Skip to content
This repository has been archived by the owner on Jan 26, 2024. It is now read-only.

Commit

Permalink
Promote till commit '6c224352715359721224b36e6ec5ef4b17517be5'
Browse files Browse the repository at this point in the history
Change-Id: I22ec1e32c2c10311b26a83c7cf6347b87038711a
  • Loading branch information
mangupta committed Mar 15, 2023
2 parents cb18e75 + 6c22435 commit 5a27d34
Show file tree
Hide file tree
Showing 40 changed files with 217 additions and 986 deletions.
1 change: 0 additions & 1 deletion cmake/ROCclr.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ endif()

target_compile_definitions(rocclr PUBLIC
LITTLEENDIAN_CPU
WITH_LIQUID_FLASH=0
${AMD_OPENCL_DEFS})

target_include_directories(rocclr PUBLIC
Expand Down
10 changes: 6 additions & 4 deletions device/blit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -729,14 +729,16 @@ bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, siz
std::vector<FillBufferInfo>& packed_info) {

// 1. Validate input arguments
guarantee(fill_size >= pattern_size, "Pattern Size cannot be greater than fill size");
guarantee(fill_size <= memory.size(), "Cannot fill more than the mem object size");
guarantee(fill_size >= pattern_size, "Pattern Size: %u cannot be greater than fill size: %u \n",
pattern_size, fill_size);
guarantee(fill_size <= memory.size(), "Cannot fill: %u more than the mem object size:%u \n",
fill_size, memory.size());

// 2. Calculate the next closest dword aligned address for faster processing
size_t dst_addr = memory.virtualAddress() + fill_origin;
size_t aligned_dst_addr = amd::alignUp(dst_addr, sizeof(size_t));
guarantee(aligned_dst_addr >= dst_addr, "Aligned address cannot be greater than destination"
"address");
guarantee(aligned_dst_addr >= dst_addr, "Aligned address: %u cannot be greater than destination"
"address :%u \n", aligned_dst_addr, dst_addr);

// 3. If given address is not aligned calculate head and tail size.
size_t head_size = std::min(aligned_dst_addr - dst_addr, fill_size);
Expand Down
1 change: 1 addition & 0 deletions device/devhcprintf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include <assert.h>
#include <cstdarg>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <string>
Expand Down
7 changes: 3 additions & 4 deletions device/devhostcall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,11 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t*
if (!messages.handlePayload(service, payload)) {
ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: invalid request for service \"%d\".",
service);
amd::report_fatal(__FILE__, __LINE__, "Hostcall: invalid service request.");
guarantee(false, "Hostcall: invalid service request %d \n", service);
}
return;
case SERVICE_DEVMEM: {
guarantee(payload[0] != 0 || payload[1] != 0, "Both payloads cannot be 0 \n");
if (payload[0]) {
amd::Memory* mem = amd::MemObjMap::FindMemObj(reinterpret_cast<void*>(payload[0]));
if (mem) {
Expand All @@ -114,9 +115,7 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t*
return;
}
default:
ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: no handler found for service ID \"%d\".",
service);
amd::report_fatal(__FILE__, __LINE__, "Hostcall service not supported.");
guarantee(false, "Hostcall: no handler found for service ID %d \n", service);
return;
}
}
Expand Down
14 changes: 4 additions & 10 deletions device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,11 +289,8 @@ void MemObjMap::AddMemObj(const void* k, amd::Memory* v) {
void MemObjMap::RemoveMemObj(const void* k) {
amd::ScopedLock lock(AllocatedLock_);
auto rval = MemObjMap_.erase(reinterpret_cast<uintptr_t>(k));
if (rval != 1) {
DevLogPrintfError("Memobj map does not have ptr: 0x%x",
reinterpret_cast<uintptr_t>(k));
guarantee(false, "Memobj map does not have ptr");
}
guarantee(rval == 1, "Memobj map does not have ptr: 0x%x",
reinterpret_cast<uintptr_t>(k));
}

amd::Memory* MemObjMap::FindMemObj(const void* k, size_t* offset) {
Expand Down Expand Up @@ -328,11 +325,8 @@ void MemObjMap::AddVirtualMemObj(const void* k, amd::Memory* v) {
void MemObjMap::RemoveVirtualMemObj(const void* k) {
amd::ScopedLock lock(AllocatedLock_);
auto rval = VirtualMemObjMap_.erase(reinterpret_cast<uintptr_t>(k));
if (rval != 1) {
DevLogPrintfError("Virtual Memobj map does not have ptr: 0x%x",
reinterpret_cast<uintptr_t>(k));
guarantee(false, "VirtualMemobj map does not have ptr");
}
guarantee(rval == 1, "Virtual Memobj map does not have ptr: 0x%x",
reinterpret_cast<uintptr_t>(k));
}

amd::Memory* MemObjMap::FindVirtualMemObj(const void* k) {
Expand Down
24 changes: 11 additions & 13 deletions device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ class SvmFillMemoryCommand;
class SvmMapMemoryCommand;
class SvmUnmapMemoryCommand;
class SvmPrefetchAsyncCommand;
class TransferBufferFileCommand;
class StreamOperationCommand;
class VirtualMapCommand;
class ExternalSemaphoreCmd;
Expand Down Expand Up @@ -154,7 +153,6 @@ enum OclExtensions {
ClKhrD3d9Sharing,
#endif
ClKhrImage2dFromBuffer,
ClAmdSemaphore,
ClAMDBusAddressableMemory,
ClAMDC11Atomics,
ClKhrSpir,
Expand All @@ -163,8 +161,6 @@ enum OclExtensions {
ClKhrDepthImages,
ClKhrMipMapImage,
ClKhrMipMapImageWrites,
ClKhrIlProgram,
ClAMDLiquidFlash,
ClAmdCopyBufferP2P,
ClAmdAssemblyProgram,
#if defined(_WIN32)
Expand Down Expand Up @@ -200,7 +196,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ",
"cl_khr_dx9_media_sharing ",
#endif
"cl_khr_image2d_from_buffer ",
"",
"cl_amd_bus_addressable_memory ",
"cl_amd_c11_atomics ",
"cl_khr_spir ",
Expand All @@ -209,8 +204,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ",
"cl_khr_depth_images ",
"cl_khr_mipmap_image ",
"cl_khr_mipmap_image_writes ",
"",
"cl_amd_liquid_flash ",
"cl_amd_copy_buffer_p2p ",
"cl_amd_assembly_program ",
#if defined(_WIN32)
Expand Down Expand Up @@ -1241,9 +1234,6 @@ class VirtualDevice : public amd::HeapObject {
/// Optional extensions
virtual void submitSignal(amd::SignalCommand& cmd) = 0;
virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) = 0;
virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
ShouldNotReachHere();
}
virtual void submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
ShouldNotReachHere();
}
Expand All @@ -1270,6 +1260,9 @@ class VirtualDevice : public amd::HeapObject {
//! Returns fence state of the VirtualGPU
virtual bool isFenceDirty() const = 0;

//! Resets fence state of the VirtualGPU
virtual void resetFenceDirty() = 0;

private:
//! Disable default copy constructor
VirtualDevice& operator=(const VirtualDevice&);
Expand Down Expand Up @@ -1796,9 +1789,14 @@ class Device : public RuntimeObject {

// Returns the status of HW event, associated with amd::Event
virtual bool IsHwEventReady(
const amd::Event& event, //!< AMD event for HW status validation
bool wait = false //!< If true then forces the event completion
) const {
const amd::Event& event, //!< AMD event for HW status validation
bool wait = false) const { //!< If true then forces the event completion
return false;
};

// Returns the status of HW event, associated with amd::Event
virtual bool IsHwEventReadyForcedWait(
const amd::Event& event) const { //!< AMD event for HW status validation
return false;
};

Expand Down
2 changes: 1 addition & 1 deletion device/devprogram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2977,7 +2977,7 @@ bool Program::runInitFiniKernel(kernel_kind_t kind) const {
amd::HostQueue* queue = nullptr;

for (const auto& i : kernels_) {
LogPrintfInfo("For Init/Fini: Kernel Name: %s", i.first.c_str());
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "For Init/Fini: Kernel Name: %s", i.first.c_str());
const auto &kernel = i.second;
if ((kernel->isInitKernel() && kind == kernel_kind_t::InitKernel) ||
(kernel->isFiniKernel() && kind == kernel_kind_t::FiniKernel)) {
Expand Down
1 change: 1 addition & 0 deletions device/devwavelimiter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
// ================================================================================================
void WaveLimiterManager::enable(bool isSupported) {
if (fixed_ > 0) {
enable_ = GPU_WAVE_LIMIT_ENABLE;
return;
}

Expand Down
11 changes: 7 additions & 4 deletions device/pal/paldevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ static constexpr PalDevice supportedPalDevices[] = {
{11, 0, 1, Pal::GfxIpLevel::GfxIp11_0, "gfx1101", Pal::AsicRevision::Navi32},
{11, 0, 2, Pal::GfxIpLevel::GfxIp11_0, "gfx1102", Pal::AsicRevision::Navi33},
{11, 0, 3, Pal::GfxIpLevel::GfxIp11_0, "gfx1103", Pal::AsicRevision::Phoenix1},
{11, 0, 3, Pal::GfxIpLevel::GfxIp11_0, "gfx1103", Pal::AsicRevision::Phoenix2},
};

static std::tuple<const amd::Isa*, const char*> findIsa(Pal::AsicRevision asicRevision,
Expand Down Expand Up @@ -632,7 +633,9 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.cooperativeGroups_ = settings().enableCoopGroups_;
info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_;

if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) {
if (amd::IS_HIP) {
info_.largeBar_ = false;
} else if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) {
info_.largeBar_ = true;
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Resizable bar enabled");
}
Expand Down Expand Up @@ -2356,8 +2359,8 @@ void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
}

// ================================================================================================
void Device::HiddenHeapAlloc() {
auto HeapAlloc = [this]() -> bool {
void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
auto HeapAlloc = [this, &gpu]() -> bool {
// Allocate initial heap for device memory allocator
static constexpr size_t HeapBufferSize = 128 * Ki;
heap_buffer_ = createMemory(HeapBufferSize);
Expand All @@ -2369,7 +2372,7 @@ void Device::HiddenHeapAlloc() {
LogError("Heap buffer allocation failed!");
return false;
}
bool result = static_cast<const KernelBlitManager&>(xferMgr()).initHeap(
bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr()).initHeap(
heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi));

return result;
Expand Down
2 changes: 1 addition & 1 deletion device/pal/paldevice.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,7 @@ class Device : public NullDevice {
#endif
#endif
//! Allocates hidden heap for device memory allocations
void HiddenHeapAlloc();
void HiddenHeapAlloc(const VirtualGPU& gpu);

private:
static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
Expand Down
45 changes: 38 additions & 7 deletions device/pal/palgpuopen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,7 @@ RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device)
se_mask_(0),
perf_counter_mem_limit_(0),
perf_counter_frequency_(0),
trace_enabled_(false),
inst_tracing_enabled_(false),
perf_counters_enabled_(false) {
value_(0) {
memset(&trace_, 0, sizeof(trace_));
}

Expand Down Expand Up @@ -176,6 +174,8 @@ bool RgpCaptureMgr::Update(Pal::IPlatform* platform) {
PostDeviceCreate();
}

static_vm_id_ = device_.properties().gfxipProperties.flags.supportStaticVmid;

return result;
}

Expand All @@ -189,12 +189,12 @@ bool RgpCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue,
// Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
// it may be optional for Vulkan, but we provide it anyway if available).
Pal::KernelContextInfo kernelContextInfo = {};

Pal::Result palResult = iQueue->QueryKernelContextInfo(&kernelContextInfo);

// Ensure we've acquired the debug VMID (note that some platforms do not
// implement this function, so don't fail the whole trace if so)
*debug_vmid = kernelContextInfo.flags.hasDebugVmid;
assert((static_vm_id_ || *debug_vmid) && "Can't capture multiple queues!");

// Register the queue with the GPA session class for timed queue operation support.
if (trace_.gpa_session_->RegisterTimedQueue(
Expand Down Expand Up @@ -278,6 +278,21 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) {
// continue until we find the right queue...
} else if (Pal::Result::Success == res) {
trace_.sqtt_disp_count_ = 0;
// Stop the trace and save the result. Currently runtime can't delay upload in HIP,
// because default stream doesn't have explicit destruction and
// OS kills all threads on exit without any notification. That includes PAL RGP threads.
{
if (trace_.status_ == TraceStatus::WaitingForSqtt) {
auto result = EndRGPTrace(gpu);
}
// Check if runtime is waiting for the final trace results
if (trace_.status_ == TraceStatus::WaitingForResults) {
// If results are ready, then finish the trace
if (CheckForTraceResults() == Pal::Result::Success) {
FinishRGPTrace(gpu, false);
}
}
}
} else {
FinishRGPTrace(gpu, true);
}
Expand Down Expand Up @@ -517,11 +532,17 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) {
}
}

// Notify the RGP server that we are starting a trace
if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
result = Pal::Result::ErrorUnknown;
if (static_vm_id_) {
result = device_.iDev()->SetStaticVmidMode(true);
assert(result == Pal::Result::Success && "Static VM ID setup failed!");
}

if (result == Pal::Result::Success) {
// Notify the RGP server that we are starting a trace
if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
result = Pal::Result::ErrorUnknown;
}
}
// Tell the GPA session class we're starting a trace
if (result == Pal::Result::Success) {
GpuUtil::GpaSessionBeginInfo info = {};
Expand Down Expand Up @@ -707,6 +728,7 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
return;
}

auto disp_count = trace_.sqtt_disp_count_;
// Finish the trace if the queue was destroyed before OCL reached
// the number of captured dispatches
if (trace_.sqtt_disp_count_ != 0) {
Expand Down Expand Up @@ -736,9 +758,18 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
} else {
rgp_server_->EndTrace();
}

if (static_vm_id_) {
auto result = device_.iDev()->SetStaticVmidMode(false);
assert(result == Pal::Result::Success && "Static VM ID setup failed!");
}

if (trace_.gpa_session_ != nullptr) {
trace_.gpa_session_->Reset();
}
// If applicaiton exits, then Windows kills all threads and
// RGP can't finish data write into a file.
amd::Os::sleep(10 * disp_count + 500);
// Reset tracing state to idle
trace_.prepared_disp_count_ = 0;
trace_.sqtt_disp_count_ = 0;
Expand Down
7 changes: 4 additions & 3 deletions device/pal/palgpuopen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,9 +410,10 @@ class RgpCaptureMgr {

union {
struct {
uint32_t trace_enabled_ : 1; // True if tracing is currently enabled (master flag)
uint32_t inst_tracing_enabled_; // Enable instruction-level SQTT tokens
uint32_t perf_counters_enabled_; // True if perf counters are enabled
uint32_t trace_enabled_: 1; // True if tracing is currently enabled (master flag)
uint32_t inst_tracing_enabled_: 1; // Enable instruction-level SQTT tokens
uint32_t perf_counters_enabled_: 1; // True if perf counters are enabled
uint32_t static_vm_id_: 1; // Static VM ID can be used for capture
};
uint32_t value_;
};
Expand Down
2 changes: 1 addition & 1 deletion device/pal/palkernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
case amd::KernelParameterDescriptor::HiddenHeap:
// Allocate hidden heap for HIP applications only
if ((amd::IS_HIP) && (palDevice().HeapBuffer() == nullptr)) {
const_cast<Device&>(palDevice()).HiddenHeapAlloc();
const_cast<Device&>(palDevice()).HiddenHeapAlloc(gpu);
}
if (palDevice().HeapBuffer() != nullptr) {
// Add heap pointer to the code
Expand Down
6 changes: 4 additions & 2 deletions device/pal/palsettings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,10 @@ Settings::Settings() {

// By default use host blit
blitEngine_ = BlitEngineHost;
pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi;
pinnedXferSize_ = GPU_PINNED_XFER_SIZE * Mi;
size_t defaultMinXferSize = amd::IS_HIP ? 128: 4;
pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)
? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi;
? defaultMinXferSize * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi;

// Disable FP_FAST_FMA defines by default
reportFMAF_ = false;
Expand Down Expand Up @@ -205,6 +206,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
case Pal::AsicRevision::Navi31:
// Fall through for Navi2x ...
case Pal::AsicRevision::Phoenix1:
case Pal::AsicRevision::Phoenix2:
case Pal::AsicRevision::Raphael:
case Pal::AsicRevision::Rembrandt:
case Pal::AsicRevision::Navi24:
Expand Down
Loading

0 comments on commit 5a27d34

Please sign in to comment.