From 30f912cbd33ecf7c539db0c334320633d927cd1d Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Fri, 20 Jan 2023 03:00:09 -0500 Subject: [PATCH 01/45] SWDEV-2 - Change OpenCL version number from 3558 to 3559 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index 2a85df5e..dde35676 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3558 +#define AMD_PLATFORM_BUILD_NUMBER 3559 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From 3b1753706b08fd8e17f986d56b216e908e7ed972 Mon Sep 17 00:00:00 2001 From: German Date: Thu, 19 Jan 2023 18:53:11 -0500 Subject: [PATCH 02/45] SWDEV-377991 - Remove liquidflash support Change-Id: Iba6455e5c0210c3223a06fec332404cd9f489154 --- cmake/ROCclr.cmake | 1 - device/device.hpp | 6 ---- device/devwavelimiter.cpp | 1 + device/pal/palkernel.cpp | 1 + device/pal/palvirtual.cpp | 44 +------------------------- device/pal/palvirtual.hpp | 1 - device/rocm/rocvirtual.cpp | 49 ++-------------------------- device/rocm/rocvirtual.hpp | 2 -- platform/command.cpp | 65 -------------------------------------- platform/command.hpp | 60 ----------------------------------- platform/context.cpp | 16 ---------- platform/memory.hpp | 36 --------------------- platform/object.hpp | 5 +-- 13 files changed, 6 insertions(+), 281 deletions(-) diff --git a/cmake/ROCclr.cmake b/cmake/ROCclr.cmake index 4d8f6a25..c7795ba8 100644 --- a/cmake/ROCclr.cmake +++ b/cmake/ROCclr.cmake @@ -100,7 +100,6 @@ endif() target_compile_definitions(rocclr PUBLIC LITTLEENDIAN_CPU - WITH_LIQUID_FLASH=0 ${AMD_OPENCL_DEFS}) target_include_directories(rocclr PUBLIC diff --git a/device/device.hpp b/device/device.hpp index 64d67465..5aad54bb 100644 --- a/device/device.hpp +++ b/device/device.hpp @@ -89,7 +89,6 @@ class SvmFillMemoryCommand; class SvmMapMemoryCommand; class SvmUnmapMemoryCommand; class SvmPrefetchAsyncCommand; -class TransferBufferFileCommand; class StreamOperationCommand; class VirtualMapCommand; class ExternalSemaphoreCmd; @@ -164,7 +163,6 @@ enum OclExtensions { ClKhrMipMapImage, ClKhrMipMapImageWrites, ClKhrIlProgram, - ClAMDLiquidFlash, ClAmdCopyBufferP2P, ClAmdAssemblyProgram, #if defined(_WIN32) @@ -210,7 +208,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ", "cl_khr_mipmap_image ", "cl_khr_mipmap_image_writes ", "", - "cl_amd_liquid_flash ", "cl_amd_copy_buffer_p2p ", "cl_amd_assembly_program ", #if defined(_WIN32) @@ -1241,9 +1238,6 @@ class VirtualDevice : public amd::HeapObject { /// Optional extensions virtual void submitSignal(amd::SignalCommand& cmd) = 0; virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) = 0; - virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) { - ShouldNotReachHere(); - } virtual void submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { ShouldNotReachHere(); } diff --git a/device/devwavelimiter.cpp b/device/devwavelimiter.cpp index beec1bce..deaadd98 100644 --- a/device/devwavelimiter.cpp +++ b/device/devwavelimiter.cpp @@ -328,6 +328,7 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback( // ================================================================================================ void WaveLimiterManager::enable(bool isSupported) { if (fixed_ > 0) { + enable_ = GPU_WAVE_LIMIT_ENABLE; return; } diff --git a/device/pal/palkernel.cpp b/device/pal/palkernel.cpp index 19c6d9fe..e6e4fd28 100644 --- a/device/pal/palkernel.cpp +++ b/device/pal/palkernel.cpp @@ -498,6 +498,7 @@ const LightningProgram& LightningKernel::prog() const { #if defined(USE_COMGR_LIBRARY) bool LightningKernel::init() { + waveLimiter_.enable(); return GetAttrCodePropMetadata(); } diff --git a/device/pal/palvirtual.cpp b/device/pal/palvirtual.cpp index 64b5bfa6..e7465c8b 100644 --- a/device/pal/palvirtual.cpp +++ b/device/pal/palvirtual.cpp @@ -3741,49 +3741,7 @@ bool VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst) { return false; } -void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) { - size_t copySize = cmd.size()[0]; - size_t fileOffset = cmd.fileOffset(); - Memory* mem = dev().getGpuMemory(&cmd.memory()); - uint idx = 0; - - assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) || - (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD)); - const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD); - - if (writeBuffer) { - size_t dstOffset = cmd.origin()[0]; - while (copySize > 0) { - Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); - size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize; - dstSize = std::min(dstSize, copySize); - void* dstBuffer = staging->cpuMap(*this); - staging->cpuUnmap(*this); - - blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false); - flushDMA(staging->getGpuEvent(*this)->engineId_); - fileOffset += dstSize; - dstOffset += dstSize; - copySize -= dstSize; - } - } else { - size_t srcOffset = cmd.origin()[0]; - while (copySize > 0) { - Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); - size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize; - srcSize = std::min(srcSize, copySize); - blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false); - - void* srcBuffer = staging->cpuMap(*this); - staging->cpuUnmap(*this); - - fileOffset += srcSize; - srcOffset += srcSize; - copySize -= srcSize; - } - } -} - +// ================================================================================================ void* VirtualGPU::getOrCreateHostcallBuffer() { if (hostcallBuffer_ != nullptr) { return hostcallBuffer_; diff --git a/device/pal/palvirtual.hpp b/device/pal/palvirtual.hpp index 1bbfe8e9..86240649 100644 --- a/device/pal/palvirtual.hpp +++ b/device/pal/palvirtual.hpp @@ -345,7 +345,6 @@ class VirtualGPU : public device::VirtualDevice { virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); - virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd); virtual void submitVirtualMap(amd::VirtualMapCommand& cmd); virtual void submitStreamOperation(amd::StreamOperationCommand& cmd); void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd); diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp index 56b6022f..7747d147 100644 --- a/device/rocm/rocvirtual.cpp +++ b/device/rocm/rocvirtual.cpp @@ -3353,55 +3353,10 @@ amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) { return nullptr; } +// ================================================================================================ void VirtualGPU::enableSyncBlit() const { blitMgr_->enableSynchronization(); } -void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) { - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - size_t copySize = cmd.size()[0]; - size_t fileOffset = cmd.fileOffset(); - Memory* mem = dev().getRocMemory(&cmd.memory()); - uint idx = 0; - - assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) || - (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD)); - const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD); - - if (writeBuffer) { - size_t dstOffset = cmd.origin()[0]; - while (copySize > 0) { - Memory* staging = dev().getRocMemory(&cmd.staging(idx)); - size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize; - dstSize = std::min(dstSize, copySize); - void* dstBuffer = staging->cpuMap(*this); - - staging->cpuUnmap(*this); - - bool result = blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false); - fileOffset += dstSize; - dstOffset += dstSize; - copySize -= dstSize; - } - } else { - size_t srcOffset = cmd.origin()[0]; - while (copySize > 0) { - Memory* staging = dev().getRocMemory(&cmd.staging(idx)); - size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize; - srcSize = std::min(srcSize, copySize); - bool result = blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false); - - void* srcBuffer = staging->cpuMap(*this); - - staging->cpuUnmap(*this); - - fileOffset += srcSize; - srcOffset += srcSize; - copySize -= srcSize; - } - } -} - +// ================================================================================================ void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp index 11a3670e..bf0bb4ce 100644 --- a/device/rocm/rocvirtual.hpp +++ b/device/rocm/rocvirtual.hpp @@ -341,8 +341,6 @@ class VirtualGPU : public device::VirtualDevice { virtual void submitSignal(amd::SignalCommand& cmd) {} virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) {} - virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd); - void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) {} void submitThreadTrace(amd::ThreadTraceCommand& vcmd) {} diff --git a/platform/command.cpp b/platform/command.cpp index cc35579a..e02cb153 100644 --- a/platform/command.cpp +++ b/platform/command.cpp @@ -726,71 +726,6 @@ bool ThreadTraceMemObjectsCommand::validateMemory() { return true; } -void TransferBufferFileCommand::releaseResources() { - for (uint i = 0; i < NumStagingBuffers; ++i) { - if (NULL != staging_[i]) { - staging_[i]->release(); - } - } - - // Call the parent - OneMemoryArgCommand::releaseResources(); -} - -void TransferBufferFileCommand::submit(device::VirtualDevice& device) { - device::Memory* mem = memory_->getDeviceMemory(queue()->device()); - if (memory_->getMemFlags() & - (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD)) { - void* srcDstBuffer = nullptr; - if (memory_->getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { - // Lock protected multiple maps for persistent memory - amd::ScopedLock lock(mem->owner()->lockMemoryOps()); - srcDstBuffer = mem->cpuMap(device); - } else { - srcDstBuffer = mem->cpuMap(device); - } - // Make HD transfer to the host accessible memory - bool writeBuffer(type() == CL_COMMAND_READ_SSG_FILE_AMD); - if (memory_->getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { - // Lock protected multiple maps for persistent memory - amd::ScopedLock lock(mem->owner()->lockMemoryOps()); - mem->cpuUnmap(device); - } else { - mem->cpuUnmap(device); - } - } else { - device.submitTransferBufferFromFile(*this); - } -} - -bool TransferBufferFileCommand::validateMemory() { - // Check if the destination buffer has direct host access - if (!(memory_->getMemFlags() & - (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD))) { - // Allocate staging buffers - for (uint i = 0; i < NumStagingBuffers; ++i) { - staging_[i] = new (memory_->getContext()) - Buffer(memory_->getContext(), StagingBufferMemType, StagingBufferSize); - if (NULL == staging_[i] || !staging_[i]->create(nullptr)) { - DevLogPrintfError("Staging Create failed, Staging[%d]: 0x%x", i, staging_[i]); - return false; - } - device::Memory* mem = staging_[i]->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate staging buffer - 0x%08X bytes!", staging_[i]->getSize()); - return false; - } - } - } - - device::Memory* mem = memory_->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory_->getSize()); - return false; - } - return true; -} - bool CopyMemoryP2PCommand::validateMemory() { amd::Device* queue_device = &queue()->device(); diff --git a/platform/command.hpp b/platform/command.hpp index 7022ec3d..45d9e365 100644 --- a/platform/command.hpp +++ b/platform/command.hpp @@ -1642,66 +1642,6 @@ class SvmUnmapMemoryCommand : public Command { void* svmPtr() const { return svmPtr_; } }; -/*! \brief A generic transfer memory from/to file command. - * - * \details Currently supports buffers only. Buffers - * are treated as 1D structures so origin_[0] and size_[0] - * are equivalent to offset_ and count_ respectively. - */ -class TransferBufferFileCommand : public OneMemoryArgCommand { - public: - static constexpr uint NumStagingBuffers = 2; - static constexpr size_t StagingBufferSize = 4 * Mi; - static constexpr uint StagingBufferMemType = CL_MEM_USE_PERSISTENT_MEM_AMD; - - protected: - const Coord3D origin_; //!< Origin of the region to write to - const Coord3D size_; //!< Size of the region to write to - LiquidFlashFile* file_; //!< The file object for data read - size_t fileOffset_; //!< Offset in the file for data read - amd::Memory* staging_[NumStagingBuffers]; //!< Staging buffers for transfer - - public: - TransferBufferFileCommand(cl_command_type type, HostQueue& queue, - const EventWaitList& eventWaitList, Memory& memory, - const Coord3D& origin, const Coord3D& size, LiquidFlashFile* file, - size_t fileOffset) - : OneMemoryArgCommand(queue, type, eventWaitList, memory), - origin_(origin), - size_(size), - file_(file), - fileOffset_(fileOffset) { - // Sanity checks - assert(size.c[0] > 0 && "invalid"); - for (uint i = 0; i < NumStagingBuffers; ++i) { - staging_[i] = NULL; - } - } - - virtual void releaseResources(); - - virtual void submit(device::VirtualDevice& device); - - //! Return the memory object to write to - Memory& memory() const { return *memory_; } - - //! Return the host memory to read from - LiquidFlashFile* file() const { return file_; } - - //! Returns file offset - size_t fileOffset() const { return fileOffset_; } - - //! Return the region origin - const Coord3D& origin() const { return origin_; } - //! Return the region size - const Coord3D& size() const { return size_; } - - //! Return the staging buffer for transfer - Memory& staging(uint i) const { return *staging_[i]; } - - bool validateMemory(); -}; - /*! \brief A P2P copy memory command * * \details Used for buffers only. Backends are expected diff --git a/platform/context.cpp b/platform/context.cpp index dbb46c38..965269e8 100644 --- a/platform/context.cpp +++ b/platform/context.cpp @@ -34,14 +34,6 @@ #include "CL/cl_dx9_media_sharing.h" #endif //_WIN32 -#ifndef WITH_LIQUID_FLASH -#if (!defined(BUILD_HSA_TARGET) && defined(WITH_HSA_DEVICE) && \ - defined(WITH_AMDGPU_PRO)) || defined(_WIN32) -#define WITH_LIQUID_FLASH 1 -#include "lf.h" -#endif -#endif - namespace amd { Context::Context(const std::vector& devices, const Info& info) @@ -91,10 +83,6 @@ Context::~Context() { delete[] properties_; delete glenv_; - -#if WITH_LIQUID_FLASH - lfTerminate(); -#endif } int Context::checkProperties(const cl_context_properties* properties, Context::Info* info) { @@ -315,10 +303,6 @@ int Context::create(const intptr_t* properties) { } } -#if WITH_LIQUID_FLASH - lfInit(); -#endif - return result; } diff --git a/platform/memory.hpp b/platform/memory.hpp index 34ea71f8..0b6ace9f 100644 --- a/platform/memory.hpp +++ b/platform/memory.hpp @@ -663,42 +663,6 @@ class SvmBuffer : AllStatic { static Monitor AllocatedLock_; }; -#ifndef CL_COMMAND_WRITE_SSG_FILE_AMD -#define CL_COMMAND_WRITE_SSG_FILE_AMD 2 -#endif -#ifndef CL_COMMAND_READ_SSG_FILE_AMD -#define CL_COMMAND_READ_SSG_FILE_AMD 1 -#endif -#ifndef cl_file_flags_amd -typedef uint32_t cl_file_flags_amd; -#endif - //! Liquid flash extension -class LiquidFlashFile : public RuntimeObject { - private: - std::wstring name_; - cl_file_flags_amd flags_; - void* handle_; - uint32_t blockSize_; - uint64_t fileSize_; - - public: - LiquidFlashFile(const wchar_t* name, cl_file_flags_amd flags) - : name_(name), flags_(flags), handle_(NULL), blockSize_(0), fileSize_(0) {} - - ~LiquidFlashFile(); - - bool open(); - void close(); - - uint32_t blockSize() const { return blockSize_; }; - uint64_t fileSize() const { return fileSize_; }; - - bool transferBlock(bool read, void* dst, uint64_t bufferSize, uint64_t fileOffset, - uint64_t bufferOffset, uint64_t size) const; - - virtual ObjectType objectType() const { return ObjectTypeLiquidFlashFile; } -}; - class ArenaMemory: public Buffer { public: ArenaMemory(Context& context) diff --git a/platform/object.hpp b/platform/object.hpp index 9fca95ab..0f5ed636 100644 --- a/platform/object.hpp +++ b/platform/object.hpp @@ -41,9 +41,7 @@ #define AMD_CL_TYPES_DO(F) \ F(cl_counter_amd, Counter) \ F(cl_perfcounter_amd, PerfCounter) \ - F(cl_threadtrace_amd, ThreadTrace) \ - F(cl_file_amd, LiquidFlashFile) - + F(cl_threadtrace_amd, ThreadTrace) #define CL_TYPES_DO(F) \ KHR_CL_TYPES_DO(F) \ @@ -144,7 +142,6 @@ class RuntimeObject : public ReferenceCountedObject, public ICDDispatchedObject ObjectTypeQueue = 8, ObjectTypeSampler = 9, ObjectTypeThreadTrace = 10, - ObjectTypeLiquidFlashFile = 11 }; virtual ObjectType objectType() const = 0; From 2448d5d31cacd9facaadf37a6896b036f654afb1 Mon Sep 17 00:00:00 2001 From: Jacob Lambert Date: Mon, 16 Jan 2023 19:40:56 -0800 Subject: [PATCH 03/45] SWDEV-376413 - Replace deprecated Comgr device-lib action The Comgr ADD_DEVICE_LIBRARIES action has been deprecated. In place of the previous two-action approach: AMD_COMGR_COMPILE_SOURCE_TO_BC AMD_COMGR_ADD_DEVICE_LIBRARIES We can now use a single combined action: AMD_COMGR_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC This new action more closely alings with how device library management is done by the clang driver. Change-Id: Id844e9031a1896dedeacec453440b9babc4b111a --- device/devprogram.cpp | 37 +++++++++++-------------------------- device/devprogram.hpp | 6 +++--- 2 files changed, 14 insertions(+), 29 deletions(-) diff --git a/device/devprogram.cpp b/device/devprogram.cpp index 5455d80c..6e4287b5 100644 --- a/device/devprogram.cpp +++ b/device/devprogram.cpp @@ -346,7 +346,7 @@ amd_comgr_status_t Program::createAction(const amd_comgr_language_t oclver, bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs, const std::vector& options, amd::option::Options* amdOptions, amd_comgr_data_set_t* output, - char* binaryData[], size_t* binarySize, const bool link_dev_libs) { + char* binaryData[], size_t* binarySize) { amd_comgr_language_t langver; setLanguage(amdOptions->oVariables->CLStd, &langver); @@ -358,28 +358,13 @@ bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs, // Create the action for linking amd_comgr_action_info_t action; - amd_comgr_data_set_t dataSetDevLibs; bool hasAction = false; - bool hasDataSetDevLibs = false; amd_comgr_status_t status = createAction(langver, options, &action, &hasAction); - if (link_dev_libs) { - if (status == AMD_COMGR_STATUS_SUCCESS) { - status = amd::Comgr::create_data_set(&dataSetDevLibs); - } - - if (status == AMD_COMGR_STATUS_SUCCESS) { - hasDataSetDevLibs = true; - status = amd::Comgr::do_action(AMD_COMGR_ACTION_ADD_DEVICE_LIBRARIES, action, inputs, - dataSetDevLibs); - extractBuildLog(dataSetDevLibs); - } - } - if (status == AMD_COMGR_STATUS_SUCCESS) { status = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, - (link_dev_libs) ? dataSetDevLibs : inputs, *output); + inputs, *output); extractBuildLog(*output); } @@ -396,17 +381,14 @@ bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs, amd::Comgr::destroy_action_info(action); } - if (hasDataSetDevLibs) { - amd::Comgr::destroy_data_set(dataSetDevLibs); - } - return (status == AMD_COMGR_STATUS_SUCCESS); } bool Program::compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs, const std::vector& options, amd::option::Options* amdOptions, - char* binaryData[], size_t* binarySize) { + char* binaryData[], size_t* binarySize, + const bool link_dev_libs) { amd_comgr_language_t langver; setLanguage(amdOptions->oVariables->CLStd, &langver); @@ -483,8 +465,12 @@ bool Program::compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs, // Compiling the source codes with precompiled headers or directly compileInputs if (status == AMD_COMGR_STATUS_SUCCESS) { - status = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, - action, input, output); + if (link_dev_libs) + status = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC, + action, input, output); + else + status = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, + action, input, output); extractBuildLog(output); } @@ -970,9 +956,8 @@ bool Program::linkImplLC(const std::vector& inputPrograms, char* binaryData = nullptr; size_t binarySize = 0; std::vector linkOptions; - constexpr bool kLinkDevLibs = false; bool ret = linkLLVMBitcode(inputs, linkOptions, options, &output, &binaryData, - &binarySize, kLinkDevLibs); + &binarySize); amd::Comgr::destroy_data_set(output); amd::Comgr::destroy_data_set(inputs); diff --git a/device/devprogram.hpp b/device/devprogram.hpp index 3bf0679a..a324750e 100644 --- a/device/devprogram.hpp +++ b/device/devprogram.hpp @@ -456,13 +456,13 @@ class Program : public amd::HeapObject { bool linkLLVMBitcode(const amd_comgr_data_set_t inputs, const std::vector& options, amd::option::Options* amdOptions, amd_comgr_data_set_t* output, - char* binaryData[] = nullptr, size_t* binarySize = nullptr, - const bool link_dev_libs = true); + char* binaryData[] = nullptr, size_t* binarySize = nullptr); //! Create the bitcode of the compiled input dataset bool compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs, const std::vector& options, amd::option::Options* amdOptions, - char* binaryData[], size_t* binarySize); + char* binaryData[], size_t* binarySize, + const bool link_dev_libs = true); //! Compile and create the excutable of the input dataset bool compileAndLinkExecutable(const amd_comgr_data_set_t inputs, From 6a671455e0f8404b43054f074e0c46e68a53b43e Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Sat, 21 Jan 2023 03:00:06 -0500 Subject: [PATCH 04/45] SWDEV-2 - Change OpenCL version number from 3559 to 3560 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index dde35676..d724d3ca 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3559 +#define AMD_PLATFORM_BUILD_NUMBER 3560 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From 945467e0e40d88b5294b8a634b7d8d67f1b03d39 Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Wed, 18 Jan 2023 13:25:07 -0800 Subject: [PATCH 05/45] SWDEV-378006 - Adding a new guarantee macro to support printing args. Change-Id: I9c838644e31a84d96a44b2bd10525a08d805a047 --- device/blit.cpp | 10 ++++++---- device/devhostcall.cpp | 6 ++---- device/device.cpp | 14 ++++---------- utils/debug.cpp | 11 ----------- utils/debug.hpp | 9 +++------ 5 files changed, 15 insertions(+), 35 deletions(-) diff --git a/device/blit.cpp b/device/blit.cpp index 3c6b5dbf..a2c25376 100644 --- a/device/blit.cpp +++ b/device/blit.cpp @@ -729,14 +729,16 @@ bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, siz std::vector& packed_info) { // 1. Validate input arguments - guarantee(fill_size >= pattern_size, "Pattern Size cannot be greater than fill size"); - guarantee(fill_size <= memory.size(), "Cannot fill more than the mem object size"); + guarantee(fill_size >= pattern_size, "Pattern Size: %u cannot be greater than fill size: %u \n", + pattern_size, fill_size); + guarantee(fill_size <= memory.size(), "Cannot fill: %u more than the mem object size:%u \n", + fill_size, memory.size()); // 2. Calculate the next closest dword aligned address for faster processing size_t dst_addr = memory.virtualAddress() + fill_origin; size_t aligned_dst_addr = amd::alignUp(dst_addr, sizeof(size_t)); - guarantee(aligned_dst_addr >= dst_addr, "Aligned address cannot be greater than destination" - "address"); + guarantee(aligned_dst_addr >= dst_addr, "Aligned address: %u cannot be greater than destination" + "address :%u \n", aligned_dst_addr, dst_addr); // 3. If given address is not aligned calculate head and tail size. size_t head_size = std::min(aligned_dst_addr - dst_addr, fill_size); diff --git a/device/devhostcall.cpp b/device/devhostcall.cpp index 8226e3da..12e5fe90 100644 --- a/device/devhostcall.cpp +++ b/device/devhostcall.cpp @@ -84,7 +84,7 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t* if (!messages.handlePayload(service, payload)) { ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: invalid request for service \"%d\".", service); - amd::report_fatal(__FILE__, __LINE__, "Hostcall: invalid service request."); + guarantee(false, "Hostcall: invalid service request %d \n", service); } return; case SERVICE_DEVMEM: { @@ -114,9 +114,7 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t* return; } default: - ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: no handler found for service ID \"%d\".", - service); - amd::report_fatal(__FILE__, __LINE__, "Hostcall service not supported."); + guarantee(false, "Hostcall: no handler found for service ID %d \n", service); return; } } diff --git a/device/device.cpp b/device/device.cpp index 2709febd..f87452db 100644 --- a/device/device.cpp +++ b/device/device.cpp @@ -289,11 +289,8 @@ void MemObjMap::AddMemObj(const void* k, amd::Memory* v) { void MemObjMap::RemoveMemObj(const void* k) { amd::ScopedLock lock(AllocatedLock_); auto rval = MemObjMap_.erase(reinterpret_cast(k)); - if (rval != 1) { - DevLogPrintfError("Memobj map does not have ptr: 0x%x", - reinterpret_cast(k)); - guarantee(false, "Memobj map does not have ptr"); - } + guarantee(rval == 1, "Memobj map does not have ptr: 0x%x", + reinterpret_cast(k)); } amd::Memory* MemObjMap::FindMemObj(const void* k, size_t* offset) { @@ -328,11 +325,8 @@ void MemObjMap::AddVirtualMemObj(const void* k, amd::Memory* v) { void MemObjMap::RemoveVirtualMemObj(const void* k) { amd::ScopedLock lock(AllocatedLock_); auto rval = VirtualMemObjMap_.erase(reinterpret_cast(k)); - if (rval != 1) { - DevLogPrintfError("Virtual Memobj map does not have ptr: 0x%x", - reinterpret_cast(k)); - guarantee(false, "VirtualMemobj map does not have ptr"); - } + guarantee(rval == 1, "Virtual Memobj map does not have ptr: 0x%x", + reinterpret_cast(k)); } amd::Memory* MemObjMap::FindVirtualMemObj(const void* k) { diff --git a/utils/debug.cpp b/utils/debug.cpp index fc607564..cdda2516 100644 --- a/utils/debug.cpp +++ b/utils/debug.cpp @@ -49,17 +49,6 @@ extern "C" void breakpoint(void) { } //! \endcond -// ================================================================================================ -void report_fatal(const char* file, int line, const char* message) { - // FIXME_lmoriche: Obfuscate the message string - #if (defined(DEBUG)) - fprintf(outFile, "%s:%d: %s\n", file, line, message); - #else - fprintf(outFile, "%s\n", message); - #endif - ::abort(); -} - // ================================================================================================ void report_warning(const char* message) { fprintf(outFile, "Warning: %s\n", message); } diff --git a/utils/debug.hpp b/utils/debug.hpp index dd3ffd6e..07cb3b61 100644 --- a/utils/debug.hpp +++ b/utils/debug.hpp @@ -66,9 +66,6 @@ extern FILE* outFile; extern "C" void breakpoint(); //! \endcond -//! \brief Report a Fatal exception message and abort. -extern void report_fatal(const char* file, int line, const char* message); - //! \brief Display a warning message. extern void report_warning(const char* message); @@ -94,10 +91,10 @@ extern void log_printf(LogLevel level, const char* file, int line, uint64_t *sta #endif // __INTEL_COMPILER //! \brief Abort the program if the invariant \a cond is false. -#define guarantee(cond, message) \ +#define guarantee(cond, format, ...) \ if (!(cond)) { \ - amd::report_fatal(__FILE__, __LINE__, XSTR(message) ); \ - amd::breakpoint(); \ + amd::log_printf(amd::LOG_NONE, __FILE__, __LINE__, format, ##__VA_ARGS__); \ + ::abort(); \ } #define fixme_guarantee(cond, ...) guarantee(cond, __VA_ARGS__) From d31d50f4ed98e2b406dbe02e9a1f906994717204 Mon Sep 17 00:00:00 2001 From: German Date: Fri, 20 Jan 2023 10:26:53 -0500 Subject: [PATCH 06/45] SWDEV-377991 - Remove liquidflash support Remove amdgpu-pro interface for persistent memory, used in Liquidflash Change-Id: I7d1720ad0875a62ebb2d7f96cba39601d560a5df --- device/device.hpp | 4 - device/rocm/pro/lnxheaders.h | 46 ------- device/rocm/pro/prodevice.cpp | 241 ---------------------------------- device/rocm/pro/prodevice.hpp | 81 ------------ device/rocm/pro/prodriver.hpp | 52 -------- device/rocm/pro/profuncs.hpp | 85 ------------ device/rocm/rocdevice.cpp | 20 --- device/rocm/rocdevice.hpp | 7 - device/rocm/rocmemory.cpp | 9 -- 9 files changed, 545 deletions(-) delete mode 100644 device/rocm/pro/lnxheaders.h delete mode 100644 device/rocm/pro/prodevice.cpp delete mode 100644 device/rocm/pro/prodevice.hpp delete mode 100644 device/rocm/pro/prodriver.hpp delete mode 100644 device/rocm/pro/profuncs.hpp diff --git a/device/device.hpp b/device/device.hpp index 5aad54bb..a493795e 100644 --- a/device/device.hpp +++ b/device/device.hpp @@ -153,7 +153,6 @@ enum OclExtensions { ClKhrD3d9Sharing, #endif ClKhrImage2dFromBuffer, - ClAmdSemaphore, ClAMDBusAddressableMemory, ClAMDC11Atomics, ClKhrSpir, @@ -162,7 +161,6 @@ enum OclExtensions { ClKhrDepthImages, ClKhrMipMapImage, ClKhrMipMapImageWrites, - ClKhrIlProgram, ClAmdCopyBufferP2P, ClAmdAssemblyProgram, #if defined(_WIN32) @@ -198,7 +196,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ", "cl_khr_dx9_media_sharing ", #endif "cl_khr_image2d_from_buffer ", - "", "cl_amd_bus_addressable_memory ", "cl_amd_c11_atomics ", "cl_khr_spir ", @@ -207,7 +204,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ", "cl_khr_depth_images ", "cl_khr_mipmap_image ", "cl_khr_mipmap_image_writes ", - "", "cl_amd_copy_buffer_p2p ", "cl_amd_assembly_program ", #if defined(_WIN32) diff --git a/device/rocm/pro/lnxheaders.h b/device/rocm/pro/lnxheaders.h deleted file mode 100644 index 8c9fbe14..00000000 --- a/device/rocm/pro/lnxheaders.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#pragma once - -// NOTE: Some of the Linux driver stack's headers don't wrap their C-style interface names in 'extern "C" { ... }' -// blocks when building with a C++ compiler, so we need to add that ourselves. -#if __cplusplus -extern "C" -{ -#endif - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -constexpr int32_t InvalidFd = -1; // value representing a invalid file descriptor for Linux - -#if __cplusplus -} // extern "C" -#endif diff --git a/device/rocm/pro/prodevice.cpp b/device/rocm/pro/prodevice.cpp deleted file mode 100644 index 1e366cd2..00000000 --- a/device/rocm/pro/prodevice.cpp +++ /dev/null @@ -1,241 +0,0 @@ -/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#ifndef WITHOUT_HSA_BACKEND - -#include "hsa/hsa_ext_amd.h" -#include "lnxheaders.h" -#include "prodevice.hpp" -#include "amdgpu_drm.h" - -namespace roc { - -constexpr uint32_t kMaxDevices = 32; -constexpr uint32_t kAtiVendorId = 0x1002; - -void* ProDevice::lib_drm_handle_ = nullptr; -bool ProDevice::initialized_ = false; -drm::Funcs ProDevice::funcs_; - -IProDevice* IProDevice::Init(uint32_t bus, uint32_t device, uint32_t func) -{ - // Make sure DRM lib is initialized - if (!ProDevice::DrmInit()) { - return nullptr; - } - - ProDevice* pro_device = new ProDevice(); - - if (pro_device == nullptr || !pro_device->Create(bus, dev, func)) { - delete pro_device; - return nullptr; - } - return pro_device; -} - -ProDevice::~ProDevice() { - delete alloc_ops_; - - if (dev_handle_ != nullptr) { - Funcs().AmdgpuDeviceDeinitialize(dev_handle_); - } - if (file_desc_ > 0) { - close(file_desc_); - } -} - -bool ProDevice::DrmInit() -{ - if (initialized_ == false) { - // Find symbols in libdrm_amdgpu.so.1 - lib_drm_handle_ = dlopen("libdrm_amdgpu.so.1", RTLD_NOW); - if (lib_drm_handle_ == nullptr) { - return false; - } else { - funcs_.DrmGetDevices = reinterpret_cast(dlsym( - lib_drm_handle_, - "drmGetDevices")); - if (funcs_.DrmGetDevices == nullptr) return false; - funcs_.AmdgpuDeviceInitialize = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_device_initialize")); - if (funcs_.AmdgpuDeviceInitialize == nullptr) return false; - funcs_.AmdgpuDeviceDeinitialize = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_device_deinitialize")); - if (funcs_.AmdgpuDeviceDeinitialize == nullptr) return false; - funcs_.AmdgpuQueryGpuInfo = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_query_gpu_info")); - if (funcs_.AmdgpuQueryGpuInfo == nullptr) return false; - funcs_.AmdgpuQueryInfo = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_query_info")); - if (funcs_.AmdgpuQueryInfo == nullptr) return false; - funcs_.AmdgpuBoAlloc = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_bo_alloc")); - if (funcs_.AmdgpuBoAlloc == nullptr) return false; - funcs_.AmdgpuBoExport = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_bo_export")); - if (funcs_.AmdgpuBoExport == nullptr) return false; - funcs_.AmdgpuBoFree = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_bo_free")); - if (funcs_.AmdgpuBoFree == nullptr) return false; - funcs_.AmdgpuBoCpuMap = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_bo_cpu_map")); - if (funcs_.AmdgpuBoCpuMap == nullptr) return false; - funcs_.AmdgpuBoCpuUnmap = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_bo_cpu_unmap")); - if (funcs_.AmdgpuBoCpuUnmap == nullptr) return false; - } - } - - initialized_ = true; - return true; -} - -#ifndef AMDGPU_CAPABILITY_SSG_FLAG -#define AMDGPU_CAPABILITY_SSG_FLAG 4 -#endif - -// ================================================================================================ -// Open drm device and initialize it. And also get the drm information. -bool ProDevice::Create(uint32_t bus, uint32_t device, uint32_t func) { - drmDevicePtr devices[kMaxDevices] = { }; - int32_t device_count = Funcs().DrmGetDevices(devices, kMaxDevices); - bool result = false; - - for (int32_t i = 0; i < device_count; i++) { - // Check if the device vendor is AMD - if (devices[i]->deviceinfo.pci->vendor_id != kAtiVendorId) { - continue; - } - if ((devices[i]->businfo.pci->bus == bus) && - (devices[i]->businfo.pci->dev == device) && - (devices[i]->businfo.pci->func == func)) { - - // pDevices[i]->nodes[DRM_NODE_PRIMARY]; - // Using render node here so that we can do the off-screen rendering without authentication - file_desc_ = open(devices[i]->nodes[DRM_NODE_RENDER], O_RDWR, 0); - - if (file_desc_ > 0) { - void* data, *file, *cap; - - // Initialize the admgpu device. - if (Funcs().AmdgpuDeviceInitialize(file_desc_, &major_ver_, - &minor_ver_, &dev_handle_) == 0) { - uint32_t version = 0; - // amdgpu_query_gpu_info will never fail only if it is initialized - Funcs().AmdgpuQueryGpuInfo(dev_handle_, &gpu_info_); - - drm_amdgpu_capability cap = {}; - Funcs().AmdgpuQueryInfo(dev_handle_, AMDGPU_INFO_CAPABILITY, sizeof(drm_amdgpu_capability), &cap); - - // Check if DGMA and SSG are available - if ((cap.flag & (AMDGPU_CAPABILITY_DIRECT_GMA_FLAG | AMDGPU_CAPABILITY_SSG_FLAG)) == - (AMDGPU_CAPABILITY_DIRECT_GMA_FLAG | AMDGPU_CAPABILITY_SSG_FLAG)) { - result = true; - break; - } - } - } - } - } - - if (result) { - alloc_ops_ = new amd::Monitor("DGMA mem alloc lock", true); - if (nullptr == alloc_ops_) { - return true; - } - } - - return result; -} - -void* ProDevice::AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr) const -{ - amd::ScopedLock l(alloc_ops_); - void* ptr = nullptr; - amdgpu_bo_handle buf_handle = 0; - amdgpu_bo_alloc_request req = {0}; - *host_ptr = nullptr; - - req.alloc_size = size; - req.phys_alignment = 64 * Ki; - req.preferred_heap = AMDGPU_GEM_DOMAIN_DGMA; - - // Allocate buffer in DGMA heap - if (0 == Funcs().AmdgpuBoAlloc(dev_handle_, &req, &buf_handle)) { - amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd; - uint32_t shared_handle = 0; - // Find the base driver handle - if (0 == Funcs().AmdgpuBoExport(buf_handle, type, &shared_handle)) { - uint32_t flags = 0; - size_t buf_size = 0; - // Map memory object to HSA device - if (0 == hsa_amd_interop_map_buffer(1, &agent, shared_handle, - flags, &buf_size, &ptr, nullptr, nullptr)) { - // Ask GPUPro driver to provide CPU access to allocation - if (0 == Funcs().AmdgpuBoCpuMap(buf_handle, host_ptr)) { - allocs_.insert({ptr, {buf_handle, shared_handle}}); - } - else { - hsa_amd_interop_unmap_buffer(ptr); - close(shared_handle); - Funcs().AmdgpuBoFree(buf_handle); - } - } - else { - close(shared_handle); - Funcs().AmdgpuBoFree(buf_handle); - } - } - else { - Funcs().AmdgpuBoFree(buf_handle); - } - } - - return ptr; -} - -void ProDevice::FreeDmaBuffer(void* ptr) const -{ - amd::ScopedLock l(alloc_ops_); - auto it = allocs_.find(ptr); - if (it != allocs_.end()) { - Funcs().AmdgpuBoCpuUnmap(it->second.first); - // Unmap memory from HSA device - hsa_amd_interop_unmap_buffer(ptr); - // Close shared handle - close(it->second.second); - int error = Funcs().AmdgpuBoFree(it->second.first); - allocs_.erase(it); - } -} - -} - -#endif // WITHOUT_HSA_BACKEND - diff --git a/device/rocm/pro/prodevice.hpp b/device/rocm/pro/prodevice.hpp deleted file mode 100644 index 80ff3600..00000000 --- a/device/rocm/pro/prodevice.hpp +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#pragma once - -#ifndef WITHOUT_HSA_BACKEND - -#include "profuncs.hpp" -#include "prodriver.hpp" -#include "thread/monitor.hpp" -#include - -/*! \addtogroup HSA - * @{ - */ - -//! HSA Device Implementation -namespace roc { - -class ProDevice : public IProDevice { -public: - static bool DrmInit(); - - ProDevice() - : file_desc_(0) - , major_ver_(0) - , minor_ver_(0) - , dev_handle_(nullptr) - , alloc_ops_(nullptr) {} - virtual ~ProDevice() override; - - bool Create(uint32_t bus, uint32_t device, uint32_t func); - - virtual void* AllocDmaBuffer( - hsa_agent_t agent, size_t size, void** host_ptr) const override; - virtual void FreeDmaBuffer(void* ptr) const override; - virtual void GetAsicIdAndRevisionId(uint32_t* asic_id, uint32_t* rev_id) const override - { - *asic_id = gpu_info_.asic_id; - *rev_id = gpu_info_.pci_rev_id; - } - -private: - static void* lib_drm_handle_; - static bool initialized_; - static drm::Funcs funcs_; - const drm::Funcs& Funcs() const { return funcs_; } - - int32_t file_desc_; //!< File descriptor for the device - uint32_t major_ver_; //!< Major driver version - uint32_t minor_ver_; //!< Minor driver version - amdgpu_device_handle dev_handle_; //!< AMD gpu device handle - amdgpu_gpu_info gpu_info_; //!< GPU info structure - amdgpu_heap_info heap_info_; //!< Information about memory - mutable std::unordered_map> allocs_; //!< Alloced memory mapping - amd::Monitor* alloc_ops_; //!< Serializes memory allocations/destructions -}; - -} // namespace roc - -/** - * @} - */ -#endif /*WITHOUT_HSA_BACKEND*/ diff --git a/device/rocm/pro/prodriver.hpp b/device/rocm/pro/prodriver.hpp deleted file mode 100644 index 819ade27..00000000 --- a/device/rocm/pro/prodriver.hpp +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#pragma once - -#ifndef WITHOUT_HSA_BACKEND - -#include "top.hpp" -#include "hsa/hsa.h" - -/*! \addtogroup HSA - * @{ - */ - -namespace roc { - -//! Pro Device Interface -class IProDevice : public amd::HeapObject { -public: - static IProDevice* Init(uint32_t bus, uint32_t device, uint32_t func); - - virtual void* AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr) const = 0; - virtual void FreeDmaBuffer(void* ptr) const = 0; - virtual void GetAsicIdAndRevisionId(uint32_t* asic_id, uint32_t* rev_id) const = 0; - - IProDevice() {} - virtual ~IProDevice() {} -}; - -} // namespace roc - -/** - * @} - */ -#endif /*WITHOUT_HSA_BACKEND*/ diff --git a/device/rocm/pro/profuncs.hpp b/device/rocm/pro/profuncs.hpp deleted file mode 100644 index e878df0c..00000000 --- a/device/rocm/pro/profuncs.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#pragma once - -namespace roc -{ -namespace drm -{ -typedef int (*DrmGetDevices)( - drmDevicePtr* pDevices, - int maxDevices); - -typedef int (*AmdgpuDeviceInitialize)( - int fd, - uint32_t* pMajorVersion, - uint32_t* pMinorVersion, - amdgpu_device_handle* pDeviceHandle); - -typedef int (*AmdgpuDeviceDeinitialize)( - amdgpu_device_handle hDevice); - -typedef int (*AmdgpuQueryGpuInfo)( - amdgpu_device_handle hDevice, - struct amdgpu_gpu_info* pInfo); - -typedef int (*AmdgpuQueryInfo)( - amdgpu_device_handle hDevice, - unsigned infoId, - unsigned size, - void* pValue); - -typedef int (*AmdgpuBoAlloc)( - amdgpu_device_handle hDevice, - struct amdgpu_bo_alloc_request* pAllocBuffer, - amdgpu_bo_handle* pBufferHandle); - -typedef int (*AmdgpuBoExport)( - amdgpu_bo_handle hBuffer, - enum amdgpu_bo_handle_type type, - uint32_t* pFd); - -typedef int (*AmdgpuBoFree)( - amdgpu_bo_handle hBuffer); - -typedef int (*AmdgpuBoCpuMap)( - amdgpu_bo_handle hBuffer, - void** ppCpuAddress); - -typedef int (*AmdgpuBoCpuUnmap)( - amdgpu_bo_handle hBuffer); - -struct Funcs -{ - DrmGetDevices DrmGetDevices; - AmdgpuDeviceInitialize AmdgpuDeviceInitialize; - AmdgpuDeviceDeinitialize AmdgpuDeviceDeinitialize; - AmdgpuQueryGpuInfo AmdgpuQueryGpuInfo; - AmdgpuQueryInfo AmdgpuQueryInfo; - AmdgpuBoAlloc AmdgpuBoAlloc; - AmdgpuBoExport AmdgpuBoExport; - AmdgpuBoFree AmdgpuBoFree; - AmdgpuBoCpuMap AmdgpuBoCpuMap; - AmdgpuBoCpuUnmap AmdgpuBoCpuUnmap; -}; - -} //namespace drm -} //namespace roc diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index 961227bb..e080cc17 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -40,9 +40,6 @@ #include "device/rocm/rocmemory.hpp" #include "device/rocm/rocglinterop.hpp" #include "device/rocm/rocsignal.hpp" -#ifdef WITH_AMDGPU_PRO -#include "pro/prodriver.hpp" -#endif #include "platform/sampler.hpp" #if defined(__clang__) @@ -163,8 +160,6 @@ Device::Device(hsa_agent_t bkendDevice) , xferQueue_(nullptr) , xferRead_(nullptr) , xferWrite_(nullptr) - , pro_device_(nullptr) - , pro_ena_(false) , freeMem_(0) , vgpusAccess_("Virtual GPU List Ops Lock", true) , hsa_exclusive_gpu_access_(false) @@ -218,9 +213,6 @@ void Device::checkAtomicSupport() { } Device::~Device() { -#ifdef WITH_AMDGPU_PRO - delete pro_device_; -#endif // Release cached map targets for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) { if ((*mapCache_)[i] != nullptr) { @@ -687,18 +679,6 @@ bool Device::create() { } info_.pciDomainID = pci_domain_id; -#ifdef WITH_AMDGPU_PRO - // Create amdgpu-pro device interface for SSG support - pro_device_ = IProDevice::Init( - info_.deviceTopology_.pcie.bus, - info_.deviceTopology_.pcie.device, - info_.deviceTopology_.pcie.function); - if (pro_device_ != nullptr) { - pro_ena_ = true; - pro_device_->GetAsicIdAndRevisionId(&info_.pcieDeviceId_, &info_.pcieRevisionId_); - } -#endif - // Get Agent HDP Flush Register Memory hsa_amd_hdp_flush_t hdpInfo; if (HSA_STATUS_SUCCESS != diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp index b3da3783..3e254ff9 100644 --- a/device/rocm/rocdevice.hpp +++ b/device/rocm/rocdevice.hpp @@ -75,7 +75,6 @@ class Memory; class Resource; class VirtualDevice; class PrintfDbg; -class IProDevice; class ProfilingSignal : public amd::ReferenceCountedObject { public: @@ -478,10 +477,6 @@ class Device : public NullDevice { //! Create internal blit program bool createBlitProgram(); - // Returns AMD GPU Pro interfacs - const IProDevice& iPro() const { return *pro_device_; } - bool ProEna() const { return pro_ena_; } - // P2P agents avaialble for this device const std::vector& p2pAgents() const { return p2p_agents_; } @@ -598,8 +593,6 @@ class Device : public NullDevice { XferBuffers* xferRead_; //!< Transfer buffers read XferBuffers* xferWrite_; //!< Transfer buffers write - const IProDevice* pro_device_; //!< AMDGPUPro device - bool pro_ena_; //!< Extra functionality with AMDGPUPro device, beyond ROCr std::atomic freeMem_; //!< Total of free memory available mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access bool hsa_exclusive_gpu_access_; //!< TRUE if current device was moved into exclusive GPU access mode diff --git a/device/rocm/rocmemory.cpp b/device/rocm/rocmemory.cpp index fbe6bd01..0eebbb55 100644 --- a/device/rocm/rocmemory.cpp +++ b/device/rocm/rocmemory.cpp @@ -37,9 +37,6 @@ #include "platform/sampler.hpp" #include "amdocl/cl_gl_amd.hpp" #include "amdocl/cl_vk_amd.hpp" -#ifdef WITH_AMDGPU_PRO -#include "pro/prodriver.hpp" -#endif namespace roc { @@ -673,12 +670,6 @@ void Buffer::destroy() { return; } -#ifdef WITH_AMDGPU_PRO - if ((memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) && dev().ProEna()) { - dev().iPro().FreeDmaBuffer(deviceMemory_); - return; - } -#endif if (deviceMemory_ != nullptr) { if (deviceMemory_ != owner()->getHostMem()) { // if they are identical, the host pointer will be From 590d947e6d3d502bd14fc6155452191856e0a013 Mon Sep 17 00:00:00 2001 From: Jaydeep Patel Date: Wed, 18 Jan 2023 18:53:06 +0000 Subject: [PATCH 07/45] SWDEV-378157 - Correct log message Change-Id: I6297693f67ae78a8874b976ac03353a81b728b1d --- device/rocm/rocblit.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/device/rocm/rocblit.cpp b/device/rocm/rocblit.cpp index f6b447ca..2f664b43 100644 --- a/device/rocm/rocblit.cpp +++ b/device/rocm/rocblit.cpp @@ -457,7 +457,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d // Copy memory line by line ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, - "HSA Asycn Copy Rect wait_event=0x%zx, completion_signal=0x%zx", + "HSA Async Copy Rect wait_event=0x%zx, completion_signal=0x%zx", (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset, &srcMem, &offset, &dim, agent, direction, wait_events.size(), wait_events.data(), active); @@ -478,7 +478,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d // Copy memory line by line ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, - "HSA Asycn Copy wait_event=0x%zx, completion_signal=0x%zx", + "HSA Async Copy wait_event=0x%zx, completion_signal=0x%zx", (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); hsa_status_t status = hsa_amd_memory_async_copy( (reinterpret_cast
(dst) + dstOffset), dstAgent, @@ -668,9 +668,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, // Use SDMA to transfer the data ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, - "HSA Asycn Copy dst=0x%zx, src=0x%zx, size=%d, wait_event=0x%zx, " - "completion_signal=0x%zx", - dst, src, (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); + "HSA Async Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, completion_signal=0x%zx", + dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], wait_events.size(), wait_events.data(), active); From 7760b95934b9b36b617fd6dd54aff8f2740d8e19 Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Tue, 24 Jan 2023 03:00:08 -0500 Subject: [PATCH 08/45] SWDEV-2 - Change OpenCL version number from 3560 to 3561 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index d724d3ca..a0734464 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3560 +#define AMD_PLATFORM_BUILD_NUMBER 3561 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From 5ffa1d9bd10667b88ed2360e05a155a494684880 Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Mon, 23 Jan 2023 17:40:30 -0500 Subject: [PATCH 09/45] SWDEV-372790 - Copy AQL packet from runtime setup Scheduler in device queue requires relaunching itself. Make sure scheduler uses exactly the same AQL packet as the host launch. Change-Id: I4eb03c4c91bf2408a6d4607731f081a2e2c2c8ae --- device/rocm/rocblit.cpp | 24 +++--------------------- device/rocm/rocvirtual.cpp | 15 +++++++++++++-- device/rocm/rocvirtual.hpp | 3 ++- 3 files changed, 18 insertions(+), 24 deletions(-) diff --git a/device/rocm/rocblit.cpp b/device/rocm/rocblit.cpp index 2f664b43..a7a3efd2 100644 --- a/device/rocm/rocblit.cpp +++ b/device/rocm/rocblit.cpp @@ -2181,7 +2181,7 @@ bool KernelBlitManager::fillBuffer3D(device::Memory& memory, const void* pattern // ================================================================================================ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, - const amd::Coord3D& sizeIn, bool entire, + const amd::Coord3D& sizeIn, bool entire, amd::CopyMetadata copyMetadata) const { amd::ScopedLock k(lockXferOps_); bool result = false; @@ -2681,31 +2681,13 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, Memory* schedulerMem = dev().getRocMemory(schedulerParam); sp->kernarg_address = reinterpret_cast(schedulerMem->getDeviceMemory()); - - sp->hidden_global_offset_x = 0; - sp->hidden_global_offset_y = 0; - sp->hidden_global_offset_z = 0; sp->thread_counter = 0; sp->child_queue = reinterpret_cast(schedulerQueue); sp->complete_signal = schedulerSignal; hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne); - sp->scheduler_aql.header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - sp->scheduler_aql.setup = 1; - sp->scheduler_aql.workgroup_size_x = 1; - sp->scheduler_aql.workgroup_size_y = 1; - sp->scheduler_aql.workgroup_size_z = 1; - sp->scheduler_aql.grid_size_x = threads; - sp->scheduler_aql.grid_size_y = 1; - sp->scheduler_aql.grid_size_z = 1; - sp->scheduler_aql.kernel_object = gpuKernel.KernelCodeHandle(); - sp->scheduler_aql.kernarg_address = (void*)sp->kernarg_address; - sp->scheduler_aql.private_segment_size = 0; - sp->scheduler_aql.group_segment_size = 0; + sp->vqueue_header = vqVM; sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam); @@ -2720,7 +2702,7 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, address parameters = captureArguments(kernels_[Scheduler]); if (!gpu().submitKernelInternal(ndrange, *kernels_[Scheduler], - parameters, nullptr)) { + parameters, nullptr, 0, nullptr, &sp->scheduler_aql)) { return false; } releaseArguments(parameters); diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp index 7747d147..0976b183 100644 --- a/device/rocm/rocvirtual.cpp +++ b/device/rocm/rocvirtual.cpp @@ -1817,7 +1817,7 @@ bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memo realSize.c[0] *= elemSize; } - result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, realSrcOrigin, realDstOrigin, + result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, realSrcOrigin, realDstOrigin, realSize, entire, copyMetadata); break; } @@ -2781,7 +2781,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) // ================================================================================================ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, void* eventHandle, - uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) { + uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd, + hsa_kernel_dispatch_packet_t* aql_packet) { device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev())); Kernel& gpuKernel = static_cast(*devKernel); size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize(); @@ -3108,6 +3109,16 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_; } + // Copy scheduler's AQL packet for possible relaunch from the scheduler itself + if (aql_packet != nullptr) { + *aql_packet = dispatchPacket; + aql_packet->header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + } + // Dispatch the packet if (!dispatchAqlPacket( &dispatchPacket, aqlHeaderWithOrder, diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp index bf0bb4ce..74d624a1 100644 --- a/device/rocm/rocvirtual.hpp +++ b/device/rocm/rocvirtual.hpp @@ -312,7 +312,8 @@ class VirtualGPU : public device::VirtualDevice { const_address parameters, //!< Parameters for the kernel void* event_handle, //!< Handle to OCL event for debugging uint32_t sharedMemBytes = 0, //!< Shared memory size - amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command + amd::NDRangeKernelCommand* vcmd = nullptr, //!< Original launch command + hsa_kernel_dispatch_packet_t* aql_packet = nullptr //!< Scheduler launch ); void submitNativeFn(amd::NativeFnCommand& cmd); void submitMarker(amd::Marker& cmd); From e3caba84b84b109092cf9200698f40df40e09a22 Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Wed, 25 Jan 2023 03:00:05 -0500 Subject: [PATCH 10/45] SWDEV-2 - Change OpenCL version number from 3561 to 3562 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index a0734464..d34c5ce3 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3561 +#define AMD_PLATFORM_BUILD_NUMBER 3562 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From cdd151dcd4b62d0d13a1273c669f005bd89be7fe Mon Sep 17 00:00:00 2001 From: Jiabao Xie Date: Wed, 25 Jan 2023 15:54:46 -0500 Subject: [PATCH 11/45] SWDEV-366886 - Revert "SWDEV-366886 - force svm alloc for rocm" This reverts commit 275f4ddd209ecb39baedc7e127184c728479b510. Reason for revert: performance drop in Quicksilver app Change-Id: I2bdf42ad0a235a74e2bb4d38f86471e9affbd7a6 --- device/rocm/rocdevice.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index e080cc17..85aa32a6 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -2311,7 +2311,6 @@ bool Device::IpcDetach (void* dev_ptr) const { // ================================================================================================ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const { - constexpr bool kForceAllocation = true; amd::Memory* mem = nullptr; if (nullptr == svmPtr) { @@ -2323,7 +2322,7 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_ return nullptr; } - if (!mem->create(nullptr, false, false, kForceAllocation)) { + if (!mem->create(nullptr)) { LogError("failed to create a svm hidden buffer!"); mem->release(); return nullptr; From 6eeda49e68a6507309bb94d84e871e05119a1477 Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Fri, 27 Jan 2023 03:00:07 -0500 Subject: [PATCH 12/45] SWDEV-2 - Change OpenCL version number from 3562 to 3563 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index d34c5ce3..0ddc862d 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3562 +#define AMD_PLATFORM_BUILD_NUMBER 3563 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From a4b3ea81f3bc9f729aa99e38ff8aedc6bbf732a5 Mon Sep 17 00:00:00 2001 From: Sourabh Betigeri Date: Thu, 26 Jan 2023 16:22:03 -0800 Subject: [PATCH 13/45] SWDEV-340649 - Removes calls to commitMemory in hmm path Change-Id: I8d381b4c3f5cf95628487e0d10ae643443c9709d --- device/rocm/rocmemory.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/device/rocm/rocmemory.cpp b/device/rocm/rocmemory.cpp index 0eebbb55..8aacd09d 100644 --- a/device/rocm/rocmemory.cpp +++ b/device/rocm/rocmemory.cpp @@ -643,8 +643,9 @@ void Buffer::destroy() { if (memFlags & CL_MEM_ALLOC_HOST_PTR) { if (dev().info().hmmSupported_) { // AMD HMM path. Destroy system memory - amd::Os::uncommitMemory(deviceMemory_, size()); - amd::Os::releaseMemory(deviceMemory_, size()); + if (!(amd::Os::releaseMemory(deviceMemory_, size()))) { + ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "[ROCClr] munmap failed \n"); + } } else { dev().hostFree(deviceMemory_, size()); } @@ -746,7 +747,6 @@ bool Buffer::create(bool alloc_local) { if (deviceMemory_ == NULL) { return false; } - amd::Os::commitMemory(deviceMemory_, size(), amd::Os::MEM_PROT_RW); // Currently HMM requires cirtain initial calls to mark sysmem allocation as // GPU accessible or prefetch memory into GPU if (!dev().SvmAllocInit(deviceMemory_, size())) { From 9c146a280777e02fc1bd34af4d15d3c973402288 Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Tue, 31 Jan 2023 03:00:08 -0500 Subject: [PATCH 14/45] SWDEV-2 - Change OpenCL version number from 3563 to 3564 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index 0ddc862d..52981be4 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3563 +#define AMD_PLATFORM_BUILD_NUMBER 3564 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From 54d9026fb5c1c05c5921c6b35263ebbb122cd312 Mon Sep 17 00:00:00 2001 From: German Date: Mon, 30 Jan 2023 13:11:14 -0500 Subject: [PATCH 15/45] SWDEV-352197 - Destroy virtual device in thread destructor Windows kills threads on exit without any notification. However, runtime can still destroy VirtualGPU object from the host thread with HostQueue destruction. This change also forces RGP trace transfer on the last capture without any delays. Change-Id: I768e87e99e1d23a021e63c12f36e450817743759 --- device/pal/palgpuopen.cpp | 15 +++++++++++++++ device/pal/palvirtual.cpp | 8 ++++---- platform/commandqueue.cpp | 1 - platform/commandqueue.hpp | 14 +++++++++----- 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/device/pal/palgpuopen.cpp b/device/pal/palgpuopen.cpp index 7c6357e9..c5d5c094 100644 --- a/device/pal/palgpuopen.cpp +++ b/device/pal/palgpuopen.cpp @@ -278,6 +278,21 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) { // continue until we find the right queue... } else if (Pal::Result::Success == res) { trace_.sqtt_disp_count_ = 0; + // Stop the trace and save the result. Currently runtime can't delay upload in HIP, + // because default stream doesn't have explicit destruction and + // OS kills all threads on exit without any notification. That includes PAL RGP threads. + { + if (trace_.status_ == TraceStatus::WaitingForSqtt) { + auto result = EndRGPTrace(gpu); + } + // Check if runtime is waiting for the final trace results + if (trace_.status_ == TraceStatus::WaitingForResults) { + // If results are ready, then finish the trace + if (CheckForTraceResults() == Pal::Result::Success) { + FinishRGPTrace(gpu, false); + } + } + } } else { FinishRGPTrace(gpu, true); } diff --git a/device/pal/palvirtual.cpp b/device/pal/palvirtual.cpp index e7465c8b..624379d1 100644 --- a/device/pal/palvirtual.cpp +++ b/device/pal/palvirtual.cpp @@ -1072,15 +1072,15 @@ bool VirtualGPU::allocHsaQueueMem() { } VirtualGPU::~VirtualGPU() { + // Not safe to remove a queue. So lock the device + amd::ScopedLock k(dev().lockAsyncOps()); + amd::ScopedLock lock(dev().vgpusAccess()); + // Destroy RGP trace if (rgpCaptureEna()) { dev().rgpCaptureMgr()->FinishRGPTrace(this, true); } - // Not safe to remove a queue. So lock the device - amd::ScopedLock k(dev().lockAsyncOps()); - amd::ScopedLock lock(dev().vgpusAccess()); - while (!freeCbQueue_.empty()) { auto cb = freeCbQueue_.front(); delete cb; diff --git a/platform/commandqueue.cpp b/platform/commandqueue.cpp index e9e56cb8..90573b4c 100644 --- a/platform/commandqueue.cpp +++ b/platform/commandqueue.cpp @@ -67,7 +67,6 @@ bool HostQueue::terminate() { marker->release(); } thread_.acceptingCommands_ = false; - thread_.Release(); } else { if (Os::isThreadAlive(thread_)) { Command* marker = nullptr; diff --git a/platform/commandqueue.hpp b/platform/commandqueue.hpp index 863f8eca..ea702a5e 100644 --- a/platform/commandqueue.hpp +++ b/platform/commandqueue.hpp @@ -162,15 +162,21 @@ class HostQueue : public CommandQueue { Thread() : amd::Thread("Command Queue Thread", CQ_THREAD_STACK_SIZE, !AMD_DIRECT_DISPATCH), acceptingCommands_(false), - virtualDevice_(NULL) {} + virtualDevice_(nullptr) {} + + virtual ~Thread() { + if (virtualDevice_ != nullptr) { + delete virtualDevice_; + virtualDevice_ = nullptr; + } + } //! The command queue thread entry point. void run(void* data) { HostQueue* queue = static_cast(data); virtualDevice_ = queue->device().createVirtualDevice(queue); - if (virtualDevice_ != NULL) { + if (virtualDevice_ != nullptr) { queue->loop(virtualDevice_); - Release(); } else { acceptingCommands_ = false; queue->flush(); @@ -184,8 +190,6 @@ class HostQueue : public CommandQueue { } } - void Release() const { delete virtualDevice_; } - //! Get virtual device for the current thread device::VirtualDevice* vdev() const { return virtualDevice_; } From a3f8e43c829bc1ce46302add1a4f94e6a5e59619 Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Tue, 31 Jan 2023 17:46:15 +0000 Subject: [PATCH 16/45] SWDEV-380258 - Fix invalid iterator after erase Fix rare segfault due to invalid iterator after erase Change-Id: Id5b54d6cf10075deff0d613fec12af249c6c55a3 --- device/rocm/rocdevice.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index 85aa32a6..01c038c9 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -236,17 +236,17 @@ Device::~Device() { } for (auto& it : queuePool_) { - for (auto& qIter : it) { - hsa_queue_t* queue = qIter.first; - auto& qInfo = qIter.second; + for (auto qIter = it.begin(); qIter != it.end(); ) { + hsa_queue_t* queue = qIter->first; + auto& qInfo = qIter->second; if (qInfo.hostcallBuffer_) { ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p", - qInfo.hostcallBuffer_, qIter.first); + qInfo.hostcallBuffer_, qIter->first); disableHostcalls(qInfo.hostcallBuffer_); context().svmFree(qInfo.hostcallBuffer_); } ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue); - it.erase(queue); + qIter = it.erase(qIter); hsa_queue_destroy(queue); } } From 3286c59a4f1f8593775c29fba0902f218058a849 Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Wed, 1 Feb 2023 03:00:07 -0500 Subject: [PATCH 17/45] SWDEV-2 - Change OpenCL version number from 3564 to 3565 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index 52981be4..d166183f 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3564 +#define AMD_PLATFORM_BUILD_NUMBER 3565 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From 95063d365feec040247aad5f1843d72d93bddc3f Mon Sep 17 00:00:00 2001 From: German Date: Tue, 31 Jan 2023 15:48:12 -0500 Subject: [PATCH 18/45] SWDEV-352197 - Use static VM ID for capture Use static VM ID if available for capture. Add a delay in RGP trace finish to help RGP server to complete file operations. Change-Id: Id0dab2fe4398ac2df79f25608fdd06b1f84e7fe0 --- device/pal/palgpuopen.cpp | 30 +++++++++++++++++++++++------- device/pal/palgpuopen.hpp | 7 ++++--- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/device/pal/palgpuopen.cpp b/device/pal/palgpuopen.cpp index c5d5c094..e5f7dd2e 100644 --- a/device/pal/palgpuopen.cpp +++ b/device/pal/palgpuopen.cpp @@ -54,9 +54,7 @@ RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device) se_mask_(0), perf_counter_mem_limit_(0), perf_counter_frequency_(0), - trace_enabled_(false), - inst_tracing_enabled_(false), - perf_counters_enabled_(false) { + value_(0) { memset(&trace_, 0, sizeof(trace_)); } @@ -176,6 +174,8 @@ bool RgpCaptureMgr::Update(Pal::IPlatform* platform) { PostDeviceCreate(); } + static_vm_id_ = device_.properties().gfxipProperties.flags.supportStaticVmid; + return result; } @@ -189,12 +189,12 @@ bool RgpCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue, // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients; // it may be optional for Vulkan, but we provide it anyway if available). Pal::KernelContextInfo kernelContextInfo = {}; - Pal::Result palResult = iQueue->QueryKernelContextInfo(&kernelContextInfo); // Ensure we've acquired the debug VMID (note that some platforms do not // implement this function, so don't fail the whole trace if so) *debug_vmid = kernelContextInfo.flags.hasDebugVmid; + assert((static_vm_id_ || *debug_vmid) && "Can't capture multiple queues!"); // Register the queue with the GPA session class for timed queue operation support. if (trace_.gpa_session_->RegisterTimedQueue( @@ -532,11 +532,17 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) { } } - // Notify the RGP server that we are starting a trace - if (rgp_server_->BeginTrace() != DevDriver::Result::Success) { - result = Pal::Result::ErrorUnknown; + if (static_vm_id_) { + result = device_.iDev()->SetStaticVmidMode(true); + assert(result == Pal::Result::Success && "Static VM ID setup failed!"); } + if (result == Pal::Result::Success) { + // Notify the RGP server that we are starting a trace + if (rgp_server_->BeginTrace() != DevDriver::Result::Success) { + result = Pal::Result::ErrorUnknown; + } + } // Tell the GPA session class we're starting a trace if (result == Pal::Result::Success) { GpuUtil::GpaSessionBeginInfo info = {}; @@ -722,6 +728,7 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) { return; } + auto disp_count = trace_.sqtt_disp_count_; // Finish the trace if the queue was destroyed before OCL reached // the number of captured dispatches if (trace_.sqtt_disp_count_ != 0) { @@ -751,9 +758,18 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) { } else { rgp_server_->EndTrace(); } + + if (static_vm_id_) { + auto result = device_.iDev()->SetStaticVmidMode(false); + assert(result == Pal::Result::Success && "Static VM ID setup failed!"); + } + if (trace_.gpa_session_ != nullptr) { trace_.gpa_session_->Reset(); } + // If applicaiton exits, then Windows kills all threads and + // RGP can't finish data write into a file. + amd::Os::sleep(10 * disp_count + 500); // Reset tracing state to idle trace_.prepared_disp_count_ = 0; trace_.sqtt_disp_count_ = 0; diff --git a/device/pal/palgpuopen.hpp b/device/pal/palgpuopen.hpp index ddae799b..33e9d35d 100644 --- a/device/pal/palgpuopen.hpp +++ b/device/pal/palgpuopen.hpp @@ -410,9 +410,10 @@ class RgpCaptureMgr { union { struct { - uint32_t trace_enabled_ : 1; // True if tracing is currently enabled (master flag) - uint32_t inst_tracing_enabled_; // Enable instruction-level SQTT tokens - uint32_t perf_counters_enabled_; // True if perf counters are enabled + uint32_t trace_enabled_: 1; // True if tracing is currently enabled (master flag) + uint32_t inst_tracing_enabled_: 1; // Enable instruction-level SQTT tokens + uint32_t perf_counters_enabled_: 1; // True if perf counters are enabled + uint32_t static_vm_id_: 1; // Static VM ID can be used for capture }; uint32_t value_; }; From 39ad907af1d0cc5ebda80ba83896778824bf6ff2 Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Thu, 2 Feb 2023 03:00:07 -0500 Subject: [PATCH 19/45] SWDEV-2 - Change OpenCL version number from 3565 to 3566 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index d166183f..2ea8cdae 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3565 +#define AMD_PLATFORM_BUILD_NUMBER 3566 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From 09be2b7049516e026a8c63c0df332d79a59dbce6 Mon Sep 17 00:00:00 2001 From: Ranjith Ramakrishnan Date: Tue, 31 Jan 2023 15:52:40 -0800 Subject: [PATCH 20/45] SWDEV-366831 - Correct the include path for new directory layout Change-Id: I7c826ed9c2f2d839d350e5150f578830655616f0 --- device/rocm/rocurilocator.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/device/rocm/rocurilocator.hpp b/device/rocm/rocurilocator.hpp index 880b6c72..76451087 100644 --- a/device/rocm/rocurilocator.hpp +++ b/device/rocm/rocurilocator.hpp @@ -22,7 +22,7 @@ #if defined(__clang__) #if __has_feature(address_sanitizer) #include "device/devurilocator.hpp" -#include "hsa_ven_amd_loader.h" +#include "hsa/hsa_ven_amd_loader.h" #include namespace roc { From 23ae2b64683d53753dd79a4c46944d0c4b9d7219 Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Fri, 3 Feb 2023 03:00:10 -0500 Subject: [PATCH 21/45] SWDEV-2 - Change OpenCL version number from 3566 to 3567 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index 2ea8cdae..62bc75fe 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3566 +#define AMD_PLATFORM_BUILD_NUMBER 3567 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From 21e50998dd2439708a6672dac5a04be1dd3da682 Mon Sep 17 00:00:00 2001 From: German Date: Fri, 3 Feb 2023 13:44:24 -0500 Subject: [PATCH 22/45] SWDEV-368235 - Remove obsolete env variables Change-Id: I7e14d53297e79e2f68b3a6cc40251ad7db9eb5ab --- device/device.cpp | 11 --- device/device.hpp | 3 +- device/devkernel.cpp | 144 ++++++++++++++++-------------------- device/pal/paldevice.cpp | 4 +- device/pal/palsettings.cpp | 39 +--------- device/pal/palsettings.hpp | 4 +- device/pal/palvirtual.cpp | 25 +------ device/rocm/rocdevice.cpp | 56 +++++++------- device/rocm/rocdevice.hpp | 2 +- device/rocm/rocsettings.cpp | 26 ------- device/rocm/rocsettings.hpp | 13 +--- utils/flags.hpp | 44 ----------- 12 files changed, 102 insertions(+), 269 deletions(-) diff --git a/device/device.cpp b/device/device.cpp index f87452db..288f84f6 100644 --- a/device/device.cpp +++ b/device/device.cpp @@ -798,17 +798,6 @@ Settings::Settings() : value_(0) { commandQueues_ = 200; //!< Field value set to maximum number //!< concurrent Virtual GPUs for default - overrideLclSet = (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE)) ? 1 : 0; - overrideLclSet |= - (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y)) - ? 2 - : 0; - overrideLclSet |= - (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z)) - ? 4 - : 0; - fenceScopeAgent_ = AMD_OPT_FLUSH; if (amd::IS_HIP) { if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) { diff --git a/device/device.hpp b/device/device.hpp index a493795e..ed206311 100644 --- a/device/device.hpp +++ b/device/device.hpp @@ -619,7 +619,6 @@ class Settings : public amd::HeapObject { uint64_t extensions_; //!< Supported OCL extensions union { struct { - uint overrideLclSet : 3; //!< Bit mask to override the local size uint apuSystem_ : 1; //!< Device is APU system with shared memory uint supportRA_ : 1; //!< Support RA channel order format uint waitCommand_ : 1; //!< Enables a wait for every submitted command @@ -639,7 +638,7 @@ class Settings : public amd::HeapObject { uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device uint fenceScopeAgent_ : 1; //!< Enable fence scope agent in AQL dispatch packet uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions - uint reserved_ : 11; + uint reserved_ : 14; }; uint value_; }; diff --git a/device/devkernel.cpp b/device/devkernel.cpp index e598dca7..42eb1857 100644 --- a/device/devkernel.cpp +++ b/device/devkernel.cpp @@ -677,100 +677,80 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, if (workGroupInfo()->compileSize_[0] == 0) { // Find the default local workgroup size, if it wasn't specified if (lclWorkSize[0] == 0) { - if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) { - // Find threads per group - size_t thrPerGrp = workGroupInfo()->size_; - - // Check if kernel uses images - if (flags_.imageEna_ && - // and thread group is a multiple value of wavefronts - ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && - // and it's 2 or 3-dimensional workload - (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) { - // Use 8x8 workgroup size if kernel has image writes - if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) { - lclWorkSize[0] = 8; - lclWorkSize[1] = 8; - } - else { - lclWorkSize[0] = 16; - lclWorkSize[1] = 16; - } - if (workDim == 3) { - lclWorkSize[2] = 1; - } + // Find threads per group + size_t thrPerGrp = workGroupInfo()->size_; + + // Check if kernel uses images + if (flags_.imageEna_ && + // and thread group is a multiple value of wavefronts + ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && + // and it's 2 or 3-dimensional workload + (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) { + // Use 8x8 workgroup size if kernel has image writes + if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) { + lclWorkSize[0] = 8; + lclWorkSize[1] = 8; } else { - size_t tmp = thrPerGrp; - // Split the local workgroup into the most efficient way - for (uint d = 0; d < workDim; ++d) { - size_t div = tmp; - for (; (gblWorkSize[d] % div) != 0; div--) - ; - lclWorkSize[d] = div; - tmp /= div; - } + lclWorkSize[0] = 16; + lclWorkSize[1] = 16; + } + if (workDim == 3) { + lclWorkSize[2] = 1; + } + } + else { + size_t tmp = thrPerGrp; + // Split the local workgroup into the most efficient way + for (uint d = 0; d < workDim; ++d) { + size_t div = tmp; + for (; (gblWorkSize[d] % div) != 0; div--) + ; + lclWorkSize[d] = div; + tmp /= div; + } - // Assuming DWORD access - const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; + // Assuming DWORD access + const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; - // Check if we couldn't find optimal workload - if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || - // or size is too small for the cache line - (lclWorkSize[0] < cacheLineMatch)) { - size_t maxSize = 0; - size_t maxDim = 0; - for (uint d = 0; d < workDim; ++d) { - if (maxSize < gblWorkSize[d]) { - maxSize = gblWorkSize[d]; - maxDim = d; - } + // Check if we couldn't find optimal workload + if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || + // or size is too small for the cache line + (lclWorkSize[0] < cacheLineMatch)) { + size_t maxSize = 0; + size_t maxDim = 0; + for (uint d = 0; d < workDim; ++d) { + if (maxSize < gblWorkSize[d]) { + maxSize = gblWorkSize[d]; + maxDim = d; } - // Use X dimension as high priority. Runtime will assume that - // X dimension is more important for the address calculation - if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { - lclWorkSize[0] = cacheLineMatch; - thrPerGrp /= cacheLineMatch; - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 1; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } + } + // Use X dimension as high priority. Runtime will assume that + // X dimension is more important for the address calculation + if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { + lclWorkSize[0] = cacheLineMatch; + thrPerGrp /= cacheLineMatch; + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 1; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; } } - else { - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; - } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } + } + else { + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 0; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; } } } } } - else { - // Use overrides when app doesn't provide workgroup dimensions - if (workDim == 1) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; - } - else if (workDim == 2) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; - } - else if (workDim == 3) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; - lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; - } - else { - assert(0 && "Invalid workDim!"); - } - } } } else { diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp index 75cd3c3e..6c60ef05 100644 --- a/device/pal/paldevice.cpp +++ b/device/pal/paldevice.cpp @@ -291,7 +291,7 @@ bool NullDevice::create(const char* palName, const amd::Isa& isa, Pal::GfxIpLeve nullptr, nullptr, nullptr, - AMD_OCL_SC_LIB}; + nullptr}; // Initialize the compiler handle acl_error error; compiler_ = amd::Hsail::CompilerInit(&opts, &error); @@ -1013,7 +1013,7 @@ bool Device::create(Pal::IDevice* device) { nullptr, nullptr, nullptr, - AMD_OCL_SC_LIB}; + nullptr}; // Initialize the compiler handle acl_error error; compiler_ = amd::Hsail::CompilerInit(&opts, &error); diff --git a/device/pal/palsettings.cpp b/device/pal/palsettings.cpp index 1f4dc8e9..59ac82b8 100644 --- a/device/pal/palsettings.cpp +++ b/device/pal/palsettings.cpp @@ -76,8 +76,6 @@ Settings::Settings() { // Enable workload split by default (for 24 bit arithmetic or timeout) workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT; - // By default use host blit - blitEngine_ = BlitEngineHost; pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi; pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE) ? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi; @@ -123,8 +121,6 @@ Settings::Settings() { //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS linearPersistentImage_ = false; - useSingleScratch_ = GPU_USE_SINGLE_SCRATCH; - // Device enqueuing settings numDeviceEvents_ = 1024; numWaitEvents_ = 8; @@ -328,16 +324,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp, libSelector_ = amd::GPU_Library_CI; if (LP64_SWITCH(false, true)) { - oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ - ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) - : OpenCL12; - } - if (GPU_FORCE_OCL20_32BIT) { - force32BitOcl20_ = true; - oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ + oclVersion_ = !reportAsOCL12Device ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12; } + if (OPENCL_VERSION < 200) { oclVersion_ = OpenCL12; } @@ -346,27 +337,13 @@ bool Settings::create(const Pal::DeviceProperties& palProp, // Cap at OpenCL20 for now if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20; - // This needs to be cleaned once 64bit addressing is stable - if (oclVersion_ < OpenCL20) { - use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) - ? LP64_SWITCH(false, - /*calAttr.isWorkstation ||*/ true) - : GPU_FORCE_64BIT_PTR; - } else { - if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) { - use64BitPtr_ = true; - } - } + use64BitPtr_ = LP64_SWITCH(false, true); if (oclVersion_ >= OpenCL20) { supportDepthsRGB_ = true; } if (use64BitPtr_) { - if (GPU_ENABLE_LARGE_ALLOCATION) { - maxAllocSize_ = 64ULL * Gi; - } else { - maxAllocSize_ = 4048 * Mi; - } + maxAllocSize_ = 64ULL * Gi; } else { maxAllocSize_ = 3ULL * Gi; } @@ -447,9 +424,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp, imageSupport_ = true; - // Use kernels for blit if appropriate - blitEngine_ = BlitEngineKernel; - hostMemDirectAccess_ |= HostMemBuffer; // HW doesn't support untiled image writes // hostMemDirectAccess_ |= HostMemImage; @@ -542,11 +516,6 @@ void Settings::override() { preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; } - // Override blit engine type - if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) { - blitEngine_ = GPU_BLIT_ENGINE_TYPE; - } - if (!flagIsDefault(DEBUG_GPU_FLAGS)) { debugFlags_ = DEBUG_GPU_FLAGS; } diff --git a/device/pal/palsettings.hpp b/device/pal/palsettings.hpp index 66984622..4bcc7e2e 100644 --- a/device/pal/palsettings.hpp +++ b/device/pal/palsettings.hpp @@ -70,7 +70,6 @@ class Settings : public device::Settings { uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features uint threadTraceEnable_ : 1; //!< Thread trace enable uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent - uint useSingleScratch_ : 1; //!< Allocates single scratch per device uint svmAtomics_ : 1; //!< SVM device atomics uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support uint useDeviceQueue_ : 1; //!< Submit to separate device queue @@ -82,7 +81,7 @@ class Settings : public device::Settings { uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10 uint disableSdma_ : 1; //!< Disable SDMA support uint alwaysResident_ : 1; //!< Make resources resident at allocation time - uint reserved_ : 7; + uint reserved_ : 8; }; uint value_; }; @@ -95,7 +94,6 @@ class Settings : public device::Settings { uint workloadSplitSize_; //!< Workload split size uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms - uint blitEngine_; //!< Blit engine type uint cacheLineSize_; //!< Cache line size in bytes uint cacheSize_; //!< L1 cache size in bytes uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings diff --git a/device/pal/palvirtual.cpp b/device/pal/palvirtual.cpp index 624379d1..3a208171 100644 --- a/device/pal/palvirtual.cpp +++ b/device/pal/palvirtual.cpp @@ -905,11 +905,6 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, // because destructor calls eraseResourceList() even if create() failed dev().resizeResoureList(index()); - if (index() >= GPU_MAX_COMMAND_QUEUES) { - // Cap the maximum number of concurrent Virtual GPUs - return false; - } - // Virtual GPU will have profiling enabled state_.profiling_ = profiling; @@ -1020,18 +1015,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, return false; } - // Choose the appropriate class for blit engine - switch (dev().settings().blitEngine_) { - default: - // Fall through ... - case Settings::BlitEngineHost: - blitSetup.disableAll(); - // Fall through ... - case Settings::BlitEngineCAL: - case Settings::BlitEngineKernel: - blitMgr_ = new KernelBlitManager(*this, blitSetup); - break; - } + blitMgr_ = new KernelBlitManager(*this, blitSetup); if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) { LogError("Could not create BlitManager!"); return false; @@ -3269,11 +3253,8 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) { cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); uint64_t endTimeStampCPU = amd::Os::timeNanos(); - // Make sure the command batch has a valid GPU TS - if (!GPU_RAW_TIMESTAMP) { - // Adjust the base time by the execution time - readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; - } + // Adjust the base time by the execution time + readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; } } } diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index 01c038c9..e63ab77e 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -894,37 +894,35 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo Device* dev = reinterpret_cast(data); switch (segment_type) { case HSA_REGION_SEGMENT_GLOBAL: { - if (dev->settings().enableLocalMemory_) { - uint32_t global_flag = 0; - hsa_status_t stat = - hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); - if (stat != HSA_STATUS_SUCCESS) { - return stat; - } + uint32_t global_flag = 0; + hsa_status_t stat = + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); + if (stat != HSA_STATUS_SUCCESS) { + return stat; + } - if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) { - dev->gpu_fine_grained_segment_ = pool; - } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) { - dev->gpuvm_segment_ = pool; - - // If cpu agent cannot access this pool, the device does not support large bar. - hsa_amd_memory_pool_access_t tmp{}; - hsa_amd_agent_memory_pool_get_info( - dev->cpu_agent_, - pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, - &tmp); - - if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { - dev->info_.largeBar_ = false; - } else { - dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR; - } + if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) { + dev->gpu_fine_grained_segment_ = pool; + } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) { + dev->gpuvm_segment_ = pool; + + // If cpu agent cannot access this pool, the device does not support large bar. + hsa_amd_memory_pool_access_t tmp{}; + hsa_amd_agent_memory_pool_get_info( + dev->cpu_agent_, + pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, + &tmp); + + if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + dev->info_.largeBar_ = false; + } else { + dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR; } + } - if (dev->gpuvm_segment_.handle == 0) { - dev->gpuvm_segment_ = pool; - } + if (dev->gpuvm_segment_.handle == 0) { + dev->gpuvm_segment_ = pool; } break; } @@ -1232,7 +1230,7 @@ bool Device::populateOCLDeviceConstants() { info_.maxWorkItemDimensions_ = 3; - if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) { + if (gpuvm_segment_.handle != 0) { size_t global_segment_size = 0; if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE, diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp index 3e254ff9..99d0976e 100644 --- a/device/rocm/rocdevice.hpp +++ b/device/rocm/rocdevice.hpp @@ -218,7 +218,7 @@ class NullDevice : public amd::Device { //! Determine if we can use device memory for SVM const bool forceFineGrain(amd::Memory* memory) const { - return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1); + return (memory->getContext().devices().size() > 1); } virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle) { diff --git a/device/rocm/rocsettings.cpp b/device/rocm/rocsettings.cpp index 71c341ad..24cb88dc 100644 --- a/device/rocm/rocsettings.cpp +++ b/device/rocm/rocsettings.cpp @@ -35,18 +35,9 @@ Settings::Settings() { // Set this to true when we drop the flag doublePrecision_ = ::CL_KHR_FP64; - enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE; - enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM; - maxWorkGroupSize_ = 1024; preferredWorkGroupSize_ = 256; - maxWorkGroupSize2DX_ = 16; - maxWorkGroupSize2DY_ = 16; - maxWorkGroupSize3DX_ = 4; - maxWorkGroupSize3DY_ = 4; - maxWorkGroupSize3DZ_ = 4; - kernargPoolSize_ = HSA_KERNARG_POOL_SIZE; // Determine if user is requesting Non-Coherent mode @@ -201,23 +192,6 @@ void Settings::override() { preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; } - if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) { - maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X; - } - if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) { - maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y; - } - - if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) { - maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X; - } - if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) { - maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y; - } - if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) { - maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z; - } - if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; } diff --git a/device/rocm/rocsettings.hpp b/device/rocm/rocsettings.hpp index d2fffd73..5b5f81d7 100644 --- a/device/rocm/rocsettings.hpp +++ b/device/rocm/rocsettings.hpp @@ -42,8 +42,6 @@ class Settings : public device::Settings { union { struct { uint doublePrecision_ : 1; //!< Enables double precision support - uint enableLocalMemory_ : 1; //!< Enable GPUVM memory - uint enableCoarseGrainSVM_ : 1; //!< Enable device memory for coarse grain SVM allocations uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory uint imageDMA_ : 1; //!< Enable direct image DMA transfers uint stagedXferRead_ : 1; //!< Uses a staged buffer read @@ -55,7 +53,7 @@ class Settings : public device::Settings { uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment uint coop_sync_ : 1; //!< grid and multi-grid sync for gfx940+ uint barrier_value_packet_ : 1; //!< Barrier value packet functionality - uint reserved_ : 18; + uint reserved_ : 20; }; uint value_; }; @@ -66,15 +64,6 @@ class Settings : public device::Settings { //! Preferred workgroup size uint preferredWorkGroupSize_; - //! Default max workgroup sizes for 2D - int maxWorkGroupSize2DX_; - int maxWorkGroupSize2DY_; - - //! Default max workgroup sizes for 3D - int maxWorkGroupSize3DX_; - int maxWorkGroupSize3DY_; - int maxWorkGroupSize3DZ_; - uint kernargPoolSize_; uint numDeviceEvents_; //!< The number of device events uint numWaitEvents_; //!< The number of wait events for device enqueue diff --git a/utils/flags.hpp b/utils/flags.hpp index 186d059b..f563c6d3 100644 --- a/utils/flags.hpp +++ b/utils/flags.hpp @@ -30,22 +30,10 @@ release(uint, AMD_LOG_MASK, 0X7FFFFFFF, \ "The mask to enable specific kinds of logs") \ debug(uint, DEBUG_GPU_FLAGS, 0, \ "The debug options for GPU device") \ -release(uint, GPU_MAX_COMMAND_QUEUES, 300, \ - "The maximum number of concurrent Virtual GPUs") \ release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */ \ "The default command queue thread stack size") \ release(int, GPU_MAX_WORKGROUP_SIZE, 0, \ "Maximum number of workitems in a workgroup for GPU, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0, \ - "Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0, \ - "Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0, \ - "Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0, \ - "Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \ -release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0, \ - "Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \ debug(bool, CPU_MEMORY_GUARD_PAGES, false, \ "Use guard pages for CPU memory") \ debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64, \ @@ -70,12 +58,8 @@ release(uint, GPU_STAGING_BUFFER_SIZE, 4, \ "Size of the GPU staging buffer in MiB") \ release(bool, GPU_DUMP_BLIT_KERNELS, false, \ "Dump the kernels for blit manager") \ -release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \ - "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \ release(bool, GPU_FLUSH_ON_EXECUTION, false, \ "Submit commands to HW on every operation. 0 - Disable, 1 - Enable") \ -release(bool, GPU_USE_SYNC_OBJECTS, true, \ - "If enabled, use sync objects instead of polling") \ release(bool, CL_KHR_FP64, true, \ "Enable/Disable support for double precision") \ release(cstring, AMD_OCL_BUILD_OPTIONS, 0, \ @@ -86,12 +70,8 @@ release(cstring, AMD_OCL_LINK_OPTIONS, 0, \ "Set clLinkProgram()'s options (override)") \ release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \ "Append clLinkProgram()'s options") \ -release(cstring, AMD_OCL_SC_LIB, 0, \ - "Set shader compiler shared library name or path") \ debug(cstring, AMD_OCL_SUBST_OBJFILE, 0, \ "Specify binary substitution config file for OpenCL") \ -debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false, \ - "Enable the error dialog on Windows") \ release(size_t, GPU_PINNED_XFER_SIZE, 32, \ "The pinned buffer size for pinning in read/write transfers in MiB") \ release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128, \ @@ -100,12 +80,6 @@ release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \ "The resource cache size in MB") \ release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \ "The maximum size accepted for suballocaitons in KB") \ -release(bool, GPU_FORCE_64BIT_PTR, 0, \ - "Forces 64 bit pointers on GPU") \ -release(bool, GPU_FORCE_OCL20_32BIT, 0, \ - "Forces 32 bit apps to take CLANG\HSAIL path") \ -release(bool, GPU_RAW_TIMESTAMP, 0, \ - "Reports GPU raw timestamps in GPU timeline") \ release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \ "Number of memory objects for dependency tracking") \ release(size_t, GPU_XFER_BUFFER_SIZE, 0, \ @@ -116,32 +90,20 @@ release(uint, GPU_SINGLE_ALLOC_PERCENT, 85, \ "Maximum size of a single allocation as percentage of total") \ release(uint, GPU_NUM_COMPUTE_RINGS, 2, \ "GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \ -release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1, \ - "GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \ release(uint, GPU_WORKLOAD_SPLIT, 22, \ "Workload split size") \ -release(bool, GPU_USE_SINGLE_SCRATCH, false, \ - "Use single scratch buffer per device instead of per HW ring") \ release(bool, AMD_OCL_WAIT_COMMAND, false, \ "1 = Enable a wait for every submitted command") \ release(uint, GPU_PRINT_CHILD_KERNEL, 0, \ "Prints the specified number of the child kernels") \ release(bool, GPU_USE_DEVICE_QUEUE, false, \ "Use a dedicated device queue for the actual submissions") \ -release(bool, GPU_ENABLE_LARGE_ALLOCATION, true, \ - "Enable >4GB single allocations") \ release(bool, AMD_THREAD_TRACE_ENABLE, true, \ "Enable thread trace extension") \ release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200), \ "Force GPU opencl verison") \ -release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \ - "Enable HSA device local memory usage") \ release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \ "Kernarg pool size") \ -release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true, \ - "Enable device memory for coarse grain SVM allocations") \ -release(bool, GPU_IFH_MODE, false, \ - "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \ release(bool, GPU_MIPMAP, true, \ "Enables GPU mipmap extension") \ release(uint, GPU_ENABLE_PAL, 2, \ @@ -152,8 +114,6 @@ release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1, \ "Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \ release(uint, OCL_SET_SVM_SIZE, 4*16384, \ "set SVM space size for discrete GPU") \ -debug(uint, OCL_SYSMEM_REQUIREMENT, 2, \ - "Use flag to change the minimum requirement of system memory not to downgrade") \ release(uint, GPU_WAVES_PER_SIMD, 0, \ "Force the number of waves per SIMD (1-10)") \ release(bool, GPU_WAVE_LIMIT_ENABLE, false, \ @@ -176,10 +136,6 @@ release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "", \ "File path prefix for dumping wave limiter output") \ release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "", \ "File path prefix for tracing wave limiter") \ -release(bool, OCL_CODE_CACHE_ENABLE, false, \ - "1 = Enable compiler code cache") \ -release(bool, OCL_CODE_CACHE_RESET, false, \ - "1 = Reset the compiler code cache storage") \ release(bool, PAL_DISABLE_SDMA, false, \ "1 = Disable SDMA for PAL") \ release(uint, PAL_RGP_DISP_COUNT, 10000, \ From a7db4ab080961c7c8896547587ef38f3080bd99f Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Mon, 30 Jan 2023 12:00:45 -0800 Subject: [PATCH 23/45] SWDEV-378648 - Adding checks in runtime to abort when bad/invalid payload is sent. Change-Id: Ibaa7927bacef1ba067ded5ccbf3f1111f31200a2 --- device/devhostcall.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/device/devhostcall.cpp b/device/devhostcall.cpp index 12e5fe90..1729e5eb 100644 --- a/device/devhostcall.cpp +++ b/device/devhostcall.cpp @@ -88,6 +88,7 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t* } return; case SERVICE_DEVMEM: { + guarantee(payload[0] != 0 || payload[1] != 0, "Both payloads cannot be 0 \n"); if (payload[0]) { amd::Memory* mem = amd::MemObjMap::FindMemObj(reinterpret_cast(payload[0])); if (mem) { From 3bed55f826b1f26b0c3ee2e0ea5673bf83ce8b44 Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Tue, 7 Feb 2023 03:00:09 -0500 Subject: [PATCH 24/45] SWDEV-2 - Change OpenCL version number from 3567 to 3568 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index 62bc75fe..fc02163e 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3567 +#define AMD_PLATFORM_BUILD_NUMBER 3568 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From b1bd4632d7d0995c763c073ebe35210aaae69a52 Mon Sep 17 00:00:00 2001 From: jatang Date: Fri, 3 Feb 2023 17:42:01 -0500 Subject: [PATCH 25/45] SWDEV-380792 - Fix floating point exception when maxEngineClockFrequency_ is 0 Change-Id: Ic443ceae586c4c84995ed2abef9bd7f32f8b60f9 --- device/rocm/rocblit.cpp | 5 ++++- device/rocm/rocdevice.cpp | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/device/rocm/rocblit.cpp b/device/rocm/rocblit.cpp index a7a3efd2..1bd6a0ff 100644 --- a/device/rocm/rocblit.cpp +++ b/device/rocm/rocblit.cpp @@ -2691,7 +2691,10 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, sp->vqueue_header = vqVM; sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam); - sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_; + + if (dev().info().maxEngineClockFrequency_ > 0) { + sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_; + } // Use a device side global atomics to workaround the reliance of PCIe 3 atomics sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue); diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index e63ab77e..e352577d 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -1147,7 +1147,9 @@ bool Device::populateOCLDeviceConstants() { //TODO: add the assert statement for Raven if (!(isa().versionMajor() == 9 && isa().versionMinor() == 0 && isa().versionStepping() == 2)) { - assert(info_.maxEngineClockFrequency_ > 0); + if (info_.maxEngineClockFrequency_ <= 0) { + LogError("maxEngineClockFrequency_ is NOT positive!"); + } } if (HSA_STATUS_SUCCESS != From df0c903c86b5e67843ceb7434d620a8ced22f96e Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Wed, 8 Feb 2023 10:21:36 -0800 Subject: [PATCH 26/45] SWDEV-381633 - Better log Change-Id: Ie151d73aa4b1dae7d0d9acd0457e7dbdf35f21ee --- device/devprogram.cpp | 2 +- device/rocm/rocdevice.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/device/devprogram.cpp b/device/devprogram.cpp index 6e4287b5..bb40cce0 100644 --- a/device/devprogram.cpp +++ b/device/devprogram.cpp @@ -2962,7 +2962,7 @@ bool Program::runInitFiniKernel(kernel_kind_t kind) const { amd::HostQueue* queue = nullptr; for (const auto& i : kernels_) { - LogPrintfInfo("For Init/Fini: Kernel Name: %s", i.first.c_str()); + ClPrint(amd::LOG_INFO, amd::LOG_INIT, "For Init/Fini: Kernel Name: %s", i.first.c_str()); const auto &kernel = i.second; if ((kernel->isInitKernel() && kind == kernel_kind_t::InitKernel) || (kernel->isFiniKernel() && kind == kernel_kind_t::FiniKernel)) { diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index e352577d..f9df3075 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -1622,7 +1622,7 @@ bool Device::populateOCLDeviceConstants() { LogError("HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS query failed."); } - LogPrintfInfo("HMM support: %d, xnack: %d, direct host access: %d\n", + ClPrint(amd::LOG_INFO, amd::LOG_INIT, "HMM support: %d, xnack: %d, direct host access: %d\n", info_.hmmSupported_, info_.hmmCpuMemoryAccessible_, info_.hmmDirectHostAccess_); info_.globalCUMask_ = {}; From 2d5917b555272ea3dd0cf5728e128484c46b7871 Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Thu, 9 Feb 2023 03:00:04 -0500 Subject: [PATCH 27/45] SWDEV-2 - Change OpenCL version number from 3568 to 3569 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index fc02163e..f03c7f4a 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3568 +#define AMD_PLATFORM_BUILD_NUMBER 3569 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From c5833a0ee114b01d16ebfeb1ba29c9e85cd9825c Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Wed, 8 Feb 2023 17:55:08 -0500 Subject: [PATCH 28/45] SWDEV-381633 - Use blit manager on app's queue Heap initialization used device queue, but it shoudl be used for cooperative launches only. Heap initialization must use the same queue as the current dispatch. Change-Id: I856621bf82bbdeb1c2d0fbc4970e90d09af805cb --- device/pal/paldevice.cpp | 6 +++--- device/pal/paldevice.hpp | 2 +- device/pal/palkernel.cpp | 2 +- device/rocm/rocdevice.cpp | 6 +++--- device/rocm/rocdevice.hpp | 2 +- device/rocm/rocvirtual.cpp | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp index 6c60ef05..5755f34a 100644 --- a/device/pal/paldevice.cpp +++ b/device/pal/paldevice.cpp @@ -2356,8 +2356,8 @@ void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const { } // ================================================================================================ -void Device::HiddenHeapAlloc() { - auto HeapAlloc = [this]() -> bool { +void Device::HiddenHeapAlloc(const VirtualGPU& gpu) { + auto HeapAlloc = [this, &gpu]() -> bool { // Allocate initial heap for device memory allocator static constexpr size_t HeapBufferSize = 128 * Ki; heap_buffer_ = createMemory(HeapBufferSize); @@ -2369,7 +2369,7 @@ void Device::HiddenHeapAlloc() { LogError("Heap buffer allocation failed!"); return false; } - bool result = static_cast(xferMgr()).initHeap( + bool result = static_cast(gpu.blitMgr()).initHeap( heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi)); return result; diff --git a/device/pal/paldevice.hpp b/device/pal/paldevice.hpp index 5b3b4af5..0226ea40 100644 --- a/device/pal/paldevice.hpp +++ b/device/pal/paldevice.hpp @@ -629,7 +629,7 @@ class Device : public NullDevice { #endif #endif //! Allocates hidden heap for device memory allocations - void HiddenHeapAlloc(); + void HiddenHeapAlloc(const VirtualGPU& gpu); private: static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex, diff --git a/device/pal/palkernel.cpp b/device/pal/palkernel.cpp index e6e4fd28..f425e3d7 100644 --- a/device/pal/palkernel.cpp +++ b/device/pal/palkernel.cpp @@ -362,7 +362,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const case amd::KernelParameterDescriptor::HiddenHeap: // Allocate hidden heap for HIP applications only if ((amd::IS_HIP) && (palDevice().HeapBuffer() == nullptr)) { - const_cast(palDevice()).HiddenHeapAlloc(); + const_cast(palDevice()).HiddenHeapAlloc(gpu); } if (palDevice().HeapBuffer() != nullptr) { // Add heap pointer to the code diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index f9df3075..9ecaf22e 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -3243,8 +3243,8 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const { } // ================================================================================================ -void Device::HiddenHeapAlloc() { - auto HeapAllocZeroOut = [this]() -> bool { +void Device::HiddenHeapAlloc(const VirtualGPU& gpu) { + auto HeapAllocZeroOut = [this, &gpu]() -> bool { // Allocate initial heap for device memory allocator static constexpr size_t HeapBufferSize = 128 * Ki; heap_buffer_ = createMemory(HeapBufferSize); @@ -3256,7 +3256,7 @@ void Device::HiddenHeapAlloc() { LogError("Heap buffer allocation failed!"); return false; } - bool result = static_cast(xferMgr()).initHeap( + bool result = static_cast(gpu.blitMgr()).initHeap( heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi)); return result; diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp index 99d0976e..d9eb4f20 100644 --- a/device/rocm/rocdevice.hpp +++ b/device/rocm/rocdevice.hpp @@ -547,7 +547,7 @@ class Device : public NullDevice { bool IsValidAllocation(const void* dev_ptr, size_t size) const; //! Allocates hidden heap for device memory allocations - void HiddenHeapAlloc(); + void HiddenHeapAlloc(const VirtualGPU& gpu); private: bool create(); diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp index 0976b183..3b88b564 100644 --- a/device/rocm/rocvirtual.cpp +++ b/device/rocm/rocvirtual.cpp @@ -2958,7 +2958,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, case amd::KernelParameterDescriptor::HiddenHeap: // Allocate hidden heap for HIP applications only if ((amd::IS_HIP) && (dev().HeapBuffer() == nullptr)) { - const_cast(dev()).HiddenHeapAlloc(); + const_cast(dev()).HiddenHeapAlloc(*this); } if (dev().HeapBuffer() != nullptr) { // Add heap pointer to the code From ea66c122995c3f040ed60e6302e915e3935a9e7e Mon Sep 17 00:00:00 2001 From: Jacob Lambert Date: Mon, 6 Feb 2023 19:10:53 -0500 Subject: [PATCH 29/45] SWDEV-376413 - Revert "SWDEV-376413 - Replace deprecated Comgr device-lib action" This reverts commit 2448d5d31cacd9facaadf37a6896b036f654afb1. Reason for revert: Test failures with Luxmark, blender, and Indigobench. Need to investigate before re-applying Change-Id: I6b08273a8f9c8bcaa4e7a06cd42d15048e52ca2a --- device/devprogram.cpp | 37 ++++++++++++++++++++++++++----------- device/devprogram.hpp | 6 +++--- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/device/devprogram.cpp b/device/devprogram.cpp index bb40cce0..f1a2cb2e 100644 --- a/device/devprogram.cpp +++ b/device/devprogram.cpp @@ -346,7 +346,7 @@ amd_comgr_status_t Program::createAction(const amd_comgr_language_t oclver, bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs, const std::vector& options, amd::option::Options* amdOptions, amd_comgr_data_set_t* output, - char* binaryData[], size_t* binarySize) { + char* binaryData[], size_t* binarySize, const bool link_dev_libs) { amd_comgr_language_t langver; setLanguage(amdOptions->oVariables->CLStd, &langver); @@ -358,13 +358,28 @@ bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs, // Create the action for linking amd_comgr_action_info_t action; + amd_comgr_data_set_t dataSetDevLibs; bool hasAction = false; + bool hasDataSetDevLibs = false; amd_comgr_status_t status = createAction(langver, options, &action, &hasAction); + if (link_dev_libs) { + if (status == AMD_COMGR_STATUS_SUCCESS) { + status = amd::Comgr::create_data_set(&dataSetDevLibs); + } + + if (status == AMD_COMGR_STATUS_SUCCESS) { + hasDataSetDevLibs = true; + status = amd::Comgr::do_action(AMD_COMGR_ACTION_ADD_DEVICE_LIBRARIES, action, inputs, + dataSetDevLibs); + extractBuildLog(dataSetDevLibs); + } + } + if (status == AMD_COMGR_STATUS_SUCCESS) { status = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, - inputs, *output); + (link_dev_libs) ? dataSetDevLibs : inputs, *output); extractBuildLog(*output); } @@ -381,14 +396,17 @@ bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs, amd::Comgr::destroy_action_info(action); } + if (hasDataSetDevLibs) { + amd::Comgr::destroy_data_set(dataSetDevLibs); + } + return (status == AMD_COMGR_STATUS_SUCCESS); } bool Program::compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs, const std::vector& options, amd::option::Options* amdOptions, - char* binaryData[], size_t* binarySize, - const bool link_dev_libs) { + char* binaryData[], size_t* binarySize) { amd_comgr_language_t langver; setLanguage(amdOptions->oVariables->CLStd, &langver); @@ -465,12 +483,8 @@ bool Program::compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs, // Compiling the source codes with precompiled headers or directly compileInputs if (status == AMD_COMGR_STATUS_SUCCESS) { - if (link_dev_libs) - status = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC, - action, input, output); - else - status = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, - action, input, output); + status = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, + action, input, output); extractBuildLog(output); } @@ -956,8 +970,9 @@ bool Program::linkImplLC(const std::vector& inputPrograms, char* binaryData = nullptr; size_t binarySize = 0; std::vector linkOptions; + constexpr bool kLinkDevLibs = false; bool ret = linkLLVMBitcode(inputs, linkOptions, options, &output, &binaryData, - &binarySize); + &binarySize, kLinkDevLibs); amd::Comgr::destroy_data_set(output); amd::Comgr::destroy_data_set(inputs); diff --git a/device/devprogram.hpp b/device/devprogram.hpp index a324750e..3bf0679a 100644 --- a/device/devprogram.hpp +++ b/device/devprogram.hpp @@ -456,13 +456,13 @@ class Program : public amd::HeapObject { bool linkLLVMBitcode(const amd_comgr_data_set_t inputs, const std::vector& options, amd::option::Options* amdOptions, amd_comgr_data_set_t* output, - char* binaryData[] = nullptr, size_t* binarySize = nullptr); + char* binaryData[] = nullptr, size_t* binarySize = nullptr, + const bool link_dev_libs = true); //! Create the bitcode of the compiled input dataset bool compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs, const std::vector& options, amd::option::Options* amdOptions, - char* binaryData[], size_t* binarySize, - const bool link_dev_libs = true); + char* binaryData[], size_t* binarySize); //! Compile and create the excutable of the input dataset bool compileAndLinkExecutable(const amd_comgr_data_set_t inputs, From ca70cd7c3d63b1036e604116f3b269ca123e7904 Mon Sep 17 00:00:00 2001 From: Jeremy Newton Date: Wed, 8 Feb 2023 14:03:58 -0500 Subject: [PATCH 30/45] SWDEV-1 - device: Add missing include Newer GCC's seem to require this. Change-Id: I85926d4fa552b772f2eb9f8ede7863a546c47f54 Signed-off-by: Jeremy Newton --- device/devhcprintf.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/device/devhcprintf.cpp b/device/devhcprintf.cpp index b36f7758..a5aba4a4 100644 --- a/device/devhcprintf.cpp +++ b/device/devhcprintf.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #include #include From 20dfaee8767f7bc9df722fdbc397d77eed267607 Mon Sep 17 00:00:00 2001 From: Chauncey Hui Date: Fri, 10 Feb 2023 03:00:12 -0500 Subject: [PATCH 31/45] SWDEV-2 - Change OpenCL version number from 3569 to 3570 --- utils/versions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/versions.hpp b/utils/versions.hpp index f03c7f4a..c8c96219 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3569 +#define AMD_PLATFORM_BUILD_NUMBER 3570 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER From c7bf6a007212e8fe387e11f74789c94c1b63f27b Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Mon, 7 Nov 2022 16:47:59 +0000 Subject: [PATCH 32/45] SWDEV-365908 - Do not align mem size for IPC Remove alignment to granularity for IPC handles as ROCr has a patch that will internally validate pointer sizes against requested size during allocation instead of size aligned to page size. This patch is needed together with this patch from ROCr: f8a42a3a:Use user requested size for memory fragments Change-Id: I28b25558ea03c836b44fafdb34b7330cf6887424 --- device/rocm/rocdevice.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index 9ecaf22e..f2ebe0f0 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -2209,8 +2209,7 @@ bool Device::IpcCreate(void* dev_ptr, size_t* mem_size, void* handle, size_t* me return false; } - // Pass the pointer and memory size to retrieve the handle - hsa_status = hsa_amd_ipc_memory_create(orig_dev_ptr, amd::alignUp(*mem_size, alloc_granularity()), + hsa_status = hsa_amd_ipc_memory_create(orig_dev_ptr, *mem_size, reinterpret_cast(handle)); if (hsa_status != HSA_STATUS_SUCCESS) { From 03944161724c523dcdb5cb536df1bf349297584f Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Mon, 13 Feb 2023 14:58:05 -0800 Subject: [PATCH 33/45] SWDEV-301667 - Refactor code Change-Id: I8f20d994d88acb14f392eaa9d5cd14620667d7f6 --- device/rocm/rocvirtual.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp index 3b88b564..feeef138 100644 --- a/device/rocm/rocvirtual.cpp +++ b/device/rocm/rocvirtual.cpp @@ -433,8 +433,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( prof_signal->ts_ = ts; ts->AddProfilingSignal(prof_signal); if (AMD_DIRECT_DISPATCH) { - bool enqueHandler= false; - uint32_t init_value = kInitSignalValueOne; + bool enqueHandler = false; enqueHandler = (ts->command().Callback() != nullptr || ts->command().GetBatchHead() != nullptr ) && !ts->command().CpuWaitRequested(); From 53ed2f3252a7eef4ae7617154d9bdbf74761e321 Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Thu, 9 Feb 2023 23:46:06 +0000 Subject: [PATCH 34/45] SWDEV-380035 - Check for agent and ptr match for hsa LOCKED ptr Change-Id: I2503aa05512aebc3535963e188ca8d1dbfab08f0 --- device/rocm/rocdevice.cpp | 36 +++++++++++++++++++++--------------- device/rocm/rocdevice.hpp | 2 +- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index f2ebe0f0..95a4d375 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -3188,7 +3188,9 @@ device::Signal* Device::createSignal() const { amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size) { // Only create arena_mem_object if CPU memory is accessible from HMM // or if runtime received an interop from another ROCr's client - if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size)) { + hsa_amd_pointer_info_t ptr_info = {}; + ptr_info.size = sizeof(hsa_amd_pointer_info_t); + if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size, &ptr_info)) { return nullptr; } @@ -3203,12 +3205,18 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size return arena_mem_obj_; } } - // Calculate the offset of the pointer. - const void* dev_ptr = reinterpret_cast(arena_mem_obj_->getDeviceMemory( - *arena_mem_obj_->getContext().devices()[0])->virtualAddress()); - offset = reinterpret_cast(ptr) - reinterpret_cast(dev_ptr); - + const void* dev_ptr = reinterpret_cast( + arena_mem_obj_->getDeviceMemory(*arena_mem_obj_->getContext().devices()[0]) + ->virtualAddress()); + // System memory which has been locked + if (ptr_info.type == HSA_EXT_POINTER_TYPE_LOCKED && + getCpuAgent().handle == ptr_info.agentOwner.handle && ptr_info.hostBaseAddress == ptr) { + offset = + reinterpret_cast(ptr_info.agentBaseAddress) - reinterpret_cast(dev_ptr); + } else { + offset = reinterpret_cast(ptr) - reinterpret_cast(dev_ptr); + } return arena_mem_obj_; } @@ -3220,20 +3228,18 @@ void Device::ReleaseGlobalSignal(void* signal) const { } // ================================================================================================ -bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const { - hsa_amd_pointer_info_t ptr_info = {}; - ptr_info.size = sizeof(hsa_amd_pointer_info_t); +bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) { // Query ptr type to see if it's a HMM allocation - hsa_status_t status = hsa_amd_pointer_info( - const_cast(dev_ptr), &ptr_info, nullptr, nullptr, nullptr); + hsa_status_t status = + hsa_amd_pointer_info(const_cast(dev_ptr), ptr_info, nullptr, nullptr, nullptr); // The call should never fail in ROCR, but just check for an error and continue if (status != HSA_STATUS_SUCCESS) { LogError("hsa_amd_pointer_info() failed"); } - // Check if it's a legacy non-HMM allocation in ROCr - if (ptr_info.type != HSA_EXT_POINTER_TYPE_UNKNOWN) { - if ((size != 0) && ((reinterpret_cast(dev_ptr) - - reinterpret_cast(ptr_info.agentBaseAddress)) > size)) { + if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) { + if ((size != 0) && + ((reinterpret_cast(dev_ptr) - + reinterpret_cast(ptr_info->agentBaseAddress)) > size)) { return false; } return true; diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp index d9eb4f20..275fe457 100644 --- a/device/rocm/rocdevice.hpp +++ b/device/rocm/rocdevice.hpp @@ -544,7 +544,7 @@ class Device : public NullDevice { const bool isFineGrainSupported() const; //! Returns True if memory pointer is known to ROCr (excludes HMM allocations) - bool IsValidAllocation(const void* dev_ptr, size_t size) const; + bool IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info); //! Allocates hidden heap for device memory allocations void HiddenHeapAlloc(const VirtualGPU& gpu); From 8aa5804efadc59b3b16e4266c16397d170dbde4b Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 15 Feb 2023 07:23:23 +0000 Subject: [PATCH 35/45] SWDEV-368235 - Revert "Remove obsolete env variables" This reverts commit 21e50998dd2439708a6672dac5a04be1dd3da682. Reason for revert: Deferred to a future release. Change-Id: Ia66c37f0ab9734dee73c930d10d7469d5fd57254 --- device/device.cpp | 11 +++ device/device.hpp | 3 +- device/devkernel.cpp | 144 ++++++++++++++++++++---------------- device/pal/paldevice.cpp | 4 +- device/pal/palsettings.cpp | 39 +++++++++- device/pal/palsettings.hpp | 4 +- device/pal/palvirtual.cpp | 25 ++++++- device/rocm/rocdevice.cpp | 56 +++++++------- device/rocm/rocdevice.hpp | 2 +- device/rocm/rocsettings.cpp | 26 +++++++ device/rocm/rocsettings.hpp | 13 +++- utils/flags.hpp | 44 +++++++++++ 12 files changed, 269 insertions(+), 102 deletions(-) diff --git a/device/device.cpp b/device/device.cpp index 288f84f6..f87452db 100644 --- a/device/device.cpp +++ b/device/device.cpp @@ -798,6 +798,17 @@ Settings::Settings() : value_(0) { commandQueues_ = 200; //!< Field value set to maximum number //!< concurrent Virtual GPUs for default + overrideLclSet = (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE)) ? 1 : 0; + overrideLclSet |= + (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y)) + ? 2 + : 0; + overrideLclSet |= + (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z)) + ? 4 + : 0; + fenceScopeAgent_ = AMD_OPT_FLUSH; if (amd::IS_HIP) { if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) { diff --git a/device/device.hpp b/device/device.hpp index ed206311..a493795e 100644 --- a/device/device.hpp +++ b/device/device.hpp @@ -619,6 +619,7 @@ class Settings : public amd::HeapObject { uint64_t extensions_; //!< Supported OCL extensions union { struct { + uint overrideLclSet : 3; //!< Bit mask to override the local size uint apuSystem_ : 1; //!< Device is APU system with shared memory uint supportRA_ : 1; //!< Support RA channel order format uint waitCommand_ : 1; //!< Enables a wait for every submitted command @@ -638,7 +639,7 @@ class Settings : public amd::HeapObject { uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device uint fenceScopeAgent_ : 1; //!< Enable fence scope agent in AQL dispatch packet uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions - uint reserved_ : 14; + uint reserved_ : 11; }; uint value_; }; diff --git a/device/devkernel.cpp b/device/devkernel.cpp index 42eb1857..e598dca7 100644 --- a/device/devkernel.cpp +++ b/device/devkernel.cpp @@ -677,80 +677,100 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, if (workGroupInfo()->compileSize_[0] == 0) { // Find the default local workgroup size, if it wasn't specified if (lclWorkSize[0] == 0) { - // Find threads per group - size_t thrPerGrp = workGroupInfo()->size_; - - // Check if kernel uses images - if (flags_.imageEna_ && - // and thread group is a multiple value of wavefronts - ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && - // and it's 2 or 3-dimensional workload - (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) { - // Use 8x8 workgroup size if kernel has image writes - if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) { - lclWorkSize[0] = 8; - lclWorkSize[1] = 8; + if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) { + // Find threads per group + size_t thrPerGrp = workGroupInfo()->size_; + + // Check if kernel uses images + if (flags_.imageEna_ && + // and thread group is a multiple value of wavefronts + ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && + // and it's 2 or 3-dimensional workload + (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) { + // Use 8x8 workgroup size if kernel has image writes + if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) { + lclWorkSize[0] = 8; + lclWorkSize[1] = 8; + } + else { + lclWorkSize[0] = 16; + lclWorkSize[1] = 16; + } + if (workDim == 3) { + lclWorkSize[2] = 1; + } } else { - lclWorkSize[0] = 16; - lclWorkSize[1] = 16; - } - if (workDim == 3) { - lclWorkSize[2] = 1; - } - } - else { - size_t tmp = thrPerGrp; - // Split the local workgroup into the most efficient way - for (uint d = 0; d < workDim; ++d) { - size_t div = tmp; - for (; (gblWorkSize[d] % div) != 0; div--) - ; - lclWorkSize[d] = div; - tmp /= div; - } - - // Assuming DWORD access - const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; - - // Check if we couldn't find optimal workload - if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || - // or size is too small for the cache line - (lclWorkSize[0] < cacheLineMatch)) { - size_t maxSize = 0; - size_t maxDim = 0; + size_t tmp = thrPerGrp; + // Split the local workgroup into the most efficient way for (uint d = 0; d < workDim; ++d) { - if (maxSize < gblWorkSize[d]) { - maxSize = gblWorkSize[d]; - maxDim = d; - } + size_t div = tmp; + for (; (gblWorkSize[d] % div) != 0; div--) + ; + lclWorkSize[d] = div; + tmp /= div; } - // Use X dimension as high priority. Runtime will assume that - // X dimension is more important for the address calculation - if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { - lclWorkSize[0] = cacheLineMatch; - thrPerGrp /= cacheLineMatch; - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 1; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; + + // Assuming DWORD access + const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2; + + // Check if we couldn't find optimal workload + if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || + // or size is too small for the cache line + (lclWorkSize[0] < cacheLineMatch)) { + size_t maxSize = 0; + size_t maxDim = 0; + for (uint d = 0; d < workDim; ++d) { + if (maxSize < gblWorkSize[d]) { + maxSize = gblWorkSize[d]; + maxDim = d; } } - } - else { - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; + // Use X dimension as high priority. Runtime will assume that + // X dimension is more important for the address calculation + if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { + lclWorkSize[0] = cacheLineMatch; + thrPerGrp /= cacheLineMatch; + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 1; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; + else { + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 0; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } } } } } } + else { + // Use overrides when app doesn't provide workgroup dimensions + if (workDim == 1) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; + } + else if (workDim == 2) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; + } + else if (workDim == 3) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; + lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; + } + else { + assert(0 && "Invalid workDim!"); + } + } } } else { diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp index 5755f34a..8b07d391 100644 --- a/device/pal/paldevice.cpp +++ b/device/pal/paldevice.cpp @@ -291,7 +291,7 @@ bool NullDevice::create(const char* palName, const amd::Isa& isa, Pal::GfxIpLeve nullptr, nullptr, nullptr, - nullptr}; + AMD_OCL_SC_LIB}; // Initialize the compiler handle acl_error error; compiler_ = amd::Hsail::CompilerInit(&opts, &error); @@ -1013,7 +1013,7 @@ bool Device::create(Pal::IDevice* device) { nullptr, nullptr, nullptr, - nullptr}; + AMD_OCL_SC_LIB}; // Initialize the compiler handle acl_error error; compiler_ = amd::Hsail::CompilerInit(&opts, &error); diff --git a/device/pal/palsettings.cpp b/device/pal/palsettings.cpp index 59ac82b8..1f4dc8e9 100644 --- a/device/pal/palsettings.cpp +++ b/device/pal/palsettings.cpp @@ -76,6 +76,8 @@ Settings::Settings() { // Enable workload split by default (for 24 bit arithmetic or timeout) workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT; + // By default use host blit + blitEngine_ = BlitEngineHost; pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi; pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE) ? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi; @@ -121,6 +123,8 @@ Settings::Settings() { //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS linearPersistentImage_ = false; + useSingleScratch_ = GPU_USE_SINGLE_SCRATCH; + // Device enqueuing settings numDeviceEvents_ = 1024; numWaitEvents_ = 8; @@ -324,11 +328,16 @@ bool Settings::create(const Pal::DeviceProperties& palProp, libSelector_ = amd::GPU_Library_CI; if (LP64_SWITCH(false, true)) { - oclVersion_ = !reportAsOCL12Device + oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ + ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) + : OpenCL12; + } + if (GPU_FORCE_OCL20_32BIT) { + force32BitOcl20_ = true; + oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12; } - if (OPENCL_VERSION < 200) { oclVersion_ = OpenCL12; } @@ -337,13 +346,27 @@ bool Settings::create(const Pal::DeviceProperties& palProp, // Cap at OpenCL20 for now if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20; - use64BitPtr_ = LP64_SWITCH(false, true); + // This needs to be cleaned once 64bit addressing is stable + if (oclVersion_ < OpenCL20) { + use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) + ? LP64_SWITCH(false, + /*calAttr.isWorkstation ||*/ true) + : GPU_FORCE_64BIT_PTR; + } else { + if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) { + use64BitPtr_ = true; + } + } if (oclVersion_ >= OpenCL20) { supportDepthsRGB_ = true; } if (use64BitPtr_) { - maxAllocSize_ = 64ULL * Gi; + if (GPU_ENABLE_LARGE_ALLOCATION) { + maxAllocSize_ = 64ULL * Gi; + } else { + maxAllocSize_ = 4048 * Mi; + } } else { maxAllocSize_ = 3ULL * Gi; } @@ -424,6 +447,9 @@ bool Settings::create(const Pal::DeviceProperties& palProp, imageSupport_ = true; + // Use kernels for blit if appropriate + blitEngine_ = BlitEngineKernel; + hostMemDirectAccess_ |= HostMemBuffer; // HW doesn't support untiled image writes // hostMemDirectAccess_ |= HostMemImage; @@ -516,6 +542,11 @@ void Settings::override() { preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; } + // Override blit engine type + if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) { + blitEngine_ = GPU_BLIT_ENGINE_TYPE; + } + if (!flagIsDefault(DEBUG_GPU_FLAGS)) { debugFlags_ = DEBUG_GPU_FLAGS; } diff --git a/device/pal/palsettings.hpp b/device/pal/palsettings.hpp index 4bcc7e2e..66984622 100644 --- a/device/pal/palsettings.hpp +++ b/device/pal/palsettings.hpp @@ -70,6 +70,7 @@ class Settings : public device::Settings { uint gfx10Plus_ : 1; //!< gfx10 and post gfx10 features uint threadTraceEnable_ : 1; //!< Thread trace enable uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent + uint useSingleScratch_ : 1; //!< Allocates single scratch per device uint svmAtomics_ : 1; //!< SVM device atomics uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support uint useDeviceQueue_ : 1; //!< Submit to separate device queue @@ -81,7 +82,7 @@ class Settings : public device::Settings { uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10 uint disableSdma_ : 1; //!< Disable SDMA support uint alwaysResident_ : 1; //!< Make resources resident at allocation time - uint reserved_ : 8; + uint reserved_ : 7; }; uint value_; }; @@ -94,6 +95,7 @@ class Settings : public device::Settings { uint workloadSplitSize_; //!< Workload split size uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms + uint blitEngine_; //!< Blit engine type uint cacheLineSize_; //!< Cache line size in bytes uint cacheSize_; //!< L1 cache size in bytes uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings diff --git a/device/pal/palvirtual.cpp b/device/pal/palvirtual.cpp index 3a208171..624379d1 100644 --- a/device/pal/palvirtual.cpp +++ b/device/pal/palvirtual.cpp @@ -905,6 +905,11 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, // because destructor calls eraseResourceList() even if create() failed dev().resizeResoureList(index()); + if (index() >= GPU_MAX_COMMAND_QUEUES) { + // Cap the maximum number of concurrent Virtual GPUs + return false; + } + // Virtual GPU will have profiling enabled state_.profiling_ = profiling; @@ -1015,7 +1020,18 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, return false; } - blitMgr_ = new KernelBlitManager(*this, blitSetup); + // Choose the appropriate class for blit engine + switch (dev().settings().blitEngine_) { + default: + // Fall through ... + case Settings::BlitEngineHost: + blitSetup.disableAll(); + // Fall through ... + case Settings::BlitEngineCAL: + case Settings::BlitEngineKernel: + blitMgr_ = new KernelBlitManager(*this, blitSetup); + break; + } if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) { LogError("Could not create BlitManager!"); return false; @@ -3253,8 +3269,11 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) { cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); uint64_t endTimeStampCPU = amd::Os::timeNanos(); - // Adjust the base time by the execution time - readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; + // Make sure the command batch has a valid GPU TS + if (!GPU_RAW_TIMESTAMP) { + // Adjust the base time by the execution time + readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; + } } } } diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index 95a4d375..21514671 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -894,35 +894,37 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo Device* dev = reinterpret_cast(data); switch (segment_type) { case HSA_REGION_SEGMENT_GLOBAL: { - uint32_t global_flag = 0; - hsa_status_t stat = - hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); - if (stat != HSA_STATUS_SUCCESS) { - return stat; - } + if (dev->settings().enableLocalMemory_) { + uint32_t global_flag = 0; + hsa_status_t stat = + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); + if (stat != HSA_STATUS_SUCCESS) { + return stat; + } - if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) { - dev->gpu_fine_grained_segment_ = pool; - } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) { - dev->gpuvm_segment_ = pool; - - // If cpu agent cannot access this pool, the device does not support large bar. - hsa_amd_memory_pool_access_t tmp{}; - hsa_amd_agent_memory_pool_get_info( - dev->cpu_agent_, - pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, - &tmp); - - if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { - dev->info_.largeBar_ = false; - } else { - dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR; + if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) { + dev->gpu_fine_grained_segment_ = pool; + } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) { + dev->gpuvm_segment_ = pool; + + // If cpu agent cannot access this pool, the device does not support large bar. + hsa_amd_memory_pool_access_t tmp{}; + hsa_amd_agent_memory_pool_get_info( + dev->cpu_agent_, + pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, + &tmp); + + if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + dev->info_.largeBar_ = false; + } else { + dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR; + } } - } - if (dev->gpuvm_segment_.handle == 0) { - dev->gpuvm_segment_ = pool; + if (dev->gpuvm_segment_.handle == 0) { + dev->gpuvm_segment_ = pool; + } } break; } @@ -1232,7 +1234,7 @@ bool Device::populateOCLDeviceConstants() { info_.maxWorkItemDimensions_ = 3; - if (gpuvm_segment_.handle != 0) { + if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) { size_t global_segment_size = 0; if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE, diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp index 275fe457..3fcbf039 100644 --- a/device/rocm/rocdevice.hpp +++ b/device/rocm/rocdevice.hpp @@ -218,7 +218,7 @@ class NullDevice : public amd::Device { //! Determine if we can use device memory for SVM const bool forceFineGrain(amd::Memory* memory) const { - return (memory->getContext().devices().size() > 1); + return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1); } virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle) { diff --git a/device/rocm/rocsettings.cpp b/device/rocm/rocsettings.cpp index 24cb88dc..71c341ad 100644 --- a/device/rocm/rocsettings.cpp +++ b/device/rocm/rocsettings.cpp @@ -35,9 +35,18 @@ Settings::Settings() { // Set this to true when we drop the flag doublePrecision_ = ::CL_KHR_FP64; + enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE; + enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM; + maxWorkGroupSize_ = 1024; preferredWorkGroupSize_ = 256; + maxWorkGroupSize2DX_ = 16; + maxWorkGroupSize2DY_ = 16; + maxWorkGroupSize3DX_ = 4; + maxWorkGroupSize3DY_ = 4; + maxWorkGroupSize3DZ_ = 4; + kernargPoolSize_ = HSA_KERNARG_POOL_SIZE; // Determine if user is requesting Non-Coherent mode @@ -192,6 +201,23 @@ void Settings::override() { preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; } + if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) { + maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X; + } + if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) { + maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y; + } + + if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) { + maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X; + } + if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) { + maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y; + } + if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) { + maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z; + } + if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; } diff --git a/device/rocm/rocsettings.hpp b/device/rocm/rocsettings.hpp index 5b5f81d7..d2fffd73 100644 --- a/device/rocm/rocsettings.hpp +++ b/device/rocm/rocsettings.hpp @@ -42,6 +42,8 @@ class Settings : public device::Settings { union { struct { uint doublePrecision_ : 1; //!< Enables double precision support + uint enableLocalMemory_ : 1; //!< Enable GPUVM memory + uint enableCoarseGrainSVM_ : 1; //!< Enable device memory for coarse grain SVM allocations uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory uint imageDMA_ : 1; //!< Enable direct image DMA transfers uint stagedXferRead_ : 1; //!< Uses a staged buffer read @@ -53,7 +55,7 @@ class Settings : public device::Settings { uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment uint coop_sync_ : 1; //!< grid and multi-grid sync for gfx940+ uint barrier_value_packet_ : 1; //!< Barrier value packet functionality - uint reserved_ : 20; + uint reserved_ : 18; }; uint value_; }; @@ -64,6 +66,15 @@ class Settings : public device::Settings { //! Preferred workgroup size uint preferredWorkGroupSize_; + //! Default max workgroup sizes for 2D + int maxWorkGroupSize2DX_; + int maxWorkGroupSize2DY_; + + //! Default max workgroup sizes for 3D + int maxWorkGroupSize3DX_; + int maxWorkGroupSize3DY_; + int maxWorkGroupSize3DZ_; + uint kernargPoolSize_; uint numDeviceEvents_; //!< The number of device events uint numWaitEvents_; //!< The number of wait events for device enqueue diff --git a/utils/flags.hpp b/utils/flags.hpp index f563c6d3..186d059b 100644 --- a/utils/flags.hpp +++ b/utils/flags.hpp @@ -30,10 +30,22 @@ release(uint, AMD_LOG_MASK, 0X7FFFFFFF, \ "The mask to enable specific kinds of logs") \ debug(uint, DEBUG_GPU_FLAGS, 0, \ "The debug options for GPU device") \ +release(uint, GPU_MAX_COMMAND_QUEUES, 300, \ + "The maximum number of concurrent Virtual GPUs") \ release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */ \ "The default command queue thread stack size") \ release(int, GPU_MAX_WORKGROUP_SIZE, 0, \ "Maximum number of workitems in a workgroup for GPU, 0 -use default") \ +release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0, \ + "Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \ +release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0, \ + "Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \ +release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0, \ + "Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \ +release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0, \ + "Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \ +release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0, \ + "Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \ debug(bool, CPU_MEMORY_GUARD_PAGES, false, \ "Use guard pages for CPU memory") \ debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64, \ @@ -58,8 +70,12 @@ release(uint, GPU_STAGING_BUFFER_SIZE, 4, \ "Size of the GPU staging buffer in MiB") \ release(bool, GPU_DUMP_BLIT_KERNELS, false, \ "Dump the kernels for blit manager") \ +release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \ + "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \ release(bool, GPU_FLUSH_ON_EXECUTION, false, \ "Submit commands to HW on every operation. 0 - Disable, 1 - Enable") \ +release(bool, GPU_USE_SYNC_OBJECTS, true, \ + "If enabled, use sync objects instead of polling") \ release(bool, CL_KHR_FP64, true, \ "Enable/Disable support for double precision") \ release(cstring, AMD_OCL_BUILD_OPTIONS, 0, \ @@ -70,8 +86,12 @@ release(cstring, AMD_OCL_LINK_OPTIONS, 0, \ "Set clLinkProgram()'s options (override)") \ release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \ "Append clLinkProgram()'s options") \ +release(cstring, AMD_OCL_SC_LIB, 0, \ + "Set shader compiler shared library name or path") \ debug(cstring, AMD_OCL_SUBST_OBJFILE, 0, \ "Specify binary substitution config file for OpenCL") \ +debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false, \ + "Enable the error dialog on Windows") \ release(size_t, GPU_PINNED_XFER_SIZE, 32, \ "The pinned buffer size for pinning in read/write transfers in MiB") \ release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128, \ @@ -80,6 +100,12 @@ release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \ "The resource cache size in MB") \ release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \ "The maximum size accepted for suballocaitons in KB") \ +release(bool, GPU_FORCE_64BIT_PTR, 0, \ + "Forces 64 bit pointers on GPU") \ +release(bool, GPU_FORCE_OCL20_32BIT, 0, \ + "Forces 32 bit apps to take CLANG\HSAIL path") \ +release(bool, GPU_RAW_TIMESTAMP, 0, \ + "Reports GPU raw timestamps in GPU timeline") \ release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \ "Number of memory objects for dependency tracking") \ release(size_t, GPU_XFER_BUFFER_SIZE, 0, \ @@ -90,20 +116,32 @@ release(uint, GPU_SINGLE_ALLOC_PERCENT, 85, \ "Maximum size of a single allocation as percentage of total") \ release(uint, GPU_NUM_COMPUTE_RINGS, 2, \ "GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \ +release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1, \ + "GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \ release(uint, GPU_WORKLOAD_SPLIT, 22, \ "Workload split size") \ +release(bool, GPU_USE_SINGLE_SCRATCH, false, \ + "Use single scratch buffer per device instead of per HW ring") \ release(bool, AMD_OCL_WAIT_COMMAND, false, \ "1 = Enable a wait for every submitted command") \ release(uint, GPU_PRINT_CHILD_KERNEL, 0, \ "Prints the specified number of the child kernels") \ release(bool, GPU_USE_DEVICE_QUEUE, false, \ "Use a dedicated device queue for the actual submissions") \ +release(bool, GPU_ENABLE_LARGE_ALLOCATION, true, \ + "Enable >4GB single allocations") \ release(bool, AMD_THREAD_TRACE_ENABLE, true, \ "Enable thread trace extension") \ release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200), \ "Force GPU opencl verison") \ +release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \ + "Enable HSA device local memory usage") \ release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \ "Kernarg pool size") \ +release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true, \ + "Enable device memory for coarse grain SVM allocations") \ +release(bool, GPU_IFH_MODE, false, \ + "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \ release(bool, GPU_MIPMAP, true, \ "Enables GPU mipmap extension") \ release(uint, GPU_ENABLE_PAL, 2, \ @@ -114,6 +152,8 @@ release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1, \ "Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \ release(uint, OCL_SET_SVM_SIZE, 4*16384, \ "set SVM space size for discrete GPU") \ +debug(uint, OCL_SYSMEM_REQUIREMENT, 2, \ + "Use flag to change the minimum requirement of system memory not to downgrade") \ release(uint, GPU_WAVES_PER_SIMD, 0, \ "Force the number of waves per SIMD (1-10)") \ release(bool, GPU_WAVE_LIMIT_ENABLE, false, \ @@ -136,6 +176,10 @@ release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "", \ "File path prefix for dumping wave limiter output") \ release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "", \ "File path prefix for tracing wave limiter") \ +release(bool, OCL_CODE_CACHE_ENABLE, false, \ + "1 = Enable compiler code cache") \ +release(bool, OCL_CODE_CACHE_RESET, false, \ + "1 = Reset the compiler code cache storage") \ release(bool, PAL_DISABLE_SDMA, false, \ "1 = Disable SDMA for PAL") \ release(uint, PAL_RGP_DISP_COUNT, 10000, \ From c7905c502e7cc385bc4d1fb8798b621af2f97f5b Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Mon, 9 Jan 2023 15:03:48 -0800 Subject: [PATCH 36/45] SWDEV-376697 - Use PCIE fine grain memory pool when hipDeviceMallocUncached flag is used. Change-Id: I7234d456ef2df42b0b9b9e0d2647ee9bded565f9 --- device/rocm/rocmemory.cpp | 2 +- platform/memory.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/device/rocm/rocmemory.cpp b/device/rocm/rocmemory.cpp index 8aacd09d..d267e0d0 100644 --- a/device/rocm/rocmemory.cpp +++ b/device/rocm/rocmemory.cpp @@ -788,7 +788,7 @@ bool Buffer::create(bool alloc_local) { } else { assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!"); deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0, - (memFlags & ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN) != 0); + (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0); } owner()->setSvmPtr(deviceMemory_); } else { diff --git a/platform/memory.hpp b/platform/memory.hpp index 0b6ace9f..ba89e18c 100644 --- a/platform/memory.hpp +++ b/platform/memory.hpp @@ -41,7 +41,7 @@ #define ROCCLR_MEM_HSA_SIGNAL_MEMORY (1u << 30) #define ROCCLR_MEM_INTERNAL_MEMORY (1u << 29) #define CL_MEM_VA_RANGE_AMD (1u << 28) -#define ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN (1u << 27) +#define ROCCLR_MEM_HSA_UNCACHED (1u << 27) namespace device { class Memory; From abc782b807f1f4e202b292c49987bde5012ef90f Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Thu, 1 Dec 2022 13:32:35 +0000 Subject: [PATCH 37/45] SWDEV-366636 - Fix performance drop in TF-RCCL models Change-Id: Idc845bb0dab858b94b9d2720cae8308cac2e7328 --- device/device.hpp | 11 ++++++++--- device/rocm/rocdevice.cpp | 16 ++++++++++++++-- device/rocm/rocdevice.hpp | 2 ++ device/rocm/rocvirtual.hpp | 6 +++--- 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/device/device.hpp b/device/device.hpp index a493795e..cf5bdb33 100644 --- a/device/device.hpp +++ b/device/device.hpp @@ -1786,9 +1786,14 @@ class Device : public RuntimeObject { // Returns the status of HW event, associated with amd::Event virtual bool IsHwEventReady( - const amd::Event& event, //!< AMD event for HW status validation - bool wait = false //!< If true then forces the event completion - ) const { + const amd::Event& event, //!< AMD event for HW status validation + bool wait = false) const { //!< If true then forces the event completion + return false; + }; + + // Returns the status of HW event, associated with amd::Event + virtual bool IsHwEventReadyForcedWait( + const amd::Event& event) const { //!< AMD event for HW status validation return false; }; diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index 21514671..b46377dc 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -2710,10 +2710,22 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI return result; } +// ================================================================================================ +bool Device::IsHwEventReadyForcedWait(const amd::Event& event) const { + void* hw_event = + (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent(); + if (hw_event == nullptr) { + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event"); + return false; + } + static constexpr bool Timeout = true; + return WaitForSignal(reinterpret_cast(hw_event)->signal_, false, true); +} + // ================================================================================================ bool Device::IsHwEventReady(const amd::Event& event, bool wait) const { - void* hw_event = (event.NotifyEvent() != nullptr) ? - event.NotifyEvent()->HwEvent() : event.HwEvent(); + void* hw_event = + (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent(); if (hw_event == nullptr) { ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event"); return false; diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp index 3fcbf039..b36d21ef 100644 --- a/device/rocm/rocdevice.hpp +++ b/device/rocm/rocdevice.hpp @@ -257,6 +257,7 @@ class NullDevice : public amd::Device { cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; } virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; } + virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const { return false; } virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {}; virtual void ReleaseGlobalSignal(void* signal) const {} @@ -442,6 +443,7 @@ class Device : public NullDevice { cl_set_device_clock_mode_output_amd* pSetClockModeOutput); virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const; + virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const; virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const; virtual void ReleaseGlobalSignal(void* signal) const; diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp index 74d624a1..1553dfe9 100644 --- a/device/rocm/rocvirtual.hpp +++ b/device/rocm/rocvirtual.hpp @@ -46,10 +46,10 @@ constexpr static uint64_t kUnlimitedWait = std::numeric_limits::max(); // Active wait time out incase same sdma engine is used again, // then just wait instead of adding dependency wait signal. -constexpr static uint64_t kSDMAEngineTimeout = 10; +constexpr static uint64_t kForcedTimeout = 10; template -inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sdma_wait = false) { +inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool forced_wait = false) { if (hsa_signal_load_relaxed(signal) > 0) { uint64_t timeout = kTimeout100us; if (active_wait) { @@ -57,7 +57,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sd } if (active_wait_timeout) { // If diff engine, wait to 10 ms. Otherwise no wait - timeout = (sdma_wait ? kSDMAEngineTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K; + timeout = (forced_wait ? kForcedTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K; if (timeout == 0) { return false; } From fc61d9db113d1938bbc43d8cdde24d0d8ca5ba97 Mon Sep 17 00:00:00 2001 From: German Date: Thu, 16 Feb 2023 15:48:57 -0500 Subject: [PATCH 38/45] SWDEV-382397 - Move VirtualGPU destruction back to the thread exit OS can terminate unfinished queue thread from default stream at any time. Potentially leaving the queue lock in a bad state and causing a deadlock if runtime destroys VirtualGPU later from the host thread. Change-Id: I247f102ee84e6b4dba947504933395071945c85d --- platform/commandqueue.cpp | 1 + platform/commandqueue.hpp | 10 +++------- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/platform/commandqueue.cpp b/platform/commandqueue.cpp index 90573b4c..8b27b4ce 100644 --- a/platform/commandqueue.cpp +++ b/platform/commandqueue.cpp @@ -66,6 +66,7 @@ bool HostQueue::terminate() { marker->awaitCompletion(); marker->release(); } + thread_.Release(); thread_.acceptingCommands_ = false; } else { if (Os::isThreadAlive(thread_)) { diff --git a/platform/commandqueue.hpp b/platform/commandqueue.hpp index ea702a5e..f98332bf 100644 --- a/platform/commandqueue.hpp +++ b/platform/commandqueue.hpp @@ -164,19 +164,13 @@ class HostQueue : public CommandQueue { acceptingCommands_(false), virtualDevice_(nullptr) {} - virtual ~Thread() { - if (virtualDevice_ != nullptr) { - delete virtualDevice_; - virtualDevice_ = nullptr; - } - } - //! The command queue thread entry point. void run(void* data) { HostQueue* queue = static_cast(data); virtualDevice_ = queue->device().createVirtualDevice(queue); if (virtualDevice_ != nullptr) { queue->loop(virtualDevice_); + Release(); } else { acceptingCommands_ = false; queue->flush(); @@ -190,6 +184,8 @@ class HostQueue : public CommandQueue { } } + void Release() const { delete virtualDevice_; } + //! Get virtual device for the current thread device::VirtualDevice* vdev() const { return virtualDevice_; } From 43abc163a45ed68530d1021829186a180a49d1bf Mon Sep 17 00:00:00 2001 From: Todd tiantuo Li Date: Tue, 21 Feb 2023 02:54:35 -0800 Subject: [PATCH 39/45] SWDEV-382501 - support new ASIC Revision for Phoenix2 Change-Id: I0225511960c9fbc50ec541511599ef87e509b567 --- device/pal/paldevice.cpp | 1 + device/pal/palsettings.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp index 8b07d391..3d758bbd 100644 --- a/device/pal/paldevice.cpp +++ b/device/pal/paldevice.cpp @@ -108,6 +108,7 @@ static constexpr PalDevice supportedPalDevices[] = { {11, 0, 1, Pal::GfxIpLevel::GfxIp11_0, "gfx1101", Pal::AsicRevision::Navi32}, {11, 0, 2, Pal::GfxIpLevel::GfxIp11_0, "gfx1102", Pal::AsicRevision::Navi33}, {11, 0, 3, Pal::GfxIpLevel::GfxIp11_0, "gfx1103", Pal::AsicRevision::Phoenix1}, + {11, 0, 3, Pal::GfxIpLevel::GfxIp11_0, "gfx1103", Pal::AsicRevision::Phoenix2}, }; static std::tuple findIsa(Pal::AsicRevision asicRevision, diff --git a/device/pal/palsettings.cpp b/device/pal/palsettings.cpp index 1f4dc8e9..b85a2df0 100644 --- a/device/pal/palsettings.cpp +++ b/device/pal/palsettings.cpp @@ -205,6 +205,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp, case Pal::AsicRevision::Navi31: // Fall through for Navi2x ... case Pal::AsicRevision::Phoenix1: + case Pal::AsicRevision::Phoenix2: case Pal::AsicRevision::Raphael: case Pal::AsicRevision::Rembrandt: case Pal::AsicRevision::Navi24: From c5f131c5d32f57ff647f115c3e537f8def711e34 Mon Sep 17 00:00:00 2001 From: "Xie,AlexBin" Date: Fri, 17 Feb 2023 18:49:15 -0500 Subject: [PATCH 40/45] SWDEV-378367 - Observed performance drop for Geekbench5 Change-Id: I0beabc6e3bec095574c8168fcf52af1e94105792 --- device/pal/palsettings.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/device/pal/palsettings.cpp b/device/pal/palsettings.cpp index b85a2df0..080654c1 100644 --- a/device/pal/palsettings.cpp +++ b/device/pal/palsettings.cpp @@ -78,9 +78,10 @@ Settings::Settings() { // By default use host blit blitEngine_ = BlitEngineHost; - pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi; + pinnedXferSize_ = GPU_PINNED_XFER_SIZE * Mi; + size_t defaultMinXferSize = amd::IS_HIP ? 128: 4; pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE) - ? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi; + ? defaultMinXferSize * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi; // Disable FP_FAST_FMA defines by default reportFMAF_ = false; From d287ba44be1245554c91a067b5250934ecf4c8bd Mon Sep 17 00:00:00 2001 From: Jaydeep Patel Date: Thu, 16 Feb 2023 16:57:50 +0000 Subject: [PATCH 41/45] SWDEV-382664 - Remove WaveLimiter init The wavelimiter init was unintentionally added in one change. It wasn't supposed to perform any logic, since LC doesn't support it and the number of waves can be overwritten only with an environment variable. Change-Id: I447bd1ad685800f874b6a6fb7409dc67e43640ff --- device/pal/palkernel.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/device/pal/palkernel.cpp b/device/pal/palkernel.cpp index f425e3d7..035a6288 100644 --- a/device/pal/palkernel.cpp +++ b/device/pal/palkernel.cpp @@ -498,7 +498,6 @@ const LightningProgram& LightningKernel::prog() const { #if defined(USE_COMGR_LIBRARY) bool LightningKernel::init() { - waveLimiter_.enable(); return GetAttrCodePropMetadata(); } From afebcb413aed0441ef684fdf1f0675f54676c6a5 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Fri, 20 Jan 2023 15:34:24 -0800 Subject: [PATCH 42/45] SWDEV-364604 - Add ROCclr support for hipEventDisableSystemFence Change-Id: I6127b432a8759359359a1890fda85bc401be6a56 --- device/device.hpp | 3 +++ device/pal/palvirtual.hpp | 2 ++ device/rocm/rocvirtual.cpp | 42 ++++++++++++++++++++++---------------- device/rocm/rocvirtual.hpp | 1 + platform/command.cpp | 1 - platform/commandqueue.cpp | 2 +- 6 files changed, 31 insertions(+), 20 deletions(-) diff --git a/device/device.hpp b/device/device.hpp index cf5bdb33..3e65a331 100644 --- a/device/device.hpp +++ b/device/device.hpp @@ -1260,6 +1260,9 @@ class VirtualDevice : public amd::HeapObject { //! Returns fence state of the VirtualGPU virtual bool isFenceDirty() const = 0; + //! Resets fence state of the VirtualGPU + virtual void resetFenceDirty() = 0; + private: //! Disable default copy constructor VirtualDevice& operator=(const VirtualDevice&); diff --git a/device/pal/palvirtual.hpp b/device/pal/palvirtual.hpp index 86240649..74fb039f 100644 --- a/device/pal/palvirtual.hpp +++ b/device/pal/palvirtual.hpp @@ -359,6 +359,8 @@ class VirtualGPU : public device::VirtualDevice { bool isFenceDirty() const { return false; } + void resetFenceDirty() {} + //! Returns GPU device object associated with this kernel const Device& dev() const { return gpuDevice_; } diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp index feeef138..f2850b55 100644 --- a/device/rocm/rocvirtual.cpp +++ b/device/rocm/rocvirtual.cpp @@ -78,13 +78,8 @@ static constexpr uint16_t kBarrierPacketHeader = (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); -static constexpr uint16_t kBarrierPacketAgentScopeHeader = - (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - static constexpr uint16_t kNopPacketHeader = - (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | + (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); @@ -99,14 +94,16 @@ static constexpr uint16_t kBarrierPacketReleaseHeader = (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); static constexpr uint16_t kBarrierVendorPacketHeader = - (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); -static constexpr uint16_t kBarrierVendorPacketAgentScopeHeader = - (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); +static constexpr uint16_t kBarrierVendorPacketNopScopeHeader = + (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); static constexpr hsa_barrier_and_packet_t kBarrierAcquirePacket = { kBarrierPacketAcquireHeader, 0, 0, {{0}}, 0, {0}}; @@ -989,6 +986,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); + fence_dirty_ = true; auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); if (!skipSignal) { @@ -1001,7 +999,9 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, } // Reset fence_dirty_ flag if we submit a barrier - fence_dirty_ = false; + if (cache_state == amd::Device::kCacheStateSystem) { + fence_dirty_ = false; + } while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); hsa_barrier_and_packet_t* aql_loc = @@ -1063,6 +1063,10 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD } } + fence_dirty_ = true; + auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); + if (completionSignal.handle == 0) { // Get active signal for current dispatch if profiling is necessary barrier_value_packet_.completion_signal = @@ -1072,6 +1076,11 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD barrier_value_packet_.completion_signal = completionSignal; } + // Reset fence_dirty_ flag if we submit a barrier + if (cache_state == amd::Device::kCacheStateSystem) { + fence_dirty_ = false; + } + uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); hsa_amd_barrier_value_packet_t* aql_loc = &(reinterpret_cast( @@ -1079,9 +1088,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD *aql_loc = barrier_value_packet_; packet_store_release(reinterpret_cast(aql_loc), packetHeader, rest); - auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, - HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); - hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index); ClPrint(amd::LOG_DEBUG, amd::LOG_AQL, @@ -3252,11 +3258,11 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { if (timestamp_ != nullptr) { const Settings& settings = dev().settings(); int32_t releaseFlags = vcmd.getEventScope(); - if (releaseFlags == Device::CacheState::kCacheStateAgent) { + if (releaseFlags == Device::CacheState::kCacheStateIgnore) { if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) { - dispatchBarrierValuePacket(kBarrierVendorPacketAgentScopeHeader, true); + dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true); } else { - dispatchBarrierPacket(kBarrierPacketAgentScopeHeader, false); + dispatchBarrierPacket(kNopPacketHeader, false); } } else { // Submit a barrier with a cache flushes. diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp index 1553dfe9..8c002658 100644 --- a/device/rocm/rocvirtual.hpp +++ b/device/rocm/rocvirtual.hpp @@ -407,6 +407,7 @@ class VirtualGPU : public device::VirtualDevice { void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } + void resetFenceDirty() { fence_dirty_ = false; } // } roc OpenCL integration private: //! Dispatches a barrier with blocking HSA signals diff --git a/platform/command.cpp b/platform/command.cpp index e02cb153..279bfc3d 100644 --- a/platform/command.cpp +++ b/platform/command.cpp @@ -436,7 +436,6 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList profilingInfo_.clear(); profilingInfo_.callback_ = nullptr; profilingInfo_.marker_ts_ = true; - setEventScope(amd::Device::kCacheStateSystem); } kernel_.retain(); } diff --git a/platform/commandqueue.cpp b/platform/commandqueue.cpp index 8b27b4ce..ce8ba4ae 100644 --- a/platform/commandqueue.cpp +++ b/platform/commandqueue.cpp @@ -120,7 +120,7 @@ void HostQueue::finish() { return; } } - if (nullptr == command || vdev()->isHandlerPending()) { + if (nullptr == command || vdev()->isHandlerPending() || vdev()->isFenceDirty()) { if (nullptr != command) { command->release(); } From 9f495efc34e0e8a618dd82adc40ff23841da2fdd Mon Sep 17 00:00:00 2001 From: Jaydeep Patel Date: Mon, 20 Feb 2023 17:59:01 +0000 Subject: [PATCH 43/45] SWDEV-383397 - Set large bar false for windows. Change-Id: Iaef70e477afab7ca8694aa55d1f704e769e24ed9 --- device/pal/paldevice.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp index 3d758bbd..9adfabff 100644 --- a/device/pal/paldevice.cpp +++ b/device/pal/paldevice.cpp @@ -633,7 +633,9 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.cooperativeGroups_ = settings().enableCoopGroups_; info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_; - if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) { + if (amd::IS_HIP) { + info_.largeBar_ = false; + } else if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) { info_.largeBar_ = true; ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Resizable bar enabled"); } From 3b9d67083a5af334c3f2a26ecd02b817fb6e618a Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras Date: Thu, 23 Feb 2023 16:38:32 +0000 Subject: [PATCH 44/45] SWDEV-385050 - Fixed possible invalid queue access from kernelCommand::releaseResources Change-Id: I7c5d99987cb7ab4fa0aa634f2bb6a4d60331b3af --- platform/command.cpp | 2 +- platform/kernel.cpp | 2 +- platform/kernel.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/platform/command.cpp b/platform/command.cpp index 279bfc3d..35ec5113 100644 --- a/platform/command.cpp +++ b/platform/command.cpp @@ -441,7 +441,7 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList } void NDRangeKernelCommand::releaseResources() { - kernel_.parameters().release(parameters_, queue()->device()); + kernel_.parameters().release(parameters_); DEBUG_ONLY(parameters_ = NULL); kernel_.release(); Command::releaseResources(); diff --git a/platform/kernel.cpp b/platform/kernel.cpp index 80cb1a12..52d4936a 100644 --- a/platform/kernel.cpp +++ b/platform/kernel.cpp @@ -253,7 +253,7 @@ bool KernelParameters::boundToSvmPointer(const Device& device, const_address cap return svmBound[index]; } -void KernelParameters::release(address mem, const amd::Device& device) const { +void KernelParameters::release(address mem) const { if (mem == nullptr) { // nothing to do! return; diff --git a/platform/kernel.hpp b/platform/kernel.hpp index e46fc5a7..8cb3b7f7 100644 --- a/platform/kernel.hpp +++ b/platform/kernel.hpp @@ -215,7 +215,7 @@ class KernelParameters : protected HeapObject { //! Capture the state of the parameters and return the stack base pointer. address capture(device::VirtualDevice& vDev, uint64_t lclMemSize, int32_t* error); //! Release the captured state of the parameters. - void release(address parameters, const amd::Device& device) const; + void release(address parameters) const; //! Allocate memory for this instance as well as the required storage for // the values_, defined_, and rawPointer_ arrays. From 6c224352715359721224b36e6ec5ef4b17517be5 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Fri, 24 Feb 2023 12:24:43 -0800 Subject: [PATCH 45/45] SWDEV-380035 - Do not create Arena Memobj for pinned memory Change-Id: I4da281f0b7139efb4bb46b47f812358a0212445a --- device/rocm/rocdevice.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index b46377dc..287ece39 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -3219,18 +3219,13 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size return arena_mem_obj_; } } + // Calculate the offset of the pointer. const void* dev_ptr = reinterpret_cast( arena_mem_obj_->getDeviceMemory(*arena_mem_obj_->getContext().devices()[0]) ->virtualAddress()); - // System memory which has been locked - if (ptr_info.type == HSA_EXT_POINTER_TYPE_LOCKED && - getCpuAgent().handle == ptr_info.agentOwner.handle && ptr_info.hostBaseAddress == ptr) { - offset = - reinterpret_cast(ptr_info.agentBaseAddress) - reinterpret_cast(dev_ptr); - } else { - offset = reinterpret_cast(ptr) - reinterpret_cast(dev_ptr); - } + offset = reinterpret_cast(ptr) - reinterpret_cast(dev_ptr); + return arena_mem_obj_; } @@ -3250,6 +3245,13 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer if (status != HSA_STATUS_SUCCESS) { LogError("hsa_amd_pointer_info() failed"); } + + // Return false for pinned memory. A true return may result in a race because + // ROCclr may attempt to do a pin/copy/unpin underneath in a multithreaded environment + if (ptr_info->type == HSA_EXT_POINTER_TYPE_LOCKED) { + return false; + } + if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) { if ((size != 0) && ((reinterpret_cast(dev_ptr) -