Promote till commit '6c224352715359721224b36e6ec5ef4b17517be5'

Change-Id: I22ec1e32c2c10311b26a83c7cf6347b87038711a
ROCm · Mar 15, 2023 · 5a27d34 · 5a27d34
2 parents cb18e75 + 6c22435
commit 5a27d34
Show file tree

Hide file tree

Showing 40 changed files with 217 additions and 986 deletions.
diff --git a/cmake/ROCclr.cmake b/cmake/ROCclr.cmake
@@ -100,7 +100,6 @@ endif()
 
 target_compile_definitions(rocclr PUBLIC
   LITTLEENDIAN_CPU
-  WITH_LIQUID_FLASH=0
   ${AMD_OPENCL_DEFS})
 
 target_include_directories(rocclr PUBLIC

diff --git a/device/blit.cpp b/device/blit.cpp
@@ -729,14 +729,16 @@ bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, siz
                                            std::vector<FillBufferInfo>& packed_info) {
 
   // 1. Validate input arguments
-  guarantee(fill_size >= pattern_size, "Pattern Size cannot be greater than fill size");
-  guarantee(fill_size <= memory.size(), "Cannot fill more than the mem object size");
+  guarantee(fill_size >= pattern_size, "Pattern Size: %u cannot be greater than fill size: %u \n",
+                                        pattern_size, fill_size);
+  guarantee(fill_size <= memory.size(), "Cannot fill: %u more than the mem object size:%u \n",
+                                        fill_size, memory.size());
 
   // 2. Calculate the next closest dword aligned address for faster processing
   size_t dst_addr = memory.virtualAddress() + fill_origin;
   size_t aligned_dst_addr = amd::alignUp(dst_addr, sizeof(size_t));
-  guarantee(aligned_dst_addr >= dst_addr, "Aligned address cannot be greater than destination"
-                                          "address");
+  guarantee(aligned_dst_addr >= dst_addr, "Aligned address: %u cannot be greater than destination"
+                                          "address :%u \n", aligned_dst_addr, dst_addr);
 
   // 3. If given address is not aligned calculate head and tail size.
   size_t head_size = std::min(aligned_dst_addr - dst_addr, fill_size);

diff --git a/device/devhcprintf.cpp b/device/devhcprintf.cpp
@@ -23,6 +23,7 @@
 
 #include <assert.h>
 #include <cstdarg>
+#include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <string>

diff --git a/device/devhostcall.cpp b/device/devhostcall.cpp
@@ -84,10 +84,11 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t*
       if (!messages.handlePayload(service, payload)) {
         ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: invalid request for service \"%d\".",
                 service);
-        amd::report_fatal(__FILE__, __LINE__, "Hostcall: invalid service request.");
+        guarantee(false, "Hostcall: invalid service request %d \n", service);
       }
       return;
     case SERVICE_DEVMEM: {
+      guarantee(payload[0] != 0 || payload[1] != 0, "Both payloads cannot be 0 \n");
       if (payload[0]) {
         amd::Memory* mem = amd::MemObjMap::FindMemObj(reinterpret_cast<void*>(payload[0]));
         if (mem) {
@@ -114,9 +115,7 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t*
       return;
     }
     default:
-      ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: no handler found for service ID \"%d\".",
-              service);
-      amd::report_fatal(__FILE__, __LINE__, "Hostcall service not supported.");
+      guarantee(false, "Hostcall: no handler found for service ID %d \n", service);
       return;
   }
 }

diff --git a/device/device.cpp b/device/device.cpp
@@ -289,11 +289,8 @@ void MemObjMap::AddMemObj(const void* k, amd::Memory* v) {
 void MemObjMap::RemoveMemObj(const void* k) {
   amd::ScopedLock lock(AllocatedLock_);
   auto rval = MemObjMap_.erase(reinterpret_cast<uintptr_t>(k));
-  if (rval != 1) {
-    DevLogPrintfError("Memobj map does not have ptr: 0x%x",
-                      reinterpret_cast<uintptr_t>(k));
-    guarantee(false, "Memobj map does not have ptr");
-  }
+  guarantee(rval == 1, "Memobj map does not have ptr: 0x%x",
+                        reinterpret_cast<uintptr_t>(k));
 }
 
 amd::Memory* MemObjMap::FindMemObj(const void* k, size_t* offset) {
@@ -328,11 +325,8 @@ void MemObjMap::AddVirtualMemObj(const void* k, amd::Memory* v) {
 void MemObjMap::RemoveVirtualMemObj(const void* k) {
   amd::ScopedLock lock(AllocatedLock_);
   auto rval = VirtualMemObjMap_.erase(reinterpret_cast<uintptr_t>(k));
-  if (rval != 1) {
-    DevLogPrintfError("Virtual Memobj map does not have ptr: 0x%x",
-                      reinterpret_cast<uintptr_t>(k));
-    guarantee(false, "VirtualMemobj map does not have ptr");
-  }
+  guarantee(rval == 1, "Virtual Memobj map does not have ptr: 0x%x",
+                       reinterpret_cast<uintptr_t>(k));
 }
 
 amd::Memory* MemObjMap::FindVirtualMemObj(const void* k) {

diff --git a/device/device.hpp b/device/device.hpp
@@ -89,7 +89,6 @@ class SvmFillMemoryCommand;
 class SvmMapMemoryCommand;
 class SvmUnmapMemoryCommand;
 class SvmPrefetchAsyncCommand;
-class TransferBufferFileCommand;
 class StreamOperationCommand;
 class VirtualMapCommand;
 class ExternalSemaphoreCmd;
@@ -154,7 +153,6 @@ enum OclExtensions {
   ClKhrD3d9Sharing,
 #endif
   ClKhrImage2dFromBuffer,
-  ClAmdSemaphore,
   ClAMDBusAddressableMemory,
   ClAMDC11Atomics,
   ClKhrSpir,
@@ -163,8 +161,6 @@ enum OclExtensions {
   ClKhrDepthImages,
   ClKhrMipMapImage,
   ClKhrMipMapImageWrites,
-  ClKhrIlProgram,
-  ClAMDLiquidFlash,
   ClAmdCopyBufferP2P,
   ClAmdAssemblyProgram,
 #if defined(_WIN32)
@@ -200,7 +196,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ",
                                             "cl_khr_dx9_media_sharing ",
 #endif
                                             "cl_khr_image2d_from_buffer ",
-                                            "",
                                             "cl_amd_bus_addressable_memory ",
                                             "cl_amd_c11_atomics ",
                                             "cl_khr_spir ",
@@ -209,8 +204,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ",
                                             "cl_khr_depth_images ",
                                             "cl_khr_mipmap_image ",
                                             "cl_khr_mipmap_image_writes ",
-                                            "",
-                                            "cl_amd_liquid_flash ",
                                             "cl_amd_copy_buffer_p2p ",
                                             "cl_amd_assembly_program ",
 #if defined(_WIN32)
@@ -1241,9 +1234,6 @@ class VirtualDevice : public amd::HeapObject {
   /// Optional extensions
   virtual void submitSignal(amd::SignalCommand& cmd) = 0;
   virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) = 0;
-  virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
-    ShouldNotReachHere();
-  }
   virtual void submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
     ShouldNotReachHere();
   }
@@ -1270,6 +1260,9 @@ class VirtualDevice : public amd::HeapObject {
   //! Returns fence state of the VirtualGPU
   virtual bool isFenceDirty() const = 0;
 
+  //! Resets fence state of the VirtualGPU
+  virtual void resetFenceDirty() = 0;
+
  private:
   //! Disable default copy constructor
   VirtualDevice& operator=(const VirtualDevice&);
@@ -1796,9 +1789,14 @@ class Device : public RuntimeObject {
 
   // Returns the status of HW event, associated with amd::Event
   virtual bool IsHwEventReady(
-      const amd::Event& event,  //!< AMD event for HW status validation
-      bool wait = false         //!< If true then forces the event completion
-      ) const {
+      const amd::Event& event,    //!< AMD event for HW status validation
+      bool wait = false) const {  //!< If true then forces the event completion
+    return false;
+  };
+
+  // Returns the status of HW event, associated with amd::Event
+  virtual bool IsHwEventReadyForcedWait(
+      const amd::Event& event) const {  //!< AMD event for HW status validation
     return false;
   };
 

diff --git a/device/devprogram.cpp b/device/devprogram.cpp
@@ -2977,7 +2977,7 @@ bool Program::runInitFiniKernel(kernel_kind_t kind) const {
   amd::HostQueue* queue = nullptr;
 
   for (const auto& i : kernels_) {
-    LogPrintfInfo("For Init/Fini: Kernel Name: %s", i.first.c_str());
+    ClPrint(amd::LOG_INFO, amd::LOG_INIT, "For Init/Fini: Kernel Name: %s", i.first.c_str());
     const auto &kernel = i.second;
     if ((kernel->isInitKernel() && kind == kernel_kind_t::InitKernel) ||
         (kernel->isFiniKernel() && kind == kernel_kind_t::FiniKernel)) {

diff --git a/device/devwavelimiter.cpp b/device/devwavelimiter.cpp
@@ -328,6 +328,7 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
 // ================================================================================================
 void WaveLimiterManager::enable(bool isSupported) {
   if (fixed_ > 0) {
+    enable_ = GPU_WAVE_LIMIT_ENABLE;
     return;
   }
 

diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp
@@ -108,6 +108,7 @@ static constexpr PalDevice supportedPalDevices[] = {
   {11, 0,  1,  Pal::GfxIpLevel::GfxIp11_0, "gfx1101",       Pal::AsicRevision::Navi32},
   {11, 0,  2,  Pal::GfxIpLevel::GfxIp11_0, "gfx1102",       Pal::AsicRevision::Navi33},
   {11, 0,  3,  Pal::GfxIpLevel::GfxIp11_0, "gfx1103",       Pal::AsicRevision::Phoenix1},
+  {11, 0,  3,  Pal::GfxIpLevel::GfxIp11_0, "gfx1103",       Pal::AsicRevision::Phoenix2},
 };
 
 static std::tuple<const amd::Isa*, const char*> findIsa(Pal::AsicRevision asicRevision,
@@ -632,7 +633,9 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
     info_.cooperativeGroups_ = settings().enableCoopGroups_;
     info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_;
 
-    if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) {
+    if (amd::IS_HIP) {
+      info_.largeBar_ = false;
+    } else if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) {
       info_.largeBar_ = true;
       ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Resizable bar enabled");
     }
@@ -2356,8 +2359,8 @@ void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
 }
 
 // ================================================================================================
-void Device::HiddenHeapAlloc() {
-  auto HeapAlloc = [this]() -> bool {
+void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
+  auto HeapAlloc = [this, &gpu]() -> bool {
     // Allocate initial heap for device memory allocator
     static constexpr size_t HeapBufferSize = 128 * Ki;
     heap_buffer_ = createMemory(HeapBufferSize);
@@ -2369,7 +2372,7 @@ void Device::HiddenHeapAlloc() {
       LogError("Heap buffer allocation failed!");
       return false;
     }
-    bool result = static_cast<const KernelBlitManager&>(xferMgr()).initHeap(
+    bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr()).initHeap(
         heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi));
 
     return result;

diff --git a/device/pal/paldevice.hpp b/device/pal/paldevice.hpp
@@ -629,7 +629,7 @@ class Device : public NullDevice {
 #endif
 #endif
   //! Allocates hidden heap for device memory allocations
-  void HiddenHeapAlloc();
+  void HiddenHeapAlloc(const VirtualGPU& gpu);
 
  private:
   static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,

diff --git a/device/pal/palgpuopen.cpp b/device/pal/palgpuopen.cpp
@@ -54,9 +54,7 @@ RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device)
       se_mask_(0),
       perf_counter_mem_limit_(0),
       perf_counter_frequency_(0),
-      trace_enabled_(false),
-      inst_tracing_enabled_(false),
-      perf_counters_enabled_(false) {
+      value_(0) {
   memset(&trace_, 0, sizeof(trace_));
 }
 
@@ -176,6 +174,8 @@ bool RgpCaptureMgr::Update(Pal::IPlatform* platform) {
     PostDeviceCreate();
   }
 
+  static_vm_id_ = device_.properties().gfxipProperties.flags.supportStaticVmid;
+
   return result;
 }
 
@@ -189,12 +189,12 @@ bool RgpCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue,
   // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
   // it may be optional for Vulkan, but we provide it anyway if available).
   Pal::KernelContextInfo kernelContextInfo = {};
-
   Pal::Result palResult = iQueue->QueryKernelContextInfo(&kernelContextInfo);
 
   // Ensure we've acquired the debug VMID (note that some platforms do not
   // implement this function, so don't fail the whole trace if so)
   *debug_vmid = kernelContextInfo.flags.hasDebugVmid;
+  assert((static_vm_id_ || *debug_vmid) && "Can't capture multiple queues!");
 
   // Register the queue with the GPA session class for timed queue operation support.
   if (trace_.gpa_session_->RegisterTimedQueue(
@@ -278,6 +278,21 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) {
           // continue until we find the right queue...
         } else if (Pal::Result::Success == res) {
           trace_.sqtt_disp_count_ = 0;
+          // Stop the trace and save the result. Currently runtime can't delay upload in HIP,
+          // because default stream doesn't have explicit destruction and
+          // OS kills all threads on exit without any notification. That includes PAL RGP threads.
+          {
+            if (trace_.status_ == TraceStatus::WaitingForSqtt) {
+              auto result = EndRGPTrace(gpu);
+            }
+            // Check if runtime is waiting for the final trace results
+            if (trace_.status_ == TraceStatus::WaitingForResults) {
+              // If results are ready, then finish the trace
+              if (CheckForTraceResults() == Pal::Result::Success) {
+                FinishRGPTrace(gpu, false);
+              }
+            }
+          }
         } else {
           FinishRGPTrace(gpu, true);
         }
@@ -517,11 +532,17 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) {
     }
   }
 
-  // Notify the RGP server that we are starting a trace
-  if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
-    result = Pal::Result::ErrorUnknown;
+  if (static_vm_id_) {
+    result = device_.iDev()->SetStaticVmidMode(true);
+    assert(result == Pal::Result::Success && "Static VM ID setup failed!");
   }
 
+  if (result == Pal::Result::Success) {
+    // Notify the RGP server that we are starting a trace
+    if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
+      result = Pal::Result::ErrorUnknown;
+    }
+  }
   // Tell the GPA session class we're starting a trace
   if (result == Pal::Result::Success) {
     GpuUtil::GpaSessionBeginInfo info = {};
@@ -707,6 +728,7 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
     return;
   }
 
+  auto disp_count = trace_.sqtt_disp_count_;
   // Finish the trace if the queue was destroyed before OCL reached
   // the number of captured dispatches
   if (trace_.sqtt_disp_count_ != 0) {
@@ -736,9 +758,18 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
   } else {
     rgp_server_->EndTrace();
   }
+
+  if (static_vm_id_) {
+    auto result = device_.iDev()->SetStaticVmidMode(false);
+    assert(result == Pal::Result::Success && "Static VM ID setup failed!");
+  }
+
   if (trace_.gpa_session_ != nullptr) {
     trace_.gpa_session_->Reset();
   }
+  // If applicaiton exits, then Windows kills all threads and
+  // RGP can't finish data write into a file.
+  amd::Os::sleep(10 * disp_count + 500);
   // Reset tracing state to idle
   trace_.prepared_disp_count_ = 0;
   trace_.sqtt_disp_count_ = 0;

diff --git a/device/pal/palgpuopen.hpp b/device/pal/palgpuopen.hpp
@@ -410,9 +410,10 @@ class RgpCaptureMgr {
 
   union {
     struct {
-      uint32_t trace_enabled_ : 1;      // True if tracing is currently enabled (master flag)
-      uint32_t inst_tracing_enabled_;   // Enable instruction-level SQTT tokens
-      uint32_t perf_counters_enabled_;  // True if perf counters are enabled
+      uint32_t trace_enabled_: 1;         // True if tracing is currently enabled (master flag)
+      uint32_t inst_tracing_enabled_: 1;  // Enable instruction-level SQTT tokens
+      uint32_t perf_counters_enabled_: 1; // True if perf counters are enabled
+      uint32_t static_vm_id_: 1;          // Static VM ID can be used for capture
     };
     uint32_t value_;
   };

diff --git a/device/pal/palkernel.cpp b/device/pal/palkernel.cpp
@@ -362,7 +362,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
       case amd::KernelParameterDescriptor::HiddenHeap:
         // Allocate hidden heap for HIP applications only
         if ((amd::IS_HIP) && (palDevice().HeapBuffer() == nullptr)) {
-          const_cast<Device&>(palDevice()).HiddenHeapAlloc();
+          const_cast<Device&>(palDevice()).HiddenHeapAlloc(gpu);
         }
         if (palDevice().HeapBuffer() != nullptr) {
           // Add heap pointer to the code

diff --git a/device/pal/palsettings.cpp b/device/pal/palsettings.cpp
@@ -78,9 +78,10 @@ Settings::Settings() {
 
   // By default use host blit
   blitEngine_ = BlitEngineHost;
-  pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi;
+  pinnedXferSize_ = GPU_PINNED_XFER_SIZE * Mi;
+  size_t defaultMinXferSize = amd::IS_HIP ? 128: 4;
   pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)
-    ? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi;
+    ? defaultMinXferSize * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi;
 
   // Disable FP_FAST_FMA defines by default
   reportFMAF_ = false;
@@ -205,6 +206,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
     case Pal::AsicRevision::Navi31:
     // Fall through for Navi2x ...
     case Pal::AsicRevision::Phoenix1:
+    case Pal::AsicRevision::Phoenix2:
     case Pal::AsicRevision::Raphael:
     case Pal::AsicRevision::Rembrandt:
     case Pal::AsicRevision::Navi24: