From 30f912cbd33ecf7c539db0c334320633d927cd1d Mon Sep 17 00:00:00 2001
From: Chauncey Hui <chauncey.hui@amd.com>
Date: Fri, 20 Jan 2023 03:00:09 -0500
Subject: [PATCH 01/45] SWDEV-2 - Change OpenCL version number from  3558 to
 3559

---
 utils/versions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.hpp b/utils/versions.hpp
index 2a85df5e..dde35676 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3558
+#define AMD_PLATFORM_BUILD_NUMBER 3559
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER

From 3b1753706b08fd8e17f986d56b216e908e7ed972 Mon Sep 17 00:00:00 2001
From: German <German.Andryeyev@amd.com>
Date: Thu, 19 Jan 2023 18:53:11 -0500
Subject: [PATCH 02/45] SWDEV-377991 - Remove liquidflash support

Change-Id: Iba6455e5c0210c3223a06fec332404cd9f489154
---
 cmake/ROCclr.cmake         |  1 -
 device/device.hpp          |  6 ----
 device/devwavelimiter.cpp  |  1 +
 device/pal/palkernel.cpp   |  1 +
 device/pal/palvirtual.cpp  | 44 +-------------------------
 device/pal/palvirtual.hpp  |  1 -
 device/rocm/rocvirtual.cpp | 49 ++--------------------------
 device/rocm/rocvirtual.hpp |  2 --
 platform/command.cpp       | 65 --------------------------------------
 platform/command.hpp       | 60 -----------------------------------
 platform/context.cpp       | 16 ----------
 platform/memory.hpp        | 36 ---------------------
 platform/object.hpp        |  5 +--
 13 files changed, 6 insertions(+), 281 deletions(-)

diff --git a/cmake/ROCclr.cmake b/cmake/ROCclr.cmake
index 4d8f6a25..c7795ba8 100644
--- a/cmake/ROCclr.cmake
+++ b/cmake/ROCclr.cmake
@@ -100,7 +100,6 @@ endif()
 
 target_compile_definitions(rocclr PUBLIC
   LITTLEENDIAN_CPU
-  WITH_LIQUID_FLASH=0
   ${AMD_OPENCL_DEFS})
 
 target_include_directories(rocclr PUBLIC
diff --git a/device/device.hpp b/device/device.hpp
index 64d67465..5aad54bb 100644
--- a/device/device.hpp
+++ b/device/device.hpp
@@ -89,7 +89,6 @@ class SvmFillMemoryCommand;
 class SvmMapMemoryCommand;
 class SvmUnmapMemoryCommand;
 class SvmPrefetchAsyncCommand;
-class TransferBufferFileCommand;
 class StreamOperationCommand;
 class VirtualMapCommand;
 class ExternalSemaphoreCmd;
@@ -164,7 +163,6 @@ enum OclExtensions {
   ClKhrMipMapImage,
   ClKhrMipMapImageWrites,
   ClKhrIlProgram,
-  ClAMDLiquidFlash,
   ClAmdCopyBufferP2P,
   ClAmdAssemblyProgram,
 #if defined(_WIN32)
@@ -210,7 +208,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ",
                                             "cl_khr_mipmap_image ",
                                             "cl_khr_mipmap_image_writes ",
                                             "",
-                                            "cl_amd_liquid_flash ",
                                             "cl_amd_copy_buffer_p2p ",
                                             "cl_amd_assembly_program ",
 #if defined(_WIN32)
@@ -1241,9 +1238,6 @@ class VirtualDevice : public amd::HeapObject {
   /// Optional extensions
   virtual void submitSignal(amd::SignalCommand& cmd) = 0;
   virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) = 0;
-  virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
-    ShouldNotReachHere();
-  }
   virtual void submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
     ShouldNotReachHere();
   }
diff --git a/device/devwavelimiter.cpp b/device/devwavelimiter.cpp
index beec1bce..deaadd98 100644
--- a/device/devwavelimiter.cpp
+++ b/device/devwavelimiter.cpp
@@ -328,6 +328,7 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
 // ================================================================================================
 void WaveLimiterManager::enable(bool isSupported) {
   if (fixed_ > 0) {
+    enable_ = GPU_WAVE_LIMIT_ENABLE;
     return;
   }
 
diff --git a/device/pal/palkernel.cpp b/device/pal/palkernel.cpp
index 19c6d9fe..e6e4fd28 100644
--- a/device/pal/palkernel.cpp
+++ b/device/pal/palkernel.cpp
@@ -498,6 +498,7 @@ const LightningProgram& LightningKernel::prog() const {
 
 #if defined(USE_COMGR_LIBRARY)
 bool LightningKernel::init() {
+  waveLimiter_.enable();
   return GetAttrCodePropMetadata();
 }
 
diff --git a/device/pal/palvirtual.cpp b/device/pal/palvirtual.cpp
index 64b5bfa6..e7465c8b 100644
--- a/device/pal/palvirtual.cpp
+++ b/device/pal/palvirtual.cpp
@@ -3741,49 +3741,7 @@ bool VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst) {
   return false;
 }
 
-void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
-  size_t copySize = cmd.size()[0];
-  size_t fileOffset = cmd.fileOffset();
-  Memory* mem = dev().getGpuMemory(&cmd.memory());
-  uint idx = 0;
-
-  assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) ||
-         (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD));
-  const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD);
-
-  if (writeBuffer) {
-    size_t dstOffset = cmd.origin()[0];
-    while (copySize > 0) {
-      Memory* staging = dev().getGpuMemory(&cmd.staging(idx));
-      size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize;
-      dstSize = std::min(dstSize, copySize);
-      void* dstBuffer = staging->cpuMap(*this);
-      staging->cpuUnmap(*this);
-
-      blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false);
-      flushDMA(staging->getGpuEvent(*this)->engineId_);
-      fileOffset += dstSize;
-      dstOffset += dstSize;
-      copySize -= dstSize;
-    }
-  } else {
-    size_t srcOffset = cmd.origin()[0];
-    while (copySize > 0) {
-      Memory* staging = dev().getGpuMemory(&cmd.staging(idx));
-      size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize;
-      srcSize = std::min(srcSize, copySize);
-      blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false);
-
-      void* srcBuffer = staging->cpuMap(*this);
-      staging->cpuUnmap(*this);
-
-      fileOffset += srcSize;
-      srcOffset += srcSize;
-      copySize -= srcSize;
-    }
-  }
-}
-
+// ================================================================================================
 void* VirtualGPU::getOrCreateHostcallBuffer() {
   if (hostcallBuffer_ != nullptr) {
     return hostcallBuffer_;
diff --git a/device/pal/palvirtual.hpp b/device/pal/palvirtual.hpp
index 1bbfe8e9..86240649 100644
--- a/device/pal/palvirtual.hpp
+++ b/device/pal/palvirtual.hpp
@@ -345,7 +345,6 @@ class VirtualGPU : public device::VirtualDevice {
   virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
   virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
   virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
-  virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
   virtual void submitVirtualMap(amd::VirtualMapCommand& cmd);
   virtual void submitStreamOperation(amd::StreamOperationCommand& cmd);
   void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd);
diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp
index 56b6022f..7747d147 100644
--- a/device/rocm/rocvirtual.cpp
+++ b/device/rocm/rocvirtual.cpp
@@ -3353,55 +3353,10 @@ amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
   return nullptr;
 }
 
+// ================================================================================================
 void VirtualGPU::enableSyncBlit() const { blitMgr_->enableSynchronization(); }
 
-void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
-  // Make sure VirtualGPU has an exclusive access to the resources
-  amd::ScopedLock lock(execution());
-
-  size_t copySize = cmd.size()[0];
-  size_t fileOffset = cmd.fileOffset();
-  Memory* mem = dev().getRocMemory(&cmd.memory());
-  uint idx = 0;
-
-  assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) ||
-         (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD));
-  const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD);
-
-  if (writeBuffer) {
-    size_t dstOffset = cmd.origin()[0];
-    while (copySize > 0) {
-      Memory* staging = dev().getRocMemory(&cmd.staging(idx));
-      size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize;
-      dstSize = std::min(dstSize, copySize);
-      void* dstBuffer = staging->cpuMap(*this);
-
-      staging->cpuUnmap(*this);
-
-      bool result = blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false);
-      fileOffset += dstSize;
-      dstOffset += dstSize;
-      copySize -= dstSize;
-    }
-  } else {
-    size_t srcOffset = cmd.origin()[0];
-    while (copySize > 0) {
-      Memory* staging = dev().getRocMemory(&cmd.staging(idx));
-      size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize;
-      srcSize = std::min(srcSize, copySize);
-      bool result = blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false);
-
-      void* srcBuffer = staging->cpuMap(*this);
-
-      staging->cpuUnmap(*this);
-
-      fileOffset += srcSize;
-      srcOffset += srcSize;
-      copySize -= srcSize;
-    }
-  }
-}
-
+// ================================================================================================
 void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp
index 11a3670e..bf0bb4ce 100644
--- a/device/rocm/rocvirtual.hpp
+++ b/device/rocm/rocvirtual.hpp
@@ -341,8 +341,6 @@ class VirtualGPU : public device::VirtualDevice {
   virtual void submitSignal(amd::SignalCommand& cmd) {}
   virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) {}
 
-  virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
-
   void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) {}
   void submitThreadTrace(amd::ThreadTraceCommand& vcmd) {}
 
diff --git a/platform/command.cpp b/platform/command.cpp
index cc35579a..e02cb153 100644
--- a/platform/command.cpp
+++ b/platform/command.cpp
@@ -726,71 +726,6 @@ bool ThreadTraceMemObjectsCommand::validateMemory() {
   return true;
 }
 
-void TransferBufferFileCommand::releaseResources() {
-  for (uint i = 0; i < NumStagingBuffers; ++i) {
-    if (NULL != staging_[i]) {
-      staging_[i]->release();
-    }
-  }
-
-  // Call the parent
-  OneMemoryArgCommand::releaseResources();
-}
-
-void TransferBufferFileCommand::submit(device::VirtualDevice& device) {
-  device::Memory* mem = memory_->getDeviceMemory(queue()->device());
-  if (memory_->getMemFlags() &
-      (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD)) {
-    void* srcDstBuffer = nullptr;
-    if (memory_->getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) {
-      // Lock protected multiple maps for persistent memory
-      amd::ScopedLock lock(mem->owner()->lockMemoryOps());
-      srcDstBuffer = mem->cpuMap(device);
-    } else {
-      srcDstBuffer = mem->cpuMap(device);
-    }
-    // Make HD transfer to the host accessible memory
-    bool writeBuffer(type() == CL_COMMAND_READ_SSG_FILE_AMD);
-    if (memory_->getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) {
-      // Lock protected multiple maps for persistent memory
-      amd::ScopedLock lock(mem->owner()->lockMemoryOps());
-      mem->cpuUnmap(device);
-    } else {
-      mem->cpuUnmap(device);
-    }
-  } else {
-    device.submitTransferBufferFromFile(*this);
-  }
-}
-
-bool TransferBufferFileCommand::validateMemory() {
-  // Check if the destination buffer has direct host access
-  if (!(memory_->getMemFlags() &
-        (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD))) {
-    // Allocate staging buffers
-    for (uint i = 0; i < NumStagingBuffers; ++i) {
-      staging_[i] = new (memory_->getContext())
-          Buffer(memory_->getContext(), StagingBufferMemType, StagingBufferSize);
-      if (NULL == staging_[i] || !staging_[i]->create(nullptr)) {
-        DevLogPrintfError("Staging Create failed, Staging[%d]: 0x%x", i, staging_[i]);
-        return false;
-      }
-      device::Memory* mem = staging_[i]->getDeviceMemory(queue()->device());
-      if (NULL == mem) {
-        LogPrintfError("Can't allocate staging buffer - 0x%08X bytes!", staging_[i]->getSize());
-        return false;
-      }
-    }
-  }
-
-  device::Memory* mem = memory_->getDeviceMemory(queue()->device());
-  if (NULL == mem) {
-    LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory_->getSize());
-    return false;
-  }
-  return true;
-}
-
 bool CopyMemoryP2PCommand::validateMemory() {
   amd::Device* queue_device = &queue()->device();
 
diff --git a/platform/command.hpp b/platform/command.hpp
index 7022ec3d..45d9e365 100644
--- a/platform/command.hpp
+++ b/platform/command.hpp
@@ -1642,66 +1642,6 @@ class SvmUnmapMemoryCommand : public Command {
   void* svmPtr() const { return svmPtr_; }
 };
 
-/*! \brief      A generic transfer memory from/to file command.
- *
- *  \details    Currently supports buffers only. Buffers
- *              are treated as 1D structures so origin_[0] and size_[0]
- *              are equivalent to offset_ and count_ respectively.
- */
-class TransferBufferFileCommand : public OneMemoryArgCommand {
- public:
-  static constexpr uint NumStagingBuffers = 2;
-  static constexpr size_t StagingBufferSize = 4 * Mi;
-  static constexpr uint StagingBufferMemType = CL_MEM_USE_PERSISTENT_MEM_AMD;
-
- protected:
-  const Coord3D origin_;                     //!< Origin of the region to write to
-  const Coord3D size_;                       //!< Size of the region to write to
-  LiquidFlashFile* file_;                    //!< The file object for data read
-  size_t fileOffset_;                        //!< Offset in the file for data read
-  amd::Memory* staging_[NumStagingBuffers];  //!< Staging buffers for transfer
-
- public:
-  TransferBufferFileCommand(cl_command_type type, HostQueue& queue,
-                            const EventWaitList& eventWaitList, Memory& memory,
-                            const Coord3D& origin, const Coord3D& size, LiquidFlashFile* file,
-                            size_t fileOffset)
-      : OneMemoryArgCommand(queue, type, eventWaitList, memory),
-        origin_(origin),
-        size_(size),
-        file_(file),
-        fileOffset_(fileOffset) {
-    // Sanity checks
-    assert(size.c[0] > 0 && "invalid");
-    for (uint i = 0; i < NumStagingBuffers; ++i) {
-      staging_[i] = NULL;
-    }
-  }
-
-  virtual void releaseResources();
-
-  virtual void submit(device::VirtualDevice& device);
-
-  //! Return the memory object to write to
-  Memory& memory() const { return *memory_; }
-
-  //! Return the host memory to read from
-  LiquidFlashFile* file() const { return file_; }
-
-  //! Returns file offset
-  size_t fileOffset() const { return fileOffset_; }
-
-  //! Return the region origin
-  const Coord3D& origin() const { return origin_; }
-  //! Return the region size
-  const Coord3D& size() const { return size_; }
-
-  //! Return the staging buffer for transfer
-  Memory& staging(uint i) const { return *staging_[i]; }
-
-  bool validateMemory();
-};
-
 /*! \brief      A P2P copy memory command
  *
  *  \details    Used for buffers only. Backends are expected
diff --git a/platform/context.cpp b/platform/context.cpp
index dbb46c38..965269e8 100644
--- a/platform/context.cpp
+++ b/platform/context.cpp
@@ -34,14 +34,6 @@
 #include "CL/cl_dx9_media_sharing.h"
 #endif  //_WIN32
 
-#ifndef WITH_LIQUID_FLASH
-#if (!defined(BUILD_HSA_TARGET) && defined(WITH_HSA_DEVICE) && \
-      defined(WITH_AMDGPU_PRO)) || defined(_WIN32)
-#define WITH_LIQUID_FLASH 1
-#include "lf.h"
-#endif
-#endif
-
 namespace amd {
 
 Context::Context(const std::vector<Device*>& devices, const Info& info)
@@ -91,10 +83,6 @@ Context::~Context() {
 
   delete[] properties_;
   delete glenv_;
-
-#if WITH_LIQUID_FLASH
-  lfTerminate();
-#endif
 }
 
 int Context::checkProperties(const cl_context_properties* properties, Context::Info* info) {
@@ -315,10 +303,6 @@ int Context::create(const intptr_t* properties) {
     }
   }
 
-#if WITH_LIQUID_FLASH
-  lfInit();
-#endif
-
   return result;
 }
 
diff --git a/platform/memory.hpp b/platform/memory.hpp
index 34ea71f8..0b6ace9f 100644
--- a/platform/memory.hpp
+++ b/platform/memory.hpp
@@ -663,42 +663,6 @@ class SvmBuffer : AllStatic {
   static Monitor AllocatedLock_;
 };
 
-#ifndef CL_COMMAND_WRITE_SSG_FILE_AMD
-#define CL_COMMAND_WRITE_SSG_FILE_AMD 2
-#endif
-#ifndef CL_COMMAND_READ_SSG_FILE_AMD
-#define CL_COMMAND_READ_SSG_FILE_AMD 1
-#endif
-#ifndef cl_file_flags_amd
-typedef uint32_t cl_file_flags_amd;
-#endif
-    //! Liquid flash extension
-class LiquidFlashFile : public RuntimeObject {
- private:
-  std::wstring name_;
-  cl_file_flags_amd flags_;
-  void* handle_;
-  uint32_t blockSize_;
-  uint64_t fileSize_;
-
- public:
-  LiquidFlashFile(const wchar_t* name, cl_file_flags_amd flags)
-      : name_(name), flags_(flags), handle_(NULL), blockSize_(0), fileSize_(0) {}
-
-  ~LiquidFlashFile();
-
-  bool open();
-  void close();
-
-  uint32_t blockSize() const { return blockSize_; };
-  uint64_t fileSize() const { return fileSize_; };
-
-  bool transferBlock(bool read, void* dst, uint64_t bufferSize, uint64_t fileOffset,
-                     uint64_t bufferOffset, uint64_t size) const;
-
-  virtual ObjectType objectType() const { return ObjectTypeLiquidFlashFile; }
-};
-
 class ArenaMemory: public Buffer {
 public:
   ArenaMemory(Context& context)
diff --git a/platform/object.hpp b/platform/object.hpp
index 9fca95ab..0f5ed636 100644
--- a/platform/object.hpp
+++ b/platform/object.hpp
@@ -41,9 +41,7 @@
 #define AMD_CL_TYPES_DO(F)                                                                         \
   F(cl_counter_amd, Counter)                                                                       \
   F(cl_perfcounter_amd, PerfCounter)                                                               \
-  F(cl_threadtrace_amd, ThreadTrace)                                                               \
-  F(cl_file_amd, LiquidFlashFile)
-
+  F(cl_threadtrace_amd, ThreadTrace)
 
 #define CL_TYPES_DO(F)                                                                             \
   KHR_CL_TYPES_DO(F)                                                                               \
@@ -144,7 +142,6 @@ class RuntimeObject : public ReferenceCountedObject, public ICDDispatchedObject
     ObjectTypeQueue = 8,
     ObjectTypeSampler = 9,
     ObjectTypeThreadTrace = 10,
-    ObjectTypeLiquidFlashFile = 11
   };
 
   virtual ObjectType objectType() const = 0;

From 2448d5d31cacd9facaadf37a6896b036f654afb1 Mon Sep 17 00:00:00 2001
From: Jacob Lambert <jacob.lambert@amd.com>
Date: Mon, 16 Jan 2023 19:40:56 -0800
Subject: [PATCH 03/45] SWDEV-376413 - Replace deprecated Comgr device-lib
 action

The Comgr ADD_DEVICE_LIBRARIES action has been deprecated. In place
of the previous two-action approach:

    AMD_COMGR_COMPILE_SOURCE_TO_BC
    AMD_COMGR_ADD_DEVICE_LIBRARIES

We can now use a single combined action:

    AMD_COMGR_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC

This new action more closely alings with how device library
management is done by the clang driver.

Change-Id: Id844e9031a1896dedeacec453440b9babc4b111a
---
 device/devprogram.cpp | 37 +++++++++++--------------------------
 device/devprogram.hpp |  6 +++---
 2 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/device/devprogram.cpp b/device/devprogram.cpp
index 5455d80c..6e4287b5 100644
--- a/device/devprogram.cpp
+++ b/device/devprogram.cpp
@@ -346,7 +346,7 @@ amd_comgr_status_t Program::createAction(const amd_comgr_language_t oclver,
 bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs,
                               const std::vector<std::string>& options,
                               amd::option::Options* amdOptions, amd_comgr_data_set_t* output,
-                              char* binaryData[], size_t* binarySize, const bool link_dev_libs) {
+                              char* binaryData[], size_t* binarySize) {
 
   amd_comgr_language_t langver;
   setLanguage(amdOptions->oVariables->CLStd, &langver);
@@ -358,28 +358,13 @@ bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs,
 
   //  Create the action for linking
   amd_comgr_action_info_t action;
-  amd_comgr_data_set_t dataSetDevLibs;
   bool hasAction = false;
-  bool hasDataSetDevLibs = false;
 
   amd_comgr_status_t status = createAction(langver, options, &action, &hasAction);
 
-  if (link_dev_libs) {
-    if (status == AMD_COMGR_STATUS_SUCCESS) {
-      status = amd::Comgr::create_data_set(&dataSetDevLibs);
-    }
-
-    if (status == AMD_COMGR_STATUS_SUCCESS) {
-      hasDataSetDevLibs = true;
-      status = amd::Comgr::do_action(AMD_COMGR_ACTION_ADD_DEVICE_LIBRARIES, action, inputs,
-                                    dataSetDevLibs);
-      extractBuildLog(dataSetDevLibs);
-    }
-  }
-
   if (status == AMD_COMGR_STATUS_SUCCESS) {
     status = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action,
-      (link_dev_libs) ? dataSetDevLibs : inputs, *output);
+                                   inputs, *output);
     extractBuildLog(*output);
   }
 
@@ -396,17 +381,14 @@ bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs,
     amd::Comgr::destroy_action_info(action);
   }
 
-  if (hasDataSetDevLibs) {
-    amd::Comgr::destroy_data_set(dataSetDevLibs);
-  }
-
   return (status == AMD_COMGR_STATUS_SUCCESS);
 }
 
 bool Program::compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs,
                                    const std::vector<std::string>& options,
                                    amd::option::Options* amdOptions,
-                                   char* binaryData[], size_t* binarySize) {
+                                   char* binaryData[], size_t* binarySize,
+                                   const bool link_dev_libs) {
 
   amd_comgr_language_t langver;
   setLanguage(amdOptions->oVariables->CLStd, &langver);
@@ -483,8 +465,12 @@ bool Program::compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs,
 
   //  Compiling the source codes with precompiled headers or directly compileInputs
   if (status == AMD_COMGR_STATUS_SUCCESS) {
-    status = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC,
-                                 action, input, output);
+    if (link_dev_libs)
+      status = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC,
+                                     action, input, output);
+    else
+      status = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC,
+                                     action, input, output);
     extractBuildLog(output);
   }
 
@@ -970,9 +956,8 @@ bool Program::linkImplLC(const std::vector<Program*>& inputPrograms,
   char* binaryData = nullptr;
   size_t binarySize = 0;
   std::vector<std::string> linkOptions;
-  constexpr bool kLinkDevLibs = false;
   bool ret = linkLLVMBitcode(inputs, linkOptions, options, &output, &binaryData,
-                             &binarySize, kLinkDevLibs);
+                             &binarySize);
 
   amd::Comgr::destroy_data_set(output);
   amd::Comgr::destroy_data_set(inputs);
diff --git a/device/devprogram.hpp b/device/devprogram.hpp
index 3bf0679a..a324750e 100644
--- a/device/devprogram.hpp
+++ b/device/devprogram.hpp
@@ -456,13 +456,13 @@ class Program : public amd::HeapObject {
   bool linkLLVMBitcode(const amd_comgr_data_set_t inputs,
     const std::vector<std::string>& options,
     amd::option::Options* amdOptions, amd_comgr_data_set_t* output,
-    char* binaryData[] = nullptr, size_t* binarySize = nullptr,
-    const bool link_dev_libs = true);
+    char* binaryData[] = nullptr, size_t* binarySize = nullptr);
 
   //! Create the bitcode of the compiled input dataset
   bool compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs,
     const std::vector<std::string>& options, amd::option::Options* amdOptions,
-    char* binaryData[], size_t* binarySize);
+    char* binaryData[], size_t* binarySize,
+    const bool link_dev_libs = true);
 
   //! Compile and create the excutable of the input dataset
   bool compileAndLinkExecutable(const amd_comgr_data_set_t inputs,

From 6a671455e0f8404b43054f074e0c46e68a53b43e Mon Sep 17 00:00:00 2001
From: Chauncey Hui <chauncey.hui@amd.com>
Date: Sat, 21 Jan 2023 03:00:06 -0500
Subject: [PATCH 04/45] SWDEV-2 - Change OpenCL version number from  3559 to
 3560

---
 utils/versions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.hpp b/utils/versions.hpp
index dde35676..d724d3ca 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3559
+#define AMD_PLATFORM_BUILD_NUMBER 3560
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER

From 945467e0e40d88b5294b8a634b7d8d67f1b03d39 Mon Sep 17 00:00:00 2001
From: kjayapra-amd <karthik.jayaprakash@amd.com>
Date: Wed, 18 Jan 2023 13:25:07 -0800
Subject: [PATCH 05/45] SWDEV-378006 - Adding a new guarantee macro to support
 printing args.

Change-Id: I9c838644e31a84d96a44b2bd10525a08d805a047
---
 device/blit.cpp        | 10 ++++++----
 device/devhostcall.cpp |  6 ++----
 device/device.cpp      | 14 ++++----------
 utils/debug.cpp        | 11 -----------
 utils/debug.hpp        |  9 +++------
 5 files changed, 15 insertions(+), 35 deletions(-)

diff --git a/device/blit.cpp b/device/blit.cpp
index 3c6b5dbf..a2c25376 100644
--- a/device/blit.cpp
+++ b/device/blit.cpp
@@ -729,14 +729,16 @@ bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, siz
                                            std::vector<FillBufferInfo>& packed_info) {
 
   // 1. Validate input arguments
-  guarantee(fill_size >= pattern_size, "Pattern Size cannot be greater than fill size");
-  guarantee(fill_size <= memory.size(), "Cannot fill more than the mem object size");
+  guarantee(fill_size >= pattern_size, "Pattern Size: %u cannot be greater than fill size: %u \n",
+                                        pattern_size, fill_size);
+  guarantee(fill_size <= memory.size(), "Cannot fill: %u more than the mem object size:%u \n",
+                                        fill_size, memory.size());
 
   // 2. Calculate the next closest dword aligned address for faster processing
   size_t dst_addr = memory.virtualAddress() + fill_origin;
   size_t aligned_dst_addr = amd::alignUp(dst_addr, sizeof(size_t));
-  guarantee(aligned_dst_addr >= dst_addr, "Aligned address cannot be greater than destination"
-                                          "address");
+  guarantee(aligned_dst_addr >= dst_addr, "Aligned address: %u cannot be greater than destination"
+                                          "address :%u \n", aligned_dst_addr, dst_addr);
 
   // 3. If given address is not aligned calculate head and tail size.
   size_t head_size = std::min(aligned_dst_addr - dst_addr, fill_size);
diff --git a/device/devhostcall.cpp b/device/devhostcall.cpp
index 8226e3da..12e5fe90 100644
--- a/device/devhostcall.cpp
+++ b/device/devhostcall.cpp
@@ -84,7 +84,7 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t*
       if (!messages.handlePayload(service, payload)) {
         ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: invalid request for service \"%d\".",
                 service);
-        amd::report_fatal(__FILE__, __LINE__, "Hostcall: invalid service request.");
+        guarantee(false, "Hostcall: invalid service request %d \n", service);
       }
       return;
     case SERVICE_DEVMEM: {
@@ -114,9 +114,7 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t*
       return;
     }
     default:
-      ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: no handler found for service ID \"%d\".",
-              service);
-      amd::report_fatal(__FILE__, __LINE__, "Hostcall service not supported.");
+      guarantee(false, "Hostcall: no handler found for service ID %d \n", service);
       return;
   }
 }
diff --git a/device/device.cpp b/device/device.cpp
index 2709febd..f87452db 100644
--- a/device/device.cpp
+++ b/device/device.cpp
@@ -289,11 +289,8 @@ void MemObjMap::AddMemObj(const void* k, amd::Memory* v) {
 void MemObjMap::RemoveMemObj(const void* k) {
   amd::ScopedLock lock(AllocatedLock_);
   auto rval = MemObjMap_.erase(reinterpret_cast<uintptr_t>(k));
-  if (rval != 1) {
-    DevLogPrintfError("Memobj map does not have ptr: 0x%x",
-                      reinterpret_cast<uintptr_t>(k));
-    guarantee(false, "Memobj map does not have ptr");
-  }
+  guarantee(rval == 1, "Memobj map does not have ptr: 0x%x",
+                        reinterpret_cast<uintptr_t>(k));
 }
 
 amd::Memory* MemObjMap::FindMemObj(const void* k, size_t* offset) {
@@ -328,11 +325,8 @@ void MemObjMap::AddVirtualMemObj(const void* k, amd::Memory* v) {
 void MemObjMap::RemoveVirtualMemObj(const void* k) {
   amd::ScopedLock lock(AllocatedLock_);
   auto rval = VirtualMemObjMap_.erase(reinterpret_cast<uintptr_t>(k));
-  if (rval != 1) {
-    DevLogPrintfError("Virtual Memobj map does not have ptr: 0x%x",
-                      reinterpret_cast<uintptr_t>(k));
-    guarantee(false, "VirtualMemobj map does not have ptr");
-  }
+  guarantee(rval == 1, "Virtual Memobj map does not have ptr: 0x%x",
+                       reinterpret_cast<uintptr_t>(k));
 }
 
 amd::Memory* MemObjMap::FindVirtualMemObj(const void* k) {
diff --git a/utils/debug.cpp b/utils/debug.cpp
index fc607564..cdda2516 100644
--- a/utils/debug.cpp
+++ b/utils/debug.cpp
@@ -49,17 +49,6 @@ extern "C" void breakpoint(void) {
 }
 //! \endcond
 
-// ================================================================================================
-void report_fatal(const char* file, int line, const char* message) {
-  // FIXME_lmoriche: Obfuscate the message string
-  #if (defined(DEBUG))
-    fprintf(outFile, "%s:%d: %s\n", file, line, message);
-  #else
-    fprintf(outFile, "%s\n", message);
-  #endif
-  ::abort();
-}
-
 // ================================================================================================
 void report_warning(const char* message) { fprintf(outFile, "Warning: %s\n", message); }
 
diff --git a/utils/debug.hpp b/utils/debug.hpp
index dd3ffd6e..07cb3b61 100644
--- a/utils/debug.hpp
+++ b/utils/debug.hpp
@@ -66,9 +66,6 @@ extern FILE* outFile;
 extern "C" void breakpoint();
 //! \endcond
 
-//! \brief Report a Fatal exception message and abort.
-extern void report_fatal(const char* file, int line, const char* message);
-
 //! \brief Display a warning message.
 extern void report_warning(const char* message);
 
@@ -94,10 +91,10 @@ extern void log_printf(LogLevel level, const char* file, int line, uint64_t *sta
 #endif  // __INTEL_COMPILER
 
 //! \brief Abort the program if the invariant \a cond is false.
-#define guarantee(cond, message)                                                                       \
+#define guarantee(cond, format, ...)                                                               \
   if (!(cond)) {                                                                                   \
-    amd::report_fatal(__FILE__, __LINE__,  XSTR(message) );                                  \
-    amd::breakpoint();                                                                             \
+    amd::log_printf(amd::LOG_NONE, __FILE__, __LINE__, format, ##__VA_ARGS__);                     \
+    ::abort();                                                                                     \
   }
 
 #define fixme_guarantee(cond, ...) guarantee(cond, __VA_ARGS__)

From d31d50f4ed98e2b406dbe02e9a1f906994717204 Mon Sep 17 00:00:00 2001
From: German <German.Andryeyev@amd.com>
Date: Fri, 20 Jan 2023 10:26:53 -0500
Subject: [PATCH 06/45] SWDEV-377991 - Remove liquidflash support

Remove amdgpu-pro interface for persistent memory,
used in Liquidflash

Change-Id: I7d1720ad0875a62ebb2d7f96cba39601d560a5df
---
 device/device.hpp             |   4 -
 device/rocm/pro/lnxheaders.h  |  46 -------
 device/rocm/pro/prodevice.cpp | 241 ----------------------------------
 device/rocm/pro/prodevice.hpp |  81 ------------
 device/rocm/pro/prodriver.hpp |  52 --------
 device/rocm/pro/profuncs.hpp  |  85 ------------
 device/rocm/rocdevice.cpp     |  20 ---
 device/rocm/rocdevice.hpp     |   7 -
 device/rocm/rocmemory.cpp     |   9 --
 9 files changed, 545 deletions(-)
 delete mode 100644 device/rocm/pro/lnxheaders.h
 delete mode 100644 device/rocm/pro/prodevice.cpp
 delete mode 100644 device/rocm/pro/prodevice.hpp
 delete mode 100644 device/rocm/pro/prodriver.hpp
 delete mode 100644 device/rocm/pro/profuncs.hpp

diff --git a/device/device.hpp b/device/device.hpp
index 5aad54bb..a493795e 100644
--- a/device/device.hpp
+++ b/device/device.hpp
@@ -153,7 +153,6 @@ enum OclExtensions {
   ClKhrD3d9Sharing,
 #endif
   ClKhrImage2dFromBuffer,
-  ClAmdSemaphore,
   ClAMDBusAddressableMemory,
   ClAMDC11Atomics,
   ClKhrSpir,
@@ -162,7 +161,6 @@ enum OclExtensions {
   ClKhrDepthImages,
   ClKhrMipMapImage,
   ClKhrMipMapImageWrites,
-  ClKhrIlProgram,
   ClAmdCopyBufferP2P,
   ClAmdAssemblyProgram,
 #if defined(_WIN32)
@@ -198,7 +196,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ",
                                             "cl_khr_dx9_media_sharing ",
 #endif
                                             "cl_khr_image2d_from_buffer ",
-                                            "",
                                             "cl_amd_bus_addressable_memory ",
                                             "cl_amd_c11_atomics ",
                                             "cl_khr_spir ",
@@ -207,7 +204,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ",
                                             "cl_khr_depth_images ",
                                             "cl_khr_mipmap_image ",
                                             "cl_khr_mipmap_image_writes ",
-                                            "",
                                             "cl_amd_copy_buffer_p2p ",
                                             "cl_amd_assembly_program ",
 #if defined(_WIN32)
diff --git a/device/rocm/pro/lnxheaders.h b/device/rocm/pro/lnxheaders.h
deleted file mode 100644
index 8c9fbe14..00000000
--- a/device/rocm/pro/lnxheaders.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE. */
-
-#pragma once
-
-// NOTE: Some of the Linux driver stack's headers don't wrap their C-style interface names in 'extern "C" { ... }'
-// blocks when building with a C++ compiler, so we need to add that ourselves.
-#if __cplusplus
-extern "C"
-{
-#endif
-
-#include <amdgpu.h>
-#include <amdgpu_drm.h>
-#include <xf86drm.h>
-#include <xf86drmMode.h>
-
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <dlfcn.h>
-
-constexpr int32_t InvalidFd = -1; // value representing a invalid file descriptor for Linux
-
-#if __cplusplus
-} // extern "C"
-#endif
diff --git a/device/rocm/pro/prodevice.cpp b/device/rocm/pro/prodevice.cpp
deleted file mode 100644
index 1e366cd2..00000000
--- a/device/rocm/pro/prodevice.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE. */
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "hsa/hsa_ext_amd.h"
-#include "lnxheaders.h"
-#include "prodevice.hpp"
-#include "amdgpu_drm.h"
-
-namespace roc {
-
-constexpr uint32_t kMaxDevices  = 32;
-constexpr uint32_t kAtiVendorId = 0x1002;
-
-void*      ProDevice::lib_drm_handle_ = nullptr;
-bool       ProDevice::initialized_ = false;
-drm::Funcs ProDevice::funcs_;
-
-IProDevice* IProDevice::Init(uint32_t bus, uint32_t device, uint32_t func)
-{
-  // Make sure DRM lib is initialized
-  if (!ProDevice::DrmInit()) {
-    return nullptr;
-  }
-
-  ProDevice* pro_device = new ProDevice();
-
-  if (pro_device == nullptr || !pro_device->Create(bus, dev, func)) {
-    delete pro_device;
-    return nullptr;
-  }
-  return pro_device;
-}
-
-ProDevice::~ProDevice() {
-  delete alloc_ops_;
-
-  if (dev_handle_ != nullptr) {
-    Funcs().AmdgpuDeviceDeinitialize(dev_handle_);
-  }
-  if (file_desc_ > 0) {
-    close(file_desc_);
-  }
-}
-
-bool ProDevice::DrmInit()
-{
-  if (initialized_ == false) {
-    // Find symbols in libdrm_amdgpu.so.1
-    lib_drm_handle_ = dlopen("libdrm_amdgpu.so.1", RTLD_NOW);
-    if (lib_drm_handle_ == nullptr) {
-      return false;
-    } else {
-      funcs_.DrmGetDevices = reinterpret_cast<drm::DrmGetDevices>(dlsym(
-                             lib_drm_handle_,
-                             "drmGetDevices"));
-      if (funcs_.DrmGetDevices == nullptr) return false;
-      funcs_.AmdgpuDeviceInitialize = reinterpret_cast<drm::AmdgpuDeviceInitialize>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_device_initialize"));
-      if (funcs_.AmdgpuDeviceInitialize == nullptr) return false;
-      funcs_.AmdgpuDeviceDeinitialize = reinterpret_cast<drm::AmdgpuDeviceDeinitialize>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_device_deinitialize"));
-      if (funcs_.AmdgpuDeviceDeinitialize == nullptr) return false;
-      funcs_.AmdgpuQueryGpuInfo = reinterpret_cast<drm::AmdgpuQueryGpuInfo>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_query_gpu_info"));
-      if (funcs_.AmdgpuQueryGpuInfo == nullptr) return false;
-      funcs_.AmdgpuQueryInfo = reinterpret_cast<drm::AmdgpuQueryInfo>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_query_info"));
-      if (funcs_.AmdgpuQueryInfo == nullptr) return false;
-      funcs_.AmdgpuBoAlloc = reinterpret_cast<drm::AmdgpuBoAlloc>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_bo_alloc"));
-      if (funcs_.AmdgpuBoAlloc == nullptr) return false;
-      funcs_.AmdgpuBoExport = reinterpret_cast<drm::AmdgpuBoExport>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_bo_export"));
-      if (funcs_.AmdgpuBoExport == nullptr) return false;
-      funcs_.AmdgpuBoFree = reinterpret_cast<drm::AmdgpuBoFree>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_bo_free"));
-      if (funcs_.AmdgpuBoFree == nullptr) return false;
-      funcs_.AmdgpuBoCpuMap = reinterpret_cast<drm::AmdgpuBoCpuMap>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_bo_cpu_map"));
-      if (funcs_.AmdgpuBoCpuMap == nullptr) return false;
-      funcs_.AmdgpuBoCpuUnmap = reinterpret_cast<drm::AmdgpuBoCpuUnmap>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_bo_cpu_unmap"));
-      if (funcs_.AmdgpuBoCpuUnmap == nullptr) return false;
-    }
-  }
-
-  initialized_ = true;
-  return true;
-}
-
-#ifndef AMDGPU_CAPABILITY_SSG_FLAG
-#define AMDGPU_CAPABILITY_SSG_FLAG 4
-#endif
-
-// ================================================================================================
-// Open drm device and initialize it. And also get the drm information.
-bool ProDevice::Create(uint32_t bus, uint32_t device, uint32_t func) {
-  drmDevicePtr  devices[kMaxDevices] = { };
-  int32_t device_count = Funcs().DrmGetDevices(devices, kMaxDevices);
-  bool    result = false;
-
-  for (int32_t i = 0; i < device_count; i++) {
-    // Check if the device vendor is AMD
-    if (devices[i]->deviceinfo.pci->vendor_id != kAtiVendorId) {
-      continue;
-    }
-    if ((devices[i]->businfo.pci->bus == bus) &&
-        (devices[i]->businfo.pci->dev == device) &&
-        (devices[i]->businfo.pci->func == func)) {
-
-      // pDevices[i]->nodes[DRM_NODE_PRIMARY];
-      // Using render node here so that we can do the off-screen rendering without authentication
-      file_desc_ = open(devices[i]->nodes[DRM_NODE_RENDER], O_RDWR, 0);
-
-      if (file_desc_ > 0) {
-        void* data, *file, *cap;
-
-        // Initialize the admgpu device.
-        if (Funcs().AmdgpuDeviceInitialize(file_desc_, &major_ver_,
-                                           &minor_ver_, &dev_handle_) == 0) {
-          uint32_t version = 0;
-          // amdgpu_query_gpu_info will never fail only if it is initialized
-          Funcs().AmdgpuQueryGpuInfo(dev_handle_, &gpu_info_);
-
-          drm_amdgpu_capability cap = {};
-          Funcs().AmdgpuQueryInfo(dev_handle_, AMDGPU_INFO_CAPABILITY, sizeof(drm_amdgpu_capability), &cap);
-
-          // Check if DGMA and SSG are available
-          if ((cap.flag & (AMDGPU_CAPABILITY_DIRECT_GMA_FLAG | AMDGPU_CAPABILITY_SSG_FLAG)) ==
-              (AMDGPU_CAPABILITY_DIRECT_GMA_FLAG | AMDGPU_CAPABILITY_SSG_FLAG)) {
-            result = true;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  if (result) {
-    alloc_ops_ = new amd::Monitor("DGMA mem alloc lock", true);
-    if (nullptr == alloc_ops_) {
-      return true;
-    }
-  }
-
-  return result;
-}
-
-void* ProDevice::AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr) const
-{
-  amd::ScopedLock l(alloc_ops_);
-  void* ptr = nullptr;
-  amdgpu_bo_handle buf_handle = 0;
-  amdgpu_bo_alloc_request req = {0};
-  *host_ptr = nullptr;
-
-  req.alloc_size = size;
-  req.phys_alignment = 64 * Ki;
-  req.preferred_heap = AMDGPU_GEM_DOMAIN_DGMA;
-
-  // Allocate buffer in DGMA heap
-  if (0 == Funcs().AmdgpuBoAlloc(dev_handle_, &req, &buf_handle)) {
-    amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd;
-    uint32_t shared_handle = 0;
-    // Find the base driver handle
-    if (0 == Funcs().AmdgpuBoExport(buf_handle, type, &shared_handle)) {
-      uint32_t  flags = 0;
-      size_t    buf_size = 0;
-      // Map memory object to HSA device
-      if (0 == hsa_amd_interop_map_buffer(1, &agent, shared_handle,
-                                          flags, &buf_size, &ptr, nullptr, nullptr)) {
-        // Ask GPUPro driver to provide CPU access to allocation
-        if (0 == Funcs().AmdgpuBoCpuMap(buf_handle, host_ptr)) {
-          allocs_.insert({ptr, {buf_handle, shared_handle}});
-        }
-        else {
-          hsa_amd_interop_unmap_buffer(ptr);
-          close(shared_handle);
-          Funcs().AmdgpuBoFree(buf_handle);
-        }
-      }
-      else {
-        close(shared_handle);
-        Funcs().AmdgpuBoFree(buf_handle);
-      }
-    }
-    else {
-      Funcs().AmdgpuBoFree(buf_handle);
-    }
-  }
-
-  return ptr;
-}
-
-void ProDevice::FreeDmaBuffer(void* ptr) const
-{
-  amd::ScopedLock l(alloc_ops_);
-  auto it = allocs_.find(ptr);
-  if (it != allocs_.end()) {
-    Funcs().AmdgpuBoCpuUnmap(it->second.first);
-    // Unmap memory from HSA device
-    hsa_amd_interop_unmap_buffer(ptr);
-    // Close shared handle
-    close(it->second.second);
-    int error = Funcs().AmdgpuBoFree(it->second.first);
-    allocs_.erase(it);
-  }
-}
-
-}
-
-#endif  // WITHOUT_HSA_BACKEND
-
diff --git a/device/rocm/pro/prodevice.hpp b/device/rocm/pro/prodevice.hpp
deleted file mode 100644
index 80ff3600..00000000
--- a/device/rocm/pro/prodevice.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE. */
-
-#pragma once
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "profuncs.hpp"
-#include "prodriver.hpp"
-#include "thread/monitor.hpp"
-#include <unordered_map>
-
-/*! \addtogroup HSA
- *  @{
- */
-
-//! HSA Device Implementation
-namespace roc {
-
-class ProDevice : public IProDevice {
-public:
-  static bool DrmInit();
-
-  ProDevice()
-    : file_desc_(0)
-    , major_ver_(0)
-    , minor_ver_(0)
-    , dev_handle_(nullptr)
-    , alloc_ops_(nullptr) {}
-  virtual ~ProDevice() override;
-
-  bool Create(uint32_t bus, uint32_t device, uint32_t func);
-
-  virtual void* AllocDmaBuffer(
-      hsa_agent_t agent, size_t size, void** host_ptr) const override;
-  virtual void FreeDmaBuffer(void* ptr) const override;
-  virtual void GetAsicIdAndRevisionId(uint32_t* asic_id, uint32_t* rev_id) const override
-  {
-    *asic_id    = gpu_info_.asic_id;
-    *rev_id     = gpu_info_.pci_rev_id;
-  }
-
-private:
-  static void*          lib_drm_handle_;
-  static bool           initialized_;
-  static drm::Funcs     funcs_;
-  const drm::Funcs& Funcs() const { return funcs_; }
-
-  int32_t               file_desc_;   //!< File descriptor for the device
-  uint32_t              major_ver_;   //!< Major driver version
-  uint32_t              minor_ver_;   //!< Minor driver version
-  amdgpu_device_handle  dev_handle_;  //!< AMD gpu device handle
-  amdgpu_gpu_info       gpu_info_;    //!< GPU info structure
-  amdgpu_heap_info      heap_info_;   //!< Information about memory
-  mutable std::unordered_map<void*, std::pair<amdgpu_bo_handle, uint32_t>> allocs_; //!< Alloced memory mapping
-  amd::Monitor*         alloc_ops_;   //!< Serializes memory allocations/destructions
-};
-
-}  // namespace roc
-
-/**
- * @}
- */
-#endif /*WITHOUT_HSA_BACKEND*/
diff --git a/device/rocm/pro/prodriver.hpp b/device/rocm/pro/prodriver.hpp
deleted file mode 100644
index 819ade27..00000000
--- a/device/rocm/pro/prodriver.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE. */
-
-#pragma once
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "top.hpp"
-#include "hsa/hsa.h"
-
-/*! \addtogroup HSA
- *  @{
- */
-
-namespace roc {
-
-//! Pro Device Interface
-class IProDevice : public amd::HeapObject {
-public:
-  static IProDevice* Init(uint32_t bus, uint32_t device, uint32_t func);
-
-  virtual void* AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr) const = 0;
-  virtual void FreeDmaBuffer(void* ptr) const = 0;
-  virtual void GetAsicIdAndRevisionId(uint32_t* asic_id, uint32_t* rev_id) const = 0;
-
-  IProDevice() {}
-  virtual ~IProDevice() {}
-};
-
-}  // namespace roc
-
-/**
- * @}
- */
-#endif /*WITHOUT_HSA_BACKEND*/
diff --git a/device/rocm/pro/profuncs.hpp b/device/rocm/pro/profuncs.hpp
deleted file mode 100644
index e878df0c..00000000
--- a/device/rocm/pro/profuncs.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE. */
-
-#pragma once
-
-namespace roc
-{
-namespace drm
-{
-typedef int (*DrmGetDevices)(
-            drmDevicePtr*     pDevices,
-            int               maxDevices);
-
-typedef int (*AmdgpuDeviceInitialize)(
-            int                       fd,
-            uint32_t*                 pMajorVersion,
-            uint32_t*                 pMinorVersion,
-            amdgpu_device_handle*     pDeviceHandle);
-
-typedef int (*AmdgpuDeviceDeinitialize)(
-            amdgpu_device_handle  hDevice);
-
-typedef int (*AmdgpuQueryGpuInfo)(
-            amdgpu_device_handle      hDevice,
-            struct amdgpu_gpu_info*   pInfo);
-
-typedef int (*AmdgpuQueryInfo)(
-            amdgpu_device_handle  hDevice,
-            unsigned              infoId,
-            unsigned              size,
-            void*                 pValue);
-
-typedef int (*AmdgpuBoAlloc)(
-            amdgpu_device_handle              hDevice,
-            struct amdgpu_bo_alloc_request*   pAllocBuffer,
-            amdgpu_bo_handle*                 pBufferHandle);
-
-typedef int (*AmdgpuBoExport)(
-            amdgpu_bo_handle              hBuffer,
-            enum amdgpu_bo_handle_type    type,
-            uint32_t*                     pFd);
-
-typedef int (*AmdgpuBoFree)(
-            amdgpu_bo_handle  hBuffer);
-
-typedef int (*AmdgpuBoCpuMap)(
-            amdgpu_bo_handle  hBuffer,
-            void**            ppCpuAddress);
-
-typedef int (*AmdgpuBoCpuUnmap)(
-            amdgpu_bo_handle  hBuffer);
-
-struct Funcs
-{
-    DrmGetDevices                     DrmGetDevices;
-    AmdgpuDeviceInitialize            AmdgpuDeviceInitialize;
-    AmdgpuDeviceDeinitialize          AmdgpuDeviceDeinitialize;
-    AmdgpuQueryGpuInfo                AmdgpuQueryGpuInfo;
-    AmdgpuQueryInfo                   AmdgpuQueryInfo;
-    AmdgpuBoAlloc                     AmdgpuBoAlloc;
-    AmdgpuBoExport                    AmdgpuBoExport;
-    AmdgpuBoFree                      AmdgpuBoFree;
-    AmdgpuBoCpuMap                    AmdgpuBoCpuMap;
-    AmdgpuBoCpuUnmap                  AmdgpuBoCpuUnmap;
-};
-
-} //namespace drm
-} //namespace roc
diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index 961227bb..e080cc17 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -40,9 +40,6 @@
 #include "device/rocm/rocmemory.hpp"
 #include "device/rocm/rocglinterop.hpp"
 #include "device/rocm/rocsignal.hpp"
-#ifdef WITH_AMDGPU_PRO
-#include "pro/prodriver.hpp"
-#endif
 #include "platform/sampler.hpp"
 
 #if defined(__clang__)
@@ -163,8 +160,6 @@ Device::Device(hsa_agent_t bkendDevice)
     , xferQueue_(nullptr)
     , xferRead_(nullptr)
     , xferWrite_(nullptr)
-    , pro_device_(nullptr)
-    , pro_ena_(false)
     , freeMem_(0)
     , vgpusAccess_("Virtual GPU List Ops Lock", true)
     , hsa_exclusive_gpu_access_(false)
@@ -218,9 +213,6 @@ void Device::checkAtomicSupport() {
 }
 
 Device::~Device() {
-#ifdef WITH_AMDGPU_PRO
-  delete pro_device_;
-#endif
   // Release cached map targets
   for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) {
     if ((*mapCache_)[i] != nullptr) {
@@ -687,18 +679,6 @@ bool Device::create() {
   }
   info_.pciDomainID = pci_domain_id;
 
-#ifdef WITH_AMDGPU_PRO
-  // Create amdgpu-pro device interface for SSG support
-  pro_device_ = IProDevice::Init(
-      info_.deviceTopology_.pcie.bus,
-      info_.deviceTopology_.pcie.device,
-      info_.deviceTopology_.pcie.function);
-  if (pro_device_ != nullptr) {
-    pro_ena_ = true;
-    pro_device_->GetAsicIdAndRevisionId(&info_.pcieDeviceId_, &info_.pcieRevisionId_);
-  }
-#endif
-
   // Get Agent HDP Flush Register Memory
   hsa_amd_hdp_flush_t hdpInfo;
   if (HSA_STATUS_SUCCESS !=
diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp
index b3da3783..3e254ff9 100644
--- a/device/rocm/rocdevice.hpp
+++ b/device/rocm/rocdevice.hpp
@@ -75,7 +75,6 @@ class Memory;
 class Resource;
 class VirtualDevice;
 class PrintfDbg;
-class IProDevice;
 
 class ProfilingSignal : public amd::ReferenceCountedObject {
 public:
@@ -478,10 +477,6 @@ class Device : public NullDevice {
   //! Create internal blit program
   bool createBlitProgram();
 
-  // Returns AMD GPU Pro interfacs
-  const IProDevice& iPro() const { return *pro_device_; }
-  bool ProEna() const  { return pro_ena_; }
-
   // P2P agents avaialble for this device
   const std::vector<hsa_agent_t>& p2pAgents() const { return p2p_agents_; }
 
@@ -598,8 +593,6 @@ class Device : public NullDevice {
 
   XferBuffers* xferRead_;   //!< Transfer buffers read
   XferBuffers* xferWrite_;  //!< Transfer buffers write
-  const IProDevice* pro_device_;  //!< AMDGPUPro device
-  bool  pro_ena_;           //!< Extra functionality with AMDGPUPro device, beyond ROCr
   std::atomic<size_t> freeMem_;   //!< Total of free memory available
   mutable amd::Monitor vgpusAccess_;     //!< Lock to serialise virtual gpu list access
   bool hsa_exclusive_gpu_access_;  //!< TRUE if current device was moved into exclusive GPU access mode
diff --git a/device/rocm/rocmemory.cpp b/device/rocm/rocmemory.cpp
index fbe6bd01..0eebbb55 100644
--- a/device/rocm/rocmemory.cpp
+++ b/device/rocm/rocmemory.cpp
@@ -37,9 +37,6 @@
 #include "platform/sampler.hpp"
 #include "amdocl/cl_gl_amd.hpp"
 #include "amdocl/cl_vk_amd.hpp"
-#ifdef WITH_AMDGPU_PRO
-#include "pro/prodriver.hpp"
-#endif
 
 namespace roc {
 
@@ -673,12 +670,6 @@ void Buffer::destroy() {
     return;
   }
 
-#ifdef WITH_AMDGPU_PRO
-  if ((memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) && dev().ProEna()) {
-    dev().iPro().FreeDmaBuffer(deviceMemory_);
-    return;
-  }
-#endif
   if (deviceMemory_ != nullptr) {
     if (deviceMemory_ != owner()->getHostMem()) {
       // if they are identical, the host pointer will be

From 590d947e6d3d502bd14fc6155452191856e0a013 Mon Sep 17 00:00:00 2001
From: Jaydeep Patel <jaydeepkumar.patel@amd.com>
Date: Wed, 18 Jan 2023 18:53:06 +0000
Subject: [PATCH 07/45] SWDEV-378157 - Correct log message

Change-Id: I6297693f67ae78a8874b976ac03353a81b728b1d
---
 device/rocm/rocblit.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/device/rocm/rocblit.cpp b/device/rocm/rocblit.cpp
index f6b447ca..2f664b43 100644
--- a/device/rocm/rocblit.cpp
+++ b/device/rocm/rocblit.cpp
@@ -457,7 +457,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
 
       // Copy memory line by line
       ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-              "HSA Asycn Copy Rect  wait_event=0x%zx, completion_signal=0x%zx",
+              "HSA Async Copy Rect  wait_event=0x%zx, completion_signal=0x%zx",
               (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
       hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
           &srcMem, &offset, &dim, agent, direction, wait_events.size(), wait_events.data(), active);
@@ -478,7 +478,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
 
           // Copy memory line by line
           ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-                  "HSA Asycn Copy wait_event=0x%zx, completion_signal=0x%zx",
+                  "HSA Async Copy wait_event=0x%zx, completion_signal=0x%zx",
                   (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
           hsa_status_t status = hsa_amd_memory_async_copy(
               (reinterpret_cast<address>(dst) + dstOffset), dstAgent,
@@ -668,9 +668,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
 
   // Use SDMA to transfer the data
   ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-          "HSA Asycn Copy dst=0x%zx, src=0x%zx, size=%d, wait_event=0x%zx, "
-          "completion_signal=0x%zx",
-          dst, src, (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
+          "HSA Async Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, completion_signal=0x%zx",
+          dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
 
   status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
       size[0], wait_events.size(), wait_events.data(), active);

From 7760b95934b9b36b617fd6dd54aff8f2740d8e19 Mon Sep 17 00:00:00 2001
From: Chauncey Hui <chauncey.hui@amd.com>
Date: Tue, 24 Jan 2023 03:00:08 -0500
Subject: [PATCH 08/45] SWDEV-2 - Change OpenCL version number from  3560 to
 3561

---
 utils/versions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.hpp b/utils/versions.hpp
index d724d3ca..a0734464 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3560
+#define AMD_PLATFORM_BUILD_NUMBER 3561
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER

From 5ffa1d9bd10667b88ed2360e05a155a494684880 Mon Sep 17 00:00:00 2001
From: German Andryeyev <German.Andryeyev@amd.com>
Date: Mon, 23 Jan 2023 17:40:30 -0500
Subject: [PATCH 09/45] SWDEV-372790 - Copy AQL packet from runtime setup

Scheduler in device queue requires relaunching itself. Make sure
scheduler uses exactly the same AQL packet as the host launch.

Change-Id: I4eb03c4c91bf2408a6d4607731f081a2e2c2c8ae
---
 device/rocm/rocblit.cpp    | 24 +++---------------------
 device/rocm/rocvirtual.cpp | 15 +++++++++++++--
 device/rocm/rocvirtual.hpp |  3 ++-
 3 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/device/rocm/rocblit.cpp b/device/rocm/rocblit.cpp
index 2f664b43..a7a3efd2 100644
--- a/device/rocm/rocblit.cpp
+++ b/device/rocm/rocblit.cpp
@@ -2181,7 +2181,7 @@ bool KernelBlitManager::fillBuffer3D(device::Memory& memory, const void* pattern
 // ================================================================================================
 bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
                                    const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
-                                   const amd::Coord3D& sizeIn, bool entire, 
+                                   const amd::Coord3D& sizeIn, bool entire,
                                    amd::CopyMetadata copyMetadata) const {
   amd::ScopedLock k(lockXferOps_);
   bool result = false;
@@ -2681,31 +2681,13 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
 
   Memory* schedulerMem = dev().getRocMemory(schedulerParam);
   sp->kernarg_address = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory());
-
-  sp->hidden_global_offset_x = 0;
-  sp->hidden_global_offset_y = 0;
-  sp->hidden_global_offset_z = 0;
   sp->thread_counter = 0;
   sp->child_queue = reinterpret_cast<uint64_t>(schedulerQueue);
   sp->complete_signal = schedulerSignal;
 
   hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne);
 
-  sp->scheduler_aql.header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-                             (1 << HSA_PACKET_HEADER_BARRIER) |
-                             (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-                             (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-  sp->scheduler_aql.setup = 1;
-  sp->scheduler_aql.workgroup_size_x = 1;
-  sp->scheduler_aql.workgroup_size_y = 1;
-  sp->scheduler_aql.workgroup_size_z = 1;
-  sp->scheduler_aql.grid_size_x = threads;
-  sp->scheduler_aql.grid_size_y = 1;
-  sp->scheduler_aql.grid_size_z = 1;
-  sp->scheduler_aql.kernel_object = gpuKernel.KernelCodeHandle();
-  sp->scheduler_aql.kernarg_address = (void*)sp->kernarg_address;
-  sp->scheduler_aql.private_segment_size = 0;
-  sp->scheduler_aql.group_segment_size = 0;
+
   sp->vqueue_header = vqVM;
 
   sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
@@ -2720,7 +2702,7 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
   address parameters = captureArguments(kernels_[Scheduler]);
 
   if (!gpu().submitKernelInternal(ndrange, *kernels_[Scheduler],
-                                  parameters, nullptr)) {
+                                  parameters, nullptr, 0, nullptr, &sp->scheduler_aql)) {
     return false;
   }
   releaseArguments(parameters);
diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp
index 7747d147..0976b183 100644
--- a/device/rocm/rocvirtual.cpp
+++ b/device/rocm/rocvirtual.cpp
@@ -1817,7 +1817,7 @@ bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memo
         realSize.c[0] *= elemSize;
       }
 
-      result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, realSrcOrigin, realDstOrigin, 
+      result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, realSrcOrigin, realDstOrigin,
                                     realSize, entire, copyMetadata);
       break;
     }
@@ -2781,7 +2781,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
 // ================================================================================================
 bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     const amd::Kernel& kernel, const_address parameters, void* eventHandle,
-    uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
+    uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd,
+    hsa_kernel_dispatch_packet_t* aql_packet) {
   device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
   Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
   size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
@@ -3108,6 +3109,16 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_;
     }
 
+    // Copy scheduler's AQL packet for possible relaunch from the scheduler itself
+    if (aql_packet != nullptr) {
+      *aql_packet = dispatchPacket;
+      aql_packet->header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+                           (1 << HSA_PACKET_HEADER_BARRIER) |
+                           (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+                           (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+      aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+    }
+
     // Dispatch the packet
     if (!dispatchAqlPacket(
             &dispatchPacket, aqlHeaderWithOrder,
diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp
index bf0bb4ce..74d624a1 100644
--- a/device/rocm/rocvirtual.hpp
+++ b/device/rocm/rocvirtual.hpp
@@ -312,7 +312,8 @@ class VirtualGPU : public device::VirtualDevice {
                             const_address parameters,            //!< Parameters for the kernel
                             void* event_handle,  //!< Handle to OCL event for debugging
                             uint32_t sharedMemBytes = 0, //!< Shared memory size
-                            amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command
+                            amd::NDRangeKernelCommand* vcmd = nullptr, //!< Original launch command
+                            hsa_kernel_dispatch_packet_t* aql_packet = nullptr  //!< Scheduler launch
                             );
   void submitNativeFn(amd::NativeFnCommand& cmd);
   void submitMarker(amd::Marker& cmd);

From e3caba84b84b109092cf9200698f40df40e09a22 Mon Sep 17 00:00:00 2001
From: Chauncey Hui <chauncey.hui@amd.com>
Date: Wed, 25 Jan 2023 03:00:05 -0500
Subject: [PATCH 10/45] SWDEV-2 - Change OpenCL version number from  3561 to
 3562

---
 utils/versions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.hpp b/utils/versions.hpp
index a0734464..d34c5ce3 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3561
+#define AMD_PLATFORM_BUILD_NUMBER 3562
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER

From cdd151dcd4b62d0d13a1273c669f005bd89be7fe Mon Sep 17 00:00:00 2001
From: Jiabao Xie <jiabaxie@amd.com>
Date: Wed, 25 Jan 2023 15:54:46 -0500
Subject: [PATCH 11/45] SWDEV-366886 - Revert "SWDEV-366886 - force svm alloc
 for rocm"

This reverts commit 275f4ddd209ecb39baedc7e127184c728479b510.

Reason for revert: performance drop in Quicksilver app

Change-Id: I2bdf42ad0a235a74e2bb4d38f86471e9affbd7a6
---
 device/rocm/rocdevice.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index e080cc17..85aa32a6 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -2311,7 +2311,6 @@ bool Device::IpcDetach (void* dev_ptr) const {
 // ================================================================================================
 void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags,
                        void* svmPtr) const {
-  constexpr bool kForceAllocation = true;
   amd::Memory* mem = nullptr;
 
   if (nullptr == svmPtr) {
@@ -2323,7 +2322,7 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_
       return nullptr;
     }
 
-    if (!mem->create(nullptr, false, false, kForceAllocation)) {
+    if (!mem->create(nullptr)) {
       LogError("failed to create a svm hidden buffer!");
       mem->release();
       return nullptr;

From 6eeda49e68a6507309bb94d84e871e05119a1477 Mon Sep 17 00:00:00 2001
From: Chauncey Hui <chauncey.hui@amd.com>
Date: Fri, 27 Jan 2023 03:00:07 -0500
Subject: [PATCH 12/45] SWDEV-2 - Change OpenCL version number from  3562 to
 3563

---
 utils/versions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.hpp b/utils/versions.hpp
index d34c5ce3..0ddc862d 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3562
+#define AMD_PLATFORM_BUILD_NUMBER 3563
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER

From a4b3ea81f3bc9f729aa99e38ff8aedc6bbf732a5 Mon Sep 17 00:00:00 2001
From: Sourabh Betigeri <sourabh.betigeri@amd.com>
Date: Thu, 26 Jan 2023 16:22:03 -0800
Subject: [PATCH 13/45] SWDEV-340649 - Removes calls to commitMemory in hmm
 path Change-Id: I8d381b4c3f5cf95628487e0d10ae643443c9709d

---
 device/rocm/rocmemory.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/device/rocm/rocmemory.cpp b/device/rocm/rocmemory.cpp
index 0eebbb55..8aacd09d 100644
--- a/device/rocm/rocmemory.cpp
+++ b/device/rocm/rocmemory.cpp
@@ -643,8 +643,9 @@ void Buffer::destroy() {
         if (memFlags & CL_MEM_ALLOC_HOST_PTR) {
           if (dev().info().hmmSupported_) {
             // AMD HMM path. Destroy system memory
-            amd::Os::uncommitMemory(deviceMemory_, size());
-            amd::Os::releaseMemory(deviceMemory_, size());
+            if (!(amd::Os::releaseMemory(deviceMemory_, size()))) {
+              ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "[ROCClr] munmap failed \n");
+            }
           } else {
             dev().hostFree(deviceMemory_, size());
           }
@@ -746,7 +747,6 @@ bool Buffer::create(bool alloc_local) {
             if (deviceMemory_ == NULL) {
               return false;
             }
-            amd::Os::commitMemory(deviceMemory_, size(), amd::Os::MEM_PROT_RW);
             // Currently HMM requires cirtain initial calls to mark sysmem allocation as
             // GPU accessible or prefetch memory into GPU
             if (!dev().SvmAllocInit(deviceMemory_, size())) {

From 9c146a280777e02fc1bd34af4d15d3c973402288 Mon Sep 17 00:00:00 2001
From: Chauncey Hui <chauncey.hui@amd.com>
Date: Tue, 31 Jan 2023 03:00:08 -0500
Subject: [PATCH 14/45] SWDEV-2 - Change OpenCL version number from  3563 to
 3564

---
 utils/versions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.hpp b/utils/versions.hpp
index 0ddc862d..52981be4 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3563
+#define AMD_PLATFORM_BUILD_NUMBER 3564
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER

From 54d9026fb5c1c05c5921c6b35263ebbb122cd312 Mon Sep 17 00:00:00 2001
From: German <German.Andryeyev@amd.com>
Date: Mon, 30 Jan 2023 13:11:14 -0500
Subject: [PATCH 15/45] SWDEV-352197 - Destroy virtual device in thread
 destructor

Windows kills threads on exit without any notification. However,
runtime can still destroy VirtualGPU object from the host thread with
HostQueue destruction.
This change also forces RGP trace transfer on the last capture without
any delays.

Change-Id: I768e87e99e1d23a021e63c12f36e450817743759
---
 device/pal/palgpuopen.cpp | 15 +++++++++++++++
 device/pal/palvirtual.cpp |  8 ++++----
 platform/commandqueue.cpp |  1 -
 platform/commandqueue.hpp | 14 +++++++++-----
 4 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/device/pal/palgpuopen.cpp b/device/pal/palgpuopen.cpp
index 7c6357e9..c5d5c094 100644
--- a/device/pal/palgpuopen.cpp
+++ b/device/pal/palgpuopen.cpp
@@ -278,6 +278,21 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) {
           // continue until we find the right queue...
         } else if (Pal::Result::Success == res) {
           trace_.sqtt_disp_count_ = 0;
+          // Stop the trace and save the result. Currently runtime can't delay upload in HIP,
+          // because default stream doesn't have explicit destruction and
+          // OS kills all threads on exit without any notification. That includes PAL RGP threads.
+          {
+            if (trace_.status_ == TraceStatus::WaitingForSqtt) {
+              auto result = EndRGPTrace(gpu);
+            }
+            // Check if runtime is waiting for the final trace results
+            if (trace_.status_ == TraceStatus::WaitingForResults) {
+              // If results are ready, then finish the trace
+              if (CheckForTraceResults() == Pal::Result::Success) {
+                FinishRGPTrace(gpu, false);
+              }
+            }
+          }
         } else {
           FinishRGPTrace(gpu, true);
         }
diff --git a/device/pal/palvirtual.cpp b/device/pal/palvirtual.cpp
index e7465c8b..624379d1 100644
--- a/device/pal/palvirtual.cpp
+++ b/device/pal/palvirtual.cpp
@@ -1072,15 +1072,15 @@ bool VirtualGPU::allocHsaQueueMem() {
 }
 
 VirtualGPU::~VirtualGPU() {
+  // Not safe to remove a queue. So lock the device
+  amd::ScopedLock k(dev().lockAsyncOps());
+  amd::ScopedLock lock(dev().vgpusAccess());
+
   // Destroy RGP trace
   if (rgpCaptureEna()) {
     dev().rgpCaptureMgr()->FinishRGPTrace(this, true);
   }
 
-  // Not safe to remove a queue. So lock the device
-  amd::ScopedLock k(dev().lockAsyncOps());
-  amd::ScopedLock lock(dev().vgpusAccess());
-
   while (!freeCbQueue_.empty()) {
     auto cb = freeCbQueue_.front();
     delete cb;
diff --git a/platform/commandqueue.cpp b/platform/commandqueue.cpp
index e9e56cb8..90573b4c 100644
--- a/platform/commandqueue.cpp
+++ b/platform/commandqueue.cpp
@@ -67,7 +67,6 @@ bool HostQueue::terminate() {
       marker->release();
     }
     thread_.acceptingCommands_ = false;
-    thread_.Release();
   } else {
     if (Os::isThreadAlive(thread_)) {
       Command* marker = nullptr;
diff --git a/platform/commandqueue.hpp b/platform/commandqueue.hpp
index 863f8eca..ea702a5e 100644
--- a/platform/commandqueue.hpp
+++ b/platform/commandqueue.hpp
@@ -162,15 +162,21 @@ class HostQueue : public CommandQueue {
     Thread()
         : amd::Thread("Command Queue Thread", CQ_THREAD_STACK_SIZE, !AMD_DIRECT_DISPATCH),
           acceptingCommands_(false),
-          virtualDevice_(NULL) {}
+          virtualDevice_(nullptr) {}
+
+    virtual ~Thread() {
+      if (virtualDevice_ != nullptr) {
+        delete virtualDevice_;
+        virtualDevice_ = nullptr;
+      }
+    }
 
     //! The command queue thread entry point.
     void run(void* data) {
       HostQueue* queue = static_cast<HostQueue*>(data);
       virtualDevice_ = queue->device().createVirtualDevice(queue);
-      if (virtualDevice_ != NULL) {
+      if (virtualDevice_ != nullptr) {
         queue->loop(virtualDevice_);
-        Release();
       } else {
         acceptingCommands_ = false;
         queue->flush();
@@ -184,8 +190,6 @@ class HostQueue : public CommandQueue {
       }
     }
 
-    void Release() const { delete virtualDevice_; }
-
     //! Get virtual device for the current thread
     device::VirtualDevice* vdev() const { return virtualDevice_; }
 

From a3f8e43c829bc1ce46302add1a4f94e6a5e59619 Mon Sep 17 00:00:00 2001
From: David Yat Sin <David.YatSin@amd.com>
Date: Tue, 31 Jan 2023 17:46:15 +0000
Subject: [PATCH 16/45] SWDEV-380258 - Fix invalid iterator after erase

Fix rare segfault due to invalid iterator after erase

Change-Id: Id5b54d6cf10075deff0d613fec12af249c6c55a3
---
 device/rocm/rocdevice.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index 85aa32a6..01c038c9 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -236,17 +236,17 @@ Device::~Device() {
   }
 
   for (auto& it : queuePool_) {
-    for (auto& qIter : it) {
-      hsa_queue_t* queue = qIter.first;
-      auto& qInfo = qIter.second;
+    for (auto qIter = it.begin(); qIter != it.end(); ) {
+      hsa_queue_t* queue = qIter->first;
+      auto& qInfo = qIter->second;
       if (qInfo.hostcallBuffer_) {
         ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
-                qInfo.hostcallBuffer_, qIter.first);
+                qInfo.hostcallBuffer_, qIter->first);
         disableHostcalls(qInfo.hostcallBuffer_);
         context().svmFree(qInfo.hostcallBuffer_);
       }
       ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
-      it.erase(queue);
+      qIter = it.erase(qIter);
       hsa_queue_destroy(queue);
     }
   }

From 3286c59a4f1f8593775c29fba0902f218058a849 Mon Sep 17 00:00:00 2001
From: Chauncey Hui <chauncey.hui@amd.com>
Date: Wed, 1 Feb 2023 03:00:07 -0500
Subject: [PATCH 17/45] SWDEV-2 - Change OpenCL version number from  3564 to
 3565

---
 utils/versions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.hpp b/utils/versions.hpp
index 52981be4..d166183f 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3564
+#define AMD_PLATFORM_BUILD_NUMBER 3565
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER

From 95063d365feec040247aad5f1843d72d93bddc3f Mon Sep 17 00:00:00 2001
From: German <German.Andryeyev@amd.com>
Date: Tue, 31 Jan 2023 15:48:12 -0500
Subject: [PATCH 18/45] SWDEV-352197 - Use static VM ID for capture

Use static VM ID if available for capture.
Add a delay in RGP trace finish to help RGP server to complete
file operations.

Change-Id: Id0dab2fe4398ac2df79f25608fdd06b1f84e7fe0
---
 device/pal/palgpuopen.cpp | 30 +++++++++++++++++++++++-------
 device/pal/palgpuopen.hpp |  7 ++++---
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/device/pal/palgpuopen.cpp b/device/pal/palgpuopen.cpp
index c5d5c094..e5f7dd2e 100644
--- a/device/pal/palgpuopen.cpp
+++ b/device/pal/palgpuopen.cpp
@@ -54,9 +54,7 @@ RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device)
       se_mask_(0),
       perf_counter_mem_limit_(0),
       perf_counter_frequency_(0),
-      trace_enabled_(false),
-      inst_tracing_enabled_(false),
-      perf_counters_enabled_(false) {
+      value_(0) {
   memset(&trace_, 0, sizeof(trace_));
 }
 
@@ -176,6 +174,8 @@ bool RgpCaptureMgr::Update(Pal::IPlatform* platform) {
     PostDeviceCreate();
   }
 
+  static_vm_id_ = device_.properties().gfxipProperties.flags.supportStaticVmid;
+
   return result;
 }
 
@@ -189,12 +189,12 @@ bool RgpCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue,
   // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
   // it may be optional for Vulkan, but we provide it anyway if available).
   Pal::KernelContextInfo kernelContextInfo = {};
-
   Pal::Result palResult = iQueue->QueryKernelContextInfo(&kernelContextInfo);
 
   // Ensure we've acquired the debug VMID (note that some platforms do not
   // implement this function, so don't fail the whole trace if so)
   *debug_vmid = kernelContextInfo.flags.hasDebugVmid;
+  assert((static_vm_id_ || *debug_vmid) && "Can't capture multiple queues!");
 
   // Register the queue with the GPA session class for timed queue operation support.
   if (trace_.gpa_session_->RegisterTimedQueue(
@@ -532,11 +532,17 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) {
     }
   }
 
-  // Notify the RGP server that we are starting a trace
-  if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
-    result = Pal::Result::ErrorUnknown;
+  if (static_vm_id_) {
+    result = device_.iDev()->SetStaticVmidMode(true);
+    assert(result == Pal::Result::Success && "Static VM ID setup failed!");
   }
 
+  if (result == Pal::Result::Success) {
+    // Notify the RGP server that we are starting a trace
+    if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
+      result = Pal::Result::ErrorUnknown;
+    }
+  }
   // Tell the GPA session class we're starting a trace
   if (result == Pal::Result::Success) {
     GpuUtil::GpaSessionBeginInfo info = {};
@@ -722,6 +728,7 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
     return;
   }
 
+  auto disp_count = trace_.sqtt_disp_count_;
   // Finish the trace if the queue was destroyed before OCL reached
   // the number of captured dispatches
   if (trace_.sqtt_disp_count_ != 0) {
@@ -751,9 +758,18 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
   } else {
     rgp_server_->EndTrace();
   }
+
+  if (static_vm_id_) {
+    auto result = device_.iDev()->SetStaticVmidMode(false);
+    assert(result == Pal::Result::Success && "Static VM ID setup failed!");
+  }
+
   if (trace_.gpa_session_ != nullptr) {
     trace_.gpa_session_->Reset();
   }
+  // If applicaiton exits, then Windows kills all threads and
+  // RGP can't finish data write into a file.
+  amd::Os::sleep(10 * disp_count + 500);
   // Reset tracing state to idle
   trace_.prepared_disp_count_ = 0;
   trace_.sqtt_disp_count_ = 0;
diff --git a/device/pal/palgpuopen.hpp b/device/pal/palgpuopen.hpp
index ddae799b..33e9d35d 100644
--- a/device/pal/palgpuopen.hpp
+++ b/device/pal/palgpuopen.hpp
@@ -410,9 +410,10 @@ class RgpCaptureMgr {
 
   union {
     struct {
-      uint32_t trace_enabled_ : 1;      // True if tracing is currently enabled (master flag)
-      uint32_t inst_tracing_enabled_;   // Enable instruction-level SQTT tokens
-      uint32_t perf_counters_enabled_;  // True if perf counters are enabled
+      uint32_t trace_enabled_: 1;         // True if tracing is currently enabled (master flag)
+      uint32_t inst_tracing_enabled_: 1;  // Enable instruction-level SQTT tokens
+      uint32_t perf_counters_enabled_: 1; // True if perf counters are enabled
+      uint32_t static_vm_id_: 1;          // Static VM ID can be used for capture
     };
     uint32_t value_;
   };

From 39ad907af1d0cc5ebda80ba83896778824bf6ff2 Mon Sep 17 00:00:00 2001
From: Chauncey Hui <chauncey.hui@amd.com>
Date: Thu, 2 Feb 2023 03:00:07 -0500
Subject: [PATCH 19/45] SWDEV-2 - Change OpenCL version number from  3565 to
 3566

---
 utils/versions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.hpp b/utils/versions.hpp
index d166183f..2ea8cdae 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3565
+#define AMD_PLATFORM_BUILD_NUMBER 3566
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER

From 09be2b7049516e026a8c63c0df332d79a59dbce6 Mon Sep 17 00:00:00 2001
From: Ranjith Ramakrishnan <Ranjith.Ramakrishnan@amd.com>
Date: Tue, 31 Jan 2023 15:52:40 -0800
Subject: [PATCH 20/45] SWDEV-366831 - Correct the include path for new
 directory layout

Change-Id: I7c826ed9c2f2d839d350e5150f578830655616f0
---
 device/rocm/rocurilocator.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/device/rocm/rocurilocator.hpp b/device/rocm/rocurilocator.hpp
index 880b6c72..76451087 100644
--- a/device/rocm/rocurilocator.hpp
+++ b/device/rocm/rocurilocator.hpp
@@ -22,7 +22,7 @@
 #if defined(__clang__)
 #if __has_feature(address_sanitizer)
 #include "device/devurilocator.hpp"
-#include "hsa_ven_amd_loader.h"
+#include "hsa/hsa_ven_amd_loader.h"
 
 #include <vector>
 namespace roc {

From 23ae2b64683d53753dd79a4c46944d0c4b9d7219 Mon Sep 17 00:00:00 2001
From: Chauncey Hui <chauncey.hui@amd.com>
Date: Fri, 3 Feb 2023 03:00:10 -0500
Subject: [PATCH 21/45] SWDEV-2 - Change OpenCL version number from  3566 to
 3567

---
 utils/versions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.hpp b/utils/versions.hpp
index 2ea8cdae..62bc75fe 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3566
+#define AMD_PLATFORM_BUILD_NUMBER 3567
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER

From 21e50998dd2439708a6672dac5a04be1dd3da682 Mon Sep 17 00:00:00 2001
From: German <German.Andryeyev@amd.com>
Date: Fri, 3 Feb 2023 13:44:24 -0500
Subject: [PATCH 22/45] SWDEV-368235 - Remove obsolete env variables

Change-Id: I7e14d53297e79e2f68b3a6cc40251ad7db9eb5ab
---
 device/device.cpp           |  11 ---
 device/device.hpp           |   3 +-
 device/devkernel.cpp        | 144 ++++++++++++++++--------------------
 device/pal/paldevice.cpp    |   4 +-
 device/pal/palsettings.cpp  |  39 +---------
 device/pal/palsettings.hpp  |   4 +-
 device/pal/palvirtual.cpp   |  25 +------
 device/rocm/rocdevice.cpp   |  56 +++++++-------
 device/rocm/rocdevice.hpp   |   2 +-
 device/rocm/rocsettings.cpp |  26 -------
 device/rocm/rocsettings.hpp |  13 +---
 utils/flags.hpp             |  44 -----------
 12 files changed, 102 insertions(+), 269 deletions(-)

diff --git a/device/device.cpp b/device/device.cpp
index f87452db..288f84f6 100644
--- a/device/device.cpp
+++ b/device/device.cpp
@@ -798,17 +798,6 @@ Settings::Settings() : value_(0) {
   commandQueues_ = 200;  //!< Field value set to maximum number
                          //!< concurrent Virtual GPUs for default
 
-  overrideLclSet = (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE)) ? 1 : 0;
-  overrideLclSet |=
-      (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y))
-      ? 2
-      : 0;
-  overrideLclSet |=
-      (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
-       !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z))
-      ? 4
-      : 0;
-
   fenceScopeAgent_ = AMD_OPT_FLUSH;
   if (amd::IS_HIP) {
     if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) {
diff --git a/device/device.hpp b/device/device.hpp
index a493795e..ed206311 100644
--- a/device/device.hpp
+++ b/device/device.hpp
@@ -619,7 +619,6 @@ class Settings : public amd::HeapObject {
   uint64_t extensions_;  //!< Supported OCL extensions
   union {
     struct {
-      uint overrideLclSet : 3;        //!< Bit mask to override the local size
       uint apuSystem_ : 1;            //!< Device is APU system with shared memory
       uint supportRA_ : 1;            //!< Support RA channel order format
       uint waitCommand_ : 1;          //!< Enables a wait for every submitted command
@@ -639,7 +638,7 @@ class Settings : public amd::HeapObject {
       uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device
       uint fenceScopeAgent_ : 1;      //!< Enable fence scope agent in AQL dispatch packet
       uint rocr_backend_ : 1;         //!< Device uses ROCr backend for submissions
-      uint reserved_ : 11;
+      uint reserved_ : 14;
     };
     uint value_;
   };
diff --git a/device/devkernel.cpp b/device/devkernel.cpp
index e598dca7..42eb1857 100644
--- a/device/devkernel.cpp
+++ b/device/devkernel.cpp
@@ -677,100 +677,80 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
   if (workGroupInfo()->compileSize_[0] == 0) {
     // Find the default local workgroup size, if it wasn't specified
     if (lclWorkSize[0] == 0) {
-      if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
-        // Find threads per group
-        size_t thrPerGrp = workGroupInfo()->size_;
-
-        // Check if kernel uses images
-        if (flags_.imageEna_ &&
-          // and thread group is a multiple value of wavefronts
-          ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
-          // and it's 2 or 3-dimensional workload
-          (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
-          // Use 8x8 workgroup size if kernel has image writes
-          if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
-            lclWorkSize[0] = 8;
-            lclWorkSize[1] = 8;
-          }
-          else {
-            lclWorkSize[0] = 16;
-            lclWorkSize[1] = 16;
-          }
-          if (workDim == 3) {
-            lclWorkSize[2] = 1;
-          }
+      // Find threads per group
+      size_t thrPerGrp = workGroupInfo()->size_;
+
+      // Check if kernel uses images
+      if (flags_.imageEna_ &&
+        // and thread group is a multiple value of wavefronts
+        ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
+        // and it's 2 or 3-dimensional workload
+        (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
+        // Use 8x8 workgroup size if kernel has image writes
+        if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
+          lclWorkSize[0] = 8;
+          lclWorkSize[1] = 8;
         }
         else {
-          size_t tmp = thrPerGrp;
-          // Split the local workgroup into the most efficient way
-          for (uint d = 0; d < workDim; ++d) {
-            size_t div = tmp;
-            for (; (gblWorkSize[d] % div) != 0; div--)
-              ;
-            lclWorkSize[d] = div;
-            tmp /= div;
-          }
+          lclWorkSize[0] = 16;
+          lclWorkSize[1] = 16;
+        }
+        if (workDim == 3) {
+          lclWorkSize[2] = 1;
+        }
+      }
+      else {
+        size_t tmp = thrPerGrp;
+        // Split the local workgroup into the most efficient way
+        for (uint d = 0; d < workDim; ++d) {
+          size_t div = tmp;
+          for (; (gblWorkSize[d] % div) != 0; div--)
+            ;
+          lclWorkSize[d] = div;
+          tmp /= div;
+        }
 
-          // Assuming DWORD access
-          const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
+        // Assuming DWORD access
+        const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
 
-          // Check if we couldn't find optimal workload
-          if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
-              // or size is too small for the cache line
-            (lclWorkSize[0] < cacheLineMatch)) {
-            size_t maxSize = 0;
-            size_t maxDim = 0;
-            for (uint d = 0; d < workDim; ++d) {
-              if (maxSize < gblWorkSize[d]) {
-                maxSize = gblWorkSize[d];
-                maxDim = d;
-              }
+        // Check if we couldn't find optimal workload
+        if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
+            // or size is too small for the cache line
+          (lclWorkSize[0] < cacheLineMatch)) {
+          size_t maxSize = 0;
+          size_t maxDim = 0;
+          for (uint d = 0; d < workDim; ++d) {
+            if (maxSize < gblWorkSize[d]) {
+              maxSize = gblWorkSize[d];
+              maxDim = d;
             }
-            // Use X dimension as high priority. Runtime will assume that
-            // X dimension is more important for the address calculation
-            if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
-              lclWorkSize[0] = cacheLineMatch;
-              thrPerGrp /= cacheLineMatch;
-              lclWorkSize[maxDim] = thrPerGrp;
-              for (uint d = 1; d < workDim; ++d) {
-                if (d != maxDim) {
-                  lclWorkSize[d] = 1;
-                }
+          }
+          // Use X dimension as high priority. Runtime will assume that
+          // X dimension is more important for the address calculation
+          if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
+            lclWorkSize[0] = cacheLineMatch;
+            thrPerGrp /= cacheLineMatch;
+            lclWorkSize[maxDim] = thrPerGrp;
+            for (uint d = 1; d < workDim; ++d) {
+              if (d != maxDim) {
+                lclWorkSize[d] = 1;
               }
             }
-            else {
-              // Check if a local workgroup has the most optimal size
-              if (thrPerGrp > maxSize) {
-                thrPerGrp = maxSize;
-              }
-              lclWorkSize[maxDim] = thrPerGrp;
-              for (uint d = 0; d < workDim; ++d) {
-                if (d != maxDim) {
-                  lclWorkSize[d] = 1;
-                }
+          }
+          else {
+            // Check if a local workgroup has the most optimal size
+            if (thrPerGrp > maxSize) {
+              thrPerGrp = maxSize;
+            }
+            lclWorkSize[maxDim] = thrPerGrp;
+            for (uint d = 0; d < workDim; ++d) {
+              if (d != maxDim) {
+                lclWorkSize[d] = 1;
               }
             }
           }
         }
       }
-      else {
-        // Use overrides when app doesn't provide workgroup dimensions
-        if (workDim == 1) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
-        }
-        else if (workDim == 2) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
-          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
-        }
-        else if (workDim == 3) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
-          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
-          lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
-        }
-        else {
-          assert(0 && "Invalid workDim!");
-        }
-      }
     }
   }
   else {
diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp
index 75cd3c3e..6c60ef05 100644
--- a/device/pal/paldevice.cpp
+++ b/device/pal/paldevice.cpp
@@ -291,7 +291,7 @@ bool NullDevice::create(const char* palName, const amd::Isa& isa, Pal::GfxIpLeve
                                nullptr,
                                nullptr,
                                nullptr,
-                               AMD_OCL_SC_LIB};
+                               nullptr};
     // Initialize the compiler handle
     acl_error error;
     compiler_ = amd::Hsail::CompilerInit(&opts, &error);
@@ -1013,7 +1013,7 @@ bool Device::create(Pal::IDevice* device) {
                                nullptr,
                                nullptr,
                                nullptr,
-                               AMD_OCL_SC_LIB};
+                               nullptr};
     // Initialize the compiler handle
     acl_error error;
     compiler_ = amd::Hsail::CompilerInit(&opts, &error);
diff --git a/device/pal/palsettings.cpp b/device/pal/palsettings.cpp
index 1f4dc8e9..59ac82b8 100644
--- a/device/pal/palsettings.cpp
+++ b/device/pal/palsettings.cpp
@@ -76,8 +76,6 @@ Settings::Settings() {
   // Enable workload split by default (for 24 bit arithmetic or timeout)
   workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT;
 
-  // By default use host blit
-  blitEngine_ = BlitEngineHost;
   pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi;
   pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)
     ? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi;
@@ -123,8 +121,6 @@ Settings::Settings() {
   //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
   linearPersistentImage_ = false;
 
-  useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
-
   // Device enqueuing settings
   numDeviceEvents_ = 1024;
   numWaitEvents_ = 8;
@@ -328,16 +324,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
 
       libSelector_ = amd::GPU_Library_CI;
       if (LP64_SWITCH(false, true)) {
-        oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
-            ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
-            : OpenCL12;
-      }
-      if (GPU_FORCE_OCL20_32BIT) {
-        force32BitOcl20_ = true;
-        oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
+        oclVersion_ = !reportAsOCL12Device
             ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
             : OpenCL12;
       }
+
       if (OPENCL_VERSION < 200) {
         oclVersion_ = OpenCL12;
       }
@@ -346,27 +337,13 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
       // Cap at OpenCL20 for now
       if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20;
 
-      // This needs to be cleaned once 64bit addressing is stable
-      if (oclVersion_ < OpenCL20) {
-        use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR)
-            ? LP64_SWITCH(false,
-                          /*calAttr.isWorkstation ||*/ true)
-            : GPU_FORCE_64BIT_PTR;
-      } else {
-        if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) {
-          use64BitPtr_ = true;
-        }
-      }
+      use64BitPtr_ = LP64_SWITCH(false, true);
 
       if (oclVersion_ >= OpenCL20) {
         supportDepthsRGB_ = true;
       }
       if (use64BitPtr_) {
-        if (GPU_ENABLE_LARGE_ALLOCATION) {
-          maxAllocSize_ = 64ULL * Gi;
-        } else {
-          maxAllocSize_ = 4048 * Mi;
-        }
+        maxAllocSize_ = 64ULL * Gi;
       } else {
         maxAllocSize_ = 3ULL * Gi;
       }
@@ -447,9 +424,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
 
   imageSupport_ = true;
 
-  // Use kernels for blit if appropriate
-  blitEngine_ = BlitEngineKernel;
-
   hostMemDirectAccess_ |= HostMemBuffer;
   // HW doesn't support untiled image writes
   // hostMemDirectAccess_ |= HostMemImage;
@@ -542,11 +516,6 @@ void Settings::override() {
     preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
   }
 
-  // Override blit engine type
-  if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) {
-    blitEngine_ = GPU_BLIT_ENGINE_TYPE;
-  }
-
   if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
     debugFlags_ = DEBUG_GPU_FLAGS;
   }
diff --git a/device/pal/palsettings.hpp b/device/pal/palsettings.hpp
index 66984622..4bcc7e2e 100644
--- a/device/pal/palsettings.hpp
+++ b/device/pal/palsettings.hpp
@@ -70,7 +70,6 @@ class Settings : public device::Settings {
       uint gfx10Plus_ : 1;              //!< gfx10 and post gfx10 features
       uint threadTraceEnable_ : 1;      //!< Thread trace enable
       uint linearPersistentImage_ : 1;  //!< Allocates linear images in persistent
-      uint useSingleScratch_ : 1;       //!< Allocates single scratch per device
       uint svmAtomics_ : 1;             //!< SVM device atomics
       uint svmFineGrainSystem_ : 1;     //!< SVM fine grain system support
       uint useDeviceQueue_ : 1;         //!< Submit to separate device queue
@@ -82,7 +81,7 @@ class Settings : public device::Settings {
       uint imageBufferWar_ : 1;         //!< Image buffer workaround for Gfx10
       uint disableSdma_ : 1;            //!< Disable SDMA support
       uint alwaysResident_ : 1;         //!< Make resources resident at allocation time
-      uint reserved_ : 7;
+      uint reserved_ : 8;
     };
     uint value_;
   };
@@ -95,7 +94,6 @@ class Settings : public device::Settings {
   uint workloadSplitSize_;       //!< Workload split size
   uint minWorkloadTime_;         //!< Minimal workload time in 0.1 ms
   uint maxWorkloadTime_;         //!< Maximum workload time in 0.1 ms
-  uint blitEngine_;              //!< Blit engine type
   uint cacheLineSize_;           //!< Cache line size in bytes
   uint cacheSize_;               //!< L1 cache size in bytes
   uint numComputeRings_;         //!< 0 - disabled, 1 , 2,.. - the number of compute rings
diff --git a/device/pal/palvirtual.cpp b/device/pal/palvirtual.cpp
index 624379d1..3a208171 100644
--- a/device/pal/palvirtual.cpp
+++ b/device/pal/palvirtual.cpp
@@ -905,11 +905,6 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
   // because destructor calls eraseResourceList() even if create() failed
   dev().resizeResoureList(index());
 
-  if (index() >= GPU_MAX_COMMAND_QUEUES) {
-    // Cap the maximum number of concurrent Virtual GPUs
-    return false;
-  }
-
   // Virtual GPU will have profiling enabled
   state_.profiling_ = profiling;
 
@@ -1020,18 +1015,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
     return false;
   }
 
-  // Choose the appropriate class for blit engine
-  switch (dev().settings().blitEngine_) {
-    default:
-    // Fall through ...
-    case Settings::BlitEngineHost:
-      blitSetup.disableAll();
-    // Fall through ...
-    case Settings::BlitEngineCAL:
-    case Settings::BlitEngineKernel:
-      blitMgr_ = new KernelBlitManager(*this, blitSetup);
-      break;
-  }
+  blitMgr_ = new KernelBlitManager(*this, blitSetup);
   if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
     LogError("Could not create BlitManager!");
     return false;
@@ -3269,11 +3253,8 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) {
       cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);
 
       uint64_t endTimeStampCPU = amd::Os::timeNanos();
-      // Make sure the command batch has a valid GPU TS
-      if (!GPU_RAW_TIMESTAMP) {
-        // Adjust the base time by the execution time
-        readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
-      }
+      // Adjust the base time by the execution time
+      readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
     }
   }
 }
diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index 01c038c9..e63ab77e 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -894,37 +894,35 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
   Device* dev = reinterpret_cast<Device*>(data);
   switch (segment_type) {
     case HSA_REGION_SEGMENT_GLOBAL: {
-      if (dev->settings().enableLocalMemory_) {
-        uint32_t global_flag = 0;
-        hsa_status_t stat =
-            hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
-        if (stat != HSA_STATUS_SUCCESS) {
-          return stat;
-        }
+      uint32_t global_flag = 0;
+      hsa_status_t stat =
+          hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
+      if (stat != HSA_STATUS_SUCCESS) {
+        return stat;
+      }
 
-        if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) {
-          dev->gpu_fine_grained_segment_ = pool;
-        } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
-          dev->gpuvm_segment_ = pool;
-
-          // If cpu agent cannot access this pool, the device does not support large bar.
-          hsa_amd_memory_pool_access_t tmp{};
-          hsa_amd_agent_memory_pool_get_info(
-            dev->cpu_agent_,
-            pool,
-            HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
-            &tmp);
-
-          if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
-            dev->info_.largeBar_ = false;
-          } else {
-            dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR;
-          }
+      if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) {
+        dev->gpu_fine_grained_segment_ = pool;
+      } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
+        dev->gpuvm_segment_ = pool;
+
+        // If cpu agent cannot access this pool, the device does not support large bar.
+        hsa_amd_memory_pool_access_t tmp{};
+        hsa_amd_agent_memory_pool_get_info(
+          dev->cpu_agent_,
+          pool,
+          HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
+          &tmp);
+
+        if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
+          dev->info_.largeBar_ = false;
+        } else {
+          dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR;
         }
+      }
 
-        if (dev->gpuvm_segment_.handle == 0) {
-          dev->gpuvm_segment_ = pool;
-        }
+      if (dev->gpuvm_segment_.handle == 0) {
+        dev->gpuvm_segment_ = pool;
       }
       break;
     }
@@ -1232,7 +1230,7 @@ bool Device::populateOCLDeviceConstants() {
 
   info_.maxWorkItemDimensions_ = 3;
 
-  if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) {
+  if (gpuvm_segment_.handle != 0) {
     size_t global_segment_size = 0;
     if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_,
                                                            HSA_AMD_MEMORY_POOL_INFO_SIZE,
diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp
index 3e254ff9..99d0976e 100644
--- a/device/rocm/rocdevice.hpp
+++ b/device/rocm/rocdevice.hpp
@@ -218,7 +218,7 @@ class NullDevice : public amd::Device {
 
   //! Determine if we can use device memory for SVM
   const bool forceFineGrain(amd::Memory* memory) const {
-    return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1);
+    return (memory->getContext().devices().size() > 1);
   }
 
   virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle) {
diff --git a/device/rocm/rocsettings.cpp b/device/rocm/rocsettings.cpp
index 71c341ad..24cb88dc 100644
--- a/device/rocm/rocsettings.cpp
+++ b/device/rocm/rocsettings.cpp
@@ -35,18 +35,9 @@ Settings::Settings() {
   // Set this to true when we drop the flag
   doublePrecision_ = ::CL_KHR_FP64;
 
-  enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE;
-  enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM;
-
   maxWorkGroupSize_ = 1024;
   preferredWorkGroupSize_ = 256;
 
-  maxWorkGroupSize2DX_ = 16;
-  maxWorkGroupSize2DY_ = 16;
-  maxWorkGroupSize3DX_ = 4;
-  maxWorkGroupSize3DY_ = 4;
-  maxWorkGroupSize3DZ_ = 4;
-
   kernargPoolSize_ = HSA_KERNARG_POOL_SIZE;
 
   // Determine if user is requesting Non-Coherent mode
@@ -201,23 +192,6 @@ void Settings::override() {
     preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
   }
 
-  if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) {
-    maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X;
-  }
-  if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) {
-    maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y;
-  }
-
-  if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) {
-    maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X;
-  }
-  if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) {
-    maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y;
-  }
-  if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) {
-    maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z;
-  }
-
   if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
     xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
   }
diff --git a/device/rocm/rocsettings.hpp b/device/rocm/rocsettings.hpp
index d2fffd73..5b5f81d7 100644
--- a/device/rocm/rocsettings.hpp
+++ b/device/rocm/rocsettings.hpp
@@ -42,8 +42,6 @@ class Settings : public device::Settings {
   union {
     struct {
       uint doublePrecision_ : 1;        //!< Enables double precision support
-      uint enableLocalMemory_ : 1;      //!< Enable GPUVM memory
-      uint enableCoarseGrainSVM_ : 1;   //!< Enable device memory for coarse grain SVM allocations
       uint enableNCMode_ : 1;           //!< Enable Non Coherent mode for system memory
       uint imageDMA_ : 1;               //!< Enable direct image DMA transfers
       uint stagedXferRead_ : 1;         //!< Uses a staged buffer read
@@ -55,7 +53,7 @@ class Settings : public device::Settings {
       uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
       uint coop_sync_ : 1;              //!< grid and multi-grid sync for gfx940+
       uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
-      uint reserved_ : 18;
+      uint reserved_ : 20;
     };
     uint value_;
   };
@@ -66,15 +64,6 @@ class Settings : public device::Settings {
   //! Preferred workgroup size
   uint preferredWorkGroupSize_;
 
-  //! Default max workgroup sizes for 2D
-  int maxWorkGroupSize2DX_;
-  int maxWorkGroupSize2DY_;
-
-  //! Default max workgroup sizes for 3D
-  int maxWorkGroupSize3DX_;
-  int maxWorkGroupSize3DY_;
-  int maxWorkGroupSize3DZ_;
-
   uint kernargPoolSize_;
   uint numDeviceEvents_;      //!< The number of device events
   uint numWaitEvents_;        //!< The number of wait events for device enqueue
diff --git a/utils/flags.hpp b/utils/flags.hpp
index 186d059b..f563c6d3 100644
--- a/utils/flags.hpp
+++ b/utils/flags.hpp
@@ -30,22 +30,10 @@ release(uint, AMD_LOG_MASK, 0X7FFFFFFF,                                       \
         "The mask to enable specific kinds of logs")                          \
 debug(uint, DEBUG_GPU_FLAGS, 0,                                               \
         "The debug options for GPU device")                                   \
-release(uint, GPU_MAX_COMMAND_QUEUES, 300,                                    \
-        "The maximum number of concurrent Virtual GPUs")                      \
 release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */         \
         "The default command queue thread stack size")                        \
 release(int, GPU_MAX_WORKGROUP_SIZE, 0,                                       \
         "Maximum number of workitems in a workgroup for GPU, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0,                                  \
-        "Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0,                                  \
-        "Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0,                                  \
-        "Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0,                                  \
-        "Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0,                                  \
-        "Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \
 debug(bool, CPU_MEMORY_GUARD_PAGES, false,                                    \
         "Use guard pages for CPU memory")                                     \
 debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64,                                 \
@@ -70,12 +58,8 @@ release(uint, GPU_STAGING_BUFFER_SIZE, 4,                                     \
         "Size of the GPU staging buffer in MiB")                              \
 release(bool, GPU_DUMP_BLIT_KERNELS, false,                                   \
         "Dump the kernels for blit manager")                                  \
-release(uint, GPU_BLIT_ENGINE_TYPE, 0x0,                                      \
-        "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel")       \
 release(bool, GPU_FLUSH_ON_EXECUTION, false,                                  \
         "Submit commands to HW on every operation. 0 - Disable, 1 - Enable")  \
-release(bool, GPU_USE_SYNC_OBJECTS, true,                                     \
-        "If enabled, use sync objects instead of polling")                    \
 release(bool, CL_KHR_FP64, true,                                              \
         "Enable/Disable support for double precision")                        \
 release(cstring, AMD_OCL_BUILD_OPTIONS, 0,                                    \
@@ -86,12 +70,8 @@ release(cstring, AMD_OCL_LINK_OPTIONS, 0,                                     \
         "Set clLinkProgram()'s options (override)")                           \
 release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0,                              \
         "Append clLinkProgram()'s options")                                   \
-release(cstring, AMD_OCL_SC_LIB, 0,                                           \
-        "Set shader compiler shared library name or path")                    \
 debug(cstring, AMD_OCL_SUBST_OBJFILE, 0,                                      \
         "Specify binary substitution config file for OpenCL")                 \
-debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false,                                \
-        "Enable the error dialog on Windows")                                 \
 release(size_t, GPU_PINNED_XFER_SIZE, 32,                                     \
         "The pinned buffer size for pinning in read/write transfers in MiB")  \
 release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128,                                \
@@ -100,12 +80,6 @@ release(size_t, GPU_RESOURCE_CACHE_SIZE, 64,                                  \
         "The resource cache size in MB")                                      \
 release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096,                                  \
         "The maximum size accepted for suballocaitons in KB")                 \
-release(bool, GPU_FORCE_64BIT_PTR, 0,                                         \
-        "Forces 64 bit pointers on GPU")                                      \
-release(bool, GPU_FORCE_OCL20_32BIT, 0,                                       \
-        "Forces 32 bit apps to take CLANG\HSAIL path")                        \
-release(bool, GPU_RAW_TIMESTAMP, 0,                                           \
-        "Reports GPU raw timestamps in GPU timeline")                         \
 release(size_t, GPU_NUM_MEM_DEPENDENCY, 256,                                  \
         "Number of memory objects for dependency tracking")                   \
 release(size_t, GPU_XFER_BUFFER_SIZE, 0,                                      \
@@ -116,32 +90,20 @@ release(uint, GPU_SINGLE_ALLOC_PERCENT, 85,                                   \
         "Maximum size of a single allocation as percentage of total")         \
 release(uint, GPU_NUM_COMPUTE_RINGS, 2,                                       \
         "GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \
-release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1,                                 \
-        "GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \
 release(uint, GPU_WORKLOAD_SPLIT, 22,                                         \
         "Workload split size")                                                \
-release(bool, GPU_USE_SINGLE_SCRATCH, false,                                  \
-        "Use single scratch buffer per device instead of per HW ring")        \
 release(bool, AMD_OCL_WAIT_COMMAND, false,                                    \
         "1 = Enable a wait for every submitted command")                      \
 release(uint, GPU_PRINT_CHILD_KERNEL, 0,                                      \
         "Prints the specified number of the child kernels")                   \
 release(bool, GPU_USE_DEVICE_QUEUE, false,                                    \
         "Use a dedicated device queue for the actual submissions")            \
-release(bool, GPU_ENABLE_LARGE_ALLOCATION, true,                              \
-        "Enable >4GB single allocations")                                     \
 release(bool, AMD_THREAD_TRACE_ENABLE, true,                                  \
         "Enable thread trace extension")                                      \
 release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200),                        \
         "Force GPU opencl verison")                                           \
-release(bool, HSA_LOCAL_MEMORY_ENABLE, true,                                  \
-        "Enable HSA device local memory usage")                               \
 release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024,                             \
         "Kernarg pool size")                                                  \
-release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true,                              \
-        "Enable device memory for coarse grain SVM allocations")              \
-release(bool, GPU_IFH_MODE, false,                                            \
-        "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \
 release(bool, GPU_MIPMAP, true,                                               \
         "Enables GPU mipmap extension")                                       \
 release(uint, GPU_ENABLE_PAL, 2,                                              \
@@ -152,8 +114,6 @@ release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1,                              \
         "Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \
 release(uint, OCL_SET_SVM_SIZE, 4*16384,                                      \
         "set SVM space size for discrete GPU")                                \
-debug(uint, OCL_SYSMEM_REQUIREMENT, 2,                                        \
-        "Use flag to change the minimum requirement of system memory not to downgrade")        \
 release(uint, GPU_WAVES_PER_SIMD, 0,                                          \
         "Force the number of waves per SIMD (1-10)")                          \
 release(bool, GPU_WAVE_LIMIT_ENABLE, false,                                   \
@@ -176,10 +136,6 @@ release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "",                              \
         "File path prefix for dumping wave limiter output")                   \
 release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "",                             \
         "File path prefix for tracing wave limiter")                          \
-release(bool, OCL_CODE_CACHE_ENABLE, false,                                   \
-        "1 = Enable compiler code cache")                                     \
-release(bool, OCL_CODE_CACHE_RESET, false,                                    \
-        "1 =  Reset the compiler code cache storage")                         \
 release(bool, PAL_DISABLE_SDMA, false,                                        \
         "1 = Disable SDMA for PAL")                                           \
 release(uint, PAL_RGP_DISP_COUNT, 10000,                                      \

From a7db4ab080961c7c8896547587ef38f3080bd99f Mon Sep 17 00:00:00 2001
From: kjayapra-amd <karthik.jayaprakash@amd.com>
Date: Mon, 30 Jan 2023 12:00:45 -0800
Subject: [PATCH 23/45] SWDEV-378648 - Adding checks in runtime to abort when
 bad/invalid payload is sent.

Change-Id: Ibaa7927bacef1ba067ded5ccbf3f1111f31200a2
---
 device/devhostcall.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/device/devhostcall.cpp b/device/devhostcall.cpp
index 12e5fe90..1729e5eb 100644
--- a/device/devhostcall.cpp
+++ b/device/devhostcall.cpp
@@ -88,6 +88,7 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t*
       }
       return;
     case SERVICE_DEVMEM: {
+      guarantee(payload[0] != 0 || payload[1] != 0, "Both payloads cannot be 0 \n");
       if (payload[0]) {
         amd::Memory* mem = amd::MemObjMap::FindMemObj(reinterpret_cast<void*>(payload[0]));
         if (mem) {

From 3bed55f826b1f26b0c3ee2e0ea5673bf83ce8b44 Mon Sep 17 00:00:00 2001
From: Chauncey Hui <chauncey.hui@amd.com>
Date: Tue, 7 Feb 2023 03:00:09 -0500
Subject: [PATCH 24/45] SWDEV-2 - Change OpenCL version number from  3567 to
 3568

---
 utils/versions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.hpp b/utils/versions.hpp
index 62bc75fe..fc02163e 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3567
+#define AMD_PLATFORM_BUILD_NUMBER 3568
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER

From b1bd4632d7d0995c763c073ebe35210aaae69a52 Mon Sep 17 00:00:00 2001
From: jatang <jatang@amd.com>
Date: Fri, 3 Feb 2023 17:42:01 -0500
Subject: [PATCH 25/45] SWDEV-380792 - Fix floating point exception when
 maxEngineClockFrequency_ is 0

Change-Id: Ic443ceae586c4c84995ed2abef9bd7f32f8b60f9
---
 device/rocm/rocblit.cpp   | 5 ++++-
 device/rocm/rocdevice.cpp | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/device/rocm/rocblit.cpp b/device/rocm/rocblit.cpp
index a7a3efd2..1bd6a0ff 100644
--- a/device/rocm/rocblit.cpp
+++ b/device/rocm/rocblit.cpp
@@ -2691,7 +2691,10 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
   sp->vqueue_header = vqVM;
 
   sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
-  sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
+
+  if (dev().info().maxEngineClockFrequency_ > 0) {
+    sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
+  }
 
   // Use a device side global atomics to workaround the reliance of PCIe 3 atomics
   sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue);
diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index e63ab77e..e352577d 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -1147,7 +1147,9 @@ bool Device::populateOCLDeviceConstants() {
 
   //TODO: add the assert statement for Raven
   if (!(isa().versionMajor() == 9 && isa().versionMinor() == 0 && isa().versionStepping() == 2)) {
-    assert(info_.maxEngineClockFrequency_ > 0);
+    if (info_.maxEngineClockFrequency_ <= 0) {
+      LogError("maxEngineClockFrequency_ is NOT positive!");
+    }
   }
 
   if (HSA_STATUS_SUCCESS !=

From df0c903c86b5e67843ceb7434d620a8ced22f96e Mon Sep 17 00:00:00 2001
From: Saleel Kudchadker <Saleel.Kudchadker@amd.com>
Date: Wed, 8 Feb 2023 10:21:36 -0800
Subject: [PATCH 26/45] SWDEV-381633 - Better log

Change-Id: Ie151d73aa4b1dae7d0d9acd0457e7dbdf35f21ee
---
 device/devprogram.cpp     | 2 +-
 device/rocm/rocdevice.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/device/devprogram.cpp b/device/devprogram.cpp
index 6e4287b5..bb40cce0 100644
--- a/device/devprogram.cpp
+++ b/device/devprogram.cpp
@@ -2962,7 +2962,7 @@ bool Program::runInitFiniKernel(kernel_kind_t kind) const {
   amd::HostQueue* queue = nullptr;
 
   for (const auto& i : kernels_) {
-    LogPrintfInfo("For Init/Fini: Kernel Name: %s", i.first.c_str());
+    ClPrint(amd::LOG_INFO, amd::LOG_INIT, "For Init/Fini: Kernel Name: %s", i.first.c_str());
     const auto &kernel = i.second;
     if ((kernel->isInitKernel() && kind == kernel_kind_t::InitKernel) ||
         (kernel->isFiniKernel() && kind == kernel_kind_t::FiniKernel)) {
diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index e352577d..f9df3075 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -1622,7 +1622,7 @@ bool Device::populateOCLDeviceConstants() {
     LogError("HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS query failed.");
   }
 
-  LogPrintfInfo("HMM support: %d, xnack: %d, direct host access: %d\n",
+  ClPrint(amd::LOG_INFO, amd::LOG_INIT, "HMM support: %d, xnack: %d, direct host access: %d\n",
     info_.hmmSupported_, info_.hmmCpuMemoryAccessible_, info_.hmmDirectHostAccess_);
 
   info_.globalCUMask_ = {};

From 2d5917b555272ea3dd0cf5728e128484c46b7871 Mon Sep 17 00:00:00 2001
From: Chauncey Hui <chauncey.hui@amd.com>
Date: Thu, 9 Feb 2023 03:00:04 -0500
Subject: [PATCH 27/45] SWDEV-2 - Change OpenCL version number from  3568 to
 3569

---
 utils/versions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.hpp b/utils/versions.hpp
index fc02163e..f03c7f4a 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3568
+#define AMD_PLATFORM_BUILD_NUMBER 3569
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER

From c5833a0ee114b01d16ebfeb1ba29c9e85cd9825c Mon Sep 17 00:00:00 2001
From: German Andryeyev <German.Andryeyev@amd.com>
Date: Wed, 8 Feb 2023 17:55:08 -0500
Subject: [PATCH 28/45] SWDEV-381633 - Use blit manager on app's queue

Heap initialization used device queue, but it shoudl be used for
cooperative launches only. Heap initialization must use the same queue
as the current dispatch.

Change-Id: I856621bf82bbdeb1c2d0fbc4970e90d09af805cb
---
 device/pal/paldevice.cpp   | 6 +++---
 device/pal/paldevice.hpp   | 2 +-
 device/pal/palkernel.cpp   | 2 +-
 device/rocm/rocdevice.cpp  | 6 +++---
 device/rocm/rocdevice.hpp  | 2 +-
 device/rocm/rocvirtual.cpp | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp
index 6c60ef05..5755f34a 100644
--- a/device/pal/paldevice.cpp
+++ b/device/pal/paldevice.cpp
@@ -2356,8 +2356,8 @@ void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
 }
 
 // ================================================================================================
-void Device::HiddenHeapAlloc() {
-  auto HeapAlloc = [this]() -> bool {
+void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
+  auto HeapAlloc = [this, &gpu]() -> bool {
     // Allocate initial heap for device memory allocator
     static constexpr size_t HeapBufferSize = 128 * Ki;
     heap_buffer_ = createMemory(HeapBufferSize);
@@ -2369,7 +2369,7 @@ void Device::HiddenHeapAlloc() {
       LogError("Heap buffer allocation failed!");
       return false;
     }
-    bool result = static_cast<const KernelBlitManager&>(xferMgr()).initHeap(
+    bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr()).initHeap(
         heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi));
 
     return result;
diff --git a/device/pal/paldevice.hpp b/device/pal/paldevice.hpp
index 5b3b4af5..0226ea40 100644
--- a/device/pal/paldevice.hpp
+++ b/device/pal/paldevice.hpp
@@ -629,7 +629,7 @@ class Device : public NullDevice {
 #endif
 #endif
   //! Allocates hidden heap for device memory allocations
-  void HiddenHeapAlloc();
+  void HiddenHeapAlloc(const VirtualGPU& gpu);
 
  private:
   static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
diff --git a/device/pal/palkernel.cpp b/device/pal/palkernel.cpp
index e6e4fd28..f425e3d7 100644
--- a/device/pal/palkernel.cpp
+++ b/device/pal/palkernel.cpp
@@ -362,7 +362,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
       case amd::KernelParameterDescriptor::HiddenHeap:
         // Allocate hidden heap for HIP applications only
         if ((amd::IS_HIP) && (palDevice().HeapBuffer() == nullptr)) {
-          const_cast<Device&>(palDevice()).HiddenHeapAlloc();
+          const_cast<Device&>(palDevice()).HiddenHeapAlloc(gpu);
         }
         if (palDevice().HeapBuffer() != nullptr) {
           // Add heap pointer to the code
diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index f9df3075..9ecaf22e 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -3243,8 +3243,8 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const {
 }
 
 // ================================================================================================
-void Device::HiddenHeapAlloc() {
-  auto HeapAllocZeroOut = [this]() -> bool {
+void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
+  auto HeapAllocZeroOut = [this, &gpu]() -> bool {
     // Allocate initial heap for device memory allocator
     static constexpr size_t HeapBufferSize = 128 * Ki;
     heap_buffer_ = createMemory(HeapBufferSize);
@@ -3256,7 +3256,7 @@ void Device::HiddenHeapAlloc() {
       LogError("Heap buffer allocation failed!");
       return false;
     }
-    bool result = static_cast<const KernelBlitManager&>(xferMgr()).initHeap(
+    bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr()).initHeap(
         heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi));
 
     return result;
diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp
index 99d0976e..d9eb4f20 100644
--- a/device/rocm/rocdevice.hpp
+++ b/device/rocm/rocdevice.hpp
@@ -547,7 +547,7 @@ class Device : public NullDevice {
   bool IsValidAllocation(const void* dev_ptr, size_t size) const;
 
   //! Allocates hidden heap for device memory allocations
-  void HiddenHeapAlloc();
+  void HiddenHeapAlloc(const VirtualGPU& gpu);
 
  private:
   bool create();
diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp
index 0976b183..3b88b564 100644
--- a/device/rocm/rocvirtual.cpp
+++ b/device/rocm/rocvirtual.cpp
@@ -2958,7 +2958,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
         case amd::KernelParameterDescriptor::HiddenHeap:
           // Allocate hidden heap for HIP applications only
           if ((amd::IS_HIP) && (dev().HeapBuffer() == nullptr)) {
-            const_cast<Device&>(dev()).HiddenHeapAlloc();
+            const_cast<Device&>(dev()).HiddenHeapAlloc(*this);
           }
           if (dev().HeapBuffer() != nullptr) {
             // Add heap pointer to the code

From ea66c122995c3f040ed60e6302e915e3935a9e7e Mon Sep 17 00:00:00 2001
From: Jacob Lambert <Jacob.Lambert@amd.com>
Date: Mon, 6 Feb 2023 19:10:53 -0500
Subject: [PATCH 29/45] SWDEV-376413 - Revert "SWDEV-376413 - Replace
 deprecated Comgr device-lib action"

This reverts commit 2448d5d31cacd9facaadf37a6896b036f654afb1.

Reason for revert: Test failures with Luxmark, blender, and Indigobench. Need to investigate before re-applying

Change-Id: I6b08273a8f9c8bcaa4e7a06cd42d15048e52ca2a
---
 device/devprogram.cpp | 37 ++++++++++++++++++++++++++-----------
 device/devprogram.hpp |  6 +++---
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/device/devprogram.cpp b/device/devprogram.cpp
index bb40cce0..f1a2cb2e 100644
--- a/device/devprogram.cpp
+++ b/device/devprogram.cpp
@@ -346,7 +346,7 @@ amd_comgr_status_t Program::createAction(const amd_comgr_language_t oclver,
 bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs,
                               const std::vector<std::string>& options,
                               amd::option::Options* amdOptions, amd_comgr_data_set_t* output,
-                              char* binaryData[], size_t* binarySize) {
+                              char* binaryData[], size_t* binarySize, const bool link_dev_libs) {
 
   amd_comgr_language_t langver;
   setLanguage(amdOptions->oVariables->CLStd, &langver);
@@ -358,13 +358,28 @@ bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs,
 
   //  Create the action for linking
   amd_comgr_action_info_t action;
+  amd_comgr_data_set_t dataSetDevLibs;
   bool hasAction = false;
+  bool hasDataSetDevLibs = false;
 
   amd_comgr_status_t status = createAction(langver, options, &action, &hasAction);
 
+  if (link_dev_libs) {
+    if (status == AMD_COMGR_STATUS_SUCCESS) {
+      status = amd::Comgr::create_data_set(&dataSetDevLibs);
+    }
+
+    if (status == AMD_COMGR_STATUS_SUCCESS) {
+      hasDataSetDevLibs = true;
+      status = amd::Comgr::do_action(AMD_COMGR_ACTION_ADD_DEVICE_LIBRARIES, action, inputs,
+                                    dataSetDevLibs);
+      extractBuildLog(dataSetDevLibs);
+    }
+  }
+
   if (status == AMD_COMGR_STATUS_SUCCESS) {
     status = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action,
-                                   inputs, *output);
+      (link_dev_libs) ? dataSetDevLibs : inputs, *output);
     extractBuildLog(*output);
   }
 
@@ -381,14 +396,17 @@ bool Program::linkLLVMBitcode(const amd_comgr_data_set_t inputs,
     amd::Comgr::destroy_action_info(action);
   }
 
+  if (hasDataSetDevLibs) {
+    amd::Comgr::destroy_data_set(dataSetDevLibs);
+  }
+
   return (status == AMD_COMGR_STATUS_SUCCESS);
 }
 
 bool Program::compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs,
                                    const std::vector<std::string>& options,
                                    amd::option::Options* amdOptions,
-                                   char* binaryData[], size_t* binarySize,
-                                   const bool link_dev_libs) {
+                                   char* binaryData[], size_t* binarySize) {
 
   amd_comgr_language_t langver;
   setLanguage(amdOptions->oVariables->CLStd, &langver);
@@ -465,12 +483,8 @@ bool Program::compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs,
 
   //  Compiling the source codes with precompiled headers or directly compileInputs
   if (status == AMD_COMGR_STATUS_SUCCESS) {
-    if (link_dev_libs)
-      status = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC,
-                                     action, input, output);
-    else
-      status = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC,
-                                     action, input, output);
+    status = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC,
+                                 action, input, output);
     extractBuildLog(output);
   }
 
@@ -956,8 +970,9 @@ bool Program::linkImplLC(const std::vector<Program*>& inputPrograms,
   char* binaryData = nullptr;
   size_t binarySize = 0;
   std::vector<std::string> linkOptions;
+  constexpr bool kLinkDevLibs = false;
   bool ret = linkLLVMBitcode(inputs, linkOptions, options, &output, &binaryData,
-                             &binarySize);
+                             &binarySize, kLinkDevLibs);
 
   amd::Comgr::destroy_data_set(output);
   amd::Comgr::destroy_data_set(inputs);
diff --git a/device/devprogram.hpp b/device/devprogram.hpp
index a324750e..3bf0679a 100644
--- a/device/devprogram.hpp
+++ b/device/devprogram.hpp
@@ -456,13 +456,13 @@ class Program : public amd::HeapObject {
   bool linkLLVMBitcode(const amd_comgr_data_set_t inputs,
     const std::vector<std::string>& options,
     amd::option::Options* amdOptions, amd_comgr_data_set_t* output,
-    char* binaryData[] = nullptr, size_t* binarySize = nullptr);
+    char* binaryData[] = nullptr, size_t* binarySize = nullptr,
+    const bool link_dev_libs = true);
 
   //! Create the bitcode of the compiled input dataset
   bool compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs,
     const std::vector<std::string>& options, amd::option::Options* amdOptions,
-    char* binaryData[], size_t* binarySize,
-    const bool link_dev_libs = true);
+    char* binaryData[], size_t* binarySize);
 
   //! Compile and create the excutable of the input dataset
   bool compileAndLinkExecutable(const amd_comgr_data_set_t inputs,

From ca70cd7c3d63b1036e604116f3b269ca123e7904 Mon Sep 17 00:00:00 2001
From: Jeremy Newton <Jeremy.Newton@amd.com>
Date: Wed, 8 Feb 2023 14:03:58 -0500
Subject: [PATCH 30/45] SWDEV-1 - device: Add missing include

Newer GCC's seem to require this.

Change-Id: I85926d4fa552b772f2eb9f8ede7863a546c47f54
Signed-off-by: Jeremy Newton <Jeremy.Newton@amd.com>
---
 device/devhcprintf.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/device/devhcprintf.cpp b/device/devhcprintf.cpp
index b36f7758..a5aba4a4 100644
--- a/device/devhcprintf.cpp
+++ b/device/devhcprintf.cpp
@@ -23,6 +23,7 @@
 
 #include <assert.h>
 #include <cstdarg>
+#include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <string>

From 20dfaee8767f7bc9df722fdbc397d77eed267607 Mon Sep 17 00:00:00 2001
From: Chauncey Hui <chauncey.hui@amd.com>
Date: Fri, 10 Feb 2023 03:00:12 -0500
Subject: [PATCH 31/45] SWDEV-2 - Change OpenCL version number from  3569 to
 3570

---
 utils/versions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.hpp b/utils/versions.hpp
index f03c7f4a..c8c96219 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3569
+#define AMD_PLATFORM_BUILD_NUMBER 3570
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER

From c7bf6a007212e8fe387e11f74789c94c1b63f27b Mon Sep 17 00:00:00 2001
From: David Yat Sin <David.YatSin@amd.com>
Date: Mon, 7 Nov 2022 16:47:59 +0000
Subject: [PATCH 32/45] SWDEV-365908 - Do not align mem size for IPC

Remove alignment to granularity for IPC handles as ROCr has a patch that
will internally validate pointer sizes against requested size during
allocation instead of size aligned to page size. This patch is needed
together with this patch from ROCr:
f8a42a3a:Use user requested size for memory fragments

Change-Id: I28b25558ea03c836b44fafdb34b7330cf6887424
---
 device/rocm/rocdevice.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index 9ecaf22e..f2ebe0f0 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -2209,8 +2209,7 @@ bool Device::IpcCreate(void* dev_ptr, size_t* mem_size, void* handle, size_t* me
     return false;
   }
 
-  // Pass the pointer and memory size to retrieve the handle
-  hsa_status = hsa_amd_ipc_memory_create(orig_dev_ptr, amd::alignUp(*mem_size, alloc_granularity()),
+  hsa_status = hsa_amd_ipc_memory_create(orig_dev_ptr, *mem_size,
                                          reinterpret_cast<hsa_amd_ipc_memory_t*>(handle));
 
   if (hsa_status != HSA_STATUS_SUCCESS) {

From 03944161724c523dcdb5cb536df1bf349297584f Mon Sep 17 00:00:00 2001
From: Saleel Kudchadker <Saleel.Kudchadker@amd.com>
Date: Mon, 13 Feb 2023 14:58:05 -0800
Subject: [PATCH 33/45] SWDEV-301667 - Refactor code

Change-Id: I8f20d994d88acb14f392eaa9d5cd14620667d7f6
---
 device/rocm/rocvirtual.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp
index 3b88b564..feeef138 100644
--- a/device/rocm/rocvirtual.cpp
+++ b/device/rocm/rocvirtual.cpp
@@ -433,8 +433,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
     prof_signal->ts_ = ts;
     ts->AddProfilingSignal(prof_signal);
     if (AMD_DIRECT_DISPATCH) {
-      bool enqueHandler= false;
-      uint32_t init_value = kInitSignalValueOne;
+      bool enqueHandler = false;
       enqueHandler = (ts->command().Callback() != nullptr ||
                       ts->command().GetBatchHead() != nullptr )  &&
                       !ts->command().CpuWaitRequested();

From 53ed2f3252a7eef4ae7617154d9bdbf74761e321 Mon Sep 17 00:00:00 2001
From: Anusha GodavarthySurya <anusha.godavarthysurya@amd.com>
Date: Thu, 9 Feb 2023 23:46:06 +0000
Subject: [PATCH 34/45] SWDEV-380035 - Check for agent and ptr match for hsa
 LOCKED ptr

Change-Id: I2503aa05512aebc3535963e188ca8d1dbfab08f0
---
 device/rocm/rocdevice.cpp | 36 +++++++++++++++++++++---------------
 device/rocm/rocdevice.hpp |  2 +-
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index f2ebe0f0..95a4d375 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -3188,7 +3188,9 @@ device::Signal* Device::createSignal() const {
 amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size) {
   // Only create arena_mem_object if CPU memory is accessible from HMM
   // or if runtime received an interop from another ROCr's client
-  if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size)) {
+  hsa_amd_pointer_info_t ptr_info = {};
+  ptr_info.size = sizeof(hsa_amd_pointer_info_t);
+  if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size, &ptr_info)) {
     return nullptr;
   }
 
@@ -3203,12 +3205,18 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size
       return arena_mem_obj_;
     }
   }
-
   // Calculate the offset of the pointer.
-  const void* dev_ptr = reinterpret_cast<void*>(arena_mem_obj_->getDeviceMemory(
-                          *arena_mem_obj_->getContext().devices()[0])->virtualAddress());
-  offset = reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(dev_ptr);
-
+  const void* dev_ptr = reinterpret_cast<void*>(
+      arena_mem_obj_->getDeviceMemory(*arena_mem_obj_->getContext().devices()[0])
+          ->virtualAddress());
+  // System memory which has been locked
+  if (ptr_info.type == HSA_EXT_POINTER_TYPE_LOCKED &&
+      getCpuAgent().handle == ptr_info.agentOwner.handle && ptr_info.hostBaseAddress == ptr) {
+    offset =
+        reinterpret_cast<size_t>(ptr_info.agentBaseAddress) - reinterpret_cast<size_t>(dev_ptr);
+  } else {
+    offset = reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(dev_ptr);
+  }
   return arena_mem_obj_;
 }
 
@@ -3220,20 +3228,18 @@ void Device::ReleaseGlobalSignal(void* signal) const {
 }
 
 // ================================================================================================
-bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const {
-  hsa_amd_pointer_info_t ptr_info = {};
-  ptr_info.size = sizeof(hsa_amd_pointer_info_t);
+bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) {
   // Query ptr type to see if it's a HMM allocation
-  hsa_status_t status = hsa_amd_pointer_info(
-    const_cast<void*>(dev_ptr), &ptr_info, nullptr, nullptr, nullptr);
+  hsa_status_t status =
+      hsa_amd_pointer_info(const_cast<void*>(dev_ptr), ptr_info, nullptr, nullptr, nullptr);
   // The call should never fail in ROCR, but just check for an error and continue
   if (status != HSA_STATUS_SUCCESS) {
     LogError("hsa_amd_pointer_info() failed");
   }
-  // Check if it's a legacy non-HMM allocation in ROCr
-  if (ptr_info.type != HSA_EXT_POINTER_TYPE_UNKNOWN) {
-    if ((size != 0) && ((reinterpret_cast<const_address>(dev_ptr) -
-                         reinterpret_cast<const_address>(ptr_info.agentBaseAddress)) > size)) {
+  if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) {
+    if ((size != 0) &&
+        ((reinterpret_cast<const_address>(dev_ptr) -
+          reinterpret_cast<const_address>(ptr_info->agentBaseAddress)) > size)) {
       return false;
     }
     return true;
diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp
index d9eb4f20..275fe457 100644
--- a/device/rocm/rocdevice.hpp
+++ b/device/rocm/rocdevice.hpp
@@ -544,7 +544,7 @@ class Device : public NullDevice {
   const bool isFineGrainSupported() const;
 
   //! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
-  bool IsValidAllocation(const void* dev_ptr, size_t size) const;
+  bool IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info);
 
   //! Allocates hidden heap for device memory allocations
   void HiddenHeapAlloc(const VirtualGPU& gpu);

From 8aa5804efadc59b3b16e4266c16397d170dbde4b Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Wed, 15 Feb 2023 07:23:23 +0000
Subject: [PATCH 35/45] SWDEV-368235 - Revert "Remove obsolete env variables"

This reverts commit 21e50998dd2439708a6672dac5a04be1dd3da682.

Reason for revert: Deferred to a future release.

Change-Id: Ia66c37f0ab9734dee73c930d10d7469d5fd57254
---
 device/device.cpp           |  11 +++
 device/device.hpp           |   3 +-
 device/devkernel.cpp        | 144 ++++++++++++++++++++----------------
 device/pal/paldevice.cpp    |   4 +-
 device/pal/palsettings.cpp  |  39 +++++++++-
 device/pal/palsettings.hpp  |   4 +-
 device/pal/palvirtual.cpp   |  25 ++++++-
 device/rocm/rocdevice.cpp   |  56 +++++++-------
 device/rocm/rocdevice.hpp   |   2 +-
 device/rocm/rocsettings.cpp |  26 +++++++
 device/rocm/rocsettings.hpp |  13 +++-
 utils/flags.hpp             |  44 +++++++++++
 12 files changed, 269 insertions(+), 102 deletions(-)

diff --git a/device/device.cpp b/device/device.cpp
index 288f84f6..f87452db 100644
--- a/device/device.cpp
+++ b/device/device.cpp
@@ -798,6 +798,17 @@ Settings::Settings() : value_(0) {
   commandQueues_ = 200;  //!< Field value set to maximum number
                          //!< concurrent Virtual GPUs for default
 
+  overrideLclSet = (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE)) ? 1 : 0;
+  overrideLclSet |=
+      (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y))
+      ? 2
+      : 0;
+  overrideLclSet |=
+      (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
+       !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z))
+      ? 4
+      : 0;
+
   fenceScopeAgent_ = AMD_OPT_FLUSH;
   if (amd::IS_HIP) {
     if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) {
diff --git a/device/device.hpp b/device/device.hpp
index ed206311..a493795e 100644
--- a/device/device.hpp
+++ b/device/device.hpp
@@ -619,6 +619,7 @@ class Settings : public amd::HeapObject {
   uint64_t extensions_;  //!< Supported OCL extensions
   union {
     struct {
+      uint overrideLclSet : 3;        //!< Bit mask to override the local size
       uint apuSystem_ : 1;            //!< Device is APU system with shared memory
       uint supportRA_ : 1;            //!< Support RA channel order format
       uint waitCommand_ : 1;          //!< Enables a wait for every submitted command
@@ -638,7 +639,7 @@ class Settings : public amd::HeapObject {
       uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device
       uint fenceScopeAgent_ : 1;      //!< Enable fence scope agent in AQL dispatch packet
       uint rocr_backend_ : 1;         //!< Device uses ROCr backend for submissions
-      uint reserved_ : 14;
+      uint reserved_ : 11;
     };
     uint value_;
   };
diff --git a/device/devkernel.cpp b/device/devkernel.cpp
index 42eb1857..e598dca7 100644
--- a/device/devkernel.cpp
+++ b/device/devkernel.cpp
@@ -677,80 +677,100 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
   if (workGroupInfo()->compileSize_[0] == 0) {
     // Find the default local workgroup size, if it wasn't specified
     if (lclWorkSize[0] == 0) {
-      // Find threads per group
-      size_t thrPerGrp = workGroupInfo()->size_;
-
-      // Check if kernel uses images
-      if (flags_.imageEna_ &&
-        // and thread group is a multiple value of wavefronts
-        ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
-        // and it's 2 or 3-dimensional workload
-        (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
-        // Use 8x8 workgroup size if kernel has image writes
-        if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
-          lclWorkSize[0] = 8;
-          lclWorkSize[1] = 8;
+      if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
+        // Find threads per group
+        size_t thrPerGrp = workGroupInfo()->size_;
+
+        // Check if kernel uses images
+        if (flags_.imageEna_ &&
+          // and thread group is a multiple value of wavefronts
+          ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
+          // and it's 2 or 3-dimensional workload
+          (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
+          // Use 8x8 workgroup size if kernel has image writes
+          if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
+            lclWorkSize[0] = 8;
+            lclWorkSize[1] = 8;
+          }
+          else {
+            lclWorkSize[0] = 16;
+            lclWorkSize[1] = 16;
+          }
+          if (workDim == 3) {
+            lclWorkSize[2] = 1;
+          }
         }
         else {
-          lclWorkSize[0] = 16;
-          lclWorkSize[1] = 16;
-        }
-        if (workDim == 3) {
-          lclWorkSize[2] = 1;
-        }
-      }
-      else {
-        size_t tmp = thrPerGrp;
-        // Split the local workgroup into the most efficient way
-        for (uint d = 0; d < workDim; ++d) {
-          size_t div = tmp;
-          for (; (gblWorkSize[d] % div) != 0; div--)
-            ;
-          lclWorkSize[d] = div;
-          tmp /= div;
-        }
-
-        // Assuming DWORD access
-        const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
-
-        // Check if we couldn't find optimal workload
-        if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
-            // or size is too small for the cache line
-          (lclWorkSize[0] < cacheLineMatch)) {
-          size_t maxSize = 0;
-          size_t maxDim = 0;
+          size_t tmp = thrPerGrp;
+          // Split the local workgroup into the most efficient way
           for (uint d = 0; d < workDim; ++d) {
-            if (maxSize < gblWorkSize[d]) {
-              maxSize = gblWorkSize[d];
-              maxDim = d;
-            }
+            size_t div = tmp;
+            for (; (gblWorkSize[d] % div) != 0; div--)
+              ;
+            lclWorkSize[d] = div;
+            tmp /= div;
           }
-          // Use X dimension as high priority. Runtime will assume that
-          // X dimension is more important for the address calculation
-          if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
-            lclWorkSize[0] = cacheLineMatch;
-            thrPerGrp /= cacheLineMatch;
-            lclWorkSize[maxDim] = thrPerGrp;
-            for (uint d = 1; d < workDim; ++d) {
-              if (d != maxDim) {
-                lclWorkSize[d] = 1;
+
+          // Assuming DWORD access
+          const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
+
+          // Check if we couldn't find optimal workload
+          if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
+              // or size is too small for the cache line
+            (lclWorkSize[0] < cacheLineMatch)) {
+            size_t maxSize = 0;
+            size_t maxDim = 0;
+            for (uint d = 0; d < workDim; ++d) {
+              if (maxSize < gblWorkSize[d]) {
+                maxSize = gblWorkSize[d];
+                maxDim = d;
               }
             }
-          }
-          else {
-            // Check if a local workgroup has the most optimal size
-            if (thrPerGrp > maxSize) {
-              thrPerGrp = maxSize;
+            // Use X dimension as high priority. Runtime will assume that
+            // X dimension is more important for the address calculation
+            if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
+              lclWorkSize[0] = cacheLineMatch;
+              thrPerGrp /= cacheLineMatch;
+              lclWorkSize[maxDim] = thrPerGrp;
+              for (uint d = 1; d < workDim; ++d) {
+                if (d != maxDim) {
+                  lclWorkSize[d] = 1;
+                }
+              }
             }
-            lclWorkSize[maxDim] = thrPerGrp;
-            for (uint d = 0; d < workDim; ++d) {
-              if (d != maxDim) {
-                lclWorkSize[d] = 1;
+            else {
+              // Check if a local workgroup has the most optimal size
+              if (thrPerGrp > maxSize) {
+                thrPerGrp = maxSize;
+              }
+              lclWorkSize[maxDim] = thrPerGrp;
+              for (uint d = 0; d < workDim; ++d) {
+                if (d != maxDim) {
+                  lclWorkSize[d] = 1;
+                }
               }
             }
           }
         }
       }
+      else {
+        // Use overrides when app doesn't provide workgroup dimensions
+        if (workDim == 1) {
+          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
+        }
+        else if (workDim == 2) {
+          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
+          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
+        }
+        else if (workDim == 3) {
+          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
+          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
+          lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
+        }
+        else {
+          assert(0 && "Invalid workDim!");
+        }
+      }
     }
   }
   else {
diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp
index 5755f34a..8b07d391 100644
--- a/device/pal/paldevice.cpp
+++ b/device/pal/paldevice.cpp
@@ -291,7 +291,7 @@ bool NullDevice::create(const char* palName, const amd::Isa& isa, Pal::GfxIpLeve
                                nullptr,
                                nullptr,
                                nullptr,
-                               nullptr};
+                               AMD_OCL_SC_LIB};
     // Initialize the compiler handle
     acl_error error;
     compiler_ = amd::Hsail::CompilerInit(&opts, &error);
@@ -1013,7 +1013,7 @@ bool Device::create(Pal::IDevice* device) {
                                nullptr,
                                nullptr,
                                nullptr,
-                               nullptr};
+                               AMD_OCL_SC_LIB};
     // Initialize the compiler handle
     acl_error error;
     compiler_ = amd::Hsail::CompilerInit(&opts, &error);
diff --git a/device/pal/palsettings.cpp b/device/pal/palsettings.cpp
index 59ac82b8..1f4dc8e9 100644
--- a/device/pal/palsettings.cpp
+++ b/device/pal/palsettings.cpp
@@ -76,6 +76,8 @@ Settings::Settings() {
   // Enable workload split by default (for 24 bit arithmetic or timeout)
   workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT;
 
+  // By default use host blit
+  blitEngine_ = BlitEngineHost;
   pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi;
   pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)
     ? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi;
@@ -121,6 +123,8 @@ Settings::Settings() {
   //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
   linearPersistentImage_ = false;
 
+  useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
+
   // Device enqueuing settings
   numDeviceEvents_ = 1024;
   numWaitEvents_ = 8;
@@ -324,11 +328,16 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
 
       libSelector_ = amd::GPU_Library_CI;
       if (LP64_SWITCH(false, true)) {
-        oclVersion_ = !reportAsOCL12Device
+        oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
+            ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
+            : OpenCL12;
+      }
+      if (GPU_FORCE_OCL20_32BIT) {
+        force32BitOcl20_ = true;
+        oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
             ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
             : OpenCL12;
       }
-
       if (OPENCL_VERSION < 200) {
         oclVersion_ = OpenCL12;
       }
@@ -337,13 +346,27 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
       // Cap at OpenCL20 for now
       if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20;
 
-      use64BitPtr_ = LP64_SWITCH(false, true);
+      // This needs to be cleaned once 64bit addressing is stable
+      if (oclVersion_ < OpenCL20) {
+        use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR)
+            ? LP64_SWITCH(false,
+                          /*calAttr.isWorkstation ||*/ true)
+            : GPU_FORCE_64BIT_PTR;
+      } else {
+        if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) {
+          use64BitPtr_ = true;
+        }
+      }
 
       if (oclVersion_ >= OpenCL20) {
         supportDepthsRGB_ = true;
       }
       if (use64BitPtr_) {
-        maxAllocSize_ = 64ULL * Gi;
+        if (GPU_ENABLE_LARGE_ALLOCATION) {
+          maxAllocSize_ = 64ULL * Gi;
+        } else {
+          maxAllocSize_ = 4048 * Mi;
+        }
       } else {
         maxAllocSize_ = 3ULL * Gi;
       }
@@ -424,6 +447,9 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
 
   imageSupport_ = true;
 
+  // Use kernels for blit if appropriate
+  blitEngine_ = BlitEngineKernel;
+
   hostMemDirectAccess_ |= HostMemBuffer;
   // HW doesn't support untiled image writes
   // hostMemDirectAccess_ |= HostMemImage;
@@ -516,6 +542,11 @@ void Settings::override() {
     preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
   }
 
+  // Override blit engine type
+  if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) {
+    blitEngine_ = GPU_BLIT_ENGINE_TYPE;
+  }
+
   if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
     debugFlags_ = DEBUG_GPU_FLAGS;
   }
diff --git a/device/pal/palsettings.hpp b/device/pal/palsettings.hpp
index 4bcc7e2e..66984622 100644
--- a/device/pal/palsettings.hpp
+++ b/device/pal/palsettings.hpp
@@ -70,6 +70,7 @@ class Settings : public device::Settings {
       uint gfx10Plus_ : 1;              //!< gfx10 and post gfx10 features
       uint threadTraceEnable_ : 1;      //!< Thread trace enable
       uint linearPersistentImage_ : 1;  //!< Allocates linear images in persistent
+      uint useSingleScratch_ : 1;       //!< Allocates single scratch per device
       uint svmAtomics_ : 1;             //!< SVM device atomics
       uint svmFineGrainSystem_ : 1;     //!< SVM fine grain system support
       uint useDeviceQueue_ : 1;         //!< Submit to separate device queue
@@ -81,7 +82,7 @@ class Settings : public device::Settings {
       uint imageBufferWar_ : 1;         //!< Image buffer workaround for Gfx10
       uint disableSdma_ : 1;            //!< Disable SDMA support
       uint alwaysResident_ : 1;         //!< Make resources resident at allocation time
-      uint reserved_ : 8;
+      uint reserved_ : 7;
     };
     uint value_;
   };
@@ -94,6 +95,7 @@ class Settings : public device::Settings {
   uint workloadSplitSize_;       //!< Workload split size
   uint minWorkloadTime_;         //!< Minimal workload time in 0.1 ms
   uint maxWorkloadTime_;         //!< Maximum workload time in 0.1 ms
+  uint blitEngine_;              //!< Blit engine type
   uint cacheLineSize_;           //!< Cache line size in bytes
   uint cacheSize_;               //!< L1 cache size in bytes
   uint numComputeRings_;         //!< 0 - disabled, 1 , 2,.. - the number of compute rings
diff --git a/device/pal/palvirtual.cpp b/device/pal/palvirtual.cpp
index 3a208171..624379d1 100644
--- a/device/pal/palvirtual.cpp
+++ b/device/pal/palvirtual.cpp
@@ -905,6 +905,11 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
   // because destructor calls eraseResourceList() even if create() failed
   dev().resizeResoureList(index());
 
+  if (index() >= GPU_MAX_COMMAND_QUEUES) {
+    // Cap the maximum number of concurrent Virtual GPUs
+    return false;
+  }
+
   // Virtual GPU will have profiling enabled
   state_.profiling_ = profiling;
 
@@ -1015,7 +1020,18 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
     return false;
   }
 
-  blitMgr_ = new KernelBlitManager(*this, blitSetup);
+  // Choose the appropriate class for blit engine
+  switch (dev().settings().blitEngine_) {
+    default:
+    // Fall through ...
+    case Settings::BlitEngineHost:
+      blitSetup.disableAll();
+    // Fall through ...
+    case Settings::BlitEngineCAL:
+    case Settings::BlitEngineKernel:
+      blitMgr_ = new KernelBlitManager(*this, blitSetup);
+      break;
+  }
   if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
     LogError("Could not create BlitManager!");
     return false;
@@ -3253,8 +3269,11 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) {
       cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);
 
       uint64_t endTimeStampCPU = amd::Os::timeNanos();
-      // Adjust the base time by the execution time
-      readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
+      // Make sure the command batch has a valid GPU TS
+      if (!GPU_RAW_TIMESTAMP) {
+        // Adjust the base time by the execution time
+        readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
+      }
     }
   }
 }
diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index 95a4d375..21514671 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -894,35 +894,37 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
   Device* dev = reinterpret_cast<Device*>(data);
   switch (segment_type) {
     case HSA_REGION_SEGMENT_GLOBAL: {
-      uint32_t global_flag = 0;
-      hsa_status_t stat =
-          hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
-      if (stat != HSA_STATUS_SUCCESS) {
-        return stat;
-      }
+      if (dev->settings().enableLocalMemory_) {
+        uint32_t global_flag = 0;
+        hsa_status_t stat =
+            hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
+        if (stat != HSA_STATUS_SUCCESS) {
+          return stat;
+        }
 
-      if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) {
-        dev->gpu_fine_grained_segment_ = pool;
-      } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
-        dev->gpuvm_segment_ = pool;
-
-        // If cpu agent cannot access this pool, the device does not support large bar.
-        hsa_amd_memory_pool_access_t tmp{};
-        hsa_amd_agent_memory_pool_get_info(
-          dev->cpu_agent_,
-          pool,
-          HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
-          &tmp);
-
-        if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
-          dev->info_.largeBar_ = false;
-        } else {
-          dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR;
+        if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) {
+          dev->gpu_fine_grained_segment_ = pool;
+        } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
+          dev->gpuvm_segment_ = pool;
+
+          // If cpu agent cannot access this pool, the device does not support large bar.
+          hsa_amd_memory_pool_access_t tmp{};
+          hsa_amd_agent_memory_pool_get_info(
+            dev->cpu_agent_,
+            pool,
+            HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
+            &tmp);
+
+          if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
+            dev->info_.largeBar_ = false;
+          } else {
+            dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR;
+          }
         }
-      }
 
-      if (dev->gpuvm_segment_.handle == 0) {
-        dev->gpuvm_segment_ = pool;
+        if (dev->gpuvm_segment_.handle == 0) {
+          dev->gpuvm_segment_ = pool;
+        }
       }
       break;
     }
@@ -1232,7 +1234,7 @@ bool Device::populateOCLDeviceConstants() {
 
   info_.maxWorkItemDimensions_ = 3;
 
-  if (gpuvm_segment_.handle != 0) {
+  if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) {
     size_t global_segment_size = 0;
     if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_,
                                                            HSA_AMD_MEMORY_POOL_INFO_SIZE,
diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp
index 275fe457..3fcbf039 100644
--- a/device/rocm/rocdevice.hpp
+++ b/device/rocm/rocdevice.hpp
@@ -218,7 +218,7 @@ class NullDevice : public amd::Device {
 
   //! Determine if we can use device memory for SVM
   const bool forceFineGrain(amd::Memory* memory) const {
-    return (memory->getContext().devices().size() > 1);
+    return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1);
   }
 
   virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle) {
diff --git a/device/rocm/rocsettings.cpp b/device/rocm/rocsettings.cpp
index 24cb88dc..71c341ad 100644
--- a/device/rocm/rocsettings.cpp
+++ b/device/rocm/rocsettings.cpp
@@ -35,9 +35,18 @@ Settings::Settings() {
   // Set this to true when we drop the flag
   doublePrecision_ = ::CL_KHR_FP64;
 
+  enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE;
+  enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM;
+
   maxWorkGroupSize_ = 1024;
   preferredWorkGroupSize_ = 256;
 
+  maxWorkGroupSize2DX_ = 16;
+  maxWorkGroupSize2DY_ = 16;
+  maxWorkGroupSize3DX_ = 4;
+  maxWorkGroupSize3DY_ = 4;
+  maxWorkGroupSize3DZ_ = 4;
+
   kernargPoolSize_ = HSA_KERNARG_POOL_SIZE;
 
   // Determine if user is requesting Non-Coherent mode
@@ -192,6 +201,23 @@ void Settings::override() {
     preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
   }
 
+  if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) {
+    maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X;
+  }
+  if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) {
+    maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y;
+  }
+
+  if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) {
+    maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X;
+  }
+  if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) {
+    maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y;
+  }
+  if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) {
+    maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z;
+  }
+
   if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
     xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
   }
diff --git a/device/rocm/rocsettings.hpp b/device/rocm/rocsettings.hpp
index 5b5f81d7..d2fffd73 100644
--- a/device/rocm/rocsettings.hpp
+++ b/device/rocm/rocsettings.hpp
@@ -42,6 +42,8 @@ class Settings : public device::Settings {
   union {
     struct {
       uint doublePrecision_ : 1;        //!< Enables double precision support
+      uint enableLocalMemory_ : 1;      //!< Enable GPUVM memory
+      uint enableCoarseGrainSVM_ : 1;   //!< Enable device memory for coarse grain SVM allocations
       uint enableNCMode_ : 1;           //!< Enable Non Coherent mode for system memory
       uint imageDMA_ : 1;               //!< Enable direct image DMA transfers
       uint stagedXferRead_ : 1;         //!< Uses a staged buffer read
@@ -53,7 +55,7 @@ class Settings : public device::Settings {
       uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
       uint coop_sync_ : 1;              //!< grid and multi-grid sync for gfx940+
       uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
-      uint reserved_ : 20;
+      uint reserved_ : 18;
     };
     uint value_;
   };
@@ -64,6 +66,15 @@ class Settings : public device::Settings {
   //! Preferred workgroup size
   uint preferredWorkGroupSize_;
 
+  //! Default max workgroup sizes for 2D
+  int maxWorkGroupSize2DX_;
+  int maxWorkGroupSize2DY_;
+
+  //! Default max workgroup sizes for 3D
+  int maxWorkGroupSize3DX_;
+  int maxWorkGroupSize3DY_;
+  int maxWorkGroupSize3DZ_;
+
   uint kernargPoolSize_;
   uint numDeviceEvents_;      //!< The number of device events
   uint numWaitEvents_;        //!< The number of wait events for device enqueue
diff --git a/utils/flags.hpp b/utils/flags.hpp
index f563c6d3..186d059b 100644
--- a/utils/flags.hpp
+++ b/utils/flags.hpp
@@ -30,10 +30,22 @@ release(uint, AMD_LOG_MASK, 0X7FFFFFFF,                                       \
         "The mask to enable specific kinds of logs")                          \
 debug(uint, DEBUG_GPU_FLAGS, 0,                                               \
         "The debug options for GPU device")                                   \
+release(uint, GPU_MAX_COMMAND_QUEUES, 300,                                    \
+        "The maximum number of concurrent Virtual GPUs")                      \
 release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */         \
         "The default command queue thread stack size")                        \
 release(int, GPU_MAX_WORKGROUP_SIZE, 0,                                       \
         "Maximum number of workitems in a workgroup for GPU, 0 -use default") \
+release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0,                                  \
+        "Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \
+release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0,                                  \
+        "Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \
+release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0,                                  \
+        "Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \
+release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0,                                  \
+        "Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \
+release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0,                                  \
+        "Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \
 debug(bool, CPU_MEMORY_GUARD_PAGES, false,                                    \
         "Use guard pages for CPU memory")                                     \
 debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64,                                 \
@@ -58,8 +70,12 @@ release(uint, GPU_STAGING_BUFFER_SIZE, 4,                                     \
         "Size of the GPU staging buffer in MiB")                              \
 release(bool, GPU_DUMP_BLIT_KERNELS, false,                                   \
         "Dump the kernels for blit manager")                                  \
+release(uint, GPU_BLIT_ENGINE_TYPE, 0x0,                                      \
+        "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel")       \
 release(bool, GPU_FLUSH_ON_EXECUTION, false,                                  \
         "Submit commands to HW on every operation. 0 - Disable, 1 - Enable")  \
+release(bool, GPU_USE_SYNC_OBJECTS, true,                                     \
+        "If enabled, use sync objects instead of polling")                    \
 release(bool, CL_KHR_FP64, true,                                              \
         "Enable/Disable support for double precision")                        \
 release(cstring, AMD_OCL_BUILD_OPTIONS, 0,                                    \
@@ -70,8 +86,12 @@ release(cstring, AMD_OCL_LINK_OPTIONS, 0,                                     \
         "Set clLinkProgram()'s options (override)")                           \
 release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0,                              \
         "Append clLinkProgram()'s options")                                   \
+release(cstring, AMD_OCL_SC_LIB, 0,                                           \
+        "Set shader compiler shared library name or path")                    \
 debug(cstring, AMD_OCL_SUBST_OBJFILE, 0,                                      \
         "Specify binary substitution config file for OpenCL")                 \
+debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false,                                \
+        "Enable the error dialog on Windows")                                 \
 release(size_t, GPU_PINNED_XFER_SIZE, 32,                                     \
         "The pinned buffer size for pinning in read/write transfers in MiB")  \
 release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128,                                \
@@ -80,6 +100,12 @@ release(size_t, GPU_RESOURCE_CACHE_SIZE, 64,                                  \
         "The resource cache size in MB")                                      \
 release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096,                                  \
         "The maximum size accepted for suballocaitons in KB")                 \
+release(bool, GPU_FORCE_64BIT_PTR, 0,                                         \
+        "Forces 64 bit pointers on GPU")                                      \
+release(bool, GPU_FORCE_OCL20_32BIT, 0,                                       \
+        "Forces 32 bit apps to take CLANG\HSAIL path")                        \
+release(bool, GPU_RAW_TIMESTAMP, 0,                                           \
+        "Reports GPU raw timestamps in GPU timeline")                         \
 release(size_t, GPU_NUM_MEM_DEPENDENCY, 256,                                  \
         "Number of memory objects for dependency tracking")                   \
 release(size_t, GPU_XFER_BUFFER_SIZE, 0,                                      \
@@ -90,20 +116,32 @@ release(uint, GPU_SINGLE_ALLOC_PERCENT, 85,                                   \
         "Maximum size of a single allocation as percentage of total")         \
 release(uint, GPU_NUM_COMPUTE_RINGS, 2,                                       \
         "GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \
+release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1,                                 \
+        "GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \
 release(uint, GPU_WORKLOAD_SPLIT, 22,                                         \
         "Workload split size")                                                \
+release(bool, GPU_USE_SINGLE_SCRATCH, false,                                  \
+        "Use single scratch buffer per device instead of per HW ring")        \
 release(bool, AMD_OCL_WAIT_COMMAND, false,                                    \
         "1 = Enable a wait for every submitted command")                      \
 release(uint, GPU_PRINT_CHILD_KERNEL, 0,                                      \
         "Prints the specified number of the child kernels")                   \
 release(bool, GPU_USE_DEVICE_QUEUE, false,                                    \
         "Use a dedicated device queue for the actual submissions")            \
+release(bool, GPU_ENABLE_LARGE_ALLOCATION, true,                              \
+        "Enable >4GB single allocations")                                     \
 release(bool, AMD_THREAD_TRACE_ENABLE, true,                                  \
         "Enable thread trace extension")                                      \
 release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200),                        \
         "Force GPU opencl verison")                                           \
+release(bool, HSA_LOCAL_MEMORY_ENABLE, true,                                  \
+        "Enable HSA device local memory usage")                               \
 release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024,                             \
         "Kernarg pool size")                                                  \
+release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true,                              \
+        "Enable device memory for coarse grain SVM allocations")              \
+release(bool, GPU_IFH_MODE, false,                                            \
+        "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \
 release(bool, GPU_MIPMAP, true,                                               \
         "Enables GPU mipmap extension")                                       \
 release(uint, GPU_ENABLE_PAL, 2,                                              \
@@ -114,6 +152,8 @@ release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1,                              \
         "Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \
 release(uint, OCL_SET_SVM_SIZE, 4*16384,                                      \
         "set SVM space size for discrete GPU")                                \
+debug(uint, OCL_SYSMEM_REQUIREMENT, 2,                                        \
+        "Use flag to change the minimum requirement of system memory not to downgrade")        \
 release(uint, GPU_WAVES_PER_SIMD, 0,                                          \
         "Force the number of waves per SIMD (1-10)")                          \
 release(bool, GPU_WAVE_LIMIT_ENABLE, false,                                   \
@@ -136,6 +176,10 @@ release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "",                              \
         "File path prefix for dumping wave limiter output")                   \
 release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "",                             \
         "File path prefix for tracing wave limiter")                          \
+release(bool, OCL_CODE_CACHE_ENABLE, false,                                   \
+        "1 = Enable compiler code cache")                                     \
+release(bool, OCL_CODE_CACHE_RESET, false,                                    \
+        "1 =  Reset the compiler code cache storage")                         \
 release(bool, PAL_DISABLE_SDMA, false,                                        \
         "1 = Disable SDMA for PAL")                                           \
 release(uint, PAL_RGP_DISP_COUNT, 10000,                                      \

From c7905c502e7cc385bc4d1fb8798b621af2f97f5b Mon Sep 17 00:00:00 2001
From: kjayapra-amd <karthik.jayaprakash@amd.com>
Date: Mon, 9 Jan 2023 15:03:48 -0800
Subject: [PATCH 36/45] SWDEV-376697 - Use PCIE fine grain memory pool when
 hipDeviceMallocUncached flag is used.

Change-Id: I7234d456ef2df42b0b9b9e0d2647ee9bded565f9
---
 device/rocm/rocmemory.cpp | 2 +-
 platform/memory.hpp       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/device/rocm/rocmemory.cpp b/device/rocm/rocmemory.cpp
index 8aacd09d..d267e0d0 100644
--- a/device/rocm/rocmemory.cpp
+++ b/device/rocm/rocmemory.cpp
@@ -788,7 +788,7 @@ bool Buffer::create(bool alloc_local) {
       } else {
         assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
         deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0,
-                                               (memFlags & ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN) != 0);
+                                               (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0);
       }
       owner()->setSvmPtr(deviceMemory_);
     } else {
diff --git a/platform/memory.hpp b/platform/memory.hpp
index 0b6ace9f..ba89e18c 100644
--- a/platform/memory.hpp
+++ b/platform/memory.hpp
@@ -41,7 +41,7 @@
 #define ROCCLR_MEM_HSA_SIGNAL_MEMORY                (1u << 30)
 #define ROCCLR_MEM_INTERNAL_MEMORY                  (1u << 29)
 #define CL_MEM_VA_RANGE_AMD                         (1u << 28)
-#define ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN            (1u << 27)
+#define ROCCLR_MEM_HSA_UNCACHED                     (1u << 27)
 
 namespace device {
 class Memory;

From abc782b807f1f4e202b292c49987bde5012ef90f Mon Sep 17 00:00:00 2001
From: Anusha GodavarthySurya <anusha.godavarthysurya@amd.com>
Date: Thu, 1 Dec 2022 13:32:35 +0000
Subject: [PATCH 37/45] SWDEV-366636 - Fix performance drop in TF-RCCL models

Change-Id: Idc845bb0dab858b94b9d2720cae8308cac2e7328
---
 device/device.hpp          | 11 ++++++++---
 device/rocm/rocdevice.cpp  | 16 ++++++++++++++--
 device/rocm/rocdevice.hpp  |  2 ++
 device/rocm/rocvirtual.hpp |  6 +++---
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/device/device.hpp b/device/device.hpp
index a493795e..cf5bdb33 100644
--- a/device/device.hpp
+++ b/device/device.hpp
@@ -1786,9 +1786,14 @@ class Device : public RuntimeObject {
 
   // Returns the status of HW event, associated with amd::Event
   virtual bool IsHwEventReady(
-      const amd::Event& event,  //!< AMD event for HW status validation
-      bool wait = false         //!< If true then forces the event completion
-      ) const {
+      const amd::Event& event,    //!< AMD event for HW status validation
+      bool wait = false) const {  //!< If true then forces the event completion
+    return false;
+  };
+
+  // Returns the status of HW event, associated with amd::Event
+  virtual bool IsHwEventReadyForcedWait(
+      const amd::Event& event) const {  //!< AMD event for HW status validation
     return false;
   };
 
diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index 21514671..b46377dc 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -2710,10 +2710,22 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI
   return result;
 }
 
+// ================================================================================================
+bool Device::IsHwEventReadyForcedWait(const amd::Event& event) const {
+  void* hw_event =
+      (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
+  if (hw_event == nullptr) {
+    ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event");
+    return false;
+  }
+  static constexpr bool Timeout = true;
+  return WaitForSignal<Timeout>(reinterpret_cast<ProfilingSignal*>(hw_event)->signal_, false, true);
+}
+
 // ================================================================================================
 bool Device::IsHwEventReady(const amd::Event& event, bool wait) const {
-  void* hw_event = (event.NotifyEvent() != nullptr) ?
-    event.NotifyEvent()->HwEvent() : event.HwEvent();
+  void* hw_event =
+      (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
   if (hw_event == nullptr) {
     ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event");
     return false;
diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp
index 3fcbf039..b36d21ef 100644
--- a/device/rocm/rocdevice.hpp
+++ b/device/rocm/rocdevice.hpp
@@ -257,6 +257,7 @@ class NullDevice : public amd::Device {
       cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }
 
   virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
+  virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const { return false; }
   virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
   virtual void ReleaseGlobalSignal(void* signal) const {}
 
@@ -442,6 +443,7 @@ class Device : public NullDevice {
                             cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
 
   virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
+  virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const;
   virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
   virtual void ReleaseGlobalSignal(void* signal) const;
 
diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp
index 74d624a1..1553dfe9 100644
--- a/device/rocm/rocvirtual.hpp
+++ b/device/rocm/rocvirtual.hpp
@@ -46,10 +46,10 @@ constexpr static uint64_t kUnlimitedWait = std::numeric_limits<uint64_t>::max();
 
 // Active wait time out incase same sdma engine is used again,
 // then just wait instead of adding dependency wait signal.
-constexpr static uint64_t kSDMAEngineTimeout = 10;
+constexpr static uint64_t kForcedTimeout = 10;
 
 template <bool active_wait_timeout = false>
-inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sdma_wait = false) {
+inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool forced_wait = false) {
   if (hsa_signal_load_relaxed(signal) > 0) {
     uint64_t timeout = kTimeout100us;
     if (active_wait) {
@@ -57,7 +57,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sd
     }
     if (active_wait_timeout) {
       // If diff engine, wait to 10 ms. Otherwise no wait
-      timeout = (sdma_wait ? kSDMAEngineTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K;
+      timeout = (forced_wait ? kForcedTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K;
       if (timeout == 0) {
         return false;
       }

From fc61d9db113d1938bbc43d8cdde24d0d8ca5ba97 Mon Sep 17 00:00:00 2001
From: German <German.Andryeyev@amd.com>
Date: Thu, 16 Feb 2023 15:48:57 -0500
Subject: [PATCH 38/45] SWDEV-382397 - Move VirtualGPU destruction back to the
 thread exit

OS can terminate unfinished queue thread from default stream at any
time. Potentially leaving the queue lock in a bad state and causing a
deadlock if runtime destroys VirtualGPU later from the host thread.

Change-Id: I247f102ee84e6b4dba947504933395071945c85d
---
 platform/commandqueue.cpp |  1 +
 platform/commandqueue.hpp | 10 +++-------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/platform/commandqueue.cpp b/platform/commandqueue.cpp
index 90573b4c..8b27b4ce 100644
--- a/platform/commandqueue.cpp
+++ b/platform/commandqueue.cpp
@@ -66,6 +66,7 @@ bool HostQueue::terminate() {
       marker->awaitCompletion();
       marker->release();
     }
+    thread_.Release();
     thread_.acceptingCommands_ = false;
   } else {
     if (Os::isThreadAlive(thread_)) {
diff --git a/platform/commandqueue.hpp b/platform/commandqueue.hpp
index ea702a5e..f98332bf 100644
--- a/platform/commandqueue.hpp
+++ b/platform/commandqueue.hpp
@@ -164,19 +164,13 @@ class HostQueue : public CommandQueue {
           acceptingCommands_(false),
           virtualDevice_(nullptr) {}
 
-    virtual ~Thread() {
-      if (virtualDevice_ != nullptr) {
-        delete virtualDevice_;
-        virtualDevice_ = nullptr;
-      }
-    }
-
     //! The command queue thread entry point.
     void run(void* data) {
       HostQueue* queue = static_cast<HostQueue*>(data);
       virtualDevice_ = queue->device().createVirtualDevice(queue);
       if (virtualDevice_ != nullptr) {
         queue->loop(virtualDevice_);
+        Release();
       } else {
         acceptingCommands_ = false;
         queue->flush();
@@ -190,6 +184,8 @@ class HostQueue : public CommandQueue {
       }
     }
 
+    void Release() const { delete virtualDevice_; }
+
     //! Get virtual device for the current thread
     device::VirtualDevice* vdev() const { return virtualDevice_; }
 

From 43abc163a45ed68530d1021829186a180a49d1bf Mon Sep 17 00:00:00 2001
From: Todd tiantuo Li <Toddtiantuo.Li@amd.com>
Date: Tue, 21 Feb 2023 02:54:35 -0800
Subject: [PATCH 39/45] SWDEV-382501 - support new ASIC Revision for Phoenix2

Change-Id: I0225511960c9fbc50ec541511599ef87e509b567
---
 device/pal/paldevice.cpp   | 1 +
 device/pal/palsettings.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp
index 8b07d391..3d758bbd 100644
--- a/device/pal/paldevice.cpp
+++ b/device/pal/paldevice.cpp
@@ -108,6 +108,7 @@ static constexpr PalDevice supportedPalDevices[] = {
   {11, 0,  1,  Pal::GfxIpLevel::GfxIp11_0, "gfx1101",       Pal::AsicRevision::Navi32},
   {11, 0,  2,  Pal::GfxIpLevel::GfxIp11_0, "gfx1102",       Pal::AsicRevision::Navi33},
   {11, 0,  3,  Pal::GfxIpLevel::GfxIp11_0, "gfx1103",       Pal::AsicRevision::Phoenix1},
+  {11, 0,  3,  Pal::GfxIpLevel::GfxIp11_0, "gfx1103",       Pal::AsicRevision::Phoenix2},
 };
 
 static std::tuple<const amd::Isa*, const char*> findIsa(Pal::AsicRevision asicRevision,
diff --git a/device/pal/palsettings.cpp b/device/pal/palsettings.cpp
index 1f4dc8e9..b85a2df0 100644
--- a/device/pal/palsettings.cpp
+++ b/device/pal/palsettings.cpp
@@ -205,6 +205,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
     case Pal::AsicRevision::Navi31:
     // Fall through for Navi2x ...
     case Pal::AsicRevision::Phoenix1:
+    case Pal::AsicRevision::Phoenix2:
     case Pal::AsicRevision::Raphael:
     case Pal::AsicRevision::Rembrandt:
     case Pal::AsicRevision::Navi24:

From c5f131c5d32f57ff647f115c3e537f8def711e34 Mon Sep 17 00:00:00 2001
From: "Xie,AlexBin" <AlexBin.Xie@amd.com>
Date: Fri, 17 Feb 2023 18:49:15 -0500
Subject: [PATCH 40/45] SWDEV-378367 - Observed performance drop for Geekbench5

Change-Id: I0beabc6e3bec095574c8168fcf52af1e94105792
---
 device/pal/palsettings.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/device/pal/palsettings.cpp b/device/pal/palsettings.cpp
index b85a2df0..080654c1 100644
--- a/device/pal/palsettings.cpp
+++ b/device/pal/palsettings.cpp
@@ -78,9 +78,10 @@ Settings::Settings() {
 
   // By default use host blit
   blitEngine_ = BlitEngineHost;
-  pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi;
+  pinnedXferSize_ = GPU_PINNED_XFER_SIZE * Mi;
+  size_t defaultMinXferSize = amd::IS_HIP ? 128: 4;
   pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)
-    ? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi;
+    ? defaultMinXferSize * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi;
 
   // Disable FP_FAST_FMA defines by default
   reportFMAF_ = false;

From d287ba44be1245554c91a067b5250934ecf4c8bd Mon Sep 17 00:00:00 2001
From: Jaydeep Patel <jaydeepkumar.patel@amd.com>
Date: Thu, 16 Feb 2023 16:57:50 +0000
Subject: [PATCH 41/45] SWDEV-382664 - Remove WaveLimiter init

The wavelimiter init was unintentionally added in one change.
It wasn't supposed to perform any logic, since LC doesn't support it and
the number of waves can be overwritten only with an environment variable.

Change-Id: I447bd1ad685800f874b6a6fb7409dc67e43640ff
---
 device/pal/palkernel.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/device/pal/palkernel.cpp b/device/pal/palkernel.cpp
index f425e3d7..035a6288 100644
--- a/device/pal/palkernel.cpp
+++ b/device/pal/palkernel.cpp
@@ -498,7 +498,6 @@ const LightningProgram& LightningKernel::prog() const {
 
 #if defined(USE_COMGR_LIBRARY)
 bool LightningKernel::init() {
-  waveLimiter_.enable();
   return GetAttrCodePropMetadata();
 }
 

From afebcb413aed0441ef684fdf1f0675f54676c6a5 Mon Sep 17 00:00:00 2001
From: Saleel Kudchadker <Saleel.Kudchadker@amd.com>
Date: Fri, 20 Jan 2023 15:34:24 -0800
Subject: [PATCH 42/45] SWDEV-364604 - Add ROCclr support for
 hipEventDisableSystemFence

Change-Id: I6127b432a8759359359a1890fda85bc401be6a56
---
 device/device.hpp          |  3 +++
 device/pal/palvirtual.hpp  |  2 ++
 device/rocm/rocvirtual.cpp | 42 ++++++++++++++++++++++----------------
 device/rocm/rocvirtual.hpp |  1 +
 platform/command.cpp       |  1 -
 platform/commandqueue.cpp  |  2 +-
 6 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/device/device.hpp b/device/device.hpp
index cf5bdb33..3e65a331 100644
--- a/device/device.hpp
+++ b/device/device.hpp
@@ -1260,6 +1260,9 @@ class VirtualDevice : public amd::HeapObject {
   //! Returns fence state of the VirtualGPU
   virtual bool isFenceDirty() const = 0;
 
+  //! Resets fence state of the VirtualGPU
+  virtual void resetFenceDirty() = 0;
+
  private:
   //! Disable default copy constructor
   VirtualDevice& operator=(const VirtualDevice&);
diff --git a/device/pal/palvirtual.hpp b/device/pal/palvirtual.hpp
index 86240649..74fb039f 100644
--- a/device/pal/palvirtual.hpp
+++ b/device/pal/palvirtual.hpp
@@ -359,6 +359,8 @@ class VirtualGPU : public device::VirtualDevice {
 
   bool isFenceDirty() const { return false; }
 
+  void resetFenceDirty() {}
+
   //! Returns GPU device object associated with this kernel
   const Device& dev() const { return gpuDevice_; }
 
diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp
index feeef138..f2850b55 100644
--- a/device/rocm/rocvirtual.cpp
+++ b/device/rocm/rocvirtual.cpp
@@ -78,13 +78,8 @@ static constexpr uint16_t kBarrierPacketHeader =
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
 
-static constexpr uint16_t kBarrierPacketAgentScopeHeader =
-    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
-    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-
 static constexpr uint16_t kNopPacketHeader =
-    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
+    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
     (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
     (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
 
@@ -99,14 +94,16 @@ static constexpr uint16_t kBarrierPacketReleaseHeader =
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
 
 static constexpr uint16_t kBarrierVendorPacketHeader =
-    (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
+    (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) |
+    (1 << HSA_PACKET_HEADER_BARRIER) |
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
 
-static constexpr uint16_t kBarrierVendorPacketAgentScopeHeader =
-    (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
-    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+static constexpr uint16_t kBarrierVendorPacketNopScopeHeader =
+    (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) |
+    (1 << HSA_PACKET_HEADER_BARRIER) |
+    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
 
 static constexpr hsa_barrier_and_packet_t kBarrierAcquirePacket = {
     kBarrierPacketAcquireHeader, 0, 0, {{0}}, 0, {0}};
@@ -989,6 +986,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
   uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
   uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
 
+  fence_dirty_ = true;
   auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                          HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
   if (!skipSignal) {
@@ -1001,7 +999,9 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
   }
 
   // Reset fence_dirty_ flag if we submit a barrier
-  fence_dirty_ = false;
+  if (cache_state == amd::Device::kCacheStateSystem) {
+    fence_dirty_ = false;
+  }
 
   while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
   hsa_barrier_and_packet_t* aql_loc =
@@ -1063,6 +1063,10 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
     }
   }
 
+  fence_dirty_ = true;
+  auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
+
   if (completionSignal.handle == 0) {
     // Get active signal for current dispatch if profiling is necessary
     barrier_value_packet_.completion_signal =
@@ -1072,6 +1076,11 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
     barrier_value_packet_.completion_signal = completionSignal;
   }
 
+  // Reset fence_dirty_ flag if we submit a barrier
+  if (cache_state == amd::Device::kCacheStateSystem) {
+    fence_dirty_ = false;
+  }
+
   uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
   while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
   hsa_amd_barrier_value_packet_t* aql_loc = &(reinterpret_cast<hsa_amd_barrier_value_packet_t*>(
@@ -1079,9 +1088,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
   *aql_loc = barrier_value_packet_;
   packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), packetHeader, rest);
 
-  auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
-                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
-
   hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);
 
   ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
@@ -3252,11 +3258,11 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
       if (timestamp_ != nullptr) {
         const Settings& settings = dev().settings();
         int32_t releaseFlags = vcmd.getEventScope();
-        if (releaseFlags == Device::CacheState::kCacheStateAgent) {
+        if (releaseFlags == Device::CacheState::kCacheStateIgnore) {
           if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) {
-            dispatchBarrierValuePacket(kBarrierVendorPacketAgentScopeHeader, true);
+            dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
           } else {
-            dispatchBarrierPacket(kBarrierPacketAgentScopeHeader, false);
+            dispatchBarrierPacket(kNopPacketHeader, false);
           }
         } else {
           // Submit a barrier with a cache flushes.
diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp
index 1553dfe9..8c002658 100644
--- a/device/rocm/rocvirtual.hpp
+++ b/device/rocm/rocvirtual.hpp
@@ -407,6 +407,7 @@ class VirtualGPU : public device::VirtualDevice {
 
   void* allocKernArg(size_t size, size_t alignment);
   bool isFenceDirty() const { return fence_dirty_; }
+  void resetFenceDirty() { fence_dirty_ = false; }
   // } roc OpenCL integration
  private:
   //! Dispatches a barrier with blocking HSA signals
diff --git a/platform/command.cpp b/platform/command.cpp
index e02cb153..279bfc3d 100644
--- a/platform/command.cpp
+++ b/platform/command.cpp
@@ -436,7 +436,6 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList
     profilingInfo_.clear();
     profilingInfo_.callback_ = nullptr;
     profilingInfo_.marker_ts_ = true;
-    setEventScope(amd::Device::kCacheStateSystem);
   }
   kernel_.retain();
 }
diff --git a/platform/commandqueue.cpp b/platform/commandqueue.cpp
index 8b27b4ce..ce8ba4ae 100644
--- a/platform/commandqueue.cpp
+++ b/platform/commandqueue.cpp
@@ -120,7 +120,7 @@ void HostQueue::finish() {
       return;
     }
   }
-  if (nullptr == command || vdev()->isHandlerPending()) {
+  if (nullptr == command || vdev()->isHandlerPending() || vdev()->isFenceDirty()) {
     if (nullptr != command) {
       command->release();
     }

From 9f495efc34e0e8a618dd82adc40ff23841da2fdd Mon Sep 17 00:00:00 2001
From: Jaydeep Patel <jaydeepkumar.patel@amd.com>
Date: Mon, 20 Feb 2023 17:59:01 +0000
Subject: [PATCH 43/45] SWDEV-383397 - Set large bar false for windows.

Change-Id: Iaef70e477afab7ca8694aa55d1f704e769e24ed9
---
 device/pal/paldevice.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp
index 3d758bbd..9adfabff 100644
--- a/device/pal/paldevice.cpp
+++ b/device/pal/paldevice.cpp
@@ -633,7 +633,9 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
     info_.cooperativeGroups_ = settings().enableCoopGroups_;
     info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_;
 
-    if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) {
+    if (amd::IS_HIP) {
+      info_.largeBar_ = false;
+    } else if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) {
       info_.largeBar_ = true;
       ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Resizable bar enabled");
     }

From 3b9d67083a5af334c3f2a26ecd02b817fb6e618a Mon Sep 17 00:00:00 2001
From: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
Date: Thu, 23 Feb 2023 16:38:32 +0000
Subject: [PATCH 44/45] SWDEV-385050 - Fixed possible invalid queue access from
 kernelCommand::releaseResources

Change-Id: I7c5d99987cb7ab4fa0aa634f2bb6a4d60331b3af
---
 platform/command.cpp | 2 +-
 platform/kernel.cpp  | 2 +-
 platform/kernel.hpp  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/platform/command.cpp b/platform/command.cpp
index 279bfc3d..35ec5113 100644
--- a/platform/command.cpp
+++ b/platform/command.cpp
@@ -441,7 +441,7 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList
 }
 
 void NDRangeKernelCommand::releaseResources() {
-  kernel_.parameters().release(parameters_, queue()->device());
+  kernel_.parameters().release(parameters_);
   DEBUG_ONLY(parameters_ = NULL);
   kernel_.release();
   Command::releaseResources();
diff --git a/platform/kernel.cpp b/platform/kernel.cpp
index 80cb1a12..52d4936a 100644
--- a/platform/kernel.cpp
+++ b/platform/kernel.cpp
@@ -253,7 +253,7 @@ bool KernelParameters::boundToSvmPointer(const Device& device, const_address cap
   return svmBound[index];
 }
 
-void KernelParameters::release(address mem, const amd::Device& device) const {
+void KernelParameters::release(address mem) const {
   if (mem == nullptr) {
     // nothing to do!
     return;
diff --git a/platform/kernel.hpp b/platform/kernel.hpp
index e46fc5a7..8cb3b7f7 100644
--- a/platform/kernel.hpp
+++ b/platform/kernel.hpp
@@ -215,7 +215,7 @@ class KernelParameters : protected HeapObject {
   //! Capture the state of the parameters and return the stack base pointer.
   address capture(device::VirtualDevice& vDev, uint64_t lclMemSize, int32_t* error);
   //! Release the captured state of the parameters.
-  void release(address parameters, const amd::Device& device) const;
+  void release(address parameters) const;
 
   //! Allocate memory for this instance as well as the required storage for
   //  the values_, defined_, and rawPointer_ arrays.

From 6c224352715359721224b36e6ec5ef4b17517be5 Mon Sep 17 00:00:00 2001
From: Saleel Kudchadker <Saleel.Kudchadker@amd.com>
Date: Fri, 24 Feb 2023 12:24:43 -0800
Subject: [PATCH 45/45] SWDEV-380035 - Do not create Arena Memobj for pinned
 memory

Change-Id: I4da281f0b7139efb4bb46b47f812358a0212445a
---
 device/rocm/rocdevice.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index b46377dc..287ece39 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -3219,18 +3219,13 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size
       return arena_mem_obj_;
     }
   }
+
   // Calculate the offset of the pointer.
   const void* dev_ptr = reinterpret_cast<void*>(
       arena_mem_obj_->getDeviceMemory(*arena_mem_obj_->getContext().devices()[0])
           ->virtualAddress());
-  // System memory which has been locked
-  if (ptr_info.type == HSA_EXT_POINTER_TYPE_LOCKED &&
-      getCpuAgent().handle == ptr_info.agentOwner.handle && ptr_info.hostBaseAddress == ptr) {
-    offset =
-        reinterpret_cast<size_t>(ptr_info.agentBaseAddress) - reinterpret_cast<size_t>(dev_ptr);
-  } else {
-    offset = reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(dev_ptr);
-  }
+  offset = reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(dev_ptr);
+
   return arena_mem_obj_;
 }
 
@@ -3250,6 +3245,13 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer
   if (status != HSA_STATUS_SUCCESS) {
     LogError("hsa_amd_pointer_info() failed");
   }
+
+  // Return false for pinned memory. A true return may result in a race because
+  // ROCclr may attempt to do a pin/copy/unpin underneath in a multithreaded environment
+  if (ptr_info->type == HSA_EXT_POINTER_TYPE_LOCKED) {
+    return false;
+  }
+
   if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) {
     if ((size != 0) &&
         ((reinterpret_cast<const_address>(dev_ptr) -