Merge branch 'dev/adunn/geometry_processing_thread_pool' into 'main'

CPU Perf: Geometry Processing Worker Threads See merge request lightspeedrtx/dxvk-remix-nv!972
NVIDIAGameWorks · Sep 5, 2024 · 9ef8b6e · 9ef8b6e
2 parents d726d9c + 371fe3a
commit 9ef8b6e
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 21 deletions.
diff --git a/RtxOptions.md b/RtxOptions.md
@@ -399,6 +399,7 @@ Tables below enumerate all the options and their defaults set by RTX Remix. Note
 |rtx.numFramesToKeepInstances|int|1||
 |rtx.numFramesToKeepLights|int|100||
 |rtx.numFramesToKeepMaterialTextures|int|5||
+|rtx.numGeometryProcessingThreads|int|2|The desired number of CPU threads to dedicate to geometry processing  Will be limited by the number of CPU cores\.  There may be some advantage to lowering this number in games which are fairly simple and use a low number of draw calls per frame\.  The default was determined by looking at a game with around 2000 draw calls per frame, and with a reasonably high average triangle count per draw\.|
 |rtx.opacityMicromap.buildRequests.customFiltersForBillboards|bool|True|Applies custom filters for staged Billboard requests\.|
 |rtx.opacityMicromap.buildRequests.enableAnimatedInstances|bool|False|Enables Opacity Micromaps for animated instances\.|
 |rtx.opacityMicromap.buildRequests.enableParticles|bool|True|Enables Opacity Micromaps for particles\.|

diff --git a/src/d3d9/d3d9_rtx.cpp b/src/d3d9/d3d9_rtx.cpp
@@ -32,7 +32,7 @@ namespace dxvk {
                           VK_ACCESS_TRANSFER_READ_BIT)
     , m_parent(d3d9Device)
     , m_enableDrawCallConversion(enableDrawCallConversion)
-    , m_pGeometryWorkers(enableDrawCallConversion ? std::make_unique<GeometryProcessor>(popcnt_uint8(D3D9Rtx::kAllThreads), "geometry-processing") : nullptr) {
+    , m_pGeometryWorkers(enableDrawCallConversion ? std::make_unique<GeometryProcessor>(numGeometryProcessingThreads(), "geometry-processing") : nullptr) {
 
     // Add space for 256 objects skinned with 256 bones each.
     m_stagedBones.resize(256 * 256);

diff --git a/src/d3d9/d3d9_rtx.h b/src/d3d9/d3d9_rtx.h
@@ -41,6 +41,7 @@ namespace dxvk {
     RTX_OPTION("rtx", bool, useVertexCapturedNormals, true, "When enabled, vertex normals are read from the input assembler and used in raytracing.  This doesn't always work as normals can be in any coordinate space, but can help sometimes.");
     RTX_OPTION("rtx", bool, useWorldMatricesForShaders, true, "When enabled, Remix will utilize the world matrices being passed from the game via D3D9 fixed function API, even when running with shaders.  Sometimes games pass these matrices and they are useful, however for some games they are very unreliable, and should be filtered out.  If you're seeing precision related issues with shader vertex capture, try disabling this setting.");
     RTX_OPTION("rtx", bool, enableIndexBufferMemoization, true, "CPU performance optimization, should generally be enabled.  Will reduce main thread time by caching processIndexBuffer operations and reusing when possible, this will come at the expense of some CPU RAM.");
+    RTX_OPTION("rtx", uint32_t, numGeometryProcessingThreads, 2, "The desired number of CPU threads to dedicate to geometry processing  Will be limited by the number of CPU cores.  There may be some advantage to lowering this number in games which are fairly simple and use a low number of draw calls per frame.  The default was determined by looking at a game with around 2000 draw calls per frame, and with a reasonably high average triangle count per draw.");
 
     // Copy of the parameters issued to D3D9 on DrawXXX
     struct DrawContext {
@@ -171,19 +172,6 @@ namespace dxvk {
     }
 
   private: 
-    // Give threads specific tasks, to reduce the chance of 
-    //  critical work being pre-empted.
-    enum WorkerTasks : uint8_t {
-      kSkinningThread = 1 << 0,
-
-      kHashingThread0 = 1 << 1,
-      kHashingThread1 = 1 << 2,
-      kHashingThread2 = 1 << 3,
-
-      kHashingThreads = (kHashingThread0 | kHashingThread1 | kHashingThread2),
-      kAllThreads = (kHashingThreads | kSkinningThread)
-    };
-
     inline static const uint32_t kMaxConcurrentDraws = 6 * 1024; // some games issuing >3000 draw calls per frame...  account for some consumer thread lag with x2
     using GeometryProcessor = WorkerThreadPool<kMaxConcurrentDraws>;
     const std::unique_ptr<GeometryProcessor> m_pGeometryWorkers;

diff --git a/src/util/util_threadpool.h b/src/util/util_threadpool.h
@@ -289,9 +289,9 @@ namespace dxvk {
 
   public:
     WorkerThreadPool(uint8_t numThreads, const char* workerName = "Nameless Worker Thread") 
-    : m_numThread(numThreads) {
+    : m_numThread(std::clamp(numThreads, (uint8_t)1u, (uint8_t)dxvk::thread::hardware_concurrency())) {
       // Note: round up to a closest power-of-two so we can use mask as modulo
-      m_taskCount = 1 << (32 - bit::lzcnt(static_cast<uint32_t>(NumTasksPerThread*numThreads) - 1));
+      m_taskCount = 1 << (32 - bit::lzcnt(static_cast<uint32_t>(NumTasksPerThread * m_numThread) - 1));
       m_tasks.resize(m_taskCount);
       m_workerTasks.resize(m_numThread);
       m_workerThreads.resize(m_numThread);
@@ -343,15 +343,11 @@ namespace dxvk {
     // Schedule a task to be executed by the thread pool
     template <uint8_t Affinity = 0xFF, typename F, typename R = std::invoke_result_t<std::decay_t<F>>>
     Future<R> Schedule(F&& f) {
-      // Add the task to the queue and notify a worker thread
-      //  just distribute evenly to all threads for some mask denoted by Affinity.
-      static size_t s_idx = 0;
-
       // Is the affinity mask valid?
       const uint8_t affinityMask = std::min(popcnt_uint8(Affinity), m_numThread);
 
       // Schedule work on the appropriate thread
-      const uint32_t thread = fast::findNthBit(Affinity, (uint8_t) (s_idx++ % affinityMask));
+      const uint32_t thread = fast::findNthBit(Affinity, (uint8_t) (m_schedulerIndex++ % affinityMask));
       assert(thread < m_numThread);
 
       // Atomic queue is SPSC, so we don't need to take a lock here
@@ -451,6 +447,10 @@ namespace dxvk {
     std::atomic<TaskId> m_taskId = 0;
     uint32_t m_taskCount;
 
+    // Add the task to the queue and notify a worker thread
+    //  just distribute evenly to all threads for some mask denoted by Affinity.
+    size_t m_schedulerIndex = 0;
+
     uint8_t m_numThread;
 
     std::atomic<bool> m_stopWork = false;