Optimize endoscopy tool tracking app

Modify ToolTrackingPostProcessorOp to generate coordinates using CUDA. This avoids the roundtrip from GPU to CPU memory and back. Use RMMAllocator for video replayer. Fix type of `dla_core` parameter of TensorRtInference. Remove unused `clock` parameter. Latency measured with flow benchmarking and replayer `realtime` set to `false`: before optimizatio 4.8 ms, after optimization 3.09 ms. Signed-off-by: Andreas Heumann <[email protected]>
nvidia-holoscan · Oct 25, 2024 · 571f03f · 571f03f
1 parent 17ed95c
commit 571f03f
Show file tree

Hide file tree

Showing 16 changed files with 239 additions and 210 deletions.
diff --git a/applications/endoscopy_tool_tracking/cpp/main.cpp b/applications/endoscopy_tool_tracking/cpp/main.cpp
@@ -1,6 +1,6 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,14 +17,14 @@
 
 #include <getopt.h>
 
+#include <holoscan/holoscan.hpp>
 #include <holoscan/operators/aja_source/aja_source.hpp>
 #include <holoscan/operators/format_converter/format_converter.hpp>
 #include <holoscan/operators/holoviz/holoviz.hpp>
 #include <holoscan/operators/video_stream_recorder/video_stream_recorder.hpp>
 #include <holoscan/operators/video_stream_replayer/video_stream_replayer.hpp>
 #include <lstm_tensor_rt_inference.hpp>
 #include <tool_tracking_postprocessor.hpp>
-#include "holoscan/holoscan.hpp"
 #ifdef VTK_RENDERER
 #include <vtk_renderer.hpp>
 #endif
@@ -38,6 +38,11 @@
 #include <qcap_source.hpp>
 #endif
 
+#include <holoscan/version_config.hpp>
+
+#define HOLOSCAN_VERSION \
+  (HOLOSCAN_VERSION_MAJOR * 10000 + HOLOSCAN_VERSION_MINOR * 100 + HOLOSCAN_VERSION_PATCH)
+
 class App : public holoscan::Application {
  public:
   void set_source(const std::string& source) { source_ = source; }
@@ -111,6 +116,10 @@ class App : public holoscan::Application {
       height = 480;
       source = make_operator<ops::VideoStreamReplayerOp>(
           "replayer", from_config("replayer"), Arg("directory", datapath));
+#if HOLOSCAN_VERSION >= 20600
+      // the RMMAllocator supported since v2.6 is much faster than the default UnboundAllocator
+      source->add_arg(Arg("allocator", make_resource<RMMAllocator>("video_replayer_allocator")));
+#endif
       source_block_size = width * height * 3 * 4;
       source_num_blocks = 2;
     }
@@ -151,16 +160,19 @@ class App : public holoscan::Application {
             "pool", 1, lstm_inferer_block_size, lstm_inferer_num_blocks),
         Arg("cuda_stream_pool") = cuda_stream_pool);
 
-    const uint64_t tool_tracking_postprocessor_block_size = 107 * 60 * 7 * 4;
-    const uint64_t tool_tracking_postprocessor_num_blocks = 2;
+    // the tool tracking post process outputs
+    // - a RGBA float32 color mask
+    // - coordinates with x,y and size in float32
+    const uint64_t tool_tracking_postprocessor_block_size =
+        std::max(107 * 60 * 7 * 4 * sizeof(float), 7 * 3 * sizeof(float));
+    const uint64_t tool_tracking_postprocessor_num_blocks = 2 * 2;
     auto tool_tracking_postprocessor = make_operator<ops::ToolTrackingPostprocessorOp>(
         "tool_tracking_postprocessor",
         Arg("device_allocator") =
             make_resource<BlockMemoryPool>("device_allocator",
                                            1,
                                            tool_tracking_postprocessor_block_size,
-                                           tool_tracking_postprocessor_num_blocks),
-        Arg("host_allocator") = make_resource<UnboundedAllocator>("host_allocator"));
+                                           tool_tracking_postprocessor_num_blocks));
 
     if (this->visualizer_name == "holoviz") {
       std::shared_ptr<BlockMemoryPool> visualizer_allocator;

diff --git a/applications/endoscopy_tool_tracking/python/endoscopy_tool_tracking.py b/applications/endoscopy_tool_tracking/python/endoscopy_tool_tracking.py
@@ -28,7 +28,6 @@
     BlockMemoryPool,
     CudaStreamPool,
     MemoryStorageType,
-    UnboundedAllocator,
 )
 
 from holohub.lstm_tensor_rt_inference import LSTMTensorRTInferenceOp
@@ -114,6 +113,12 @@ def compose(self):
                 directory=video_dir,
                 **self.kwargs("replayer"),
             )
+            # the RMMAllocator supported since v2.6 is much faster than the default UnboundAllocator
+            try:
+                from holoscan.resources import RMMAllocator
+                source.add_arg(allocator=RMMAllocator(self, name="video_replayer_allocator"))
+            except Exception:
+                pass
             # 4 bytes/channel, 3 channels
             source_block_size = width * height * 3 * 4
             source_num_blocks = 2
@@ -133,9 +138,7 @@ def compose(self):
                     pool=BlockMemoryPool(self, name="pool", **source_pool_kwargs),
                     **self.kwargs("recorder_format_converter"),
                 )
-            recorder = VideoStreamRecorderOp(
-                name="recorder", fragment=self, **self.kwargs("recorder")
-            )
+            recorder = VideoStreamRecorderOp(name="recorder", fragment=self, **self.kwargs("recorder"))
 
         config_key_name = "format_converter_" + self.source.lower()
 
@@ -177,8 +180,14 @@ def compose(self):
             **self.kwargs("lstm_inference"),
         )
 
-        tool_tracking_postprocessor_block_size = 107 * 60 * 7 * 4
-        tool_tracking_postprocessor_num_blocks = 2
+        # the tool tracking post process outputs
+        # - a RGBA float32 color mask
+        # - coordinates with x,y and size in float32
+        bytes_per_float32 = 4
+        tool_tracking_postprocessor_block_size = max(
+            107 * 60 * 7 * 4 * bytes_per_float32, 7 * 3 * bytes_per_float32
+        )
+        tool_tracking_postprocessor_num_blocks = 2 * 2
         tool_tracking_postprocessor = ToolTrackingPostprocessorOp(
             self,
             name="tool_tracking_postprocessor",
@@ -189,7 +198,6 @@ def compose(self):
                 block_size=tool_tracking_postprocessor_block_size,
                 num_blocks=tool_tracking_postprocessor_num_blocks,
             ),
-            host_allocator=UnboundedAllocator(self, name="host_allocator"),
         )
 
         if (record_type == "visualizer") and (self.source == "replayer"):

diff --git a/gxf_extensions/lstm_tensor_rt_inference/tensor_rt_inference.cpp b/gxf_extensions/lstm_tensor_rt_inference/tensor_rt_inference.cpp
@@ -289,12 +289,6 @@ gxf_result_t TensorRtInference::registerInterface(gxf::Registrar* registrar) {
                                  "Relaxed Dimension Check",
                                  "Ignore dimensions of 1 for input tensor dimension check.",
                                  true);
-  result &= registrar->parameter(clock_,
-                                 "clock",
-                                 "Clock",
-                                 "Instance of clock for publish time.",
-                                 gxf::Registrar::NoDefaultParameter(),
-                                 GXF_PARAMETER_FLAGS_OPTIONAL);
 
   result &= registrar->parameter(rx_, "rx", "RX", "List of receivers to take input tensors");
   result &= registrar->parameter(tx_, "tx", "TX", "Transmitter to publish output tensors");

diff --git a/gxf_extensions/lstm_tensor_rt_inference/tensor_rt_inference.hpp b/gxf_extensions/lstm_tensor_rt_inference/tensor_rt_inference.hpp
@@ -32,7 +32,6 @@
 #include "gxf/cuda/cuda_stream.hpp"
 #include "gxf/cuda/cuda_stream_pool.hpp"
 #include "gxf/std/allocator.hpp"
-#include "gxf/std/clock.hpp"
 #include "gxf/std/codelet.hpp"
 #include "gxf/std/receiver.hpp"
 #include "gxf/std/tensor.hpp"
@@ -110,12 +109,11 @@ class TensorRtInference : public gxf::Codelet {
   gxf::Parameter<gxf::Handle<gxf::Allocator>> pool_;
   gxf::Parameter<gxf::Handle<gxf::CudaStreamPool>> cuda_stream_pool_;
   gxf::Parameter<int64_t> max_workspace_size_;
-  gxf::Parameter<int64_t> dla_core_;
+  gxf::Parameter<int32_t> dla_core_;
   gxf::Parameter<int32_t> max_batch_size_;
   gxf::Parameter<bool> enable_fp16_;
   gxf::Parameter<bool> relaxed_dimension_check_;
   gxf::Parameter<bool> verbose_;
-  gxf::Parameter<gxf::Handle<gxf::Clock>> clock_;
 
   gxf::Parameter<std::vector<gxf::Handle<gxf::Receiver>>> rx_;
   gxf::Parameter<gxf::Handle<gxf::Transmitter>> tx_;

diff --git a/operators/lstm_tensor_rt_inference/README.md b/operators/lstm_tensor_rt_inference/README.md
@@ -38,7 +38,7 @@ This implementation is based on `nvidia::gxf::TensorRtInference`.
 - **`max_workspace_size`**: Size of working space in bytes (default: `67108864l` (64MB))
   - type: `int64_t`
 - **`dla_core`**: DLA Core to use. Fallback to GPU is always enabled. Default to use GPU only (`optional`)
-  - type: `int64_t`
+  - type: `int32_t`
 - **`max_batch_size`**: Maximum possible batch size in case the first dimension is dynamic and used as batch size (default: `1`)
   - type: `int32_t`
 - **`enable_fp16_`**: Enable inference with FP16 and FP32 fallback (default: `false`)
@@ -47,8 +47,6 @@ This implementation is based on `nvidia::gxf::TensorRtInference`.
   - type: `bool`
 - **`relaxed_dimension_check`**: Ignore dimensions of 1 for input tensor dimension check (default: `true`)
   - type: `bool`
-- **`clock`**: Instance of clock for publish time (`optional`)
-  - type: `gxf::Handle<gxf::Clock>`
 - **`rx`**: List of receivers to take input tensors
   - type: `std::vector<gxf::Handle<gxf::Receiver>>`
 - **`tx`**: Transmitter to publish output tensors

diff --git a/operators/lstm_tensor_rt_inference/lstm_tensor_rt_inference.cpp b/operators/lstm_tensor_rt_inference/lstm_tensor_rt_inference.cpp
@@ -94,7 +94,8 @@ void LSTMTensorRTInferenceOp::setup(OperatorSpec& spec) {
              "dla_core",
              "DLA Core",
              "DLA Core to use. Fallback to GPU is always enabled. "
-             "Default to use GPU only.");
+             "Default to use GPU only.",
+             ParameterFlag::kOptional);
   spec.param(max_batch_size_,
              "max_batch_size",
              "Max Batch Size",
@@ -117,7 +118,6 @@ void LSTMTensorRTInferenceOp::setup(OperatorSpec& spec) {
              "Relaxed Dimension Check",
              "Ignore dimensions of 1 for input tensor dimension check.",
              true);
-  spec.param(clock_, "clock", "Clock", "Instance of clock for publish time.");
 
   spec.param(rx_, "rx", "RX", "List of receivers to take input tensors", {&in_tensor});
   spec.param(tx_, "tx", "TX", "Transmitter to publish output tensors", &out_tensor);

diff --git a/operators/lstm_tensor_rt_inference/lstm_tensor_rt_inference.hpp b/operators/lstm_tensor_rt_inference/lstm_tensor_rt_inference.hpp
@@ -58,12 +58,11 @@ class LSTMTensorRTInferenceOp : public holoscan::ops::GXFOperator {
   Parameter<std::shared_ptr<Allocator>> pool_;
   Parameter<std::shared_ptr<CudaStreamPool>> cuda_stream_pool_;
   Parameter<int64_t> max_workspace_size_;
-  Parameter<int64_t> dla_core_;
+  Parameter<int32_t> dla_core_;
   Parameter<int32_t> max_batch_size_;
   Parameter<bool> enable_fp16_;
   Parameter<bool> relaxed_dimension_check_;
   Parameter<bool> verbose_;
-  Parameter<std::shared_ptr<Resource>> clock_;
 
   Parameter<std::vector<IOSpec*>> rx_;
   Parameter<IOSpec*> tx_;

diff --git a/operators/lstm_tensor_rt_inference/python/lstm_tensor_rt_inference.cpp b/operators/lstm_tensor_rt_inference/python/lstm_tensor_rt_inference.cpp
@@ -1,6 +1,6 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,11 +25,11 @@
 #include <memory>
 #include <string>
 
-#include "../../operator_util.hpp"
 #include <holoscan/core/fragment.hpp>
 #include <holoscan/core/operator.hpp>
 #include <holoscan/core/operator_spec.hpp>
 #include <holoscan/core/resources/gxf/allocator.hpp>
+#include "../../operator_util.hpp"
 #include "holoscan/core/resources/gxf/cuda_stream_pool.hpp"
 
 using std::string_literals::operator""s;
@@ -63,11 +63,8 @@ class PyLSTMTensorRTInferenceOp : public LSTMTensorRTInferenceOp {
       const std::vector<std::string>& output_tensor_names,
       const std::vector<std::string>& input_binding_names,
       const std::vector<std::string>& output_binding_names, const std::string& model_file_path,
-      const std::string& engine_cache_dir,
-      // int64_t dla_core,
-      std::shared_ptr<holoscan::Allocator> pool,
-      std::shared_ptr<holoscan::CudaStreamPool> cuda_stream_pool,
-      // std::shared_ptr<holoscan::Resource> clock,
+      const std::string& engine_cache_dir, std::shared_ptr<holoscan::Allocator> pool,
+      std::shared_ptr<holoscan::CudaStreamPool> cuda_stream_pool, std::optional<int32_t> dla_core,
       const std::string& plugins_lib_namespace = "",
       const std::vector<std::string>& input_state_tensor_names = std::vector<std::string>{},
       const std::vector<std::string>& output_state_tensor_names = std::vector<std::string>{},
@@ -80,10 +77,8 @@ class PyLSTMTensorRTInferenceOp : public LSTMTensorRTInferenceOp {
                                         Arg{"output_binding_names", output_binding_names},
                                         Arg{"model_file_path", model_file_path},
                                         Arg{"engine_cache_dir", engine_cache_dir},
-                                        // Arg{"dla_core", dla_core},
                                         Arg{"pool", pool},
                                         Arg{"cuda_stream_pool", cuda_stream_pool},
-                                        // Arg{"clock", clock},
                                         Arg{"plugins_lib_namespace", plugins_lib_namespace},
                                         Arg{"input_state_tensor_names", input_state_tensor_names},
                                         Arg{"output_state_tensor_names", output_state_tensor_names},
@@ -93,6 +88,7 @@ class PyLSTMTensorRTInferenceOp : public LSTMTensorRTInferenceOp {
                                         Arg{"relaxed_dimension_check", relaxed_dimension_check},
                                         Arg{"max_workspace_size", max_workspace_size},
                                         Arg{"max_batch_size", max_batch_size}}) {
+    if (dla_core.has_value()) { add_arg(Arg{"dla_core", dla_core.value()}); }
     add_positional_condition_and_resource_args(this, args);
     name_ = name;
     fragment_ = fragment;
@@ -131,10 +127,9 @@ PYBIND11_MODULE(_lstm_tensor_rt_inference, m) {
                     const std::vector<std::string>&,
                     const std::string&,
                     const std::string&,
-                    // int64_t,  // dla_core
                     std::shared_ptr<holoscan::Allocator>,
                     std::shared_ptr<holoscan::CudaStreamPool>,
-                    // std::shared_ptr<holoscan::Resource>,  // clock
+                    std::optional<int32_t>,
                     const std::string&,
                     const std::vector<std::string>&,
                     const std::vector<std::string>&,
@@ -152,10 +147,9 @@ PYBIND11_MODULE(_lstm_tensor_rt_inference, m) {
            "output_binding_names"_a,
            "model_file_path"_a,
            "engine_cache_dir"_a,
-           // "dla_core"_a,
            "pool"_a,
            "cuda_stream_pool"_a,
-           // "clock"_a,
+           "dla_core"_a = py::none(),
            "plugins_lib_namespace"_a = "",
            "input_state_tensor_names"_a = std::vector<std::string>{},
            "output_state_tensor_names"_a = std::vector<std::string>{},

diff --git a/operators/tool_tracking_postprocessor/CMakeLists.txt b/operators/tool_tracking_postprocessor/CMakeLists.txt
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2034 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 cmake_minimum_required(VERSION 3.20)
+
 project(tool_tracking_postprocessor LANGUAGES CXX CUDA)
 
 find_package(holoscan REQUIRED CONFIG
@@ -25,10 +26,21 @@ add_library(tool_tracking_postprocessor SHARED
   tool_tracking_postprocessor.cuh
   )
 
-set_target_properties(tool_tracking_postprocessor PROPERTIES CUDA_ARCHITECTURES "70;80")
+set_target_properties(tool_tracking_postprocessor
+  PROPERTIES
+    # separable compilation is required since we launch kernels from within kernels
+    CUDA_SEPARABLE_COMPILATION ON
+  )
 
-target_link_libraries(tool_tracking_postprocessor holoscan::core)
-target_include_directories(tool_tracking_postprocessor INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(tool_tracking_postprocessor
+  PRIVATE
+    holoscan::core
+  )
+
+target_include_directories(tool_tracking_postprocessor
+  INTERFACE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+  )
 
 if(HOLOHUB_BUILD_PYTHON)
     add_subdirectory(python)

diff --git a/operators/tool_tracking_postprocessor/python/tool_tracking_postprocessor.cpp b/operators/tool_tracking_postprocessor/python/tool_tracking_postprocessor.cpp
@@ -73,12 +73,11 @@ class PyToolTrackingPostprocessorOp : public ToolTrackingPostprocessorOp {
   // Define a constructor that fully initializes the object.
   PyToolTrackingPostprocessorOp(
       Fragment* fragment, const py::args& args, std::shared_ptr<Allocator> device_allocator,
-      std::shared_ptr<Allocator> host_allocator, float min_prob = 0.5f,
+      float min_prob = 0.5f,
       std::vector<std::vector<float>> overlay_img_colors = VIZ_TOOL_DEFAULT_COLORS,
       std::shared_ptr<holoscan::CudaStreamPool> cuda_stream_pool = nullptr,
       const std::string& name = "tool_tracking_postprocessor")
       : ToolTrackingPostprocessorOp(ArgList{Arg{"device_allocator", device_allocator},
-                                            Arg{"host_allocator", host_allocator},
                                             Arg{"min_prob", min_prob},
                                             Arg{"overlay_img_colors", overlay_img_colors}}) {
     if (cuda_stream_pool) { this->add_arg(Arg{"cuda_stream_pool", cuda_stream_pool}); }
@@ -116,14 +115,12 @@ PYBIND11_MODULE(_tool_tracking_postprocessor, m) {
       .def(py::init<Fragment*,
                     const py::args&,
                     std::shared_ptr<Allocator>,
-                    std::shared_ptr<Allocator>,
                     float,
                     std::vector<std::vector<float>>,
                     std::shared_ptr<holoscan::CudaStreamPool>,
                     const std::string&>(),
            "fragment"_a,
            "device_allocator"_a,
-           "host_allocator"_a,
            "min_prob"_a = 0.5f,
            "overlay_img_colors"_a = VIZ_TOOL_DEFAULT_COLORS,
            "cuda_stream_pool"_a = py::none(),

diff --git a/operators/tool_tracking_postprocessor/python/tool_tracking_postprocessor_pydoc.hpp b/operators/tool_tracking_postprocessor/python/tool_tracking_postprocessor_pydoc.hpp
@@ -40,7 +40,7 @@ Operator performing post-processing for the endoscopy tool tracking demo.
 **==Named Outputs==**
 
     out_coords : nvidia::gxf::Tensor
-        Coordinates tensor, stored on the host (CPU).
+        Coordinates tensor, stored on the device (GPU).
 
     out_mask : nvidia::gxf::Tensor
         Binary mask tensor, stored on device (GPU).
@@ -51,8 +51,6 @@ fragment : Fragment
     The fragment that the operator belongs to.
 device_allocator : ``holoscan.resources.Allocator``
     Output allocator used on the device side.
-host_allocator : ``holoscan.resources.Allocator``
-    Output allocator used on the host side.
 min_prob : float, optional
     Minimum probability (in range [0, 1]). Default value is 0.5.
 overlay_img_colors : sequence of sequence of float, optional