Skip to content

Commit

Permalink
Optimize endoscopy tool tracking app
Browse files Browse the repository at this point in the history
Modify ToolTrackingPostProcessorOp to generate coordinates using
CUDA. This avoids the roundtrip from GPU to CPU memory and back.
Use RMMAllocator for video replayer.
Fix type  of `dla_core` parameter of TensorRtInference.
Remove unused `clock` parameter.

Latency measured with flow benchmarking and replayer `realtime`
set to `false`: before optimizatio 4.8 ms, after optimization 3.09 ms.

Signed-off-by: Andreas Heumann <[email protected]>
  • Loading branch information
AndreasHeumann committed Oct 25, 2024
1 parent 17ed95c commit 571f03f
Show file tree
Hide file tree
Showing 16 changed files with 239 additions and 210 deletions.
26 changes: 19 additions & 7 deletions applications/endoscopy_tool_tracking/cpp/main.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights
* reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,14 +17,14 @@

#include <getopt.h>

#include <holoscan/holoscan.hpp>
#include <holoscan/operators/aja_source/aja_source.hpp>
#include <holoscan/operators/format_converter/format_converter.hpp>
#include <holoscan/operators/holoviz/holoviz.hpp>
#include <holoscan/operators/video_stream_recorder/video_stream_recorder.hpp>
#include <holoscan/operators/video_stream_replayer/video_stream_replayer.hpp>
#include <lstm_tensor_rt_inference.hpp>
#include <tool_tracking_postprocessor.hpp>
#include "holoscan/holoscan.hpp"
#ifdef VTK_RENDERER
#include <vtk_renderer.hpp>
#endif
Expand All @@ -38,6 +38,11 @@
#include <qcap_source.hpp>
#endif

#include <holoscan/version_config.hpp>

#define HOLOSCAN_VERSION \
(HOLOSCAN_VERSION_MAJOR * 10000 + HOLOSCAN_VERSION_MINOR * 100 + HOLOSCAN_VERSION_PATCH)

class App : public holoscan::Application {
public:
void set_source(const std::string& source) { source_ = source; }
Expand Down Expand Up @@ -111,6 +116,10 @@ class App : public holoscan::Application {
height = 480;
source = make_operator<ops::VideoStreamReplayerOp>(
"replayer", from_config("replayer"), Arg("directory", datapath));
#if HOLOSCAN_VERSION >= 20600
// the RMMAllocator supported since v2.6 is much faster than the default UnboundAllocator
source->add_arg(Arg("allocator", make_resource<RMMAllocator>("video_replayer_allocator")));
#endif
source_block_size = width * height * 3 * 4;
source_num_blocks = 2;
}
Expand Down Expand Up @@ -151,16 +160,19 @@ class App : public holoscan::Application {
"pool", 1, lstm_inferer_block_size, lstm_inferer_num_blocks),
Arg("cuda_stream_pool") = cuda_stream_pool);

const uint64_t tool_tracking_postprocessor_block_size = 107 * 60 * 7 * 4;
const uint64_t tool_tracking_postprocessor_num_blocks = 2;
// the tool tracking post process outputs
// - a RGBA float32 color mask
// - coordinates with x,y and size in float32
const uint64_t tool_tracking_postprocessor_block_size =
std::max(107 * 60 * 7 * 4 * sizeof(float), 7 * 3 * sizeof(float));
const uint64_t tool_tracking_postprocessor_num_blocks = 2 * 2;
auto tool_tracking_postprocessor = make_operator<ops::ToolTrackingPostprocessorOp>(
"tool_tracking_postprocessor",
Arg("device_allocator") =
make_resource<BlockMemoryPool>("device_allocator",
1,
tool_tracking_postprocessor_block_size,
tool_tracking_postprocessor_num_blocks),
Arg("host_allocator") = make_resource<UnboundedAllocator>("host_allocator"));
tool_tracking_postprocessor_num_blocks));

if (this->visualizer_name == "holoviz") {
std::shared_ptr<BlockMemoryPool> visualizer_allocator;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
BlockMemoryPool,
CudaStreamPool,
MemoryStorageType,
UnboundedAllocator,
)

from holohub.lstm_tensor_rt_inference import LSTMTensorRTInferenceOp
Expand Down Expand Up @@ -114,6 +113,12 @@ def compose(self):
directory=video_dir,
**self.kwargs("replayer"),
)
# the RMMAllocator supported since v2.6 is much faster than the default UnboundAllocator
try:
from holoscan.resources import RMMAllocator
source.add_arg(allocator=RMMAllocator(self, name="video_replayer_allocator"))
except Exception:
pass
# 4 bytes/channel, 3 channels
source_block_size = width * height * 3 * 4
source_num_blocks = 2
Expand All @@ -133,9 +138,7 @@ def compose(self):
pool=BlockMemoryPool(self, name="pool", **source_pool_kwargs),
**self.kwargs("recorder_format_converter"),
)
recorder = VideoStreamRecorderOp(
name="recorder", fragment=self, **self.kwargs("recorder")
)
recorder = VideoStreamRecorderOp(name="recorder", fragment=self, **self.kwargs("recorder"))

config_key_name = "format_converter_" + self.source.lower()

Expand Down Expand Up @@ -177,8 +180,14 @@ def compose(self):
**self.kwargs("lstm_inference"),
)

tool_tracking_postprocessor_block_size = 107 * 60 * 7 * 4
tool_tracking_postprocessor_num_blocks = 2
# the tool tracking post process outputs
# - a RGBA float32 color mask
# - coordinates with x,y and size in float32
bytes_per_float32 = 4
tool_tracking_postprocessor_block_size = max(
107 * 60 * 7 * 4 * bytes_per_float32, 7 * 3 * bytes_per_float32
)
tool_tracking_postprocessor_num_blocks = 2 * 2
tool_tracking_postprocessor = ToolTrackingPostprocessorOp(
self,
name="tool_tracking_postprocessor",
Expand All @@ -189,7 +198,6 @@ def compose(self):
block_size=tool_tracking_postprocessor_block_size,
num_blocks=tool_tracking_postprocessor_num_blocks,
),
host_allocator=UnboundedAllocator(self, name="host_allocator"),
)

if (record_type == "visualizer") and (self.source == "replayer"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,12 +289,6 @@ gxf_result_t TensorRtInference::registerInterface(gxf::Registrar* registrar) {
"Relaxed Dimension Check",
"Ignore dimensions of 1 for input tensor dimension check.",
true);
result &= registrar->parameter(clock_,
"clock",
"Clock",
"Instance of clock for publish time.",
gxf::Registrar::NoDefaultParameter(),
GXF_PARAMETER_FLAGS_OPTIONAL);

result &= registrar->parameter(rx_, "rx", "RX", "List of receivers to take input tensors");
result &= registrar->parameter(tx_, "tx", "TX", "Transmitter to publish output tensors");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
#include "gxf/cuda/cuda_stream.hpp"
#include "gxf/cuda/cuda_stream_pool.hpp"
#include "gxf/std/allocator.hpp"
#include "gxf/std/clock.hpp"
#include "gxf/std/codelet.hpp"
#include "gxf/std/receiver.hpp"
#include "gxf/std/tensor.hpp"
Expand Down Expand Up @@ -110,12 +109,11 @@ class TensorRtInference : public gxf::Codelet {
gxf::Parameter<gxf::Handle<gxf::Allocator>> pool_;
gxf::Parameter<gxf::Handle<gxf::CudaStreamPool>> cuda_stream_pool_;
gxf::Parameter<int64_t> max_workspace_size_;
gxf::Parameter<int64_t> dla_core_;
gxf::Parameter<int32_t> dla_core_;
gxf::Parameter<int32_t> max_batch_size_;
gxf::Parameter<bool> enable_fp16_;
gxf::Parameter<bool> relaxed_dimension_check_;
gxf::Parameter<bool> verbose_;
gxf::Parameter<gxf::Handle<gxf::Clock>> clock_;

gxf::Parameter<std::vector<gxf::Handle<gxf::Receiver>>> rx_;
gxf::Parameter<gxf::Handle<gxf::Transmitter>> tx_;
Expand Down
4 changes: 1 addition & 3 deletions operators/lstm_tensor_rt_inference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ This implementation is based on `nvidia::gxf::TensorRtInference`.
- **`max_workspace_size`**: Size of working space in bytes (default: `67108864l` (64MB))
- type: `int64_t`
- **`dla_core`**: DLA Core to use. Fallback to GPU is always enabled. Default to use GPU only (`optional`)
- type: `int64_t`
- type: `int32_t`
- **`max_batch_size`**: Maximum possible batch size in case the first dimension is dynamic and used as batch size (default: `1`)
- type: `int32_t`
- **`enable_fp16_`**: Enable inference with FP16 and FP32 fallback (default: `false`)
Expand All @@ -47,8 +47,6 @@ This implementation is based on `nvidia::gxf::TensorRtInference`.
- type: `bool`
- **`relaxed_dimension_check`**: Ignore dimensions of 1 for input tensor dimension check (default: `true`)
- type: `bool`
- **`clock`**: Instance of clock for publish time (`optional`)
- type: `gxf::Handle<gxf::Clock>`
- **`rx`**: List of receivers to take input tensors
- type: `std::vector<gxf::Handle<gxf::Receiver>>`
- **`tx`**: Transmitter to publish output tensors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ void LSTMTensorRTInferenceOp::setup(OperatorSpec& spec) {
"dla_core",
"DLA Core",
"DLA Core to use. Fallback to GPU is always enabled. "
"Default to use GPU only.");
"Default to use GPU only.",
ParameterFlag::kOptional);
spec.param(max_batch_size_,
"max_batch_size",
"Max Batch Size",
Expand All @@ -117,7 +118,6 @@ void LSTMTensorRTInferenceOp::setup(OperatorSpec& spec) {
"Relaxed Dimension Check",
"Ignore dimensions of 1 for input tensor dimension check.",
true);
spec.param(clock_, "clock", "Clock", "Instance of clock for publish time.");

spec.param(rx_, "rx", "RX", "List of receivers to take input tensors", {&in_tensor});
spec.param(tx_, "tx", "TX", "Transmitter to publish output tensors", &out_tensor);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,11 @@ class LSTMTensorRTInferenceOp : public holoscan::ops::GXFOperator {
Parameter<std::shared_ptr<Allocator>> pool_;
Parameter<std::shared_ptr<CudaStreamPool>> cuda_stream_pool_;
Parameter<int64_t> max_workspace_size_;
Parameter<int64_t> dla_core_;
Parameter<int32_t> dla_core_;
Parameter<int32_t> max_batch_size_;
Parameter<bool> enable_fp16_;
Parameter<bool> relaxed_dimension_check_;
Parameter<bool> verbose_;
Parameter<std::shared_ptr<Resource>> clock_;

Parameter<std::vector<IOSpec*>> rx_;
Parameter<IOSpec*> tx_;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights
* reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,11 +25,11 @@
#include <memory>
#include <string>

#include "../../operator_util.hpp"
#include <holoscan/core/fragment.hpp>
#include <holoscan/core/operator.hpp>
#include <holoscan/core/operator_spec.hpp>
#include <holoscan/core/resources/gxf/allocator.hpp>
#include "../../operator_util.hpp"
#include "holoscan/core/resources/gxf/cuda_stream_pool.hpp"

using std::string_literals::operator""s;
Expand Down Expand Up @@ -63,11 +63,8 @@ class PyLSTMTensorRTInferenceOp : public LSTMTensorRTInferenceOp {
const std::vector<std::string>& output_tensor_names,
const std::vector<std::string>& input_binding_names,
const std::vector<std::string>& output_binding_names, const std::string& model_file_path,
const std::string& engine_cache_dir,
// int64_t dla_core,
std::shared_ptr<holoscan::Allocator> pool,
std::shared_ptr<holoscan::CudaStreamPool> cuda_stream_pool,
// std::shared_ptr<holoscan::Resource> clock,
const std::string& engine_cache_dir, std::shared_ptr<holoscan::Allocator> pool,
std::shared_ptr<holoscan::CudaStreamPool> cuda_stream_pool, std::optional<int32_t> dla_core,
const std::string& plugins_lib_namespace = "",
const std::vector<std::string>& input_state_tensor_names = std::vector<std::string>{},
const std::vector<std::string>& output_state_tensor_names = std::vector<std::string>{},
Expand All @@ -80,10 +77,8 @@ class PyLSTMTensorRTInferenceOp : public LSTMTensorRTInferenceOp {
Arg{"output_binding_names", output_binding_names},
Arg{"model_file_path", model_file_path},
Arg{"engine_cache_dir", engine_cache_dir},
// Arg{"dla_core", dla_core},
Arg{"pool", pool},
Arg{"cuda_stream_pool", cuda_stream_pool},
// Arg{"clock", clock},
Arg{"plugins_lib_namespace", plugins_lib_namespace},
Arg{"input_state_tensor_names", input_state_tensor_names},
Arg{"output_state_tensor_names", output_state_tensor_names},
Expand All @@ -93,6 +88,7 @@ class PyLSTMTensorRTInferenceOp : public LSTMTensorRTInferenceOp {
Arg{"relaxed_dimension_check", relaxed_dimension_check},
Arg{"max_workspace_size", max_workspace_size},
Arg{"max_batch_size", max_batch_size}}) {
if (dla_core.has_value()) { add_arg(Arg{"dla_core", dla_core.value()}); }
add_positional_condition_and_resource_args(this, args);
name_ = name;
fragment_ = fragment;
Expand Down Expand Up @@ -131,10 +127,9 @@ PYBIND11_MODULE(_lstm_tensor_rt_inference, m) {
const std::vector<std::string>&,
const std::string&,
const std::string&,
// int64_t, // dla_core
std::shared_ptr<holoscan::Allocator>,
std::shared_ptr<holoscan::CudaStreamPool>,
// std::shared_ptr<holoscan::Resource>, // clock
std::optional<int32_t>,
const std::string&,
const std::vector<std::string>&,
const std::vector<std::string>&,
Expand All @@ -152,10 +147,9 @@ PYBIND11_MODULE(_lstm_tensor_rt_inference, m) {
"output_binding_names"_a,
"model_file_path"_a,
"engine_cache_dir"_a,
// "dla_core"_a,
"pool"_a,
"cuda_stream_pool"_a,
// "clock"_a,
"dla_core"_a = py::none(),
"plugins_lib_namespace"_a = "",
"input_state_tensor_names"_a = std::vector<std::string>{},
"output_state_tensor_names"_a = std::vector<std::string>{},
Expand Down
20 changes: 16 additions & 4 deletions operators/tool_tracking_postprocessor/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2023-2034 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.20)

project(tool_tracking_postprocessor LANGUAGES CXX CUDA)

find_package(holoscan REQUIRED CONFIG
Expand All @@ -25,10 +26,21 @@ add_library(tool_tracking_postprocessor SHARED
tool_tracking_postprocessor.cuh
)

set_target_properties(tool_tracking_postprocessor PROPERTIES CUDA_ARCHITECTURES "70;80")
set_target_properties(tool_tracking_postprocessor
PROPERTIES
# separable compilation is required since we launch kernels from within kernels
CUDA_SEPARABLE_COMPILATION ON
)

target_link_libraries(tool_tracking_postprocessor holoscan::core)
target_include_directories(tool_tracking_postprocessor INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(tool_tracking_postprocessor
PRIVATE
holoscan::core
)

target_include_directories(tool_tracking_postprocessor
INTERFACE
${CMAKE_CURRENT_SOURCE_DIR}
)

if(HOLOHUB_BUILD_PYTHON)
add_subdirectory(python)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,11 @@ class PyToolTrackingPostprocessorOp : public ToolTrackingPostprocessorOp {
// Define a constructor that fully initializes the object.
PyToolTrackingPostprocessorOp(
Fragment* fragment, const py::args& args, std::shared_ptr<Allocator> device_allocator,
std::shared_ptr<Allocator> host_allocator, float min_prob = 0.5f,
float min_prob = 0.5f,
std::vector<std::vector<float>> overlay_img_colors = VIZ_TOOL_DEFAULT_COLORS,
std::shared_ptr<holoscan::CudaStreamPool> cuda_stream_pool = nullptr,
const std::string& name = "tool_tracking_postprocessor")
: ToolTrackingPostprocessorOp(ArgList{Arg{"device_allocator", device_allocator},
Arg{"host_allocator", host_allocator},
Arg{"min_prob", min_prob},
Arg{"overlay_img_colors", overlay_img_colors}}) {
if (cuda_stream_pool) { this->add_arg(Arg{"cuda_stream_pool", cuda_stream_pool}); }
Expand Down Expand Up @@ -116,14 +115,12 @@ PYBIND11_MODULE(_tool_tracking_postprocessor, m) {
.def(py::init<Fragment*,
const py::args&,
std::shared_ptr<Allocator>,
std::shared_ptr<Allocator>,
float,
std::vector<std::vector<float>>,
std::shared_ptr<holoscan::CudaStreamPool>,
const std::string&>(),
"fragment"_a,
"device_allocator"_a,
"host_allocator"_a,
"min_prob"_a = 0.5f,
"overlay_img_colors"_a = VIZ_TOOL_DEFAULT_COLORS,
"cuda_stream_pool"_a = py::none(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ Operator performing post-processing for the endoscopy tool tracking demo.
**==Named Outputs==**
out_coords : nvidia::gxf::Tensor
Coordinates tensor, stored on the host (CPU).
Coordinates tensor, stored on the device (GPU).
out_mask : nvidia::gxf::Tensor
Binary mask tensor, stored on device (GPU).
Expand All @@ -51,8 +51,6 @@ fragment : Fragment
The fragment that the operator belongs to.
device_allocator : ``holoscan.resources.Allocator``
Output allocator used on the device side.
host_allocator : ``holoscan.resources.Allocator``
Output allocator used on the host side.
min_prob : float, optional
Minimum probability (in range [0, 1]). Default value is 0.5.
overlay_img_colors : sequence of sequence of float, optional
Expand Down
Loading

0 comments on commit 571f03f

Please sign in to comment.