vcomp/cmpto_j2k: run GPU conv as prepreprocess

Set the pixfmt conversion CUDA kernel as cmpto_j2k_enc preprocessor, not run directly. This also eliminates to need to have the conversion kernel if conversion is needed - CPU conversion will be sufficient. Currently not effective, only R12L is converted for which there is the kernel. refer to GH-406
CESNET · Sep 12, 2024 · 92565ca · 92565ca
1 parent caacebc
commit 92565ca
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 65 deletions.
diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu
@@ -1,6 +1,9 @@
 /**
  * @file   cuda_wrapper/kernels.cu
  * @author Martin Pulec     <[email protected]>
+ *
+ * This file hosts various CUDA kernels. Currently there are only kernels
+ * for cmpto_j2k compression and decompression.
  */
 /*
  * Copyright (c) 2024 CESNET
@@ -298,7 +301,7 @@ int postprocess_rg48_to_r12l(
 
         MEASURE_KERNEL_DURATION_STOP(stream)
 
-        return 0;
+        return cudaGetLastError();
 }
 
 //     ___   ___   ___    __      __     ___   _____  ____  ___
@@ -446,28 +449,42 @@ r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out)
         }
 }
 
-void
-preprocess_r12l_to_rg48(int width, int height, void *src, void *dst)
+int
+preprocess_r12l_to_rg48(void *preprocessor, void *img_custom_data,
+                        size_t img_custom_data_size, int size_x, int size_y,
+                        struct cmpto_j2k_enc_comp_format *comp_formats,
+                        int comp_count, void *input_samples,
+                        size_t input_samples_size, void *output_samples,
+                        size_t output_samples_size, void *vstream)
 {
-        (void) width, (void) height, (void) src, (void) dst;
+        (void) preprocessor, (void) img_custom_data,
+            (void) img_custom_data_size, (void) comp_formats, (void) comp_count,
+            (void) input_samples_size, (void) output_samples_size;
+
+        cudaStream_t stream = (cudaStream_t) vstream;
         dim3 threads_per_block(256);
-        dim3 blocks((((width + 7) / 8) + 255) / 256, height);
+        dim3 blocks((((size_x+ 7) / 8) + 255) / 256, size_y);
 
-        MEASURE_KERNEL_DURATION_START(0)
-        if (width % 2 == 0) {
-                kernel_r12l_to_rg48<uint32_t><<<blocks, threads_per_block>>>(
-                    (uint8_t *) src, (uint8_t *) dst, width);
+        MEASURE_KERNEL_DURATION_START(stream)
+        if (size_x % 2 == 0) {
+                kernel_r12l_to_rg48<uint32_t>
+                    <<<blocks, threads_per_block, 0, stream>>>(
+                        (uint8_t *) input_samples, (uint8_t *) output_samples,
+                        size_x);
         } else {
                 thread_local bool warn_print;
                 if (!warn_print) {
                         fprintf(stderr,
                                 "%s: Odd width %d px will use slower kernel!\n",
-                                __func__, width);
+                                __func__, size_x);
                         warn_print = true;
                 }
-                kernel_r12l_to_rg48<uint16_t><<<blocks, threads_per_block>>>(
-                    (uint8_t *) src, (uint8_t *) dst, width);
+                kernel_r12l_to_rg48<uint16_t>
+                    <<<blocks, threads_per_block, 0, stream>>>(
+                        (uint8_t *) input_samples, (uint8_t *) output_samples,
+                        size_x);
         }
-        MEASURE_KERNEL_DURATION_STOP(0)
-}
+        MEASURE_KERNEL_DURATION_STOP(stream)
 
+        return cudaGetLastError();
+}
diff --git a/src/cuda_wrapper/kernels.hpp b/src/cuda_wrapper/kernels.hpp
@@ -40,8 +40,9 @@
 
 #include <cstddef>
 
+// for cmpto_j2k compress and decompress
 struct cmpto_j2k_dec_comp_format;
-
+struct cmpto_j2k_enc_comp_format;
 int postprocess_rg48_to_r12l(
     void * postprocessor,
     void * img_custom_data,
@@ -58,7 +59,19 @@ int postprocess_rg48_to_r12l(
     size_t output_buffer_size,
     void * stream
 );
-
-void preprocess_r12l_to_rg48(int width, int height, void *src, void *dst);
+int preprocess_r12l_to_rg48(
+    void * preprocessor,
+    void * img_custom_data,
+    size_t img_custom_data_size,
+    int size_x,
+    int size_y,
+    struct cmpto_j2k_enc_comp_format * comp_formats,
+    int comp_count,
+    void * input_samples,
+    size_t input_samples_size,
+    void * output_samples,
+    size_t output_samples_size,
+    void *stream
+);
 
 #endif // defined CUDA_WRAPPER_KERNELS_HPP_1A3F7B57_EE91_4363_8D50_9CDDC60CB74F
diff --git a/src/video_compress/cmpto_j2k.cpp b/src/video_compress/cmpto_j2k.cpp
@@ -72,8 +72,9 @@
 #include "utils/misc.h"
 #include "utils/parallel_conv.h"
 #include "utils/video_frame_pool.h"
-#include "video.h"
+#include "video_codec.h"             // for vc_get_linesize, codec_is_a_rgb
 #include "video_compress.h"
+#include "video_frame.h"             // for vf_alloc_desc, vf_free, vf_resto...
 
 #define MOD_NAME "[Cmpto J2K enc.] "
 
@@ -129,8 +130,6 @@ struct cmpto_j2k_enc_cuda_buffer_data_allocator
 };
 #endif
 
-typedef void (*cuda_convert_func_t)(int width, int height, void *src, void *dst);
-
 struct state_video_compress_j2k {
         struct module module_data{};
         struct cmpto_j2k_enc_ctx *context{};
@@ -157,9 +156,6 @@ struct state_video_compress_j2k {
         condition_variable configure_cv;
         bool               configured  = false;
         bool               should_exit = false;
-
-        cuda_convert_func_t cuda_convert_func = nullptr;
-        uint8_t            *cuda_conv_tmp_buf = nullptr;
 };
 
 // prototypes
@@ -184,18 +180,18 @@ static void parallel_conv(video_frame *dst, video_frame *src){
         }
 }
 
+const cmpto_j2k_enc_preprocessor_run_callback_cuda r12l_to_rg48_cuda =
 #ifdef HAVE_CUDA
-const cuda_convert_func_t r12l_to_rg48_cuda = preprocess_r12l_to_rg48;
+    preprocess_r12l_to_rg48;
 #else
-const cuda_convert_func_t r12l_to_rg48_cuda = nullptr;
+    nullptr;
 #endif
 
 static struct {
         codec_t ug_codec;
         enum cmpto_sample_format_type cmpto_sf;
         codec_t convert_codec;
-        /// must be not-NULL if convert_codec != VC_NONE and HAVE_CUDA
-        cuda_convert_func_t cuda_convert_func;
+        cmpto_j2k_enc_preprocessor_run_callback_cuda cuda_convert_func;
 } codecs[] = {
         {UYVY, CMPTO_422_U8_P1020, VIDEO_CODEC_NONE, nullptr},
         {v210, CMPTO_422_U10_V210, VIDEO_CODEC_NONE, nullptr},
@@ -212,51 +208,42 @@ ADD_TO_PARAM(
     "* " CPU_CONV_PARAM "\n"
     "  Enforce CPU conversion instead of CUDA (applicable to R12L now)\n");
 static void
-set_pool(struct state_video_compress_j2k *s, struct video_desc desc)
+set_pool(struct state_video_compress_j2k *s, bool have_gpu_preprocess)
 {
 #ifdef HAVE_CUDA
-        const bool force_cpu_conv =
-            get_commandline_param(CPU_CONV_PARAM) != nullptr;
         s->pool_in_device_memory = false;
         if (cuda_devices_count > 1) {
                 MSG(WARNING, "More than 1 CUDA device will use CPU buffers and "
                              "conversion...\n");
-        } else if (!force_cpu_conv || s->cuda_convert_func == nullptr) {
-                cuda_wrapper_set_device((int) cuda_devices[0]);
-
-                if (s->cuda_convert_func != nullptr) {
-                        cuda_wrapper_malloc(
-                            (void **) &s->cuda_conv_tmp_buf,
-                            vc_get_datalen(desc.width, desc.height, desc.color_spec) +
-                                MAX_PADDING);
-                }
-
+        } else if (s->precompress_codec == VC_NONE || have_gpu_preprocess) {
                 s->pool_in_device_memory = true;
                 s->pool                  = video_frame_pool(
                     s->max_in_frames,
                     cmpto_j2k_enc_cuda_buffer_data_allocator<
                                          cuda_wrapper_malloc, cuda_wrapper_free>());
                 return;
         }
-        s->cuda_convert_func = nullptr; // either was 0 or force_cpu_conv
         s->pool = video_frame_pool(
             s->max_in_frames,
             cmpto_j2k_enc_cuda_buffer_data_allocator<cuda_wrapper_malloc_host,
                                                      cuda_wrapper_free_host>());
 #else
+        assert(!have_gpu_preprocess); // if CUDA not found, we shouldn't have
         s->pool = video_frame_pool(s->max_in_frames, default_data_allocator());
 #endif
 }
 
 static bool configure_with(struct state_video_compress_j2k *s, struct video_desc desc){
         enum cmpto_sample_format_type sample_format;
+        cmpto_j2k_enc_preprocessor_run_callback_cuda cuda_convert_func =
+            nullptr;
         bool found = false;
 
         for(const auto &codec : codecs){
                 if(codec.ug_codec == desc.color_spec){
                         sample_format = codec.cmpto_sf;
                         s->precompress_codec = codec.convert_codec;
-                        s->cuda_convert_func = codec.cuda_convert_func;
+                        cuda_convert_func = codec.cuda_convert_func;
                         found = true;
                         break;
                 }
@@ -275,6 +262,10 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
                 s->configured = false;
         }
 
+        if (get_commandline_param(CPU_CONV_PARAM) != nullptr) {
+                cuda_convert_func = nullptr;
+        }
+
         struct cmpto_j2k_enc_ctx_cfg *ctx_cfg = nullptr;
         CHECK_OK(cmpto_j2k_enc_ctx_cfg_create(&ctx_cfg),
                  "Context configuration create", return false);
@@ -284,6 +275,11 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
                         ctx_cfg, cuda_devices[i], s->mem_limit, s->tile_limit),
                     "Setting CUDA device", return false);
         }
+        if (cuda_convert_func != nullptr) {
+                CHECK_OK(cmpto_j2k_enc_ctx_cfg_set_preprocessor_cuda(
+                             ctx_cfg, nullptr, nullptr, cuda_convert_func),
+                         "Setting CUDA preprocess", return false);
+        }
 
         CHECK_OK(cmpto_j2k_enc_ctx_create(ctx_cfg, &s->context),
                  "Context create", return false);
@@ -321,7 +317,7 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
                         "Setting MCT",
                         NOOP);
 
-        set_pool(s, desc);
+        set_pool(s, cuda_convert_func != nullptr);
 
         s->compressed_desc = desc;
         s->compressed_desc.color_spec = codec_is_a_rgb(desc.color_spec) ? J2KR : J2K;
@@ -341,26 +337,15 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
  * Does the pixel format conversion as well if specified.
  */
 static void
-do_gpu_copy(struct state_video_compress_j2k *s,
-             std::shared_ptr<video_frame> &ret, video_frame *in_frame)
+do_gpu_copy(std::shared_ptr<video_frame> &ret, video_frame *in_frame)
 {
 #ifdef HAVE_CUDA
         cuda_wrapper_set_device((int) cuda_devices[0]);
-        if (s->cuda_convert_func == nullptr) {
-                assert(s->precompress_codec == VC_NONE);
-                cuda_wrapper_memcpy(ret->tiles[0].data, in_frame->tiles[0].data,
-                                    in_frame->tiles[0].data_len,
-                                    CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE);
-                return;
-        }
-        cuda_wrapper_memcpy(s->cuda_conv_tmp_buf, in_frame->tiles[0].data,
+        cuda_wrapper_memcpy(ret->tiles[0].data, in_frame->tiles[0].data,
                             in_frame->tiles[0].data_len,
                             CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE);
-        s->cuda_convert_func((int) in_frame->tiles[0].width,
-                             (int) in_frame->tiles[0].height,
-                             s->cuda_conv_tmp_buf, ret->tiles[0].data);
 #else
-        (void) s, (void) ret, (void) in_frame;
+        (void) ret, (void) in_frame;
         abort(); // must not reach here
 #endif
 }
@@ -369,7 +354,7 @@ static shared_ptr<video_frame> get_copy(struct state_video_compress_j2k *s, vide
         std::shared_ptr<video_frame> ret = s->pool.get_frame();
 
         if (s->pool_in_device_memory) {
-                do_gpu_copy(s, ret, frame);
+                do_gpu_copy(ret, frame);
         } else if (s->precompress_codec != VC_NONE) {
                 parallel_conv(ret.get(), frame);
         } else {
@@ -615,7 +600,8 @@ static void j2k_compress_push(struct module *state, std::shared_ptr<video_frame>
                         return;
                 }
                 struct video_desc pool_desc = desc;
-                if (s->precompress_codec != VC_NONE) {
+                if (s->precompress_codec != VC_NONE &&
+                    !s->pool_in_device_memory) {
                         pool_desc.color_spec = s->precompress_codec;
                 }
                 s->pool.reconfigure(
@@ -646,9 +632,17 @@ static void j2k_compress_push(struct module *state, std::shared_ptr<video_frame>
         vf_store_metadata(tx.get(), udata->metadata);
 
         if (s->pool_in_device_memory) {
+                // cmpto_j2k_enc requires the size after postprocess, which
+                // doesn't equeal the IN frame data_len for R12L
+                const codec_t device_codec = s->precompress_codec == VC_NONE
+                                           ? udata->frame->color_spec
+                                           : s->precompress_codec;
+                const size_t  data_len =
+                    vc_get_datalen(udata->frame->tiles[0].width,
+                                   udata->frame->tiles[0].height, device_codec);
                 CHECK_OK(cmpto_j2k_enc_img_set_samples_cuda(
                              img, cuda_devices[0], udata->frame->tiles[0].data,
-                             udata->frame->tiles[0].data_len, release_cstream_cuda),
+                             data_len, release_cstream_cuda),
                          "Setting image samples", HANDLE_ERROR_COMPRESS_PUSH);
         } else {
                 CHECK_OK(cmpto_j2k_enc_img_set_samples(
@@ -694,11 +688,6 @@ cleanup_common(struct state_video_compress_j2k *s)
                 cmpto_j2k_enc_ctx_destroy(s->context);
         }
         s->context = nullptr;
-
-#ifdef HAVE_CUDA
-        cuda_wrapper_free(s->cuda_conv_tmp_buf);
-        s->cuda_conv_tmp_buf = nullptr;
-#endif
 }
 
 static compress_module_info get_cmpto_j2k_module_info(){