diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu index 0ae897f0b..3747a94e7 100644 --- a/src/cuda_wrapper/kernels.cu +++ b/src/cuda_wrapper/kernels.cu @@ -1,6 +1,9 @@ /** * @file cuda_wrapper/kernels.cu * @author Martin Pulec + * + * This file hosts various CUDA kernels. Currently there are only kernels + * for cmpto_j2k compression and decompression. */ /* * Copyright (c) 2024 CESNET @@ -298,7 +301,7 @@ int postprocess_rg48_to_r12l( MEASURE_KERNEL_DURATION_STOP(stream) - return 0; + return cudaGetLastError(); } // ___ ___ ___ __ __ ___ _____ ____ ___ @@ -446,28 +449,42 @@ r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out) } } -void -preprocess_r12l_to_rg48(int width, int height, void *src, void *dst) +int +preprocess_r12l_to_rg48(void *preprocessor, void *img_custom_data, + size_t img_custom_data_size, int size_x, int size_y, + struct cmpto_j2k_enc_comp_format *comp_formats, + int comp_count, void *input_samples, + size_t input_samples_size, void *output_samples, + size_t output_samples_size, void *vstream) { - (void) width, (void) height, (void) src, (void) dst; + (void) preprocessor, (void) img_custom_data, + (void) img_custom_data_size, (void) comp_formats, (void) comp_count, + (void) input_samples_size, (void) output_samples_size; + + cudaStream_t stream = (cudaStream_t) vstream; dim3 threads_per_block(256); - dim3 blocks((((width + 7) / 8) + 255) / 256, height); + dim3 blocks((((size_x+ 7) / 8) + 255) / 256, size_y); - MEASURE_KERNEL_DURATION_START(0) - if (width % 2 == 0) { - kernel_r12l_to_rg48<<>>( - (uint8_t *) src, (uint8_t *) dst, width); + MEASURE_KERNEL_DURATION_START(stream) + if (size_x % 2 == 0) { + kernel_r12l_to_rg48 + <<>>( + (uint8_t *) input_samples, (uint8_t *) output_samples, + size_x); } else { thread_local bool warn_print; if (!warn_print) { fprintf(stderr, "%s: Odd width %d px will use slower kernel!\n", - __func__, width); + __func__, size_x); warn_print = true; } - kernel_r12l_to_rg48<<>>( - (uint8_t *) src, (uint8_t *) dst, width); + kernel_r12l_to_rg48 + <<>>( + (uint8_t *) input_samples, (uint8_t *) output_samples, + size_x); } - MEASURE_KERNEL_DURATION_STOP(0) -} + MEASURE_KERNEL_DURATION_STOP(stream) + return cudaGetLastError(); +} diff --git a/src/cuda_wrapper/kernels.hpp b/src/cuda_wrapper/kernels.hpp index b23aaab6b..7c20404a8 100644 --- a/src/cuda_wrapper/kernels.hpp +++ b/src/cuda_wrapper/kernels.hpp @@ -40,8 +40,9 @@ #include +// for cmpto_j2k compress and decompress struct cmpto_j2k_dec_comp_format; - +struct cmpto_j2k_enc_comp_format; int postprocess_rg48_to_r12l( void * postprocessor, void * img_custom_data, @@ -58,7 +59,19 @@ int postprocess_rg48_to_r12l( size_t output_buffer_size, void * stream ); - -void preprocess_r12l_to_rg48(int width, int height, void *src, void *dst); +int preprocess_r12l_to_rg48( + void * preprocessor, + void * img_custom_data, + size_t img_custom_data_size, + int size_x, + int size_y, + struct cmpto_j2k_enc_comp_format * comp_formats, + int comp_count, + void * input_samples, + size_t input_samples_size, + void * output_samples, + size_t output_samples_size, + void *stream +); #endif // defined CUDA_WRAPPER_KERNELS_HPP_1A3F7B57_EE91_4363_8D50_9CDDC60CB74F diff --git a/src/video_compress/cmpto_j2k.cpp b/src/video_compress/cmpto_j2k.cpp index 4b0e12533..583f1da34 100644 --- a/src/video_compress/cmpto_j2k.cpp +++ b/src/video_compress/cmpto_j2k.cpp @@ -72,8 +72,9 @@ #include "utils/misc.h" #include "utils/parallel_conv.h" #include "utils/video_frame_pool.h" -#include "video.h" +#include "video_codec.h" // for vc_get_linesize, codec_is_a_rgb #include "video_compress.h" +#include "video_frame.h" // for vf_alloc_desc, vf_free, vf_resto... #define MOD_NAME "[Cmpto J2K enc.] " @@ -129,8 +130,6 @@ struct cmpto_j2k_enc_cuda_buffer_data_allocator }; #endif -typedef void (*cuda_convert_func_t)(int width, int height, void *src, void *dst); - struct state_video_compress_j2k { struct module module_data{}; struct cmpto_j2k_enc_ctx *context{}; @@ -157,9 +156,6 @@ struct state_video_compress_j2k { condition_variable configure_cv; bool configured = false; bool should_exit = false; - - cuda_convert_func_t cuda_convert_func = nullptr; - uint8_t *cuda_conv_tmp_buf = nullptr; }; // prototypes @@ -184,18 +180,18 @@ static void parallel_conv(video_frame *dst, video_frame *src){ } } +const cmpto_j2k_enc_preprocessor_run_callback_cuda r12l_to_rg48_cuda = #ifdef HAVE_CUDA -const cuda_convert_func_t r12l_to_rg48_cuda = preprocess_r12l_to_rg48; + preprocess_r12l_to_rg48; #else -const cuda_convert_func_t r12l_to_rg48_cuda = nullptr; + nullptr; #endif static struct { codec_t ug_codec; enum cmpto_sample_format_type cmpto_sf; codec_t convert_codec; - /// must be not-NULL if convert_codec != VC_NONE and HAVE_CUDA - cuda_convert_func_t cuda_convert_func; + cmpto_j2k_enc_preprocessor_run_callback_cuda cuda_convert_func; } codecs[] = { {UYVY, CMPTO_422_U8_P1020, VIDEO_CODEC_NONE, nullptr}, {v210, CMPTO_422_U10_V210, VIDEO_CODEC_NONE, nullptr}, @@ -212,25 +208,14 @@ ADD_TO_PARAM( "* " CPU_CONV_PARAM "\n" " Enforce CPU conversion instead of CUDA (applicable to R12L now)\n"); static void -set_pool(struct state_video_compress_j2k *s, struct video_desc desc) +set_pool(struct state_video_compress_j2k *s, bool have_gpu_preprocess) { #ifdef HAVE_CUDA - const bool force_cpu_conv = - get_commandline_param(CPU_CONV_PARAM) != nullptr; s->pool_in_device_memory = false; if (cuda_devices_count > 1) { MSG(WARNING, "More than 1 CUDA device will use CPU buffers and " "conversion...\n"); - } else if (!force_cpu_conv || s->cuda_convert_func == nullptr) { - cuda_wrapper_set_device((int) cuda_devices[0]); - - if (s->cuda_convert_func != nullptr) { - cuda_wrapper_malloc( - (void **) &s->cuda_conv_tmp_buf, - vc_get_datalen(desc.width, desc.height, desc.color_spec) + - MAX_PADDING); - } - + } else if (s->precompress_codec == VC_NONE || have_gpu_preprocess) { s->pool_in_device_memory = true; s->pool = video_frame_pool( s->max_in_frames, @@ -238,25 +223,27 @@ set_pool(struct state_video_compress_j2k *s, struct video_desc desc) cuda_wrapper_malloc, cuda_wrapper_free>()); return; } - s->cuda_convert_func = nullptr; // either was 0 or force_cpu_conv s->pool = video_frame_pool( s->max_in_frames, cmpto_j2k_enc_cuda_buffer_data_allocator()); #else + assert(!have_gpu_preprocess); // if CUDA not found, we shouldn't have s->pool = video_frame_pool(s->max_in_frames, default_data_allocator()); #endif } static bool configure_with(struct state_video_compress_j2k *s, struct video_desc desc){ enum cmpto_sample_format_type sample_format; + cmpto_j2k_enc_preprocessor_run_callback_cuda cuda_convert_func = + nullptr; bool found = false; for(const auto &codec : codecs){ if(codec.ug_codec == desc.color_spec){ sample_format = codec.cmpto_sf; s->precompress_codec = codec.convert_codec; - s->cuda_convert_func = codec.cuda_convert_func; + cuda_convert_func = codec.cuda_convert_func; found = true; break; } @@ -275,6 +262,10 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc s->configured = false; } + if (get_commandline_param(CPU_CONV_PARAM) != nullptr) { + cuda_convert_func = nullptr; + } + struct cmpto_j2k_enc_ctx_cfg *ctx_cfg = nullptr; CHECK_OK(cmpto_j2k_enc_ctx_cfg_create(&ctx_cfg), "Context configuration create", return false); @@ -284,6 +275,11 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc ctx_cfg, cuda_devices[i], s->mem_limit, s->tile_limit), "Setting CUDA device", return false); } + if (cuda_convert_func != nullptr) { + CHECK_OK(cmpto_j2k_enc_ctx_cfg_set_preprocessor_cuda( + ctx_cfg, nullptr, nullptr, cuda_convert_func), + "Setting CUDA preprocess", return false); + } CHECK_OK(cmpto_j2k_enc_ctx_create(ctx_cfg, &s->context), "Context create", return false); @@ -321,7 +317,7 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc "Setting MCT", NOOP); - set_pool(s, desc); + set_pool(s, cuda_convert_func != nullptr); s->compressed_desc = desc; s->compressed_desc.color_spec = codec_is_a_rgb(desc.color_spec) ? J2KR : J2K; @@ -341,26 +337,15 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc * Does the pixel format conversion as well if specified. */ static void -do_gpu_copy(struct state_video_compress_j2k *s, - std::shared_ptr &ret, video_frame *in_frame) +do_gpu_copy(std::shared_ptr &ret, video_frame *in_frame) { #ifdef HAVE_CUDA cuda_wrapper_set_device((int) cuda_devices[0]); - if (s->cuda_convert_func == nullptr) { - assert(s->precompress_codec == VC_NONE); - cuda_wrapper_memcpy(ret->tiles[0].data, in_frame->tiles[0].data, - in_frame->tiles[0].data_len, - CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE); - return; - } - cuda_wrapper_memcpy(s->cuda_conv_tmp_buf, in_frame->tiles[0].data, + cuda_wrapper_memcpy(ret->tiles[0].data, in_frame->tiles[0].data, in_frame->tiles[0].data_len, CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE); - s->cuda_convert_func((int) in_frame->tiles[0].width, - (int) in_frame->tiles[0].height, - s->cuda_conv_tmp_buf, ret->tiles[0].data); #else - (void) s, (void) ret, (void) in_frame; + (void) ret, (void) in_frame; abort(); // must not reach here #endif } @@ -369,7 +354,7 @@ static shared_ptr get_copy(struct state_video_compress_j2k *s, vide std::shared_ptr ret = s->pool.get_frame(); if (s->pool_in_device_memory) { - do_gpu_copy(s, ret, frame); + do_gpu_copy(ret, frame); } else if (s->precompress_codec != VC_NONE) { parallel_conv(ret.get(), frame); } else { @@ -615,7 +600,8 @@ static void j2k_compress_push(struct module *state, std::shared_ptr return; } struct video_desc pool_desc = desc; - if (s->precompress_codec != VC_NONE) { + if (s->precompress_codec != VC_NONE && + !s->pool_in_device_memory) { pool_desc.color_spec = s->precompress_codec; } s->pool.reconfigure( @@ -646,9 +632,17 @@ static void j2k_compress_push(struct module *state, std::shared_ptr vf_store_metadata(tx.get(), udata->metadata); if (s->pool_in_device_memory) { + // cmpto_j2k_enc requires the size after postprocess, which + // doesn't equeal the IN frame data_len for R12L + const codec_t device_codec = s->precompress_codec == VC_NONE + ? udata->frame->color_spec + : s->precompress_codec; + const size_t data_len = + vc_get_datalen(udata->frame->tiles[0].width, + udata->frame->tiles[0].height, device_codec); CHECK_OK(cmpto_j2k_enc_img_set_samples_cuda( img, cuda_devices[0], udata->frame->tiles[0].data, - udata->frame->tiles[0].data_len, release_cstream_cuda), + data_len, release_cstream_cuda), "Setting image samples", HANDLE_ERROR_COMPRESS_PUSH); } else { CHECK_OK(cmpto_j2k_enc_img_set_samples( @@ -694,11 +688,6 @@ cleanup_common(struct state_video_compress_j2k *s) cmpto_j2k_enc_ctx_destroy(s->context); } s->context = nullptr; - -#ifdef HAVE_CUDA - cuda_wrapper_free(s->cuda_conv_tmp_buf); - s->cuda_conv_tmp_buf = nullptr; -#endif } static compress_module_info get_cmpto_j2k_module_info(){