Skip to content

Commit

Permalink
vcomp/cmpto_j2k: run GPU conv as prepreprocess
Browse files Browse the repository at this point in the history
Set the pixfmt conversion CUDA kernel as cmpto_j2k_enc preprocessor,
not run directly.

This also eliminates to need to have the conversion kernel if conversion
is needed - CPU conversion will be sufficient. Currently not effective,
only R12L is converted for which there is the kernel.

refer to GH-406
  • Loading branch information
MartinPulec committed Sep 12, 2024
1 parent caacebc commit 92565ca
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 65 deletions.
45 changes: 31 additions & 14 deletions src/cuda_wrapper/kernels.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
/**
* @file cuda_wrapper/kernels.cu
* @author Martin Pulec <[email protected]>
*
* This file hosts various CUDA kernels. Currently there are only kernels
* for cmpto_j2k compression and decompression.
*/
/*
* Copyright (c) 2024 CESNET
Expand Down Expand Up @@ -298,7 +301,7 @@ int postprocess_rg48_to_r12l(

MEASURE_KERNEL_DURATION_STOP(stream)

return 0;
return cudaGetLastError();
}

// ___ ___ ___ __ __ ___ _____ ____ ___
Expand Down Expand Up @@ -446,28 +449,42 @@ r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out)
}
}

void
preprocess_r12l_to_rg48(int width, int height, void *src, void *dst)
int
preprocess_r12l_to_rg48(void *preprocessor, void *img_custom_data,
size_t img_custom_data_size, int size_x, int size_y,
struct cmpto_j2k_enc_comp_format *comp_formats,
int comp_count, void *input_samples,
size_t input_samples_size, void *output_samples,
size_t output_samples_size, void *vstream)
{
(void) width, (void) height, (void) src, (void) dst;
(void) preprocessor, (void) img_custom_data,
(void) img_custom_data_size, (void) comp_formats, (void) comp_count,
(void) input_samples_size, (void) output_samples_size;

cudaStream_t stream = (cudaStream_t) vstream;
dim3 threads_per_block(256);
dim3 blocks((((width + 7) / 8) + 255) / 256, height);
dim3 blocks((((size_x+ 7) / 8) + 255) / 256, size_y);

MEASURE_KERNEL_DURATION_START(0)
if (width % 2 == 0) {
kernel_r12l_to_rg48<uint32_t><<<blocks, threads_per_block>>>(
(uint8_t *) src, (uint8_t *) dst, width);
MEASURE_KERNEL_DURATION_START(stream)
if (size_x % 2 == 0) {
kernel_r12l_to_rg48<uint32_t>
<<<blocks, threads_per_block, 0, stream>>>(
(uint8_t *) input_samples, (uint8_t *) output_samples,
size_x);
} else {
thread_local bool warn_print;
if (!warn_print) {
fprintf(stderr,
"%s: Odd width %d px will use slower kernel!\n",
__func__, width);
__func__, size_x);
warn_print = true;
}
kernel_r12l_to_rg48<uint16_t><<<blocks, threads_per_block>>>(
(uint8_t *) src, (uint8_t *) dst, width);
kernel_r12l_to_rg48<uint16_t>
<<<blocks, threads_per_block, 0, stream>>>(
(uint8_t *) input_samples, (uint8_t *) output_samples,
size_x);
}
MEASURE_KERNEL_DURATION_STOP(0)
}
MEASURE_KERNEL_DURATION_STOP(stream)

return cudaGetLastError();
}
19 changes: 16 additions & 3 deletions src/cuda_wrapper/kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@

#include <cstddef>

// for cmpto_j2k compress and decompress
struct cmpto_j2k_dec_comp_format;

struct cmpto_j2k_enc_comp_format;
int postprocess_rg48_to_r12l(
void * postprocessor,
void * img_custom_data,
Expand All @@ -58,7 +59,19 @@ int postprocess_rg48_to_r12l(
size_t output_buffer_size,
void * stream
);

void preprocess_r12l_to_rg48(int width, int height, void *src, void *dst);
int preprocess_r12l_to_rg48(
void * preprocessor,
void * img_custom_data,
size_t img_custom_data_size,
int size_x,
int size_y,
struct cmpto_j2k_enc_comp_format * comp_formats,
int comp_count,
void * input_samples,
size_t input_samples_size,
void * output_samples,
size_t output_samples_size,
void *stream
);

#endif // defined CUDA_WRAPPER_KERNELS_HPP_1A3F7B57_EE91_4363_8D50_9CDDC60CB74F
85 changes: 37 additions & 48 deletions src/video_compress/cmpto_j2k.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@
#include "utils/misc.h"
#include "utils/parallel_conv.h"
#include "utils/video_frame_pool.h"
#include "video.h"
#include "video_codec.h" // for vc_get_linesize, codec_is_a_rgb
#include "video_compress.h"
#include "video_frame.h" // for vf_alloc_desc, vf_free, vf_resto...

#define MOD_NAME "[Cmpto J2K enc.] "

Expand Down Expand Up @@ -129,8 +130,6 @@ struct cmpto_j2k_enc_cuda_buffer_data_allocator
};
#endif

typedef void (*cuda_convert_func_t)(int width, int height, void *src, void *dst);

struct state_video_compress_j2k {
struct module module_data{};
struct cmpto_j2k_enc_ctx *context{};
Expand All @@ -157,9 +156,6 @@ struct state_video_compress_j2k {
condition_variable configure_cv;
bool configured = false;
bool should_exit = false;

cuda_convert_func_t cuda_convert_func = nullptr;
uint8_t *cuda_conv_tmp_buf = nullptr;
};

// prototypes
Expand All @@ -184,18 +180,18 @@ static void parallel_conv(video_frame *dst, video_frame *src){
}
}

const cmpto_j2k_enc_preprocessor_run_callback_cuda r12l_to_rg48_cuda =
#ifdef HAVE_CUDA
const cuda_convert_func_t r12l_to_rg48_cuda = preprocess_r12l_to_rg48;
preprocess_r12l_to_rg48;
#else
const cuda_convert_func_t r12l_to_rg48_cuda = nullptr;
nullptr;
#endif

static struct {
codec_t ug_codec;
enum cmpto_sample_format_type cmpto_sf;
codec_t convert_codec;
/// must be not-NULL if convert_codec != VC_NONE and HAVE_CUDA
cuda_convert_func_t cuda_convert_func;
cmpto_j2k_enc_preprocessor_run_callback_cuda cuda_convert_func;
} codecs[] = {
{UYVY, CMPTO_422_U8_P1020, VIDEO_CODEC_NONE, nullptr},
{v210, CMPTO_422_U10_V210, VIDEO_CODEC_NONE, nullptr},
Expand All @@ -212,51 +208,42 @@ ADD_TO_PARAM(
"* " CPU_CONV_PARAM "\n"
" Enforce CPU conversion instead of CUDA (applicable to R12L now)\n");
static void
set_pool(struct state_video_compress_j2k *s, struct video_desc desc)
set_pool(struct state_video_compress_j2k *s, bool have_gpu_preprocess)
{
#ifdef HAVE_CUDA
const bool force_cpu_conv =
get_commandline_param(CPU_CONV_PARAM) != nullptr;
s->pool_in_device_memory = false;
if (cuda_devices_count > 1) {
MSG(WARNING, "More than 1 CUDA device will use CPU buffers and "
"conversion...\n");
} else if (!force_cpu_conv || s->cuda_convert_func == nullptr) {
cuda_wrapper_set_device((int) cuda_devices[0]);

if (s->cuda_convert_func != nullptr) {
cuda_wrapper_malloc(
(void **) &s->cuda_conv_tmp_buf,
vc_get_datalen(desc.width, desc.height, desc.color_spec) +
MAX_PADDING);
}

} else if (s->precompress_codec == VC_NONE || have_gpu_preprocess) {
s->pool_in_device_memory = true;
s->pool = video_frame_pool(
s->max_in_frames,
cmpto_j2k_enc_cuda_buffer_data_allocator<
cuda_wrapper_malloc, cuda_wrapper_free>());
return;
}
s->cuda_convert_func = nullptr; // either was 0 or force_cpu_conv
s->pool = video_frame_pool(
s->max_in_frames,
cmpto_j2k_enc_cuda_buffer_data_allocator<cuda_wrapper_malloc_host,
cuda_wrapper_free_host>());
#else
assert(!have_gpu_preprocess); // if CUDA not found, we shouldn't have
s->pool = video_frame_pool(s->max_in_frames, default_data_allocator());
#endif
}

static bool configure_with(struct state_video_compress_j2k *s, struct video_desc desc){
enum cmpto_sample_format_type sample_format;
cmpto_j2k_enc_preprocessor_run_callback_cuda cuda_convert_func =
nullptr;
bool found = false;

for(const auto &codec : codecs){
if(codec.ug_codec == desc.color_spec){
sample_format = codec.cmpto_sf;
s->precompress_codec = codec.convert_codec;
s->cuda_convert_func = codec.cuda_convert_func;
cuda_convert_func = codec.cuda_convert_func;
found = true;
break;
}
Expand All @@ -275,6 +262,10 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
s->configured = false;
}

if (get_commandline_param(CPU_CONV_PARAM) != nullptr) {
cuda_convert_func = nullptr;
}

struct cmpto_j2k_enc_ctx_cfg *ctx_cfg = nullptr;
CHECK_OK(cmpto_j2k_enc_ctx_cfg_create(&ctx_cfg),
"Context configuration create", return false);
Expand All @@ -284,6 +275,11 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
ctx_cfg, cuda_devices[i], s->mem_limit, s->tile_limit),
"Setting CUDA device", return false);
}
if (cuda_convert_func != nullptr) {
CHECK_OK(cmpto_j2k_enc_ctx_cfg_set_preprocessor_cuda(
ctx_cfg, nullptr, nullptr, cuda_convert_func),
"Setting CUDA preprocess", return false);
}

CHECK_OK(cmpto_j2k_enc_ctx_create(ctx_cfg, &s->context),
"Context create", return false);
Expand Down Expand Up @@ -321,7 +317,7 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
"Setting MCT",
NOOP);

set_pool(s, desc);
set_pool(s, cuda_convert_func != nullptr);

s->compressed_desc = desc;
s->compressed_desc.color_spec = codec_is_a_rgb(desc.color_spec) ? J2KR : J2K;
Expand All @@ -341,26 +337,15 @@ static bool configure_with(struct state_video_compress_j2k *s, struct video_desc
* Does the pixel format conversion as well if specified.
*/
static void
do_gpu_copy(struct state_video_compress_j2k *s,
std::shared_ptr<video_frame> &ret, video_frame *in_frame)
do_gpu_copy(std::shared_ptr<video_frame> &ret, video_frame *in_frame)
{
#ifdef HAVE_CUDA
cuda_wrapper_set_device((int) cuda_devices[0]);
if (s->cuda_convert_func == nullptr) {
assert(s->precompress_codec == VC_NONE);
cuda_wrapper_memcpy(ret->tiles[0].data, in_frame->tiles[0].data,
in_frame->tiles[0].data_len,
CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE);
return;
}
cuda_wrapper_memcpy(s->cuda_conv_tmp_buf, in_frame->tiles[0].data,
cuda_wrapper_memcpy(ret->tiles[0].data, in_frame->tiles[0].data,
in_frame->tiles[0].data_len,
CUDA_WRAPPER_MEMCPY_HOST_TO_DEVICE);
s->cuda_convert_func((int) in_frame->tiles[0].width,
(int) in_frame->tiles[0].height,
s->cuda_conv_tmp_buf, ret->tiles[0].data);
#else
(void) s, (void) ret, (void) in_frame;
(void) ret, (void) in_frame;
abort(); // must not reach here
#endif
}
Expand All @@ -369,7 +354,7 @@ static shared_ptr<video_frame> get_copy(struct state_video_compress_j2k *s, vide
std::shared_ptr<video_frame> ret = s->pool.get_frame();

if (s->pool_in_device_memory) {
do_gpu_copy(s, ret, frame);
do_gpu_copy(ret, frame);
} else if (s->precompress_codec != VC_NONE) {
parallel_conv(ret.get(), frame);
} else {
Expand Down Expand Up @@ -615,7 +600,8 @@ static void j2k_compress_push(struct module *state, std::shared_ptr<video_frame>
return;
}
struct video_desc pool_desc = desc;
if (s->precompress_codec != VC_NONE) {
if (s->precompress_codec != VC_NONE &&
!s->pool_in_device_memory) {
pool_desc.color_spec = s->precompress_codec;
}
s->pool.reconfigure(
Expand Down Expand Up @@ -646,9 +632,17 @@ static void j2k_compress_push(struct module *state, std::shared_ptr<video_frame>
vf_store_metadata(tx.get(), udata->metadata);

if (s->pool_in_device_memory) {
// cmpto_j2k_enc requires the size after postprocess, which
// doesn't equeal the IN frame data_len for R12L
const codec_t device_codec = s->precompress_codec == VC_NONE
? udata->frame->color_spec
: s->precompress_codec;
const size_t data_len =
vc_get_datalen(udata->frame->tiles[0].width,
udata->frame->tiles[0].height, device_codec);
CHECK_OK(cmpto_j2k_enc_img_set_samples_cuda(
img, cuda_devices[0], udata->frame->tiles[0].data,
udata->frame->tiles[0].data_len, release_cstream_cuda),
data_len, release_cstream_cuda),
"Setting image samples", HANDLE_ERROR_COMPRESS_PUSH);
} else {
CHECK_OK(cmpto_j2k_enc_img_set_samples(
Expand Down Expand Up @@ -694,11 +688,6 @@ cleanup_common(struct state_video_compress_j2k *s)
cmpto_j2k_enc_ctx_destroy(s->context);
}
s->context = nullptr;

#ifdef HAVE_CUDA
cuda_wrapper_free(s->cuda_conv_tmp_buf);
s->cuda_conv_tmp_buf = nullptr;
#endif
}

static compress_module_info get_cmpto_j2k_module_info(){
Expand Down

0 comments on commit 92565ca

Please sign in to comment.