Skip to content

Commit

Permalink
rt48_to_r12l_compute_blk: compute |last_bl|%8 != 0
Browse files Browse the repository at this point in the history
Compute last incomplete block (as already done for the cmpto_j2k enc)
in CUDA kernel.
  • Loading branch information
MartinPulec committed Sep 3, 2024
1 parent c83f6ad commit bb19d58
Showing 1 changed file with 36 additions and 17 deletions.
53 changes: 36 additions & 17 deletions src/cuda_wrapper/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,25 +72,10 @@

/**
* modified @ref vc_copylineRG48toR12L
* @todo fix the last block for widths not divisible by 8
*/
__global__ void
kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x)
__device__ static void
rt48_to_r12l_compute_blk(const uint8_t *src, uint8_t *dst)
{
unsigned position_x = threadIdx.x + blockIdx.x * blockDim.x;
unsigned position_y = threadIdx.y + blockIdx.y * blockDim.y;
if (position_x >= (size_x + 7) / 8) {
return;
}
// drop last block if not complete (prevent overriding start of
// following line with junk and also possibly OOB access)
if (position_x > size_x / 8) {
return;
}
uint8_t *src = in + 2 * (position_y * 3 * size_x + position_x * 3 * 8);
uint8_t *dst =
out + (position_y * ((size_x + 7) / 8) + position_x) * 36;

// 0
dst[0] = src[0] >> 4;
dst[0] |= src[1] << 4;
Expand Down Expand Up @@ -208,6 +193,40 @@ kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x)
src += 2;
}

__device__ static void
rt48_to_r12l_compute_last_blk(uint8_t *src, uint8_t *dst, unsigned width)
{
uint8_t tmp[48];
for (unsigned i = 0; i < width * 6; ++i) {
tmp[i] = src[i];
}
rt48_to_r12l_compute_blk(tmp, dst);
}

/**
* @todo fix the last block for widths not divisible by 8
*/
__global__ static void
kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x)
{
unsigned position_x = threadIdx.x + blockIdx.x * blockDim.x;
unsigned position_y = threadIdx.y + blockIdx.y * blockDim.y;
if (position_x >= (size_x + 7) / 8) {
return;
}
uint8_t *src = in + 2 * (position_y * 3 * size_x + position_x * 3 * 8);
uint8_t *dst =
out + (position_y * ((size_x + 7) / 8) + position_x) * 36;

// handle incomplete blocks
if (position_x == size_x / 8) {
rt48_to_r12l_compute_last_blk(src, dst,
size_x - position_x * 8);
return;
}
rt48_to_r12l_compute_blk(src, dst);
}

/**
* @sa cmpto_j2k_dec_postprocessor_run_callback_cuda
*/
Expand Down

0 comments on commit bb19d58

Please sign in to comment.