Skip to content

Commit

Permalink
Merge branch 'CESNET:master' into wip-cmpto-j2k-cpu
Browse files Browse the repository at this point in the history
  • Loading branch information
ATrivialAtomic authored Sep 10, 2024
2 parents 392b3b4 + 8b7db97 commit d3f1ae4
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 41 deletions.
28 changes: 13 additions & 15 deletions src/audio/playback/alsa.c
Original file line number Diff line number Diff line change
Expand Up @@ -448,18 +448,18 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode,
enum {
REC_MIN_BUF_US = 5000,
};
unsigned int buf_len = 0;
unsigned int buf_len_us = 0;
int buf_dir = -1;
const char *buff_param = get_commandline_param("alsa-playback-buffer");

if (get_commandline_param("low-latency-audio") != NULL &&
buff_param == NULL) {
// set minimal value from the configuration space
CHECK_OK(snd_pcm_hw_params_set_buffer_time_first(
handle, params, &buf_len, &buf_dir));
handle, params, &buf_len_us, &buf_dir));
MSG(INFO, "ALSA driver buffer len set to: %lf ms\n",
buf_len / US_IN_1MS_DBL);
if (buf_len <= REC_MIN_BUF_US) {
US_TO_MS((double) buf_len_us));
if (buf_len_us <= REC_MIN_BUF_US) {
MSG(WARNING,
"ALSA driver buffer len less than %d usec seem to "
"be too loow, consider using alsa-playback-buffer "
Expand All @@ -469,22 +469,20 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode,
return;
}

if (buff_param != NULL) {
buf_len = atoi(buff_param);
} else {
buf_len = (playback_mode == SYNC ? BUF_LEN_DEFAULT_SYNC_MS
: BUF_LEN_DEFAULT_MS) *
US_IN_1MS;
}
buf_len_us = buff_param != NULL ? atoi(buff_param)
: MS_TO_US(playback_mode == SYNC
? BUF_LEN_DEFAULT_SYNC_MS
: BUF_LEN_DEFAULT_MS);

const int rc = snd_pcm_hw_params_set_buffer_time_near(
handle, params, &buf_len, &buf_dir);
handle, params, &buf_len_us, &buf_dir);
if (rc < 0) {
MSG(WARNING, "Warning - unable to set buffer to its size: %s\n",
snd_strerror(rc));
MSG(WARNING,
"Warning - unable to set buffer to its size %u us: %s\n",
buf_len_us, snd_strerror(rc));
}
MSG(INFO, "ALSA driver buffer len set to: %lf ms\n",
buf_len / US_IN_1MS_DBL);
US_TO_MS((double) buf_len_us));
}

ADD_TO_PARAM("alsa-play-period-size", "* alsa-play-period-size=<frames>\n"
Expand Down
119 changes: 98 additions & 21 deletions src/cuda_wrapper/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,8 @@
#include <cstdio>
#include <cuda_runtime_api.h>

#ifdef DEBUG
#define D_PRINTF printf
#else
#define D_PRINTF(...)
#endif
extern volatile int log_level;
#define LOG_LEVEL_DEBUG 7

#define MEASURE_KERNEL_DURATION_START(stream) \
cudaEvent_t t0, t1; \
Expand All @@ -57,7 +54,9 @@
cudaEventSynchronize(t1); \
float elapsedTime = NAN; \
cudaEventElapsedTime(&elapsedTime, t0, t1); \
D_PRINTF("%s elapsed time: %f ms\n", __func__, elapsedTime); \
if (log_level >= LOG_LEVEL_DEBUG) { \
printf("%s elapsed time: %f ms\n", __func__, elapsedTime); \
} \
if (elapsedTime > 10.0) { \
fprintf( \
stderr, \
Expand All @@ -73,9 +72,26 @@
/**
* modified @ref vc_copylineRG48toR12L
*/
template <typename load_t>
__device__ static void
rt48_to_r12l_compute_blk(const uint8_t *src, uint8_t *dst)
rt48_to_r12l_compute_blk(const uint8_t *in, uint8_t *out)
{
// load the data from in to src_u32
auto *in_t = (load_t *) in;
uint32_t src_u32[12];
for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) {
static_assert(sizeof(load_t) == 2 || sizeof(load_t) == 4);
if constexpr (sizeof(load_t) == 4) {
src_u32[i] = in_t[i];
} else {
src_u32[i] = in_t[2 * i] | in_t[2 * i + 1] << 16;
}
}

uint32_t dst_u32[9];
auto *dst = (uint8_t *) dst_u32;
auto *src = (uint8_t *) src_u32;

// 0
dst[0] = src[0] >> 4;
dst[0] |= src[1] << 4;
Expand Down Expand Up @@ -191,21 +207,29 @@ rt48_to_r12l_compute_blk(const uint8_t *src, uint8_t *dst)
dst[32 + 2] |= src[0] & 0xF0;
dst[32 + 3] = src[1];
src += 2;

// store the result
auto *out_u32 = (uint32_t *) out;
for (unsigned i = 0; i < sizeof dst_u32 / sizeof dst_u32[0]; ++i) {
out_u32[i] = dst_u32[i];
}
}

template <typename load_t>
__device__ static void
rt48_to_r12l_compute_last_blk(uint8_t *src, uint8_t *dst, unsigned width)
{
uint8_t tmp[48];
alignas(uint32_t) uint8_t tmp[48];
for (unsigned i = 0; i < width * 6; ++i) {
tmp[i] = src[i];
}
rt48_to_r12l_compute_blk(tmp, dst);
rt48_to_r12l_compute_blk<load_t>(tmp, dst);
}

/**
* @todo fix the last block for widths not divisible by 8
*/
template <typename load_t>
__global__ static void
kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x)
{
Expand All @@ -220,11 +244,11 @@ kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x)

// handle incomplete blocks
if (position_x == size_x / 8) {
rt48_to_r12l_compute_last_blk(src, dst,
size_x - position_x * 8);
rt48_to_r12l_compute_last_blk<load_t>(src, dst,
size_x - position_x * 8);
return;
}
rt48_to_r12l_compute_blk(src, dst);
rt48_to_r12l_compute_blk<load_t>(src, dst);
}

/**
Expand Down Expand Up @@ -252,9 +276,24 @@ int postprocess_rg48_to_r12l(

MEASURE_KERNEL_DURATION_START(stream)

kernel_rg48_to_r12l<<<blocks, threads_per_block, 0,
(cudaStream_t) stream>>>(
(uint8_t *) input_samples, (uint8_t *) output_buffer, size_x);
if (size_x % 2 == 0) {
kernel_rg48_to_r12l<uint32_t>
<<<blocks, threads_per_block, 0, (cudaStream_t) stream>>>(
(uint8_t *) input_samples, (uint8_t *) output_buffer,
size_x);
} else {
thread_local bool warn_print;
if (!warn_print) {
fprintf(stderr,
"%s: Odd width %d px will use slower kernel!\n",
__func__, size_x);
warn_print = true;
}
kernel_rg48_to_r12l<uint16_t>
<<<blocks, threads_per_block, 0, (cudaStream_t) stream>>>(
(uint8_t *) input_samples, (uint8_t *) output_buffer,
size_x);
}

MEASURE_KERNEL_DURATION_STOP(stream)

Expand All @@ -266,9 +305,11 @@ int postprocess_rg48_to_r12l(
// / , _/ / / / __/ / /__/___/ > > / , _// (_ / /_ _// _ |
// /_/|_| /_/ /____/ /____/ /_/ /_/|_| \___/ /_/ \___/

template <typename store_t>
__device__ static void r12l_to_rg48_compute_blk(const uint8_t *src,
uint8_t *dst);

template <typename store_t>
__global__ static void
kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x)
{
Expand All @@ -283,20 +324,32 @@ kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x)

if (position_x == size_x / 8) {
// compute the last incomplete block
uint8_t tmp[48];
r12l_to_rg48_compute_blk(src, tmp);
alignas(uint32_t) uint8_t tmp[48];
r12l_to_rg48_compute_blk<store_t>(src, tmp);
for (unsigned i = 0; i < (size_x - position_x * 8) * 6; ++i) {
dst[i] = tmp[i];
}
return;
}
r12l_to_rg48_compute_blk(src, dst);
r12l_to_rg48_compute_blk<store_t>(src, dst);
}

/// adapted variant of @ref vc_copylineR12LtoRG48
template <typename store_t>
__device__ static void
r12l_to_rg48_compute_blk(const uint8_t *src, uint8_t *dst)
r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out)
{
// load the data from in to src_u32
auto *in_u32 = (uint32_t *) in;
uint32_t src_u32[9];
for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) {
src_u32[i] = in_u32[i];
}

uint32_t dst_u32[12];
uint8_t *dst = (uint8_t *) dst_u32;
uint8_t *src = (uint8_t *) src_u32;

// 0
// R
*dst++ = src[0] << 4;
Expand Down Expand Up @@ -377,6 +430,18 @@ r12l_to_rg48_compute_blk(const uint8_t *src, uint8_t *dst)

*dst++ = src[32 + 2] & 0xF0;
*dst++ = src[32 + 3];

// store the result
auto *out_t = (store_t *) out;
for (unsigned i = 0; i < sizeof dst_u32 / sizeof dst_u32[0]; ++i) {
static_assert(sizeof(store_t) == 2 || sizeof(store_t) == 4);
if constexpr (sizeof(store_t) == 4) {
out_t[i] = dst_u32[i];
} else {
out_t[2 * i] = dst_u32[i] & 0xFFFFU;
out_t[2 * i + 1] = dst_u32[i] >> 16;
}
}
}

void
Expand All @@ -387,8 +452,20 @@ preprocess_r12l_to_rg48(int width, int height, void *src, void *dst)
dim3 blocks((((width + 7) / 8) + 255) / 256, height);

MEASURE_KERNEL_DURATION_START(0)
kernel_r12l_to_rg48<<<blocks, threads_per_block>>>(
(uint8_t *) src, (uint8_t *) dst, width);
if (width % 2 == 0) {
kernel_r12l_to_rg48<uint32_t><<<blocks, threads_per_block>>>(
(uint8_t *) src, (uint8_t *) dst, width);
} else {
thread_local bool warn_print;
if (!warn_print) {
fprintf(stderr,
"%s: Odd width %d px will use slower kernel!\n",
__func__, width);
warn_print = true;
}
kernel_r12l_to_rg48<uint16_t><<<blocks, threads_per_block>>>(
(uint8_t *) src, (uint8_t *) dst, width);
}
MEASURE_KERNEL_DURATION_STOP(0)
}

9 changes: 6 additions & 3 deletions src/export.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <assert.h>
#include <dirent.h>
#include <errno.h> // for errno, EEXIST
#include <limits.h>
Expand Down Expand Up @@ -341,7 +340,9 @@ static void process_messages(struct exporter *s) {

void export_audio(struct exporter *s, struct audio_frame *frame)
{
assert(s != NULL);
if(!s){
return;
}

process_messages(s);

Expand All @@ -354,7 +355,9 @@ void export_audio(struct exporter *s, struct audio_frame *frame)

void export_video(struct exporter *s, struct video_frame *frame)
{
assert(s != NULL);
if(!s){
return;
}

process_messages(s);

Expand Down
6 changes: 4 additions & 2 deletions src/tv.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,6 @@ typedef long long time_ns_t;
#define MS_IN_NS_DBL 1000000.0
#define MS_IN_SEC 1000
#define MS_IN_SEC_DBL 1000.0
#define US_IN_1MS 1000
#define US_IN_1MS_DBL 1000.0
#define US_IN_SEC 1000000LL
#define US_IN_NS 1000LL
#define US_IN_SEC_DBL ((double) US_IN_SEC)
Expand All @@ -97,6 +95,10 @@ typedef long long time_ns_t;
#define NS_IN_SEC_DBL ((double) NS_IN_SEC)
#define NS_IN_US (NS_IN_SEC/US_IN_SEC)
#define NS_IN_US_DBL ((double) NS_IN_US)
#define US_TO_MS(val_us) ((val_us) / 1000)
#define MS_TO_US(val_ms) ((val_ms) * 1000)
#define NS_TO_MS(val_ns) ((val_ns) / 1000 / 1000)

static inline time_ns_t get_time_in_ns() {
#ifdef HAVE_TIMESPEC_GET
struct timespec ts = { 0, 0 };
Expand Down
5 changes: 5 additions & 0 deletions src/video_compress/cmpto_j2k.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,14 @@ static void parallel_conv(video_frame *dst, video_frame *src){
decoder_t decoder =
get_decoder_from_to(src->color_spec, dst->color_spec);
assert(decoder != nullptr);
time_ns_t t0 = get_time_in_ns();
parallel_pix_conv((int) src->tiles[0].height, dst->tiles[0].data,
dst_pitch, src->tiles[0].data, src_pitch,
decoder, 0);
if (log_level >= LOG_LEVEL_DEBUG) {
MSG(DEBUG, "pixfmt conversion duration: %f ms\n",
NS_TO_MS((double) (get_time_in_ns() - t0)));
}
}

static struct {
Expand Down
5 changes: 5 additions & 0 deletions src/video_decompress/cmpto_j2k.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,9 +212,14 @@ static void rg48_to_r12l(unsigned char *dst_buffer,
int dst_len = vc_get_linesize(width, R12L);
decoder_t vc_copylineRG48toR12L = get_decoder_from_to(RG48, R12L);

time_ns_t t0 = get_time_in_ns();
parallel_pix_conv((int) height, (char *) dst_buffer, dst_len,
(const char *) src_buffer, src_pitch,
vc_copylineRG48toR12L, 0);
if (log_level >= LOG_LEVEL_DEBUG) {
MSG(DEBUG, "pixfmt conversion duration: %f ms\n",
NS_TO_MS((double) (get_time_in_ns() - t0)));
}
}

static void print_dropped(unsigned long long int dropped, const j2k_decompress_platform& platform) {
Expand Down

0 comments on commit d3f1ae4

Please sign in to comment.