diff --git a/src/r3d.imageio/r3dinput.cpp b/src/r3d.imageio/r3dinput.cpp index bbdbc7c887..3f2d7c4b8a 100644 --- a/src/r3d.imageio/r3dinput.cpp +++ b/src/r3d.imageio/r3dinput.cpp @@ -9,8 +9,13 @@ // /opt/R3DSDKv8_5_1 directory and setting up the variable // export R3DSDK_ROOT="/opt/R3DSDKv8_5_1" +#define GPU +#define CUDA + +#include "OpenImageIO/platform.h" #include #include +#include #include #include @@ -58,7 +63,7 @@ OIIO_PLUGIN_NAMESPACE_BEGIN -#if 0 || !defined(NDEBUG) // allow R3D configuration debugging +#if 1 || !defined(NDEBUG) // allow R3D configuration debugging static bool r3d_debug = Strutil::stoi(Sysutil::getenv("OIIO_R3D_DEBUG")); # define DBG(...) \ if (r3d_debug) \ @@ -115,6 +120,11 @@ class R3dInput final : public ImageInput { std::unique_ptr m_config; // Saved copy of configuration spec R3DSDK::Clip* m_clip; R3DSDK::VideoDecodeJob m_job; +#ifdef GPU + bool m_gpu; + R3DSDK::DecodeStatus m_supported; + R3DSDK::AsyncDecompressJob m_async_decompress_job; +#endif // GPU unsigned char* m_image_buffer; int m_frames; int m_channels; @@ -170,6 +180,493 @@ OIIO_PLUGIN_EXPORTS_END +#ifdef CUDA + +namespace { +R3DSDK::GpuDecoder* GPU_DECODER; +R3DSDK::REDCuda* RED_CUDA; + +int CUDA_DEVICE_ID = 0; +volatile bool decodeDone = false; +} // namespace + +class SimpleMemoryPool { +public: + static SimpleMemoryPool* getInstance() + { + static SimpleMemoryPool* instance = NULL; + + if (instance == NULL) { + std::unique_lock lock(guard); + if (instance == NULL) { + instance = new SimpleMemoryPool(); + } + } + return instance; + } + + static cudaError_t cudaMalloc(void** p, size_t size) + { + DBG("cudaMalloc {}\n", size); + + return getInstance()->malloc_d(p, size); + } + + static cudaError_t cudaFree(void* p) { return getInstance()->free_d(p); } + + static cudaError_t cudaMallocArray(struct cudaArray** array, + const struct cudaChannelFormatDesc* desc, + size_t width, size_t height = 0, + unsigned int flags = 0) + { + DBG("cudaMallocArray {} {} {}\n", width, height, flags); + + return getInstance()->malloc_array(array, desc, width, height, flags); + } + + static cudaError_t + cudaMalloc3DArray(struct cudaArray** array, + const struct cudaChannelFormatDesc* desc, + struct cudaExtent ext, unsigned int flags = 0) + { + return getInstance()->malloc_array_3d(array, desc, ext, flags); + } + + static cudaError_t cudaFreeArray(cudaArray* p) + { + getInstance()->free_array(p); + + return cudaSuccess; + } + + static cudaError_t cudaMallocHost(void** p, size_t size) + { + return getInstance()->malloc_h(p, size); + } + + static cudaError_t cudaHostAlloc(void** p, size_t size, unsigned int flags) + { + return getInstance()->hostAlloc_h(p, size, flags); + } + + static cudaError_t cudaFreeHost(void* p) + { + getInstance()->free_h(p); + + return cudaSuccess; + } + +private: + static std::mutex guard; + + cudaError_t malloc_d(void** p, size_t size) + { + int device = 0; + cudaGetDevice(&device); + cudaError_t result = cudaSuccess; + *p = _device.findBlock(size, device); + + if (*p == NULL) { + result = ::cudaMalloc(p, size); + if (result != cudaSuccess) { + std::cerr << "Memory allocation of " << size + << " bytes failed: " << result << "\n"; + _device.sweep(); + _array.sweep(); + result = ::cudaMalloc(p, size); + } + if (result == cudaSuccess) + _device.addBlock(*p, size, device); + } + return result; + } + + cudaError_t free_d(void* p) + { + _device.releaseBlock(p); + return cudaSuccess; + } + + cudaError_t malloc_array(struct cudaArray** array, + const struct cudaChannelFormatDesc* desc, + size_t width, size_t height = 0, + unsigned int flags = 0) + { + int device = 0; + cudaGetDevice(&device); + cudaError_t result = cudaSuccess; + *array = (cudaArray*)_array.findBlock(width, height, 0, *desc, device); + + if (*array == NULL) { + result = ::cudaMallocArray(array, desc, width, height, flags); + if (result != cudaSuccess) { + DBG("Memory allocation failed: {}\n", static_cast(result)); + _device.sweep(); + _array.sweep(); + result = ::cudaMallocArray(array, desc, width, height, flags); + } + if (result == cudaSuccess) + _array.addBlock(*array, width, height, 0, *desc, device); + } + return result; + } + + cudaError_t malloc_array_3d(struct cudaArray** array, + const struct cudaChannelFormatDesc* desc, + const struct cudaExtent& ext, + unsigned int flags = 0) + { + int device = 0; + cudaGetDevice(&device); + cudaError_t result = cudaSuccess; + *array = (cudaArray*)_array.findBlock(ext.width, ext.height, ext.depth, + *desc, device); + + if (*array == NULL) { + result = ::cudaMalloc3DArray(array, desc, ext, flags); + if (result != cudaSuccess) { + DBG("Memory allocation failed: {}\n", static_cast(result)); + _device.sweep(); + _array.sweep(); + result = ::cudaMalloc3DArray(array, desc, ext, flags); + } + if (result == cudaSuccess) + _array.addBlock(*array, ext.width, ext.height, ext.depth, *desc, + device); + } + return result; + } + + void free_array(void* p) { _array.releaseBlock(p); } + + cudaError_t malloc_h(void** p, size_t size) + { + int device = 0; + cudaGetDevice(&device); + cudaError_t result = cudaSuccess; + *p = _host.findBlock(size, device); + + if (*p == NULL) { + result = ::cudaMallocHost(p, size); + if (result != cudaSuccess) { + DBG("Memory allocation failed: {}\n", static_cast(result)); + _host.sweep(); + result = ::cudaMallocHost(p, size); + } + if (result == cudaSuccess) + _host.addBlock(*p, size, device); + } + return result; + } + + void free_h(void* p) + { + if (!_host.releaseBlock(p)) { + _hostAlloc.releaseBlock(p); + } + } + + cudaError_t hostAlloc_h(void** p, size_t size, unsigned int flags) + { + int device = 0; + cudaGetDevice(&device); + cudaError_t result = cudaSuccess; + *p = _hostAlloc.findBlock(size, device); + + if (*p == NULL) { + result = ::cudaHostAlloc(p, size, flags); + if (result != cudaSuccess) { + DBG("Memory allocation failed: {}\n", static_cast(result)); + _hostAlloc.sweep(); + result = ::cudaHostAlloc(p, size, flags); + } + if (result == cudaSuccess) + _hostAlloc.addBlock(*p, size, device); + } + return result; + } + + struct BLOCK { + void* ptr; + size_t size; + int device; + }; + + struct ARRAY { + void* ptr; + size_t width; + size_t height; + size_t depth; + cudaChannelFormatDesc desc; + int device; + }; + + class Pool { + public: + void addBlock(void* ptr, size_t size, int device) + { + std::unique_lock lock(_guard); + + _inUse[ptr] = { ptr, size, device }; + } + + void* findBlock(size_t size, int device) + { + std::unique_lock lock(_guard); + + for (auto i = _free.begin(); i < _free.end(); ++i) { + if (i->size == size && i->device == device) { + void* p = i->ptr; + _inUse[p] = *i; + _free.erase(i); + return p; + } + } + return NULL; + } + + bool releaseBlock(void* ptr) + { + std::unique_lock lock(_guard); + + auto i = _inUse.find(ptr); + + if (i != _inUse.end()) { + _free.push_back(i->second); + _inUse.erase(i); + return true; + } + return false; + } + + void sweep() + { + std::unique_lock lock(_guard); + + for (auto i = _free.begin(); i < _free.end(); ++i) { + ::cudaFree(i->ptr); + } + _free.clear(); + } + + private: + std::map _inUse; + std::vector _free; + std::mutex _guard; + }; + + class ArrayPool { + public: + void addBlock(void* ptr, size_t width, size_t height, size_t depth, + const cudaChannelFormatDesc& desc, int device) + { + std::unique_lock lock(_guard); + + _inUse[ptr] = { ptr, width, height, depth, desc, device }; + } + + void* findBlock(size_t width, size_t height, size_t depth, + const cudaChannelFormatDesc& desc, int device) + { + std::unique_lock lock(_guard); + + for (auto i = _free.begin(); i < _free.end(); ++i) { + if (i->width == width && i->height == height + && i->depth == depth && i->desc.x == desc.x + && i->desc.y == desc.y && i->desc.z == desc.z + && i->desc.w == desc.w && i->desc.f == desc.f + && i->device == device) { + void* p = i->ptr; + _inUse[p] = *i; + _free.erase(i); + return p; + } + } + return NULL; + } + + bool releaseBlock(void* ptr) + { + std::unique_lock lock(_guard); + + auto i = _inUse.find(ptr); + + if (i != _inUse.end()) { + _free.push_back(i->second); + + _inUse.erase(i); + + return true; + } + return false; + } + + void sweep() + { + std::unique_lock lock(_guard); + + for (auto i = _free.begin(); i < _free.end(); ++i) { + ::cudaFree(i->ptr); + } + _free.clear(); + } + + private: + std::map _inUse; + std::vector _free; + std::mutex _guard; + }; + + Pool _device; + Pool _host; + Pool _hostAlloc; + ArrayPool _array; +}; + +std::mutex SimpleMemoryPool::guard; + + + +namespace { +R3DSDK::DebayerCudaJob* +DebayerAllocate(const R3DSDK::AsyncDecompressJob* job, + R3DSDK::ImageProcessingSettings* imageProcessingSettings, + R3DSDK::VideoPixelType pixelType) +{ + //allocate the debayer job + R3DSDK::DebayerCudaJob* data = RED_CUDA->createDebayerJob(); + + data->raw_host_mem = job->OutputBuffer; + data->mode = job->Mode; + data->imageProcessingSettings = imageProcessingSettings; + data->pixelType = pixelType; + + //create raw buffer on the CUDA device + cudaError_t err = SimpleMemoryPool::cudaMalloc(&(data->raw_device_mem), + job->OutputBufferSize); + + if (err != cudaSuccess) { + DBG("Failed to allocate raw frame on GPU: {}\n", static_cast(err)); + RED_CUDA->releaseDebayerJob(data); + return NULL; + } + + data->output_device_mem_size = R3DSDK::DebayerCudaJob::ResultFrameSize( + data); + DBG("data->output_device_mem_size = {}\n", data->output_device_mem_size); + + //YOU MUST specify an existing buffer for the result image + //Set DebayerCudaJob::output_device_mem_size >= result_buffer_size + //and a pointer to the device buffer in DebayerCudaJob::output_device_mem + err = SimpleMemoryPool::cudaMalloc(&(data->output_device_mem), + data->output_device_mem_size); + + if (err != cudaSuccess) { + DBG("Failed to allocate result frame on card {}\n", + static_cast(err)); + SimpleMemoryPool::cudaFree(data->raw_device_mem); + RED_CUDA->releaseDebayerJob(data); + return NULL; + } + + return data; +} + + + +void +DebayerFree(R3DSDK::DebayerCudaJob* job) +{ + SimpleMemoryPool::cudaFree(job->raw_device_mem); + SimpleMemoryPool::cudaFree(job->output_device_mem); + RED_CUDA->releaseDebayerJob(job); +} + + + +template class ConcurrentQueue { +private: + std::mutex QUEUE_MUTEX; + std::condition_variable QUEUE_CV; + std::list QUEUE; + +public: + void push(T* job) + { + std::unique_lock lck(QUEUE_MUTEX); + QUEUE.push_back(job); + QUEUE_CV.notify_all(); + } + + void pop(T*& job) + { + std::unique_lock lck(QUEUE_MUTEX); + + while (QUEUE.size() == 0) + QUEUE_CV.wait(lck); + + job = QUEUE.front(); + QUEUE.pop_front(); + } + + size_t size() const { return QUEUE.size(); } +}; + + + +void +CPU_callback(R3DSDK::AsyncDecompressJob* item, + R3DSDK::DecodeStatus decodeStatus) +{ + // DBG("CPU_callback()\n"); + Strutil::print("CPU_callback()\n"); + decodeDone = true; +} + + + +R3DSDK::REDCuda* +OpenCUDA(int& deviceId) +{ + //setup Cuda for the current thread + cudaDeviceProp deviceProp; + cudaError_t err = cudaChooseDevice(&deviceId, &deviceProp); + if (err != cudaSuccess) { + DBG("Failed to move raw frame to card {}\n", static_cast(err)); + return NULL; + } + + err = cudaSetDevice(deviceId); + if (err != cudaSuccess) { + DBG("Failed to move raw frame to card {}\n", static_cast(err)); + return NULL; + } + + //SETUP YOUR CUDA API FUNCTION POINTERS + R3DSDK::EXT_CUDA_API api; + api.cudaFree = SimpleMemoryPool::cudaFree; + api.cudaFreeArray = SimpleMemoryPool::cudaFreeArray; + api.cudaFreeHost = SimpleMemoryPool::cudaFreeHost; + api.cudaFreeMipmappedArray = ::cudaFreeMipmappedArray; + api.cudaHostAlloc = SimpleMemoryPool::cudaHostAlloc; + api.cudaMalloc = SimpleMemoryPool::cudaMalloc; + api.cudaMalloc3D = ::cudaMalloc3D; + api.cudaMalloc3DArray = SimpleMemoryPool::cudaMalloc3DArray; + api.cudaMallocArray = SimpleMemoryPool::cudaMallocArray; + api.cudaMallocHost = SimpleMemoryPool::cudaMallocHost; + api.cudaMallocMipmappedArray = ::cudaMallocMipmappedArray; + api.cudaMallocPitch = ::cudaMallocPitch; + + + //CREATE THE REDCuda CLASS + return new R3DSDK::REDCuda(api); +} +} //end anonymous namespace + +#endif // CUDA + + + void R3dInput::initialize() { @@ -188,9 +685,20 @@ R3dInput::initialize() #endif ); // initialize SDK - // R3DSDK::InitializeStatus init_status = R3DSDK::InitializeSdk(".", OPTION_RED_CUDA); + + unsigned int optional_components = +#ifdef CUDA + OPTION_RED_CUDA; +#elif defined(OpenCL) + OPTION_RED_OPENCL; +#elif defined(Metal) + OPTION_RED_METAL; +#else + OPTION_RED_NONE; +#endif + R3DSDK::InitializeStatus init_status - = R3DSDK::InitializeSdk(library_path.c_str(), OPTION_RED_NONE); + = R3DSDK::InitializeSdk(library_path.c_str(), optional_components); if (init_status != R3DSDK::ISInitializeOK) { R3DSDK::FinalizeSdk(); DBG("Failed to load R3DSDK Library\n"); @@ -198,7 +706,7 @@ R3dInput::initialize() } DBG("SDK VERSION: {}\n", R3DSDK::GetSdkVersion()); -#ifdef GPU +#ifdef CUDA // open CUDA device RED_CUDA = OpenCUDA(CUDA_DEVICE_ID); @@ -206,7 +714,9 @@ R3dInput::initialize() R3DSDK::FinalizeSdk(); DBG("Failed to initialize CUDA\n"); } -#endif // GPU + + m_gpu = true; +#endif // CUDA } @@ -245,30 +755,29 @@ R3dInput::open(const std::string& name, ImageSpec& newspec, } R3DSDK::VideoDecodeMode mode = R3DSDK::DECODE_FULL_RES_PREMIUM; - int scale = 1; - - switch (hint) - { - case 0: - mode = R3DSDK::DECODE_FULL_RES_PREMIUM; - scale = 1; - break; - case 1: - mode = R3DSDK::DECODE_HALF_RES_GOOD; - scale = 2; - break; - case 2: - mode = R3DSDK::DECODE_QUARTER_RES_GOOD; - scale = 4; - break; - case 3: - mode = R3DSDK::DECODE_EIGHT_RES_GOOD; - scale = 8; - break; - case 4: - mode = R3DSDK::DECODE_SIXTEENTH_RES_GOOD; - scale = 16; - break; + int scale = 1; + + switch (hint) { + case 0: + mode = R3DSDK::DECODE_FULL_RES_PREMIUM; + scale = 1; + break; + case 1: + mode = R3DSDK::DECODE_HALF_RES_GOOD; + scale = 2; + break; + case 2: + mode = R3DSDK::DECODE_QUARTER_RES_GOOD; + scale = 4; + break; + case 3: + mode = R3DSDK::DECODE_EIGHT_RES_GOOD; + scale = 8; + break; + case 4: + mode = R3DSDK::DECODE_SIXTEENTH_RES_GOOD; + scale = 16; + break; } // calculate how much ouput memory we're going to need @@ -303,32 +812,74 @@ R3dInput::open(const std::string& name, ImageSpec& newspec, return false; } - // letting the decoder know how big the buffer is - m_job.OutputBufferSize = memNeeded; +#ifdef GPU + if (m_gpu) { + // open GPU decoder + GPU_DECODER = new R3DSDK::GpuDecoder(); + GPU_DECODER->Open(); + + m_supported = GPU_DECODER->DecodeSupportedForClip(*m_clip); + } + + if (m_supported == R3DSDK::DSDecodeOK) { + m_async_decompress_job.Clip = m_clip; + + m_async_decompress_job.Mode = mode; - m_job.Mode = mode; + // letting the decoder know how big the buffer is + m_async_decompress_job.OutputBufferSize + = R3DSDK::AsyncDecoder::GetSizeBufferNeeded(m_async_decompress_job); - // store the image here - m_job.OutputBuffer = m_image_buffer; + DBG("OutputBufferSize = {}\n", m_async_decompress_job.OutputBufferSize); - // Interleaved RGB decoding in 16-bits per pixel - m_job.PixelType = R3DSDK::PixelType_16Bit_RGB_Interleaved; - m_job.BytesPerRow = m_channels * width * sizeof(uint16_t); + m_async_decompress_job.OutputBuffer = static_cast( + aligned_malloc(m_async_decompress_job.OutputBufferSize, 16)); - m_job.ImageProcessing = NULL; - m_job.HdrProcessing = NULL; + // Interleaved RGB decoding in 16-bits per pixel + // m_async_decompress_job.PixelType + // = R3DSDK::PixelType_16Bit_RGB_Interleaved; + // m_async_decompress_job.BytesPerRow = m_channels * width + // * sizeof(uint16_t); + + // m_async_decompress_job.ImageProcessing = NULL; + // m_async_decompress_job.HdrProcessing = NULL; + // m_async_decompress_job.Callback = CPU_callback; + } else +#endif // GPU + { + // letting the decoder know how big the buffer is + m_job.OutputBufferSize = memNeeded; + + m_job.Mode = mode; + + // store the image here + m_job.OutputBuffer = m_image_buffer; + + // Interleaved RGB decoding in 16-bits per pixel + m_job.PixelType = R3DSDK::PixelType_16Bit_RGB_Interleaved; + m_job.BytesPerRow = m_channels * width * sizeof(uint16_t); + + m_job.ImageProcessing = NULL; + m_job.HdrProcessing = NULL; + } m_spec = ImageSpec(width, height, m_channels, TypeDesc::UINT16); - int frame_rate_numerator = m_clip->MetadataItemAsInt(R3DSDK::RMD_FRAMERATE_NUMERATOR); - int frame_rate_denominator = m_clip->MetadataItemAsInt(R3DSDK::RMD_FRAMERATE_DENOMINATOR); + int frame_rate_numerator = m_clip->MetadataItemAsInt( + R3DSDK::RMD_FRAMERATE_NUMERATOR); + int frame_rate_denominator = m_clip->MetadataItemAsInt( + R3DSDK::RMD_FRAMERATE_DENOMINATOR); int frame_rate[2] = { frame_rate_numerator, frame_rate_denominator }; - bool record_frame_rate_exists = m_clip->MetadataExists(R3DSDK::RMD_RECORD_FRAMERATE_NUMERATOR); + bool record_frame_rate_exists = m_clip->MetadataExists( + R3DSDK::RMD_RECORD_FRAMERATE_NUMERATOR); if (record_frame_rate_exists) { - int record_frame_rate_numerator = m_clip->MetadataItemAsInt(R3DSDK::RMD_RECORD_FRAMERATE_NUMERATOR); - int record_frame_rate_denominator = m_clip->MetadataItemAsInt(R3DSDK::RMD_RECORD_FRAMERATE_DENOMINATOR); - int record_frame_rate[2] = { record_frame_rate_numerator, record_frame_rate_denominator }; + int record_frame_rate_numerator = m_clip->MetadataItemAsInt( + R3DSDK::RMD_RECORD_FRAMERATE_NUMERATOR); + int record_frame_rate_denominator = m_clip->MetadataItemAsInt( + R3DSDK::RMD_RECORD_FRAMERATE_DENOMINATOR); + int record_frame_rate[2] = { record_frame_rate_numerator, + record_frame_rate_denominator }; m_spec.attribute("FramesPerSecond", TypeRational, &record_frame_rate); } else { m_spec.attribute("FramesPerSecond", TypeRational, &frame_rate); @@ -337,6 +888,9 @@ R3dInput::open(const std::string& name, ImageSpec& newspec, m_spec.attribute("oiio:Movie", true); m_spec.attribute("oiio:subimages", int(m_frames)); m_spec.attribute("oiio:BitsPerSample", 16); +#ifdef GPU + m_spec.attribute("oiio:GPU", m_gpu); +#endif // GPU newspec = m_spec; m_next_scanline = 0; @@ -354,9 +908,125 @@ R3dInput::read_frame(int pos) seek(pos); } - R3DSDK::DecodeStatus decode_status = m_clip->DecodeVideoFrame(pos, m_job); - if (decode_status != R3DSDK::DSDecodeOK) { - DBG("Failed to decode frame {}\n", pos); +#ifdef GPU + if (m_gpu && (m_supported == R3DSDK::DSDecodeOK)) { + m_async_decompress_job.VideoFrameNo = pos; + m_async_decompress_job.VideoTrackNo = 0; + m_async_decompress_job.Callback = CPU_callback; + + decodeDone = false; + + int device = CUDA_DEVICE_ID; + cudaStream_t stream; + cudaError_t err; + + err = cudaStreamCreate(&stream); + if (err != cudaSuccess) { + DBG("Failed to create stream {}\n", static_cast(err)); + return; + } + + R3DSDK::DecodeStatus decode_status = GPU_DECODER->DecodeForGpuSdk( + m_async_decompress_job); + if (decode_status != R3DSDK::DSDecodeOK) { + DBG("Failed to decode frame {} with status {}\n", pos, + static_cast(decode_status)); + cudaStreamDestroy(stream); + return; + } + + while (!decodeDone) { + usleep(1000); + } + + R3DSDK::ImageProcessingSettings* ips + = new R3DSDK::ImageProcessingSettings(); + m_async_decompress_job.Clip->GetDefaultImageProcessingSettings(*ips); + + const R3DSDK::VideoPixelType pixelType + = R3DSDK::PixelType_16Bit_RGB_Interleaved; + + R3DSDK::DebayerCudaJob* debayer_cuda_job + = DebayerAllocate(&m_async_decompress_job, ips, pixelType); + if (debayer_cuda_job == nullptr) { + delete ips; + cudaStreamDestroy(stream); + return; + } + + m_async_decompress_job.PrivateData = debayer_cuda_job; + + DBG("debayer_cuda_job = {}\n", static_cast(debayer_cuda_job)); + DBG(" raw_host_mem = {}\n", + static_cast(debayer_cuda_job->raw_host_mem)); + DBG(" raw_device_mem = {}\n", + static_cast(debayer_cuda_job->raw_device_mem)); + DBG(" output_device_mem_size = {}\n", + debayer_cuda_job->output_device_mem_size); + DBG(" output_device_mem = {}\n", debayer_cuda_job->output_device_mem); + DBG(" mode = {}\n", static_cast(debayer_cuda_job->mode)); + DBG(" pixelType = {}\n", + static_cast(debayer_cuda_job->pixelType)); + + R3DSDK::REDCuda::Status status + = RED_CUDA->processAsync(device, stream, debayer_cuda_job, err); + + if (status != R3DSDK::REDCuda::Status_Ok) { + DBG("Failed to process frame, error {}\n", + static_cast(status)); + delete debayer_cuda_job->imageProcessingSettings; + debayer_cuda_job->imageProcessingSettings = NULL; + DebayerFree(debayer_cuda_job); + + cudaStreamDestroy(stream); + return; + } + + debayer_cuda_job->completeAsync(); + + size_t result_buffer_size = R3DSDK::DebayerCudaJob::ResultFrameSize( + debayer_cuda_job); + + DBG("result_buffer_size = {}\n", result_buffer_size); + + //allocate the result buffer in host memory. + if (result_buffer_size != debayer_cuda_job->output_device_mem_size) { + DBG("Result buffer size does not match expected size: Expected: {} Actual: {}\n", + result_buffer_size, debayer_cuda_job->output_device_mem_size); + } + + if (m_image_buffer != nullptr) { + //read the GPU buffer back to the host memory result buffer. - Note this is not always the optimal way to read back. (Use pinned memory in a real app) + cudaError_t err = cudaMemcpy(m_image_buffer, + debayer_cuda_job->output_device_mem, + result_buffer_size, + cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + DBG("Failed to read result frame from card {}\n", + static_cast(err)); + } else { + //ensure the read is complete. + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + DBG("Failed to finish after reading result frame from card {}\n", + static_cast(err)); + } + } + } + + delete debayer_cuda_job->imageProcessingSettings; + debayer_cuda_job->imageProcessingSettings = NULL; + DebayerFree(debayer_cuda_job); + + cudaStreamDestroy(stream); + } else +#endif // GPU + { + R3DSDK::DecodeStatus decode_status = m_clip->DecodeVideoFrame(pos, + m_job); + if (decode_status != R3DSDK::DSDecodeOK) { + DBG("Failed to decode frame {}\n", pos); + } } m_last_search_pos = pos; @@ -461,10 +1131,20 @@ R3dInput::close() // delete m_clip; m_clip = nullptr; } + if (m_image_buffer) { aligned_free(m_image_buffer); m_image_buffer = nullptr; } + + if (m_gpu) { + if (GPU_DECODER) { + GPU_DECODER->Close(); + delete GPU_DECODER; + GPU_DECODER = nullptr; + } + } + reset(); // Reset to initial state return true; }