From c7f1c2e2e6afe9ba01f1a57080a7a86720bf16c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20Kov=C3=A1=C5=99?= <peter.kovar@reflexion.tv>
Date: Thu, 9 May 2024 21:43:55 +0200
Subject: [PATCH] nVIDIA CUDA R3D
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Peter Kovář <peter.kovar@reflexion.tv>
---
 src/r3d.imageio/r3dinput.cpp | 776 ++++++++++++++++++++++++++++++++---
 1 file changed, 728 insertions(+), 48 deletions(-)
diff --git a/src/r3d.imageio/r3dinput.cpp b/src/r3d.imageio/r3dinput.cpp
index bbdbc7c887..3f2d7c4b8a 100644
--- a/src/r3d.imageio/r3dinput.cpp
+++ b/src/r3d.imageio/r3dinput.cpp
@@ -9,8 +9,13 @@
 // /opt/R3DSDKv8_5_1 directory and setting up the variable
 // export R3DSDK_ROOT="/opt/R3DSDKv8_5_1"
 
+#define GPU
+#define CUDA
+
+#include "OpenImageIO/platform.h"
 #include <algorithm>
 #include <cassert>
+#include <cstdint>
 #include <cstdio>
 
 #include <OpenImageIO/filesystem.h>
@@ -58,7 +63,7 @@
 
 OIIO_PLUGIN_NAMESPACE_BEGIN
 
-#if 0 || !defined(NDEBUG) // allow R3D configuration debugging
+#if 1 || !defined(NDEBUG)  // allow R3D configuration debugging
 static bool r3d_debug = Strutil::stoi(Sysutil::getenv("OIIO_R3D_DEBUG"));
 #    define DBG(...)   \
         if (r3d_debug) \
@@ -115,6 +120,11 @@ class R3dInput final : public ImageInput {
     std::unique_ptr<ImageSpec> m_config;  // Saved copy of configuration spec
     R3DSDK::Clip* m_clip;
     R3DSDK::VideoDecodeJob m_job;
+#ifdef GPU
+    bool m_gpu;
+    R3DSDK::DecodeStatus m_supported;
+    R3DSDK::AsyncDecompressJob m_async_decompress_job;
+#endif  // GPU
     unsigned char* m_image_buffer;
     int m_frames;
     int m_channels;
@@ -170,6 +180,493 @@ OIIO_PLUGIN_EXPORTS_END
 
 
 
+#ifdef CUDA
+
+namespace {
+R3DSDK::GpuDecoder* GPU_DECODER;
+R3DSDK::REDCuda* RED_CUDA;
+
+int CUDA_DEVICE_ID       = 0;
+volatile bool decodeDone = false;
+}  // namespace
+
+class SimpleMemoryPool {
+public:
+    static SimpleMemoryPool* getInstance()
+    {
+        static SimpleMemoryPool* instance = NULL;
+
+        if (instance == NULL) {
+            std::unique_lock<std::mutex> lock(guard);
+            if (instance == NULL) {
+                instance = new SimpleMemoryPool();
+            }
+        }
+        return instance;
+    }
+
+    static cudaError_t cudaMalloc(void** p, size_t size)
+    {
+        DBG("cudaMalloc {}\n", size);
+
+        return getInstance()->malloc_d(p, size);
+    }
+
+    static cudaError_t cudaFree(void* p) { return getInstance()->free_d(p); }
+
+    static cudaError_t cudaMallocArray(struct cudaArray** array,
+                                       const struct cudaChannelFormatDesc* desc,
+                                       size_t width, size_t height = 0,
+                                       unsigned int flags = 0)
+    {
+        DBG("cudaMallocArray {} {} {}\n", width, height, flags);
+
+        return getInstance()->malloc_array(array, desc, width, height, flags);
+    }
+
+    static cudaError_t
+    cudaMalloc3DArray(struct cudaArray** array,
+                      const struct cudaChannelFormatDesc* desc,
+                      struct cudaExtent ext, unsigned int flags = 0)
+    {
+        return getInstance()->malloc_array_3d(array, desc, ext, flags);
+    }
+
+    static cudaError_t cudaFreeArray(cudaArray* p)
+    {
+        getInstance()->free_array(p);
+
+        return cudaSuccess;
+    }
+
+    static cudaError_t cudaMallocHost(void** p, size_t size)
+    {
+        return getInstance()->malloc_h(p, size);
+    }
+
+    static cudaError_t cudaHostAlloc(void** p, size_t size, unsigned int flags)
+    {
+        return getInstance()->hostAlloc_h(p, size, flags);
+    }
+
+    static cudaError_t cudaFreeHost(void* p)
+    {
+        getInstance()->free_h(p);
+
+        return cudaSuccess;
+    }
+
+private:
+    static std::mutex guard;
+
+    cudaError_t malloc_d(void** p, size_t size)
+    {
+        int device = 0;
+        cudaGetDevice(&device);
+        cudaError_t result = cudaSuccess;
+        *p                 = _device.findBlock(size, device);
+
+        if (*p == NULL) {
+            result = ::cudaMalloc(p, size);
+            if (result != cudaSuccess) {
+                std::cerr << "Memory allocation of " << size
+                          << " bytes failed: " << result << "\n";
+                _device.sweep();
+                _array.sweep();
+                result = ::cudaMalloc(p, size);
+            }
+            if (result == cudaSuccess)
+                _device.addBlock(*p, size, device);
+        }
+        return result;
+    }
+
+    cudaError_t free_d(void* p)
+    {
+        _device.releaseBlock(p);
+        return cudaSuccess;
+    }
+
+    cudaError_t malloc_array(struct cudaArray** array,
+                             const struct cudaChannelFormatDesc* desc,
+                             size_t width, size_t height = 0,
+                             unsigned int flags = 0)
+    {
+        int device = 0;
+        cudaGetDevice(&device);
+        cudaError_t result = cudaSuccess;
+        *array = (cudaArray*)_array.findBlock(width, height, 0, *desc, device);
+
+        if (*array == NULL) {
+            result = ::cudaMallocArray(array, desc, width, height, flags);
+            if (result != cudaSuccess) {
+                DBG("Memory allocation failed: {}\n", static_cast<int>(result));
+                _device.sweep();
+                _array.sweep();
+                result = ::cudaMallocArray(array, desc, width, height, flags);
+            }
+            if (result == cudaSuccess)
+                _array.addBlock(*array, width, height, 0, *desc, device);
+        }
+        return result;
+    }
+
+    cudaError_t malloc_array_3d(struct cudaArray** array,
+                                const struct cudaChannelFormatDesc* desc,
+                                const struct cudaExtent& ext,
+                                unsigned int flags = 0)
+    {
+        int device = 0;
+        cudaGetDevice(&device);
+        cudaError_t result = cudaSuccess;
+        *array = (cudaArray*)_array.findBlock(ext.width, ext.height, ext.depth,
+                                              *desc, device);
+
+        if (*array == NULL) {
+            result = ::cudaMalloc3DArray(array, desc, ext, flags);
+            if (result != cudaSuccess) {
+                DBG("Memory allocation failed: {}\n", static_cast<int>(result));
+                _device.sweep();
+                _array.sweep();
+                result = ::cudaMalloc3DArray(array, desc, ext, flags);
+            }
+            if (result == cudaSuccess)
+                _array.addBlock(*array, ext.width, ext.height, ext.depth, *desc,
+                                device);
+        }
+        return result;
+    }
+
+    void free_array(void* p) { _array.releaseBlock(p); }
+
+    cudaError_t malloc_h(void** p, size_t size)
+    {
+        int device = 0;
+        cudaGetDevice(&device);
+        cudaError_t result = cudaSuccess;
+        *p                 = _host.findBlock(size, device);
+
+        if (*p == NULL) {
+            result = ::cudaMallocHost(p, size);
+            if (result != cudaSuccess) {
+                DBG("Memory allocation failed: {}\n", static_cast<int>(result));
+                _host.sweep();
+                result = ::cudaMallocHost(p, size);
+            }
+            if (result == cudaSuccess)
+                _host.addBlock(*p, size, device);
+        }
+        return result;
+    }
+
+    void free_h(void* p)
+    {
+        if (!_host.releaseBlock(p)) {
+            _hostAlloc.releaseBlock(p);
+        }
+    }
+
+    cudaError_t hostAlloc_h(void** p, size_t size, unsigned int flags)
+    {
+        int device = 0;
+        cudaGetDevice(&device);
+        cudaError_t result = cudaSuccess;
+        *p                 = _hostAlloc.findBlock(size, device);
+
+        if (*p == NULL) {
+            result = ::cudaHostAlloc(p, size, flags);
+            if (result != cudaSuccess) {
+                DBG("Memory allocation failed: {}\n", static_cast<int>(result));
+                _hostAlloc.sweep();
+                result = ::cudaHostAlloc(p, size, flags);
+            }
+            if (result == cudaSuccess)
+                _hostAlloc.addBlock(*p, size, device);
+        }
+        return result;
+    }
+
+    struct BLOCK {
+        void* ptr;
+        size_t size;
+        int device;
+    };
+
+    struct ARRAY {
+        void* ptr;
+        size_t width;
+        size_t height;
+        size_t depth;
+        cudaChannelFormatDesc desc;
+        int device;
+    };
+
+    class Pool {
+    public:
+        void addBlock(void* ptr, size_t size, int device)
+        {
+            std::unique_lock<std::mutex> lock(_guard);
+
+            _inUse[ptr] = { ptr, size, device };
+        }
+
+        void* findBlock(size_t size, int device)
+        {
+            std::unique_lock<std::mutex> lock(_guard);
+
+            for (auto i = _free.begin(); i < _free.end(); ++i) {
+                if (i->size == size && i->device == device) {
+                    void* p   = i->ptr;
+                    _inUse[p] = *i;
+                    _free.erase(i);
+                    return p;
+                }
+            }
+            return NULL;
+        }
+
+        bool releaseBlock(void* ptr)
+        {
+            std::unique_lock<std::mutex> lock(_guard);
+
+            auto i = _inUse.find(ptr);
+
+            if (i != _inUse.end()) {
+                _free.push_back(i->second);
+                _inUse.erase(i);
+                return true;
+            }
+            return false;
+        }
+
+        void sweep()
+        {
+            std::unique_lock<std::mutex> lock(_guard);
+
+            for (auto i = _free.begin(); i < _free.end(); ++i) {
+                ::cudaFree(i->ptr);
+            }
+            _free.clear();
+        }
+
+    private:
+        std::map<void*, BLOCK> _inUse;
+        std::vector<BLOCK> _free;
+        std::mutex _guard;
+    };
+
+    class ArrayPool {
+    public:
+        void addBlock(void* ptr, size_t width, size_t height, size_t depth,
+                      const cudaChannelFormatDesc& desc, int device)
+        {
+            std::unique_lock<std::mutex> lock(_guard);
+
+            _inUse[ptr] = { ptr, width, height, depth, desc, device };
+        }
+
+        void* findBlock(size_t width, size_t height, size_t depth,
+                        const cudaChannelFormatDesc& desc, int device)
+        {
+            std::unique_lock<std::mutex> lock(_guard);
+
+            for (auto i = _free.begin(); i < _free.end(); ++i) {
+                if (i->width == width && i->height == height
+                    && i->depth == depth && i->desc.x == desc.x
+                    && i->desc.y == desc.y && i->desc.z == desc.z
+                    && i->desc.w == desc.w && i->desc.f == desc.f
+                    && i->device == device) {
+                    void* p   = i->ptr;
+                    _inUse[p] = *i;
+                    _free.erase(i);
+                    return p;
+                }
+            }
+            return NULL;
+        }
+
+        bool releaseBlock(void* ptr)
+        {
+            std::unique_lock<std::mutex> lock(_guard);
+
+            auto i = _inUse.find(ptr);
+
+            if (i != _inUse.end()) {
+                _free.push_back(i->second);
+
+                _inUse.erase(i);
+
+                return true;
+            }
+            return false;
+        }
+
+        void sweep()
+        {
+            std::unique_lock<std::mutex> lock(_guard);
+
+            for (auto i = _free.begin(); i < _free.end(); ++i) {
+                ::cudaFree(i->ptr);
+            }
+            _free.clear();
+        }
+
+    private:
+        std::map<void*, ARRAY> _inUse;
+        std::vector<ARRAY> _free;
+        std::mutex _guard;
+    };
+
+    Pool _device;
+    Pool _host;
+    Pool _hostAlloc;
+    ArrayPool _array;
+};
+
+std::mutex SimpleMemoryPool::guard;
+
+
+
+namespace {
+R3DSDK::DebayerCudaJob*
+DebayerAllocate(const R3DSDK::AsyncDecompressJob* job,
+                R3DSDK::ImageProcessingSettings* imageProcessingSettings,
+                R3DSDK::VideoPixelType pixelType)
+{
+    //allocate the debayer job
+    R3DSDK::DebayerCudaJob* data = RED_CUDA->createDebayerJob();
+
+    data->raw_host_mem            = job->OutputBuffer;
+    data->mode                    = job->Mode;
+    data->imageProcessingSettings = imageProcessingSettings;
+    data->pixelType               = pixelType;
+
+    //create raw buffer on the CUDA device
+    cudaError_t err = SimpleMemoryPool::cudaMalloc(&(data->raw_device_mem),
+                                                   job->OutputBufferSize);
+
+    if (err != cudaSuccess) {
+        DBG("Failed to allocate raw frame on GPU: {}\n", static_cast<int>(err));
+        RED_CUDA->releaseDebayerJob(data);
+        return NULL;
+    }
+
+    data->output_device_mem_size = R3DSDK::DebayerCudaJob::ResultFrameSize(
+        data);
+    DBG("data->output_device_mem_size = {}\n", data->output_device_mem_size);
+
+    //YOU MUST specify an existing buffer for the result image
+    //Set DebayerCudaJob::output_device_mem_size >= result_buffer_size
+    //and a pointer to the device buffer in DebayerCudaJob::output_device_mem
+    err = SimpleMemoryPool::cudaMalloc(&(data->output_device_mem),
+                                       data->output_device_mem_size);
+
+    if (err != cudaSuccess) {
+        DBG("Failed to allocate result frame on card {}\n",
+            static_cast<int>(err));
+        SimpleMemoryPool::cudaFree(data->raw_device_mem);
+        RED_CUDA->releaseDebayerJob(data);
+        return NULL;
+    }
+
+    return data;
+}
+
+
+
+void
+DebayerFree(R3DSDK::DebayerCudaJob* job)
+{
+    SimpleMemoryPool::cudaFree(job->raw_device_mem);
+    SimpleMemoryPool::cudaFree(job->output_device_mem);
+    RED_CUDA->releaseDebayerJob(job);
+}
+
+
+
+template<typename T> class ConcurrentQueue {
+private:
+    std::mutex QUEUE_MUTEX;
+    std::condition_variable QUEUE_CV;
+    std::list<T*> QUEUE;
+
+public:
+    void push(T* job)
+    {
+        std::unique_lock<std::mutex> lck(QUEUE_MUTEX);
+        QUEUE.push_back(job);
+        QUEUE_CV.notify_all();
+    }
+
+    void pop(T*& job)
+    {
+        std::unique_lock<std::mutex> lck(QUEUE_MUTEX);
+
+        while (QUEUE.size() == 0)
+            QUEUE_CV.wait(lck);
+
+        job = QUEUE.front();
+        QUEUE.pop_front();
+    }
+
+    size_t size() const { return QUEUE.size(); }
+};
+
+
+
+void
+CPU_callback(R3DSDK::AsyncDecompressJob* item,
+             R3DSDK::DecodeStatus decodeStatus)
+{
+    // DBG("CPU_callback()\n");
+    Strutil::print("CPU_callback()\n");
+    decodeDone = true;
+}
+
+
+
+R3DSDK::REDCuda*
+OpenCUDA(int& deviceId)
+{
+    //setup Cuda for the current thread
+    cudaDeviceProp deviceProp;
+    cudaError_t err = cudaChooseDevice(&deviceId, &deviceProp);
+    if (err != cudaSuccess) {
+        DBG("Failed to move raw frame to card {}\n", static_cast<int>(err));
+        return NULL;
+    }
+
+    err = cudaSetDevice(deviceId);
+    if (err != cudaSuccess) {
+        DBG("Failed to move raw frame to card {}\n", static_cast<int>(err));
+        return NULL;
+    }
+
+    //SETUP YOUR CUDA API FUNCTION POINTERS
+    R3DSDK::EXT_CUDA_API api;
+    api.cudaFree                 = SimpleMemoryPool::cudaFree;
+    api.cudaFreeArray            = SimpleMemoryPool::cudaFreeArray;
+    api.cudaFreeHost             = SimpleMemoryPool::cudaFreeHost;
+    api.cudaFreeMipmappedArray   = ::cudaFreeMipmappedArray;
+    api.cudaHostAlloc            = SimpleMemoryPool::cudaHostAlloc;
+    api.cudaMalloc               = SimpleMemoryPool::cudaMalloc;
+    api.cudaMalloc3D             = ::cudaMalloc3D;
+    api.cudaMalloc3DArray        = SimpleMemoryPool::cudaMalloc3DArray;
+    api.cudaMallocArray          = SimpleMemoryPool::cudaMallocArray;
+    api.cudaMallocHost           = SimpleMemoryPool::cudaMallocHost;
+    api.cudaMallocMipmappedArray = ::cudaMallocMipmappedArray;
+    api.cudaMallocPitch          = ::cudaMallocPitch;
+
+
+    //CREATE THE REDCuda CLASS
+    return new R3DSDK::REDCuda(api);
+}
+}  //end anonymous namespace
+
+#endif  // CUDA
+
+
+
 void
 R3dInput::initialize()
 {
@@ -188,9 +685,20 @@ R3dInput::initialize()
 #endif
         );
     // initialize SDK
-    // R3DSDK::InitializeStatus init_status = R3DSDK::InitializeSdk(".", OPTION_RED_CUDA);
+
+    unsigned int optional_components =
+#ifdef CUDA
+        OPTION_RED_CUDA;
+#elif defined(OpenCL)
+        OPTION_RED_OPENCL;
+#elif defined(Metal)
+        OPTION_RED_METAL;
+#else
+        OPTION_RED_NONE;
+#endif
+
     R3DSDK::InitializeStatus init_status
-        = R3DSDK::InitializeSdk(library_path.c_str(), OPTION_RED_NONE);
+        = R3DSDK::InitializeSdk(library_path.c_str(), optional_components);
     if (init_status != R3DSDK::ISInitializeOK) {
         R3DSDK::FinalizeSdk();
         DBG("Failed to load R3DSDK Library\n");
@@ -198,7 +706,7 @@ R3dInput::initialize()
     }
 
     DBG("SDK VERSION: {}\n", R3DSDK::GetSdkVersion());
-#ifdef GPU
+#ifdef CUDA
     // open CUDA device
     RED_CUDA = OpenCUDA(CUDA_DEVICE_ID);
 
@@ -206,7 +714,9 @@ R3dInput::initialize()
         R3DSDK::FinalizeSdk();
         DBG("Failed to initialize CUDA\n");
     }
-#endif  // GPU
+
+    m_gpu = true;
+#endif  // CUDA
 }
 
 
@@ -245,30 +755,29 @@ R3dInput::open(const std::string& name, ImageSpec& newspec,
     }
 
     R3DSDK::VideoDecodeMode mode = R3DSDK::DECODE_FULL_RES_PREMIUM;
-    int scale = 1;
-
-    switch (hint)
-    {
-        case 0:
-            mode = R3DSDK::DECODE_FULL_RES_PREMIUM;
-            scale = 1;
-            break;
-        case 1:
-            mode = R3DSDK::DECODE_HALF_RES_GOOD;
-            scale = 2;
-            break;
-        case 2:
-            mode = R3DSDK::DECODE_QUARTER_RES_GOOD;
-            scale = 4;
-            break;
-        case 3:
-            mode = R3DSDK::DECODE_EIGHT_RES_GOOD;
-            scale = 8;
-            break;
-        case 4:
-            mode = R3DSDK::DECODE_SIXTEENTH_RES_GOOD;
-            scale = 16;
-            break;
+    int scale                    = 1;
+
+    switch (hint) {
+    case 0:
+        mode  = R3DSDK::DECODE_FULL_RES_PREMIUM;
+        scale = 1;
+        break;
+    case 1:
+        mode  = R3DSDK::DECODE_HALF_RES_GOOD;
+        scale = 2;
+        break;
+    case 2:
+        mode  = R3DSDK::DECODE_QUARTER_RES_GOOD;
+        scale = 4;
+        break;
+    case 3:
+        mode  = R3DSDK::DECODE_EIGHT_RES_GOOD;
+        scale = 8;
+        break;
+    case 4:
+        mode  = R3DSDK::DECODE_SIXTEENTH_RES_GOOD;
+        scale = 16;
+        break;
     }
 
     // calculate how much ouput memory we're going to need
@@ -303,32 +812,74 @@ R3dInput::open(const std::string& name, ImageSpec& newspec,
         return false;
     }
 
-    // letting the decoder know how big the buffer is
-    m_job.OutputBufferSize = memNeeded;
+#ifdef GPU
+    if (m_gpu) {
+        // open GPU decoder
+        GPU_DECODER = new R3DSDK::GpuDecoder();
+        GPU_DECODER->Open();
+
+        m_supported = GPU_DECODER->DecodeSupportedForClip(*m_clip);
+    }
+
+    if (m_supported == R3DSDK::DSDecodeOK) {
+        m_async_decompress_job.Clip = m_clip;
+
+        m_async_decompress_job.Mode = mode;
 
-    m_job.Mode = mode;
+        // letting the decoder know how big the buffer is
+        m_async_decompress_job.OutputBufferSize
+            = R3DSDK::AsyncDecoder::GetSizeBufferNeeded(m_async_decompress_job);
 
-    // store the image here
-    m_job.OutputBuffer = m_image_buffer;
+        DBG("OutputBufferSize = {}\n", m_async_decompress_job.OutputBufferSize);
 
-    // Interleaved RGB decoding in 16-bits per pixel
-    m_job.PixelType   = R3DSDK::PixelType_16Bit_RGB_Interleaved;
-    m_job.BytesPerRow = m_channels * width * sizeof(uint16_t);
+        m_async_decompress_job.OutputBuffer = static_cast<unsigned char*>(
+            aligned_malloc(m_async_decompress_job.OutputBufferSize, 16));
 
-    m_job.ImageProcessing = NULL;
-    m_job.HdrProcessing = NULL;
+        // Interleaved RGB decoding in 16-bits per pixel
+        // m_async_decompress_job.PixelType
+        //     = R3DSDK::PixelType_16Bit_RGB_Interleaved;
+        // m_async_decompress_job.BytesPerRow = m_channels * width
+        //                                      * sizeof(uint16_t);
+
+        // m_async_decompress_job.ImageProcessing = NULL;
+        // m_async_decompress_job.HdrProcessing   = NULL;
+        // m_async_decompress_job.Callback = CPU_callback;
+    } else
+#endif  // GPU
+    {
+        // letting the decoder know how big the buffer is
+        m_job.OutputBufferSize = memNeeded;
+
+        m_job.Mode = mode;
+
+        // store the image here
+        m_job.OutputBuffer = m_image_buffer;
+
+        // Interleaved RGB decoding in 16-bits per pixel
+        m_job.PixelType   = R3DSDK::PixelType_16Bit_RGB_Interleaved;
+        m_job.BytesPerRow = m_channels * width * sizeof(uint16_t);
+
+        m_job.ImageProcessing = NULL;
+        m_job.HdrProcessing   = NULL;
+    }
 
     m_spec = ImageSpec(width, height, m_channels, TypeDesc::UINT16);
 
-    int frame_rate_numerator = m_clip->MetadataItemAsInt(R3DSDK::RMD_FRAMERATE_NUMERATOR);
-    int frame_rate_denominator = m_clip->MetadataItemAsInt(R3DSDK::RMD_FRAMERATE_DENOMINATOR);
+    int frame_rate_numerator = m_clip->MetadataItemAsInt(
+        R3DSDK::RMD_FRAMERATE_NUMERATOR);
+    int frame_rate_denominator = m_clip->MetadataItemAsInt(
+        R3DSDK::RMD_FRAMERATE_DENOMINATOR);
     int frame_rate[2] = { frame_rate_numerator, frame_rate_denominator };
 
-    bool record_frame_rate_exists = m_clip->MetadataExists(R3DSDK::RMD_RECORD_FRAMERATE_NUMERATOR);
+    bool record_frame_rate_exists = m_clip->MetadataExists(
+        R3DSDK::RMD_RECORD_FRAMERATE_NUMERATOR);
     if (record_frame_rate_exists) {
-        int record_frame_rate_numerator = m_clip->MetadataItemAsInt(R3DSDK::RMD_RECORD_FRAMERATE_NUMERATOR);
-        int record_frame_rate_denominator = m_clip->MetadataItemAsInt(R3DSDK::RMD_RECORD_FRAMERATE_DENOMINATOR);
-        int record_frame_rate[2] = { record_frame_rate_numerator, record_frame_rate_denominator };
+        int record_frame_rate_numerator = m_clip->MetadataItemAsInt(
+            R3DSDK::RMD_RECORD_FRAMERATE_NUMERATOR);
+        int record_frame_rate_denominator = m_clip->MetadataItemAsInt(
+            R3DSDK::RMD_RECORD_FRAMERATE_DENOMINATOR);
+        int record_frame_rate[2] = { record_frame_rate_numerator,
+                                     record_frame_rate_denominator };
         m_spec.attribute("FramesPerSecond", TypeRational, &record_frame_rate);
     } else {
         m_spec.attribute("FramesPerSecond", TypeRational, &frame_rate);
@@ -337,6 +888,9 @@ R3dInput::open(const std::string& name, ImageSpec& newspec,
     m_spec.attribute("oiio:Movie", true);
     m_spec.attribute("oiio:subimages", int(m_frames));
     m_spec.attribute("oiio:BitsPerSample", 16);
+#ifdef GPU
+    m_spec.attribute("oiio:GPU", m_gpu);
+#endif  // GPU
 
     newspec         = m_spec;
     m_next_scanline = 0;
@@ -354,9 +908,125 @@ R3dInput::read_frame(int pos)
         seek(pos);
     }
 
-    R3DSDK::DecodeStatus decode_status = m_clip->DecodeVideoFrame(pos, m_job);
-    if (decode_status != R3DSDK::DSDecodeOK) {
-        DBG("Failed to decode frame {}\n", pos);
+#ifdef GPU
+    if (m_gpu && (m_supported == R3DSDK::DSDecodeOK)) {
+        m_async_decompress_job.VideoFrameNo = pos;
+        m_async_decompress_job.VideoTrackNo = 0;
+        m_async_decompress_job.Callback     = CPU_callback;
+
+        decodeDone = false;
+
+        int device = CUDA_DEVICE_ID;
+        cudaStream_t stream;
+        cudaError_t err;
+
+        err = cudaStreamCreate(&stream);
+        if (err != cudaSuccess) {
+            DBG("Failed to create stream {}\n", static_cast<int>(err));
+            return;
+        }
+
+        R3DSDK::DecodeStatus decode_status = GPU_DECODER->DecodeForGpuSdk(
+            m_async_decompress_job);
+        if (decode_status != R3DSDK::DSDecodeOK) {
+            DBG("Failed to decode frame {} with status {}\n", pos,
+                static_cast<int>(decode_status));
+            cudaStreamDestroy(stream);
+            return;
+        }
+
+        while (!decodeDone) {
+            usleep(1000);
+        }
+
+        R3DSDK::ImageProcessingSettings* ips
+            = new R3DSDK::ImageProcessingSettings();
+        m_async_decompress_job.Clip->GetDefaultImageProcessingSettings(*ips);
+
+        const R3DSDK::VideoPixelType pixelType
+            = R3DSDK::PixelType_16Bit_RGB_Interleaved;
+
+        R3DSDK::DebayerCudaJob* debayer_cuda_job
+            = DebayerAllocate(&m_async_decompress_job, ips, pixelType);
+        if (debayer_cuda_job == nullptr) {
+            delete ips;
+            cudaStreamDestroy(stream);
+            return;
+        }
+
+        m_async_decompress_job.PrivateData = debayer_cuda_job;
+
+        DBG("debayer_cuda_job = {}\n", static_cast<void*>(debayer_cuda_job));
+        DBG("  raw_host_mem = {}\n",
+            static_cast<void*>(debayer_cuda_job->raw_host_mem));
+        DBG("  raw_device_mem = {}\n",
+            static_cast<void*>(debayer_cuda_job->raw_device_mem));
+        DBG("  output_device_mem_size = {}\n",
+            debayer_cuda_job->output_device_mem_size);
+        DBG("  output_device_mem = {}\n", debayer_cuda_job->output_device_mem);
+        DBG("  mode = {}\n", static_cast<uint32_t>(debayer_cuda_job->mode));
+        DBG("  pixelType = {}\n",
+            static_cast<uint32_t>(debayer_cuda_job->pixelType));
+
+        R3DSDK::REDCuda::Status status
+            = RED_CUDA->processAsync(device, stream, debayer_cuda_job, err);
+
+        if (status != R3DSDK::REDCuda::Status_Ok) {
+            DBG("Failed to process frame, error {}\n",
+                static_cast<int>(status));
+            delete debayer_cuda_job->imageProcessingSettings;
+            debayer_cuda_job->imageProcessingSettings = NULL;
+            DebayerFree(debayer_cuda_job);
+
+            cudaStreamDestroy(stream);
+            return;
+        }
+
+        debayer_cuda_job->completeAsync();
+
+        size_t result_buffer_size = R3DSDK::DebayerCudaJob::ResultFrameSize(
+            debayer_cuda_job);
+
+        DBG("result_buffer_size = {}\n", result_buffer_size);
+
+        //allocate the result buffer in host memory.
+        if (result_buffer_size != debayer_cuda_job->output_device_mem_size) {
+            DBG("Result buffer size does not match expected size: Expected: {} Actual: {}\n",
+                result_buffer_size, debayer_cuda_job->output_device_mem_size);
+        }
+
+        if (m_image_buffer != nullptr) {
+            //read the GPU buffer back to the host memory result buffer. - Note this is not always the optimal way to read back. (Use pinned memory in a real app)
+            cudaError_t err = cudaMemcpy(m_image_buffer,
+                                         debayer_cuda_job->output_device_mem,
+                                         result_buffer_size,
+                                         cudaMemcpyDeviceToHost);
+            if (err != cudaSuccess) {
+                DBG("Failed to read result frame from card {}\n",
+                    static_cast<int>(err));
+            } else {
+                //ensure the read is complete.
+                err = cudaDeviceSynchronize();
+                if (err != cudaSuccess) {
+                    DBG("Failed to finish after reading result frame from card {}\n",
+                        static_cast<int>(err));
+                }
+            }
+        }
+
+        delete debayer_cuda_job->imageProcessingSettings;
+        debayer_cuda_job->imageProcessingSettings = NULL;
+        DebayerFree(debayer_cuda_job);
+
+        cudaStreamDestroy(stream);
+    } else
+#endif  // GPU
+    {
+        R3DSDK::DecodeStatus decode_status = m_clip->DecodeVideoFrame(pos,
+                                                                      m_job);
+        if (decode_status != R3DSDK::DSDecodeOK) {
+            DBG("Failed to decode frame {}\n", pos);
+        }
     }
 
     m_last_search_pos  = pos;
@@ -461,10 +1131,20 @@ R3dInput::close()
         // delete m_clip;
         m_clip = nullptr;
     }
+
     if (m_image_buffer) {
         aligned_free(m_image_buffer);
         m_image_buffer = nullptr;
     }
+
+    if (m_gpu) {
+        if (GPU_DECODER) {
+            GPU_DECODER->Close();
+            delete GPU_DECODER;
+            GPU_DECODER = nullptr;
+        }
+    }
+
     reset();  // Reset to initial state
     return true;
 }