From 8a6625ff993dea84a24b803e496363caf89858b5 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 26 Oct 2024 03:02:51 +0800
Subject: [PATCH] Add support for CUDA virtual device memory. (#10930)

---
 cmake/Utils.cmake                          |  3 +-
 src/common/io.h                            |  3 ++
 src/common/ref_resource_view.cuh           |  7 ++++
 src/common/resource.cuh                    | 31 +++++++++++++-
 src/data/ellpack_page_source.cu            |  3 +-
 tests/cpp/common/test_ref_resource_view.cu | 49 ++++++++++++++++++++++
 6 files changed, 92 insertions(+), 4 deletions(-)
 create mode 100644 tests/cpp/common/test_ref_resource_view.cu
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 4ad096ba0392..ec47bf6eb62a 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -115,8 +115,7 @@ function(xgboost_set_cuda_flags target)
   else()
     # If the downstream user is dynamically linking with libxgboost, it does not
     # need to link with CCCL and CUDA runtime.
-    target_link_libraries(${target}
-      PRIVATE CCCL::CCCL CUDA::cudart_static)
+    target_link_libraries(${target} PRIVATE CCCL::CCCL CUDA::cudart_static)
   endif()
   target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1)
   target_include_directories(
diff --git a/src/common/io.h b/src/common/io.h
index 1a540ea7338f..e3eaa4faf89b 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -287,6 +287,7 @@ class ResourceHandler {
     kCudaMalloc = 2,     // CUDA device memory.
     kCudaMmap = 3,       // CUDA with mmap.
     kCudaHostCache = 4,  // CUDA pinned host memory.
+    kCudaGrowOnly = 5,   // CUDA virtual memory allocator.
   };
 
  private:
@@ -313,6 +314,8 @@ class ResourceHandler {
         return "CudaMmap";
       case kCudaHostCache:
         return "CudaHostCache";
+      case kCudaGrowOnly:
+        return "CudaGrowOnly";
     }
     LOG(FATAL) << "Unreachable.";
     return {};
diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh
index 985938e08128..21d49333b579 100644
--- a/src/common/ref_resource_view.cuh
+++ b/src/common/ref_resource_view.cuh
@@ -22,6 +22,13 @@ template <typename T>
   return ref;
 }
 
+template <typename T>
+[[nodiscard]] RefResourceView<T> MakeCudaGrowOnly(std::size_t n_elements) {
+  auto resource = std::make_shared<common::CudaGrowOnlyResource>(n_elements * sizeof(T));
+  auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
+  return ref;
+}
+
 template <typename T>
 [[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
                                                             std::size_t n_elements, T const& init) {
diff --git a/src/common/resource.cuh b/src/common/resource.cuh
index 4930f8368182..4936cb798a07 100644
--- a/src/common/resource.cuh
+++ b/src/common/resource.cuh
@@ -6,7 +6,7 @@
 #include <functional>  // for function
 
 #include "cuda_pinned_allocator.h"  // for SamAllocator
-#include "device_vector.cuh"        // for DeviceUVector
+#include "device_vector.cuh"        // for DeviceUVector, GrowOnlyVirtualMemVec
 #include "io.h"                     // for ResourceHandler, MMAPFile
 #include "xgboost/string_view.h"    // for StringView
 
@@ -30,6 +30,35 @@ class CudaMallocResource : public ResourceHandler {
   void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
 };
 
+/**
+ * @brief Device resource that only grows in size.
+ */
+class CudaGrowOnlyResource : public ResourceHandler {
+  static auto MakeNew() {
+    return std::make_unique<dh::detail::GrowOnlyVirtualMemVec>(CU_MEM_LOCATION_TYPE_DEVICE);
+  }
+
+  std::unique_ptr<dh::detail::GrowOnlyVirtualMemVec> alloc_;
+  std::size_t n_bytes_{0};
+
+ public:
+  explicit CudaGrowOnlyResource(std::size_t n_bytes)
+      : ResourceHandler{kCudaGrowOnly}, alloc_{MakeNew()} {
+    this->Resize(n_bytes);
+  }
+  void Resize(std::size_t n_bytes) {
+    this->alloc_->GrowTo(n_bytes);
+    this->n_bytes_ = n_bytes;
+  }
+  void Clear() {
+    this->alloc_.reset();
+    this->alloc_ = MakeNew();
+    this->n_bytes_ = 0;
+  }
+  [[nodiscard]] void* Data() final { return this->alloc_->data(); }
+  [[nodiscard]] std::size_t Size() const final { return this->n_bytes_; }
+};
+
 class CudaPinnedResource : public ResourceHandler {
   std::vector<std::byte, cuda_impl::SamAllocator<std::byte>> storage_;
 
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index a186130b7b58..4901f900a7d5 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -25,7 +25,8 @@ namespace xgboost::data {
 namespace {
 [[nodiscard]] bool IsDevicePage(EllpackPageImpl const* page) {
   switch (page->gidx_buffer.Resource()->Type()) {
-    case common::ResourceHandler::kCudaMalloc: {
+    case common::ResourceHandler::kCudaMalloc:
+    case common::ResourceHandler::kCudaGrowOnly: {
       return true;
     }
     case common::ResourceHandler::kCudaHostCache:
diff --git a/tests/cpp/common/test_ref_resource_view.cu b/tests/cpp/common/test_ref_resource_view.cu
new file mode 100644
index 000000000000..ed69d087dc3c
--- /dev/null
+++ b/tests/cpp/common/test_ref_resource_view.cu
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#if defined(__linux__)
+
+#include <gtest/gtest.h>
+#include <thrust/equal.h>                       // for equal
+#include <thrust/fill.h>                        // for fill_n
+#include <thrust/iterator/constant_iterator.h>  // for make_constant_iterator
+#include <thrust/sequence.h>                    // for sequence
+
+#include "../../../src/common/ref_resource_view.cuh"
+#include "../helpers.h"  // for MakeCUDACtx
+
+namespace xgboost::common {
+class TestCudaGrowOnly : public ::testing::TestWithParam<std::size_t> {
+ public:
+  void TestGrow(std::size_t m, std::size_t n) {
+    auto ctx = MakeCUDACtx(0);
+    ctx.CUDACtx()->Stream().Sync();
+
+    auto ref = MakeCudaGrowOnly<double>(m);
+    ASSERT_EQ(ref.size_bytes(), m * sizeof(double));
+    thrust::sequence(ctx.CUDACtx()->CTP(), ref.begin(), ref.end(), 0.0);
+    auto res = std::dynamic_pointer_cast<common::CudaGrowOnlyResource>(ref.Resource());
+    CHECK(res);
+    res->Resize(n * sizeof(double));
+
+    auto ref1 = RefResourceView<double>(res->DataAs<double>(), res->Size() / sizeof(double),
+                                        ref.Resource());
+    ASSERT_EQ(res->Size(), n * sizeof(double));
+    ASSERT_EQ(ref1.size(), n);
+    thrust::sequence(ctx.CUDACtx()->CTP(), ref1.begin(), ref1.end(), static_cast<double>(0.0));
+    std::vector<double> h_vec(ref1.size());
+    dh::safe_cuda(cudaMemcpyAsync(h_vec.data(), ref1.data(), ref1.size_bytes(), cudaMemcpyDefault));
+    for (std::size_t i = 0; i < h_vec.size(); ++i) {
+      ASSERT_EQ(h_vec[i], i);
+    }
+  }
+
+  void Run(std::size_t n) { this->TestGrow(1024, n); }
+};
+
+TEST_P(TestCudaGrowOnly, Resize) { this->Run(this->GetParam()); }
+
+INSTANTIATE_TEST_SUITE_P(RefResourceView, TestCudaGrowOnly, ::testing::Values(1 << 20, 1 << 21));
+}  // namespace xgboost::common
+
+#endif  // defined(__linux__)