From 8626c96c1cc029696ae164aa2f10d3f3ac71afff Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 26 Oct 2024 15:54:25 +0800
Subject: [PATCH] [WIP] Use memory pool for pinned allocation.

---
 src/common/device_vector.cu  |  8 ++++++++
 src/common/device_vector.cuh | 39 ++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)
diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu
index b7f300df61e2..fc4ae4680241 100644
--- a/src/common/device_vector.cu
+++ b/src/common/device_vector.cu
@@ -29,6 +29,14 @@ void ThrowOOMError(std::string const &err, std::size_t bytes) {
   return std::accumulate(it, it + this->handles_.size(), static_cast<std::size_t>(0));
 }
 
+void GrowOnlyPinnedMemPool::Grow(std::size_t n_bytes) {
+  if (n_bytes > this->cur_n_bytes) {
+    return;
+  }
+  safe_cuda(cudaFreeAsync(storage, dh::DefaultStream()));
+  safe_cuda(cudaMallocFromPoolAsync(&storage, n_bytes, pool.Handle(), dh::DefaultStream()));
+}
+
 void GrowOnlyVirtualMemVec::Reserve(std::size_t new_size) {
   auto va_capacity = this->Capacity();
   if (new_size < va_capacity) {
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 352ebf371d26..e9fa0a2b62cf 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -134,6 +134,45 @@ struct GrowOnlyPinnedMemoryImpl {
   }
 };
 
+class HostMemPool {
+  cudaMemPool_t pool_;
+
+ public:
+  HostMemPool() {
+    cudaMemPoolProps props;
+    std::memset(&props, '\0', sizeof(props));
+    props.allocType = cudaMemAllocationTypePinned;
+    props.location.type = cudaMemLocationTypeHostNuma;
+    std::int32_t numa_id = -1;
+    safe_cuda(cudaDeviceGetAttribute(&numa_id, cudaDevAttrNumaId, cub::CurrentDevice()));
+    CHECK_GE(numa_id, 0);
+    props.location.id = numa_id;
+
+    safe_cuda(cudaMemPoolCreate(&pool_, &props));
+  }
+
+  auto Handle() { return pool_; }
+  ~HostMemPool() { safe_cuda(cudaMemPoolDestroy(pool_)); }
+};
+
+/**
+ * @brief Pinned host memory using CUDA memory pool.
+ */
+struct GrowOnlyPinnedMemPool {
+  HostMemPool pool;
+  void *storage{nullptr};
+  std::size_t cur_n_bytes{0};
+
+  void Grow(std::size_t n_bytes);
+
+  template <typename T>
+  xgboost::common::Span<T> GetSpan(std::size_t size) {
+    auto n_bytes = size * sizeof(T);
+    this->Grow(n_bytes);
+    return xgboost::common::Span<T>(static_cast<T *>(storage), size);
+  }
+};
+
 /**
  * @brief Use low-level virtual memory functions from CUDA driver API for grow-only memory
  *        allocation.