From 687ed5c4b8b76d240aa76f801f5e3d9974b6893e Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 9 Sep 2024 18:47:39 -0400
Subject: [PATCH 01/27] [FEA] Allow setting `*_pool_size` with human-readable
 string (#1670)

Closes #173

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/rmm/pull/1670
---
 README.md                               |  4 +-
 python/rmm/docs/guide.md                |  8 +--
 python/rmm/rmm/_lib/CMakeLists.txt      |  3 +-
 python/rmm/rmm/_lib/helper.pxd          | 16 +++++
 python/rmm/rmm/_lib/helper.pyx          | 78 +++++++++++++++++++++++++
 python/rmm/rmm/_lib/memory_resource.pyx | 24 +++++---
 python/rmm/rmm/rmm.py                   |  8 ++-
 python/rmm/rmm/tests/test_rmm.py        | 24 +++++++-
 8 files changed, 143 insertions(+), 22 deletions(-)
 create mode 100644 python/rmm/rmm/_lib/helper.pxd
 create mode 100644 python/rmm/rmm/_lib/helper.pyx

diff --git a/README.md b/README.md
index e90398c2b..1250d094b 100644
--- a/README.md
+++ b/README.md
@@ -771,8 +771,8 @@ of 1 GiB and a maximum size of 4 GiB. The pool uses
 >>> import rmm
 >>> pool = rmm.mr.PoolMemoryResource(
 ...     rmm.mr.CudaMemoryResource(),
-...     initial_pool_size=2**30,
-...     maximum_pool_size=2**32
+...     initial_pool_size="1GiB", # equivalent to initial_pool_size=2**30
+...     maximum_pool_size="4GiB"
 ... )
 >>> rmm.mr.set_current_device_resource(pool)
 ```
diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md
index 911073b5d..22c0dc023 100644
--- a/python/rmm/docs/guide.md
+++ b/python/rmm/docs/guide.md
@@ -139,8 +139,8 @@ of 1 GiB and a maximum size of 4 GiB. The pool uses
 >>> import rmm
 >>> pool = rmm.mr.PoolMemoryResource(
 ...     rmm.mr.CudaMemoryResource(),
-...     initial_pool_size=2**30,
-...     maximum_pool_size=2**32
+...     initial_pool_size="1GiB", # equivalent to initial_pool_size=2**30
+...     maximum_pool_size="4GiB"
 ... )
 >>> rmm.mr.set_current_device_resource(pool)
 ```
@@ -151,8 +151,8 @@ Similarly, to use a pool of managed memory:
 >>> import rmm
 >>> pool = rmm.mr.PoolMemoryResource(
 ...     rmm.mr.ManagedMemoryResource(),
-...     initial_pool_size=2**30,
-...     maximum_pool_size=2**32
+...     initial_pool_size="1GiB",
+...     maximum_pool_size="4GiB"
 ... )
 >>> rmm.mr.set_current_device_resource(pool)
 ```
diff --git a/python/rmm/rmm/_lib/CMakeLists.txt b/python/rmm/rmm/_lib/CMakeLists.txt
index 1e629a402..7cdfed971 100644
--- a/python/rmm/rmm/_lib/CMakeLists.txt
+++ b/python/rmm/rmm/_lib/CMakeLists.txt
@@ -12,7 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources device_buffer.pyx lib.pyx logger.pyx memory_resource.pyx cuda_stream.pyx)
+set(cython_sources device_buffer.pyx lib.pyx logger.pyx memory_resource.pyx cuda_stream.pyx
+                   helper.pyx)
 set(linked_libraries rmm::rmm)
 
 # Build all of the Cython targets
diff --git a/python/rmm/rmm/_lib/helper.pxd b/python/rmm/rmm/_lib/helper.pxd
new file mode 100644
index 000000000..8ca151c00
--- /dev/null
+++ b/python/rmm/rmm/_lib/helper.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+cdef object parse_bytes(object s) except *
diff --git a/python/rmm/rmm/_lib/helper.pyx b/python/rmm/rmm/_lib/helper.pyx
new file mode 100644
index 000000000..d442ee341
--- /dev/null
+++ b/python/rmm/rmm/_lib/helper.pyx
@@ -0,0 +1,78 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper functions for rmm"""
+
+import re
+
+
+cdef dict BYTE_SIZES = {
+    'b': 1,
+    '': 1,
+    'kb': 1000,
+    'mb': 1000**2,
+    'gb': 1000**3,
+    'tb': 1000**4,
+    'pb': 1000**5,
+    'kib': 1024,
+    'mib': 1024**2,
+    'gib': 1024**3,
+    'tib': 1024**4,
+    'pib': 1024**5,
+}
+
+
+pattern = re.compile(r"^([0-9]+(?:\.[0-9]*)?)[\t ]*((?i:(?:[kmgtp]i?)?b))?$")
+
+cdef object parse_bytes(object s):
+    """Parse a string or integer into a number of bytes.
+
+    Parameters
+    ----------
+    s : int | str
+        Size in bytes. If an integer is provided, it is returned as-is.
+        A string is parsed as a floating point number with an (optional,
+        case-insensitive) byte-specifier, both SI prefixes (kb, mb, ..., pb)
+        and binary prefixes (kib, mib, ..., pib) are supported.
+
+     Returns
+     -------
+     Requested size in bytes as an integer.
+
+     Raises
+     ------
+     ValueError
+         If it is not possible to parse the input as a byte specification.
+    """
+    cdef str suffix
+    cdef double n
+    cdef int multiplier
+
+    if isinstance(s, int):
+        return s
+
+    match = pattern.match(s)
+
+    if match is None:
+        raise ValueError(f"Could not parse {s} as a byte specification")
+
+    n = float(match.group(1))
+
+    suffix = match.group(2)
+    if suffix is None:
+        suffix = ""
+
+    multiplier = BYTE_SIZES[suffix.lower()]
+
+    return int(n*multiplier)
diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx
index 5030c5d2d..231253e3f 100644
--- a/python/rmm/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/_lib/memory_resource.pyx
@@ -32,10 +32,13 @@ from libcpp.string cimport string
 from cuda.cudart import cudaError_t
 
 from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice
+
 from rmm._cuda.stream cimport Stream
+
 from rmm._cuda.stream import DEFAULT_STREAM
 
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from rmm._lib.helper cimport parse_bytes
 from rmm._lib.memory_resource cimport (
     available_device_memory as c_available_device_memory,
     percent_of_free_device_memory as c_percent_of_free_device_memory,
@@ -44,6 +47,7 @@ from rmm._lib.per_device_resource cimport (
     cuda_device_id,
     set_per_device_resource as cpp_set_per_device_resource,
 )
+
 from rmm.statistics import Statistics
 
 # Transparent handle of a C++ exception
@@ -314,9 +318,9 @@ cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
 
     Parameters
     ----------
-    initial_pool_size : int, optional
+    initial_pool_size : int | str, optional
         Initial pool size in bytes. By default, half the available memory
-        on the device is used.
+        on the device is used. A string argument is parsed using `parse_bytes`.
     release_threshold: int, optional
         Release threshold in bytes. If the pool size grows beyond this
         value, unused memory held by the pool will be released at the
@@ -334,7 +338,7 @@ cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
         cdef optional[size_t] c_initial_pool_size = (
             optional[size_t]()
             if initial_pool_size is None
-            else optional[size_t](<size_t> initial_pool_size)
+            else optional[size_t](<size_t> parse_bytes(initial_pool_size))
         )
 
         cdef optional[size_t] c_release_threshold = (
@@ -426,12 +430,12 @@ cdef class PoolMemoryResource(UpstreamResourceAdaptor):
         c_initial_pool_size = (
             c_percent_of_free_device_memory(50) if
             initial_pool_size is None
-            else initial_pool_size
+            else parse_bytes(initial_pool_size)
         )
         c_maximum_pool_size = (
             optional[size_t]() if
             maximum_pool_size is None
-            else optional[size_t](<size_t> maximum_pool_size)
+            else optional[size_t](<size_t> parse_bytes(maximum_pool_size))
         )
         self.c_obj.reset(
             new pool_memory_resource[device_memory_resource](
@@ -456,10 +460,10 @@ cdef class PoolMemoryResource(UpstreamResourceAdaptor):
         upstream_mr : DeviceMemoryResource
             The DeviceMemoryResource from which to allocate blocks for the
             pool.
-        initial_pool_size : int, optional
+        initial_pool_size : int | str, optional
             Initial pool size in bytes. By default, half the available memory
             on the device is used.
-        maximum_pool_size : int, optional
+        maximum_pool_size : int | str, optional
             Maximum size in bytes, that the pool can grow to.
         """
         pass
@@ -1091,8 +1095,10 @@ cpdef void _initialize(
         typ = PoolMemoryResource
         args = (upstream(),)
         kwargs = dict(
-            initial_pool_size=initial_pool_size,
-            maximum_pool_size=maximum_pool_size
+            initial_pool_size=None if initial_pool_size is None
+            else parse_bytes(initial_pool_size),
+            maximum_pool_size=None if maximum_pool_size is None
+            else parse_bytes(maximum_pool_size)
         )
     else:
         typ = upstream
diff --git a/python/rmm/rmm/rmm.py b/python/rmm/rmm/rmm.py
index e5290905c..bac04b477 100644
--- a/python/rmm/rmm/rmm.py
+++ b/python/rmm/rmm/rmm.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -45,14 +45,16 @@ def reinitialize(
         performance.
     managed_memory : bool, default False
         If True, use managed memory for device memory allocation
-    initial_pool_size : int, default None
+    initial_pool_size : int | str, default None
         When `pool_allocator` is True, this indicates the initial pool size in
         bytes. By default, 1/2 of the total GPU memory is used.
         When `pool_allocator` is False, this argument is ignored if provided.
-    maximum_pool_size : int, default None
+        A string argument is parsed using `parse_bytes`.
+    maximum_pool_size : int | str, default None
         When `pool_allocator` is True, this indicates the maximum pool size in
         bytes. By default, the total available memory on the GPU is used.
         When `pool_allocator` is False, this argument is ignored if provided.
+        A string argument is parsed using `parse_bytes`.
     devices : int or List[int], default 0
         GPU device  IDs to register. By default registers only GPU 0.
     logging : bool, default False
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index c4fd90c45..c88d21b38 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -432,8 +432,8 @@ def test_rmm_pool_cupy_allocator_stream_lifetime():
 def test_pool_memory_resource(dtype, nelem, alloc):
     mr = rmm.mr.PoolMemoryResource(
         rmm.mr.CudaMemoryResource(),
-        initial_pool_size=1 << 22,
-        maximum_pool_size=1 << 23,
+        initial_pool_size="4MiB",
+        maximum_pool_size="8MiB",
     )
     rmm.mr.set_current_device_resource(mr)
     assert rmm.mr.get_current_device_resource_type() is type(mr)
@@ -507,7 +507,7 @@ def test_binning_memory_resource(dtype, nelem, alloc, upstream_mr):
 
 def test_reinitialize_max_pool_size():
     rmm.reinitialize(
-        pool_allocator=True, initial_pool_size=0, maximum_pool_size=1 << 23
+        pool_allocator=True, initial_pool_size=0, maximum_pool_size="8MiB"
     )
     rmm.DeviceBuffer().resize((1 << 23) - 1)
 
@@ -530,6 +530,24 @@ def test_reinitialize_initial_pool_size_gt_max():
     assert "Initial pool size exceeds the maximum pool size" in str(e.value)
 
 
+def test_reinitialize_with_valid_str_arg_pool_size():
+    rmm.reinitialize(
+        pool_allocator=True,
+        initial_pool_size="2kib",
+        maximum_pool_size="8kib",
+    )
+
+
+def test_reinitialize_with_invalid_str_arg_pool_size():
+    with pytest.raises(ValueError) as e:
+        rmm.reinitialize(
+            pool_allocator=True,
+            initial_pool_size="2k",  # 2kb valid, not 2k
+            maximum_pool_size="8k",
+        )
+    assert "Could not parse" in str(e.value)
+
+
 @pytest.mark.parametrize("dtype", _dtypes)
 @pytest.mark.parametrize("nelem", _nelems)
 @pytest.mark.parametrize("alloc", _allocs)

From 6729deff42fcb771555186af3027e662a34985ea Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Tue, 10 Sep 2024 09:39:41 +1000
Subject: [PATCH 02/27] Update RMM adaptors, containers and tests to use
 get/set_current_device_resource_ref() (#1661)

Closes #1660.

This adds a constructor to each MR adaptor to take a resource_ref rather than an `Upstream*`. It also updates RMM to use `get_current_device_resource_ref()` everywhere: in containers, in tests, in adaptors, Thrust allocator, polymorphic allocator, execution_policy, etc.

Importantly, this PR also modifies `set_current_device_resource()` to basically call `set_current_device_resource_ref()`. This is necessary, because while RMM C++ uses `get_current_device_resource_ref()` everywhere, the Python API still uses the raw pointer API `set_current_device_resource()`. So we need the latter to update the state for the former.  This is a temporary bootstrap to help with the refactoring.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Michael Schellenberger Costa (https://github.com/miscco)
  - Lawrence Mitchell (https://github.com/wence-)
  - Rong Ou (https://github.com/rongou)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/1661
---
 .../device_uvector/device_uvector_bench.cu    | 18 ++--
 .../multi_stream_allocations_bench.cu         |  4 +-
 include/rmm/device_buffer.hpp                 | 14 +--
 include/rmm/device_scalar.hpp                 |  6 +-
 include/rmm/device_uvector.hpp                |  6 +-
 include/rmm/exec_policy.hpp                   |  4 +-
 .../mr/device/aligned_resource_adaptor.hpp    | 51 ++++++----
 .../rmm/mr/device/arena_memory_resource.hpp   | 30 +++++-
 .../rmm/mr/device/binning_memory_resource.hpp | 82 ++++++++++------
 .../mr/device/callback_memory_resource.hpp    | 13 +--
 include/rmm/mr/device/detail/arena.hpp        | 33 +++----
 .../failure_callback_resource_adaptor.hpp     | 39 +++++---
 .../mr/device/fixed_size_memory_resource.hpp  | 50 +++++++---
 .../mr/device/limiting_resource_adaptor.hpp   | 53 +++++++----
 .../mr/device/logging_resource_adaptor.hpp    | 93 +++++++++++++++----
 include/rmm/mr/device/per_device_resource.hpp | 46 ++++++---
 .../rmm/mr/device/polymorphic_allocator.hpp   |  4 +-
 .../rmm/mr/device/pool_memory_resource.hpp    | 43 ++++++---
 .../mr/device/prefetch_resource_adaptor.hpp   | 26 +++---
 .../mr/device/statistics_resource_adaptor.hpp | 32 ++++---
 .../device/thread_safe_resource_adaptor.hpp   | 28 +++---
 .../mr/device/thrust_allocator_adaptor.hpp    |  2 +-
 .../mr/device/tracking_resource_adaptor.hpp   | 33 ++++---
 include/rmm/resource_ref.hpp                  | 16 ++++
 python/rmm/rmm/_lib/_torch_allocator.cpp      | 14 ++-
 tests/CMakeLists.txt                          |  1 +
 tests/container_multidevice_tests.cu          | 24 ++---
 tests/cuda_stream_tests.cpp                   |  4 +-
 tests/device_buffer_tests.cu                  | 14 +--
 tests/device_check_resource_adaptor.hpp       | 16 ++--
 tests/device_scalar_tests.cpp                 |  4 +-
 tests/device_uvector_tests.cpp                |  2 +-
 tests/mock_resource.hpp                       |  5 +
 tests/mr/device/adaptor_tests.cpp             |  3 +-
 tests/mr/device/aligned_mr_tests.cpp          |  6 +-
 tests/mr/device/arena_mr_tests.cpp            | 50 +++++-----
 tests/mr/device/callback_mr_tests.cpp         | 39 ++++----
 tests/mr/device/failure_callback_mr_tests.cpp |  3 +-
 tests/mr/device/limiting_mr_tests.cpp         | 16 ++--
 tests/mr/device/pool_mr_tests.cpp             | 40 +++-----
 tests/mr/device/statistics_mr_tests.cpp       | 20 ++--
 tests/mr/device/thrust_allocator_tests.cu     | 10 +-
 tests/mr/device/tracking_mr_tests.cpp         | 26 +++---
 tests/mr/host/mr_ref_tests.cpp                |  3 +
 44 files changed, 639 insertions(+), 387 deletions(-)

diff --git a/benchmarks/device_uvector/device_uvector_bench.cu b/benchmarks/device_uvector/device_uvector_bench.cu
index 36c9183f9..0eddb1d92 100644
--- a/benchmarks/device_uvector/device_uvector_bench.cu
+++ b/benchmarks/device_uvector/device_uvector_bench.cu
@@ -40,7 +40,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)
   rmm::mr::cuda_memory_resource cuda_mr{};
   rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
     &cuda_mr, rmm::percent_of_free_device_memory(50)};
-  rmm::mr::set_current_device_resource(&mr);
+  rmm::mr::set_current_device_resource_ref(mr);
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
     rmm::device_uvector<std::int32_t> vec(state.range(0), rmm::cuda_stream_view{});
@@ -49,7 +49,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)
 
   state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));
 
-  rmm::mr::set_current_device_resource(nullptr);
+  rmm::mr::reset_current_device_resource_ref();
 }
 
 BENCHMARK(BM_UvectorSizeConstruction)
@@ -62,7 +62,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state)
   rmm::mr::cuda_memory_resource cuda_mr{};
   rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
     &cuda_mr, rmm::percent_of_free_device_memory(50)};
-  rmm::mr::set_current_device_resource(&mr);
+  rmm::mr::set_current_device_resource_ref(mr);
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
     rmm::device_vector<std::int32_t> vec(state.range(0));
@@ -71,7 +71,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state)
 
   state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));
 
-  rmm::mr::set_current_device_resource(nullptr);
+  rmm::mr::reset_current_device_resource_ref();
 }
 
 BENCHMARK(BM_ThrustVectorSizeConstruction)
@@ -140,7 +140,7 @@ template <typename Vector>
 void BM_VectorWorkflow(benchmark::State& state)
 {
   rmm::mr::cuda_async_memory_resource cuda_async_mr{};
-  rmm::mr::set_current_device_resource(&cuda_async_mr);
+  rmm::mr::set_current_device_resource_ref(cuda_async_mr);
 
   rmm::cuda_stream input_stream;
   std::vector<rmm::cuda_stream> streams(4);
@@ -158,7 +158,7 @@ void BM_VectorWorkflow(benchmark::State& state)
   auto const bytes            = num_elements * sizeof(std::int32_t) * num_accesses;
   state.SetBytesProcessed(static_cast<std::int64_t>(state.iterations() * bytes));
 
-  rmm::mr::set_current_device_resource(nullptr);
+  rmm::mr::reset_current_device_resource_ref();
 }
 
 BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector)  // NOLINT
@@ -167,9 +167,9 @@ BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector)  // NOLINT
   ->Unit(benchmark::kMicrosecond)
   ->UseManualTime();
 
-// The only difference here is that `rmm::device_vector` uses `rmm::current_device_resource()`
-// for allocation while `thrust::device_vector` uses cudaMalloc/cudaFree. In the benchmarks we use
-// `cuda_async_memory_resource`, which is faster.
+// The only difference here is that `rmm::device_vector` uses
+// `rmm::get_current_device_resource_ref()` for allocation while `thrust::device_vector` uses
+// cudaMalloc/cudaFree. In the benchmarks we use `cuda_async_memory_resource`, which is faster.
 BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_vector)  // NOLINT
   ->RangeMultiplier(10)                            // NOLINT
   ->Range(100'000, 100'000'000)                    // NOLINT
diff --git a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
index b73ef54f8..86e761c80 100644
--- a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
+++ b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
@@ -75,7 +75,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
 {
   auto mr = factory();
 
-  rmm::mr::set_current_device_resource(mr.get());
+  rmm::mr::set_current_device_resource_ref(mr.get());
 
   auto num_streams = state.range(0);
   auto num_kernels = state.range(1);
@@ -92,7 +92,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
 
   state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * num_kernels));
 
-  rmm::mr::set_current_device_resource(nullptr);
+  rmm::mr::reset_current_device_resource_ref();
 }
 
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp
index e6664e0f6..3ddd37415 100644
--- a/include/rmm/device_buffer.hpp
+++ b/include/rmm/device_buffer.hpp
@@ -41,7 +41,7 @@ namespace RMM_NAMESPACE {
  *
  * This class allocates untyped and *uninitialized* device memory using a
  * `device_async_resource_ref`. If not explicitly specified, the memory resource
- * returned from `get_current_device_resource()` is used.
+ * returned from `get_current_device_resource_ref()` is used.
  *
  * @note Unlike `std::vector` or `thrust::device_vector`, the device memory
  * allocated by a `device_buffer` is uninitialized. Therefore, it is undefined
@@ -95,7 +95,7 @@ class device_buffer {
   // `__host__ __device__` specifiers to the defaulted constructor when it is called within the
   // context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host-
   // device function. This causes warnings/errors because this ctor invokes host-only functions.
-  device_buffer() : _mr{rmm::mr::get_current_device_resource()} {}
+  device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {}
 
   /**
    * @brief Constructs a new device buffer of `size` uninitialized bytes
@@ -109,7 +109,7 @@ class device_buffer {
    */
   explicit device_buffer(std::size_t size,
                          cuda_stream_view stream,
-                         device_async_resource_ref mr = mr::get_current_device_resource())
+                         device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _stream{stream}, _mr{mr}
   {
     cuda_set_device_raii dev{_device};
@@ -138,7 +138,7 @@ class device_buffer {
   device_buffer(void const* source_data,
                 std::size_t size,
                 cuda_stream_view stream,
-                device_async_resource_ref mr = mr::get_current_device_resource())
+                device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _stream{stream}, _mr{mr}
   {
     cuda_set_device_raii dev{_device};
@@ -169,7 +169,7 @@ class device_buffer {
    */
   device_buffer(device_buffer const& other,
                 cuda_stream_view stream,
-                device_async_resource_ref mr = mr::get_current_device_resource())
+                device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : device_buffer{other.data(), other.size(), stream, mr}
   {
   }
@@ -419,8 +419,8 @@ class device_buffer {
   cuda_stream_view _stream{};  ///< Stream to use for device memory deallocation
 
   rmm::device_async_resource_ref _mr{
-    rmm::mr::get_current_device_resource()};  ///< The memory resource used to
-                                              ///< allocate/deallocate device memory
+    rmm::mr::get_current_device_resource_ref()};  ///< The memory resource used to
+                                                  ///< allocate/deallocate device memory
   cuda_device_id _device{get_current_cuda_device()};
 
   /**
diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index 95388eca9..62b004e2f 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -95,7 +95,7 @@ class device_scalar {
    * @param mr Optional, resource with which to allocate.
    */
   explicit device_scalar(cuda_stream_view stream,
-                         device_async_resource_ref mr = mr::get_current_device_resource())
+                         device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{1, stream, mr}
   {
   }
@@ -118,7 +118,7 @@ class device_scalar {
    */
   explicit device_scalar(value_type const& initial_value,
                          cuda_stream_view stream,
-                         device_async_resource_ref mr = mr::get_current_device_resource())
+                         device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{1, stream, mr}
   {
     set_value_async(initial_value, stream);
@@ -138,7 +138,7 @@ class device_scalar {
    */
   device_scalar(device_scalar const& other,
                 cuda_stream_view stream,
-                device_async_resource_ref mr = mr::get_current_device_resource())
+                device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{other._storage, stream, mr}
   {
   }
diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp
index e1610a73a..13f566150 100644
--- a/include/rmm/device_uvector.hpp
+++ b/include/rmm/device_uvector.hpp
@@ -48,7 +48,7 @@ namespace RMM_NAMESPACE {
  *
  * Example:
  * @code{.cpp}
- * rmm::mr::device_memory_resource * mr = new my_custom_resource();
+ * auto mr = new my_custom_resource();
  * rmm::cuda_stream_view s{};
  *
  * // Allocates *uninitialized* device memory on stream `s` sufficient for 100 ints using the
@@ -126,7 +126,7 @@ class device_uvector {
    */
   explicit device_uvector(std::size_t size,
                           cuda_stream_view stream,
-                          device_async_resource_ref mr = mr::get_current_device_resource())
+                          device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{elements_to_bytes(size), stream, mr}
   {
   }
@@ -142,7 +142,7 @@ class device_uvector {
    */
   explicit device_uvector(device_uvector const& other,
                           cuda_stream_view stream,
-                          device_async_resource_ref mr = mr::get_current_device_resource())
+                          device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{other._storage, stream, mr}
   {
   }
diff --git a/include/rmm/exec_policy.hpp b/include/rmm/exec_policy.hpp
index 1c9a07abd..019a8245a 100644
--- a/include/rmm/exec_policy.hpp
+++ b/include/rmm/exec_policy.hpp
@@ -57,7 +57,7 @@ class exec_policy : public thrust_exec_policy_t {
    * @param mr The resource to use for allocating temporary memory
    */
   explicit exec_policy(cuda_stream_view stream      = cuda_stream_default,
-                       device_async_resource_ref mr = mr::get_current_device_resource())
+                       device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : thrust_exec_policy_t(
         thrust::cuda::par(mr::thrust_allocator<char>(stream, mr)).on(stream.value()))
   {
@@ -81,7 +81,7 @@ using thrust_exec_policy_nosync_t =
 class exec_policy_nosync : public thrust_exec_policy_nosync_t {
  public:
   explicit exec_policy_nosync(cuda_stream_view stream      = cuda_stream_default,
-                              device_async_resource_ref mr = mr::get_current_device_resource())
+                              device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : thrust_exec_policy_nosync_t(
         thrust::cuda::par_nosync(mr::thrust_allocator<char>(stream, mr)).on(stream.value()))
   {
diff --git a/include/rmm/mr/device/aligned_resource_adaptor.hpp b/include/rmm/mr/device/aligned_resource_adaptor.hpp
index 85eddb427..4df2c4d2d 100644
--- a/include/rmm/mr/device/aligned_resource_adaptor.hpp
+++ b/include/rmm/mr/device/aligned_resource_adaptor.hpp
@@ -20,6 +20,7 @@
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cstddef>
@@ -59,7 +60,6 @@ class aligned_resource_adaptor final : public device_memory_resource {
   /**
    * @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
    *
-   * @throws rmm::logic_error if `upstream == nullptr`
    * @throws rmm::logic_error if `allocation_alignment` is not a power of 2
    *
    * @param upstream The resource used for allocating/deallocating device memory.
@@ -67,12 +67,33 @@ class aligned_resource_adaptor final : public device_memory_resource {
    * @param alignment_threshold Only allocations with a size larger than or equal to this threshold
    * are aligned.
    */
-  explicit aligned_resource_adaptor(Upstream* upstream,
+  explicit aligned_resource_adaptor(device_async_resource_ref upstream,
                                     std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
                                     std::size_t alignment_threshold = default_alignment_threshold)
     : upstream_{upstream}, alignment_{alignment}, alignment_threshold_{alignment_threshold}
   {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
+    RMM_EXPECTS(rmm::is_supported_alignment(alignment),
+                "Allocation alignment is not a power of 2.");
+  }
+
+  /**
+   * @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
+   *
+   * @throws rmm::logic_error if `upstream == nullptr`
+   * @throws rmm::logic_error if `alignment` is not a power of 2
+   *
+   * @param upstream The resource used for allocating/deallocating device memory.
+   * @param alignment The size used for allocation alignment.
+   * @param alignment_threshold Only allocations with a size larger than or equal to this threshold
+   * are aligned.
+   */
+  explicit aligned_resource_adaptor(Upstream* upstream,
+                                    std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
+                                    std::size_t alignment_threshold = default_alignment_threshold)
+    : upstream_{to_device_async_resource_ref_checked(upstream)},
+      alignment_{alignment},
+      alignment_threshold_{alignment_threshold}
+  {
     RMM_EXPECTS(rmm::is_supported_alignment(alignment),
                 "Allocation alignment is not a power of 2.");
   }
@@ -92,11 +113,6 @@ class aligned_resource_adaptor final : public device_memory_resource {
     return upstream_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
-
   /**
    * @brief The default alignment used by the adaptor.
    */
@@ -106,8 +122,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
   using lock_guard = std::lock_guard<std::mutex>;
 
   /**
-   * @brief Allocates memory of size at least `bytes` using the upstream resource with the specified
-   * alignment.
+   * @brief Allocates memory of size at least `bytes` using the upstream resource with the
+   * specified alignment.
    *
    * @throws rmm::bad_alloc if the requested allocation could not be fulfilled
    * by the upstream resource.
@@ -119,10 +135,10 @@ class aligned_resource_adaptor final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
     if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
-      return upstream_->allocate(bytes, stream);
+      return get_upstream_resource().allocate_async(bytes, 1, stream);
     }
     auto const size = upstream_allocation_size(bytes);
-    void* pointer   = upstream_->allocate(size, stream);
+    void* pointer   = get_upstream_resource().allocate_async(size, 1, stream);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
     auto const address         = reinterpret_cast<std::size_t>(pointer);
     auto const aligned_address = rmm::align_up(address, alignment_);
@@ -145,7 +161,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
     if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
-      upstream_->deallocate(ptr, bytes, stream);
+      get_upstream_resource().deallocate_async(ptr, bytes, 1, stream);
     } else {
       {
         lock_guard lock(mtx_);
@@ -155,7 +171,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
           pointers_.erase(iter);
         }
       }
-      upstream_->deallocate(ptr, upstream_allocation_size(bytes), stream);
+      get_upstream_resource().deallocate_async(ptr, upstream_allocation_size(bytes), 1, stream);
     }
   }
 
@@ -176,8 +192,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
   }
 
   /**
-   * @brief Calculate the allocation size needed from upstream to account for alignments of both the
-   * size and the base pointer.
+   * @brief Calculate the allocation size needed from upstream to account for alignments of both
+   * the size and the base pointer.
    *
    * @param bytes The requested allocation size.
    * @return Allocation size needed from upstream to align both the size and the base pointer.
@@ -188,7 +204,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
     return aligned_size + alignment_ - rmm::CUDA_ALLOCATION_ALIGNMENT;
   }
 
-  Upstream* upstream_;  ///< The upstream resource used for satisfying allocation requests
+  /// The upstream resource used for satisfying allocation requests
+  device_async_resource_ref upstream_;
   std::unordered_map<void*, void*> pointers_;  ///< Map of aligned pointers to upstream pointers.
   std::size_t alignment_;                      ///< The size used for allocation alignment
   std::size_t alignment_threshold_;  ///< The size above which allocations should be aligned
diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 388182e6a..417b7d2b4 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -22,6 +22,7 @@
 #include <rmm/logger.hpp>
 #include <rmm/mr/device/detail/arena.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda_runtime_api.h>
 
@@ -82,6 +83,26 @@ namespace mr {
 template <typename Upstream>
 class arena_memory_resource final : public device_memory_resource {
  public:
+  /**
+   * @brief Construct an `arena_memory_resource`.
+   *
+   * @param upstream_mr The memory resource from which to allocate blocks for the global arena.
+   * @param arena_size Size in bytes of the global arena. Defaults to half of the available
+   * memory on the current device.
+   * @param dump_log_on_failure If true, dump memory log when running out of memory.
+   */
+  explicit arena_memory_resource(device_async_resource_ref upstream_mr,
+                                 std::optional<std::size_t> arena_size = std::nullopt,
+                                 bool dump_log_on_failure              = false)
+    : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
+  {
+    if (dump_log_on_failure_) {
+      logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
+      // Set the level to `debug` for more detailed output.
+      logger_->set_level(spdlog::level::info);
+    }
+  }
+
   /**
    * @brief Construct an `arena_memory_resource`.
    *
@@ -95,7 +116,8 @@ class arena_memory_resource final : public device_memory_resource {
   explicit arena_memory_resource(Upstream* upstream_mr,
                                  std::optional<std::size_t> arena_size = std::nullopt,
                                  bool dump_log_on_failure              = false)
-    : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
+    : global_arena_{to_device_async_resource_ref_checked(upstream_mr), arena_size},
+      dump_log_on_failure_{dump_log_on_failure}
   {
     if (dump_log_on_failure_) {
       logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
@@ -113,8 +135,8 @@ class arena_memory_resource final : public device_memory_resource {
   arena_memory_resource& operator=(arena_memory_resource&&) noexcept = delete;
 
  private:
-  using global_arena = rmm::mr::detail::arena::global_arena<Upstream>;
-  using arena        = rmm::mr::detail::arena::arena<Upstream>;
+  using global_arena = rmm::mr::detail::arena::global_arena;
+  using arena        = rmm::mr::detail::arena::arena;
 
   /**
    * @brief Allocates memory of size at least `bytes`.
@@ -274,7 +296,7 @@ class arena_memory_resource final : public device_memory_resource {
       std::unique_lock lock(map_mtx_);
       auto thread_arena = std::make_shared<arena>(global_arena_);
       thread_arenas_.emplace(thread_id, thread_arena);
-      thread_local detail::arena::arena_cleaner<Upstream> cleaner{thread_arena};
+      thread_local detail::arena::arena_cleaner cleaner{thread_arena};
       return *thread_arena;
     }
   }
diff --git a/include/rmm/mr/device/binning_memory_resource.hpp b/include/rmm/mr/device/binning_memory_resource.hpp
index 773035231..a5ef64665 100644
--- a/include/rmm/mr/device/binning_memory_resource.hpp
+++ b/include/rmm/mr/device/binning_memory_resource.hpp
@@ -27,6 +27,7 @@
 #include <cassert>
 #include <map>
 #include <memory>
+#include <optional>
 #include <vector>
 
 namespace RMM_NAMESPACE {
@@ -52,16 +53,48 @@ class binning_memory_resource final : public device_memory_resource {
    * Initially has no bins, so simply uses the upstream_resource until bin resources are added
    * with `add_bin`.
    *
-   * @throws rmm::logic_error if size_base is not a power of two.
+   * @param upstream_resource The upstream memory resource used to allocate bin pools.
+   */
+  explicit binning_memory_resource(device_async_resource_ref upstream_resource)
+    : upstream_mr_{upstream_resource}
+  {
+  }
+
+  /**
+   * @brief Construct a new binning memory resource object.
+   *
+   * Initially has no bins, so simply uses the upstream_resource until bin resources are added
+   * with `add_bin`.
+   *
+   * @throws rmm::logic_error if upstream_resource is nullptr
    *
    * @param upstream_resource The upstream memory resource used to allocate bin pools.
    */
   explicit binning_memory_resource(Upstream* upstream_resource)
-    : upstream_mr_{[upstream_resource]() {
-        RMM_EXPECTS(nullptr != upstream_resource, "Unexpected null upstream pointer.");
-        return upstream_resource;
-      }()}
+    : upstream_mr_{to_device_async_resource_ref_checked(upstream_resource)}
+  {
+  }
+
+  /**
+   * @brief Construct a new binning memory resource object with a range of initial bins.
+   *
+   * Constructs a new binning memory resource and adds bins backed by `fixed_size_memory_resource`
+   * in the range [2^min_size_exponent, 2^max_size_exponent]. For example if `min_size_exponent==18`
+   * and `max_size_exponent==22`, creates bins of sizes 256KiB, 512KiB, 1024KiB, 2048KiB and
+   * 4096KiB.
+   *
+   * @param upstream_resource The upstream memory resource used to allocate bin pools.
+   * @param min_size_exponent The minimum base-2 exponent bin size.
+   * @param max_size_exponent The maximum base-2 exponent bin size.
+   */
+  binning_memory_resource(device_async_resource_ref upstream_resource,
+                          int8_t min_size_exponent,  // NOLINT(bugprone-easily-swappable-parameters)
+                          int8_t max_size_exponent)
+    : upstream_mr_{upstream_resource}
   {
+    for (auto i = min_size_exponent; i <= max_size_exponent; i++) {
+      add_bin(1 << i);
+    }
   }
 
   /**
@@ -72,6 +105,8 @@ class binning_memory_resource final : public device_memory_resource {
    * and `max_size_exponent==22`, creates bins of sizes 256KiB, 512KiB, 1024KiB, 2048KiB and
    * 4096KiB.
    *
+   * @throws rmm::logic_error if upstream_resource is nullptr
+   *
    * @param upstream_resource The upstream memory resource used to allocate bin pools.
    * @param min_size_exponent The minimum base-2 exponent bin size.
    * @param max_size_exponent The maximum base-2 exponent bin size.
@@ -79,10 +114,7 @@ class binning_memory_resource final : public device_memory_resource {
   binning_memory_resource(Upstream* upstream_resource,
                           int8_t min_size_exponent,  // NOLINT(bugprone-easily-swappable-parameters)
                           int8_t max_size_exponent)
-    : upstream_mr_{[upstream_resource]() {
-        RMM_EXPECTS(nullptr != upstream_resource, "Unexpected null upstream pointer.");
-        return upstream_resource;
-      }()}
+    : upstream_mr_{to_device_async_resource_ref_checked(upstream_resource)}
   {
     for (auto i = min_size_exponent; i <= max_size_exponent; i++) {
       add_bin(1 << i);
@@ -102,23 +134,17 @@ class binning_memory_resource final : public device_memory_resource {
   binning_memory_resource& operator=(binning_memory_resource&&)      = delete;
 
   /**
-   * @briefreturn{rmm::device_async_resource_ref to the upstream resource}
+   * @briefreturn{device_async_resource_ref to the upstream resource}
    */
-  [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept
+  [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept
   {
     return upstream_mr_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_mr_; }
-
   /**
    * @brief Add a bin allocator to this resource
    *
-   * Adds `bin_resource` if it is not null; otherwise constructs and adds a
-   * fixed_size_memory_resource.
+   * Adds `bin_resource` if provided; otherwise constructs and adds a fixed_size_memory_resource.
    *
    * This bin will be used for any allocation smaller than `allocation_size` that is larger than
    * the next smaller bin's allocation size.
@@ -130,14 +156,14 @@ class binning_memory_resource final : public device_memory_resource {
    * @param allocation_size The maximum size that this bin allocates
    * @param bin_resource The memory resource for the bin
    */
-  void add_bin(std::size_t allocation_size, device_memory_resource* bin_resource = nullptr)
+  void add_bin(std::size_t allocation_size,
+               std::optional<device_async_resource_ref> bin_resource = std::nullopt)
   {
-    allocation_size = rmm::align_up(allocation_size, rmm::CUDA_ALLOCATION_ALIGNMENT);
+    allocation_size = align_up(allocation_size, CUDA_ALLOCATION_ALIGNMENT);
 
-    if (nullptr != bin_resource) {
-      resource_bins_.insert({allocation_size, bin_resource});
+    if (bin_resource.has_value()) {
+      resource_bins_.insert({allocation_size, bin_resource.value()});
     } else if (resource_bins_.count(allocation_size) == 0) {  // do nothing if bin already exists
-
       owned_bin_resources_.push_back(
         std::make_unique<fixed_size_memory_resource<Upstream>>(upstream_mr_, allocation_size));
       resource_bins_.insert({allocation_size, owned_bin_resources_.back().get()});
@@ -153,11 +179,10 @@ class binning_memory_resource final : public device_memory_resource {
    * @param bytes Requested allocation size in bytes
    * @return Get the resource reference for the requested size.
    */
-  rmm::device_async_resource_ref get_resource_ref(std::size_t bytes)
+  device_async_resource_ref get_resource_ref(std::size_t bytes)
   {
     auto iter = resource_bins_.lower_bound(bytes);
-    return (iter != resource_bins_.cend()) ? rmm::device_async_resource_ref{iter->second}
-                                           : get_upstream_resource();
+    return (iter != resource_bins_.cend()) ? iter->second : get_upstream_resource();
   }
 
   /**
@@ -188,11 +213,12 @@ class binning_memory_resource final : public device_memory_resource {
     get_resource_ref(bytes).deallocate_async(ptr, bytes, stream);
   }
 
-  Upstream* upstream_mr_;  // The upstream memory_resource from which to allocate blocks.
+  device_async_resource_ref
+    upstream_mr_;  // The upstream memory_resource from which to allocate blocks.
 
   std::vector<std::unique_ptr<fixed_size_memory_resource<Upstream>>> owned_bin_resources_;
 
-  std::map<std::size_t, device_memory_resource*> resource_bins_;
+  std::map<std::size_t, device_async_resource_ref> resource_bins_;
 };
 
 /** @} */  // end of group
diff --git a/include/rmm/mr/device/callback_memory_resource.hpp b/include/rmm/mr/device/callback_memory_resource.hpp
index c569f7dd6..fa2d8056d 100644
--- a/include/rmm/mr/device/callback_memory_resource.hpp
+++ b/include/rmm/mr/device/callback_memory_resource.hpp
@@ -86,12 +86,13 @@ class callback_memory_resource final : public device_memory_resource {
    * It is the caller's responsibility to maintain the lifetime of the pointed-to data
    * for the duration of the lifetime of the `callback_memory_resource`.
    */
-  callback_memory_resource(allocate_callback_t allocate_callback,
-                           deallocate_callback_t deallocate_callback,
-                           void* allocate_callback_arg   = nullptr,
-                           void* deallocate_callback_arg = nullptr) noexcept
-    : allocate_callback_(allocate_callback),
-      deallocate_callback_(deallocate_callback),
+  callback_memory_resource(
+    allocate_callback_t allocate_callback,
+    deallocate_callback_t deallocate_callback,
+    void* allocate_callback_arg   = nullptr,  // NOLINT(bugprone-easily-swappable-parameters)
+    void* deallocate_callback_arg = nullptr) noexcept
+    : allocate_callback_(std::move(allocate_callback)),
+      deallocate_callback_(std::move(deallocate_callback)),
       allocate_callback_arg_(allocate_callback_arg),
       deallocate_callback_arg_(deallocate_callback_arg)
   {
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 3d24cfebf..6f8303c83 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -23,6 +23,7 @@
 #include <rmm/detail/export.hpp>
 #include <rmm/detail/logging_assert.hpp>
 #include <rmm/logger.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda_runtime_api.h>
 
@@ -494,22 +495,18 @@ inline auto max_free_size(std::set<superblock> const& superblocks)
  * @tparam Upstream Memory resource to use for allocating the arena. Implements
  * rmm::mr::device_memory_resource interface.
  */
-template <typename Upstream>
 class global_arena final {
  public:
   /**
    * @brief Construct a global arena.
    *
-   * @throws rmm::logic_error if `upstream_mr == nullptr`.
-   *
    * @param upstream_mr The memory resource from which to allocate blocks for the pool
    * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory
    * on the current device.
    */
-  global_arena(Upstream* upstream_mr, std::optional<std::size_t> arena_size)
+  global_arena(device_async_resource_ref upstream_mr, std::optional<std::size_t> arena_size)
     : upstream_mr_{upstream_mr}
   {
-    RMM_EXPECTS(nullptr != upstream_mr_, "Unexpected null upstream pointer.");
     auto const size =
       rmm::align_down(arena_size.value_or(default_size()), rmm::CUDA_ALLOCATION_ALIGNMENT);
     RMM_EXPECTS(size >= superblock::minimum_size,
@@ -530,7 +527,7 @@ class global_arena final {
   ~global_arena()
   {
     std::lock_guard lock(mtx_);
-    upstream_mr_->deallocate(upstream_block_.pointer(), upstream_block_.size());
+    upstream_mr_.deallocate(upstream_block_.pointer(), upstream_block_.size());
   }
 
   /**
@@ -539,7 +536,7 @@ class global_arena final {
    * @param size The size in bytes of the allocation.
    * @return bool True if the allocation should be handled by the global arena.
    */
-  bool handles(std::size_t size) const { return size > superblock::minimum_size; }
+  static bool handles(std::size_t size) { return size > superblock::minimum_size; }
 
   /**
    * @brief Acquire a superblock that can fit a block of the given size.
@@ -610,7 +607,7 @@ class global_arena final {
    * @param stream Stream on which to perform deallocation.
    * @return bool true if the allocation is found, false otherwise.
    */
-  bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
+  bool deallocate_async(void* ptr, std::size_t size, cuda_stream_view stream)
   {
     RMM_LOGGING_ASSERT(handles(size));
     stream.synchronize_no_throw();
@@ -692,7 +689,7 @@ class global_arena final {
    * @brief Default size of the global arena if unspecified.
    * @return the default global arena size.
    */
-  constexpr std::size_t default_size() const
+  static std::size_t default_size()
   {
     auto const [free, total] = rmm::available_device_memory();
     return free / 2;
@@ -705,7 +702,7 @@ class global_arena final {
    */
   void initialize(std::size_t size)
   {
-    upstream_block_ = {upstream_mr_->allocate(size), size};
+    upstream_block_ = {upstream_mr_.allocate(size), size};
     superblocks_.emplace(upstream_block_.pointer(), size);
   }
 
@@ -777,7 +774,7 @@ class global_arena final {
   }
 
   /// The upstream resource to allocate memory from.
-  Upstream* upstream_mr_;
+  device_async_resource_ref upstream_mr_;
   /// Block allocated from upstream so that it can be quickly freed.
   block upstream_block_;
   /// Address-ordered set of superblocks.
@@ -795,7 +792,6 @@ class global_arena final {
  * @tparam Upstream Memory resource to use for allocating the global arena. Implements
  * rmm::mr::device_memory_resource interface.
  */
-template <typename Upstream>
 class arena {
  public:
   /**
@@ -803,7 +799,7 @@ class arena {
    *
    * @param global_arena The global arena from which to allocate superblocks.
    */
-  explicit arena(global_arena<Upstream>& global_arena) : global_arena_{global_arena} {}
+  explicit arena(global_arena& global_arena) : global_arena_{global_arena} {}
 
   // Disable copy (and move) semantics.
   arena(arena const&)                = delete;
@@ -837,7 +833,9 @@ class arena {
    */
   bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
   {
-    if (global_arena_.handles(size) && global_arena_.deallocate(ptr, size, stream)) { return true; }
+    if (global_arena::handles(size) && global_arena_.deallocate_async(ptr, size, stream)) {
+      return true;
+    }
     return deallocate(ptr, size);
   }
 
@@ -961,7 +959,7 @@ class arena {
   }
 
   /// The global arena to allocate superblocks from.
-  global_arena<Upstream>& global_arena_;
+  global_arena& global_arena_;
   /// Acquired superblocks.
   std::set<superblock> superblocks_;
   /// Mutex for exclusive lock.
@@ -976,10 +974,9 @@ class arena {
  * @tparam Upstream Memory resource to use for allocating the global arena. Implements
  * rmm::mr::device_memory_resource interface.
  */
-template <typename Upstream>
 class arena_cleaner {
  public:
-  explicit arena_cleaner(std::shared_ptr<arena<Upstream>> const& arena) : arena_(arena) {}
+  explicit arena_cleaner(std::shared_ptr<arena> const& arena) : arena_(arena) {}
 
   // Disable copy (and move) semantics.
   arena_cleaner(arena_cleaner const&)            = delete;
@@ -997,7 +994,7 @@ class arena_cleaner {
 
  private:
   /// A non-owning pointer to the arena that may need cleaning.
-  std::weak_ptr<arena<Upstream>> arena_;
+  std::weak_ptr<arena> arena_;
 };
 
 }  // namespace mr::detail::arena
diff --git a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp
index eeebaac3b..fdb385748 100644
--- a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp
+++ b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp
@@ -18,6 +18,7 @@
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cstddef>
@@ -81,9 +82,9 @@ using failure_callback_t = std::function<bool(std::size_t, void*)>;
  * {
  *   bool retried{false};
  *   failure_callback_adaptor mr{
- *     rmm::mr::get_current_device_resource(), failure_handler, &retried
+ *     rmm::mr::get_current_device_resource_ref(), failure_handler, &retried
  *   };
- *   rmm::mr::set_current_device_resource(&mr);
+ *   rmm::mr::set_current_device_resource_ref(mr);
  * }
  * @endcode
  *
@@ -95,6 +96,21 @@ class failure_callback_resource_adaptor final : public device_memory_resource {
  public:
   using exception_type = ExceptionType;  ///< The type of exception this object catches/throws
 
+  /**
+   * @brief Construct a new `failure_callback_resource_adaptor` using `upstream` to satisfy
+   * allocation requests.
+   *
+   * @param upstream The resource used for allocating/deallocating device memory
+   * @param callback Callback function @see failure_callback_t
+   * @param callback_arg Extra argument passed to `callback`
+   */
+  failure_callback_resource_adaptor(device_async_resource_ref upstream,
+                                    failure_callback_t callback,
+                                    void* callback_arg)
+    : upstream_{upstream}, callback_{std::move(callback)}, callback_arg_{callback_arg}
+  {
+  }
+
   /**
    * @brief Construct a new `failure_callback_resource_adaptor` using `upstream` to satisfy
    * allocation requests.
@@ -108,9 +124,10 @@ class failure_callback_resource_adaptor final : public device_memory_resource {
   failure_callback_resource_adaptor(Upstream* upstream,
                                     failure_callback_t callback,
                                     void* callback_arg)
-    : upstream_{upstream}, callback_{std::move(callback)}, callback_arg_{callback_arg}
+    : upstream_{to_device_async_resource_ref_checked(upstream)},
+      callback_{std::move(callback)},
+      callback_arg_{callback_arg}
   {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
   }
 
   failure_callback_resource_adaptor()                                                    = delete;
@@ -130,11 +147,6 @@ class failure_callback_resource_adaptor final : public device_memory_resource {
     return upstream_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
-
  private:
   /**
    * @brief Allocates memory of size at least `bytes` using the upstream
@@ -153,7 +165,7 @@ class failure_callback_resource_adaptor final : public device_memory_resource {
 
     while (true) {
       try {
-        ret = upstream_->allocate(bytes, stream);
+        ret = get_upstream_resource().allocate_async(bytes, stream);
         break;
       } catch (exception_type const& e) {
         if (!callback_(bytes, callback_arg_)) { throw; }
@@ -171,7 +183,7 @@ class failure_callback_resource_adaptor final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
-    upstream_->deallocate(ptr, bytes, stream);
+    get_upstream_resource().deallocate_async(ptr, bytes, stream);
   }
 
   /**
@@ -185,11 +197,12 @@ class failure_callback_resource_adaptor final : public device_memory_resource {
   {
     if (this == &other) { return true; }
     auto cast = dynamic_cast<failure_callback_resource_adaptor<Upstream> const*>(&other);
-    if (cast == nullptr) { return upstream_->is_equal(other); }
+    if (cast == nullptr) { return false; }
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
-  Upstream* upstream_;  // the upstream resource used for satisfying allocation requests
+  // the upstream resource used for satisfying allocation requests
+  device_async_resource_ref upstream_;
   failure_callback_t callback_;
   void* callback_arg_;
 };
diff --git a/include/rmm/mr/device/fixed_size_memory_resource.hpp b/include/rmm/mr/device/fixed_size_memory_resource.hpp
index 84cb3b0c6..249af77dd 100644
--- a/include/rmm/mr/device/fixed_size_memory_resource.hpp
+++ b/include/rmm/mr/device/fixed_size_memory_resource.hpp
@@ -65,7 +65,31 @@ class fixed_size_memory_resource
 
   /**
    * @brief Construct a new `fixed_size_memory_resource` that allocates memory from
-   * `upstream_resource`.
+   * `upstream_mr`.
+   *
+   * When the pool of blocks is all allocated, grows the pool by allocating
+   * `blocks_to_preallocate` more blocks from `upstream_mr`.
+   *
+   * @param upstream_mr The device_async_resource_ref from which to allocate blocks for the pool.
+   * @param block_size The size of blocks to allocate.
+   * @param blocks_to_preallocate The number of blocks to allocate to initialize the pool.
+   */
+  explicit fixed_size_memory_resource(
+    device_async_resource_ref upstream_mr,
+    // NOLINTNEXTLINE bugprone-easily-swappable-parameters
+    std::size_t block_size            = default_block_size,
+    std::size_t blocks_to_preallocate = default_blocks_to_preallocate)
+    : upstream_mr_{upstream_mr},
+      block_size_{align_up(block_size, CUDA_ALLOCATION_ALIGNMENT)},
+      upstream_chunk_size_{block_size_ * blocks_to_preallocate}
+  {
+    // allocate initial blocks and insert into free list
+    this->insert_blocks(std::move(blocks_from_upstream(cuda_stream_legacy)), cuda_stream_legacy);
+  }
+
+  /**
+   * @brief Construct a new `fixed_size_memory_resource` that allocates memory from
+   * `upstream_mr`.
    *
    * When the pool of blocks is all allocated, grows the pool by allocating
    * `blocks_to_preallocate` more blocks from `upstream_mr`.
@@ -76,11 +100,12 @@ class fixed_size_memory_resource
    */
   explicit fixed_size_memory_resource(
     Upstream* upstream_mr,
+    // NOLINTNEXTLINE bugprone-easily-swappable-parameters
     std::size_t block_size            = default_block_size,
     std::size_t blocks_to_preallocate = default_blocks_to_preallocate)
-    : upstream_mr_{upstream_mr},
-      block_size_{rmm::align_up(block_size, rmm::CUDA_ALLOCATION_ALIGNMENT)},
-      upstream_chunk_size_{block_size * blocks_to_preallocate}
+    : upstream_mr_{to_device_async_resource_ref_checked(upstream_mr)},
+      block_size_{align_up(block_size, CUDA_ALLOCATION_ALIGNMENT)},
+      upstream_chunk_size_{block_size_ * blocks_to_preallocate}
   {
     // allocate initial blocks and insert into free list
     this->insert_blocks(std::move(blocks_from_upstream(cuda_stream_legacy)), cuda_stream_legacy);
@@ -99,18 +124,13 @@ class fixed_size_memory_resource
   fixed_size_memory_resource& operator=(fixed_size_memory_resource&&)      = delete;
 
   /**
-   * @briefreturn{rmm::device_async_resource_ref to the upstream resource}
+   * @briefreturn{device_async_resource_ref to the upstream resource}
    */
-  [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept
+  [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept
   {
     return upstream_mr_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_mr_; }
-
   /**
    * @brief Get the size of blocks allocated by this memory resource.
    *
@@ -200,7 +220,7 @@ class fixed_size_memory_resource
   {
     // Deallocating a fixed-size block just inserts it in the free list, which is
     // handled by the parent class
-    RMM_LOGGING_ASSERT(rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT) <= block_size_);
+    RMM_LOGGING_ASSERT(align_up(size, CUDA_ALLOCATION_ALIGNMENT) <= block_size_);
     return block_type{ptr};
   }
 
@@ -254,10 +274,10 @@ class fixed_size_memory_resource
   }
 
  private:
-  Upstream* upstream_mr_;  // The resource from which to allocate new blocks
+  device_async_resource_ref upstream_mr_;  // The resource from which to allocate new blocks
 
-  std::size_t const block_size_;           // size of blocks this MR allocates
-  std::size_t const upstream_chunk_size_;  // size of chunks allocated from heap MR
+  std::size_t block_size_;           // size of blocks this MR allocates
+  std::size_t upstream_chunk_size_;  // size of chunks allocated from heap MR
 
   // blocks allocated from heap: so they can be easily freed
   std::vector<block_type> upstream_blocks_;
diff --git a/include/rmm/mr/device/limiting_resource_adaptor.hpp b/include/rmm/mr/device/limiting_resource_adaptor.hpp
index c3ef72e09..d19fa3a0a 100644
--- a/include/rmm/mr/device/limiting_resource_adaptor.hpp
+++ b/include/rmm/mr/device/limiting_resource_adaptor.hpp
@@ -19,6 +19,7 @@
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cstddef>
@@ -45,6 +46,24 @@ namespace mr {
 template <typename Upstream>
 class limiting_resource_adaptor final : public device_memory_resource {
  public:
+  /**
+   * @brief Construct a new limiting resource adaptor using `upstream` to satisfy
+   * allocation requests and limiting the total allocation amount possible.
+   *
+   * @param upstream The resource used for allocating/deallocating device memory
+   * @param allocation_limit Maximum memory allowed for this allocator
+   * @param alignment Alignment in bytes for the start of each allocated buffer
+   */
+  limiting_resource_adaptor(device_async_resource_ref upstream,
+                            std::size_t allocation_limit,
+                            std::size_t alignment = CUDA_ALLOCATION_ALIGNMENT)
+    : upstream_{upstream},
+      allocation_limit_{allocation_limit},
+      allocated_bytes_(0),
+      alignment_(alignment)
+  {
+  }
+
   /**
    * @brief Construct a new limiting resource adaptor using `upstream` to satisfy
    * allocation requests and limiting the total allocation amount possible.
@@ -57,13 +76,12 @@ class limiting_resource_adaptor final : public device_memory_resource {
    */
   limiting_resource_adaptor(Upstream* upstream,
                             std::size_t allocation_limit,
-                            std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)
-    : allocation_limit_{allocation_limit},
+                            std::size_t alignment = CUDA_ALLOCATION_ALIGNMENT)
+    : upstream_{to_device_async_resource_ref_checked(upstream)},
+      allocation_limit_{allocation_limit},
       allocated_bytes_(0),
-      alignment_(alignment),
-      upstream_{upstream}
+      alignment_(alignment)
   {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
   }
 
   limiting_resource_adaptor()                                 = delete;
@@ -76,18 +94,13 @@ class limiting_resource_adaptor final : public device_memory_resource {
     default;  ///< @default_move_assignment{limiting_resource_adaptor}
 
   /**
-   * @briefreturn{rmm::device_async_resource_ref to the upstream resource}
+   * @briefreturn{device_async_resource_ref to the upstream resource}
    */
-  [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept
+  [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept
   {
     return upstream_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
-
   /**
    * @brief Query the number of bytes that have been allocated. Note that
    * this can not be used to know how large of an allocation is possible due
@@ -124,11 +137,11 @@ class limiting_resource_adaptor final : public device_memory_resource {
    */
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
-    auto const proposed_size = rmm::align_up(bytes, alignment_);
+    auto const proposed_size = align_up(bytes, alignment_);
     auto const old           = allocated_bytes_.fetch_add(proposed_size);
     if (old + proposed_size <= allocation_limit_) {
       try {
-        return upstream_->allocate(bytes, stream);
+        return get_upstream_resource().allocate_async(bytes, stream);
       } catch (...) {
         allocated_bytes_ -= proposed_size;
         throw;
@@ -148,8 +161,8 @@ class limiting_resource_adaptor final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
-    std::size_t allocated_size = rmm::align_up(bytes, alignment_);
-    upstream_->deallocate(ptr, bytes, stream);
+    std::size_t allocated_size = align_up(bytes, alignment_);
+    get_upstream_resource().deallocate_async(ptr, bytes, stream);
     allocated_bytes_ -= allocated_size;
   }
 
@@ -164,10 +177,13 @@ class limiting_resource_adaptor final : public device_memory_resource {
   {
     if (this == &other) { return true; }
     auto const* cast = dynamic_cast<limiting_resource_adaptor<Upstream> const*>(&other);
-    if (cast == nullptr) { return upstream_->is_equal(other); }
+    if (cast == nullptr) { return false; }
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
+  // The upstream resource used for satisfying allocation requests
+  device_async_resource_ref upstream_;
+
   // maximum bytes this allocator is allowed to allocate.
   std::size_t allocation_limit_;
 
@@ -176,9 +192,6 @@ class limiting_resource_adaptor final : public device_memory_resource {
 
   // todo: should be some way to ask the upstream...
   std::size_t alignment_;
-
-  Upstream* upstream_;  ///< The upstream resource used for satisfying
-                        ///< allocation requests
 };
 
 /**
diff --git a/include/rmm/mr/device/logging_resource_adaptor.hpp b/include/rmm/mr/device/logging_resource_adaptor.hpp
index 1b4d80f14..595ab2e4e 100644
--- a/include/rmm/mr/device/logging_resource_adaptor.hpp
+++ b/include/rmm/mr/device/logging_resource_adaptor.hpp
@@ -77,10 +77,8 @@ class logging_resource_adaptor final : public device_memory_resource {
   logging_resource_adaptor(Upstream* upstream,
                            std::string const& filename = get_default_filename(),
                            bool auto_flush             = false)
-    : logger_{make_logger(filename)}, upstream_{upstream}
+    : logger_{make_logger(filename)}, upstream_{to_device_async_resource_ref_checked(upstream)}
   {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
-
     init_logger(auto_flush);
   }
 
@@ -99,10 +97,8 @@ class logging_resource_adaptor final : public device_memory_resource {
    * performance.
    */
   logging_resource_adaptor(Upstream* upstream, std::ostream& stream, bool auto_flush = false)
-    : logger_{make_logger(stream)}, upstream_{upstream}
+    : logger_{make_logger(stream)}, upstream_{to_device_async_resource_ref_checked(upstream)}
   {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
-
     init_logger(auto_flush);
   }
 
@@ -123,10 +119,76 @@ class logging_resource_adaptor final : public device_memory_resource {
   logging_resource_adaptor(Upstream* upstream,
                            spdlog::sinks_init_list sinks,
                            bool auto_flush = false)
-    : logger_{make_logger(sinks)}, upstream_{upstream}
+    : logger_{make_logger(sinks)}, upstream_{to_device_async_resource_ref_checked(upstream)}
+  {
+    init_logger(auto_flush);
+  }
+
+  /**
+   * @brief Construct a new logging resource adaptor using `upstream` to satisfy
+   * allocation requests and logging information about each allocation/free to
+   * the file specified by `filename`.
+   *
+   * The logfile will be written using CSV formatting.
+   *
+   * Clears the contents of `filename` if it already exists.
+   *
+   * Creating multiple `logging_resource_adaptor`s with the same `filename` will
+   * result in undefined behavior.
+   *
+   * @throws spdlog::spdlog_ex if opening `filename` failed
+   *
+   * @param upstream The resource_ref used for allocating/deallocating device memory.
+   * @param filename Name of file to write log info. If not specified, retrieves
+   * the file name from the environment variable "RMM_LOG_FILE".
+   * @param auto_flush If true, flushes the log for every (de)allocation. Warning, this will degrade
+   * performance.
+   */
+  logging_resource_adaptor(device_async_resource_ref upstream,
+                           std::string const& filename = get_default_filename(),
+                           bool auto_flush             = false)
+    : logger_{make_logger(filename)}, upstream_{upstream}
+  {
+    init_logger(auto_flush);
+  }
+
+  /**
+   * @brief Construct a new logging resource adaptor using `upstream` to satisfy
+   * allocation requests and logging information about each allocation/free to
+   * the ostream specified by `stream`.
+   *
+   * The logfile will be written using CSV formatting.
+   *
+   * @param upstream The resource_ref used for allocating/deallocating device memory.
+   * @param stream The ostream to write log info.
+   * @param auto_flush If true, flushes the log for every (de)allocation. Warning, this will degrade
+   * performance.
+   */
+  logging_resource_adaptor(device_async_resource_ref upstream,
+                           std::ostream& stream,
+                           bool auto_flush = false)
+    : logger_{make_logger(stream)}, upstream_{upstream}
   {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
+    init_logger(auto_flush);
+  }
 
+  /**
+   * @brief Construct a new logging resource adaptor using `upstream` to satisfy
+   * allocation requests and logging information about each allocation/free to
+   * the ostream specified by `stream`.
+   *
+   * The logfile will be written using CSV formatting.
+   *
+   * @param upstream The resource_ref used for allocating/deallocating device memory.
+   * @param sinks A list of logging sinks to which log output will be written.
+   * @param auto_flush If true, flushes the log for every (de)allocation. Warning, this will degrade
+   * performance.
+   */
+  logging_resource_adaptor(device_async_resource_ref upstream,
+                           spdlog::sinks_init_list sinks,
+                           bool auto_flush = false)
+    : logger_{make_logger(sinks)}, upstream_{upstream}
+  {
     init_logger(auto_flush);
   }
 
@@ -147,11 +209,6 @@ class logging_resource_adaptor final : public device_memory_resource {
     return upstream_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
-
   /**
    * @brief Flush logger contents.
    */
@@ -239,7 +296,7 @@ class logging_resource_adaptor final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
     try {
-      auto const ptr = upstream_->allocate(bytes, stream);
+      auto const ptr = get_upstream_resource().allocate_async(bytes, stream);
       logger_->info("allocate,{},{},{}", ptr, bytes, fmt::ptr(stream.value()));
       return ptr;
     } catch (...) {
@@ -265,7 +322,7 @@ class logging_resource_adaptor final : public device_memory_resource {
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
     logger_->info("free,{},{},{}", ptr, bytes, fmt::ptr(stream.value()));
-    upstream_->deallocate(ptr, bytes, stream);
+    get_upstream_resource().deallocate_async(ptr, bytes, stream);
   }
 
   /**
@@ -279,14 +336,14 @@ class logging_resource_adaptor final : public device_memory_resource {
   {
     if (this == &other) { return true; }
     auto const* cast = dynamic_cast<logging_resource_adaptor<Upstream> const*>(&other);
-    if (cast == nullptr) { return upstream_->is_equal(other); }
+    if (cast == nullptr) { return false; }
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
   std::shared_ptr<spdlog::logger> logger_;  ///< spdlog logger object
 
-  Upstream* upstream_;  ///< The upstream resource used for satisfying
-                        ///< allocation requests
+  device_async_resource_ref upstream_;  ///< The upstream resource used for satisfying
+                                        ///< allocation requests
 };
 
 /**
diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp
index ff6321e25..855d4c0bf 100644
--- a/include/rmm/mr/device/per_device_resource.hpp
+++ b/include/rmm/mr/device/per_device_resource.hpp
@@ -193,6 +193,31 @@ inline device_memory_resource* get_per_device_resource(cuda_device_id device_id)
                               : found->second;
 }
 
+namespace detail {
+
+// The non-thread-safe implementation of `set_per_device_resource_ref`. This exists because
+// we need to call this function from two places: the thread-safe version of
+// `set_per_device_resource_ref` and the thread-safe version of `set_per_device_resource`,
+// both of which take the lock, so we need an implementation that doesn't take the lock.
+/// @private
+inline device_async_resource_ref set_per_device_resource_ref_unsafe(
+  cuda_device_id device_id, device_async_resource_ref new_resource_ref)
+{
+  auto& map          = detail::get_ref_map();
+  auto const old_itr = map.find(device_id.value());
+  // If a resource didn't previously exist for `device_id`, return pointer to initial_resource
+  // Note: because resource_ref is not default-constructible, we can't use std::map::operator[]
+  if (old_itr == map.end()) {
+    map.insert({device_id.value(), new_resource_ref});
+    return device_async_resource_ref{detail::initial_resource()};
+  }
+
+  auto old_resource_ref = old_itr->second;
+  old_itr->second       = new_resource_ref;  // update map directly via iterator
+  return old_resource_ref;
+}
+}  // namespace detail
+
 /**
  * @brief Set the `device_memory_resource` for the specified device.
  *
@@ -224,6 +249,14 @@ inline device_memory_resource* set_per_device_resource(cuda_device_id device_id,
                                                        device_memory_resource* new_mr)
 {
   std::lock_guard<std::mutex> lock{detail::map_lock()};
+
+  // Note: even though set_per_device_resource() and set_per_device_resource_ref() are not
+  // interchangeable, we call the latter from the former to maintain resource_ref
+  // state consistent with the resource pointer state. This is necessary because the
+  // Python API still uses the raw pointer API. Once the Python API is updated to use
+  // resource_ref, this call can be removed.
+  detail::set_per_device_resource_ref_unsafe(device_id, new_mr);
+
   auto& map          = detail::get_map();
   auto const old_itr = map.find(device_id.value());
   // If a resource didn't previously exist for `id`, return pointer to initial_resource
@@ -350,18 +383,7 @@ inline device_async_resource_ref set_per_device_resource_ref(
   cuda_device_id device_id, device_async_resource_ref new_resource_ref)
 {
   std::lock_guard<std::mutex> lock{detail::ref_map_lock()};
-  auto& map          = detail::get_ref_map();
-  auto const old_itr = map.find(device_id.value());
-  // If a resource didn't previously exist for `device_id`, return pointer to initial_resource
-  // Note: because resource_ref is not default-constructible, we can't use std::map::operator[]
-  if (old_itr == map.end()) {
-    map.insert({device_id.value(), new_resource_ref});
-    return device_async_resource_ref{detail::initial_resource()};
-  }
-
-  auto old_resource_ref = old_itr->second;
-  old_itr->second       = new_resource_ref;  // update map directly via iterator
-  return old_resource_ref;
+  return detail::set_per_device_resource_ref_unsafe(device_id, new_resource_ref);
 }
 
 /**
diff --git a/include/rmm/mr/device/polymorphic_allocator.hpp b/include/rmm/mr/device/polymorphic_allocator.hpp
index 6fb068410..442632d4f 100644
--- a/include/rmm/mr/device/polymorphic_allocator.hpp
+++ b/include/rmm/mr/device/polymorphic_allocator.hpp
@@ -52,7 +52,7 @@ class polymorphic_allocator {
   using value_type = T;  ///< T, the value type of objects allocated by this allocator
   /**
    * @brief Construct a `polymorphic_allocator` using the return value of
-   * `rmm::mr::get_current_device_resource()` as the underlying memory resource.
+   * `rmm::mr::get_current_device_resource_ref()` as the underlying memory resource.
    *
    */
   polymorphic_allocator() = default;
@@ -116,7 +116,7 @@ class polymorphic_allocator {
 
  private:
   rmm::device_async_resource_ref mr_{
-    get_current_device_resource()};  ///< Underlying resource used for (de)allocation
+    get_current_device_resource_ref()};  ///< Underlying resource used for (de)allocation
 };
 
 /**
diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp
index d22b53404..f63de21ff 100644
--- a/include/rmm/mr/device/pool_memory_resource.hpp
+++ b/include/rmm/mr/device/pool_memory_resource.hpp
@@ -25,6 +25,7 @@
 #include <rmm/mr/device/detail/coalescing_free_list.hpp>
 #include <rmm/mr/device/detail/stream_ordered_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/std/type_traits>
@@ -114,6 +115,33 @@ class pool_memory_resource final
   friend class detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
                                                       detail::coalescing_free_list>;
 
+  /**
+   * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using
+   * `upstream_mr`.
+   *
+   * @throws rmm::logic_error if `initial_pool_size` is not aligned to a multiple of
+   * pool_memory_resource::allocation_alignment bytes.
+   * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a
+   * multiple of pool_memory_resource::allocation_alignment bytes.
+   *
+   * @param upstream_mr The memory_resource from which to allocate blocks for the pool.
+   * @param initial_pool_size Minimum size, in bytes, of the initial pool.
+   * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all
+   * of the available from the upstream resource.
+   */
+  explicit pool_memory_resource(device_async_resource_ref upstream_mr,
+                                std::size_t initial_pool_size,
+                                std::optional<std::size_t> maximum_pool_size = std::nullopt)
+    : upstream_mr_{upstream_mr}
+  {
+    RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT),
+                "Error, Initial pool size required to be a multiple of 256 bytes");
+    RMM_EXPECTS(rmm::is_aligned(maximum_pool_size.value_or(0), rmm::CUDA_ALLOCATION_ALIGNMENT),
+                "Error, Maximum pool size required to be a multiple of 256 bytes");
+
+    initialize_pool(initial_pool_size, maximum_pool_size);
+  }
+
   /**
    * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using
    * `upstream_mr`.
@@ -132,10 +160,7 @@ class pool_memory_resource final
   explicit pool_memory_resource(Upstream* upstream_mr,
                                 std::size_t initial_pool_size,
                                 std::optional<std::size_t> maximum_pool_size = std::nullopt)
-    : upstream_mr_{[upstream_mr]() {
-        RMM_EXPECTS(nullptr != upstream_mr, "Unexpected null upstream pointer.");
-        return upstream_mr;
-      }()}
+    : upstream_mr_{to_device_async_resource_ref_checked(upstream_mr)}
   {
     RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT),
                 "Error, Initial pool size required to be a multiple of 256 bytes");
@@ -184,16 +209,11 @@ class pool_memory_resource final
   /**
    * @briefreturn{rmm::device_async_resource_ref to the upstream resource}
    */
-  [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept
+  [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept
   {
     return upstream_mr_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_mr_; }
-
   /**
    * @brief Computes the size of the current pool
    *
@@ -466,7 +486,8 @@ class pool_memory_resource final
   }
 
  private:
-  Upstream* upstream_mr_;  // The "heap" to allocate the pool from
+  // The "heap" to allocate the pool from
+  device_async_resource_ref upstream_mr_;
   std::size_t current_pool_size_{};
   std::optional<std::size_t> maximum_pool_size_{};
 
diff --git a/include/rmm/mr/device/prefetch_resource_adaptor.hpp b/include/rmm/mr/device/prefetch_resource_adaptor.hpp
index 59ce8e036..d3a4c676a 100644
--- a/include/rmm/mr/device/prefetch_resource_adaptor.hpp
+++ b/include/rmm/mr/device/prefetch_resource_adaptor.hpp
@@ -41,6 +41,14 @@ namespace mr {
 template <typename Upstream>
 class prefetch_resource_adaptor final : public device_memory_resource {
  public:
+  /**
+   * @brief Construct a new prefetch resource adaptor using `upstream` to satisfy
+   * allocation requests.
+   *
+   * @param upstream The resource_ref used for allocating/deallocating device memory
+   */
+  prefetch_resource_adaptor(device_async_resource_ref upstream) : upstream_{upstream} {}
+
   /**
    * @brief Construct a new prefetch resource adaptor using `upstream` to satisfy
    * allocation requests.
@@ -49,9 +57,9 @@ class prefetch_resource_adaptor final : public device_memory_resource {
    *
    * @param upstream The resource used for allocating/deallocating device memory
    */
-  prefetch_resource_adaptor(Upstream* upstream) : upstream_{upstream}
+  prefetch_resource_adaptor(Upstream* upstream)
+    : upstream_{to_device_async_resource_ref_checked(upstream)}
   {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
   }
 
   prefetch_resource_adaptor()                                            = delete;
@@ -71,11 +79,6 @@ class prefetch_resource_adaptor final : public device_memory_resource {
     return upstream_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
-
  private:
   /**
    * @brief Allocates memory of size at least `bytes` using the upstream
@@ -92,7 +95,7 @@ class prefetch_resource_adaptor final : public device_memory_resource {
    */
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
-    void* ptr = upstream_->allocate(bytes, stream);
+    void* ptr = get_upstream_resource().allocate_async(bytes, stream);
     rmm::prefetch(ptr, bytes, rmm::get_current_cuda_device(), stream);
     return ptr;
   }
@@ -106,7 +109,7 @@ class prefetch_resource_adaptor final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
-    upstream_->deallocate(ptr, bytes, stream);
+    get_upstream_resource().deallocate_async(ptr, bytes, stream);
   }
 
   /**
@@ -120,11 +123,12 @@ class prefetch_resource_adaptor final : public device_memory_resource {
   {
     if (this == &other) { return true; }
     auto cast = dynamic_cast<prefetch_resource_adaptor<Upstream> const*>(&other);
-    if (cast == nullptr) { return upstream_->is_equal(other); }
+    if (cast == nullptr) { return false; }
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
-  Upstream* upstream_;  // the upstream resource used for satisfying allocation requests
+  // the upstream resource used for satisfying allocation requests
+  device_async_resource_ref upstream_;
 };
 
 /** @} */  // end of group
diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp
index bf78c669a..025c51aa7 100644
--- a/include/rmm/mr/device/statistics_resource_adaptor.hpp
+++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp
@@ -17,6 +17,7 @@
 
 #include <rmm/detail/export.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cstddef>
@@ -113,17 +114,25 @@ class statistics_resource_adaptor final : public device_memory_resource {
     }
   };
 
+  /**
+   * @brief Construct a new statistics resource adaptor using `upstream` to satisfy
+   * allocation requests.
+   *
+   * @param upstream The resource_ref used for allocating/deallocating device memory.
+   */
+  statistics_resource_adaptor(device_async_resource_ref upstream) : upstream_{upstream} {}
+
   /**
    * @brief Construct a new statistics resource adaptor using `upstream` to satisfy
    * allocation requests.
    *
    * @throws rmm::logic_error if `upstream == nullptr`
    *
-   * @param upstream The resource used for allocating/deallocating device memory
+   * @param upstream The resource used for allocating/deallocating device memory.
    */
-  statistics_resource_adaptor(Upstream* upstream) : upstream_{upstream}
+  statistics_resource_adaptor(Upstream* upstream)
+    : upstream_{to_device_async_resource_ref_checked(upstream)}
   {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
   }
 
   statistics_resource_adaptor()                                              = delete;
@@ -143,11 +152,6 @@ class statistics_resource_adaptor final : public device_memory_resource {
     return upstream_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
-
   /**
    * @brief Returns a `counter` struct for this adaptor containing the current,
    * peak, and total number of allocated bytes for this
@@ -226,7 +230,7 @@ class statistics_resource_adaptor final : public device_memory_resource {
    */
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
-    void* ptr = upstream_->allocate(bytes, stream);
+    void* ptr = get_upstream_resource().allocate_async(bytes, stream);
 
     // increment the stats
     {
@@ -249,7 +253,7 @@ class statistics_resource_adaptor final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
-    upstream_->deallocate(ptr, bytes, stream);
+    get_upstream_resource().deallocate_async(ptr, bytes, stream);
 
     {
       write_lock_t lock(mtx_);
@@ -271,7 +275,7 @@ class statistics_resource_adaptor final : public device_memory_resource {
   {
     if (this == &other) { return true; }
     auto cast = dynamic_cast<statistics_resource_adaptor<Upstream> const*>(&other);
-    if (cast == nullptr) { return upstream_->is_equal(other); }
+    if (cast == nullptr) { return false; }
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
@@ -279,14 +283,14 @@ class statistics_resource_adaptor final : public device_memory_resource {
   // Invariant: the stack always contains at least one entry
   std::stack<std::pair<counter, counter>> counter_stack_{{std::make_pair(counter{}, counter{})}};
   std::shared_mutex mutable mtx_;  // mutex for thread safe access to allocations_
-  Upstream* upstream_;             // the upstream resource used for satisfying allocation requests
+  // the upstream resource used for satisfying allocation requests
+  device_async_resource_ref upstream_;
 };
 
 /**
  * @brief Convenience factory to return a `statistics_resource_adaptor` around the
  * upstream resource `upstream`.
  *
- * @tparam Upstream Type of the upstream `device_memory_resource`.
  * @param upstream Pointer to the upstream resource
  * @return The new statistics resource adaptor
  */
@@ -297,7 +301,7 @@ template <typename Upstream>
   "instead.")]]
 statistics_resource_adaptor<Upstream> make_statistics_adaptor(Upstream* upstream)
 {
-  return statistics_resource_adaptor<Upstream>{upstream};
+  return statistics_resource_adaptor<rmm::mr::device_memory_resource>{upstream};
 }
 
 /** @} */  // end of group
diff --git a/include/rmm/mr/device/thread_safe_resource_adaptor.hpp b/include/rmm/mr/device/thread_safe_resource_adaptor.hpp
index 9979d1e08..6881aa19e 100644
--- a/include/rmm/mr/device/thread_safe_resource_adaptor.hpp
+++ b/include/rmm/mr/device/thread_safe_resource_adaptor.hpp
@@ -45,6 +45,16 @@ class thread_safe_resource_adaptor final : public device_memory_resource {
  public:
   using lock_t = std::lock_guard<std::mutex>;  ///< Type of lock used to synchronize access
 
+  /**
+   * @brief Construct a new thread safe resource adaptor using `upstream` to satisfy
+   * allocation requests.
+   *
+   * All allocations and frees are protected by a mutex lock
+   *
+   * @param upstream The resource used for allocating/deallocating device memory.
+   */
+  thread_safe_resource_adaptor(device_async_resource_ref upstream) : upstream_{upstream} {}
+
   /**
    * @brief Construct a new thread safe resource adaptor using `upstream` to satisfy
    * allocation requests.
@@ -55,9 +65,9 @@ class thread_safe_resource_adaptor final : public device_memory_resource {
    *
    * @param upstream The resource used for allocating/deallocating device memory.
    */
-  thread_safe_resource_adaptor(Upstream* upstream) : upstream_{upstream}
+  thread_safe_resource_adaptor(Upstream* upstream)
+    : upstream_{to_device_async_resource_ref_checked(upstream)}
   {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
   }
 
   thread_safe_resource_adaptor()                                               = delete;
@@ -75,11 +85,6 @@ class thread_safe_resource_adaptor final : public device_memory_resource {
     return upstream_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
-
  private:
   /**
    * @brief Allocates memory of size at least `bytes` using the upstream
@@ -95,7 +100,7 @@ class thread_safe_resource_adaptor final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
     lock_t lock(mtx);
-    return upstream_->allocate(bytes, stream);
+    return get_upstream_resource().allocate_async(bytes, stream);
   }
 
   /**
@@ -108,7 +113,7 @@ class thread_safe_resource_adaptor final : public device_memory_resource {
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
     lock_t lock(mtx);
-    upstream_->deallocate(ptr, bytes, stream);
+    get_upstream_resource().deallocate_async(ptr, bytes, stream);
   }
 
   /**
@@ -122,12 +127,13 @@ class thread_safe_resource_adaptor final : public device_memory_resource {
   {
     if (this == &other) { return true; }
     auto cast = dynamic_cast<thread_safe_resource_adaptor<Upstream> const*>(&other);
-    if (cast == nullptr) { return upstream_->is_equal(other); }
+    if (cast == nullptr) { return false; }
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
   std::mutex mutable mtx;  // mutex for thread safe access to upstream
-  Upstream* upstream_;     ///< The upstream resource used for satisfying allocation requests
+  device_async_resource_ref
+    upstream_;  ///< The upstream resource used for satisfying allocation requests
 };
 
 /** @} */  // end of group
diff --git a/include/rmm/mr/device/thrust_allocator_adaptor.hpp b/include/rmm/mr/device/thrust_allocator_adaptor.hpp
index 2055a0633..b7b990c3d 100644
--- a/include/rmm/mr/device/thrust_allocator_adaptor.hpp
+++ b/include/rmm/mr/device/thrust_allocator_adaptor.hpp
@@ -150,7 +150,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
 
  private:
   cuda_stream_view _stream{};
-  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource_ref()};
   cuda_device_id _device{get_current_cuda_device()};
 };
 /** @} */  // end of group
diff --git a/include/rmm/mr/device/tracking_resource_adaptor.hpp b/include/rmm/mr/device/tracking_resource_adaptor.hpp
index 3d3188b23..6a5916e5c 100644
--- a/include/rmm/mr/device/tracking_resource_adaptor.hpp
+++ b/include/rmm/mr/device/tracking_resource_adaptor.hpp
@@ -20,6 +20,7 @@
 #include <rmm/detail/stack_trace.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <fmt/core.h>
@@ -83,6 +84,18 @@ class tracking_resource_adaptor final : public device_memory_resource {
         allocation_size{size} {};
   };
 
+  /**
+   * @brief Construct a new tracking resource adaptor using `upstream` to satisfy
+   * allocation requests.
+   *
+   * @param upstream The resource used for allocating/deallocating device memory
+   * @param capture_stacks If true, capture stacks for allocation calls
+   */
+  tracking_resource_adaptor(device_async_resource_ref upstream, bool capture_stacks = false)
+    : capture_stacks_{capture_stacks}, allocated_bytes_{0}, upstream_{upstream}
+  {
+  }
+
   /**
    * @brief Construct a new tracking resource adaptor using `upstream` to satisfy
    * allocation requests.
@@ -93,9 +106,10 @@ class tracking_resource_adaptor final : public device_memory_resource {
    * @param capture_stacks If true, capture stacks for allocation calls
    */
   tracking_resource_adaptor(Upstream* upstream, bool capture_stacks = false)
-    : capture_stacks_{capture_stacks}, allocated_bytes_{0}, upstream_{upstream}
+    : capture_stacks_{capture_stacks},
+      allocated_bytes_{0},
+      upstream_{to_device_async_resource_ref_checked(upstream)}
   {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
   }
 
   tracking_resource_adaptor()                                 = delete;
@@ -115,11 +129,6 @@ class tracking_resource_adaptor final : public device_memory_resource {
     return upstream_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
-
   /**
    * @brief Get the outstanding allocations map
    *
@@ -199,8 +208,7 @@ class tracking_resource_adaptor final : public device_memory_resource {
    */
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
-    void* ptr = upstream_->allocate(bytes, stream);
-
+    void* ptr = get_upstream_resource().allocate_async(bytes, stream);
     // track it.
     {
       write_lock_t lock(mtx_);
@@ -220,7 +228,7 @@ class tracking_resource_adaptor final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
-    upstream_->deallocate(ptr, bytes, stream);
+    get_upstream_resource().deallocate_async(ptr, bytes, stream);
     {
       write_lock_t lock(mtx_);
 
@@ -265,7 +273,7 @@ class tracking_resource_adaptor final : public device_memory_resource {
   {
     if (this == &other) { return true; }
     auto cast = dynamic_cast<tracking_resource_adaptor<Upstream> const*>(&other);
-    if (cast == nullptr) { return upstream_->is_equal(other); }
+    if (cast == nullptr) { return false; }
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
@@ -273,7 +281,8 @@ class tracking_resource_adaptor final : public device_memory_resource {
   std::map<void*, allocation_info> allocations_;  // map of active allocations
   std::atomic<std::size_t> allocated_bytes_;      // number of bytes currently allocated
   std::shared_mutex mutable mtx_;                 // mutex for thread safe access to allocations_
-  Upstream* upstream_;  // the upstream resource used for satisfying allocation requests
+  device_async_resource_ref upstream_;            // the upstream resource used for satisfying
+                                                  // allocation requests
 };
 
 /**
diff --git a/include/rmm/resource_ref.hpp b/include/rmm/resource_ref.hpp
index 08942a040..56049522f 100644
--- a/include/rmm/resource_ref.hpp
+++ b/include/rmm/resource_ref.hpp
@@ -65,5 +65,21 @@ using host_device_resource_ref =
 using host_device_async_resource_ref =
   cuda::mr::async_resource_ref<cuda::mr::host_accessible, cuda::mr::device_accessible>;
 
+/**
+ * @brief Convert pointer to memory resource into `device_async_resource_ref`, checking for
+ * `nullptr`
+ *
+ * @tparam Resource The type of the memory resource.
+ * @param res A pointer to the memory resource.
+ * @return A `device_async_resource_ref` to the memory resource.
+ * @throws std::logic_error if the memory resource pointer is null.
+ */
+template <class Resource>
+device_async_resource_ref to_device_async_resource_ref_checked(Resource* res)
+{
+  RMM_EXPECTS(res, "Unexpected null resource pointer.");
+  return device_async_resource_ref{*res};
+}
+
 /** @} */  // end of group
 }  // namespace RMM_NAMESPACE
diff --git a/python/rmm/rmm/_lib/_torch_allocator.cpp b/python/rmm/rmm/_lib/_torch_allocator.cpp
index dc92e4639..bfe94c2d0 100644
--- a/python/rmm/rmm/_lib/_torch_allocator.cpp
+++ b/python/rmm/rmm/_lib/_torch_allocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,8 +39,9 @@ extern "C" void* allocate(std::size_t size, int device, void* stream)
 {
   rmm::cuda_device_id const device_id{device};
   rmm::cuda_set_device_raii with_device{device_id};
-  auto mr = rmm::mr::get_per_device_resource(device_id);
-  return mr->allocate(size, rmm::cuda_stream_view{static_cast<cudaStream_t>(stream)});
+  auto mr = rmm::mr::get_per_device_resource_ref(device_id);
+  return mr.allocate_async(
+    size, rmm::CUDA_ALLOCATION_ALIGNMENT, rmm::cuda_stream_view{static_cast<cudaStream_t>(stream)});
 }
 
 /**
@@ -55,6 +56,9 @@ extern "C" void deallocate(void* ptr, std::size_t size, int device, void* stream
 {
   rmm::cuda_device_id const device_id{device};
   rmm::cuda_set_device_raii with_device{device_id};
-  auto mr = rmm::mr::get_per_device_resource(device_id);
-  mr->deallocate(ptr, size, rmm::cuda_stream_view{static_cast<cudaStream_t>(stream)});
+  auto mr = rmm::mr::get_per_device_resource_ref(device_id);
+  mr.deallocate_async(ptr,
+                      size,
+                      rmm::CUDA_ALLOCATION_ALIGNMENT,
+                      rmm::cuda_stream_view{static_cast<cudaStream_t>(stream)});
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 5a4f88166..ea1af58cd 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -40,6 +40,7 @@ function(ConfigureTestInternal TEST_NAME)
                              PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}")
   target_compile_options(${TEST_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror
                                              -Wno-error=deprecated-declarations>)
+  target_compile_options(${TEST_NAME} PUBLIC "$<$<CONFIG:Debug>:-O0>")
 
   if(DISABLE_DEPRECATION_WARNING)
     target_compile_options(
diff --git a/tests/container_multidevice_tests.cu b/tests/container_multidevice_tests.cu
index e58ba53a2..55432feb0 100644
--- a/tests/container_multidevice_tests.cu
+++ b/tests/container_multidevice_tests.cu
@@ -42,9 +42,9 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateDestroyDifferentActiveDevice)
   // only run on multidevice systems
   if (num_devices >= 2) {
     rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}};
-    auto* orig_mr = rmm::mr::get_current_device_resource();
+    auto orig_mr  = rmm::mr::get_current_device_resource_ref();
     auto check_mr = device_check_resource_adaptor{orig_mr};
-    rmm::mr::set_current_device_resource(&check_mr);
+    rmm::mr::set_current_device_resource_ref(check_mr);
 
     {
       if constexpr (std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
@@ -57,7 +57,7 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateDestroyDifferentActiveDevice)
     }
 
     RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0));
-    rmm::mr::set_current_device_resource(orig_mr);
+    rmm::mr::set_current_device_resource_ref(orig_mr);
   }
 }
 
@@ -69,9 +69,9 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateMoveDestroyDifferentActiveDevice)
   // only run on multidevice systems
   if (num_devices >= 2) {
     rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}};
-    auto* orig_mr = rmm::mr::get_current_device_resource();
+    auto orig_mr  = rmm::mr::get_current_device_resource_ref();
     auto check_mr = device_check_resource_adaptor{orig_mr};
-    rmm::mr::set_current_device_resource(&check_mr);
+    rmm::mr::set_current_device_resource_ref(check_mr);
 
     {
       auto buf_1 = []() {
@@ -97,7 +97,7 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateMoveDestroyDifferentActiveDevice)
     }
 
     RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0));
-    rmm::mr::set_current_device_resource(orig_mr);
+    rmm::mr::set_current_device_resource_ref(orig_mr);
   }
 }
 
@@ -109,9 +109,9 @@ TYPED_TEST(ContainerMultiDeviceTest, ResizeDifferentActiveDevice)
   // only run on multidevice systems
   if (num_devices >= 2) {
     rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}};
-    auto* orig_mr = rmm::mr::get_current_device_resource();
+    auto orig_mr  = rmm::mr::get_current_device_resource_ref();
     auto check_mr = device_check_resource_adaptor{orig_mr};
-    rmm::mr::set_current_device_resource(&check_mr);
+    rmm::mr::set_current_device_resource_ref(check_mr);
 
     if constexpr (not std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
       auto buf = TypeParam(128, rmm::cuda_stream_view{});
@@ -120,7 +120,7 @@ TYPED_TEST(ContainerMultiDeviceTest, ResizeDifferentActiveDevice)
     }
 
     RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0));
-    rmm::mr::set_current_device_resource(orig_mr);
+    rmm::mr::set_current_device_resource_ref(orig_mr);
   }
 }
 
@@ -132,9 +132,9 @@ TYPED_TEST(ContainerMultiDeviceTest, ShrinkDifferentActiveDevice)
   // only run on multidevice systems
   if (num_devices >= 2) {
     rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}};
-    auto* orig_mr = rmm::mr::get_current_device_resource();
+    auto orig_mr  = rmm::mr::get_current_device_resource_ref();
     auto check_mr = device_check_resource_adaptor{orig_mr};
-    rmm::mr::set_current_device_resource(&check_mr);
+    rmm::mr::set_current_device_resource_ref(check_mr);
 
     if constexpr (not std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
       auto buf = TypeParam(128, rmm::cuda_stream_view{});
@@ -144,6 +144,6 @@ TYPED_TEST(ContainerMultiDeviceTest, ShrinkDifferentActiveDevice)
     }
 
     RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0));
-    rmm::mr::set_current_device_resource(orig_mr);
+    rmm::mr::set_current_device_resource_ref(orig_mr);
   }
 }
diff --git a/tests/cuda_stream_tests.cpp b/tests/cuda_stream_tests.cpp
index 1cc068434..ec7e6c3e9 100644
--- a/tests/cuda_stream_tests.cpp
+++ b/tests/cuda_stream_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,6 +96,6 @@ TEST_F(CudaStreamDeathTest, TestSyncNoThrow)
     // should assert here or in `~cuda_stream()`
     stream_a.synchronize_no_throw();
   };
-  EXPECT_DEATH(test(), "Assertion");
+  EXPECT_DEATH(test(), "");
 }
 #endif
diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu
index c095eecf8..5e48504d6 100644
--- a/tests/device_buffer_tests.cu
+++ b/tests/device_buffer_tests.cu
@@ -75,7 +75,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource)
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.ssize());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()},
+  EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()},
             buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
@@ -87,7 +87,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()},
+  EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()},
             buff.memory_resource());
   EXPECT_EQ(this->stream, buff.stream());
 }
@@ -121,7 +121,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()},
+  EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()},
             buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 
@@ -138,7 +138,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()},
+  EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()},
             buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
   buff.stream().synchronize();
@@ -152,7 +152,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr)
   EXPECT_EQ(nullptr, buff.data());
   EXPECT_EQ(0, buff.size());
   EXPECT_EQ(0, buff.capacity());
-  EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()},
+  EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()},
             buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
@@ -180,7 +180,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor)
   EXPECT_EQ(buff.size(), buff_copy.size());
   EXPECT_EQ(buff.capacity(), buff_copy.capacity());
   EXPECT_EQ(buff_copy.memory_resource(),
-            rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()});
+            rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()});
   EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{});
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -223,7 +223,7 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize)
   // The capacity of the copy should be equal to the `size()` of the original
   EXPECT_EQ(new_size, buff_copy.capacity());
   EXPECT_EQ(buff_copy.memory_resource(),
-            rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()});
+            rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()});
   EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{});
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
diff --git a/tests/device_check_resource_adaptor.hpp b/tests/device_check_resource_adaptor.hpp
index fcb578fdf..6780f56d7 100644
--- a/tests/device_check_resource_adaptor.hpp
+++ b/tests/device_check_resource_adaptor.hpp
@@ -17,13 +17,14 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <gtest/gtest.h>
 
 class device_check_resource_adaptor final : public rmm::mr::device_memory_resource {
  public:
-  device_check_resource_adaptor(rmm::mr::device_memory_resource* upstream)
+  device_check_resource_adaptor(rmm::device_async_resource_ref upstream)
     : device_id{rmm::get_current_cuda_device()}, upstream_(upstream)
   {
   }
@@ -36,11 +37,6 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour
     return upstream_;
   }
 
-  /**
-   * @briefreturn{device_memory_resource* to the upstream memory resource}
-   */
-  [[nodiscard]] device_memory_resource* get_upstream() const noexcept { return upstream_; }
-
  private:
   [[nodiscard]] bool check_device_id() const { return device_id == rmm::get_current_cuda_device(); }
 
@@ -48,7 +44,7 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour
   {
     bool const is_correct_device = check_device_id();
     EXPECT_TRUE(is_correct_device);
-    if (is_correct_device) { return upstream_->allocate(bytes, stream); }
+    if (is_correct_device) { return get_upstream_resource().allocate_async(bytes, stream); }
     return nullptr;
   }
 
@@ -56,7 +52,7 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour
   {
     bool const is_correct_device = check_device_id();
     EXPECT_TRUE(is_correct_device);
-    if (is_correct_device) { upstream_->deallocate(ptr, bytes, stream); }
+    if (is_correct_device) { get_upstream_resource().deallocate_async(ptr, bytes, stream); }
   }
 
   [[nodiscard]] bool do_is_equal(
@@ -64,10 +60,10 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour
   {
     if (this == &other) { return true; }
     auto const* cast = dynamic_cast<device_check_resource_adaptor const*>(&other);
-    if (cast == nullptr) { return upstream_->is_equal(other); }
+    if (cast == nullptr) { return false; }
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
   rmm::cuda_device_id device_id;
-  rmm::mr::device_memory_resource* upstream_{};
+  rmm::device_async_resource_ref upstream_;
 };
diff --git a/tests/device_scalar_tests.cpp b/tests/device_scalar_tests.cpp
index 6f80a5de1..323894a6a 100644
--- a/tests/device_scalar_tests.cpp
+++ b/tests/device_scalar_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ struct DeviceScalarTest : public ::testing::Test {
   std::default_random_engine generator{};
   T value{};
   rmm::cuda_stream stream{};
-  rmm::device_async_resource_ref mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref mr{rmm::mr::get_current_device_resource_ref()};
 
   DeviceScalarTest() : value{random_value()} {}
 
diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp
index 1c93ef138..90955c24c 100644
--- a/tests/device_uvector_tests.cpp
+++ b/tests/device_uvector_tests.cpp
@@ -39,7 +39,7 @@ TYPED_TEST(TypedUVectorTest, MemoryResource)
 {
   rmm::device_uvector<TypeParam> vec(128, this->stream());
   EXPECT_EQ(vec.memory_resource(),
-            rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()});
+            rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()});
 }
 
 TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor)
diff --git a/tests/mock_resource.hpp b/tests/mock_resource.hpp
index e06148d3a..555cf0d74 100644
--- a/tests/mock_resource.hpp
+++ b/tests/mock_resource.hpp
@@ -25,7 +25,12 @@ class mock_resource : public rmm::mr::device_memory_resource {
  public:
   MOCK_METHOD(void*, do_allocate, (std::size_t, cuda_stream_view), (override));
   MOCK_METHOD(void, do_deallocate, (void*, std::size_t, cuda_stream_view), (override));
+  bool operator==(mock_resource const&) const noexcept { return true; }
+  bool operator!=(mock_resource const&) const { return false; }
+  friend void get_property(mock_resource const&, cuda::mr::device_accessible) noexcept {}
   using size_pair = std::pair<std::size_t, std::size_t>;
 };
 
+static_assert(cuda::mr::async_resource_with<mock_resource, cuda::mr::device_accessible>);
+
 }  // namespace rmm::test
diff --git a/tests/mr/device/adaptor_tests.cpp b/tests/mr/device/adaptor_tests.cpp
index a757a78b0..286daa841 100644
--- a/tests/mr/device/adaptor_tests.cpp
+++ b/tests/mr/device/adaptor_tests.cpp
@@ -129,8 +129,7 @@ TYPED_TEST(AdaptorTest, Equality)
   }
 
   {
-    rmm::mr::device_memory_resource* device_mr = &this->cuda;
-    auto other_mr = aligned_resource_adaptor<rmm::mr::device_memory_resource>{device_mr};
+    auto other_mr = aligned_resource_adaptor<rmm::mr::device_memory_resource>{&this->cuda};
     EXPECT_FALSE(this->mr->is_equal(other_mr));
   }
 }
diff --git a/tests/mr/device/aligned_mr_tests.cpp b/tests/mr/device/aligned_mr_tests.cpp
index b9ecbc8ca..9b90bf751 100644
--- a/tests/mr/device/aligned_mr_tests.cpp
+++ b/tests/mr/device/aligned_mr_tests.cpp
@@ -59,13 +59,13 @@ TEST(AlignedTest, ThrowOnInvalidAllocationAlignment)
 TEST(AlignedTest, SupportsGetMemInfo)
 {
   mock_resource mock;
-  aligned_mock mr{&mock};
+  aligned_mock mr{mock};
 }
 
 TEST(AlignedTest, DefaultAllocationAlignmentPassthrough)
 {
   mock_resource mock;
-  aligned_mock mr{&mock};
+  aligned_mock mr{mock};
 
   cuda_stream_view stream;
   void* const pointer = int_to_address(123);
@@ -204,7 +204,7 @@ TEST(AlignedTest, AlignRealPointer)
 {
   auto const alignment{4096};
   auto const threshold{65536};
-  aligned_real mr{rmm::mr::get_current_device_resource(), alignment, threshold};
+  aligned_real mr{rmm::mr::get_current_device_resource_ref(), alignment, threshold};
   void* alloc = mr.allocate(threshold);
   EXPECT_TRUE(rmm::is_pointer_aligned(alloc, alignment));
   mr.deallocate(alloc, threshold);
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 1eb38888e..bdc0f2438 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -23,6 +23,9 @@
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <cuda/stream_ref>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -37,15 +40,22 @@ namespace {
 
 class mock_memory_resource {
  public:
-  MOCK_METHOD(void*, allocate, (std::size_t));
-  MOCK_METHOD(void, deallocate, (void*, std::size_t));
+  MOCK_METHOD(void*, allocate, (std::size_t, std::size_t));
+  MOCK_METHOD(void, deallocate, (void*, std::size_t, std::size_t));
+  MOCK_METHOD(void*, allocate_async, (std::size_t, std::size_t, cuda::stream_ref));
+  MOCK_METHOD(void, deallocate_async, (void*, std::size_t, std::size_t, cuda::stream_ref));
+  bool operator==(mock_memory_resource const&) const noexcept { return true; }
+  bool operator!=(mock_memory_resource const&) const { return false; }
+  friend void get_property(mock_memory_resource const&, cuda::mr::device_accessible) noexcept {}
 };
 
+static_assert(cuda::mr::async_resource_with<mock_memory_resource, cuda::mr::device_accessible>);
+
 using rmm::mr::detail::arena::block;
 using rmm::mr::detail::arena::byte_span;
 using rmm::mr::detail::arena::superblock;
-using global_arena = rmm::mr::detail::arena::global_arena<mock_memory_resource>;
-using arena        = rmm::mr::detail::arena::arena<mock_memory_resource>;
+using global_arena = rmm::mr::detail::arena::global_arena;
+using arena        = rmm::mr::detail::arena::arena;
 using arena_mr     = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>;
 using ::testing::Return;
 
@@ -59,9 +69,10 @@ auto const fake_address4 = reinterpret_cast<void*>(superblock::minimum_size * 2)
 struct ArenaTest : public ::testing::Test {
   void SetUp() override
   {
-    EXPECT_CALL(mock_mr, allocate(arena_size)).WillOnce(Return(fake_address3));
-    EXPECT_CALL(mock_mr, deallocate(fake_address3, arena_size));
-    global     = std::make_unique<global_arena>(&mock_mr, arena_size);
+    EXPECT_CALL(mock_mr, allocate(arena_size, ::testing::_)).WillOnce(Return(fake_address3));
+    EXPECT_CALL(mock_mr, deallocate(fake_address3, arena_size, ::testing::_));
+
+    global     = std::make_unique<global_arena>(mock_mr, arena_size);
     per_thread = std::make_unique<arena>(*global);
   }
 
@@ -293,13 +304,6 @@ TEST_F(ArenaTest, SuperblockMaxFreeSizeWhenFull)  // NOLINT
 /**
  * Test global_arena.
  */
-
-TEST_F(ArenaTest, GlobalArenaNullUpstream)  // NOLINT
-{
-  auto construct_nullptr = []() { global_arena global{nullptr, std::nullopt}; };
-  EXPECT_THROW(construct_nullptr(), rmm::logic_error);  // NOLINT(cppcoreguidelines-avoid-goto)
-}
-
 TEST_F(ArenaTest, GlobalArenaAcquire)  // NOLINT
 {
   auto const sblk = global->acquire(256);
@@ -378,7 +382,7 @@ TEST_F(ArenaTest, GlobalArenaDeallocate)  // NOLINT
 {
   auto* ptr = global->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(ptr, fake_address3);
-  global->deallocate(ptr, superblock::minimum_size * 2, {});
+  global->deallocate_async(ptr, superblock::minimum_size * 2, {});
   ptr = global->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(ptr, fake_address3);
 }
@@ -387,8 +391,8 @@ TEST_F(ArenaTest, GlobalArenaDeallocateAlignUp)  // NOLINT
 {
   auto* ptr  = global->allocate(superblock::minimum_size + 256);
   auto* ptr2 = global->allocate(superblock::minimum_size + 512);
-  global->deallocate(ptr, superblock::minimum_size + 256, {});
-  global->deallocate(ptr2, superblock::minimum_size + 512, {});
+  global->deallocate_async(ptr, superblock::minimum_size + 256, {});
+  global->deallocate_async(ptr2, superblock::minimum_size + 512, {});
   EXPECT_EQ(global->allocate(arena_size), fake_address3);
 }
 
@@ -479,7 +483,7 @@ TEST_F(ArenaTest, ThrowOnNullUpstream)  // NOLINT
 
 TEST_F(ArenaTest, SizeSmallerThanSuperblockSize)  // NOLINT
 {
-  auto construct_small = []() { arena_mr mr{rmm::mr::get_current_device_resource(), 256}; };
+  auto construct_small = []() { arena_mr mr{rmm::mr::get_current_device_resource_ref(), 256}; };
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
   EXPECT_THROW(construct_small(), rmm::logic_error);
 }
@@ -490,14 +494,14 @@ TEST_F(ArenaTest, AllocateNinetyPercent)  // NOLINT
     auto const free           = rmm::available_device_memory().first;
     auto const ninety_percent = rmm::align_up(
       static_cast<std::size_t>(static_cast<double>(free) * 0.9), rmm::CUDA_ALLOCATION_ALIGNMENT);
-    arena_mr mr(rmm::mr::get_current_device_resource(), ninety_percent);
+    arena_mr mr(rmm::mr::get_current_device_resource_ref(), ninety_percent);
   }());
 }
 
 TEST_F(ArenaTest, SmallMediumLarge)  // NOLINT
 {
   EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
-    arena_mr mr(rmm::mr::get_current_device_resource());
+    arena_mr mr(rmm::mr::get_current_device_resource_ref());
     auto* small     = mr.allocate(256);
     auto* medium    = mr.allocate(64_MiB);
     auto const free = rmm::available_device_memory().first;
@@ -512,7 +516,7 @@ TEST_F(ArenaTest, Defragment)  // NOLINT
 {
   EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
     auto const arena_size = superblock::minimum_size * 4;
-    arena_mr mr(rmm::mr::get_current_device_resource(), arena_size);
+    arena_mr mr(rmm::mr::get_current_device_resource_ref(), arena_size);
     std::vector<std::thread> threads;
     std::size_t num_threads{4};
     threads.reserve(num_threads);
@@ -539,7 +543,7 @@ TEST_F(ArenaTest, PerThreadToStreamDealloc)  // NOLINT
   // arena that then moved to global arena during a defragmentation
   // and then moved to a stream arena.
   auto const arena_size = superblock::minimum_size * 2;
-  arena_mr mr(rmm::mr::get_current_device_resource(), arena_size);
+  arena_mr mr(rmm::mr::get_current_device_resource_ref(), arena_size);
   // Create an allocation from a per thread arena
   void* thread_ptr = mr.allocate(256, rmm::cuda_stream_per_thread);
   // Create an allocation in a stream arena to force global arena
@@ -565,7 +569,7 @@ TEST_F(ArenaTest, PerThreadToStreamDealloc)  // NOLINT
 
 TEST_F(ArenaTest, DumpLogOnFailure)  // NOLINT
 {
-  arena_mr mr{rmm::mr::get_current_device_resource(), 1_MiB, true};
+  arena_mr mr{rmm::mr::get_current_device_resource_ref(), 1_MiB, true};
 
   {  // make the log interesting
     std::vector<std::thread> threads;
diff --git a/tests/mr/device/callback_mr_tests.cpp b/tests/mr/device/callback_mr_tests.cpp
index 34a2cc8cc..a56efa60c 100644
--- a/tests/mr/device/callback_mr_tests.cpp
+++ b/tests/mr/device/callback_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,47 +36,50 @@ using ::testing::_;
 
 TEST(CallbackTest, TestCallbacksAreInvoked)
 {
-  auto base_mr = mock_resource();
+  auto base_mr  = mock_resource();
+  auto base_ref = device_async_resource_ref{base_mr};
   EXPECT_CALL(base_mr, do_allocate(10_MiB, cuda_stream_view{})).Times(1);
   EXPECT_CALL(base_mr, do_deallocate(_, 10_MiB, cuda_stream_view{})).Times(1);
 
   auto allocate_callback = [](std::size_t size, cuda_stream_view stream, void* arg) {
-    auto base_mr = static_cast<rmm::mr::device_memory_resource*>(arg);
-    return base_mr->allocate(size, stream);
+    auto base_mr = *static_cast<rmm::device_async_resource_ref*>(arg);
+    return base_mr.allocate_async(size, stream);
   };
   auto deallocate_callback = [](void* ptr, std::size_t size, cuda_stream_view stream, void* arg) {
-    auto base_mr = static_cast<rmm::mr::device_memory_resource*>(arg);
-    base_mr->deallocate(ptr, size, stream);
+    auto base_mr = *static_cast<rmm::device_async_resource_ref*>(arg);
+    base_mr.deallocate_async(ptr, size, stream);
   };
   auto mr =
-    rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, &base_mr, &base_mr);
-  auto ptr = mr.allocate(10_MiB);
-  mr.deallocate(ptr, 10_MiB);
+    rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, &base_ref, &base_ref);
+  auto const size = std::size_t{10_MiB};
+  auto* ptr       = mr.allocate(size);
+  mr.deallocate(ptr, size);
 }
 
 TEST(CallbackTest, LoggingTest)
 {
   testing::internal::CaptureStdout();
 
-  auto base_mr           = rmm::mr::get_current_device_resource();
+  auto base_mr           = rmm::mr::get_current_device_resource_ref();
   auto allocate_callback = [](std::size_t size, cuda_stream_view stream, void* arg) {
     std::cout << "Allocating " << size << " bytes" << std::endl;
-    auto base_mr = static_cast<rmm::mr::device_memory_resource*>(arg);
-    return base_mr->allocate(size, stream);
+    auto base_mr = *static_cast<rmm::device_async_resource_ref*>(arg);
+    return base_mr.allocate_async(size, stream);
   };
 
   auto deallocate_callback = [](void* ptr, std::size_t size, cuda_stream_view stream, void* arg) {
     std::cout << "Deallocating " << size << " bytes" << std::endl;
-    auto base_mr = static_cast<rmm::mr::device_memory_resource*>(arg);
-    base_mr->deallocate(ptr, size, stream);
+    auto base_mr = *static_cast<rmm::device_async_resource_ref*>(arg);
+    base_mr.deallocate_async(ptr, size, stream);
   };
   auto mr =
-    rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, base_mr, base_mr);
-  auto ptr = mr.allocate(10_MiB);
-  mr.deallocate(ptr, 10_MiB);
+    rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, &base_mr, &base_mr);
+  auto const size = std::size_t{10_MiB};
+  auto* ptr       = mr.allocate(size);
+  mr.deallocate(ptr, size);
 
   std::string output = testing::internal::GetCapturedStdout();
-  std::string expect = fmt::format("Allocating {} bytes\nDeallocating {} bytes\n", 10_MiB, 10_MiB);
+  std::string expect = fmt::format("Allocating {} bytes\nDeallocating {} bytes\n", size, size);
   ASSERT_EQ(expect, output);
 }
 
diff --git a/tests/mr/device/failure_callback_mr_tests.cpp b/tests/mr/device/failure_callback_mr_tests.cpp
index 683aee86e..4b3d084d5 100644
--- a/tests/mr/device/failure_callback_mr_tests.cpp
+++ b/tests/mr/device/failure_callback_mr_tests.cpp
@@ -47,7 +47,8 @@ bool failure_handler(std::size_t /*bytes*/, void* arg)
 TEST(FailureCallbackTest, RetryAllocationOnce)
 {
   bool retried{false};
-  failure_callback_adaptor<> mr{rmm::mr::get_current_device_resource(), failure_handler, &retried};
+  failure_callback_adaptor<> mr{
+    rmm::mr::get_current_device_resource_ref(), failure_handler, &retried};
   EXPECT_EQ(retried, false);
   EXPECT_THROW(mr.allocate(512_GiB), std::bad_alloc);
   EXPECT_EQ(retried, true);
diff --git a/tests/mr/device/limiting_mr_tests.cpp b/tests/mr/device/limiting_mr_tests.cpp
index 777ce9428..e6cc97029 100644
--- a/tests/mr/device/limiting_mr_tests.cpp
+++ b/tests/mr/device/limiting_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,19 +25,19 @@
 namespace rmm::test {
 namespace {
 
-using Limiting_adaptor = rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>;
+using limiting_adaptor = rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>;
 
 TEST(LimitingTest, ThrowOnNullUpstream)
 {
   auto const max_size{5_MiB};
-  auto construct_nullptr = []() { Limiting_adaptor mr{nullptr, max_size}; };
+  auto construct_nullptr = []() { limiting_adaptor mr{nullptr, max_size}; };
   EXPECT_THROW(construct_nullptr(), rmm::logic_error);
 }
 
 TEST(LimitingTest, TooBig)
 {
   auto const max_size{5_MiB};
-  Limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size};
+  limiting_adaptor mr{rmm::mr::get_current_device_resource_ref(), max_size};
   EXPECT_THROW(mr.allocate(max_size + 1), rmm::out_of_memory);
 }
 
@@ -45,15 +45,15 @@ TEST(LimitingTest, UpstreamFailure)
 {
   auto const max_size_1{2_MiB};
   auto const max_size_2{5_MiB};
-  Limiting_adaptor mr1{rmm::mr::get_current_device_resource(), max_size_1};
-  Limiting_adaptor mr2{&mr1, max_size_2};
+  limiting_adaptor mr1{rmm::mr::get_current_device_resource_ref(), max_size_1};
+  limiting_adaptor mr2{&mr1, max_size_2};
   EXPECT_THROW(mr2.allocate(4_MiB), rmm::out_of_memory);
 }
 
 TEST(LimitingTest, UnderLimitDueToFrees)
 {
   auto const max_size{10_MiB};
-  Limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size};
+  limiting_adaptor mr{rmm::mr::get_current_device_resource_ref(), max_size};
   auto const size1{4_MiB};
   auto* ptr1           = mr.allocate(size1);
   auto allocated_bytes = size1;
@@ -81,7 +81,7 @@ TEST(LimitingTest, UnderLimitDueToFrees)
 TEST(LimitingTest, OverLimit)
 {
   auto const max_size{10_MiB};
-  Limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size};
+  limiting_adaptor mr{rmm::mr::get_current_device_resource_ref(), max_size};
   auto const size1{4_MiB};
   auto* ptr1           = mr.allocate(size1);
   auto allocated_bytes = size1;
diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp
index c63a61844..9db63eb1b 100644
--- a/tests/mr/device/pool_mr_tests.cpp
+++ b/tests/mr/device/pool_mr_tests.cpp
@@ -49,19 +49,7 @@ TEST(PoolTest, ThrowMaxLessThanInitial)
   auto max_less_than_initial = []() {
     const auto initial{1024};
     const auto maximum{256};
-    pool_mr mr{rmm::mr::get_current_device_resource(), initial, maximum};
-  };
-  EXPECT_THROW(max_less_than_initial(), rmm::logic_error);
-}
-
-TEST(PoolTest, ReferenceThrowMaxLessThanInitial)
-{
-  // Make sure first argument is enough larger than the second that alignment rounding doesn't
-  // make them equal
-  auto max_less_than_initial = []() {
-    const auto initial{1024};
-    const auto maximum{256};
-    pool_mr mr{*rmm::mr::get_current_device_resource(), initial, maximum};
+    pool_mr mr{rmm::mr::get_current_device_resource_ref(), initial, maximum};
   };
   EXPECT_THROW(max_less_than_initial(), rmm::logic_error);
 }
@@ -72,7 +60,7 @@ TEST(PoolTest, AllocateNinetyPercent)
     auto const [free, total] = rmm::available_device_memory();
     (void)total;
     auto const ninety_percent_pool = rmm::percent_of_free_device_memory(90);
-    pool_mr mr{rmm::mr::get_current_device_resource(), ninety_percent_pool};
+    pool_mr mr{rmm::mr::get_current_device_resource_ref(), ninety_percent_pool};
   };
   EXPECT_NO_THROW(allocate_ninety());
 }
@@ -81,7 +69,7 @@ TEST(PoolTest, TwoLargeBuffers)
 {
   auto two_large = []() {
     [[maybe_unused]] auto const [free, total] = rmm::available_device_memory();
-    pool_mr mr{rmm::mr::get_current_device_resource(), rmm::percent_of_free_device_memory(50)};
+    pool_mr mr{rmm::mr::get_current_device_resource_ref(), rmm::percent_of_free_device_memory(50)};
     auto* ptr1 = mr.allocate(free / 4);
     auto* ptr2 = mr.allocate(free / 4);
     mr.deallocate(ptr1, free / 4);
@@ -116,7 +104,7 @@ TEST(PoolTest, ForceGrowth)
 
 TEST(PoolTest, DeletedStream)
 {
-  pool_mr mr{rmm::mr::get_current_device_resource(), 0};
+  pool_mr mr{rmm::mr::get_current_device_resource_ref(), 0};
   cudaStream_t stream{};  // we don't use rmm::cuda_stream here to make destruction more explicit
   const int size = 10000;
   EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream));
@@ -129,7 +117,7 @@ TEST(PoolTest, DeletedStream)
 TEST(PoolTest, InitialAndMaxPoolSizeEqual)
 {
   EXPECT_NO_THROW([]() {
-    pool_mr mr(rmm::mr::get_current_device_resource(), 1000192, 1000192);
+    pool_mr mr(rmm::mr::get_current_device_resource_ref(), 1000192, 1000192);
     mr.allocate(1000);
   }());
 }
@@ -138,14 +126,14 @@ TEST(PoolTest, NonAlignedPoolSize)
 {
   EXPECT_THROW(
     []() {
-      pool_mr mr(rmm::mr::get_current_device_resource(), 1000031, 1000192);
+      pool_mr mr(rmm::mr::get_current_device_resource_ref(), 1000031, 1000192);
       mr.allocate(1000);
     }(),
     rmm::logic_error);
 
   EXPECT_THROW(
     []() {
-      pool_mr mr(rmm::mr::get_current_device_resource(), 1000192, 1000200);
+      pool_mr mr(rmm::mr::get_current_device_resource_ref(), 1000192, 1000200);
       mr.allocate(1000);
     }(),
     rmm::logic_error);
@@ -203,18 +191,18 @@ namespace test_properties {
 class fake_async_resource {
  public:
   // To model `async_resource`
-  void* allocate(std::size_t, std::size_t) { return nullptr; }
-  void deallocate(void* ptr, std::size_t, std::size_t) {}
-  void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) { return nullptr; }
-  void deallocate_async(void* ptr, std::size_t, std::size_t, cuda::stream_ref) {}
+  static void* allocate(std::size_t, std::size_t) { return nullptr; }
+  static void deallocate(void* ptr, std::size_t, std::size_t) {}
+  static void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) { return nullptr; }
+  static void deallocate_async(void* ptr, std::size_t, std::size_t, cuda::stream_ref) {}
 
   bool operator==(const fake_async_resource& other) const { return true; }
   bool operator!=(const fake_async_resource& other) const { return false; }
 
  private:
-  void* do_allocate(std::size_t bytes, cuda_stream_view) { return nullptr; }
-  void do_deallocate(void* ptr, std::size_t, cuda_stream_view) {}
-  [[nodiscard]] bool do_is_equal(fake_async_resource const& other) const noexcept { return true; }
+  static void* do_allocate(std::size_t bytes, cuda_stream_view) { return nullptr; }
+  static void do_deallocate(void* ptr, std::size_t, cuda_stream_view) {}
+  [[nodiscard]] static bool do_is_equal(fake_async_resource const& other) noexcept { return true; }
 };
 static_assert(!cuda::has_property<fake_async_resource, cuda::mr::device_accessible>);
 static_assert(!cuda::has_property<rmm::mr::pool_memory_resource<fake_async_resource>,
diff --git a/tests/mr/device/statistics_mr_tests.cpp b/tests/mr/device/statistics_mr_tests.cpp
index 6c5700f0b..f796a4c00 100644
--- a/tests/mr/device/statistics_mr_tests.cpp
+++ b/tests/mr/device/statistics_mr_tests.cpp
@@ -40,7 +40,7 @@ TEST(StatisticsTest, ThrowOnNullUpstream)
 
 TEST(StatisticsTest, Empty)
 {
-  statistics_adaptor mr{rmm::mr::get_current_device_resource()};
+  statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()};
 
   EXPECT_EQ(mr.get_bytes_counter().peak, 0);
   EXPECT_EQ(mr.get_bytes_counter().total, 0);
@@ -53,7 +53,7 @@ TEST(StatisticsTest, Empty)
 
 TEST(StatisticsTest, AllFreed)
 {
-  statistics_adaptor mr{rmm::mr::get_current_device_resource()};
+  statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   std::vector<void*> allocations;
 
   allocations.reserve(num_allocations);
@@ -71,7 +71,7 @@ TEST(StatisticsTest, AllFreed)
 
 TEST(StatisticsTest, PeakAllocations)
 {
-  statistics_adaptor mr{rmm::mr::get_current_device_resource()};
+  statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   std::vector<void*> allocations;
 
   for (std::size_t i = 0; i < num_allocations; ++i) {
@@ -127,9 +127,9 @@ TEST(StatisticsTest, PeakAllocations)
 
 TEST(StatisticsTest, MultiTracking)
 {
-  auto* orig_device_resource = rmm::mr::get_current_device_resource();
+  auto orig_device_resource = rmm::mr::get_current_device_resource_ref();
   statistics_adaptor mr{orig_device_resource};
-  rmm::mr::set_current_device_resource(&mr);
+  rmm::mr::set_current_device_resource_ref(mr);
 
   std::vector<std::shared_ptr<rmm::device_buffer>> allocations;
   for (std::size_t i = 0; i < num_allocations; ++i) {
@@ -139,8 +139,8 @@ TEST(StatisticsTest, MultiTracking)
 
   EXPECT_EQ(mr.get_allocations_counter().value, 10);
 
-  statistics_adaptor inner_mr{rmm::mr::get_current_device_resource()};
-  rmm::mr::set_current_device_resource(&inner_mr);
+  statistics_adaptor inner_mr{rmm::mr::get_current_device_resource_ref()};
+  rmm::mr::set_current_device_resource_ref(inner_mr);
 
   for (std::size_t i = 0; i < num_more_allocations; ++i) {
     allocations.emplace_back(
@@ -172,7 +172,7 @@ TEST(StatisticsTest, MultiTracking)
   EXPECT_EQ(inner_mr.get_allocations_counter().peak, 5);
 
   // Reset the current device resource
-  rmm::mr::set_current_device_resource(orig_device_resource);
+  rmm::mr::set_current_device_resource_ref(orig_device_resource);
 }
 
 TEST(StatisticsTest, NegativeInnerTracking)
@@ -180,7 +180,7 @@ TEST(StatisticsTest, NegativeInnerTracking)
   // This tests the unlikely scenario where pointers are deallocated on an inner
   // wrapped memory resource. This can happen if the MR is not saved with the
   // memory pointer
-  statistics_adaptor mr{rmm::mr::get_current_device_resource()};
+  statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   std::vector<void*> allocations;
   for (std::size_t i = 0; i < num_allocations; ++i) {
     allocations.push_back(mr.allocate(ten_MiB));
@@ -236,7 +236,7 @@ TEST(StatisticsTest, NegativeInnerTracking)
 
 TEST(StatisticsTest, Nested)
 {
-  statistics_adaptor mr{rmm::mr::get_current_device_resource()};
+  statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   void* a0 = mr.allocate(ten_MiB);
   EXPECT_EQ(mr.get_bytes_counter().value, ten_MiB);
   EXPECT_EQ(mr.get_allocations_counter().value, 1);
diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu
index 91ae396ed..84f599957 100644
--- a/tests/mr/device/thrust_allocator_tests.cu
+++ b/tests/mr/device/thrust_allocator_tests.cu
@@ -37,9 +37,9 @@ namespace {
 struct allocator_test : public mr_ref_test {};
 
 // Disable until we support resource_ref with set_current_device_resource
-/*TEST_P(allocator_test, first)
+TEST_P(allocator_test, first)
 {
-  rmm::mr::set_current_device_resource(this->mr.get());
+  rmm::mr::set_current_device_resource_ref(this->ref);
   auto const num_ints{100};
   rmm::device_vector<int> ints(num_ints, 1);
   EXPECT_EQ(num_ints, thrust::reduce(ints.begin(), ints.end()));
@@ -47,12 +47,12 @@ struct allocator_test : public mr_ref_test {};
 
 TEST_P(allocator_test, defaults)
 {
-  rmm::mr::set_current_device_resource(this->mr.get());
+  rmm::mr::set_current_device_resource_ref(this->ref);
   rmm::mr::thrust_allocator<int> allocator(rmm::cuda_stream_default);
   EXPECT_EQ(allocator.stream(), rmm::cuda_stream_default);
   EXPECT_EQ(allocator.get_upstream_resource(),
-            rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()});
-}*/
+            rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()});
+}
 
 TEST_P(allocator_test, multi_device)
 {
diff --git a/tests/mr/device/tracking_mr_tests.cpp b/tests/mr/device/tracking_mr_tests.cpp
index 7c2532c60..acd540ae6 100644
--- a/tests/mr/device/tracking_mr_tests.cpp
+++ b/tests/mr/device/tracking_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,14 +42,14 @@ TEST(TrackingTest, ThrowOnNullUpstream)
 
 TEST(TrackingTest, Empty)
 {
-  tracking_adaptor mr{rmm::mr::get_current_device_resource()};
+  tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   EXPECT_EQ(mr.get_outstanding_allocations().size(), 0);
   EXPECT_EQ(mr.get_allocated_bytes(), 0);
 }
 
 TEST(TrackingTest, AllFreed)
 {
-  tracking_adaptor mr{rmm::mr::get_current_device_resource()};
+  tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   std::vector<void*> allocations;
   allocations.reserve(num_allocations);
   for (int i = 0; i < num_allocations; ++i) {
@@ -64,7 +64,7 @@ TEST(TrackingTest, AllFreed)
 
 TEST(TrackingTest, AllocationsLeftWithStacks)
 {
-  tracking_adaptor mr{rmm::mr::get_current_device_resource(), true};
+  tracking_adaptor mr{rmm::mr::get_current_device_resource_ref(), true};
   std::vector<void*> allocations;
   allocations.reserve(num_allocations);
   for (int i = 0; i < num_allocations; ++i) {
@@ -82,7 +82,7 @@ TEST(TrackingTest, AllocationsLeftWithStacks)
 
 TEST(TrackingTest, AllocationsLeftWithoutStacks)
 {
-  tracking_adaptor mr{rmm::mr::get_current_device_resource()};
+  tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   std::vector<void*> allocations;
   allocations.reserve(num_allocations);
   for (int i = 0; i < num_allocations; ++i) {
@@ -101,9 +101,9 @@ TEST(TrackingTest, AllocationsLeftWithoutStacks)
 
 TEST(TrackingTest, MultiTracking)
 {
-  auto* orig_device_resource = rmm::mr::get_current_device_resource();
+  auto orig_device_resource = rmm::mr::get_current_device_resource_ref();
   tracking_adaptor mr{orig_device_resource, true};
-  rmm::mr::set_current_device_resource(&mr);
+  rmm::mr::set_current_device_resource_ref(mr);
 
   std::vector<std::shared_ptr<rmm::device_buffer>> allocations;
   for (std::size_t i = 0; i < num_allocations; ++i) {
@@ -113,8 +113,8 @@ TEST(TrackingTest, MultiTracking)
 
   EXPECT_EQ(mr.get_outstanding_allocations().size(), num_allocations);
 
-  tracking_adaptor inner_mr{rmm::mr::get_current_device_resource()};
-  rmm::mr::set_current_device_resource(&inner_mr);
+  tracking_adaptor inner_mr{rmm::mr::get_current_device_resource_ref()};
+  rmm::mr::set_current_device_resource_ref(inner_mr);
 
   for (std::size_t i = 0; i < num_more_allocations; ++i) {
     allocations.emplace_back(
@@ -141,7 +141,7 @@ TEST(TrackingTest, MultiTracking)
   EXPECT_EQ(inner_mr.get_allocated_bytes(), 0);
 
   // Reset the current device resource
-  rmm::mr::set_current_device_resource(orig_device_resource);
+  rmm::mr::set_current_device_resource_ref(orig_device_resource);
 }
 
 TEST(TrackingTest, NegativeInnerTracking)
@@ -149,7 +149,7 @@ TEST(TrackingTest, NegativeInnerTracking)
   // This tests the unlikely scenario where pointers are deallocated on an inner
   // wrapped memory resource. This can happen if the MR is not saved with the
   // memory pointer
-  tracking_adaptor mr{rmm::mr::get_current_device_resource()};
+  tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   std::vector<void*> allocations;
   for (std::size_t i = 0; i < num_allocations; ++i) {
     allocations.push_back(mr.allocate(ten_MiB));
@@ -181,7 +181,7 @@ TEST(TrackingTest, NegativeInnerTracking)
 
 TEST(TrackingTest, DeallocWrongBytes)
 {
-  tracking_adaptor mr{rmm::mr::get_current_device_resource()};
+  tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   std::vector<void*> allocations;
   for (std::size_t i = 0; i < num_allocations; ++i) {
     allocations.push_back(mr.allocate(ten_MiB));
@@ -207,7 +207,7 @@ TEST(TrackingTest, LogOutstandingAllocations)
   rmm::logger().sinks().push_back(oss_sink);
   auto old_level = rmm::logger().level();
 
-  tracking_adaptor mr{rmm::mr::get_current_device_resource()};
+  tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   std::vector<void*> allocations;
   for (std::size_t i = 0; i < num_allocations; ++i) {
     allocations.push_back(mr.allocate(ten_MiB));
diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp
index 8445ab1f5..071739575 100644
--- a/tests/mr/host/mr_ref_tests.cpp
+++ b/tests/mr/host/mr_ref_tests.cpp
@@ -233,14 +233,17 @@ TYPED_TEST(MRRefTest, UnsupportedAlignmentTest)
   for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) {
     for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment;
          alignment *= TestedAlignmentMultiplier) {
+#ifdef NDEBUG
       auto allocation_size = size_distribution(generator);
       void* ptr{nullptr};
       // An unsupported alignment (like an odd number) should result in an
       // alignment of `alignof(std::max_align_t)`
       auto const bad_alignment = alignment + 1;
+
       EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, bad_alignment));
       EXPECT_TRUE(is_aligned(ptr, alignof(std::max_align_t)));
       EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, bad_alignment));
+#endif
     }
   }
 }

From 800dc8749aa889c6b51046739449ef5322217ddf Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 10 Sep 2024 10:55:14 +0200
Subject: [PATCH 03/27] Add missing include to `resource_ref.hpp` (#1677)

This has been found breaking CCCL CI when building cuDF

Authors:
  - Michael Schellenberger Costa (https://github.com/miscco)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/rmm/pull/1677
---
 include/rmm/resource_ref.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/rmm/resource_ref.hpp b/include/rmm/resource_ref.hpp
index 56049522f..285726478 100644
--- a/include/rmm/resource_ref.hpp
+++ b/include/rmm/resource_ref.hpp
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
 
 #include <cuda/memory_resource>

From 68a51d3a12e9d13793dd79f053e63145e7107ee1 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 18 Sep 2024 01:00:58 -0500
Subject: [PATCH 04/27] Recommend `miniforge` for conda install. (#1681)

Recommending `miniforge` for conda install in installation docs.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/rmm/pull/1681
---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1250d094b..0a25cd348 100644
--- a/README.md
+++ b/README.md
@@ -33,19 +33,20 @@ For a walkthrough about the design of the RAPIDS Memory Manager, read [Fast, Fle
 
 ### Conda
 
-RMM can be installed with Conda ([miniconda](https://conda.io/miniconda.html), or the full
-[Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` channel:
+RMM can be installed with conda. You can get a minimal conda installation with [miniforge](https://github.com/conda-forge/miniforge).
+
+Install RMM with:
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia rmm cuda-version=12.0
 ```
 
-We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
+We also provide [nightly conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
 of our latest development branch.
 
 Note: RMM is supported only on Linux, and only tested with Python versions 3.10, 3.11, and 3.12.
 
-Note: The RMM package from Conda requires building with GCC 9 or later. Otherwise, your application may fail to build.
+Note: The RMM package from conda requires building with GCC 9 or later. Otherwise, your application may fail to build.
 
 See the [Get RAPIDS version picker](https://rapids.ai/start.html) for more OS and version info.
 

From 58039d3c8ff63ecfbfde8b3a103bc14d33be7bae Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 18 Sep 2024 14:43:51 -0500
Subject: [PATCH 05/27] Use CI workflow branch 'branch-24.10' again [skip ci]
 (#1683)

---
 .github/workflows/build.yaml | 16 ++++++++--------
 .github/workflows/pr.yaml    | 22 +++++++++++-----------
 .github/workflows/test.yaml  |  6 +++---
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 7f54701d9..9b7efecde 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -56,7 +56,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -68,7 +68,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -79,7 +79,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7d9fbb086..1160b93e9 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -23,40 +23,40 @@ jobs:
       - wheel-tests
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -66,7 +66,7 @@ jobs:
   wheel-build-cpp:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -74,20 +74,20 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
   wheel-tests:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index eec04005d..13838d888 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From f4ec4631a66de1bfa199120321abe95782cf32c6 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 19 Sep 2024 11:43:47 -0400
Subject: [PATCH 06/27] DOC v24.12 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda11.8-pip/devcontainer.json  |  6 ++---
 .../cuda12.5-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda12.5-pip/devcontainer.json  |  6 ++---
 .github/workflows/build.yaml                  | 16 +++++++-------
 .github/workflows/pr.yaml                     | 22 +++++++++----------
 .github/workflows/test.yaml                   |  6 ++---
 VERSION                                       |  2 +-
 dependencies.yaml                             |  6 ++---
 python/rmm/pyproject.toml                     |  2 +-
 10 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 3bfef6706..549ffa67b 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 5bfc30823..d6dd7b6ce 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index 925557b22..17e8d5cd0 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 2f9e1c493..54964d880 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 9b7efecde..6fa11225e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -56,7 +56,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -68,7 +68,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -79,7 +79,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1160b93e9..afc9f7487 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -23,40 +23,40 @@ jobs:
       - wheel-tests
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -66,7 +66,7 @@ jobs:
   wheel-build-cpp:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -74,20 +74,20 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
   wheel-tests:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 13838d888..34a0f746d 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index 7c7ba0443..af28c42b5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.10.00
+24.12.00
diff --git a/dependencies.yaml b/dependencies.yaml
index eff3560e7..5b5cded62 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -163,15 +163,15 @@ dependencies:
                 cuda: "12.*"
                 cuda_suffixed: "true"
               packages:
-                - librmm-cu12==24.10.*,>=0.0.0a0
+                - librmm-cu12==24.12.*,>=0.0.0a0
             - matrix:
                 cuda: "11.*"
                 cuda_suffixed: "true"
               packages:
-                - librmm-cu11==24.10.*,>=0.0.0a0
+                - librmm-cu11==24.12.*,>=0.0.0a0
             - matrix: null
               packages:
-                - librmm==24.10.*,>=0.0.0a0
+                - librmm==24.12.*,>=0.0.0a0
   checks:
     common:
       - output_types: [conda, requirements]
diff --git a/python/rmm/pyproject.toml b/python/rmm/pyproject.toml
index 7577ad961..b148cdba7 100644
--- a/python/rmm/pyproject.toml
+++ b/python/rmm/pyproject.toml
@@ -130,7 +130,7 @@ requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cuda-python>=11.7.1,<12.0a0",
     "cython>=3.0.0",
-    "librmm==24.10.*,>=0.0.0a0",
+    "librmm==24.12.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From 99e237ef6a42321a6bbd28b7aab9e4cc4105e6a3 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 23 Sep 2024 15:11:29 -0500
Subject: [PATCH 07/27] Update fmt (to 11.0.2) and spdlog (to 1.14.1). (#1678)

* Update fmt (to 11.0.2) and spdlog (to 1.14.1).

* simplify get_spdlog

* copyright

* Apply suggestions from code review

Co-authored-by: Bradley Dice <bdice@bradleydice.com>

* Apply suggestions from code review

* test with changes from https://github.com/rapidsai/rapids-cmake/pull/689/commits/d7671a30f99bc2b239653b5c1089ff51bcc37dc4

* Update cmake/thirdparty/get_spdlog.cmake

* move rapids-cmake overrides [skip ci]

* try reverting get_spdlog export changes [skip ci]

* more fiddling with export sets [skip ci]

* more exporting [skip ci]

* more export set fiddling [skip ci]

* more [skip ci]

* exports [skip ci]

* run a build

* restore tests

* branch references

* remove testing-only changes [skip ci]

---------

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 cmake/thirdparty/get_spdlog.cmake              | 18 +++++-------------
 .../environments/all_cuda-118_arch-x86_64.yaml |  4 ++--
 .../environments/all_cuda-125_arch-x86_64.yaml |  4 ++--
 conda/recipes/librmm/conda_build_config.yaml   |  4 ++--
 dependencies.yaml                              |  4 ++--
 5 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/cmake/thirdparty/get_spdlog.cmake b/cmake/thirdparty/get_spdlog.cmake
index 296370469..7f80b3726 100644
--- a/cmake/thirdparty/get_spdlog.cmake
+++ b/cmake/thirdparty/get_spdlog.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -16,19 +16,11 @@
 function(find_and_configure_spdlog)
 
   include(${rapids-cmake-dir}/cpm/spdlog.cmake)
-  rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET rmm-exports)
-  rapids_export_package(BUILD spdlog rmm-exports)
+  rapids_cpm_spdlog(
+    FMT_OPTION "EXTERNAL_FMT_HO"
+    INSTALL_EXPORT_SET rmm-exports
+    BUILD_EXPORT_SET rmm-exports)
 
-  if(spdlog_ADDED)
-    rapids_export(
-      BUILD spdlog
-      EXPORT_SET spdlog
-      GLOBAL_TARGETS spdlog spdlog_header_only
-      NAMESPACE spdlog::)
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=]
-                                    EXPORT_SET rmm-exports)
-  endif()
 endfunction()
 
 find_and_configure_spdlog()
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 33b54b3f9..bf64d4d55 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -16,7 +16,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - doxygen=1.9.1
-- fmt>=10.1.1,<11
+- fmt>=11.0.2,<12
 - gcc_linux-64=11.*
 - gcovr>=5.0
 - graphviz
@@ -35,7 +35,7 @@ dependencies:
 - python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - scikit-build-core >=0.10.0
-- spdlog>=1.12.0,<1.13
+- spdlog>=1.14.1,<1.15
 - sphinx
 - sphinx-copybutton
 - sphinx-markdown-tables
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 5946a9786..112c635a8 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -16,7 +16,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - doxygen=1.9.1
-- fmt>=10.1.1,<11
+- fmt>=11.0.2,<12
 - gcc_linux-64=11.*
 - gcovr>=5.0
 - graphviz
@@ -34,7 +34,7 @@ dependencies:
 - python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - scikit-build-core >=0.10.0
-- spdlog>=1.12.0,<1.13
+- spdlog>=1.14.1,<1.15
 - sphinx
 - sphinx-copybutton
 - sphinx-markdown-tables
diff --git a/conda/recipes/librmm/conda_build_config.yaml b/conda/recipes/librmm/conda_build_config.yaml
index 6afd32c32..b4791745f 100644
--- a/conda/recipes/librmm/conda_build_config.yaml
+++ b/conda/recipes/librmm/conda_build_config.yaml
@@ -14,10 +14,10 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 fmt_version:
-  - ">=10.1.1,<11"
+  - ">=11.0.2,<12"
 
 spdlog_version:
-  - ">=1.12.0,<1.13"
+  - ">=1.14.1,<1.15"
 
 c_stdlib:
   - sysroot
diff --git a/dependencies.yaml b/dependencies.yaml
index eff3560e7..483c21e61 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -111,8 +111,8 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - fmt>=10.1.1,<11
-          - spdlog>=1.12.0,<1.13
+          - fmt>=11.0.2,<12
+          - spdlog>=1.14.1,<1.15
     specific:
       - output_types: conda
         matrices:

From ab6e2961d7b8f833f688775e941c4e2ed2bd4d8a Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Tue, 24 Sep 2024 14:09:28 -0500
Subject: [PATCH 08/27] update update-version.sh to use packaging lib (#1685)

---
 ci/release/update-version.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index acec1c658..ddd093bea 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -24,7 +24,7 @@ NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 
 # Need to distutils-normalize the original version
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
+NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 

From b51447393c523cc929608d84850c70a3eae08af3 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 27 Sep 2024 11:56:06 -0500
Subject: [PATCH 09/27] exclude 'gcovr' from list of development pip packages
 (#1688)

This project currently lists `gcovr` (https://pypi.org/project/gcovr/) as a pip dependency for development. I strongly suspect that that was unintentional... it doesn't look like it has any reliance on getting that package via `pip` (just conda, in the C++ test jobs and for local C++ development).

This proposes removing `gcovr` from the list of pip dependencies, so it won't get installed in the DLFW images or other places where `rapids-make-pip-env` from https://github.com/rapidsai/devcontainers is called.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1688
---
 dependencies.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index b9a1e1b36..9f1ed9c40 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -232,13 +232,11 @@ dependencies:
             packages:
   develop:
     common:
-      - output_types: [conda, requirements]
-        packages:
-          - gcovr>=5.0
       - output_types: conda
         packages:
           - clang==16.0.6
           - clang-tools==16.0.6
+          - gcovr>=5.0
   docs:
     common:
       - output_types: conda

From 9e410c0591f38aa6c0a17c4e2c2edc4f6bfed058 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 30 Sep 2024 18:28:44 +0100
Subject: [PATCH 10/27] Use `rmm::percent_of_free_device_memory` in arena test
 (#1689)

Rather than hand-coding a fraction of the device memory use the utility routine.

- Closes #1674

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Rong Ou (https://github.com/rongou)

URL: https://github.com/rapidsai/rmm/pull/1689
---
 tests/mr/device/arena_mr_tests.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index bdc0f2438..95cc9c9c1 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -491,9 +491,7 @@ TEST_F(ArenaTest, SizeSmallerThanSuperblockSize)  // NOLINT
 TEST_F(ArenaTest, AllocateNinetyPercent)  // NOLINT
 {
   EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
-    auto const free           = rmm::available_device_memory().first;
-    auto const ninety_percent = rmm::align_up(
-      static_cast<std::size_t>(static_cast<double>(free) * 0.9), rmm::CUDA_ALLOCATION_ALIGNMENT);
+    auto const ninety_percent = rmm::percent_of_free_device_memory(90);
     arena_mr mr(rmm::mr::get_current_device_resource_ref(), ninety_percent);
   }());
 }

From 6489bb7df63a3784b4a94067e3a8fa8917523ab7 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 2 Oct 2024 23:14:10 -0400
Subject: [PATCH 11/27] [Improvement] Reorganize Cython to separate C++
 bindings and make Cython classes public (#1676)

Closes #1280

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1676
---
 .gitignore                                    |  11 +-
 python/rmm/CMakeLists.txt                     |   3 +-
 python/rmm/docs/guide.md                      |   6 +-
 python/rmm/rmm/__init__.py                    |  20 +-
 python/rmm/rmm/_cuda/stream.pxd               |   4 +-
 python/rmm/rmm/_cuda/stream.pyx               |   4 +-
 python/rmm/rmm/_lib/__init__.py               |   4 +-
 python/rmm/rmm/_lib/cuda_stream.pxd           |  27 +-
 python/rmm/rmm/_lib/cuda_stream_pool.pxd      |  14 +-
 python/rmm/rmm/_lib/cuda_stream_view.pxd      |  26 +-
 python/rmm/rmm/_lib/device_buffer.pxd         | 115 +--------
 python/rmm/rmm/_lib/device_uvector.pxd        |  28 +--
 python/rmm/rmm/_lib/helper.pxd                |   3 +-
 python/rmm/rmm/_lib/logger.pxd                |  24 ++
 python/rmm/rmm/_lib/memory_resource.pxd       | 138 ++++-------
 python/rmm/rmm/_lib/per_device_resource.pxd   |  42 ++--
 python/rmm/rmm/allocators/cupy.py             |   6 +-
 python/rmm/rmm/allocators/numba.py            |   6 +-
 python/rmm/rmm/allocators/torch.py            |   8 +-
 .../rmm/rmm/{_lib => librmm}/CMakeLists.txt   |   3 +-
 .../{_lib/__init__.pxd => librmm/__init__.py} |   0
 python/rmm/rmm/librmm/_logger.pxd             |  66 +++++
 .../rmm/{_lib/lib.pxd => librmm/_logger.pyx}  |   9 +-
 .../rmm/{_lib => librmm}/_torch_allocator.cpp |   0
 python/rmm/rmm/librmm/cuda_stream.pxd         |  28 +++
 python/rmm/rmm/librmm/cuda_stream_pool.pxd    |  23 ++
 python/rmm/rmm/librmm/cuda_stream_view.pxd    |  32 +++
 python/rmm/rmm/librmm/device_buffer.pxd       |  58 +++++
 python/rmm/rmm/librmm/device_uvector.pxd      |  39 +++
 python/rmm/rmm/librmm/memory_resource.pxd     | 230 ++++++++++++++++++
 python/rmm/rmm/librmm/per_device_resource.pxd |  36 +++
 python/rmm/rmm/mr.py                          |   2 +-
 python/rmm/rmm/pylibrmm/CMakeLists.txt        |  27 ++
 python/rmm/rmm/pylibrmm/__init__.py           |  15 ++
 python/rmm/rmm/pylibrmm/cuda_stream.pxd       |  27 ++
 .../rmm/{_lib => pylibrmm}/cuda_stream.pyx    |   4 +-
 python/rmm/rmm/pylibrmm/device_buffer.pxd     |  71 ++++++
 .../rmm/{_lib => pylibrmm}/device_buffer.pyx  |  19 +-
 .../rmm/{_lib/lib.pyx => pylibrmm/helper.pxd} |   5 +-
 python/rmm/rmm/{_lib => pylibrmm}/helper.pyx  |   0
 python/rmm/rmm/{_lib => pylibrmm}/logger.pyx  |  57 +----
 python/rmm/rmm/pylibrmm/memory_resource.pxd   |  83 +++++++
 .../{_lib => pylibrmm}/memory_resource.pyx    | 228 +++--------------
 .../rmm/{_lib => pylibrmm}/tests/__init__.py  |   0
 .../tests/test_device_buffer.pyx              |   7 +-
 python/rmm/rmm/tests/test_cython.py           |   4 +-
 python/rmm/rmm/tests/test_rmm.py              |   2 +-
 47 files changed, 965 insertions(+), 599 deletions(-)
 create mode 100644 python/rmm/rmm/_lib/logger.pxd
 rename python/rmm/rmm/{_lib => librmm}/CMakeLists.txt (93%)
 rename python/rmm/rmm/{_lib/__init__.pxd => librmm/__init__.py} (100%)
 create mode 100644 python/rmm/rmm/librmm/_logger.pxd
 rename python/rmm/rmm/{_lib/lib.pxd => librmm/_logger.pyx} (70%)
 rename python/rmm/rmm/{_lib => librmm}/_torch_allocator.cpp (100%)
 create mode 100644 python/rmm/rmm/librmm/cuda_stream.pxd
 create mode 100644 python/rmm/rmm/librmm/cuda_stream_pool.pxd
 create mode 100644 python/rmm/rmm/librmm/cuda_stream_view.pxd
 create mode 100644 python/rmm/rmm/librmm/device_buffer.pxd
 create mode 100644 python/rmm/rmm/librmm/device_uvector.pxd
 create mode 100644 python/rmm/rmm/librmm/memory_resource.pxd
 create mode 100644 python/rmm/rmm/librmm/per_device_resource.pxd
 create mode 100644 python/rmm/rmm/pylibrmm/CMakeLists.txt
 create mode 100644 python/rmm/rmm/pylibrmm/__init__.py
 create mode 100644 python/rmm/rmm/pylibrmm/cuda_stream.pxd
 rename python/rmm/rmm/{_lib => pylibrmm}/cuda_stream.pyx (91%)
 create mode 100644 python/rmm/rmm/pylibrmm/device_buffer.pxd
 rename python/rmm/rmm/{_lib => pylibrmm}/device_buffer.pyx (96%)
 rename python/rmm/rmm/{_lib/lib.pyx => pylibrmm/helper.pxd} (86%)
 rename python/rmm/rmm/{_lib => pylibrmm}/helper.pyx (100%)
 rename python/rmm/rmm/{_lib => pylibrmm}/logger.pyx (77%)
 create mode 100644 python/rmm/rmm/pylibrmm/memory_resource.pxd
 rename python/rmm/rmm/{_lib => pylibrmm}/memory_resource.pyx (82%)
 rename python/rmm/rmm/{_lib => pylibrmm}/tests/__init__.py (100%)
 rename python/rmm/rmm/{_lib => pylibrmm}/tests/test_device_buffer.pyx (83%)

diff --git a/.gitignore b/.gitignore
index 2d0b150e1..36aafe643 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,10 +22,13 @@ rmm.egg-info/
 python/build
 python/*/build
 python/rmm/docs/_build
-python/rmm/**/_lib/**/*.cpp
-!python/rmm/_lib/_torch_allocator.cpp
-python/rmm/**/_lib/**/*.h
-python/rmm/**/_lib/.nfs*
+python/rmm/**/librmmm/**/*.cpp
+!python/rmm/librmmm/_torch_allocator.cpp
+python/rmm/**/librmm/**/*.h
+python/rmm/**/librmm/.nfs*
+python/rmm/**/pylibrmmm/**/*.cpp
+python/rmm/**/pylibrmmm/**/*.h
+python/rmm/**/pylibrmmm/.nfs*
 python/rmm/_cuda/*.cpp
 python/rmm/tests/*.cpp
 python/rmm/*.ipynb
diff --git a/python/rmm/CMakeLists.txt b/python/rmm/CMakeLists.txt
index 6c2515102..ac8495e14 100644
--- a/python/rmm/CMakeLists.txt
+++ b/python/rmm/CMakeLists.txt
@@ -30,4 +30,5 @@ rapids_cython_init()
 add_compile_definitions("SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}")
 
 add_subdirectory(rmm/_cuda)
-add_subdirectory(rmm/_lib)
+add_subdirectory(rmm/librmm)
+add_subdirectory(rmm/pylibrmm)
diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md
index 22c0dc023..c7e940497 100644
--- a/python/rmm/docs/guide.md
+++ b/python/rmm/docs/guide.md
@@ -236,17 +236,17 @@ Common to both usages is that they modify the currently active RMM memory resour
 
 >>> # We start with the default cuda memory resource
 >>> rmm.mr.get_current_device_resource()
-<rmm._lib.memory_resource.CudaMemoryResource at 0x7f7e6c0a1ce0>
+<rmm.pylibrmm.memory_resource.CudaMemoryResource object at 0x7fa0da48a8e0>
 
 >>> # When using statistics, we get a StatisticsResourceAdaptor with the context
 >>> with rmm.statistics.statistics():
 ...     rmm.mr.get_current_device_resource()
-<rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f7e6c524900>
+<rmm.pylibrmm.memory_resource.StatisticsResourceAdaptor object at 0x7fa0dd6e4a40>
 
 >>> # We can also enable statistics globally
 >>> rmm.statistics.enable_statistics()
 >>> print(rmm.mr.get_current_device_resource())
-<rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f662c2bb3c0>
+<rmm.pylibrmm.memory_resource.StatisticsResourceAdaptor object at 0x7f9a11340a40>
 ```
 
 With statistics enabled, you can query statistics of the current and peak bytes and number of allocations performed by the current RMM memory resource:
diff --git a/python/rmm/rmm/__init__.py b/python/rmm/rmm/__init__.py
index 1e3b5c8b1..b23ad68f9 100644
--- a/python/rmm/rmm/__init__.py
+++ b/python/rmm/rmm/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 from rmm import mr
-from rmm._lib.device_buffer import DeviceBuffer
-from rmm._lib.logger import (
+from rmm._version import __git_commit__, __version__
+from rmm.mr import disable_logging, enable_logging, get_log_filenames
+from rmm.pylibrmm.device_buffer import DeviceBuffer
+from rmm.pylibrmm.logger import (
     flush_logger,
     get_flush_level,
     get_logging_level,
@@ -23,8 +25,6 @@
     set_logging_level,
     should_log,
 )
-from rmm._version import __git_commit__, __version__
-from rmm.mr import disable_logging, enable_logging, get_log_filenames
 from rmm.rmm import (
     RMMError,
     is_initialized,
@@ -52,3 +52,13 @@
     "should_log",
     "unregister_reinitialize_hook",
 ]
+
+
+def __getattr__(name):
+    if name == "_lib":
+        import importlib
+
+        module = importlib.import_module("rmm.pylibrmm")
+        return module
+    else:
+        raise AttributeError(f"Module '{__name__}' has no attribute '{name}'")
diff --git a/python/rmm/rmm/_cuda/stream.pxd b/python/rmm/rmm/_cuda/stream.pxd
index 3c3d3aa6f..e91e2ce58 100644
--- a/python/rmm/rmm/_cuda/stream.pxd
+++ b/python/rmm/rmm/_cuda/stream.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@ from cuda.ccudart cimport cudaStream_t
 from libc.stdint cimport uintptr_t
 from libcpp cimport bool
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
 
 
 cdef class Stream:
diff --git a/python/rmm/rmm/_cuda/stream.pyx b/python/rmm/rmm/_cuda/stream.pyx
index 4d5ff5232..37dcbd610 100644
--- a/python/rmm/rmm/_cuda/stream.pyx
+++ b/python/rmm/rmm/_cuda/stream.pyx
@@ -16,13 +16,13 @@ from cuda.ccudart cimport cudaStream_t
 from libc.stdint cimport uintptr_t
 from libcpp cimport bool
 
-from rmm._lib.cuda_stream cimport CudaStream
-from rmm._lib.cuda_stream_view cimport (
+from rmm.librmm.cuda_stream_view cimport (
     cuda_stream_default,
     cuda_stream_legacy,
     cuda_stream_per_thread,
     cuda_stream_view,
 )
+from rmm.pylibrmm.cuda_stream cimport CudaStream
 
 
 cdef class Stream:
diff --git a/python/rmm/rmm/_lib/__init__.py b/python/rmm/rmm/_lib/__init__.py
index 0b8672ef6..7cfddab60 100644
--- a/python/rmm/rmm/_lib/__init__.py
+++ b/python/rmm/rmm/_lib/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .device_buffer import DeviceBuffer
+from rmm.pylibrmm import *
diff --git a/python/rmm/rmm/_lib/cuda_stream.pxd b/python/rmm/rmm/_lib/cuda_stream.pxd
index e224cf9af..afc365fbb 100644
--- a/python/rmm/rmm/_lib/cuda_stream.pxd
+++ b/python/rmm/rmm/_lib/cuda_stream.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,26 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cimport cython
-from cuda.ccudart cimport cudaStream_t
-from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-
-cdef extern from "rmm/cuda_stream.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_stream:
-        cuda_stream() except +
-        bool is_valid() except +
-        cudaStream_t value() except +
-        cuda_stream_view view() except +
-        void synchronize() except +
-        void synchronize_no_throw()
-
-
-@cython.final
-cdef class CudaStream:
-    cdef unique_ptr[cuda_stream] c_obj
-    cdef cudaStream_t value(self) except * nogil
-    cdef bool is_valid(self) except * nogil
+from rmm.librmm.cuda_stream cimport cuda_stream
+from rmm.pylibrmm.cuda_stream cimport CudaStream
diff --git a/python/rmm/rmm/_lib/cuda_stream_pool.pxd b/python/rmm/rmm/_lib/cuda_stream_pool.pxd
index 0286a9377..4da59cc68 100644
--- a/python/rmm/rmm/_lib/cuda_stream_pool.pxd
+++ b/python/rmm/rmm/_lib/cuda_stream_pool.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cimport cython
-
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-
-cdef extern from "rmm/cuda_stream_pool.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_stream_pool:
-        cuda_stream_pool(size_t pool_size)
-        cuda_stream_view get_stream()
-        cuda_stream_view get_stream(size_t stream_id) except +
-        size_t get_pool_size()
+from rmm.librmm.cuda_stream_pool cimport cuda_stream_pool
diff --git a/python/rmm/rmm/_lib/cuda_stream_view.pxd b/python/rmm/rmm/_lib/cuda_stream_view.pxd
index bf0d33c24..c336b0fe8 100644
--- a/python/rmm/rmm/_lib/cuda_stream_view.pxd
+++ b/python/rmm/rmm/_lib/cuda_stream_view.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,21 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cuda.ccudart cimport cudaStream_t
-from libcpp cimport bool
-
-
-cdef extern from "rmm/cuda_stream_view.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_stream_view:
-        cuda_stream_view()
-        cuda_stream_view(cudaStream_t)
-        cudaStream_t value()
-        bool is_default()
-        bool is_per_thread_default()
-        void synchronize() except +
-
-    cdef bool operator==(cuda_stream_view const, cuda_stream_view const)
-
-    const cuda_stream_view cuda_stream_default
-    const cuda_stream_view cuda_stream_legacy
-    const cuda_stream_view cuda_stream_per_thread
+from rmm.librmm.cuda_stream_view cimport (
+    cuda_stream_default,
+    cuda_stream_legacy,
+    cuda_stream_per_thread,
+    cuda_stream_view,
+)
diff --git a/python/rmm/rmm/_lib/device_buffer.pxd b/python/rmm/rmm/_lib/device_buffer.pxd
index 0da9ace0c..22833b1b8 100644
--- a/python/rmm/rmm/_lib/device_buffer.pxd
+++ b/python/rmm/rmm/_lib/device_buffer.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,105 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from libc.stdint cimport uintptr_t
-from libcpp.memory cimport unique_ptr
-
-from rmm._cuda.stream cimport Stream
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from rmm._lib.memory_resource cimport (
-    DeviceMemoryResource,
-    device_memory_resource,
+from rmm.librmm.device_buffer cimport (
+    cuda_device_id,
+    device_buffer,
+    get_current_cuda_device,
+    prefetch,
+)
+from rmm.pylibrmm.device_buffer cimport (
+    DeviceBuffer,
+    copy_device_to_ptr,
+    copy_host_to_ptr,
+    copy_ptr_to_host,
+    to_device,
 )
-
-
-cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_device_id:
-        ctypedef int value_type
-        cuda_device_id()
-        cuda_device_id(value_type id)
-        value_type value()
-
-    cdef cuda_device_id get_current_cuda_device()
-
-cdef extern from "rmm/prefetch.hpp" namespace "rmm" nogil:
-    cdef void prefetch(const void* ptr,
-                       size_t bytes,
-                       cuda_device_id device,
-                       cuda_stream_view stream) except +
-
-cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
-    cdef cppclass device_buffer:
-        device_buffer()
-        device_buffer(
-            size_t size,
-            cuda_stream_view stream,
-            device_memory_resource *
-        ) except +
-        device_buffer(
-            const void* source_data,
-            size_t size,
-            cuda_stream_view stream,
-            device_memory_resource *
-        ) except +
-        device_buffer(
-            const device_buffer buf,
-            cuda_stream_view stream,
-            device_memory_resource *
-        ) except +
-        void reserve(size_t new_capacity, cuda_stream_view stream) except +
-        void resize(size_t new_size, cuda_stream_view stream) except +
-        void shrink_to_fit(cuda_stream_view stream) except +
-        void* data()
-        size_t size()
-        size_t capacity()
-
-
-cdef class DeviceBuffer:
-    cdef unique_ptr[device_buffer] c_obj
-
-    # Holds a reference to the DeviceMemoryResource used for allocation.
-    # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
-    # needed for deallocation
-    cdef DeviceMemoryResource mr
-
-    # Holds a reference to the stream used by the underlying `device_buffer`.
-    # Ensures the stream does not get destroyed before this DeviceBuffer
-    cdef Stream stream
-
-    @staticmethod
-    cdef DeviceBuffer c_from_unique_ptr(
-        unique_ptr[device_buffer] ptr,
-        Stream stream=*,
-        DeviceMemoryResource mr=*,
-    )
-
-    @staticmethod
-    cdef DeviceBuffer c_to_device(const unsigned char[::1] b,
-                                  Stream stream=*) except *
-    cpdef copy_to_host(self, ary=*, Stream stream=*)
-    cpdef copy_from_host(self, ary, Stream stream=*)
-    cpdef copy_from_device(self, cuda_ary, Stream stream=*)
-    cpdef bytes tobytes(self, Stream stream=*)
-
-    cdef size_t c_size(self) except *
-    cpdef void reserve(self, size_t new_capacity, Stream stream=*) except *
-    cpdef void resize(self, size_t new_size, Stream stream=*) except *
-    cpdef size_t capacity(self) except *
-    cdef void* c_data(self) except *
-
-    cdef device_buffer c_release(self) except *
-
-cpdef DeviceBuffer to_device(const unsigned char[::1] b,
-                             Stream stream=*)
-cpdef void copy_ptr_to_host(uintptr_t db,
-                            unsigned char[::1] hb,
-                            Stream stream=*) except *
-
-cpdef void copy_host_to_ptr(const unsigned char[::1] hb,
-                            uintptr_t db,
-                            Stream stream=*) except *
-
-cpdef void copy_device_to_ptr(uintptr_t d_src,
-                              uintptr_t d_dst,
-                              size_t count,
-                              Stream stream=*) except *
diff --git a/python/rmm/rmm/_lib/device_uvector.pxd b/python/rmm/rmm/_lib/device_uvector.pxd
index 29e122bbf..230b0afb3 100644
--- a/python/rmm/rmm/_lib/device_uvector.pxd
+++ b/python/rmm/rmm/_lib/device_uvector.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,28 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from rmm._lib.device_buffer cimport device_buffer
-from rmm._lib.memory_resource cimport device_memory_resource
-
-
-cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
-    cdef cppclass device_uvector[T]:
-        device_uvector(size_t size, cuda_stream_view  stream) except +
-        T* element_ptr(size_t index)
-        void set_element(size_t element_index, const T& v, cuda_stream_view s)
-        void set_element_async(
-            size_t element_index,
-            const T& v,
-            cuda_stream_view s
-        ) except +
-        T front_element(cuda_stream_view s) except +
-        T back_element(cuda_stream_view s) except +
-        void reserve(size_t new_capacity, cuda_stream_view stream) except +
-        void resize(size_t new_size, cuda_stream_view stream) except +
-        void shrink_to_fit(cuda_stream_view stream) except +
-        device_buffer release()
-        size_t capacity()
-        T* data()
-        size_t size()
-        device_memory_resource* memory_resource()
+from rmm.librmm.device_uvector cimport device_uvector
diff --git a/python/rmm/rmm/_lib/helper.pxd b/python/rmm/rmm/_lib/helper.pxd
index 8ca151c00..4a5159435 100644
--- a/python/rmm/rmm/_lib/helper.pxd
+++ b/python/rmm/rmm/_lib/helper.pxd
@@ -12,5 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-cdef object parse_bytes(object s) except *
+from rmm.pylibrmm.helper cimport parse_bytes
diff --git a/python/rmm/rmm/_lib/logger.pxd b/python/rmm/rmm/_lib/logger.pxd
new file mode 100644
index 000000000..bef05c903
--- /dev/null
+++ b/python/rmm/rmm/_lib/logger.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm._logger cimport logger, logging_level, spdlog_logger
+from rmm.pylibrmm.logger cimport (
+    _validate_level_type,
+    flush_logger,
+    get_flush_level,
+    get_logging_level,
+    set_flush_level,
+    set_logging_level,
+    should_log,
+)
diff --git a/python/rmm/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd
index 000a3fe1e..983063914 100644
--- a/python/rmm/rmm/_lib/memory_resource.pxd
+++ b/python/rmm/rmm/_lib/memory_resource.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,92 +12,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from libc.stdint cimport int8_t
-from libcpp.memory cimport shared_ptr
-from libcpp.pair cimport pair
-from libcpp.string cimport string
-from libcpp.vector cimport vector
-
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-
-cdef extern from "rmm/mr/device/device_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass device_memory_resource:
-        void* allocate(size_t bytes) except +
-        void* allocate(size_t bytes, cuda_stream_view stream) except +
-        void deallocate(void* ptr, size_t bytes) except +
-        void deallocate(
-            void* ptr,
-            size_t bytes,
-            cuda_stream_view stream
-        ) except +
-
-cdef extern from "rmm/cuda_device.hpp" namespace "rmm" nogil:
-    size_t percent_of_free_device_memory(int percent) except +
-    pair[size_t, size_t] available_device_memory() except +
-
-cdef class DeviceMemoryResource:
-    cdef shared_ptr[device_memory_resource] c_obj
-    cdef device_memory_resource* get_mr(self) noexcept nogil
-
-cdef class UpstreamResourceAdaptor(DeviceMemoryResource):
-    cdef readonly DeviceMemoryResource upstream_mr
-
-    cpdef DeviceMemoryResource get_upstream(self)
-
-cdef class CudaMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class ManagedMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class SystemMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class SamHeadroomMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class PoolMemoryResource(UpstreamResourceAdaptor):
-    pass
-
-cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor):
-    pass
-
-cdef class BinningMemoryResource(UpstreamResourceAdaptor):
-
-    cdef readonly list _bin_mrs
-
-    cpdef add_bin(
-        self,
-        size_t allocation_size,
-        DeviceMemoryResource bin_resource=*)
-
-cdef class CallbackMemoryResource(DeviceMemoryResource):
-    cdef object _allocate_func
-    cdef object _deallocate_func
-
-cdef class LimitingResourceAdaptor(UpstreamResourceAdaptor):
-    pass
-
-cdef class LoggingResourceAdaptor(UpstreamResourceAdaptor):
-    cdef object _log_file_name
-    cpdef get_file_name(self)
-    cpdef flush(self)
-
-cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
-    pass
-
-cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor):
-    pass
-
-cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor):
-    cdef object _callback
-
-cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor):
-    pass
-
-cpdef DeviceMemoryResource get_current_device_resource()
+from rmm.librmm.memory_resource cimport (
+    CppExcept,
+    allocate_callback_t,
+    allocation_handle_type,
+    available_device_memory,
+    binning_memory_resource,
+    callback_memory_resource,
+    cuda_async_memory_resource,
+    cuda_memory_resource,
+    deallocate_callback_t,
+    device_memory_resource,
+    failure_callback_resource_adaptor,
+    failure_callback_t,
+    fixed_size_memory_resource,
+    limiting_resource_adaptor,
+    logging_resource_adaptor,
+    managed_memory_resource,
+    percent_of_free_device_memory,
+    pool_memory_resource,
+    prefetch_resource_adaptor,
+    sam_headroom_memory_resource,
+    statistics_resource_adaptor,
+    system_memory_resource,
+    throw_cpp_except,
+    tracking_resource_adaptor,
+    translate_python_except_to_cpp,
+)
+from rmm.pylibrmm.memory_resource cimport (
+    BinningMemoryResource,
+    CallbackMemoryResource,
+    CudaAsyncMemoryResource,
+    CudaMemoryResource,
+    DeviceMemoryResource,
+    FailureCallbackResourceAdaptor,
+    FixedSizeMemoryResource,
+    LimitingResourceAdaptor,
+    LoggingResourceAdaptor,
+    ManagedMemoryResource,
+    PoolMemoryResource,
+    PrefetchResourceAdaptor,
+    SamHeadroomMemoryResource,
+    StatisticsResourceAdaptor,
+    SystemMemoryResource,
+    TrackingResourceAdaptor,
+    UpstreamResourceAdaptor,
+    get_current_device_resource,
+)
diff --git a/python/rmm/rmm/_lib/per_device_resource.pxd b/python/rmm/rmm/_lib/per_device_resource.pxd
index c33217622..29487f503 100644
--- a/python/rmm/rmm/_lib/per_device_resource.pxd
+++ b/python/rmm/rmm/_lib/per_device_resource.pxd
@@ -1,23 +1,21 @@
-from rmm._lib.memory_resource cimport device_memory_resource
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-
-cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_device_id:
-        ctypedef int value_type
-
-        cuda_device_id(value_type id)
-
-        value_type value()
-
-cdef extern from "rmm/mr/device/per_device_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef device_memory_resource* set_current_device_resource(
-        device_memory_resource* new_mr
-    )
-    cdef device_memory_resource* get_current_device_resource()
-    cdef device_memory_resource* set_per_device_resource(
-        cuda_device_id id, device_memory_resource* new_mr
-    )
-    cdef device_memory_resource* get_per_device_resource (
-        cuda_device_id id
-    )
+from rmm.librmm.per_device_resource cimport (
+    cuda_device_id,
+    get_current_device_resource,
+    get_per_device_resource,
+    set_current_device_resource,
+    set_per_device_resource,
+)
diff --git a/python/rmm/rmm/allocators/cupy.py b/python/rmm/rmm/allocators/cupy.py
index 89947c46b..780ff2abf 100644
--- a/python/rmm/rmm/allocators/cupy.py
+++ b/python/rmm/rmm/allocators/cupy.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from rmm import _lib as librmm
+from rmm import pylibrmm
 from rmm._cuda.stream import Stream
 
 try:
@@ -34,7 +34,7 @@ def rmm_cupy_allocator(nbytes):
         raise ModuleNotFoundError("No module named 'cupy'")
 
     stream = Stream(obj=cupy.cuda.get_current_stream())
-    buf = librmm.device_buffer.DeviceBuffer(size=nbytes, stream=stream)
+    buf = pylibrmm.device_buffer.DeviceBuffer(size=nbytes, stream=stream)
     dev_id = -1 if buf.ptr else cupy.cuda.device.get_device_id()
     mem = cupy.cuda.UnownedMemory(
         ptr=buf.ptr, size=buf.size, owner=buf, device_id=dev_id
diff --git a/python/rmm/rmm/allocators/numba.py b/python/rmm/rmm/allocators/numba.py
index 5e87b87b6..fd9bacb5a 100644
--- a/python/rmm/rmm/allocators/numba.py
+++ b/python/rmm/rmm/allocators/numba.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 from numba import config, cuda
 from numba.cuda import HostOnlyCUDAMemoryManager, IpcHandle, MemoryPointer
 
-from rmm import _lib as librmm
+from rmm import pylibrmm
 
 
 def _make_emm_plugin_finalizer(handle, allocations):
@@ -70,7 +70,7 @@ def memalloc(self, size):
         """
         Allocate an on-device array from the RMM pool.
         """
-        buf = librmm.DeviceBuffer(size=size)
+        buf = pylibrmm.DeviceBuffer(size=size)
         ctx = self.context
 
         if config.CUDA_USE_NVIDIA_BINDING:
diff --git a/python/rmm/rmm/allocators/torch.py b/python/rmm/rmm/allocators/torch.py
index 753da66da..eee0e9df9 100644
--- a/python/rmm/rmm/allocators/torch.py
+++ b/python/rmm/rmm/allocators/torch.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,10 +28,10 @@
         # allocator .so relative to the current file because the current file
         # is pure Python and will therefore be in the source directory.
         # Instead, we search relative to an arbitrary file in the compiled
-        # package. We use the _lib.lib module because it is small.
-        from rmm._lib import lib
+        # package. We use the librmm._logger module because it is small.
+        from rmm.librmm import _logger
 
-        sofile = pathlib.Path(lib.__file__).parent / "_torch_allocator.so"
+        sofile = pathlib.Path(_logger.__file__).parent / "_torch_allocator.so"
         rmm_torch_allocator = CUDAPluggableAllocator(
             str(sofile.absolute()),
             alloc_fn_name="allocate",
diff --git a/python/rmm/rmm/_lib/CMakeLists.txt b/python/rmm/rmm/librmm/CMakeLists.txt
similarity index 93%
rename from python/rmm/rmm/_lib/CMakeLists.txt
rename to python/rmm/rmm/librmm/CMakeLists.txt
index 7cdfed971..5da2a1a01 100644
--- a/python/rmm/rmm/_lib/CMakeLists.txt
+++ b/python/rmm/rmm/librmm/CMakeLists.txt
@@ -12,8 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources device_buffer.pyx lib.pyx logger.pyx memory_resource.pyx cuda_stream.pyx
-                   helper.pyx)
+set(cython_sources _logger.pyx)
 set(linked_libraries rmm::rmm)
 
 # Build all of the Cython targets
diff --git a/python/rmm/rmm/_lib/__init__.pxd b/python/rmm/rmm/librmm/__init__.py
similarity index 100%
rename from python/rmm/rmm/_lib/__init__.pxd
rename to python/rmm/rmm/librmm/__init__.py
diff --git a/python/rmm/rmm/librmm/_logger.pxd b/python/rmm/rmm/librmm/_logger.pxd
new file mode 100644
index 000000000..241a748c3
--- /dev/null
+++ b/python/rmm/rmm/librmm/_logger.pxd
@@ -0,0 +1,66 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libcpp cimport bool
+
+
+cdef extern from "spdlog/common.h" namespace "spdlog::level" nogil:
+    cpdef enum logging_level "spdlog::level::level_enum":
+        """
+        The debug logging level for RMM.
+
+        Debug logging prints messages to a log file. See
+        `Debug Logging <https://github.com/rapidsai/rmm#debug-logging>`_
+        for more information.
+
+        Valid levels, in decreasing order of verbosity, are TRACE, DEBUG,
+        INFO, WARN, ERR, CRITICAL, and OFF. Default is INFO.
+
+        Examples
+        --------
+        >>> import rmm
+        >>> rmm.logging_level.DEBUG
+        <logging_level.DEBUG: 1>
+        >>> rmm.logging_level.DEBUG.value
+        1
+        >>> rmm.logging_level.DEBUG.name
+        'DEBUG'
+
+        See Also
+        --------
+        set_logging_level : Set the debug logging level
+        get_logging_level : Get the current debug logging level
+        """
+        TRACE "spdlog::level::trace"
+        DEBUG "spdlog::level::debug"
+        INFO "spdlog::level::info"
+        WARN "spdlog::level::warn"
+        ERR "spdlog::level::err"
+        CRITICAL "spdlog::level::critical"
+        OFF "spdlog::level::off"
+
+
+cdef extern from "spdlog/spdlog.h" namespace "spdlog" nogil:
+    cdef cppclass spdlog_logger "spdlog::logger":
+        spdlog_logger() except +
+        void set_level(logging_level level)
+        logging_level level()
+        void flush() except +
+        void flush_on(logging_level level)
+        logging_level flush_level()
+        bool should_log(logging_level msg_level)
+
+
+cdef extern from "rmm/logger.hpp" namespace "rmm" nogil:
+    cdef spdlog_logger& logger() except +
diff --git a/python/rmm/rmm/_lib/lib.pxd b/python/rmm/rmm/librmm/_logger.pyx
similarity index 70%
rename from python/rmm/rmm/_lib/lib.pxd
rename to python/rmm/rmm/librmm/_logger.pyx
index e35b672e4..4392cb106 100644
--- a/python/rmm/rmm/_lib/lib.pxd
+++ b/python/rmm/rmm/librmm/_logger.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from libc.stdint cimport uintptr_t
-from libcpp cimport bool
-from libcpp.utility cimport pair
-from libcpp.vector cimport vector
-
-ctypedef pair[const char*, unsigned int] caller_pair
+from rmm.librmm._logger cimport logging_level  # no-cython-lint
diff --git a/python/rmm/rmm/_lib/_torch_allocator.cpp b/python/rmm/rmm/librmm/_torch_allocator.cpp
similarity index 100%
rename from python/rmm/rmm/_lib/_torch_allocator.cpp
rename to python/rmm/rmm/librmm/_torch_allocator.cpp
diff --git a/python/rmm/rmm/librmm/cuda_stream.pxd b/python/rmm/rmm/librmm/cuda_stream.pxd
new file mode 100644
index 000000000..3f2ac3361
--- /dev/null
+++ b/python/rmm/rmm/librmm/cuda_stream.pxd
@@ -0,0 +1,28 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cuda.ccudart cimport cudaStream_t
+from libcpp cimport bool
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+
+
+cdef extern from "rmm/cuda_stream.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_stream:
+        cuda_stream() except +
+        bool is_valid() except +
+        cudaStream_t value() except +
+        cuda_stream_view view() except +
+        void synchronize() except +
+        void synchronize_no_throw()
diff --git a/python/rmm/rmm/librmm/cuda_stream_pool.pxd b/python/rmm/rmm/librmm/cuda_stream_pool.pxd
new file mode 100644
index 000000000..4f2cbb36d
--- /dev/null
+++ b/python/rmm/rmm/librmm/cuda_stream_pool.pxd
@@ -0,0 +1,23 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+
+
+cdef extern from "rmm/cuda_stream_pool.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_stream_pool:
+        cuda_stream_pool(size_t pool_size)
+        cuda_stream_view get_stream()
+        cuda_stream_view get_stream(size_t stream_id) except +
+        size_t get_pool_size()
diff --git a/python/rmm/rmm/librmm/cuda_stream_view.pxd b/python/rmm/rmm/librmm/cuda_stream_view.pxd
new file mode 100644
index 000000000..bf0d33c24
--- /dev/null
+++ b/python/rmm/rmm/librmm/cuda_stream_view.pxd
@@ -0,0 +1,32 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cuda.ccudart cimport cudaStream_t
+from libcpp cimport bool
+
+
+cdef extern from "rmm/cuda_stream_view.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_stream_view:
+        cuda_stream_view()
+        cuda_stream_view(cudaStream_t)
+        cudaStream_t value()
+        bool is_default()
+        bool is_per_thread_default()
+        void synchronize() except +
+
+    cdef bool operator==(cuda_stream_view const, cuda_stream_view const)
+
+    const cuda_stream_view cuda_stream_default
+    const cuda_stream_view cuda_stream_legacy
+    const cuda_stream_view cuda_stream_per_thread
diff --git a/python/rmm/rmm/librmm/device_buffer.pxd b/python/rmm/rmm/librmm/device_buffer.pxd
new file mode 100644
index 000000000..1c503ac9a
--- /dev/null
+++ b/python/rmm/rmm/librmm/device_buffer.pxd
@@ -0,0 +1,58 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_device_id:
+        ctypedef int value_type
+        cuda_device_id()
+        cuda_device_id(value_type id)
+        value_type value()
+
+    cdef cuda_device_id get_current_cuda_device()
+
+cdef extern from "rmm/prefetch.hpp" namespace "rmm" nogil:
+    cdef void prefetch(const void* ptr,
+                       size_t bytes,
+                       cuda_device_id device,
+                       cuda_stream_view stream) except +
+
+cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
+    cdef cppclass device_buffer:
+        device_buffer()
+        device_buffer(
+            size_t size,
+            cuda_stream_view stream,
+            device_memory_resource *
+        ) except +
+        device_buffer(
+            const void* source_data,
+            size_t size,
+            cuda_stream_view stream,
+            device_memory_resource *
+        ) except +
+        device_buffer(
+            const device_buffer buf,
+            cuda_stream_view stream,
+            device_memory_resource *
+        ) except +
+        void reserve(size_t new_capacity, cuda_stream_view stream) except +
+        void resize(size_t new_size, cuda_stream_view stream) except +
+        void shrink_to_fit(cuda_stream_view stream) except +
+        void* data()
+        size_t size()
+        size_t capacity()
diff --git a/python/rmm/rmm/librmm/device_uvector.pxd b/python/rmm/rmm/librmm/device_uvector.pxd
new file mode 100644
index 000000000..f560a9e38
--- /dev/null
+++ b/python/rmm/rmm/librmm/device_uvector.pxd
@@ -0,0 +1,39 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
+    cdef cppclass device_uvector[T]:
+        device_uvector(size_t size, cuda_stream_view  stream) except +
+        T* element_ptr(size_t index)
+        void set_element(size_t element_index, const T& v, cuda_stream_view s)
+        void set_element_async(
+            size_t element_index,
+            const T& v,
+            cuda_stream_view s
+        ) except +
+        T front_element(cuda_stream_view s) except +
+        T back_element(cuda_stream_view s) except +
+        void reserve(size_t new_capacity, cuda_stream_view stream) except +
+        void resize(size_t new_size, cuda_stream_view stream) except +
+        void shrink_to_fit(cuda_stream_view stream) except +
+        device_buffer release()
+        size_t capacity()
+        T* data()
+        size_t size()
+        device_memory_resource* memory_resource()
diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd
new file mode 100644
index 000000000..9ddaf04b9
--- /dev/null
+++ b/python/rmm/rmm/librmm/memory_resource.pxd
@@ -0,0 +1,230 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This import is needed for Cython typing in translate_python_except_to_cpp
+# See https://github.com/cython/cython/issues/5589
+from builtins import BaseException
+
+from libc.stddef cimport size_t
+from libc.stdint cimport int8_t, int64_t
+from libcpp cimport bool
+from libcpp.optional cimport optional
+from libcpp.pair cimport pair
+from libcpp.string cimport string
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/mr/device/device_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass device_memory_resource:
+        void* allocate(size_t bytes) except +
+        void* allocate(size_t bytes, cuda_stream_view stream) except +
+        void deallocate(void* ptr, size_t bytes) except +
+        void deallocate(
+            void* ptr,
+            size_t bytes,
+            cuda_stream_view stream
+        ) except +
+
+cdef extern from "rmm/cuda_device.hpp" namespace "rmm" nogil:
+    size_t percent_of_free_device_memory(int percent) except +
+    pair[size_t, size_t] available_device_memory() except +
+
+# Transparent handle of a C++ exception
+ctypedef pair[int, string] CppExcept
+
+cdef inline CppExcept translate_python_except_to_cpp(err: BaseException) noexcept:
+    """Translate a Python exception into a C++ exception handle
+
+    The returned exception handle can then be thrown by `throw_cpp_except()`,
+    which MUST be done without holding the GIL.
+
+    This is useful when C++ calls a Python function and needs to catch or
+    propagate exceptions.
+    """
+    if isinstance(err, MemoryError):
+        return CppExcept(0, str.encode(str(err)))
+    return CppExcept(-1, str.encode(str(err)))
+
+# Implementation of `throw_cpp_except()`, which throws a given `CppExcept`.
+# This function MUST be called without the GIL otherwise the thrown C++
+# exception are translated back into a Python exception.
+cdef extern from *:
+    """
+    #include <stdexcept>
+    #include <utility>
+
+    void throw_cpp_except(std::pair<int, std::string> res) {
+        switch(res.first) {
+            case 0:
+                throw rmm::out_of_memory(res.second);
+            default:
+                throw std::runtime_error(res.second);
+        }
+    }
+    """
+    void throw_cpp_except(CppExcept) nogil
+
+
+cdef extern from "rmm/mr/device/cuda_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass cuda_memory_resource(device_memory_resource):
+        cuda_memory_resource() except +
+
+cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass managed_memory_resource(device_memory_resource):
+        managed_memory_resource() except +
+
+cdef extern from "rmm/mr/device/system_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass system_memory_resource(device_memory_resource):
+        system_memory_resource() except +
+
+cdef extern from "rmm/mr/device/sam_headroom_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass sam_headroom_memory_resource(device_memory_resource):
+        sam_headroom_memory_resource(size_t headroom) except +
+
+cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+
+    cdef cppclass cuda_async_memory_resource(device_memory_resource):
+        cuda_async_memory_resource(
+            optional[size_t] initial_pool_size,
+            optional[size_t] release_threshold,
+            optional[allocation_handle_type] export_handle_type) except +
+
+# TODO: when we adopt Cython 3.0 use enum class
+cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
+        namespace \
+        "rmm::mr::cuda_async_memory_resource::allocation_handle_type" \
+        nogil:
+    enum allocation_handle_type \
+            "rmm::mr::cuda_async_memory_resource::allocation_handle_type":
+        none
+        posix_file_descriptor
+        win32
+        win32_kmt
+
+
+cdef extern from "rmm/mr/device/pool_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass pool_memory_resource[Upstream](device_memory_resource):
+        pool_memory_resource(
+            Upstream* upstream_mr,
+            size_t initial_pool_size,
+            optional[size_t] maximum_pool_size) except +
+        size_t pool_size()
+
+cdef extern from "rmm/mr/device/fixed_size_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass fixed_size_memory_resource[Upstream](device_memory_resource):
+        fixed_size_memory_resource(
+            Upstream* upstream_mr,
+            size_t block_size,
+            size_t block_to_preallocate) except +
+
+cdef extern from "rmm/mr/device/callback_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    ctypedef void* (*allocate_callback_t)(size_t, cuda_stream_view, void*)
+    ctypedef void (*deallocate_callback_t)(void*, size_t, cuda_stream_view, void*)
+
+    cdef cppclass callback_memory_resource(device_memory_resource):
+        callback_memory_resource(
+            allocate_callback_t allocate_callback,
+            deallocate_callback_t deallocate_callback,
+            void* allocate_callback_arg,
+            void* deallocate_callback_arg
+        ) except +
+
+cdef extern from "rmm/mr/device/binning_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass binning_memory_resource[Upstream](device_memory_resource):
+        binning_memory_resource(Upstream* upstream_mr) except +
+        binning_memory_resource(
+            Upstream* upstream_mr,
+            int8_t min_size_exponent,
+            int8_t max_size_exponent) except +
+
+        void add_bin(size_t allocation_size) except +
+        void add_bin(
+            size_t allocation_size,
+            device_memory_resource* bin_resource) except +
+
+cdef extern from "rmm/mr/device/limiting_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass limiting_resource_adaptor[Upstream](device_memory_resource):
+        limiting_resource_adaptor(
+            Upstream* upstream_mr,
+            size_t allocation_limit) except +
+
+        size_t get_allocated_bytes() except +
+        size_t get_allocation_limit() except +
+
+cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass logging_resource_adaptor[Upstream](device_memory_resource):
+        logging_resource_adaptor(
+            Upstream* upstream_mr,
+            string filename) except +
+
+        void flush() except +
+
+cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass statistics_resource_adaptor[Upstream](device_memory_resource):
+        struct counter:
+            counter()
+
+            int64_t value
+            int64_t peak
+            int64_t total
+
+        statistics_resource_adaptor(Upstream* upstream_mr) except +
+
+        counter get_bytes_counter() except +
+        counter get_allocations_counter() except +
+        pair[counter, counter] pop_counters() except +
+        pair[counter, counter] push_counters() except +
+
+cdef extern from "rmm/mr/device/tracking_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass tracking_resource_adaptor[Upstream](device_memory_resource):
+        tracking_resource_adaptor(
+            Upstream* upstream_mr,
+            bool capture_stacks) except +
+
+        size_t get_allocated_bytes() except +
+        string get_outstanding_allocations_str() except +
+        void log_outstanding_allocations() except +
+
+cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    ctypedef bool (*failure_callback_t)(size_t, void*)
+    cdef cppclass failure_callback_resource_adaptor[Upstream](
+        device_memory_resource
+    ):
+        failure_callback_resource_adaptor(
+            Upstream* upstream_mr,
+            failure_callback_t callback,
+            void* callback_arg
+        ) except +
+
+cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass prefetch_resource_adaptor[Upstream](device_memory_resource):
+        prefetch_resource_adaptor(Upstream* upstream_mr) except +
diff --git a/python/rmm/rmm/librmm/per_device_resource.pxd b/python/rmm/rmm/librmm/per_device_resource.pxd
new file mode 100644
index 000000000..63ee29056
--- /dev/null
+++ b/python/rmm/rmm/librmm/per_device_resource.pxd
@@ -0,0 +1,36 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_device_id:
+        ctypedef int value_type
+
+        cuda_device_id(value_type id)
+
+        value_type value()
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef device_memory_resource* set_current_device_resource(
+        device_memory_resource* new_mr
+    )
+    cdef device_memory_resource* get_current_device_resource()
+    cdef device_memory_resource* set_per_device_resource(
+        cuda_device_id id, device_memory_resource* new_mr
+    )
+    cdef device_memory_resource* get_per_device_resource (
+        cuda_device_id id
+    )
diff --git a/python/rmm/rmm/mr.py b/python/rmm/rmm/mr.py
index 6eb94da0f..3f0c3fce3 100644
--- a/python/rmm/rmm/mr.py
+++ b/python/rmm/rmm/mr.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from rmm._lib.memory_resource import (
+from rmm.pylibrmm.memory_resource import (
     BinningMemoryResource,
     CallbackMemoryResource,
     CudaAsyncMemoryResource,
diff --git a/python/rmm/rmm/pylibrmm/CMakeLists.txt b/python/rmm/rmm/pylibrmm/CMakeLists.txt
new file mode 100644
index 000000000..0e88f01bb
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/CMakeLists.txt
@@ -0,0 +1,27 @@
+# =============================================================================
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources device_buffer.pyx logger.pyx memory_resource.pyx cuda_stream.pyx helper.pyx)
+set(linked_libraries rmm::rmm)
+
+# Build all of the Cython targets
+rapids_cython_create_modules(SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}"
+                                                                               CXX)
+
+# mark all symbols in these Cython targets "hidden" by default, so they won't collide with symbols
+# loaded from other DSOs
+foreach(_cython_target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  set_target_properties(${_cython_target} PROPERTIES C_VISIBILITY_PRESET hidden
+                                                     CXX_VISIBILITY_PRESET hidden)
+endforeach()
diff --git a/python/rmm/rmm/pylibrmm/__init__.py b/python/rmm/rmm/pylibrmm/__init__.py
new file mode 100644
index 000000000..0b8672ef6
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .device_buffer import DeviceBuffer
diff --git a/python/rmm/rmm/pylibrmm/cuda_stream.pxd b/python/rmm/rmm/pylibrmm/cuda_stream.pxd
new file mode 100644
index 000000000..dd38387c2
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/cuda_stream.pxd
@@ -0,0 +1,27 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cimport cython
+from cuda.ccudart cimport cudaStream_t
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+
+from rmm.librmm.cuda_stream cimport cuda_stream
+
+
+@cython.final
+cdef class CudaStream:
+    cdef unique_ptr[cuda_stream] c_obj
+    cdef cudaStream_t value(self) except * nogil
+    cdef bool is_valid(self) except * nogil
diff --git a/python/rmm/rmm/_lib/cuda_stream.pyx b/python/rmm/rmm/pylibrmm/cuda_stream.pyx
similarity index 91%
rename from python/rmm/rmm/_lib/cuda_stream.pyx
rename to python/rmm/rmm/pylibrmm/cuda_stream.pyx
index 0861f0663..d6aa4edc7 100644
--- a/python/rmm/rmm/_lib/cuda_stream.pyx
+++ b/python/rmm/rmm/pylibrmm/cuda_stream.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@ cimport cython
 from cuda.ccudart cimport cudaStream_t
 from libcpp cimport bool
 
+from rmm.librmm.cuda_stream cimport cuda_stream
+
 
 @cython.final
 cdef class CudaStream:
diff --git a/python/rmm/rmm/pylibrmm/device_buffer.pxd b/python/rmm/rmm/pylibrmm/device_buffer.pxd
new file mode 100644
index 000000000..a0d287423
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/device_buffer.pxd
@@ -0,0 +1,71 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libc.stdint cimport uintptr_t
+from libcpp.memory cimport unique_ptr
+
+from rmm._cuda.stream cimport Stream
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
+
+
+cdef class DeviceBuffer:
+    cdef unique_ptr[device_buffer] c_obj
+
+    # Holds a reference to the DeviceMemoryResource used for allocation.
+    # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
+    # needed for deallocation
+    cdef DeviceMemoryResource mr
+
+    # Holds a reference to the stream used by the underlying `device_buffer`.
+    # Ensures the stream does not get destroyed before this DeviceBuffer
+    cdef Stream stream
+
+    @staticmethod
+    cdef DeviceBuffer c_from_unique_ptr(
+        unique_ptr[device_buffer] ptr,
+        Stream stream=*,
+        DeviceMemoryResource mr=*,
+    )
+
+    @staticmethod
+    cdef DeviceBuffer c_to_device(const unsigned char[::1] b,
+                                  Stream stream=*) except *
+    cpdef copy_to_host(self, ary=*, Stream stream=*)
+    cpdef copy_from_host(self, ary, Stream stream=*)
+    cpdef copy_from_device(self, cuda_ary, Stream stream=*)
+    cpdef bytes tobytes(self, Stream stream=*)
+
+    cdef size_t c_size(self) except *
+    cpdef void reserve(self, size_t new_capacity, Stream stream=*) except *
+    cpdef void resize(self, size_t new_size, Stream stream=*) except *
+    cpdef size_t capacity(self) except *
+    cdef void* c_data(self) except *
+
+    cdef device_buffer c_release(self) except *
+
+cpdef DeviceBuffer to_device(const unsigned char[::1] b,
+                             Stream stream=*)
+cpdef void copy_ptr_to_host(uintptr_t db,
+                            unsigned char[::1] hb,
+                            Stream stream=*) except *
+
+cpdef void copy_host_to_ptr(const unsigned char[::1] hb,
+                            uintptr_t db,
+                            Stream stream=*) except *
+
+cpdef void copy_device_to_ptr(uintptr_t d_src,
+                              uintptr_t d_dst,
+                              size_t count,
+                              Stream stream=*) except *
diff --git a/python/rmm/rmm/_lib/device_buffer.pyx b/python/rmm/rmm/pylibrmm/device_buffer.pyx
similarity index 96%
rename from python/rmm/rmm/_lib/device_buffer.pyx
rename to python/rmm/rmm/pylibrmm/device_buffer.pyx
index 94a4dc771..76fbceef8 100644
--- a/python/rmm/rmm/_lib/device_buffer.pyx
+++ b/python/rmm/rmm/pylibrmm/device_buffer.pyx
@@ -32,9 +32,16 @@ from cuda.ccudart cimport (
     cudaStream_t,
 )
 
-from rmm._lib.memory_resource cimport (
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.device_buffer cimport (
+    cuda_device_id,
+    device_buffer,
+    get_current_cuda_device,
+    prefetch,
+)
+from rmm.librmm.memory_resource cimport device_memory_resource
+from rmm.pylibrmm.memory_resource cimport (
     DeviceMemoryResource,
-    device_memory_resource,
     get_current_device_resource,
 )
 
@@ -394,7 +401,7 @@ cpdef DeviceBuffer to_device(const unsigned char[::1] b,
     Examples
     --------
     >>> import rmm
-    >>> db = rmm._lib.device_buffer.to_device(b"abc")
+    >>> db = rmm.pylibrmm.device_buffer.to_device(b"abc")
     >>> print(bytes(db))
     b'abc'
     """
@@ -460,7 +467,7 @@ cpdef void copy_ptr_to_host(uintptr_t db,
     >>> import rmm
     >>> db = rmm.DeviceBuffer.to_device(b"abc")
     >>> hb = bytearray(db.nbytes)
-    >>> rmm._lib.device_buffer.copy_ptr_to_host(db.ptr, hb)
+    >>> rmm.pylibrmm.device_buffer.copy_ptr_to_host(db.ptr, hb)
     >>> print(hb)
     bytearray(b'abc')
     """
@@ -502,7 +509,7 @@ cpdef void copy_host_to_ptr(const unsigned char[::1] hb,
     >>> import rmm
     >>> db = rmm.DeviceBuffer(size=10)
     >>> hb = b"abc"
-    >>> rmm._lib.device_buffer.copy_host_to_ptr(hb, db.ptr)
+    >>> rmm.pylibrmm.device_buffer.copy_host_to_ptr(hb, db.ptr)
     >>> hb = db.copy_to_host()
     >>> print(hb)
     array([97, 98, 99,  0,  0,  0,  0,  0,  0,  0], dtype=uint8)
@@ -541,7 +548,7 @@ cpdef void copy_device_to_ptr(uintptr_t d_src,
     >>> import rmm
     >>> db = rmm.DeviceBuffer(size=5)
     >>> db2 = rmm.DeviceBuffer.to_device(b"abc")
-    >>> rmm._lib.device_buffer.copy_device_to_ptr(db2.ptr, db.ptr, db2.size)
+    >>> rmm.pylibrmm.device_buffer.copy_device_to_ptr(db2.ptr, db.ptr, db2.size)
     >>> hb = db.copy_to_host()
     >>> hb
     array([97, 98, 99,  0,  0], dtype=uint8)
diff --git a/python/rmm/rmm/_lib/lib.pyx b/python/rmm/rmm/pylibrmm/helper.pxd
similarity index 86%
rename from python/rmm/rmm/_lib/lib.pyx
rename to python/rmm/rmm/pylibrmm/helper.pxd
index 46753baa3..8ca151c00 100644
--- a/python/rmm/rmm/_lib/lib.pyx
+++ b/python/rmm/rmm/pylibrmm/helper.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,3 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+
+cdef object parse_bytes(object s) except *
diff --git a/python/rmm/rmm/_lib/helper.pyx b/python/rmm/rmm/pylibrmm/helper.pyx
similarity index 100%
rename from python/rmm/rmm/_lib/helper.pyx
rename to python/rmm/rmm/pylibrmm/helper.pyx
diff --git a/python/rmm/rmm/_lib/logger.pyx b/python/rmm/rmm/pylibrmm/logger.pyx
similarity index 77%
rename from python/rmm/rmm/_lib/logger.pyx
rename to python/rmm/rmm/pylibrmm/logger.pyx
index 029bbdd79..119e1c92f 100644
--- a/python/rmm/rmm/_lib/logger.pyx
+++ b/python/rmm/rmm/pylibrmm/logger.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,58 +14,9 @@
 
 import warnings
 
-from libcpp cimport bool
-
-
-cdef extern from "spdlog/common.h" namespace "spdlog::level" nogil:
-    cpdef enum logging_level "spdlog::level::level_enum":
-        """
-        The debug logging level for RMM.
-
-        Debug logging prints messages to a log file. See
-        `Debug Logging <https://github.com/rapidsai/rmm#debug-logging>`_
-        for more information.
-
-        Valid levels, in decreasing order of verbosity, are TRACE, DEBUG,
-        INFO, WARN, ERR, CRITICAL, and OFF. Default is INFO.
-
-        Examples
-        --------
-        >>> import rmm
-        >>> rmm.logging_level.DEBUG
-        <logging_level.DEBUG: 1>
-        >>> rmm.logging_level.DEBUG.value
-        1
-        >>> rmm.logging_level.DEBUG.name
-        'DEBUG'
-
-        See Also
-        --------
-        set_logging_level : Set the debug logging level
-        get_logging_level : Get the current debug logging level
-        """
-        TRACE "spdlog::level::trace"
-        DEBUG "spdlog::level::debug"
-        INFO "spdlog::level::info"
-        WARN "spdlog::level::warn"
-        ERR "spdlog::level::err"
-        CRITICAL "spdlog::level::critical"
-        OFF "spdlog::level::off"
-
-
-cdef extern from "spdlog/spdlog.h" namespace "spdlog" nogil:
-    cdef cppclass spdlog_logger "spdlog::logger":
-        spdlog_logger() except +
-        void set_level(logging_level level)
-        logging_level level()
-        void flush() except +
-        void flush_on(logging_level level)
-        logging_level flush_level()
-        bool should_log(logging_level msg_level)
-
-
-cdef extern from "rmm/logger.hpp" namespace "rmm" nogil:
-    cdef spdlog_logger& logger() except +
+from rmm.librmm._logger cimport logger
+
+from rmm.librmm._logger import logging_level
 
 
 def _validate_level_type(level):
diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pxd b/python/rmm/rmm/pylibrmm/memory_resource.pxd
new file mode 100644
index 000000000..985d5d31b
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pxd
@@ -0,0 +1,83 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libcpp.memory cimport shared_ptr
+
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef class DeviceMemoryResource:
+    cdef shared_ptr[device_memory_resource] c_obj
+    cdef device_memory_resource* get_mr(self) noexcept nogil
+
+cdef class UpstreamResourceAdaptor(DeviceMemoryResource):
+    cdef readonly DeviceMemoryResource upstream_mr
+
+    cpdef DeviceMemoryResource get_upstream(self)
+
+cdef class CudaMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class ManagedMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class SystemMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class SamHeadroomMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class PoolMemoryResource(UpstreamResourceAdaptor):
+    pass
+
+cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor):
+    pass
+
+cdef class BinningMemoryResource(UpstreamResourceAdaptor):
+
+    cdef readonly list _bin_mrs
+
+    cpdef add_bin(
+        self,
+        size_t allocation_size,
+        DeviceMemoryResource bin_resource=*)
+
+cdef class CallbackMemoryResource(DeviceMemoryResource):
+    cdef object _allocate_func
+    cdef object _deallocate_func
+
+cdef class LimitingResourceAdaptor(UpstreamResourceAdaptor):
+    pass
+
+cdef class LoggingResourceAdaptor(UpstreamResourceAdaptor):
+    cdef object _log_file_name
+    cpdef get_file_name(self)
+    cpdef flush(self)
+
+cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
+    pass
+
+cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor):
+    pass
+
+cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor):
+    cdef object _callback
+
+cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor):
+    pass
+
+cpdef DeviceMemoryResource get_current_device_resource()
diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/pylibrmm/memory_resource.pyx
similarity index 82%
rename from python/rmm/rmm/_lib/memory_resource.pyx
rename to python/rmm/rmm/pylibrmm/memory_resource.pyx
index 231253e3f..021125567 100644
--- a/python/rmm/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pyx
@@ -22,12 +22,11 @@ from collections import defaultdict
 cimport cython
 from cython.operator cimport dereference as deref
 from libc.stddef cimport size_t
-from libc.stdint cimport int8_t, int64_t, uintptr_t
+from libc.stdint cimport int8_t, uintptr_t
 from libcpp cimport bool
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.optional cimport optional
 from libcpp.pair cimport pair
-from libcpp.string cimport string
 
 from cuda.cudart import cudaError_t
 
@@ -37,206 +36,43 @@ from rmm._cuda.stream cimport Stream
 
 from rmm._cuda.stream import DEFAULT_STREAM
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from rmm._lib.helper cimport parse_bytes
-from rmm._lib.memory_resource cimport (
-    available_device_memory as c_available_device_memory,
-    percent_of_free_device_memory as c_percent_of_free_device_memory,
-)
-from rmm._lib.per_device_resource cimport (
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.per_device_resource cimport (
     cuda_device_id,
     set_per_device_resource as cpp_set_per_device_resource,
 )
+from rmm.pylibrmm.helper cimport parse_bytes
 
 from rmm.statistics import Statistics
 
-# Transparent handle of a C++ exception
-ctypedef pair[int, string] CppExcept
-
-cdef CppExcept translate_python_except_to_cpp(err: BaseException) noexcept:
-    """Translate a Python exception into a C++ exception handle
-
-    The returned exception handle can then be thrown by `throw_cpp_except()`,
-    which MUST be done without holding the GIL.
-
-    This is useful when C++ calls a Python function and needs to catch or
-    propagate exceptions.
-    """
-    if isinstance(err, MemoryError):
-        return CppExcept(0, str.encode(str(err)))
-    return CppExcept(-1, str.encode(str(err)))
-
-# Implementation of `throw_cpp_except()`, which throws a given `CppExcept`.
-# This function MUST be called without the GIL otherwise the thrown C++
-# exception are translated back into a Python exception.
-cdef extern from *:
-    """
-    #include <stdexcept>
-    #include <utility>
-
-    void throw_cpp_except(std::pair<int, std::string> res) {
-        switch(res.first) {
-            case 0:
-                throw rmm::out_of_memory(res.second);
-            default:
-                throw std::runtime_error(res.second);
-        }
-    }
-    """
-    void throw_cpp_except(CppExcept) nogil
-
-
-# NOTE: Keep extern declarations in .pyx file as much as possible to avoid
-# leaking dependencies when importing RMM Cython .pxd files
-cdef extern from "rmm/mr/device/cuda_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass cuda_memory_resource(device_memory_resource):
-        cuda_memory_resource() except +
-
-cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass managed_memory_resource(device_memory_resource):
-        managed_memory_resource() except +
-
-cdef extern from "rmm/mr/device/system_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass system_memory_resource(device_memory_resource):
-        system_memory_resource() except +
-
-cdef extern from "rmm/mr/device/sam_headroom_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass sam_headroom_memory_resource(device_memory_resource):
-        sam_headroom_memory_resource(size_t headroom) except +
-
-cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-
-    cdef cppclass cuda_async_memory_resource(device_memory_resource):
-        cuda_async_memory_resource(
-            optional[size_t] initial_pool_size,
-            optional[size_t] release_threshold,
-            optional[allocation_handle_type] export_handle_type) except +
-
-# TODO: when we adopt Cython 3.0 use enum class
-cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
-        namespace \
-        "rmm::mr::cuda_async_memory_resource::allocation_handle_type" \
-        nogil:
-    enum allocation_handle_type \
-            "rmm::mr::cuda_async_memory_resource::allocation_handle_type":
-        none
-        posix_file_descriptor
-        win32
-        win32_kmt
-
-
-cdef extern from "rmm/mr/device/pool_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass pool_memory_resource[Upstream](device_memory_resource):
-        pool_memory_resource(
-            Upstream* upstream_mr,
-            size_t initial_pool_size,
-            optional[size_t] maximum_pool_size) except +
-        size_t pool_size()
-
-cdef extern from "rmm/mr/device/fixed_size_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass fixed_size_memory_resource[Upstream](device_memory_resource):
-        fixed_size_memory_resource(
-            Upstream* upstream_mr,
-            size_t block_size,
-            size_t block_to_preallocate) except +
-
-cdef extern from "rmm/mr/device/callback_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    ctypedef void* (*allocate_callback_t)(size_t, cuda_stream_view, void*)
-    ctypedef void (*deallocate_callback_t)(void*, size_t, cuda_stream_view, void*)
-
-    cdef cppclass callback_memory_resource(device_memory_resource):
-        callback_memory_resource(
-            allocate_callback_t allocate_callback,
-            deallocate_callback_t deallocate_callback,
-            void* allocate_callback_arg,
-            void* deallocate_callback_arg
-        ) except +
-
-cdef extern from "rmm/mr/device/binning_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass binning_memory_resource[Upstream](device_memory_resource):
-        binning_memory_resource(Upstream* upstream_mr) except +
-        binning_memory_resource(
-            Upstream* upstream_mr,
-            int8_t min_size_exponent,
-            int8_t max_size_exponent) except +
-
-        void add_bin(size_t allocation_size) except +
-        void add_bin(
-            size_t allocation_size,
-            device_memory_resource* bin_resource) except +
-
-cdef extern from "rmm/mr/device/limiting_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass limiting_resource_adaptor[Upstream](device_memory_resource):
-        limiting_resource_adaptor(
-            Upstream* upstream_mr,
-            size_t allocation_limit) except +
-
-        size_t get_allocated_bytes() except +
-        size_t get_allocation_limit() except +
-
-cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass logging_resource_adaptor[Upstream](device_memory_resource):
-        logging_resource_adaptor(
-            Upstream* upstream_mr,
-            string filename) except +
-
-        void flush() except +
-
-cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass statistics_resource_adaptor[Upstream](device_memory_resource):
-        struct counter:
-            counter()
-
-            int64_t value
-            int64_t peak
-            int64_t total
-
-        statistics_resource_adaptor(Upstream* upstream_mr) except +
-
-        counter get_bytes_counter() except +
-        counter get_allocations_counter() except +
-        pair[counter, counter] pop_counters() except +
-        pair[counter, counter] push_counters() except +
-
-cdef extern from "rmm/mr/device/tracking_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass tracking_resource_adaptor[Upstream](device_memory_resource):
-        tracking_resource_adaptor(
-            Upstream* upstream_mr,
-            bool capture_stacks) except +
-
-        size_t get_allocated_bytes() except +
-        string get_outstanding_allocations_str() except +
-        void log_outstanding_allocations() except +
-
-cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    ctypedef bool (*failure_callback_t)(size_t, void*)
-    cdef cppclass failure_callback_resource_adaptor[Upstream](
-        device_memory_resource
-    ):
-        failure_callback_resource_adaptor(
-            Upstream* upstream_mr,
-            failure_callback_t callback,
-            void* callback_arg
-        ) except +
-
-cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass prefetch_resource_adaptor[Upstream](device_memory_resource):
-        prefetch_resource_adaptor(Upstream* upstream_mr) except +
+from rmm.librmm.memory_resource cimport (
+    CppExcept,
+    allocate_callback_t,
+    allocation_handle_type,
+    available_device_memory as c_available_device_memory,
+    binning_memory_resource,
+    callback_memory_resource,
+    cuda_async_memory_resource,
+    cuda_memory_resource,
+    deallocate_callback_t,
+    device_memory_resource,
+    failure_callback_resource_adaptor,
+    failure_callback_t,
+    fixed_size_memory_resource,
+    limiting_resource_adaptor,
+    logging_resource_adaptor,
+    managed_memory_resource,
+    percent_of_free_device_memory as c_percent_of_free_device_memory,
+    pool_memory_resource,
+    posix_file_descriptor,
+    prefetch_resource_adaptor,
+    sam_headroom_memory_resource,
+    statistics_resource_adaptor,
+    system_memory_resource,
+    throw_cpp_except,
+    tracking_resource_adaptor,
+    translate_python_except_to_cpp,
+)
 
 
 cdef class DeviceMemoryResource:
diff --git a/python/rmm/rmm/_lib/tests/__init__.py b/python/rmm/rmm/pylibrmm/tests/__init__.py
similarity index 100%
rename from python/rmm/rmm/_lib/tests/__init__.py
rename to python/rmm/rmm/pylibrmm/tests/__init__.py
diff --git a/python/rmm/rmm/_lib/tests/test_device_buffer.pyx b/python/rmm/rmm/pylibrmm/tests/test_device_buffer.pyx
similarity index 83%
rename from python/rmm/rmm/_lib/tests/test_device_buffer.pyx
rename to python/rmm/rmm/pylibrmm/tests/test_device_buffer.pyx
index 733383827..ec2ff4def 100644
--- a/python/rmm/rmm/_lib/tests/test_device_buffer.pyx
+++ b/python/rmm/rmm/pylibrmm/tests/test_device_buffer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,8 +16,9 @@ import numpy as np
 
 from libcpp.memory cimport make_unique
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_default
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.cuda_stream_view cimport cuda_stream_default
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 
 def test_release():
diff --git a/python/rmm/rmm/tests/test_cython.py b/python/rmm/rmm/tests/test_cython.py
index 82eba2451..5df933435 100644
--- a/python/rmm/rmm/tests/test_cython.py
+++ b/python/rmm/rmm/tests/test_cython.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ def wrapped(*args, **kwargs):
     return wrapped
 
 
-cython_test_modules = ["rmm._lib.tests.test_device_buffer"]
+cython_test_modules = ["rmm.pylibrmm.tests.test_device_buffer"]
 
 
 for mod in cython_test_modules:
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index c88d21b38..c03b9e501 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -354,7 +354,7 @@ def test_rmm_pool_numba_stream(stream):
     rmm.reinitialize(pool_allocator=True)
 
     stream = rmm._cuda.stream.Stream(stream)
-    a = rmm._lib.device_buffer.DeviceBuffer(size=3, stream=stream)
+    a = rmm.pylibrmm.device_buffer.DeviceBuffer(size=3, stream=stream)
 
     assert a.size == 3
     assert a.ptr != 0

From 815003232d90a45fe6867214e73284649c639066 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 4 Oct 2024 15:10:01 -0400
Subject: [PATCH 12/27] Fix `rmm ._lib` imports (#1693)

This PR fixes a bug in #1676. It makes sure that rmm imports work correctly using both `from rmm._lib...` and `import rmm._lib...` syntax.

I'm adding DO NOT MERGE until I do some more testing.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1693
---
 python/rmm/rmm/_lib/cuda_stream.py     | 15 +++++++++
 python/rmm/rmm/_lib/device_buffer.py   | 21 ++++++++++++
 python/rmm/rmm/_lib/logger.py          | 24 ++++++++++++++
 python/rmm/rmm/_lib/memory_resource.py | 44 ++++++++++++++++++++++++++
 4 files changed, 104 insertions(+)
 create mode 100644 python/rmm/rmm/_lib/cuda_stream.py
 create mode 100644 python/rmm/rmm/_lib/device_buffer.py
 create mode 100644 python/rmm/rmm/_lib/logger.py
 create mode 100644 python/rmm/rmm/_lib/memory_resource.py

diff --git a/python/rmm/rmm/_lib/cuda_stream.py b/python/rmm/rmm/_lib/cuda_stream.py
new file mode 100644
index 000000000..1eb424e12
--- /dev/null
+++ b/python/rmm/rmm/_lib/cuda_stream.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.pylibrmm.cuda_stream import CudaStream  # noqa: F401
diff --git a/python/rmm/rmm/_lib/device_buffer.py b/python/rmm/rmm/_lib/device_buffer.py
new file mode 100644
index 000000000..c531bca5f
--- /dev/null
+++ b/python/rmm/rmm/_lib/device_buffer.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.pylibrmm.device_buffer import (  # noqa: F401
+    DeviceBuffer,
+    copy_device_to_ptr,
+    copy_host_to_ptr,
+    copy_ptr_to_host,
+    to_device,
+)
diff --git a/python/rmm/rmm/_lib/logger.py b/python/rmm/rmm/_lib/logger.py
new file mode 100644
index 000000000..1e9b519b8
--- /dev/null
+++ b/python/rmm/rmm/_lib/logger.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm._logger import logging_level  # noqa: F401
+from rmm.pylibrmm.logger import (  # noqa: F401
+    _validate_level_type,
+    flush_logger,
+    get_flush_level,
+    get_logging_level,
+    set_flush_level,
+    set_logging_level,
+    should_log,
+)
diff --git a/python/rmm/rmm/_lib/memory_resource.py b/python/rmm/rmm/_lib/memory_resource.py
new file mode 100644
index 000000000..0d47e8c9b
--- /dev/null
+++ b/python/rmm/rmm/_lib/memory_resource.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.pylibrmm.memory_resource import (  # noqa: F401
+    BinningMemoryResource,
+    CallbackMemoryResource,
+    CudaAsyncMemoryResource,
+    CudaMemoryResource,
+    DeviceMemoryResource,
+    FailureCallbackResourceAdaptor,
+    FixedSizeMemoryResource,
+    LimitingResourceAdaptor,
+    LoggingResourceAdaptor,
+    ManagedMemoryResource,
+    PoolMemoryResource,
+    PrefetchResourceAdaptor,
+    SamHeadroomMemoryResource,
+    StatisticsResourceAdaptor,
+    SystemMemoryResource,
+    TrackingResourceAdaptor,
+    UpstreamResourceAdaptor,
+    _flush_logs,
+    available_device_memory,
+    disable_logging,
+    enable_logging,
+    get_current_device_resource,
+    get_current_device_resource_type,
+    get_log_filenames,
+    get_per_device_resource_type,
+    is_initialized,
+    set_current_device_resource,
+    set_per_device_resource,
+)

From c494395e58288cac16321ce90e9b15f3508ae89a Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 4 Oct 2024 16:18:35 -0400
Subject: [PATCH 13/27] Prune workflows based on changed files (#1695)

Contributes to https://github.com/rapidsai/build-planning/issues/94

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/rmm/pull/1695
---
 .github/workflows/pr.yaml | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index afc9f7487..4dfcaf1ae 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,6 +12,7 @@ concurrency:
 jobs:
   pr-builder:
     needs:
+      - changed-files
       - checks
       - conda-cpp-build
       - conda-cpp-tests
@@ -24,6 +25,29 @@ jobs:
       - devcontainer
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
+    if: always()
+    with:
+      needs: ${{ toJSON(needs) }}
+  changed-files:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
+    with:
+      files_yaml: |
+        test_cpp:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!img/**'
+          - '!python/**'
+        test_python:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!img/**'
   checks:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
@@ -36,9 +60,10 @@ jobs:
     with:
       build_type: pull-request
   conda-cpp-tests:
-    needs: conda-cpp-build
+    needs: [conda-cpp-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-python-build:
@@ -48,9 +73,10 @@ jobs:
     with:
       build_type: pull-request
   conda-python-tests:
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   docs-build:
@@ -79,9 +105,10 @@ jobs:
       build_type: pull-request
       script: ci/build_wheel_python.sh
   wheel-tests:
-    needs: wheel-build-python
+    needs: [wheel-build-python, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel.sh

From afe0a3336397b17a96bb703e82f3b6365ee7c41e Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 9 Oct 2024 09:39:40 -0400
Subject: [PATCH 14/27] Update Changelog [skip ci]

---
 CHANGELOG.md | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 986cf7052..1268762b2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,50 @@
+# rmm 24.10.00 (9 Oct 2024)
+
+## 🚨 Breaking Changes
+
+- Inline functions that return static references must have default visibility ([#1653](https://github.com/rapidsai/rmm/pull/1653)) [@wence-](https://github.com/wence-)
+- Hide visibility of non-public symbols ([#1644](https://github.com/rapidsai/rmm/pull/1644)) [@jameslamb](https://github.com/jameslamb)
+- Deprecate adaptor factories. ([#1626](https://github.com/rapidsai/rmm/pull/1626)) [@bdice](https://github.com/bdice)
+
+## 🐛 Bug Fixes
+
+- Add missing include to `resource_ref.hpp` ([#1677](https://github.com/rapidsai/rmm/pull/1677)) [@miscco](https://github.com/miscco)
+- Remove the friend declaration with an attribute ([#1669](https://github.com/rapidsai/rmm/pull/1669)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Fix `build.sh clean` to delete python build directory ([#1658](https://github.com/rapidsai/rmm/pull/1658)) [@rongou](https://github.com/rongou)
+- Stream synchronize before deallocating SAM ([#1655](https://github.com/rapidsai/rmm/pull/1655)) [@rongou](https://github.com/rongou)
+- Explicitly mark RMM headers with `RMM_EXPORT` ([#1654](https://github.com/rapidsai/rmm/pull/1654)) [@robertmaynard](https://github.com/robertmaynard)
+- Inline functions that return static references must have default visibility ([#1653](https://github.com/rapidsai/rmm/pull/1653)) [@wence-](https://github.com/wence-)
+- Use `tool.scikit-build.cmake.version` ([#1637](https://github.com/rapidsai/rmm/pull/1637)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+
+## 📖 Documentation
+
+- Recommend `miniforge` for conda install. ([#1681](https://github.com/rapidsai/rmm/pull/1681)) [@bdice](https://github.com/bdice)
+- Fix docs cross reference in DeviceBuffer.prefetch ([#1636](https://github.com/rapidsai/rmm/pull/1636)) [@bdice](https://github.com/bdice)
+
+## 🚀 New Features
+
+- [FEA] Allow setting `*_pool_size` with human-readable string ([#1670](https://github.com/rapidsai/rmm/pull/1670)) [@Matt711](https://github.com/Matt711)
+- Update RMM adaptors, containers and tests to use get/set_current_device_resource_ref() ([#1661](https://github.com/rapidsai/rmm/pull/1661)) [@harrism](https://github.com/harrism)
+- Deprecate adaptor factories. ([#1626](https://github.com/rapidsai/rmm/pull/1626)) [@bdice](https://github.com/bdice)
+- Allow testing of earliest/latest dependencies ([#1613](https://github.com/rapidsai/rmm/pull/1613)) [@seberg](https://github.com/seberg)
+- Add resource_ref versions of get/set_current_device_resource ([#1598](https://github.com/rapidsai/rmm/pull/1598)) [@harrism](https://github.com/harrism)
+
+## 🛠️ Improvements
+
+- Update update-version.sh to use packaging lib ([#1685](https://github.com/rapidsai/rmm/pull/1685)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Use CI workflow branch &#39;branch-24.10&#39; again ([#1683](https://github.com/rapidsai/rmm/pull/1683)) [@jameslamb](https://github.com/jameslamb)
+- Update fmt (to 11.0.2) and spdlog (to 1.14.1). ([#1678](https://github.com/rapidsai/rmm/pull/1678)) [@jameslamb](https://github.com/jameslamb)
+- Attempt to address oom failures in test suite ([#1672](https://github.com/rapidsai/rmm/pull/1672)) [@wence-](https://github.com/wence-)
+- Add support for Python 3.12 ([#1666](https://github.com/rapidsai/rmm/pull/1666)) [@jameslamb](https://github.com/jameslamb)
+- Update rapidsai/pre-commit-hooks ([#1663](https://github.com/rapidsai/rmm/pull/1663)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Drop Python 3.9 support ([#1659](https://github.com/rapidsai/rmm/pull/1659)) [@jameslamb](https://github.com/jameslamb)
+- Remove NumPy &lt;2 pin ([#1650](https://github.com/rapidsai/rmm/pull/1650)) [@seberg](https://github.com/seberg)
+- Hide visibility of non-public symbols ([#1644](https://github.com/rapidsai/rmm/pull/1644)) [@jameslamb](https://github.com/jameslamb)
+- Update pre-commit hooks ([#1643](https://github.com/rapidsai/rmm/pull/1643)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Improve update-version.sh ([#1640](https://github.com/rapidsai/rmm/pull/1640)) [@bdice](https://github.com/bdice)
+- Install headers into `${CMAKE_INSTALL_INCLUDEDIR}` ([#1633](https://github.com/rapidsai/rmm/pull/1633)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Merge branch-24.08 into branch-24.10 ([#1631](https://github.com/rapidsai/rmm/pull/1631)) [@jameslamb](https://github.com/jameslamb)
+
 # rmm 24.08.00 (7 Aug 2024)
 
 ## 🚨 Breaking Changes

From 4e519bbf94dd1641dfb69fc171f714c38a7d0894 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 9 Oct 2024 18:09:39 +0100
Subject: [PATCH 15/27] Update cross-link to cuda-python object (#1699)

nvidia/cuda-python#137 reorganised the low-level binding structure which broke our cross-linking, update to the new name to fix.

- Closes #1698

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/1699
---
 python/rmm/rmm/pylibrmm/device_buffer.pyx |  2 +-
 python/rmm/rmm/statistics.py              | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/python/rmm/rmm/pylibrmm/device_buffer.pyx b/python/rmm/rmm/pylibrmm/device_buffer.pyx
index 76fbceef8..c2e95e845 100644
--- a/python/rmm/rmm/pylibrmm/device_buffer.pyx
+++ b/python/rmm/rmm/pylibrmm/device_buffer.pyx
@@ -156,7 +156,7 @@ cdef class DeviceBuffer:
         device : optional
             The CUDA device to which to prefetch the memory for this buffer.
             Defaults to the current CUDA device. To prefetch to the CPU, pass
-            :py:attr:`~cuda.cudart.cudaCpuDeviceId` as the device.
+            :py:attr:`~cuda.bindings.runtime.cudaCpuDeviceId` as the device.
         stream : optional
             CUDA stream to use for prefetching. Defaults to self.stream
         """
diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 279e45dc6..2dabedce6 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -74,8 +74,8 @@ def enable_statistics() -> None:
 def get_statistics() -> Optional[Statistics]:
     """Get the current allocation statistics.
 
-    Return
-    ------
+    Returns
+    -------
     If enabled, returns the current tracked statistics.
     If disabled, returns None.
     """
@@ -94,8 +94,8 @@ def push_statistics() -> Optional[Statistics]:
     If statistics are disabled (the current memory resource is not an
     instance of StatisticsResourceAdaptor), this function is a no-op.
 
-    Return
-    ------
+    Returns
+    -------
     If enabled, returns the current tracked statistics _before_ the pop.
     If disabled, returns None.
     """
@@ -114,8 +114,8 @@ def pop_statistics() -> Optional[Statistics]:
     If statistics are disabled (the current memory resource is not an
     instance of StatisticsResourceAdaptor), this function is a no-op.
 
-    Return
-    ------
+    Returns
+    -------
     If enabled, returns the popped counters.
     If disabled, returns None.
     """
@@ -232,8 +232,8 @@ def report(
         ordered_by
             Sort the statistics by this attribute.
 
-        Return
-        ------
+        Returns
+        -------
         The pretty formatted string of the memory statistics
         """
 
@@ -279,8 +279,8 @@ def _get_descriptive_name_of_object(obj: object) -> str:
     obj
         Object in question
 
-    Return
-    ------
+    Returns
+    -------
     A string including filename, line number, and object name.
     """
 

From 69a297d82641fcb61eac92ac6de42658cfa651f6 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 9 Oct 2024 19:00:50 +0100
Subject: [PATCH 16/27] Add BUILD_SHARED_LIBS option defaulting to ON (#1702)

This means that downstream libraries that get their `fmt` dependency from RMM will use `-DFMT_SHARED` in the compile command: this matches what the rapids combined devcontainers do, so we get sccache hits.

- Closes #1701

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/rmm/pull/1702
---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39d5dccde..26fcf1fd0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,6 +41,9 @@ rapids_cmake_build_type(Release)
 option(RMM_NVTX "Build RMM with NVTX support" OFF)
 option(BUILD_TESTS "Configure CMake to build tests" ON)
 option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF)
+# This is mostly so that dependent libraries, such as fmt, are configured in shared mode for
+# downstream dependents of RMM that get their common dependencies transitively.
+option(BUILD_SHARED_LIBS "Build RMM shared libraries" ON)
 set(RMM_LOGGING_LEVEL
     "INFO"
     CACHE STRING "Choose the logging level.")

From f7155183645f640fa5695a0558d9708703f5b2a6 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 9 Oct 2024 13:28:23 -0500
Subject: [PATCH 17/27] make conda installs in CI stricter (#1696)

Contributes to https://github.com/rapidsai/build-planning/issues/106

Proposes specifying the RAPIDS version in `conda install` calls in CI that install CI artifacts, to reduce the risk of CI jobs picking up artifacts from other releases.

Authors:
  - James Lamb (https://github.com/jameslamb)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/rmm/pull/1696
---
 ci/build_docs.sh  | 9 +++++----
 ci/test_cpp.sh    | 5 ++++-
 ci/test_python.sh | 5 ++++-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 970417c1d..fadaf0f27 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -6,6 +6,9 @@ set -euo pipefail
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
+
 rapids-dependency-file-generator \
   --output conda \
   --file-key docs \
@@ -23,11 +26,9 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  rmm librmm
+  "rmm=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "librmm=${RAPIDS_VERSION_MAJOR_MINOR}"
 
-export RAPIDS_VERSION="$(rapids-version)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 9ad1c9536..02435f249 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -8,6 +8,8 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-logger "Generate C++ testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
@@ -29,7 +31,8 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  librmm librmm-tests
+  "librmm=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "librmm-tests=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 386d0b063..7a688107e 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -9,6 +9,8 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-dependency-file-generator \
   --output conda \
   --file-key test_python \
@@ -28,7 +30,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  rmm librmm
+  "rmm=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "librmm=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
 RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}

From 90a5631e1093ce44c4feceb88fcf557c3dfc043b Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 9 Oct 2024 15:03:33 -0400
Subject: [PATCH 18/27] Fix typos in .gitignore (#1697)

Small fix to some typos that cropped up in the .gitignore with #1676

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/rmm/pull/1697
---
 .gitignore | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 36aafe643..df9d920d5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,13 +22,13 @@ rmm.egg-info/
 python/build
 python/*/build
 python/rmm/docs/_build
-python/rmm/**/librmmm/**/*.cpp
-!python/rmm/librmmm/_torch_allocator.cpp
+python/rmm/**/librmm/**/*.cpp
+!python/rmm/librmm/_torch_allocator.cpp
 python/rmm/**/librmm/**/*.h
 python/rmm/**/librmm/.nfs*
-python/rmm/**/pylibrmmm/**/*.cpp
-python/rmm/**/pylibrmmm/**/*.h
-python/rmm/**/pylibrmmm/.nfs*
+python/rmm/**/pylibrmm/**/*.cpp
+python/rmm/**/pylibrmm/**/*.h
+python/rmm/**/pylibrmm/.nfs*
 python/rmm/_cuda/*.cpp
 python/rmm/tests/*.cpp
 python/rmm/*.ipynb

From 1b70ffdd5ab460ac481f1575c42e8c1fccfda792 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 11 Oct 2024 15:09:11 -0500
Subject: [PATCH 19/27] make conda installs in CI stricter (part 2) (#1703)

Follow-up to #1696

Changes relative to that PR:

* switches to pinning CI conda installs to the output of `rapids-version` (`{major}.{minor}.{patch}`) instead of `rapids-version-major-minor` (`{major}.{minor}`), to get a bit more protection in the presence of hotfix releases
* restores some exporting of variables needed for docs builds

In #1696, I'd missed that this project's Doxygen setup is expecting to find `RAPIDS_VERSION_MAJOR_MINOR` defined in the environment.

https://github.com/rapidsai/rmm/blob/90a5631e1093ce44c4feceb88fcf557c3dfc043b/ci/build_docs.sh#L36

https://github.com/rapidsai/rmm/blob/90a5631e1093ce44c4feceb88fcf557c3dfc043b/doxygen/Doxyfile#L41

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/rmm/pull/1703
---
 ci/build_docs.sh  | 10 +++++-----
 ci/test_cpp.sh    |  6 +++---
 ci/test_python.sh |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index fadaf0f27..844dae1c6 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -6,8 +6,8 @@ set -euo pipefail
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
+RAPIDS_VERSION="$(rapids-version)"
+export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
 
 rapids-dependency-file-generator \
   --output conda \
@@ -26,8 +26,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  "rmm=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "librmm=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "rmm=${RAPIDS_VERSION}" \
+  "librmm=${RAPIDS_VERSION}"
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
@@ -45,4 +45,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/rmm/html"
 mv _build/dirhtml/* "${RAPIDS_DOCS_DIR}/rmm/html"
 popd
 
-rapids-upload-docs
+RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 02435f249..975477a6e 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -8,7 +8,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-logger "Generate C++ testing dependencies"
 rapids-dependency-file-generator \
@@ -31,8 +31,8 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  "librmm=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "librmm-tests=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "librmm=${RAPIDS_VERSION}" \
+  "librmm-tests=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 7a688107e..51d0a48c3 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -9,7 +9,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-dependency-file-generator \
   --output conda \
@@ -30,8 +30,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  "rmm=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "librmm=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "rmm=${RAPIDS_VERSION}" \
+  "librmm=${RAPIDS_VERSION}"
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
 RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}

From de42f5711386f6b914cef0fc54d3081a936c5740 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 16 Oct 2024 01:22:40 -0700
Subject: [PATCH 20/27] Deprecate support for directly accessing logger (#1690)

Contributes to https://github.com/rapidsai/build-planning/issues/104


This PR removes support for accessing rmm's underlying spdlog logger directly.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/rmm/pull/1690
---
 include/rmm/logger.hpp                | 25 ++++++++++++++++---------
 python/rmm/rmm/librmm/_logger.pxd     |  2 +-
 tests/mr/device/tracking_mr_tests.cpp | 10 +++++-----
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/include/rmm/logger.hpp b/include/rmm/logger.hpp
index 326385f16..eba3f122b 100644
--- a/include/rmm/logger.hpp
+++ b/include/rmm/logger.hpp
@@ -96,6 +96,11 @@ struct bytes {
   }
 };
 
+inline spdlog::logger& logger()
+{
+  static detail::logger_wrapper wrapped{};
+  return wrapped.logger_;
+}
 }  // namespace detail
 
 /**
@@ -107,10 +112,12 @@ struct bytes {
  *
  * @return spdlog::logger& The logger.
  */
-RMM_EXPORT inline spdlog::logger& logger()
+[[deprecated(
+  "Support for direct access to spdlog loggers in rmm is planned for "
+  "removal")]] RMM_EXPORT inline spdlog::logger&
+logger()
 {
-  static detail::logger_wrapper wrapped{};
-  return wrapped.logger_;
+  return detail::logger();
 }
 
 //! @cond Doxygen_Suppress
@@ -118,12 +125,12 @@ RMM_EXPORT inline spdlog::logger& logger()
 // The default is INFO, but it should be used sparingly, so that by default a log file is only
 // output if there is important information, warnings, errors, and critical failures
 // Log messages that require computation should only be used at level TRACE and DEBUG
-#define RMM_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&rmm::logger(), __VA_ARGS__)
+#define RMM_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&rmm::detail::logger(), __VA_ARGS__)
+#define RMM_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&rmm::detail::logger(), __VA_ARGS__)
+#define RMM_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&rmm::detail::logger(), __VA_ARGS__)
+#define RMM_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&rmm::detail::logger(), __VA_ARGS__)
+#define RMM_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&rmm::detail::logger(), __VA_ARGS__)
+#define RMM_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&rmm::detail::logger(), __VA_ARGS__)
 
 //! @endcond
 
diff --git a/python/rmm/rmm/librmm/_logger.pxd b/python/rmm/rmm/librmm/_logger.pxd
index 241a748c3..fb2126b2f 100644
--- a/python/rmm/rmm/librmm/_logger.pxd
+++ b/python/rmm/rmm/librmm/_logger.pxd
@@ -62,5 +62,5 @@ cdef extern from "spdlog/spdlog.h" namespace "spdlog" nogil:
         bool should_log(logging_level msg_level)
 
 
-cdef extern from "rmm/logger.hpp" namespace "rmm" nogil:
+cdef extern from "rmm/logger.hpp" namespace "rmm::detail" nogil:
     cdef spdlog_logger& logger() except +
diff --git a/tests/mr/device/tracking_mr_tests.cpp b/tests/mr/device/tracking_mr_tests.cpp
index acd540ae6..3fce55fb8 100644
--- a/tests/mr/device/tracking_mr_tests.cpp
+++ b/tests/mr/device/tracking_mr_tests.cpp
@@ -204,8 +204,8 @@ TEST(TrackingTest, LogOutstandingAllocations)
 {
   std::ostringstream oss;
   auto oss_sink = std::make_shared<spdlog::sinks::ostream_sink_st>(oss);
-  rmm::logger().sinks().push_back(oss_sink);
-  auto old_level = rmm::logger().level();
+  rmm::detail::logger().sinks().push_back(oss_sink);
+  auto old_level = rmm::detail::logger().level();
 
   tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   std::vector<void*> allocations;
@@ -213,7 +213,7 @@ TEST(TrackingTest, LogOutstandingAllocations)
     allocations.push_back(mr.allocate(ten_MiB));
   }
 
-  rmm::logger().set_level(spdlog::level::debug);
+  rmm::detail::logger().set_level(spdlog::level::debug);
   EXPECT_NO_THROW(mr.log_outstanding_allocations());
 
 #if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_DEBUG
@@ -224,8 +224,8 @@ TEST(TrackingTest, LogOutstandingAllocations)
     mr.deallocate(allocation, ten_MiB);
   }
 
-  rmm::logger().set_level(old_level);
-  rmm::logger().sinks().pop_back();
+  rmm::detail::logger().set_level(old_level);
+  rmm::detail::logger().sinks().pop_back();
 }
 
 }  // namespace

From 50e60a868af05cc9f65b9980753d708e7170f3a1 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 16 Oct 2024 16:16:32 -0500
Subject: [PATCH 21/27] Fix docs warning (#1706)

Closes https://github.com/rapidsai/rmm/issues/1705.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/rmm/pull/1706
---
 python/rmm/docs/conf.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/python/rmm/docs/conf.py b/python/rmm/docs/conf.py
index d48dc2b42..0b2c21d5a 100644
--- a/python/rmm/docs/conf.py
+++ b/python/rmm/docs/conf.py
@@ -12,6 +12,7 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
+import datetime
 import os
 import re
 
@@ -22,8 +23,8 @@
 # -- Project information -----------------------------------------------------
 
 project = "rmm"
-copyright = "2020-2023, NVIDIA"
-author = "NVIDIA"
+copyright = f"2018-{datetime.datetime.today().year}, NVIDIA Corporation"
+author = "NVIDIA Corporation"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -118,19 +119,6 @@
 
 html_theme = "sphinx_rtd_theme"
 
-# on_rtd is whether we are on readthedocs.org
-on_rtd = os.environ.get("READTHEDOCS", None) == "True"
-
-if not on_rtd:
-    # only import and set the theme if we're building docs locally
-    # otherwise, readthedocs.org uses their theme by default,
-    # so no need to specify it
-    import sphinx_rtd_theme
-
-    html_theme = "sphinx_rtd_theme"
-    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-
-
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.

From 1024a1250cfde7e93d26dc6d5e063e84c4a39824 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 22 Oct 2024 08:20:02 -0400
Subject: [PATCH 22/27] Update rmm tests to use rapids_cmake_support_conda_env
 (#1707)

Fixes issue brought up in https://github.com/rapidsai/rapids-cmake/issues/634#issuecomment-2345129521 where rmm wasn't using rapids_cmake_support_conda_env

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/rmm/pull/1707
---
 tests/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ea1af58cd..0258c59c5 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -19,13 +19,16 @@ option(CODE_COVERAGE "Enable generating code coverage with gcov." OFF)
 include(rapids-test)
 rapids_test_init()
 
+# Ensure tests are using the conda env, so they have the correct release/debug compile flags
+rapids_cmake_support_conda_env(conda_env)
+
 # This function takes in a test name and test source and handles setting all of the associated
 # properties and linking to build the test
 function(ConfigureTestInternal TEST_NAME)
   add_executable(${TEST_NAME} ${ARGN})
   target_include_directories(${TEST_NAME} PRIVATE "$<BUILD_INTERFACE:${RMM_SOURCE_DIR}>")
   target_link_libraries(${TEST_NAME} GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main
-                        pthread rmm)
+                        pthread rmm $<TARGET_NAME_IF_EXISTS:conda_env>)
   set_target_properties(
     ${TEST_NAME}
     PROPERTIES POSITION_INDEPENDENT_CODE ON
@@ -40,7 +43,6 @@ function(ConfigureTestInternal TEST_NAME)
                              PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}")
   target_compile_options(${TEST_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror
                                              -Wno-error=deprecated-declarations>)
-  target_compile_options(${TEST_NAME} PUBLIC "$<$<CONFIG:Debug>:-O0>")
 
   if(DISABLE_DEPRECATION_WARNING)
     target_compile_options(

From 1ebfe0a4ee5f83a2ad54afcf99716944d20598dd Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Fri, 25 Oct 2024 19:18:44 +0200
Subject: [PATCH 23/27] devcontainer: replace `VAULT_HOST` with `AWS_ROLE_ARN`
 (#1708)

This PR is replacing the `VAULT_HOST` variable with `AWS_ROLE_ARN`. This is required to use the new token service to get AWS credentials.

Authors:
  - Jordan Jacobelli (https://github.com/jjacobelli)

Approvers:
  - Paul Taylor (https://github.com/trxcllnt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/1708
---
 .devcontainer/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 9d35e3f97..5d1d53670 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -26,5 +26,5 @@ ENV PYTHONDONTWRITEBYTECODE="1"
 
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV HISTFILE="/home/coder/.cache/._bash_history"

From 47dae24b5578894ac0efc3c06930b7a5a069d988 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 29 Oct 2024 13:39:06 -0500
Subject: [PATCH 24/27] print sccache stats in builds (#1712)

Contributes to https://github.com/rapidsai/build-planning/issues/111

Proposes some small packaging/CI changes, matching similar changes being made across RAPIDS.

* printing `sccache` stats to CI logs
* updating to the latest `rapids-dependency-file-generator` (v1.16.0)
* reducing verbosity of `pip wheel` in wheel builds

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/rmm/pull/1712
---
 .pre-commit-config.yaml  | 2 +-
 ci/build_cpp.sh          | 4 ++++
 ci/build_python.sh       | 4 ++++
 ci/build_wheel_cpp.sh    | 7 ++++++-
 ci/build_wheel_python.sh | 6 +++++-
 5 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f114abec4..56c972b4e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -82,7 +82,7 @@ repos:
         - id: verify-copyright
         - id: verify-alpha-spec
     - repo: https://github.com/rapidsai/dependency-file-generator
-      rev: v1.13.11
+      rev: v1.16.0
       hooks:
           - id: rapids-dependency-file-generator
             args: ["--clean"]
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index a601ecaae..9d14cd072 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -15,7 +15,11 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
+sccache --zero-stats
+
 # This calls mambabuild when boa is installed (as is the case in the CI images)
 RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild conda/recipes/librmm
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index fcd2c55e7..7a9df5fc7 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -19,7 +19,11 @@ rapids-logger "Begin py build"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
+sccache --zero-stats
+
 # This calls mambabuild when boa is installed (as is the case in the CI images)
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild -c "${CPP_CHANNEL}" conda/recipes/rmm
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
index 2c5cc0560..12e099bdb 100755
--- a/ci/build_wheel_cpp.sh
+++ b/ci/build_wheel_cpp.sh
@@ -14,7 +14,12 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
 cd "${package_dir}"
 
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+sccache --zero-stats
+
+python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+
+sccache --show-adv-stats
+
 python -m pip install wheel
 python -m wheel tags --platform any dist/* --remove
 RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp dist
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index 555974b50..b497b76d3 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -22,8 +22,12 @@ CPP_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-down
 # are used when created the isolated build environment
 echo "librmm-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${CPP_WHEELHOUSE}/librmm_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" > ./build-constraints.txt
 
+sccache --zero-stats
+
 PIP_CONSTRAINT="${PWD}/build-constraints.txt" \
-    python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+    python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+
+sccache --show-adv-stats
 
 mkdir -p final_dist
 python -m auditwheel repair -w final_dist dist/*

From 8d49fffdb93b55ce70c72981d2e1d5511692eaa2 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 31 Oct 2024 19:44:38 -0400
Subject: [PATCH 25/27] Deprecate `rmm._lib` (#1713)

Follows up #1676 to add deprecation warnings to the `rmm._lib` sub package.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1713
---
 python/rmm/rmm/__init__.py       | 8 ++++++++
 python/rmm/rmm/_lib/__init__.py  | 8 ++++++++
 python/rmm/rmm/tests/test_rmm.py | 6 ++++++
 3 files changed, 22 insertions(+)

diff --git a/python/rmm/rmm/__init__.py b/python/rmm/rmm/__init__.py
index b23ad68f9..832fec095 100644
--- a/python/rmm/rmm/__init__.py
+++ b/python/rmm/rmm/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
+
 from rmm import mr
 from rmm._version import __git_commit__, __version__
 from rmm.mr import disable_logging, enable_logging, get_log_filenames
@@ -58,6 +60,12 @@ def __getattr__(name):
     if name == "_lib":
         import importlib
 
+        warnings.warn(
+            "The `rmm._lib` module is deprecated in will be removed in a future release. Use `rmm.pylibrmm` instead.",
+            FutureWarning,
+            stacklevel=2,
+        )
+
         module = importlib.import_module("rmm.pylibrmm")
         return module
     else:
diff --git a/python/rmm/rmm/_lib/__init__.py b/python/rmm/rmm/_lib/__init__.py
index 7cfddab60..7e01bda77 100644
--- a/python/rmm/rmm/_lib/__init__.py
+++ b/python/rmm/rmm/_lib/__init__.py
@@ -12,4 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
+
 from rmm.pylibrmm import *
+
+warnings.warn(
+    "The `rmm._lib` module is deprecated in will be removed in a future release. Use `rmm.pylibrmm` instead.",
+    FutureWarning,
+    stacklevel=2,
+)
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index c03b9e501..9872ba89d 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -1076,3 +1076,9 @@ def test_available_device_memory():
     assert initial_memory[1] == final_memory[1]
     assert initial_memory[0] > 0
     assert final_memory[0] > 0
+
+
+# TODO: Remove test when rmm._lib is removed in 25.02
+def test_deprecate_rmm_lib():
+    with pytest.warns(FutureWarning):
+        rmm._lib.device_buffer.DeviceBuffer(size=100)

From 9b76d366d2d971839d4997c437e2d20490d9d65e Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 1 Nov 2024 13:47:08 -0400
Subject: [PATCH 26/27] Correct rmm tests for validity of device pointers
 (#1714)

The `is_host_memory` function has been updated to understand that `cudaMemoryTypeUnregistered` is returned when provided pointers allocated by `malloc` and other host side allocation functions.

The `is_device_memory` function has been restricted to report only when device pointer that is usable by the calling cuda context. For that reason the tests now also set the active cuda device for all calling threads.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Rong Ou (https://github.com/rongou)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1714
---
 .../mr/device/mr_ref_multithreaded_tests.cpp  | 19 +++++++++++++++++--
 tests/mr/device/test_utils.hpp                |  7 ++-----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp
index 7d749efd1..944ba1807 100644
--- a/tests/mr/device/mr_ref_multithreaded_tests.cpp
+++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp
@@ -109,8 +109,13 @@ TEST_P(mr_ref_test_mt, SetCurrentDeviceResourceRef_mt)
 {
   // single thread changes default resource, then multiple threads use it
   auto old = rmm::mr::set_current_device_resource_ref(this->ref);
+  test_get_current_device_resource_ref();
 
-  spawn([mr = this->ref]() {
+  int device;
+  RMM_CUDA_TRY(cudaGetDevice(&device));
+
+  spawn([device, mr = this->ref]() {
+    RMM_CUDA_TRY(cudaSetDevice(device));
     EXPECT_EQ(mr, rmm::mr::get_current_device_resource_ref());
     test_get_current_device_resource_ref();  // test allocating with the new default resource
   });
@@ -156,7 +161,17 @@ TEST_P(mr_ref_test_mt, SetCurrentDeviceResourceRefPerThread_mt)
   }
 }
 
-TEST_P(mr_ref_test_mt, Allocate) { spawn(test_various_allocations, this->ref); }
+TEST_P(mr_ref_test_mt, Allocate)
+{
+  int device;
+  RMM_CUDA_TRY(cudaGetDevice(&device));
+
+  auto mr = this->ref;
+  spawn([device, mr]() {
+    RMM_CUDA_TRY(cudaSetDevice(device));
+    test_various_allocations(mr);
+  });
+}
 
 TEST_P(mr_ref_test_mt, AllocateDefaultStream)
 {
diff --git a/tests/mr/device/test_utils.hpp b/tests/mr/device/test_utils.hpp
index 2b9513793..5b7ef197b 100644
--- a/tests/mr/device/test_utils.hpp
+++ b/tests/mr/device/test_utils.hpp
@@ -31,17 +31,14 @@ inline bool is_device_accessible_memory(void* ptr)
 {
   cudaPointerAttributes attributes{};
   if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
-  return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged) or
-         ((attributes.type == cudaMemoryTypeHost) and (attributes.devicePointer != nullptr)) or
-         ((attributes.type == cudaMemoryTypeUnregistered) and
-          (rmm::mr::detail::is_system_memory_supported(rmm::get_current_cuda_device())));
+  return attributes.devicePointer != nullptr;
 }
 
 inline bool is_host_memory(void* ptr)
 {
   cudaPointerAttributes attributes{};
   if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
-  return attributes.type == cudaMemoryTypeHost;
+  return attributes.hostPointer != nullptr || attributes.type == cudaMemoryTypeUnregistered;
 }
 
 inline bool is_properly_aligned(void* ptr)

From dbae8c08b0bed1d14ff1b5fe1bc5332b0c175cf8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 2 Nov 2024 01:58:15 +0800
Subject: [PATCH 27/27] [fea] Expose the arena mr to the Python interface.
 (#1711)

Close https://github.com/rapidsai/rmm/issues/830 .

- Add the arena allocator to the public Python interface.
- Small changes to the logger initialization to avoid exposing spdlog in the shared objects.

Authors:
  - Jiaming Yuan (https://github.com/trivialfis)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1711
---
 .../rmm/mr/device/arena_memory_resource.hpp   | 10 ++++-
 include/rmm/mr/device/detail/arena.hpp        |  2 +-
 python/rmm/rmm/_lib/memory_resource.pxd       |  1 +
 python/rmm/rmm/_lib/memory_resource.py        |  1 +
 python/rmm/rmm/librmm/memory_resource.pxd     |  9 ++++
 python/rmm/rmm/mr.py                          |  2 +
 python/rmm/rmm/pylibrmm/memory_resource.pxd   |  3 ++
 python/rmm/rmm/pylibrmm/memory_resource.pyx   | 43 +++++++++++++++++++
 python/rmm/rmm/tests/test_rmm.py              | 22 ++++++++++
 9 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 417b7d2b4..9b380ffb9 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -97,7 +97,10 @@ class arena_memory_resource final : public device_memory_resource {
     : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
   {
     if (dump_log_on_failure_) {
-      logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
+      logger_ =
+        std::make_shared<spdlog::logger>("arena_memory_dump",
+                                         std::make_shared<spdlog::sinks::basic_file_sink_mt>(
+                                           "rmm_arena_memory_dump.log", true /*truncate file*/));
       // Set the level to `debug` for more detailed output.
       logger_->set_level(spdlog::level::info);
     }
@@ -120,7 +123,10 @@ class arena_memory_resource final : public device_memory_resource {
       dump_log_on_failure_{dump_log_on_failure}
   {
     if (dump_log_on_failure_) {
-      logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
+      logger_ =
+        std::make_shared<spdlog::logger>("arena_memory_dump",
+                                         std::make_shared<spdlog::sinks::basic_file_sink_mt>(
+                                           "rmm_arena_memory_dump.log", true /*truncate file*/));
       // Set the level to `debug` for more detailed output.
       logger_->set_level(spdlog::level::info);
     }
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 6f8303c83..da64ca85b 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -647,7 +647,7 @@ class global_arena final {
    *
    * @param logger the spdlog logger to use
    */
-  void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
+  RMM_HIDDEN void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
   {
     std::lock_guard lock(mtx_);
 
diff --git a/python/rmm/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd
index 983063914..0d11001a4 100644
--- a/python/rmm/rmm/_lib/memory_resource.pxd
+++ b/python/rmm/rmm/_lib/memory_resource.pxd
@@ -40,6 +40,7 @@ from rmm.librmm.memory_resource cimport (
     translate_python_except_to_cpp,
 )
 from rmm.pylibrmm.memory_resource cimport (
+    ArenaMemoryResource,
     BinningMemoryResource,
     CallbackMemoryResource,
     CudaAsyncMemoryResource,
diff --git a/python/rmm/rmm/_lib/memory_resource.py b/python/rmm/rmm/_lib/memory_resource.py
index 0d47e8c9b..f3a24f635 100644
--- a/python/rmm/rmm/_lib/memory_resource.py
+++ b/python/rmm/rmm/_lib/memory_resource.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from rmm.pylibrmm.memory_resource import (  # noqa: F401
+    ArenaMemoryResource,
     BinningMemoryResource,
     CallbackMemoryResource,
     CudaAsyncMemoryResource,
diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd
index 9ddaf04b9..9e7b70c4f 100644
--- a/python/rmm/rmm/librmm/memory_resource.pxd
+++ b/python/rmm/rmm/librmm/memory_resource.pxd
@@ -130,6 +130,15 @@ cdef extern from "rmm/mr/device/pool_memory_resource.hpp" \
             optional[size_t] maximum_pool_size) except +
         size_t pool_size()
 
+cdef extern from "rmm/mr/device/arena_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass arena_memory_resource[Upstream](device_memory_resource):
+        arena_memory_resource(
+            Upstream* upstream_mr,
+            optional[size_t] arena_size,
+            bool dump_log_on_failure
+        ) except +
+
 cdef extern from "rmm/mr/device/fixed_size_memory_resource.hpp" \
         namespace "rmm::mr" nogil:
     cdef cppclass fixed_size_memory_resource[Upstream](device_memory_resource):
diff --git a/python/rmm/rmm/mr.py b/python/rmm/rmm/mr.py
index 3f0c3fce3..82729271f 100644
--- a/python/rmm/rmm/mr.py
+++ b/python/rmm/rmm/mr.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from rmm.pylibrmm.memory_resource import (
+    ArenaMemoryResource,
     BinningMemoryResource,
     CallbackMemoryResource,
     CudaAsyncMemoryResource,
@@ -45,6 +46,7 @@
 )
 
 __all__ = [
+    "ArenaMemoryResource",
     "BinningMemoryResource",
     "CallbackMemoryResource",
     "CudaAsyncMemoryResource",
diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pxd b/python/rmm/rmm/pylibrmm/memory_resource.pxd
index 985d5d31b..d1e5610db 100644
--- a/python/rmm/rmm/pylibrmm/memory_resource.pxd
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pxd
@@ -26,6 +26,9 @@ cdef class UpstreamResourceAdaptor(DeviceMemoryResource):
 
     cpdef DeviceMemoryResource get_upstream(self)
 
+cdef class ArenaMemoryResource(UpstreamResourceAdaptor):
+    pass
+
 cdef class CudaMemoryResource(DeviceMemoryResource):
     pass
 
diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pyx b/python/rmm/rmm/pylibrmm/memory_resource.pyx
index 021125567..b41890fca 100644
--- a/python/rmm/rmm/pylibrmm/memory_resource.pyx
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pyx
@@ -49,6 +49,7 @@ from rmm.librmm.memory_resource cimport (
     CppExcept,
     allocate_callback_t,
     allocation_handle_type,
+    arena_memory_resource,
     available_device_memory as c_available_device_memory,
     binning_memory_resource,
     callback_memory_resource,
@@ -310,6 +311,48 @@ cdef class PoolMemoryResource(UpstreamResourceAdaptor):
         )
         return c_mr.pool_size()
 
+cdef class ArenaMemoryResource(UpstreamResourceAdaptor):
+    def __cinit__(
+        self, DeviceMemoryResource upstream_mr,
+        arena_size=None,
+        dump_log_on_failure=False
+    ):
+        cdef optional[size_t] c_arena_size = (
+            optional[size_t]() if
+            arena_size is None
+            else optional[size_t](<size_t> parse_bytes(arena_size))
+        )
+        self.c_obj.reset(
+            new arena_memory_resource[device_memory_resource](
+                upstream_mr.get_mr(),
+                c_arena_size,
+                dump_log_on_failure,
+            )
+        )
+
+    def __init__(
+        self,
+        DeviceMemoryResource upstream_mr,
+        object arena_size=None,
+        bool dump_log_on_failure=False
+    ):
+        """
+        A suballocator that emphasizes fragmentation avoidance and scalable concurrency
+        support.
+
+        Parameters
+        ----------
+        upstream_mr : DeviceMemoryResource
+            The DeviceMemoryResource from which to allocate memory for arenas.
+        arena_size : int, optional
+            Size in bytes of the global arena. Defaults to half of the available memory
+            on the current device.
+        dump_log_on_failure : bool, optional
+            Whether to dump the arena on allocation failure.
+        """
+        pass
+
+
 cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor):
     def __cinit__(
             self,
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index 9872ba89d..b52ea0179 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -505,6 +505,28 @@ def test_binning_memory_resource(dtype, nelem, alloc, upstream_mr):
     array_tester(dtype, nelem, alloc)
 
 
+@pytest.mark.parametrize("dtype", _dtypes)
+@pytest.mark.parametrize("nelem", _nelems)
+@pytest.mark.parametrize("alloc", _allocs)
+@pytest.mark.parametrize(
+    "upstream_mr",
+    [
+        lambda: rmm.mr.CudaMemoryResource(),
+        lambda: rmm.mr.ManagedMemoryResource(),
+        lambda: rmm.mr.PoolMemoryResource(
+            rmm.mr.CudaMemoryResource(), 1 << 20
+        ),
+    ],
+)
+def test_arena_memory_resource(dtype, nelem, alloc, upstream_mr):
+    upstream = upstream_mr()
+    mr = rmm.mr.ArenaMemoryResource(upstream)
+
+    rmm.mr.set_current_device_resource(mr)
+    assert rmm.mr.get_current_device_resource_type() is type(mr)
+    array_tester(dtype, nelem, alloc)
+
+
 def test_reinitialize_max_pool_size():
     rmm.reinitialize(
         pool_allocator=True, initial_pool_size=0, maximum_pool_size="8MiB"