Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prefetch resource adaptor #1608

Merged
merged 6 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 143 additions & 0 deletions include/rmm/mr/device/prefetch_resource_adaptor.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/prefetch.hpp>
#include <rmm/resource_ref.hpp>

#include <cstddef>
#include <mutex>
#include <shared_mutex>
#include <stack>

namespace rmm::mr {
/**
* @addtogroup device_resource_adaptors
* @{
* @file
*/
/**
* @brief Resource that prefetches all memory allocations.
*
* @tparam Upstream Type of the upstream resource used for
* allocation/deallocation.
*/
template <typename Upstream>
class prefetch_resource_adaptor final : public device_memory_resource {
public:
/**
* @brief Construct a new prefetch resource adaptor using `upstream` to satisfy
* allocation requests.
*
* @throws rmm::logic_error if `upstream == nullptr`
*
* @param upstream The resource used for allocating/deallocating device memory
*/
prefetch_resource_adaptor(Upstream* upstream) : upstream_{upstream}
{
RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
}

prefetch_resource_adaptor() = delete;
~prefetch_resource_adaptor() override = default;
prefetch_resource_adaptor(prefetch_resource_adaptor const&) = delete;
prefetch_resource_adaptor& operator=(prefetch_resource_adaptor const&) = delete;
prefetch_resource_adaptor(prefetch_resource_adaptor&&) noexcept =
default; ///< @default_move_constructor
prefetch_resource_adaptor& operator=(prefetch_resource_adaptor&&) noexcept =
default; ///< @default_move_assignment{prefetch_resource_adaptor}

/**
* @briefreturn{rmm::device_async_resource_ref to the upstream resource}
*/
[[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept
{
return upstream_;
}

/**
* @briefreturn{Upstream* to the upstream memory resource}
*/
[[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }

private:
/**
* @brief Allocates memory of size at least `bytes` using the upstream
* resource as long as it fits inside the allocation limit.
*
vyasr marked this conversation as resolved.
Show resolved Hide resolved
* @note The allocation is always prefetched to the current device.
*
* @throws rmm::bad_alloc if the requested allocation could not be fulfilled
* by the upstream resource.
*
* @param bytes The size, in bytes, of the allocation
* @param stream Stream on which to perform the allocation
* @return void* Pointer to the newly allocated memory
*/
void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
{
void* ptr = upstream_->allocate(bytes, stream);
rmm::prefetch(ptr, bytes, rmm::get_current_cuda_device(), stream);
vyasr marked this conversation as resolved.
Show resolved Hide resolved
return ptr;
}

/**
* @brief Free allocation of size `bytes` pointed to by `ptr`
*
* @param ptr Pointer to be deallocated
* @param bytes Size of the allocation
* @param stream Stream on which to perform the deallocation
*/
void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
{
upstream_->deallocate(ptr, bytes, stream);
}

/**
* @brief Compare the upstream resource to another.
*
* @param other The other resource to compare to
* @return true If the two resources are equivalent
* @return false If the two resources are not equal
*/
bool do_is_equal(device_memory_resource const& other) const noexcept override
{
if (this == &other) { return true; }
auto cast = dynamic_cast<prefetch_resource_adaptor<Upstream> const*>(&other);
if (cast == nullptr) { return upstream_->is_equal(other); }
return get_upstream_resource() == cast->get_upstream_resource();
}

Upstream* upstream_; // the upstream resource used for satisfying allocation requests
};

/**
* @brief Convenience factory to return a `prefetch_resource_adaptor` around the
* upstream resource `upstream`.
*
* @tparam Upstream Type of the upstream `device_memory_resource`.
* @param upstream Pointer to the upstream resource
* @return The new prefetch resource adaptor
*/
template <typename Upstream>
prefetch_resource_adaptor<Upstream> make_prefetch_adaptor(Upstream* upstream)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: can't this just be a constructor? Why do we need a factory function?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question — I followed prior art from the statistics adaptor.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My guess is that all of the make_* functions in rmm predate C++17 and were workarounds for not having CTAD. We can probably remove them now (not necessary in this PR but we can remove this one).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep. Also, we should have avoided creating another adaptor with an Upstream template parameter, since that is something we are trying to eliminate from the current adaptors as part of our refactoring. Unfortunately, I was not around to tell you this.

(The Upstream parameter is not necessary, it can be type erased, and will be with resource refs. See #1457)

Copy link
Contributor

@vyasr vyasr Jul 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ugh sorry, I should have caught this. I'm not yet used to the new paradigms with resource refs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to refactor the prefetching adaptor to use resource refs but got stuck at the point where I didn't know how to implement do_is_equal, since refs don't have an is_equal method (afaict). I didn't see any other adaptors that have been refactored yet.

{
return prefetch_resource_adaptor<Upstream>{upstream};
}

/** @} */ // end of group
} // namespace rmm::mr
3 changes: 3 additions & 0 deletions python/rmm/rmm/_lib/memory_resource.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -91,4 +91,7 @@ cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor):
cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor):
cdef object _callback

cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor):
pass

cpdef DeviceMemoryResource get_current_device_resource()
31 changes: 31 additions & 0 deletions python/rmm/rmm/_lib/memory_resource.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,11 @@ cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \
void* callback_arg
) except +

cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \
namespace "rmm::mr" nogil:
cdef cppclass prefetch_resource_adaptor[Upstream](device_memory_resource):
prefetch_resource_adaptor(Upstream* upstream_mr) except +


cdef class DeviceMemoryResource:

Expand Down Expand Up @@ -987,6 +992,32 @@ cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor):
"""
pass

cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor):

def __cinit__(
self,
DeviceMemoryResource upstream_mr
):
self.c_obj.reset(
new prefetch_resource_adaptor[device_memory_resource](
upstream_mr.get_mr()
)
)

def __init__(
self,
DeviceMemoryResource upstream_mr
):
"""
Memory resource that prefetches all allocations.

Parameters
----------
upstream : DeviceMemoryResource
The upstream memory resource.
"""
vyasr marked this conversation as resolved.
Show resolved Hide resolved
pass


# Global per-device memory resources; dict of int:DeviceMemoryResource
cdef _per_device_mrs = defaultdict(CudaMemoryResource)
Expand Down
2 changes: 2 additions & 0 deletions python/rmm/rmm/mr.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
LoggingResourceAdaptor,
ManagedMemoryResource,
PoolMemoryResource,
PrefetchResourceAdaptor,
StatisticsResourceAdaptor,
TrackingResourceAdaptor,
UpstreamResourceAdaptor,
Expand Down Expand Up @@ -52,6 +53,7 @@
"LoggingResourceAdaptor",
"ManagedMemoryResource",
"PoolMemoryResource",
"PrefetchResourceAdaptor",
"StatisticsResourceAdaptor",
"TrackingResourceAdaptor",
"FailureCallbackResourceAdaptor",
Expand Down
24 changes: 24 additions & 0 deletions python/rmm/rmm/tests/test_rmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,30 @@ def callback(nbytes: int) -> bool:
assert retried[0]


@pytest.mark.parametrize("managed", [True, False])
def test_prefetch_resource_adaptor(managed):
if managed:
upstream_mr = rmm.mr.ManagedMemoryResource()
else:
upstream_mr = rmm.mr.CudaMemoryResource()
mr = rmm.mr.PrefetchResourceAdaptor(upstream_mr)
rmm.mr.set_current_device_resource(mr)

# This allocation should be prefetched
db = rmm.DeviceBuffer.to_device(np.zeros(256, dtype="u1"))

err, device = cudart.cudaGetDevice()
assert err == cudart.cudaError_t.cudaSuccess

if managed:
assert_prefetched(db, device)
db.prefetch() # just test that it doesn't throw
if managed:
err, device = cudart.cudaGetDevice()
assert err == cudart.cudaError_t.cudaSuccess
assert_prefetched(db, device)


def test_failure_callback_resource_adaptor_error():
def callback(nbytes: int) -> bool:
raise RuntimeError("MyError")
Expand Down
3 changes: 3 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ ConfigureTest(TRACKING_TEST mr/device/tracking_mr_tests.cpp GPUS 1 PERCENT 100)
# out-of-memory callback adaptor tests
ConfigureTest(FAILURE_CALLBACK_TEST mr/device/failure_callback_mr_tests.cpp)

# prefetch adaptor tests
ConfigureTest(PREFETCH_ADAPTOR_TEST mr/device/prefetch_resource_adaptor_tests.cpp)

# aligned adaptor tests
ConfigureTest(ALIGNED_TEST mr/device/aligned_mr_tests.cpp)

Expand Down
101 changes: 101 additions & 0 deletions tests/mr/device/prefetch_resource_adaptor_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "../../byte_literals.hpp"

#include <rmm/cuda_stream.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/mr/device/cuda_memory_resource.hpp>
#include <rmm/mr/device/managed_memory_resource.hpp>
#include <rmm/mr/device/prefetch_resource_adaptor.hpp>

#include <gtest/gtest.h>

#include <cstddef>
#include <random>

using prefetch_adaptor = rmm::mr::prefetch_resource_adaptor<rmm::mr::device_memory_resource>;

template <typename MemoryResourceType>
struct PrefetchAdaptorTest : public ::testing::Test {
rmm::cuda_stream stream{};
std::size_t size{};
MemoryResourceType mr{};

PrefetchAdaptorTest()
{
std::default_random_engine generator;

auto constexpr range_min{1000};
auto constexpr range_max{100000};
std::uniform_int_distribution<std::size_t> distribution(range_min, range_max);
size = distribution(generator);
}

// Test that the memory range was last prefetched to the specified device
void expect_prefetched(void const* ptr, std::size_t size, rmm::cuda_device_id device)
{
if constexpr (std::is_same_v<MemoryResourceType, rmm::mr::managed_memory_resource>) {
int prefetch_location{0};
// See the CUDA documentation for cudaMemRangeGetAttribute
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g8048f6ea5ad77917444567656c140c5a
// specifically for when cudaMemRangeAttribute::cudaMemRangeAttributeLastPrefetchLocation is
// used.
constexpr size_t prefetch_data_size = 4;
RMM_CUDA_TRY(
cudaMemRangeGetAttribute(&prefetch_location,
prefetch_data_size,
cudaMemRangeAttribute::cudaMemRangeAttributeLastPrefetchLocation,
ptr,
size));
EXPECT_EQ(prefetch_location, device.value());
}
}
};

using resources = ::testing::Types<rmm::mr::cuda_memory_resource, rmm::mr::managed_memory_resource>;

TYPED_TEST_CASE(PrefetchAdaptorTest, resources);

// The following tests simply test compilation and that there are no exceptions thrown
// due to prefetching non-managed memory.

TYPED_TEST(PrefetchAdaptorTest, PointerAndSize)
{
auto* orig_device_resource = &this->mr;
prefetch_adaptor prefetch_mr{orig_device_resource};
rmm::device_buffer buff(this->size, this->stream, &prefetch_mr);
// verify data range has been prefetched
this->expect_prefetched(buff.data(), buff.size(), rmm::get_current_cuda_device());
// verify that prefetching does not error
rmm::prefetch(buff.data(), buff.size(), rmm::get_current_cuda_device(), this->stream);
// reverify data range has been prefetched
this->expect_prefetched(buff.data(), buff.size(), rmm::get_current_cuda_device());
}

TYPED_TEST(PrefetchAdaptorTest, NotPrefetchedWithoutAdaptor)
{
// verify not prefetched without adaptor
rmm::device_buffer buff(this->size, this->stream, &this->mr);
this->expect_prefetched(buff.data(), buff.size(), rmm::cuda_device_id(cudaInvalidDeviceId));
}

TEST(PrefetchAdaptorTestNullUpstream, ThrowOnNullUpstream)
{
auto construct_nullptr = []() { prefetch_adaptor mr{nullptr}; };
EXPECT_THROW(construct_nullptr(), rmm::logic_error);
}
7 changes: 6 additions & 1 deletion tests/prefetch_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,16 @@ struct PrefetchTest : public ::testing::Test {
// Test that the memory range was last prefetched to the specified device
void expect_prefetched(void const* ptr, std::size_t size, rmm::cuda_device_id device)
{
// See the CUDA documentation for cudaMemRangeGetAttribute
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g8048f6ea5ad77917444567656c140c5a
// specifically for when cudaMemRangeAttribute::cudaMemRangeAttributeLastPrefetchLocation is
// used.
constexpr size_t prefetch_data_size = 4;
if constexpr (std::is_same_v<MemoryResourceType, rmm::mr::managed_memory_resource>) {
int prefetch_location{0};
RMM_CUDA_TRY(
cudaMemRangeGetAttribute(&prefetch_location,
4,
prefetch_data_size,
cudaMemRangeAttribute::cudaMemRangeAttributeLastPrefetchLocation,
ptr,
size));
Expand Down
Loading