NVIDIA · pciolkosz · Nov 11, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 28, 2024
diff --git a/examples/cudax/CMakeLists.txt b/examples/cudax/CMakeLists.txt
@@ -66,7 +66,11 @@ endif()
 add_executable(vector_add vector_add/vector_add.cu)
 target_link_libraries(vector_add PUBLIC cudax_samples_interface)
 
+add_executable(simple_p2p simple_p2p/simple_p2p.cu)
+target_link_libraries(simple_p2p PUBLIC cudax_samples_interface)
+
 # This is only relevant for internal testing and not needed by end users.
 include(CTest)
 enable_testing()
 add_test(NAME vector_add COMMAND vector_add)
+add_test(NAME simple_p2p COMMAND simple_p2p)
diff --git a/examples/cudax/simple_p2p/simple_p2p.cu b/examples/cudax/simple_p2p/simple_p2p.cu
@@ -0,0 +1,253 @@
+/* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample demonstrates a combination of Peer-to-Peer (P2P) and
+ * Unified Virtual Address Space (UVA) features.
+ */
+
+#include <cuda/memory_resource>
+
+#include <cuda/experimental/algorithm.cuh>
+#include <cuda/experimental/buffer.cuh>
+#include <cuda/experimental/device.cuh>
+#include <cuda/experimental/launch.cuh>
+#include <cuda/experimental/memory_resource.cuh>
+
+#include <algorithm>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace cudax = cuda::experimental;
+
+struct simple_kernel
+{
+  template <typename Dimensions>
+  __device__ void operator()(Dimensions dims, ::cuda::std::span<const float> src, ::cuda::std::span<float> dst)
+  {
+    // Just a dummy kernel, doing enough for us to verify that everything worked
+    const auto idx = dims.rank(cudax::thread);
+    dst[idx]       = src[idx] * 2.0f;
+  }
+};
+
+std::vector<cudax::device_ref> find_peers_group()
+{
+  // Check possibility for peer access
+  printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
+
+  std::vector<cudax::device_ref> peers;
+  for (auto& dev_i : cudax::devices)
+  {
+    for (auto& dev_j : cudax::devices)
+    {
+      if (dev_i != dev_j)
+      {
+        bool can_access_peer = dev_i.is_peer_accessible_from(dev_j);
+        // Save all peers of a first device found with a peer
+        if (can_access_peer && peers.size() == 0)
+        {
+          peers = dev_i.get_peers();
+          peers.insert(peers.begin(), dev_i);
+        }
+        printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
+               dev_i.get_name().c_str(),
+               dev_i.get(),
+               dev_j.get_name().c_str(),
+               dev_j.get(),
+               can_access_peer ? "Yes" : "No");
+      }
+    }
+  }
+
+  return peers;
+}
+
+template <typename BufferType>
+void benchmark_cross_device_ping_pong_copy(
+  cudax::stream_ref dev0_stream, cudax::stream_ref dev1_stream, BufferType& dev0_buffer, BufferType& dev1_buffer)
+{
+  // Use dev1 stream due to some surprising performance issue
+  constexpr int cpy_count = 100;
+  auto start_event        = dev1_stream.record_timed_event();
+  for (int i = 0; i < cpy_count; i++)
+  {
+    // Ping-pong copy between GPUs
+    if (i % 2 == 0)
+    {
+      cudax::copy_bytes(dev1_stream, dev0_buffer, dev1_buffer);
+    }
+    else
+    {
+      cudax::copy_bytes(dev1_stream, dev1_buffer, dev0_buffer);
+    }
+  }
+
+  auto end_event = dev1_stream.record_timed_event();
+  dev1_stream.wait();
+  cuda::std::chrono::duration<double> duration(end_event - start_event);
+  printf("Peer copy between GPU%d and GPU%d: %.2fGB/s\n",
+         dev0_stream.device().get(),
+         dev1_stream.device().get(),
+         (static_cast<float>(cpy_count * dev0_buffer.size_bytes()) / (1024 * 1024 * 1024) / duration.count()));
+}
+
+template <typename BufferType>
+void test_cross_device_access_from_kernel(
+  cudax::stream_ref dev0_stream, cudax::stream_ref dev1_stream, BufferType& dev0_buffer, BufferType& dev1_buffer)
+{
+  cudax::device_ref dev0 = dev0_stream.device().get();
+  cudax::device_ref dev1 = dev1_stream.device().get();
+
+  // Prepare host buffer and copy to GPU 0
+  printf("Preparing host buffer and copy to GPU%d...\n", dev0.get());
+
+  // This will be a pinned memory vector once available
+  cudax::uninitialized_buffer<float, cuda::mr::host_accessible> host_buffer(
+    cuda::mr::pinned_memory_resource(), dev0_buffer.size());
+  std::generate(host_buffer.begin(), host_buffer.end(), []() {
+    static int i = 0;
+    return (i++) % 4096;
+  });
+
+  cudax::copy_bytes(dev0_stream, host_buffer, dev0_buffer);
+  dev1_stream.wait(dev0_stream);
+
+  // Kernel launch configuration
+  auto dims = cudax::distribute<512>(dev0_buffer.size());
+
+  // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing output to the GPU 1 buffer
+  printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
+         "GPU%d...\n",
+         dev1.get(),
+         dev0.get(),
+         dev1.get());
+  cudax::launch(dev1_stream, dims, simple_kernel{}, dev0_buffer, dev1_buffer);
+  dev0_stream.wait(dev1_stream);
+
+  // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing output to the GPU 0 buffer
+  printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
+         "GPU%d...\n",
+         dev0.get(),
+         dev1.get(),
+         dev0.get());
+  cudax::launch(dev0_stream, dims, simple_kernel{}, dev1_buffer, dev0_buffer);
+
+  // Copy data back to host and verify
+  printf("Copy data back to host from GPU%d and verify results...\n", dev0.get());
+  cudax::copy_bytes(dev0_stream, dev0_buffer, host_buffer);
+  dev0_stream.wait();
+
+  int error_count = 0;
+  for (int i = 0; i < host_buffer.size(); i++)
+  {
+    cuda::std::span host_span(host_buffer);
+    // Re-generate input data and apply 2x '* 2.0f' computation of both kernel runs
+    float expected = float(i % 4096) * 2.0f * 2.0f;
+    if (host_span[i] != expected)
+    {
+      printf("Verification error @ element %i: val = %f, ref = %f\n", i, host_span[i], expected);
+
+      if (error_count++ > 10)
+      {
+        break;
+      }
+    }
+  }
+  if (error_count != 0)
+  {
+    printf("Test failed!\n");
+    exit(EXIT_FAILURE);
+  }
+}
+
+int main(int argc, char** argv)
+try
+{
+  const int test_waived = 2;
+  printf("[%s] - Starting...\n", argv[0]);
+
+  // Number of GPUs
+  printf("Checking for multiple GPUs...\n");
+  printf("CUDA-capable device count: %lu\n", cudax::devices.size());
+
+  if (cudax::devices.size() < 2)
+  {
+    printf("Two or more GPUs with Peer-to-Peer access capability are required for %s.\n", argv[0]);
+    printf("Waiving test.\n");
+    exit(test_waived);
+  }
+
+  auto peers = find_peers_group();
+
+  if (peers.size() == 0)
+  {
+    printf("Two or more GPUs with Peer-to-Peer access capability are required, waving the test.\n");
+    exit(test_waived);
+  }
+
+  cudax::stream dev0_stream(peers[0]);
+  cudax::stream dev1_stream(peers[1]);
+
+  printf("Enabling peer access between GPU%d and GPU%d...\n", peers[0].get(), peers[1].get());
+  cudax::mr::async_memory_resource dev0_resource(peers[0]);
+  dev0_resource.enable_peer_access(peers[1]);
+  cudax::mr::async_memory_resource dev1_resource(peers[1]);
+  dev1_resource.enable_peer_access(peers[0]);
+
+  // Allocate buffers
+  constexpr size_t buf_cnt = 1024 * 1024 * 16;
+  printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n",
+         int(buf_cnt / 1024 / 1024 * sizeof(float)),
+         peers[0].get(),
+         peers[1].get());
+
+  cudax::uninitialized_buffer<float, cuda::mr::device_accessible> dev0_buffer(dev0_resource, buf_cnt);
+  cudax::uninitialized_buffer<float, cuda::mr::device_accessible> dev1_buffer(dev1_resource, buf_cnt);
+
+  benchmark_cross_device_ping_pong_copy(dev0_stream, dev1_stream, dev0_buffer, dev1_buffer);
+
+  test_cross_device_access_from_kernel(dev0_stream, dev1_stream, dev0_buffer, dev1_buffer);
+
+  // Disable peer access
+  printf("Disabling peer access...\n");
+  dev0_resource.disable_peer_access(peers[1]);
+  dev1_resource.disable_peer_access(peers[0]);
+
+  // No cleanup needed
+  printf("Test passed\n");
+  return 0;
+}
+catch (const std::exception& e)
+{
+  printf("caught an exception: \"%s\"\n", e.what());
+}
+catch (...)
+{
+  printf("caught an unknown exception\n");
+}