From 42a59a6ea83c03225dcb45879b8b920d9b92a023 Mon Sep 17 00:00:00 2001
From: Robert Remen <robik75@gmail.com>
Date: Sun, 8 Sep 2024 17:12:30 +0200
Subject: [PATCH] feat: add distribute function (#14)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What ❔

This PR adds the distribute function.

## Checklist

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [x] Documentation comments have been added / updated.
---
 src/bellman-cuda.cu |  5 +++++
 src/bellman-cuda.h  |  8 ++++++++
 src/pn_kernels.cu   | 17 +++++++++++++++++
 src/pn_kernels.cuh  |  2 ++
 4 files changed, 32 insertions(+)
diff --git a/src/bellman-cuda.cu b/src/bellman-cuda.cu
index c4916fd..65e823e 100644
--- a/src/bellman-cuda.cu
+++ b/src/bellman-cuda.cu
@@ -280,6 +280,11 @@ bc_error pn_set_values_from_packed_bits(void *values, const void *packet_bits, c
                                                                static_cast<cudaStream_t>(stream.handle)));
 }
 
+bc_error pn_distribute_values(const void *src, void *dst, const unsigned count, const unsigned stride, bc_stream stream) {
+  return static_cast<bc_error>(pn::distribute_values(static_cast<const fd_q::storage *>(src), static_cast<fd_q::storage *>(dst), count, stride,
+                                                               static_cast<cudaStream_t>(stream.handle)));
+}
+
 bc_error pn_tear_down() { return static_cast<bc_error>(pn::tear_down()); };
 
 bc_error msm_set_up() { return static_cast<bc_error>(msm::set_up()); }
diff --git a/src/bellman-cuda.h b/src/bellman-cuda.h
index e618a37..4528287 100644
--- a/src/bellman-cuda.h
+++ b/src/bellman-cuda.h
@@ -458,6 +458,14 @@ bc_error pn_generate_permutation_polynomials(generate_permutation_polynomials_co
 // stream - Stream on which this operation will be scheduled
 bc_error pn_set_values_from_packed_bits(void *values, const void *packet_bits, unsigned count, bc_stream stream);
 
+// Distribute field element values with a stride
+// src - device pointer to the vector of field elements from where the values will be read
+// dst - device pointer to the vector of field elements to where the results will be written
+// count - number of values to distribute
+// stride - stride with which the values will be distributed
+// stream - Stream on which this operation will be scheduled
+bc_error pn_distribute_values(const void *src, void *dst, unsigned count, unsigned stride, bc_stream stream);
+
 // release all resources associated with the internal state for polynomial computations
 bc_error pn_tear_down();
 
diff --git a/src/pn_kernels.cu b/src/pn_kernels.cu
index 9b9b205..77d2fb4 100644
--- a/src/pn_kernels.cu
+++ b/src/pn_kernels.cu
@@ -141,4 +141,21 @@ cudaError_t set_values_from_packed_bits(fd_q::storage *values, const unsigned *p
   return cudaGetLastError();
 }
 
+__global__ void distribute_values_kernel(const fd_q::storage *src, fd_q::storage *dst, const unsigned count, const unsigned stride) {
+  typedef fd_q::storage storage;
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid >= count)
+    return;
+  const auto value = memory::load<storage, memory::ld_modifier::cs>(src + gid);
+  memory::store<storage, memory::st_modifier::cs>(dst + gid * stride, value);
+}
+
+cudaError_t distribute_values(const fd_q::storage *src, fd_q::storage *dst, const unsigned count, const unsigned stride, cudaStream_t stream) {
+  const unsigned threads_per_block = 128;
+  const dim3 block_dim = count < threads_per_block ? count : threads_per_block;
+  const dim3 grid_dim = (count - 1) / block_dim.x + 1;
+  distribute_values_kernel<<<grid_dim, block_dim, 0, stream>>>(src, dst, count, stride);
+  return cudaGetLastError();
+}
+
 } // namespace pn
\ No newline at end of file
diff --git a/src/pn_kernels.cuh b/src/pn_kernels.cuh
index 5b5cff0..91fe081 100644
--- a/src/pn_kernels.cuh
+++ b/src/pn_kernels.cuh
@@ -18,4 +18,6 @@ cudaError_t generate_permutation_matrix(fd_q::storage *values, const fd_q::stora
 
 cudaError_t set_values_from_packed_bits(fd_q::storage *values, const unsigned *packet_bits, unsigned count, cudaStream_t stream);
 
+cudaError_t distribute_values(const fd_q::storage *src, fd_q::storage *dst, unsigned count, unsigned stride, cudaStream_t stream);
+
 } // namespace pn
\ No newline at end of file