Skip to content

Commit

Permalink
refactor: change calls to deprecated CUB functions and fix compiler w…
Browse files Browse the repository at this point in the history
…arnings about return values (#15)

# What ❔

This PR changes calls to deprecated CUB functions and fix compiler
warnings about return values.
  • Loading branch information
robik75 authored Sep 25, 2024
1 parent 42a59a6 commit 5ce3c31
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 34 deletions.
37 changes: 16 additions & 21 deletions src/bellman-cuda-cub.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,28 @@ namespace common {
using namespace cub;

cudaError_t sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, int num_items, int begin_bit,
int end_bit, cudaStream_t stream, bool debug_synchronous) {
return DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream, debug_synchronous);
int end_bit, cudaStream_t stream) {
return DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream);
}

cudaError_t sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, const unsigned *d_values_in,
unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) {
return DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream,
debug_synchronous);
unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream) {
return DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream);
}

cudaError_t sort_pairs_descending(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out,
const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream,
bool debug_synchronous) {
const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream) {
return DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit,
end_bit, stream, debug_synchronous);
end_bit, stream);
}

cudaError_t run_length_encode(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_unique_out, unsigned *d_counts_out,
unsigned *d_num_runs_out, int num_items, cudaStream_t stream, bool debug_synchronous) {
return DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream,
debug_synchronous);
unsigned *d_num_runs_out, int num_items, cudaStream_t stream) {
return DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
}

cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream,
bool debug_synchronous) {
return DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream) {
return DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
}

} // namespace common
Expand All @@ -51,21 +47,20 @@ struct fq_mul {
__device__ __forceinline__ storage operator()(const storage &a, const storage &b) const { return fd_q::mul(a, b); }
};

cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream,
bool debug_synchronous) {
return DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, fq_add(), fd_q::storage(), stream, debug_synchronous);
cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream) {
return DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, fq_add(), fd_q::storage(), stream);
}

cudaError_t inclusive_prefix_product(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
cudaStream_t stream, bool debug_synchronous) {
return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, fq_mul(), num_items, stream, debug_synchronous);
cudaStream_t stream) {
return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, fq_mul(), num_items, stream);
}

cudaError_t inclusive_prefix_product_reverse(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
cudaStream_t stream, bool debug_synchronous) {
cudaStream_t stream) {
auto i_in = std::reverse_iterator(d_in + num_items);
auto i_out = std::reverse_iterator(d_out + num_items);
return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, i_in, i_out, fq_mul(), num_items, stream, debug_synchronous);
return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, i_in, i_out, fq_mul(), num_items, stream);
}

} // namespace ff
21 changes: 10 additions & 11 deletions src/bellman-cuda-cub.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,32 @@
namespace common {

cudaError_t sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, int num_items, int begin_bit = 0,
int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr, bool debug_synchronous = false);
int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr);

cudaError_t sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, const unsigned *d_values_in,
unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr,
bool debug_synchronous = false);
unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr);

cudaError_t sort_pairs_descending(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out,
const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8,
cudaStream_t stream = nullptr, bool debug_synchronous = false);
cudaStream_t stream = nullptr);

cudaError_t run_length_encode(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_unique_out, unsigned *d_counts_out,
unsigned *d_num_runs_out, int num_items, cudaStream_t stream = nullptr, bool debug_synchronous = false);
unsigned *d_num_runs_out, int num_items, cudaStream_t stream = nullptr);

cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream = nullptr,
bool debug_synchronous = false);
cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items,
cudaStream_t stream = nullptr);

} // namespace common

namespace ff {

cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream = nullptr,
bool debug_synchronous = false);
cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
cudaStream_t stream = nullptr);

cudaError_t inclusive_prefix_product(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
cudaStream_t stream = nullptr, bool debug_synchronous = false);
cudaStream_t stream = nullptr);

cudaError_t inclusive_prefix_product_reverse(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
cudaStream_t stream = nullptr, bool debug_synchronous = false);
cudaStream_t stream = nullptr);

} // namespace ff
1 change: 1 addition & 0 deletions src/memory.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ template <typename T, ld_modifier MODIFIER> static constexpr __device__ __forcei
case ld_modifier::cv:
return __ldcv(ptr);
}
return *ptr;
}

enum class st_modifier { none, wb, cg, cs, wt };
Expand Down
3 changes: 1 addition & 2 deletions src/ntt_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,8 @@ __device__ __forceinline__ typename FD::storage *index_to_addr(const per_device_
// "addrs" passed from ntt_smem_stages_kernel should be in constant memory, which is dynamically indexable.
// I guess nvcc moved ntt_smem_stages_kernel "inputs" and "outputs" to registers then tried to dynamically
// index addr.data here in index_to_addr. Smart :eyeroll: Whatever, switch statement works.
} else {
return addrs.data[0] + idx;
}
return addrs.data[0] + idx;
}

// Carries out up to MAX_SMEM_STAGES - log_tile_sz C-T stages in shared memory.
Expand Down

0 comments on commit 5ce3c31

Please sign in to comment.