diff --git a/GraphBLAS/CMakeLists.txt b/GraphBLAS/CMakeLists.txt index 9157a4b906..8229506f9d 100644 --- a/GraphBLAS/CMakeLists.txt +++ b/GraphBLAS/CMakeLists.txt @@ -203,11 +203,6 @@ configure_file ( "Config/README.md.in" "${PROJECT_SOURCE_DIR}/README.md" NEWLINE_STYLE LF ) -# for CUDA -configure_file ( "CUDA/Config/GB_cuda_common_jitFactory.hpp.in" - "${PROJECT_SOURCE_DIR}/CUDA/GB_cuda_common_jitFactory.hpp" - NEWLINE_STYLE LF ) - #------------------------------------------------------------------------------- # include directories for both graphblas and the demos #------------------------------------------------------------------------------- @@ -465,6 +460,7 @@ if ( GRAPHBLAS_HAS_OPENMP ) target_link_libraries ( GraphBLAS_static PRIVATE OpenMP::OpenMP_C ) endif ( ) message ( STATUS "CMAKE OpenMP C flags: ${OpenMP_C_FLAGS}" ) + set ( GB_OPENMP_C_FLAGS "${OpenMP_C_FLAGS}" ) else ( ) message ( WARNING "WARNING: OpenMP was not found (or was disabled with " @@ -485,6 +481,7 @@ else ( ) "The C compiler does not support thread-local-storage; " "GxB_Context_engage will return GrB_NOT_IMPLEMENTED." ) endif ( ) + set ( GB_OPENMP_C_FLAGS "" ) endif ( ) if ( SUITESPARSE_HAS_CUDA AND GRAPHBLAS_USE_CUDA ) diff --git a/GraphBLAS/CUDA/.gitignore b/GraphBLAS/CUDA/.gitignore index 2650c12fe3..8d9e4b49ad 100644 --- a/GraphBLAS/CUDA/.gitignore +++ b/GraphBLAS/CUDA/.gitignore @@ -2,8 +2,6 @@ *.o *.a *.so -jitFactory -stringify rmm_log.txt # Do not ignore this file diff --git a/GraphBLAS/CUDA/CMakeLists.txt b/GraphBLAS/CUDA/CMakeLists.txt index 2b477a36bf..c0c74d825d 100644 --- a/GraphBLAS/CUDA/CMakeLists.txt +++ b/GraphBLAS/CUDA/CMakeLists.txt @@ -2,7 +2,7 @@ # GraphBLAS/CUDA/CMakeLists.txt: cmake script for GraphBLAS/CUDA #------------------------------------------------------------------------------- -# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. # Some files in this folder are (c) NVIDIA or (c) Google. Please refer # to their individual licenses (Apache, BSD, or others). @@ -12,11 +12,6 @@ cmake_minimum_required ( VERSION 3.20 ) # GraphBLAS can be built stand-alone -# CMake build for generating googletest c++ files that can be compiled and -# executed in parallel. Build can be customized to speed up development by -# allowing the targeting of specific specific parameters. The output of this -# build is an executable that can be used to run the gtests. - project ( GRAPHBLAS_CUDA VERSION "${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB}" LANGUAGES CXX CUDA ) @@ -29,7 +24,6 @@ set ( CMAKE_CUDA_FLAGS "-cudart=static -lineinfo " ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++17 -fPIC " ) add_compile_definitions ( GBNCPUFEAT ) -add_compile_definitions ( GBCUDA_CPLUSPLUS ) message ( STATUS "C++ flags for CUDA: ${CMAKE_CXX_FLAGS}" ) @@ -160,126 +154,13 @@ if ( NOT MSVC ) endif ( ) #------------------------------------------------------------------------------- -# test suite for the CUDA kernels +# report #------------------------------------------------------------------------------- -if ( 0 ) - -# 1. Execute enumify/stringify/jitify logic to compile ptx kernels and -# compile/link w/ relevant *.cu files. - -# TODO: Need to do this piece in cmake - -# 2. Generate test .cu files named "{semiring_operation}_test_instances.hpp" -set ( CUDA_TEST_SUITES - AxB_dot3 -# reduce_to_scalar -) - -# -set ( CUDA_TEST_MONOIDS PLUS MIN MAX) # TIMES ANY ) -set ( CUDA_TEST_BINOPS TIMES PLUS MIN MAX DIV ) #MINUS RDIV RMINUS FIRST SECOND PAIR ) -set ( CUDA_TEST_SEMIRINGS PLUS_TIMES MIN_PLUS MAX_PLUS ) -set ( CUDA_TEST_DATATYPES int32_t int64_t uint32_t uint64_t float double ) -set ( CUDA_TEST_KERNELS vsvs) # mp vsvs dndn spdn vssp ) -set ( CUDA_TEST_FORMATS sparse dense sparse_dense reduce ) - -# TODO: Update testGen.py to accept the above CUDA_TEST_* params as arguments - -# Note: I don't believe there's a way to do this particular piece in parallel but -# once all the files are written, we should be able to compile them in parallel - -# Separate individual kernels from larger "overview" test (e.g. 2-level testing structure) -# We want to test all the *_cuda versions - -# TODO: make this a shorter test -set(CUDA_TEST_CPP_FILES "") -if ( FALSE ) # TODO: use a cmake option - foreach(var ${CUDA_TEST_SUITES}) - foreach(semiring ${CUDA_TEST_SEMIRINGS}) - foreach(kernel ${CUDA_TEST_KERNELS}) - foreach(format ${CUDA_TEST_FORMATS}) - # TODO: Have Python script also build separate cudaTest.cpp (named something - # like AxB_dot3_cuda_tests.cpp) for each suite. This way we should be able to - # easily ignore them from the build - add_custom_command( - OUTPUT - ${CMAKE_CURRENT_BINARY_DIR}/${var}_${semiring}_${format}_test_instances.hpp - ${CMAKE_CURRENT_BINARY_DIR}/${var}_${semiring}_${format}_cuda_tests.cpp -# DEPENDS -# jitFactory.hpp - COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/test/testGen_cmake.py "\"${CMAKE_CURRENT_SOURCE_DIR}\"" "\"${var}\"" "\"${CUDA_TEST_MONOIDS}\"" - "\"${CUDA_TEST_BINOPS}\"" "\"${semiring}\"" "\"${CUDA_TEST_DATATYPES}\"" - "\"${kernel}\"" - ) - # Construct final list of files to compile (in parallel) - list(APPEND CUDA_TEST_CPP_FILES ${CMAKE_CURRENT_BINARY_DIR}/${var}_${semiring}_${format}_cuda_tests.cpp) - endforeach() - endforeach() - endforeach() - endforeach() -endif ( ) - -include(FetchContent) -FetchContent_Declare( - googletest - # Specify the commit you depend on and update it regularly. - URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip -) -# For Windows: Prevent overriding the parent project's compiler/linker settings -set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) -FetchContent_GetProperties(googletest) -if(NOT googletest_POPULATED) - FetchContent_Populate(googletest) - add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL) -endif() - -#FetchContent_MakeAvailable(googletest EC) - - -#file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes) -#execute_process( -# COMMAND git clone "https://github.com/google/googletest.git" googletest -# WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes) -# -#include_directories(${CMAKE_CURRENT_BINARY_DIR}/external_includes/googletest/googletest/include) - -#add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/external_includes/googletest/googletest/) - -# 3. Compile/link individual {test_suite_name}_cuda_tests.cpp files into a gtest executable -set(GRAPHBLAS_CUDA_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/test) +message ( STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES} ") +message ( STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER} ") +message ( STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS} ") +message ( STATUS "CMAKE_CUDA_FLAGS_RELEASE: ${CMAKE_CUDA_FLAGS_RELEASE} ") +message ( STATUS "CMAKE_CUDA_FLAGS_DEBUG: ${CMAKE_CUDA_FLAGS_DEBUG} ") -message(STATUS "CUDA tests files: " "${CUDA_TEST_CPP_FILES}") -add_executable(graphblascuda_test ${CUDA_TEST_CPP_FILES} ${CMAKE_CURRENT_SOURCE_DIR}/test/run_tests.cpp) - -set_target_properties(graphblascuda_test PROPERTIES POSITION_INDEPENDENT_CODE ON) -set_target_properties(graphblascuda_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -set_target_properties(graphblascuda_test PROPERTIES CUDA_ARCHITECTURES "52;75;80" ) - -include(GoogleTest) - -if ( ENABLE_SHARED_LIBS ) - target_link_libraries ( graphblascuda_test PUBLIC GraphBLAS ) -else ( ) - target_link_libraries ( graphblascuda_test PUBLIC GraphBLAS_static ) -endif ( ) - -target_link_libraries ( graphblascuda_test - PUBLIC - GraphBLAS_CUDA - RMM_wrap - CUDA::cudart_static - CUDA::nvrtc - ${ADDITIONAL_DEPS} - PRIVATE - gtest_main ) - -target_include_directories ( graphblascuda_test - PUBLIC - rmm_wrap - ${ADDITIONAL_INCLUDES} - ${CUDAToolkit_INCLUDE_DIRS} - ${GRAPHBLAS_CUDA_INCLUDES} ) - -endif ( ) diff --git a/GraphBLAS/CUDA/Config/GB_cuda_common_jitFactory.hpp.in b/GraphBLAS/CUDA/Config/GB_cuda_common_jitFactory.hpp.in deleted file mode 100644 index 5d7ad01e6f..0000000000 --- a/GraphBLAS/CUDA/Config/GB_cuda_common_jitFactory.hpp.in +++ /dev/null @@ -1,82 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_common_jitFactory.hpp: for all jitFactory classes -//------------------------------------------------------------------------------ - -// (c) Nvidia Corp. 2023 All rights reserved -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -// Common defines for all jitFactory classes: -// iostream callback to deliver the buffer to jitify as if read from a file -// compiler flags -// Include this file along with any jitFactory you need. - -// NOTE: do not edit the GB_cuda_common_jitFactory.hpp directly. It is -// configured by cmake from the following file: -// GraphBLAS/CUDA/Config/GB_cuda_common_jitFactory.hpp.in - -#ifndef GB_CUDA_COMMON_JITFACTORY_HPP -#define GB_CUDA_COMMON_JITFACTORY_HPP - -#pragma once - -#include "GraphBLAS_cuda.h" - -extern "C" -{ - #include "GB.h" - #include "GB_stringify.h" -} - -#include -#include -#include "GB_cuda_jitify_cache.h" -#include "GB_cuda_jitify_launcher.h" -#include "GB_cuda_mxm_factory.hpp" -#include "GB_cuda_error.h" -#include "../rmm_wrap/rmm_wrap.h" -#include "GB_iceil.h" - -// amount of shared memory to use in CUDA kernel launches -constexpr unsigned int SMEM = 0 ; - -#if 0 - -static const std::vector GB_jit_cuda_compiler_flags{ // OLD - "-std=c++17", - //"-G", - "-remove-unused-globals", - "-w", - "-D__CUDACC_RTC__", -// "-I" + jit::get_user_home_cache_dir(), // FIXME: add +/cu/00 -// "-I" + jit::get_user_home_cache_dir() + "/src", - "-I/usr/local/cuda/include", - // FIXME: add SUITESPARSE_CUDA_ARCHITECTURES here, via config -}; - -#endif - -inline std::vector GB_cuda_jit_compiler_flags ( ) -{ - return ( - std::vector ( - {"-std=c++17", - //"-G", - "-remove-unused-globals", - "-w", - "-D__CUDACC_RTC__", - "-I" + jit::get_user_home_cache_dir(), // FIXME: add +/cu/00 - "-I" + jit::get_user_home_cache_dir() + "/src", - "-I/usr/local/cuda/include" - // FIXME: add SUITESPARSE_CUDA_ARCHITECTURES here, via config - })) ; -} ; - -// FIXME: rename GB_jit_cuda_header_names or something -static const std::vector header_names ={}; - -// FIXME: rename GB_jit_cuda_file_callback -inline std::istream* (*file_callback)(std::string, std::iostream&); - -#endif diff --git a/GraphBLAS/CUDA/Config/GraphBLAS_CUDA.pc.in b/GraphBLAS/CUDA/Config/GraphBLAS_CUDA.pc.in index 2f5a31ea12..befb30bbe6 100644 --- a/GraphBLAS/CUDA/Config/GraphBLAS_CUDA.pc.in +++ b/GraphBLAS/CUDA/Config/GraphBLAS_CUDA.pc.in @@ -1,4 +1,4 @@ -# GraphBLAS_CUDA, Copyright (c) 2017-2023, Timothy A. Davis. +# GraphBLAS_CUDA, Copyright (c) 2017-2024, FIXME # All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 diff --git a/GraphBLAS/CUDA/Config/GraphBLAS_CUDAConfig.cmake.in b/GraphBLAS/CUDA/Config/GraphBLAS_CUDAConfig.cmake.in index 41db265312..6db344ffb7 100644 --- a/GraphBLAS/CUDA/Config/GraphBLAS_CUDAConfig.cmake.in +++ b/GraphBLAS/CUDA/Config/GraphBLAS_CUDAConfig.cmake.in @@ -4,7 +4,7 @@ # The following copyright and license applies to just this file only, not to # the library itself: -# GraphBLASConfig.cmake, Copyright (c) 2023, Timothy A. Davis. All Rights Reserved. +# GraphBLASConfig.cmake, Copyright (c) 2023-2024, FIXME # SPDX-License-Identifier: BSD-3-clause #------------------------------------------------------------------------------- diff --git a/GraphBLAS/CUDA/GB_cuda.h b/GraphBLAS/CUDA/GB_cuda.h deleted file mode 100644 index 3dac3a7c5e..0000000000 --- a/GraphBLAS/CUDA/GB_cuda.h +++ /dev/null @@ -1,139 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda.h -//------------------------------------------------------------------------------ - -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -#ifndef GB_CUDA_H -#define GB_CUDA_H - -extern "C" -{ - #include "GB_dev.h" - #include "GB_compiler.h" - #include "GB_cpu_features.h" - #include "GB_warnings.h" -} - -#include "GraphBLAS_cuda.h" - -extern "C" -{ - #include - #include - #include "GB.h" -} - -// Finally, include the CUDA definitions -#include "cuda_runtime.h" -#include "cuda.h" -// #include "cub.cuh" -#include "jitify.hpp" -#include "GB_cuda_mxm_factory.hpp" - -#include - -#define CHECK_CUDA_SIMPLE(call) \ - do { \ - cudaError_t err = call; \ - if (err != cudaSuccess) { \ - const char* str = cudaGetErrorName( err); \ - std::cout << "(CUDA runtime) returned " << str; \ - std::cout << " (" << __FILE__ << ":" << __LINE__ << ":" << __func__ \ - << "())" << std::endl; \ - return (GrB_PANIC) ; \ - } \ - } while (0) - -#define CU_OK(call) CHECK_CUDA_SIMPLE(call) - -//------------------------------------------------------------------------------ -// GB_CUDA_CATCH: catch error from a try { ... } region -//------------------------------------------------------------------------------ - -// #define GB_FREE_ALL { some macro to free all temporaries } -// GrB_Info info ; -// try { ... do stuff that can throw an exception } -// GB_CUDA_CATCH (info) ; - -#define GB_CUDA_CATCH(info) \ - catch (std::exception& e) \ - { \ - printf ("CUDA error: %s\n", e.what ( )) ; \ - info = GrB_PANIC ; \ - /* out_of_memory : info = GrB_OUT_OF_MEMORY ; */ \ - /* nulltpr: info = ... ; */ \ - /* no gpus here: info = GrB_PANIC ; */ \ - } \ - if (info != GrB_SUCCESS) \ - { \ - /* CUDA failed */ \ - GB_FREE_ALL ; \ - return (info) ; \ - } - -// NBUCKETS buckets: computed by up to NBUCKETS-1 kernel launches (zombies need -// no work...), using different kernels (with different configurations -// depending on the bucket). - -#include "GB_cuda_buckets.h" - -extern "C" -{ - #include "GB_stringify.h" -} - -//------------------------------------------------------------------------------ -// prefetch and memadvise -//------------------------------------------------------------------------------ - -// for the "which" parameter of GB_cuda_matrix_prefetch: -// FIXME: rename this to GB_WHATEVER_P for GB_cuda_matrix_advise -#define GB_PREFETCH_P 1 -#define GB_PREFETCH_H 2 -#define GB_PREFETCH_Y 4 -#define GB_PREFETCH_B 8 -#define GB_PREFETCH_I 16 -#define GB_PREFETCH_X 32 -#define GB_PREFETCH_PIX (GB_PREFETCH_P + GB_PREFETCH_I + GB_PREFETCH_X) -#define GB_PREFETCH_PYI (GB_PREFETCH_P + GB_PREFETCH_Y + GB_PREFETCH_I) -#define GB_PREFETCH_PYBI (GB_PREFETCH_PYI + GB_PREFETCH_B) -#define GB_PREFETCH_PYBIX (GB_PREFETCH_PYBI + GB_PREFETCH_X) -#define GB_PREFETCH_PHI (GB_PREFETCH_P + GB_PREFETCH_H + GB_PREFETCH_I) -#define GB_PREFETCH_PHBI (GB_PREFETCH_PHI + GB_PREFETCH_B) -#define GB_PREFETCH_PHBIX (GB_PREFETCH_PHBI + GB_PREFETCH_X) - -GrB_Info GB_cuda_matrix_prefetch -( - GrB_Matrix A, - int which, // which components to prefetch (phybix control) - int device, // GPU device or cudaCpuDeviceId - cudaStream_t stream -) ; - -#if 0 -// do we need this function too? -GrB_Info GB_cuda_matrix_advise -( - GrB_Matrix A, - - p, h, y, b, i, x? 6 bools - - what to do: advise (prefer location? access by)? prefetch? nothing? - avdice: enum (1 to 6) - - int device, // GPU device or cudaCpuDeviceId -) ; -#endif - -void GB_cuda_upscale_identity -( - GB_void *identity_upscaled, // output: at least sizeof (uint16_t) - GrB_Monoid monoid // input: monoid to upscale -) ; - -#endif - diff --git a/GraphBLAS/CUDA/GB_cuda.hpp b/GraphBLAS/CUDA/GB_cuda.hpp new file mode 100644 index 0000000000..baa0bc23ee --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda.hpp @@ -0,0 +1,97 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/GB_cuda.hpp: include file for host CUDA methods (not for JIT) +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +#ifndef GB_CUDA_HPP +#define GB_CUDA_HPP + +extern "C" +{ + #include "GB_dev.h" + #include "GB_compiler.h" + #include "GB_cpu_features.h" + #include "GB_warnings.h" +} + +#include "GraphBLAS_cuda.hpp" + +extern "C" +{ + #include + #include + #include "GB.h" + #include "GB_stringify.h" + #include "GB_iceil.h" +} + +// Finally, include the CUDA definitions +#include "cuda_runtime.h" +#include "cuda.h" + +#include +#include +#include +#include +#include + +#include "GB_cuda_error.hpp" +#include "GB_cuda_timer.hpp" + +//------------------------------------------------------------------------------ +// prefetch and memadvise +//------------------------------------------------------------------------------ + +// for the "which" parameter of GB_cuda_matrix_prefetch: +// FIXME: rename this to GB_WHATEVER_P for GB_cuda_matrix_advise + +#define GB_PREFETCH_P 1 +#define GB_PREFETCH_H 2 +#define GB_PREFETCH_Y 4 +#define GB_PREFETCH_B 8 +#define GB_PREFETCH_I 16 +#define GB_PREFETCH_X 32 +#define GB_PREFETCH_PIX (GB_PREFETCH_P + GB_PREFETCH_I + GB_PREFETCH_X) +#define GB_PREFETCH_PYI (GB_PREFETCH_P + GB_PREFETCH_Y + GB_PREFETCH_I) +#define GB_PREFETCH_PYBI (GB_PREFETCH_PYI + GB_PREFETCH_B) +#define GB_PREFETCH_PYBIX (GB_PREFETCH_PYBI + GB_PREFETCH_X) +#define GB_PREFETCH_PHI (GB_PREFETCH_P + GB_PREFETCH_H + GB_PREFETCH_I) +#define GB_PREFETCH_PHBI (GB_PREFETCH_PHI + GB_PREFETCH_B) +#define GB_PREFETCH_PHBIX (GB_PREFETCH_PHBI + GB_PREFETCH_X) + +GrB_Info GB_cuda_matrix_prefetch +( + GrB_Matrix A, + int which, // which components to prefetch (phybix control) + int device, // GPU device or cudaCpuDeviceId + cudaStream_t stream +) ; + +#if 0 +// do we need this function too? +GrB_Info GB_cuda_matrix_advise +( + GrB_Matrix A, + + p, h, y, b, i, x? 6 bools + + what to do: advise (prefer location? access by)? prefetch? nothing? + avdice: enum (1 to 6) + + int device, // GPU device or cudaCpuDeviceId +) ; +#endif + +void GB_cuda_upscale_identity +( + GB_void *identity_upscaled, // output: at least sizeof (uint32_t) + GrB_Monoid monoid // input: monoid to upscale +) ; + +#endif + diff --git a/GraphBLAS/CUDA/GB_cuda_AxB.hpp b/GraphBLAS/CUDA/GB_cuda_AxB.hpp new file mode 100644 index 0000000000..19a319777e --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_AxB.hpp @@ -0,0 +1,33 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/GB_cuda_AxB.hpp +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +#ifndef GB_CUDA_AXB_H +#define GB_CUDA_AXB_H + +#include "GB_cuda.hpp" +#include "GB_hash.h" + +GrB_Info GB_cuda_AxB_dot3_jit +( + // input/output: + GrB_Matrix C, // FIXME: allow iso for this kernel + // input: + const GrB_Matrix M, const bool Mask_struct, + const GrB_Matrix A, + const GrB_Matrix B, + const GrB_Semiring semiring, + const bool flipxy, + // CUDA stream, device, and # of ms + cudaStream_t stream, + int device, + int number_of_sms +) ; + +#endif + diff --git a/GraphBLAS/CUDA/GB_cuda_AxB_dot3.cpp b/GraphBLAS/CUDA/GB_cuda_AxB_dot3.cpp new file mode 100644 index 0000000000..df14d833cf --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_AxB_dot3.cpp @@ -0,0 +1,257 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/GB_cuda_AxB_dot3: compute C = A'*B on GPU(s) +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +// This function computes C=A'*B on the GPUs. The mask must be present, +// sparse or hypersparse, and not complemented. The mask is always applied. A +// and B can have any sparsity format. C is computed as sparse or hypersparse, +// with the same format as M. + +#define GB_FREE_WORKSPACE \ +{ \ + /* FIXME: use a stream pool instead */ \ + if (stream != nullptr) cudaStreamDestroy (stream) ; \ + stream = nullptr ; \ +} + +#define GB_FREE_ALL \ +{ \ + GB_FREE_WORKSPACE ; \ + GB_phybix_free (C) ; \ +} + +#include "GB_cuda_AxB.hpp" + +//------------------------------------------------------------------------------ +// GB_cuda_AxB_dot3 +//------------------------------------------------------------------------------ + +GrB_Info GB_cuda_AxB_dot3 // C = A'*B using dot product method +( + GrB_Matrix C, // output matrix + const GrB_Matrix M, // mask matrix + const bool Mask_struct, // if true, use the only structure of M + const GrB_Matrix A, // input matrix + const GrB_Matrix B, // input matrix + const GrB_Semiring semiring, // semiring that defines C=A*B + const bool flipxy // if true, do z=fmult(b,a) vs fmult(a,b) +) +{ + + cudaStream_t stream = nullptr ; + + //-------------------------------------------------------------------------- + // create the stream + //-------------------------------------------------------------------------- + + // FIXME: pass in a stream instead, or checkout a stream + CUDA_OK (cudaStreamCreate (&stream)) ; + GpuTimer kernel_timer; + + //-------------------------------------------------------------------------- + // check inputs + //-------------------------------------------------------------------------- + + // when CUDA is enabled, no static headers are used in all of GraphBLAS + GrB_Info info ; + ASSERT (C != NULL && !(C->static_header)) ; + ASSERT (M != NULL && !(M->static_header)) ; + ASSERT (A != NULL && !(A->static_header)) ; + ASSERT (B != NULL && !(B->static_header)) ; + + ASSERT_MATRIX_OK (M, "M for dot3 cuda A'*B", GB0) ; + ASSERT_MATRIX_OK (A, "A for dot3 cuda A'*B", GB0) ; + ASSERT_MATRIX_OK (B, "B for dot3 cuda A'*B", GB0) ; + + ASSERT (!GB_PENDING (M)) ; + ASSERT (GB_JUMBLED_OK (M)) ; + ASSERT (!GB_ZOMBIES (M)) ; + + ASSERT (!GB_PENDING (A)) ; + ASSERT (!GB_JUMBLED (A)) ; + ASSERT (!GB_ZOMBIES (A)) ; + + ASSERT (!GB_PENDING (B)) ; + ASSERT (!GB_ZOMBIES (B)) ; + ASSERT (!GB_JUMBLED (B)) ; + + ASSERT_SEMIRING_OK (semiring, "semiring for dot3 numeric A'*B", GB0) ; + + ASSERT (A->vlen == B->vlen) ; + GBURBLE ("(GPU dot3) ") ; + + //-------------------------------------------------------------------------- + // initializations + //-------------------------------------------------------------------------- + + int device = -1; + + // FIXME: control the GPU to use via the descriptor + CUDA_OK (cudaSetDevice ( 0 )) ; + CUDA_OK (cudaGetDevice (&device)) ; + int number_of_sms = GB_Global_gpu_sm_get (0) ; + + //-------------------------------------------------------------------------- + // get M, A, and B + //-------------------------------------------------------------------------- + + const int64_t mvlen = M->vlen ; + const int64_t mvdim = M->vdim ; + const int64_t mnz = GB_nnz (M) ; + const int64_t mnvec = M->nvec ; + const bool M_is_hyper = GB_IS_HYPERSPARSE( M ) ; + + const int64_t anz = GB_nnz (A) ; + const int64_t anvec = A->nvec ; + bool A_is_sparse = GB_IS_SPARSE (A) ; + bool A_is_hyper = GB_IS_HYPERSPARSE (A) ; + bool A_is_bitmap = GB_IS_BITMAP (A) ; + bool A_is_full = GB_IS_FULL (A) ; + bool A_is_sparse_or_hyper = A_is_sparse || A_is_hyper ; + bool A_is_bitmap_or_full = A_is_bitmap || A_is_full ; + + const int64_t bnz = GB_nnz (B) ; + const int64_t bnvec = B->nvec ; + bool B_is_sparse = GB_IS_SPARSE (B) ; + bool B_is_hyper = GB_IS_HYPERSPARSE (B) ; + bool B_is_bitmap = GB_IS_BITMAP (B) ; + bool B_is_full = GB_IS_FULL (B) ; + bool B_is_sparse_or_hyper = B_is_sparse || B_is_hyper ; + bool B_is_bitmap_or_full = B_is_bitmap || B_is_full ; + + //-------------------------------------------------------------------------- + // get the semiring operators + //-------------------------------------------------------------------------- + + GrB_BinaryOp mult = semiring->multiply ; + GrB_Monoid add = semiring->add ; + ASSERT (mult->ztype == add->op->ztype) ; + GB_Opcode mult_opcode = mult->opcode ; + if (mult->xtype->code == GB_BOOL_code) + { + mult_opcode = GB_boolean_rename (mult_opcode) ; + } + bool A_is_pattern, B_is_pattern ; + GB_binop_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult_opcode) ; + + //-------------------------------------------------------------------------- + // allocate C, the same size and # of entries as M + //-------------------------------------------------------------------------- + + // FUTURE: ctype need not be the op->ztype + GrB_Type ctype = add->op->ztype ; + int64_t cvlen = mvlen ; + int64_t cvdim = mvdim ; + int64_t cnz = mnz ; + int64_t cnvec = mnvec ; + + int M_sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ; + int C_sparsity = M_sparsity ; + bool C_iso = false ; // FIXME: pass in C_iso and cscalar + bool C_in_iso = false ; // FIXME: pass in C_in_iso and cscalar + + if (C_iso) + { + A_is_pattern = true ; + B_is_pattern = true ; + } + + GB_OK (GB_new_bix (&C, // sparse or hyper (from M), existing header + ctype, cvlen, cvdim, GB_Ap_malloc, true, + M_sparsity, false, M->hyper_switch, cnvec, + cnz+1, // add one to cnz for GB_cumsum of Cwork + true, C_iso)) ; + + //-------------------------------------------------------------------------- + // Pre-fetch arrays that will be used on the device + //-------------------------------------------------------------------------- + + // GB_cuda_matrix_advise (C, cnvec, cnz, which, what, device) + // advise C + CUDA_OK (cudaMemAdvise (C->p, (cnvec+1) * sizeof ( int64_t), + cudaMemAdviseSetPreferredLocation, device)) ; + if (M_is_hyper) + { + CUDA_OK (cudaMemAdvise (C->h, cnvec * sizeof ( int64_t), + cudaMemAdviseSetPreferredLocation, device)) ; + } + CUDA_OK (cudaMemAdvise (C->i, (cnz+1) * sizeof ( int64_t), + cudaMemAdviseSetPreferredLocation, device)) ; + if (!C_iso) + { + CUDA_OK (cudaMemAdvise (C->x, (cnz+1) * C->type->size , + cudaMemAdviseSetPreferredLocation, device)) ; + } + + // prefetch M (if M hypersparse: using M->h not M->Y) + GB_OK (GB_cuda_matrix_prefetch (M, + Mask_struct ? GB_PREFETCH_PHBI : GB_PREFETCH_PHBIX, device, stream)) ; + + //-------------------------------------------------------------------------- + // copy Mp and Mh into C + //-------------------------------------------------------------------------- + + // FIXME: use shallow? + CUDA_OK (cudaMemcpyAsync (C->p, M->p, (cnvec+1) * sizeof (int64_t), + cudaMemcpyDefault, stream)) ; + if (M_is_hyper) + { + CUDA_OK (cudaMemcpyAsync (C->h, M->h, cnvec * sizeof (int64_t), + cudaMemcpyDefault, stream)) ; + } + + C->nvals = cnz ; + C->magic = GB_MAGIC ; + C->nvec_nonempty = M->nvec_nonempty ; + C->jumbled = GB_JUMBLED (M) ; // C is jumbled if M is jumbled + + GBURBLE ("(GPU C created and copied from M) ") ; + + //-------------------------------------------------------------------------- + // prefetch A and B + //-------------------------------------------------------------------------- + + // M might be very very sparse. A(:,i) is not needed if M(:,i) is empty. + // Likewise, B(:,j) is not needed if M(:,j) is empty. For now, try this + // heuristic: if M is hypersparse, then do not prefetch A->b or A->x. + + int prefetch_b = (M_is_hyper) ? 0 : GB_PREFETCH_B ; + int prefetch_x = (M_is_hyper) ? 0 : GB_PREFETCH_X ; + int prefetch_pybi = GB_PREFETCH_PYI + prefetch_b ; + + // prefetch A (if A hypersparse: using A->Y) + GB_OK (GB_cuda_matrix_prefetch (A, prefetch_pybi + + (A_is_pattern ? 0 : prefetch_x), device, stream)) ; + + // prefetch B (if B hypersparse: using B->Y) + GB_OK (GB_cuda_matrix_prefetch (B, prefetch_pybi + + (B_is_pattern ? 0 : prefetch_x), device, stream)) ; + + //-------------------------------------------------------------------------- + // C=A'*B on CUDA, in the JIT + //-------------------------------------------------------------------------- + +// final call looks like this: +// GB_OK (GB_cuda_AxB_dot3_jit (C, M, Mask_struct, A, B, semiring, flipxy, +// stream, device, number_of_sms)) ; + +// debugging for now, to die early if the CUDA fails to compile, load, or run: + info = GB_cuda_AxB_dot3_jit (C, M, Mask_struct, A, B, semiring, flipxy, + stream, device, number_of_sms) ; + if (info == GrB_NO_VALUE) info = GrB_PANIC ; + GB_OK (info) ; + + //-------------------------------------------------------------------------- + // free workspace and return result + //-------------------------------------------------------------------------- + + GB_FREE_WORKSPACE ; + return GrB_SUCCESS; +} + diff --git a/GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch.cpp b/GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch.cpp index c69dc2132f..cac90233f2 100644 --- a/GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch.cpp +++ b/GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch.cpp @@ -2,20 +2,14 @@ // GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch: decide to use GPU for dot3 //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ -// Decide branch direction for GPU use for the dot-product MxM +// Decide branch direction for GPU use for the dot-product C=A'*B -#include "GraphBLAS_cuda.h" - -extern "C" -{ - #include "GB_mxm.h" -} -#include "GB_cuda.h" +#include "GB_cuda.hpp" #include bool GB_cuda_AxB_dot3_branch @@ -36,8 +30,12 @@ bool GB_cuda_AxB_dot3_branch !GB_cuda_type_branch (semiring->multiply->ztype)) { // one or more types are not yet supported on the GPU - // FIXME: remove debug output here: - std::cout << "Not using cuda path: type size not supported" << std::endl; + return (false) ; + } + + if (A->vlen == 0) + { + // C has no entries: no need to compute it on the GPU return (false) ; } @@ -46,9 +44,6 @@ bool GB_cuda_AxB_dot3_branch double bdeg = ((double) GB_nnz (B)) / ((double) GB_IMAX (1, B->nvec)) ; double work = GB_nnz (M) * GB_IMIN (adeg, bdeg) ; - // TODO if A or B are not accessed (first, 2nd, or pair ops) - // then the type if A can be user-defined here, for CUDA. - int ngpus_to_use = GB_ngpus_to_use (work) ; GBURBLE (" work:%g GPUs:%d ", work, ngpus_to_use) ; if (ngpus_to_use > 0) @@ -60,9 +55,8 @@ bool GB_cuda_AxB_dot3_branch } else { - // FIXME: remove debug output here: - std::cout << "Not using cuda path." << std::endl; +// std::cout << "Not using cuda path for dot3." << std::endl; return false ; } - } + diff --git a/GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit.cpp b/GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit.cpp index 5933403dd9..fab1a7bdad 100644 --- a/GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit.cpp +++ b/GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit.cpp @@ -1,464 +1,65 @@ //------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit: compute C = A'*B on GPU(s) +// GB_cuda_AxB_dot3_jit: reduce a matrix to a scalar, via the CUDA JIT //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ -// This function computes C=A'*B on the GPUs. The mask must be present, -// sparse or hypersparse, and not complemented. The mask is always applied. A -// and B can have any sparsity format. C is computed as sparse or hypersparse, -// with the same format as M. +#include "GB_cuda_AxB.hpp" -#include "GB_cuda.h" extern "C" { - #include "GB_mxm.h" + typedef GB_JIT_CUDA_KERNEL_DOT3_PROTO ((*GB_jit_dl_function)) ; } -#include "GB_cuda_jitify_cache.h" -#include "GB_cuda_common_jitFactory.hpp" -#include "GB_cuda_reduce_jitFactory.hpp" -#include "GB_cuda_mxm_dot3_jitFactory.hpp" -#include "test/GpuTimer.h" - -/* -template -void print_array(void *arr, I size, const char *name) { - std::cout << "Printing " << name << std::endl; - for(I i = 0; i < size; ++i) { - std::cout << static_cast(arr)[i] << ", "; - } - std::cout << std::endl << "Done." << std::endl; -} -*/ - -#undef GB_FREE_WORKSPACE -#define GB_FREE_WORKSPACE \ -{ \ - /* FIXME: use a stream pool instead */ \ - CU_OK (cudaStreamSynchronize(stream)); \ - CU_OK (cudaStreamDestroy(stream)); \ - GB_FREE_WORK (&Nanobuckets, Nb_size) ; \ - GB_FREE_WORK (&Blockbucket, Bb_size) ; \ - GB_FREE_WORK (&Bucketp, Bup_size) ; \ - GB_FREE_WORK (&offset, O_size) ; \ - GB_FREE_WORK (&Bucket, Bu_size) ; \ -} - -#undef GB_FREE_ALL -#define GB_FREE_ALL \ -{ \ - GB_FREE_WORKSPACE ; \ - GB_phybix_free (C) ; \ -} - -//------------------------------------------------------------------------------ -// GB_AxB_dot3_cuda -//------------------------------------------------------------------------------ - -GrB_Info GB_cuda_AxB_dot3_jit // C = A'*B using dot product method +GrB_Info GB_cuda_AxB_dot3_jit ( - GrB_Matrix C, // output matrix - const GrB_Matrix M, // mask matrix - const bool Mask_struct, // if true, use the only structure of M - const GrB_Matrix A, // input matrix - const GrB_Matrix B, // input matrix - const GrB_Semiring semiring, // semiring that defines C=A*B - const bool flipxy // if true, do z=fmult(b,a) vs fmult(a,b) + // input/output: + GrB_Matrix C, // FIXME: allow iso for this kernel + // input: + const GrB_Matrix M, const bool Mask_struct, + const GrB_Matrix A, + const GrB_Matrix B, + const GrB_Semiring semiring, + const bool flipxy, + // CUDA stream, device, and # of ms + cudaStream_t stream, + int device, + int number_of_sms ) -{ - - // FIXME: pass in a stream instead, or checkout a stream - cudaStream_t stream = NULL ; - CU_OK (cudaStreamCreate(&stream)); - - GpuTimer kernel_timer; - - //-------------------------------------------------------------------------- - // check inputs - //-------------------------------------------------------------------------- - - // when CUDA is enabled, no static headers are used in all of GraphBLAS - GrB_Info info ; - ASSERT (C != NULL && !(C->static_header)) ; - ASSERT (M != NULL && !(M->static_header)) ; - ASSERT (A != NULL && !(A->static_header)) ; - ASSERT (B != NULL && !(B->static_header)) ; - - ASSERT_MATRIX_OK (M, "M for dot3 cuda A'*B", GB0) ; - ASSERT_MATRIX_OK (A, "A for dot3 cuda A'*B", GB0) ; - ASSERT_MATRIX_OK (B, "B for dot3 cuda A'*B", GB0) ; - - ASSERT (!GB_PENDING (M)) ; - ASSERT (GB_JUMBLED_OK (M)) ; - ASSERT (!GB_ZOMBIES (M)) ; - - ASSERT (!GB_PENDING (A)) ; - ASSERT (!GB_JUMBLED (A)) ; - ASSERT (!GB_ZOMBIES (A)) ; - - ASSERT (!GB_PENDING (B)) ; - ASSERT (!GB_ZOMBIES (B)) ; - ASSERT (!GB_JUMBLED (B)) ; - - ASSERT_SEMIRING_OK (semiring, "semiring for dot3 numeric A'*B", GB0) ; - - ASSERT (A->vlen == B->vlen) ; - GBURBLE ("(GPU dot3) ") ; - //printf ("\nM -------------\n") ; GxB_Matrix_fprint (M, "M", GxB_SHORT, stdout) ; - //printf ("\nA -------------\n") ; GxB_Matrix_fprint (A, "A", GxB_SHORT, stdout) ; - //printf ("\nB -------------\n") ; GxB_Matrix_fprint (B, "B", GxB_SHORT, stdout) ; - - //-------------------------------------------------------------------------- - // initializations - //-------------------------------------------------------------------------- - - int64_t *Nanobuckets = NULL ; size_t Nb_size = 0 ; - int64_t *Blockbucket = NULL ; size_t Bb_size = 0 ; - int64_t *Bucket = NULL ; size_t Bu_size = 0 ; - int64_t *Bucketp = NULL ; size_t Bup_size = 0 ; - int64_t *offset = NULL ; size_t O_size = 0 ; - - int device = -1; - - // FIXME: control the GPU to use via the descriptor - CU_OK (cudaSetDevice( 0 )); - CU_OK (cudaGetDevice(&device)); - - //-------------------------------------------------------------------------- - // get M - //-------------------------------------------------------------------------- - - const int64_t mvlen = M->vlen ; - const int64_t mvdim = M->vdim ; - const int64_t mnz = GB_nnz (M) ; - const int64_t mnvec = M->nvec ; - const bool M_is_hyper = GB_IS_HYPERSPARSE( M ) ; - - const int64_t anz = GB_nnz (A) ; - const int64_t anvec = A->nvec ; - - const int64_t bnz = GB_nnz (B) ; - const int64_t bnvec = B->nvec ; - - //-------------------------------------------------------------------------- - // allocate C, the same size and # of entries as M - //-------------------------------------------------------------------------- - - // FUTURE: ctype need not be the op->ztype - GrB_Type ctype = semiring->add->op->ztype ; - int64_t cvlen = mvlen ; - int64_t cvdim = mvdim ; - int64_t cnz = mnz ; - int64_t cnvec = mnvec ; - - int M_sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ; - int C_sparsity = M_sparsity ; - bool C_iso = false ; // FIXME: pass in C_iso and cscalar - bool C_in_iso = false ; // FIXME: pass in C_in_iso and cscalar - info = GB_new_bix (&C, // sparse or hyper (from M), existing header - ctype, cvlen, cvdim, GB_Ap_malloc, true, - M_sparsity, false, M->hyper_switch, cnvec, - cnz+1, // add one to cnz for GB_cumsum of Cwork - true, C_iso) ; - - if (info != GrB_SUCCESS) - { - // out of memory - GB_FREE_ALL ; - return (info) ; - } - -// try this with GB_Ap_null, above in GB_new_bix -// C->p = M->p ; C->p_shallow = true ; -// C->h = M->h ; C->h_shallow = true ; - - //-------------------------------------------------------------------------- - // Pre-fetch arrays that will be used on the device - //-------------------------------------------------------------------------- - - // GB_cuda_matrix_advise (C, cnvec, cnz, which, what, device) - // advise C - CU_OK (cudaMemAdvise (C->p, (cnvec+1) * sizeof ( int64_t), - cudaMemAdviseSetPreferredLocation, device)) ; - if (M_is_hyper) - { - CU_OK (cudaMemAdvise (C->h, cnvec * sizeof ( int64_t), - cudaMemAdviseSetPreferredLocation, device)) ; - } - CU_OK (cudaMemAdvise (C->i, (cnz+1) * sizeof ( int64_t), - cudaMemAdviseSetPreferredLocation, device)) ; - if (!C_iso) - { - CU_OK (cudaMemAdvise (C->x, (cnz+1) * C->type->size , - cudaMemAdviseSetPreferredLocation, device)) ; - } - - // prefetch M (if M hypersparse: using M->h not M->Y) - GB_OK (GB_cuda_matrix_prefetch (M, - Mask_struct ? GB_PREFETCH_PHBI : GB_PREFETCH_PHBIX, device, stream)) ; - - //-------------------------------------------------------------------------- - // copy Mp and Mh into C - //-------------------------------------------------------------------------- - - // FIXME: use shallow? - CU_OK (cudaMemcpyAsync (C->p, M->p, (cnvec+1) * sizeof (int64_t), - cudaMemcpyDefault, stream)) ; - if (M_is_hyper) - { - CU_OK (cudaMemcpyAsync (C->h, M->h, cnvec * sizeof (int64_t), - cudaMemcpyDefault, stream)) ; - } - - C->nvals = cnz ; - C->magic = GB_MAGIC ; - C->nvec_nonempty = M->nvec_nonempty ; - C->jumbled = GB_JUMBLED (M) ; // C is jumbled if M is jumbled - - GBURBLE ("(GPU C created and copied from M) ") ; +{ //-------------------------------------------------------------------------- - // stringify the semiring and the mask + // encodify the problem //-------------------------------------------------------------------------- - GB_cuda_mxm_factory my_mxm_spec = GB_cuda_mxm_factory ( ) ; - - // (1) create the mxm code and name - my_mxm_spec.mxm_factory ( C_iso, C_in_iso, C_sparsity, ctype, + GB_jit_encoding encoding ; + char *suffix ; + uint64_t hash = GB_encodify_mxm (&encoding, &suffix, + GB_JIT_CUDA_KERNEL_AXB_DOT3, + // FIXME: all C to be iso + /* C->iso: */ false, false, GB_sparsity (C), C->type, M, Mask_struct, false, semiring, flipxy, A, B) ; - // (2) ensure the jitifier has "GB_mxm_[my_mxm_spec.sr_code].h" - jit::GBJitCache filecache = jit::GBJitCache::Instance() ; - filecache.getFile (my_mxm_spec) ; - - GBURBLE ("(GPU stringified srcode = %lu)\n", my_mxm_spec.sr_code) ; - - //-------------------------------------------------------------------------- - // get A and B - //-------------------------------------------------------------------------- - - // FIXME: add acode, bcode to the GB_cuda_mxm_factory object - int acode = GB_RSHIFT (my_mxm_spec.sr_code, 12, 4) ; // if 0: A is pattern - int bcode = GB_RSHIFT (my_mxm_spec.sr_code, 8, 4) ; // if 0: B is pattern - - bool A_is_sparse = GB_IS_SPARSE (A) ; - bool A_is_hyper = GB_IS_HYPERSPARSE (A) ; - bool A_is_bitmap = GB_IS_BITMAP (A) ; - bool A_is_full = GB_IS_FULL (A) ; - bool A_is_sparse_or_hyper = A_is_sparse || A_is_hyper ; - bool A_is_bitmap_or_full = A_is_bitmap || A_is_full ; - bool A_is_pattern = (acode == 0) ; - - bool B_is_sparse = GB_IS_SPARSE (B) ; - bool B_is_hyper = GB_IS_HYPERSPARSE (B) ; - bool B_is_bitmap = GB_IS_BITMAP (B) ; - bool B_is_full = GB_IS_FULL (B) ; - bool B_is_sparse_or_hyper = B_is_sparse || B_is_hyper ; - bool B_is_bitmap_or_full = B_is_bitmap || B_is_full ; - bool B_is_pattern = (bcode == 0) ; - - // M might be very very sparse. A(:,i) is not needed if M(:,i) is empty. - // Likewise, B(:,j) is not needed if M(:,j) is empty. For now, try this - // heuristic: if M is hypersparse, then do not prefetch A->b or A->x. - - int prefetch_b = (M_is_hyper) ? 0 : GB_PREFETCH_B ; - int prefetch_x = (M_is_hyper) ? 0 : GB_PREFETCH_X ; - int prefetch_pybi = GB_PREFETCH_PYI + prefetch_b ; - - // prefetch A (if A hypersparse: using A->Y) - GB_OK (GB_cuda_matrix_prefetch (A, prefetch_pybi + - (A_is_pattern ? 0 : prefetch_x), device, stream)) ; - - // prefetch B (if B hypersparse: using B->Y) - GB_OK (GB_cuda_matrix_prefetch (B, prefetch_pybi + - (B_is_pattern ? 0 : prefetch_x), device, stream)) ; - //-------------------------------------------------------------------------- - // C=A'*B via jitified kernels + // get the kernel function pointer, loading or compiling it if needed //-------------------------------------------------------------------------- - if (A_is_bitmap_or_full && B_is_bitmap_or_full) - { - - //---------------------------------------------------------------------- - // (full or bitmap) times (full or bitmap) - //---------------------------------------------------------------------- - - dense_phase1launchFactory dp1lf(my_mxm_spec); - - GBURBLE ("(GPU dense phase1 start nblk = %d) ", - dp1lf.get_number_of_blocks(M)) ; - kernel_timer.Start(); - dp1lf.jitGridBlockLaunch(C, M, A, B, stream); - CU_OK (cudaStreamSynchronize(stream)); - kernel_timer.Stop(); - GBURBLE ("(GPU phase1 done %12.6g ms )\n", kernel_timer.Elapsed()) ; - - mxm_dense_launchFactory mdlf(my_mxm_spec); - GBURBLE ("(GPU Dense full x full launch ) ") ; - kernel_timer.Start(); - mdlf.jitGridBlockLaunch( C, M, A, B, stream); - CU_OK (cudaStreamSynchronize(stream)); // only for timing - kernel_timer.Stop(); - GBURBLE ("(GPU Dense full x full done %12.6g ms, rate=%12.6g)\n", - kernel_timer.Elapsed(), (mnvec)/(1000*kernel_timer.Elapsed())) ; - - } - else - { - - //---------------------------------------------------------------------- - // (sparse or hyper) times (sparse or hyper) - // (sparse or hyper) times (bitmap or full) - // (bitmap or full) times (sparse or hyper) - //---------------------------------------------------------------------- - - //---------------------------------------------------------------------- - // construct the tasks for phase1 and phase2 - //---------------------------------------------------------------------- - - // on the CPU: nthreads = GB_nthreads (cnz, chunk, nthreads_max) ; - // on the GPU: - phase1launchFactory p1lf(my_mxm_spec); - phase2launchFactory p2lf; - phase2endlaunchFactory p2elf; - - // # of threads in phase1 and phase2 kernel launches are related - // # by the size of the warp. ph2_task = ph1_task/32 for example - int nthrd = p2lf.get_threads_per_block(); - int ntasks = p2elf.get_number_of_blocks(M); - - int64_t nanobuckets_size = NBUCKETS * nthrd * ntasks; - int64_t blockbuckets_size = NBUCKETS * ntasks; - - Nanobuckets = GB_MALLOC_WORK (nanobuckets_size, int64_t, &Nb_size) ; - Blockbucket = GB_MALLOC_WORK (blockbuckets_size, int64_t, &Bb_size) ; - Bucketp = GB_MALLOC_WORK (NBUCKETS+1, int64_t, &Bup_size) ; - offset = GB_MALLOC_WORK (NBUCKETS, int64_t, &O_size) ; - Bucket = GB_MALLOC_WORK (mnz, int64_t, &Bu_size) ; - - if (Nanobuckets == NULL || Blockbucket == NULL || Bucketp == NULL - || Bucket == NULL || offset == NULL) - { - // out of memory - GB_FREE_ALL ; - return (GrB_OUT_OF_MEMORY) ; - } - - // FIXME: do async with streams - // FIXME: do we need any of these? - //CU_OK (cudaMemsetAsync(Nanobuckets, 0, - // nanobuckets_size * sizeof(int64_t), stream)); - //CU_OK (cudaMemsetAsync(Blockbucket, 0, - // blockbuckets_size * sizeof(int64_t), stream)); - CU_OK (cudaMemsetAsync(Bucketp, 0, - (NBUCKETS+1) * sizeof(int64_t), stream)); - CU_OK (cudaMemsetAsync(offset, 0, - NBUCKETS * sizeof(int64_t), stream)); - //CU_OK (cudaMemsetAsync(Bucket, 0, - // mnz * sizeof(int64_t), stream)); - - //---------------------------------------------------------------------- - // phase1 and phase2: place each C(i,j) in a bucket - //---------------------------------------------------------------------- - - CU_OK (cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), - cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId)); - CU_OK (cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), - cudaMemAdviseSetAccessedBy, device)); - - CU_OK (cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), - cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId)); - CU_OK (cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), - cudaMemAdviseSetAccessedBy, device)); - - //---------------------------------------------------------------------- - // phase1: assign each C(i,j) to a bucket, and count them - //---------------------------------------------------------------------- - - GBURBLE ("(GPU sparse phase1 start nblk = %d) ", - p1lf.get_number_of_blocks(M)); - kernel_timer.Start(); - p1lf.jitGridBlockLaunch(Nanobuckets, Blockbucket, C, M, A, B, stream); - CU_OK (cudaStreamSynchronize(stream)); - kernel_timer.Stop(); - - GBURBLE ("(GPU phase1 done %12.6g ms )\n", kernel_timer.Elapsed()) ; - - //---------------------------------------------------------------------- - // phase2: cumsum across the blockbuckets, propagate to thread level - //---------------------------------------------------------------------- - - GBURBLE ("(GPU phase2 start nblk=%d ) ", ntasks) ; - - kernel_timer.Start(); - p2lf.jitGridBlockLaunch(Blockbucket, offset, M, stream); - kernel_timer.Stop(); - - CU_OK (cudaStreamSynchronize(stream)); - - int64_t s= offset[0]; - C->nzombies = s; - bool all_in_one = false; - for ( int bucket = 1 ; bucket < NBUCKETS+1; ++bucket) - { - Bucketp[bucket] = s; - s += offset[bucket]; - if ( (Bucketp[bucket] - Bucketp[bucket-1] ) == mnz ) - { - all_in_one = true; - } - } - - GBURBLE ("(GPU phase2 done %12.6g ms )\n", kernel_timer.Elapsed()) ; - - if (!all_in_one) - { - GBURBLE ("(GPU phase2end start nblk=%d) ", ntasks) ; - - kernel_timer.Start(); - p2elf.jitGridBlockLaunch(Nanobuckets, Blockbucket, - Bucketp, Bucket, offset, C, M, stream); - - CU_OK (cudaStreamSynchronize(stream)); - kernel_timer.Stop(); - GBURBLE ("(GPU phase2end done %12.6g ms)\n",kernel_timer.Elapsed()); - } - - //---------------------------------------------------------------------- - // phase3: do the numerical work - //---------------------------------------------------------------------- - - for ( int bucket = 1 ; bucket < NBUCKETS; ++bucket) - { - int64_t start = Bucketp[bucket]; - int64_t end = Bucketp[bucket + 1 ]; - if (end - start > 0) - { - // TODO: Use stream pool - phase3launchFactory p3lf(my_mxm_spec, (GB_bucket_code)bucket); - GBURBLE ("(GPU phase3 bucket %d launch ) ", bucket) ; - kernel_timer.Start(); - p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, - C, M, A, B, stream); - CU_OK (cudaStreamSynchronize(stream)); // only for timing - kernel_timer.Stop(); - GBURBLE ("(GPU phase3 bucket %d done %12.6g ms, rate=%12.6g)\n", - bucket, kernel_timer.Elapsed(), - (end-start)/(1000*kernel_timer.Elapsed())) ; - } - } - } + void *dl_function ; + GrB_Info info = GB_jitifyer_load (&dl_function, + GB_jit_mxm_family, "cuda_AxB_dot3", + hash, &encoding, suffix, semiring, NULL, + NULL, C->type, A->type, B->type) ; + if (info != GrB_SUCCESS) return (info) ; //-------------------------------------------------------------------------- - // free workspace and return result + // call the jit kernel and return result //-------------------------------------------------------------------------- - GB_FREE_WORKSPACE ; - return GrB_SUCCESS; + GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ; + return (GB_jit_kernel (C, M, A, B, stream, device, number_of_sms, + &GB_callback)) ; } diff --git a/GraphBLAS/CUDA/GB_cuda_common_jitFactory.hpp b/GraphBLAS/CUDA/GB_cuda_common_jitFactory.hpp deleted file mode 100644 index 5d7ad01e6f..0000000000 --- a/GraphBLAS/CUDA/GB_cuda_common_jitFactory.hpp +++ /dev/null @@ -1,82 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_common_jitFactory.hpp: for all jitFactory classes -//------------------------------------------------------------------------------ - -// (c) Nvidia Corp. 2023 All rights reserved -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -// Common defines for all jitFactory classes: -// iostream callback to deliver the buffer to jitify as if read from a file -// compiler flags -// Include this file along with any jitFactory you need. - -// NOTE: do not edit the GB_cuda_common_jitFactory.hpp directly. It is -// configured by cmake from the following file: -// GraphBLAS/CUDA/Config/GB_cuda_common_jitFactory.hpp.in - -#ifndef GB_CUDA_COMMON_JITFACTORY_HPP -#define GB_CUDA_COMMON_JITFACTORY_HPP - -#pragma once - -#include "GraphBLAS_cuda.h" - -extern "C" -{ - #include "GB.h" - #include "GB_stringify.h" -} - -#include -#include -#include "GB_cuda_jitify_cache.h" -#include "GB_cuda_jitify_launcher.h" -#include "GB_cuda_mxm_factory.hpp" -#include "GB_cuda_error.h" -#include "../rmm_wrap/rmm_wrap.h" -#include "GB_iceil.h" - -// amount of shared memory to use in CUDA kernel launches -constexpr unsigned int SMEM = 0 ; - -#if 0 - -static const std::vector GB_jit_cuda_compiler_flags{ // OLD - "-std=c++17", - //"-G", - "-remove-unused-globals", - "-w", - "-D__CUDACC_RTC__", -// "-I" + jit::get_user_home_cache_dir(), // FIXME: add +/cu/00 -// "-I" + jit::get_user_home_cache_dir() + "/src", - "-I/usr/local/cuda/include", - // FIXME: add SUITESPARSE_CUDA_ARCHITECTURES here, via config -}; - -#endif - -inline std::vector GB_cuda_jit_compiler_flags ( ) -{ - return ( - std::vector ( - {"-std=c++17", - //"-G", - "-remove-unused-globals", - "-w", - "-D__CUDACC_RTC__", - "-I" + jit::get_user_home_cache_dir(), // FIXME: add +/cu/00 - "-I" + jit::get_user_home_cache_dir() + "/src", - "-I/usr/local/cuda/include" - // FIXME: add SUITESPARSE_CUDA_ARCHITECTURES here, via config - })) ; -} ; - -// FIXME: rename GB_jit_cuda_header_names or something -static const std::vector header_names ={}; - -// FIXME: rename GB_jit_cuda_file_callback -inline std::istream* (*file_callback)(std::string, std::iostream&); - -#endif diff --git a/GraphBLAS/CUDA/GB_cuda_error.h b/GraphBLAS/CUDA/GB_cuda_error.h deleted file mode 100644 index d9aec9b3ff..0000000000 --- a/GraphBLAS/CUDA/GB_cuda_error.h +++ /dev/null @@ -1,82 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_error.h -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -/* - * Copyright (c) 2023 NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -#ifndef GB_CUDA_ERROR_H -#define GB_CUDA_ERROR_H - -#include - -static const char *_cudaGetErrorEnum(cudaError_t error) { - return cudaGetErrorName(error); -} - -template -void check(T result, char const *const func, const char *const file, - int const line) { - if (result) { - fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, - static_cast(result), _cudaGetErrorEnum(result), func); - exit(EXIT_FAILURE); - } -} - -#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) - -// This will output the proper error string when calling cudaGetLastError -#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) - -inline void __getLastCudaError(const char *errorMessage, const char *file, - const int line) { - cudaError_t err = cudaGetLastError(); - - if (cudaSuccess != err) { - fprintf(stderr, - "%s(%i) : getLastCudaError() CUDA error :" - " %s : (%d) %s.\n", - file, line, errorMessage, static_cast(err), - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } -} - -// This will only print the proper error string when calling cudaGetLastError -// but not exit program incase error detected. -#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) - -inline void __printLastCudaError(const char *errorMessage, const char *file, - const int line) { - cudaError_t err = cudaGetLastError(); - - if (cudaSuccess != err) { - fprintf(stderr, - "%s(%i) : getLastCudaError() CUDA error :" - " %s : (%d) %s.\n", - file, line, errorMessage, static_cast(err), - cudaGetErrorString(err)); - } -} -#define CHECK_CUDA(call) checkCudaErrors( call ) - -#endif diff --git a/GraphBLAS/CUDA/GB_cuda_get_device_count.cu b/GraphBLAS/CUDA/GB_cuda_get_device_count.cu index 7cad833fa1..3f0d074dd4 100644 --- a/GraphBLAS/CUDA/GB_cuda_get_device_count.cu +++ b/GraphBLAS/CUDA/GB_cuda_get_device_count.cu @@ -2,12 +2,13 @@ // GraphBLAS/CUDA/GB_cuda_get_device_count.cu: find out how many GPUs exist //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ -#include "GB_cuda.h" +#include "GB_cuda.hpp" bool GB_cuda_get_device_count // true if OK, false if failure ( diff --git a/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu b/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu index 7bb7e1407f..daaac9a214 100644 --- a/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu +++ b/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu @@ -2,12 +2,17 @@ // GraphBLAS/CUDA/GB_cuda_get_device_properties: get the properties of a GPU //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ -#include "GB_cuda.h" +#include "GB_cuda.hpp" +#define CU_OK(cudaMethod) \ +{ \ + if ((cudaMethod) != cudaSuccess) return (false) ; \ +} //------------------------------------------------------------------------------ // GB_cuda_get_device: get the current GPU @@ -20,7 +25,7 @@ bool GB_cuda_get_device (int &device) // invalid inputs return (false) ; } - CHECK_CUDA_SIMPLE (cudaGetDevice (&device)) ; + CU_OK (cudaGetDevice (&device)) ; return (true) ; } @@ -35,7 +40,7 @@ bool GB_cuda_set_device (int device) // invalid inputs return (false) ; } - CHECK_CUDA_SIMPLE (cudaSetDevice (device)) ; + CU_OK (cudaSetDevice (device)) ; return (true) ; } @@ -64,7 +69,7 @@ bool GB_cuda_get_device_properties // true if OK, false if failure memset (prop, 0, sizeof (GB_cuda_device)) ; int old_device ; - CHECK_CUDA_SIMPLE ( cudaGetDevice( &old_device ) ) ; + CU_OK (cudaGetDevice (&old_device )) ; //-------------------------------------------------------------------------- // get the properties @@ -73,26 +78,24 @@ bool GB_cuda_get_device_properties // true if OK, false if failure int num_sms, compute_capability_major, compute_capability_minor ; size_t memfree, memtotal ; - CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&num_sms, - cudaDevAttrMultiProcessorCount, - device) ) ; - CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&compute_capability_major, - cudaDevAttrComputeCapabilityMajor, - device) ) ; - CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&compute_capability_minor, - cudaDevAttrComputeCapabilityMajor, - device) ) ; + CU_OK (cudaDeviceGetAttribute (&num_sms, + cudaDevAttrMultiProcessorCount, device)) ; + CU_OK (cudaDeviceGetAttribute (&compute_capability_major, + cudaDevAttrComputeCapabilityMajor, device)) ; + CU_OK (cudaDeviceGetAttribute (&compute_capability_minor, + cudaDevAttrComputeCapabilityMinor, device)) ; - CHECK_CUDA_SIMPLE ( cudaSetDevice( device ) ) ; - CHECK_CUDA_SIMPLE ( cudaMemGetInfo( & memfree, &memtotal) ) ; - CHECK_CUDA_SIMPLE ( cudaSetDevice( old_device ) ) ; + CU_OK (cudaSetDevice (device )) ; + CU_OK (cudaMemGetInfo (&memfree, &memtotal)) ; + CU_OK (cudaSetDevice (old_device )) ; prop->total_global_memory = memtotal ; prop->number_of_sms = num_sms ; prop->compute_capability_major = compute_capability_major ; prop->compute_capability_minor = compute_capability_minor ; - printf ("Device: %d: memory: %ld SMs: %d compute: %d.%d\n", + // FIXME: remove this printf + printf ("\nDevice: %d: memory: %ld SMs: %d compute: %d.%d\n", device, prop->total_global_memory, prop->number_of_sms, prop->compute_capability_major, prop->compute_capability_minor) ; diff --git a/GraphBLAS/CUDA/GB_cuda_init.c b/GraphBLAS/CUDA/GB_cuda_init.c index 25dd233b88..ed920f9758 100644 --- a/GraphBLAS/CUDA/GB_cuda_init.c +++ b/GraphBLAS/CUDA/GB_cuda_init.c @@ -2,7 +2,8 @@ // GraphBLAS/CUDA/GB_cuda_init: initialize the GPUs for use by GraphBLAS //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ @@ -12,7 +13,6 @@ // assumed. Then each GPU is "warmed up" by allocating a small amount of // memory. -#undef GBCUDA_CPLUSPLUS #include "GB.h" GrB_Info GB_cuda_init (void) @@ -55,7 +55,6 @@ GrB_Info GB_cuda_init (void) GB_cuda_set_device (0) ; // make GPU 0 the default device GB_Context_gpu_id_set (NULL, 0) ; // set GxB_CONTEXT_WORLD->gpu_id to 0 - GB_Global_hack_set (2, 0) ; // gpu_hack default // also check for jit cache, pre-load library of common kernels ... return (GrB_SUCCESS) ; diff --git a/GraphBLAS/CUDA/GB_cuda_jitify_cache.cu b/GraphBLAS/CUDA/GB_cuda_jitify_cache.cu deleted file mode 100644 index 3e66d735b5..0000000000 --- a/GraphBLAS/CUDA/GB_cuda_jitify_cache.cu +++ /dev/null @@ -1,233 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_jitify_cache.cu -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -/* - * Copyright (c) 2019,2023 NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -//------------------------------------------------------------------------------ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "GB_cuda_jitify_cache.h" -#include "GraphBLAS_cuda.h" - -extern "C" -{ - #include "GB.h" - #include "GB_jitifyer.h" -} - -namespace jit { - -// Get the directory in home to use for storing the cache - std::string get_user_home_cache_dir() { - const char *path = GB_jitifyer_get_cache_path ( ) ; - if (path == NULL) - { - return std::string ("") ; - } - else - { - return std::string (path) ; - } - } - -GBJitCache::GBJitCache() { } - -GBJitCache::~GBJitCache() { } - - -std::mutex GBJitCache::_kernel_cache_mutex; -std::mutex GBJitCache::_program_cache_mutex; - -std::string GBJitCache::getFile( - File_Desc &file_object ) -{ - // Lock for thread safety - std::lock_guard lock(_program_cache_mutex); - - // Macrofied version - auto cached_file = getCachedFile( file_object, file_map ); - return *std::get<1>( cached_file ).get(); -} - -named_prog GBJitCache::getProgram( - std::string const& prog_name, - std::string const& cuda_source, - std::vector const& given_headers, - std::vector const& given_options, - jitify::experimental::file_callback_type file_callback) -{ - // Lock for thread safety - std::lock_guard lock(_program_cache_mutex); -// printf(" jit_cache get program %s\n", prog_name.c_str()); - - return getCached(prog_name, program_map, - [&](){ - return jitify::experimental::Program(cuda_source, - given_headers, - given_options, - file_callback); - } - ); -} - -named_prog GBJitCache::getKernelInstantiation( - std::string const& kern_name, - named_prog const& named_program, - std::vector const& arguments) -{ - // Lock for thread safety - std::lock_guard lock(_kernel_cache_mutex); - - std::string prog_name = std::get<0>(named_program); - jitify::experimental::Program& program = *std::get<1>(named_program); - - // Make instance name e.g. "prog_binop.kernel_v_v_int_int_long int_Add" - std::string kern_inst_name = kern_name; - for ( auto&& arg : arguments ) kern_inst_name += '_' + arg; - - //printf(" got kernel instance %s\n",kern_inst_name.c_str()); - - return getCached(kern_inst_name, kernel_inst_map, - [&](){return program.kernel(kern_name) - .instantiate(arguments); - } - ); -} - -// Another overload for getKernelInstantiation which might be useful to get -// kernel instantiations in one step -// ------------------------------------------------------------------------ -/* -jitify::experimental::KernelInstantiation GBJitCache::getKernelInstantiation( - std::string const& kern_name, - std::string const& prog_name, - std::string const& cuda_source = "", - std::vector const& given_headers = {}, - std::vector const& given_options = {}, - file_callback_type file_callback = nullptr) -{ - auto program = getProgram(prog_name, - cuda_source, - given_headers, - given_options, - file_callback); - return getKernelInstantiation(kern_name, program); -} -*/ - -GBJitCache::cacheFile::cacheFile(std::string file_name) - : _file_name{file_name} -{ } - -GBJitCache::cacheFile::~cacheFile() { } - -std::string GBJitCache::cacheFile::read_file() -{ - // Open file (duh) - int fd = open ( _file_name.c_str(), O_RDWR ); - if ( fd == -1 ) { - // TODO: connect errors to GrB_error result -// printf(" failed to open cache file %s\n",_file_name.c_str()); - successful_read = false; - return std::string(); - } - - // Lock the file descriptor. we the only ones now - if ( lockf(fd, F_LOCK, 0) == -1 ) { - successful_read = false; - return std::string(); - } - - // Get file descriptor from file pointer - FILE *fp = fdopen( fd, "rb" ); - - // Get file length - fseek( fp , 0L , SEEK_END); - size_t file_size = ftell( fp ); - rewind( fp ); - - // Allocate memory of file length size - std::string content; - content.resize(file_size); - - char *buffer = content.data(); - - // Copy file into buffer - if( fread(buffer, file_size, 1, fp) != 1 ) { - //printf(" failed to read cache file %s\n",_file_name.c_str()); - successful_read = false; - fclose(fp); -// free(buffer); FIXME: Shouldn't need to free buffer since it's RAII - return content; // FIXME: use unique_ptr here - } - -// printf("about to close\n"); - fclose(fp); - successful_read = true; -// printf(" read cache file %s\n",_file_name.c_str()); - - return content; -} - -void GBJitCache::cacheFile::write(std::string content) -{ - // Open file and create if it doesn't exist, with access 0600 - int fd = open ( _file_name.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ); - if ( fd == -1 ) { - //printf(" failed to open cache file for write %s\n",_file_name.c_str()); - successful_write = false; - return; - } - - // Lock the file descriptor. we the only ones now - if ( lockf(fd, F_LOCK, 0) == -1 ) { - successful_write = false; - return; - } - - // Get file descriptor from file pointer - FILE *fp = fdopen( fd, "wb" ); - - // Copy string into file - if( fwrite(content.c_str(), content.length(), 1, fp) != 1 ) { - //printf(" failed to write cache file %s\n",_file_name.c_str()); - successful_write = false; - fclose(fp); - return; - } - fclose(fp); - - successful_write = true; - //printf(" wrote cache file %s\n",_file_name.c_str()); - - return; -} - -} // namespace jit diff --git a/GraphBLAS/CUDA/GB_cuda_jitify_cache.h b/GraphBLAS/CUDA/GB_cuda_jitify_cache.h deleted file mode 100644 index 36124da469..0000000000 --- a/GraphBLAS/CUDA/GB_cuda_jitify_cache.h +++ /dev/null @@ -1,327 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_jitify_cache.h -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -/* - * Copyright (c) 2019,2020 NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef GB_JIT_CACHE_H_ -#define GB_JIT_CACHE_H_ - -#include -#include -#include -#include -#include -#include -#include - - -#define JITIFY_USE_CACHE 1 - -namespace jit { - -std::string get_user_home_cache_dir(); - -template -using named_prog = std::pair>; - -// Basic file descriptor to enable file manipulation with caching -class File_Desc -{ -public: - virtual void open( const char *path_and_file, const char *mode) {} - virtual void close() {} - virtual void macrofy() { - - printf("Uh oh. this isn't good\n"); - - } - std::string filename; -}; - -/** - * @brief Get the string path to the JITIFY kernel cache directory. - * - * This path can be overridden at runtime by defining an environment variable - * named `GB_CUDA_KERNEL_CACHE_PATH`. The value of this variable must be a path - * under which the process' user has read/write priveleges. - * - * This function returns a path to the cache directory, creating it if it - * doesn't exist. - * - * The default cache directory `~/.GraphBLAS_kernel_cache`. - **/ - -class GBJitCache -{ -public: - - /**---------------------------------------------------------------------------* - * @brief Get a process wide singleton cache object - * - *---------------------------------------------------------------------------**/ - static GBJitCache& Instance() { - // Meyers' singleton is thread safe in C++11 - // Link: https://stackoverflow.com/a/1661564 - static GBJitCache cache; - return cache; - } - - GBJitCache(); - ~GBJitCache(); - - /**---------------------------------------------------------------------------* - * @brief Get the file object - * - * Searches an internal in-memory cache and file based cache for the file - * and if not found, opens the file, calls macrofy, closes the file - * - * @param file_desc [in] object representing file: open, macrofy, close - * @return string name of file, or 'error' if not able to create file - *---------------------------------------------------------------------------**/ - std::string getFile( File_Desc & file_obj ); - - /**---------------------------------------------------------------------------* - * @brief Get the Kernel Instantiation object - * - * Searches an internal in-memory cache and file based cache for the kernel - * and if not found, JIT compiles and returns the kernel - * - * @param kern_name [in] name of kernel to return - * @param program [in] Jitify preprocessed program to get the kernel from - * @param arguments [in] template arguments for kernel in vector of strings - * @return Pair of string kernel identifier and compiled kernel object - *---------------------------------------------------------------------------**/ - named_prog getKernelInstantiation( - std::string const& kern_name, - named_prog const& program, - std::vector const& arguments); - - /**---------------------------------------------------------------------------* - * @brief Get the Jitify preprocessed Program object - * - * Searches an internal in-memory cache and file based cache for the Jitify - * pre-processed program and if not found, JIT processes and returns it - * - * @param prog_file_name [in] name of program to return - * @param cuda_source [in] string source code of program to compile - * @param given_headers [in] vector of strings representing source or names of - * each header included in cuda_source - * @param given_options [in] vector of strings options to pass to NVRTC - * @param file_callback [in] pointer to callback function to call whenever a - * header needs to be loaded - * @return named_prog - *---------------------------------------------------------------------------**/ - named_prog getProgram( - std::string const& prog_file_name, - std::string const& cuda_source = "", - std::vector const& given_headers = {}, - std::vector const& given_options = {}, - jitify::experimental::file_callback_type file_callback = nullptr); - -private: - template - using umap_str_shptr = std::unordered_map>; - - umap_str_shptr file_map; - umap_str_shptr kernel_inst_map; - umap_str_shptr program_map; - - /* - Even though this class can be used as a non-singleton, the file cache - access should remain limited to one thread per process. The lockf locks can - prevent multiple processes from accessing the file but are ineffective in - preventing multiple threads from doing so as the lock is shared by the - entire process. - Therefore the mutexes are static. - */ - static std::mutex _file_cache_mutex; - static std::mutex _kernel_cache_mutex; - static std::mutex _program_cache_mutex; - -private: - /**---------------------------------------------------------------------------* - * @brief Class to allow process wise exclusive access to cache files - * - *---------------------------------------------------------------------------**/ - class cacheFile - { - private: - std::string _file_name ; - // FIXME this isn't used, is it? - std::string _dir_name = "~/.GraphBLAS_kernel_cache/"; // FIXME - bool successful_read = false; - bool successful_write = false; - public: - cacheFile(std::string file_name); - ~cacheFile(); - - /**---------------------------------------------------------------------------* - * @brief Read this file and return the contents as a std::string - * - *---------------------------------------------------------------------------**/ - std::string read_file(); - - /**---------------------------------------------------------------------------* - * @brief Write the passed string to this file - * - *---------------------------------------------------------------------------**/ - void write(std::string); - - /**---------------------------------------------------------------------------* - * @brief Check whether the read() operation on the file completed successfully - * - * @return true Read was successful. String returned by `read()` is valid - * @return false Read was unsuccessful. String returned by `read()` is empty - *---------------------------------------------------------------------------**/ - bool is_read_successful() { return successful_read; } - - /**---------------------------------------------------------------------------* - * @brief Check whether the write() operation on the file completed successfully - * - * @return true Write was successful. - * @return false Write was unsuccessful. File state is undefined - *---------------------------------------------------------------------------**/ - bool is_write_successful() { return successful_write; } - }; - -private: - - template - named_prog getCachedFile( - FileDescType &file_object, - umap_str_shptr& map ) - { - -// printf("INside get cached file\n"); - std::string name = file_object.filename; - - // Find memory cached T object - auto it = map.find(name); - if ( it != map.end()) { -// std::cout<<"found memory-cached file "<second); - } - else { // Find file cached T object - bool successful_read = false; - std::string serialized; - std::string cache_dir = get_user_home_cache_dir(); - std::string file_name = cache_dir + "/" + name; - if (not cache_dir.empty() ) { - // TODO: Use OS-agnostic path separator here -// std::cout<<"looking for prog in file "<(serialized); - //std::cout<<"storing file in memory "< - named_prog getCached( - std::string const& name, - umap_str_shptr& map, - FallbackFunc func) { - - // Find memory cached T object - auto it = map.find(name); - if ( it != map.end()) { -// std::cout<<"found memory-cached prog "<second); - } - else { // Find file cached T object - bool successful_read = false; - std::string serialized; - #if defined(JITIFY_USE_CACHE) - std::string cache_dir = get_user_home_cache_dir() ; - if (not cache_dir.empty() ) { - // TODO: Use OS-agnostic path separator - std::string file_name = cache_dir + "/" + name; - //std::cout<<"looking for prog in file "<(T::deserialize(serialized)); - map[name] = program; - //std::cout<<"storing prog in memory "< - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "GB_cuda_jitify_launcher.h" -#include - -namespace jit { - - launcher::launcher( - const std::string& hash, - const std::string& cuda_source, - const std::vector& header_names, - const std::vector& compiler_flags, - jitify::experimental::file_callback_type file_callback, - cudaStream_t stream - ) - : cache_instance{jit::GBJitCache::Instance()} - , stream(stream) - { - program = cache_instance.getProgram( - hash, - cuda_source.c_str(), - header_names, - compiler_flags, - file_callback - ); - } - - launcher::launcher(launcher&& launcher) - : program {std::move(launcher.program)} - , cache_instance {jit::GBJitCache::Instance()} - , kernel_inst {std::move(launcher.kernel_inst)} - , stream {launcher.stream} - { } - -} // namespace jit diff --git a/GraphBLAS/CUDA/GB_cuda_jitify_launcher.h b/GraphBLAS/CUDA/GB_cuda_jitify_launcher.h deleted file mode 100644 index 088a2bd77a..0000000000 --- a/GraphBLAS/CUDA/GB_cuda_jitify_launcher.h +++ /dev/null @@ -1,152 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_jitify_launcher.h -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -/* - * Copyright (c) 2019,2023 NVIDIA CORPORATION. - * - * Copyright 2018-2019 BlazingDB, Inc. - * Copyright 2018 Christian Noboa Mardini - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -//------------------------------------------------------------------------------ - -// FIXME: rename .hpp? - -#ifndef GB_CUDA_JITIFY_LAUNCHER_H -#define GB_CUDA_JITIFY_LAUNCHER_H - -#include "GB_cuda_jitify_cache.h" - -#include -#include -#include -#include - -#undef JITIFY_PRINT_INSTANTIATION -#define JITIFY_PRINT_INSTANTIATION 0 -#undef JITIFY_PRINT_SOURCE -#define JITIFY_PRINT_SOURCE 1 -#undef JITIFY_PRINT_LOG -#define JITIFY_PRINT_LOG 1 -#undef JITIFY_PRINT_PTX -#define JITIFY_PRINT_PTX 1 -#undef JITIFY_PRINT_LINKER_LOG -#define JITIFY_PRINT_LINKER_LOG 0 -#undef JITIFY_PRINT_LAUNCH -#define JITIFY_PRINT_LAUNCH 1 -#include "jitify.hpp" - - -namespace jit { - -/** - * @brief Class used to handle compilation and execution of JIT kernels - * - */ -class launcher { - public: - launcher() = delete; - - /** - * @brief C'tor of the launcher class - * - * Method to generate vector containing all template types for a JIT kernel. - * This vector is used to get the compiled kernel for one set of types and set - * it as the kernel to launch using this launcher. - * - * @param hash The hash to be used as the key for caching - * @param cuda_code The CUDA code that contains the kernel to be launched - * @param header_names Strings of header_names or strings that contain content - * of the header files - * @param compiler_flags Strings of compiler flags - * @param file_callback a function that returns header file contents given header - * file names. - * @param stream The non-owned stream to use for execution - */ - launcher( - const std::string& hash, - const std::string& cuda_source, - const std::vector& header_names, - const std::vector& compiler_flags, - jitify::experimental::file_callback_type file_callback, - cudaStream_t stream = 0 - ); - launcher(launcher&&); - launcher(const launcher&) = delete; - launcher& operator=(launcher&&) = delete; - launcher& operator=(const launcher&) = delete; - - /** - * @brief Sets the kernel to launch using this launcher - * - * Method to generate vector containing all template types for a JIT kernel. - * This vector is used to get the compiled kernel for one set of types and set - * it as the kernel to launch using this launcher. - * - * @param kernel_name The kernel to be launched - * @param arguments The template arguments to be used to instantiate the kernel - * @return launcher& ref to this launcehr object - */ - launcher& set_kernel_inst( - const std::string& kernel_name, - const std::vector& arguments - ) - { // program is a member variable of the launcher - kernel_inst = cache_instance.getKernelInstantiation(kernel_name, program, arguments); - return *this; - } - - /** - * @brief Handle the Jitify API to launch using information - * contained in the members of `this` - * - * @tparam grid and block sizes - * @return Return launcher reference if successful - */ - jitify::experimental::KernelLauncher configure( dim3 grid, dim3 block, unsigned int smem = 0, cudaStream_t stream = 0){ - return get_kernel().configure( grid, block, smem, stream); - //return get_kernel().configure_1d_max_occupancy( max_block_size=block.x); - } - - - /** - * @brief Handle the Jitify API to launch using information - * contained in the members of `this` - * - * @tparam All parameters to launch the kernel - * @return Return GDF_SUCCESS if successful - */ - template - void launch(Args ... args) { - get_kernel().configure_1d_max_occupancy(32, 0, 0, stream).launch(args...); - } - - private: - jit::GBJitCache& cache_instance; - jit::named_prog program; - jit::named_prog kernel_inst; - cudaStream_t stream; - - jitify::experimental::KernelInstantiation& get_kernel() { return *std::get<1>(kernel_inst); } -}; - -} // namespace jit - -#endif diff --git a/GraphBLAS/CUDA/GB_cuda_matrix_prefetch.cpp b/GraphBLAS/CUDA/GB_cuda_matrix_prefetch.cpp index c71dc0cd6c..d3e5710c33 100644 --- a/GraphBLAS/CUDA/GB_cuda_matrix_prefetch.cpp +++ b/GraphBLAS/CUDA/GB_cuda_matrix_prefetch.cpp @@ -2,12 +2,13 @@ // GraphBLAS/CUDA/GB_cuda_matrix_prefetch: prefetch a matrix to a GPU or the CPU //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ -#include "GB_cuda.h" +#include "GB_cuda.hpp" #define GB_FREE_ALL ; GrB_Info GB_cuda_matrix_prefetch @@ -25,33 +26,39 @@ GrB_Info GB_cuda_matrix_prefetch if (A->p != NULL && (which & GB_PREFETCH_P)) { - CU_OK (cudaMemPrefetchAsync (A->p, (anvec+1) * sizeof (int64_t), device, stream)) ; + CUDA_OK (cudaMemPrefetchAsync (A->p, (anvec+1) * sizeof (int64_t), + device, stream)) ; } if (A->h != NULL && (which & GB_PREFETCH_H)) { - CU_OK (cudaMemPrefetchAsync (A->h, anvec * sizeof (int64_t), device, stream)) ; + CUDA_OK (cudaMemPrefetchAsync (A->h, anvec * sizeof (int64_t), + device, stream)) ; } if (A->Y != NULL && (which & GB_PREFETCH_Y)) { // prefetch the hyper_hash: A->Y->p, A->Y->i, and A->Y->x - GB_OK (GB_cuda_matrix_prefetch (A->Y, GB_PREFETCH_PIX, device, stream)) ; + GB_OK (GB_cuda_matrix_prefetch (A->Y, GB_PREFETCH_PIX, + device, stream)) ; } if (A->b != NULL && (which & GB_PREFETCH_B)) { - CU_OK (cudaMemPrefetchAsync (A->b, anz * sizeof (int8_t), device, stream)) ; + CUDA_OK (cudaMemPrefetchAsync (A->b, anz * sizeof (int8_t), + device, stream)) ; } if (A->i != NULL && (which & GB_PREFETCH_I)) { - CU_OK (cudaMemPrefetchAsync (A->i, anz * sizeof (int64_t), device, stream)) ; + CUDA_OK (cudaMemPrefetchAsync (A->i, anz * sizeof (int64_t), + device, stream)) ; } if (A->x != NULL && (which & GB_PREFETCH_X)) { - CU_OK (cudaMemPrefetchAsync (A->x, (A->iso ? 1:anz) * A->type->size, device, stream)) ; + CUDA_OK (cudaMemPrefetchAsync (A->x, (A->iso ? 1:anz) * A->type->size, + device, stream)) ; } return (GrB_SUCCESS) ; diff --git a/GraphBLAS/CUDA/GB_cuda_mxm_dot3_jitFactory.hpp b/GraphBLAS/CUDA/GB_cuda_mxm_dot3_jitFactory.hpp deleted file mode 100644 index 3fce511b40..0000000000 --- a/GraphBLAS/CUDA/GB_cuda_mxm_dot3_jitFactory.hpp +++ /dev/null @@ -1,832 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_mxm_dot3_jitFactory.hpp -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -/* - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef GB_MXM_DOT3_JITFACTORY_H -#define GB_MXM_DOT3_JITFACTORY_H - -#pragma once - -/** - * This file is responsible for picking all the parameters and what kernel - * variaiton we will use for a given instance - * - data types - * - semiring types - * - binary ops - * - monoids - * - * Kernel factory says "Here's the actual instance I want you to build with the - * given parameters" - */ - - -//AxB_dot3_phase1 kernel launchers -template class phase1launchFactory ; -template class dense_phase1launchFactory ; - -//AxB_dot3_phase3 kernel launchers - -//------------------------------------------------------------------------------ -// dot3: dense_phase1launchFactory -//------------------------------------------------------------------------------ - -// Handles full/bitmap cases, which means we don't need buckets and zombies. -// This is a much simpler kernel as a result, it only does the i,j lookup -// and stores the values in Mi and Ci. - - -template -class dense_phase1launchFactory -{ - // FIXME: this is the full name. Why? See below for partial name. - // Need to be consistent in naming schemes. - std::string kernel_name = "GB_cuda_jit_AxB_dot3_dense_phase1"; - - GB_cuda_mxm_factory &mxm_factory_; - -public: - - int get_number_of_blocks(GrB_Matrix M) { - int number_of_sms = GB_Global_gpu_sm_get (0); - int nblks = ( GB_nnz (M) + chunk_size - 1)/chunk_size; - return GB_IMIN( nblks, chunk_size * number_of_sms); - } - - int get_threads_per_block() { - return threads_per_block; - } - - // This assumes the needed state on the GB_cuda_mxm_factory - // has already been populated - dense_phase1launchFactory(GB_cuda_mxm_factory &mxm_factory): mxm_factory_(mxm_factory){} - - bool jitGridBlockLaunch( GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B, cudaStream_t stream = 0) { - - // Idea is to have each task work on a continguous block of columns of C - // Note: for small tests, mnz is small so ntasks is be governed by - // chunksize, not chunk_size*number_of_sms. For large problems in - // production, chunksize is less important since ntasks will likely be - // bounded by chunk_size*number_of_sms (say 128*80 = 10,240 on a V100, for - // the default chunk_size of 128). - - // Defining dummy instance only so we can introspect type -// // (1) create the mxm code and name - -// // (2) ensure the jitifier has "GB_mxm_[mymxm.sr_code].h" - jit::GBJitCache filecache = jit::GBJitCache::Instance() ; - filecache.getFile (mxm_factory_) ; - - uint64_t sr_code = mxm_factory_.sr_code ; - int mask_ecode = GB_RSHIFT (sr_code, 20, 4) ; - bool mask_no_type = (mask_ecode < 4) ; - auto sr_code_str = std::to_string(sr_code) ; - std::vector template_types = { - (mask_no_type) ? "bool" : M->type->name, sr_code_str }; - - std::stringstream string_to_be_jitted ; - - string_to_be_jitted << kernel_name << std::endl << - R"(#include "GB_cuda_kernel.h")" << std::endl << - R"(#include ")" << mxm_factory_.filename << R"(")" << std::endl << - R"(#include ")" << kernel_name << R"(.cuh")" << std::endl; - - bool result = false; - - dim3 grid(get_number_of_blocks(M)); - dim3 block(get_threads_per_block()); - - std::cout << "HERE I AM 7" << std::endl ; - jit::launcher( kernel_name + "_" + sr_code_str + ".jtfy", - string_to_be_jitted.str(), - header_names, - GB_cuda_jit_compiler_flags ( ), - file_callback /* FIXME: make NULL */) - .set_kernel_inst( kernel_name, template_types) - .configure(grid, block, SMEM, stream) - .launch( C, M); - - result = true; - - return result; - } -}; - -//------------------------------------------------------------------------------ -// dot3: phase1launchFactory -//------------------------------------------------------------------------------ - -// FIXME: We probably want to remove this type template altogether and provide -// a macro/function that can convert from a GrB_Type instance to the name of a -// type that the jitifier will accept. - -template -class phase1launchFactory -{ - std::string kernel_name = "GB_cuda_jit_AxB_dot3_phase1"; - - GB_cuda_mxm_factory &mxm_factory_; - -public: - - int get_number_of_blocks(GrB_Matrix M) { - int number_of_sms = GB_Global_gpu_sm_get (0); - int nblks = ( GB_nnz (M) + chunk_size - 1)/chunk_size; - return GB_IMIN( nblks, chunk_size * number_of_sms); - } - - int get_threads_per_block() { - return threads_per_block; - } - - // This assumes the needed state on the GB_cuda_mxm_factory - // has already been populated - phase1launchFactory(GB_cuda_mxm_factory &mxm_factory): mxm_factory_(mxm_factory){} - - bool jitGridBlockLaunch(int64_t *nanobuckets, int64_t *blockBucket, - GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B, cudaStream_t stream = 0) { - - // Idea is to have each task work on a continguous block of columns of C - // Note: for small tests, mnz is small so ntasks is be governed by - // chunksize, not chunk_size*number_of_sms. For large problems in - // production, chunksize is less important since ntasks will likely be - // bounded by chunk_size*number_of_sms (say 128*80 = 10,240 on a V100, for - // the default chunk_size of 128). - - // Defining dummy instance only so we can introspect type -// // (1) create the mxm code and name - -// // (2) ensure the jitifier has "GB_mxm_[mymxm.sr_code].h" - jit::GBJitCache filecache = jit::GBJitCache::Instance() ; - filecache.getFile (mxm_factory_) ; - - uint64_t sr_code = mxm_factory_.sr_code ; - int mask_ecode = GB_RSHIFT (sr_code, 20, 4) ; - bool mask_no_type = (mask_ecode < 4) ; - auto sr_code_str = std::to_string(sr_code) ; - std::vector template_types = { - (mask_no_type) ? "bool" : M->type->name, sr_code_str }; - - std::stringstream string_to_be_jitted ; - - string_to_be_jitted << kernel_name << std::endl << - R"(#include "GB_cuda_kernel.h")" << std::endl << - R"(#include ")" << mxm_factory_.filename << R"(")" << std::endl << - R"(#include ")" << kernel_name << R"(.cuh")" << std::endl; - - std::cout << "header names:" << std::endl ; -// std::cout << header_names << std::endl ; - for (std::string s : header_names) - { - std::cout << " " << s << std::endl ; - } -// std::cout << "string_to_be_jitted :" << std::endl ; -// std::cout << string_to_be_jitted << std::endl ; - std::cout << "GB_cuda_jit_compiler_flags ( ):" << std::endl ; - for (std::string s : GB_cuda_jit_compiler_flags ( )) - { - std::cout << " " << s << std::endl ; - } - std::cout << "kernel_name + sr_code_str .jtfy:" << std::endl ; - std::cout << kernel_name + "_" + sr_code_str + ".jtfy" << std::endl ; - std::cout << "jit::get_user_home_cache_dir ( ):" << std::endl ; - std::cout << jit::get_user_home_cache_dir ( ) << std::endl ; - - bool result = false; - - dim3 grid(get_number_of_blocks(M)); - dim3 block(get_threads_per_block()); - - std::cout << "HERE I AM 1" << std::endl ; - jit::launcher( kernel_name + "_" + sr_code_str + ".jtfy", - string_to_be_jitted.str(), - header_names, - GB_cuda_jit_compiler_flags ( ), - file_callback) - .set_kernel_inst( kernel_name, template_types) - .configure(grid, block, SMEM, stream) - .launch( nanobuckets, blockBucket, C, M, A, B); - - result = true; - - return result; - } -}; - -//------------------------------------------------------------------------------ -// dot3: phase2launchFactory -//------------------------------------------------------------------------------ - -template -class phase2launchFactory -{ - - std::string base_name = "GB_cuda_jit"; - // FIXME: this is the partial name. Why? See above. - std::string kernel_name = "AxB_phase2"; - -public: - - int get_threads_per_block() { - return threads_per_block; - } - - int get_number_of_blocks(GrB_Matrix M) { - const int64_t mnz = GB_nnz (M) ; - int ntasks = ( mnz +chunk_size -1)/chunk_size; - // Idea is to have each task work on a continguous block of columns of C - ntasks = GB_IMIN( ntasks, chunk_size*GB_Global_gpu_sm_get (0)) ; // ntasks will be grid.x - return (ntasks + threads_per_block - 1) / threads_per_block ; - } - - int get_number_of_phase1_blocks( GrB_Matrix M){ - const int64_t mnz = GB_nnz (M) ; - int number_of_sms = GB_Global_gpu_sm_get (0); - int nblks = ( GB_nnz (M) + chunk_size - 1)/chunk_size; - return GB_IMIN( nblks, chunk_size * number_of_sms); - } - - bool jitGridBlockLaunch(// parameters to AxB_phase2: - int64_t *blockBucket, int64_t *offset, GrB_Matrix M, cudaStream_t stream = 0) { - - bool result = false; - - dim3 grid(get_number_of_blocks(M)); - dim3 block(get_threads_per_block()); - - std::string hashable_name = base_name + "_" + kernel_name; - std::stringstream string_to_be_jitted ; - string_to_be_jitted << hashable_name << std::endl << - R"(#include ")" << hashable_name << R"(.cuh")" << std::endl; - - const int64_t mnz = GB_nnz (M) ; - std::cout << "HERE I AM 2" << std::endl ; - jit::launcher( hashable_name, - string_to_be_jitted.str(), - header_names, - GB_cuda_jit_compiler_flags ( ), - file_callback) - .set_kernel_inst( kernel_name, {}) - .configure(grid, block, SMEM, stream) - // parameters to AxB_phase2: - .launch( blockBucket, offset, get_number_of_phase1_blocks(M)); - - result= true; - - return result; - } - -}; - -//------------------------------------------------------------------------------ -// dot3: phase2endlaunchFactory -//------------------------------------------------------------------------------ - -template< int threads_per_block = 32, int chunk_size = 128> -class phase2endlaunchFactory -{ - - std::string base_name = "GB_cuda_jit"; - std::string kernel_name = "AxB_phase2end"; - -public: - - int get_threads_per_block() { - return threads_per_block; - } - - int get_number_of_blocks(GrB_Matrix M) { - const int64_t mnz = GB_nnz (M) ; - int ntasks = ( mnz +chunk_size -1)/chunk_size; - int number_of_sms = GB_Global_gpu_sm_get (0); - - // Idea is to have each task work on a continguous block of columns of C - return GB_IMIN( ntasks, chunk_size*number_of_sms) ; // ntasks will be grid.x - } - - bool jitGridBlockLaunch(int64_t *nanobuckets, int64_t *blockBucket, - int64_t *bucketp, int64_t *bucket, int64_t *offset, - GrB_Matrix C, GrB_Matrix M, cudaStream_t stream = 0) - { - - bool result = false; - - dim3 grid(get_number_of_blocks(M)); - dim3 block(get_threads_per_block()); - - std::string hashable_name = base_name + "_" + kernel_name; - std::stringstream string_to_be_jitted ; - string_to_be_jitted << hashable_name << std::endl << - R"(#include ")" << hashable_name << R"(.cuh")" << std::endl; - - std::cout << "HERE I AM 3" << std::endl ; - jit::launcher( hashable_name, - string_to_be_jitted.str(), - header_names, - GB_cuda_jit_compiler_flags ( ), - file_callback) - .set_kernel_inst( kernel_name , {}) - .configure(grid, block, SMEM, stream) - .launch( nanobuckets, blockBucket, bucketp, bucket, offset, C, GB_nnz (M)); - - result= true; - - return result; - } - -}; - - -//------------------------------------------------------------------------------ -// dot3: mxm_dense_launchFactory -//------------------------------------------------------------------------------ - -class mxm_dense_launchFactory -{ - std::string base_name = "GB_cuda_jit"; - std::string kernel_name = "AxB_dot3_phase3_dndn"; - - GB_cuda_mxm_factory &mxm_factory_; - -public: - - /** - * This assumes the needed state on the GB_cuda_mxm_factory has already been populated. - * The `bucket_code` determines which kernel is launched - */ - mxm_dense_launchFactory(GB_cuda_mxm_factory &mymxmfactory): - mxm_factory_(mymxmfactory) {} - - bool jitGridBlockLaunch( GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B, - cudaStream_t stream = 0) { - - bool result = false; - - //---------------------------------------------------------------------- - // do the numerical work - //---------------------------------------------------------------------- - - const int64_t nz = GB_nnz(M); // number of dots in the mask - const int64_t mnvec = M->nvec ; - - int gridsz, blocksz; - - std::stringstream final_kernel_name_ss; - final_kernel_name_ss << kernel_name; - - /** - * Configure geometry and kernel function name based on sparsity of C and number of vectors in M - */ - configure( nz, mnvec, final_kernel_name_ss, blocksz, gridsz); - - auto sr_code = std::to_string(mxm_factory_.sr_code); // FIXME: make hexadecimal - - GrB_BinaryOp mult = mxm_factory_.semiring->multiply ; - - std::string hashable_name = base_name + "_" + final_kernel_name_ss.str(); - std::stringstream string_to_be_jitted ; - std::vector template_types = - { - C->type->name, A->type->name, B->type->name, - mult->ztype->name, mult->xtype->name, mult->ytype->name, - sr_code - }; - - jit::GBJitCache filecache = jit::GBJitCache::Instance() ; - filecache.getFile (mxm_factory_) ; - - string_to_be_jitted << hashable_name << std::endl << - R"(#include "GB_cuda_kernel.h")" << std::endl << - R"(#include ")" << mxm_factory_.filename << R"(")" << std::endl << - R"(#include ")" << hashable_name << R"(.cuh")" << std::endl; - - dim3 grid(gridsz); - dim3 block(blocksz); - - GBURBLE ("(GPU dot3 mxm dense launch nblocks,blocksize= %d,%d )\n", gridsz,blocksz) ; - std::cout << "HERE I AM 4" << std::endl ; - jit::launcher( hashable_name + "_" + sr_code, - string_to_be_jitted.str(), - header_names, - GB_cuda_jit_compiler_flags ( ), - file_callback) - .set_kernel_inst(final_kernel_name_ss.str(), template_types ) - // { C->type->name, - // A->type->name, - // B->type->name }) - .configure(grid, block, SMEM, stream) //if commented, use implicit 1D configure in launch - .launch( - C, // final output matrix - // inputs, not modified: - M, // Mi used for column index - A, // A matrix - B // B matrix - ); - - result= true; - - return result; - } - -private: - void configure(std::int64_t Cnz, std::int64_t mnvec, std::stringstream &opname, - int &blocksz, int &gridsz) { - int number_of_sms = GB_Global_gpu_sm_get (0) ; - - int work_per_thread; - - blocksz = 64; - work_per_thread = 8; - - if( Cnz > 1024){ - blocksz = 512; - work_per_thread = 64; - } - - // gridsz = ceiling (Cnz / work_per_thread*blocksz) - gridsz = GB_ICEIL (Cnz, work_per_thread*blocksz) ; - - } -}; - -//------------------------------------------------------------------------------ -// FIXME: rename GB_cuda_mxm_dot3_jitFactory_sparse_dense_launchFactory -//------------------------------------------------------------------------------ - -class mxm_sparse_dense_launchFactory -{ - std::string base_name = "GB_cuda_jit"; - std::string kernel_name = "AxB_dot3"; - - GB_cuda_mxm_factory &mxm_factory_; - -public: - - /** - * This assumes the needed state on the GB_cuda_mxm_factory has already been populated. - * The `bucket_code` determines which kernel is launched - */ - mxm_sparse_dense_launchFactory(GB_cuda_mxm_factory &mymxmfactory): - mxm_factory_(mymxmfactory) {} - - bool jitGridBlockLaunch( GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B, - cudaStream_t stream = 0) { - - bool result = false; - - //---------------------------------------------------------------------- - // do the numerical work - //---------------------------------------------------------------------- - - const int64_t nz = GB_nnz(M); // number of dots in the mask - const int64_t mnvec = M->nvec ; - - int gridsz, blocksz; - - std::stringstream final_kernel_name_ss; - final_kernel_name_ss << kernel_name; - - /** - * Configure geometry and kernel function name based on sparsity of C and number of vectors in M - */ - configure( nz, mnvec, final_kernel_name_ss, blocksz, gridsz); - - auto sr_code = std::to_string(mxm_factory_.sr_code); - - GrB_BinaryOp mult = mxm_factory_.semiring->multiply ; - - std::string hashable_name = base_name + "_" + final_kernel_name_ss.str(); - std::stringstream string_to_be_jitted ; - std::vector template_types = - { - C->type->name, A->type->name, B->type->name, - mult->ztype->name, mult->xtype->name, mult->ytype->name, - sr_code - }; - - jit::GBJitCache filecache = jit::GBJitCache::Instance() ; - filecache.getFile (mxm_factory_) ; - - string_to_be_jitted << hashable_name << std::endl << - R"(#include "GB_cuda_kernel.h")" << std::endl << - R"(#include ")" << mxm_factory_.filename << R"(")" << std::endl << - R"(#include ")" << hashable_name << R"(.cuh")" << std::endl; - - dim3 grid(gridsz); - dim3 block(blocksz); - - GBURBLE ("(GPU dot3 mxm sparse_dense launch nblocks,blocksize= %d,%d )\n", gridsz,blocksz) ; - std::cout << "HERE I AM 5" << std::endl ; - jit::launcher( hashable_name + "_" + sr_code, - string_to_be_jitted.str(), - header_names, - GB_cuda_jit_compiler_flags ( ), - file_callback) - .set_kernel_inst(final_kernel_name_ss.str(), template_types ) - // { C->type->name, - // A->type->name, - // B->type->name }) - .configure(grid, block, SMEM, stream) //if commented, use implicit 1D configure in launch - .launch( - C, // final output matrix - // inputs, not modified: - M, // Mi used for column index - A, // A matrix - B // B matrix - ); - - result= true; - - return result; - } - -private: - void configure(std::int64_t Cnz, std::int64_t mnvec, std::stringstream &opname, - int &blocksz, int &gridsz) { - int number_of_sms = GB_Global_gpu_sm_get (0) ; - - int work_per_thread; - - blocksz = 64; - work_per_thread = 8; - - if( Cnz > 1024){ - blocksz = 512; - work_per_thread = 64; - } - - // gridsz = ceiling (Cnz / work_per_thread*blocksz) - gridsz = GB_ICEIL (Cnz, work_per_thread*blocksz) ; - - } -}; - -//------------------------------------------------------------------------------ -// dot3: phase3launchFactory -//------------------------------------------------------------------------------ - -class phase3launchFactory -{ - std::string base_name = "GB_cuda_jit"; - std::string kernel_name = "AxB_dot3"; - - GB_cuda_mxm_factory &mxm_factory_; - - GB_bucket_code bucket_code_; - -public: - - std::string Opname; - - /** - * This assumes the needed state on the GB_cuda_mxm_factory has already been populated. - * The `bucket_code` determines which kernel is launched - */ - phase3launchFactory(GB_cuda_mxm_factory &mymxmfactory, GB_bucket_code bucket_code): - mxm_factory_(mymxmfactory), bucket_code_(bucket_code) {} - - bool jitGridBlockLaunch(int64_t start, int64_t end, int64_t *bucketp, int64_t *bucket, - GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B, - cudaStream_t stream = 0) { - - bool result = false; - - //---------------------------------------------------------------------- - // phase3: do the numerical work - //---------------------------------------------------------------------- - - const int64_t nz = end - start; // number of dots in this bucket - const int64_t mnvec = M->nvec ; - - int gridsz, blocksz, sz = 4; - - std::stringstream final_kernel_name_ss; - final_kernel_name_ss << kernel_name << "_"; - - /** - * Configure geometry and kernel function name based on sparsity of C and number of vectors in M - */ - auto sr_code = std::to_string(mxm_factory_.sr_code); - - configure2( nz, mnvec, final_kernel_name_ss, blocksz, gridsz, sz, mxm_factory_.sr_code); - - - GrB_BinaryOp mult = mxm_factory_.semiring->multiply ; - - std::string hashable_name = base_name + "_" + final_kernel_name_ss.str(); - std::stringstream string_to_be_jitted ; - std::vector template_types = - { - C->type->name, A->type->name, B->type->name, - mult->ztype->name, mult->xtype->name, mult->ytype->name, - sr_code - }; - - jit::GBJitCache filecache = jit::GBJitCache::Instance() ; - filecache.getFile (mxm_factory_) ; - - // FIXME: why is "hashable_name" used sometimes, and sometimes "kernel_name"? - string_to_be_jitted << hashable_name << std::endl << - R"(#include "GB_cuda_kernel.h")" << std::endl << - R"(#include ")" << mxm_factory_.filename << R"(")" << std::endl << - R"(#include ")" << hashable_name << R"(.cuh")" << std::endl; - - dim3 grid(gridsz); - dim3 block(blocksz); - - GBURBLE ("(GPU phase3 launch %s st,end=%ld,%ld nblocks,blocksize= %d,%d )\n", this->Opname.c_str(), - start,end,gridsz,blocksz) ; - std::cout << "HERE I AM 6" << std::endl ; - jit::launcher( hashable_name + "_" + sr_code, - string_to_be_jitted.str(), - header_names, - GB_cuda_jit_compiler_flags ( ), - file_callback) - .set_kernel_inst(final_kernel_name_ss.str(), template_types ) - // { C->type->name, - // A->type->name, - // B->type->name }) - .configure(grid, block, SMEM, stream) //if commented, use implicit 1D configure in launch - .launch( - start, // input/output: - end, // global bucket cumsum, of size NBUCKETS+1 - bucket, // global buckets, of size cnz (== mnz) - C, // final output matrix - // inputs, not modified: - M, // Mi used for column index - A, // A matrix - B, // B matrix - sz // only used for sparse-sparse cases - ); - - result= true; - - return result; - } - -private: - void configure2(std::int64_t Cnz, std::int64_t mnvec, std::stringstream &opname, - int &blocksz, int &gridsz, int &sz, uint64_t sr_code) { - int number_of_sms = GB_Global_gpu_sm_get (0) ; - - int work_per_thread; - - // 0:hyper, 1:sparse, 2:bitmap, 3:full - int asparsity = GB_RSHIFT (sr_code, 2, 2) ; - int bsparsity = GB_RSHIFT (sr_code, 0, 2) ; - - if (asparsity <= 1 && bsparsity <= 1) - { - // both A and B are sparse/hyper - switch (bucket_code_) - { - - //-------------------------------------------------------------- - // not a bucket ... bring out your dead: - //-------------------------------------------------------------- - - case GB_BUCKET_ZOMBIE : // C(i,j) is a zombie (not a bucket) - break ; - - //-------------------------------------------------------------- - // CUDA kernel: vsvs bucket: - //-------------------------------------------------------------- - - case GB_BUCKET_VSVS : - Opname = "phase3_vsvs" ; - blocksz = 256; - work_per_thread = 4; - - if( Cnz > (2<<12)){ - blocksz = 512; - work_per_thread = 4; - } - - // gridsz = ceiling (Cnz / work_per_thread*blocksz) - gridsz = GB_ICEIL (Cnz, work_per_thread*blocksz) ; - if (gridsz > 256*number_of_sms) gridsz = 256*number_of_sms; - break ; - - //-------------------------------------------------------------- - // CUDA kernel: mp, use the merge-path method: - //-------------------------------------------------------------- - - case GB_BUCKET_MERGEPATH : - Opname = "phase3_mp" ; - blocksz = 32; - work_per_thread = 256 ; - - if( Cnz > (2<<20)){ - work_per_thread = 1024; - } - gridsz = GB_ICEIL (Cnz, work_per_thread) ; - if ((gridsz < number_of_sms) && (Cnz > (2<<20))) - { - gridsz = number_of_sms; - } - if (gridsz > 256*number_of_sms) gridsz = 256*number_of_sms; - break ; - - default: - break ; - } - - } - else - { - // either A or B are bitmap/full - switch (bucket_code_) - { - - //-------------------------------------------------------------- - // not a bucket ... bring out your dead: - //-------------------------------------------------------------- - - case GB_BUCKET_ZOMBIE : // C(i,j) is a zombie (not a bucket) - break ; - - //-------------------------------------------------------------- - // CUDA kernel: vsdn bucket: one thread per C(i,j) dot product - //-------------------------------------------------------------- - - case GB_BUCKET_VSDN : - Opname = "phase3_vsdn" ; - - // FIXME: - blocksz = 256; - work_per_thread = 4; - - if( Cnz > (2<<12)){ - blocksz = 512; - work_per_thread = 4; - } - - // gridsz = ceiling (Cnz / work_per_thread*blocksz) - gridsz = GB_ICEIL (Cnz, work_per_thread*blocksz) ; - if (gridsz > 256*number_of_sms) gridsz = 256*number_of_sms; - break ; - - //-------------------------------------------------------------- - // CUDA kernel: spdn bucket: one warp per C(i,j) dot product - //-------------------------------------------------------------- - - case GB_BUCKET_SPDN : - Opname = "phase3_spdn" ; - - // FIXME: - blocksz = 32; - work_per_thread = 256 ; - - if( Cnz > (2<<20)){ - work_per_thread = 1024; - } - gridsz = GB_ICEIL (Cnz, work_per_thread) ; - if ((gridsz < number_of_sms) && (Cnz > (2<<20))) - { - gridsz = number_of_sms; - } - if (gridsz > 256*number_of_sms) gridsz = 256*number_of_sms; - break ; - - default: - break ; - } - - } - - opname << Opname; - } -}; - -#endif diff --git a/GraphBLAS/CUDA/GB_cuda_mxm_factory.hpp b/GraphBLAS/CUDA/GB_cuda_mxm_factory.hpp deleted file mode 100644 index 227b776f65..0000000000 --- a/GraphBLAS/CUDA/GB_cuda_mxm_factory.hpp +++ /dev/null @@ -1,167 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_mxm_factory.hpp -//------------------------------------------------------------------------------ - -// (c) Nvidia Corp. 2023 All rights reserved -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -// Class to manage both stringify functions from mxm, ops and monoids to a -// header file. - -// Implementations of string callbacks -#pragma once - -// FIXME do we need the iostrean any more? -#include -#include -#include "GB_cuda_jitify_cache.h" - -extern "C" -{ - #include "GB.h" - #include "GB_binop.h" - #include "GB_stringify.h" -} - -// FIXME: do we need the file_callback method? -// Define function pointer we will use later -//std::istream* (*file_callback)(std::string, std::iostream&); - -//------------------------------------------------------------------------------ -// GB_cuda_mxm_factory -//------------------------------------------------------------------------------ - -// Define a factory class for building any mxm text definitions - -class GB_cuda_mxm_factory: public jit::File_Desc -{ - - //-------------------------------------------------------------------------- - // public members of the object - //-------------------------------------------------------------------------- - - public: - - uint64_t sr_code ; // unique 62-bit code for a GrB_mxm problem - GrB_Semiring semiring ; // the semiring for GrB_mxm - GrB_Type ctype, atype, btype ; // the types of C, A, and B - FILE *fp ; // file for GB_mxm_*.h header - - //-------------------------------------------------------------------------- - // open/close: access the GB_mxm_*.h header file for a specific instance - //-------------------------------------------------------------------------- - - void open (const char *path_and_file, const char *mode) - { - fp = fopen (path_and_file, mode) ; - } - - void close( ) - { - fclose (fp) ; - } - - //-------------------------------------------------------------------------- - // mxm_factory: create unique code for a GrB_mxm problem - //-------------------------------------------------------------------------- - - // mxm_factory takes a set of inputs describing and operation (semiring, - // mask, datatypes, sparsity formats, etc) and produces a numerical unique - // value for those. This allows rapid lookups to see if we have handled this - // case before, and avoids the need to generate and manage strings at this - // stage. - - // FIXME: pass in user's C_in matrix, in case C_in+=A*B can be done - // in-place - // FIXME: handle hypersparse case in dot3 - - void mxm_factory - ( - // C matrix: - bool C_iso, // true if C is iso-valued - bool C_in_iso, // C input iso status - int C_sparsity, // sparsity structure of C - GrB_Type ctype, // the type of C - // M matrix: - GrB_Matrix M, // may be NULL - bool Mask_struct, // mask is structural - bool Mask_comp, // mask is complemented - // semiring: - GrB_Semiring semiring, // the semiring to enumify - bool flipxy, // multiplier is: mult(a,b) or mult(b,a) - // A and B: - GrB_Matrix A, - GrB_Matrix B - ) - { - - if (C_iso) - { - // the kernel does not access any values of C, A, or B - semiring = GxB_ANY_PAIR_BOOL ; - flipxy = false ; - } - - uint64_t scode ; - - GB_enumify_mxm ( - // output: - &scode, // unique encoding of the entire semiring - // input: - C_iso, // true if C is iso-valued - C_in_iso, - C_sparsity, // sparsity structure of C - ctype, // the type of C - // M matrix: - M, - Mask_struct, // mask is structural - Mask_comp, // mask is complemented - // semiring: - semiring, // the semiring to enumify - flipxy, // multiplier is: mult(a,b) or mult(b,a) - // A and B: - A, - B - ) ; - - this->sr_code = scode; - this->semiring = semiring ; - this->atype = A->type ; - this->btype = B->type ; - this->ctype = ctype ; - - std::stringstream ss; - // FIXME: use same name scheme as the CPU jit - ss << "GB_mxm_" << this->sr_code << ".h"; - - std::string new_filename = ss.str(); - filename.resize(new_filename.size()); - strcpy(filename.data(), new_filename.data()); - - } - - //-------------------------------------------------------------------------- - // macrofy: create macros from sr_code and data types - //-------------------------------------------------------------------------- - - // macrofy takes a code and creates the corresponding string macros for - // operators, datatypes, sparsity formats and writes its results to a file. - - void macrofy ( ) override - { - GB_macrofy_mxm ( - // output to file : - fp, - // input: - this->sr_code, - this->semiring, - this->ctype, - this->atype, - this->btype - ) ; - } - -} ; // GB_cuda_mxm_factory - diff --git a/GraphBLAS/CUDA/GB_cuda_reduce.hpp b/GraphBLAS/CUDA/GB_cuda_reduce.hpp new file mode 100644 index 0000000000..3dfe07372f --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_reduce.hpp @@ -0,0 +1,30 @@ +//------------------------------------------------------------------------------ +// GB_cuda_reduce.hpp: CPU definitions for CUDA reductions +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +#ifndef GB_CUDA_REDUCE_H +#define GB_CUDA_REDUCE_H + +#include "GB_cuda.hpp" + +GrB_Info GB_cuda_reduce_to_scalar_jit // z = reduce_to_scalar (A) via CUDA JIT +( + // output: + GB_void *z, // result if has_cheeseburger is true + GrB_Matrix V, // result if has_cheeseburger is false + // input: + const GrB_Monoid monoid, // monoid to do the reduction + const GrB_Matrix A, // matrix to reduce + // CUDA stream and launch parameters: + cudaStream_t stream, + int32_t gridsz, + int32_t blocksz +) ; + +#endif + diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_factory.hpp b/GraphBLAS/CUDA/GB_cuda_reduce_factory.hpp deleted file mode 100644 index e0f7aae75b..0000000000 --- a/GraphBLAS/CUDA/GB_cuda_reduce_factory.hpp +++ /dev/null @@ -1,105 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_reduce_factory.hpp -//------------------------------------------------------------------------------ - -// (c) Nvidia Corp. 2023 All rights reserved -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -// Class to manage both stringify functions from mxm, ops and monoids to a -// header file. - -// FIXME: does it? -// Also provides a iostream callback to deliver the buffer to jitify as -// if read from a file - -// Implementations of string callbacks -#pragma once - -// FIXME: do we use iostream? -#include -#include -#include "GB_cuda_jitify_cache.h" - -extern "C" -{ - #include "GB.h" - #include "GB_stringify.h" -} - -//------------------------------------------------------------------------------ -// GB_cuda_reduce_factory: construct code and header file for reduce jit kernel -//------------------------------------------------------------------------------ - -class GB_cuda_reduce_factory: public jit::File_Desc { - -public: - - uint64_t rcode ; // unique encoding from GB_enumify_reduce - GrB_Monoid monoid ; // monoid to perform the reduction - GrB_Type atype ; // input matrix data type - FILE *fp ; // file pointer for GB_reduce_*.h header file - - //-------------------------------------------------------------------------- - // open/close: access the GB_reduce_*.h header file for a specific instance - //-------------------------------------------------------------------------- - - void open (const char *path_and_file, const char *mode) - { - fp = fopen (path_and_file, mode) ; - } - - void close( ) - { - fclose (fp) ; - } - - //-------------------------------------------------------------------------- - // reduce_factory: encode the reduction problem into a scalar rcode - //-------------------------------------------------------------------------- - - void reduce_factory (GrB_Monoid monoid, GrB_Matrix A) - { - uint64_t rcode ; - - GB_enumify_reduce - ( - // output: - &rcode, // unique encoding of entire monoid - // input: - monoid, // monoid to use for the reduction - A // matrix to reduce - ) ; - - this->rcode = rcode ; - this->monoid = monoid ; - this->atype = A->type ; - - // FIXME: use same name scheme as the CPU jit - std::stringstream ss ; - ss << "GB_reduce_" << this->rcode << ".h"; - - std::string new_filename = ss.str() ; - filename.resize(new_filename.size()) ; - strcpy(filename.data(), new_filename.data()) ; - } - - //-------------------------------------------------------------------------- - // macrofy: construct a header file from the rcode and data types - //-------------------------------------------------------------------------- - - void macrofy ( ) override - { - GB_macrofy_reduce ( - // output to file : - fp, - // input: - this->rcode, - this->monoid, - this->atype - ) ; - } - -} ; // GB_cuda_reduce_factory - diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_jitFactory.hpp b/GraphBLAS/CUDA/GB_cuda_reduce_jitFactory.hpp deleted file mode 100644 index fd618bddf0..0000000000 --- a/GraphBLAS/CUDA/GB_cuda_reduce_jitFactory.hpp +++ /dev/null @@ -1,254 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_reduce_jitFactory.hpp: kernel for reduction to scalar -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -/* - * Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -//------------------------------------------------------------------------------ - -// Constructs an instance of the template/GB_jit_reduce.cuh kernel to reduce -// a GrB_Matrix to a scalar. - -#ifndef GB_REDUCE_JITFACTORY_H -#define GB_REDUCE_JITFACTORY_H - -#pragma once -#include "GB_cuda_reduce_factory.hpp" - -/** - * This file is responsible for picking all the parameters and what kernel - * variaiton we will use for a given instance - * - data types - * - semiring types - * - binary ops - * - monoids - * - * Kernel factory says "Here's the actual instance I want you to build with the - * given parameters" - */ - -// Kernel jitifiers -class reduceFactory ; - -//------------------------------------------------------------------------------ -// reduceFactory -//------------------------------------------------------------------------------ - -class reduceFactory -{ - - //-------------------------------------------------------------------------- - // class properties - //-------------------------------------------------------------------------- - - std::string base_name = "GB_cuda_jit"; - std::string kernel_name = "reduce"; - - int threads_per_block = 320 ; - int work_per_thread = 256; -// int number_of_sms = GB_Global_gpu_sm_get (0) ; - - GB_cuda_reduce_factory &reduce_factory_ ; - - public: - - //-------------------------------------------------------------------------- - // class constructor - //-------------------------------------------------------------------------- - - reduceFactory (GB_cuda_reduce_factory &myreducefactory) : - reduce_factory_(myreducefactory) {} - - //-------------------------------------------------------------------------- - // GB_get_threads_per_block: determine # of threads in a threadBlock - //-------------------------------------------------------------------------- - - int GB_get_threads_per_block ( ) - { - return threads_per_block ; - } - - //-------------------------------------------------------------------------- - // GB_get_number_of_blocks: determine # of threadBlocks to use - //-------------------------------------------------------------------------- - - int GB_get_number_of_blocks - ( - int64_t anvals // # of entries in input matrix - ) - { - // FIXME: this is a lot of blocks. Use a smaller number (cap at, say, - // 64K), to simplify the non-atomic reductions - return (anvals + work_per_thread*threads_per_block - 1) / - (work_per_thread*threads_per_block) ; - } - - //-------------------------------------------------------------------------- - // jitGridBlockLaunch: construct and launch the GB_jit_reduce kernel - //-------------------------------------------------------------------------- - - // Note: this does assume the erased types are compatible w/ the monoid's - // ztype (an erased type is the type overwritten by a pun type). - - bool jitGridBlockLaunch // FIXME: return GrB_Info - ( - GrB_Matrix A, // matrix to reduce to a scalar - GB_void *output, // output scalar (static on CPU), of size zsize - GrB_Matrix *V_handle, // result of a partial reduction - GrB_Monoid monoid, // monoid to use for the reducution - cudaStream_t stream = 0 // stream to use, default stream 0 - ) - { - GBURBLE ("\n(launch reduce factory) \n") ; - - GrB_Type ztype = monoid->op->ztype ; - size_t zsize = ztype->size ; - - GB_void *zscalar = NULL ; - (*V_handle) = NULL ; - GrB_Matrix V = NULL ; - - jit::GBJitCache filecache = jit::GBJitCache::Instance() ; - filecache.getFile (reduce_factory_) ; - - auto rcode = std::to_string(reduce_factory_.rcode); - bool has_cheeseburger = GB_RSHIFT (reduce_factory_.rcode, 27, 1) ; - GBURBLE ("has_cheeseburger %d\n", has_cheeseburger) ; - - std::string hashable_name = base_name + "_" + kernel_name; - std::stringstream string_to_be_jitted ; - string_to_be_jitted << hashable_name << std::endl << - R"(#include "GB_cuda_kernel.h")" << std::endl << - R"(#include ")" << reduce_factory_.filename << R"(")" << std::endl << - R"(#include ")" << hashable_name << R"(.cuh")" << std::endl; - - int64_t anvals = GB_nnz_held (A) ; - - // determine kernel launch geometry - int blocksz = GB_get_threads_per_block ( ) ; - int gridsz = GB_get_number_of_blocks (anvals) ; - dim3 grid (gridsz) ; - dim3 block (blocksz) ; - - // determine the kind of reduction: partial (to &V), or complete - // (to the scalar output) - if (has_cheeseburger) - { - // the kernel launch can reduce A to zscalar all by itself - // allocate and initialize zscalar (upscaling it to at least 32 bits) - size_t zscalar_size = GB_IMAX (zsize, sizeof (uint32_t)) ; - (GB_void *) rmm_wrap_malloc (zscalar_size) ; - zscalar = (GB_void *) rmm_wrap_malloc (zscalar_size) ; - if (zscalar == NULL) - { - // out of memory - return (GrB_OUT_OF_MEMORY) ; - } - GB_cuda_upscale_identity (zscalar, monoid) ; - } - else - { - // allocate a full GrB_Matrix V for the partial result, of size - // gridsz-by-1, and of type ztype. V is allocated but not - // initialized. - GrB_Info info = GB_new_bix (&V, ztype, gridsz, 1, GB_Ap_null, - true, GxB_FULL, false, 0, -1, gridsz, true, false) ; - if (info != GrB_SUCCESS) - { - // out of memory - return (info) ; - } - } - - GBURBLE ("(cuda reduce launch %d threads in %d blocks)", - blocksz, gridsz ) ; - - // construct and launch the kernel - // FIXME: use same name scheme as the CPU jit - // FIXME: where does it go if it fails? try/catch? - jit::launcher(hashable_name + "_" + rcode, - string_to_be_jitted.str(), - header_names, - GB_cuda_jit_compiler_flags ( ), - file_callback) // FIXME: where is file_callback defined? - .set_kernel_inst( hashable_name , - { A->type->name, monoid->op->ztype->name }) - .configure(grid, block, SMEM, stream) - .launch (A, zscalar, V, anvals) ; - - // synchronize before copying result to host - CHECK_CUDA (cudaStreamSynchronize (stream)) ; - - // FIXME: sometimes we use CHECK_CUDA, sometimes CU_OK. Need to - // be consistent. Also, if this method fails, zscalar - // must be freed: we can do this in the CU_OK or CHECK_CUDA macros. - // Or in a try/catch? - - if (has_cheeseburger) - { - // return the scalar result - // output = zscalar (but only the first zsize bytes of it) - memcpy (output, zscalar, zsize) ; - rmm_wrap_free (zscalar) ; - } - else - { - // return the partial reduction - (*V_handle) = V ; - } - - return (GrB_SUCCESS) ; - } -} ; - -//------------------------------------------------------------------------------ -// GB_cuda_reduce -//------------------------------------------------------------------------------ - -inline bool GB_cuda_reduce // FIXME: return GrB_Info, not bool -( - GB_cuda_reduce_factory &myreducefactory, // reduction JIT factory - GrB_Matrix A, // matrix to reduce - GB_void *output, // result of size monoid->op->ztype->size - GrB_Matrix *V_handle, // result of a partial reduction - GrB_Monoid monoid, // monoid for the reduction - cudaStream_t stream = 0 // stream to use -) -{ - reduceFactory rf(myreducefactory); - GBURBLE ("(starting cuda reduce)" ) ; - bool result = rf.jitGridBlockLaunch (A, output, V_handle, monoid, stream) ; - GBURBLE ("(ending cuda reduce)" ) ; - return (result) ; -} - -#endif - diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar.cpp b/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar.cpp new file mode 100644 index 0000000000..e1da05383c --- /dev/null +++ b/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar.cpp @@ -0,0 +1,153 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/GB_cuda_reduce_to_scalar: reduce on the GPU with semiring +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +// Reduce a matrix A to a scalar s, or to a smaller matrix V if the GPU was +// only able to do a partial reduction. This case occurs if the GPU does not +// cannot do an atomic update for the monoid. To handle this case, the GPU +// returns a full GrB_Matrix V, of size gridsize-by-1, with one entry per +// threadblock. Then GB_reduce_to_scalar on the CPU sees this V as the result, +// and calls itself recursively to continue the reduction. + +#define GB_FREE_ALL \ +{ \ + GB_FREE_WORK (&zscalar, zscalar_size) ; \ + GB_Matrix_free (&V) ; \ + if (stream != nullptr) cudaStreamDestroy (stream) ; \ + stream = nullptr ; \ +} + +#include "GB_cuda_reduce.hpp" + +GrB_Info GB_cuda_reduce_to_scalar +( + // output: + GB_void *s, // note: statically allocated on CPU stack; if + // the result is in s then V is NULL. + GrB_Matrix *V_handle, // partial result if unable to reduce to scalar; + // NULL if result is in s. + // input: + const GrB_Monoid monoid, + const GrB_Matrix A +) +{ + + //-------------------------------------------------------------------------- + // check inputs + //-------------------------------------------------------------------------- + + GB_void *zscalar = NULL ; + size_t zscalar_size = 0 ; + GrB_Matrix V = NULL ; + (*V_handle) = NULL ; + GrB_Info info = GrB_SUCCESS ; + cudaStream_t stream = nullptr ; + + //-------------------------------------------------------------------------- + // create the stream + //-------------------------------------------------------------------------- + + // FIXME: use the stream pool + CUDA_OK (cudaStreamCreate (&stream)) ; + + //-------------------------------------------------------------------------- + // determine problem characteristics and allocate worksbace + //-------------------------------------------------------------------------- + + int threads_per_block = 320 ; + int work_per_thread = 256; +// int number_of_sms = GB_Global_gpu_sm_get (0) ; + + GrB_Type ztype = monoid->op->ztype ; + size_t zsize = ztype->size ; + + // determine kernel launch geometry + int64_t anvals = GB_nnz_held (A) ; + int blocksz = threads_per_block ; + int gridsz = + // FIXME: this is a lot of blocks. Use a smaller number (cap at, + // say, 64K), to simplify the non-atomic reductions + (anvals + work_per_thread*threads_per_block - 1) / + (work_per_thread*threads_per_block) ; + + // FIXME: GB_enumify_reduce is called twice: here (to get has_cheeseburger) + // and in GB_cuda_reduce_to_scalar_jit. Can we just call it once? One + // solution: The code from here to the call to GB_cuda_reduce_to_scalar_jit + // could be added to the GB_cuda_reduce_to_scalar_jit function itself. + + uint64_t rcode ; + GB_enumify_reduce (&rcode, monoid, A) ; + bool has_cheeseburger = GB_RSHIFT (rcode, 27, 1) ; + GBURBLE ("has_cheeseburger %d\n", has_cheeseburger) ; + + // determine the kind of reduction: partial (to &V), or complete + // (to the scalar output) + if (has_cheeseburger) + { + // the kernel launch can reduce A to zscalar all by itself + // allocate and initialize zscalar (upscaling it to at least 32 bits) + size_t zscalar_space = GB_IMAX (zsize, sizeof (uint32_t)) ; + zscalar = GB_MALLOC (zscalar_space, GB_void, &zscalar_size) ; + if (zscalar == NULL) + { + // out of memory + GB_FREE_ALL ; + return (GrB_OUT_OF_MEMORY) ; + } + GB_cuda_upscale_identity (zscalar, monoid) ; + } + else + { + // allocate a full GrB_Matrix V for the partial result, of size + // gridsz-by-1, and of type ztype. V is allocated but not + // initialized. + GB_OK (GB_new_bix (&V, ztype, gridsz, 1, GB_Ap_null, + true, GxB_FULL, false, 0, -1, gridsz, true, false)) ; + } + + GBURBLE ("(cuda reduce launch %d threads in %d blocks)", + blocksz, gridsz ) ; + + //-------------------------------------------------------------------------- + // reduce C to a scalar via the CUDA JIT + //-------------------------------------------------------------------------- + +// final call looks like this: +// GB_OK (GB_cuda_reduce_to_scalar_jit (zscalar, V, monoid, A, +// stream, gridsz, blocksz)) ; + +// debugging for now, to die early if the CUDA fails to compile, load, or run: + info = (GB_cuda_reduce_to_scalar_jit (zscalar, V, monoid, A, + stream, gridsz, blocksz)) ; + if (info == GrB_NO_VALUE) info = GrB_PANIC ; + GB_OK (info) ; + + //-------------------------------------------------------------------------- + // return result and destroy the stream + //-------------------------------------------------------------------------- + + CUDA_OK (cudaStreamSynchronize (stream)) ; + + if (has_cheeseburger) + { + // return the scalar result + // s = zscalar (but only the first zsize bytes of it) + memcpy (s, zscalar, zsize) ; + GB_FREE_WORK (&zscalar, zscalar_size) ; + } + else + { + // return the partial reduction + (*V_handle) = V ; + } + + CUDA_OK (cudaStreamDestroy (stream)) ; + return (GrB_SUCCESS) ; +} + diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch.cpp b/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch.cpp index f336ce8002..353201b65b 100644 --- a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch.cpp +++ b/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch.cpp @@ -2,14 +2,14 @@ // GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch: decide to use GPU for reduce //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ // Decide branch direction for GPU use for the reduction to scalar -#include "GB_cuda.h" +#include "GB_cuda_reduce.hpp" bool GB_cuda_reduce_to_scalar_branch // return true to use the GPU ( @@ -22,9 +22,6 @@ bool GB_cuda_reduce_to_scalar_branch // return true to use the GPU !GB_cuda_type_branch (monoid->op->ztype)) { // one or more types are not yet supported on the GPU - // FIXME: remove debug output here: - std::cout << "Not using cuda path: type size not supported" - << std::endl ; return (false) ; } @@ -48,11 +45,11 @@ bool GB_cuda_reduce_to_scalar_branch // return true to use the GPU { // FIXME: gpu_id = GB_Context_gpu_id_get ( ) ; // cudaSetDevice (gpu_id) ; - return true; + return (true) ; } else { - return false; + return (false) ; } } diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit.cpp b/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit.cpp index 0d7c6578b0..cc8603c708 100644 --- a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit.cpp +++ b/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit.cpp @@ -1,62 +1,62 @@ //------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit: reduce on the GPU with semiring +// GB_cuda_reduce_to_scalar_jit: reduce a matrix to a scalar, via the CUDA JIT //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ -// Reduce a matrix A to a scalar s, or to a smaller matrix V if the GPU was -// only able to do a partial reduction. This case occurs if the GPU does not -// cannot do an atomic update for the monoid. To handle this case, the GPU -// returns a full GrB_Matrix V, of size gridsize-by-1, with one entry per -// threadblock. Then GB_reduce_to_scalar on the CPU sees this V as the result, -// and calls itself recursively to continue the reduction. - -#include "GraphBLAS_cuda.h" +#include "GB_cuda_reduce.hpp" extern "C" { - #include "GB_reduce.h" + typedef GB_JIT_CUDA_KERNEL_REDUCE_PROTO ((*GB_jit_dl_function)) ; } -#include "GB_cuda.h" -#include "GB_cuda_jitify_cache.h" -#include "GB_cuda_common_jitFactory.hpp" -#include "GB_cuda_reduce_jitFactory.hpp" - -GrB_Info GB_cuda_reduce_to_scalar_jit +GrB_Info GB_cuda_reduce_to_scalar_jit // z = reduce_to_scalar (A) via CUDA JIT ( // output: - GB_void *s, // note: statically allocated on CPU stack; if - // the result is in s then V is NULL. - GrB_Matrix *V_handle, // partial result if unable to reduce to scalar; - // NULL if result is in s. + GB_void *z, // result if has_cheeseburger is true + GrB_Matrix V, // result if has_cheeseburger is false // input: - const GrB_Monoid monoid, - const GrB_Matrix A + const GrB_Monoid monoid, // monoid to do the reduction + const GrB_Matrix A, // matrix to reduce + // CUDA stream and launch parameters: + cudaStream_t stream, + int32_t gridsz, + int32_t blocksz ) -{ - - // FIXME: use the stream pool - cudaStream_t stream ; - CHECK_CUDA (cudaStreamCreate (&stream)) ; +{ //-------------------------------------------------------------------------- - // reduce C to a scalar + // encodify the problem //-------------------------------------------------------------------------- - // FIXME: check error conditions (out of memory, etc) - GB_cuda_reduce_factory myreducefactory ; - myreducefactory.reduce_factory (monoid, A) ; + GB_jit_encoding encoding ; + char *suffix ; + uint64_t hash = GB_encodify_reduce (&encoding, &suffix, + GB_JIT_CUDA_KERNEL_REDUCE, monoid, A) ; + + // FIXME: could get has_cheesburger here, and allocate zscalar + // and V accordingly. - // FIXME: get GrB_Info result from GB_cuda_reduce - GB_cuda_reduce (myreducefactory, A, s, V_handle, monoid, stream) ; + //-------------------------------------------------------------------------- + // get the kernel function pointer, loading or compiling it if needed + //-------------------------------------------------------------------------- + + void *dl_function ; + GrB_Info info = GB_jitifyer_load (&dl_function, + GB_jit_reduce_family, "cuda_reduce", + hash, &encoding, suffix, NULL, monoid, + NULL, A->type, NULL, NULL) ; + if (info != GrB_SUCCESS) return (info) ; - CHECK_CUDA (cudaStreamSynchronize (stream)) ; - CHECK_CUDA (cudaStreamDestroy (stream)) ; + //-------------------------------------------------------------------------- + // call the jit kernel and return result + //-------------------------------------------------------------------------- - return (GrB_SUCCESS) ; + GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ; + return (GB_jit_kernel (z, V, A, stream, gridsz, blocksz)) ; } diff --git a/GraphBLAS/CUDA/GB_cuda_type_bits.c b/GraphBLAS/CUDA/GB_cuda_type_bits.c deleted file mode 100644 index 17de151f3f..0000000000 --- a/GraphBLAS/CUDA/GB_cuda_type_bits.c +++ /dev/null @@ -1,35 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GB_cuda_type_bits -//------------------------------------------------------------------------------ - -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -#undef GBCUDA_CPLUSPLUS -#include "GB.h" - -size_t GB_cuda_type_bits (GB_Type_code); - -size_t GB_cuda_type_bits (GB_Type_code type_code) -{ - switch (type_code) - { - case GB_BOOL_code : return (8) ; - case GB_INT8_code : return (8) ; - case GB_INT16_code : return (16) ; - case GB_INT32_code : return (32) ; - case GB_INT64_code : return (64) ; - case GB_UINT8_code : return (8) ; - case GB_UINT16_code : return (16) ; - case GB_UINT32_code : return (32) ; - case GB_UINT64_code : return (64) ; - case GB_FP32_code : return (32) ; - case GB_FP64_code : return (64) ; -// case GB_FC32_code : return (64) ; -// case GB_FC64_code : return (128) ; - default : return (0) ; - } -} - diff --git a/GraphBLAS/CUDA/GB_cuda_type_branch.cpp b/GraphBLAS/CUDA/GB_cuda_type_branch.cpp index ba268b2a33..1debd0eb4b 100644 --- a/GraphBLAS/CUDA/GB_cuda_type_branch.cpp +++ b/GraphBLAS/CUDA/GB_cuda_type_branch.cpp @@ -2,7 +2,7 @@ // GraphBLAS/CUDA/GB_cuda_type_branch: decide if GPU can be used on a type //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ @@ -12,9 +12,11 @@ // bytes or less. If user-defined type has a different size, it cannot be done // on the GPU. +// FIXME: get the CUDA kernels to work on large types + // All built-in types pass this rule. -#include "GB_cuda.h" +#include "GB_cuda.hpp" bool GB_cuda_type_branch // return true if the type is OK on GPU ( diff --git a/GraphBLAS/CUDA/GB_cuda_upscale_identity.cpp b/GraphBLAS/CUDA/GB_cuda_upscale_identity.cpp index d7d5ec8a9e..c034f4b4ad 100644 --- a/GraphBLAS/CUDA/GB_cuda_upscale_identity.cpp +++ b/GraphBLAS/CUDA/GB_cuda_upscale_identity.cpp @@ -2,7 +2,7 @@ // GraphBLAS/CUDA/GB_cuda_upscale_identity: return identity, >= 16 bits in size //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ @@ -11,7 +11,7 @@ // for 2-byte values. This method initializes the identity value of a monoid, // scaling up the 1-byte and 2-byte cases to 4-bytes. -#include "GB_cuda.h" +#include "GB_cuda.hpp" extern "C" { #include "GB_binop.h" diff --git a/GraphBLAS/CUDA/GB_cuda_warmup.cu b/GraphBLAS/CUDA/GB_cuda_warmup.cu index 4b8016e59c..8a7322b9f6 100644 --- a/GraphBLAS/CUDA/GB_cuda_warmup.cu +++ b/GraphBLAS/CUDA/GB_cuda_warmup.cu @@ -2,50 +2,55 @@ // GraphBLAS/CUDA/GB_cuda_warmup.cu: warmup the GPU //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ -#include "GB_cuda.h" +#include "GB_cuda.hpp" bool GB_cuda_warmup (int device) { - // allocate 'nothing' just to load the drivers. - // No need to free the result. - bool ok = GB_cuda_set_device( device ); - if (!ok) + + //-------------------------------------------------------------------------- + // set the device + //-------------------------------------------------------------------------- + + if (!GB_cuda_set_device (device)) { - printf ("invalid GPU: %d\n", device) ; + // invalid device return (false) ; } - double gpu_memory_size = GB_Global_gpu_memorysize_get (device); + // FIXME: why do we need this? + double gpu_memory_size = GB_Global_gpu_memorysize_get (device) ; + + //-------------------------------------------------------------------------- + // allocate two small blocks just to load the drivers + //-------------------------------------------------------------------------- size_t size = 0 ; void *p = GB_malloc_memory (1, 1, &size) ; if (p == NULL) { - printf ("Hey!! where's da memory???\n") ; + // no memory on the device return (false) ; } -// printf ("oooo nice block of memory of size %lu\n", size) ; - GB_free_memory ( &p, size) ; -// printf ("be free, block of memory of size %lu\n", size) ; + GB_free_memory (&p, size) ; -// printf ("good ol' cudaMalloc just to be sure\n"); - cudaMalloc ( &p, size ) ; + cudaMalloc (&p, size ) ; if (p == NULL) { - printf ("Hey!! where's da GPU???\n") ; + // no memory on the device return (false) ; } cudaFree (p) ; -// printf ("GPU %d nice and toasty now\n", device) ; - - // TODO check for jit cache? or in GB_init? + //-------------------------------------------------------------------------- + // return result + //-------------------------------------------------------------------------- - return true; //(err == cudaSuccess) ; + return (true) ; } diff --git a/GraphBLAS/CUDA/GraphBLAS_cuda.h b/GraphBLAS/CUDA/GraphBLAS_cuda.hpp similarity index 87% rename from GraphBLAS/CUDA/GraphBLAS_cuda.h rename to GraphBLAS/CUDA/GraphBLAS_cuda.hpp index 4bb6872dc4..c3968e37a7 100644 --- a/GraphBLAS/CUDA/GraphBLAS_cuda.h +++ b/GraphBLAS/CUDA/GraphBLAS_cuda.hpp @@ -1,8 +1,8 @@ //------------------------------------------------------------------------------ -// GraphBLAS/CUDA/GraphBLAS_cuda.h +// GraphBLAS/CUDA/GraphBLAS_cuda.hpp //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_dense_phase1.cuh b/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_dense_phase1.cuh deleted file mode 100644 index 2d971a5ecc..0000000000 --- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_dense_phase1.cuh +++ /dev/null @@ -1,166 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_dense_phase1.cuh -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -// phase1 for dot3, A and B are bitmap/full -// dense phase1: symbolic load balancing and data partition -// to assign work to different 'buckets' for later compute - -// This kernel scans the non-zero pattern in A and B, takes into account the -// mask and computes total work required to form C. Then it classifies each -// dot product into a set of buckets for efficient compute. - -#pragma once - -#include -#include "GB_cuda_kernel.h" -#include "GB_mxm_shared_definitions.h" -#include "GB_cuda_buckets.h" -#include -#include - -using namespace cooperative_groups; - -//------------------------------------------------------------------------------ -// GB_jit_AxB_dot3_dense_phase1: lookup i,j pairs and store in Mi, Ci -//------------------------------------------------------------------------------ - -// GB_AxB_dense_phase1 is a CUDA kernel that scans all entries in M and -// assigns i,j coordinates for each entries and stores in Mi and Ci. - -template -__global__ void GB_jit_AxB_dot3_dense_phase1 -( - // input/output: - GrB_Matrix C, // final output matrix - const GrB_Matrix M // mask matrix -) -{ - - //-------------------------------------------------------------------------- - // get C, M, A, and B - //-------------------------------------------------------------------------- - - const int64_t *__restrict__ Mp = M->p ; - const int64_t *__restrict__ Mi = M->i ; - #if !GB_MASK_STRUCT - const GB_M_TYPE *__restrict__ Mx = (GB_M_TYPE *) M->x ; - #endif - const int64_t mnvec = M->nvec ; - const int64_t mvlen = M->vlen ; -// const int64_t mnz = GB_nnz(M) ; - const GB_M_NVALS (mnz) ; - - int64_t *__restrict__ Ci = C->i ; // for zombies, or bucket assignment - - // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a - // zombie, or (k << 4) + bucket otherwise, where C(:,j) is the kth vector - // of C (j = Ch [k] if hypersparse or j = k if standard sparse), and - // where bucket is the bucket assignment for C(i,j). - // bucket can be recovered from Ci by bucket = Ci & 0xF - - // ASSERT (mnz > 0) ; - // ASSERT (gridDim.x <= mnz) ; - - // shared cache used for coordinate search - __shared__ int64_t ks [chunk_size] ; - - //-------------------------------------------------------------------------- - // assign all entries of C to the buckets - //-------------------------------------------------------------------------- - - // all threads in this block will compute the same values for these: - int64_t pfirst, plast, kfirst, klast ; - - int64_t chunk_max = GB_ICEIL (mnz, chunk_size) ; - // (mnz + chunk_size -1)/chunk_size; - for ( int64_t chunk = blockIdx.x; - chunk < chunk_max; - chunk += gridDim.x ) - { - - //---------------------------------------------------------------------- - // determine the work done by this iteration, "chunk" - //---------------------------------------------------------------------- - - // The slice for each task contains entries pfirst:plast-1 of M and C. - // This iteration "chunk" computes Ci and Cx [pfirst...plast-1], using - // Mi and Mx [pfirst:plast-1]. All threads in the thread block are - // used for this "chunk". - pfirst = chunk_size * chunk ; - plast = pfirst + chunk_size ; - // plast = GB_IMIN (plast, mnz) ; - if (plast > mnz) plast = mnz ; - int64_t my_chunk_size = plast - pfirst ; - - // find the first vector of the slice for this chunk: the - // vector that owns the entry Mi [pfirst] and Mx [pfirst]. - kfirst = GB_search_for_vector_device (pfirst, Mp, 0, mnvec, mvlen) ; - - // find the last vector of the slice for task blockIdx.x: the - // vector that owns the entry Mi [plast-1] and Mx [plast-1]. - klast = GB_search_for_vector_device (plast-1, Mp, kfirst, mnvec, mvlen); - - // number of vectors in C and M for this "chunk" iteration, where - // Mp [kfirst:klast] will be operated on. - int64_t nk = klast - kfirst + 1 ; - - //---------------------------------------------------------------------- - // fill ks to find all indices - //---------------------------------------------------------------------- - - // search for k values for each entry pfirst:plast-1 - float slope = ((float) nk) / ((float) my_chunk_size) ; - int64_t mnvec1 = mnvec - 1 ; - for (int64_t kk = threadIdx.x ; kk < my_chunk_size ; kk += blockDim.x) - { - // get a rough estimate of k for the kkth entry in ks - int64_t k = kfirst + (int64_t) (slope * ((float) kk)) ; - // k cannot be smaller than kfirst, but might be bigger than - // mnvec-1, so ensure it is in the valid range, kfirst to mnvec-1 - // k = GB_IMIN (k, mnvec-1) ; - if (k > mnvec1) k = mnvec1 ; - // look for p in Mp, where p is in range pfirst:plast-1 - // where pfirst >= 0 and plast < mnz - int64_t p = kk + pfirst ; - // linear-time search for the k value of the pth entry - while ( Mp [ k + 1 ] <= p ) k++ ; - while ( Mp [ k ] > p ) k-- ; - ks [kk] = k ; - } - this_thread_block().sync(); - - //---------------------------------------------------------------------- - // assign entries in C(i,j) to the buckets - //---------------------------------------------------------------------- - - for ( int64_t pM = pfirst + threadIdx.x; - pM < pfirst + my_chunk_size; - pM += blockDim.x ) - { - int64_t k = ks [pM - pfirst] ; // get the k value of Mi,Mx [pM]. - // j = k or j = Mh [k] if C and M are hypersparse, but j is not - // needed here. - - #if GB_MASK_STRUCT - { - // no need to check the value of M(i,j); no prezombies - Ci[pM] = (k << 4) ; - } - #else - { - bool mij = (bool) GB_MCAST (Mx,pM,) ; - int64_t i = Mi [ pM ] ; - // FIXME: no need for k<<4, just place k or GB_FLIP(i) in Ci - Ci[pM] = (!mij) * ( GB_FLIP(i) << 4) - + mij * ((k<<4) ) ; - } - #endif - } - } -} - diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh b/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh deleted file mode 100644 index 80a4e2020b..0000000000 --- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh +++ /dev/null @@ -1,255 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -// This CUDA kernel produces the semiring product of two -// dense matrices of types T_A and T_B and common index space size n, to a -// output matrix of type T_C. The matrices are dense, with uniform -// non-zeros and sparsity patterns. -// ie. we want to produce C = A'*B in the sense of the given semi-ring. - -// This version uses a simple warp-based dense dot product algorithm, when the -// vectors coming from both A and B are dense, for any size of N. - -// Both the grid and block are 1D, so blockDim.x is the # threads in a -// threadblock, and the # of threadblocks is grid.x - -// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number -// of active threads = min( min(nzA, nzB), 32) - -// Thus, threadblock b owns a semi-ring dot product on a pair of vectors. -// The work is to load the data, do the multiply and add work and finally -// reduce this data to a scalar, and write it to Cx[pair]. - -// int64_t start <- start of vector pairs for this kernel -// int64_t end <- end of vector pairs for this kernel -// int64_t *Bucket <- array of pair indices for all kernels -// GrB_Matrix C <- result matrix -// GrB_Matrix M <- mask matrix -// GrB_Matrix A <- input matrix A -// GrB_Matrix B <- input matrix B -// int sz <- size parameter (not used) - -/* FIXME: This kernel needs to be split into 4 methods: - - (A bitmap) * (B bitmap) - (A full ) * (B bitmap) - (A bitmap) * (B full) - (A full) * (B full) - - The buckets are not needed at all. A single pass can be done. - C and M would still be sparse or hypersparse. - - See also denseDotProduct.cu. -*/ - -#pragma once -#include -#include -#include "GB_cuda_kernel.h" -#include "GB_mxm_shared_definitions.h" -#include - -// Using tile size fixed at compile time, we don't need shared memory -#define tile_sz 32 - -using namespace cooperative_groups; - -//------------------------------------------------------------------------------ -// warp_ReduceSum -//------------------------------------------------------------------------------ - -template< typename T_Z, int warp_sz> -__inline__ __device__ T_Z warp_ReduceSum(thread_block_tile g, T_Z val) -{ - // Each iteration halves the number of active threads - // Each thread adds its partial sum[i] to sum[lane+i] - // FIXME: only works if sizeof(T_Z) <= 32 bytes - // FIXME: the ANY monoid needs the cij_exists for each thread - for (int i = g.size() / 2; i > 0; i /= 2) - { - T_Z next = g.shfl_down( val, i) ; - GB_ADD( val, val, next ); - } - return val; // note: only thread 0 will return full sum -} - -//------------------------------------------------------------------------------ -// AxB_dot3_phase3_dndn -//------------------------------------------------------------------------------ - -template< - typename T_C, typename T_A, typename T_B, - typename T_Z, typename T_X, typename T_Y, - uint64_t srcode> -__global__ void AxB_dot3_phase3_dndn -( - GrB_Matrix C, - GrB_Matrix M, - GrB_Matrix A, - GrB_Matrix B -) -{ - // TODO: Figure out how to use graphblas-specific INFINITY macro - #ifndef INFINITY - #define INFINITY std::numeric_limits::max() - #endif - - const T_A *__restrict__ Ax = (T_A *)A->x ; - const T_B *__restrict__ Bx = (T_B *)B->x ; - T_C *__restrict__ Cx = (T_C *)C->x ; - int64_t *__restrict__ Ci = C->i ; - const int64_t *__restrict__ Mi = M->i ; - #if GB_M_IS_HYPER - const int64_t *__restrict__ Mh = M->h ; - #endif - // A and B are either bitmap or full - #if GB_A_IS_BITMAP - const int8_t *__restrict__ Ab = A->b ; - #endif - #if GB_B_IS_BITMAP - const int8_t *__restrict__ Bb = B->b ; - #endif - - // zombie count - int64_t zc = 0; - - int64_t start = 0; - int64_t end = M->p[M->nvec]; - - // total items to be inspected - int64_t nnzA = A->vlen; - int64_t nnzB = B->vlen; - int s = blockDim.x; - - // Main loop over pairs - for ( int64_t pair_id = start + blockIdx.x; //warp per pair - pair_id < end; - pair_id += gridDim.x ) - { - - // get M(i,j) and C(i,j) - int64_t i = Mi[pair_id]; - int64_t kk = Ci[pair_id] >> 4; // FIXME: can remove ">> 4" - bool cij_exists = false ; - GB_DECLARE_IDENTITY (cij) ; // GB_Z_TYPE cij = identity - - // skip if C(i,j) is a prezombie - if (kk >= 0) - { - - // j = kk or j = Mh [kk] if C and M are hypersparse - int64_t j = GBH_M (Mh, kk) ; - - int64_t pA = (A->vlen)*i; - int64_t pA_end = pA +(A->vlen); - - int64_t pB = (B->vlen)*j; - int64_t pB_end = pB +(B->vlen); - - // if (threadIdx.x == 0 ){ - // printf("tid=%d, i,j = %d,%d nnzA= %d, nnzB=%d\n", - // threadIdx.x, (int)i,(int)j, (int)nnzA, (int)nnzB); - // } - // __syncthreads(); - - // convert global data pointer to the local pointer of this block - GB_DECLAREA (aki) ; - GB_DECLAREB (bkj) ; - - #if GB_A_IS_FULL && GB_B_IS_FULL - { - cij_exists = true ; - for (int64_t k = threadIdx.x ; k < nnzA ; k += s) - { - // cij += A(k,i) * B(k,j) - GB_GETA (aki, Ax, pA+k, ) ; // aki = A(k,i) - GB_GETB (bkj, Bx, pB+k, ) ; // bkj = B(k,j) - GB_MULTADD ( cij, aki, bkj, i, k, j ) ; // cij += aki * bkj - } - } - #elif GB_A_IS_BITMAP && GB_B_IS_BITMAP - { - for ( int64_t k = threadIdx.x ; k < nnzA ; k += s) - { - GB_GETA (aki, Ax, pA+k, ) ; // aki = A(k,i) - GB_GETB (bkj, Bx, pB+k, ) ; // bkj = B(k,j) - int8_t b = (Ab [pA+k] && Bb [pB+k]) ; - cij_exists |= b ; - if (b) - { - GB_MULTADD ( cij, aki, bkj, i, k, j ) ; // cij += aki * bkj - } - } - } - #elif GB_A_IS_FULL && GB_B_IS_BITMAP - { - for ( int64_t k = threadIdx.x ; k < nnzA ; k += s) - { - if (Bb [pB+k]) - { - GB_GETA (aki, Ax, pA+k, ) ; // aki = A(k,i) - GB_GETB (bkj, Bx, pB+k, ) ; // bkj = B(k,j) - GB_MULTADD ( cij, aki, bkj, i, k, j ) ; // cij += aki * bkj - cij_exists = true ; - } - } - } - #elif GB_A_IS_BITMAP && GB_B_IS_FULL - { - for ( int64_t k = threadIdx.x ; k < nnzA ; k += s) - { - if (Ab [pB+k]) - { - GB_GETA (aki, Ax, pA+k, ) ; // aki = A(k,i) - GB_GETB (bkj, Bx, pB+k, ) ; // bkj = B(k,j) - GB_MULTADD ( cij, aki, bkj, i, k, j ) ; // cij += aki * bkj - cij_exists = true ; - } - } - } - #endif - } - - //---------------------------------------------------------------------- - // reduce per-thread sums to a single scalar - //---------------------------------------------------------------------- - - // Do vote here for control. - thread_block_tile<32> tile = tiled_partition<32>( this_thread_block() ); - cij_exists = tile.any( cij_exists); - tile.sync(); - - #if !GB_C_ISO - // FIXME: the ANY monoid needs the cij_exists for each thread - cij = warp_ReduceSum ( tile, cij); - #endif - - // write result for this block to global mem - if (threadIdx.x == 0) - { - if (cij_exists) - { - GB_PUTC (cij, Cx, pair_id) ; // Cx [pair_id] = (T_C) cij - Ci [pair_id] = i ; - } - else - { - // cij is a zombie - zc++; - Ci [pair_id] = GB_FLIP (i) ; - } - } - //__syncthreads ( ) ; - - if( threadIdx.x ==0 && zc > 0) - { - GB_cuda_atomic_add ( &(C->nzombies), zc) ; - } - } -} - diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_phase2.cuh b/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_phase2.cuh deleted file mode 100644 index 3d9f7d39cb..0000000000 --- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_phase2.cuh +++ /dev/null @@ -1,193 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_GB_AxB_phase2.cuh -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ -// fill the global buckets -//------------------------------------------------------------------------------ - -#pragma once -#include "GB_cuda_kernel.h" -#include "GB_mxm_shared_definitions.h" -#include "GB_cuda_buckets.h" -#include -#include -#include - -using namespace cooperative_groups; - -//------------------------------------------------------------------------------ -// BlockPrefixCallbackOp -//------------------------------------------------------------------------------ - -// A stateful callback functor that maintains a running prefix to be applied -// during consecutive scan operations. -struct BlockPrefixCallbackOp -{ - // Running prefix - int64_t running_total; - // Constructor - __device__ BlockPrefixCallbackOp(int64_t running_total) : running_total(running_total) {} - - // Callback operator to be entered by the first warp of threads in the block. - // Thread-0 is responsible for returning a value for seeding the block-wide scan. - __device__ int64_t operator()(int64_t block_aggregate) - { - int64_t old_prefix = running_total; - running_total += block_aggregate; - return old_prefix; - } -}; - -//------------------------------------------------------------------------------ -// blockBucketExclusiveSum -//------------------------------------------------------------------------------ - -__inline__ -__device__ void blockBucketExclusiveSum(int bucketId, int64_t *d_data, int nblocks) -{ - #define blocksize 32 - - // Specialize BlockScan for a 1D block of 32 threads - typedef cub::BlockScan BlockScan; - - // Allocate shared memory for BlockScan - __shared__ typename BlockScan::TempStorage temp_storage; - - // Initialize running total - BlockPrefixCallbackOp prefix_op(0); - - // Have the block iterate over segments of items - int64_t data=0; - - int64_t *blockbucket= d_data; - - for (int block_id = 0; block_id < nblocks; block_id += blocksize) - { - // Load a segment of consecutive items that are blocked across threads - - //printf("block %d entering sum\n",blockIdx.x); - int loc = block_id + threadIdx.x; - if ( loc < nblocks) - { - //printf("block %di loading tid=%d\n",block_id,tid); - data = blockbucket[bucketId*nblocks +loc ] ; - } - this_thread_block().sync(); - - //printf("bb%d_%d s0 before prefix= %ld \n", block_id,bucketId, - // blockbucket[bucketId*nblocks +loc] ) ; - // Collectively compute the block-wide exclusive prefix sum - BlockScan(temp_storage).ExclusiveSum( data, data, prefix_op); - this_thread_block().sync(); - - if ( loc < nblocks) - { - blockbucket[bucketId*nblocks +loc ] = data ; - } - //this_thread_block().sync(); - - //printf("bb%d_%d = %ld \n", block_id, bucketId, blockbucket[bucketId*nblocks +loc] ) ; - - data = 0; - } -} - -//------------------------------------------------------------------------------ -// warp_ReduceSumPlus_uint64 -//------------------------------------------------------------------------------ - -template< int tile_sz> -__inline__ __device__ uint64_t warp_ReduceSumPlus_uint64( thread_block_tile tile, uint64_t val) -{ - // Each iteration halves the number of active threads - // Each thread adds its partial sum[i] to sum[lane+i] - for (int i = tile.size() / 2; i > 0; i /= 2) { - val += tile.shfl_down( val, i); - } - return val; // note: only thread 0 will return full sum -} - -//------------------------------------------------------------------------------ -// AxB_phase2 -//------------------------------------------------------------------------------ - -// GB_AxB_cuda_dot3_phase2 is a CUDA kernel that takes as input the -// nanobuckets and blockbucket arrays computed by the first phase kernel, -// GB_AxB_cuda_dot3_phase1. The launch geometry of this kernel must match the -// GB_AxB_cuda_dot3_phase1 kernel, with the same # of threads and threadblocks. - -__global__ void AxB_phase2 // FIXME rename -( - // input, not modified: - int64_t *__restrict__ blockbucket, // global bucket count, of size NBUCKETS*nblocks - // output: - int64_t *__restrict__ offset, // global offsets, for each bucket - // inputs, not modified: - const int nblocks // input number of blocks to reduce across, ie size of vector for 1 bucket -) -{ - - //-------------------------------------------------------------------------- - // sum up the bucket counts of prior threadblocks - //-------------------------------------------------------------------------- - - // blockbucket is an array of size NBUCKETS-by-nblocks, held by row. The - // entry blockbucket [bucket * nblocks + t] holds the # of entries - // in the bucket (in range 0 to NBUCKETS-1) found by threadblock t. - - //__shared__ uint64_t offset [NBUCKETS] ; - uint64_t s[NBUCKETS]; - - #pragma unroll - for(int b = 0; b < NBUCKETS; ++b){ - s[b] = 0; - } - - thread_block_tile<32> tile = tiled_partition<32>(this_thread_block() ); - - //printf("block %d,dim %d entering sum %d nblocks\n",blockIdx.x, blockDim.x, nblocks); - int64_t tid = threadIdx.x + blockIdx.x * blockDim.x; - - - #pragma unroll - for(int b = 0; b < NBUCKETS; ++b) { - for( tid = threadIdx.x + blockIdx.x * blockDim.x; - tid < nblocks; - tid += blockDim.x*gridDim.x) { - s[b] += blockbucket[ b * nblocks +tid] ; - } - this_thread_block().sync(); - - s[b] = warp_ReduceSumPlus_uint64<32>( tile, s[b]); - } - - if (threadIdx.x ==0 ) - { - #pragma unroll - for(int b = 0; b < NBUCKETS; ++b) { - atomicAdd( (unsigned long long int*)&(offset[b]), s[b]); - } - } - this_thread_block().sync(); - - if( gridDim.x >= NBUCKETS) - { - // Cumulative sum across blocks for each bucket - if (blockIdx.x -//#include -//using namespace cooperative_groups; - -__global__ -void AxB_phase2end - ( - // input, not modified: - const int64_t *__restrict__ nanobuckets, // array of size NBUCKETS-blockDim.x-by-nblocks - const int64_t *__restrict__ blockbucket, // global bucket count, of size NBUCKETS*nblocks - // output: - const int64_t *__restrict__ bucketp, // global bucket cumsum, of size NBUCKETS+1 - int64_t *__restrict__ bucket, // global buckets, of size cnz (== mnz) - const int64_t *__restrict__ offset, // global offsets, for each bucket - // inputs, not modified: - const GrB_Matrix C, // output matrix - const int64_t cnz // number of entries in C and M - ) -{ - - //-------------------------------------------------------------------------- - // get C information - //-------------------------------------------------------------------------- - - // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a - // zombie, or (k << 4) + bucket otherwise, where C(:,j) is the kth vector - // of C (j = Ch [k] if hypersparse or j = k if standard sparse), and - // where bucket is the bucket assignment for C(i,j). This phase does not - // need k, just the bucket for each entry C(i,j). - - int64_t *__restrict__ Ci = C->i ; // for zombies, or bucket assignment - //int64_t *Mp = C->p ; // for offset calculations - //int64_t mnvec = C->nvec; - - //-------------------------------------------------------------------------- - // load and shift the nanobuckets for this thread block - //-------------------------------------------------------------------------- - - // The taskbucket for this threadblock is an array of size - // NBUCKETS-by-blockDim.x, held by row. It forms a 2D array within the 3D - // nanobuckets array. - const int64_t *taskbucket = nanobuckets + blockIdx.x * (NBUCKETS * blockDim.x) ; - - //printf("block%d thd%d blockbucket= %ld\n", blockIdx.x, threadIdx.x, - // blockbucket[blockIdx.x*gridDim.x+blockIdx.x]); - - // Each thread in this threadblock owns one column of this taskbucket, for - // its set of NBUCKETS nanobuckets. The nanobuckets are a column of length NBUCKETS, - // with stride equal to blockDim.x. - const int64_t *nanobucket = taskbucket + threadIdx.x; - - // Each thread loads its NBUCKETS nanobucket values into registers. - int64_t my_bucket[NBUCKETS]; - - #pragma unroll - for(int b = 0; b < NBUCKETS; ++b) { - my_bucket[b] = nanobucket [b * blockDim.x] - + blockbucket [b * gridDim.x + blockIdx.x] - + bucketp [b] ; - - //if(b==3) printf("blk:%d tid: %d my_buck[%d]=%lu \n", blockIdx.x, threadIdx.x, b, my_bucket[b]); - } - - // Now each thread has an index into the global set of NBUCKETS buckets, - // held in bucket, of where to place its own entries. - - //-------------------------------------------------------------------------- - // construct the global buckets - //-------------------------------------------------------------------------- - - // The slice for task blockIdx.x contains entries pfirst:plast-1 of M and - // C, which is the part of C operated on by this threadblock. - int64_t pfirst, plast ; - - __shared__ int64_t bucket_idx[chunksize]; - //__shared__ int64_t bucket_s[NBUCKETS][chunksize]; - - int chunk_max= (cnz + chunksize -1)/chunksize; - for ( int chunk = blockIdx.x; - chunk < chunk_max; - chunk += gridDim.x ) - { - - pfirst = chunksize * chunk ; - plast = GB_IMIN( chunksize * (chunk+1), cnz ) ; - - for ( int64_t p = pfirst + threadIdx.x; p < plast ; p += blockDim.x ) - { - // get the entry C(i,j), and extract its bucket. Then - // place the entry C(i,j) in the global bucket it belongs to. - int tid = p - pfirst; - - // TODO: these writes to global are not coalesced. Instead: each - // threadblock could buffer its writes to NBUCKETS buffers and when the - // buffers are full they can be written to global. - int ibucket = Ci[p] & 0xF; - //printf(" thd: %d p,Ci[p] = %ld,%ld,%d\n", threadIdx.x, p, Ci[p], irow ); - - //bucket[my_bucket[ibucket]++] = p; - //int idx = (my_bucket[ibucket] - pfirst); - //my_bucket[ibucket] += 1; //blockDim.x; - //int idx = (my_bucket[ibucket]++ - pfirst) & 0x7F; - //bucket_s[ibucket][ idx ] = p; - bucket_idx[tid] = my_bucket[ibucket]++; - Ci[p] = (ibucket==0) * (Ci[p] >> 4) + (ibucket > 0)* Ci[p]; - //if(ibucket == 0) { - //// bucket[my_bucket[0]++] = p; - // Ci[p] = Ci[p] >> 4; - //} else { - // bucket[my_bucket[ibucket]++] = p; - //} - } - - for ( int64_t p = pfirst + threadIdx.x; p < plast ; p+= blockDim.x ) - { - int tid = p - pfirst; - //int ibucket = Ci[p] & 0xF; - //bucket[ p ] = bucket_s[ibucket][tid]; - bucket [ bucket_idx[tid] ] = p; - //printf("ibucket = %d tid=%d p=%lu idx = %lu val = %lu \n",ibucket, threadIdx.x,p, tid, bucket_s[ibucket][tid]); - //printf("ibucket = %d tid=%d p=%lu idx = %lu \n",ibucket, threadIdx.x, p, bucket_idx[tid]); - } - } -} - diff --git a/GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_AxB_dot3.cu b/GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_AxB_dot3.cu new file mode 100644 index 0000000000..f515ef2177 --- /dev/null +++ b/GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_AxB_dot3.cu @@ -0,0 +1,552 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_AxB_dot3.cu +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +// GB_jit_kernel_cuda_AxB_dot3: C=A'*B using the dot3 method on the GPU. + +#define GB_FREE_ALL ; + +#if GB_C_ISO +// FIXME +#error "kernel undefined for C iso" +#endif + +// FIXME: Figure out how to use graphblas-specific INFINITY macro +#ifndef INFINITY +#define INFINITY std::numeric_limits::max() +#endif + +//------------------------------------------------------------------------------ +// kernel launch geometry +//------------------------------------------------------------------------------ + +// FIXME: some duplicates here +#define chunk_size 128 +#define log2_chunk_size 7 +#define tile_sz 32 +#define shared_vector_size 128 +#define blocksize 32 +#define threads_per_block 32 + +//------------------------------------------------------------------------------ +// operators +//------------------------------------------------------------------------------ + +#if GB_C_ISO + + #define GB_DOT_TERMINAL( c ) break + #define GB_DOT_MERGE(pA,pB) \ + { \ + cij_exists = true ; \ + } + #define GB_CIJ_EXIST_POSTCHECK + +#else + + #define GB_DOT_TERMINAL( c ) GB_IF_TERMINAL_BREAK ( c, zterminal ) + + #if GB_IS_PLUS_PAIR_REAL_SEMIRING + + // cij += A(k,i) * B(k,j), for merge operation (plus_pair_real semiring) + #if GB_Z_IGNORE_OVERFLOW + // plus_pair for int64, uint64, float, or double + #define GB_DOT_MERGE(pA,pB) cij++ ; + #define GB_CIJ_EXIST_POSTCHECK cij_exists = (cij != 0) ; + #else + // plus_pair semiring for small integers + #define GB_DOT_MERGE(pA,pB) \ + { \ + cij_exists = true ; \ + cij++ ; \ + } + #define GB_CIJ_EXIST_POSTCHECK + #endif + + #else + + // cij += A(k,i) * B(k,j), for merge operation (general case) + #define GB_DOT_MERGE(pA,pB) \ + { \ + GB_GETA ( aki, Ax, pA, ) ; /* aki = A(k,i) */ \ + GB_GETB ( bkj, Bx, pB, ) ; /* bkj = B(k,j) */ \ + cij_exists = true ; \ + GB_MULTADD ( cij, aki, bkj, i, k, j ) ; /* cij += aki * bkj */ \ + } + #define GB_CIJ_EXIST_POSTCHECK + + #endif + +#endif + +//------------------------------------------------------------------------------ +// dot3 buckets +//------------------------------------------------------------------------------ + +#define NBUCKETS 3 + +// NBUCKETS buckets: computed by up to NBUCKETS-1 kernel launches (zombies need +// no work...), each using different kernels (with different configurations +// depending on the bucket). + +// dot3: C=A'B, M is sparse or hyper, C is sparse or hyper +// 32 kernels A,B: (hyper,sparse,bitmap,full)^2 x (M and C are sparse/hyper) + +typedef enum +{ + GB_BUCKET_ZOMBIE = 0, // C(i,j) is a zombie (not a bucket) + // both A and B are sparse/hyper: + GB_BUCKET_VSVS = 1, // vsvs: both A(:,i) and B(:,j) are very sparse + GB_BUCKET_MERGEPATH = 2, // mp: use the merge-path method + // A is sparse/hyper and B is bitmap/full, or + // A is bitmap/full and B is sparse/hyper + GB_BUCKET_VSDN = 1, // vsdn: the sparse vector is very sparse + GB_BUCKET_SPDN = 2, // spdn: sparse vector has lots of entries; + // use a whole warp for each dot product +} +GB_bucket_code ; // FIXME: rename GB_dot3_bucket_code + +// These may use another bucket enum: + + // two full/(sparse,hyper) kernels: + // // CUDA kernel: spdn, handles 4 buckets: + // // A(:,i) is dense and B(:,j) is very sparse (< 256 entries) + // GB_BUCKET_DNVS = 2, + // // A(:,i) is dense and B(:,j) is sparse (>= 256 entries) + // GB_BUCKET_DNSP = 3, + + // a sparse/full kernel + // // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense + // GB_BUCKET_VSDN = 4, + // // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense + // GB_BUCKET_SPDN = 5, + + // a sparse/bitmap kernel + // a bitmap/bitmap kernel + // a bitmap/sparse kernel + // ... + +#include "GB_cuda_shfl_down.cuh" + +//------------------------------------------------------------------------------ +// CUDA device kernels for each case +//------------------------------------------------------------------------------ + +#include "GB_cuda_ek_slice.cuh" + +#if ((GB_A_IS_BITMAP || GB_A_IS_FULL) && (GB_B_IS_BITMAP || GB_B_IS_FULL)) + // dense-dense + #include "GB_cuda_jit_AxB_dot3_dense_phase1.cuh" + #include "GB_cuda_jit_AxB_dot3_phase3_dndn.cuh" +#else + // sparse-sparse, sparse-dense, or dense-sparse + + #undef GB_FREE_ALL + #define GB_FREE_ALL \ + { \ + GB_FREE_WORK (&Nanobuckets, Nb_size) ; \ + GB_FREE_WORK (&Blockbucket, Bb_size) ; \ + GB_FREE_WORK (&Bucketp, Bup_size) ; \ + GB_FREE_WORK (&offset, O_size) ; \ + GB_FREE_WORK (&Bucket, Bu_size) ; \ + } + + #include "GB_cuda_jit_AxB_dot3_phase1.cuh" + #include "GB_cuda_jit_AxB_dot3_phase2.cuh" + #include "GB_cuda_jit_AxB_dot3_phase2end.cuh" + #if ((GB_A_IS_SPARSE || GB_A_IS_HYPER) && \ + (GB_B_IS_SPARSE || GB_B_IS_HYPER)) + // sparse-sparse + #include "GB_cuda_jit_AxB_dot3_phase3_mp.cuh" + #include "GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh" + #else + // sparse-dense or dense-sparse + #include "GB_cuda_jit_AxB_dot3_phase3_spdn.cuh" + #include "GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh" + #endif +#endif + +//------------------------------------------------------------------------------ +// host function to launch the CUDA kernels for dot3 on the GPU +//------------------------------------------------------------------------------ + +// #include "GB_cuda_timer.hpp" + +extern "C" +{ + GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel) ; +} + +GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel) +{ + + // GpuTimer kernel_timer ; + + //-------------------------------------------------------------------------- + // get callback functions + //-------------------------------------------------------------------------- + + #ifdef GB_JIT_RUNTIME + // get callback functions + GB_free_memory_f GB_free_memory = my_callback->GB_free_memory_func ; + GB_malloc_memory_f GB_malloc_memory = my_callback->GB_malloc_memory_func ; + #endif + + //-------------------------------------------------------------------------- + // declare workspace + //-------------------------------------------------------------------------- + + #if ((GB_A_IS_BITMAP || GB_A_IS_FULL) && (GB_B_IS_BITMAP || GB_B_IS_FULL)) + // dense-dense case requires no workspace + #else + // sparse-sparse, sparse-dense, and dense-sparse requires workspace + int64_t *Nanobuckets = NULL ; size_t Nb_size = 0 ; + int64_t *Blockbucket = NULL ; size_t Bb_size = 0 ; + int64_t *Bucket = NULL ; size_t Bu_size = 0 ; + int64_t *Bucketp = NULL ; size_t Bup_size = 0 ; + int64_t *offset = NULL ; size_t O_size = 0 ; + #endif + + //-------------------------------------------------------------------------- + // get problem size + //-------------------------------------------------------------------------- + + const GB_M_NVALS (mnz) ; + int nblks_1 = (mnz + chunk_size - 1) / chunk_size ; + int number_of_blocks_1 = GB_IMIN (nblks_1, chunk_size * number_of_sms) ; + + // most methods can use these launch geometries: + dim3 grid_1 (number_of_blocks_1) ; + dim3 block (threads_per_block) ; + + //-------------------------------------------------------------------------- + // C=A'*B via jitified kernels + //-------------------------------------------------------------------------- + + #if ((GB_A_IS_BITMAP || GB_A_IS_FULL) && (GB_B_IS_BITMAP || GB_B_IS_FULL)) + { + + //---------------------------------------------------------------------- + // (full or bitmap) times (full or bitmap) + //---------------------------------------------------------------------- + + // full/bitmap cases, which means we don't need buckets and zombies. + // This is a much simpler kernel as a result, it only does the i,j + // lookup and stores the values in Mi and Ci. + + // Idea is to have each task work on a continguous block of columns of + // C Note: for small tests, mnz is small so ntasks is be governed by + // chunk_size, not chunk_size*number_of_sms. For large problems in + // production, chunk_size is less important since ntasks will likely be + // bounded by chunk_size*number_of_sms (say 128*80 = 10,240 on a V100, + // for the default chunk_size of 128). + + //---------------------------------------------------------------------- + // dense case, phase 1 + //---------------------------------------------------------------------- + + // kernel_timer.Start(); + GB_cuda_AxB_dot3_dense_phase1_kernel <<>> + (C, M) ; + + CUDA_OK (cudaStreamSynchronize(stream)) ; // is this needed? + + // kernel_timer.Stop(); + // printf ("(GPU phase1 %12.6g ms )\n", kernel_timer.Elapsed()) ; + + //---------------------------------------------------------------------- + // dense case, phase "3" (FIXME: rename to dense_phase2) + //---------------------------------------------------------------------- + + int work_per_thread = 8 ; + int blocksz = 64 ; + work_per_thread = 8 ; + if (mnz > 1024) + { + blocksz = 512 ; + work_per_thread = 64 ; + } + int gridsz = GB_ICEIL (mnz, work_per_thread*blocksz) ; + dim3 grid_2 (gridsz) ; + + // kernel_timer.Start(); + + GB_cuda_AxB_dot3_phase3_dndn_kernel <> + (C, M, A, B) ; + + } + #else + { + + //---------------------------------------------------------------------- + // (sparse or hyper) times (sparse or hyper) + // (sparse or hyper) times (bitmap or full) + // (bitmap or full) times (sparse or hyper) + //---------------------------------------------------------------------- + + //---------------------------------------------------------------------- + // construct the tasks for phase1 and phase2 + //---------------------------------------------------------------------- + + // # of threads in phase1 and phase2 kernel launches are related + // # by the size of the warp. ph2_task = ph1_task/32 for example + + int64_t blockbuckets_size = NBUCKETS * number_of_blocks_1 ; + int64_t nanobuckets_size = blockbuckets_size * threads_per_block ; + + Nanobuckets = GB_MALLOC_WORK (nanobuckets_size, int64_t, &Nb_size) ; + Blockbucket = GB_MALLOC_WORK (blockbuckets_size, int64_t, &Bb_size) ; + Bucketp = GB_MALLOC_WORK (NBUCKETS+1, int64_t, &Bup_size) ; + offset = GB_MALLOC_WORK (NBUCKETS, int64_t, &O_size) ; + Bucket = GB_MALLOC_WORK (mnz, int64_t, &Bu_size) ; + + if (Nanobuckets == NULL || Blockbucket == NULL || Bucketp == NULL + || Bucket == NULL || offset == NULL) + { + // out of memory + GB_FREE_ALL ; + return (GrB_OUT_OF_MEMORY) ; + } + + // FIXME: do async with streams + // FIXME: do we need any of these? + //CUDA_OK (cudaMemsetAsync(Nanobuckets, 0, + // nanobuckets_size * sizeof(int64_t), stream)); + //CUDA_OK (cudaMemsetAsync(Blockbucket, 0, + // blockbuckets_size * sizeof(int64_t), stream)); + CUDA_OK (cudaMemsetAsync(Bucketp, 0, + (NBUCKETS+1) * sizeof(int64_t), stream)); + CUDA_OK (cudaMemsetAsync(offset, 0, + NBUCKETS * sizeof(int64_t), stream)); + //CUDA_OK (cudaMemsetAsync(Bucket, 0, + // mnz * sizeof(int64_t), stream)); + + //---------------------------------------------------------------------- + // phase1 and phase2: place each C(i,j) in a bucket + //---------------------------------------------------------------------- + + CUDA_OK (cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), + cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId)); + CUDA_OK (cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), + cudaMemAdviseSetAccessedBy, device)); + + CUDA_OK (cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), + cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId)); + CUDA_OK (cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), + cudaMemAdviseSetAccessedBy, device)); + + //---------------------------------------------------------------------- + // phase1: assign each C(i,j) to a bucket, and count them + //---------------------------------------------------------------------- + + // kernel_timer.Start(); + + GB_jit_AxB_dot3_phase1_kernel <<>> + (Nanobuckets, Blockbucket, C, M, A, B) ; + + CUDA_OK (cudaStreamSynchronize (stream)) ; + + // kernel_timer.Stop(); + // printf ("(GPU phase1 %12.6g ms )\n", kernel_timer.Elapsed()) ; + + //---------------------------------------------------------------------- + // phase2: cumsum across the blockbuckets, propagate to thread level + //---------------------------------------------------------------------- + + // # of blocks for phase2: + int number_of_blocks_2 = (number_of_blocks_1 + threads_per_block - 1) + / threads_per_block ; + + dim3 grid_2 (number_of_blocks_2) ; + + // kernel_timer.Start(); + + GB_cuda_AxB_dot3_phase2_kernel <<>> + (Blockbucket, offset, number_of_blocks_1) ; + + CUDA_OK (cudaStreamSynchronize (stream)) ; + + int64_t s = offset [0] ; + C->nzombies = s ; + bool all_in_one = false ; + for (int bucket = 1 ; bucket < NBUCKETS+1 ; bucket++) + { + Bucketp[bucket] = s ; + s += offset [bucket] ; + if ((Bucketp [bucket] - Bucketp [bucket-1] ) == mnz) + { + all_in_one = true ; + } + } + + // kernel_timer.Stop(); + // printf ("(GPU phase2 %12.6g ms )\n", kernel_timer.Elapsed()) ; + + //---------------------------------------------------------------------- + // phase2end + //---------------------------------------------------------------------- + + if (!all_in_one) + { + // kernel_timer.Start(); + + GB_cuda_AxB_dot3_phase2end_kernel <<>> + (Nanobuckets, Blockbucket, Bucketp, Bucket, offset, C, mnz) ; + + CUDA_OK (cudaStreamSynchronize (stream)) ; + // kernel_timer.Stop(); + // printf ("(GPU phase2end %12.6g ms)\n",kernel_timer.Elapsed()); + } + + //---------------------------------------------------------------------- + // phase3: do the numerical work + //---------------------------------------------------------------------- + + // kernel_timer.Start(); + + for (int bucket = 1 ; bucket < NBUCKETS ; bucket++) + { + int64_t start = Bucketp [bucket] ; + int64_t end = Bucketp [bucket + 1] ; + int64_t cnz_in_bucket = end - start ; + int gridsz, blocksz, work_per_thread ; + if (cnz_in_bucket > 0) + { + + #if ((GB_A_IS_SPARSE || GB_A_IS_HYPER) && \ + (GB_B_IS_SPARSE || GB_B_IS_HYPER)) + + switch (bucket) + { + + //------------------------------------------------------ + // vsvs bucket: both vectors very sparse + //------------------------------------------------------ + + case GB_BUCKET_VSVS : + { + // FIXME: should be a function of cuda architecture + blocksz = 256 ; + work_per_thread = 4 ; + if (cnz_in_bucket > (2<<12)) + { + blocksz = 512 ; + } + gridsz = GB_ICEIL (cnz_in_bucket, + work_per_thread*blocksz) ; + gridsz = GB_IMIN (gridsz, 256*number_of_sms) ; + dim3 grid_3 (gridsz) ; + GB_cuda_AxB_dot3_phase3_vsvs_kernel + <<>> + (start, end, Bucket, C, M, A, B) ; + } + break ; + + //------------------------------------------------------ + // mergepath bucket: + //------------------------------------------------------ + + case GB_BUCKET_MERGEPATH : + { + // FIXME: should be a function of cuda architecture + blocksz = 32 ; + work_per_thread = 256 ; + if (cnz_in_bucket > (2<<20)) + { + work_per_thread = 1024 ; + } + gridsz = GB_ICEIL (cnz_in_bucket, work_per_thread) ; + if ((gridsz < number_of_sms) && + (cnz_in_bucket > (2<<20))) + { + gridsz = number_of_sms ; + } + gridsz = GB_IMIN (gridsz, 256*number_of_sms) ; + dim3 grid_3 (gridsz) ; + GB_cuda_AxB_dot3_phase3_mp_kernel + <<>> + (start, end, Bucket, C, M, A, B) ; + } + break ; + } + + #else + + switch (bucket) + { + + //------------------------------------------------------ + // vsdn bucket: one thread per C(i,j) dot product + //------------------------------------------------------ + + case GB_BUCKET_VSDN : + { + // FIXME: should be a function of cuda architecture + blocksz = 256 ; + work_per_thread = 4 ; + if (cnz_in_bucket > (2<<12)) + { + blocksz = 512 ; + } + gridsz = GB_ICEIL (cnz_in_bucket, + work_per_thread*blocksz) ; + gridsz = GB_IMIN (gridsz, 256*number_of_sms) ; + dim3 grid_3 (gridsz) ; + GB_cuda_AxB_dot3_phase3_vsdn_kernel + <<>> + (start, end, Bucket, C, M, A, B) ; + } + break ; + + //------------------------------------------------------ + // spdn bucket: one warp per C(i,j) dot product + //------------------------------------------------------ + + case GB_BUCKET_SPDN : + { + // FIXME: should be a function of cuda architecture + blocksz = 32 ; + work_per_thread = 256 ; + if (cnz_in_bucket > (2<<20)) + { + work_per_thread = 1024 ; + } + gridsz = GB_ICEIL (cnz_in_bucket, work_per_thread) ; + if ((gridsz < number_of_sms) && + (cnz_in_bucket > (2<<20))) + { + gridsz = number_of_sms ; + } + gridsz = GB_IMIN (gridsz, 256*number_of_sms) ; + dim3 grid_3 (gridsz) ; + GB_cuda_AxB_dot3_phase3_spdn_kernel + <<>> + (start, end, Bucket, C, M, A, B) ; + break ; + } + } + #endif + } + } + } + #endif + + //-------------------------------------------------------------------------- + // free workspace and return result + //-------------------------------------------------------------------------- + + CUDA_OK (cudaStreamSynchronize (stream)) ; + + // kernel_timer.Stop(); + // printf ("(GPU phase3 %12.6g ms, rate=%12.6g)\n", + // kernel_timer.Elapsed(), mnz/(1000*kernel_timer.Elapsed())) ; + + GB_FREE_ALL ; + return (GrB_SUCCESS) ; +} + diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_reduce.cuh b/GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_reduce.cu similarity index 71% rename from GraphBLAS/CUDA/JitKernels/GB_cuda_jit_reduce.cuh rename to GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_reduce.cu index 354e3b216c..94de78d706 100644 --- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_reduce.cuh +++ b/GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_reduce.cu @@ -1,14 +1,17 @@ //------------------------------------------------------------------------------ -// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_reduce.cuh +// GraphBLAS/CUDA/JitKernels/GB_jit_cuda_reduce.cu //------------------------------------------------------------------------------ +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ -// The GB_cuda_jit_reduce CUDA kernel reduces a GrB_Matrix A of any type T_A, -// to a scalar of type T_Z. Each threadblock (blockIdx.x) reduces its portion -// of Ax to a single scalar, and then atomics are used across the threadblocks. +// The GB_cuda_jit_reduce CUDA kernel reduces a GrB_Matrix A of any type +// GB_A_TYPE, to a scalar of type GB_Z_TYPE. Each threadblock (blockIdx.x) +// reduces its portion of Ax to a single scalar, and then atomics are used +// across the threadblocks. // Both the grid and block are 1D, so blockDim.x is the # threads in a // threadblock, and the # of threadblocks is grid.x @@ -21,63 +24,33 @@ // If the reduction is done on the GPU, A will never be iso-valued. -#include -#include -#include "GB_cuda_kernel.h" -#include "GB_monoid_shared_definitions.h" -#include "GB_cuda_atomics.cuh" -#include -#include - #if GB_C_ISO #error "kernel undefined for C iso" #endif -using namespace cooperative_groups; - -//------------------------------------------------------------------------------ -// GB_warp_Reduce: reduce all entries in a warp to a single scalar -//------------------------------------------------------------------------------ - -// GB_warp_Reduce assumes WARPSIZE is 32 threads. +// FIXME: put these definitions in GB_cuda_kernel.h: +#define tile_sz 32 +#define log2_tile_sz 5 -template -__inline__ __device__ -T_Z GB_warp_Reduce( thread_block_tile g, T_Z val) -{ - // Each iteration halves the number of active threads - // Each thread adds its partial val[k] to val[lane+k] - - // FIXME: doesn't work unless sizeof(T_Z) <= 32 bytes - - T_Z fold = g.shfl_down ( val, 16) ; - GB_ADD ( val, val, fold ) ; - fold = g.shfl_down ( val, 8) ; - GB_ADD ( val, val, fold ) ; - fold = g.shfl_down ( val, 4) ; - GB_ADD ( val, val, fold ) ; - fold = g.shfl_down ( val, 2) ; - GB_ADD ( val, val, fold ) ; - fold = g.shfl_down ( val, 1) ; - GB_ADD ( val, val, fold ) ; - return (val) ; // note: only thread 0 will return full val -} +#include "GB_cuda_shfl_down.cuh" //------------------------------------------------------------------------------ // GB_block_Reduce: reduce across all warps into a single scalar //------------------------------------------------------------------------------ -template -__inline__ __device__ -T_Z GB_block_Reduce(thread_block g, T_Z val) +__inline__ __device__ GB_Z_TYPE GB_block_Reduce +( + thread_block g, + GB_Z_TYPE val +) { - static __shared__ T_Z shared [WARPSIZE] ; - int lane = threadIdx.x & (WARPSIZE-1) ; - int wid = threadIdx.x >> LOG2_WARPSIZE ; - thread_block_tile tile = tiled_partition( g ) ; + static __shared__ GB_Z_TYPE shared [tile_sz] ; + int lane = threadIdx.x & (tile_sz-1) ; + int wid = threadIdx.x >> log2_tile_sz ; + thread_block_tile tile = tiled_partition( g ) ; // Each warp performs partial reduction - val = GB_warp_Reduce( tile, val) ; + val = GB_cuda_warp_reduce_ztype (tile, val) ; // Wait for all partial reductions if (lane == 0) @@ -88,25 +61,25 @@ T_Z GB_block_Reduce(thread_block g, T_Z val) GB_DECLARE_IDENTITY_CONST (zid) ; // const GB_Z_TYPE zid = identity ; - val = (threadIdx.x < (blockDim.x >> LOG2_WARPSIZE)) ? - shared [lane] : zid ; + val = (threadIdx.x < (blockDim.x >> LOG2_WARPSIZE)) ? shared [lane] : zid ; // Final reduce within first warp - val = GB_warp_Reduce( tile, val) ; + val = GB_cuda_warp_reduce_ztype (tile, val) ; return (val) ; } //------------------------------------------------------------------------------ -// GB_jit_reduce: reduce all entries in a matrix to a single scalar +// GB_cuda_reduce_kernel: reduce all entries in a matrix to a single scalar //------------------------------------------------------------------------------ -template< typename T_A, typename T_Z> -__global__ void GB_jit_reduce // FIXME rename +__global__ void GB_cuda_reduce_kernel ( - GrB_Matrix A, // matrix to reduce + // output: void *zscalar, // scalar result, at least sizeof (uint32_t) GrB_Matrix V, // matrix result, for partial reduction (or NULL) - int64_t anz // # of entries in A: anz = GB_nnz_held (A) + // input: + GrB_Matrix A, // matrix to reduce + int64_t anz // # of entries in A ) { @@ -114,19 +87,20 @@ __global__ void GB_jit_reduce // FIXME rename // initializations //-------------------------------------------------------------------------- - const T_A *__restrict__ Ax = (T_A *) A->x ; + const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *) A->x ; - // each thread reduces its result into zmine, of type T_Z + // each thread reduces its result into zmine, of type GB_Z_TYPE GB_DECLARE_IDENTITY (zmine) ; // GB_Z_TYPE zmine = identity ; // On input, zscalar is already initialized to the monoid identity value. - // If T_Z has size less than 4 bytes, zscalar has been upscaled to 4 bytes. + // If GB_Z_TYPE has size less than 4 bytes, zscalar has been upscaled to 4 + // bytes. //-------------------------------------------------------------------------- // phase 1: each thread reduces a part of the matrix to its own scalar //-------------------------------------------------------------------------- - #if GB_A_IS_SPARSE || GB_A_IS_HYPERSPARSE + #if GB_A_IS_SPARSE || GB_A_IS_HYPER { //---------------------------------------------------------------------- @@ -180,7 +154,7 @@ __global__ void GB_jit_reduce // FIXME rename // A is bitmap //---------------------------------------------------------------------- - const uint8_t *__restrict__ Ab = A->b ; + const int8_t *__restrict__ Ab = A->b ; for (int64_t p = blockIdx.x * blockDim.x + threadIdx.x ; p < anz ; p += blockDim.x * gridDim.x) @@ -197,7 +171,7 @@ __global__ void GB_jit_reduce // FIXME rename // phase 2: each threadblock reduces all threads into a scalar //-------------------------------------------------------------------------- - zmine = GB_block_Reduce< T_Z >( this_thread_block(), zmine) ; + zmine = GB_block_Reduce( this_thread_block(), zmine) ; this_thread_block().sync() ; //-------------------------------------------------------------------------- @@ -229,3 +203,21 @@ __global__ void GB_jit_reduce // FIXME rename } } +//------------------------------------------------------------------------------ +// host function to launch the CUDA kernel +//------------------------------------------------------------------------------ + +extern "C" +{ + GB_JIT_CUDA_KERNEL_REDUCE_PROTO (GB_jit_kernel) ; +} + +GB_JIT_CUDA_KERNEL_REDUCE_PROTO (GB_jit_kernel) +{ + dim3 grid (gridsz) ; + dim3 block (blocksz) ; + GB_A_NHELD (anz) ; // anz = # of entries held in A + GB_cuda_reduce_kernel <<>> (zscalar, V, A, anz) ; + return (GrB_SUCCESS) ; +} + diff --git a/GraphBLAS/CUDA/README.txt b/GraphBLAS/CUDA/README.txt index 48ef3edd09..71d84593c4 100644 --- a/GraphBLAS/CUDA/README.txt +++ b/GraphBLAS/CUDA/README.txt @@ -2,11 +2,10 @@ GraphBLAS/CUDA: CUDA acceleration for SuiteSparse:GraphBLAS Dependencies: local_cub BSD 3-clause, (c) NVIDIA (part of CUDA Toolkit) - rmm_wrap Apache 2.0, (c) FIXME + rmm_wrap Apache 2.0, (c) NVIDIA cuCollections Apache 2.0, (c) NVIDIA Rapids cuco/cub BSD 3-clause, (c) NVIDIA cuco/libcudacxx BSD?, (c) NVIDIA cuco/libcxx Apache 2.0, (c) NVIDIA - google-benchmark ?, (c) Google diff --git a/GraphBLAS/CUDA/TODO.txt b/GraphBLAS/CUDA/TODO.txt index 4bc2524747..1b4fa1620e 100644 --- a/GraphBLAS/CUDA/TODO.txt +++ b/GraphBLAS/CUDA/TODO.txt @@ -1,3 +1,21 @@ +TODO (Mar 2024): + + set/get cuda archictures + CUDA PreJIT kernels + GB_cuda_matrix_advise: write it + dot3: allow iso + use a stream pool (from RMM) + can rmm_wrap be thread safe? + # of threadblocks in reduce + reduce calls GB_enumify_reduce twice + set/get which GPU(s) to use + data types > 32 bytes + handling nvcc compiler errors + static device function for computing ks (acts like GB_ek_slice, + so call it GB_ek_slice_device + +-------------------------------------------------------------------------------- + all the FIXMEs clean up comments and code style diff --git a/GraphBLAS/CUDA/Template/GB_cuda_atomics.cuh b/GraphBLAS/CUDA/Template/GB_cuda_atomics.cuh index 9eeddf43dc..6152d003e9 100644 --- a/GraphBLAS/CUDA/Template/GB_cuda_atomics.cuh +++ b/GraphBLAS/CUDA/Template/GB_cuda_atomics.cuh @@ -2,7 +2,13 @@ // GraphBLAS/CUDA/Template/GB_cuda_atomics.cuh: CUDA atomics for GraphBLAS //------------------------------------------------------------------------------ -// yet still more stuff here +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +//------------------------------------------------------------------------------ + +// Atomic device functions for CUDA JIT kernels. Not used on the host. /* * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. @@ -31,41 +37,40 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * SPDX-License-Identifier: BSD-3-Clause */ //------------------------------------------------------------------------------ // Specializations for different atomic operations on different types //------------------------------------------------------------------------------ -// No 1-byte methods are available (bool, uint8_t, int8_t), because CUDA does -// not support atomicCAS for a single byte. Instead, to compute a single byte -// atomically, GraphBLAS must operate on a larger temporary type (typically -// uint32_t, but it could also use a 16-bit type), and when all results are -// computed and the kernel launch is done, the final value is copied to the -// single byte result on the host. +// No 1- or 2-byte methods are available (bool, uint8_t, int8_t, uint16_t, +// int16_t), because CUDA does not support atomicCAS for just one or two bytes. +// Instead, to compute one or two bytes atomically, GraphBLAS must operate on a +// larger temporary type (typically uint32_t) and when all results are computed +// and the kernel launch is done, the final value is copied to the one or two +// bytes result on the host. // // The GxB_FC64_t type is supported only by GB_cuda_atomic_add. // // GB_cuda_atomic_write, GB_cuda_atomic_times: // -// int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t, +// int32_t, uint32_t, int64_t, uint64_t, // float, double, and GxB_FC32_t (not GxB_FC64_t). // // GB_cuda_atomic_min, GB_cuda_atomic_max: // -// int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t, +// int32_t, uint32_t, int64_t, uint64_t, // float, and double (not GxB_FC32_t or GxB_FC64_t). // // GB_cuda_atomic_add: // -// int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t, +// int32_t, uint32_t, int64_t, uint64_t, // float, double, GxB_FC32_t, and GxB_FC64_t. // // GB_cuda_atomic_bor, GB_cuda_atomic_band, // GB_cuda_atomic_bxor, GB_cuda_atomic_bxnor : // -// uint16_t, uint32_t, uint64_t +// uint32_t, uint64_t // // GB_cuda_atomic_lock, GB_cuda_atomic_unlock: // @@ -95,49 +100,9 @@ __device__ __inline__ void GB_cuda_unlock (uint32_t *mutex) ; // GB_cuda_atomic_write //------------------------------------------------------------------------------ -// atomic write (16, 32, and 64 bits) +// atomic write (32 and 64 bits) // no atomic write for GxB_FC64_t -template<> __device__ __inline__ void GB_cuda_atomic_write -( - int16_t *ptr, // target to modify - int16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int v = GB_PUN (unsigned short int, val) ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // modify it atomically: - old = atomicCAS (p, assumed, v) ; - } - while (assumed != old) ; -} - -template<> __device__ __inline__ void GB_cuda_atomic_write -( - uint16_t *ptr, // target to modify - uint16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int v = (unsigned short int) val ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // modify it atomically: - old = atomicCAS (p, assumed, v) ; - } - while (assumed != old) ; -} - template<> __device__ __inline__ void GB_cuda_atomic_write ( int32_t *ptr, // target to modify @@ -217,48 +182,7 @@ template<> __device__ __inline__ void GB_cuda_atomic_write // GB_cuda_atomic_add for built-in types //------------------------------------------------------------------------------ -// types: int and uint [16,32,64], float, double, GxB_FC32_t, complex double - -template<> __device__ __inline__ void GB_cuda_atomic_add -( - int16_t *ptr, // target to modify - int16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // compute the new value: - int16_t new_value = GB_PUN (int16_t, assumed) + val ; - // modify it atomically: - old = atomicCAS (p, assumed, GB_PUN (unsigned short int, new_value)) ; - } - while (assumed != old) ; -} - -template<> __device__ __inline__ void GB_cuda_atomic_add -( - uint16_t *ptr, // target to modify - uint16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int v = (unsigned short int) val ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // modify it atomically: - old = atomicCAS (p, assumed, assumed + v) ; - } - while (assumed != old) ; -} +// types: int and uint [32,64], float, double, GxB_FC32_t, complex double template<> __device__ __inline__ void GB_cuda_atomic_add ( @@ -351,50 +275,9 @@ template<> __device__ __inline__ void GB_cuda_atomic_add // GB_cuda_atomic_times for built-in types //------------------------------------------------------------------------------ -// types: int and uint [16,32,64], float, double, GxB_FC32_t +// types: int and uint [32,64], float, double, GxB_FC32_t // no GxB_FC64_t. -template<> __device__ __inline__ void GB_cuda_atomic_times -( - int16_t *ptr, // target to modify - int16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // compute the new value: - int16_t new_value = GB_PUN (int16_t, assumed) * val ; - // modify it atomically: - old = atomicCAS (p, assumed, GB_PUN (unsigned short int, new_value)) ; - } - while (assumed != old) ; -} - -template<> __device__ __inline__ void GB_cuda_atomic_times -( - uint16_t *ptr, // target to modify - uint16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int v = (unsigned short int) val ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // modify it atomically: - old = atomicCAS (p, assumed, assumed * v) ; - } - while (assumed != old) ; -} - template<> __device__ __inline__ void GB_cuda_atomic_times ( int32_t *ptr, // target to modify @@ -546,53 +429,9 @@ template<> __device__ __inline__ void GB_cuda_atomic_times // GB_cuda_atomic_min //------------------------------------------------------------------------------ -// types: int and uint [16,32,64], float, and double +// types: int and uint [32,64], float, and double // no complex types -template<> __device__ __inline__ void GB_cuda_atomic_min -( - int16_t *ptr, // target to modify - int16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // compute the new value - int16_t assumed_int16 = GB_PUN (int16_t, assumed) ; - int16_t new_value = GB_IMIN (assumed_int16, val) ; - // modify it atomically: - old = atomicCAS (p, assumed, GB_PUN (unsigned short int, new_value)) ; - } - while (assumed != old) ; -} - -template<> __device__ __inline__ void GB_cuda_atomic_min -( - uint16_t *ptr, // target to modify - uint16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int v = (unsigned short int) val ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // compute the new value - unsigned short int new_value = GB_IMIN (assumed, v) ; - // modify it atomically: - old = atomicCAS (p, assumed, new_value) ; - } - while (assumed != old) ; -} - template<> __device__ __inline__ void GB_cuda_atomic_min ( int32_t *ptr, // target to modify @@ -679,53 +518,9 @@ template<> __device__ __inline__ void GB_cuda_atomic_min // GB_cuda_atomic_max //------------------------------------------------------------------------------ -// types: int and uint [16,32,64], float, and double +// types: int and uint [32,64], float, and double // no complex types -template<> __device__ __inline__ void GB_cuda_atomic_max -( - int16_t *ptr, // target to modify - int16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // compute the new value - int16_t assumed_int16 = GB_PUN (int16_t, assumed) ; - int16_t new_value = GB_IMIN (assumed_int16, val) ; - // modify it atomically: - old = atomicCAS (p, assumed, GB_PUN (unsigned short int, new_value)) ; - } - while (assumed != old) ; -} - -template<> __device__ __inline__ void GB_cuda_atomic_max -( - uint16_t *ptr, // target to modify - uint16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int v = (unsigned short int) val ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // compute the new value - unsigned short int new_value = GB_IMIN (assumed, v) ; - // modify it atomically: - old = atomicCAS (p, assumed, new_value) ; - } - while (assumed != old) ; -} - template<> __device__ __inline__ void GB_cuda_atomic_max ( int32_t *ptr, // target to modify @@ -812,27 +607,7 @@ template<> __device__ __inline__ void GB_cuda_atomic_max // GB_cuda_atomic_bor //------------------------------------------------------------------------------ -// bitwise: on uint [16,32,64] - -template<> __device__ __inline__ void GB_cuda_atomic_bor -( - uint16_t *ptr, // target to modify - uint16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int v = (unsigned short int) val ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // modify it atomically: - old = atomicCAS (p, assumed, assumed | v) ; - } - while (assumed != old) ; -} +// bitwise: on uint [32,64] template<> __device__ __inline__ void GB_cuda_atomic_bor ( @@ -858,27 +633,7 @@ template<> __device__ __inline__ void GB_cuda_atomic_bor // GB_cuda_atomic_band //------------------------------------------------------------------------------ -// bitwise: on uint [16,32,64] - -template<> __device__ __inline__ void GB_cuda_atomic_band -( - uint16_t *ptr, // target to modify - uint16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int v = (unsigned short int) val ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // modify it atomically: - old = atomicCAS (p, assumed, assumed & v) ; - } - while (assumed != old) ; -} +// bitwise: on uint [32,64] template<> __device__ __inline__ void GB_cuda_atomic_band ( @@ -904,27 +659,7 @@ template<> __device__ __inline__ void GB_cuda_atomic_band // GB_cuda_atomic_bxor //------------------------------------------------------------------------------ -// bitwise: on uint [16,32,64] - -template<> __device__ __inline__ void GB_cuda_atomic_bxor -( - uint16_t *ptr, // target to modify - uint16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int v = (unsigned short int) val ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // modify it atomically: - old = atomicCAS (p, assumed, assumed ^ v) ; - } - while (assumed != old) ; -} +// bitwise: on uint [32,64] template<> __device__ __inline__ void GB_cuda_atomic_bxor ( @@ -950,27 +685,7 @@ template<> __device__ __inline__ void GB_cuda_atomic_bxor // GB_cuda_atomic_bxnor //------------------------------------------------------------------------------ -// bitwise: on uint [16,32,64] - -template<> __device__ __inline__ void GB_cuda_atomic_bxnor -( - uint16_t *ptr, // target to modify - uint16_t val // value to modify the target with -) -{ - unsigned short int *p = (unsigned short int *) ptr ; - unsigned short int v = (unsigned short int) val ; - unsigned short int assumed ; - unsigned short int old = *p ; - do - { - // assume the old value - assumed = old ; - // modify it atomically: - old = atomicCAS (p, assumed, ~(assumed ^ v)) ; - } - while (assumed != old) ; -} +// bitwise: on uint [32,64] template<> __device__ __inline__ void GB_cuda_atomic_bxnor ( diff --git a/GraphBLAS/CUDA/Template/GB_cuda_buckets.h b/GraphBLAS/CUDA/Template/GB_cuda_buckets.h deleted file mode 100644 index 57cc9ebc4a..0000000000 --- a/GraphBLAS/CUDA/Template/GB_cuda_buckets.h +++ /dev/null @@ -1,59 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/Template/GB_cuda_buckets.h: bucket definitions for dot3 -//------------------------------------------------------------------------------ - -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -// FIXME: rename this file to GB_cuda_dot3_buckets.h (or .cuh? .hpp?) - -//------------------------------------------------------------------------------ - -#ifndef GB_CUDA_BUCKETS_H -#define GB_CUDA_BUCKETS_H - -#define NBUCKETS 3 - -// NBUCKETS buckets: computed by up to NBUCKETS-1 kernel launches (zombies need -// no work...), each using different kernels (with different configurations -// depending on the bucket). - -// dot3: C=A'B, M is sparse or hyper, C is sparse or hyper -// 32 kernels A,B: (hyper,sparse,bitmap,full)^2 x (M and C are sparse/hyper) - -// FIXME: rename enum values to GB_DOT3_BUCKET* -typedef enum -{ - GB_BUCKET_ZOMBIE = 0, // C(i,j) is a zombie (not a bucket) - // both A and B are sparse/hyper: - GB_BUCKET_VSVS = 1, // vsvs: both A(:,i) and B(:,j) are very sparse - GB_BUCKET_MERGEPATH = 2, // mp: use the merge-path method - // A is sparse/hyper and B is bitmap/full, or - // A is bitmap/full and B is sparse/hyper - GB_BUCKET_VSDN = 1, // vsdn: the sparse vector is very sparse - GB_BUCKET_SPDN = 2, // spdn: sparse vector has lots of entries; - // use a whole warp for each dot product -} -GB_bucket_code ; // FIXME: rename GB_dot3_bucket_code - -// These may use another bucket enum: - - // two full/(sparse,hyper) kernels: - // // CUDA kernel: spdn, handles 4 buckets: - // // A(:,i) is dense and B(:,j) is very sparse (< 256 entries) - // GB_BUCKET_DNVS = 2, - // // A(:,i) is dense and B(:,j) is sparse (>= 256 entries) - // GB_BUCKET_DNSP = 3, - - // a sparse/full kernel - // // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense - // GB_BUCKET_VSDN = 4, - // // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense - // GB_BUCKET_SPDN = 5, - - // a sparse/bitmap kernel - // a bitmap/bitmap kernel - // a bitmap/sparse kernel - // ... - -#endif diff --git a/GraphBLAS/CUDA/Template/GB_cuda_dot3_defn.h b/GraphBLAS/CUDA/Template/GB_cuda_dot3_defn.h deleted file mode 100644 index 651ec5c064..0000000000 --- a/GraphBLAS/CUDA/Template/GB_cuda_dot3_defn.h +++ /dev/null @@ -1,72 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/Template/GB_cuda_dot3_defn.h: defns just for dot3 -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -// FIXME: rename this to .cuh? It is only #included by GB_cuda_jit* - -#pragma once - -//------------------------------------------------------------------------------ -// operators -//------------------------------------------------------------------------------ - -#if GB_C_ISO - -// GB_MULTADD now defined in header -// #define GB_MULTADD( c, a ,b, i, k, j) - #define GB_DOT_TERMINAL( c ) break - #define GB_DOT_MERGE(pA,pB) \ - { \ - cij_exists = true ; \ - } - #define GB_CIJ_EXIST_POSTCHECK - -#else - -// GB_MULTADD now defined in header -// #define GB_MULTADD( c, a, b, i, k, j ) \ -// { \ -// GB_Z_TYPE x_op_y ; \ -// GB_MULT ( x_op_y, a, b, i, k, j ) ; /* x_op_y = a*b */ \ -// GB_ADD ( c, c, x_op_y ) ; /* c += x_op_y */ \ -// } - - #define GB_DOT_TERMINAL( c ) GB_IF_TERMINAL_BREAK ( c, zterminal ) - - #if GB_IS_PLUS_PAIR_REAL_SEMIRING - - // cij += A(k,i) * B(k,j), for merge operation (plus_pair_real semiring) - #if GB_Z_IGNORE_OVERFLOW - // plus_pair for int64, uint64, float, or double - #define GB_DOT_MERGE(pA,pB) cij++ ; - #define GB_CIJ_EXIST_POSTCHECK cij_exists = (cij != 0) ; - #else - // plus_pair semiring for small integers - #define GB_DOT_MERGE(pA,pB) \ - { \ - cij_exists = true ; \ - cij++ ; \ - } - #define GB_CIJ_EXIST_POSTCHECK - #endif - - #else - - // cij += A(k,i) * B(k,j), for merge operation (general case) - #define GB_DOT_MERGE(pA,pB) \ - { \ - GB_GETA ( aki, Ax, pA, ) ; /* aki = A(k,i) */ \ - GB_GETB ( bkj, Bx, pB, ) ; /* bkj = B(k,j) */ \ - cij_exists = true ; \ - GB_MULTADD ( cij, aki, bkj, i, k, j ) ; /* cij += aki * bkj */ \ - } - #define GB_CIJ_EXIST_POSTCHECK - - #endif - -#endif - diff --git a/GraphBLAS/CUDA/Template/GB_cuda_ek_slice.cuh b/GraphBLAS/CUDA/Template/GB_cuda_ek_slice.cuh new file mode 100644 index 0000000000..8b864a22fd --- /dev/null +++ b/GraphBLAS/CUDA/Template/GB_cuda_ek_slice.cuh @@ -0,0 +1,192 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/Template/GB_cuda_ek_slice.cuh +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ +// GB_cuda_ek_slice_setup +//------------------------------------------------------------------------------ + +static __device__ __inline__ int64_t GB_cuda_ek_slice_setup +( + // inputs, not modified: + const int64_t *Ap, // array of size anvec+1 + const int64_t anvec, // # of vectors in the matrix A + const int64_t anz, // # of entries in the sparse/hyper matrix A + const int64_t pfirst, // first entry in A to find k + const int64_t max_pchunk, // max # of entries in A to find k + // output: + int64_t *my_chunk_size, // size of the chunk for this threadblock + int64_t *anvec1, // anvec-1 + float *slope // slope of vectors from kfirst to klast +) +{ + + //-------------------------------------------------------------------------- + // determine the range of entryes pfirst:plast-1 for this chunk + //-------------------------------------------------------------------------- + + // The slice for each threadblock contains entries pfirst:plast-1 of A. + // The threadblock works on a chunk of entries in Ai/Ax [pfirst...plast-1]. + + ASSERT (pfirst < anz) ; + ASSERT (max_pchunk > 0) ; + int64_t plast = pfirst + max_pchunk ; + plast = GB_IMIN (plast, anz) ; + (*my_chunk_size) = plast - pfirst ; + ASSERT ((*my_chunk_size) > 0) ; + + //-------------------------------------------------------------------------- + // estimate the first and last vectors for this chunk + //-------------------------------------------------------------------------- + + // find kfirst, the first vector of the slice for this chunk. kfirst is + // the vector that owns the entry Ai [pfirst] and Ax [pfirst]. The search + // does not need to be exact, so kfirst is an estimate. + + int64_t kfirst = 0 ; + int64_t kright = anvec ; + GB_TRIM_BINARY_SEARCH (pfirst, Ap, kfirst, kright) ; + + // find klast, the last vector of the slice for this chunk. klast is the + // vector that owns the entry Ai [plast-1] and Ax [plast-1]. The search + // does not have to be exact, so klast is an estimate. + + int64_t klast = kfirst ; + kright = anvec ; + GB_TRIM_BINARY_SEARCH (plast, Ap, klast, kright) ; + + //-------------------------------------------------------------------------- + // find slope of vectors in this chunk, and return result + //-------------------------------------------------------------------------- + + // number of vectors in A for this chunk, where + // Ap [kfirst:klast-1] will be searched. + int64_t nk = klast - kfirst + 1 ; + + // slope is the estimated # of vectors in this chunk, divided by the + // chunk size. + (*slope) = ((float) nk) / ((float) (*my_chunk_size)) ; + + (*anvec1) = anvec - 1 ; + return (kfirst) ; +} + +//------------------------------------------------------------------------------ +// GB_cuda_ek_slice_entry +//------------------------------------------------------------------------------ + +// Let p = kk + pfirst, where kk ranges from 0:my_chunk_size-1, and so p ranges +// from kk:(kk+my_chunk_size-1), and where my_chunk_size is normally of size +// max_pchunk, unless this is the last chunk in the entire matrix. +// GB_cuda_ek_slice_entry computes k for this entry, so that the kth vector +// contains the entry aij with row index i = Ai [p] and value aij = Ax [p] +// (assuming that A is a sparse or hypersparse matrix held by column). That +// is, Ap [k] <= p < Ap [k+1] will hold. If A is sparse and held by column, +// then aij is in column j = k. If A is hypersparse, then aij is in column j = +// Ah [k]. + +// The method returns the index k of the vector in A that contains the pth +// entry in A, at position p = kk + pfirst. + +static __device__ __inline__ int64_t GB_cuda_ek_slice_entry +( + // inputs, not modified: + const int64_t kk, // find the k value of the kkth entry + const int64_t pfirst, // first entry in A to find k (for which kk=0) + const int64_t *Ap, // array of size anvec+1 + const int64_t anvec1, // anvec-1 + const int64_t kfirst, // estimate of first vector in the chunk + const float slope // estimate # vectors in chunk / my_chunk_size +) +{ + + // get a rough estimate of k for the kkth entry + int64_t k = kfirst + (int64_t) (slope * ((float) kk)) ; + + // The estimate of k cannot be smaller than kfirst, but it might be bigger + // than anvec-1, so ensure it is in the valid range, kfirst to anvec-1. + k = GB_IMIN (k, anvec1) ; + + // look for p in Ap, where p is in range pfirst:plast-1 + // where pfirst >= 0 and plast < anz + int64_t p = kk + pfirst ; + + // linear-time search for the k value of the pth entry + while (Ap [k+1] <= p) k++ ; + while (Ap [k ] > p) k-- ; + + // the pth entry of A is contained in the kth vector of A + ASSERT (Ap [k] <= p && p < Ap [k+1]) ; + + // return the result k + return (k) ; +} + +//------------------------------------------------------------------------------ +// GB_cuda_ek_slice +//------------------------------------------------------------------------------ + +// GB_cuda_ek_slice finds the vector k that owns each entry in the sparse or +// hypersparse matrix A, in Ai/Ax [pfirst:plast-1], where plast = min (anz, +// pfirst+max_pchunk). Returns my_chunk_size = plast - pfirst, which is the +// size of the chunk operated on by this threadblock. + +// The function GB_cuda_ek_slice behaves somewhat like GB_ek_slice used on the +// CPU. The latter is for OpenMP parallelism on the CPU only; it does not +// need to compute ks. + +static __device__ __inline__ int64_t GB_cuda_ek_slice // returns my_chunk_size +( + // inputs, not modified: + const int64_t *Ap, // array of size anvec+1 + const int64_t anvec, // # of vectors in the matrix A + const int64_t anz, // # of entries in the sparse/hyper matrix A + const int64_t pfirst, // first entry in A to find k + const int64_t max_pchunk, // max # of entries in A to find k + // output: + int64_t *ks // k value for each pfirst:plast-1 +) +{ + + //-------------------------------------------------------------------------- + // determine the chunk for this threadblock and its slope + //-------------------------------------------------------------------------- + + int64_t my_chunk_size, anvec1 ; + float slope ; + int64_t kfirst = GB_cuda_ek_slice_setup (Ap, anvec, anz, pfirst, + max_pchunk, &my_chunk_size, &anvec1, &slope) ; + + //-------------------------------------------------------------------------- + // find the kth vector that contains each entry p = pfirst:plast-1 + //-------------------------------------------------------------------------- + + for (int64_t kk = threadIdx.x ; kk < my_chunk_size ; kk += blockDim.x) + { + + //---------------------------------------------------------------------- + // determine the kth vector that contains the pth entry + //---------------------------------------------------------------------- + + int64_t k = GB_cuda_ek_slice_entry (kk, pfirst, Ap, anvec1, kfirst, + slope) ; + + //---------------------------------------------------------------------- + // save the result in ks + //---------------------------------------------------------------------- + + ks [kk] = k ; + } + + //-------------------------------------------------------------------------- + // sync all threads and return result + //-------------------------------------------------------------------------- + + this_thread_block().sync() ; + return (my_chunk_size) ; +} + diff --git a/GraphBLAS/CUDA/Template/GB_cuda_error.hpp b/GraphBLAS/CUDA/Template/GB_cuda_error.hpp new file mode 100644 index 0000000000..fe7815c6c2 --- /dev/null +++ b/GraphBLAS/CUDA/Template/GB_cuda_error.hpp @@ -0,0 +1,37 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/GB_cuda_error.hpp: call a cuda method and check its result +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +#ifndef GB_CUDA_ERROR_HPP +#define GB_CUDA_ERROR_HPP + +//------------------------------------------------------------------------------ +// CUDA_OK: like GB_OK but for calls to cuda* methods +//------------------------------------------------------------------------------ + +// FIXME: GrB_NO_VALUE means something in CUDA failed, and the caller will then +// do the computation on the CPU. Need to turn off the JIT for CUDA kernels +// (but not CPU kernels) if some CUDA error occurred. Current JIT control does +// not distinguish between CPU and CUDA failures. + +#define CUDA_OK(cudaMethod) \ +{ \ + cudaError_t cuda_error = cudaMethod ; \ + if (cuda_error != cudaSuccess) \ + { \ + GrB_Info info = (cuda_error == cudaErrorMemoryAllocation) ? \ + GrB_OUT_OF_MEMORY : GrB_NO_VALUE ; \ + GBURBLE ("(cuda failed: %d:%s file:%s line:%d) ", (int) cuda_error, \ + cudaGetErrorString (cuda_error), __FILE__, __LINE__) ; \ + GB_FREE_ALL ; \ + return (info) ; \ + } \ +} + +#endif + diff --git a/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh new file mode 100644 index 0000000000..4c202eeaf5 --- /dev/null +++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh @@ -0,0 +1,122 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_dense_phase1.cuh +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +// phase1 for dot3, A and B are bitmap/full. +// dense phase1: symbolic load balancing and data partition. + +// This kernel scans the non-zero pattern in A and B, takes into account the +// mask and computes total work required to form C. Then it computes the vector +// k that contains each entry C(i,j) that isn't a zombie, or sets C(i,j) to its +// zombie status. + +//------------------------------------------------------------------------------ +// GB_cuda_AxB_dot3_dense_phase1_kernel: lookup i,k pairs and store in Ci +//------------------------------------------------------------------------------ + +// GB_cuda_AxB_dot3_dense_phase1_kernel is a CUDA kernel that scans all entries +// in M and assigns i,j coordinates for each entries and stores in Mi and Ci. +// A and B are both bitmap/full. C and M are sparse/hypersparse. + +__global__ void GB_cuda_AxB_dot3_dense_phase1_kernel +( + // input/output: + GrB_Matrix C, // final output matrix + const GrB_Matrix M // mask matrix +) +{ + + //-------------------------------------------------------------------------- + // get C, M, A, and B + //-------------------------------------------------------------------------- + + const int64_t *__restrict__ Mp = M->p ; + const int64_t *__restrict__ Mi = M->i ; + #if !GB_MASK_STRUCT + const GB_M_TYPE *__restrict__ Mx = (GB_M_TYPE *) M->x ; + #endif + const int64_t mnvec = M->nvec ; + const GB_M_NVALS (mnz) ; + + int64_t *__restrict__ Ci = C->i ; // for zombies, or vector k + + // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a + // zombie, or k otherwise, where C(:,j) is the kth vector of C (j = Ch [k] + // if hypersparse or j = k if standard sparse). + + //-------------------------------------------------------------------------- + // determine the vector k of all entries in C(i,j), one chunk at a time + //-------------------------------------------------------------------------- + +#if 0 + __shared__ int64_t ks [chunk_size] ; +#endif + +// int64_t chunk_max = GB_ICEIL (mnz, chunk_size) ; +// for (int64_t chunk = blockIdx.x ; chunk < chunk_max ; chunk += gridDim.x ) + + for (int64_t pfirst = blockIdx.x << log2_chunk_size ; + pfirst < mnz ; + pfirst += gridDim.x << log2_chunk_size) + { + + //---------------------------------------------------------------------- + // find the vector k that contains each entry C(i,j) in this chunk + //---------------------------------------------------------------------- + + // This threadblock works on Mi/Mx and Ci/Cx, in positions pfirst to + // pfirst + my_chunk_size - 1. + +#if 0 + int64_t my_chunk_size = GB_cuda_ek_slice (Mp, mnvec, mnz, pfirst, + chunk_size, /* output: */ ks) ; +#else + int64_t my_chunk_size, mnvec1 ; + float slope ; + int64_t kfirst = GB_cuda_ek_slice_setup (Mp, mnvec, mnz, pfirst, + chunk_size, &my_chunk_size, &mnvec1, &slope) ; +#endif + + //---------------------------------------------------------------------- + // assign entries in C(i,j): either its vector k or its zombie status + //---------------------------------------------------------------------- + +// for (int64_t pM = pfirst + threadIdx.x ; +// pM < pfirst + my_chunk_size ; +// pM += blockDim.x) + + for (int64_t kk = threadIdx.x ; kk < my_chunk_size ; kk += blockDim.x) + { + +#if 0 + int64_t k = ks [kk] ; // get the k value of Mi,Mx [pM]. +#else + int64_t k = GB_cuda_ek_slice_entry (kk, pfirst, Mp, mnvec1, kfirst, + slope) ; +#endif + + int64_t pM = kk + pfirst ; + + #if GB_MASK_STRUCT + { + // no need to check the value of M(i,j); no prezombies + Ci [pM] = k ; + } + #else + { + bool mij = (bool) GB_MCAST (Mx, pM, ) ; + int64_t i = Mi [pM] ; + Ci [pM] = (!mij) * (GB_FLIP (i)) + + mij * (k) ; + } + #endif + } + } +} + diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase1.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase1.cuh similarity index 63% rename from GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase1.cuh rename to GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase1.cuh index 23b6b272aa..346b5de04a 100644 --- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase1.cuh +++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase1.cuh @@ -2,6 +2,8 @@ // GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase1.cuh //------------------------------------------------------------------------------ +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ @@ -11,27 +13,9 @@ // dot3, phase1: symbolic load balancing and data partition // to assign work to different 'buckets' for later compute -// This kernel scans the non-zero pattern in A and B, takes into account the -// mask and computes total work required to form C. Then it classifies each -// dot product into a set of buckets for efficient compute. - -#pragma once - -#include -#include -#include "GB_cuda_kernel.h" -#include "GB_mxm_shared_definitions.h" -#include "GB_hash.h" -#include "GB_hyper_hash_lookup.h" -#include "GB_cuda_buckets.h" -#include -#include - -using namespace cooperative_groups; - -//------------------------------------------------------------------------------ -// GB_jit_AxB_dot3_phase1: build nanobuckets, hunt for pre-zombies -//------------------------------------------------------------------------------ +// This kernel scans the non-zero pattern in A and B, takes into account the +// mask and computes total work required to form C. Then it classifies each dot +// product into a set of buckets for efficient compute. // GB_AxB_cuda_dot3_phase1 is a CUDA kernel that scans all entries in C and // assigns them to each of the NBUCKETS buckets. The output is a @@ -54,8 +38,7 @@ using namespace cooperative_groups; // FIXME: What if all entries are in one bucket; // can we skip the bucket creation? -template -__global__ void GB_jit_AxB_dot3_phase1 +__global__ void GB_jit_AxB_dot3_phase1_kernel ( // outputs, preallocated in global memory: int64_t *nanobuckets, // array of size NBUCKETS-blockDim.x-by-gridDim.x @@ -73,30 +56,26 @@ __global__ void GB_jit_AxB_dot3_phase1 // get C, M, A, and B //-------------------------------------------------------------------------- + #if GB_M_IS_HYPER const int64_t *__restrict__ Mh = M->h ; + #endif const int64_t *__restrict__ Mp = M->p ; const int64_t *__restrict__ Mi = M->i ; #if !GB_MASK_STRUCT const GB_M_TYPE *__restrict__ Mx = (GB_M_TYPE *) M->x ; #endif const int64_t mnvec = M->nvec ; - const int64_t mvlen = M->vlen ; -// const int64_t mnz = GB_nnz(M) ; + // const int64_t mvlen = M->vlen ; const GB_M_NVALS (mnz) ; - const bool M_is_hyper = M->h != NULL ; ASSERT (GB_M_IS_SPARSE || GB_M_IS_HYPER) ; + #if GB_A_IS_SPARSE || GB_A_IS_HYPER const int64_t *__restrict__ Ap = A->p ; - const int64_t *__restrict__ Ai = A->i ; - const int64_t avlen = A->vlen ; -// const int64_t anz = GB_nnz(A) ; - const GB_A_NVALS (anz) ; + #endif + #if GB_B_IS_SPARSE || GB_B_IS_HYPER const int64_t *__restrict__ Bp = B->p ; - const int64_t *__restrict__ Bi = B->i ; - const int64_t bvlen = B->vlen ; -// const int64_t bnz = GB_nnz(B); - const GB_B_NVALS (bnz) ; + #endif #if GB_A_IS_HYPER const int64_t anvec = A->nvec ; @@ -131,139 +110,124 @@ __global__ void GB_jit_AxB_dot3_phase1 //-------------------------------------------------------------------------- // clear the bucket counters //-------------------------------------------------------------------------- - int64_t my_bucket[NBUCKETS]; - - // ASSERT (mnz > 0) ; - // ASSERT (gridDim.x <= mnz) ; + int64_t my_bucket [NBUCKETS] ; // each thread uses NBUCKETS bucket counters, held in register #pragma unroll - for(int b = 0; b < NBUCKETS; ++b) { - my_bucket[b] = 0; + for (int b = 0 ; b < NBUCKETS ; b++) + { + my_bucket [b] = 0 ; } - __shared__ int64_t ks [chunk_size] ; - //-------------------------------------------------------------------------- - // assign all entries of C to the buckets + // assign buckets to all entries in C(i,j), one chunk at a time //-------------------------------------------------------------------------- - // all threads in this block will compute the same values for these: - int64_t pfirst, plast, kfirst, klast ; +#if 0 + // removing ks saves about 10% of the phase1 time + // (19.5 msec to 17.5 msec for the com-Orkut matrix) + __shared__ int64_t ks [chunk_size] ; +#endif - int64_t chunk_max = GB_ICEIL (mnz, chunk_size) ; - // (mnz + chunk_size -1)/chunk_size; - for ( int64_t chunk = blockIdx.x; - chunk < chunk_max; - chunk += gridDim.x ) + for (int64_t pfirst = blockIdx.x << log2_chunk_size ; + pfirst < mnz ; + pfirst += gridDim.x << log2_chunk_size) { //---------------------------------------------------------------------- - // determine the work done by this iteration, "chunk" + // find the vector k that contains each entry C(i,j) in this chunk //---------------------------------------------------------------------- - // The slice for each task contains entries pfirst:plast-1 of M and C. - // This iteration "chunk" computes Ci and Cx [pfirst...plast-1], using - // Mi and Mx [pfirst:plast-1]. All threads in the thread block are - // used for this "chunk". - pfirst = chunk_size * chunk ; - plast = pfirst + chunk_size ; - // plast = GB_IMIN (plast, mnz) ; - if (plast > mnz) plast = mnz ; - int64_t my_chunk_size = plast - pfirst ; - - // find the first vector of the slice for this chunk: the - // vector that owns the entry Mi [pfirst] and Mx [pfirst]. - kfirst = GB_search_for_vector_device (pfirst, Mp, 0, mnvec, mvlen) ; - - // find the last vector of the slice for task blockIdx.x: the - // vector that owns the entry Mi [plast-1] and Mx [plast-1]. - klast = GB_search_for_vector_device (plast-1, Mp, kfirst, mnvec, mvlen); + // This threadblock works on Mi/Mx and Ci/Mx, in positions pfirst to + // pfirst + my_chunk_size - 1. - // number of vectors in C and M for this "chunk" iteration, where - // Mp [kfirst:klast] will be operated on. - int64_t nk = klast - kfirst + 1 ; +#if 0 + int64_t my_chunk_size = GB_cuda_ek_slice (Mp, mnvec, mnz, pfirst, + chunk_size, /* output: */ ks) ; +#else + int64_t my_chunk_size, mnvec1 ; + float slope ; + int64_t kfirst = GB_cuda_ek_slice_setup (Mp, mnvec, mnz, pfirst, + chunk_size, &my_chunk_size, &mnvec1, &slope) ; +#endif //---------------------------------------------------------------------- - // fill ks to find all indices + // assign entries in C(i,j) to the buckets //---------------------------------------------------------------------- - // search for k values for each entry pfirst:plast-1 - float slope = ((float) nk) / ((float) my_chunk_size) ; - int64_t mnvec1 = mnvec - 1 ; for (int64_t kk = threadIdx.x ; kk < my_chunk_size ; kk += blockDim.x) { - // get a rough estimate of k for the kkth entry in ks - int64_t k = kfirst + (int64_t) (slope * ((float) kk)) ; - // k cannot be smaller than kfirst, but might be bigger than - // mnvec-1, so ensure it is in the valid range, kfirst to mnvec-1 - // k = GB_IMIN (k, mnvec-1) ; - if (k > mnvec1) k = mnvec1 ; - // look for p in Mp, where p is in range pfirst:plast-1 - // where pfirst >= 0 and plast < mnz - int64_t p = kk + pfirst ; - // linear-time search for the k value of the pth entry - while ( Mp [ k + 1 ] <= p ) k++ ; - while ( Mp [ k ] > p ) k-- ; - ks [kk] = k ; - } - this_thread_block().sync(); - //---------------------------------------------------------------------- - // assign entries in C(i,j) to the buckets - //---------------------------------------------------------------------- + //------------------------------------------------------------------ + // determine the kth vector that contains the pth entry + //------------------------------------------------------------------ + +#if 0 + int64_t k = ks [kk] ; // get the k value of Mi,Mx [pM] +#else + int64_t k = GB_cuda_ek_slice_entry (kk, pfirst, Mp, mnvec1, kfirst, + slope) ; +#endif + + //------------------------------------------------------------------ + // get C(i,j): zombie if A(:,i) and B(:,j) are empty or M(i,j) false + //------------------------------------------------------------------ + + // C(i,j) is in the kth vector of C, where j == k if C is sparse, + // or j = Mh [k] if C is hypersparse - for ( int64_t pM = pfirst + threadIdx.x; - pM < pfirst + my_chunk_size; - pM += blockDim.x ) - { GB_bucket_code bucket = GB_BUCKET_ZOMBIE ; - int64_t k = ks [pM - pfirst] ; // get the k value of Mi,Mx [pM]. - int64_t i = Mi [ pM ] ; - int64_t j = GBH_M (Mh, k) ; // note that Ch and Mh are the same - if ( GB_MCAST ( Mx, pM, ) ) + int64_t pM = kk + pfirst ; + int64_t i = Mi [pM] ; + + if (GB_MCAST (Mx, pM, )) // if (M (i,j) is true): { //-------------------------------------------------------------- // get B(:,j) //-------------------------------------------------------------- - int64_t pB, pB_end ; + #if GB_B_IS_SPARSE || GB_B_IS_HYPER + int64_t j = GBH_M (Mh, k) ; // that Ch and Mh are the same + int64_t pB, pB_end, bjnz ; + #endif + #if GB_B_IS_HYPER GB_hyper_hash_lookup (Bh, bnvec, Bp, B_Yp, B_Yi, B_Yx, B_hash_bits, j, &pB, &pB_end) ; + bjnz = pB_end - pB ; + if (bjnz > 0) #elif GB_B_IS_SPARSE - pB = Bp[j] ; - pB_end = Bp[j+1] ; + pB = Bp [j] ; + pB_end = Bp [j+1] ; + bjnz = pB_end - pB ; // # of entries in B(:,j) + if (bjnz > 0) #else - // B is bitmap or full - pB = bvlen * j ; - pB_end = pB + j ; + // B is bitmap or full: no need to look up B(:,j) #endif - - int64_t bjnz = pB_end - pB ; - if (bjnz > 0) { //---------------------------------------------------------- // get A(:,i) //---------------------------------------------------------- - int64_t pA, pA_end ; + #if GB_A_IS_SPARSE || GB_A_IS_HYPER + int64_t pA, pA_end, ainz ; + #endif + #if GB_A_IS_HYPER GB_hyper_hash_lookup (Ah, anvec, Ap, A_Yp, A_Yi, A_Yx, A_hash_bits, i, &pA, &pA_end) ; + ainz = pA_end - pA ; + if (ainz > 0) #elif GB_A_IS_SPARSE - pA = Ap[i] ; - pA_end = Ap[i+1] ; + pA = Ap [i] ; + pA_end = Ap [i+1] ; + ainz = pA_end - pA ; // # of entries in A(:,i) + if (ainz > 0) #else - // A is bitmap or full - pA = avlen * i ; - pA_end = pA + i ; + // A is bitmap or full: no need to look up A(:,i) #endif - - int64_t ainz = pA_end - pA ; - if (ainz > 0) { // determine the bucket for C(i,j) #if (GB_A_IS_SPARSE || GB_A_IS_HYPER) && \ @@ -291,12 +255,20 @@ __global__ void GB_jit_AxB_dot3_phase1 } } - Ci[pM] = (bucket == GB_BUCKET_ZOMBIE) * ( GB_FLIP(i) << 4) - + (bucket != GB_BUCKET_ZOMBIE) * ((k<<4) + bucket) ; - my_bucket[bucket]++; + //------------------------------------------------------------------ + // assign C(i,j) to its bucket + //------------------------------------------------------------------ + + // encode the bucket or zombie status in the row index of C(i,j) + Ci [pM] = (bucket == GB_BUCKET_ZOMBIE) * ( GB_FLIP(i) << 4) + + (bucket != GB_BUCKET_ZOMBIE) * ((k<<4) + bucket) ; + + // each thread counts its own bucket sizes + my_bucket [bucket]++ ; } } - this_thread_block().sync(); + + this_thread_block().sync() ; //-------------------------------------------------------------------------- // cumulative sum of each bucket @@ -313,17 +285,17 @@ __global__ void GB_jit_AxB_dot3_phase1 nanobuckets + blockIdx.x * (NBUCKETS * blockDim.x) + threadIdx.x ; #pragma unroll - for (int b = 0; b < NBUCKETS; ++b) + for (int b = 0 ; b < NBUCKETS ; b++) { if ( threadIdx.x == blockDim.x-1) { blockbucket [blockIdx.x + b * gridDim.x] = my_bucket[b] ; } - this_thread_block().sync(); + this_thread_block().sync() ; BlockCumSum(temp_storage).ExclusiveSum( my_bucket[b], my_bucket[b]) ; - this_thread_block().sync(); + this_thread_block().sync() ; nanobucket [b * blockDim.x] = my_bucket[b] ; } @@ -337,7 +309,7 @@ __global__ void GB_jit_AxB_dot3_phase1 if (threadIdx.x == blockDim.x - 1 ) { #pragma unroll - for(int b = 0; b < NBUCKETS; ++b) + for (int b = 0; b < NBUCKETS; ++b) { blockbucket [b * gridDim.x + blockIdx.x] += my_bucket[b]; } diff --git a/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase2.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase2.cuh new file mode 100644 index 0000000000..171347573f --- /dev/null +++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase2.cuh @@ -0,0 +1,176 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_GB_AxB_dot3_phase2.cuh +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +// AxB_dot3_phase2: fill the global buckets + +//------------------------------------------------------------------------------ +// BlockPrefixCallbackOp +//------------------------------------------------------------------------------ + +// A stateful callback functor that maintains a running prefix to be applied +// during consecutive scan operations. +struct BlockPrefixCallbackOp +{ + // Running prefix + int64_t running_total ; + + // Constructor + __device__ BlockPrefixCallbackOp (int64_t running_total) : + running_total(running_total) {} + + // Callback operator to be entered by the first warp of threads in the + // block. Thread-0 is responsible for returning a value for seeding the + // block-wide scan. + __device__ int64_t operator()(int64_t block_aggregate) + { + int64_t old_prefix = running_total ; + running_total += block_aggregate ; + return old_prefix ; + } +} ; + +//------------------------------------------------------------------------------ +// blockBucketExclusiveSum +//------------------------------------------------------------------------------ + +__inline__ __device__ void blockBucketExclusiveSum +( + int bucketId, + int64_t *d_data, + int nblocks +) +{ + + // Specialize BlockScan for a 1D block of 32 threads + typedef cub::BlockScan BlockScan ; + + // Allocate shared memory for BlockScan + __shared__ typename BlockScan::TempStorage temp_storage ; + + // Initialize running total + BlockPrefixCallbackOp prefix_op (0) ; + + // Have the block iterate over segments of items + int64_t data = 0 ; + + int64_t *blockbucket = d_data ; + + for (int block_id = 0 ; block_id < nblocks ; block_id += blocksize) + { + // Load a segment of consecutive items that are blocked across threads + + int loc = block_id + threadIdx.x; + if (loc < nblocks) + { + data = blockbucket [bucketId*nblocks + loc] ; + } + this_thread_block().sync() ; + + // Collectively compute the block-wide exclusive prefix sum + BlockScan(temp_storage).ExclusiveSum (data, data, prefix_op) ; + this_thread_block().sync() ; + + if (loc < nblocks) + { + blockbucket [bucketId*nblocks + loc] = data ; + } + + // this_thread_block().sync(); + + data = 0 ; + } +} + +//------------------------------------------------------------------------------ +// GB_cuda_AxB_dot3_phase2_kernel +//------------------------------------------------------------------------------ + +// GB_cuda_AxB__dot3_phase2 is a CUDA kernel that takes as input the +// nanobuckets and blockbucket arrays computed by the first phase kernel, +// GB_cuda_AxB__dot3_phase1. The launch geometry of this kernel must match +// the GB_cuda_AxB_dot3_phase1 kernel, with the same # of threads and +// threadblocks. + +__global__ void GB_cuda_AxB_dot3_phase2_kernel +( + // input, not modified: + int64_t *__restrict__ blockbucket, // global bucket count, + // of size NBUCKETS*nblocks + // output: + int64_t *__restrict__ offset, // global offsets, for each bucket + // inputs, not modified: + const int nblocks // input number of blocks to reduce + // across, ie size of vector for 1 bucket +) +{ + + //-------------------------------------------------------------------------- + // sum up the bucket counts of prior threadblocks + //-------------------------------------------------------------------------- + + // blockbucket is an array of size NBUCKETS-by-nblocks, held by row. The + // entry blockbucket [bucket * nblocks + t] holds the # of entries + // in the bucket (in range 0 to NBUCKETS-1) found by threadblock t. + + uint64_t s [NBUCKETS] ; + + #pragma unroll + for (int b = 0 ; b < NBUCKETS ; b++) + { + s [b] = 0 ; + } + + thread_block_tile<32> tile = tiled_partition<32>(this_thread_block() ); + + #pragma unroll + for (int b = 0 ; b < NBUCKETS ; b++) + { + for (int64_t tid = threadIdx.x + blockIdx.x * blockDim.x ; + tid < nblocks ; + tid += blockDim.x*gridDim.x) + { + s [b] += blockbucket [b * nblocks + tid] ; + } + this_thread_block().sync(); + + s [b] = GB_cuda_warp_sum_uint64 (tile, s [b]) ; + } + + if (threadIdx.x == 0) + { + #pragma unroll + for (int b = 0 ; b < NBUCKETS ; b++) + { + atomicAdd ((unsigned long long int*) &(offset [b]), s [b]) ; + } + } + this_thread_block().sync(); + + if (gridDim.x >= NBUCKETS) + { + // Cumulative sum across blocks for each bucket + if (blockIdx.x i ; // for zombies, or bucket assignment + //int64_t *Mp = C->p ; // for offset calculations + //int64_t mnvec = C->nvec; + + //-------------------------------------------------------------------------- + // load and shift the nanobuckets for this thread block + //-------------------------------------------------------------------------- + + // The taskbucket for this threadblock is an array of size + // NBUCKETS-by-blockDim.x, held by row. It forms a 2D array within the 3D + // nanobuckets array. + const int64_t *taskbucket = nanobuckets + + blockIdx.x * (NBUCKETS * blockDim.x) ; + + // Each thread in this threadblock owns one column of this taskbucket, for + // its set of NBUCKETS nanobuckets. The nanobuckets are a column of length + // NBUCKETS, with stride equal to blockDim.x. + + const int64_t *nanobucket = taskbucket + threadIdx.x ; + + // Each thread loads its NBUCKETS nanobucket values into registers. + int64_t my_bucket [NBUCKETS] ; + + #pragma unroll + for (int b = 0 ; b < NBUCKETS ; b++) + { + my_bucket [b] = nanobucket [b * blockDim.x] + + blockbucket [b * gridDim.x + blockIdx.x] + + bucketp [b] ; + } + + // Now each thread has an index into the global set of NBUCKETS buckets, + // held in bucket, of where to place its own entries. + + //-------------------------------------------------------------------------- + // construct the global buckets + //-------------------------------------------------------------------------- + + // The slice for task blockIdx.x contains entries pfirst:plast-1 of M and + // C, which is the part of C operated on by this threadblock. + + // FIXME: why is bucket_idx needed? + __shared__ int64_t bucket_idx [chunk_size] ; + +// int64_t chunk_max = (cnz + chunk_size -1) / chunk_size ; +// for (int64_t chunk = blockIdx.x ; chunk < chunk_max ; chunk += gridDim.x) + + for (int64_t pfirst = blockIdx.x << log2_chunk_size ; + pfirst < cnz ; + pfirst += gridDim.x << log2_chunk_size) + { + + // pfirst = chunk_size * chunk ; + // plast = GB_IMIN( chunk_size * (chunk+1), cnz ) ; + int64_t plast = pfirst + chunk_size ; + plast = GB_IMIN (plast, cnz) ; + + for (int64_t p = pfirst + threadIdx.x ; p < plast ; p += blockDim.x) + { + // get the entry C(i,j), and extract its bucket. Then + // place the entry C(i,j) in the global bucket it belongs to. + int tid = p - pfirst ; + + // TODO: these writes to global are not coalesced. Instead: each + // threadblock could buffer its writes to NBUCKETS buffers and when + // the buffers are full they can be written to global. + + int ibucket = Ci [p] & 0xF; + + //bucket[my_bucket[ibucket]++] = p; + //int idx = (my_bucket[ibucket] - pfirst); + //my_bucket[ibucket] += 1; //blockDim.x ; + //int idx = (my_bucket[ibucket]++ - pfirst) & 0x7F; + //bucket_s[ibucket][ idx ] = p; + + bucket_idx [tid] = my_bucket [ibucket]++ ; + Ci [p] = (ibucket==0) * (Ci [p] >> 4) + (ibucket > 0) * Ci [p] ; + + //if(ibucket == 0) { + //// bucket[my_bucket[0]++] = p; + // Ci[p] = Ci[p] >> 4; + //} else { + // bucket[my_bucket[ibucket]++] = p; + //} + } + + // FIXME: can't this be merged with the loop above? Or is it a + // partial implementation of a coalesced write to the global bucket + // array? + + for (int64_t p = pfirst + threadIdx.x ; p < plast ; p += blockDim.x) + { + int tid = p - pfirst ; + bucket [bucket_idx [tid]] = p ; + } + } +} + diff --git a/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh new file mode 100644 index 0000000000..c36f35d0cc --- /dev/null +++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh @@ -0,0 +1,222 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +// This CUDA kernel produces the semiring product of two dense matrices of +// types GB_A_TYPE and GB_B_TYPE and common index space size n, to an output +// matrix of type GB_C_TYPE. The matrices are dense, with uniform non-zeros and +// sparsity patterns. ie. we want to produce C = A'*B in the sense of the +// given semi-ring. + +// This version uses a simple warp-based dense dot product algorithm, when the +// vectors coming from both A and B are dense, for any size of N. + +// Both the grid and block are 1D, so blockDim.x is the # threads in a +// threadblock, and the # of threadblocks is grid.x + +// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number +// of active threads = min( min(nzA, nzB), 32) + +// Thus, threadblock b owns a semi-ring dot product on a pair of vectors. +// The work is to load the data, do the multiply and add work and finally +// reduce this data to a scalar, and write it to Cx[pair]. + +//------------------------------------------------------------------------------ +// GB_cuda_AxB_dot3_phase3_dndn_kernel +//------------------------------------------------------------------------------ + +__global__ void GB_cuda_AxB_dot3_phase3_dndn_kernel +( + GrB_Matrix C, // result matrix + GrB_Matrix M, // mask matrix + GrB_Matrix A, // input matrix A + GrB_Matrix B // input matrix B +) +{ + + //-------------------------------------------------------------------------- + // get C, M, A, and B + //-------------------------------------------------------------------------- + + #if !GB_A_IS_PATTERN + const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *)A->x ; + #endif + #if !GB_A_IS_PATTERN + const GB_B_TYPE *__restrict__ Bx = (GB_B_TYPE *)B->x ; + #endif + GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *)C->x ; + int64_t *__restrict__ Ci = C->i ; + const int64_t *__restrict__ Mi = M->i ; + #if GB_M_IS_HYPER + const int64_t *__restrict__ Mh = M->h ; + #endif + // A and B are either bitmap or full + #if GB_A_IS_BITMAP + const int8_t *__restrict__ Ab = A->b ; + #endif + #if GB_B_IS_BITMAP + const int8_t *__restrict__ Bb = B->b ; + #endif + + // zombie count + uint64_t zc = 0 ; + + GB_M_NVALS (mnz) ; + + // total items to be inspected + int64_t vlen = A->vlen ; + ASSERT (vlen == B->vlen) ; + ASSERT (vlen > 0) ; + + //-------------------------------------------------------------------------- + // compute C(i,j) = A(:,i)'*B(:,j) for each entry in M(i,j) + //-------------------------------------------------------------------------- + + for (int64_t pM = blockIdx.x ; pM < mnz ; pM += gridDim.x) + { + + //---------------------------------------------------------------------- + // get M(i,j) and C(i,j) + //---------------------------------------------------------------------- + + int64_t i = Mi [pM] ; + int64_t kth = Ci [pM] ; // C(i,j) is in the kth vector of C + bool cij_exists = false ; + GB_DECLARE_IDENTITY (cij) ; // GB_Z_TYPE cij = identity + + //---------------------------------------------------------------------- + // The threadblock cooperates to compute a single entry C(i,j) + //---------------------------------------------------------------------- + + #ifndef GB_MASK_STRUCT + // skip if C(i,j) is a prezombie + if (kth >= 0) + #endif + { + + // j = kth or j = Mh [kth] if C and M are hypersparse + int64_t j = GBH_M (Mh, kth) ; + int64_t pA = vlen * i ; + int64_t pB = vlen * j ; + + GB_DECLAREA (aki) ; + GB_DECLAREB (bkj) ; + + #if GB_A_IS_FULL && GB_B_IS_FULL + { + cij_exists = true ; + for (int64_t k = threadIdx.x ; k < vlen ; k += blockDim.x) + { + // cij += A(k,i) * B(k,j) + GB_GETA (aki, Ax, pA+k, ) ; // aki = A(k,i) + GB_GETB (bkj, Bx, pB+k, ) ; // bkj = B(k,j) + GB_MULTADD ( cij, aki, bkj, i, k, j ) ; // cij += aki * bkj + } + } + #elif GB_A_IS_BITMAP && GB_B_IS_BITMAP + { + for ( int64_t k = threadIdx.x ; k < vlen ; k += blockDim.x) + { + GB_GETA (aki, Ax, pA+k, ) ; // aki = A(k,i) + GB_GETB (bkj, Bx, pB+k, ) ; // bkj = B(k,j) + int8_t b = (Ab [pA+k] && Bb [pB+k]) ; + cij_exists |= b ; + if (b) + { + // cij += aki * bkj + GB_MULTADD ( cij, aki, bkj, i, k, j ) ; + } + } + } + #elif GB_A_IS_FULL && GB_B_IS_BITMAP + { + for ( int64_t k = threadIdx.x ; k < vlen ; k += blockDim.x) + { + if (Bb [pB+k]) + { + GB_GETA (aki, Ax, pA+k, ) ; // aki = A(k,i) + GB_GETB (bkj, Bx, pB+k, ) ; // bkj = B(k,j) + // cij += aki * bkj + GB_MULTADD ( cij, aki, bkj, i, k, j ) ; + cij_exists = true ; + } + } + } + #elif GB_A_IS_BITMAP && GB_B_IS_FULL + { + for ( int64_t k = threadIdx.x ; k < vlen ; k += blockDim.x) + { + if (Ab [pB+k]) + { + GB_GETA (aki, Ax, pA+k, ) ; // aki = A(k,i) + GB_GETB (bkj, Bx, pB+k, ) ; // bkj = B(k,j) + // cij += aki * bkj + GB_MULTADD ( cij, aki, bkj, i, k, j ) ; + cij_exists = true ; + } + } + } + #endif + } + + //---------------------------------------------------------------------- + // reduce per-thread sums to a single scalar + //---------------------------------------------------------------------- + + // FIXME: no need to do this if C(i,j) is a zombie (cij_exists is + // always false), or if A and B are both full and C(i,j) is not a + // zombie (cij_exists is always true). + + // FIXME: this only works if the size of the thread block is 32, + // right? + + // Do vote here for control. + thread_block_tile<32> tile = tiled_partition<32> (this_thread_block()) ; + + // FIXME: tile.any takes an int predicate, not bool. How does this work? + cij_exists = tile.any (cij_exists) ; + tile.sync(); + + #if !GB_C_ISO + // FIXME: the ANY monoid needs the cij_exists for each thread + cij = GB_cuda_warp_reduce_ztype (tile, cij) ; + #endif + + // FIXME: if A and B are full, and GB_MASK_STRUCT is true, cij_exists + // is always true because vlen > 0 always holds for this kernel. + + // FIXME: if kth < 0, C(i,j) is a prezombie, and Ci [pM] already holds + // GB_FLIP (i). + + // write result for this block to global mem + if (threadIdx.x == 0) + { + if (cij_exists) + { + // Cx [pM] = (GB_C_TYPE) cij + GB_PUTC (cij, Cx, pM) ; + Ci [pM] = i ; + } + else + { + // cij is a zombie + zc++ ; + Ci [pM] = GB_FLIP (i) ; + } + } + + // __syncthreads ( ) ; + + if( threadIdx.x ==0 && zc > 0) + { + GB_cuda_atomic_add ( &(C->nzombies), zc) ; + } + } +} + diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_mp.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh similarity index 64% rename from GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_mp.cuh rename to GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh index 838b7e4ccf..3fb4ead9e8 100644 --- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_mp.cuh +++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh @@ -2,88 +2,46 @@ // GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_mp.cuh //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ -// This CUDA kernel produces the semi-ring product of two -// sparse matrices of types T_A and T_B and common index space size n, to a -// output matrix of type T_C. The matrices are sparse, with different numbers -// of non-zeros and different sparsity patterns. -// ie. we want to produce C = A'*B in the sense of the given semi-ring. +// This CUDA kernel produces the semi-ring product of two sparse matrices of +// types GB_A_TYPE and GB_B_TYPE and common index space size n, to a output +// matrix of type GB_C_TYPE. The matrices are sparse, with different numbers of +// non-zeros and different sparsity patterns. ie. we want to produce C = A'*B +// in the sense of the given semi-ring. -// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are -// relatively close in size, neither is very sparse nor dense, for any size of N. -// Handles arbitrary sparsity patterns with guaranteed load balance. +// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are +// relatively close in size, neither is very sparse nor dense, for any size of +// N. Handles arbitrary sparsity patterns with guaranteed load balance. // Both the grid and block are 1D, so blockDim.x is the # threads in a // threadblock, and the # of threadblocks is grid.x -// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number -// of active threads = min( min(g_xnz, g_ynz), 32) +// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number of +// active threads = min( min(g_xnz, g_ynz), 32) -// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi. Its job -// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot -// product on those items in the intersection, and finally reduce this data to a scalar, -// on exit write it to g_odata [b]. +// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi. +// Its job is to find the intersection of the index sets g_xi and g_yi, perform +// the semi-ring dot product on those items in the intersection, and finally +// reduce this data to a scalar, on exit write it to g_odata [b]. // int64_t start <- start of vector pairs for this kernel // int64_t end <- end of vector pairs for this kernel // int64_t *Bucket <- array of pair indices for all kernels -// matrix *C <- result matrix -// matrix *M <- mask matrix -// matrix *A <- input matrix A -// matrix *B <- input matrix B - -#pragma once - -#include -#include -#include -#include "GB_cuda_kernel.h" -#include "GB_mxm_shared_definitions.h" -#include "GB_cuda_atomics.cuh" -#include "GB_hash.h" -#include "GB_hyper_hash_lookup.h" -#include "GB_cuda_dot3_defn.h" - -// Using tile size fixed at compile time, we don't need shared memory -#define tile_sz 32 - -using namespace cooperative_groups; - -//------------------------------------------------------------------------------ -// GB_reduce_sum -//------------------------------------------------------------------------------ - -template< typename T_Z, int warp_sz> -__device__ __inline__ -T_Z GB_reduce_sum(thread_block_tile g, T_Z val) -{ - // Each iteration halves the number of active threads - // Each thread adds its partial sum[i] to sum[lane+i] - // Temporary T_Z is necessary to handle arbirary ops - // FIXME: only works if sizeof(T_Z) <= 32 bytes - // FIXME: the ANY monoid needs the cij_exists for each thread - #pragma unroll - for (int i = warp_sz >> 1; i > 0; i >>= 1) - { - T_Z next = g.shfl_down( val, i); - GB_ADD( val, val, next ); - } - return val; -} +// GrB_Matrix C <- result matrix +// GrB_Matrix M <- mask matrix +// GrB_Matrix A <- input matrix A +// GrB_Matrix B <- input matrix B //------------------------------------------------------------------------------ -// AxB_dot3_phase3_mp +// GB_cuda_AxB_dot3_phase3_mp_kernel //------------------------------------------------------------------------------ - -template< - typename T_C, typename T_A, typename T_B, - typename T_Z, typename T_X, typename T_Y, - uint64_t srcode> -__global__ void AxB_dot3_phase3_mp // FIXME rename + +__global__ void GB_cuda_AxB_dot3_phase3_mp_kernel ( int64_t start, int64_t end, @@ -91,19 +49,17 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, - GrB_Matrix B, - int sz + GrB_Matrix B ) { - // TODO: Figure out how to use graphblas-specific INFINITY macro - #ifndef INFINITY - #define INFINITY std::numeric_limits::max() + #if !GB_A_IS_PATTERN + const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *)A->x ; #endif - - const T_A *__restrict__ Ax = (T_A *)A->x ; - const T_B *__restrict__ Bx = (T_B *)B->x ; - T_C *__restrict__ Cx = (T_C *)C->x ; + #if !GB_B_IS_PATTERN + const GB_B_TYPE *__restrict__ Bx = (GB_B_TYPE *)B->x ; + #endif + GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *)C->x ; int64_t *__restrict__ Ci = C->i ; const int64_t *__restrict__ Mi = M->i ; #if GB_M_IS_HYPER @@ -141,18 +97,10 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename // zombie count int64_t zc = 0; - int64_t pair_id; - // set thread ID - int tid_global = threadIdx.x+ blockDim.x* blockIdx.x; +// int tid_global = threadIdx.x+ blockDim.x* blockIdx.x; int tid = threadIdx.x; - int b = blockIdx.x ; - - // total items to be inspected - int64_t ainz = 0; - int64_t bjnz = 0; - thread_block_tile tile = tiled_partition( this_thread_block()); int all_in_one = ( (end - start) == (M->p)[(M->nvec)] ) ; @@ -163,7 +111,7 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename kk += gridDim.x ) { - pair_id = all_in_one ? kk : Bucket [kk] ; + int64_t pair_id = all_in_one ? kk : Bucket [kk] ; int64_t i = Mi[pair_id]; int64_t k = Ci[pair_id] >> 4; @@ -180,7 +128,7 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename pA_end = Ap[i+1] ; #endif - ainz = pA_end - pA_start ; + int64_t ainz = pA_end - pA_start ; GB_DECLAREA (aki) ; GB_DECLAREB (bkj) ; @@ -188,7 +136,6 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename int cij_exists = 0 ; // FIXME: make a bool - #define shared_vector_size 128 __shared__ int64_t Ai_s[shared_vector_size]; int shared_steps_A = (ainz + shared_vector_size -1)/shared_vector_size; @@ -210,7 +157,7 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename pB_end = Bp[j+1] ; #endif - bjnz = pB_end - pB_start; // bjnz + int64_t bjnz = pB_end - pB_start; // bjnz int shared_steps_B = (bjnz + shared_vector_size -1)/shared_vector_size; __shared__ int64_t Bj_s[shared_vector_size]; @@ -221,14 +168,7 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename Bj_s[i] = Bi[ i + pB_start]; } this_thread_block().sync(); - - //if (threadIdx.x ==0 ) { - // printf("block %d doing dot %lld i,j= %lld,%lld\n", blockIdx.x, pair_id, i, j); - // printf("block %d doing dot %lld ainz,bjnz= %lld,%lld, A_steps=%d, B_steps=%d\n", - // blockIdx.x, pair_id, ainz, bjnz, shared_steps_A, shared_steps_B); - //} - //this_thread_block().sync(); - + //we want more than one intersection per thread while ( (shared_steps_A > 0) && (shared_steps_B > 0) ) { @@ -238,40 +178,28 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename if ( shared_steps_B > 1) bwork = shared_vector_size; int64_t nxy = awork + bwork; - int work_per_thread = (nxy + blockDim.x -1)/blockDim.x; // ceil Divide by 32 = blockDim.x + // ceil Divide by 32 = blockDim.x : + int work_per_thread = (nxy + blockDim.x -1)/blockDim.x; int diag = GB_IMIN( work_per_thread*tid, nxy); int diag_end = GB_IMIN( diag + work_per_thread, nxy); - //printf(" thd%d parts = %u wpt = %u diag, diag_end = %u,%u\n",tid, blockDim.x, work_per_thread, diag, diag_end); - //if (1) //(threadIdx.x == 0) - //{ - // printf ("pair %ld tid%d work_per_thread %d nxy %ld parts %d diag %d diag_end %d Astep=%d, Bstep=%d\n", - // pair_id, threadIdx.x, work_per_thread, nxy, blockDim.x, diag, diag_end,shared_steps_A,shared_steps_B) ; - //} - //this_thread_block().sync(); + // bwork takes place of bjnz: + int x_min = GB_IMAX( (diag - bwork) , 0); - int x_min = GB_IMAX( (diag - bwork) , 0); //bwork takes place of bjnz - int x_max = GB_IMIN( diag, awork); //awork takes place of ainz + //awork takes place of ainz: + int x_max = GB_IMIN( diag, awork); while ( x_min < x_max) { //binary search for correct diag break int pivot = (x_min +x_max) >> 1; - //printf("start search thd%u piv=%u xmin,xmax = %u,%u diag_end=%d\n", tid_global, pivot, x_min, x_max, diag_end); int64_t Apiv = Ai_s[pivot] ; int64_t Bpiv = Bj_s[diag -pivot -1] ; - // if ( Apiv < Bpiv ) { - // x_min = pivot +1; - // } - // else { - // x_max = pivot; - // } - x_min = (pivot + 1)* (Apiv < Bpiv) + x_min * (1 - (Apiv < Bpiv)); - x_max = pivot * (1 - (Apiv < Bpiv)) + x_max * (Apiv < Bpiv); + x_min = (pivot + 1)* (Apiv < Bpiv) + x_min * (1 - (Apiv < Bpiv)); + x_max = pivot * (1 - (Apiv < Bpiv)) + x_max * (Apiv < Bpiv); } - //printf("start search thd%u xcoord= %u diag=%d, diag_end=%d\n", tid_global, x_min, diag, diag_end); int xcoord = x_min; int ycoord = diag -x_min -1; @@ -285,8 +213,6 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename int tx_start = xcoord; // +pA_start; int ty_start = diag -xcoord; // +pB_start; - //if (x_start != y_start) - // printf("start thd%u xs,ys = %i,%i\n", tid_global, x_start, y_start); x_min = GB_IMAX( (diag_end - bwork), 0); //bwork replace bjnz x_max = GB_IMIN( diag_end, awork); //awork replace ainz @@ -297,16 +223,10 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename int64_t Apiv = Ai_s[pivot] ; int64_t Bpiv = Bj_s[diag_end -pivot -1] ; - //if ( Apiv < Bpiv ) { - // x_min = pivot +1; - //} - //else { - // x_max = pivot; - //} x_min = (pivot + 1)* (Apiv < Bpiv) + x_min * (1 - (Apiv < Bpiv)); x_max = pivot * (1 - (Apiv < Bpiv)) + x_max * (Apiv < Bpiv); } - //printf("end search thd%u x_coord = %u diag=%d, diag_end=%d\n", tid_global, x_min, diag, diag_end); + xcoord = x_min; ycoord = diag_end -x_min -1; @@ -318,21 +238,6 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename int64_t pA = tx_start; // pA int64_t pB = ty_start; // pB - //if (1) // threadIdx.x == 0) - //{ - // printf ("%d tx_start %d\n", threadIdx.x, tx_start) ; - // printf ("%d tx_end %d\n", threadIdx.x, tx_end ) ; - // printf ("%d ty_start %d\n", threadIdx.x, ty_start) ; - // printf ("%d ty_end %d\n", threadIdx.x, ty_end ) ; - //} - //this_thread_block().sync(); - - // if(threadIdx.x == 0 ) { - // printf("blk%d, thd%d k=%d, l=%d, tx_start=%d, ty_start=%d, tx_end=%d, ty_end=%d\n", - // blockIdx.x, tid_global, k, l, tx_start, ty_start, tx_end, ty_end); - // } - // this_thread_block().sync(); - while ( pA < tx_end && pB < ty_end ) { int64_t Aind = Ai_s[pA] ; @@ -416,14 +321,6 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename // reduce sum per-thread values to a single scalar, get OR of flag //---------------------------------------------------------------------- - /* - if (tid == 0) - { - printf ("reduce %d : %d exists = %d\n", b, cij, cij_exists) ; - } - __syncthreads(); - */ - // Do vote here for control. cij_exists = tile.any (cij_exists) ; tile.sync ( ) ; @@ -432,7 +329,7 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename if (cij_exists) { // FIXME: the ANY monoid needs the cij_exists for each thread - cij = GB_reduce_sum( tile, cij ); + cij = GB_cuda_warp_reduce_ztype (tile, cij) ; } #endif @@ -441,7 +338,8 @@ __global__ void AxB_dot3_phase3_mp // FIXME rename { if (cij_exists) { - GB_PUTC (cij, Cx, pair_id) ; // Cx [pair_id] = (T_C) cij + // Cx [pair_id] = (GB_C_TYPE) cij + GB_PUTC (cij, Cx, pair_id) ; Ci [pair_id] = i ; } else diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh similarity index 52% rename from GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh rename to GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh index e8986d86cf..c0e04e9361 100644 --- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh +++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh @@ -2,96 +2,55 @@ // GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh //------------------------------------------------------------------------------ +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ -// This CUDA kernel produces the semi-ring product of two -// sparse matrices of types T_A and T_B and common index space size n, to a -// output matrix of type T_C. The matrices are sparse, with different numbers -// of non-zeros and different sparsity patterns. -// ie. we want to produce C = A'*B in the sense of the given semi-ring. +// This CUDA kernel produces the semi-ring product of two sparse matrices of +// types GB_A_TYPE and GB_B_TYPE and common index space size n, to an output +// matrix of type GB_C_TYPE. The matrices are sparse, with different numbers of +// non-zeros and different sparsity patterns. ie. we want to produce C = A'*B +// in the sense of the given semi-ring. // This version uses an entire threadblock to compute each C(i,j) dot product. // Both the grid and block are 1D, so blockDim.x is the # threads in a // threadblock, and the # of threadblocks is grid.x -// int64_t start <- start of vector pairs for this kernel -// int64_t end <- end of vector pairs for this kernel -// int64_t *Bucket <- array of pair indices for all kernels -// matrix *C <- result matrix -// matrix *M <- mask matrix -// matrix *A <- input matrix A -// matrix *B <- input matrix B - -#pragma once - -#include -#include -#include -#include "GB_cuda_kernel.h" -#include "GB_mxm_shared_definitions.h" -#include "GB_hash.h" -#include "GB_hyper_hash_lookup.h" -#include "GB_cuda_dot3_defn.h" - -// Using tile size fixed at compile time, we don't need shared memory -#define tile_sz 32 - -using namespace cooperative_groups; - //------------------------------------------------------------------------------ -// GB_reduce_sum +// GB_cuda_AxB_dot3_phase3_spdn_kernel //------------------------------------------------------------------------------ -template< typename T_Z, int warp_sz> -__device__ __inline__ -T_Z GB_reduce_sum(thread_block_tile g, T_Z val) -{ - // Each iteration halves the number of active threads - // Each thread adds its partial sum[i] to sum[lane+i] - // Temporary T_Z is necessary to handle arbirary ops - // FIXME: only works if sizeof(T_Z) <= 32 bytes - // FIXME: the ANY monoid needs the cij_exists for each thread - #pragma unroll - for (int i = warp_sz >> 1; i > 0; i >>= 1) - { - T_Z next = g.shfl_down( val, i); - GB_ADD( val, val, next ); - } - return val; -} - -//------------------------------------------------------------------------------ -// AxB_dot3_phase3_spdn -//------------------------------------------------------------------------------ - -template< - typename T_C, typename T_A, typename T_B, - typename T_Z, typename T_X, typename T_Y, - uint64_t srcode> -__global__ void AxB_dot3_phase3_spdn // FIXME rename +__global__ void GB_cuda_AxB_dot3_phase3_spdn_kernel ( - int64_t start, - int64_t end, + int64_t start, // start of vector pairs for this kernel + int64_t end, // end of vector pairs for this kernel int64_t *Bucket, // do the work in Bucket [start:end-1] - GrB_Matrix C, - GrB_Matrix M, - GrB_Matrix A, - GrB_Matrix B, - int sz // FIXME: unused + GrB_Matrix C, // result matrix + GrB_Matrix M, // mask matrix + GrB_Matrix A, // input matrix A + GrB_Matrix B // input matrix B ) { - // TODO: Figure out how to use graphblas-specific INFINITY macro - #ifndef INFINITY - #define INFINITY std::numeric_limits::max() + // sparse-times-dense or dense-times-sparse + #if !(((GB_A_IS_SPARSE || GB_A_IS_HYPER) && \ + (GB_B_IS_BITMAP || GB_B_IS_FULL)) \ + || \ + ((GB_B_IS_SPARSE || GB_B_IS_HYPER) && \ + (GB_A_IS_BITMAP || GB_A_IS_FULL))) + #error "spdn: for sparse-dense or dense-sparse cases only" #endif - const T_A *__restrict__ Ax = (T_A *)A->x ; - const T_B *__restrict__ Bx = (T_B *)B->x ; - T_C *__restrict__ Cx = (T_C *)C->x ; + #if !GB_A_IS_PATTERN + const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *)A->x ; + #endif + #if !GB_B_IS_PATTERN + const GB_B_TYPE *__restrict__ Bx = (GB_B_TYPE *)B->x ; + #endif + GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *)C->x ; int64_t *__restrict__ Ci = C->i ; const int64_t *__restrict__ Mi = M->i ; #if GB_M_IS_HYPER @@ -101,6 +60,8 @@ __global__ void AxB_dot3_phase3_spdn // FIXME rename #if GB_A_IS_HYPER || GB_A_IS_SPARSE const int64_t *__restrict__ Ai = A->i ; const int64_t *__restrict__ Ap = A->p ; + #else + const int64_t avlen = A->vlen ; #endif #if GB_A_IS_BITMAP @@ -110,6 +71,8 @@ __global__ void AxB_dot3_phase3_spdn // FIXME rename #if GB_B_IS_HYPER || GB_B_IS_SPARSE const int64_t *__restrict__ Bi = B->i ; const int64_t *__restrict__ Bp = B->p ; + #else + const int64_t bvlen = B->vlen ; #endif #if GB_B_IS_BITMAP @@ -136,87 +99,94 @@ __global__ void AxB_dot3_phase3_spdn // FIXME rename const int64_t B_hash_bits = (B->Y == NULL) ? 0 : (B->Y->vdim - 1) ; #endif - // zombie count - int64_t zc = 0; + // zombie count for this threadblock + uint64_t zc = 0 ; - int64_t pair_id; + thread_block_tile tile = + tiled_partition (this_thread_block()) ; - thread_block_tile tile = tiled_partition( this_thread_block()); - int all_in_one = ( (end - start) == (M->p)[(M->nvec)] ) ; + GB_M_NVALS (mnz) ; + ASSERT (GB_M_IS_SPARSE || GB_M_IS_HYPER) ; + int64_t cnz_in_bucket = end - start ; + int all_in_one = (cnz_in_bucket == mnz) ; // Main loop over pairs int64_t kk ; - for (kk = start+ blockIdx.x; // warp per C(i,j)=A(:,i)'*B(:,j) dot product - kk < end; - kk += gridDim.x ) + for (kk = start + blockIdx.x ; // warp per C(i,j)=A(:,i)'*B(:,j) dot product + kk < end ; + kk += gridDim.x) { - pair_id = all_in_one ? kk : Bucket [kk] ; - int64_t i = Mi[pair_id]; - int64_t k = Ci[pair_id] >> 4; + //---------------------------------------------------------------------- + // get M(i,j) and C(i,j) + //---------------------------------------------------------------------- + int64_t pair_id = all_in_one ? kk : Bucket [kk] ; + int64_t i = Mi [pair_id] ; + int64_t k = Ci [pair_id] >> 4 ; // j = k or j = Mh [k] if C and M are hypersparse int64_t j = GBH_M (Mh, k) ; - // find A(:,i) - int64_t pA, pA_end ; + //---------------------------------------------------------------------- + // get A(:,i) + //---------------------------------------------------------------------- + #if GB_A_IS_HYPER + int64_t pA, pA_end ; GB_hyper_hash_lookup (Ah, anvec, Ap, A_Yp, A_Yi, A_Yx, A_hash_bits, i, &pA, &pA_end) ; #elif GB_A_IS_SPARSE - pA = Ap[i] ; - pA_end = Ap[i+1] ; + int64_t pA = Ap [i] ; + int64_t pA_end = Ap [i+1] ; #else - // A is bitmap or full - pA = A->vlen * i ; - pA_end = pA + i ; + // A is bitmap or full: only pA is needed + int64_t pA = avlen * i ; #endif - GB_DECLAREA (aki) ; - GB_DECLAREB (bkj) ; - GB_DECLARE_IDENTITY (cij) ; // GB_Z_TYPE cij = identity - - int cij_exists = 0 ; // FIXME: make a bool + //---------------------------------------------------------------------- + // get B(:,j) + //---------------------------------------------------------------------- - // find B(:,j) - int64_t pB, pB_end ; #if GB_B_IS_HYPER + int64_t pB, pB_end ; GB_hyper_hash_lookup (Bh, bnvec, Bp, B_Yp, B_Yi, B_Yx, B_hash_bits, j, &pB, &pB_end) ; #elif GB_B_IS_SPARSE - pB = Bp[j] ; - pB_end = Bp[j+1] ; + int64_t pB = Bp [j] ; + int64_t pB_end = Bp [j+1] ; #else - // B is bitmap or full - pB = B->vlen * j ; - pB_end = pB + j ; + // B is bitmap or full: only pB is needed + int64_t pB = bvlen * j ; #endif //---------------------------------------------------------------------- - // compute C(i,j) = A(:,i)'*B(:,j) using the entire threadblock + // C(i,j) = A(:,i)'*B(:,j) using the entire threadblock //---------------------------------------------------------------------- + GB_DECLAREA (aki) ; + GB_DECLAREB (bkj) ; + GB_DECLARE_IDENTITY (cij) ; // GB_Z_TYPE cij = identity + int cij_exists = 0 ; + #if ( GB_A_IS_FULL ) { -// int64_t bjnz = pB_end - pB ; // bjnz = nnz (B (:,j)) -// if (bjnz > 0) // will always be >= 128 - { - //-------------------------------------------------------------- - // A is full and B is sparse/hyper - //-------------------------------------------------------------- + //------------------------------------------------------------------ + // A is full and B is sparse/hyper + //------------------------------------------------------------------ - cij_exists = true ; - for (int64_t p = pB + threadIdx.x ; p < pB_end ; p += blockDim.x) - { - int64_t k = Bi [p] ; // next row index of B(:,j) - // cij += A(k,i) * B(k,j) - GB_GETA ( aki, Ax, pA+k, ) ; // aki = A(k,i) - GB_GETB ( bkj, Bx, p, ) ; // bkj = B(k,j) - GB_MULTADD ( cij, aki, bkj, i, k, j ) ; // cij += aki * bkj - GB_DOT_TERMINAL (cij) ; // break if cij == terminal - } + cij_exists = true ; + for (int64_t p = pB + threadIdx.x ; p < pB_end ; p += blockDim.x) + { + int64_t k = Bi [p] ; // next row index of B(:,j) + // cij += A(k,i) * B(k,j) + GB_GETA ( aki, Ax, pA+k, ) ; // aki = A(k,i) + GB_GETB ( bkj, Bx, p, ) ; // bkj = B(k,j) + // cij += aki * bkj + GB_MULTADD ( cij, aki, bkj, i, k, j ) ; + GB_DOT_TERMINAL (cij) ; // break if cij == terminal } + } #elif ( GB_A_IS_BITMAP ) { @@ -237,25 +207,23 @@ __global__ void AxB_dot3_phase3_spdn // FIXME rename } #elif ( GB_B_IS_FULL ) { -// int64_t ainz = pA_end - pA ; // ainz = nnz (A (:,i)) -// if (ainz > 0) // will always be >= 128 - { - //-------------------------------------------------------------- - // A is sparse/hyper and B is full - //-------------------------------------------------------------- + //------------------------------------------------------------------ + // A is sparse/hyper and B is full + //------------------------------------------------------------------ - cij_exists = true ; - for (int64_t p = pA + threadIdx.x ; p < pA_end ; p += blockDim.x) - { - int64_t k = Ai [p] ; // next row index of A(:,i) - // cij += A(k,i) * B(k,j) - GB_GETA ( aki, Ax, p, ) ; // aki = A(i,k) - GB_GETB ( bkj, Bx, pB+k, ) ; // bkj = B(j,k) - GB_MULTADD ( cij, aki, bkj, i, k, j) ; // cij += aik * bjk - GB_DOT_TERMINAL (cij) ; // break if cij == terminal - } + cij_exists = true ; + for (int64_t p = pA + threadIdx.x ; p < pA_end ; p += blockDim.x) + { + int64_t k = Ai [p] ; // next row index of A(:,i) + // cij += A(k,i) * B(k,j) + GB_GETA ( aki, Ax, p, ) ; // aki = A(i,k) + GB_GETB ( bkj, Bx, pB+k, ) ; // bkj = B(j,k) + // cij += aik * bjk + GB_MULTADD ( cij, aki, bkj, i, k, j) ; + GB_DOT_TERMINAL (cij) ; // break if cij == terminal } + } #elif ( GB_B_IS_BITMAP ) { @@ -277,13 +245,17 @@ __global__ void AxB_dot3_phase3_spdn // FIXME rename } #endif + //---------------------------------------------------------------------- + // save C(i,j) or declare it a zombie + //---------------------------------------------------------------------- + GB_CIJ_EXIST_POSTCHECK //---------------------------------------------------------------------- // reduce sum per-thread values to a single scalar, get OR of flag //---------------------------------------------------------------------- - // Do vote here for control. + // Do vote here for control cij_exists = tile.any (cij_exists) ; tile.sync ( ) ; @@ -291,7 +263,7 @@ __global__ void AxB_dot3_phase3_spdn // FIXME rename if (cij_exists) { // FIXME: the ANY monoid needs cij_exists for each thread - cij = GB_reduce_sum( tile, cij ); + cij = GB_cuda_warp_reduce_ztype (tile, cij) ; } #endif @@ -300,13 +272,14 @@ __global__ void AxB_dot3_phase3_spdn // FIXME rename { if (cij_exists) { - GB_PUTC (cij, Cx, pair_id) ; // Cx [pair_id] = (T_C) cij + // Cx [pair_id] = (GB_C_TYPE) cij + GB_PUTC (cij, Cx, pair_id) ; Ci [pair_id] = i ; } else { // cij is a zombie - zc++; + zc++ ; Ci [pair_id] = GB_FLIP (i) ; } } @@ -319,7 +292,7 @@ __global__ void AxB_dot3_phase3_spdn // FIXME rename if (threadIdx.x == 0 && zc > 0) { - GB_cuda_atomic_add ( &(C->nzombies), zc) ; + GB_cuda_atomic_add (&(C->nzombies), zc) ; } } diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh similarity index 64% rename from GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh rename to GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh index e5168527e0..018df8c1ae 100644 --- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh +++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh @@ -2,6 +2,8 @@ // GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh //------------------------------------------------------------------------------ +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ @@ -11,76 +13,46 @@ // Each thread in this kernel is responsible for m vector-pairs(x,y), // m = 256/sz, where sz is in {4, 16, 64, 256} // We know each non-zero on the sparse side will hit a dense value. -// Template on // Parameters: -// matrix *C <- C result matrix -// matrix *M <- Mask matrix -// matrix *A <- A matrix to multiply, sparse -// matrix *B <- B matrix to multiply, dense in sparse format? +// C <- C result matrix +// M <- Mask matrix +// A <- A matrix to multiply, sparse +// B <- B matrix to multiply, dense in sparse format? //****************************************************************************** -#pragma once -#include -#include -#include -#include "GB_cuda_kernel.h" -#include "GB_mxm_shared_definitions.h" -#include "GB_hash.h" -#include "GB_hyper_hash_lookup.h" -#include -#define tile_sz 32 -//#include "local_cub/block/block_reduce.cuh" -#include -#include "GB_cuda_dot3_defn.h" - -using namespace cooperative_groups; - -//------------------------------------------------------------------------------ -// reduce_sum_int64 -//------------------------------------------------------------------------------ - -// for counting zombies only (always int64_t) -template< int warpSize > -__device__ int64_t reduce_sum_int64(thread_block_tile g, int64_t val) -{ - // Each iteration halves the number of active threads - // Each thread adds its partial sum[i] to sum[lane+i] - for (int i = g.size() / 2; i > 0; i /= 2) - { - val += g.shfl_down(val,i) ; - } - return val; // note: only thread 0 will return full sum -} - //------------------------------------------------------------------------------ -// AxB_dot3_phase3_vsdn +// GB_cuda_AxB_dot3_phase3_vsdn_kernel //------------------------------------------------------------------------------ -template< - typename T_C, typename T_A, typename T_B, - typename T_Z, typename T_X, typename T_Y, - uint64_t srcode> -__global__ void AxB_dot3_phase3_vsdn +__global__ void GB_cuda_AxB_dot3_phase3_vsdn_kernel ( - int64_t start, - int64_t end, - int64_t *Bucket, // do the work in Bucket [start:end-1] - GrB_Matrix C, - GrB_Matrix M, - GrB_Matrix A, - GrB_Matrix B, - int sz // unused (FIXME: remove this) + int64_t start, + int64_t end, + int64_t *Bucket, // do the work in Bucket [start:end-1] + GrB_Matrix C, + GrB_Matrix M, + GrB_Matrix A, + GrB_Matrix B ) { - // TODO: Figure out how to use graphblas-specific INFINITY macro - #ifndef INFINITY - #define INFINITY std::numeric_limits::max() + + // sparse-times-dense or dense-times-sparse + #if !(((GB_A_IS_SPARSE || GB_A_IS_HYPER) && \ + (GB_B_IS_BITMAP || GB_B_IS_FULL)) \ + || \ + ((GB_B_IS_SPARSE || GB_B_IS_HYPER) && \ + (GB_A_IS_BITMAP || GB_A_IS_FULL))) + #error "vsdn: for sparse-dense or dense-sparse cases only" #endif - const T_A *__restrict__ Ax = (T_A *)A->x ; - const T_B *__restrict__ Bx = (T_B *)B->x ; - T_C *__restrict__ Cx = (T_C *)C->x ; + #if !GB_A_IS_PATTERN + const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *)A->x ; + #endif + #if !GB_B_IS_PATTERN + const GB_B_TYPE *__restrict__ Bx = (GB_B_TYPE *)B->x ; + #endif + GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *)C->x ; int64_t *__restrict__ Ci = C->i ; const int64_t *__restrict__ Mi = M->i ; #if GB_M_IS_HYPER @@ -90,6 +62,8 @@ __global__ void AxB_dot3_phase3_vsdn #if GB_A_IS_HYPER || GB_A_IS_SPARSE const int64_t *__restrict__ Ai = A->i ; const int64_t *__restrict__ Ap = A->p ; + #else + const int64_t avlen = A->vlen ; #endif #if GB_A_IS_BITMAP @@ -99,6 +73,8 @@ __global__ void AxB_dot3_phase3_vsdn #if GB_B_IS_HYPER || GB_B_IS_SPARSE const int64_t *__restrict__ Bi = B->i ; const int64_t *__restrict__ Bp = B->p ; + #else + const int64_t bvlen = B->vlen ; #endif #if GB_B_IS_BITMAP @@ -125,73 +101,72 @@ __global__ void AxB_dot3_phase3_vsdn const int64_t B_hash_bits = (B->Y == NULL) ? 0 : (B->Y->vdim - 1) ; #endif -// typedef cub::BlockReduce BlockReduce; -// __shared__ typename BlockReduce::TempStorage temp_storage; + uint64_t zc = 0 ; // zombie count -// if( threadIdx.x ==0) -// printf("thd:%d %d dots/thrd, nvec = %d blockDim=%d\n",threadIdx.x, sz, nvec, blockDim.x); -// __syncthreads(); + GB_M_NVALS (mnz) ; + ASSERT (GB_M_IS_SPARSE || GB_M_IS_HYPER) ; + int64_t cnz_in_bucket = end - start ; + int all_in_one = (cnz_in_bucket == mnz) ; - int64_t pair_id; - - int64_t zc = 0 ; - -// if (threadIdx.x ==0) -// printf("thd%u pi=%lld\n",tid, start+threadIdx.x); -// __syncthreads(); - - int all_in_one = ( (end - start) == (M->p)[(M->nvec)] ) ; - - for (int64_t kk = start +threadIdx.x +blockIdx.x*blockDim.x; - kk < end ; - kk += gridDim.x*blockDim.x ) + for (int64_t kk = start + threadIdx.x + blockIdx.x*blockDim.x ; + kk < end ; + kk += gridDim.x*blockDim.x) { + //---------------------------------------------------------------------- + // get the entry C(i,j) + //---------------------------------------------------------------------- + int64_t pair_id = all_in_one ? kk : Bucket[ kk ]; - int64_t i = Mi[pair_id]; // cols from mask + int64_t i = Mi [pair_id] ; - // FIXME: use another variable, not "k" here: - int64_t k = Ci[pair_id] >> 4; // vector of C encoded in phase1 + int64_t k = Ci [pair_id] >> 4; // vector of C encoded in phase1 // j = k or j = Mh [k] if C and M are hypersparse int64_t j = GBH_M (Mh, k) ; - // Prep row offsets for both A and B + //---------------------------------------------------------------------- + // get A(:,i) + //---------------------------------------------------------------------- - // find A(:,i) - int64_t pA, pA_end ; #if GB_A_IS_HYPER + int64_t pA, pA_end ; GB_hyper_hash_lookup (Ah, anvec, Ap, A_Yp, A_Yi, A_Yx, A_hash_bits, i, &pA, &pA_end) ; #elif GB_A_IS_SPARSE - pA = Ap[i] ; - pA_end = Ap[i+1] ; + int64_t pA = Ap[i] ; + int64_t pA_end = Ap[i+1] ; #else - // A is bitmap or full - pA = (A->vlen)*i; - pA_end = pA +(A->vlen); + // A is bitmap or full: only pA is needed + int64_t pA = avlen * i ; #endif - // find B(:,j) - int64_t pB, pB_end ; + //---------------------------------------------------------------------- + // get B(:,j) + //---------------------------------------------------------------------- + #if GB_B_IS_HYPER + int64_t pB, pB_end ; GB_hyper_hash_lookup (Bh, bnvec, Bp, B_Yp, B_Yi, B_Yx, B_hash_bits, j, &pB, &pB_end) ; #elif GB_B_IS_SPARSE - pB = Bp[j]; // col of C - pB_end = Bp[j+1]; + int64_t pB = Bp [j] ; + int64_t pB_end = Bp [j+1] ; #else - // B is bitmap or full - pB = (B->vlen)*j; - pB_end = pB +(B->vlen); + // B is bitmap or full: only pB is needed + int64_t pB = bvlen * j ; #endif + //---------------------------------------------------------------------- + // C(i,j) = A(:,i)'*B(:,j) + //---------------------------------------------------------------------- + GB_DECLAREA (aki) ; GB_DECLAREB (bkj) ; GB_DECLARE_IDENTITY (cij) ; // GB_Z_TYPE cij = identity bool cij_exists = false ; - int64_t my_nzombies = 0; + uint64_t my_nzombies = 0 ; #if ( GB_A_IS_FULL ) { @@ -204,7 +179,7 @@ __global__ void AxB_dot3_phase3_vsdn //-------------------------------------------------------------- cij_exists = true ; - for (int64_t p = pB ; p < pB_end ; ++p) + for (int64_t p = pB ; p < pB_end ; p++) { int64_t k = Bi [p] ; // next row index of B(:,j) // cij += A(k,i) * B(k,j) @@ -221,7 +196,7 @@ __global__ void AxB_dot3_phase3_vsdn // A is bitmap and B is sparse/hyper //------------------------------------------------------------------ - for (int64_t p = pB ; p < pB_end ; ++p) + for (int64_t p = pB ; p < pB_end ; p++) { int64_t k = Bi [p] ; // next row index of B(:,j) if (Ab [pA+k]) // check if A(k,i) exists @@ -243,7 +218,7 @@ __global__ void AxB_dot3_phase3_vsdn //-------------------------------------------------------------- cij_exists = true ; - for (int64_t p = pA ; p < pA_end ; ++p) + for (int64_t p = pA ; p < pA_end ; p++) { int64_t k = Ai [p] ; // next row index of A(:,i) // cij += A(k,i) * B(k,j) @@ -261,7 +236,7 @@ __global__ void AxB_dot3_phase3_vsdn // A is sparse/hyper and B is bitmap //------------------------------------------------------------------ - for (int64_t p = pA ; p < pA_end ; ++p) + for (int64_t p = pA ; p < pA_end ; p++) { int64_t k = Ai [p] ; // next row index of A(:,i) if (Bb [pB+k]) // check if B(k,j) exists @@ -274,10 +249,15 @@ __global__ void AxB_dot3_phase3_vsdn } #endif + //---------------------------------------------------------------------- + // save C(i,j) or declare it a zombie + //---------------------------------------------------------------------- + GB_CIJ_EXIST_POSTCHECK if (cij_exists) { - GB_PUTC (cij, Cx, pair_id) ; // Cx [pair_id] = (T_C) cij + // Cx [pair_id] = (GB_C_TYPE) cij + GB_PUTC (cij, Cx, pair_id) ; Ci [pair_id] = i ; } else @@ -286,17 +266,17 @@ __global__ void AxB_dot3_phase3_vsdn Ci [pair_id] = GB_FLIP (i) ; } - // FIXME: use the same method as vsvs for counting zombies // sum up the zombie count: - thread_block_tile tile = tiled_partition( this_thread_block()); - zc += reduce_sum_int64(tile, my_nzombies); + thread_block_tile tile = + tiled_partition (this_thread_block ()) ; + zc += GB_cuda_warp_sum_uint64 (tile, my_nzombies) ; } - if(threadIdx.x == 0 && zc > 0) + if (threadIdx.x == 0 && zc > 0) { // this threadblock accumulates its zombie count into the global // zombie count - GB_cuda_atomic_add ( &(C->nzombies), zc) ; + GB_cuda_atomic_add ( &(C->nzombies), zc) ; } } diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh similarity index 53% rename from GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh rename to GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh index bcd0d4d25c..edf539634d 100644 --- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh +++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh @@ -2,6 +2,8 @@ // GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh //------------------------------------------------------------------------------ +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ @@ -13,144 +15,97 @@ // using a serial merge algorithm on the sparse vectors. // m = 256/sz, where sz is in {4, 16, 64, 256} // For a vector-pair, sz = xnz + ynz -// Template on // Parameters: // int64_t start <- start of vector pairs for this kernel // int64_t end <- end of vector pairs for this kernel // int64_t *Bucket <- array of pair indices for all kernels -// matrix *C <- result matrix -// matrix *M <- mask matrix -// matrix *A <- input matrix A -// matrix *B <- input matrix B -// int sz <- nnz of very sparse vectors +// C <- result matrix +// M <- mask matrix +// A <- input matrix A +// B <- input matrix B // Blocksize is 1024, uses warp and block reductions to count zombies produced. //****************************************************************************** -#pragma once -#include -#include -#include -#include -#include -#include "GB_cuda_kernel.h" -#include "GB_mxm_shared_definitions.h" -#include "GB_cuda_atomics.cuh" -#include "GB_hash.h" -#include "GB_hyper_hash_lookup.h" -#include "GB_cuda_dot3_defn.h" - -using namespace cooperative_groups; - -//------------------------------------------------------------------------------ -// GB_warp_ReduceSumPlus_int64 -//------------------------------------------------------------------------------ - -template< int tile_sz> -__inline__ __device__ -int64_t GB_warp_ReduceSumPlus_int64( thread_block_tile g, int64_t val) -{ - // Each iteration halves the number of active threads - // Each thread adds its partial sum[i] to sum[lane+i] - /* - #pragma unroll - for (int i = tile_sz >> 1; i > 0; i >>= 1) { - val += g.shfl_down( val, i); - } - */ - val += g.shfl_down( val, 16); - val += g.shfl_down( val, 8); - val += g.shfl_down( val, 4); - val += g.shfl_down( val, 2); - val += g.shfl_down( val, 1); - return val; // note: only thread 0 will return full sum -} - //------------------------------------------------------------------------------ -// GB_block_ReduceSum_int64 +// GB_block_ReduceSum_uint64 //------------------------------------------------------------------------------ -template -__inline__ __device__ -int64_t GB_block_ReduceSum_int64(thread_block g, int64_t val) +__inline__ __device__ uint64_t GB_block_ReduceSum_uint64 +( + thread_block g, // FIXME: g is used for thread_block_tile elsewhere; + // be consistent. + uint64_t val +) { - static __shared__ int64_t shared[warpSize]; // Shared mem for 32 partial sums + // Shared mem for 32 partial sums + static __shared__ uint64_t shared [tile_sz] ; - int lane = threadIdx.x & 31 ; // % warpSize; - int wid = threadIdx.x >> 5 ; // / warpSize; - thread_block_tile tile = tiled_partition( g ); + // FIXME: assumes tile_sz is 32: (use an #if .. #else ... #endif) + int lane = threadIdx.x & 31 ; // % tile_sz; + int wid = threadIdx.x >> 5 ; // / tile_sz; + thread_block_tile tile = tiled_partition (g) ; - // Each warp performs partial reduction - val = GB_warp_ReduceSumPlus_int64( tile, val); + // Each warp performs partial reduction + val = GB_cuda_warp_sum_uint64 (tile, val) ; - // Wait for all partial reductions - if (lane==0) shared[wid]=val; // Write reduced value to shared memory - g.sync(); // Wait for all partial reductions + // Wait for all partial reductions + if (lane == 0) + { + shared [wid] = val ; // Write reduced value to shared memory + } - //if (wid > 0 ) return val; + g.sync(); // Wait for all partial reductions - //read from shared memory only if that warp existed - val = (threadIdx.x < (blockDim.x / warpSize ) ) ? shared[lane] : 0; + // read from shared memory only if that warp existed + val = (threadIdx.x < (blockDim.x / tile_sz ) ) ? shared[lane] : 0; - // Final reduce within first warp - if (wid==0) val = GB_warp_ReduceSumPlus_int64( tile, val); + // Final reduce within first warp + if (wid == 0) + { + val = GB_cuda_warp_sum_uint64 (tile, val) ; + } - return val; + return (val) ; } //------------------------------------------------------------------------------ -// AxB_dot3_phase3_vsvs +// GB_cuda_AxB_dot3_phase3_vsvs_kernel //------------------------------------------------------------------------------ -template< - typename T_C, typename T_A, typename T_B, - typename T_Z, typename T_X, typename T_Y, uint64_t srcode> -__global__ void AxB_dot3_phase3_vsvs -( - int64_t start, - int64_t end, - int64_t *Bucket, // do the work in Bucket [start:end-1] - GrB_Matrix C, - GrB_Matrix M, - GrB_Matrix A, - GrB_Matrix B, - int sz // unused +__global__ void GB_cuda_AxB_dot3_phase3_vsvs_kernel +( + int64_t start, + int64_t end, + int64_t *Bucket, // do the work in Bucket [start:end-1] + GrB_Matrix C, + GrB_Matrix M, + GrB_Matrix A, + GrB_Matrix B ) { - // TODO: Figure out how to use graphblas-specific INFINITY macro - #ifndef INFINITY - #define INFINITY std::numeric_limits::max() + #if !GB_A_IS_PATTERN + const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *)A->x ; #endif - - int64_t dots = end - start; - // sz = expected non-zeros per dot -// /* -// int m = (gridDim.x*blockDim.x)*256/sz; -// int dpt = (nvecs+ gridDim.x*blockDim.x -1)/(gridDim.x*blockDim.x); -// m = dpt < m ? dpt : m; -// -// int dots = (nvecs +m -1)/m; -// */ - const T_A *__restrict__ Ax = (T_A *)A->x ; - const T_B *__restrict__ Bx = (T_B *)B->x ; - T_C *__restrict__ Cx = (T_C *)C->x ; + #if !GB_B_IS_PATTERN + const GB_B_TYPE *__restrict__ Bx = (GB_B_TYPE *)B->x ; + #endif + GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *)C->x ; int64_t *__restrict__ Ci = C->i ; const int64_t *__restrict__ Mi = M->i ; #if GB_M_IS_HYPER const int64_t *__restrict__ Mh = M->h ; #endif - #if GB_A_IS_HYPER || GB_A_IS_SPARSE + ASSERT (GB_A_IS_HYPER || GB_A_IS_SPARSE) ; const int64_t *__restrict__ Ai = A->i ; const int64_t *__restrict__ Ap = A->p ; - #endif - #if GB_B_IS_HYPER || GB_B_IS_SPARSE + ASSERT (GB_B_IS_HYPER || GB_B_IS_SPARSE) ; const int64_t *__restrict__ Bi = B->i ; const int64_t *__restrict__ Bp = B->p ; - #endif #if GB_A_IS_HYPER const int64_t anvec = A->nvec ; @@ -172,20 +127,14 @@ __global__ void AxB_dot3_phase3_vsvs const int64_t B_hash_bits = (B->Y == NULL) ? 0 : (B->Y->vdim - 1) ; #endif - //int64_t pfirst, plast; - - //GB_PARTITION (pfirst, plast, dots, blockIdx.x, gridDim.x ) ; - - int64_t my_nzombies = 0 ; + uint64_t my_nzombies = 0 ; - int all_in_one = ( (end - start) == (M->p)[(M->nvec)] ) ; + GB_M_NVALS (mnz) ; + int all_in_one = ( (end - start) == mnz ) ; - //for ( int64_t kk = pfirst+ threadIdx.x ; - // kk < plast; - // kk += blockDim.x ) - for ( int64_t kk = start+ threadIdx.x +blockDim.x*blockIdx.x ; - kk < end; - kk += blockDim.x*gridDim.x ) + for (int64_t kk = start + threadIdx.x + blockDim.x*blockIdx.x ; + kk < end ; + kk += blockDim.x*gridDim.x ) { int64_t pair_id = all_in_one ? kk : Bucket[ kk ]; @@ -201,8 +150,8 @@ __global__ void AxB_dot3_phase3_vsvs GB_hyper_hash_lookup (Ah, anvec, Ap, A_Yp, A_Yi, A_Yx, A_hash_bits, i, &pA, &pA_end) ; #else - pA = Ap[i] ; - pA_end = Ap[i+1] ; + pA = Ap [i] ; + pA_end = Ap [i+1] ; #endif // find B(:,j): B is always sparse or hypersparse @@ -211,8 +160,8 @@ __global__ void AxB_dot3_phase3_vsvs GB_hyper_hash_lookup (Bh, bnvec, Bp, B_Yp, B_Yi, B_Yx, B_hash_bits, j, &pB, &pB_end) ; #else - pB = Bp[j] ; - pB_end = Bp[j+1] ; + pB = Bp [j] ; + pB_end = Bp [j+1] ; #endif GB_DECLAREA (aki) ; @@ -243,7 +192,7 @@ __global__ void AxB_dot3_phase3_vsvs GB_CIJ_EXIST_POSTCHECK ; if (cij_exists) { - GB_PUTC (cij, Cx, pair_id) ; // Cx [pair_id] = (T_C) cij + GB_PUTC (cij, Cx, pair_id) ; // Cx [pair_id] = (GB_C_TYPE) cij Ci [pair_id] = i ; } else @@ -257,12 +206,12 @@ __global__ void AxB_dot3_phase3_vsvs // FIXME: use this in spdn and vsdn: this_thread_block().sync(); - my_nzombies = GB_block_ReduceSum_int64<32>( this_thread_block(), my_nzombies); + my_nzombies = GB_block_ReduceSum_uint64 (this_thread_block(), my_nzombies) ; this_thread_block().sync(); if( threadIdx.x == 0 && my_nzombies > 0) { - GB_cuda_atomic_add ( &(C->nzombies), (uint64_t) my_nzombies) ; + GB_cuda_atomic_add ( &(C->nzombies), my_nzombies) ; } } diff --git a/GraphBLAS/CUDA/Template/GB_cuda_kernel.cuh b/GraphBLAS/CUDA/Template/GB_cuda_kernel.cuh new file mode 100644 index 0000000000..e37259530d --- /dev/null +++ b/GraphBLAS/CUDA/Template/GB_cuda_kernel.cuh @@ -0,0 +1,79 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/Template/GB_cuda_kernel.cuh: definitions for CUDA kernels +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +// This file is #include'd into all device functions for CUDA JIT kernels for +// GraphBLAS. It provides a subset of GraphBLAS.h and GB.h, plus other +// definitions. It is not used on the host. + +#pragma once + +//------------------------------------------------------------------------------ +// C++ and CUDA #include files +//------------------------------------------------------------------------------ + +#include +#include +#include +#include +#include +#include +#include +using namespace cooperative_groups ; + +//------------------------------------------------------------------------------ +// CUDA kernel definitions +//------------------------------------------------------------------------------ + +#define GB_CUDA_KERNEL + +#undef ASSERT +#define ASSERT(x) + +//------------------------------------------------------------------------------ +// NVIDIA warp size +//------------------------------------------------------------------------------ + +#define WARPSIZE 32 +#define LOG2_WARPSIZE 5 + +//------------------------------------------------------------------------------ + +// for internal static inline functions +#undef GB_STATIC_INLINE +#define GB_STATIC_INLINE static __device__ __inline__ + +//------------------------------------------------------------------------------ +// subset of GraphBLAS.h +//------------------------------------------------------------------------------ + +#include "GraphBLAS_h_subset.cuh" + +//------------------------------------------------------------------------------ +// subset of GB.h +//------------------------------------------------------------------------------ + +#include "GB_h_subset.cuh" + +//------------------------------------------------------------------------------ +// final #include files +//------------------------------------------------------------------------------ + +#include "GB_cuda_error.hpp" +#include "GB_printf_kernels.h" +#include "GB_cuda_atomics.cuh" +#include "GB_hash.h" +#include "GB_hyper_hash_lookup.h" + +extern "C" +{ + #include "GB_werk.h" + #include "GB_callback.h" +} + diff --git a/GraphBLAS/CUDA/Template/GB_cuda_kernel.h b/GraphBLAS/CUDA/Template/GB_cuda_kernel.h deleted file mode 100644 index b4eb500a56..0000000000 --- a/GraphBLAS/CUDA/Template/GB_cuda_kernel.h +++ /dev/null @@ -1,263 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/Template/GB_cuda_kernel.h: definitions for CUDA kernels -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -// This file is #include'd into all CUDA kernels for GraphBLAS. It provides -// a subset of GraphBLAS.h and GB.h, plus other definitions. - -// FIXME: rename to .cuh? - -#pragma once - -#define GB_CUDA_KERNEL - -#undef ASSERT -#define ASSERT(x) - -//------------------------------------------------------------------------------ -// TODO: this will be in the jit code: -#define chunksize 128 - -//------------------------------------------------------------------------------ -// NVIDIA warp size -//------------------------------------------------------------------------------ - -#define WARPSIZE 32 -#define LOG2_WARPSIZE 5 - -//------------------------------------------------------------------------------ - -#ifndef INFINITY -#define INFINITY (std::numeric_limits::max()) -#endif - -// for internal static inline functions -#undef GB_STATIC_INLINE -#define GB_STATIC_INLINE static __device__ __inline__ - -//------------------------------------------------------------------------------ -// subset of GraphBLAS.h -//------------------------------------------------------------------------------ - -#ifndef GRAPHBLAS_H -#define GRAPHBLAS_H - -#undef restrict -#undef GB_restrict -#define GB_restrict __restrict__ -#define restrict GB_restrict - -#include -//#include -#include -#include - -#undef GB_GLOBAL -#define GB_GLOBAL extern - -// GB_STR: convert the content of x into a string "x" -#define GB_XSTR(x) GB_STR(x) -#define GB_STR(x) #x - -#undef GxB_MAX_NAME_LEN -#define GxB_MAX_NAME_LEN 128 - -typedef uint64_t GrB_Index ; -typedef struct GB_Descriptor_opaque *GrB_Descriptor ; -typedef struct GB_Type_opaque *GrB_Type ; -typedef struct GB_UnaryOp_opaque *GrB_UnaryOp ; -typedef struct GB_BinaryOp_opaque *GrB_BinaryOp ; -typedef struct GB_IndexUnaryOp_opaque *GrB_IndexUnaryOp ; -typedef struct GB_Monoid_opaque *GrB_Monoid ; -typedef struct GB_Semiring_opaque *GrB_Semiring ; -typedef struct GB_Scalar_opaque *GrB_Scalar ; -typedef struct GB_Vector_opaque *GrB_Vector ; -typedef struct GB_Matrix_opaque *GrB_Matrix ; -typedef struct GB_Context_opaque *GxB_Context ; -typedef struct GB_Global_opaque *GrB_Global ; -typedef struct GB_Iterator_opaque *GxB_Iterator ; - -#define GxB_HYPERSPARSE 1 // store matrix in hypersparse form -#define GxB_SPARSE 2 // store matrix as sparse form (compressed vector) -#define GxB_BITMAP 4 // store matrix as a bitmap -#define GxB_FULL 8 // store matrix as full; all entries must be present - -typedef void (*GxB_unary_function) (void *, const void *) ; -typedef void (*GxB_binary_function) (void *, const void *, const void *) ; - -typedef bool (*GxB_select_function) // return true if A(i,j) is kept -( - GrB_Index i, // row index of A(i,j) - GrB_Index j, // column index of A(i,j) - const void *x, // value of A(i,j) - const void *thunk // optional input for select function -) ; - -typedef void (*GxB_index_unary_function) -( - void *z, // output value z, of type ztype - const void *x, // input value x of type xtype; value of v(i) or A(i,j) - GrB_Index i, // row index of A(i,j) - GrB_Index j, // column index of A(i,j), or zero for v(i) - const void *y // input scalar y -) ; - -#define GxB_GLOBAL_GPU_ID 26 - -typedef enum -{ - // for all GrB_Descriptor fields: - GxB_DEFAULT = 0, // default behavior of the method - - // for GrB_OUTP only: - GrB_REPLACE = 1, // clear the output before assigning new values to it - - // for GrB_MASK only: - GrB_COMP = 2, // use the structural complement of the input - GrB_SCMP = 2, // same as GrB_COMP (historical; use GrB_COMP instead) - GrB_STRUCTURE = 4, // use the only pattern of the mask, not its values - - // for GrB_INP0 and GrB_INP1 only: - GrB_TRAN = 3, // use the transpose of the input - - // for GxB_AxB_METHOD only: - GxB_AxB_GUSTAVSON = 1001, // gather-scatter saxpy method - GxB_AxB_DOT = 1003, // dot product - GxB_AxB_HASH = 1004, // hash-based saxpy method - GxB_AxB_SAXPY = 1005 // saxpy method (any kind) -} -GrB_Desc_Value ; - -#endif - -//------------------------------------------------------------------------------ -// subset of GB.h -//------------------------------------------------------------------------------ - -//#include GB_iceil.h -#define GB_ICEIL(a,b) (((a) + (b) - 1) / (b)) -//#include GB_imin.h -#define GB_IMAX(x,y) (((x) > (y)) ? (x) : (y)) -#define GB_IMIN(x,y) (((x) < (y)) ? (x) : (y)) -//#include GB_zombie.h -#define GB_FLIP(i) (-(i)-2) -#define GB_IS_FLIPPED(i) ((i) < 0) -#define GB_IS_ZOMBIE(i) ((i) < 0) -#define GB_IS_NOT_FLIPPED(i) ((i) >= 0) -#define GB_UNFLIP(i) (((i) < 0) ? GB_FLIP(i) : (i)) -#define GBI_UNFLIP(Ai,p,avlen) \ - ((Ai == NULL) ? ((p) % (avlen)) : GB_UNFLIP (Ai [p])) - -#include "GB_index.h" -#include "GB_partition.h" -#include "GB_pun.h" -#include "GB_opaque.h" -#include "GB_int64_mult.h" -#define GB_HAS_CMPLX_MACROS 1 -#include "GB_complex.h" - -// version for the GPU, with fewer branches -#define GB_TRIM_BINARY_SEARCH(i,X,pleft,pright) \ -{ \ - /* binary search of X [pleft ... pright] for integer i */ \ - while (pleft < pright) \ - { \ - int64_t pmiddle = (pleft + pright) >> 1 ; \ - bool less = (X [pmiddle] < i) ; \ - pleft = less ? (pmiddle+1) : pleft ; \ - pright = less ? pright : pmiddle ; \ - } \ - /* binary search is narrowed down to a single item */ \ - /* or it has found the list is empty */ \ - ASSERT (pleft == pright || pleft == pright + 1) ; \ -} - -#define GB_BINARY_SEARCH(i,X,pleft,pright,found) \ -{ \ - GB_TRIM_BINARY_SEARCH (i, X, pleft, pright) ; \ - found = (pleft == pright && X [pleft] == i) ; \ -} - -#define GB_SPLIT_BINARY_SEARCH(i,X,pleft,pright,found) \ -{ \ - GB_BINARY_SEARCH (i, X, pleft, pright, found) \ - if (!found && (pleft == pright)) \ - { \ - if (i > X [pleft]) \ - { \ - pleft++ ; \ - } \ - else \ - { \ - pright++ ; \ - } \ - } \ -} - -static __device__ __inline__ int64_t GB_search_for_vector_device -( - const int64_t p, // search for vector k that contains p - const int64_t *restrict Ap, // vector pointers to search - int64_t kleft, // left-most k to search - int64_t anvec, // Ap is of size anvec+1 - int64_t avlen // A->vlen -) -{ - - //-------------------------------------------------------------------------- - // check inputs - //-------------------------------------------------------------------------- - - if (Ap == NULL) - { - // A is full or bitmap - ASSERT (p >= 0 && p < avlen * anvec) ; - return ((avlen == 0) ? 0 : (p / avlen)) ; - } - - // A is sparse - ASSERT (p >= 0 && p < Ap [anvec]) ; - - //-------------------------------------------------------------------------- - // search for k - //-------------------------------------------------------------------------- - - int64_t k = kleft ; - int64_t kright = anvec ; - bool found ; - GB_SPLIT_BINARY_SEARCH (p, Ap, k, kright, found) ; - if (found) - { - // Ap [k] == p has been found, but if k is an empty vector, then the - // next vector will also contain the entry p. In that case, k needs to - // be incremented until finding the first non-empty vector for which - // Ap [k] == p. - ASSERT (Ap [k] == p) ; - while (k < anvec-1 && Ap [k+1] == p) - { - k++ ; - } - } - else - { - // p has not been found in Ap, so it appears in the middle of Ap [k-1] - // ... Ap [k], as computed by the binary search. This is the range of - // entries for the vector k-1, so k must be decremented. - k-- ; - } - - //-------------------------------------------------------------------------- - // return result - //-------------------------------------------------------------------------- - - // The entry p must reside in a non-empty vector. - ASSERT (k >= 0 && k < anvec) ; - ASSERT (Ap [k] <= p && p < Ap [k+1]) ; - - return (k) ; -} - diff --git a/GraphBLAS/CUDA/Template/GB_cuda_shfl_down.cuh b/GraphBLAS/CUDA/Template/GB_cuda_shfl_down.cuh new file mode 100644 index 0000000000..13b5c505f0 --- /dev/null +++ b/GraphBLAS/CUDA/Template/GB_cuda_shfl_down.cuh @@ -0,0 +1,384 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/Template/GB_cuda_shfl_down.cuh: warp-level reductions +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +//------------------------------------------------------------------------------ + +// shfl_down is a method in the cooperative_groups namespace. It allows all +// threads in a warp (or other thread partition) to work together in a +// cooperative fashion. +// +// Suppose we have a tile that defines a single warp of 32 threads: +// +// #define tile_sz 32 +// thread_block_tile tile = +// tiled_partition (this_thread_block()) ; +// +// Suppose each thread has two scalars dest and src of type T. Then: +// +// T dest, src ; +// dest = tile.shfl_down (src, delta) ; +// +// performs the following computation for each thread i: +// +// if (i+delta < tile_sz) +// { +// dest = (the value of src on thread i+delta) +// } +// +// Where i ranges from 0 to the tile_size-1, which is the warp size of 32 (the +// size of the tile, given by tile.num_threads() and also the #define'd value +// tile_sz), minus one. If i+delta >= tile_sz for the ith thread, then nothing +// happens for that thread, and the thread is inactive. +// +// Restrictions: tile_sz must be a power of 2, and it must be 32 or less for +// tile.shfl_down(). The type T must be trivially-copyable (that is +// is_trivially_copyable::value must be true), and sizeof (T) <= 32 must +// hold (that is, the size of T must be 32 bytes or less). The 32-byte limit +// is handled by GB_cuda_shfl_down_large_ztype, which uses repeated calls to +// tile.shfl_down on 32-byte chunks. + +// FIXME for tile.shfl_down(...), delta is an int, so can it be negative? +// For the __shfl_down warp shuffle function, delta is an unsigned int. + +//------------------------------------------------------------------------------ +// GB_cuda_warp_sum_uint64: reduce a uint64_t value across a single warp +//------------------------------------------------------------------------------ + +// On input, each thread in the tile holds a single uint64_t value. On output, +// thread zero holds the sum of values from all the warps. + +__device__ __inline__ uint64_t GB_cuda_warp_sum_uint64 +( + thread_block_tile tile, + uint64_t value +) +{ + + //-------------------------------------------------------------------------- + // sum value on all threads to a single value + //-------------------------------------------------------------------------- + + #if (tile_sz == 32) + { + // this is the typical case + value += tile.shfl_down (value, 16) ; + value += tile.shfl_down (value, 8) ; + value += tile.shfl_down (value, 4) ; + value += tile.shfl_down (value, 2) ; + value += tile.shfl_down (value, 1) ; + } + #else + { + #pragma unroll + for (int i = tile_sz >> 1 ; i > 0 ; i >>= 1) + { + value += tile.shfl_down (value, i) ; + } + } + #endif + + //-------------------------------------------------------------------------- + // return result + //-------------------------------------------------------------------------- + + // Note that only thread 0 will have the full summation of all values in + // the tile. To broadcast it to all threads, use the following: + + // value = tile.shfl (value, 0) ; + + return (value) ; +} + +#if 0 + +//------------------------------------------------------------------------------ +// warp_ReduceSumPlus_uint64: for dot3_phase2 +//------------------------------------------------------------------------------ + +__inline__ __device__ uint64_t warp_ReduceSumPlus_uint64 +( + thread_block_tile tile, + uint64_t val +) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = tile.num_threads() / 2; i > 0; i /= 2) + { + val += tile.shfl_down (val, i) ; + } + return val; // note: only thread 0 will return full sum +} + +//------------------------------------------------------------------------------ +// GB_warp_ReduceSumPlus_uint64_vsvs: for vsvs kernel +//------------------------------------------------------------------------------ + +__inline__ __device__ uint64_t GB_warp_ReduceSumPlus_uint64_vsvs +( + thread_block_tile g, + uint64_t val +) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + /* + #pragma unroll + for (int i = tile_sz >> 1; i > 0; i >>= 1) { + val += g.shfl_down( val, i); + } + */ + // assuming tile_sz is 32: + val += g.shfl_down( val, 16); + val += g.shfl_down( val, 8); + val += g.shfl_down( val, 4); + val += g.shfl_down( val, 2); + val += g.shfl_down( val, 1); + return val; // note: only thread 0 will return full sum +} + +//------------------------------------------------------------------------------ +// reduce_sum_int64: for vsdn +//------------------------------------------------------------------------------ + +// for counting zombies only (always int64_t) +__device__ int64_t reduce_sum_int64 +( + thread_block_tile g, + int64_t val +) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int64_t i = g.num_threads() / 2; i > 0; i /= 2) + { + val += g.shfl_down(val,i) ; + } + return val; // note: only thread 0 will return full sum +} + +#endif + +//------------------------------------------------------------------------------ +// GB_cuda_shfl_down_large_ztype: shfl_down a type larger than 32 bytes +//------------------------------------------------------------------------------ + +// This returns result = tile.shfl_down (value, delta), where value has type +// GB_Z_TYPE, and sizeof (GB_Z_TYPE) > 32. + +#if ( GB_Z_SIZE > 32 ) + + // # of 32-byte chunks to hold a single GB_Z_TYPE, excluding leftover + // chunk; GB_Z_SIZE is sizeof (GB_Z_TYPE) as a hard-coded constant. + #define GB_Z_NCHUNKS ( GB_Z_SIZE / 32 ) + + // ztype_chunk is always 32 bytes in size + typedef struct { uint8_t bytes [32] ; } ztype_chunk ; + + // size of the single leftover chunk of size 0 to < 32 bytes + #define GB_Z_LEFTOVER ( GB_Z_SIZE - ( GB_Z_NCHUNKS * 32 ) ) + + #if ( GB_Z_LEFTOVER > 0 ) + // leftover chunk is not defined if GB_Z_SIZE is a multiple of 32 + typedef struct { uint8_t bytes [GB_Z_LEFTOVER] ; } ztype_leftover ; + #endif + + __device__ __inline__ void GB_cuda_shfl_down_large_ztype + ( + GB_Z_TYPE *result, + thread_block_tile tile, + GB_Z_TYPE *value, + int delta + ) + { + + // get pointers to value and result, as chunks of size 32 bytes + struct ztype_chunk *v = (struct ztype_chunk *) value ; + struct ztype_chunk *r = (struct ztype_chunk *) result ; + + // shfl_down value into result, one chunk at a time + #pragma unroll + for (int chunk = 0 ; chunk < GB_Z_NCHUNKS ; chunk++, r++, v++) + { + (*r) = tile.shfl_down (*v, delta) ; + } + + #if ( GB_Z_LEFTOVER > 0 ) + // handle the leftover chunk, if it has nonzero size + struct ztype_leftover *v_leftover = (struct ztype_leftover *) v ; + struct ztype_leftover *r_leftover = (struct ztype_leftover *) r ; + (*r_leftover) = tile.shfl_down (*v_leftover, delta) ; + #endif + } + +#endif + +//------------------------------------------------------------------------------ +// GB_cuda_warp_reduce_ztype: reduce a ztype to a scalar, on a single warp +//------------------------------------------------------------------------------ + +// FIXME: make value parameter *value, and return type void? + +__device__ __inline__ GB_Z_TYPE GB_cuda_warp_reduce_ztype +( + thread_block_tile tile, + GB_Z_TYPE value +) +{ + + #if ( GB_Z_SIZE <= 32 ) + { + + //---------------------------------------------------------------------- + // GB_Z_TYPE can done with a single shfl_down + //---------------------------------------------------------------------- + + #if ( tile_sz == 32 ) + { + // this is the typical case + GB_Z_TYPE next ; + next = tile.shfl_down (value, 16) ; + GB_ADD (value, value, next) ; + next = tile.shfl_down (value, 8) ; + GB_ADD (value, value, next) ; + next = tile.shfl_down (value, 4) ; + GB_ADD (value, value, next) ; + next = tile.shfl_down (value, 2) ; + GB_ADD (value, value, next) ; + next = tile.shfl_down (value, 1) ; + GB_ADD (value, value, next) ; + } + #else + { + + #pragma unroll + for (int i = tile_sz >> 1 ; i > 0 ; i >>= 1) + { + GB_Z_TYPE next = tile.shfl_down (value, i) ; + GB_ADD (value, value, next) ; + } + + } + #endif + } + #else + { + + //---------------------------------------------------------------------- + // sizeof (GB_Z_TYPE) is too large for a single shfl_down + //---------------------------------------------------------------------- + + #pragma unroll + for (int i = tile_sz >> 1 ; i > 0 ; i >>= 1) + { + GB_Z_TYPE next ; + GB_cuda_shfl_down_large_ztype (&next, tile, &value, i) ; + GB_ADD (value, value, next) ; + } + } + #endif + + //-------------------------------------------------------------------------- + // return result + //-------------------------------------------------------------------------- + + // Note that only thread 0 will have the full summation of all values in + // the tile. To broadcast it to all threads, use the following: + + // value = tile.shfl (value, 0) ; + + // or if the ztype is large: + // GB_cuda_shfl_down_large_ztype (&value, tile, &value, 0) ; + + return (value) ; +} + +#if 0 + +//------------------------------------------------------------------------------ +// warp_ReduceSum_dndn: for dndn kernel +//------------------------------------------------------------------------------ + +__inline__ __device__ GB_Z_TYPE warp_ReduceSum_dndn +( + thread_block_tile<32> g, + GB_Z_TYPE val +) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + // FIXME: only works if sizeof(GB_Z_TYPE) <= 32 bytes + // FIXME: the ANY monoid needs the cij_exists for each thread + for (int i = g.num_threads() / 2; i > 0; i /= 2) + { + GB_Z_TYPE next = g.shfl_down( val, i) ; + GB_ADD( val, val, next ); + } + return val; // note: only thread 0 will return full sum +} + +//------------------------------------------------------------------------------ +// GB_reduce_sum: for dot3 mp and spdn +//------------------------------------------------------------------------------ + +__device__ __inline__ GB_Z_TYPE GB_reduce_sum +( + thread_block_tile g, + GB_Z_TYPE val +) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + // Temporary GB_Z_TYPE is necessary to handle arbirary ops + // FIXME: only works if sizeof(GB_Z_TYPE) <= 32 bytes + // FIXME: the ANY monoid needs the cij_exists for each thread + #pragma unroll + for (int i = tile_sz >> 1 ; i > 0 ; i >>= 1) + { + GB_Z_TYPE next = g.shfl_down (val, i) ; + GB_ADD (val, val, next) ; + } + return val; +} + +//------------------------------------------------------------------------------ +// GB_warp_Reduce: for cuda_reduce +//------------------------------------------------------------------------------ + +__device__ __inline__ GB_Z_TYPE GB_warp_Reduce +( + thread_block_tile g, + GB_Z_TYPE val +) +{ + // Each iteration halves the number of active threads + // Each thread adds its partial val[k] to val[lane+k] + + // FIXME: doesn't work unless sizeof(GB_Z_TYPE) <= 32 bytes + +#if ( GB_Z_SIZE <= 32 ) + // assumes tile_sz is 32: + GB_Z_TYPE fold = g.shfl_down ( val, 16) ; + GB_ADD ( val, val, fold ) ; + fold = g.shfl_down ( val, 8) ; + GB_ADD ( val, val, fold ) ; + fold = g.shfl_down ( val, 4) ; + GB_ADD ( val, val, fold ) ; + fold = g.shfl_down ( val, 2) ; + GB_ADD ( val, val, fold ) ; + fold = g.shfl_down ( val, 1) ; + GB_ADD ( val, val, fold ) ; +#else + // use shared memory and do not use shfl_down? + // or use repeated calls to shfl_down, on chunks of 32 bytes each? + #error "not implemented yet" +#endif + + return (val) ; // note: only thread 0 will return full val +} +#endif diff --git a/GraphBLAS/CUDA/Template/GB_cuda_timer.hpp b/GraphBLAS/CUDA/Template/GB_cuda_timer.hpp new file mode 100644 index 0000000000..12a6e87d6b --- /dev/null +++ b/GraphBLAS/CUDA/Template/GB_cuda_timer.hpp @@ -0,0 +1,52 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/test/GB_cuda_timer.hpp +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +#ifndef GB_CUDA_TIMER_HPP +#define GB_CUDA_TIMER_HPP + +#include +struct GpuTimer +{ + cudaEvent_t start; + cudaEvent_t stop; + + GpuTimer() + { + cudaEventCreate(&start); + cudaEventCreate(&stop); + } + + ~GpuTimer() + { + cudaEventDestroy(start); + cudaEventDestroy(stop); + } + + void Start() + { + cudaEventRecord(start, 0); + } + + void Stop() + { + cudaEventRecord(stop, 0); + } + + float Elapsed() + { + float elapsed; + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsed, start, stop); + return elapsed; + } +} ; + +#endif + diff --git a/GraphBLAS/CUDA/Template/GB_h_subset.cuh b/GraphBLAS/CUDA/Template/GB_h_subset.cuh new file mode 100644 index 0000000000..f371da6041 --- /dev/null +++ b/GraphBLAS/CUDA/Template/GB_h_subset.cuh @@ -0,0 +1,77 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/Template/GB_h_subset.cuh: subset of GB.h +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +// Note the header guard is the same as GB.h: +#ifndef GB_H +#define GB_H + +// from GB_iceil.h: +#define GB_ICEIL(a,b) (((a) + (b) - 1) / (b)) +// from GB_imin.h: +#define GB_IMAX(x,y) (((x) > (y)) ? (x) : (y)) +#define GB_IMIN(x,y) (((x) < (y)) ? (x) : (y)) +// from GB_zombie.h: +#define GB_FLIP(i) (-(i)-2) +#define GB_IS_FLIPPED(i) ((i) < 0) +#define GB_IS_ZOMBIE(i) ((i) < 0) +#define GB_IS_NOT_FLIPPED(i) ((i) >= 0) +#define GB_UNFLIP(i) (((i) < 0) ? GB_FLIP(i) : (i)) +#define GBI_UNFLIP(Ai,p,avlen) \ + ((Ai == NULL) ? ((p) % (avlen)) : GB_UNFLIP (Ai [p])) + +#include "GB_index.h" +#include "GB_partition.h" +#include "GB_pun.h" +#include "GB_opaque.h" +#include "GB_int64_mult.h" +#define GB_HAS_CMPLX_MACROS 1 +#include "GB_complex.h" +#include "GB_memory_macros.h" + +// version for the GPU, with fewer branches +#define GB_TRIM_BINARY_SEARCH(i,X,pleft,pright) \ +{ \ + /* binary search of X [pleft ... pright] for integer i */ \ + while (pleft < pright) \ + { \ + int64_t pmiddle = (pleft + pright) >> 1 ; \ + bool less = (X [pmiddle] < i) ; \ + pleft = less ? (pmiddle+1) : pleft ; \ + pright = less ? pright : pmiddle ; \ + } \ + /* binary search is narrowed down to a single item */ \ + /* or it has found the list is empty */ \ + ASSERT (pleft == pright || pleft == pright + 1) ; \ +} + +#define GB_BINARY_SEARCH(i,X,pleft,pright,found) \ +{ \ + GB_TRIM_BINARY_SEARCH (i, X, pleft, pright) ; \ + found = (pleft == pright && X [pleft] == i) ; \ +} + +#define GB_SPLIT_BINARY_SEARCH(i,X,pleft,pright,found) \ +{ \ + GB_BINARY_SEARCH (i, X, pleft, pright, found) \ + if (!found && (pleft == pright)) \ + { \ + if (i > X [pleft]) \ + { \ + pleft++ ; \ + } \ + else \ + { \ + pright++ ; \ + } \ + } \ +} + + +#endif + diff --git a/GraphBLAS/CUDA/Template/GraphBLAS_h_subset.cuh b/GraphBLAS/CUDA/Template/GraphBLAS_h_subset.cuh new file mode 100644 index 0000000000..53085666b5 --- /dev/null +++ b/GraphBLAS/CUDA/Template/GraphBLAS_h_subset.cuh @@ -0,0 +1,135 @@ +//------------------------------------------------------------------------------ +// GraphBLAS/CUDA/Template/GraphBLAS_h_subset.cuh: subset of GraphBLAS.h +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +// Note the header gaurd is the same as GraphBLAS.h: +#ifndef GRAPHBLAS_H +#define GRAPHBLAS_H + +typedef enum +{ + + GrB_SUCCESS = 0, // all is well + + //-------------------------------------------------------------------------- + // informational codes, not an error: + //-------------------------------------------------------------------------- + + GrB_NO_VALUE = 1, // A(i,j) requested but not there + GxB_EXHAUSTED = 7089, // iterator is exhausted + + //-------------------------------------------------------------------------- + // errors: + //-------------------------------------------------------------------------- + + GrB_UNINITIALIZED_OBJECT = -1, // object has not been initialized + GrB_NULL_POINTER = -2, // input pointer is NULL + GrB_INVALID_VALUE = -3, // generic error; some value is bad + GrB_INVALID_INDEX = -4, // row or column index is out of bounds + GrB_DOMAIN_MISMATCH = -5, // object domains are not compatible + GrB_DIMENSION_MISMATCH = -6, // matrix dimensions do not match + GrB_OUTPUT_NOT_EMPTY = -7, // output matrix already has values + GrB_NOT_IMPLEMENTED = -8, // method not implemented + GrB_ALREADY_SET = -9, // field already written to + GrB_PANIC = -101, // unknown error + GrB_OUT_OF_MEMORY = -102, // out of memory + GrB_INSUFFICIENT_SPACE = -103, // output array not large enough + GrB_INVALID_OBJECT = -104, // object is corrupted + GrB_INDEX_OUT_OF_BOUNDS = -105, // row or col index out of bounds + GrB_EMPTY_OBJECT = -106 // an object does not contain a value + +} +GrB_Info ; + +#undef restrict +#undef GB_restrict +#define GB_restrict __restrict__ +#define restrict GB_restrict + +#include +#include +#include + +#undef GB_GLOBAL +#define GB_GLOBAL extern + +// GB_STR: convert the content of x into a string "x" +#define GB_XSTR(x) GB_STR(x) +#define GB_STR(x) #x + +#undef GxB_MAX_NAME_LEN +#define GxB_MAX_NAME_LEN 128 + +typedef uint64_t GrB_Index ; +typedef struct GB_Descriptor_opaque *GrB_Descriptor ; +typedef struct GB_Type_opaque *GrB_Type ; +typedef struct GB_UnaryOp_opaque *GrB_UnaryOp ; +typedef struct GB_BinaryOp_opaque *GrB_BinaryOp ; +typedef struct GB_IndexUnaryOp_opaque *GrB_IndexUnaryOp ; +typedef struct GB_Monoid_opaque *GrB_Monoid ; +typedef struct GB_Semiring_opaque *GrB_Semiring ; +typedef struct GB_Scalar_opaque *GrB_Scalar ; +typedef struct GB_Vector_opaque *GrB_Vector ; +typedef struct GB_Matrix_opaque *GrB_Matrix ; +typedef struct GB_Context_opaque *GxB_Context ; +typedef struct GB_Global_opaque *GrB_Global ; +typedef struct GB_Iterator_opaque *GxB_Iterator ; + +#define GxB_HYPERSPARSE 1 // store matrix in hypersparse form +#define GxB_SPARSE 2 // store matrix as sparse form (compressed vector) +#define GxB_BITMAP 4 // store matrix as a bitmap +#define GxB_FULL 8 // store matrix as full; all entries must be present + +typedef void (*GxB_unary_function) (void *, const void *) ; +typedef void (*GxB_binary_function) (void *, const void *, const void *) ; + +typedef bool (*GxB_select_function) // return true if A(i,j) is kept +( + GrB_Index i, // row index of A(i,j) + GrB_Index j, // column index of A(i,j) + const void *x, // value of A(i,j) + const void *thunk // optional input for select function +) ; + +typedef void (*GxB_index_unary_function) +( + void *z, // output value z, of type ztype + const void *x, // input value x of type xtype; value of v(i) or A(i,j) + GrB_Index i, // row index of A(i,j) + GrB_Index j, // column index of A(i,j), or zero for v(i) + const void *y // input scalar y +) ; + +#define GxB_GLOBAL_GPU_ID 26 + +typedef enum +{ + // for all GrB_Descriptor fields: + GxB_DEFAULT = 0, // default behavior of the method + + // for GrB_OUTP only: + GrB_REPLACE = 1, // clear the output before assigning new values to it + + // for GrB_MASK only: + GrB_COMP = 2, // use the structural complement of the input + GrB_SCMP = 2, // same as GrB_COMP (historical; use GrB_COMP instead) + GrB_STRUCTURE = 4, // use the only pattern of the mask, not its values + + // for GrB_INP0 and GrB_INP1 only: + GrB_TRAN = 3, // use the transpose of the input + + // for GxB_AxB_METHOD only: + GxB_AxB_GUSTAVSON = 1001, // gather-scatter saxpy method + GxB_AxB_DOT = 1003, // dot product + GxB_AxB_HASH = 1004, // hash-based saxpy method + GxB_AxB_SAXPY = 1005 // saxpy method (any kind) +} +GrB_Desc_Value ; + +#endif + diff --git a/GraphBLAS/CUDA/go b/GraphBLAS/CUDA/go deleted file mode 100755 index 74b4fa0787..0000000000 --- a/GraphBLAS/CUDA/go +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# nuke the cached kernels and src -find ~/.SuiteSparse/GrB9.0.1 -mindepth 1 -delete - -# rebuild the JITpackage -( cd ../JITpackage ; make purge ; make ) - -# rebuild GraphBLAS -( cd .. ; make ) - -# run a demo -../build/wathen_demo - diff --git a/GraphBLAS/CUDA/jitify.hpp b/GraphBLAS/CUDA/jitify.hpp deleted file mode 100644 index 4dc3a9b9b6..0000000000 --- a/GraphBLAS/CUDA/jitify.hpp +++ /dev/null @@ -1,4196 +0,0 @@ -/* - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * SPDX-License-Identifier: BSD-3-Clause - */ - -/* - ----------- - Jitify 0.9 - ----------- - A C++ library for easy integration of CUDA runtime compilation into - existing codes. - - -------------- - How to compile - -------------- - Compiler dependencies: , -std=c++11 - Linker dependencies: dl cuda nvrtc - - -------------------------------------- - Embedding source files into executable - -------------------------------------- - g++ ... -ldl -rdynamic -DJITIFY_ENABLE_EMBEDDED_FILES=1 - -Wl,-b,binary,my_kernel.cu,include/my_header.cuh,-b,default nvcc ... -ldl - -Xcompiler "-rdynamic - -Wl\,-b\,binary\,my_kernel.cu\,include/my_header.cuh\,-b\,default" - JITIFY_INCLUDE_EMBEDDED_FILE(my_kernel_cu); - JITIFY_INCLUDE_EMBEDDED_FILE(include_my_header_cuh); - - ---- - TODO - ---- - Extract valid compile options and pass the rest to cuModuleLoadDataEx - See if can have stringified headers automatically looked-up - by having stringify add them to a (static) global map. - The global map can be updated by creating a static class instance - whose constructor performs the registration. - Can then remove all headers from JitCache constructor in example code - See other TODOs in code -*/ - -/*! \file jitify.hpp - * \brief The Jitify library header - */ - -/*! \mainpage Jitify - A C++ library that simplifies the use of NVRTC - * \p Use class jitify::JitCache to manage and launch JIT-compiled CUDA - * kernels. - * - * \p Use namespace jitify::reflection to reflect types and values into - * code-strings. - * - * \p Use JITIFY_INCLUDE_EMBEDDED_FILE() to declare files that have been - * embedded into the executable using the GCC linker. - * - * \p Use jitify::parallel_for and JITIFY_LAMBDA() to generate and launch - * simple kernels. - */ - -#pragma once - -#ifndef JITIFY_THREAD_SAFE -#define JITIFY_THREAD_SAFE 1 -#endif - -#if JITIFY_ENABLE_EMBEDDED_FILES -#include -#endif -#include -#include -#include -#include // For strtok_r etc. -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if JITIFY_THREAD_SAFE -#include -#endif - -#include -#include // For dim3, cudaStream_t -#if CUDA_VERSION >= 8000 -#define NVRTC_GET_TYPE_NAME 1 -#endif -#include - -// For use by get_current_executable_path(). -#ifdef __linux__ -#include // For PATH_MAX - -#include // For realpath -#define JITIFY_PATH_MAX PATH_MAX -#elif defined(_WIN32) || defined(_WIN64) -#include -#define JITIFY_PATH_MAX MAX_PATH -#else -#error "Unsupported platform" -#endif - -#ifdef _MSC_VER // MSVC compiler -#include // For UnDecorateSymbolName -#else -#include // For abi::__cxa_demangle -#endif - -#if defined(_WIN32) || defined(_WIN64) -// WAR for strtok_r being called strtok_s on Windows -#pragma push_macro("strtok_r") -#undef strtok_r -#define strtok_r strtok_s -// WAR for min and max possibly being macros defined by windows.h -#pragma push_macro("min") -#pragma push_macro("max") -#undef min -#undef max -#endif - -#ifndef JITIFY_PRINT_LOG -#define JITIFY_PRINT_LOG 1 -#endif - -#define JITIFY_PRINT_ALL 0 - -#if JITIFY_PRINT_ALL -#define JITIFY_PRINT_INSTANTIATION 1 -#define JITIFY_PRINT_SOURCE 1 -#define JITIFY_PRINT_LOG 1 -#define JITIFY_PRINT_PTX 1 -#define JITIFY_PRINT_LINKER_LOG 1 -#define JITIFY_PRINT_LAUNCH 1 -#define JITIFY_PRINT_HEADER_PATHS 1 -#endif - -#if JITIFY_ENABLE_EMBEDDED_FILES -#define JITIFY_FORCE_UNDEFINED_SYMBOL(x) void* x##_forced = (void*)&x -/*! Include a source file that has been embedded into the executable using the - * GCC linker. - * \param name The name of the source file (not as a string), which must - * be sanitized by replacing non-alpha-numeric characters with underscores. - * E.g., \code{.cpp}JITIFY_INCLUDE_EMBEDDED_FILE(my_header_h)\endcode will - * include the embedded file "my_header.h". - * \note Files declared with this macro can be referenced using - * their original (unsanitized) filenames when creating a \p - * jitify::Program instance. - */ -#define JITIFY_INCLUDE_EMBEDDED_FILE(name) \ - extern "C" uint8_t _jitify_binary_##name##_start[] asm("_binary_" #name \ - "_start"); \ - extern "C" uint8_t _jitify_binary_##name##_end[] asm("_binary_" #name \ - "_end"); \ - JITIFY_FORCE_UNDEFINED_SYMBOL(_jitify_binary_##name##_start); \ - JITIFY_FORCE_UNDEFINED_SYMBOL(_jitify_binary_##name##_end) -#endif // JITIFY_ENABLE_EMBEDDED_FILES - -/*! Jitify library namespace - */ -namespace jitify { - -/*! Source-file load callback. - * - * \param filename The name of the requested source file. - * \param tmp_stream A temporary stream that can be used to hold source code. - * \return A pointer to an input stream containing the source code, or NULL - * to defer loading of the file to Jitify's file-loading mechanisms. - */ -typedef std::istream* (*file_callback_type)(std::string filename, - std::iostream& tmp_stream); -// Exclude from Doxygen -//! \cond - -class JitCache; - -// Simple cache using LRU discard policy -template -class ObjectCache { - public: - typedef KeyType key_type; - typedef ValueType value_type; - - private: - typedef std::map object_map; - typedef std::deque key_rank; - typedef typename key_rank::iterator rank_iterator; - object_map _objects; - key_rank _ranked_keys; - size_t _capacity; - - inline void discard_old(size_t n = 0) { - if (n > _capacity) { - throw std::runtime_error("Insufficient capacity in cache"); - } - while (_objects.size() > _capacity - n) { - key_type discard_key = _ranked_keys.back(); - _ranked_keys.pop_back(); - _objects.erase(discard_key); - } - } - - public: - inline ObjectCache(size_t capacity = 8) : _capacity(capacity) {} - inline void resize(size_t capacity) { - _capacity = capacity; - this->discard_old(); - } - inline bool contains(const key_type& k) const { - return (bool)_objects.count(k); - } - inline void touch(const key_type& k) { - if (!this->contains(k)) { - throw std::runtime_error("Key not found in cache"); - } - rank_iterator rank = std::find(_ranked_keys.begin(), _ranked_keys.end(), k); - if (rank != _ranked_keys.begin()) { - // Move key to front of ranks - _ranked_keys.erase(rank); - _ranked_keys.push_front(k); - } - } - inline value_type& get(const key_type& k) { - if (!this->contains(k)) { - throw std::runtime_error("Key not found in cache"); - } - this->touch(k); - return _objects[k]; - } - inline value_type& insert(const key_type& k, - const value_type& v = value_type()) { - this->discard_old(1); - _ranked_keys.push_front(k); - return _objects.insert(std::make_pair(k, v)).first->second; - } - template - inline value_type& emplace(const key_type& k, Args&&... args) { - this->discard_old(1); - // Note: Use of piecewise_construct allows non-movable non-copyable types - auto iter = _objects - .emplace(std::piecewise_construct, std::forward_as_tuple(k), - std::forward_as_tuple(args...)) - .first; - _ranked_keys.push_front(iter->first); - return iter->second; - } -}; - -namespace detail { - -// Convenience wrapper for std::vector that provides handy constructors -template -class vector : public std::vector { - typedef std::vector super_type; - - public: - vector() : super_type() {} - vector(size_t n) : super_type(n) {} // Note: Not explicit, allows =0 - vector(std::vector const& vals) : super_type(vals) {} - template - vector(T const (&vals)[N]) : super_type(vals, vals + N) {} - vector(std::vector&& vals) : super_type(vals) {} - vector(std::initializer_list vals) : super_type(vals) {} -}; - -// Helper functions for parsing/manipulating source code - -inline std::string replace_characters(std::string str, - std::string const& oldchars, - char newchar) { - size_t i = str.find_first_of(oldchars); - while (i != std::string::npos) { - str[i] = newchar; - i = str.find_first_of(oldchars, i + 1); - } - return str; -} -inline std::string sanitize_filename(std::string name) { - return replace_characters(name, "/\\.-: ?%*|\"<>", '_'); -} - -#if JITIFY_ENABLE_EMBEDDED_FILES -class EmbeddedData { - void* _app; - EmbeddedData(EmbeddedData const&); - EmbeddedData& operator=(EmbeddedData const&); - - public: - EmbeddedData() { - _app = dlopen(NULL, RTLD_LAZY); - if (!_app) { - throw std::runtime_error(std::string("dlopen failed: ") + dlerror()); - } - dlerror(); // Clear any existing error - } - ~EmbeddedData() { - if (_app) { - dlclose(_app); - } - } - const uint8_t* operator[](std::string key) const { - key = sanitize_filename(key); - key = "_binary_" + key; - uint8_t const* data = (uint8_t const*)dlsym(_app, key.c_str()); - if (!data) { - throw std::runtime_error(std::string("dlsym failed: ") + dlerror()); - } - return data; - } - const uint8_t* begin(std::string key) const { - return (*this)[key + "_start"]; - } - const uint8_t* end(std::string key) const { return (*this)[key + "_end"]; } -}; -#endif // JITIFY_ENABLE_EMBEDDED_FILES - -inline bool is_tokenchar(char c) { - return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || - (c >= '0' && c <= '9') || c == '_'; -} -inline std::string replace_token(std::string src, std::string token, - std::string replacement) { - size_t i = src.find(token); - while (i != std::string::npos) { - if (i == 0 || i == src.size() - token.size() || - (!is_tokenchar(src[i - 1]) && !is_tokenchar(src[i + token.size()]))) { - src.replace(i, token.size(), replacement); - i += replacement.size(); - } else { - i += token.size(); - } - i = src.find(token, i); - } - return src; -} -inline std::string path_base(std::string p) { - // "/usr/local/myfile.dat" -> "/usr/local" - // "foo/bar" -> "foo" - // "foo/bar/" -> "foo/bar" -#if defined _WIN32 || defined _WIN64 - char sep = '\\'; -#else - char sep = '/'; -#endif - size_t i = p.find_last_of(sep); - if (i != std::string::npos) { - return p.substr(0, i); - } else { - return ""; - } -} -inline std::string path_join(std::string p1, std::string p2) { -#ifdef _WIN32 - char sep = '\\'; -#else - char sep = '/'; -#endif - if (p1.size() && p2.size() && p2[0] == sep) { - throw std::invalid_argument("Cannot join to absolute path"); - } - if (p1.size() && p1[p1.size() - 1] != sep) { - p1 += sep; - } - return p1 + p2; -} -// Elides "/." and "/.." tokens from path. -inline std::string path_simplify(const std::string& path) { - std::vector dirs; - std::string cur_dir; - bool after_slash = false; - for (int i = 0; i < (int)path.size(); ++i) { - if (path[i] == '/') { - if (after_slash) continue; // Ignore repeat slashes - after_slash = true; - if (cur_dir == ".." && !dirs.empty() && dirs.back() != "..") { - if (dirs.size() == 1 && dirs.front().empty()) { - throw std::runtime_error( - "Invalid path: back-traversals exceed depth of absolute path"); - } - dirs.pop_back(); - } else if (cur_dir != ".") { // Ignore /./ - dirs.push_back(cur_dir); - } - cur_dir.clear(); - } else { - after_slash = false; - cur_dir.push_back(path[i]); - } - } - if (!after_slash) { - dirs.push_back(cur_dir); - } - std::stringstream ss; - for (int i = 0; i < (int)dirs.size() - 1; ++i) { - ss << dirs[i] << "/"; - } - if (!dirs.empty()) ss << dirs.back(); - if (after_slash) ss << "/"; - return ss.str(); -} -inline unsigned long long hash_larson64(const char* s, - unsigned long long seed = 0) { - unsigned long long hash = seed; - while (*s) { - hash = hash * 101 + *s++; - } - return hash; -} - -inline uint64_t hash_combine(uint64_t a, uint64_t b) { - // Note: The magic number comes from the golden ratio - return a ^ (0x9E3779B97F4A7C17ull + b + (b >> 2) + (a << 6)); -} - -inline bool extract_include_info_from_compile_error(std::string log, - std::string& name, - std::string& parent, - int& line_num) { - static const std::vector pattern = { - "could not open source file \"", "cannot open source file \""}; - - for (auto& p : pattern) { - size_t beg = log.find(p); - if (beg != std::string::npos) { - beg += p.size(); - size_t end = log.find("\"", beg); - name = log.substr(beg, end - beg); - - size_t line_beg = log.rfind("\n", beg); - if (line_beg == std::string::npos) { - line_beg = 0; - } else { - line_beg += 1; - } - - size_t split = log.find("(", line_beg); - parent = log.substr(line_beg, split - line_beg); - line_num = - atoi(log.substr(split + 1, log.find(")", split + 1) - (split + 1)) - .c_str()); - - return true; - } - } - - return false; -} - -inline bool is_include_directive_with_quotes(const std::string& source, - int line_num) { - // TODO: Check each find() for failure. - size_t beg = 0; - for (int i = 1; i < line_num; ++i) { - beg = source.find("\n", beg) + 1; - } - beg = source.find("include", beg) + 7; - beg = source.find_first_of("\"<", beg); - return source[beg] == '"'; -} - -inline std::string comment_out_code_line(int line_num, std::string source) { - size_t beg = 0; - for (int i = 1; i < line_num; ++i) { - beg = source.find("\n", beg) + 1; - } - return (source.substr(0, beg) + "//" + source.substr(beg)); -} - -inline void print_with_line_numbers(std::string const& source) { - int linenum = 1; - std::stringstream source_ss(source); - for (std::string line; std::getline(source_ss, line); ++linenum) { - std::cout << std::setfill(' ') << std::setw(3) << linenum << " " << line - << std::endl; - } -} - -inline void print_compile_log(std::string program_name, - std::string const& log) { - std::cout << "---------------------------------------------------" - << std::endl; - std::cout << "--- JIT compile log for " << program_name << " ---" - << std::endl; - std::cout << "---------------------------------------------------" - << std::endl; - std::cout << log << std::endl; - std::cout << "---------------------------------------------------" - << std::endl; -} - -inline std::vector split_string(std::string str, - long maxsplit = -1, - std::string delims = " \t") { - std::vector results; - if (maxsplit == 0) { - results.push_back(str); - return results; - } - // Note: +1 to include NULL-terminator - std::vector v_str(str.c_str(), str.c_str() + (str.size() + 1)); - char* c_str = v_str.data(); - char* saveptr = c_str; - char* token = nullptr; - for (long i = 0; i != maxsplit; ++i) { - token = ::strtok_r(c_str, delims.c_str(), &saveptr); - c_str = 0; - if (!token) { - return results; - } - results.push_back(token); - } - // Check if there's a final piece - token += ::strlen(token) + 1; - if (token - v_str.data() < (ptrdiff_t)str.size()) { - // Find the start of the final piece - token += ::strspn(token, delims.c_str()); - if (*token) { - results.push_back(token); - } - } - return results; -} - -static const std::map& get_jitsafe_headers_map(); - -inline bool load_source( - std::string filename, std::map& sources, - std::string current_dir = "", - std::vector include_paths = std::vector(), - file_callback_type file_callback = 0, - std::map* fullpaths = nullptr, - bool search_current_dir = true) { - std::istream* source_stream = 0; - std::stringstream string_stream; - std::ifstream file_stream; - // First detect direct source-code string ("my_program\nprogram_code...") - size_t newline_pos = filename.find("\n"); - if (newline_pos != std::string::npos) { - std::string source = filename.substr(newline_pos + 1); - filename = filename.substr(0, newline_pos); - string_stream << source; - source_stream = &string_stream; - } - if (sources.count(filename)) { - // Already got this one - return true; - } - if (!source_stream) { - std::string fullpath = path_join(current_dir, filename); - // Try loading from callback - if (!file_callback || - !(source_stream = file_callback(fullpath, string_stream))) { -#if JITIFY_ENABLE_EMBEDDED_FILES - // Try loading as embedded file - EmbeddedData embedded; - std::string source; - try { - source.assign(embedded.begin(fullpath), embedded.end(fullpath)); - string_stream << source; - source_stream = &string_stream; - } catch (std::runtime_error const&) -#endif // JITIFY_ENABLE_EMBEDDED_FILES - { - // Try loading from filesystem - bool found_file = false; - if (search_current_dir) { - file_stream.open(fullpath.c_str()); - if (file_stream) { - source_stream = &file_stream; - found_file = true; - } - } - // Search include directories - if (!found_file) { - for (int i = 0; i < (int)include_paths.size(); ++i) { - fullpath = path_join(include_paths[i], filename); - file_stream.open(fullpath.c_str()); - if (file_stream) { - source_stream = &file_stream; - found_file = true; - break; - } - } - if (!found_file) { - // Try loading from builtin headers - fullpath = path_join("__jitify_builtin", filename); - auto it = get_jitsafe_headers_map().find(filename); - if (it != get_jitsafe_headers_map().end()) { - string_stream << it->second; - source_stream = &string_stream; - } else { - return false; - } - } - } - } - } - if (fullpaths) { - // Record the full file path corresponding to this include name. - (*fullpaths)[filename] = path_simplify(fullpath); - } - } - sources[filename] = std::string(); - std::string& source = sources[filename]; - std::string line; - size_t linenum = 0; - unsigned long long hash = 0; - bool pragma_once = false; - bool remove_next_blank_line = false; - while (std::getline(*source_stream, line)) { - ++linenum; - - // HACK WAR for static variables not allowed on the device (unless - // __shared__) - // TODO: This breaks static member variables - // line = replace_token(line, "static const", "/*static*/ const"); - - // TODO: Need to watch out for /* */ comments too - std::string cleanline = - line.substr(0, line.find("//")); // Strip line comments - // if( cleanline.back() == "\r" ) { // Remove Windows line ending - // cleanline = cleanline.substr(0, cleanline.size()-1); - //} - // TODO: Should trim whitespace before checking .empty() - if (cleanline.empty() && remove_next_blank_line) { - remove_next_blank_line = false; - continue; - } - // Maintain a file hash for use in #pragma once WAR - hash = hash_larson64(line.c_str(), hash); - if (cleanline.find("#pragma once") != std::string::npos) { - pragma_once = true; - // Note: This is an attempt to recover the original line numbering, - // which otherwise gets off-by-one due to the include guard. - remove_next_blank_line = true; - // line = "//" + line; // Comment out the #pragma once line - continue; - } - - // HACK WAR for Thrust using "#define FOO #pragma bar" - size_t pragma_beg = cleanline.find("#pragma "); - if (pragma_beg != std::string::npos) { - std::string line_after_pragma = line.substr(pragma_beg); - std::vector pragma_split = - split_string(line_after_pragma, 2); - line = - (line.substr(0, pragma_beg) + "_Pragma(\"" + pragma_split[1] + "\")"); - if (pragma_split.size() == 3) { - line += " " + pragma_split[2]; - } - } - - source += line + "\n"; - } - // HACK TESTING (WAR for cub) - // source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source; - ////source = "cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }\n" + - /// source; - - // WAR for #pragma once causing problems when there are multiple inclusions - // of the same header from different paths. - if (pragma_once) { - std::stringstream ss; - ss << std::uppercase << std::hex << std::setw(8) << std::setfill('0') - << hash; - std::string include_guard_name = "_JITIFY_INCLUDE_GUARD_" + ss.str() + "\n"; - std::string include_guard_header; - include_guard_header += "#ifndef " + include_guard_name; - include_guard_header += "#define " + include_guard_name; - std::string include_guard_footer; - include_guard_footer += "#endif // " + include_guard_name; - source = include_guard_header + source + "\n" + include_guard_footer; - } - // return filename; - return true; -} - -} // namespace detail - -//! \endcond - -/*! Jitify reflection utilities namespace - */ -namespace reflection { - -// Provides type and value reflection via a function 'reflect': -// reflect() -> "Type" -// reflect(value) -> "(T)value" -// reflect() -> "VAL" -// reflect -> "VAL" -// reflect_template,char>() -> "" -// reflect_template({"float", "7", "char"}) -> "" - -/*! A wrapper class for non-type template parameters. - */ -template -struct NonType { - constexpr static T VALUE = VALUE_; -}; - -// Forward declaration -template -inline std::string reflect(T const& value); - -//! \cond - -namespace detail { - -template -inline std::string value_string(const T& x) { - std::stringstream ss; - ss << x; - return ss.str(); -} -// WAR for non-printable characters -template <> -inline std::string value_string(const char& x) { - std::stringstream ss; - ss << (int)x; - return ss.str(); -} -template <> -inline std::string value_string(const signed char& x) { - std::stringstream ss; - ss << (int)x; - return ss.str(); -} -template <> -inline std::string value_string(const unsigned char& x) { - std::stringstream ss; - ss << (int)x; - return ss.str(); -} -template <> -inline std::string value_string(const wchar_t& x) { - std::stringstream ss; - ss << (long)x; - return ss.str(); -} -// Specialisation for bool true/false literals -template <> -inline std::string value_string(const bool& x) { - return x ? "true" : "false"; -} - -// Removes all tokens that start with double underscores. -inline void strip_double_underscore_tokens(char* s) { - using jitify::detail::is_tokenchar; - char* w = s; - do { - if (*s == '_' && *(s + 1) == '_') { - while (is_tokenchar(*++s)) - ; - } - } while ((*w++ = *s++)); -} - -//#if CUDA_VERSION < 8000 -#ifdef _MSC_VER // MSVC compiler -inline std::string demangle_cuda_symbol(const char* mangled_name) { - // We don't have a way to demangle CUDA symbol names under MSVC. - return mangled_name; -} -inline std::string demangle_native_type(const std::type_info& typeinfo) { - // Get the decorated name and skip over the leading '.'. - const char* decorated_name = typeinfo.raw_name() + 1; - char undecorated_name[4096]; - if (UnDecorateSymbolName( - decorated_name, undecorated_name, - sizeof(undecorated_name) / sizeof(*undecorated_name), - UNDNAME_NO_ARGUMENTS | // Treat input as a type name - UNDNAME_NAME_ONLY // No "class" and "struct" prefixes - /*UNDNAME_NO_MS_KEYWORDS*/)) { // No "__cdecl", "__ptr64" etc. - // WAR for UNDNAME_NO_MS_KEYWORDS messing up function types. - strip_double_underscore_tokens(undecorated_name); - return undecorated_name; - } - throw std::runtime_error("UnDecorateSymbolName failed"); -} -#else // not MSVC -inline std::string demangle_cuda_symbol(const char* mangled_name) { - size_t bufsize = 0; - char* buf = nullptr; - std::string demangled_name; - int status; - auto demangled_ptr = std::unique_ptr( - abi::__cxa_demangle(mangled_name, buf, &bufsize, &status), free); - if (status == 0) { - demangled_name = demangled_ptr.get(); // all worked as expected - } else if (status == -2) { - demangled_name = mangled_name; // we interpret this as plain C name - } else if (status == -1) { - throw std::runtime_error( - std::string("memory allocation failure in __cxa_demangle")); - } else if (status == -3) { - throw std::runtime_error(std::string("invalid argument to __cxa_demangle")); - } - return demangled_name; -} -inline std::string demangle_native_type(const std::type_info& typeinfo) { - return demangle_cuda_symbol(typeinfo.name()); -} -#endif // not MSVC -//#endif // CUDA_VERSION < 8000 - -template -class JitifyTypeNameWrapper_ {}; - -template -struct type_reflection { - inline static std::string name() { - //#if CUDA_VERSION < 8000 - // TODO: Use nvrtcGetTypeName once it has the same behavior as this. - // WAR for typeid discarding cv qualifiers on value-types - // Wrap type in dummy template class to preserve cv-qualifiers, then strip - // off the wrapper from the resulting string. - std::string wrapped_name = - demangle_native_type(typeid(JitifyTypeNameWrapper_)); - // Note: The reflected name of this class also has namespace prefixes. - const std::string wrapper_class_name = "JitifyTypeNameWrapper_<"; - size_t start = wrapped_name.find(wrapper_class_name); - if (start == std::string::npos) { - throw std::runtime_error("Type reflection failed: " + wrapped_name); - } - start += wrapper_class_name.size(); - std::string name = - wrapped_name.substr(start, wrapped_name.size() - (start + 1)); - return name; - //#else - // std::string ret; - // nvrtcResult status = nvrtcGetTypeName(&ret); - // if( status != NVRTC_SUCCESS ) { - // throw std::runtime_error(std::string("nvrtcGetTypeName - // failed: - //")+ nvrtcGetErrorString(status)); - // } - // return ret; - //#endif - } -}; // namespace detail -template -struct type_reflection > { - inline static std::string name() { - return jitify::reflection::reflect(VALUE); - } -}; - -} // namespace detail - -//! \endcond - -/*! Create an Instance object that contains a const reference to the - * value. We use this to wrap abstract objects from which we want to extract - * their type at runtime (e.g., derived type). This is used to facilitate - * templating on derived type when all we know at compile time is abstract - * type. - */ -template -struct Instance { - const T& value; - Instance(const T& value) : value(value) {} -}; - -/*! Create an Instance object from which we can extract the value's run-time - * type. - * \param value The const value to be captured. - */ -template -inline Instance instance_of(T const& value) { - return Instance(value); -} - -/*! A wrapper used for representing types as values. - */ -template -struct Type {}; - -// Type reflection -// E.g., reflect() -> "float" -// Note: This strips trailing const and volatile qualifiers -/*! Generate a code-string for a type. - * \code{.cpp}reflect() --> "float"\endcode - */ -template -inline std::string reflect() { - return detail::type_reflection::name(); -} -// Value reflection -// E.g., reflect(3.14f) -> "(float)3.14" -/*! Generate a code-string for a value. - * \code{.cpp}reflect(3.14f) --> "(float)3.14"\endcode - */ -template -inline std::string reflect(T const& value) { - return "(" + reflect() + ")" + detail::value_string(value); -} -// Non-type template arg reflection (implicit conversion to int64_t) -// E.g., reflect<7>() -> "(int64_t)7" -/*! Generate a code-string for an integer non-type template argument. - * \code{.cpp}reflect<7>() --> "(int64_t)7"\endcode - */ -template -inline std::string reflect() { - return reflect >(); -} -// Non-type template arg reflection (explicit type) -// E.g., reflect() -> "(int)7" -/*! Generate a code-string for a generic non-type template argument. - * \code{.cpp} reflect() --> "(int)7" \endcode - */ -template -inline std::string reflect() { - return reflect >(); -} -// Type reflection via value -// E.g., reflect(Type()) -> "float" -/*! Generate a code-string for a type wrapped as a Type instance. - * \code{.cpp}reflect(Type()) --> "float"\endcode - */ -template -inline std::string reflect(jitify::reflection::Type) { - return reflect(); -} - -/*! Generate a code-string for a type wrapped as an Instance instance. - * \code{.cpp}reflect(Instance(3.1f)) --> "float"\endcode - * or more simply when passed to a instance_of helper - * \code{.cpp}reflect(instance_of(3.1f)) --> "float"\endcodei - * This is specifically for the case where we want to extract the run-time - * type, e.g., derived type, of an object pointer. - */ -template -inline std::string reflect(jitify::reflection::Instance& value) { - return detail::demangle_native_type(typeid(value.value)); -} - -// Type from value -// E.g., type_of(3.14f) -> Type() -/*! Create a Type object representing a value's type. - * \param value The value whose type is to be captured. - */ -template -inline Type type_of(T& value) { - return Type(); -} -/*! Create a Type object representing a value's type. - * \param value The const value whose type is to be captured. - */ -template -inline Type type_of(T const& value) { - return Type(); -} - -// Multiple value reflections one call, returning list of strings -template -inline std::vector reflect_all(Args... args) { - return {reflect(args)...}; -} - -inline std::string reflect_list(jitify::detail::vector const& args, - std::string opener = "", - std::string closer = "") { - std::stringstream ss; - ss << opener; - for (int i = 0; i < (int)args.size(); ++i) { - if (i > 0) ss << ","; - ss << args[i]; - } - ss << closer; - return ss.str(); -} - -// Template instantiation reflection -// inline std::string reflect_template(std::vector const& args) { -inline std::string reflect_template( - jitify::detail::vector const& args) { - // Note: The space in " >" is a WAR to avoid '>>' appearing - return reflect_list(args, "<", " >"); -} -// TODO: See if can make this evaluate completely at compile-time -template -inline std::string reflect_template() { - return reflect_template({reflect()...}); - // return reflect_template({reflect()...}); -} - -} // namespace reflection - -//! \cond - -namespace detail { - -// Demangles nested variable names using the PTX name mangling scheme -// (which follows the Itanium64 ABI). E.g., _ZN1a3Foo2bcE -> a::Foo::bc. -inline std::string demangle_ptx_variable_name(const char* name) { - std::stringstream ss; - const char* c = name; - if (*c++ != '_' || *c++ != 'Z') return name; // Non-mangled name - if (*c++ != 'N') return ""; // Not a nested name, unsupported - while (true) { - // Parse identifier length. - int n = 0; - while (std::isdigit(*c)) { - n = n * 10 + (*c - '0'); - c++; - } - if (!n) return ""; // Invalid or unsupported mangled name - // Parse identifier. - const char* c0 = c; - while (n-- && *c) c++; - if (!*c) return ""; // Mangled name is truncated - std::string id(c0, c); - // Identifiers starting with "_GLOBAL" are anonymous namespaces. - ss << (id.substr(0, 7) == "_GLOBAL" ? "(anonymous namespace)" : id); - // Nested name specifiers end with 'E'. - if (*c == 'E') break; - // There are more identifiers to come, add join token. - ss << "::"; - } - return ss.str(); -} - -static const char* get_current_executable_path() { - static const char* path = []() -> const char* { - static char buffer[JITIFY_PATH_MAX] = {}; -#ifdef __linux__ - if (!::realpath("/proc/self/exe", buffer)) return nullptr; -#elif defined(_WIN32) || defined(_WIN64) - if (!GetModuleFileNameA(nullptr, buffer, JITIFY_PATH_MAX)) return nullptr; -#endif - return buffer; - }(); - return path; -} - -inline bool endswith(const std::string& str, const std::string& suffix) { - return str.size() >= suffix.size() && - str.substr(str.size() - suffix.size()) == suffix; -} - -// Infers the JIT input type from the filename suffix. If no known suffix is -// present, the filename is assumed to refer to a library, and the associated -// suffix (and possibly prefix) is automatically added to the filename. -inline CUjitInputType get_cuda_jit_input_type(std::string* filename) { - if (endswith(*filename, ".ptx")) { - return CU_JIT_INPUT_PTX; - } else if (endswith(*filename, ".cubin")) { - return CU_JIT_INPUT_CUBIN; - } else if (endswith(*filename, ".fatbin")) { - return CU_JIT_INPUT_FATBINARY; - } else if (endswith(*filename, -#if defined _WIN32 || defined _WIN64 - ".obj" -#else // Linux - ".o" -#endif - )) { - return CU_JIT_INPUT_OBJECT; - } else { // Assume library -#if defined _WIN32 || defined _WIN64 - if (!endswith(*filename, ".lib")) { - *filename += ".lib"; - } -#else // Linux - if (!endswith(*filename, ".a")) { - *filename = "lib" + *filename + ".a"; - } -#endif - return CU_JIT_INPUT_LIBRARY; - } -} - -class CUDAKernel { - std::vector _link_files; - std::vector _link_paths; - CUlinkState _link_state; - CUmodule _module; - CUfunction _kernel; - std::string _func_name; - std::string _ptx; - std::map _global_map; - std::vector _opts; - std::vector _optvals; -#ifdef JITIFY_PRINT_LINKER_LOG - static const unsigned int _log_size = 8192; - char _error_log[_log_size]; - char _info_log[_log_size]; -#endif - - inline void cuda_safe_call(CUresult res) const { - if (res != CUDA_SUCCESS) { - const char* msg; - cuGetErrorName(res, &msg); - throw std::runtime_error(msg); - } - } - inline void create_module(std::vector link_files, - std::vector link_paths) { - CUresult result; -#ifndef JITIFY_PRINT_LINKER_LOG - // WAR since linker log does not seem to be constructed using a single call - // to cuModuleLoadDataEx. - if (link_files.empty()) { - result = - cuModuleLoadDataEx(&_module, _ptx.c_str(), (unsigned)_opts.size(), - _opts.data(), _optvals.data()); - } else -#endif - { - cuda_safe_call(cuLinkCreate((unsigned)_opts.size(), _opts.data(), - _optvals.data(), &_link_state)); - cuda_safe_call(cuLinkAddData(_link_state, CU_JIT_INPUT_PTX, - (void*)_ptx.c_str(), _ptx.size(), - "jitified_source.ptx", 0, 0, 0)); - for (int i = 0; i < (int)link_files.size(); ++i) { - std::string link_file = link_files[i]; - CUjitInputType jit_input_type; - if (link_file == ".") { - // Special case for linking to current executable. - link_file = get_current_executable_path(); - jit_input_type = CU_JIT_INPUT_OBJECT; - } else { - // Infer based on filename. - jit_input_type = get_cuda_jit_input_type(&link_file); - } - CUresult result = cuLinkAddFile(_link_state, jit_input_type, - link_file.c_str(), 0, 0, 0); - int path_num = 0; - while (result == CUDA_ERROR_FILE_NOT_FOUND && - path_num < (int)link_paths.size()) { - std::string filename = path_join(link_paths[path_num++], link_file); - result = cuLinkAddFile(_link_state, jit_input_type, filename.c_str(), - 0, 0, 0); - } -#if JITIFY_PRINT_LINKER_LOG - if (result == CUDA_ERROR_FILE_NOT_FOUND) { - std::cerr << "Linker error: Device library not found: " << link_file - << std::endl; - } else if (result != CUDA_SUCCESS) { - std::cerr << "Linker error: Failed to add file: " << link_file - << std::endl; - std::cerr << _error_log << std::endl; - } -#endif - cuda_safe_call(result); - } - size_t cubin_size; - void* cubin; - result = cuLinkComplete(_link_state, &cubin, &cubin_size); - if (result == CUDA_SUCCESS) { - result = cuModuleLoadData(&_module, cubin); - } - } -#ifdef JITIFY_PRINT_LINKER_LOG - std::cout << "---------------------------------------" << std::endl; - std::cout << "--- Linker for " - << reflection::detail::demangle_cuda_symbol(_func_name.c_str()) - << " ---" << std::endl; - std::cout << "---------------------------------------" << std::endl; - std::cout << _info_log << std::endl; - std::cout << std::endl; - std::cout << _error_log << std::endl; - std::cout << "---------------------------------------" << std::endl; -#endif - cuda_safe_call(result); - // Allow _func_name to be empty to support cases where we want to generate - // PTX containing extern symbol definitions but no kernels. - if (!_func_name.empty()) { - cuda_safe_call( - cuModuleGetFunction(&_kernel, _module, _func_name.c_str())); - } - } - inline void destroy_module() { - if (_link_state) { - cuda_safe_call(cuLinkDestroy(_link_state)); - } - _link_state = 0; - if (_module) { - cuModuleUnload(_module); - } - _module = 0; - } - - // create a map of __constant__ and __device__ variables in the ptx file - // mapping demangled to mangled name - inline void create_global_variable_map() { - size_t pos = 0; - while (pos < _ptx.size()) { - pos = std::min(_ptx.find(".const .align", pos), - _ptx.find(".global .align", pos)); - if (pos == std::string::npos) break; - size_t end = _ptx.find_first_of(";=", pos); - if (_ptx[end] == '=') --end; - std::string line = _ptx.substr(pos, end - pos); - pos = end; - size_t symbol_start = line.find_last_of(" ") + 1; - size_t symbol_end = line.find_last_of("["); - std::string entry = line.substr(symbol_start, symbol_end - symbol_start); - std::string key = detail::demangle_ptx_variable_name(entry.c_str()); - // Skip unsupported mangled names. E.g., a static variable defined inside - // a function (such variables are not directly addressable from outside - // the function, so skipping them is the correct behavior). - if (key == "") continue; - _global_map[key] = entry; - } - } - - inline void set_linker_log() { -#ifdef JITIFY_PRINT_LINKER_LOG - _opts.push_back(CU_JIT_INFO_LOG_BUFFER); - _optvals.push_back((void*)_info_log); - _opts.push_back(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES); - _optvals.push_back((void*)(long)_log_size); - _opts.push_back(CU_JIT_ERROR_LOG_BUFFER); - _optvals.push_back((void*)_error_log); - _opts.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES); - _optvals.push_back((void*)(long)_log_size); - _opts.push_back(CU_JIT_LOG_VERBOSE); - _optvals.push_back((void*)1); -#endif - } - - public: - inline CUDAKernel() : _link_state(0), _module(0), _kernel(0) {} - inline CUDAKernel(const CUDAKernel& other) = delete; - inline CUDAKernel& operator=(const CUDAKernel& other) = delete; - inline CUDAKernel(CUDAKernel&& other) = delete; - inline CUDAKernel& operator=(CUDAKernel&& other) = delete; - inline CUDAKernel(const char* func_name, const char* ptx, - std::vector link_files, - std::vector link_paths, unsigned int nopts = 0, - CUjit_option* opts = 0, void** optvals = 0) - : _link_files(link_files), - _link_paths(link_paths), - _link_state(0), - _module(0), - _kernel(0), - _func_name(func_name), - _ptx(ptx), - _opts(opts, opts + nopts), - _optvals(optvals, optvals + nopts) { - this->set_linker_log(); - this->create_module(link_files, link_paths); - this->create_global_variable_map(); - } - - inline CUDAKernel& set(const char* func_name, const char* ptx, - std::vector link_files, - std::vector link_paths, - unsigned int nopts = 0, CUjit_option* opts = 0, - void** optvals = 0) { - this->destroy_module(); - _func_name = func_name; - _ptx = ptx; - _link_files = link_files; - _link_paths = link_paths; - _opts.assign(opts, opts + nopts); - _optvals.assign(optvals, optvals + nopts); - this->set_linker_log(); - this->create_module(link_files, link_paths); - this->create_global_variable_map(); - return *this; - } - inline ~CUDAKernel() { this->destroy_module(); } - inline operator CUfunction() const { return _kernel; } - - inline CUresult launch(dim3 grid, dim3 block, unsigned int smem, - CUstream stream, std::vector arg_ptrs) const { - return cuLaunchKernel(_kernel, grid.x, grid.y, grid.z, block.x, block.y, - block.z, smem, stream, arg_ptrs.data(), NULL); - } - - inline CUdeviceptr get_global_ptr(const char* name, - size_t* size = nullptr) const { - CUdeviceptr global_ptr = 0; - auto global = _global_map.find(name); - if (global != _global_map.end()) { - cuda_safe_call(cuModuleGetGlobal(&global_ptr, size, _module, - global->second.c_str())); - } else { - throw std::runtime_error(std::string("failed to look up global ") + name); - } - return global_ptr; - } - - template - inline CUresult get_global_data(const char* name, T* data, size_t count, - CUstream stream = 0) const { - size_t size_bytes; - CUdeviceptr ptr = get_global_ptr(name, &size_bytes); - size_t given_size_bytes = count * sizeof(T); - if (given_size_bytes != size_bytes) { - throw std::runtime_error( - std::string("Value for global variable ") + name + - " has wrong size: got " + std::to_string(given_size_bytes) + - " bytes, expected " + std::to_string(size_bytes)); - } - return cuMemcpyDtoH(data, ptr, size_bytes); - } - - template - inline CUresult set_global_data(const char* name, const T* data, size_t count, - CUstream stream = 0) const { - size_t size_bytes; - CUdeviceptr ptr = get_global_ptr(name, &size_bytes); - size_t given_size_bytes = count * sizeof(T); - if (given_size_bytes != size_bytes) { - throw std::runtime_error( - std::string("Value for global variable ") + name + - " has wrong size: got " + std::to_string(given_size_bytes) + - " bytes, expected " + std::to_string(size_bytes)); - } - return cuMemcpyHtoD(ptr, data, size_bytes); - } - - const std::string& function_name() const { return _func_name; } - const std::string& ptx() const { return _ptx; } - const std::vector& link_files() const { return _link_files; } - const std::vector& link_paths() const { return _link_paths; } -}; - -static const char* jitsafe_header_preinclude_h = R"( -//// WAR for Thrust (which appears to have forgotten to include this in result_of_adaptable_function.h -//#include - -//// WAR for Thrust (which appear to have forgotten to include this in error_code.h) -//#include - -// WAR for Thrust (which only supports gnuc, clang or msvc) -#define __GNUC__ 4 - -// WAR for generics/shfl.h -#define THRUST_STATIC_ASSERT(x) - -// WAR for CUB -#ifdef __host__ -#undef __host__ -#endif -#define __host__ - -// WAR to allow exceptions to be parsed -#define try -#define catch(...) -)"; - - -static const char* jitsafe_header_float_h = R"( -#pragma once - -#define FLT_RADIX 2 -#define FLT_MANT_DIG 24 -#define DBL_MANT_DIG 53 -#define FLT_DIG 6 -#define DBL_DIG 15 -#define FLT_MIN_EXP -125 -#define DBL_MIN_EXP -1021 -#define FLT_MIN_10_EXP -37 -#define DBL_MIN_10_EXP -307 -#define FLT_MAX_EXP 128 -#define DBL_MAX_EXP 1024 -#define FLT_MAX_10_EXP 38 -#define DBL_MAX_10_EXP 308 -#define FLT_MAX 3.4028234e38f -#define DBL_MAX 1.7976931348623157e308 -#define FLT_EPSILON 1.19209289e-7f -#define DBL_EPSILON 2.220440492503130e-16 -#define FLT_MIN 1.1754943e-38f; -#define DBL_MIN 2.2250738585072013e-308 -#define FLT_ROUNDS 1 -#if defined __cplusplus && __cplusplus >= 201103L -#define FLT_EVAL_METHOD 0 -#define DECIMAL_DIG 21 -#endif -)"; - -static const char* jitsafe_header_limits_h = R"( -#pragma once - -#if defined _WIN32 || defined _WIN64 - #define __WORDSIZE 32 -#else - #if defined __x86_64__ && !defined __ILP32__ - #define __WORDSIZE 64 - #else - #define __WORDSIZE 32 - #endif -#endif -#define MB_LEN_MAX 16 -#define CHAR_BIT 8 -#define SCHAR_MIN (-128) -#define SCHAR_MAX 127 -#define UCHAR_MAX 255 -enum { - _JITIFY_CHAR_IS_UNSIGNED = (char)-1 >= 0, - CHAR_MIN = _JITIFY_CHAR_IS_UNSIGNED ? 0 : SCHAR_MIN, - CHAR_MAX = _JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX, -}; -#define SHRT_MIN (-32768) -#define SHRT_MAX 32767 -#define USHRT_MAX 65535 -#define INT_MIN (-INT_MAX - 1) -#define INT_MAX 2147483647 -#define UINT_MAX 4294967295U -#if __WORDSIZE == 64 - # define LONG_MAX 9223372036854775807L -#else - # define LONG_MAX 2147483647L -#endif -#define LONG_MIN (-LONG_MAX - 1L) -#if __WORDSIZE == 64 - #define ULONG_MAX 18446744073709551615UL -#else - #define ULONG_MAX 4294967295UL -#endif -#define LLONG_MAX 9223372036854775807LL -#define LLONG_MIN (-LLONG_MAX - 1LL) -#define ULLONG_MAX 18446744073709551615ULL -)"; - -static const char* jitsafe_header_iterator = R"( -#pragma once - -namespace __jitify_iterator_ns { -struct output_iterator_tag {}; -struct input_iterator_tag {}; -struct forward_iterator_tag {}; -struct bidirectional_iterator_tag {}; -struct random_access_iterator_tag {}; -template -struct iterator_traits { - typedef typename Iterator::iterator_category iterator_category; - typedef typename Iterator::value_type value_type; - typedef typename Iterator::difference_type difference_type; - typedef typename Iterator::pointer pointer; - typedef typename Iterator::reference reference; -}; -template -struct iterator_traits { - typedef random_access_iterator_tag iterator_category; - typedef T value_type; - typedef ptrdiff_t difference_type; - typedef T* pointer; - typedef T& reference; -}; -template -struct iterator_traits { - typedef random_access_iterator_tag iterator_category; - typedef T value_type; - typedef ptrdiff_t difference_type; - typedef T const* pointer; - typedef T const& reference; -}; -} // namespace __jitify_iterator_ns -namespace std { using namespace __jitify_iterator_ns; } -using namespace __jitify_iterator_ns; -)"; - -// TODO: This is incomplete; need floating point limits -// Joe Eaton: added IEEE float and double types, none of the smaller types -// using type specific structs since we can't template on floats. -static const char* jitsafe_header_limits = R"( -#pragma once -#include -#include -// TODO: epsilon(), infinity(), etc -namespace __jitify_detail { -#if __cplusplus >= 201103L -#define JITIFY_CXX11_CONSTEXPR constexpr -#define JITIFY_CXX11_NOEXCEPT noexcept -#else -#define JITIFY_CXX11_CONSTEXPR -#define JITIFY_CXX11_NOEXCEPT -#endif - -struct FloatLimits { -#if __cplusplus >= 201103L - static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ - float lowest() JITIFY_CXX11_NOEXCEPT { return -FLT_MAX;} - static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ - float min() JITIFY_CXX11_NOEXCEPT { return FLT_MIN; } - static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ - float max() JITIFY_CXX11_NOEXCEPT { return FLT_MAX; } -#endif // __cplusplus >= 201103L - enum { - is_specialized = true, - is_signed = true, - is_integer = false, - is_exact = false, - has_infinity = true, - has_quiet_NaN = true, - has_signaling_NaN = true, - has_denorm = 1, - has_denorm_loss = true, - round_style = 1, - is_iec559 = true, - is_bounded = true, - is_modulo = false, - digits = 24, - digits10 = 6, - max_digits10 = 9, - radix = 2, - min_exponent = -125, - min_exponent10 = -37, - max_exponent = 128, - max_exponent10 = 38, - tinyness_before = false, - traps = false - }; -}; -struct DoubleLimits { -#if __cplusplus >= 201103L - static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ - double lowest() noexcept { return -DBL_MAX; } - static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ - double min() noexcept { return DBL_MIN; } - static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ - double max() noexcept { return DBL_MAX; } -#endif // __cplusplus >= 201103L - enum { - is_specialized = true, - is_signed = true, - is_integer = false, - is_exact = false, - has_infinity = true, - has_quiet_NaN = true, - has_signaling_NaN = true, - has_denorm = 1, - has_denorm_loss = true, - round_style = 1, - is_iec559 = true, - is_bounded = true, - is_modulo = false, - digits = 53, - digits10 = 15, - max_digits10 = 17, - radix = 2, - min_exponent = -1021, - min_exponent10 = -307, - max_exponent = 1024, - max_exponent10 = 308, - tinyness_before = false, - traps = false - }; -}; -template -struct IntegerLimits { - static inline __host__ __device__ T min() { return Min; } - static inline __host__ __device__ T max() { return Max; } -#if __cplusplus >= 201103L - static constexpr inline __host__ __device__ T lowest() noexcept { - return Min; - } -#endif // __cplusplus >= 201103L - enum { - is_specialized = true, - digits = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits, - digits10 = (digits * 30103) / 100000, - is_signed = ((T)(-1)<0), - is_integer = true, - is_exact = true, - radix = 2, - is_bounded = true, - is_modulo = false - }; -}; -} // namespace __jitify_detail -namespace std { using namespace __jitify_detail; } -namespace __jitify_limits_ns { -template struct numeric_limits { - enum { is_specialized = false }; -}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits {}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits -{}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits -{}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits -{}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits {}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits -{}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits -{}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits {}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits -{}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits -{}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits -{}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits -{}; -template<> struct numeric_limits : public -__jitify_detail::IntegerLimits -{}; -//template struct numeric_limits { static const bool -//is_signed = ((T)(-1)<0); }; -template<> struct numeric_limits : public -__jitify_detail::FloatLimits -{}; -template<> struct numeric_limits : public -__jitify_detail::DoubleLimits -{}; -} // namespace __jitify_limits_ns -namespace std { using namespace __jitify_limits_ns; } -using namespace __jitify_limits_ns; -)"; - -// TODO: This is highly incomplete -static const char* jitsafe_header_type_traits = R"( - #pragma once - #if __cplusplus >= 201103L - namespace __jitify_type_traits_ns { - - template struct enable_if {}; - template struct enable_if { typedef T type; }; - #if __cplusplus >= 201402L - template< bool B, class T = void > using enable_if_t = typename enable_if::type; - #endif - - struct true_type { - enum { value = true }; - operator bool() const { return true; } - }; - struct false_type { - enum { value = false }; - operator bool() const { return false; } - }; - - template struct is_floating_point : false_type {}; - template<> struct is_floating_point : true_type {}; - template<> struct is_floating_point : true_type {}; - template<> struct is_floating_point : true_type {}; - - template struct is_integral : false_type {}; - template<> struct is_integral : true_type {}; - template<> struct is_integral : true_type {}; - template<> struct is_integral : true_type {}; - template<> struct is_integral : true_type {}; - template<> struct is_integral : true_type {}; - template<> struct is_integral : true_type {}; - template<> struct is_integral : true_type {}; - template<> struct is_integral : true_type {}; - template<> struct is_integral : true_type {}; - template<> struct is_integral : true_type {}; - template<> struct is_integral : true_type {}; - template<> struct is_integral : true_type {}; - - template struct is_signed : false_type {}; - template<> struct is_signed : true_type {}; - template<> struct is_signed : true_type {}; - template<> struct is_signed : true_type {}; - template<> struct is_signed : true_type {}; - template<> struct is_signed : true_type {}; - template<> struct is_signed : true_type {}; - template<> struct is_signed : true_type {}; - template<> struct is_signed : true_type {}; - - template struct is_unsigned : false_type {}; - template<> struct is_unsigned : true_type {}; - template<> struct is_unsigned : true_type {}; - template<> struct is_unsigned : true_type {}; - template<> struct is_unsigned : true_type {}; - template<> struct is_unsigned : true_type {}; - - template struct is_same : false_type {}; - template struct is_same : true_type {}; - - template struct is_array : false_type {}; - template struct is_array : true_type {}; - template struct is_array : true_type {}; - - //partial implementation only of is_function - template struct is_function : false_type { }; - template struct is_function : true_type {}; //regular - template struct is_function : true_type {}; // variadic - - template struct result_of; - template - struct result_of { - // TODO: This is a hack; a proper implem is quite complicated. - typedef typename F::result_type type; - }; - - template struct remove_reference { typedef T type; }; - template struct remove_reference { typedef T type; }; - template struct remove_reference { typedef T type; }; - #if __cplusplus >= 201402L - template< class T > using remove_reference_t = typename remove_reference::type; - #endif - - template struct remove_extent { typedef T type; }; - template struct remove_extent { typedef T type; }; - template struct remove_extent { typedef T type; }; - #if __cplusplus >= 201402L - template< class T > using remove_extent_t = typename remove_extent::type; - #endif - - template< class T > struct remove_const { typedef T type; }; - template< class T > struct remove_const { typedef T type; }; - template< class T > struct remove_volatile { typedef T type; }; - template< class T > struct remove_volatile { typedef T type; }; - template< class T > struct remove_cv { typedef typename remove_volatile::type>::type type; }; - #if __cplusplus >= 201402L - template< class T > using remove_cv_t = typename remove_cv::type; - template< class T > using remove_const_t = typename remove_const::type; - template< class T > using remove_volatile_t = typename remove_volatile::type; - #endif - - template struct conditional { typedef T type; }; - template struct conditional { typedef F type; }; - #if __cplusplus >= 201402L - template< bool B, class T, class F > using conditional_t = typename conditional::type; - #endif - - namespace __jitify_detail { - template< class T, bool is_function_type = false > struct add_pointer { using type = typename remove_reference::type*; }; - template< class T > struct add_pointer { using type = T; }; - template< class T, class... Args > struct add_pointer { using type = T(*)(Args...); }; - template< class T, class... Args > struct add_pointer { using type = T(*)(Args..., ...); }; - } - template< class T > struct add_pointer : __jitify_detail::add_pointer::value> {}; - #if __cplusplus >= 201402L - template< class T > using add_pointer_t = typename add_pointer::type; - #endif - - template< class T > struct decay { - private: - typedef typename remove_reference::type U; - public: - typedef typename conditional::value, typename remove_extent::type*, - typename conditional::value,typename add_pointer::type,typename remove_cv::type - >::type>::type type; - }; - #if __cplusplus >= 201402L - template< class T > using decay_t = typename decay::type; - #endif - - } // namespace __jtiify_type_traits_ns - namespace std { using namespace __jitify_type_traits_ns; } - using namespace __jitify_type_traits_ns; - #endif // c++11 -)"; - -// TODO: INT_FAST8_MAX et al. and a few other misc constants -static const char* jitsafe_header_stdint_h = - "#pragma once\n" - "#include \n" - "namespace __jitify_stdint_ns {\n" - "typedef signed char int8_t;\n" - "typedef signed short int16_t;\n" - "typedef signed int int32_t;\n" - "typedef signed long long int64_t;\n" - "typedef signed char int_fast8_t;\n" - "typedef signed short int_fast16_t;\n" - "typedef signed int int_fast32_t;\n" - "typedef signed long long int_fast64_t;\n" - "typedef signed char int_least8_t;\n" - "typedef signed short int_least16_t;\n" - "typedef signed int int_least32_t;\n" - "typedef signed long long int_least64_t;\n" - "typedef signed long long intmax_t;\n" - "typedef signed long intptr_t; //optional\n" - "typedef unsigned char uint8_t;\n" - "typedef unsigned short uint16_t;\n" - "typedef unsigned int uint32_t;\n" - "typedef unsigned long long uint64_t;\n" - "typedef unsigned char uint_fast8_t;\n" - "typedef unsigned short uint_fast16_t;\n" - "typedef unsigned int uint_fast32_t;\n" - "typedef unsigned long long uint_fast64_t;\n" - "typedef unsigned char uint_least8_t;\n" - "typedef unsigned short uint_least16_t;\n" - "typedef unsigned int uint_least32_t;\n" - "typedef unsigned long long uint_least64_t;\n" - "typedef unsigned long long uintmax_t;\n" - "typedef unsigned long uintptr_t; //optional\n" - "#define INT8_MIN SCHAR_MIN\n" - "#define INT16_MIN SHRT_MIN\n" - "#define INT32_MIN INT_MIN\n" - "#define INT64_MIN LLONG_MIN\n" - "#define INT8_MAX SCHAR_MAX\n" - "#define INT16_MAX SHRT_MAX\n" - "#define INT32_MAX INT_MAX\n" - "#define INT64_MAX LLONG_MAX\n" - "#define UINT8_MAX UCHAR_MAX\n" - "#define UINT16_MAX USHRT_MAX\n" - "#define UINT32_MAX UINT_MAX\n" - "#define UINT64_MAX ULLONG_MAX\n" - "#define INTPTR_MIN LONG_MIN\n" - "#define INTMAX_MIN LLONG_MIN\n" - "#define INTPTR_MAX LONG_MAX\n" - "#define INTMAX_MAX LLONG_MAX\n" - "#define UINTPTR_MAX ULONG_MAX\n" - "#define UINTMAX_MAX ULLONG_MAX\n" - "#define PTRDIFF_MIN INTPTR_MIN\n" - "#define PTRDIFF_MAX INTPTR_MAX\n" - "#define SIZE_MAX UINT64_MAX\n" - "} // namespace __jitify_stdint_ns\n" - "namespace std { using namespace __jitify_stdint_ns; }\n" - "using namespace __jitify_stdint_ns;\n"; - -// TODO: offsetof -static const char* jitsafe_header_stddef_h = - "#pragma once\n" - "#include \n" - "namespace __jitify_stddef_ns {\n" - "#if __cplusplus >= 201103L\n" - "typedef decltype(nullptr) nullptr_t;\n" - "#if defined(_MSC_VER)\n" - " typedef double max_align_t;\n" - "#elif defined(__APPLE__)\n" - " typedef long double max_align_t;\n" - "#else\n" - " // Define max_align_t to match the GCC definition.\n" - " typedef struct {\n" - " long long __jitify_max_align_nonce1\n" - " __attribute__((__aligned__(__alignof__(long long))));\n" - " long double __jitify_max_align_nonce2\n" - " __attribute__((__aligned__(__alignof__(long double))));\n" - " } max_align_t;\n" - "#endif\n" - "#endif // __cplusplus >= 201103L\n" - "#if __cplusplus >= 201703L\n" - "enum class byte : unsigned char {};\n" - "#endif // __cplusplus >= 201703L\n" - "} // namespace __jitify_stddef_ns\n" - "namespace std {\n" - " // NVRTC provides built-in definitions of ::size_t and ::ptrdiff_t.\n" - " using ::size_t;\n" - " using ::ptrdiff_t;\n" - " using namespace __jitify_stddef_ns;\n" - "} // namespace std\n" - "using namespace __jitify_stddef_ns;\n"; - -static const char* jitsafe_header_stdlib_h = - "#pragma once\n" - "#include \n"; -static const char* jitsafe_header_stdio_h = - "#pragma once\n" - "#include \n" - "#define FILE int\n" - "int fflush ( FILE * stream );\n" - "int fprintf ( FILE * stream, const char * format, ... );\n"; - -static const char* jitsafe_header_string_h = - "#pragma once\n" - "char* strcpy ( char * destination, const char * source );\n" - "int strcmp ( const char * str1, const char * str2 );\n" - "char* strerror( int errnum );\n"; - -static const char* jitsafe_header_cstring = - "#pragma once\n" - "\n" - "namespace __jitify_cstring_ns {\n" - "char* strcpy ( char * destination, const char * source );\n" - "int strcmp ( const char * str1, const char * str2 );\n" - "char* strerror( int errnum );\n" - "} // namespace __jitify_cstring_ns\n" - "namespace std { using namespace __jitify_cstring_ns; }\n" - "using namespace __jitify_cstring_ns;\n"; - -// HACK TESTING (WAR for cub) -static const char* jitsafe_header_iostream = - "#pragma once\n" - "#include \n" - "#include \n"; -// HACK TESTING (WAR for Thrust) -static const char* jitsafe_header_ostream = - "#pragma once\n" - "\n" - "namespace __jitify_ostream_ns {\n" - "template\n" // = std::char_traits - // >\n" - "struct basic_ostream {\n" - "};\n" - "typedef basic_ostream ostream;\n" - "ostream& endl(ostream& os);\n" - "ostream& operator<<( ostream&, ostream& (*f)( ostream& ) );\n" - "template< class CharT, class Traits > basic_ostream& endl( " - "basic_ostream& os );\n" - "template< class CharT, class Traits > basic_ostream& " - "operator<<( basic_ostream& os, const char* c );\n" - "#if __cplusplus >= 201103L\n" - "template< class CharT, class Traits, class T > basic_ostream& operator<<( basic_ostream&& os, const T& value );\n" - "#endif // __cplusplus >= 201103L\n" - "} // namespace __jitify_ostream_ns\n" - "namespace std { using namespace __jitify_ostream_ns; }\n" - "using namespace __jitify_ostream_ns;\n"; - -static const char* jitsafe_header_istream = - "#pragma once\n" - "\n" - "namespace __jitify_istream_ns {\n" - "template\n" // = std::char_traits - // >\n" - "struct basic_istream {\n" - "};\n" - "typedef basic_istream istream;\n" - "} // namespace __jitify_istream_ns\n" - "namespace std { using namespace __jitify_istream_ns; }\n" - "using namespace __jitify_istream_ns;\n"; - -static const char* jitsafe_header_sstream = - "#pragma once\n" - "#include \n" - "#include \n"; - -static const char* jitsafe_header_utility = - "#pragma once\n" - "namespace __jitify_utility_ns {\n" - "template\n" - "struct pair {\n" - " T1 first;\n" - " T2 second;\n" - " inline pair() {}\n" - " inline pair(T1 const& first_, T2 const& second_)\n" - " : first(first_), second(second_) {}\n" - " // TODO: Standard includes many more constructors...\n" - " // TODO: Comparison operators\n" - "};\n" - "template\n" - "pair make_pair(T1 const& first, T2 const& second) {\n" - " return pair(first, second);\n" - "}\n" - "} // namespace __jitify_utility_ns\n" - "namespace std { using namespace __jitify_utility_ns; }\n" - "using namespace __jitify_utility_ns;\n"; - -// TODO: incomplete -static const char* jitsafe_header_vector = - "#pragma once\n" - "namespace __jitify_vector_ns {\n" - "template\n" // = std::allocator> \n" - "struct vector {\n" - "};\n" - "} // namespace __jitify_vector_ns\n" - "namespace std { using namespace __jitify_vector_ns; }\n" - "using namespace __jitify_vector_ns;\n"; - -// TODO: incomplete -static const char* jitsafe_header_string = - "#pragma once\n" - "namespace __jitify_string_ns {\n" - "template\n" - "struct basic_string {\n" - "basic_string();\n" - "basic_string( const CharT* s );\n" //, const Allocator& alloc = - // Allocator() );\n" - "const CharT* c_str() const;\n" - "bool empty() const;\n" - "void operator+=(const char *);\n" - "void operator+=(const basic_string &);\n" - "};\n" - "typedef basic_string string;\n" - "} // namespace __jitify_string_ns\n" - "namespace std { using namespace __jitify_string_ns; }\n" - "using namespace __jitify_string_ns;\n"; - -// TODO: incomplete -static const char* jitsafe_header_stdexcept = - "#pragma once\n" - "namespace __jitify_stdexcept_ns {\n" - "struct runtime_error {\n" - "explicit runtime_error( const std::string& what_arg );" - "explicit runtime_error( const char* what_arg );" - "virtual const char* what() const;\n" - "};\n" - "} // namespace __jitify_stdexcept_ns\n" - "namespace std { using namespace __jitify_stdexcept_ns; }\n" - "using namespace __jitify_stdexcept_ns;\n"; - -// TODO: incomplete -static const char* jitsafe_header_complex = - "#pragma once\n" - "namespace __jitify_complex_ns {\n" - "template\n" - "class complex {\n" - " T _real;\n" - " T _imag;\n" - "public:\n" - " complex() : _real(0), _imag(0) {}\n" - " complex(T const& real, T const& imag)\n" - " : _real(real), _imag(imag) {}\n" - " complex(T const& real)\n" - " : _real(real), _imag(static_cast(0)) {}\n" - " T const& real() const { return _real; }\n" - " T& real() { return _real; }\n" - " void real(const T &r) { _real = r; }\n" - " T const& imag() const { return _imag; }\n" - " T& imag() { return _imag; }\n" - " void imag(const T &i) { _imag = i; }\n" - " complex& operator+=(const complex z)\n" - " { _real += z.real(); _imag += z.imag(); return *this; }\n" - "};\n" - "template\n" - "complex operator*(const complex& lhs, const complex& rhs)\n" - " { return complex(lhs.real()*rhs.real()-lhs.imag()*rhs.imag(),\n" - " lhs.real()*rhs.imag()+lhs.imag()*rhs.real()); }\n" - "template\n" - "complex operator*(const complex& lhs, const T & rhs)\n" - " { return complexs(lhs.real()*rhs,lhs.imag()*rhs); }\n" - "template\n" - "complex operator*(const T& lhs, const complex& rhs)\n" - " { return complexs(rhs.real()*lhs,rhs.imag()*lhs); }\n" - "} // namespace __jitify_complex_ns\n" - "namespace std { using namespace __jitify_complex_ns; }\n" - "using namespace __jitify_complex_ns;\n"; - -// TODO: This is incomplete (missing binary and integer funcs, macros, -// constants, types) -static const char* jitsafe_header_math = - "#pragma once\n" - "namespace __jitify_math_ns {\n" - "#if __cplusplus >= 201103L\n" - "#define DEFINE_MATH_UNARY_FUNC_WRAPPER(f) \\\n" - " inline double f(double x) { return ::f(x); } \\\n" - " inline float f##f(float x) { return ::f(x); } \\\n" - " /*inline long double f##l(long double x) { return ::f(x); }*/ \\\n" - " inline float f(float x) { return ::f(x); } \\\n" - " /*inline long double f(long double x) { return ::f(x); }*/\n" - "#else\n" - "#define DEFINE_MATH_UNARY_FUNC_WRAPPER(f) \\\n" - " inline double f(double x) { return ::f(x); } \\\n" - " inline float f##f(float x) { return ::f(x); } \\\n" - " /*inline long double f##l(long double x) { return ::f(x); }*/\n" - "#endif\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(cos)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(sin)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(tan)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(acos)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(asin)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(atan)\n" - "template inline T atan2(T y, T x) { return ::atan2(y, x); }\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(cosh)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(sinh)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(tanh)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(exp)\n" - "template inline T frexp(T x, int* exp) { return ::frexp(x, " - "exp); }\n" - "template inline T ldexp(T x, int exp) { return ::ldexp(x, " - "exp); }\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(log)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(log10)\n" - "template inline T modf(T x, T* intpart) { return ::modf(x, " - "intpart); }\n" - "template inline T pow(T x, T y) { return ::pow(x, y); }\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(sqrt)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(ceil)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(floor)\n" - "template inline T fmod(T n, T d) { return ::fmod(n, d); }\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(fabs)\n" - "template inline T abs(T x) { return ::abs(x); }\n" - "#if __cplusplus >= 201103L\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(acosh)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(asinh)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(atanh)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(exp2)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(expm1)\n" - "template inline int ilogb(T x) { return ::ilogb(x); }\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(log1p)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(log2)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(logb)\n" - "template inline T scalbn (T x, int n) { return ::scalbn(x, " - "n); }\n" - "template inline T scalbln(T x, long n) { return ::scalbn(x, " - "n); }\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(cbrt)\n" - "template inline T hypot(T x, T y) { return ::hypot(x, y); }\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(erf)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(erfc)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(tgamma)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(lgamma)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(trunc)\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(round)\n" - "template inline long lround(T x) { return ::lround(x); }\n" - "template inline long long llround(T x) { return ::llround(x); " - "}\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(rint)\n" - "template inline long lrint(T x) { return ::lrint(x); }\n" - "template inline long long llrint(T x) { return ::llrint(x); " - "}\n" - "DEFINE_MATH_UNARY_FUNC_WRAPPER(nearbyint)\n" - // TODO: remainder, remquo, copysign, nan, nextafter, nexttoward, fdim, - // fmax, fmin, fma - "#endif\n" - "#undef DEFINE_MATH_UNARY_FUNC_WRAPPER\n" - "} // namespace __jitify_math_ns\n" - "namespace std { using namespace __jitify_math_ns; }\n" - "#define M_PI 3.14159265358979323846\n" - // Note: Global namespace already includes CUDA math funcs - "//using namespace __jitify_math_ns;\n"; - -static const char* jitsafe_header_memory_h = R"( - #pragma once - #include - )"; - -// TODO: incomplete -static const char* jitsafe_header_mutex = R"( - #pragma once - #if __cplusplus >= 201103L - namespace __jitify_mutex_ns { - class mutex { - public: - void lock(); - bool try_lock(); - void unlock(); - }; - } // namespace __jitify_mutex_ns - namespace std { using namespace __jitify_mutex_ns; } - using namespace __jitify_mutex_ns; - #endif - )"; - -static const char* jitsafe_header_algorithm = R"( - #pragma once - #if __cplusplus >= 201103L - namespace __jitify_algorithm_ns { - - #if __cplusplus == 201103L - #define JITIFY_CXX14_CONSTEXPR - #else - #define JITIFY_CXX14_CONSTEXPR constexpr - #endif - - template JITIFY_CXX14_CONSTEXPR const T& max(const T& a, const T& b) - { - return (b > a) ? b : a; - } - template JITIFY_CXX14_CONSTEXPR const T& min(const T& a, const T& b) - { - return (b < a) ? b : a; - } - - } // namespace __jitify_algorithm_ns - namespace std { using namespace __jitify_algorithm_ns; } - using namespace __jitify_algorithm_ns; - #endif - )"; - -static const char* jitsafe_header_time_h = R"( - #pragma once - #define NULL 0 - #define CLOCKS_PER_SEC 1000000 - namespace __jitify_time_ns { - typedef long time_t; - struct tm { - int tm_sec; - int tm_min; - int tm_hour; - int tm_mday; - int tm_mon; - int tm_year; - int tm_wday; - int tm_yday; - int tm_isdst; - }; - #if __cplusplus >= 201703L - struct timespec { - time_t tv_sec; - long tv_nsec; - }; - #endif - } // namespace __jitify_time_ns - namespace std { - // NVRTC provides built-in definitions of ::size_t and ::clock_t. - using ::size_t; - using ::clock_t; - using namespace __jitify_time_ns; - } - using namespace __jitify_time_ns; - )"; - -// WAR: These need to be pre-included as a workaround for NVRTC implicitly using -// /usr/include as an include path. The other built-in headers will be included -// lazily as needed. -static const char* preinclude_jitsafe_header_names[] = { - "jitify_preinclude.h", - "limits.h", - "math.h", - "memory.h", - "stdint.h", - "stdlib.h", - "stdio.h", - "string.h", - "time.h", -}; - -template -int array_size(T (&)[N]) { - return N; -} -const int preinclude_jitsafe_headers_count = - array_size(preinclude_jitsafe_header_names); - -static const std::map& get_jitsafe_headers_map() { - static const std::map jitsafe_headers_map = { - {"jitify_preinclude.h", jitsafe_header_preinclude_h}, - {"float.h", jitsafe_header_float_h}, - {"cfloat", jitsafe_header_float_h}, - {"limits.h", jitsafe_header_limits_h}, - {"climits", jitsafe_header_limits_h}, - {"stdint.h", jitsafe_header_stdint_h}, - {"cstdint", jitsafe_header_stdint_h}, - {"stddef.h", jitsafe_header_stddef_h}, - {"cstddef", jitsafe_header_stddef_h}, - {"stdlib.h", jitsafe_header_stdlib_h}, - {"cstdlib", jitsafe_header_stdlib_h}, - {"stdio.h", jitsafe_header_stdio_h}, - {"cstdio", jitsafe_header_stdio_h}, - {"string.h", jitsafe_header_string_h}, - {"cstring", jitsafe_header_cstring}, - {"iterator", jitsafe_header_iterator}, - {"limits", jitsafe_header_limits}, - {"type_traits", jitsafe_header_type_traits}, - {"utility", jitsafe_header_utility}, - {"math.h", jitsafe_header_math}, - {"cmath", jitsafe_header_math}, - {"memory.h", jitsafe_header_memory_h}, - {"complex", jitsafe_header_complex}, - {"iostream", jitsafe_header_iostream}, - {"ostream", jitsafe_header_ostream}, - {"istream", jitsafe_header_istream}, - {"sstream", jitsafe_header_sstream}, - {"vector", jitsafe_header_vector}, - {"string", jitsafe_header_string}, - {"stdexcept", jitsafe_header_stdexcept}, - {"mutex", jitsafe_header_mutex}, - {"algorithm", jitsafe_header_algorithm}, - {"time.h", jitsafe_header_time_h}, - {"ctime", jitsafe_header_time_h}, - }; - return jitsafe_headers_map; -} - -inline void add_options_from_env(std::vector& options) { - // Add options from environment variable - const char* env_options = std::getenv("JITIFY_OPTIONS"); - if (env_options) { - std::stringstream ss; - ss << env_options; - std::string opt; - while (!(ss >> opt).fail()) { - options.push_back(opt); - } - } - // Add options from JITIFY_OPTIONS macro -#ifdef JITIFY_OPTIONS -#define JITIFY_TOSTRING_IMPL(x) #x -#define JITIFY_TOSTRING(x) JITIFY_TOSTRING_IMPL(x) - std::stringstream ss; - ss << JITIFY_TOSTRING(JITIFY_OPTIONS); - std::string opt; - while (!(ss >> opt).fail()) { - options.push_back(opt); - } -#undef JITIFY_TOSTRING -#undef JITIFY_TOSTRING_IMPL -#endif // JITIFY_OPTIONS -} - -inline void detect_and_add_cuda_arch(std::vector& options) { - for (int i = 0; i < (int)options.size(); ++i) { - // Note that this will also match the middle of "--gpu-architecture". - if (options[i].find("-arch") != std::string::npos) { - // Arch already specified in options - return; - } - } - // Use the compute capability of the current device - // TODO: Check these API calls for errors - cudaError_t status; - int device; - status = cudaGetDevice(&device); - if (status != cudaSuccess) { - throw std::runtime_error( - std::string( - "Failed to detect GPU architecture: cudaGetDevice failed: ") + - cudaGetErrorString(status)); - } - int cc_major; - cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device); - int cc_minor; - cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device); - int cc = cc_major * 10 + cc_minor; - // Note: We must limit the architecture to the max supported by the current - // version of NVRTC, otherwise newer hardware will cause errors - // on older versions of CUDA. - // TODO: It would be better to detect this somehow, rather than hard-coding it - - // Tegra chips do not have forwards compatibility so we need to special case - // them. - bool is_tegra = ((cc_major == 3 && cc_minor == 2) || // Logan - (cc_major == 5 && cc_minor == 3) || // Erista - (cc_major == 6 && cc_minor == 2) || // Parker - (cc_major == 7 && cc_minor == 2)); // Xavier - if (!is_tegra) { - // ensure that future CUDA versions just work (even if suboptimal) - const int cuda_major = std::min(10, CUDA_VERSION / 1000); - // clang-format off - switch (cuda_major) { - case 10: cc = std::min(cc, 75); break; // Turing - case 9: cc = std::min(cc, 70); break; // Volta - case 8: cc = std::min(cc, 61); break; // Pascal - case 7: cc = std::min(cc, 52); break; // Maxwell - default: - throw std::runtime_error("Unexpected CUDA major version " + - std::to_string(cuda_major)); - } - // clang-format on - } - - std::stringstream ss; - ss << cc; - options.push_back("-arch=compute_" + ss.str()); -} - -inline void detect_and_add_cxx11_flag(std::vector& options) { - // Reverse loop so we can erase on the fly. - for (int i = (int)options.size() - 1; i >= 0; --i) { - if (options[i].find("-std=c++98") != std::string::npos) { - // NVRTC doesn't support specifying c++98 explicitly, so we remove it. - options.erase(options.begin() + i); - return; - } else if (options[i].find("-std") != std::string::npos) { - // Some other standard was explicitly specified, don't change anything. - return; - } - } - // Jitify must be compiled with C++11 support, so we default to enabling it - // for the JIT-compiled code too. - options.push_back("-std=c++11"); -} - -inline void split_compiler_and_linker_options( - std::vector options, - std::vector* compiler_options, - std::vector* linker_files, - std::vector* linker_paths) { - for (int i = 0; i < (int)options.size(); ++i) { - std::string opt = options[i]; - std::string flag = opt.substr(0, 2); - std::string value = opt.substr(2); - if (flag == "-l") { - linker_files->push_back(value); - } else if (flag == "-L") { - linker_paths->push_back(value); - } else { - compiler_options->push_back(opt); - } - } -} - -inline bool pop_remove_unused_globals_flag(std::vector* options) { - auto it = std::remove_if( - options->begin(), options->end(), [](const std::string& opt) { - return opt.find("-remove-unused-globals") != std::string::npos; - }); - if (it != options->end()) { - options->resize(it - options->begin()); - return true; - } - return false; -} - -inline std::string ptx_parse_decl_name(const std::string& line) { - size_t name_end = line.find_first_of("[;"); - if (name_end == std::string::npos) { - throw std::runtime_error( - "Failed to parse .global/.const declaration in PTX: expected a " - "semicolon"); - } - size_t name_start_minus1 = line.find_last_of(" \t", name_end); - if (name_start_minus1 == std::string::npos) { - throw std::runtime_error( - "Failed to parse .global/.const declaration in PTX: expected " - "whitespace"); - } - size_t name_start = name_start_minus1 + 1; - std::string name = line.substr(name_start, name_end - name_start); - return name; -} - -inline void ptx_remove_unused_globals(std::string* ptx) { - std::istringstream iss(*ptx); - std::vector lines; - std::unordered_map line_num_to_global_name; - std::unordered_set name_set; - for (std::string line; std::getline(iss, line);) { - size_t line_num = lines.size(); - lines.push_back(line); - auto terms = split_string(line); - if (terms.size() <= 1) continue; // Ignore lines with no arguments - if (terms[0].substr(0, 2) == "//") continue; // Ignore comment lines - if (terms[0].substr(0, 7) == ".global" || - terms[0].substr(0, 6) == ".const") { - line_num_to_global_name.emplace(line_num, ptx_parse_decl_name(line)); - continue; - } - if (terms[0][0] == '.') continue; // Ignore .version, .reg, .param etc. - // Note: The first term will always be an instruction name; starting at 1 - // also allows unchecked inspection of the previous term. - for (int i = 1; i < (int)terms.size(); ++i) { - if (terms[i].substr(0, 2) == "//") break; // Ignore comments - // Note: The characters '.' and '%' are not treated as delimiters. - const char* token_delims = " \t()[]{},;+-*/~&|^?:=!<>\"'\\"; - for (auto token : split_string(terms[i], -1, token_delims)) { - if ( // Ignore non-names - !(std::isalpha(token[0]) || token[0] == '_' || token[0] == '$') || - token.find('.') != std::string::npos || - // Ignore variable/parameter declarations - terms[i - 1][0] == '.' || - // Ignore branch instructions - (token == "bra" && terms[i - 1][0] == '@') || - // Ignore branch labels - (token.substr(0, 2) == "BB" && - terms[i - 1].substr(0, 3) == "bra")) { - continue; - } - name_set.insert(token); - } - } - } - std::ostringstream oss; - for (size_t line_num = 0; line_num < lines.size(); ++line_num) { - auto it = line_num_to_global_name.find(line_num); - if (it != line_num_to_global_name.end()) { - const std::string& name = it->second; - if (!name_set.count(name)) { - continue; // Remove unused .global declaration. - } - } - oss << lines[line_num] << '\n'; - } - *ptx = oss.str(); -} - -inline nvrtcResult compile_kernel(std::string program_name, - std::map sources, - std::vector options, - std::string instantiation = "", - std::string* log = 0, std::string* ptx = 0, - std::string* mangled_instantiation = 0) { - std::string program_source = sources[program_name]; - // Build arrays of header names and sources - std::vector header_names_c; - std::vector header_sources_c; - int num_headers = (int)(sources.size() - 1); - header_names_c.reserve(num_headers); - header_sources_c.reserve(num_headers); - typedef std::map source_map; - for (source_map::const_iterator iter = sources.begin(); iter != sources.end(); - ++iter) { - std::string const& name = iter->first; - std::string const& code = iter->second; - if (name == program_name) { - continue; - } - header_names_c.push_back(name.c_str()); - header_sources_c.push_back(code.c_str()); - } - - // TODO: This WAR is expected to be unnecessary as of CUDA > 10.2. - bool should_remove_unused_globals = - detail::pop_remove_unused_globals_flag(&options); - - std::vector options_c(options.size() + 2); - options_c[0] = "--device-as-default-execution-space"; - options_c[1] = "--pre-include=jitify_preinclude.h"; - for (int i = 0; i < (int)options.size(); ++i) { - options_c[i + 2] = options[i].c_str(); - } - -#if CUDA_VERSION < 8000 - std::string inst_dummy; - if (!instantiation.empty()) { - // WAR for no nvrtcAddNameExpression before CUDA 8.0 - // Force template instantiation by adding dummy reference to kernel - inst_dummy = "__jitify_instantiation"; - program_source += - "\nvoid* " + inst_dummy + " = (void*)" + instantiation + ";\n"; - } -#endif - -#define CHECK_NVRTC(call) \ - do { \ - nvrtcResult ret = call; \ - if (ret != NVRTC_SUCCESS) { \ - return ret; \ - } \ - } while (0) - - nvrtcProgram nvrtc_program; - CHECK_NVRTC(nvrtcCreateProgram( - &nvrtc_program, program_source.c_str(), program_name.c_str(), num_headers, - header_sources_c.data(), header_names_c.data())); - -#if CUDA_VERSION >= 8000 - if (!instantiation.empty()) { - CHECK_NVRTC(nvrtcAddNameExpression(nvrtc_program, instantiation.c_str())); - } -#endif - - nvrtcResult ret = nvrtcCompileProgram(nvrtc_program, (int)options_c.size(), - options_c.data()); - if (log) { - size_t logsize; - CHECK_NVRTC(nvrtcGetProgramLogSize(nvrtc_program, &logsize)); - std::vector vlog(logsize, 0); - CHECK_NVRTC(nvrtcGetProgramLog(nvrtc_program, vlog.data())); - log->assign(vlog.data(), logsize); - } - if (ret != NVRTC_SUCCESS) { - return ret; - } - - if (ptx) { - size_t ptxsize; - CHECK_NVRTC(nvrtcGetPTXSize(nvrtc_program, &ptxsize)); - std::vector vptx(ptxsize); - CHECK_NVRTC(nvrtcGetPTX(nvrtc_program, vptx.data())); - ptx->assign(vptx.data(), ptxsize); - if (should_remove_unused_globals) { - detail::ptx_remove_unused_globals(ptx); - } - } - - if (!instantiation.empty() && mangled_instantiation) { -#if CUDA_VERSION >= 8000 - const char* mangled_instantiation_cstr; - // Note: The returned string pointer becomes invalid after - // nvrtcDestroyProgram has been called, so we save it. - CHECK_NVRTC(nvrtcGetLoweredName(nvrtc_program, instantiation.c_str(), - &mangled_instantiation_cstr)); - *mangled_instantiation = mangled_instantiation_cstr; -#else - // Extract mangled kernel template instantiation from PTX - inst_dummy += " = "; // Note: This must match how the PTX is generated - int mi_beg = ptx->find(inst_dummy) + inst_dummy.size(); - int mi_end = ptx->find(";", mi_beg); - *mangled_instantiation = ptx->substr(mi_beg, mi_end - mi_beg); -#endif - } - - CHECK_NVRTC(nvrtcDestroyProgram(&nvrtc_program)); -#undef CHECK_NVRTC - return NVRTC_SUCCESS; -} - -inline void load_program(std::string const& cuda_source, - std::vector const& headers, - file_callback_type file_callback, - std::vector* include_paths, - std::map* program_sources, - std::vector* program_options, - std::string* program_name) { - // Extract include paths from compile options - std::vector::iterator iter = program_options->begin(); - while (iter != program_options->end()) { - std::string const& opt = *iter; - if (opt.substr(0, 2) == "-I") { - include_paths->push_back(opt.substr(2)); - iter = program_options->erase(iter); - } else { - ++iter; - } - } - - // Load program source - if (!detail::load_source(cuda_source, *program_sources, "", *include_paths, - file_callback)) { - throw std::runtime_error("Source not found: " + cuda_source); - } - *program_name = program_sources->begin()->first; - - // Maps header include names to their full file paths. - std::map header_fullpaths; - - // Load header sources - for (std::string const& header : headers) { - if (!detail::load_source(header, *program_sources, "", *include_paths, - file_callback, &header_fullpaths)) { - // **TODO: Deal with source not found - throw std::runtime_error("Source not found: " + header); - } - } - -#if JITIFY_PRINT_SOURCE - std::string& program_source = (*program_sources)[*program_name]; - std::cout << "---------------------------------------" << std::endl; - std::cout << "--- Source of " << *program_name << " ---" << std::endl; - std::cout << "---------------------------------------" << std::endl; - detail::print_with_line_numbers(program_source); - std::cout << "---------------------------------------" << std::endl; -#endif - - std::vector compiler_options, linker_files, linker_paths; - detail::split_compiler_and_linker_options(*program_options, &compiler_options, - &linker_files, &linker_paths); - - // If no arch is specified at this point we use whatever the current - // context is. This ensures we pick up the correct internal headers - // for arch-dependent compilation, e.g., some intrinsics are only - // present for specific architectures. - detail::detect_and_add_cuda_arch(compiler_options); - detail::detect_and_add_cxx11_flag(compiler_options); - - // Iteratively try to compile the sources, and use the resulting errors to - // identify missing headers. - std::string log; - nvrtcResult ret; - while ((ret = detail::compile_kernel(*program_name, *program_sources, - compiler_options, "", &log)) == - NVRTC_ERROR_COMPILATION) { - std::string include_name; - std::string include_parent; - int line_num = 0; - if (!detail::extract_include_info_from_compile_error( - log, include_name, include_parent, line_num)) { -#if JITIFY_PRINT_LOG - detail::print_compile_log(*program_name, log); -#endif - // There was a non include-related compilation error - // TODO: How to handle error? - throw std::runtime_error("Runtime compilation failed"); - } - - bool is_included_with_quotes = false; - if (program_sources->count(include_parent)) { - const std::string& parent_source = (*program_sources)[include_parent]; - is_included_with_quotes = - is_include_directive_with_quotes(parent_source, line_num); - } - - // Try to load the new header - // Note: This fullpath lookup is needed because the compiler error - // messages have the include name of the header instead of its full path. - std::string include_parent_fullpath = header_fullpaths[include_parent]; - std::string include_path = detail::path_base(include_parent_fullpath); - if (detail::load_source(include_name, *program_sources, include_path, - *include_paths, file_callback, &header_fullpaths, - is_included_with_quotes)) { -#if JITIFY_PRINT_HEADER_PATHS - std::cout << "Found #include " << include_name << " from " - << include_parent << ":" << line_num << " [" - << include_parent_fullpath << "]" - << " at:\n " << header_fullpaths[include_name] << std::endl; -#endif - } else { // Failed to find header file. - // Comment-out the include line and print a warning - if (!program_sources->count(include_parent)) { - // ***TODO: Unless there's another mechanism (e.g., potentially - // the parent path vs. filename problem), getting - // here means include_parent was found automatically - // in a system include path. - // We need a WAR to zap it from *its parent*. - - typedef std::map source_map; - for (source_map::const_iterator it = program_sources->begin(); - it != program_sources->end(); ++it) { - std::cout << " " << it->first << std::endl; - } - throw std::out_of_range(include_parent + - " not in loaded sources!" - " This may be due to a header being loaded by" - " NVRTC without Jitify's knowledge."); - } - std::string& parent_source = (*program_sources)[include_parent]; - parent_source = detail::comment_out_code_line(line_num, parent_source); -#if JITIFY_PRINT_LOG - std::cout << include_parent << "(" << line_num - << "): warning: " << include_name << ": [jitify] File not found" - << std::endl; -#endif - } - } - if (ret != NVRTC_SUCCESS) { -#if JITIFY_PRINT_LOG - if (ret == NVRTC_ERROR_INVALID_OPTION) { - std::cout << "Compiler options: "; - for (int i = 0; i < (int)compiler_options.size(); ++i) { - std::cout << compiler_options[i] << " "; - } - std::cout << std::endl; - } -#endif - throw std::runtime_error(std::string("NVRTC error: ") + - nvrtcGetErrorString(ret)); - } -} - -inline void instantiate_kernel( - std::string const& program_name, - std::map const& program_sources, - std::string const& instantiation, std::vector const& options, - std::string* log, std::string* ptx, std::string* mangled_instantiation, - std::vector* linker_files, - std::vector* linker_paths) { - std::vector compiler_options; - detail::split_compiler_and_linker_options(options, &compiler_options, - linker_files, linker_paths); - - std::cout << "ABout to compile kernel" << std::endl; - nvrtcResult ret = - detail::compile_kernel(program_name, program_sources, compiler_options, - instantiation, log, ptx, mangled_instantiation); -#if JITIFY_PRINT_LOG - if (log->size() > 1) { - detail::print_compile_log(program_name, *log); - } -#endif - if (ret != NVRTC_SUCCESS) { - throw std::runtime_error(std::string("NVRTC error: ") + - nvrtcGetErrorString(ret)); - } - std::cout << "done compilling" << std::endl; - -#if JITIFY_PRINT_PTX - std::cout << "---------------------------------------" << std::endl; - std::cout << *mangled_instantiation << std::endl; - std::cout << "---------------------------------------" << std::endl; - std::cout << "--- PTX for " << mangled_instantiation << " in " << program_name - << " ---" << std::endl; - std::cout << "---------------------------------------" << std::endl; - std::cout << *ptx << std::endl; - std::cout << "---------------------------------------" << std::endl; -#endif -} - -inline void get_1d_max_occupancy(CUfunction func, - CUoccupancyB2DSize smem_callback, - unsigned int* smem, int max_block_size, - unsigned int flags, int* grid, int* block) { - if (!func) { - throw std::runtime_error( - "Kernel pointer is NULL; you may need to define JITIFY_THREAD_SAFE " - "1"); - } - CUresult res = cuOccupancyMaxPotentialBlockSizeWithFlags( - grid, block, func, smem_callback, *smem, max_block_size, flags); - if (res != CUDA_SUCCESS) { - const char* msg; - cuGetErrorName(res, &msg); - throw std::runtime_error(msg); - } - if (smem_callback) { - *smem = (unsigned int)smem_callback(*block); - } -} - -} // namespace detail - -//! \endcond - -class KernelInstantiation; -class Kernel; -class Program; -class JitCache; - -struct ProgramConfig { - std::vector options; - std::vector include_paths; - std::string name; - typedef std::map source_map; - source_map sources; -}; - -class JitCache_impl { - friend class Program_impl; - friend class KernelInstantiation_impl; - friend class KernelLauncher_impl; - typedef uint64_t key_type; - jitify::ObjectCache _kernel_cache; - jitify::ObjectCache _program_config_cache; - std::vector _options; -#if JITIFY_THREAD_SAFE - std::mutex _kernel_cache_mutex; - std::mutex _program_cache_mutex; -#endif - public: - inline JitCache_impl(size_t cache_size) - : _kernel_cache(cache_size), _program_config_cache(cache_size) { - detail::add_options_from_env(_options); - - // Bootstrap the cuda context to avoid errors - cudaFree(0); - } -}; - -class Program_impl { - // A friendly class - friend class Kernel_impl; - friend class KernelLauncher_impl; - friend class KernelInstantiation_impl; - // TODO: This can become invalid if JitCache is destroyed before the - // Program object is. However, this can't happen if JitCache - // instances are static. - JitCache_impl& _cache; - uint64_t _hash; - ProgramConfig* _config; - void load_sources(std::string source, std::vector headers, - std::vector options, - file_callback_type file_callback); - - public: - inline Program_impl(JitCache_impl& cache, std::string source, - jitify::detail::vector headers = 0, - jitify::detail::vector options = 0, - file_callback_type file_callback = 0); - inline Program_impl(Program_impl const&) = default; - inline Program_impl(Program_impl&&) = default; - inline std::vector const& options() const { - return _config->options; - } - inline std::string const& name() const { return _config->name; } - inline ProgramConfig::source_map const& sources() const { - return _config->sources; - } - inline std::vector const& include_paths() const { - return _config->include_paths; - } -}; - -class Kernel_impl { - friend class KernelLauncher_impl; - friend class KernelInstantiation_impl; - Program_impl _program; - std::string _name; - std::vector _options; - uint64_t _hash; - - public: - inline Kernel_impl(Program_impl const& program, std::string name, - jitify::detail::vector options = 0); - inline Kernel_impl(Kernel_impl const&) = default; - inline Kernel_impl(Kernel_impl&&) = default; -}; - -class KernelInstantiation_impl { - friend class KernelLauncher_impl; - Kernel_impl _kernel; - uint64_t _hash; - std::string _template_inst; - std::vector _options; - detail::CUDAKernel* _cuda_kernel; - inline void print() const; - void build_kernel(); - - public: - inline KernelInstantiation_impl( - Kernel_impl const& kernel, std::vector const& template_args); - inline KernelInstantiation_impl(KernelInstantiation_impl const&) = default; - inline KernelInstantiation_impl(KernelInstantiation_impl&&) = default; - detail::CUDAKernel const& cuda_kernel() const { return *_cuda_kernel; } -}; - -class KernelLauncher_impl { - KernelInstantiation_impl _kernel_inst; - dim3 _grid; - dim3 _block; - unsigned int _smem; - cudaStream_t _stream; - - public: - inline KernelLauncher_impl(KernelInstantiation_impl const& kernel_inst, - dim3 grid, dim3 block, unsigned int smem = 0, - cudaStream_t stream = 0) - : _kernel_inst(kernel_inst), - _grid(grid), - _block(block), - _smem(smem), - _stream(stream) {} - inline KernelLauncher_impl(KernelLauncher_impl const&) = default; - inline KernelLauncher_impl(KernelLauncher_impl&&) = default; - inline CUresult launch( - jitify::detail::vector arg_ptrs, - jitify::detail::vector arg_types = 0) const; -}; - -/*! An object representing a configured and instantiated kernel ready - * for launching. - */ -class KernelLauncher { - std::unique_ptr _impl; - - public: - inline KernelLauncher(KernelInstantiation const& kernel_inst, dim3 grid, - dim3 block, unsigned int smem = 0, - cudaStream_t stream = 0); - - // Note: It's important that there is no implicit conversion required - // for arg_ptrs, because otherwise the parameter pack version - // below gets called instead (probably resulting in a segfault). - /*! Launch the kernel. - * - * \param arg_ptrs A vector of pointers to each function argument for the - * kernel. - * \param arg_types A vector of function argument types represented - * as code-strings. This parameter is optional and is only used to print - * out the function signature. - */ - inline CUresult launch( - std::vector arg_ptrs = std::vector(), - jitify::detail::vector arg_types = 0) const { - return _impl->launch(arg_ptrs, arg_types); - } - // Regular function call syntax - /*! Launch the kernel. - * - * \see launch - */ - template - inline CUresult operator()(ArgTypes... args) const { - return this->launch(args...); - } - /*! Launch the kernel. - * - * \param args Function arguments for the kernel. - */ - template - inline CUresult launch(ArgTypes... args) const { - return this->launch(std::vector({(void*)&args...}), - {reflection::reflect()...}); - } -}; - -/*! An object representing a kernel instantiation made up of a Kernel and - * template arguments. - */ -class KernelInstantiation { - friend class KernelLauncher; - std::unique_ptr _impl; - - public: - inline KernelInstantiation(Kernel const& kernel, - std::vector const& template_args); - - /*! Implicit conversion to the underlying CUfunction object. - * - * \note This allows use of CUDA APIs like - * cuOccupancyMaxActiveBlocksPerMultiprocessor. - */ - inline operator CUfunction() const { return _impl->cuda_kernel(); } - - /*! Configure the kernel launch. - * - * \see configure - */ - inline KernelLauncher operator()(dim3 grid, dim3 block, unsigned int smem = 0, - cudaStream_t stream = 0) const { - return this->configure(grid, block, smem, stream); - } - /*! Configure the kernel launch. - * - * \param grid The thread grid dimensions for the launch. - * \param block The thread block dimensions for the launch. - * \param smem The amount of shared memory to dynamically allocate, in - * bytes. - * \param stream The CUDA stream to launch the kernel in. - */ - inline KernelLauncher configure(dim3 grid, dim3 block, unsigned int smem = 0, - cudaStream_t stream = 0) const { - return KernelLauncher(*this, grid, block, smem, stream); - } - /*! Configure the kernel launch with a 1-dimensional block and grid chosen - * automatically to maximise occupancy. - * - * \param max_block_size The upper limit on the block size, or 0 for no - * limit. - * \param smem The amount of shared memory to dynamically allocate, in bytes. - * \param smem_callback A function returning smem for a given block size (overrides \p smem). - * \param stream The CUDA stream to launch the kernel in. - * \param flags The flags to pass to cuOccupancyMaxPotentialBlockSizeWithFlags. - */ - inline KernelLauncher configure_1d_max_occupancy( - int max_block_size = 0, unsigned int smem = 0, - CUoccupancyB2DSize smem_callback = 0, cudaStream_t stream = 0, - unsigned int flags = 0) const { - int grid; - int block; - CUfunction func = _impl->cuda_kernel(); - detail::get_1d_max_occupancy(func, smem_callback, &smem, max_block_size, - flags, &grid, &block); - return this->configure(grid, block, smem, stream); - } - - /* - * \deprecated Use \p get_global_ptr instead. - */ - inline CUdeviceptr get_constant_ptr(const char* name, - size_t* size = nullptr) const { - return get_global_ptr(name, size); - } - - /* - * Get a device pointer to a global __constant__ or __device__ variable using - * its un-mangled name. If provided, *size is set to the size of the variable - * in bytes. - */ - inline CUdeviceptr get_global_ptr(const char* name, - size_t* size = nullptr) const { - return _impl->cuda_kernel().get_global_ptr(name, size); - } - - /* - * Copy data from a global __constant__ or __device__ array to the host using - * its un-mangled name. - */ - template - inline CUresult get_global_array(const char* name, T* data, size_t count, - CUstream stream = 0) const { - return _impl->cuda_kernel().get_global_data(name, data, count, stream); - } - - /* - * Copy a value from a global __constant__ or __device__ variable to the host - * using its un-mangled name. - */ - template - inline CUresult get_global_value(const char* name, T* value, - CUstream stream = 0) const { - return get_global_array(name, value, 1, stream); - } - - /* - * Copy data from the host to a global __constant__ or __device__ array using - * its un-mangled name. - */ - template - inline CUresult set_global_array(const char* name, const T* data, - size_t count, CUstream stream = 0) const { - return _impl->cuda_kernel().set_global_data(name, data, count, stream); - } - - /* - * Copy a value from the host to a global __constant__ or __device__ variable - * using its un-mangled name. - */ - template - inline CUresult set_global_value(const char* name, const T& value, - CUstream stream = 0) const { - return set_global_array(name, &value, 1, stream); - } - - const std::string& mangled_name() const { - return _impl->cuda_kernel().function_name(); - } - - const std::string& ptx() const { return _impl->cuda_kernel().ptx(); } - - const std::vector& link_files() const { - return _impl->cuda_kernel().link_files(); - } - - const std::vector& link_paths() const { - return _impl->cuda_kernel().link_paths(); - } -}; - -/*! An object representing a kernel made up of a Program, a name and options. - */ -class Kernel { - friend class KernelInstantiation; - std::unique_ptr _impl; - - public: - Kernel(Program const& program, std::string name, - jitify::detail::vector options = 0); - - /*! Instantiate the kernel. - * - * \param template_args A vector of template arguments represented as - * code-strings. These can be generated using - * \code{.cpp}jitify::reflection::reflect()\endcode or - * \code{.cpp}jitify::reflection::reflect(value)\endcode - * - * \note Template type deduction is not possible, so all types must be - * explicitly specified. - */ - // inline KernelInstantiation instantiate(std::vector const& - // template_args) const { - inline KernelInstantiation instantiate( - std::vector const& template_args = - std::vector()) const { - return KernelInstantiation(*this, template_args); - } - - // Regular template instantiation syntax (note limited flexibility) - /*! Instantiate the kernel. - * - * \note The template arguments specified on this function are - * used to instantiate the kernel. Non-type template arguments must - * be wrapped with - * \code{.cpp}jitify::reflection::NonType\endcode - * - * \note Template type deduction is not possible, so all types must be - * explicitly specified. - */ - template - inline KernelInstantiation instantiate() const { - return this->instantiate( - std::vector({reflection::reflect()...})); - } - // Template-like instantiation syntax - // E.g., instantiate(myvar,Type())(grid,block) - /*! Instantiate the kernel. - * - * \param targs The template arguments for the kernel, represented as - * values. Types must be wrapped with - * \code{.cpp}jitify::reflection::Type()\endcode or - * \code{.cpp}jitify::reflection::type_of(value)\endcode - * - * \note Template type deduction is not possible, so all types must be - * explicitly specified. - */ - template - inline KernelInstantiation instantiate(TemplateArgs... targs) const { - return this->instantiate( - std::vector({reflection::reflect(targs)...})); - } -}; - -/*! An object representing a program made up of source code, headers - * and options. - */ -class Program { - friend class Kernel; - std::unique_ptr _impl; - - public: - Program(JitCache& cache, std::string source, - jitify::detail::vector headers = 0, - jitify::detail::vector options = 0, - file_callback_type file_callback = 0); - - /*! Select a kernel. - * - * \param name The name of the kernel (unmangled and without - * template arguments). - * \param options A vector of options to be passed to the NVRTC - * compiler when compiling this kernel. - */ - inline Kernel kernel(std::string name, - jitify::detail::vector options = 0) const { - return Kernel(*this, name, options); - } - /*! Select a kernel. - * - * \see kernel - */ - inline Kernel operator()( - std::string name, jitify::detail::vector options = 0) const { - return this->kernel(name, options); - } -}; - -/*! An object that manages a cache of JIT-compiled CUDA kernels. - * - */ -class JitCache { - friend class Program; - std::unique_ptr _impl; - - public: - /*! JitCache constructor. - * \param cache_size The number of kernels to hold in the cache - * before overwriting the least-recently-used ones. - */ - enum { DEFAULT_CACHE_SIZE = 128 }; - JitCache(size_t cache_size = DEFAULT_CACHE_SIZE) - : _impl(new JitCache_impl(cache_size)) {} - - /*! Create a program. - * - * \param source A string containing either the source filename or - * the source itself; in the latter case, the first line must be - * the name of the program. - * \param headers A vector of strings representing the source of - * each header file required by the program. Each entry can be - * either the header filename or the header source itself; in - * the latter case, the first line must be the name of the header - * (i.e., the name by which the header is #included). - * \param options A vector of options to be passed to the - * NVRTC compiler. Include paths specified with \p -I - * are added to the search paths used by Jitify. The environment - * variable JITIFY_OPTIONS can also be used to define additional - * options. - * \param file_callback A pointer to a callback function that is - * invoked whenever a source file needs to be loaded. Inside this - * function, the user can either load/specify the source themselves - * or defer to Jitify's file-loading mechanisms. - * \note Program or header source files referenced by filename are - * looked-up using the following mechanisms (in this order): - * \note 1) By calling file_callback. - * \note 2) By looking for the file embedded in the executable via the GCC - * linker. - * \note 3) By looking for the file in the filesystem. - * - * \note Jitify recursively scans all source files for \p #include - * directives and automatically adds them to the set of headers needed - * by the program. - * If a \p #include directive references a header that cannot be found, - * the directive is automatically removed from the source code to prevent - * immediate compilation failure. This may result in compilation errors - * if the header was required by the program. - * - * \note Jitify automatically includes NVRTC-safe versions of some - * standard library headers. - */ - inline Program program(std::string source, - jitify::detail::vector headers = 0, - jitify::detail::vector options = 0, - file_callback_type file_callback = 0) { - return Program(*this, source, headers, options, file_callback); - } -}; - -inline Program::Program(JitCache& cache, std::string source, - jitify::detail::vector headers, - jitify::detail::vector options, - file_callback_type file_callback) - : _impl(new Program_impl(*cache._impl, source, headers, options, - file_callback)) {} - -inline Kernel::Kernel(Program const& program, std::string name, - jitify::detail::vector options) - : _impl(new Kernel_impl(*program._impl, name, options)) {} - -inline KernelInstantiation::KernelInstantiation( - Kernel const& kernel, std::vector const& template_args) - : _impl(new KernelInstantiation_impl(*kernel._impl, template_args)) {} - -inline KernelLauncher::KernelLauncher(KernelInstantiation const& kernel_inst, - dim3 grid, dim3 block, unsigned int smem, - cudaStream_t stream) - : _impl(new KernelLauncher_impl(*kernel_inst._impl, grid, block, smem, - stream)) {} - -inline std::ostream& operator<<(std::ostream& stream, dim3 d) { - if (d.y == 1 && d.z == 1) { - stream << d.x; - } else { - stream << "(" << d.x << "," << d.y << "," << d.z << ")"; - } - return stream; -} - -inline CUresult KernelLauncher_impl::launch( - jitify::detail::vector arg_ptrs, - jitify::detail::vector arg_types) const { -#if JITIFY_PRINT_LAUNCH - Kernel_impl const& kernel = _kernel_inst._kernel; - std::string arg_types_string = - (arg_types.empty() ? "..." : reflection::reflect_list(arg_types)); - std::cout << "Launching " << kernel._name << _kernel_inst._template_inst - << "<<<" << _grid << "," << _block << "," << _smem << "," << _stream - << ">>>" - << "(" << arg_types_string << ")" << std::endl; -#endif - if (!_kernel_inst._cuda_kernel) { - throw std::runtime_error( - "Kernel pointer is NULL; you may need to define JITIFY_THREAD_SAFE 1"); - } - return _kernel_inst._cuda_kernel->launch(_grid, _block, _smem, _stream, - arg_ptrs); -} - -inline KernelInstantiation_impl::KernelInstantiation_impl( - Kernel_impl const& kernel, std::vector const& template_args) - : _kernel(kernel), _options(kernel._options) { - _template_inst = - (template_args.empty() ? "" - : reflection::reflect_template(template_args)); - using detail::hash_combine; - using detail::hash_larson64; - _hash = _kernel._hash; - _hash = hash_combine(_hash, hash_larson64(_template_inst.c_str())); - JitCache_impl& cache = _kernel._program._cache; - uint64_t cache_key = _hash; -#if JITIFY_THREAD_SAFE - std::lock_guard lock(cache._kernel_cache_mutex); -#endif - if (cache._kernel_cache.contains(cache_key)) { -#if JITIFY_PRINT_INSTANTIATION - std::cout << "Found "; - this->print(); -#endif - _cuda_kernel = &cache._kernel_cache.get(cache_key); - } else { -#if JITIFY_PRINT_INSTANTIATION - std::cout << "Building "; - this->print(); -#endif - _cuda_kernel = &cache._kernel_cache.emplace(cache_key); - this->build_kernel(); - } -} - -inline void KernelInstantiation_impl::print() const { - std::string options_string = reflection::reflect_list(_options); - std::cout << _kernel._name << _template_inst << " [" << options_string << "]" - << std::endl; -} - -inline void KernelInstantiation_impl::build_kernel() { - Program_impl const& program = _kernel._program; - - std::string instantiation = _kernel._name + _template_inst; - - std::string log, ptx, mangled_instantiation; - std::vector linker_files, linker_paths; - detail::instantiate_kernel(program.name(), program.sources(), instantiation, - _options, &log, &ptx, &mangled_instantiation, - &linker_files, &linker_paths); - - _cuda_kernel->set(mangled_instantiation.c_str(), ptx.c_str(), linker_files, - linker_paths); -} - -Kernel_impl::Kernel_impl(Program_impl const& program, std::string name, - jitify::detail::vector options) - : _program(program), _name(name), _options(options) { - // Merge options from parent - _options.insert(_options.end(), _program.options().begin(), - _program.options().end()); - detail::detect_and_add_cuda_arch(_options); - detail::detect_and_add_cxx11_flag(_options); - std::string options_string = reflection::reflect_list(_options); - using detail::hash_combine; - using detail::hash_larson64; - _hash = _program._hash; - _hash = hash_combine(_hash, hash_larson64(_name.c_str())); - _hash = hash_combine(_hash, hash_larson64(options_string.c_str())); -} - -Program_impl::Program_impl(JitCache_impl& cache, std::string source, - jitify::detail::vector headers, - jitify::detail::vector options, - file_callback_type file_callback) - : _cache(cache) { - // Compute hash of source, headers and options - std::string options_string = reflection::reflect_list(options); - using detail::hash_combine; - using detail::hash_larson64; - _hash = hash_combine(hash_larson64(source.c_str()), - hash_larson64(options_string.c_str())); - for (size_t i = 0; i < headers.size(); ++i) { - _hash = hash_combine(_hash, hash_larson64(headers[i].c_str())); - } - _hash = hash_combine(_hash, (uint64_t)file_callback); - // Add pre-include built-in JIT-safe headers - for (int i = 0; i < detail::preinclude_jitsafe_headers_count; ++i) { - const char* hdr_name = detail::preinclude_jitsafe_header_names[i]; - const std::string& hdr_source = - detail::get_jitsafe_headers_map().at(hdr_name); - headers.push_back(std::string(hdr_name) + "\n" + hdr_source); - } - // Merge options from parent - options.insert(options.end(), _cache._options.begin(), _cache._options.end()); - // Load sources -#if JITIFY_THREAD_SAFE - std::lock_guard lock(cache._program_cache_mutex); -#endif - if (!cache._program_config_cache.contains(_hash)) { - _config = &cache._program_config_cache.insert(_hash); - this->load_sources(source, headers, options, file_callback); - } else { - _config = &cache._program_config_cache.get(_hash); - } -} - -inline void Program_impl::load_sources(std::string source, - std::vector headers, - std::vector options, - file_callback_type file_callback) { - _config->options = options; - detail::load_program(source, headers, file_callback, &_config->include_paths, - &_config->sources, &_config->options, &_config->name); -} - -enum Location { HOST, DEVICE }; - -/*! Specifies location and parameters for execution of an algorithm. - * \param stream The CUDA stream on which to execute. - * \param headers A vector of headers to include in the code. - * \param options Options to pass to the NVRTC compiler. - * \param file_callback See jitify::Program. - * \param block_size The size of the CUDA thread block with which to - * execute. - * \param cache_size The number of kernels to store in the cache - * before overwriting the least-recently-used ones. - */ -struct ExecutionPolicy { - /*! Location (HOST or DEVICE) on which to execute.*/ - Location location; - /*! List of headers to include when compiling the algorithm.*/ - std::vector headers; - /*! List of compiler options.*/ - std::vector options; - /*! Optional callback for loading source files.*/ - file_callback_type file_callback; - /*! CUDA stream on which to execute.*/ - cudaStream_t stream; - /*! CUDA device on which to execute.*/ - int device; - /*! CUDA block size with which to execute.*/ - int block_size; - /*! The number of instantiations to store in the cache before overwriting - * the least-recently-used ones.*/ - size_t cache_size; - ExecutionPolicy(Location location_ = DEVICE, - jitify::detail::vector headers_ = 0, - jitify::detail::vector options_ = 0, - file_callback_type file_callback_ = 0, - cudaStream_t stream_ = 0, int device_ = 0, - int block_size_ = 256, - size_t cache_size_ = JitCache::DEFAULT_CACHE_SIZE) - : location(location_), - headers(headers_), - options(options_), - file_callback(file_callback_), - stream(stream_), - device(device_), - block_size(block_size_), - cache_size(cache_size_) {} -}; - -template -class Lambda; - -/*! An object that captures a set of variables for use in a parallel_for - * expression. See JITIFY_CAPTURE(). - */ -class Capture { - public: - std::vector _arg_decls; - std::vector _arg_ptrs; - - public: - template - inline Capture(std::vector arg_names, Args const&... args) - : _arg_ptrs{(void*)&args...} { - std::vector arg_types = {reflection::reflect()...}; - _arg_decls.resize(arg_names.size()); - for (int i = 0; i < (int)arg_names.size(); ++i) { - _arg_decls[i] = arg_types[i] + " " + arg_names[i]; - } - } -}; - -/*! An object that captures the instantiated Lambda function for use - in a parallel_for expression and the function string for NVRTC - compilation - */ -template -class Lambda { - public: - Capture _capture; - std::string _func_string; - Func _func; - - public: - inline Lambda(Capture const& capture, std::string func_string, Func func) - : _capture(capture), _func_string(func_string), _func(func) {} -}; - -template -inline Lambda make_Lambda(Capture const& capture, std::string func, - T lambda) { - return Lambda(capture, func, lambda); -} - -#define JITIFY_CAPTURE(...) \ - jitify::Capture(jitify::detail::split_string(#__VA_ARGS__, -1, ","), \ - __VA_ARGS__) - -#define JITIFY_MAKE_LAMBDA(capture, x, ...) \ - jitify::make_Lambda(capture, std::string(#__VA_ARGS__), \ - [x](int i) { __VA_ARGS__; }) - -#define JITIFY_ARGS(...) __VA_ARGS__ - -#define JITIFY_LAMBDA_(x, ...) \ - JITIFY_MAKE_LAMBDA(JITIFY_CAPTURE(x), JITIFY_ARGS(x), __VA_ARGS__) - -// macro sequence to strip surrounding brackets -#define JITIFY_STRIP_PARENS(X) X -#define JITIFY_PASS_PARAMETERS(X) JITIFY_STRIP_PARENS(JITIFY_ARGS X) - -/*! Creates a Lambda object with captured variables and a function - * definition. - * \param capture A bracket-enclosed list of variables to capture. - * \param ... The function definition. - * - * \code{.cpp} - * float* capture_me; - * int capture_me_too; - * auto my_lambda = JITIFY_LAMBDA( (capture_me, capture_me_too), - * capture_me[i] = i*capture_me_too ); - * \endcode - */ -#define JITIFY_LAMBDA(capture, ...) \ - JITIFY_LAMBDA_(JITIFY_ARGS(JITIFY_PASS_PARAMETERS(capture)), \ - JITIFY_ARGS(__VA_ARGS__)) - -// TODO: Try to implement for_each that accepts iterators instead of indices -// Add compile guard for NOCUDA compilation -/*! Call a function for a range of indices - * - * \param policy Determines the location and device parameters for - * execution of the parallel_for. - * \param begin The starting index. - * \param end The ending index. - * \param lambda A Lambda object created using the JITIFY_LAMBDA() macro. - * - * \code{.cpp} - * char const* in; - * float* out; - * parallel_for(0, 100, JITIFY_LAMBDA( (in, out), {char x = in[i]; out[i] = - * x*x; } ); \endcode - */ -template -CUresult parallel_for(ExecutionPolicy policy, IndexType begin, IndexType end, - Lambda const& lambda) { - using namespace jitify; - - if (policy.location == HOST) { -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (IndexType i = begin; i < end; i++) { - lambda._func(i); - } - return CUDA_SUCCESS; // FIXME - replace with non-CUDA enum type? - } - - thread_local static JitCache kernel_cache(policy.cache_size); - - std::vector arg_decls; - arg_decls.push_back("I begin, I end"); - arg_decls.insert(arg_decls.end(), lambda._capture._arg_decls.begin(), - lambda._capture._arg_decls.end()); - - std::stringstream source_ss; - source_ss << "parallel_for_program\n"; - for (auto const& header : policy.headers) { - std::string header_name = header.substr(0, header.find("\n")); - source_ss << "#include <" << header_name << ">\n"; - } - source_ss << "template\n" - "__global__\n" - "void parallel_for_kernel(" - << reflection::reflect_list(arg_decls) - << ") {\n" - " I i0 = threadIdx.x + blockDim.x*blockIdx.x;\n" - " for( I i=i0+begin; i arg_ptrs; - arg_ptrs.push_back(&begin); - arg_ptrs.push_back(&end); - arg_ptrs.insert(arg_ptrs.end(), lambda._capture._arg_ptrs.begin(), - lambda._capture._arg_ptrs.end()); - - size_t n = end - begin; - dim3 block(policy.block_size); - dim3 grid((unsigned int)std::min((n - 1) / block.x + 1, size_t(65535))); - cudaSetDevice(policy.device); - return program.kernel("parallel_for_kernel") - .instantiate() - .configure(grid, block, 0, policy.stream) - .launch(arg_ptrs); -} - -namespace experimental { - -using jitify::file_callback_type; - -namespace serialization { - -namespace detail { - -// This should be incremented whenever the serialization format changes in any -// incompatible way. -static constexpr const size_t kSerializationVersion = 1; - -inline void serialize(std::ostream& stream, size_t u) { - uint64_t u64 = u; - stream.write(reinterpret_cast(&u64), sizeof(u64)); -} - -inline bool deserialize(std::istream& stream, size_t* size) { - uint64_t u64; - stream.read(reinterpret_cast(&u64), sizeof(u64)); - *size = u64; - return stream.good(); -} - -inline void serialize(std::ostream& stream, std::string const& s) { - serialize(stream, s.size()); - stream.write(s.data(), s.size()); -} - -inline bool deserialize(std::istream& stream, std::string* s) { - size_t size; - if (!deserialize(stream, &size)) return false; - s->resize(size); - if (s->size()) { - stream.read(&(*s)[0], s->size()); - } - return stream.good(); -} - -inline void serialize(std::ostream& stream, std::vector const& v) { - serialize(stream, v.size()); - for (auto const& s : v) { - serialize(stream, s); - } -} - -inline bool deserialize(std::istream& stream, std::vector* v) { - size_t size; - if (!deserialize(stream, &size)) return false; - v->resize(size); - for (auto& s : *v) { - if (!deserialize(stream, &s)) return false; - } - return true; -} - -inline void serialize(std::ostream& stream, - std::map const& m) { - serialize(stream, m.size()); - for (auto const& kv : m) { - serialize(stream, kv.first); - serialize(stream, kv.second); - } -} - -inline bool deserialize(std::istream& stream, - std::map* m) { - size_t size; - if (!deserialize(stream, &size)) return false; - for (size_t i = 0; i < size; ++i) { - std::string key; - if (!deserialize(stream, &key)) return false; - if (!deserialize(stream, &(*m)[key])) return false; - } - return true; -} - -template -inline void serialize(std::ostream& stream, T const& value, Rest... rest) { - serialize(stream, value); - serialize(stream, rest...); -} - -template -inline bool deserialize(std::istream& stream, T* value, Rest... rest) { - if (!deserialize(stream, value)) return false; - return deserialize(stream, rest...); -} - -inline void serialize_magic_number(std::ostream& stream) { - stream.write("JTFY", 4); - serialize(stream, kSerializationVersion); -} - -inline bool deserialize_magic_number(std::istream& stream) { - char magic_number[4] = {0, 0, 0, 0}; - stream.read(&magic_number[0], 4); - if (!(magic_number[0] == 'J' && magic_number[1] == 'T' && - magic_number[2] == 'F' && magic_number[3] == 'Y')) { - return false; - } - size_t serialization_version; - if (!deserialize(stream, &serialization_version)) return false; - return serialization_version == kSerializationVersion; -} - -} // namespace detail - -template -inline std::string serialize(Values const&... values) { - std::ostringstream ss(std::stringstream::out | std::stringstream::binary); - detail::serialize_magic_number(ss); - detail::serialize(ss, values...); - return ss.str(); -} - -template -inline bool deserialize(std::string const& serialized, Values*... values) { - std::istringstream ss(serialized, - std::stringstream::in | std::stringstream::binary); - if (!detail::deserialize_magic_number(ss)) return false; - return detail::deserialize(ss, values...); -} - -} // namespace serialization - -class Program; -class Kernel; -class KernelInstantiation; -class KernelLauncher; - -/*! An object representing a program made up of source code, headers - * and options. - */ -class Program { - private: - friend class KernelInstantiation; - std::string _name; - std::vector _options; - std::map _sources; - - // Private constructor used by deserialize() - Program() {} - - public: - /*! Create a program. - * - * \param source A string containing either the source filename or - * the source itself; in the latter case, the first line must be - * the name of the program. - * \param headers A vector of strings representing the source of - * each header file required by the program. Each entry can be - * either the header filename or the header source itself; in - * the latter case, the first line must be the name of the header - * (i.e., the name by which the header is #included). - * \param options A vector of options to be passed to the - * NVRTC compiler. Include paths specified with \p -I - * are added to the search paths used by Jitify. The environment - * variable JITIFY_OPTIONS can also be used to define additional - * options. - * \param file_callback A pointer to a callback function that is - * invoked whenever a source file needs to be loaded. Inside this - * function, the user can either load/specify the source themselves - * or defer to Jitify's file-loading mechanisms. - * \note Program or header source files referenced by filename are - * looked-up using the following mechanisms (in this order): - * \note 1) By calling file_callback. - * \note 2) By looking for the file embedded in the executable via the GCC - * linker. - * \note 3) By looking for the file in the filesystem. - * - * \note Jitify recursively scans all source files for \p #include - * directives and automatically adds them to the set of headers needed - * by the program. - * If a \p #include directive references a header that cannot be found, - * the directive is automatically removed from the source code to prevent - * immediate compilation failure. This may result in compilation errors - * if the header was required by the program. - * - * \note Jitify automatically includes NVRTC-safe versions of some - * standard library headers. - */ - Program(std::string const& cuda_source, - std::vector const& given_headers = {}, - std::vector const& given_options = {}, - file_callback_type file_callback = nullptr) { - // Add pre-include built-in JIT-safe headers - std::vector headers = given_headers; - for (int i = 0; i < detail::preinclude_jitsafe_headers_count; ++i) { - const char* hdr_name = detail::preinclude_jitsafe_header_names[i]; - const std::string& hdr_source = - detail::get_jitsafe_headers_map().at(hdr_name); - headers.push_back(std::string(hdr_name) + "\n" + hdr_source); - } - - _options = given_options; - detail::add_options_from_env(_options); - std::vector include_paths; - detail::load_program(cuda_source, headers, file_callback, &include_paths, - &_sources, &_options, &_name); - } - - /*! Restore a serialized program. - * - * \param serialized_program The serialized program to restore. - * - * \see serialize - */ - static Program deserialize(std::string const& serialized_program) { - Program program; - if (!serialization::deserialize(serialized_program, &program._name, - &program._options, &program._sources)) { - throw std::runtime_error("Failed to deserialize program"); - } - return program; - } - - /*! Save the program. - * - * \see deserialize - */ - std::string serialize() const { - // Note: Must update kSerializationVersion if this is changed. - return serialization::serialize(_name, _options, _sources); - }; - - /*! Select a kernel. - * - * \param name The name of the kernel (unmangled and without - * template arguments). - * \param options A vector of options to be passed to the NVRTC - * compiler when compiling this kernel. - */ - Kernel kernel(std::string const& name, - std::vector const& options = {}) const; -}; - -class Kernel { - friend class KernelInstantiation; - Program const* _program; - std::string _name; - std::vector _options; - - public: - Kernel(Program const* program, std::string const& name, - std::vector const& options = {}) - : _program(program), _name(name), _options(options) {} - - /*! Instantiate the kernel. - * - * \param template_args A vector of template arguments represented as - * code-strings. These can be generated using - * \code{.cpp}jitify::reflection::reflect()\endcode or - * \code{.cpp}jitify::reflection::reflect(value)\endcode - * - * \note Template type deduction is not possible, so all types must be - * explicitly specified. - */ - KernelInstantiation instantiate( - std::vector const& template_args = - std::vector()) const; - - // Regular template instantiation syntax (note limited flexibility) - /*! Instantiate the kernel. - * - * \note The template arguments specified on this function are - * used to instantiate the kernel. Non-type template arguments must - * be wrapped with - * \code{.cpp}jitify::reflection::NonType\endcode - * - * \note Template type deduction is not possible, so all types must be - * explicitly specified. - */ - template - KernelInstantiation instantiate() const; - - // Template-like instantiation syntax - // E.g., instantiate(myvar,Type())(grid,block) - /*! Instantiate the kernel. - * - * \param targs The template arguments for the kernel, represented as - * values. Types must be wrapped with - * \code{.cpp}jitify::reflection::Type()\endcode or - * \code{.cpp}jitify::reflection::type_of(value)\endcode - * - * \note Template type deduction is not possible, so all types must be - * explicitly specified. - */ - template - KernelInstantiation instantiate(TemplateArgs... targs) const; -}; - -class KernelInstantiation { - friend class KernelLauncher; - std::unique_ptr _cuda_kernel; - - // Private constructor used by deserialize() - KernelInstantiation(std::string const& func_name, std::string const& ptx, - std::vector const& link_files, - std::vector const& link_paths) - : _cuda_kernel(new detail::CUDAKernel(func_name.c_str(), ptx.c_str(), - link_files, link_paths)) {} - - public: - KernelInstantiation(Kernel const& kernel, - std::vector const& template_args) { - Program const* program = kernel._program; - - std::string template_inst = - (template_args.empty() ? "" - : reflection::reflect_template(template_args)); - std::string instantiation = kernel._name + template_inst; - - std::vector options; - options.insert(options.begin(), program->_options.begin(), - program->_options.end()); - options.insert(options.begin(), kernel._options.begin(), - kernel._options.end()); - detail::detect_and_add_cuda_arch(options); - detail::detect_and_add_cxx11_flag(options); - - std::string log, ptx, mangled_instantiation; - std::vector linker_files, linker_paths; - - std::cout << "About to instantiate kernel" << std::endl; - detail::instantiate_kernel(program->_name, program->_sources, instantiation, - options, &log, &ptx, &mangled_instantiation, - &linker_files, &linker_paths); - - std::cout << "instantiated kernel" << std::endl; - _cuda_kernel.reset(new detail::CUDAKernel(mangled_instantiation.c_str(), - ptx.c_str(), linker_files, - linker_paths)); - } - - /*! Implicit conversion to the underlying CUfunction object. - * - * \note This allows use of CUDA APIs like - * cuOccupancyMaxActiveBlocksPerMultiprocessor. - */ - operator CUfunction() const { return *_cuda_kernel; } - - /*! Restore a serialized kernel instantiation. - * - * \param serialized_kernel_inst The serialized kernel instantiation to - * restore. - * - * \see serialize - */ - static KernelInstantiation deserialize( - std::string const& serialized_kernel_inst) { - std::string func_name, ptx; - std::vector link_files, link_paths; - if (!serialization::deserialize(serialized_kernel_inst, &func_name, &ptx, - &link_files, &link_paths)) { - throw std::runtime_error("Failed to deserialize kernel instantiation"); - } - return KernelInstantiation(func_name, ptx, link_files, link_paths); - } - - /*! Save the program. - * - * \see deserialize - */ - std::string serialize() const { - // Note: Must update kSerializationVersion if this is changed. - - std::cout << "Inside serialize!!!!" << std::endl; - return serialization::serialize( - _cuda_kernel->function_name(), _cuda_kernel->ptx(), - _cuda_kernel->link_files(), _cuda_kernel->link_paths()); - } - - /*! Configure the kernel launch. - * - * \param grid The thread grid dimensions for the launch. - * \param block The thread block dimensions for the launch. - * \param smem The amount of shared memory to dynamically allocate, in - * bytes. - * \param stream The CUDA stream to launch the kernel in. - */ - KernelLauncher configure(dim3 grid, dim3 block, unsigned int smem = 0, - cudaStream_t stream = 0) const; - - /*! Configure the kernel launch with a 1-dimensional block and grid chosen - * automatically to maximise occupancy. - * - * \param max_block_size The upper limit on the block size, or 0 for no - * limit. - * \param smem The amount of shared memory to dynamically allocate, in bytes. - * \param smem_callback A function returning smem for a given block size - * (overrides \p smem). - * \param stream The CUDA stream to launch the kernel in. - * \param flags The flags to pass to - * cuOccupancyMaxPotentialBlockSizeWithFlags. - */ - KernelLauncher configure_1d_max_occupancy( - int max_block_size = 0, unsigned int smem = 0, - CUoccupancyB2DSize smem_callback = 0, cudaStream_t stream = 0, - unsigned int flags = 0) const; - - /* - * \deprecated Use \p get_global_ptr instead. - */ - CUdeviceptr get_constant_ptr(const char* name, size_t* size = nullptr) const { - return get_global_ptr(name, size); - } - - /* - * Get a device pointer to a global __constant__ or __device__ variable using - * its un-mangled name. If provided, *size is set to the size of the variable - * in bytes. - */ - CUdeviceptr get_global_ptr(const char* name, size_t* size = nullptr) const { - return _cuda_kernel->get_global_ptr(name, size); - } - - /* - * Copy data from a global __constant__ or __device__ array to the host using - * its un-mangled name. - */ - template - CUresult get_global_array(const char* name, T* data, size_t count, - CUstream stream = 0) const { - return _cuda_kernel->get_global_data(name, data, count, stream); - } - - /* - * Copy a value from a global __constant__ or __device__ variable to the host - * using its un-mangled name. - */ - template - CUresult get_global_value(const char* name, T* value, - CUstream stream = 0) const { - return get_global_array(name, value, 1, stream); - } - - /* - * Copy data from the host to a global __constant__ or __device__ array using - * its un-mangled name. - */ - template - CUresult set_global_array(const char* name, const T* data, size_t count, - CUstream stream = 0) const { - return _cuda_kernel->set_global_data(name, data, count, stream); - } - - /* - * Copy a value from the host to a global __constant__ or __device__ variable - * using its un-mangled name. - */ - template - CUresult set_global_value(const char* name, const T& value, - CUstream stream = 0) const { - return set_global_array(name, &value, 1, stream); - } - - const std::string& mangled_name() const { - return _cuda_kernel->function_name(); - } - - const std::string& ptx() const { return _cuda_kernel->ptx(); } - - const std::vector& link_files() const { - return _cuda_kernel->link_files(); - } - - const std::vector& link_paths() const { - return _cuda_kernel->link_paths(); - } -}; - -class KernelLauncher { - KernelInstantiation const* _kernel_inst; - dim3 _grid; - dim3 _block; - unsigned int _smem; - cudaStream_t _stream; - - public: - KernelLauncher(KernelInstantiation const* kernel_inst, dim3 grid, dim3 block, - unsigned int smem = 0, cudaStream_t stream = 0) - : _kernel_inst(kernel_inst), - _grid(grid), - _block(block), - _smem(smem), - _stream(stream) {} - - // Note: It's important that there is no implicit conversion required - // for arg_ptrs, because otherwise the parameter pack version - // below gets called instead (probably resulting in a segfault). - /*! Launch the kernel. - * - * \param arg_ptrs A vector of pointers to each function argument for the - * kernel. - * \param arg_types A vector of function argument types represented - * as code-strings. This parameter is optional and is only used to print - * out the function signature. - */ - CUresult launch(std::vector arg_ptrs = {}, - std::vector arg_types = {}) const { -#if JITIFY_PRINT_LAUNCH - std::string arg_types_string = - (arg_types.empty() ? "..." : reflection::reflect_list(arg_types)); - std::cout << "Launching " << _kernel_inst->_cuda_kernel->function_name() - << "<<<" << _grid << "," << _block << "," << _smem << "," - << _stream << ">>>" - << "(" << arg_types_string << ")" << std::endl; -#endif - - return _kernel_inst->_cuda_kernel->launch(_grid, _block, _smem, _stream, - arg_ptrs); - } - - /*! Launch the kernel. - * - * \param args Function arguments for the kernel. - */ - template - CUresult launch(ArgTypes... args) const { - return this->launch(std::vector({(void*)&args...}), - {reflection::reflect()...}); - } -}; - -inline Kernel Program::kernel(std::string const& name, - std::vector const& options) const { - return Kernel(this, name, options); -} - -inline KernelInstantiation Kernel::instantiate( - std::vector const& template_args) const { - return KernelInstantiation(*this, template_args); -} - -template -inline KernelInstantiation Kernel::instantiate() const { - return this->instantiate( - std::vector({reflection::reflect()...})); -} - -template -inline KernelInstantiation Kernel::instantiate(TemplateArgs... targs) const { - return this->instantiate( - std::vector({reflection::reflect(targs)...})); -} - -inline KernelLauncher KernelInstantiation::configure( - dim3 grid, dim3 block, unsigned int smem, cudaStream_t stream) const { - return KernelLauncher(this, grid, block, smem, stream); -} - -inline KernelLauncher KernelInstantiation::configure_1d_max_occupancy( - int max_block_size, unsigned int smem, CUoccupancyB2DSize smem_callback, - cudaStream_t stream, unsigned int flags) const { - int grid; - int block; - CUfunction func = *_cuda_kernel; - detail::get_1d_max_occupancy(func, smem_callback, &smem, max_block_size, - flags, &grid, &block); - return this->configure(grid, block, smem, stream); -} - -} // namespace experimental - -} // namespace jitify - -#if defined(_WIN32) || defined(_WIN64) -#pragma pop_macro("max") -#pragma pop_macro("min") -#pragma pop_macro("strtok_r") -#endif diff --git a/GraphBLAS/CUDA/test/.gitignore b/GraphBLAS/CUDA/test/.gitignore deleted file mode 100644 index ab8dd2b30f..0000000000 --- a/GraphBLAS/CUDA/test/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -# Ignore these files: -graphblascuda_test - -# Do not ignore this file -!.gitignore - diff --git a/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.cpp b/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.cpp deleted file mode 100644 index ca7c5de350..0000000000 --- a/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.cpp +++ /dev/null @@ -1,22 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.cpp -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -// Test AxB_dot3_cuda kernels -// Using data generators and test classes, cover -// all NBUCKETS cases for the masked GEMM ( C, M, A, B) in GraphBLAS -// Tests Semirings, data types and a range of data input sizes and shapes -// Connects to the jitFactory for launches. - -#include -#include -#include -#include -#include - -//Test instances and groupings - diff --git a/GraphBLAS/CUDA/test/GB_cuda_type_wrap.hpp b/GraphBLAS/CUDA/test/GB_cuda_type_wrap.hpp deleted file mode 100644 index 3acf26553f..0000000000 --- a/GraphBLAS/CUDA/test/GB_cuda_type_wrap.hpp +++ /dev/null @@ -1,246 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/test/GB_cuda_type_wrap.hpp -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -/* - * Copyright (c) 2019,2020 NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#ifndef GB_CONV_TYPE_H -#define GB_CONV_TYPE_H - -extern "C" { -#include "GB.h" -}; -#include -#include -#include -#include -#include -#include - -/**---------------------------------------------------------------------------* - * @file type_convert.hpp - * @brief Defines the mapping between concrete C++ types and Grb types. - *---------------------------------------------------------------------------**/ -namespace cuda::jit { - -template -GrB_Type to_grb_type(); - -template<> inline GrB_Type to_grb_type() { return GrB_INT8; } -template<> inline GrB_Type to_grb_type() { return GrB_INT16; } -template<> inline GrB_Type to_grb_type() { return GrB_INT32; } -template<> inline GrB_Type to_grb_type() { return GrB_INT64; } -template<> inline GrB_Type to_grb_type() { return GrB_UINT8; } -template<> inline GrB_Type to_grb_type() { return GrB_UINT16; } -template<> inline GrB_Type to_grb_type() { return GrB_UINT32; } -template<> inline GrB_Type to_grb_type() { return GrB_UINT64; } -template<> inline GrB_Type to_grb_type() { return GrB_FP32; } -template<> inline GrB_Type to_grb_type() { return GrB_FP64; } -template<> inline GrB_Type to_grb_type() { return GrB_BOOL; } - - -template -void set_element(GrB_Matrix A, T x, int64_t i, int64_t j); - -template<> inline void set_element(GrB_Matrix A, int8_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_INT8(A, x, i, j); } -template<> inline void set_element(GrB_Matrix A, int16_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_INT16(A, x, i, j); } -template<> inline void set_element(GrB_Matrix A, int32_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_INT32(A, x, i, j); } -template<> inline void set_element(GrB_Matrix A, int64_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_INT64(A, x, i, j); } -template<> inline void set_element(GrB_Matrix A, uint8_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_UINT8(A, x, i, j); } -template<> inline void set_element(GrB_Matrix A, uint16_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_UINT16(A, x, i, j); } -template<> inline void set_element(GrB_Matrix A, uint32_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_UINT32(A, x, i, j); } -template<> inline void set_element(GrB_Matrix A, uint64_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_UINT64(A, x, i, j); } -template<> inline void set_element(GrB_Matrix A, float x, int64_t i, int64_t j) { GrB_Matrix_setElement_FP32(A, x, i, j); } -template<> inline void set_element(GrB_Matrix A, double x, int64_t i, int64_t j) { GrB_Matrix_setElement_FP64(A, x, i, j); } -template<> inline void set_element(GrB_Matrix A, bool x, int64_t i, int64_t j) { GrB_Matrix_setElement_BOOL(A, x, i, j); } - - -template -void vector_set_element(GrB_Vector A, T x, int64_t i); - -template<> inline void vector_set_element(GrB_Vector A, int8_t x, int64_t i) { GrB_Vector_setElement_INT8(A, x, i); } -template<> inline void vector_set_element(GrB_Vector A, int16_t x, int64_t i) { GrB_Vector_setElement_INT16(A, x, i); } -template<> inline void vector_set_element(GrB_Vector A, int32_t x, int64_t i) { GrB_Vector_setElement_INT32(A, x, i); } -template<> inline void vector_set_element(GrB_Vector A, int64_t x, int64_t i) { GrB_Vector_setElement_INT64(A, x, i); } -template<> inline void vector_set_element(GrB_Vector A, uint8_t x, int64_t i) { GrB_Vector_setElement_UINT8(A, x, i); } -template<> inline void vector_set_element(GrB_Vector A, uint16_t x, int64_t i) { GrB_Vector_setElement_UINT16(A, x, i); } -template<> inline void vector_set_element(GrB_Vector A, uint32_t x, int64_t i) { GrB_Vector_setElement_UINT32(A, x, i); } -template<> inline void vector_set_element(GrB_Vector A, uint64_t x, int64_t i) { GrB_Vector_setElement_UINT64(A, x, i); } -template<> inline void vector_set_element(GrB_Vector A, float x, int64_t i) { GrB_Vector_setElement_FP32(A, x, i); } -template<> inline void vector_set_element(GrB_Vector A, double x, int64_t i) { GrB_Vector_setElement_FP64(A, x, i); } -template<> inline void vector_set_element(GrB_Vector A, bool x, int64_t i) { GrB_Vector_setElement_BOOL(A, x, i); } - - - template - void scalar_set_element(GrB_Scalar A, T x); - - template<> inline void scalar_set_element(GrB_Scalar A, int8_t x) { GrB_Scalar_setElement_INT8(A, x); } - template<> inline void scalar_set_element(GrB_Scalar A, int16_t x) { GrB_Scalar_setElement_INT16(A, x); } - template<> inline void scalar_set_element(GrB_Scalar A, int32_t x) { GrB_Scalar_setElement_INT32(A, x); } - template<> inline void scalar_set_element(GrB_Scalar A, int64_t x) { GrB_Scalar_setElement_INT64(A, x); } - template<> inline void scalar_set_element(GrB_Scalar A, uint8_t x) { GrB_Scalar_setElement_UINT8(A, x); } - template<> inline void scalar_set_element(GrB_Scalar A, uint16_t x) { GrB_Scalar_setElement_UINT16(A, x); } - template<> inline void scalar_set_element(GrB_Scalar A, uint32_t x) { GrB_Scalar_setElement_UINT32(A, x); } - template<> inline void scalar_set_element(GrB_Scalar A, uint64_t x) { GrB_Scalar_setElement_UINT64(A, x); } - template<> inline void scalar_set_element(GrB_Scalar A, float x) { GrB_Scalar_setElement_FP32(A, x); } - template<> inline void scalar_set_element(GrB_Scalar A, double x) { GrB_Scalar_setElement_FP64(A, x); } - template<> inline void scalar_set_element(GrB_Scalar A, bool x) { GrB_Scalar_setElement_BOOL(A, x); } - - -template -GrB_Info vector_reduce(T *scalar, GrB_Vector A, GrB_Monoid op); - -template<> inline GrB_Info vector_reduce(int8_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_INT8(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info vector_reduce(int16_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_INT16(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info vector_reduce(int32_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_INT32(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info vector_reduce(int64_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_INT64(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info vector_reduce(uint8_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_UINT8(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info vector_reduce(uint16_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_UINT16(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info vector_reduce(uint32_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_UINT32(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info vector_reduce(uint64_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_UINT64(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info vector_reduce(float *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_FP32(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info vector_reduce(double *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_FP64(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info vector_reduce(bool *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_BOOL(scalar, NULL, op, A, NULL); } - -/** - * GxB_Matrix_reduce_FC32 // c = accum (c, reduce_to_scalar (A)) - ( - GxB_FC32_t *c, // result scalar - const GrB_BinaryOp accum, // optional accum for c=accum(c,t) - const GrB_Monoid monoid, // monoid to do the reduction - const GrB_Matrix A, // matrix to reduce - const GrB_Descriptor desc - - * @tparam T - * @param scalar - * @param A - * @param op - * @return - */ - -template -GrB_Info matrix_reduce(T *scalar, GrB_Matrix A, GrB_Monoid op); - -template<> inline GrB_Info matrix_reduce(int8_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_INT8(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info matrix_reduce(int16_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_INT16(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info matrix_reduce(int32_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_INT32(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info matrix_reduce(int64_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_INT64(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info matrix_reduce(uint8_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_UINT8(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info matrix_reduce(uint16_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_UINT16(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info matrix_reduce(uint32_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_UINT32(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info matrix_reduce(uint64_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_UINT64(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info matrix_reduce(float *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_FP32(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info matrix_reduce(double *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_FP64(scalar, NULL, op, A, NULL); } -template<> inline GrB_Info matrix_reduce(bool *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_BOOL(scalar, NULL, op, A, NULL); } - - -template -GrB_Info get_element(GrB_Matrix A, T* x, int64_t i, int64_t j); -template<> inline GrB_Info get_element(GrB_Matrix A, int8_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_INT8(x, A, i, j); } -template<> inline GrB_Info get_element(GrB_Matrix A, int16_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_INT16(x, A, i, j); } -template<> inline GrB_Info get_element(GrB_Matrix A, int32_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_INT32(x, A, i, j); } -template<> inline GrB_Info get_element(GrB_Matrix A, int64_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_INT64(x, A, i, j); } -template<> inline GrB_Info get_element(GrB_Matrix A, uint8_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_UINT8(x, A, i, j); } -template<> inline GrB_Info get_element(GrB_Matrix A, uint16_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_UINT16(x, A, i, j); } -template<> inline GrB_Info get_element(GrB_Matrix A, uint32_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_UINT32(x, A, i, j); } -template<> inline GrB_Info get_element(GrB_Matrix A, uint64_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_UINT64(x, A, i, j); } -template<> inline GrB_Info get_element(GrB_Matrix A, float *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_FP32(x, A, i, j); } -template<> inline GrB_Info get_element(GrB_Matrix A, double *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_FP64(x, A, i, j); } -template<> inline GrB_Info get_element(GrB_Matrix A, bool *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_BOOL(x, A, i, j); } - - - - - -template -class type_name { -public: - static const char *name; -}; - -#define DECLARE_TYPE_NAME(x) template<> inline const char *type_name::name = #x; -#define GET_TYPE_NAME(x) (type_name::name) - - DECLARE_TYPE_NAME(int); - DECLARE_TYPE_NAME(int&); - DECLARE_TYPE_NAME(int*); - DECLARE_TYPE_NAME(int8_t); - DECLARE_TYPE_NAME(int8_t&); - DECLARE_TYPE_NAME(int8_t*); - DECLARE_TYPE_NAME(unsigned char); - DECLARE_TYPE_NAME(unsigned char&); - DECLARE_TYPE_NAME(unsigned char*); -// DECLARE_TYPE_NAME(unsigned int); -// DECLARE_TYPE_NAME(unsigned int&); -// DECLARE_TYPE_NAME(unsigned int*); -// DECLARE_TYPE_NAME(unsigned int32_t); -// DECLARE_TYPE_NAME(unsigned int32_t&); -// DECLARE_TYPE_NAME(unsigned int32_t*); - DECLARE_TYPE_NAME(unsigned int64_t); - DECLARE_TYPE_NAME(unsigned int64_t&); - DECLARE_TYPE_NAME(unsigned int64_t*); - DECLARE_TYPE_NAME(long); - DECLARE_TYPE_NAME(long&); - DECLARE_TYPE_NAME(long*); - DECLARE_TYPE_NAME(float); - DECLARE_TYPE_NAME(float&); - DECLARE_TYPE_NAME(float*); - DECLARE_TYPE_NAME(double); - DECLARE_TYPE_NAME(double&); - DECLARE_TYPE_NAME(double*); - DECLARE_TYPE_NAME(bool); - - - - inline const std::string grb_str_type(GB_Type_code grb_type_code) { - switch(grb_type_code) { - case GB_BOOL_code: - return "bool"; - case GB_INT8_code: - return "int8_t"; - case GB_UINT8_code: - return "uint8_t"; - case GB_INT16_code: - return "int16_t"; - case GB_UINT16_code: - return "uint16_t"; - case GB_INT32_code: - return "int32_t"; - case GB_UINT32_code: - return "uint32_t"; - case GB_INT64_code: - return "int64_t"; - case GB_UINT64_code: - return "uint64_t"; - case GB_FP32_code: - return "float"; - case GB_FP64_code: - return "double"; - default: - printf("Error: GrB_Type not supported.\n"); - exit(1); - } - } - - -} // namespace cuda::jit -#endif diff --git a/GraphBLAS/CUDA/test/GpuTimer.h b/GraphBLAS/CUDA/test/GpuTimer.h deleted file mode 100644 index ed5b52520d..0000000000 --- a/GraphBLAS/CUDA/test/GpuTimer.h +++ /dev/null @@ -1,49 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/test/GpuTimer.h -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -#ifndef __GPU_TIMER_H__ -#define __GPU_TIMER_H__ - -#include -struct GpuTimer -{ - cudaEvent_t start; - cudaEvent_t stop; - - GpuTimer() - { - cudaEventCreate(&start); - cudaEventCreate(&stop); - } - - ~GpuTimer() - { - cudaEventDestroy(start); - cudaEventDestroy(stop); - } - - void Start() - { - cudaEventRecord(start, 0); - } - - void Stop() - { - cudaEventRecord(stop, 0); - } - - float Elapsed() - { - float elapsed; - cudaEventSynchronize(stop); - cudaEventElapsedTime(&elapsed, start, stop); - return elapsed; - } -}; - -#endif /* __GPU_TIMER_H__ */ diff --git a/GraphBLAS/CUDA/test/Makefile b/GraphBLAS/CUDA/test/Makefile deleted file mode 100644 index 412ac47294..0000000000 --- a/GraphBLAS/CUDA/test/Makefile +++ /dev/null @@ -1,133 +0,0 @@ -#------------------------------------------------------------------------------- -# GraphBLAS/CUDA/test/Makefile -#------------------------------------------------------------------------------- - -# cuda 10.1+ is assumed - -all: cudaTest - - -LIBS = -L/usr/local/cuda/lib64 -L/usr/local/cuda/lib64/stubs -lpthreads -lcudadevrt -lcudart -lnvrtc -INC += -I$(CUDA_DIR)/include -I../ -I../../Source -I../../Include -I../../Source/Template -I$(TEMPLATE_DIR) -Igoogletest/include - -CUDA_OPTS = -O2 --cudart=shared --gpu-architecture=compute_75\ - --relocatable-device-code true --device-c\ - --std=c++17 -Xcompiler -fPIC - -%.o: %.cu - nvcc -c $(I) $(CUDA_OPTS) $(INC) -o $@ $< - -config: - nvidia-smi - nvcc --version - @echo " " - @echo "SO_NAME: " $(SO_NAME) - @echo "SO_OPTS: " $(SO_OPTS) - @echo "LIBS: " $(LIBS) - @echo "CUDA_OPTS: " $(CUDA_OPTS) - @echo "SRC: " $(SRC) - @echo "OBJ: " $(OBJ) - @echo "I: " $(I) - @echo " " - gcc --version - icc --version - -clean: - rm -f *.o - rm -f stringify - rm -f cudaTest - rm -f testJit -.PHONY: clean - -distclean: clean - rm -f *.so *.a - -purge: distclean - -################################################################################ - -GXX ?= g++ -GCC ?= gcc -DOXYGEN ?= doxygen -CXXFLAGS ?= -O3 -Wall -g -fmessage-length=80 -CFLAGS ?= -O2 -g -std=c11 - -CXX11 ?= 1 - -CUDA_DIR ?= /usr/local/cuda - -CXXFLAGS += -pthread - -ifeq ($(CXX11),1) - CXXFLAGS += -std=c++17 -endif - -EMBED_BEGIN = -rdynamic -Wl,-b,binary, -EMBED_END = ,-b,default - -UNAME_S := $(shell uname -s) -ifeq ($(UNAME_S),Linux) - CXXFLAGS += -D LINUX - CUDA_LIB_DIR = $(CUDA_DIR)/lib64 -else ifeq ($(UNAME_S),Darwin) - CUDA_LIB_DIR = $(CUDA_DIR)/lib -endif - -TEMPLATE_DIR ?= ../templates - -LIB += -ldl -L$(CUDA_LIB_DIR) -L$(CUDA_LIB_DIR)/stubs -lcuda -lcudadevrt -lcudart -lnvrtc - -# FIXME: file names in HEADERS are old -HEADERS = jitify.hpp dataFactory.hpp jitFactory.hpp jitTestFactory.hpp semiringFactory.hpp \ - ../type_name.hpp - -TEMPLATES := $(wildcard $(TEMPLATE_DIR)/*.cu) - -CU_OBJS := ../GB_cuda_jitify_cache.o ../GB_cuda_jitify_launcher.o - -CFILES := $(wildcard ../*.c) - -COBJS := $(patsubst %.c, %.o, $(CFILES) ) - -JIT_TEMP := $(patsubst %.cu, %.cu.jit, $(TEMPLATES)) - -GTEST_LIB := googletest/build/lib/libgtest.a googletest/build/lib/libgtest_main.a - -%.cu.jit: %.cu - ../stringify $? > $@ - -stringify: stringify.cpp - $(GXX) -o $@ $< -O3 -Wall - -%.o: %.c - $(GXX) -c -o $@ $< $(CFLAGS) $(INC) - -%.o: %.cpp - $(GXX) -c -o $@ $< $(CXXFLAGS) $(INC) - -cu_link.o: $(CU_OBJS) - nvcc --gpu-architecture=compute_75 --device-link $(CU_OBJS) --output-file cu_link.o - - -testJit: ../tofix/testJit.cpp $(OBJS) $(HEADERS) $(JIT_TEMP) - $(GXX) -o $@ $< $(CXXFLAGS) $(INC) $(OBJS) $(LIB) - -AxB_dot3_test_instances.hpp: testGen.py - python3 testGen.py - - -instances := AxB_dot3_test_instances.hpp - - -cudaTest: cudaTest.cpp.bak $(COBJS) $(OBJS) $(HEADERS) $(JIT_TEMP) cu_link.o AxB_dot3_cuda_tests.hpp.bak $(instances) - $(GXX) -o $@ $< $(CXXFLAGS) $(INC) $(COBJS) $(CU_OBJS) cu_link.o $(LIB) $(GTEST_LIB) - -%.cu: %.cutmp - cp $? $@ - - -doc: jitify.hpp Doxyfile - $(DOXYGEN) Doxyfile -.PHONY: doc - - diff --git a/GraphBLAS/CUDA/test/cuda_tests_template.cpp b/GraphBLAS/CUDA/test/cuda_tests_template.cpp deleted file mode 100644 index 340ded031b..0000000000 --- a/GraphBLAS/CUDA/test/cuda_tests_template.cpp +++ /dev/null @@ -1,25 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/test/cuda_tests_template.cpp -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -// Test AxB_dot3_cuda kernels -// Using data generators and test classes, cover -// all NBUCKETS cases for the masked GEMM ( C, M, A, B) in GraphBLAS -// Tests Semirings, data types and a range of data input sizes and shapes -// Connects to the jitFactory for launches. - -#include -#include -#include -#include -#include -#include "problem_spec.hpp" -#include "jitTestFactory.hpp" -#include "../GB_cuda_buckets.h" - -//Test instances and groupings - diff --git a/GraphBLAS/CUDA/test/dataFactory.hpp b/GraphBLAS/CUDA/test/dataFactory.hpp deleted file mode 100644 index 5c6263c918..0000000000 --- a/GraphBLAS/CUDA/test/dataFactory.hpp +++ /dev/null @@ -1,382 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/test/dataFactory.hpp -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -#pragma once - -#include -#include -#include -#include - -#include "GB.h" -#include "GB_cuda_type_wrap.hpp" -#include "test_utility.hpp" -#include "GB_cuda_error.h" - -// CAUTION: This assumes our indices are small enough to fit into a 32-bit int. -inline std::int64_t gen_key(std::int64_t i, std::int64_t j) { - return (std::int64_t) i << 32 | (std::int64_t) j; -} - -//Vector generators -template -void fillvector_linear( int N, T *vec, int start=0) { - for (int i = start; i< N+start; ++i) vec[i] = T(i); -} -template -void fillvector_constant( int N, T *vec, T val) { - for (int i = 0; i< N; ++i) vec[i] = val; -} - -// Mix-in class to enable unified memory -class Managed { -public: - void *operator new(size_t len) { - void *ptr = nullptr; - //std::cout<<"in new operator, alloc for "< -class matrix : public Managed { - int64_t nrows_; - int64_t ncols_; - - public: - GrB_Matrix mat; - - matrix(int64_t nrows, int64_t ncols): nrows_(nrows), ncols_(ncols) {} - - GrB_Matrix get_grb_matrix() { - return mat; - } - - ~matrix() { - if(mat != NULL) { - GrB_Matrix_free(&mat); - mat = NULL; - } - } - - uint64_t get_zombie_count() { return mat->nzombies;} - - void clear() { - GRB_TRY (GrB_Matrix_clear (mat)) ; - } - - void alloc() { - GrB_Type type = cuda::jit::to_grb_type(); - - GRB_TRY (GrB_Matrix_new (&mat, type, nrows_, ncols_)) ; - - // GxB_Matrix_Option_set (mat, GxB_SPARSITY_CONTROL, - // GxB_SPARSE) ; - // or: - // GxB_HYPERSPARSE, GxB_BITMAP, GxB_FULL - } - - - void fill_random( int64_t nnz, int gxb_sparsity_control, int gxb_format, std::int64_t seed = 12345ULL, T val_min = 0.0, T val_max = 2.0 , bool debug_print = false) { - -// std::cout << "inside fill_random, using seed "<< seed << std::endl; - alloc(); - - double inv_sparsity ; - if (nnz < 0) - { - // build a matrix with all entries present - inv_sparsity = 1 ; - } - else - { - inv_sparsity = ceil(((double)nrows_*ncols_)/nnz); //= values not taken per value occupied in index space - } -// -// std::cout<< "fill_random nrows="<< nrows_<<"ncols=" << ncols_ <<" need "<< nnz<<" values, invsparse = "< dis(0.0, 1.0); - - if (nnz < 0 || inv_sparsity == 1.) - { -// std::cout<<"filling dense"< (mat, x, i, j) ; - // A (j,i) = x - cuda::jit::set_element (mat, x, j, i) ; - } - else - { - // A (i,j) = x - cuda::jit::set_element (mat, x, i, j) ; - } - } - } - -// std::cout << "done." << std::endl; - } - else - { -// std::cout<<"filling sparse"< row_lookup; - unordered_set key_lookup; - for ( int co = 0; co < 2*nrows_; co++ ) - { - GrB_Index i = ((GrB_Index) (dis(r) * nrows_)) % ((GrB_Index) nrows_) ; - - row_lookup.insert( i ); - } - int remain= nnz; //countdown to done - - while ( remain > 0) - { -// std::cout<< remain<<" nonzeroes left to fill.."< 0 ) - { - GrB_Index j = ((GrB_Index) (dis(r) * ncols_)) % ((GrB_Index) ncols_) ; - if (key_lookup.count( gen_key(i,j) ) == 1) continue; - if (no_self_edges && (i == j)) continue ; - - key_lookup.insert( gen_key(i, j) ); - col_guess--; - remain= (nnz- key_lookup.size() ); - if (remain <= 0) break; - if (make_symmetric) { - // A (j,i) = x - if (key_lookup.count( gen_key( j, i) ) == 0) - { - key_lookup.insert( gen_key( j, i) ) ; - col_guess--; - remain= (nnz- key_lookup.size() ); - } - } - if (remain <= 0) break; - } - if (remain <= 0) break; - //std::cout<< remain<<" nonzeroes left..."< 0 - /* - while(key_lookup.size() < nnz) { - GrB_Index i = ((GrB_Index) (dis(r) * nrows_)) % ((GrB_Index) nrows_) ; - GrB_Index j = ((GrB_Index) (dis(r) * ncols_)) % ((GrB_Index) ncols_) ; - - key_lookup.insert( gen_key(i, j) ); - if (make_symmetric) { - // A (j,i) = x - key_lookup.insert( gen_key( j, i) ) ; - } - } */ - - for (int64_t k : key_lookup) - { - GrB_Index i = k >> 32; - GrB_Index j = k & 0x0000ffff; - - T x = (T)val_min + (T)(dis(r) * (val_max - val_min)) ; - // A (i,j) = x - cuda::jit::set_element (mat, x, i, j) ; - if (make_symmetric) { - // A (j,i) = x - cuda::jit::set_element(mat, x, j, i) ; - } - } - } - - GRB_TRY (GrB_Matrix_wait (mat, GrB_MATERIALIZE)) ; - GB_convert_any_to_non_iso (mat, true) ; - // TODO: Need to specify these - GRB_TRY (GxB_Matrix_Option_set (mat, GxB_SPARSITY_CONTROL, gxb_sparsity_control)) ; - GRB_TRY (GxB_Matrix_Option_set(mat, GxB_FORMAT, gxb_format)); - GRB_TRY (GrB_Matrix_wait (mat, GrB_MATERIALIZE)) ; - GRB_TRY (GrB_Matrix_nvals ((GrB_Index *) &nnz, mat)) ; - //GRB_TRY (GxB_Matrix_fprint (mat, "my random mat", GxB_SHORT_VERBOSE, stdout)) ; - - bool iso ; - GRB_TRY (GxB_Matrix_iso (&iso, mat)) ; - if (iso) - { - printf ("Die! (cannot do iso)\n") ; - GRB_TRY (GrB_Matrix_free (&mat)) ; - } - - } - -}; - - - -template< typename T_C, typename T_M, typename T_A, typename T_B> -class SpGEMM_problem_generator { - - float Anzpercent,Bnzpercent,Mnzpercent; - int64_t Mnz; - int64_t *Bucket = nullptr; - - int64_t BucketStart[NBUCKETS+1]; - unsigned seed = 13372801; - bool ready = false; - - int64_t nrows_; - int64_t ncols_; - - public: - - matrix *C= nullptr; - matrix *M= nullptr; - matrix *A= nullptr; - matrix *B= nullptr; - - SpGEMM_problem_generator() {}; - - SpGEMM_problem_generator(int64_t nrows, int64_t ncols): nrows_(nrows), ncols_(ncols) { - - // Create sparse matrices - C = new matrix(nrows_, ncols_); - M = new matrix(nrows_, ncols_); - A = new matrix(nrows_, ncols_); - B = new matrix(nrows_, ncols_); - }; - - void initDim ( int64_t nrows, int64_t ncols){ - nrows_ = nrows; - ncols_ = ncols; - // Create sparse matrices - C = new matrix(nrows_, ncols_); - M = new matrix(nrows_, ncols_); - A = new matrix(nrows_, ncols_); - B = new matrix(nrows_, ncols_); - } - - matrix* getCptr(){ return C;} - matrix* getMptr(){ return M;} - matrix* getAptr(){ return A;} - matrix* getBptr(){ return B;} - - void init_A(std::int64_t Anz, int gxb_sparsity_control, int gxb_format, std::int64_t seed = 12345ULL, T_A min_val = 0.0, T_A max_val = 2.0) { - Anzpercent = float(Anz)/float(nrows_*ncols_); - A->fill_random(Anz, gxb_sparsity_control, gxb_format, seed, min_val, max_val); - } - - void init_B(std::int64_t Bnz, int gxb_sparsity_control, int gxb_format, std::int64_t seed = 54321ULL, T_B min_val = 0.0, T_B max_val = 2.0) { - Bnzpercent = float(Bnz)/float(nrows_*ncols_); - B->fill_random(Bnz, gxb_sparsity_control, gxb_format, seed, min_val, max_val); - } - - GrB_Matrix getC(){ return C->get_grb_matrix();} - GrB_Matrix getM(){ return M->get_grb_matrix();} - GrB_Matrix getA(){ return A->get_grb_matrix();} - GrB_Matrix getB(){ return B->get_grb_matrix();} - - int64_t* getBucket() { return Bucket;} - int64_t* getBucketStart(){ return BucketStart;} - - void init_C(float Mnzp, std::int64_t seed_c = 23456ULL, std::int64_t seed_m = 4567ULL){ - - // Get sizes relative to fully dense matrices - Mnzpercent = Mnzp; - Mnz = (int64_t)(Mnzp * nrows_ * ncols_); - - //Seed the generator - //std::cout<<"filling matrices"<fill_random(Mnz, GxB_SPARSE, GxB_BY_ROW, seed_m); - M->fill_random(Mnz, GxB_SPARSE, GxB_BY_ROW, seed_m); - - } - - void del(){ - C->clear(); - M->clear(); - A->clear(); - B->clear(); - //if (Bucket != nullptr) CHECK_CUDA( cudaFree(Bucket) ); - delete C; - delete M; - delete A; - delete B; - CHECK_CUDA( cudaDeviceSynchronize() ); - } - - // - void fill_buckets( int fill_bucket){ - - std::cout< fill_bucket) BucketStart[b] = Mnz; - //std::cout<< " one bucket "<< b<<"starts at "<\n", - "#include \n", - "\n", - "template\n", - "class TestData {\n", - "\n", - "public:\n", - " TestData( std::vector A_indptr_,\n", - " std::vector A_indices_,\n", - " std::vector A_data_,\n", - "\n", - " std::vector B_indptr_,\n", - " std::vector B_indices_,\n", - " std::vector B_data_,\n", - "\n", - "\n", - " std::vector C_indptr_,\n", - " std::vector C_indices_,\n", - " std::vector C_data_,\n", - "\n", - " std::vector M_indptr_,\n", - " std::vector M_indices_,\n", - " std::vector M_data_):\n", - " A_indptr(A_indptr_), A_indices(A_indices_), A_data(A_data_),\n", - " B_indptr(B_indptr_), B_indices(B_indices_), B_data(B_data_),\n", - " C_indptr(C_indptr_), C_indices(C_indices_), C_data(C_data_),\n", - " M_indptr(M_indptr_), M_indices(M_indices_), M_data(M_data_){}\n", - "\n", - "\n", - " std::vector A_indptr;\n", - " std::vector A_indices;\n", - " std::vector A_data;\n", - " \n", - " std::vector B_indptr;\n", - " std::vector B_indices;\n", - " std::vector B_data;\n", - " \n", - " \n", - " std::vector C_indptr;\n", - " std::vector C_indices;\n", - " std::vector C_data;\n", - "\n", - " std::vector M_indptr;\n", - " std::vector M_indices;\n", - " std::vector M_data;\n", - "\n", - "};\n", - "\n", - "template\n", - "std::unique_ptr> make_karate_tricount() {\n", - "\n", - " std::vector A_indptr = %s;\n", - " std::vector A_indices = %s;\n", - " std::vector A_data = %s;\n", - "\n", - " std::vector B_indptr = %s;\n", - " std::vector B_indices = %s;\n", - " std::vector B_data = %s;\n", - "\n", - " std::vector M_indptr = %s;\n", - " std::vector M_indices = %s;\n", - " std::vector M_data = %s;\n", - "\n", - " std::vector C_indptr = %s;\n", - " std::vector C_indices = %s;\n", - " std::vector C_data = %s;\n", - "\n", - " TestData karate_tricount(A_indptr, A_indices, A_data,\n", - " B_indptr, B_indices, B_data,\n", - " C_indptr, C_indices, C_data,\n", - " M_indptr, M_indices, M_data);\n", - "\n", - " return std::make_unique>(karate_tricount);\n", - "}\n", - "\n", - "\n", - "\n", - "TestData karate_tricount;\n", - "karate.A_indptr = %s;\n", - "karate.A_indices = %s;\n", - "karate.A_data = %s;\n", - "\n", - "karate.B_indptr = %s;\n", - "karate.B_indices = %s;\n", - "karate.B_data = %s;\n", - "\n", - "karate.M_indptr = %s;\n", - "karate.M_indices = %s;\n", - "karate.M_data = %s;\n", - "\n", - "karate.C_indptr = %s;\n", - "karate.C_indices = %s;\n", - "karate.C_data = %s;\n", - "\"\"\" % data" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "b8c2c497-7156-449e-9c44-6fa19bdedea3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\ntemplate\\nstruct TestData {\\n\\n std::vector A_indptr;\\n std::vector A_indices;\\n std::vector A_data;\\n \\n std::vector B_indptr;\\n std::vector B_indices;\\n std::vector B_data;\\n \\n \\n std::vector C_indptr;\\n std::vector C_indices;\\n std::vector C_data;\\n\\n std::vector M_indptr;\\n std::vector M_indices;\\n std::vector M_data;\\n\\n}\\n\\n\\nTestData karate_tricount;\\nkarate.A_indptr = { 0,16,24,32,35,37,40,41,41,44,45,45,45,45,46,48,50,50,50,52,53,55,55,57,\\n 62,65,66,68,69,71,73,75,77,78,78};\\nkarate.A_indices = { 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,17,19,21,31, 2, 3, 7,13,17,19,21,30,\\n 3, 7, 8, 9,13,27,28,32, 7,12,13, 6,10, 6,10,16,16,30,32,33,33,33,32,33,\\n 32,33,32,33,33,32,33,32,33,25,27,29,32,33,25,27,31,31,29,33,33,31,33,32,\\n 33,32,33,32,33,33};\\nkarate.A_data = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1};\\n\\nkarate.B_indptr = { 0, 0, 1, 3, 6, 7, 8,11,15,17,18,21,22,24,28,28,28,30,32,32,34,34,36,36,\\n 36,36,38,38,41,42,44,46,50,61,78};\\nkarate.B_indices = { 0, 0, 1, 3, 6, 7, 8,11,15,17,18,21,22,24,28,28,28,30,32,32,34,34,36,36,\\n 36,36,38,38,41,42,44,46,50,61,78};\\nkarate.B_data = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1};\\n\\nkarate.M_indptr = { 0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80,\\n 82, 84, 87, 89, 91, 93, 98,101,104,106,110,113,117,121,127,139,156};\\nkarate.M_indices = { 0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80,\\n 82, 84, 87, 89, 91, 93, 98,101,104,106,110,113,117,121,127,139,156};\\nkarate.M_data = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1,1,1,1,1};\\n\\nkarate.C_indptr = { 0, 0, 7,12,17,19,21,24,27,29,29,31,31,32,35,35,35,36,37,37,38,38,39,39,\\n 39,39,40,40,41,41,43,45,47,51,56};\\nkarate.C_indices = { 0, 0, 7,12,17,19,21,24,27,29,29,31,31,32,35,35,35,36,37,37,38,38,39,39,\\n 39,39,40,40,41,41,43,45,47,51,56};\\nkarate.C_data = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1,\\n 1, 2, 3, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,\\n 1, 1,10, 1, 2, 1, 1,10};\\n'" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_str" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "f0c11d60-a542-41b7-afe6-38d93df094f7", - "metadata": {}, - "outputs": [], - "source": [ - "def store_file(output_string, filename = \"test_data.hpp\"):\n", - " with open(filename, 'w') as f:\n", - " f.write(output_string)" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "bc22e70f-5373-4e80-88d5-c18e9c03cf76", - "metadata": {}, - "outputs": [], - "source": [ - "store_file(output_str)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "255965d0-b90b-4ad7-a53a-9f5eb4745906", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python (cuml_2204_0222222_2)", - "language": "python", - "name": "cuml_2204_022222_2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/GraphBLAS/CUDA/test/jitTestFactory.hpp b/GraphBLAS/CUDA/test/jitTestFactory.hpp deleted file mode 100644 index 9e83ae4dd0..0000000000 --- a/GraphBLAS/CUDA/test/jitTestFactory.hpp +++ /dev/null @@ -1,916 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/test/jitTestFactory.hpp -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -#pragma once - -#include -#include -#include -#include -#include -#include "GpuTimer.h" -#include "GB_cuda_buckets.h" -#include -#include "test_data.hpp" -#include "../rmm_wrap/rmm_wrap.hpp" -#include "problem_spec.hpp" - -extern "C" { - #include "GB.h" -} - -#include "GB_cuda_common_jitFactory.hpp" -#include "GB_cuda_mxm_dot3_jitFactory.hpp" -#include "GB_cuda_reduce_jitFactory.hpp" -#include "GB_cuda_reduce_factory.hpp" -#include "dataFactory.hpp" - -////Operations for test results on CPU -//template T myOP_plus( T a, T b) { return a + b;} -//template T myOP_min ( T a, T b) { return a < b ? a : b;} -//template T myOP_max ( T a, T b) { return a > b ? a : b;} -//template T myOP_first ( T a, T b) { return a ;} -//template T myOP_second ( T a, T b) { return b ;} -//template T myOP_times ( T a, T b) { return a * b ;} -// -//template T (*myOpPTR)(T a, T b); -//template T (*ADD_ptr)(T a, T b); -//template T (*MUL_ptr)(T a, T b); - -//AxB_dot3_phase1 kernels -template -bool test_AxB_phase1_factory( int64_t , int64_t , int64_t , int64_t ) ; - -//AxB_dot3_phase2 kernels -template -bool test_AxB_dot3_phase2_factory( int , int64_t , int64_t , int64_t, int64_t ) ; - -template -void make_grb_matrix(GrB_Matrix mat, int64_t n_rows, int64_t n_cols, - std::vector &indptr, - std::vector &indices, T *data, - int gxb_sparsity_control = GxB_SPARSE, - int gxb_format = GxB_BY_ROW) ; - -//Fixture to generate valid inputs and hold them for tests -class AxB_dot3_Test : public ::testing::Test -{ - void SetUp() {} - - void TearDown() {} -}; - -template -void print_array(void *arr, I size, const char *name) { - std::cout << "Printing " << name << std::endl; - for(I i = 0; i < size; ++i) { - std::cout << static_cast(arr)[i] << ", "; - } - std::cout << std::endl; -} - -//------------------------------------------------------------------------------ -// test_AxB_phase1_factory: test phase1 -//------------------------------------------------------------------------------ - -// Test generator code, to allow parameterized tests -// Uses jitFactory, dataFactory and GB_jit -template -bool test_AxB_phase1_factory(mxm_problem_spec &problem_spec) -{ - - cudaStream_t stream = (cudaStream_t)rmm_wrap_get_main_stream(); - - /******************** - * Launch kernel - */ - GB_cuda_mxm_factory mysemiringfactory = problem_spec.get_mxm_factory(); - phase1launchFactory p1lF(mysemiringfactory); - - GpuTimer kernTimer; - - int nthrd = p1lF.get_threads_per_block(); - int ntasks = p1lF.get_number_of_blocks(problem_spec.getM()); - - // TODO: Verify that RMM is checking and throwing exceptions - int nanobuckets_size = NBUCKETS * nthrd * ntasks; - int blockbuckets_size = NBUCKETS * ntasks; - - int64_t *Nanobuckets = (int64_t*)rmm_wrap_malloc(nanobuckets_size * sizeof (int64_t)); - int64_t *Blockbucket = (int64_t*)rmm_wrap_malloc(blockbuckets_size * sizeof (int64_t)); - - kernTimer.Start(); - p1lF.jitGridBlockLaunch(Nanobuckets, Blockbucket, - problem_spec.getC(), problem_spec.getM(), - problem_spec.getA(), problem_spec.getB(), stream); - - CHECK_CUDA(cudaStreamSynchronize(stream)); - kernTimer.Stop(); - std::cout<<"returned from phase1 kernel "<(Nanobuckets, nanobuckets_size, "Nanobuckets"); -// print_array(Blockbucket, blockbuckets_size, "Blockbucket"); - std::cout<<"==== phase1 done=============================" < -bool test_AxB_dense_phase1_factory(mxm_problem_spec &problem_spec) -{ - cudaStream_t stream = (cudaStream_t)rmm_wrap_get_main_stream(); - - /******************** - * Launch kernel - */ - GB_cuda_mxm_factory mysemiringfactory = problem_spec.get_mxm_factory(); - dense_phase1launchFactory p1lF(mysemiringfactory); - p1lF.jitGridBlockLaunch(problem_spec.getC(), problem_spec.getM(), problem_spec.getA(), problem_spec.getB(), stream); - return true; -} - - -//------------------------------------------------------------------------------ -// test_AxB_phase2_factory: test phase2 and phase2end -//------------------------------------------------------------------------------ - -template -bool test_AxB_phase2_factory(mxm_problem_spec &problem_spec) -{ - cudaStream_t stream = (cudaStream_t)rmm_wrap_get_main_stream(); - - auto mymxm = problem_spec.get_mxm_factory(); - phase1launchFactory p1lF(mymxm); - phase2launchFactory p2lF; - phase2endlaunchFactory p2elF; - - GpuTimer kernTimer; - kernTimer.Start(); - - const int64_t mnz = GB_nnz (problem_spec.getM()) ; - - int nthrd = p2lF.get_threads_per_block(); - int ntasks = p2elF.get_number_of_blocks(problem_spec.getM()); - - // fabricate data as if it came from phase1: - int64_t *nanobuckets = (int64_t*)rmm_wrap_malloc(NBUCKETS * nthrd * ntasks * sizeof (int64_t)); - int64_t *blockbucket = (int64_t*)rmm_wrap_malloc(NBUCKETS * ntasks * sizeof (int64_t)); - int64_t *bucketp = (int64_t*)rmm_wrap_malloc((NBUCKETS+1) * sizeof (int64_t)); - int64_t *offset = (int64_t*)rmm_wrap_malloc(NBUCKETS * sizeof (int64_t)); - int64_t *bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t)); - - fillvector_constant(NBUCKETS, bucketp, (int64_t)0); - fillvector_constant(NBUCKETS, offset, (int64_t)0); - //fillvector_constant(problem_spec.getCnnz(), bucket, (int64_t)0); - - std::cout << "Running phase1 kernel" << std::endl; - kernTimer.Start(); - p1lF.jitGridBlockLaunch(nanobuckets, blockbucket, - problem_spec.getC(), problem_spec.getM(), - problem_spec.getA(), problem_spec.getB(), stream); - - - CHECK_CUDA(cudaStreamSynchronize(stream)); - kernTimer.Stop(); - - std::cout << " phase1 internal phase2 "<< kernTimer.Elapsed() <<"ms Done." << std::endl; - - // // launch phase2 (just with p2ntasks as the # of tasks) - kernTimer.Start(); - p2lF.jitGridBlockLaunch(blockbucket, offset, problem_spec.getM(), stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - kernTimer.Stop(); - std::cout << " phase2 kern "<< kernTimer.Elapsed() <<"ms Done." << std::endl; - -// -// // do the reduction between phase2 and phase2end - int64_t s= 0; - for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket) - { - bucketp[bucket] = s; - s+= offset[bucket]; - } - - // launch phase2end: note same # of tasks as phase1 - kernTimer.Start(); - p2elF.jitGridBlockLaunch( nanobuckets, blockbucket, - bucketp, bucket, offset, problem_spec.getC(), - problem_spec.getM(),stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - kernTimer.Stop(); - std::cout<<"returned from phase2end kernel "<(bucketp, NBUCKETS, "bucketp"); -// print_array(bucket, mnz, "bucket"); - std::cout<<"phase2 done =================="< -void make_grb_matrix(GrB_Matrix mat, int64_t n_rows, int64_t n_cols, - std::vector &indptr, - std::vector &indices, T *data, - int gxb_sparsity_control, - int gxb_format ) -{ - - GrB_Type type = cuda::jit::to_grb_type(); - - GRB_TRY (GrB_Matrix_new (&mat, type, n_rows, n_cols)) ; - - for(int64_t row = 0; row < n_rows; ++row) { - int64_t start = indptr[row]; - int64_t stop = indptr[row+1]; - - for(int64_t offset = start; offset < stop; ++offset) { - GrB_Index i = (GrB_Index) row; - GrB_Index j = (GrB_Index) indices[offset]; - T x = data[offset]; - - cuda::jit::set_element (mat, x, i, j) ; - } - } - - GRB_TRY (GrB_Matrix_wait (mat, GrB_MATERIALIZE)) ; - GRB_TRY (GB_convert_any_to_non_iso (mat, true)) ; - GRB_TRY (GxB_Matrix_Option_set (mat, GxB_SPARSITY_CONTROL, gxb_sparsity_control)) ; - GRB_TRY (GxB_Matrix_Option_set(mat, GxB_FORMAT, gxb_format)); - - -} - -template < - typename T_C, typename T_M, typename T_A,typename T_B, - typename T_X, typename T_Y, typename T_Z> -bool test_AxB_dot3_sparse_factory(mxm_problem_spec &problem_spec) { - - // FIXME: Allow the adaptive tests in this guy - std::cout << "sparse test ======================" << std::endl; - - GpuTimer kernTimer; - - cudaStream_t strm; - CHECK_CUDA(cudaStreamCreate(&strm)); - - std::cout << "sr_code: " << problem_spec.get_mxm_factory().sr_code << std::endl; - - bool result = false; - - int64_t N = problem_spec.getN(); - /** - * Run Phase 1, phase 2 and phase2end: Compute nanobuckets and blockbuckets - */ - - auto mymxm = problem_spec.get_mxm_factory(); - phase1launchFactory p1lF(mymxm); - phase2launchFactory p2lF; - phase2endlaunchFactory p2elF; - - GrB_Matrix C = problem_spec.getC(); - GrB_Matrix M = problem_spec.getM(); - GrB_Matrix A = problem_spec.getA(); - GrB_Matrix B = problem_spec.getB(); - - const int64_t mnz = GB_nnz (M) ; - const int64_t cnz = GB_nnz (C) ; - const int64_t cvlen = C->vlen ; - const int64_t cvdim = C->vdim ; - const int64_t cnvec = C->nvec ; - - bool C_iso = false ; - int C_sparsity = GB_sparsity (M) ; - int M_sparsity = GB_sparsity (M) ; - GrB_Type ctype = problem_spec.getBinaryOp()->ztype ; - - int nthrd = p2lF.get_threads_per_block(); - int ntasks = p2elF.get_number_of_blocks(M); - - // fabricate data as if it came from phase1: - int64_t *nanobuckets = (int64_t*)rmm_wrap_malloc(NBUCKETS * nthrd * ntasks * sizeof (int64_t)); - int64_t *blockbucket = (int64_t*)rmm_wrap_malloc(NBUCKETS * ntasks * sizeof (int64_t)); - int64_t *bucketp = (int64_t*)rmm_wrap_malloc((NBUCKETS+1) * sizeof (int64_t)); - int64_t *bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t)); - int64_t *offset = (int64_t*)rmm_wrap_malloc(NBUCKETS * sizeof (int64_t)); - - fillvector_constant(NBUCKETS, bucketp, (int64_t)0); - fillvector_constant(NBUCKETS, offset, (int64_t)0); - //fillvector_constant(problem_spec.getCnnz(), bucket, (int64_t)0); - - std::cout << "sparse phase1 kernel" << std::endl; - kernTimer.Start(); - p1lF.jitGridBlockLaunch(nanobuckets, blockbucket, - C, M, A, B, strm); - CHECK_CUDA(cudaStreamSynchronize(strm)); - kernTimer.Stop(); - std::cout<<"sparse test phase1 kernel "<nzombies += (bucketp[1]); //add pre-zombies to the count; - - GRB_TRY(GrB_Matrix_wait(C, GrB_MATERIALIZE)); - fflush(stdout); - - GrB_Matrix C_expected; - GrB_Type type = cuda::jit::to_grb_type(); - GRB_TRY (GrB_Matrix_new (&C_expected, type, N, N)) ; - - // ensure the GPU is not used - GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, -1)) ; - GB_Global_hack_set (2, 2) ; // hack(2) = 2: never use the GPU - - // Use GrB_DESC_S for structural because dot3 mask will never be complemented - // The order of B and A is swapped to account for CSR vs CSC assumption - GRB_TRY (GrB_mxm(C_expected, problem_spec.getM(), NULL, problem_spec.get_semiring(), problem_spec.getB(), - problem_spec.getA(), problem_spec.get_mask_struct() ? GrB_DESC_ST1 : GrB_DESC_T1)); - - - GRB_TRY(GrB_Matrix_wait(C_expected, GrB_MATERIALIZE)); - - // compare - double tol = 0 ; - GrB_Index nvals1 = 0, nvals2 = 0 ; - GRB_TRY (GrB_Matrix_nvals (&nvals1, C)) ; - GRB_TRY (GrB_Matrix_nvals (&nvals2, C_expected)) ; - if (nvals1 != nvals2) { printf ("Wrong number of nonzeroes found, test fail!!!\n") ; ADD_FAILURE( ) ; } - GrB_Index nrows, ncols ; - GrB_Matrix_nrows (&nrows, C_expected) ; - GrB_Matrix_ncols (&ncols, C_expected) ; - - GrB_Matrix T; - - GRB_TRY (GrB_Matrix_new (&T, GrB_BOOL, nrows, ncols)) ; - GrB_BinaryOp op = NULL; - GrB_UnaryOp op_abs = NULL ; - if (type == GrB_BOOL ) op = GrB_EQ_BOOL ; - else if (type == GrB_INT8 ) op = GrB_EQ_INT8 ; - else if (type == GrB_INT16 ) op = GrB_EQ_INT16 ; - else if (type == GrB_INT32 ) op = GrB_EQ_INT32 ; - else if (type == GrB_INT64 ) op = GrB_EQ_INT64 ; - else if (type == GrB_UINT8 ) op = GrB_EQ_UINT8 ; - else if (type == GrB_UINT16) op = GrB_EQ_UINT16 ; - else if (type == GrB_UINT32) op = GrB_EQ_UINT32 ; - else if (type == GrB_UINT64) op = GrB_EQ_UINT64 ; - else if (type == GrB_FP32 ) - { tol = 1e-6; - op = (tol == 0)? GrB_EQ_FP32 : GrB_MINUS_FP32 ; - op_abs = GrB_ABS_FP32 ; - } - else if (type == GrB_FP64 ) - { tol = 1e12; - op = (tol == 0)? GrB_EQ_FP64 : GrB_MINUS_FP64 ; - op_abs = GrB_ABS_FP64 ; - } - else if (type == GxB_FC32 ) - { tol = 2e-6; - op = (tol == 0)? GxB_EQ_FC32 : GxB_MINUS_FC32 ; - op_abs = GxB_ABS_FC32 ; - } - else if (type == GxB_FC64 ) - { tol = 2e-12; - op = (tol == 0)? GxB_EQ_FC64 : GxB_MINUS_FC64 ; - op_abs = GxB_ABS_FC64 ; - } - - - - if (tol == 0) - { - // check for perfect equality - GRB_TRY (GrB_Matrix_eWiseMult_BinaryOp (T, NULL, NULL, op, C, C_expected, - NULL)) ; - GrB_Index nvals3 = 1 ; - GRB_TRY (GrB_Matrix_nvals (&nvals3, T)) ; -// if (nvals1 != nvals3) { printf (" difference matrix wrong size, test fail!!\n") ; ADD_FAILURE( ) ; } - bool is_same = false ; - GRB_TRY (GrB_Matrix_reduce_BOOL (&is_same, NULL, GrB_LAND_MONOID_BOOL, - T, NULL)) ; - if (!is_same) { printf (" results don't match, test fail!!\n") ; ADD_FAILURE ( ) ; } - GRB_TRY (GrB_Matrix_free (&T)) ; - } - else - { - // TODO: check with roundoff - // Diff = C - C_expected - GrB_Matrix Diff ; - GRB_TRY (GrB_Matrix_new (&Diff, GrB_FP64, nrows, ncols)) ; - GRB_TRY (GrB_Matrix_apply (Diff, NULL, NULL, GrB_AINV_FP64, C_expected, NULL)) ; - GRB_TRY (GrB_Matrix_eWiseAdd_BinaryOp (Diff, NULL, NULL, GrB_PLUS_FP64, - C, Diff, NULL)) ; - GRB_TRY( GrB_Matrix_apply( Diff, NULL, NULL, op_abs, Diff, NULL) ); - GrB_Index nvals3 = 1 ; - GRB_TRY (GrB_Matrix_nvals (&nvals3, Diff)) ; - if (nvals1 != nvals3) { printf ("fp difference matrix wrong size, test fail!!\n") ; ADD_FAILURE( ) ; } - double is_same = false ; - GRB_TRY (GrB_Matrix_reduce_FP64 (&is_same, NULL, GrB_PLUS_MONOID_FP64, - Diff, NULL)) ; - printf("difference = %12.6g, rel_l1_err=%12.6g\n", is_same, is_same/nvals3 ); - EXPECT_LT( is_same/nvals3, tol); - GRB_TRY (GrB_Matrix_free (&Diff)) ; - - } - - // re-enable the GPU - GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, 0)) ; - GB_Global_hack_set (2, 1) ; // hack(2) = 1: always use the GPU - - rmm_wrap_free(nanobuckets); - rmm_wrap_free(blockbucket); - rmm_wrap_free(bucketp); - rmm_wrap_free(bucket); - rmm_wrap_free(offset); - GRB_TRY(GrB_Matrix_free(&C_expected)); - CHECK_CUDA(cudaStreamDestroy(strm)); - - std::cout << "phase 3 test complete ======================" << std::endl; - return result; -} - -template < - typename T_C, typename T_M, typename T_A,typename T_B, - typename T_X, typename T_Y, typename T_Z> -bool test_AxB_dot3_dense_factory(mxm_problem_spec &problem_spec) { - - std::cout << "phase dense test ======================" << std::endl; - - GpuTimer kernTimer; - - cudaStream_t strm = (cudaStream_t)rmm_wrap_get_main_stream(); - - bool result = false; - - int64_t N = problem_spec.getN(); - - auto mymxm = problem_spec.get_mxm_factory(); - dense_phase1launchFactory p1lF(mymxm); - - GrB_Matrix C = problem_spec.getC(); - GrB_Matrix M = problem_spec.getM(); - GrB_Matrix A = problem_spec.getA(); - GrB_Matrix B = problem_spec.getB(); - - problem_spec.set_sparsity_control( A, GxB_FULL, GxB_BY_ROW); - problem_spec.set_sparsity_control( B, GxB_FULL, GxB_BY_ROW); - - const int64_t mnz = GB_nnz (M) ; - const int64_t cnz = GB_nnz (C) ; - const int64_t cvlen = C->vlen ; - const int64_t cvdim = C->vdim ; - const int64_t cnvec = C->nvec ; - - bool C_iso = false ; - GrB_Type ctype = problem_spec.getBinaryOp()->ztype ; - - std::cout << "Running phase1 kernel" << std::endl; - kernTimer.Start(); - p1lF.jitGridBlockLaunch(C, M, A, B, strm); - CHECK_CUDA(cudaStreamSynchronize(strm)); - kernTimer.Stop(); - std::cout<<"Dense internal phase1 kernel done "<(); - GRB_TRY (GrB_Matrix_new (&C_expected, type, N, N)) ; - - // ensure the GPU is not used - GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, -1)) ; - GB_Global_hack_set (2, 2) ; // hack(2) = 2: never use the GPU - - // Use GrB_DESC_S for structural because dot3 mask will never be complemented - // The order of B and A is swapped to account for CSR vs CSC assumption - GRB_TRY (GrB_mxm(C_expected, problem_spec.getM(), NULL, problem_spec.get_semiring(), problem_spec.getB(), - problem_spec.getA(), problem_spec.get_mask_struct() ? GrB_DESC_ST1 : GrB_DESC_T1)); - - GRB_TRY(GrB_Matrix_wait(C_expected, GrB_MATERIALIZE)); - std::cout << "nnz: " << GB_nnz (C_expected) << std::endl ; - - // compare - double tol = 0 ; - GrB_Index nvals1 = 0, nvals2 = 0 ; - GRB_TRY (GrB_Matrix_nvals (&nvals1, C)) ; - GRB_TRY (GrB_Matrix_nvals (&nvals2, C_expected)) ; - if (nvals1 != nvals2) { printf ("Wrong number of nonzeroes found, test fail!!! nvals1=%lu, nvals2=%lu\n", nvals1, nvals2) ; ADD_FAILURE( ) ; } - GrB_Index nrows, ncols ; - GrB_Matrix_nrows (&nrows, C_expected) ; - GrB_Matrix_ncols (&ncols, C_expected) ; - - GrB_Matrix T; - - GRB_TRY (GrB_Matrix_new (&T, GrB_BOOL, nrows, ncols)) ; - GrB_BinaryOp op = NULL; - GrB_UnaryOp op_abs = NULL ; - if (type == GrB_BOOL ) op = GrB_EQ_BOOL ; - else if (type == GrB_INT8 ) op = GrB_EQ_INT8 ; - else if (type == GrB_INT16 ) op = GrB_EQ_INT16 ; - else if (type == GrB_INT32 ) op = GrB_EQ_INT32 ; - else if (type == GrB_INT64 ) op = GrB_EQ_INT64 ; - else if (type == GrB_UINT8 ) op = GrB_EQ_UINT8 ; - else if (type == GrB_UINT16) op = GrB_EQ_UINT16 ; - else if (type == GrB_UINT32) op = GrB_EQ_UINT32 ; - else if (type == GrB_UINT64) op = GrB_EQ_UINT64 ; - else if (type == GrB_FP32 ) - { tol = 5e-6; - op = (tol == 0)? GrB_EQ_FP32 : GrB_MINUS_FP32 ; - op_abs = GrB_ABS_FP32 ; - } - else if (type == GrB_FP64 ) - { tol = 1e12; - op = (tol == 0)? GrB_EQ_FP64 : GrB_MINUS_FP64 ; - op_abs = GrB_ABS_FP64 ; - } - else if (type == GxB_FC32 ) - { tol = 2e-6; - op = (tol == 0)? GxB_EQ_FC32 : GxB_MINUS_FC32 ; - op_abs = GxB_ABS_FC32 ; - } - else if (type == GxB_FC64 ) - { tol = 2e-12; - op = (tol == 0)? GxB_EQ_FC64 : GxB_MINUS_FC64 ; - op_abs = GxB_ABS_FC64 ; - } - - - - if (tol == 0) - { - // check for perfect equality - GRB_TRY (GrB_Matrix_eWiseMult_BinaryOp (T, NULL, NULL, op, C, C_expected, - NULL)) ; - GrB_Index nvals3 = 1 ; - GRB_TRY (GrB_Matrix_nvals (&nvals3, T)) ; -// if (nvals1 != nvals3) { printf (" difference matrix wrong size, test fail!! nvals1=%ld nvals3=%ld\n", nvals1, nvals3) ; ADD_FAILURE( ) ; } - bool is_same = false ; - GRB_TRY (GrB_Matrix_reduce_BOOL (&is_same, NULL, GrB_LAND_MONOID_BOOL, - T, NULL)) ; - if (!is_same) { printf (" results don't match, test fail!!\n") ; ADD_FAILURE ( ) ; } - GRB_TRY (GrB_Matrix_free (&T)) ; - } - else - { - // TODO: check with roundoff - // Diff = C - C_expected - GrB_Matrix Diff ; - GRB_TRY (GrB_Matrix_new (&Diff, GrB_FP64, nrows, ncols)) ; - GRB_TRY (GrB_Matrix_apply (Diff, NULL, NULL, GrB_AINV_FP64, C_expected, NULL)) ; - GRB_TRY (GrB_Matrix_eWiseAdd_BinaryOp (Diff, NULL, NULL, GrB_PLUS_FP64, - C, Diff, NULL)) ; - GRB_TRY( GrB_Matrix_apply( Diff, NULL, NULL, op_abs, Diff, NULL) ); - GrB_Index nvals3 = 1 ; - GRB_TRY (GrB_Matrix_nvals (&nvals3, Diff)) ; - if (nvals1 != nvals3) { printf ("fp difference matrix wrong size, test fail!!\n") ; ADD_FAILURE( ) ; } - double is_same = false ; - GRB_TRY (GrB_Matrix_reduce_FP64 (&is_same, NULL, GrB_PLUS_MONOID_FP64, - Diff, NULL)) ; - printf("difference = %12.6g, rel_l1_err=%12.6g\n", is_same, is_same/nvals3 ); - EXPECT_LT( is_same/nvals3, tol); - GRB_TRY (GrB_Matrix_free (&Diff)) ; - - } - - // re-enable the GPU - GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, 0)) ; - GB_Global_hack_set (2, 1) ; // hack(2) = 1: always use the GPU - - - GRB_TRY(GrB_Matrix_free(&C_expected)); - - std::cout << "phase 3 dense test complete ======================" << std::endl; - return result; -} - -template < - typename T_C, typename T_M, typename T_A,typename T_B, - typename T_X, typename T_Y, typename T_Z> -bool test_AxB_dot3_sparse_dense_factory(mxm_problem_spec &problem_spec) { - - std::cout << "sparse dense test ======================" << std::endl; - - GpuTimer kernTimer; - - cudaStream_t strm; - CHECK_CUDA(cudaStreamCreate(&strm)); - - bool result = false; - - int64_t N = problem_spec.getN(); - - GrB_Matrix C = problem_spec.getC(); - GrB_Matrix M = problem_spec.getM(); - GrB_Matrix A = problem_spec.getA(); - GrB_Matrix B = problem_spec.getB(); - - problem_spec.set_sparsity_control( A, GxB_SPARSE, GxB_BY_ROW); - - // TODO: Need to make sure the format itself is actually dense. - problem_spec.set_sparsity_control( B, GxB_FULL, GxB_BY_ROW); - - auto mymxm = problem_spec.get_mxm_factory(); - dense_phase1launchFactory p1lF(mymxm); - - const int64_t mnz = GB_nnz (M) ; - const int64_t cnz = GB_nnz (C) ; - const int64_t cvlen = C->vlen ; - const int64_t cvdim = C->vdim ; - const int64_t cnvec = C->nvec ; - - bool C_iso = false ; - GrB_Type ctype = problem_spec.getBinaryOp()->ztype ; - - std::cout << "Running dense_phase1 kernel" << std::endl; - kernTimer.Start(); - p1lF.jitGridBlockLaunch(C, M, A, B, strm); - CHECK_CUDA(cudaStreamSynchronize(strm)); - kernTimer.Stop(); - std::cout<<"Dense internal phase1 kernel done "<(); - GRB_TRY (GrB_Matrix_new (&C_expected, type, N, N)) ; - - // ensure the GPU is not used - GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, -1)) ; - GB_Global_hack_set (2, 2) ; // hack(2) = 2: never use the GPU - - // Use GrB_DESC_S for structural because dot3 mask will never be complemented - // The order of B and A is swapped to account for CSR vs CSC assumption - GRB_TRY (GrB_mxm(C_expected, problem_spec.getM(), NULL, problem_spec.get_semiring(), problem_spec.getB(), - problem_spec.getA(), problem_spec.get_mask_struct() ? GrB_DESC_ST1 : GrB_DESC_T1)); - - GRB_TRY(GrB_Matrix_wait(C_expected, GrB_MATERIALIZE)); - std::cout << "nnz: " << GB_nnz (C_expected) << std::endl ; - - // compare - double tol = 0 ; - GrB_Index nvals1 = 0, nvals2 = 0 ; - GRB_TRY (GrB_Matrix_nvals (&nvals1, C)) ; - GRB_TRY (GrB_Matrix_nvals (&nvals2, C_expected)) ; - if (nvals1 != nvals2) { printf ("Wrong number of nonzeroes found, test fail!!! nvals1=%lu, nvals2=%lu\n", nvals1, nvals2) ; ADD_FAILURE( ) ; } - GrB_Index nrows, ncols ; - GrB_Matrix_nrows (&nrows, C_expected) ; - GrB_Matrix_ncols (&ncols, C_expected) ; - - GrB_Matrix T; - - GRB_TRY (GrB_Matrix_new (&T, GrB_BOOL, nrows, ncols)) ; - GrB_BinaryOp op = NULL; - GrB_UnaryOp op_abs = NULL ; - if (type == GrB_BOOL ) op = GrB_EQ_BOOL ; - else if (type == GrB_INT8 ) op = GrB_EQ_INT8 ; - else if (type == GrB_INT16 ) op = GrB_EQ_INT16 ; - else if (type == GrB_INT32 ) op = GrB_EQ_INT32 ; - else if (type == GrB_INT64 ) op = GrB_EQ_INT64 ; - else if (type == GrB_UINT8 ) op = GrB_EQ_UINT8 ; - else if (type == GrB_UINT16) op = GrB_EQ_UINT16 ; - else if (type == GrB_UINT32) op = GrB_EQ_UINT32 ; - else if (type == GrB_UINT64) op = GrB_EQ_UINT64 ; - else if (type == GrB_FP32 ) - { tol = 5e-6; - op = (tol == 0)? GrB_EQ_FP32 : GrB_MINUS_FP32 ; - op_abs = GrB_ABS_FP32 ; - } - else if (type == GrB_FP64 ) - { tol = 1e12; - op = (tol == 0)? GrB_EQ_FP64 : GrB_MINUS_FP64 ; - op_abs = GrB_ABS_FP64 ; - } - else if (type == GxB_FC32 ) - { tol = 2e-6; - op = (tol == 0)? GxB_EQ_FC32 : GxB_MINUS_FC32 ; - op_abs = GxB_ABS_FC32 ; - } - else if (type == GxB_FC64 ) - { tol = 2e-12; - op = (tol == 0)? GxB_EQ_FC64 : GxB_MINUS_FC64 ; - op_abs = GxB_ABS_FC64 ; - } - - - - if (tol == 0) - { - // check for perfect equality - GRB_TRY (GrB_Matrix_eWiseMult_BinaryOp (T, NULL, NULL, op, C, C_expected, - NULL)) ; - GrB_Index nvals3 = 1 ; - GRB_TRY (GrB_Matrix_nvals (&nvals3, T)) ; -// if (nvals1 != nvals3) { printf (" difference matrix wrong size, test fail!! nvals1=%ld nvals3=%ld\n", nvals1, nvals3) ; ADD_FAILURE( ) ; } - bool is_same = false ; - GRB_TRY (GrB_Matrix_reduce_BOOL (&is_same, NULL, GrB_LAND_MONOID_BOOL, - T, NULL)) ; - if (!is_same) { printf (" results don't match, test fail!!\n") ; ADD_FAILURE ( ) ; } - GRB_TRY (GrB_Matrix_free (&T)) ; - } - else - { - // TODO: check with roundoff - // Diff = C - C_expected - GrB_Matrix Diff ; - GRB_TRY (GrB_Matrix_new (&Diff, GrB_FP64, nrows, ncols)) ; - GRB_TRY (GrB_Matrix_apply (Diff, NULL, NULL, GrB_AINV_FP64, C_expected, NULL)) ; - GRB_TRY (GrB_Matrix_eWiseAdd_BinaryOp (Diff, NULL, NULL, GrB_PLUS_FP64, - C, Diff, NULL)) ; - GRB_TRY( GrB_Matrix_apply( Diff, NULL, NULL, op_abs, Diff, NULL) ); - GrB_Index nvals3 = 1 ; - GRB_TRY (GrB_Matrix_nvals (&nvals3, Diff)) ; - if (nvals1 != nvals3) { printf ("fp difference matrix wrong size, test fail!!\n") ; ADD_FAILURE( ) ; } - double is_same = false ; - GRB_TRY (GrB_Matrix_reduce_FP64 (&is_same, NULL, GrB_PLUS_MONOID_FP64, - Diff, NULL)) ; - printf("difference = %12.6g, rel_l1_err=%12.6g\n", is_same, is_same/nvals3 ); - EXPECT_LT( is_same/nvals3, tol); - GRB_TRY (GrB_Matrix_free (&Diff)) ; - - } - - // re-enable the GPU - GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, 0)) ; - GB_Global_hack_set (2, 1) ; // hack(2) = 1: always use the GPU - - - GRB_TRY(GrB_Matrix_free(&C_expected)); - CHECK_CUDA(cudaStreamDestroy(strm)); - - std::cout << "phase 3 dense test complete ======================" << std::endl; - return result; -} - - -template -bool test_reduce_factory(mxm_problem_spec &problem_spec) { - - std::cout << "reduce test ======================" << std::endl; - - // TODO: This test doesn't really fit the `mxm` category - GrB_Monoid monoid = problem_spec.getMonoid(); - int64_t N = problem_spec.getN(); - - GrB_Matrix A; - - // TODO: Using C here so that the reduced type matches - GrB_Matrix_dup(&A, problem_spec.getC()); - GrB_Type type = cuda::jit::to_grb_type(); - - A->i[0] = GB_FLIP(A->i[0]); // FIXME - A->i[1] = GB_FLIP(A->i[1]); // FIXME - A->nzombies = 2; // FIXME: use an opaque method to insert zombies into A - - //GRB_TRY (GxB_Matrix_fprint (A, "A", GxB_SHORT_VERBOSE, stdout)) ; - - GB_cuda_reduce_factory myreducefactory; - myreducefactory.reduce_factory(monoid, A); - - T_C actual; - GB_cuda_reduce(myreducefactory, A, &actual, monoid ); - - GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, -1)) ; - GB_Global_hack_set (2, 2) ; // hack(2) = 2: never use the GPU - - T_C expected; - GRB_TRY(cuda::jit::matrix_reduce(&expected, A, monoid)); - - GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, 0)) ; - GB_Global_hack_set (2, 1) ; // hack(2) = 1: always use the GPU - - double tol = 0; - GrB_BinaryOp op = NULL; - GrB_UnaryOp op_abs = NULL ; - - if (type == GrB_BOOL ) op = GrB_EQ_BOOL ; - else if (type == GrB_INT8 ) op = GrB_EQ_INT8 ; - else if (type == GrB_INT16 ) op = GrB_EQ_INT16 ; - else if (type == GrB_INT32 ) op = GrB_EQ_INT32 ; - else if (type == GrB_INT64 ) op = GrB_EQ_INT64 ; - else if (type == GrB_UINT8 ) op = GrB_EQ_UINT8 ; - else if (type == GrB_UINT16) op = GrB_EQ_UINT16 ; - else if (type == GrB_UINT32) op = GrB_EQ_UINT32 ; - else if (type == GrB_UINT64) op = GrB_EQ_UINT64 ; - else if (type == GrB_FP32 ) - { tol = 1e-6; - op = (tol == 0)? GrB_EQ_FP32 : GrB_MINUS_FP32 ; - op_abs = GrB_ABS_FP32 ; - } - else if (type == GrB_FP64 ) - { tol = 1e12; - op = (tol == 0)? GrB_EQ_FP64 : GrB_MINUS_FP64 ; - op_abs = GrB_ABS_FP64 ; - } - else if (type == GxB_FC32 ) - { tol = 2e-6; - op = (tol == 0)? GxB_EQ_FC32 : GxB_MINUS_FC32 ; - op_abs = GxB_ABS_FC32 ; - } - else if (type == GxB_FC64 ) - { tol = 2e-12; - op = (tol == 0)? GxB_EQ_FC64 : GxB_MINUS_FC64 ; - op_abs = GxB_ABS_FC64 ; - } - - if(tol == 0) { - EXPECT_EQ( actual , expected); - //std::cout << "results do not match: reduced=" << expected << ", actual=" << actual << std::endl; - //exit(1); - } else if ( (tol > 0) && ( ( type ==GrB_FP32) || ( type ==GxB_FC32) - || ( type ==GrB_FP64) || ( type ==GxB_FC64) ) ){ - EXPECT_LT( abs((double)actual - (double)expected)/(abs((double)expected)+1.e-12), tol) ; - } - - std::cout<< expected<< " " << actual<< "reduce test complete ======================" << std::endl; - GRB_TRY(GrB_Matrix_free(&A)); - - return expected == actual; -} - diff --git a/GraphBLAS/CUDA/test/problem_spec.hpp b/GraphBLAS/CUDA/test/problem_spec.hpp deleted file mode 100644 index c1997771ab..0000000000 --- a/GraphBLAS/CUDA/test/problem_spec.hpp +++ /dev/null @@ -1,129 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/test/problem_spec.hpp -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -#pragma once - -#include -#include -#include -#include -#include -#include "GpuTimer.h" -#include "GB_cuda_buckets.h" -#include "../../rmm_wrap/rmm_wrap.h" -#include -#include "test_data.hpp" -extern "C" { -#include "GB.h" -} - -#include "../GB_cuda_common_jitFactory.hpp" -#include "../GB_cuda_mxm_dot3_jitFactory.hpp" -#include "../GB_cuda_reduce_jitFactory.hpp" -#include "dataFactory.hpp" - -template -class mxm_problem_spec { - -public: - mxm_problem_spec(GrB_Monoid monoid_, GrB_BinaryOp binop_, int64_t N_, int64_t Annz_, int64_t Bnnz_, int64_t Cnnz_, - int sparsity_control_A_ = GxB_SPARSE, int sparsity_control_B_ = GxB_SPARSE) : - mysemiring(), binop(binop_), monoid(monoid_), N(N_), - G(N_, N_), Annz(Annz_), Bnnz(Bnnz_), Cnnz(Cnnz_), mask_struct(true), flipxy(false), mask_comp(false) { - - // FIXME: This should be getting set automatically somehow. - float Cnzpercent = (float) Cnnz_/(N_*N_); - - // TODO: Allocate and fill arrays for buckets and nano buckets - G.init_A(Annz_, sparsity_control_A_, GxB_BY_ROW); - G.init_B(Bnnz_, sparsity_control_B_, GxB_BY_ROW); - G.init_C(Cnzpercent); -// G.fill_buckets( TB ); // all elements go to testbucket= TB - - /************************ - * Create mxm factory - */ - auto grb_info = GrB_Semiring_new(&mysemiring, monoid_, binop_); - GRB_TRY (grb_info) ; - GrB_Matrix A = G.getA(); - GrB_Matrix B = G.getB(); - //GRB_TRY (GxB_Matrix_fprint (A, "A", GxB_SHORT_VERBOSE, stdout)) ; - //GRB_TRY (GxB_Matrix_fprint (B, "B", GxB_SHORT_VERBOSE, stdout)) ; - } - - ~mxm_problem_spec() { - - std::cout << "Calling G.del()" << std::endl; - G.del(); - - } - - GrB_Matrix getC(){ return G.getC(); } - GrB_Matrix getM(){ return G.getM(); } - GrB_Matrix getA(){ return G.getA(); } - GrB_Matrix getB(){ return G.getB(); } - - GrB_Monoid getMonoid() { return monoid; } - GrB_BinaryOp getBinaryOp() { return binop; } - - int64_t getN() { return N; } - int64_t getAnnz() { return Annz; } - int64_t getBnnz() { return Bnnz; } - int64_t getCnnz() { return Cnnz; } - - auto &getG() { return G; } - - GB_cuda_mxm_factory &get_mxm_factory() { - - // Lazily create the mxm factory - if(!mymxmfactory.has_value()) { - - mymxmfactory.emplace(GB_cuda_mxm_factory()); - GrB_Matrix C = G.getC(); - GrB_Matrix M = G.getM(); - GrB_Matrix A = G.getA(); - GrB_Matrix B = G.getB(); - - bool C_iso = false ; - int C_sparsity = GB_sparsity (M) ; - GrB_Type ctype = binop->ztype ; - - (*mymxmfactory).mxm_factory ( - C_iso, C_sparsity, ctype, - M, mask_struct, mask_comp, - mysemiring, flipxy, - A, B) ; - } - return *mymxmfactory; - } - GrB_Semiring get_semiring() { return mysemiring; } - - void set_sparsity_control(GrB_Matrix mat, int gxb_sparsity_control, int gxb_format) { - GRB_TRY (GxB_Matrix_Option_set (mat, GxB_SPARSITY_CONTROL, gxb_sparsity_control)) ; - GRB_TRY (GxB_Matrix_Option_set(mat, GxB_FORMAT, gxb_format)); - GRB_TRY (GrB_Matrix_wait (mat, GrB_MATERIALIZE)) ; - } - - bool get_mask_struct() { return mask_struct; } - -private: - - bool mask_struct{false}; - bool flipxy{false}; - bool mask_comp{false}; - - int64_t Annz; - int64_t Bnnz; - int64_t Cnnz; - int64_t N; - GrB_BinaryOp binop; - GrB_Monoid monoid; - GrB_Semiring mysemiring; - std::optional mymxmfactory; - SpGEMM_problem_generator G; -}; diff --git a/GraphBLAS/CUDA/test/run_tests.cpp b/GraphBLAS/CUDA/test/run_tests.cpp deleted file mode 100644 index 2618b9f688..0000000000 --- a/GraphBLAS/CUDA/test/run_tests.cpp +++ /dev/null @@ -1,45 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/test/run_tests.cpp -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -#include - -#include "GraphBLAS_cuda.h" -#include "rmm_wrap.h" - -#include "test_utility.hpp" - -int main(int argc, char **argv) { - - size_t init_size, max_size, stream_pool_size; - init_size = 256*(1ULL<<10); - max_size = 256*(1ULL<<20); - stream_pool_size = 1; - - printf(" pool init size %ld, max size %ld\n", init_size, max_size); - rmm_wrap_initialize_all_same( rmm_wrap_managed, init_size, max_size, stream_pool_size); - - GRB_TRY (GxB_init (GxB_NONBLOCKING_GPU, - rmm_wrap_malloc, rmm_wrap_calloc, rmm_wrap_realloc, rmm_wrap_free)) ; - - std::cout << "Done initializing graphblas and rmm" << std::endl; - - GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, 0)) ; - - size_t buff_size = (1ULL<<13)+152; - void *p = (void *)rmm_wrap_allocate( &buff_size ); - - ::testing::InitGoogleTest(&argc, argv); - auto r = RUN_ALL_TESTS(); - - rmm_wrap_deallocate( p, buff_size); - GRB_TRY (GrB_finalize()); - rmm_wrap_finalize(); - std::cout << "Tests complete" << std::endl; - - return r; -} diff --git a/GraphBLAS/CUDA/test/testGen_cmake.py b/GraphBLAS/CUDA/test/testGen_cmake.py deleted file mode 100644 index 52857f0bd0..0000000000 --- a/GraphBLAS/CUDA/test/testGen_cmake.py +++ /dev/null @@ -1,176 +0,0 @@ -#------------------------------------------------------------------------------- -# GraphBLAS/CUDA/test/testGen_cmake.py -#------------------------------------------------------------------------------- - -# SPDX-License-Identifier: Apache-2.0 - -#------------------------------------------------------------------------------- - -# Generate test instances from a large tensor product set of options - -GB_TYPE_PREFIX = "GrB" - -SUPPORTED_TYPES = { - "int32_t": "INT32", - "uint32_t": "UINT32", - "int64_t": "INT64", - "uint64_t": "UINT64", - "bool": "BOOL", - "float": "FP32", - "double": "FP64" -} - -DOT3_BUCKETS = [1, 2] # NBUCKETS, hard-coded - -DataShapes ={ - "nanoxnano": {'N':32, 'Anz':64, 'Bnz':56, 'Cnz': 256}, - "tinyxtiny": {'N':128, 'Anz':1256, 'Bnz':1028, 'Cnz': 1640}, - "smallxsmall": {'N':1024, 'Anz': 65_536, 'Bnz':65_536, 'Cnz': 10000}, - "ti_denxti_den": {'N':32, 'Anz':1024, 'Bnz':1024, 'Cnz': 1024}, - "ti_spaxti_den": {'N':32, 'Anz':256, 'Bnz':1024, 'Cnz': 1024}, - "medxmed": {'N':4096, 'Anz': 2**20, 'Bnz':2**20}, - "largexlarge": {'N':2**16, 'Anz': 64*2**20, 'Bnz':64*2**20} -} - -FORMATS = { "sparse": ["phase1", "phase2", "mxm_sparse"], - "dense": ["dense_phase1", "mxm_dense"], - "sparse_dense": ["dense_phase1", "mxm_sparse_dense"], - "reduce": ["reduce"]} - -FORMAT_INPUTS = { - "sparse": [("GxB_SPARSE", "GxB_SPARSE")], - "dense": [("GxB_FULL", "GxB_FULL"), ("GxB_BITMAP", "GxB_BITMAP")], - "sparse_dense": [("GxB_SPARSE", "GxB_FULL")], - "reduce": [("GxB_SPARSE", "GxB_SPARSE")] -} - -FORMAT_DATASETS = { - "sparse": ["nanoxnano", "tinyxtiny", "smallxsmall"], - "dense": ["ti_denxti_den"], - "sparse_dense": ["ti_spaxti_den"], - "reduce": ["nanoxnano", "smallxsmall", "ti_denxti_den", "ti_spaxti_den"] -} - -def std_type_to_gb_type(t): - return SUPPORTED_TYPES[t] - -def build_gb_monioid(t, m): - # Example: GrB_PLUS_MONIOD_UINT64 - gb_type = std_type_to_gb_type(t) - return f"{GB_TYPE_PREFIX}_{m}_MONOID_{gb_type}" - -def build_gb_binop(t, b): - # Example: GrB_TIMES_UINT64 - gb_type = std_type_to_gb_type(t) - return f"{GB_TYPE_PREFIX}_{b}_{gb_type}" - - - - -def buildTest(ts="TestsuiteName", ds="tiny-tiny", df=("GxB_SPARSE", "GxB_SPARSE"), - SUM="PLUS", PRODUCT="TIMES", - typeC="int32_t",typeM="int32_t", - typeA="int32_t",typeB="int32_t", - type_x="int32_t", type_y="int32_t",type_z="int32_t"): - - # build string interpolation from pieces - format_A, format_B = df - - Test_name = f"{ds}{SUM}_{PRODUCT}__{format_A}_{format_B}__C{typeC}M{typeM}A{typeA}B{typeB}X{type_x}Y{type_y}Z{type_z}" - Test_suite = f"{ts}" - - N = DataShapes[ds]['N'] - Anz = DataShapes[ds]['Anz'] - Bnz = DataShapes[ds]['Bnz'] - Cnz = DataShapes[ds]['Cnz'] - - gb_monoid = build_gb_monioid(typeC, SUM) - gb_binop = build_gb_binop(typeC, PRODUCT) - - TEST_HEAD = f""" - TEST( {Test_suite}, {Test_name}) {{ - - /************************** - * Create reference and input data - */ - GrB_Monoid monoid = {gb_monoid}; - GrB_BinaryOp binop = {gb_binop}; - - mxm_problem_spec<{typeC}, {typeM}, {typeA}, {typeB}> problem_spec(monoid, binop, {N}, {Anz}, {Bnz}, {Cnz}, - {format_A}, {format_B}); - """ - phase1_body= f""" test_AxB_phase1_factory< {typeC}, {typeM}, {typeA}, {typeB}>(problem_spec);""" - phase2_body= f""" test_AxB_phase2_factory< {typeC}, {typeM}, {typeA}, {typeB} >(problem_spec);""" - dense_phase1_body = f""" test_AxB_dense_phase1_factory<{typeC}, {typeM}, {typeA}, {typeB}>(problem_spec);""" - mxm_sparse_body = f""" test_AxB_dot3_sparse_factory< {typeC},{typeM},{typeA},{typeB},{type_x},{type_y},{type_z} > (problem_spec);\n""" - mxm_dense_body = f""" test_AxB_dot3_dense_factory< {typeC},{typeM},{typeA},{typeB},{type_x},{type_y},{type_z} > (problem_spec);\n""" - mxm_sparse_dense_body = f""" test_AxB_dot3_sparse_dense_factory< {typeC},{typeM},{typeA},{typeB},{type_x},{type_y},{type_z} > (problem_spec);\n""" - reduce_body = f""" test_reduce_factory<{typeC}, {typeM}, {typeA}, {typeB}>(problem_spec);""" - phasedict = { "phase1": phase1_body, - "phase2": phase2_body, - "mxm_sparse": mxm_sparse_body, - "mxm_dense": mxm_dense_body, - "mxm_sparse_dense": mxm_sparse_dense_body, - "reduce": reduce_body, - "dense_phase1": dense_phase1_body } - - return TEST_HEAD, phasedict - -def load_types(argv): - test_suite_name = argv[2] - Monoids = argv[3].split(";") - Binops = argv[4].split(";") - Semirings = argv[5] - DataTypes = argv[6].split(";") - - # Hard-coding data shapes for now - Kernels= argv[7] - - return argv[1], test_suite_name, Monoids, Binops, Semirings, DataTypes, DataShapes, Kernels - -def write_test_instances_header(test_suite_name, mat_format, tests, Monoids, Binops, Semirings, DataTypes, DataShapes, Kernels): - outfile = f'{test_suite_name}_{Semirings}_{mat_format}_test_instances.hpp' - with open(outfile, 'w') as fp: - fp.write("#pragma once\n#include \"problem_spec.hpp\"\n"); - m, b = Semirings.split("_") - Test_suite = f'{test_suite_name}_tests_{mat_format}_{m}_{b}' - for dtC in DataTypes: - dtX = dtC - dtY = dtC - dtZ = dtC - for dtM in ["bool", "int32_t", "int64_t", "float", "double"]: - for dtA in DataTypes: - for dtB in DataTypes: - for ds in FORMAT_DATASETS[mat_format]: - for df in FORMAT_INPUTS[mat_format]: - TEST_HEAD, TEST_BODY = buildTest( Test_suite, ds, df, m, b, - dtC, dtM, dtA, dtB, dtX, dtY, dtZ) - fp.write( TEST_HEAD) - for test in tests: - fp.write( TEST_BODY[test] ) - fp.write( "}\n") - -def write_cuda_test(source_dir, test_suite_name, mat_format, semiring, kernel): - import shutil - - shutil.copy(f"{source_dir}/test/cuda_tests_template.cpp", f"{test_suite_name}_{semiring}_{mat_format}_cuda_tests.cpp") - - with open(f"{test_suite_name}_{semiring}_{mat_format}_cuda_tests.cpp", "a") as file_object: - # Keeping this as a separate file for now to allow for further nesting - # of test instances for each test_suite_name - file_object.write(f"\n#include \"{test_suite_name}_{semiring}_{mat_format}_test_instances.hpp\"") - -if __name__ == "__main__": - import sys - - if(len(sys.argv) != 8): - raise ValueError("Expected 7 arguments but only got %s" % len(sys.argv)) - - """ - First load values - """ - source_dir, test_suite_name, Monoids, Binops, Semirings, DataTypes, DataShapes, Kernels = load_types(sys.argv) - - for mat_format, tests in FORMATS.items(): - write_test_instances_header(test_suite_name, mat_format, tests, Monoids, Binops, Semirings, DataTypes, DataShapes, DOT3_BUCKETS) - write_cuda_test(source_dir, test_suite_name, mat_format, Semirings, Kernels) diff --git a/GraphBLAS/CUDA/test/test_data.hpp b/GraphBLAS/CUDA/test/test_data.hpp deleted file mode 100644 index d6cca87d0e..0000000000 --- a/GraphBLAS/CUDA/test/test_data.hpp +++ /dev/null @@ -1,113 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/test/test_data.hpp -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -#include -#include - -#pragma once - -template -class TestData { - -public: - TestData( std::vector A_indptr_, - std::vector A_indices_, - std::vector A_data_, - - std::vector B_indptr_, - std::vector B_indices_, - std::vector B_data_, - - - std::vector C_indptr_, - std::vector C_indices_, - std::vector C_data_, - - std::vector M_indptr_, - std::vector M_indices_, - std::vector M_data_): - A_indptr(A_indptr_), A_indices(A_indices_), A_data(A_data_), - B_indptr(B_indptr_), B_indices(B_indices_), B_data(B_data_), - C_indptr(C_indptr_), C_indices(C_indices_), C_data(C_data_), - M_indptr(M_indptr_), M_indices(M_indices_), M_data(M_data_){} - - - std::vector A_indptr; - std::vector A_indices; - std::vector A_data; - - std::vector B_indptr; - std::vector B_indices; - std::vector B_data; - - - std::vector C_indptr; - std::vector C_indices; - std::vector C_data; - - std::vector M_indptr; - std::vector M_indices; - std::vector M_data; - -}; - -template -std::unique_ptr> make_karate_tricount() { - - std::vector A_indptr = { 0,16,24,32,35,37,40,41,41,44,45,45,45,45,46,48,50,50,50,52,53,55,55,57, - 62,65,66,68,69,71,73,75,77,78,78}; - std::vector A_indices = { 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,17,19,21,31, 2, 3, 7,13,17,19,21,30, - 3, 7, 8, 9,13,27,28,32, 7,12,13, 6,10, 6,10,16,16,30,32,33,33,33,32,33, - 32,33,32,33,33,32,33,32,33,25,27,29,32,33,25,27,31,31,29,33,33,31,33,32, - 33,32,33,32,33,33}; - std::vector A_data = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1}; - - std::vector B_indptr = { 0, 0, 1, 3, 6, 7, 8,11,15,17,18,21,22,24,28,28,28,30,32,32,34,34,36,36, - 36,36,38,38,41,42,44,46,50,61,78}; - std::vector B_indices = { 0, 0, 1, 0, 1, 2, 0, 0, 0, 4, 5, 0, 1, 2, 3, 0, 2, 2, 0, 4, 5, 0, 0, 3, - 0, 1, 2, 3, 5, 6, 0, 1, 0, 1, 0, 1,23,24, 2,23,24, 2,23,26, 1, 8, 0,24, - 25,28, 2, 8,14,15,18,20,22,23,29,30,31, 8, 9,13,14,15,18,19,20,22,23,26, - 27,28,29,30,31,32}; - std::vector B_data = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1}; - - std::vector M_indptr = { 0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80, - 82, 84, 87, 89, 91, 93, 98,101,104,106,110,113,117,121,127,139,156}; - std::vector M_indices = { 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,17,19,21,31, 0, 2, 3, 7,13,17,19,21, - 30, 0, 1, 3, 7, 8, 9,13,27,28,32, 0, 1, 2, 7,12,13, 0, 6,10, 0, 6,10,16, - 0, 4, 5,16, 0, 1, 2, 3, 0, 2,30,32,33, 2,33, 0, 4, 5, 0, 0, 3, 0, 1, 2, - 3,33,32,33,32,33, 5, 6, 0, 1,32,33, 0, 1,33,32,33, 0, 1,32,33,25,27,29, - 32,33,25,27,31,23,24,31,29,33, 2,23,24,33, 2,31,33,23,26,32,33, 1, 8,32, - 33, 0,24,25,28,32,33, 2, 8,14,15,18,20,22,23,29,30,31,33, 8, 9,13,14,15, - 18,19,20,22,23,26,27,28,29,30,31,32}; - std::vector M_data = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1}; - - std::vector C_indptr = { 0, 0, 7,12,17,19,21,24,27,29,29,31,31,32,35,35,35,36,37,37,38,38,39,39, - 39,39,40,40,41,41,43,45,47,51,56}; - std::vector C_indices = { 2, 3, 7,13,17,19,21, 1, 3, 7, 8,13, 1, 2, 7,12,13, 6,10, 6,10, 4, 5,16, - 1, 2, 3, 2,32, 4, 5, 3, 1, 2, 3, 6, 1, 1, 1,31,33,32,33,32,33,25,33, 8, - 29,30,33,27,29,30,31,32}; - std::vector C_data = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 3, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, - 1, 1,10, 1, 2, 1, 1,10}; - - TestData karate_tricount(A_indptr, A_indices, A_data, - B_indptr, B_indices, B_data, - C_indptr, C_indices, C_data, - M_indptr, M_indices, M_data); - - return std::make_unique>(karate_tricount); -} - diff --git a/GraphBLAS/CUDA/test/test_jitify.cpp b/GraphBLAS/CUDA/test/test_jitify.cpp deleted file mode 100644 index 30aab0adc2..0000000000 --- a/GraphBLAS/CUDA/test/test_jitify.cpp +++ /dev/null @@ -1,51 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/test/test_jitify.cpp -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -#include "jitify.hpp" -#include "GB_cuda_jitify_launcher.h" - -int main(int argc, char **argv) { - -#if 0 - -BROKEN - - std::string named_program = "GB_jit_AxB_phase2"; - std::string kern_name = "AxB_phase2"; - - - jitify::experimental::Program& program = *std::get<1>(named_program); - auto instantiated_kernel = program.kernel(kern_name).instantiate({}); - - // hashable name is program name - // string to be jitted is the actual prgram - // - - dim3 grid(1); - dim3 block(1); - -// std::cout<< kernel_name<<" with types " <mat, cnz); - - -#endif - -} diff --git a/GraphBLAS/CUDA/test/test_utility.hpp b/GraphBLAS/CUDA/test/test_utility.hpp deleted file mode 100644 index 3eb6ba942e..0000000000 --- a/GraphBLAS/CUDA/test/test_utility.hpp +++ /dev/null @@ -1,22 +0,0 @@ -//------------------------------------------------------------------------------ -// GraphBLAS/CUDA/test/test_utility.hpp -//------------------------------------------------------------------------------ - -// SPDX-License-Identifier: Apache-2.0 - -//------------------------------------------------------------------------------ - -#pragma once - -// try calling a GrB_method and check the result -#define GRB_TRY(GrB_method) \ -{ \ - GrB_Info GB_info_result = GrB_method ; \ - if (GB_info_result < GrB_SUCCESS) \ - { \ - printf ("test failure: file %s line %d status %d\n", \ - __FILE__, __LINE__, GB_info_result) ; \ - exit (EXIT_FAILURE) ; \ - } \ -} - diff --git a/GraphBLAS/CUDA/GB_cuda_cumsum.cu b/GraphBLAS/CUDA/unused/GB_cuda_cumsum.cu similarity index 98% rename from GraphBLAS/CUDA/GB_cuda_cumsum.cu rename to GraphBLAS/CUDA/unused/GB_cuda_cumsum.cu index da6b32f504..8ed9726afa 100644 --- a/GraphBLAS/CUDA/GB_cuda_cumsum.cu +++ b/GraphBLAS/CUDA/unused/GB_cuda_cumsum.cu @@ -18,7 +18,7 @@ // sum (count [0..j-1]). count [n] is implicitly zero on input. // On output, count [n] is the total sum. -#include "GB_cuda.h" +#include "GB_cuda.hpp" // #include #include diff --git a/GraphBLAS/CUDA/unused/GB_search_for_vector_device.cuh b/GraphBLAS/CUDA/unused/GB_search_for_vector_device.cuh new file mode 100644 index 0000000000..6384c1c840 --- /dev/null +++ b/GraphBLAS/CUDA/unused/GB_search_for_vector_device.cuh @@ -0,0 +1,69 @@ +//------------------------------------------------------------------------------ +// GB_search_for_vector_device +//------------------------------------------------------------------------------ + +static __device__ __inline__ int64_t GB_search_for_vector_device +( + const int64_t p, // search for vector k that contains p + const int64_t *restrict Ap, // vector pointers to search + int64_t kleft, // left-most k to search + int64_t anvec, // Ap is of size anvec+1 + int64_t avlen // A->vlen +) +{ + + //-------------------------------------------------------------------------- + // check inputs + //-------------------------------------------------------------------------- + + if (Ap == NULL) + { + // A is full or bitmap + ASSERT (p >= 0 && p < avlen * anvec) ; + return ((avlen == 0) ? 0 : (p / avlen)) ; + } + + // A is sparse or hypersparse + ASSERT (p >= 0 && p < Ap [anvec]) ; + + //-------------------------------------------------------------------------- + // search for k + //-------------------------------------------------------------------------- + + int64_t k = kleft ; + int64_t kright = anvec ; + bool found ; + GB_SPLIT_BINARY_SEARCH (p, Ap, k, kright, found) ; + + // FIXME: this is not needed if the search is approximate: + if (found) + { + // Ap [k] == p has been found, but if k is an empty vector, then the + // next vector will also contain the entry p. In that case, k needs to + // be incremented until finding the first non-empty vector for which + // Ap [k] == p. + ASSERT (Ap [k] == p) ; + while (k < anvec-1 && Ap [k+1] == p) + { + k++ ; + } + } + else + { + // p has not been found in Ap, so it appears in the middle of Ap [k-1] + // ... Ap [k], as computed by the binary search. This is the range of + // entries for the vector k-1, so k must be decremented. + k-- ; + } + + //-------------------------------------------------------------------------- + // return result + //-------------------------------------------------------------------------- + + // The entry p must reside in a non-empty vector. + ASSERT (k >= 0 && k < anvec) ; + ASSERT (Ap [k] <= p && p < Ap [k+1]) ; + + return (k) ; +} + diff --git a/GraphBLAS/Config/GB_config.h.in b/GraphBLAS/Config/GB_config.h.in index 3585ce4c20..513cc40f94 100644 --- a/GraphBLAS/Config/GB_config.h.in +++ b/GraphBLAS/Config/GB_config.h.in @@ -21,7 +21,7 @@ // GB_C_FLAGS: the C compiler flags used to compile GraphBLAS. Used // for compiling and linking: #ifndef GB_C_FLAGS -#define GB_C_FLAGS "@GB_C_FLAGS@" +#define GB_C_FLAGS "@GB_C_FLAGS@ @GB_OPENMP_C_FLAGS@" #endif // GB_C_LINK_FLAGS: the flags passed to the C compiler for the link phase: @@ -49,7 +49,7 @@ #define GB_OMP_INC "@GB_OMP_INC@" #endif -// GB_OMP_INC_DIRS: include directories OpenMP, if in use by GraphBLAS, +// GB_OMP_INC_DIRS: include directories for OpenMP, if in use by GraphBLAS, // for cmake: #ifndef GB_OMP_INC_DIRS #define GB_OMP_INC_DIRS "@GB_OMP_INC_DIRS@" @@ -65,5 +65,25 @@ #define GB_CMAKE_LIBRARIES "@GB_CMAKE_LIBRARIES@" #endif +// GB_CUDA_COMPILER: the CUDA compiler to compile CUDA JIT kernels: +#ifndef GB_CUDA_COMPILER +#define GB_CUDA_COMPILER "@GB_CUDA_COMPILER@" +#endif + +// GB_CUDA_FLAGS: the CUDA flags to compile CUDA JIT kernels: +#ifndef GB_CUDA_FLAGS +#define GB_CUDA_FLAGS "@GB_CUDA_FLAGS@" +#endif + +// GB_CUDA_INC: -I includes for CUDA JIT kernels: +#ifndef GB_CUDA_INC +#define GB_CUDA_INC "@GB_CUDA_INC@" +#endif + +// GB_CUDA_ARCHITECTURES: the CUDA ARCHITECTURES for CUDA JIT kernels: +#ifndef GB_CUDA_ARCHITECTURES +#define GB_CUDA_ARCHITECTURES "@GB_CUDA_ARCHITECTURES@" +#endif + #endif diff --git a/GraphBLAS/Config/GraphBLAS.h.in b/GraphBLAS/Config/GraphBLAS.h.in index e01f63c2c3..0dbdd56fe8 100644 --- a/GraphBLAS/Config/GraphBLAS.h.in +++ b/GraphBLAS/Config/GraphBLAS.h.in @@ -279,7 +279,7 @@ // The 'spec' string describes the GraphBLAS spec: #define GxB_SPEC_ABOUT \ "GraphBLAS C API, by Benjamin Brock, Aydin Buluc, Raye Kimmerer,\n" \ -"Jim Kitchen, Major Kumar, Timothy Mattson, Scott McMillan, Jose' Moreira,\n" \ +"Jim Kitchen, Manoj Kumar, Timothy Mattson, Scott McMillan, Jose' Moreira,\n" \ "Erik Welch, and Carl Yang. Based on 'GraphBLAS Mathematics by Jeremy\n" \ "Kepner. See also 'Graph Algorithms in the Language of Linear Algebra,'\n" \ "edited by J. Kepner and J. Gilbert, SIAM, 2011.\n" @@ -3772,6 +3772,8 @@ typedef enum // for global options or matrix options GxB_JIT_USE_CMAKE = 7032, // CPU JIT: use cmake or direct compile GxB_JIT_ERROR_LOG = 7033, // CPU JIT: error log file + GxB_JIT_CUDA_PREFACE = 7100, // CUDA JIT C++ preface + //------------------------------------------------------------ // GrB_get for GrB_Matrix: //------------------------------------------------------------ @@ -3973,7 +3975,7 @@ GrB_Info GxB_Context_get (GxB_Context, GxB_Context_Field, ...) ; _Generic \ ( \ (arg1), \ - default : GxB_Global_Option_set , \ + default: GxB_Global_Option_set , \ GxB_Option_Field : GxB_Global_Option_set , \ GrB_Vector : GxB_Vector_Option_set , \ GrB_Matrix : GxB_Matrix_Option_set , \ @@ -3986,7 +3988,7 @@ GrB_Info GxB_Context_get (GxB_Context, GxB_Context_Field, ...) ; _Generic \ ( \ (arg1), \ - default : GxB_Global_Option_get , \ + default: GxB_Global_Option_get , \ GxB_Option_Field : GxB_Global_Option_get , \ GrB_Vector : GxB_Vector_Option_get , \ GrB_Matrix : GxB_Matrix_Option_get , \ diff --git a/GraphBLAS/Demo/Program/gauss_demo.c b/GraphBLAS/Demo/Program/gauss_demo.c index 3411a31a8a..b32ec76b06 100644 --- a/GraphBLAS/Demo/Program/gauss_demo.c +++ b/GraphBLAS/Demo/Program/gauss_demo.c @@ -7,7 +7,7 @@ //------------------------------------------------------------------------------ -#include "GraphBLAS.h" +#include "graphblas_demos.h" #undef I //------------------------------------------------------------------------------ diff --git a/GraphBLAS/Demo/Program/reduce_demo.c b/GraphBLAS/Demo/Program/reduce_demo.c index 492b51423d..303cca39a4 100644 --- a/GraphBLAS/Demo/Program/reduce_demo.c +++ b/GraphBLAS/Demo/Program/reduce_demo.c @@ -7,7 +7,7 @@ //------------------------------------------------------------------------------ -#include "GraphBLAS.h" +#include "graphblas_demos.h" #undef I #if defined ( _OPENMP ) #include diff --git a/GraphBLAS/Demo/Program/wildtype_demo.c b/GraphBLAS/Demo/Program/wildtype_demo.c index 8eb761ee95..36e312746e 100644 --- a/GraphBLAS/Demo/Program/wildtype_demo.c +++ b/GraphBLAS/Demo/Program/wildtype_demo.c @@ -10,7 +10,7 @@ // Each "scalar" entry of this type consists of a 4x4 matrix and a string of // length 64. -#include "GraphBLAS.h" +#include "graphblas_demos.h" #undef I #if defined __INTEL_COMPILER @@ -196,13 +196,7 @@ int main (void) { // start GraphBLAS - #if 1 GrB_init (GrB_NONBLOCKING) ; - #else - GxB_init (GxB_NONBLOCKING_GPU, NULL, NULL, NULL, NULL, NULL) ; - GxB_set (GxB_GPU_ID, 0) ; - GB_Global_hack_set (2, 1) ; // always use the GPU - #endif GxB_Global_Option_set (GxB_BURBLE, true) ; int nthreads ; diff --git a/GraphBLAS/Doc/ChangeLog b/GraphBLAS/Doc/ChangeLog index 15e87e79d8..e1ef26ee59 100644 --- a/GraphBLAS/Doc/ChangeLog +++ b/GraphBLAS/Doc/ChangeLog @@ -1,4 +1,4 @@ -Feb XX, 2024: version 9.1.0 +Mar 22, 2024: version 9.1.0 * minor updates to build system * C11 complex type detection: this is now detected and configured by @@ -9,6 +9,18 @@ Feb XX, 2024: version 9.1.0 GraphBLAS.h to indicate which kind of complex data types are available in C11 or MSVC. Contributed by Markus Mützel. * port to clang-cl: fixing the GxB_get and GxB_set macro + * (53) bug fix: eWiseAdd C=A+B when M, A, and B are all hypersparse; + access to M was incorrect (also affects C+=T for any operation, if + M and T are both hypersparse). + +Mar 1, 2024: version 9.0.3 + + * (52) performance bug fix: JIT kernels since v8.3.1 were not compiled with + OpenMP. + +Feb 26, 2024: version 9.0.2 + + * (51) bug fix: GraphBLAS/Makefile "make static" was incorrect. Jan 20, 2024: version 9.0.1 diff --git a/GraphBLAS/Doc/FUTURE.txt b/GraphBLAS/Doc/FUTURE.txt index 7327aa3ccf..79dd37efa8 100644 --- a/GraphBLAS/Doc/FUTURE.txt +++ b/GraphBLAS/Doc/FUTURE.txt @@ -12,9 +12,13 @@ CUDA: Future features: + cumulative sum (or other monoid) + pack/unpack COO kernel fusion CUDA kernels + distributed framework + fine-grain parallelism for dot-product based mxm, mxv, vxm, then add GxB_vxvt (outer product) and GxB_vtxv (inner product) (or call them GxB_outerProduct and GxB_innerProduct?) @@ -40,3 +44,21 @@ Future features: GrB_set (op, GxB_DEFN, "string" also for all ops + candidates for kernel fusion: + * triangle counting: mxm then reduce to scalar + * lcc: mxm then reduce to vector + * FusedMM: see https://arxiv.org/pdf/2011.06391.pdf + + more: + * consider algorithms where fusion can occur + * performance monitor, or revised burble, to detect generic cases + * check if vectorization of GrB_mxm is effective when using clang + * see how HNSW vector search could be implemented in GraphBLAS + +CUDA JIT: + + https://developer.nvidia.com/blog/cuda-12-0-compiler-support-for-runtime-lto-using-nvjitlink-library/ + Developer webpage talking about ways to do nvJit with link time + optimization using CUDA 12.0 Shows precompiled path and JIT path to + generate kernels + diff --git a/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf b/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf index 2560e6e7aa..15a176c960 100644 Binary files a/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf and b/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf differ diff --git a/GraphBLAS/Doc/GraphBLAS_UserGuide.tex b/GraphBLAS/Doc/GraphBLAS_UserGuide.tex index 07f87969a8..598b4c7484 100644 --- a/GraphBLAS/Doc/GraphBLAS_UserGuide.tex +++ b/GraphBLAS/Doc/GraphBLAS_UserGuide.tex @@ -14768,7 +14768,7 @@ \section{Release Notes} \begin{itemize} -\item Feb XX, 2024: version 9.1.0 % FIXME for SuiteSparse 7.7.0 +\item Mar 22, 2024: version 9.1.0 \begin{itemize} \item minor updates to build system @@ -14780,6 +14780,23 @@ \section{Release Notes} \verb'GxB_HAVE_COMPLEX*' to GraphBLAS.h to indicate which kind of complex data types are available in C11 or MSVC. Contributed by Markus M\"{u}tzel. + \item (53) bug fix: eWiseAdd \verb'C=A+B' when \verb'M', \verb'A', + and \verb'B' are all hypersparse; access to \verb'M' was incorrect + (also affects \verb'C+=T' for any operation, if \verb'M' and + \verb'T' are both hypersparse). + \end{itemize} + +\item Mar 1, 2024: version 9.0.3 + + \begin{itemize} + \item (52) performance bug fix: JIT kernels since v8.3.1 were not compiled + with OpenMP. + \end{itemize} + +\item Feb 26, 2024: version 9.0.2 + + \begin{itemize} + \item GraphBLAS/Makefile \verb"make static" was incorrect. \end{itemize} \item Jan 20, 2024: version 9.0.1 diff --git a/GraphBLAS/Doc/GraphBLAS_version.tex b/GraphBLAS/Doc/GraphBLAS_version.tex index e970a35c25..c4b9b9002b 100644 --- a/GraphBLAS/Doc/GraphBLAS_version.tex +++ b/GraphBLAS/Doc/GraphBLAS_version.tex @@ -1,5 +1,5 @@ % version of SuiteSparse:GraphBLAS \date{VERSION 9.1.0, -Feb XX, 2024} +Mar 22, 2024} diff --git a/GraphBLAS/GraphBLAS/CMakeLists.txt b/GraphBLAS/GraphBLAS/CMakeLists.txt index e69ed35478..51156a96ea 100644 --- a/GraphBLAS/GraphBLAS/CMakeLists.txt +++ b/GraphBLAS/GraphBLAS/CMakeLists.txt @@ -42,6 +42,7 @@ include ( SuiteSparsePolicy ) # option ( GRAPHBLAS_USE_CUDA "ON (default): enable CUDA acceleration for GraphBLAS, OFF: do not use CUDA" ${SUITESPARSE_USE_CUDA} ) set ( GRAPHBLAS_HAS_CUDA OFF ) +message ( STATUS "GraphBLAS CUDA JIT: disabled for MATLAB" ) # check for strict usage if ( SUITESPARSE_USE_STRICT AND GRAPHBLAS_USE_CUDA AND NOT GRAPHBLAS_HAS_CUDA ) diff --git a/GraphBLAS/GraphBLAS/rename/GB_rename.h b/GraphBLAS/GraphBLAS/rename/GB_rename.h index a2a7b272ee..d3362f4ee5 100644 --- a/GraphBLAS/GraphBLAS/rename/GB_rename.h +++ b/GraphBLAS/GraphBLAS/rename/GB_rename.h @@ -535,6 +535,7 @@ #define GB_jitifyer_get_C_link_flags GM_jitifyer_get_C_link_flags #define GB_jitifyer_get_control GM_jitifyer_get_control #define GB_jitifyer_get_C_preface GM_jitifyer_get_C_preface +#define GB_jitifyer_get_CUDA_preface GM_jitifyer_get_CUDA_preface #define GB_jitifyer_get_error_log GM_jitifyer_get_error_log #define GB_jitifyer_get_use_cmake GM_jitifyer_get_use_cmake #define GB_jitifyer_hash_encoding GM_jitifyer_hash_encoding @@ -544,6 +545,7 @@ #define GB_jitifyer_load GM_jitifyer_load #define GB_jitifyer_load_worker GM_jitifyer_load_worker #define GB_jitifyer_lookup GM_jitifyer_lookup +#define GB_jitifyer_nvcc_compile GM_jitifyer_nvcc_compile #define GB_jitifyer_path_256 GM_jitifyer_path_256 #define GB_jitifyer_query GM_jitifyer_query #define GB_jitifyer_set_cache_path GM_jitifyer_set_cache_path @@ -561,6 +563,8 @@ #define GB_jitifyer_set_control GM_jitifyer_set_control #define GB_jitifyer_set_C_preface GM_jitifyer_set_C_preface #define GB_jitifyer_set_C_preface_worker GM_jitifyer_set_C_preface_worker +#define GB_jitifyer_set_CUDA_preface GM_jitifyer_set_CUDA_preface +#define GB_jitifyer_set_CUDA_preface_worker GM_jitifyer_set_CUDA_preface_worker #define GB_jitifyer_set_error_log GM_jitifyer_set_error_log #define GB_jitifyer_set_error_log_worker GM_jitifyer_set_error_log_worker #define GB_jitifyer_set_use_cmake GM_jitifyer_set_use_cmake @@ -696,8 +700,27 @@ #define GB_JITpackage_214 GM_JITpackage_214 #define GB_JITpackage_215 GM_JITpackage_215 #define GB_JITpackage_216 GM_JITpackage_216 +#define GB_JITpackage_217 GM_JITpackage_217 +#define GB_JITpackage_218 GM_JITpackage_218 +#define GB_JITpackage_219 GM_JITpackage_219 #define GB_JITpackage_21 GM_JITpackage_21 +#define GB_JITpackage_220 GM_JITpackage_220 +#define GB_JITpackage_221 GM_JITpackage_221 +#define GB_JITpackage_222 GM_JITpackage_222 +#define GB_JITpackage_223 GM_JITpackage_223 +#define GB_JITpackage_224 GM_JITpackage_224 +#define GB_JITpackage_225 GM_JITpackage_225 +#define GB_JITpackage_226 GM_JITpackage_226 +#define GB_JITpackage_227 GM_JITpackage_227 +#define GB_JITpackage_228 GM_JITpackage_228 +#define GB_JITpackage_229 GM_JITpackage_229 #define GB_JITpackage_22 GM_JITpackage_22 +#define GB_JITpackage_230 GM_JITpackage_230 +#define GB_JITpackage_231 GM_JITpackage_231 +#define GB_JITpackage_232 GM_JITpackage_232 +#define GB_JITpackage_233 GM_JITpackage_233 +#define GB_JITpackage_234 GM_JITpackage_234 +#define GB_JITpackage_235 GM_JITpackage_235 #define GB_JITpackage_23 GM_JITpackage_23 #define GB_JITpackage_24 GM_JITpackage_24 #define GB_JITpackage_25 GM_JITpackage_25 @@ -885,6 +908,7 @@ #define GB_macrofy_input GM_macrofy_input #define GB_macrofy_mask GM_macrofy_mask #define GB_macrofy_monoid GM_macrofy_monoid +#define GB_macrofy_multadd GM_macrofy_multadd #define GB_macrofy_mxm GM_macrofy_mxm #define GB_macrofy_name GM_macrofy_name #define GB_macrofy_nvals GM_macrofy_nvals diff --git a/GraphBLAS/Include/GraphBLAS.h b/GraphBLAS/Include/GraphBLAS.h index 6659650a9a..680ebdfd92 100644 --- a/GraphBLAS/Include/GraphBLAS.h +++ b/GraphBLAS/Include/GraphBLAS.h @@ -234,7 +234,7 @@ // The version of this implementation, and the GraphBLAS API version: #define GxB_IMPLEMENTATION_NAME "SuiteSparse:GraphBLAS" -#define GxB_IMPLEMENTATION_DATE "Feb XX, 2024" +#define GxB_IMPLEMENTATION_DATE "Mar 22, 2024" #define GxB_IMPLEMENTATION_MAJOR 9 #define GxB_IMPLEMENTATION_MINOR 1 #define GxB_IMPLEMENTATION_SUB 0 @@ -279,7 +279,7 @@ // The 'spec' string describes the GraphBLAS spec: #define GxB_SPEC_ABOUT \ "GraphBLAS C API, by Benjamin Brock, Aydin Buluc, Raye Kimmerer,\n" \ -"Jim Kitchen, Major Kumar, Timothy Mattson, Scott McMillan, Jose' Moreira,\n" \ +"Jim Kitchen, Manoj Kumar, Timothy Mattson, Scott McMillan, Jose' Moreira,\n" \ "Erik Welch, and Carl Yang. Based on 'GraphBLAS Mathematics by Jeremy\n" \ "Kepner. See also 'Graph Algorithms in the Language of Linear Algebra,'\n" \ "edited by J. Kepner and J. Gilbert, SIAM, 2011.\n" @@ -3772,6 +3772,8 @@ typedef enum // for global options or matrix options GxB_JIT_USE_CMAKE = 7032, // CPU JIT: use cmake or direct compile GxB_JIT_ERROR_LOG = 7033, // CPU JIT: error log file + GxB_JIT_CUDA_PREFACE = 7100, // CUDA JIT C++ preface + //------------------------------------------------------------ // GrB_get for GrB_Matrix: //------------------------------------------------------------ @@ -3973,7 +3975,7 @@ GrB_Info GxB_Context_get (GxB_Context, GxB_Context_Field, ...) ; _Generic \ ( \ (arg1), \ - default : GxB_Global_Option_set , \ + default: GxB_Global_Option_set , \ GxB_Option_Field : GxB_Global_Option_set , \ GrB_Vector : GxB_Vector_Option_set , \ GrB_Matrix : GxB_Matrix_Option_set , \ @@ -3986,7 +3988,7 @@ GrB_Info GxB_Context_get (GxB_Context, GxB_Context_Field, ...) ; _Generic \ ( \ (arg1), \ - default : GxB_Global_Option_get , \ + default: GxB_Global_Option_get , \ GxB_Option_Field : GxB_Global_Option_get , \ GrB_Vector : GxB_Vector_Option_get , \ GrB_Matrix : GxB_Matrix_Option_get , \ diff --git a/GraphBLAS/JITpackage/CMakeLists.txt b/GraphBLAS/JITpackage/CMakeLists.txt index 3b5d11ab68..3d3dab7563 100644 --- a/GraphBLAS/JITpackage/CMakeLists.txt +++ b/GraphBLAS/JITpackage/CMakeLists.txt @@ -78,8 +78,8 @@ if ( TARGET grb_jitpackage ) "../Include/GraphBLAS.h" "../Source/Template/*.[ch]" "../Source/JitKernels/*.[ch]" - "../CUDA/Template/*h" - "../CUDA/JitKernels/*h" + "../CUDA/Template/*" + "../CUDA/JitKernels/*" "../Source/Shared/*.h" ) add_custom_command ( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/GB_JITpackage.c diff --git a/GraphBLAS/Makefile b/GraphBLAS/Makefile index ff91ca7266..f1d5ecef93 100644 --- a/GraphBLAS/Makefile +++ b/GraphBLAS/Makefile @@ -80,7 +80,7 @@ setup: # build the static library static: - ( cd build && cmake $(CMAKE_OPTIONS) -DNSTATIC=0 .. && cmake --build . --config Release -j$(JOBS) ) + ( cd build && cmake $(CMAKE_OPTIONS) -DBUILD_STATIC_LIBS=ON -DBUILD_SHARED_LIBS=OFF .. && cmake --build . --config Release -j$(JOBS) ) # installs GraphBLAS to the install location defined by cmake, usually # /usr/local/lib and /usr/local/include diff --git a/GraphBLAS/README.md b/GraphBLAS/README.md index cbb8760b20..b2ce94c06e 100644 --- a/GraphBLAS/README.md +++ b/GraphBLAS/README.md @@ -4,7 +4,7 @@ SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. SPDX-License-Identifier: Apache-2.0 -VERSION 9.1.0, Feb XX, 2024 +VERSION 9.1.0, Mar 22, 2024 SuiteSparse:GraphBLAS is a complete implementation of the GraphBLAS standard, which defines a set of sparse matrix operations on an extended algebra of diff --git a/GraphBLAS/Source/Factories/GB_search_for_vector_template.c b/GraphBLAS/Source/Factories/GB_search_for_vector_template.c index 4a8ff482d1..fd7d8588ba 100644 --- a/GraphBLAS/Source/Factories/GB_search_for_vector_template.c +++ b/GraphBLAS/Source/Factories/GB_search_for_vector_template.c @@ -18,13 +18,11 @@ #ifdef GB_CUDA_KERNEL __device__ -static inline int64_t GB_search_for_vector_device -#else -static inline int64_t GB_search_for_vector // return vector k that contains p #endif +static inline int64_t GB_search_for_vector // return vector k that contains p ( const int64_t p, // search for vector k that contains p - const int64_t *restrict Ap, // vector pointers to search + const int64_t *restrict Ap, // vector pointers to search int64_t kleft, // left-most k to search int64_t anvec, // Ap is of size anvec+1 int64_t avlen // A->vlen @@ -42,7 +40,7 @@ static inline int64_t GB_search_for_vector // return vector k that contains p return ((avlen == 0) ? 0 : (p / avlen)) ; } - // A is sparse + // A is sparse or hypersparse ASSERT (p >= 0 && p < Ap [anvec]) ; //-------------------------------------------------------------------------- diff --git a/GraphBLAS/Source/Factories/GB_twotype_factory.c b/GraphBLAS/Source/Factories/GB_twotype_factory.c index 3548761d68..d3cbeeec90 100644 --- a/GraphBLAS/Source/Factories/GB_twotype_factory.c +++ b/GraphBLAS/Source/Factories/GB_twotype_factory.c @@ -36,11 +36,8 @@ switch (code1) case GB_UINT64_code : GB_WORKER (GB_OPNAME, _bool, bool, _uint64, uint64_t ) case GB_FP32_code : GB_WORKER (GB_OPNAME, _bool, bool, _fp32, float ) case GB_FP64_code : GB_WORKER (GB_OPNAME, _bool, bool, _fp64, double ) - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA case GB_FC32_code : GB_WORKER (GB_OPNAME, _bool, bool, _fc32, GxB_FC32_t) case GB_FC64_code : GB_WORKER (GB_OPNAME, _bool, bool, _fc64, GxB_FC64_t) - #endif default: ; } break ; @@ -62,11 +59,8 @@ switch (code1) case GB_UINT64_code : GB_WORKER (GB_OPNAME, _int8, int8_t, _uint64, uint64_t ) case GB_FP32_code : GB_WORKER (GB_OPNAME, _int8, int8_t, _fp32, float ) case GB_FP64_code : GB_WORKER (GB_OPNAME, _int8, int8_t, _fp64, double ) - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA case GB_FC32_code : GB_WORKER (GB_OPNAME, _int8, int8_t, _fc32, GxB_FC32_t) case GB_FC64_code : GB_WORKER (GB_OPNAME, _int8, int8_t, _fc64, GxB_FC64_t) - #endif default: ; } break ; @@ -88,11 +82,8 @@ switch (code1) case GB_UINT64_code : GB_WORKER (GB_OPNAME, _int16, int16_t, _uint64, uint64_t ) case GB_FP32_code : GB_WORKER (GB_OPNAME, _int16, int16_t, _fp32, float ) case GB_FP64_code : GB_WORKER (GB_OPNAME, _int16, int16_t, _fp64, double ) - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA case GB_FC32_code : GB_WORKER (GB_OPNAME, _int16, int16_t, _fc32, GxB_FC32_t) case GB_FC64_code : GB_WORKER (GB_OPNAME, _int16, int16_t, _fc64, GxB_FC64_t) - #endif default: ; } break ; @@ -114,11 +105,8 @@ switch (code1) case GB_UINT64_code : GB_WORKER (GB_OPNAME, _int32, int32_t, _uint64, uint64_t ) case GB_FP32_code : GB_WORKER (GB_OPNAME, _int32, int32_t, _fp32, float ) case GB_FP64_code : GB_WORKER (GB_OPNAME, _int32, int32_t, _fp64, double ) - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA case GB_FC32_code : GB_WORKER (GB_OPNAME, _int32, int32_t, _fc32, GxB_FC32_t) case GB_FC64_code : GB_WORKER (GB_OPNAME, _int32, int32_t, _fc64, GxB_FC64_t) - #endif default: ; } break ; @@ -140,11 +128,8 @@ switch (code1) case GB_UINT64_code : GB_WORKER (GB_OPNAME, _int64, int64_t, _uint64, uint64_t ) case GB_FP32_code : GB_WORKER (GB_OPNAME, _int64, int64_t, _fp32, float ) case GB_FP64_code : GB_WORKER (GB_OPNAME, _int64, int64_t, _fp64, double ) - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA case GB_FC32_code : GB_WORKER (GB_OPNAME, _int64, int64_t, _fc32, GxB_FC32_t) case GB_FC64_code : GB_WORKER (GB_OPNAME, _int64, int64_t, _fc64, GxB_FC64_t) - #endif default: ; } break ; @@ -166,11 +151,8 @@ switch (code1) case GB_UINT64_code : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _uint64, uint64_t ) case GB_FP32_code : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _fp32, float ) case GB_FP64_code : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _fp64, double ) - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA case GB_FC32_code : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _fc32, GxB_FC32_t) case GB_FC64_code : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _fc64, GxB_FC64_t) - #endif default: ; } break ; @@ -192,11 +174,8 @@ switch (code1) case GB_UINT64_code : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _uint64, uint64_t ) case GB_FP32_code : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _fp32, float ) case GB_FP64_code : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _fp64, double ) - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA case GB_FC32_code : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _fc32, GxB_FC32_t) case GB_FC64_code : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _fc64, GxB_FC64_t) - #endif default: ; } break ; @@ -218,11 +197,8 @@ switch (code1) case GB_UINT64_code : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _uint64, uint64_t ) case GB_FP32_code : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _fp32, float ) case GB_FP64_code : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _fp64, double ) - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA case GB_FC32_code : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _fc32, GxB_FC32_t) case GB_FC64_code : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _fc64, GxB_FC64_t) - #endif default: ; } break ; @@ -244,11 +220,8 @@ switch (code1) #endif case GB_FP32_code : GB_WORKER (GB_OPNAME, _uint64, uint64_t, _fp32, float ) case GB_FP64_code : GB_WORKER (GB_OPNAME, _uint64, uint64_t, _fp64, double ) - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA case GB_FC32_code : GB_WORKER (GB_OPNAME, _uint64, uint64_t, _fc32, GxB_FC32_t) case GB_FC64_code : GB_WORKER (GB_OPNAME, _uint64, uint64_t, _fc64, GxB_FC64_t) - #endif default: ; } break ; @@ -270,11 +243,8 @@ switch (code1) case GB_FP32_code : GB_WORKER (GB_OPNAME, _fp32, float, _fp32, float ) #endif case GB_FP64_code : GB_WORKER (GB_OPNAME, _fp32, float, _fp64, double ) - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA case GB_FC32_code : GB_WORKER (GB_OPNAME, _fp32, float, _fc32, GxB_FC32_t) case GB_FC64_code : GB_WORKER (GB_OPNAME, _fp32, float, _fc64, GxB_FC64_t) - #endif default: ; } break ; @@ -296,11 +266,8 @@ switch (code1) #if !defined ( GB_EXCLUDE_SAME_TYPES ) case GB_FP64_code : GB_WORKER (GB_OPNAME, _fp64, double, _fp64, double ) #endif - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA case GB_FC32_code : GB_WORKER (GB_OPNAME, _fp64, double, _fc32, GxB_FC32_t) case GB_FC64_code : GB_WORKER (GB_OPNAME, _fp64, double, _fc64, GxB_FC64_t) - #endif default: ; } break ; @@ -320,13 +287,10 @@ switch (code1) case GB_UINT64_code : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _uint64, uint64_t ) case GB_FP32_code : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _fp32, float ) case GB_FP64_code : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _fp64, double ) - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA #if !defined ( GB_EXCLUDE_SAME_TYPES ) case GB_FC32_code : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _fc32, GxB_FC32_t) #endif case GB_FC64_code : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _fc64, GxB_FC64_t) - #endif default: ; } break ; @@ -346,13 +310,10 @@ switch (code1) case GB_UINT64_code : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _uint64, uint64_t ) case GB_FP32_code : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _fp32, float ) case GB_FP64_code : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _fp64, double ) - #if !defined ( GRAPHBLAS_HAS_CUDA ) - // TODO: does not yet work in CUDA case GB_FC32_code : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _fc32, GxB_FC32_t) #if !defined ( GB_EXCLUDE_SAME_TYPES ) case GB_FC64_code : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _fc64, GxB_FC64_t) #endif - #endif default: ; } break ; diff --git a/GraphBLAS/Source/FactoryKernels/GB_AxB__include2.h b/GraphBLAS/Source/FactoryKernels/GB_AxB__include2.h index a3ed1a352f..f662aa1138 100644 --- a/GraphBLAS/Source/FactoryKernels/GB_AxB__include2.h +++ b/GraphBLAS/Source/FactoryKernels/GB_AxB__include2.h @@ -6,6 +6,7 @@ // SPDX-License-Identifier: Apache-2.0 // This file has been automatically generated from Generator/GB_AxB.h +#include "GB_math.h" GrB_Info GB (_Adot2B__plus_pair_int8) ( diff --git a/GraphBLAS/Source/FactoryKernels/GB_aop__include.h b/GraphBLAS/Source/FactoryKernels/GB_aop__include.h index a5a6add698..ee91742be7 100644 --- a/GraphBLAS/Source/FactoryKernels/GB_aop__include.h +++ b/GraphBLAS/Source/FactoryKernels/GB_aop__include.h @@ -6,6 +6,7 @@ // SPDX-License-Identifier: Apache-2.0 // This file has been automatically generated from Generator/GB_aop.h +#include "GB_math.h" GrB_Info GB (_subassign_23__first_bool) ( diff --git a/GraphBLAS/Source/FactoryKernels/GB_as__include.h b/GraphBLAS/Source/FactoryKernels/GB_as__include.h index 89776b5481..e02fbfddf9 100644 --- a/GraphBLAS/Source/FactoryKernels/GB_as__include.h +++ b/GraphBLAS/Source/FactoryKernels/GB_as__include.h @@ -6,6 +6,7 @@ // SPDX-License-Identifier: Apache-2.0 // This file has been automatically generated from Generator/GB_as.h +#include "GB_math.h" GrB_Info GB (_subassign_05d__bool) ( diff --git a/GraphBLAS/Source/FactoryKernels/GB_bld__include.h b/GraphBLAS/Source/FactoryKernels/GB_bld__include.h index ddf238d831..6197da7d5b 100644 --- a/GraphBLAS/Source/FactoryKernels/GB_bld__include.h +++ b/GraphBLAS/Source/FactoryKernels/GB_bld__include.h @@ -6,6 +6,7 @@ // SPDX-License-Identifier: Apache-2.0 // This file has been automatically generated from Generator/GB_bld.h +#include "GB_math.h" GrB_Info GB (_bld__min_int8) ( diff --git a/GraphBLAS/Source/FactoryKernels/GB_ew__include.h b/GraphBLAS/Source/FactoryKernels/GB_ew__include.h index 630b328472..c3be2bc1ce 100644 --- a/GraphBLAS/Source/FactoryKernels/GB_ew__include.h +++ b/GraphBLAS/Source/FactoryKernels/GB_ew__include.h @@ -6,6 +6,7 @@ // SPDX-License-Identifier: Apache-2.0 // This file has been automatically generated from Generator/GB_ew.h +#include "GB_math.h" GrB_Info GB (_Cewise_fulln__first_bool) diff --git a/GraphBLAS/Source/FactoryKernels/GB_red__include.h b/GraphBLAS/Source/FactoryKernels/GB_red__include.h index 1dd63cdb28..ac6b8122b1 100644 --- a/GraphBLAS/Source/FactoryKernels/GB_red__include.h +++ b/GraphBLAS/Source/FactoryKernels/GB_red__include.h @@ -6,6 +6,7 @@ // SPDX-License-Identifier: Apache-2.0 // This file has been automatically generated from Generator/GB_red.h +#include "GB_math.h" GrB_Info GB (_red__min_int8) ( diff --git a/GraphBLAS/Source/FactoryKernels/GB_sel__include.h b/GraphBLAS/Source/FactoryKernels/GB_sel__include.h index 2667deb07e..c495c3f5e9 100644 --- a/GraphBLAS/Source/FactoryKernels/GB_sel__include.h +++ b/GraphBLAS/Source/FactoryKernels/GB_sel__include.h @@ -6,6 +6,7 @@ // SPDX-License-Identifier: Apache-2.0 // This file has been automatically generated from Generator/GB_sel.h +#include "GB_math.h" GrB_Info GB (_sel_phase2__nonzombie_bool) diff --git a/GraphBLAS/Source/FactoryKernels/GB_unop__include.h b/GraphBLAS/Source/FactoryKernels/GB_unop__include.h index edef1e6ddc..cef8329258 100644 --- a/GraphBLAS/Source/FactoryKernels/GB_unop__include.h +++ b/GraphBLAS/Source/FactoryKernels/GB_unop__include.h @@ -6,6 +6,7 @@ // SPDX-License-Identifier: Apache-2.0 // This file has been automatically generated from Generator/GB_unop.h +#include "GB_math.h" GrB_Info GB (_unop_tran__identity_bool_bool) diff --git a/GraphBLAS/Source/GB.h b/GraphBLAS/Source/GB.h index 580dee9cfe..1ee458b44f 100644 --- a/GraphBLAS/Source/GB.h +++ b/GraphBLAS/Source/GB.h @@ -19,8 +19,6 @@ #endif #include "GB_static_header.h" #include "GB_positional.h" -#include "GB_casting.h" -#include "GB_math.h" #include "GB_bitwise.h" #include "GB_check.h" #include "GB_nnz.h" @@ -53,7 +51,6 @@ #include "GB_cast.h" #include "GB_wait.h" #include "GB_convert.h" -#include "GB_ops.h" #include "GB_where.h" #include "GB_Context.h" #include "GB_cuda_gateway.h" diff --git a/GraphBLAS/Source/GB_AxB__include1.h b/GraphBLAS/Source/GB_AxB__include1.h index 22bec25456..03fdcc2f41 100644 --- a/GraphBLAS/Source/GB_AxB__include1.h +++ b/GraphBLAS/Source/GB_AxB__include1.h @@ -6,6 +6,7 @@ // SPDX-License-Identifier: Apache-2.0 // This file has been automatically generated from Generator/GB_AxB.h +#include "GB_math.h" GrB_Info GB (_Adot2B__any_pair_iso) ( diff --git a/GraphBLAS/Source/GB_AxB_dot.c b/GraphBLAS/Source/GB_AxB_dot.c index f43a4dc80e..785091002a 100644 --- a/GraphBLAS/Source/GB_AxB_dot.c +++ b/GraphBLAS/Source/GB_AxB_dot.c @@ -187,10 +187,10 @@ GrB_Info GB_AxB_dot // dot product (multiple methods) GB_sparsity_char_matrix (B)) ; #if defined ( GRAPHBLAS_HAS_CUDA ) - if (!C_iso && // fixme for CUDA, remove and create C iso on output + if (!C_iso && // FIXME for CUDA, remove and create C iso on output GB_cuda_AxB_dot3_branch (M, Mask_struct, A, B, semiring, flipxy)) { - info = (GB_cuda_AxB_dot3_jit (C, M, Mask_struct, A, B, semiring, + info = (GB_cuda_AxB_dot3 (C, M, Mask_struct, A, B, semiring, flipxy)) ; } else diff --git a/GraphBLAS/Source/GB_AxB_dot3.c b/GraphBLAS/Source/GB_AxB_dot3.c index daf0cf0b68..8893196d8d 100644 --- a/GraphBLAS/Source/GB_AxB_dot3.c +++ b/GraphBLAS/Source/GB_AxB_dot3.c @@ -192,6 +192,7 @@ GrB_Info GB_AxB_dot3 // C = A'*B using dot product method // M is sparse or hypersparse; C is the same as M nthreads = GB_nthreads (cnvec, chunk, nthreads_max) ; + // TODO: try this with Cp and Ch shallow GB_memcpy (Cp, Mp, (cnvec+1) * sizeof (int64_t), nthreads) ; if (M_is_hyper) @@ -304,6 +305,11 @@ GrB_Info GB_AxB_dot3 // C = A'*B using dot product method { #include "GB_AxB_factory.c" } + + if (info == GrB_SUCCESS) + { + GBURBLE (" factory ") ; + } } #endif diff --git a/GraphBLAS/Source/GB_AxB_saxpy.h b/GraphBLAS/Source/GB_AxB_saxpy.h index 9f0df1790b..5a37f7d999 100644 --- a/GraphBLAS/Source/GB_AxB_saxpy.h +++ b/GraphBLAS/Source/GB_AxB_saxpy.h @@ -10,6 +10,7 @@ #ifndef GB_AXB_SAXPY_H #define GB_AXB_SAXPY_H #include "GB.h" +#include "GB_math.h" #include "GB_AxB_saxpy3.h" //------------------------------------------------------------------------------ diff --git a/GraphBLAS/Source/GB_AxB_saxpy3.h b/GraphBLAS/Source/GB_AxB_saxpy3.h index d672effd33..6d23a54897 100644 --- a/GraphBLAS/Source/GB_AxB_saxpy3.h +++ b/GraphBLAS/Source/GB_AxB_saxpy3.h @@ -14,6 +14,7 @@ #define GB_AXB_SAXPY3_H #include "GB.h" +#include "GB_math.h" GrB_Info GB_AxB_saxpy3 // C = A*B using Gustavson+Hash ( diff --git a/GraphBLAS/Source/GB_add.h b/GraphBLAS/Source/GB_add.h index c8b974c0c4..1f3a6152b8 100644 --- a/GraphBLAS/Source/GB_add.h +++ b/GraphBLAS/Source/GB_add.h @@ -10,6 +10,7 @@ #ifndef GB_ADD_H #define GB_ADD_H #include "GB.h" +#include "GB_math.h" GrB_Info GB_add // C=A+B, C=A+B, or C=A+B ( diff --git a/GraphBLAS/Source/GB_add_phase0.c b/GraphBLAS/Source/GB_add_phase0.c index 067eded6b7..a965bf5486 100644 --- a/GraphBLAS/Source/GB_add_phase0.c +++ b/GraphBLAS/Source/GB_add_phase0.c @@ -677,8 +677,8 @@ GrB_Info GB_add_phase0 // find vectors in C for C=A+B or C=A+B GB_OK (GB_hyper_hash_build (M, Werk)) ; const int64_t *restrict M_Yp = (M->Y == NULL) ? NULL : M->Y->p ; - const int64_t *restrict M_Yi = (M->Y == NULL) ? NULL : M->Y->p ; - const int64_t *restrict M_Yx = (M->Y == NULL) ? NULL : M->Y->p ; + const int64_t *restrict M_Yi = (M->Y == NULL) ? NULL : M->Y->i ; + const int64_t *restrict M_Yx = (M->Y == NULL) ? NULL : M->Y->x ; const int64_t M_hash_bits = (M->Y == NULL) ? 0 : (M->Y->vdim - 1) ; int64_t k ; diff --git a/GraphBLAS/Source/GB_assign.h b/GraphBLAS/Source/GB_assign.h index 3b891f33d0..c9ce0530fb 100644 --- a/GraphBLAS/Source/GB_assign.h +++ b/GraphBLAS/Source/GB_assign.h @@ -10,6 +10,7 @@ #ifndef GB_ASSIGN_H #define GB_ASSIGN_H #include "GB.h" +#include "GB_math.h" GrB_Info GB_assign // C(Rows,Cols) += A or A' ( diff --git a/GraphBLAS/Source/GB_bitmap_assign.h b/GraphBLAS/Source/GB_bitmap_assign.h index f294e89cdf..a45a05bfbe 100644 --- a/GraphBLAS/Source/GB_bitmap_assign.h +++ b/GraphBLAS/Source/GB_bitmap_assign.h @@ -10,6 +10,7 @@ #ifndef GB_BITMAP_ASSIGN_H #define GB_BITMAP_ASSIGN_H #include "GB.h" +#include "GB_math.h" GrB_Info GB_bitmap_assign ( diff --git a/GraphBLAS/Source/GB_cast.h b/GraphBLAS/Source/GB_cast.h index fbf3587465..5312d08c58 100644 --- a/GraphBLAS/Source/GB_cast.h +++ b/GraphBLAS/Source/GB_cast.h @@ -10,6 +10,18 @@ #ifndef GB_CAST_H #define GB_CAST_H +//------------------------------------------------------------------------------ +// pointer casting function, returned by GB_cast_factory. +//------------------------------------------------------------------------------ + +typedef void (*GB_cast_function) (void *, const void *, size_t) ; + +GB_cast_function GB_cast_factory // returns pointer to function to cast x to z +( + const GB_Type_code code1, // the type of z, the output value + const GB_Type_code code2 // the type of x, the input value +) ; + //------------------------------------------------------------------------------ // GB_cast_scalar: typecast or copy a scalar //------------------------------------------------------------------------------ diff --git a/GraphBLAS/Source/GB_cast_factory.c b/GraphBLAS/Source/GB_cast_factory.c index c97e60cef5..67907852fa 100644 --- a/GraphBLAS/Source/GB_cast_factory.c +++ b/GraphBLAS/Source/GB_cast_factory.c @@ -16,6 +16,7 @@ // function GB_copy_user_user. #include "GB.h" +#include "GB_casting.h" GB_cast_function GB_cast_factory // returns pointer to function to cast x to z ( diff --git a/GraphBLAS/Source/GB_casting.c b/GraphBLAS/Source/GB_casting.c index 90def26d33..bac1c2d87c 100644 --- a/GraphBLAS/Source/GB_casting.c +++ b/GraphBLAS/Source/GB_casting.c @@ -8,6 +8,7 @@ //------------------------------------------------------------------------------ #include "GB.h" +#include "GB_casting.h" //------------------------------------------------------------------------------ // typecasting from double to integer diff --git a/GraphBLAS/Source/GB_casting.h b/GraphBLAS/Source/GB_casting.h index e6cc7b4edb..cfe83d362b 100644 --- a/GraphBLAS/Source/GB_casting.h +++ b/GraphBLAS/Source/GB_casting.h @@ -9,21 +9,13 @@ // The GJ_cast* methods are only used in JIT kernels. +#ifdef __cplusplus +#error "not used for C++" +#endif + #ifndef GB_CASTING_H #define GB_CASTING_H -//------------------------------------------------------------------------------ -// pointer casting function, returned by GB_cast_factory. -//------------------------------------------------------------------------------ - -typedef void (*GB_cast_function) (void *, const void *, size_t) ; - -GB_cast_function GB_cast_factory // returns pointer to function to cast x to z -( - const GB_Type_code code1, // the type of z, the output value - const GB_Type_code code2 // the type of x, the input value -) ; - //------------------------------------------------------------------------------ // typecasting from double to integer //------------------------------------------------------------------------------ @@ -215,16 +207,12 @@ GB_CAST_FUNCTION (bool , uint32_t ) GB_CAST_FUNCTION (bool , uint64_t ) GB_CAST_FUNCTION (bool , float ) GB_CAST_FUNCTION (bool , double ) - -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA #undef GB_CAST #define GB_CAST(ztype,x) (GB_crealf (x) != 0 || GB_cimagf (x) != 0) GB_CAST_FUNCTION (bool , GxB_FC32_t) #undef GB_CAST #define GB_CAST(ztype,x) (GB_creal (x) != 0 || GB_cimag (x) != 0) GB_CAST_FUNCTION (bool , GxB_FC64_t) -#endif //------------------------------------------------------------------------------ // typecast to int8_t @@ -246,15 +234,11 @@ GB_CAST_FUNCTION (int8_t , uint64_t ) GB_CAST_FUNCTION (int8_t , float ) GB_CAST_FUNCTION (int8_t , double ) #undef GB_CAST - -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA #define GB_CAST(ztype,x) GB_cast_to_int8_t ((double) GB_crealf (x)) GB_CAST_FUNCTION (int8_t , GxB_FC32_t) #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_int8_t (GB_creal (x)) GB_CAST_FUNCTION (int8_t , GxB_FC64_t) -#endif //------------------------------------------------------------------------------ // typecast to int16_t @@ -275,16 +259,12 @@ GB_CAST_FUNCTION (int16_t , uint64_t ) #define GB_CAST(ztype,x) GB_cast_to_int16_t ((double) x) GB_CAST_FUNCTION (int16_t , float ) GB_CAST_FUNCTION (int16_t , double ) - -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_int16_t ((double) GB_crealf (x)) GB_CAST_FUNCTION (int16_t , GxB_FC32_t) #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_int16_t (GB_creal (x)) GB_CAST_FUNCTION (int16_t , GxB_FC64_t) -#endif //------------------------------------------------------------------------------ // typecast to int32_t @@ -305,16 +285,12 @@ GB_CAST_FUNCTION (int32_t , uint64_t ) #define GB_CAST(ztype,x) GB_cast_to_int32_t ((double) x) GB_CAST_FUNCTION (int32_t , float ) GB_CAST_FUNCTION (int32_t , double ) - -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_int32_t ((double) GB_crealf (x)) GB_CAST_FUNCTION (int32_t , GxB_FC32_t) #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_int32_t (GB_creal (x)) GB_CAST_FUNCTION (int32_t , GxB_FC64_t) -#endif //------------------------------------------------------------------------------ // typecast to int64_t @@ -335,16 +311,12 @@ GB_CAST_FUNCTION (int64_t , uint64_t ) #define GB_CAST(ztype,x) GB_cast_to_int64_t ((double) x) GB_CAST_FUNCTION (int64_t , float ) GB_CAST_FUNCTION (int64_t , double ) - -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_int64_t ((double) GB_crealf (x)) GB_CAST_FUNCTION (int64_t , GxB_FC32_t) #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_int64_t (GB_creal (x)) GB_CAST_FUNCTION (int64_t , GxB_FC64_t) -#endif //------------------------------------------------------------------------------ // typecast to uint8_t @@ -365,16 +337,12 @@ GB_CAST_FUNCTION (uint8_t , uint64_t ) #define GB_CAST(ztype,x) GB_cast_to_uint8_t ((double) x) GB_CAST_FUNCTION (uint8_t , float ) GB_CAST_FUNCTION (uint8_t , double ) - -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_uint8_t ((double) GB_crealf (x)) GB_CAST_FUNCTION (uint8_t , GxB_FC32_t) #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_uint8_t (GB_creal (x)) GB_CAST_FUNCTION (uint8_t , GxB_FC64_t) -#endif //------------------------------------------------------------------------------ // typecast to uint16_t @@ -395,16 +363,12 @@ GB_CAST_FUNCTION (uint16_t , uint64_t ) #define GB_CAST(ztype,x) GB_cast_to_uint16_t ((double) x) GB_CAST_FUNCTION (uint16_t , float ) GB_CAST_FUNCTION (uint16_t , double ) - -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_uint16_t ((double) GB_crealf (x)) GB_CAST_FUNCTION (uint16_t , GxB_FC32_t) #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_uint16_t (GB_creal (x)) GB_CAST_FUNCTION (uint16_t , GxB_FC64_t) -#endif //------------------------------------------------------------------------------ // typecast to uint32_t @@ -425,16 +389,12 @@ GB_CAST_FUNCTION (uint32_t , uint64_t ) #define GB_CAST(ztype,x) GB_cast_to_uint32_t ((double) x) GB_CAST_FUNCTION (uint32_t , float ) GB_CAST_FUNCTION (uint32_t , double ) - -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_uint32_t ((double) GB_crealf (x)) GB_CAST_FUNCTION (uint32_t , GxB_FC32_t) #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_uint32_t (GB_creal (x)) GB_CAST_FUNCTION (uint32_t , GxB_FC64_t) -#endif //------------------------------------------------------------------------------ // typecast to uint64_t @@ -455,16 +415,12 @@ GB_CAST_FUNCTION (uint64_t , uint64_t ) #define GB_CAST(ztype,x) GB_cast_to_uint64_t ((double) x) GB_CAST_FUNCTION (uint64_t , float ) GB_CAST_FUNCTION (uint64_t , double ) - -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_uint64_t ((double) GB_crealf (x)) GB_CAST_FUNCTION (uint64_t , GxB_FC32_t) #undef GB_CAST #define GB_CAST(ztype,x) GB_cast_to_uint64_t (GB_creal (x)) GB_CAST_FUNCTION (uint64_t , GxB_FC64_t) -#endif //------------------------------------------------------------------------------ // typecast to float @@ -483,16 +439,12 @@ GB_CAST_FUNCTION (float , uint32_t ) GB_CAST_FUNCTION (float , uint64_t ) GB_CAST_FUNCTION (float , float ) GB_CAST_FUNCTION (float , double ) - -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA #undef GB_CAST #define GB_CAST(ztype,x) GB_crealf (x) GB_CAST_FUNCTION (float , GxB_FC32_t) #undef GB_CAST #define GB_CAST(ztype,x) ((float) GB_creal (x)) GB_CAST_FUNCTION (float , GxB_FC64_t) -#endif //------------------------------------------------------------------------------ // typecast to double @@ -511,24 +463,17 @@ GB_CAST_FUNCTION (double , uint32_t ) GB_CAST_FUNCTION (double , uint64_t ) GB_CAST_FUNCTION (double , float ) GB_CAST_FUNCTION (double , double ) - -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA #undef GB_CAST #define GB_CAST(ztype,x) ((double) GB_crealf (x)) GB_CAST_FUNCTION (double , GxB_FC32_t) #undef GB_CAST #define GB_CAST(ztype,x) GB_creal (x) GB_CAST_FUNCTION (double , GxB_FC64_t) -#endif //------------------------------------------------------------------------------ // typecast to float complex //------------------------------------------------------------------------------ -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA - #undef GB_CAST #define GB_CAST(ztype,x) GB_CMPLX32 ((float) x, (float) 0) GB_CAST_FUNCTION (GxB_FC32_t, bool ) @@ -573,8 +518,6 @@ GB_CAST_FUNCTION (GxB_FC64_t, GxB_FC32_t) #define GB_CAST(ztype,x) x GB_CAST_FUNCTION (GxB_FC64_t, GxB_FC64_t) -#endif - #undef GB_CAST #undef GB_CAST_FUNCTION diff --git a/GraphBLAS/Source/GB_copy_user_user.c b/GraphBLAS/Source/GB_copy_user_user.c index 84c69a3a86..797f94e3d1 100644 --- a/GraphBLAS/Source/GB_copy_user_user.c +++ b/GraphBLAS/Source/GB_copy_user_user.c @@ -8,6 +8,7 @@ //------------------------------------------------------------------------------ #include "GB.h" +#include "GB_casting.h" void GB_copy_user_user (void *z, const void *x, size_t s) { diff --git a/GraphBLAS/Source/GB_cuda_gateway.h b/GraphBLAS/Source/GB_cuda_gateway.h index cfb00a72ed..72bd217d73 100644 --- a/GraphBLAS/Source/GB_cuda_gateway.h +++ b/GraphBLAS/Source/GB_cuda_gateway.h @@ -66,17 +66,15 @@ static inline int GB_ngpus_to_use // get # of GPUs avaiable int gpu_count = GB_Global_gpu_count_get ( ) ; - if (gpu_hack == 2 || gpu_count == 0) + if (gpu_hack == 2 || gpu_count == 0 || work == 0) { // never use the GPU(s) - // printf ("(GPU: disabled, gpu_count: %d) ", gpu_count) ; return (0) ; } else if (gpu_hack == 1) { // always use all available GPU(s) // fixme for CUDA: allow 1 to gpu_count to be requested - // printf ("(using the GPU: %d) ", gpu_count) ; return (gpu_count) ; } else @@ -84,15 +82,12 @@ static inline int GB_ngpus_to_use // default: use no more than max_gpus_to_use double gpu_chunk = 2e6 ; double max_gpus_to_use = floor (work / gpu_chunk) ; - // printf ("(work %g gpu_chunk: %g max gpus to use: %g) ", - // work, gpu_chunk, max_gpus_to_use) ; // but use no more than the # of GPUs available if (max_gpus_to_use > gpu_count) return (gpu_count) ; return ((int) max_gpus_to_use) ; } } - //------------------------------------------------------------------------------ // GB_cuda_* gateway functions //------------------------------------------------------------------------------ @@ -116,13 +111,18 @@ bool GB_cuda_get_device_properties GB_cuda_device *prop ) ; +bool GB_cuda_type_branch // return true if the type is OK on GPU +( + const GrB_Type type // type to query +) ; + bool GB_cuda_reduce_to_scalar_branch // return true to use the GPU ( const GrB_Monoid monoid, // monoid to do the reduction const GrB_Matrix A // input matrix ) ; -GrB_Info GB_cuda_reduce_to_scalar_jit +GrB_Info GB_cuda_reduce_to_scalar ( // output: GB_void *s, // note: statically allocated on CPU stack; if @@ -134,12 +134,7 @@ GrB_Info GB_cuda_reduce_to_scalar_jit const GrB_Matrix A ) ; -bool GB_cuda_type_branch // return true if the type is OK on GPU -( - const GrB_Type type // type to query -) ; - -GrB_Info GB_cuda_AxB_dot3_jit // C = A'*B using dot product method +GrB_Info GB_cuda_AxB_dot3 // C = A'*B using dot product method ( GrB_Matrix C, // output matrix, static header const GrB_Matrix M, // mask matrix diff --git a/GraphBLAS/Source/GB_emult.h b/GraphBLAS/Source/GB_emult.h index c5661f9c00..7a9e7e2a03 100644 --- a/GraphBLAS/Source/GB_emult.h +++ b/GraphBLAS/Source/GB_emult.h @@ -10,6 +10,7 @@ #ifndef GB_EMULT_H #define GB_EMULT_H #include "GB.h" +#include "GB_math.h" #include "GB_bitmap_assign_methods.h" #define GB_EMULT_METHOD1_ADD 1 /* use GB_add instead of emult */ diff --git a/GraphBLAS/Source/GB_encodify_reduce.c b/GraphBLAS/Source/GB_encodify_reduce.c index 61fb3f5425..5112883246 100644 --- a/GraphBLAS/Source/GB_encodify_reduce.c +++ b/GraphBLAS/Source/GB_encodify_reduce.c @@ -17,6 +17,7 @@ uint64_t GB_encodify_reduce // encode a GrB_reduce problem // except for the suffix char **suffix, // suffix for user-defined kernel // input: + const GB_jit_kcode kcode, // kernel to encode GrB_Monoid monoid, // the monoid to enumify GrB_Matrix A // input matrix to reduce ) @@ -40,7 +41,7 @@ uint64_t GB_encodify_reduce // encode a GrB_reduce problem GB_enumify_reduce (&encoding->code, monoid, A) ; bool builtin = (monoid->hash == 0) ; - encoding->kcode = GB_JIT_KERNEL_REDUCE ; + encoding->kcode = kcode ; //-------------------------------------------------------------------------- // determine the suffix and its length diff --git a/GraphBLAS/Source/GB_enumify_cuda_atomic.c b/GraphBLAS/Source/GB_enumify_cuda_atomic.c index 315f78ed5c..e10b97dc39 100644 --- a/GraphBLAS/Source/GB_enumify_cuda_atomic.c +++ b/GraphBLAS/Source/GB_enumify_cuda_atomic.c @@ -37,12 +37,11 @@ bool GB_enumify_cuda_atomic { // user defined monoid: can apply GB_ADD via atomicCAS if the ztype has - // 16, 32, or 64 bits + // 32 or 64 bits case 0 : (*user_monoid_atomically) = - (zsize == sizeof (uint16_t) || - zsize == sizeof (uint32_t) || + (zsize == sizeof (uint32_t) || zsize == sizeof (uint64_t)) ; break ; @@ -234,14 +233,10 @@ bool GB_enumify_cuda_atomic { //---------------------------------------------------------------------- - // user-defined monoid with a type of 16, 32, or 64 bits + // user-defined monoid with a type of 32 or 64 bits //---------------------------------------------------------------------- - if (zsize == sizeof (uint16_t)) - { - (*cuda_type) = "unsigned short int" ; - } - else if (zsize == sizeof (uint32_t)) + if (zsize == sizeof (uint32_t)) { (*cuda_type) = "unsigned int" ; } @@ -261,7 +256,7 @@ bool GB_enumify_cuda_atomic //---------------------------------------------------------------------- // either built-in (GxB_ANY_FC64_MONOID or GxB_TIMES_FC64_MONOID), - // or user-defined where the type is not 16, 32, or 64 bits in size + // or user-defined where the type is not 32 or 64 bits in size has_cheeseburger = false ; diff --git a/GraphBLAS/Source/GB_ewise_kernels.h b/GraphBLAS/Source/GB_ewise_kernels.h index de2665caa1..b90a8a4f65 100644 --- a/GraphBLAS/Source/GB_ewise_kernels.h +++ b/GraphBLAS/Source/GB_ewise_kernels.h @@ -7,7 +7,6 @@ //------------------------------------------------------------------------------ -#include "GB.h" #include "GB_emult.h" #include "GB_ek_slice.h" #include "GB_bitmap_assign_methods.h" diff --git a/GraphBLAS/Source/GB_helper.h b/GraphBLAS/Source/GB_helper.h index 317b6ac3ba..5cb05a5c27 100644 --- a/GraphBLAS/Source/GB_helper.h +++ b/GraphBLAS/Source/GB_helper.h @@ -14,6 +14,7 @@ #define GB_HELPER_H #include "GB.h" +#include "GB_math.h" double GB_helper0 (void) ; diff --git a/GraphBLAS/Source/GB_init.c b/GraphBLAS/Source/GB_init.c index 7b55498369..c3d6e6d253 100644 --- a/GraphBLAS/Source/GB_init.c +++ b/GraphBLAS/Source/GB_init.c @@ -31,7 +31,8 @@ // The calloc function pointer is also optional and can be NULL. // If the mode is GxB_BLOCKING_GPU or GxB_NONBLOCKING_GPU, the 4 function -// pointers are ignored, and rmm_wrap_malloc/.../rmm_wrap_free are used instead. +// pointers are ignored, and rmm_wrap_malloc/.../rmm_wrap_free are used +// instead. #define GB_FREE_ALL ; #include "GB.h" @@ -44,7 +45,7 @@ GrB_Info GB_init // start up GraphBLAS ( - const GrB_Mode mode, // blocking or non-blocking mode + GrB_Mode mode, // blocking or non-blocking mode // pointers to memory management functions. void * (* malloc_function ) (size_t), // required @@ -78,7 +79,10 @@ GrB_Info GB_init // start up GraphBLAS // establish malloc/calloc/realloc/free //-------------------------------------------------------------------------- + bool malloc_is_thread_safe = true ; + #if defined ( GRAPHBLAS_HAS_CUDA ) + mode = GxB_NONBLOCKING_GPU ; // HACK FIXME if (mode == GxB_NONBLOCKING_GPU || mode == GxB_BLOCKING_GPU) { // ignore the memory management function pointers and use rmm_wrap_* @@ -86,6 +90,8 @@ GrB_Info GB_init // start up GraphBLAS calloc_function = rmm_wrap_calloc ; realloc_function = rmm_wrap_realloc ; free_function = rmm_wrap_free ; + // the rmm_wrap methods are not thread-safe + malloc_is_thread_safe = false ; } #endif @@ -104,7 +110,7 @@ GrB_Info GB_init // start up GraphBLAS GB_Global_realloc_function_set (realloc_function) ; // ok if NULL GB_Global_free_function_set (free_function ) ; // cannot be NULL - GB_Global_malloc_is_thread_safe_set (true) ; // malloc must be thread-safe + GB_Global_malloc_is_thread_safe_set (malloc_is_thread_safe) ; GB_Global_memtable_clear ( ) ; GB_Global_malloc_tracking_set (false) ; @@ -182,7 +188,7 @@ GrB_Info GB_init // start up GraphBLAS GB_Global_timing_clear_all ( ) ; //-------------------------------------------------------------------------- - // set up the JIT folder locations and compiler flags + // set up the JIT setting and emit the source to the cache folder //-------------------------------------------------------------------------- GB_OK (GB_jitifyer_init ( )) ; @@ -192,6 +198,12 @@ GrB_Info GB_init // start up GraphBLAS //-------------------------------------------------------------------------- #pragma omp flush + #if defined ( GRAPHBLAS_HAS_CUDA ) +// this hack_get setting is used by GB_ngpus_to_use: +// GB_Global_hack_set (2,0) ; // HACK FIXME: default: GPU for big enough probs +// GB_Global_hack_set (2,1) ; // HACK FIXME: force the GPU always to be used +// GB_Global_hack_set (2,2) ; // HACK FIXME: force the GPU never to be used + #endif return (GrB_SUCCESS) ; } diff --git a/GraphBLAS/Source/GB_init.h b/GraphBLAS/Source/GB_init.h index 9bd15d962e..8664293dc3 100644 --- a/GraphBLAS/Source/GB_init.h +++ b/GraphBLAS/Source/GB_init.h @@ -12,7 +12,7 @@ GrB_Info GB_init // start up GraphBLAS ( - const GrB_Mode mode, // blocking or non-blocking mode + GrB_Mode mode, // blocking or non-blocking mode // pointers to memory management functions. void * (* malloc_function ) (size_t), // required diff --git a/GraphBLAS/Source/GB_jitifyer.c b/GraphBLAS/Source/GB_jitifyer.c index 7e96b03f04..d430977e57 100644 --- a/GraphBLAS/Source/GB_jitifyer.c +++ b/GraphBLAS/Source/GB_jitifyer.c @@ -1,5 +1,5 @@ //------------------------------------------------------------------------------ -// GB_jitifyer.c: CPU jitifyer +// GB_jitifyer.c: CPU / CUDA jitifyer //------------------------------------------------------------------------------ // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. @@ -71,10 +71,14 @@ static size_t GB_jit_C_libraries_allocated = 0 ; static char *GB_jit_C_cmake_libs = NULL ; static size_t GB_jit_C_cmake_libs_allocated = 0 ; -// preface to add to each JIT kernel: +// preface to add to each CPU JIT kernel: static char *GB_jit_C_preface = NULL ; static size_t GB_jit_C_preface_allocated = 0 ; +// preface to add to each CUDA JIT kernel: +static char *GB_jit_CUDA_preface = NULL ; +static size_t GB_jit_CUDA_preface_allocated = 0 ; + // temporary workspace for filenames and system commands: static char *GB_jit_temp = NULL ; static size_t GB_jit_temp_allocated = 0 ; @@ -208,11 +212,12 @@ void GB_jitifyer_finalize (void) GB_FREE_STUFF (GB_jit_C_libraries) ; GB_FREE_STUFF (GB_jit_C_cmake_libs) ; GB_FREE_STUFF (GB_jit_C_preface) ; + GB_FREE_STUFF (GB_jit_CUDA_preface) ; GB_FREE_STUFF (GB_jit_temp) ; } //------------------------------------------------------------------------------ -// GB_jitifyer_init: initialize the CPU and CUDA JIT folders, flags, etc +// GB_jitifyer_init: initialize the JIT folders, flags, etc //------------------------------------------------------------------------------ // Returns GrB_SUCCESS or GrB_OUT_OF_MEMORY. If any other error occurs (such @@ -308,6 +313,7 @@ GrB_Info GB_jitifyer_init (void) GB_COPY_STUFF (GB_jit_C_libraries, GB_C_LIBRARIES) ; GB_COPY_STUFF (GB_jit_C_cmake_libs, GB_CMAKE_LIBRARIES) ; GB_COPY_STUFF (GB_jit_C_preface, "") ; + GB_COPY_STUFF (GB_jit_CUDA_preface, "") ; OK (GB_jitifyer_alloc_space ( )) ; //-------------------------------------------------------------------------- @@ -438,6 +444,7 @@ GrB_Info GB_jitifyer_init (void) else if (IS ("union" )) c = GB_JIT_KERNEL_UNION ; else if (IS ("user_op" )) c = GB_JIT_KERNEL_USEROP ; else if (IS ("user_type" )) c = GB_JIT_KERNEL_USERTYPE ; + else if (IS ("cuda_reduce" )) c = GB_JIT_CUDA_KERNEL_REDUCE ; else { // PreJIT error: kernel_name is invalid; ignore this kernel @@ -548,8 +555,6 @@ GrB_Info GB_jitifyer_establish_paths (GrB_Info error_condition) // construct the c, lib, and lock paths and their 256 subfolders ok = ok && GB_jitifyer_path_256 ("c") ; - ok = ok && GB_jitifyer_path_256 ("cu") ; - ok = ok && GB_jitifyer_path_256 ("libcu") ; ok = ok && GB_jitifyer_path_256 ("lib") ; ok = ok && GB_jitifyer_path_256 ("lock") ; @@ -558,7 +563,7 @@ GrB_Info GB_jitifyer_establish_paths (GrB_Info error_condition) ok = ok && GB_file_mkdir (GB_jit_temp) ; // construct the tmp path - snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/tmp", GB_jit_cache_path); + snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/tmp", GB_jit_cache_path) ; ok = ok && GB_file_mkdir (GB_jit_temp) ; //-------------------------------------------------------------------------- @@ -1230,7 +1235,6 @@ GrB_Info GB_jitifyer_set_C_cmake_libs_worker (const char *new_cmake_libs) return (GB_jitifyer_alloc_space ( )) ; } - //------------------------------------------------------------------------------ // GB_jitifyer_get_C_preface: return the current C preface //------------------------------------------------------------------------------ @@ -1286,6 +1290,61 @@ GrB_Info GB_jitifyer_set_C_preface_worker (const char *new_C_preface) return (GrB_SUCCESS) ; } +//------------------------------------------------------------------------------ +// GB_jitifyer_get_CUDA_preface: return the current C preface +//------------------------------------------------------------------------------ + +const char *GB_jitifyer_get_CUDA_preface (void) +{ + const char *s ; + #pragma omp critical (GB_jitifyer_worker) + { + s = GB_jit_CUDA_preface ; + } + return (s) ; +} + +//------------------------------------------------------------------------------ +// GB_jitifyer_set_CUDA_preface: set new C preface +//------------------------------------------------------------------------------ + +GrB_Info GB_jitifyer_set_CUDA_preface (const char *new_CUDA_preface) +{ + + //-------------------------------------------------------------------------- + // check inputs + //-------------------------------------------------------------------------- + + if (new_CUDA_preface == NULL) + { + return (GrB_NULL_POINTER) ; + } + + //-------------------------------------------------------------------------- + // set the C preface in a critical section + //-------------------------------------------------------------------------- + + GrB_Info info ; + #pragma omp critical (GB_jitifyer_worker) + { + info = GB_jitifyer_set_CUDA_preface_worker (new_CUDA_preface) ; + } + return (info) ; +} + +//------------------------------------------------------------------------------ +// GB_jitifyer_set_CUDA_preface_worker: set C preface in a critical section +//------------------------------------------------------------------------------ + +GrB_Info GB_jitifyer_set_CUDA_preface_worker (const char *new_CUDA_preface) +{ + // free the old strings that depend on the C preface + GB_FREE_STUFF (GB_jit_CUDA_preface) ; + // allocate the new GB_jit_CUDA_preface + GB_COPY_STUFF (GB_jit_CUDA_preface, new_CUDA_preface) ; + return (GrB_SUCCESS) ; +} + //------------------------------------------------------------------------------ // GB_jitifyer_query: check if the type/op/monoid definitions match //------------------------------------------------------------------------------ @@ -1677,6 +1736,10 @@ GrB_Info GB_jitifyer_worker // lock the kernel //-------------------------------------------------------------------------- + // FIXME: add kernel_name to the lock filename. If the lock fails, + // sleep for 1 second and try again repeatedly, with a timeout limit of + // (say) 60 seconds. + uint32_t bucket = hash & 0xFF ; snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/lock/%02x/%016" PRIx64 "_lock", GB_jit_cache_path, bucket, hash) ; @@ -1797,15 +1860,19 @@ GrB_Info GB_jitifyer_load_worker //---------------------------------------------------------------------- GBURBLE ("(jit: compile and load) ") ; + GB_jit_kcode kcode = encoding->kcode ; + const char *kernel_filetype = + (kcode < GB_JIT_CUDA_KERNEL) ? "c" : "cu" ; // create (or recreate) the kernel source, compile it, and load it - snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/c/%02x/%s.c", - GB_jit_cache_path, bucket, kernel_name) ; + snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/c/%02x/%s.%s", + GB_jit_cache_path, bucket, kernel_name, kernel_filetype) ; FILE *fp = fopen (GB_jit_temp, "w") ; if (fp != NULL) { // create the preface - GB_macrofy_preface (fp, kernel_name, GB_jit_C_preface) ; + GB_macrofy_preface (fp, kernel_name, + GB_jit_C_preface, GB_jit_CUDA_preface, kcode) ; // macrofy the kernel operators, types, and matrix formats GB_macrofy_family (fp, family, encoding->code, semiring, monoid, op, type1, type2, type3) ; @@ -1814,12 +1881,14 @@ GrB_Info GB_jitifyer_load_worker "#define GB_jit_kernel %s\n" "#define GB_jit_query %s_query\n" "#endif\n" - "#include \"GB_jit_kernel_%s.c\"\n", - kernel_name, kernel_name, kname) ; + "#include \"GB_jit_kernel_%s.%s\"\n", + kernel_name, kernel_name, kname, + kernel_filetype) ; + // macrofy the query function bool builtin = (encoding->suffix_len == 0) ; GB_macrofy_query (fp, builtin, monoid, op1, op2, type1, type2, - type3, hash) ; + type3, hash, kcode) ; fclose (fp) ; } @@ -1827,16 +1896,22 @@ GrB_Info GB_jitifyer_load_worker // gracefully fail. // compile the kernel to get the lib*.so file - if (GB_jit_use_cmake) + if (kcode >= GB_JIT_CUDA_KERNEL) + { + // use NVCC to directly compile the CUDA kernel + GB_jitifyer_nvcc_compile (kernel_name, bucket) ; + } + else if (GB_jit_use_cmake) { - // use cmake to compile the kernel + // use cmake to compile the CPU kernel GB_jitifyer_cmake_compile (kernel_name, hash) ; } else { - // use the compiler to directly compile the kernel + // use the compiler to directly compile the CPU kernel GB_jitifyer_direct_compile (kernel_name, bucket) ; } + // load the kernel from the lib*.so file snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/lib/%02x/%s%s%s", GB_jit_cache_path, bucket, GB_LIB_PREFIX, kernel_name, @@ -1865,7 +1940,7 @@ GrB_Info GB_jitifyer_load_worker } //-------------------------------------------------------------------------- - // get the jit_kernel_function pointer + // get the GB_jit_kernel function pointer //-------------------------------------------------------------------------- (*dl_function) = GB_file_dlsym (dl_handle, "GB_jit_kernel") ; @@ -2314,6 +2389,87 @@ void GB_jitifyer_cmake_compile (char *kernel_name, uint64_t hash) #endif } +//------------------------------------------------------------------------------ +// GB_jitifyer_nvcc_compile: compile a CUDA kernel with NVRTC +//------------------------------------------------------------------------------ + +// Compiles a CUDA JIT kernel in a *.cu file, containing host code that +// launches one or more device kernels. + +// The input file has the form: +// +// %s/c/%02x/%s or [cache_path]/c/[bucket]/[kernel_name].cu +// +// and the libary file is linked as +// +// %s/lib/%02x/lib%s.so or [cache_path]/lib/[bucket]/lib[kernel_name].so +// +// All other temporary files (including *.o object files) are removed. + +void GB_jitifyer_nvcc_compile (char *kernel_name, uint32_t bucket) +{ + +#if defined ( GRAPHBLAS_HAS_CUDA ) && !defined ( NJIT ) + + char *burble_stdout = GB_Global_burble_get ( ) ? "" : GB_DEV_NULL ; + char *err_redirect = (strlen (GB_jit_error_log) > 0) ? " 2>> " : "" ; + + GBURBLE ("(jit compiling cuda with nvcc: %s/c/%02x/%s.cu) ", + GB_jit_cache_path, bucket, kernel_name) ; + + snprintf (GB_jit_temp, GB_jit_temp_allocated, + + // compile: + "sh -c \"" // execute with POSIX shell + "nvcc " // compiler command + "-forward-unknown-to-host-compiler " + "-DGB_JIT_RUNTIME=1 " // nvcc flags + "-I/usr/local/cuda/include -std=c++17 -arch=sm_60 -fPIC " + "-I%s/src " // include source directory + "-o %s/c/%02x/%s%s " // *.o output file + "-c %s/c/%02x/%s.cu " // *.cu input file + "%s " // burble stdout + "%s %s ; " // error log file + + // link: + "nvcc " // compiler + "-DGB_JIT_RUNTIME=1 " // nvcc flags + "-I/usr/local/cuda/include -std=c++17 -arch=sm_60 " + " -shared " + "-o %s/lib/%02x/%s%s%s " // lib*.so output file + "%s/c/%02x/%s%s " // *.o input file + " -cudart shared " +// "%s " // libraries to link with (any?) + "%s " // burble stdout + "%s %s\"", // error log file + + // compile: + GB_jit_cache_path, // include source directory (cache/src) + GB_jit_cache_path, bucket, kernel_name, GB_OBJ_SUFFIX, // *.o output file + GB_jit_cache_path, bucket, kernel_name, // *.cu input file + burble_stdout, // burble stdout + err_redirect, GB_jit_error_log, // error log file + + // link: + GB_jit_cache_path, bucket, + GB_LIB_PREFIX, kernel_name, GB_LIB_SUFFIX, // lib*.so file + GB_jit_cache_path, bucket, kernel_name, GB_OBJ_SUFFIX, // *.o input file +// GB_jit_C_libraries // libraries to link with + burble_stdout, // burble stdout + err_redirect, GB_jit_error_log) ; // error log file + + // compile the library and return result + GBURBLE ("\n(jit: %s) ", GB_jit_temp) ; + GB_jitifyer_command (GB_jit_temp) ; // OK: see security comment above + + // remove the *.o file + snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/c/%02x/%s%s", + GB_jit_cache_path, bucket, kernel_name, GB_OBJ_SUFFIX) ; + remove (GB_jit_temp) ; + +#endif +} + //------------------------------------------------------------------------------ // GB_jitifyer_direct_compile: compile a kernel with just the compiler //------------------------------------------------------------------------------ diff --git a/GraphBLAS/Source/GB_jitifyer.h b/GraphBLAS/Source/GB_jitifyer.h index 7aaefde893..cd75830a2b 100644 --- a/GraphBLAS/Source/GB_jitifyer.h +++ b/GraphBLAS/Source/GB_jitifyer.h @@ -169,6 +169,25 @@ typedef enum GB_JIT_KERNEL_CONVERTBITMAP = 85, // GB_convert_bitmap_worker GB_JIT_KERNEL_EXPANDISO = 86, // GB_expand_iso GB_JIT_KERNEL_SORT = 87, // GB_sort + + //-------------------------------------------------------------------------- + // future:: CUDA kernels + //-------------------------------------------------------------------------- + + GB_JIT_CUDA_KERNEL = 1000, // no CUDA kernel + + // reduce to scalar in CUDA + GB_JIT_CUDA_KERNEL_REDUCE = 1001, // GB_cuda_reduce_to_scalar + + // C = A*B, except for row/col scale (which are ewise methods) + // ... + GB_JIT_CUDA_KERNEL_AXB_DOT3 = 1004, // GB_cuda_AxB_dot3 + + // ewise methods: + // ... + GB_JIT_CUDA_KERNEL_ROWSCALE = 1011, + // ... + } GB_jit_kcode ; @@ -326,6 +345,7 @@ bool GB_jitifyer_query void GB_jitifyer_cmake_compile (char *kernel_name, uint64_t hash) ; void GB_jitifyer_direct_compile (char *kernel_name, uint32_t bucket) ; +void GB_jitifyer_nvcc_compile (char *kernel_name, uint32_t bucket) ; GrB_Info GB_jitifyer_init (void) ; // initialize the JIT @@ -372,6 +392,10 @@ const char *GB_jitifyer_get_C_preface (void) ; GrB_Info GB_jitifyer_set_C_preface (const char *new_C_preface) ; GrB_Info GB_jitifyer_set_C_preface_worker (const char *new_C_preface) ; +const char *GB_jitifyer_get_CUDA_preface (void) ; +GrB_Info GB_jitifyer_set_CUDA_preface (const char *new_CUDA_preface) ; +GrB_Info GB_jitifyer_set_CUDA_preface_worker (const char *new_CUDA_preface) ; + const char *GB_jitifyer_get_error_log (void) ; GrB_Info GB_jitifyer_set_error_log (const char *new_error_log) ; GrB_Info GB_jitifyer_set_error_log_worker (const char *new_error_log) ; diff --git a/GraphBLAS/Source/GB_macrofy_assign.c b/GraphBLAS/Source/GB_macrofy_assign.c index 776faf2583..4517fa023c 100644 --- a/GraphBLAS/Source/GB_macrofy_assign.c +++ b/GraphBLAS/Source/GB_macrofy_assign.c @@ -176,7 +176,7 @@ void GB_macrofy_assign // construct all macros for GrB_assign { fprintf (fp, "\n// accum operator:\n") ; GB_macrofy_binop (fp, "GB_ACCUM_OP", false, true, false, accum_ecode, - C_iso, accum, NULL, NULL) ; + C_iso, accum, NULL, NULL, NULL) ; char *yname = "ywork" ; diff --git a/GraphBLAS/Source/GB_macrofy_binop.c b/GraphBLAS/Source/GB_macrofy_binop.c index 3d170bf2a8..08cb51d91a 100644 --- a/GraphBLAS/Source/GB_macrofy_binop.c +++ b/GraphBLAS/Source/GB_macrofy_binop.c @@ -8,6 +8,7 @@ //------------------------------------------------------------------------------ #include "GB.h" +#include "GB_math.h" #include "GB_stringify.h" #include @@ -25,8 +26,9 @@ void GB_macrofy_binop bool C_iso, // if true: C is iso GrB_BinaryOp op, // NULL if C is iso // output: - const char **f_handle, - const char **u_handle + const char **f_handle, // basic expression z=f(x,y) + const char **u_handle, // update z=f(z,y) for the CPU + const char **g_handle // update z=f(z,y) for the GPU (if different) ) { @@ -758,10 +760,11 @@ void GB_macrofy_binop } //-------------------------------------------------------------------------- - // return the u and f expressions + // return the u, f, and g expressions //-------------------------------------------------------------------------- if (u_handle != NULL) (*u_handle) = u ; if (f_handle != NULL) (*f_handle) = f ; + if (g_handle != NULL) (*g_handle) = g ; } diff --git a/GraphBLAS/Source/GB_macrofy_build.c b/GraphBLAS/Source/GB_macrofy_build.c index 1c824908d4..2f3b481283 100644 --- a/GraphBLAS/Source/GB_macrofy_build.c +++ b/GraphBLAS/Source/GB_macrofy_build.c @@ -85,7 +85,7 @@ void GB_macrofy_build // construct all macros for GB_build fprintf (fp, "\n// binary dup operator:\n") ; GB_macrofy_binop (fp, "GB_DUP", false, true, false, dup_ecode, false, dup, - NULL, NULL) ; + NULL, NULL, NULL) ; fprintf (fp, "\n// build copy/dup methods:\n") ; diff --git a/GraphBLAS/Source/GB_macrofy_cast_expression.c b/GraphBLAS/Source/GB_macrofy_cast_expression.c index 566ec1b071..001f26c9cf 100644 --- a/GraphBLAS/Source/GB_macrofy_cast_expression.c +++ b/GraphBLAS/Source/GB_macrofy_cast_expression.c @@ -10,6 +10,7 @@ // Return a typecast expression to cast from xtype to ztype. #include "GB.h" +#include "GB_math.h" #include "GB_stringify.h" const char *GB_macrofy_cast_expression // return cast expression diff --git a/GraphBLAS/Source/GB_macrofy_ewise.c b/GraphBLAS/Source/GB_macrofy_ewise.c index 5f9ea7cb10..95168262a5 100644 --- a/GraphBLAS/Source/GB_macrofy_ewise.c +++ b/GraphBLAS/Source/GB_macrofy_ewise.c @@ -128,7 +128,7 @@ void GB_macrofy_ewise // construct all macros for GrB_eWise fprintf (fp, "\n// binary operator%s:\n", flipxy ? " (flipped)" : "") ; GB_macrofy_binop (fp, "GB_BINOP", flipxy, false, true, binop_ecode, C_iso, - binaryop, NULL, NULL) ; + binaryop, NULL, NULL, NULL) ; if (binaryop->opcode == GB_SECOND_binop_code) { diff --git a/GraphBLAS/Source/GB_macrofy_monoid.c b/GraphBLAS/Source/GB_macrofy_monoid.c index 46538ed278..a4fe05120b 100644 --- a/GraphBLAS/Source/GB_macrofy_monoid.c +++ b/GraphBLAS/Source/GB_macrofy_monoid.c @@ -24,7 +24,8 @@ void GB_macrofy_monoid // construct the macros for a monoid // semiring, times is normally a terminal monoid, but // it's not worth exploiting in GrB_mxm. // output: - const char **u_expression + const char **u_expression, + const char **g_expression ) { @@ -39,7 +40,7 @@ void GB_macrofy_monoid // construct the macros for a monoid //-------------------------------------------------------------------------- GB_macrofy_binop (fp, "GB_ADD", false, true, false, add_ecode, C_iso, - op, NULL, u_expression) ; + op, NULL, u_expression, g_expression) ; //-------------------------------------------------------------------------- // create macros for the identity value @@ -260,6 +261,7 @@ void GB_macrofy_monoid // construct the macros for a monoid // create macros for atomics on the CPU //-------------------------------------------------------------------------- + fprintf (fp, "#define GB_Z_SIZE %d\n", (int) zsize) ; fprintf (fp, "#define GB_Z_NBITS %d\n", 8 * (int) zsize) ; // atomic write diff --git a/GraphBLAS/Source/GB_macrofy_multadd.c b/GraphBLAS/Source/GB_macrofy_multadd.c new file mode 100644 index 0000000000..f739b65752 --- /dev/null +++ b/GraphBLAS/Source/GB_macrofy_multadd.c @@ -0,0 +1,47 @@ +//------------------------------------------------------------------------------ +// GB_macrofy_multadd: create a fused multiply-add operator +//------------------------------------------------------------------------------ + +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//------------------------------------------------------------------------------ + +#include "GB.h" +#include "GB_stringify.h" + +void GB_macrofy_multadd +( + FILE *fp, + const char *update_expression, // has the form "z = f(z,y)" + const char *multiply_expression, // has the form "z = mult(x,y)" + bool flipxy +) +{ + + // CPU kernels can use the fused multiply-add + if (flipxy) + { + fprintf (fp, "#define GB_MULTADD(z,y,x,j,k,i) ") ; + } + else + { + fprintf (fp, "#define GB_MULTADD(z,x,y,i,k,j) ") ; + } + for (const char *p = update_expression ; (*p) != '\0' ; p++) + { + // all update operators have a single 'y' + if ((*p) == 'y') + { + // inject the multiply operator; all have the form "z = ..." + fprintf (fp, "%s", multiply_expression + 4) ; + } + else + { + // otherwise, print the update operator character + fprintf (fp, "%c", (*p)) ; + } + } + fprintf (fp, "\n") ; +} + diff --git a/GraphBLAS/Source/GB_macrofy_mxm.c b/GraphBLAS/Source/GB_macrofy_mxm.c index 58d917f3bd..919025e27c 100644 --- a/GraphBLAS/Source/GB_macrofy_mxm.c +++ b/GraphBLAS/Source/GB_macrofy_mxm.c @@ -2,7 +2,7 @@ // GB_macrofy_mxm: construct all macros for a semiring //------------------------------------------------------------------------------ -// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ @@ -10,10 +10,14 @@ #include "GB.h" #include "GB_stringify.h" -void GB_macrofy_mxm // construct all macros for GrB_mxm +//------------------------------------------------------------------------------ +// GB_macrofy_mxm: create all macros for GrB_mxm +//------------------------------------------------------------------------------ + +void GB_macrofy_mxm // construct all macros for GrB_mxm ( // output: - FILE *fp, // target file to write, already open + FILE *fp, // target file to write, already open // input: uint64_t scode, GrB_Semiring semiring, // the semiring to macrofy @@ -109,10 +113,10 @@ void GB_macrofy_mxm // construct all macros for GrB_mxm bool is_positional = GB_IS_BINARYOP_CODE_POSITIONAL (mult->opcode) ; fprintf (fp, "\n// monoid:\n") ; - const char *u_expr ; + const char *u_expr, *g_expr ; GB_macrofy_type (fp, "Z", "_", (zcode == 0) ? "GB_void" : ztype->name) ; GB_macrofy_monoid (fp, add_ecode, id_ecode, term_ecode, C_iso, monoid, - is_positional, &u_expr) ; + is_positional, &u_expr, &g_expr) ; //-------------------------------------------------------------------------- // construct macros for the multiply operator @@ -122,7 +126,7 @@ void GB_macrofy_mxm // construct all macros for GrB_mxm flipxy ? " (flipped)" : "") ; const char *f_expr ; GB_macrofy_binop (fp, "GB_MULT", flipxy, false, false, mult_ecode, C_iso, - mult, &f_expr, NULL) ; + mult, &f_expr, NULL, NULL) ; //-------------------------------------------------------------------------- // multiply-add operator @@ -165,29 +169,20 @@ void GB_macrofy_mxm // construct all macros for GrB_mxm // Since GB_MULT is not used, the fused GB_MULTADD must handle flipxy. - if (flipxy) - { - fprintf (fp, "#define GB_MULTADD(z,y,x,j,k,i) ") ; + if (g_expr == NULL) + { + // the CPU and GPU use the same macro + GB_macrofy_multadd (fp, u_expr, f_expr, flipxy) ; } else - { - fprintf (fp, "#define GB_MULTADD(z,x,y,i,k,j) ") ; - } - for (const char *p = u_expr ; (*p) != '\0' ; p++) { - // all update operators have a single 'y' - if ((*p) == 'y') - { - // inject the multiply operator; all have the form "z = ..." - fprintf (fp, "%s", f_expr + 4) ; - } - else - { - // otherwise, print the update operator character - fprintf (fp, "%c", (*p)) ; - } + // the CPU uses u_expr, and GPU uses g_expr + fprintf (fp, "#ifdef GB_CUDA_KERNEL\n") ; + GB_macrofy_multadd (fp, g_expr, f_expr, flipxy) ; + fprintf (fp, "#else\n") ; + GB_macrofy_multadd (fp, u_expr, f_expr, flipxy) ; + fprintf (fp, "#endif\n") ; } - fprintf (fp, "\n") ; } else diff --git a/GraphBLAS/Source/GB_macrofy_preface.c b/GraphBLAS/Source/GB_macrofy_preface.c index 0ad00c6f07..25363e5b75 100644 --- a/GraphBLAS/Source/GB_macrofy_preface.c +++ b/GraphBLAS/Source/GB_macrofy_preface.c @@ -14,9 +14,11 @@ void GB_macrofy_preface ( FILE *fp, // target file to write, already open char *kernel_name, // name of the kernel - char *preface // user-provided preface + char *C_preface, // user-provided preface for CPU JIT kernels + char *CUDA_preface, // user-provided preface for CUDA JIT kernels + GB_jit_kcode kcode ) -{ +{ const char *date = GxB_IMPLEMENTATION_DATE ; int len = (int) strlen (date) ; @@ -32,14 +34,25 @@ void GB_macrofy_preface "// The above copyright and license do not apply to any\n" "// user-defined types and operators defined below.\n" "//--------------------------------------" - "----------------------------------------\n" - "%s\n" - "#include \"GB_jit_kernel.h\"\n\n", + "----------------------------------------\n", kernel_name, GxB_IMPLEMENTATION_MAJOR, GxB_IMPLEMENTATION_MINOR, GxB_IMPLEMENTATION_SUB, - date + GB_IMAX (0, len - 4), - preface) ; + date + GB_IMAX (0, len - 4)) ; + + if (kcode >= GB_JIT_CUDA_KERNEL) + { + // for CUDA JIT kernels + fprintf (fp, "#define GB_CUDA_KERNEL\n%s\n", CUDA_preface) ; + } + else + { + // CPU JIT kernels + fprintf (fp, "%s\n", C_preface) ; + } + + // for all kernels: CPU and CUDA + fprintf (fp, "#include \"GB_jit_kernel.h\"\n\n") ; } diff --git a/GraphBLAS/Source/GB_macrofy_query.c b/GraphBLAS/Source/GB_macrofy_query.c index 87680f6190..c6ed31bbe8 100644 --- a/GraphBLAS/Source/GB_macrofy_query.c +++ b/GraphBLAS/Source/GB_macrofy_query.c @@ -20,7 +20,8 @@ void GB_macrofy_query GrB_Type type0, GrB_Type type1, GrB_Type type2, - uint64_t hash // hash code for the kernel + uint64_t hash, // hash code for the kernel + GB_jit_kcode kcode ) { @@ -28,8 +29,21 @@ void GB_macrofy_query // create the function header, and query the version //-------------------------------------------------------------------------- - fprintf (fp, - "GB_JIT_GLOBAL GB_JIT_QUERY_PROTO (GB_jit_query) ;\n" + if (kcode >= GB_JIT_CUDA_KERNEL) + { + // ensure the query function can be called from a C function + fprintf (fp, "extern \"C\"\n{\n") ; + } + + fprintf (fp, + "GB_JIT_GLOBAL GB_JIT_QUERY_PROTO (GB_jit_query) ;\n") ; + + if (kcode >= GB_JIT_CUDA_KERNEL) + { + fprintf (fp, "}\n") ; + } + + fprintf (fp, "GB_JIT_GLOBAL GB_JIT_QUERY_PROTO (GB_jit_query)\n" "{\n" " (*hash) = 0x%016" PRIx64 " ;\n" diff --git a/GraphBLAS/Source/GB_macrofy_reduce.c b/GraphBLAS/Source/GB_macrofy_reduce.c index 8c3f225f56..3ca25ac043 100644 --- a/GraphBLAS/Source/GB_macrofy_reduce.c +++ b/GraphBLAS/Source/GB_macrofy_reduce.c @@ -63,7 +63,7 @@ void GB_macrofy_reduce // construct all macros for GrB_reduce to scalar fprintf (fp, "\n// monoid:\n") ; GB_macrofy_type (fp, "Z", "_", monoid->op->ztype->name) ; GB_macrofy_monoid (fp, red_ecode, id_ecode, term_ecode, false, monoid, - false, NULL) ; + false, NULL, NULL) ; fprintf (fp, "#define GB_GETA_AND_UPDATE(z,Ax,p)") ; if (atype == monoid->op->ztype) diff --git a/GraphBLAS/Source/GB_macrofy_unop.c b/GraphBLAS/Source/GB_macrofy_unop.c index 3f074e527e..2c8658c627 100644 --- a/GraphBLAS/Source/GB_macrofy_unop.c +++ b/GraphBLAS/Source/GB_macrofy_unop.c @@ -16,6 +16,7 @@ // #define GB_UNARYOP(z,x,j,i,y) z = f (x,i,j,y) #include "GB.h" +#include "GB_math.h" #include "GB_stringify.h" #include diff --git a/GraphBLAS/Source/GB_mask.h b/GraphBLAS/Source/GB_mask.h index 27c6338f77..5033419bb3 100644 --- a/GraphBLAS/Source/GB_mask.h +++ b/GraphBLAS/Source/GB_mask.h @@ -10,6 +10,7 @@ #ifndef GB_MASK_H #define GB_MASK_H #include "GB.h" +#include "GB_math.h" GrB_Info GB_mask // C = Z ( diff --git a/GraphBLAS/Source/GB_math.c b/GraphBLAS/Source/GB_math.c index e4a6c1cf3b..c34f57ee11 100644 --- a/GraphBLAS/Source/GB_math.c +++ b/GraphBLAS/Source/GB_math.c @@ -8,6 +8,7 @@ //------------------------------------------------------------------------------ #include "GB.h" +#include "GB_math.h" #if !GB_HAS_CMPLX_MACROS // complex constructors when the C compiler does not provide CMPLX and CMPLXF diff --git a/GraphBLAS/Source/GB_math.h b/GraphBLAS/Source/GB_math.h index 559cdf16e2..8720efd778 100644 --- a/GraphBLAS/Source/GB_math.h +++ b/GraphBLAS/Source/GB_math.h @@ -7,6 +7,8 @@ //------------------------------------------------------------------------------ +#include "GB_casting.h" + #ifndef GB_MATH_H #define GB_MATH_H @@ -296,9 +298,6 @@ inline uint64_t GB_idiv_uint64 (uint64_t x, uint64_t y) // Three cases below are from ACM Algo 116, R. L. Smith, 1962. -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA - inline GxB_FC64_t GB_FC64_div (GxB_FC64_t x, GxB_FC64_t y) { double xr = GB_creal (x) ; @@ -401,8 +400,6 @@ inline GxB_FC32_t GB_FC32_div (GxB_FC32_t x, GxB_FC32_t y) " return (GJ_CMPLX32 ((float) GB_creal(zz), (float) GB_cimag(zz))) ; \n" \ "}" -#endif - //------------------------------------------------------------------------------ // z = x^y: wrappers for pow, powf, cpow, and cpowf //------------------------------------------------------------------------------ @@ -484,9 +481,6 @@ inline double GB_pow (double x, double y) " return (pow (x, y)) ; \n" \ "}" -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA - inline GxB_FC32_t GB_FC32_pow (GxB_FC32_t x, GxB_FC32_t y) { float xr = GB_crealf (x) ; @@ -608,7 +602,6 @@ inline GxB_FC64_t GB_FC64_pow (GxB_FC64_t x, GxB_FC64_t y) " } \n" \ " return (GB_cpow (x, y)) ; \n" \ "}" -#endif inline int8_t GB_pow_int8 (int8_t x, int8_t y) { @@ -792,9 +785,6 @@ inline double GB_signum (double x) " return ((double) ((x < 0) ? (-1) : ((x > 0) ? 1 : 0))) ; \n" \ "}" -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA - inline GxB_FC32_t GB_csignumf (GxB_FC32_t x) { if (GB_crealf (x) == 0 && GB_cimagf (x) == 0) @@ -1258,5 +1248,3 @@ inline bool GB_cisfinite (GxB_FC64_t x) #endif -#endif - diff --git a/GraphBLAS/Source/GB_ops.c b/GraphBLAS/Source/GB_ops.c index ccd8749bb5..58c61e51ec 100644 --- a/GraphBLAS/Source/GB_ops.c +++ b/GraphBLAS/Source/GB_ops.c @@ -11,6 +11,8 @@ // operators, index_unary operators, binary operators, monoids, and semirings. #include "GB.h" +#include "GB_math.h" +#include "GB_ops.h" //------------------------------------------------------------------------------ // compiler flags @@ -22,9 +24,7 @@ #pragma warning (disable: 144 ) #elif GB_COMPILER_GCC // disable gcc warnings - #if !defined ( __cplusplus ) #pragma GCC diagnostic ignored "-Wincompatible-pointer-types" - #endif #elif GB_COMPILER_MSC // disable MS Visual Studio warnings GB_PRAGMA (warning (disable : 4146 )) diff --git a/GraphBLAS/Source/GB_ops.h b/GraphBLAS/Source/GB_ops.h index d938bda669..0745ec3a64 100644 --- a/GraphBLAS/Source/GB_ops.h +++ b/GraphBLAS/Source/GB_ops.h @@ -7,6 +7,10 @@ //------------------------------------------------------------------------------ +#ifdef __cplusplus +#error "not used for C++" +#endif + #ifndef GB_OPS_H #define GB_OPS_H @@ -106,9 +110,6 @@ inline void GB_nonzombie_func (bool *z, const void *x, #define GB_DOUBLE #include "GB_ops_template.h" -#if !defined ( GBCUDA_CPLUSPLUS ) -// TODO: does not yet work in CUDA - #define GB_TYPE GxB_FC32_t #define GB_XTYPE FC32 #define GB_BITS 64 @@ -127,5 +128,3 @@ inline void GB_nonzombie_func (bool *z, const void *x, #endif -#endif - diff --git a/GraphBLAS/Source/GB_reduce_to_scalar.c b/GraphBLAS/Source/GB_reduce_to_scalar.c index 838c5c5259..d743f6b9a8 100644 --- a/GraphBLAS/Source/GB_reduce_to_scalar.c +++ b/GraphBLAS/Source/GB_reduce_to_scalar.c @@ -114,7 +114,7 @@ GrB_Info GB_reduce_to_scalar // z = reduce_to_scalar (A) //---------------------------------------------------------------------- GrB_Matrix V = NULL ; - info = GB_cuda_reduce_to_scalar_jit (z, &V, monoid, A) ; + info = GB_cuda_reduce_to_scalar (z, &V, monoid, A) ; if (V != NULL) { @@ -137,12 +137,12 @@ GrB_Info GB_reduce_to_scalar // z = reduce_to_scalar (A) } } - // GB_cuda_reduce_to_scalar_jit may refuse to do the reduction and + // GB_cuda_reduce_to_scalar may refuse to do the reduction and // indicate this by returning GrB_NO_VALUE. If so, the CPU will do it // below. if (!(info == GrB_SUCCESS || info == GrB_NO_VALUE)) { - // GB_cuda_reduce_to_scalar_jit has returned an error + // GB_cuda_reduce_to_scalar has returned an error // (out of memory, or other error) return (info) ; } diff --git a/GraphBLAS/Source/GB_reduce_to_scalar_jit.c b/GraphBLAS/Source/GB_reduce_to_scalar_jit.c index 05ceaf6fb5..991c48b0e9 100644 --- a/GraphBLAS/Source/GB_reduce_to_scalar_jit.c +++ b/GraphBLAS/Source/GB_reduce_to_scalar_jit.c @@ -32,7 +32,8 @@ GrB_Info GB_reduce_to_scalar_jit // z = reduce_to_scalar (A) via the JIT GB_jit_encoding encoding ; char *suffix ; - uint64_t hash = GB_encodify_reduce (&encoding, &suffix, monoid, A) ; + uint64_t hash = GB_encodify_reduce (&encoding, &suffix, + GB_JIT_KERNEL_REDUCE, monoid, A) ; //-------------------------------------------------------------------------- // get the kernel function pointer, loading or compiling it if needed diff --git a/GraphBLAS/Source/GB_select.h b/GraphBLAS/Source/GB_select.h index 7c5e6598da..e049edd6af 100644 --- a/GraphBLAS/Source/GB_select.h +++ b/GraphBLAS/Source/GB_select.h @@ -10,6 +10,7 @@ #ifndef GB_SELECT_H #define GB_SELECT_H #include "GB.h" +#include "GB_math.h" #include "GB_is_nonzero.h" GrB_Info GB_select // C = accum (C, select(A,k)) or select(A',k) diff --git a/GraphBLAS/Source/GB_stringify.h b/GraphBLAS/Source/GB_stringify.h index 9b54b3d55d..630fcac01b 100644 --- a/GraphBLAS/Source/GB_stringify.h +++ b/GraphBLAS/Source/GB_stringify.h @@ -22,7 +22,9 @@ void GB_macrofy_preface ( FILE *fp, // target file to write, already open char *kernel_name, // name of the kernel - char *preface // user-provided preface + char *C_preface, // user-provided preface for CPU JIT kernels + char *CUDA_preface, // user-provided preface for CUDA JIT kernels + GB_jit_kcode kcode ) ; //------------------------------------------------------------------------------ @@ -73,6 +75,7 @@ uint64_t GB_encodify_reduce // encode a GrB_reduce problem // except for the suffix char **suffix, // suffix for user-defined kernel // input: + const GB_jit_kcode kcode, // kernel to encode GrB_Monoid monoid, // the monoid to enumify GrB_Matrix A // input matrix to reduce ) ; @@ -422,10 +425,10 @@ void GB_enumify_mxm // enumerate a GrB_mxm problem GrB_Matrix B ) ; -void GB_macrofy_mxm // construct all macros for GrB_mxm +void GB_macrofy_mxm // construct all macros for GrB_mxm ( // output: - FILE *fp, // target file to write, already open + FILE *fp, // target file to write, already open // input: uint64_t scode, GrB_Semiring semiring, // the semiring to macrofy @@ -434,6 +437,14 @@ void GB_macrofy_mxm // construct all macros for GrB_mxm GrB_Type btype ) ; +void GB_macrofy_multadd +( + FILE *fp, + const char *update_expression, // has the form "z = f(z,y)" + const char *multiply_expression, // has the form "z = mult(x,y)" + bool flipxy +) ; + GrB_Info GB_AxB_saxpy3_jit // C=A*B, saxpy3, via the JIT ( // input/output: @@ -639,7 +650,8 @@ void GB_macrofy_monoid // construct the macros for a monoid // semiring, times is normally a terminal monoid, but // it's not worth exploiting in GrB_mxm. // output: - const char **u_expression + const char **u_expression, + const char **g_expression ) ; bool GB_enumify_cuda_atomic // return true if CUDA can do it atomically @@ -665,7 +677,8 @@ void GB_macrofy_query GrB_Type type0, GrB_Type type1, GrB_Type type2, - uint64_t hash // hash code for the kernel + uint64_t hash, // hash code for the kernel + GB_jit_kcode kcode ) ; //------------------------------------------------------------------------------ @@ -687,16 +700,18 @@ void GB_macrofy_binop FILE *fp, // input: const char *macro_name, - bool flipxy, // if true: op is f(y,x) + bool flipxy, // if true: op is f(y,x) for a semiring bool is_monoid_or_build, // if true: additive operator for monoid, - // or binary op for GrB_Matrix_build + // or binary op for GrB_Matrix_build, or + // accum operator bool is_ewise, // if true: binop for ewise methods int ecode, bool C_iso, // if true: C is iso GrB_BinaryOp op, // NULL if C is iso // output: - const char **f_handle, - const char **u_handle + const char **f_handle, // basic expression z=f(x,y) + const char **u_handle, // update z=f(z,y) for the CPU + const char **g_handle // update z=f(z,y) for the GPU (if different) ) ; //------------------------------------------------------------------------------ diff --git a/GraphBLAS/Source/GrB_Global_get.c b/GraphBLAS/Source/GrB_Global_get.c index 31850b9e39..9bfb095a86 100644 --- a/GraphBLAS/Source/GrB_Global_get.c +++ b/GraphBLAS/Source/GrB_Global_get.c @@ -306,6 +306,11 @@ static GrB_Info GB_global_string_get (const char **value, int field) (*value) = GB_jitifyer_get_C_preface ( ) ; break ; + case GxB_JIT_CUDA_PREFACE : + + (*value) = GB_jitifyer_get_CUDA_preface ( ) ; + break ; + case GxB_JIT_ERROR_LOG : (*value) = GB_jitifyer_get_error_log ( ) ; diff --git a/GraphBLAS/Source/GrB_Global_set.c b/GraphBLAS/Source/GrB_Global_set.c index d8897cf680..540b69d06e 100644 --- a/GraphBLAS/Source/GrB_Global_set.c +++ b/GraphBLAS/Source/GrB_Global_set.c @@ -202,6 +202,10 @@ GrB_Info GrB_Global_set_String return (GB_jitifyer_set_C_preface (value)) ; + case GxB_JIT_CUDA_PREFACE : + + return (GB_jitifyer_set_CUDA_preface (value)) ; + case GxB_JIT_ERROR_LOG : return (GB_jitifyer_set_error_log (value)) ; diff --git a/GraphBLAS/Source/GrB_init.c b/GraphBLAS/Source/GrB_init.c index d0204449d4..c4fe7d0e51 100644 --- a/GraphBLAS/Source/GrB_init.c +++ b/GraphBLAS/Source/GrB_init.c @@ -8,7 +8,11 @@ //------------------------------------------------------------------------------ // GrB_init (or GxB_init) must called before any other GraphBLAS operation. -// GrB_finalize must be called as the last GraphBLAS operation. +// GrB_finalize must be called as the last GraphBLAS operation. To use CUDA +// and its RMM memory manager: use a mode of GxB_BLOCKING_GPU or +// GxB_NONBLOCKING_GPU. + +// FIXME: rename GxB_*BLOCKING_GPU to GxB_*BLOCKING_CUDA. #include "GB.h" #include "GB_init.h" @@ -29,8 +33,17 @@ GrB_Info GrB_init // start up GraphBLAS // initialize GraphBLAS //-------------------------------------------------------------------------- - // default: use the C11 malloc memory manager, which is thread-safe - +#if defined ( GRAPHBLAS_HAS_CUDA ) + if (mode == GxB_BLOCKING_GPU || mode == GxB_NONBLOCKING_GPU) + { + return (GB_init (mode, // blocking or non-blocking mode + // RMM C memory management functions + rmm_wrap_malloc, rmm_wrap_calloc, rmm_wrap_realloc, rmm_wrap_free, + Werk)) ; + } +#endif + + // default: use the C11 malloc memory manager, which is thread-safe return (GB_init (mode, // blocking or non-blocking mode malloc, calloc, realloc, free, // ANSI C memory management functions Werk)) ; diff --git a/GraphBLAS/Source/GxB_Global_Option_get.c b/GraphBLAS/Source/GxB_Global_Option_get.c index 20b41cef7b..226a315fb7 100644 --- a/GraphBLAS/Source/GxB_Global_Option_get.c +++ b/GraphBLAS/Source/GxB_Global_Option_get.c @@ -334,6 +334,11 @@ GrB_Info GxB_Global_Option_get_CHAR // gets the current global option (*value) = GB_jitifyer_get_C_preface ( ) ; break ; + case GxB_JIT_CUDA_PREFACE : + + (*value) = GB_jitifyer_get_CUDA_preface ( ) ; + break ; + case GxB_JIT_ERROR_LOG : (*value) = GB_jitifyer_get_error_log ( ) ; @@ -910,6 +915,17 @@ GrB_Info GxB_Global_Option_get // gets the current global option } break ; + case GxB_JIT_CUDA_PREFACE : + + { + va_start (ap, field) ; + const char **preface = va_arg (ap, const char **) ; + va_end (ap) ; + GB_RETURN_IF_NULL (preface) ; + (*preface) = GB_jitifyer_get_CUDA_preface ( ) ; + } + break ; + case GxB_JIT_C_CONTROL : { diff --git a/GraphBLAS/Source/GxB_Global_Option_set.c b/GraphBLAS/Source/GxB_Global_Option_set.c index ed47db4aeb..05905ddfc2 100644 --- a/GraphBLAS/Source/GxB_Global_Option_set.c +++ b/GraphBLAS/Source/GxB_Global_Option_set.c @@ -267,6 +267,10 @@ GrB_Info GxB_Global_Option_set_CHAR // set a global default option return (GB_jitifyer_set_C_preface (value)) ; + case GxB_JIT_CUDA_PREFACE : + + return (GB_jitifyer_set_CUDA_preface (value)) ; + case GxB_JIT_ERROR_LOG : return (GB_jitifyer_set_error_log (value)) ; @@ -544,6 +548,15 @@ GrB_Info GxB_Global_Option_set // set a global default option return (GB_jitifyer_set_C_preface (C_preface)) ; } + case GxB_JIT_CUDA_PREFACE : + + { + va_start (ap, field) ; + char *CUDA_preface = va_arg (ap, char *) ; + va_end (ap) ; + return (GB_jitifyer_set_CUDA_preface (CUDA_preface)) ; + } + case GxB_JIT_USE_CMAKE : { diff --git a/GraphBLAS/Source/GxB_init.c b/GraphBLAS/Source/GxB_init.c index d1d999caee..fb453ccf50 100644 --- a/GraphBLAS/Source/GxB_init.c +++ b/GraphBLAS/Source/GxB_init.c @@ -53,7 +53,7 @@ GrB_Info GxB_init // start up GraphBLAS and also define malloc, etc ( - GrB_Mode mode, // blocking or non-blocking mode, GPU or not + GrB_Mode mode, // blocking or non-blocking mode // pointers to memory management functions void * (* user_malloc_function ) (size_t), // required diff --git a/GraphBLAS/Source/JitKernels/GB_jit_kernel.h b/GraphBLAS/Source/JitKernels/GB_jit_kernel.h index 826a44cc70..f421485751 100644 --- a/GraphBLAS/Source/JitKernels/GB_jit_kernel.h +++ b/GraphBLAS/Source/JitKernels/GB_jit_kernel.h @@ -1,5 +1,5 @@ //------------------------------------------------------------------------------ -// GB_jit_kernel.h: JIT kernel #include for all kernels +// GB_jit_kernel.h: JIT kernel #include for all kernels (both CPU and CUDA) //------------------------------------------------------------------------------ // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. @@ -13,10 +13,17 @@ #define GB_JIT_KERNEL_H #define GB_JIT_KERNEL -#include "GB_Template.h" -#include "GB_jit_kernel_proto.h" -// for JIT kernels +#ifndef GB_CUDA_KERNEL + // for CPU JIT kernels: + #include "GB_Template.h" +#else + // for CUDA JIT kernels: + #include "GB_cuda_kernel.cuh" +#endif + +// for all JIT kernels +#include "GB_jit_kernel_proto.h" #if defined (_MSC_VER) && !(defined (__INTEL_COMPILER) || defined(__INTEL_CLANG_COMPILER)) #define GB_JIT_GLOBAL extern __declspec ( dllexport ) #else @@ -24,7 +31,7 @@ #endif #ifndef GB_JIT_RUNTIME - // for PreJIT kernels + // for PreJIT kernels (CPU and CUDA) #include "GB_callbacks.h" #endif diff --git a/GraphBLAS/Source/Template/GB_Template.h b/GraphBLAS/Source/Template/GB_Template.h index c608047fa7..f89ae26041 100644 --- a/GraphBLAS/Source/Template/GB_Template.h +++ b/GraphBLAS/Source/Template/GB_Template.h @@ -27,6 +27,10 @@ #include "GraphBLAS.h" #undef I +#ifdef GBMATLAB +#undef GRAPHBLAS_HAS_CUDA +#endif + //------------------------------------------------------------------------------ // handle the restrict and 'static inline' keywords //------------------------------------------------------------------------------ diff --git a/GraphBLAS/Source/Template/GB_jit_kernel_proto.h b/GraphBLAS/Source/Template/GB_jit_kernel_proto.h index 47b2fb6ed8..5c260789ab 100644 --- a/GraphBLAS/Source/Template/GB_jit_kernel_proto.h +++ b/GraphBLAS/Source/Template/GB_jit_kernel_proto.h @@ -15,7 +15,7 @@ //------------------------------------------------------------------------------ #define GB_JIT_QUERY_PROTO(query_func) \ -bool query_func (uint64_t *hash, int v [3], char *defn [5], \ +bool query_func (uint64_t *hash, int v [3], const char *defn [5], \ void *id, void *term, size_t id_size, size_t term_size) #define GB_JIT_KERNEL_USER_OP_PROTO(GB_jit_kernel_user_op) \ @@ -557,6 +557,34 @@ GrB_Info GB_jit_kernel_union \ const bool M_is_B \ ) +//------------------------------------------------------------------------------ +// CUDA JIT prototypes +//------------------------------------------------------------------------------ + +#define GB_JIT_CUDA_KERNEL_REDUCE_PROTO(GB_jit_kernel_reduce) \ +GrB_Info GB_jit_kernel_reduce \ +( \ + GB_void *zscalar, \ + GrB_Matrix V, \ + const GrB_Matrix A, \ + cudaStream_t stream, \ + int32_t gridsz, \ + int32_t blocksz \ +) + +#define GB_JIT_CUDA_KERNEL_DOT3_PROTO(GB_jit_kernel_AxB_dot3) \ +GrB_Info GB_jit_kernel_AxB_dot3 \ +( \ + GrB_Matrix C, \ + const GrB_Matrix M, \ + const GrB_Matrix A, \ + const GrB_Matrix B, \ + cudaStream_t stream, \ + int device, \ + int number_of_sms, \ + const GB_callback_struct *restrict my_callback \ +) + //------------------------------------------------------------------------------ // shorthand macros for GB_prejit.c: //------------------------------------------------------------------------------ @@ -603,5 +631,12 @@ GrB_Info GB_jit_kernel_union \ #define JIT_UTYP(g) GB_JIT_KERNEL_USER_TYPE_PROTO(g) ; #define JIT_Q(q) GB_JIT_QUERY_PROTO(q) ; +//------------------------------------------------------------------------------ +// shorthand macros for GB_cuda_prejit.c: +//------------------------------------------------------------------------------ + +#define JIT_CUDA_RED(g) GB_JIT_CUDA_KERNEL_REDUCE_PROTO(g) ; +#define JIT_CUDA_DOT3(g) GB_JIT_CUDA_KERNEL_DOT3_PROTO(g) ; + #endif diff --git a/GraphBLAS/Source/codegen_aop.m b/GraphBLAS/Source/codegen_aop.m index 5ebb329e63..709eaecbdb 100644 --- a/GraphBLAS/Source/codegen_aop.m +++ b/GraphBLAS/Source/codegen_aop.m @@ -17,7 +17,7 @@ fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ; fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ; fprintf (fh, '// This file has been automatically generated from Generator/GB_aop.h') ; -fprintf (fh, '\n\n') ; +fprintf (fh, '\n#include "GB_math.h"\n\n') ; fclose (fh) ; % The ANY operator is not used as a binary operator in the generated functions. diff --git a/GraphBLAS/Source/codegen_as.m b/GraphBLAS/Source/codegen_as.m index 12796e3147..78a741395a 100644 --- a/GraphBLAS/Source/codegen_as.m +++ b/GraphBLAS/Source/codegen_as.m @@ -17,7 +17,7 @@ fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ; fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ; fprintf (fh, '// This file has been automatically generated from Generator/GB_as.h') ; -fprintf (fh, '\n\n') ; +fprintf (fh, '\n#include "GB_math.h"\n\n') ; fclose (fh) ; codegen_as_template ('bool') ; diff --git a/GraphBLAS/Source/codegen_axb.m b/GraphBLAS/Source/codegen_axb.m index 354aed8efb..b0207b5290 100644 --- a/GraphBLAS/Source/codegen_axb.m +++ b/GraphBLAS/Source/codegen_axb.m @@ -28,7 +28,7 @@ fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ; fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ; fprintf (fh, '// This file has been automatically generated from Generator/GB_AxB.h') ; - fprintf (fh, '\n\n') ; + fprintf (fh, '\n#include "GB_math.h"\n\n') ; fclose (fh) ; end diff --git a/GraphBLAS/Source/codegen_ew.m b/GraphBLAS/Source/codegen_ew.m index 93ea69ef85..8f705b9b84 100644 --- a/GraphBLAS/Source/codegen_ew.m +++ b/GraphBLAS/Source/codegen_ew.m @@ -17,7 +17,7 @@ fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ; fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ; fprintf (fh, '// This file has been automatically generated from Generator/GB_ew.h') ; -fprintf (fh, '\n\n') ; +fprintf (fh, '\n#include "GB_math.h"\n\n') ; fclose (fh) ; % The ANY operator is not used as a binary operator in the generated functions. diff --git a/GraphBLAS/Source/codegen_red.m b/GraphBLAS/Source/codegen_red.m index 4b76152909..d48a24747d 100644 --- a/GraphBLAS/Source/codegen_red.m +++ b/GraphBLAS/Source/codegen_red.m @@ -17,7 +17,7 @@ fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ; fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ; fprintf (fh, '// This file has been automatically generated from Generator/GB_red.h') ; -fprintf (fh, '\n\n') ; +fprintf (fh, '\n#include "GB_math.h"\n\n') ; fclose (fh) ; fh = fopen ('FactoryKernels/GB_bld__include.h', 'w') ; @@ -28,7 +28,7 @@ fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ; fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ; fprintf (fh, '// This file has been automatically generated from Generator/GB_bld.h') ; -fprintf (fh, '\n\n') ; +fprintf (fh, '\n#include "GB_math.h"\n\n') ; fclose (fh) ; %------------------------------------------------------------------------------- diff --git a/GraphBLAS/Source/codegen_sel.m b/GraphBLAS/Source/codegen_sel.m index 2ff31c8e32..dd41aca256 100644 --- a/GraphBLAS/Source/codegen_sel.m +++ b/GraphBLAS/Source/codegen_sel.m @@ -18,7 +18,7 @@ fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ; fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ; fprintf (fh, '// This file has been automatically generated from Generator/GB_sel.h') ; -fprintf (fh, '\n\n') ; +fprintf (fh, '\n#include "GB_math.h"\n\n') ; fclose (fh) ; % NONZOMBIE: name selector type diff --git a/GraphBLAS/Source/codegen_unop.m b/GraphBLAS/Source/codegen_unop.m index 9753df6449..45116d7ab1 100644 --- a/GraphBLAS/Source/codegen_unop.m +++ b/GraphBLAS/Source/codegen_unop.m @@ -17,7 +17,7 @@ fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ; fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ; fprintf (fh, '// This file has been automatically generated from Generator/GB_unop.h') ; -fprintf (fh, '\n\n') ; +fprintf (fh, '\n#include "GB_math.h"\n\n') ; fclose (fh) ; codegen_unop_identity ; diff --git a/GraphBLAS/Tcov/.gitignore b/GraphBLAS/Tcov/.gitignore new file mode 100644 index 0000000000..8faafc0802 --- /dev/null +++ b/GraphBLAS/Tcov/.gitignore @@ -0,0 +1,2 @@ +# ignore these files +log_GB_mex_test21.txt diff --git a/GraphBLAS/Tcov/log_GB_mex_test21.txt b/GraphBLAS/Tcov/log_GB_mex_test21.txt deleted file mode 100644 index 369fe5ae07..0000000000 --- a/GraphBLAS/Tcov/log_GB_mex_test21.txt +++ /dev/null @@ -1,2574 +0,0 @@ - - -================================================================================ -GB_macrofy_cast_output, ztype NULL -#define GB_PUTC(z,Cx,p) - - -================================================================================ -GB_macrofy_cast_output, cast FC64 to bool -#define GB_PUTC(z,Cx,p) Cx [p] = (GB_creal (z) != 0 || GB_cimag (z) != 0) - - -================================================================================ -GB_assign_describe -C = A - - -================================================================================ -GB_enumify_ewise / GB_macrofy_ewise, C iso -// op: symbolic only (C is iso) - -// binary operator types: -#define GB_Z_TYPE void -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// binary operator: -#define GB_BINOP(z,x,y,i,j) -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) -#define GB_COPY_B_to_C(Cx,pC,Bx,pB,B_iso) - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE void -#define GB_PUTC(c,Cx,p) -#define GB_EWISEOP(Cx,p,aij,bij,i,j) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -// B matrix: hypersparse -#define GB_B_IS_HYPER 1 -#define GB_B_IS_SPARSE 0 -#define GB_B_IS_BITMAP 0 -#define GB_B_IS_FULL 0 -#define GBP_B(Bp,k,vlen) Bp [k] -#define GBH_B(Bh,k) Bh [k] -#define GBI_B(Bi,p,vlen) Bi [p] -#define GBB_B(Bb,p) 1 -#define GB_B_NVALS(e) int64_t e = B->nvals -#define GB_B_NHELD(e) GB_B_NVALS(e) -#define GB_B_ISO 0 -#define GB_B_TYPE bool -#define GB_B2TYPE void -#define GB_DECLAREB(b) -#define GB_GETB(b,Bx,p,iso) - -#include "GB_ewise_shared_definitions.h" - - -================================================================================ -GB_enumify_ewise / GB_macrofy_ewise, C non iso -// op: (and, bool) - -// binary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE bool -#define GB_Y_TYPE bool - -// binary operator: -#define GB_BINOP(z,x,y,i,j) z = ((x) && (y)) -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA] -#define GB_COPY_B_to_C(Cx,pC,Bx,pB,B_iso) Cx [pC] = Bx [pB] - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(c,Cx,p) Cx [p] = c -#define GB_EWISEOP(Cx,p,aij,bij,i,j) GB_BINOP (Cx [p], aij, bij, i, j) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE bool -#define GB_DECLAREA(a) bool a -#define GB_GETA(a,Ax,p,iso) a = Ax [p] - -// B matrix: hypersparse -#define GB_B_IS_HYPER 1 -#define GB_B_IS_SPARSE 0 -#define GB_B_IS_BITMAP 0 -#define GB_B_IS_FULL 0 -#define GBP_B(Bp,k,vlen) Bp [k] -#define GBH_B(Bh,k) Bh [k] -#define GBI_B(Bi,p,vlen) Bi [p] -#define GBB_B(Bb,p) 1 -#define GB_B_NVALS(e) int64_t e = B->nvals -#define GB_B_NHELD(e) GB_B_NVALS(e) -#define GB_B_ISO 0 -#define GB_B_TYPE bool -#define GB_B2TYPE bool -#define GB_DECLAREB(b) bool b -#define GB_GETB(b,Bx,p,iso) b = Bx [p] - -#include "GB_ewise_shared_definitions.h" - - -================================================================================ -GB_enumify_mxm / GB_macrofy_mxm, C iso -// semiring: symbolic only (C is iso) - -// monoid: -#define GB_Z_TYPE void -#define GB_UPDATE(z,y) -#define GB_ADD(z,x,y) -#define GB_DECLARE_IDENTITY(z) -#define GB_DECLARE_IDENTITY_CONST(z) -#define GB_IS_ANY_MONOID 1 -#define GB_Z_IGNORE_OVERFLOW 1 -#define GB_Z_NBITS 0 -#define GB_Z_ATOMIC_BITS 0 - -// multiplicative operator (flipped): -#define GB_MULT(z,x,y,i,k,j) - -// multiply-add operator: -#define GB_MULTADD(z,x,y,i,k,j) - -// special cases: - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 1 -#define GB_C_TYPE void -#define GB_PUTC(c,Cx,p) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 1 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -// B matrix: hypersparse -#define GB_B_IS_HYPER 1 -#define GB_B_IS_SPARSE 0 -#define GB_B_IS_BITMAP 0 -#define GB_B_IS_FULL 0 -#define GBP_B(Bp,k,vlen) Bp [k] -#define GBH_B(Bh,k) Bh [k] -#define GBI_B(Bi,p,vlen) Bi [p] -#define GBB_B(Bb,p) 1 -#define GB_B_NVALS(e) int64_t e = B->nvals -#define GB_B_NHELD(e) GB_B_NVALS(e) -#define GB_B_ISO 1 -#define GB_B_IS_PATTERN 1 -#define GB_B_TYPE void -#define GB_B2TYPE void -#define GB_DECLAREB(b) -#define GB_GETB(b,Bx,p,iso) - -#include "GB_mxm_shared_definitions.h" - - -================================================================================ -GB_enumify_mxm / GB_macrofy_mxm, any_pair, flipxy -// semiring: symbolic only (C is iso) - -// monoid: -#define GB_Z_TYPE void -#define GB_UPDATE(z,y) -#define GB_ADD(z,x,y) -#define GB_DECLARE_IDENTITY(z) -#define GB_DECLARE_IDENTITY_CONST(z) -#define GB_IS_ANY_MONOID 1 -#define GB_Z_NBITS 0 -#define GB_Z_ATOMIC_BITS 0 - -// multiplicative operator (flipped): -#define GB_MULT(z,x,y,i,k,j) - -// multiply-add operator: -#define GB_MULTADD(z,x,y,i,k,j) - -// special cases: -#define GB_IS_ANY_PAIR_SEMIRING 1 -#define GB_IS_PAIR_MULTIPLIER 1 - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE void -#define GB_PUTC(c,Cx,p) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 1 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -// B matrix: hypersparse -#define GB_B_IS_HYPER 1 -#define GB_B_IS_SPARSE 0 -#define GB_B_IS_BITMAP 0 -#define GB_B_IS_FULL 0 -#define GBP_B(Bp,k,vlen) Bp [k] -#define GBH_B(Bh,k) Bh [k] -#define GBI_B(Bi,p,vlen) Bi [p] -#define GBB_B(Bb,p) 1 -#define GB_B_NVALS(e) int64_t e = B->nvals -#define GB_B_NHELD(e) GB_B_NVALS(e) -#define GB_B_ISO 1 -#define GB_B_IS_PATTERN 1 -#define GB_B_TYPE void -#define GB_B2TYPE void -#define GB_DECLAREB(b) -#define GB_GETB(b,Bx,p,iso) - -#include "GB_mxm_shared_definitions.h" - - -================================================================================ -GB_enumify_mxm / GB_macrofy_mxm, any_pair fp32 -// semiring: (any, pair (flipped), float) - -// monoid: -#define GB_Z_TYPE float -#define GB_ADD(z,x,y) z = y -#define GB_UPDATE(z,y) z = y -#define GB_DECLARE_IDENTITY(z) float z = 0 -#define GB_DECLARE_IDENTITY_CONST(z) const float z = 0 -#define GB_HAS_IDENTITY_BYTE 1 -#define GB_IDENTITY_BYTE 0x00 -#define GB_IS_ANY_MONOID 1 -#define GB_Z_NBITS 32 -#define GB_Z_ATOMIC_BITS 32 -#define GB_Z_HAS_ATOMIC_UPDATE 1 -#define GB_Z_HAS_OMP_ATOMIC_UPDATE 1 -#define GB_Z_HAS_CUDA_ATOMIC_BUILTIN 1 -#define GB_Z_CUDA_ATOMIC GB_cuda_atomic_write -#define GB_Z_CUDA_ATOMIC_TYPE float - -// multiplicative operator (flipped): -#define GB_MULT(z,y,x,j,k,i) z = 1 - -// multiply-add operator: -#define GB_MULTADD(z,y,x,j,k,i) z = 1 - -// special cases: -#define GB_IS_PAIR_MULTIPLIER 1 - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE float -#define GB_PUTC(c,Cx,p) Cx [p] = c - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 1 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -// B matrix: hypersparse -#define GB_B_IS_HYPER 1 -#define GB_B_IS_SPARSE 0 -#define GB_B_IS_BITMAP 0 -#define GB_B_IS_FULL 0 -#define GBP_B(Bp,k,vlen) Bp [k] -#define GBH_B(Bh,k) Bh [k] -#define GBI_B(Bi,p,vlen) Bi [p] -#define GBB_B(Bb,p) 1 -#define GB_B_NVALS(e) int64_t e = B->nvals -#define GB_B_NHELD(e) GB_B_NVALS(e) -#define GB_B_ISO 1 -#define GB_B_IS_PATTERN 1 -#define GB_B_TYPE void -#define GB_B2TYPE void -#define GB_DECLAREB(b) -#define GB_GETB(b,Bx,p,iso) - -#include "GB_mxm_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: rowindex -// op: (rowindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE int32_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((i) + (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_DEPENDS_ON_Y 1 -#define GB_ROWINDEX_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: rowindex -// op: (rowindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((i) + (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_DEPENDS_ON_Y 1 -#define GB_ROWINDEX_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: colindex -// op: (colindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE int32_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) + (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_COLINDEX_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: colindex -// op: (colindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) + (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_COLINDEX_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: diagindex -// op: (diagindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE int32_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) - ((i) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_OFFDIAG_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: diagindex -// op: (diagindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) - ((i) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_OFFDIAG_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: tril -// op: (tril, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) <= ((i) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_TRIL_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: triu -// op: (triu, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) >= ((i) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_TRIU_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: diag -// op: (diag, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) == ((i) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_DIAG_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: offdiag -// op: (offdiag, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) != ((i) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_OFFDIAG_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: colle -// op: (colle, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) <= (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_COLLE_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: colgt -// op: (colgt, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) > (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_COLGT_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: rowle -// op: (rowle, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((i) <= (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_DEPENDS_ON_Y 1 -#define GB_ROWLE_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: rowgt -// op: (rowgt, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((i) > (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_DEPENDS_ON_Y 1 -#define GB_ROWGT_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: flipdiagindex -// op: (flipdiagindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE int32_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((i) - ((j) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_ENTRY_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: flipdiagindex -// op: (flipdiagindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((i) - ((j) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_ENTRY_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: opi32 -// op: opi32func, ztype: GxB_FC32_t, xtype: GxB_FC32_t, ytype: GxB_FC32_t - -// unary operator types: -#define GB_Z_TYPE GxB_FC32_t -#define GB_X_TYPE GxB_FC32_t -#define GB_Y_TYPE GxB_FC32_t - -// index unary operator: -#ifndef GB_GUARD_opi32func_DEFINED -#define GB_GUARD_opi32func_DEFINED -GB_STATIC_INLINE -void opi32func (GxB_FC32_t *z, const GxB_FC32_t *x, GrB_Index i, GrB_Index j, - const GxB_FC32_t *y) -{ - (*z) = (*x) ; -} -#define GB_opi32func_USER_DEFN \ -"void opi32func (GxB_FC32_t *z, const GxB_FC32_t *x, GrB_Index i, GrB_Index j, \n" \ -" const GxB_FC32_t *y) \n" \ -"{ \n" \ -" (*z) = (*x) ; \n" \ -"}" -#endif -#define GB_IDXUNOP(z,x,i,j,y) opi32func (&(z), &(x), i, j, &(y)) -#define GB_DEPENDS_ON_X 1 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_ENTRY_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_DECLAREA (x) ; \ - GB_GETA (x, Ax, p, ) ; \ - GB_IDXUNOP (z, x, i, j, y) ; \ - bool keep = (GB_crealf (z) != 0 || GB_cimagf (z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE GxB_FC32_t - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE GxB_FC32_t -#define GB_A2TYPE GxB_FC32_t -#define GB_DECLAREA(a) GxB_FC32_t a -#define GB_GETA(a,Ax,p,iso) a = Ax [p] - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: one -// op: (one, void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = 1 -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 0 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \ -{ \ - GB_DECLAREZ (z) ; \ - GB_UNARYOP (z, aij, Ax [pA], , , ) ; \ - GB_PUTC (z, Cx, pC) ; \ -} - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c) - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positioni -// op: (positioni, void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (i) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA], i, , ) - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = c - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positioni -// op: (positioni, void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (i) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \ -{ \ - GB_DECLAREZ (z) ; \ - GB_UNARYOP (z, aij, Ax [pA], i, , ) ; \ - GB_PUTC (z, Cx, pC) ; \ -} - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c) - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positioni1 -// op: (positioni1, void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (i) + 1 -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA], i, , ) - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = c - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positioni1 -// op: (positioni1, void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (i) + 1 -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \ -{ \ - GB_DECLAREZ (z) ; \ - GB_UNARYOP (z, aij, Ax [pA], i, , ) ; \ - GB_PUTC (z, Cx, pC) ; \ -} - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c) - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positionj -// op: (positionj, void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (j) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA], , j, ) - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = c - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positionj -// op: (positionj, void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (j) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \ -{ \ - GB_DECLAREZ (z) ; \ - GB_UNARYOP (z, aij, Ax [pA], , j, ) ; \ - GB_PUTC (z, Cx, pC) ; \ -} - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c) - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positionj1 -// op: (positionj1, void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (j) + 1 -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA], , j, ) - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = c - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positionj1 -// op: (positionj1, void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (j) + 1 -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \ -{ \ - GB_DECLAREZ (z) ; \ - GB_UNARYOP (z, aij, Ax [pA], , j, ) ; \ - GB_PUTC (z, Cx, pC) ; \ -} - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c) - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: sqrt -// op: (sqrt, GxB_FC64_t) - -// unary operator types: -#define GB_Z_TYPE GxB_FC64_t -#define GB_X_TYPE GxB_FC64_t -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = GB_csqrt (x) -#define GB_DEPENDS_ON_X 1 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 0 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \ -{ \ - GB_DECLAREA (aij) ; \ - GB_GETA (aij, Ax, pA, ) ; \ - GB_DECLAREZ (z) ; \ - GB_UNARYOP (z, aij, , , ) ; \ - GB_PUTC (z, Cx, pC) ; \ -} - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#ifndef GB_GUARD_GJ_cast_to_int32_DEFINED -#define GB_GUARD_GJ_cast_to_int32_DEFINED -GB_STATIC_INLINE -int32_t GJ_cast_to_int32 (double x) -{ - if (isnan (x)) return (0) ; - if (x <= (double) INT32_MIN) return (INT32_MIN) ; - if (x >= (double) INT32_MAX) return (INT32_MAX) ; - return ((int32_t) x) ; -} -#endif -#define GB_PUTC(c,Cx,p) Cx [p] = GJ_cast_to_int32 (GB_creal (c)) - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_TYPE int32_t -#define GB_A2TYPE GxB_FC64_t -#define GB_DECLAREA(a) GxB_FC64_t a -#define GB_GETA(a,Ax,p,iso) a = (GxB_FC64_t) (Ax [p]) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_build / GB_macrofy_build: times -// op: (times, GxB_FC32_t) - -// binary dup operator types: -#define GB_Z_TYPE GxB_FC32_t -#define GB_X_TYPE GxB_FC32_t -#define GB_Y_TYPE GxB_FC32_t - -// S and T data types: -#define GB_T_TYPE bool -#define GB_S_TYPE bool - -// binary dup operator: -#define GB_DUP(z,x,y) z = GB_FC32_mul (x,y) -#define GB_UPDATE(z,y) GB_DUP(z,z,y) - -// build copy/dup methods: -#define GB_BLD_COPY(Tx,p,Sx,k) Tx [p] = Sx [k] -#define GB_BLD_DUP(Tx,p,Sx,k) \ - GxB_FC32_t y = (GxB_FC32_t) Sx [k] ; \ - GxB_FC32_t x = (GxB_FC32_t) Tx [p] ; \ - GxB_FC32_t z ; \ - GB_DUP (z, x, y) ; \ - Tx [p] = (GB_crealf (z) != 0 || GB_cimagf (z) != 0) ; - -#include "GB_kernel_shared_definitions.h" - - -================================================================================ -GB_enumify_build / GB_macrofy_build: and -// op: (and, bool) - -// binary dup operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE bool -#define GB_Y_TYPE bool - -// S and T data types: -#define GB_T_TYPE GxB_FC32_t -#define GB_S_TYPE GxB_FC32_t - -// binary dup operator: -#define GB_DUP(z,x,y) z = ((x) && (y)) -#define GB_UPDATE(z,y) z &= y - -// build copy/dup methods: -#define GB_BLD_COPY(Tx,p,Sx,k) Tx [p] = Sx [k] -#define GB_BLD_DUP(Tx,p,Sx,k) \ - bool y = (GB_crealf (Sx [k]) != 0 || GB_cimagf (Sx [k]) != 0) ; \ - bool x = (GB_crealf (Tx [p]) != 0 || GB_cimagf (Tx [p]) != 0) ; \ - bool z ; \ - GB_DUP (z, x, y) ; \ - Tx [p] = (GxB_FC32_t) z ; - -#include "GB_kernel_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)=A (assign) -// assign/subassign: C(I,J) = A -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: not present - - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA] -#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \ - GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso) -#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p] -#define GB_COPY_C_to_xwork(xwork,Cx,p) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(lo:s:hi,lo:s:hi)=A (assign) -// assign/subassign: C(I,J) = A -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_STRIDE -#define GB_J_KIND GB_STRIDE -#define GB_C_REPLACE 0 -// accum: not present - - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA] -#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \ - GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso) -#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p] -#define GB_COPY_C_to_xwork(xwork,Cx,p) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(i,J)=s (row assign) -// assign/subassign: C(i,J) = A -#define GB_ASSIGN_KIND GB_ROW_ASSIGN -#define GB_I_KIND GB_ALL -#define GB_J_KIND GB_LIST -#define GB_C_REPLACE 0 -// accum: not present - - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar -#define GB_COPY_scalar_to_C(Cx,pC,cwork) Cx [pC] = cwork -#define GB_COPY_C_to_xwork(xwork,Cx,p) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// scalar: -#define GB_A_TYPE bool - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(I,j)=s (col assign) -// assign/subassign: C(I,j) = A -#define GB_ASSIGN_KIND GB_COL_ASSIGN -#define GB_I_KIND GB_LIST -#define GB_J_KIND GB_ALL -#define GB_C_REPLACE 0 -// accum: not present - - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar -#define GB_COPY_scalar_to_C(Cx,pC,cwork) Cx [pC] = cwork -#define GB_COPY_C_to_xwork(xwork,Cx,p) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// scalar: -#define GB_A_TYPE bool - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)=A (assign) -// assign/subassign: C(I,J) = scalar -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: not present - - -// C matrix: full -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 1 -#define GBP_C(Cp,k,vlen) ((k) * (vlen)) -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) ((p) % (vlen)) -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim) -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 1 -#define GB_C_TYPE void -#define GB_PUTC(cwork,Cx,p) -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar -#define GB_COPY_scalar_to_C(Cx,pC,cwork) -#define GB_COPY_C_to_xwork(xwork,Cx,pC) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// scalar: -#define GB_A_TYPE bool - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)+=s (assign) -// assign/subassign: C(I,J) += scalar -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: (plus, float) - -// accum operator types: -#define GB_Z_TYPE float -#define GB_X_TYPE float -#define GB_Y_TYPE float -#define GB_DECLAREZ(zwork) float zwork -#define GB_DECLAREX(xwork) float xwork -#define GB_DECLAREY(ywork) float ywork - -// accum operator: -#define GB_UPDATE(z,y) -#define GB_ACCUM_OP(z,x,y) -#define GB_ACCUMULATE_scalar(Cx,pC,ywork) - -// C matrix: full -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 1 -#define GBP_C(Cp,k,vlen) ((k) * (vlen)) -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) ((p) % (vlen)) -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim) -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 1 -#define GB_C_TYPE void -#define GB_PUTC(zwork,Cx,p) -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar -#define GB_COPY_scalar_to_C(Cx,pC,cwork) -#define GB_COPY_C_to_xwork(xwork,Cx,pC) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// scalar: -#define GB_A_TYPE bool -#define GB_COPY_scalar_to_ywork(ywork,scalar) ywork = (float) (scalar) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)+=s (assign) -// assign/subassign: C(I,J) += scalar -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: (plus, float) - -// accum operator types: -#define GB_Z_TYPE float -#define GB_X_TYPE float -#define GB_Y_TYPE float -#define GB_DECLAREZ(zwork) float zwork -#define GB_DECLAREX(xwork) float xwork -#define GB_DECLAREY(ywork) float ywork - -// accum operator: -#define GB_UPDATE(z,y) -#define GB_ACCUM_OP(z,x,y) -#define GB_ACCUMULATE_scalar(Cx,pC,ywork) - -// C matrix: full -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 1 -#define GBP_C(Cp,k,vlen) ((k) * (vlen)) -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) ((p) % (vlen)) -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim) -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 1 -#define GB_C_TYPE void -#define GB_PUTC(zwork,Cx,p) -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar -#define GB_COPY_scalar_to_C(Cx,pC,cwork) -#define GB_COPY_C_to_xwork(xwork,Cx,pC) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// scalar: -#define GB_A_TYPE bool -#define GB_COPY_scalar_to_ywork(ywork,scalar) ywork = (float) (scalar) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)+=A (assign) -// assign/subassign: C(I,J) += A -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: (plus, float) - -// accum operator types: -#define GB_Z_TYPE float -#define GB_X_TYPE float -#define GB_Y_TYPE float -#define GB_DECLAREZ(zwork) float zwork -#define GB_DECLAREX(xwork) float xwork -#define GB_DECLAREY(ywork) float ywork - -// accum operator: -#define GB_ACCUM_OP(z,x,y) z = (x) + (y) -#define GB_UPDATE(z,y) z += y -#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) \ -{ \ - GB_DECLAREY (ywork) ; \ - GB_GETA (ywork, Ax, pA, ) ; \ - GB_DECLAREX (xwork) ; \ - GB_COPY_C_to_xwork (xwork, Cx, pC) ; \ - GB_DECLAREZ (zwork) ; \ - GB_ACCUM_OP (zwork, xwork, ywork) ; \ - GB_PUTC (zwork, Cx, pC) ; \ -} - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(zwork,Cx,p) Cx [p] = ((zwork) != 0) -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA] -#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \ - GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso) -#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p] -#define GB_COPY_C_to_xwork(xwork,Cx,p) xwork = (float) (Cx [p]) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE float -#define GB_DECLAREA(a) float a -#define GB_GETA(a,Ax,p,iso) a = (float) (Ax [p]) -#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)&=A (assign) -// assign/subassign: C(I,J) &= A -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: (and, bool) - -// accum operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE bool -#define GB_Y_TYPE bool -#define GB_DECLAREZ(zwork) bool zwork -#define GB_DECLAREX(xwork) bool xwork -#define GB_DECLAREY(ywork) bool ywork - -// accum operator: -#define GB_ACCUM_OP(z,x,y) z = ((x) && (y)) -#define GB_UPDATE(z,y) z &= y -#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) \ -{ \ - GB_UPDATE (Cx [pC], Ax [pA]) ; \ -} - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(zwork,Cx,p) Cx [p] = zwork -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA] -#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \ - GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso) -#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p] -#define GB_COPY_C_to_xwork(xwork,Cx,p) xwork = Cx [p] - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE bool -#define GB_DECLAREA(a) bool a -#define GB_GETA(a,Ax,p,iso) a = Ax [p] -#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)<=A (assign) -// assign/subassign: C(I,J) lt= A -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: (lt, float) - -// accum operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE float -#define GB_Y_TYPE float -#define GB_DECLAREZ(zwork) bool zwork -#define GB_DECLAREX(xwork) float xwork -#define GB_DECLAREY(ywork) float ywork - -// accum operator: -#define GB_ACCUM_OP(z,x,y) z = ((x) < (y)) -#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) \ -{ \ - GB_DECLAREY (ywork) ; \ - GB_GETA (ywork, Ax, pA, ) ; \ - GB_DECLAREX (xwork) ; \ - GB_COPY_C_to_xwork (xwork, Cx, pC) ; \ - GB_ACCUM_OP (Cx [pC], xwork, ywork) ; \ -} - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(zwork,Cx,p) Cx [p] = zwork -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA] -#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \ - GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso) -#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p] -#define GB_COPY_C_to_xwork(xwork,Cx,p) xwork = (float) (Cx [p]) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE float -#define GB_DECLAREA(a) float a -#define GB_GETA(a,Ax,p,iso) a = (float) (Ax [p]) -#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)<=H (assign) -// assign/subassign: C(I,J) lt= A -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: (lt, float) - -// accum operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE float -#define GB_Y_TYPE float -#define GB_DECLAREZ(zwork) bool zwork -#define GB_DECLAREX(xwork) float xwork -#define GB_DECLAREY(ywork) float ywork - -// accum operator: -#define GB_ACCUM_OP(z,x,y) -#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) - -// C matrix: full -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 1 -#define GBP_C(Cp,k,vlen) ((k) * (vlen)) -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) ((p) % (vlen)) -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim) -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 1 -#define GB_C_TYPE void -#define GB_PUTC(zwork,Cx,p) -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) -#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) -#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p] -#define GB_COPY_C_to_xwork(xwork,Cx,pC) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE float -#define GB_DECLAREA(a) float a -#define GB_GETA(a,Ax,p,iso) a = (float) (Ax [p]) -#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso) - -#include "GB_assign_shared_definitions.h" diff --git a/GraphBLAS/Test/.gitignore b/GraphBLAS/Test/.gitignore new file mode 100644 index 0000000000..8faafc0802 --- /dev/null +++ b/GraphBLAS/Test/.gitignore @@ -0,0 +1,2 @@ +# ignore these files +log_GB_mex_test21.txt diff --git a/GraphBLAS/Test/GB_mex_test11.c b/GraphBLAS/Test/GB_mex_test11.c index 6ee30630ae..cd33b115c3 100644 --- a/GraphBLAS/Test/GB_mex_test11.c +++ b/GraphBLAS/Test/GB_mex_test11.c @@ -235,6 +235,17 @@ if (jit_enabled) OK (GxB_Global_Option_get_CHAR (GxB_JIT_C_PREFACE, &t)) ; CHECK (MATCH (t, "// more stuff here")) ; + OK (GxB_get (GxB_JIT_CUDA_PREFACE, &s)) ; + printf ("default CUDA preface [%s]\n", s) ; + OK (GxB_set (GxB_JIT_CUDA_PREFACE, "// cuda stuff here")) ; + OK (GxB_get (GxB_JIT_CUDA_PREFACE, &s)) ; + CHECK (MATCH (s, "// cuda stuff here")) ; + OK (GxB_Global_Option_get_CHAR (GxB_JIT_CUDA_PREFACE, &t)) ; + CHECK (MATCH (t, "// cuda stuff here")) ; + OK (GxB_Global_Option_set_CHAR (GxB_JIT_CUDA_PREFACE, + "// more cuda stuff here")) ; + OK (GxB_Global_Option_get_CHAR (GxB_JIT_CUDA_PREFACE, &t)) ; + CHECK (MATCH (t, "// more cuda stuff here")) ; OK (GxB_Type_new (&MyType, 0, "mytype", "typedef double mytype ;")) ; OK (GxB_Type_size (&mysize, MyType)) ; diff --git a/GraphBLAS/Test/GB_mex_test13.c b/GraphBLAS/Test/GB_mex_test13.c index fa359450bb..1e7872a342 100644 --- a/GraphBLAS/Test/GB_mex_test13.c +++ b/GraphBLAS/Test/GB_mex_test13.c @@ -57,6 +57,7 @@ void mexFunction ERR (GxB_Global_Option_set_CHAR (GxB_JIT_C_COMPILER_FLAGS, NULL)) ; ERR (GxB_Global_Option_set_CHAR (GxB_JIT_C_LINKER_FLAGS, NULL)) ; ERR (GxB_Global_Option_set_CHAR (GxB_JIT_C_PREFACE, NULL)) ; + ERR (GxB_Global_Option_set_CHAR (GxB_JIT_CUDA_PREFACE, NULL)) ; OK (GxB_Global_Option_set_CHAR (GxB_JIT_ERROR_LOG, NULL)) ; //-------------------------------------------------------------------------- diff --git a/GraphBLAS/Test/GB_mex_test16.c b/GraphBLAS/Test/GB_mex_test16.c index 21e955e111..998f111825 100644 --- a/GraphBLAS/Test/GB_mex_test16.c +++ b/GraphBLAS/Test/GB_mex_test16.c @@ -120,7 +120,7 @@ void mexFunction GrB_FP32, NULL, false, false, s, false, A, B) ; CHECK (code == UINT64_MAX) ; - code = GB_encodify_reduce (&e, &suffix, mon, A) ; + code = GB_encodify_reduce (&e, &suffix, GB_JIT_KERNEL_REDUCE, mon, A) ; CHECK (code == UINT64_MAX) ; code = GB_encodify_assign (&e, &suffix, 0, C, false, 0, 0, NULL, diff --git a/GraphBLAS/Test/GB_mex_test21.c b/GraphBLAS/Test/GB_mex_test21.c index 37a5bad838..ef7f4c58dc 100644 --- a/GraphBLAS/Test/GB_mex_test21.c +++ b/GraphBLAS/Test/GB_mex_test21.c @@ -74,7 +74,7 @@ void mexFunction const char *a, *cuda_type ; bool user_monoid_atomically ; bool has_cheeseburger = GB_enumify_cuda_atomic (&a, - &user_monoid_atomically, &cuda_type, NULL, 0, sizeof (uint16_t), 0) ; + &user_monoid_atomically, &cuda_type, NULL, 0, sizeof (uint32_t), 0) ; CHECK (!has_cheeseburger) ; CHECK (user_monoid_atomically) ; CHECK (cuda_type == NULL) ; diff --git a/GraphBLAS/Test/GB_mex_test29.c b/GraphBLAS/Test/GB_mex_test29.c index 0ec8a45325..14a5b748e5 100644 --- a/GraphBLAS/Test/GB_mex_test29.c +++ b/GraphBLAS/Test/GB_mex_test29.c @@ -347,6 +347,15 @@ void mexFunction OK (GrB_Global_get_String_ (GrB_GLOBAL, defn2, GxB_JIT_C_PREFACE)) ; CHECK (MATCH (defn2, defn)) ; + OK (GrB_Global_get_String_ (GrB_GLOBAL, defn, GxB_JIT_CUDA_PREFACE)) ; + printf ("JIT CUDA preface: [%s]\n", defn) ; + OK (GrB_Global_set_String_ (GrB_GLOBAL, "// cu", GxB_JIT_CUDA_PREFACE)) ; + OK (GrB_Global_get_String_ (GrB_GLOBAL, defn2, GxB_JIT_CUDA_PREFACE)) ; + CHECK (MATCH (defn2, "// cu")) ; + OK (GrB_Global_set_String_ (GrB_GLOBAL, defn, GxB_JIT_CUDA_PREFACE)) ; + OK (GrB_Global_get_String_ (GrB_GLOBAL, defn2, GxB_JIT_CUDA_PREFACE)) ; + CHECK (MATCH (defn2, defn)) ; + OK (GrB_Global_get_String_ (GrB_GLOBAL, defn, GxB_JIT_ERROR_LOG)) ; printf ("JIT error log: [%s]\n", defn) ; OK (GrB_Global_set_String_ (GrB_GLOBAL, "errlog.txt", GxB_JIT_ERROR_LOG)) ; diff --git a/GraphBLAS/Test/GB_mex_test9.c b/GraphBLAS/Test/GB_mex_test9.c index e4f164c6b8..1f5de9ce09 100644 --- a/GraphBLAS/Test/GB_mex_test9.c +++ b/GraphBLAS/Test/GB_mex_test9.c @@ -174,7 +174,7 @@ void mexFunction FILE *fp = fopen ("/tmp/GB_tcov_gunk.h", "w") ; GB_macrofy_binop (fp, "nothing", false, false, false, - 199, false, NULL, NULL, NULL) ; + 199, false, NULL, NULL, NULL, NULL) ; fclose (fp) ; //-------------------------------------------------------------------------- diff --git a/GraphBLAS/Test/log_GB_mex_test21.txt b/GraphBLAS/Test/log_GB_mex_test21.txt deleted file mode 100644 index 369fe5ae07..0000000000 --- a/GraphBLAS/Test/log_GB_mex_test21.txt +++ /dev/null @@ -1,2574 +0,0 @@ - - -================================================================================ -GB_macrofy_cast_output, ztype NULL -#define GB_PUTC(z,Cx,p) - - -================================================================================ -GB_macrofy_cast_output, cast FC64 to bool -#define GB_PUTC(z,Cx,p) Cx [p] = (GB_creal (z) != 0 || GB_cimag (z) != 0) - - -================================================================================ -GB_assign_describe -C = A - - -================================================================================ -GB_enumify_ewise / GB_macrofy_ewise, C iso -// op: symbolic only (C is iso) - -// binary operator types: -#define GB_Z_TYPE void -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// binary operator: -#define GB_BINOP(z,x,y,i,j) -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) -#define GB_COPY_B_to_C(Cx,pC,Bx,pB,B_iso) - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE void -#define GB_PUTC(c,Cx,p) -#define GB_EWISEOP(Cx,p,aij,bij,i,j) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -// B matrix: hypersparse -#define GB_B_IS_HYPER 1 -#define GB_B_IS_SPARSE 0 -#define GB_B_IS_BITMAP 0 -#define GB_B_IS_FULL 0 -#define GBP_B(Bp,k,vlen) Bp [k] -#define GBH_B(Bh,k) Bh [k] -#define GBI_B(Bi,p,vlen) Bi [p] -#define GBB_B(Bb,p) 1 -#define GB_B_NVALS(e) int64_t e = B->nvals -#define GB_B_NHELD(e) GB_B_NVALS(e) -#define GB_B_ISO 0 -#define GB_B_TYPE bool -#define GB_B2TYPE void -#define GB_DECLAREB(b) -#define GB_GETB(b,Bx,p,iso) - -#include "GB_ewise_shared_definitions.h" - - -================================================================================ -GB_enumify_ewise / GB_macrofy_ewise, C non iso -// op: (and, bool) - -// binary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE bool -#define GB_Y_TYPE bool - -// binary operator: -#define GB_BINOP(z,x,y,i,j) z = ((x) && (y)) -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA] -#define GB_COPY_B_to_C(Cx,pC,Bx,pB,B_iso) Cx [pC] = Bx [pB] - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(c,Cx,p) Cx [p] = c -#define GB_EWISEOP(Cx,p,aij,bij,i,j) GB_BINOP (Cx [p], aij, bij, i, j) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE bool -#define GB_DECLAREA(a) bool a -#define GB_GETA(a,Ax,p,iso) a = Ax [p] - -// B matrix: hypersparse -#define GB_B_IS_HYPER 1 -#define GB_B_IS_SPARSE 0 -#define GB_B_IS_BITMAP 0 -#define GB_B_IS_FULL 0 -#define GBP_B(Bp,k,vlen) Bp [k] -#define GBH_B(Bh,k) Bh [k] -#define GBI_B(Bi,p,vlen) Bi [p] -#define GBB_B(Bb,p) 1 -#define GB_B_NVALS(e) int64_t e = B->nvals -#define GB_B_NHELD(e) GB_B_NVALS(e) -#define GB_B_ISO 0 -#define GB_B_TYPE bool -#define GB_B2TYPE bool -#define GB_DECLAREB(b) bool b -#define GB_GETB(b,Bx,p,iso) b = Bx [p] - -#include "GB_ewise_shared_definitions.h" - - -================================================================================ -GB_enumify_mxm / GB_macrofy_mxm, C iso -// semiring: symbolic only (C is iso) - -// monoid: -#define GB_Z_TYPE void -#define GB_UPDATE(z,y) -#define GB_ADD(z,x,y) -#define GB_DECLARE_IDENTITY(z) -#define GB_DECLARE_IDENTITY_CONST(z) -#define GB_IS_ANY_MONOID 1 -#define GB_Z_IGNORE_OVERFLOW 1 -#define GB_Z_NBITS 0 -#define GB_Z_ATOMIC_BITS 0 - -// multiplicative operator (flipped): -#define GB_MULT(z,x,y,i,k,j) - -// multiply-add operator: -#define GB_MULTADD(z,x,y,i,k,j) - -// special cases: - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 1 -#define GB_C_TYPE void -#define GB_PUTC(c,Cx,p) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 1 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -// B matrix: hypersparse -#define GB_B_IS_HYPER 1 -#define GB_B_IS_SPARSE 0 -#define GB_B_IS_BITMAP 0 -#define GB_B_IS_FULL 0 -#define GBP_B(Bp,k,vlen) Bp [k] -#define GBH_B(Bh,k) Bh [k] -#define GBI_B(Bi,p,vlen) Bi [p] -#define GBB_B(Bb,p) 1 -#define GB_B_NVALS(e) int64_t e = B->nvals -#define GB_B_NHELD(e) GB_B_NVALS(e) -#define GB_B_ISO 1 -#define GB_B_IS_PATTERN 1 -#define GB_B_TYPE void -#define GB_B2TYPE void -#define GB_DECLAREB(b) -#define GB_GETB(b,Bx,p,iso) - -#include "GB_mxm_shared_definitions.h" - - -================================================================================ -GB_enumify_mxm / GB_macrofy_mxm, any_pair, flipxy -// semiring: symbolic only (C is iso) - -// monoid: -#define GB_Z_TYPE void -#define GB_UPDATE(z,y) -#define GB_ADD(z,x,y) -#define GB_DECLARE_IDENTITY(z) -#define GB_DECLARE_IDENTITY_CONST(z) -#define GB_IS_ANY_MONOID 1 -#define GB_Z_NBITS 0 -#define GB_Z_ATOMIC_BITS 0 - -// multiplicative operator (flipped): -#define GB_MULT(z,x,y,i,k,j) - -// multiply-add operator: -#define GB_MULTADD(z,x,y,i,k,j) - -// special cases: -#define GB_IS_ANY_PAIR_SEMIRING 1 -#define GB_IS_PAIR_MULTIPLIER 1 - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE void -#define GB_PUTC(c,Cx,p) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 1 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -// B matrix: hypersparse -#define GB_B_IS_HYPER 1 -#define GB_B_IS_SPARSE 0 -#define GB_B_IS_BITMAP 0 -#define GB_B_IS_FULL 0 -#define GBP_B(Bp,k,vlen) Bp [k] -#define GBH_B(Bh,k) Bh [k] -#define GBI_B(Bi,p,vlen) Bi [p] -#define GBB_B(Bb,p) 1 -#define GB_B_NVALS(e) int64_t e = B->nvals -#define GB_B_NHELD(e) GB_B_NVALS(e) -#define GB_B_ISO 1 -#define GB_B_IS_PATTERN 1 -#define GB_B_TYPE void -#define GB_B2TYPE void -#define GB_DECLAREB(b) -#define GB_GETB(b,Bx,p,iso) - -#include "GB_mxm_shared_definitions.h" - - -================================================================================ -GB_enumify_mxm / GB_macrofy_mxm, any_pair fp32 -// semiring: (any, pair (flipped), float) - -// monoid: -#define GB_Z_TYPE float -#define GB_ADD(z,x,y) z = y -#define GB_UPDATE(z,y) z = y -#define GB_DECLARE_IDENTITY(z) float z = 0 -#define GB_DECLARE_IDENTITY_CONST(z) const float z = 0 -#define GB_HAS_IDENTITY_BYTE 1 -#define GB_IDENTITY_BYTE 0x00 -#define GB_IS_ANY_MONOID 1 -#define GB_Z_NBITS 32 -#define GB_Z_ATOMIC_BITS 32 -#define GB_Z_HAS_ATOMIC_UPDATE 1 -#define GB_Z_HAS_OMP_ATOMIC_UPDATE 1 -#define GB_Z_HAS_CUDA_ATOMIC_BUILTIN 1 -#define GB_Z_CUDA_ATOMIC GB_cuda_atomic_write -#define GB_Z_CUDA_ATOMIC_TYPE float - -// multiplicative operator (flipped): -#define GB_MULT(z,y,x,j,k,i) z = 1 - -// multiply-add operator: -#define GB_MULTADD(z,y,x,j,k,i) z = 1 - -// special cases: -#define GB_IS_PAIR_MULTIPLIER 1 - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE float -#define GB_PUTC(c,Cx,p) Cx [p] = c - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 1 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -// B matrix: hypersparse -#define GB_B_IS_HYPER 1 -#define GB_B_IS_SPARSE 0 -#define GB_B_IS_BITMAP 0 -#define GB_B_IS_FULL 0 -#define GBP_B(Bp,k,vlen) Bp [k] -#define GBH_B(Bh,k) Bh [k] -#define GBI_B(Bi,p,vlen) Bi [p] -#define GBB_B(Bb,p) 1 -#define GB_B_NVALS(e) int64_t e = B->nvals -#define GB_B_NHELD(e) GB_B_NVALS(e) -#define GB_B_ISO 1 -#define GB_B_IS_PATTERN 1 -#define GB_B_TYPE void -#define GB_B2TYPE void -#define GB_DECLAREB(b) -#define GB_GETB(b,Bx,p,iso) - -#include "GB_mxm_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: rowindex -// op: (rowindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE int32_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((i) + (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_DEPENDS_ON_Y 1 -#define GB_ROWINDEX_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: rowindex -// op: (rowindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((i) + (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_DEPENDS_ON_Y 1 -#define GB_ROWINDEX_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: colindex -// op: (colindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE int32_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) + (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_COLINDEX_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: colindex -// op: (colindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) + (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_COLINDEX_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: diagindex -// op: (diagindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE int32_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) - ((i) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_OFFDIAG_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: diagindex -// op: (diagindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) - ((i) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_OFFDIAG_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: tril -// op: (tril, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) <= ((i) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_TRIL_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: triu -// op: (triu, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) >= ((i) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_TRIU_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: diag -// op: (diag, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) == ((i) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_DIAG_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: offdiag -// op: (offdiag, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) != ((i) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_OFFDIAG_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: colle -// op: (colle, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) <= (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_COLLE_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: colgt -// op: (colgt, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((j) > (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_COLGT_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: rowle -// op: (rowle, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((i) <= (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_DEPENDS_ON_Y 1 -#define GB_ROWLE_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: rowgt -// op: (rowgt, GB_void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((i) > (y)) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_DEPENDS_ON_Y 1 -#define GB_ROWGT_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - bool keep ; \ - GB_IDXUNOP (keep, , i, j, y) ; - -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: flipdiagindex -// op: (flipdiagindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE int32_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((i) - ((j) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_ENTRY_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: flipdiagindex -// op: (flipdiagindex, GB_void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE int64_t - -// index unary operator: -#define GB_IDXUNOP(z,x,i,j,y) z = ((i) - ((j) + (y))) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_ENTRY_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_IDXUNOP (z, , i, j, y) ; \ - bool keep = ((z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE bool - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_select / GB_macrofy_select: opi32 -// op: opi32func, ztype: GxB_FC32_t, xtype: GxB_FC32_t, ytype: GxB_FC32_t - -// unary operator types: -#define GB_Z_TYPE GxB_FC32_t -#define GB_X_TYPE GxB_FC32_t -#define GB_Y_TYPE GxB_FC32_t - -// index unary operator: -#ifndef GB_GUARD_opi32func_DEFINED -#define GB_GUARD_opi32func_DEFINED -GB_STATIC_INLINE -void opi32func (GxB_FC32_t *z, const GxB_FC32_t *x, GrB_Index i, GrB_Index j, - const GxB_FC32_t *y) -{ - (*z) = (*x) ; -} -#define GB_opi32func_USER_DEFN \ -"void opi32func (GxB_FC32_t *z, const GxB_FC32_t *x, GrB_Index i, GrB_Index j, \n" \ -" const GxB_FC32_t *y) \n" \ -"{ \n" \ -" (*z) = (*x) ; \n" \ -"}" -#endif -#define GB_IDXUNOP(z,x,i,j,y) opi32func (&(z), &(x), i, j, &(y)) -#define GB_DEPENDS_ON_X 1 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 1 -#define GB_DEPENDS_ON_Y 1 -#define GB_ENTRY_SELECTOR - -// test if A(i,j) is to be kept: -#define GB_TEST_VALUE_OF_ENTRY(keep,p) \ - GB_Z_TYPE z ; \ - GB_DECLAREA (x) ; \ - GB_GETA (x, Ax, p, ) ; \ - GB_IDXUNOP (z, x, i, j, y) ; \ - bool keep = (GB_crealf (z) != 0 || GB_cimagf (z) != 0) -// copy A(i,j) to C(i,j): -#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA] - -// C type: -#define GB_C_TYPE GxB_FC32_t - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE GxB_FC32_t -#define GB_A2TYPE GxB_FC32_t -#define GB_DECLAREA(a) GxB_FC32_t a -#define GB_GETA(a,Ax,p,iso) a = Ax [p] - -#include "GB_select_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: one -// op: (one, void) - -// unary operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = 1 -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 0 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \ -{ \ - GB_DECLAREZ (z) ; \ - GB_UNARYOP (z, aij, Ax [pA], , , ) ; \ - GB_PUTC (z, Cx, pC) ; \ -} - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c) - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positioni -// op: (positioni, void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (i) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA], i, , ) - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = c - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positioni -// op: (positioni, void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (i) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \ -{ \ - GB_DECLAREZ (z) ; \ - GB_UNARYOP (z, aij, Ax [pA], i, , ) ; \ - GB_PUTC (z, Cx, pC) ; \ -} - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c) - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positioni1 -// op: (positioni1, void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (i) + 1 -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA], i, , ) - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = c - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positioni1 -// op: (positioni1, void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (i) + 1 -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 1 -#define GB_DEPENDS_ON_J 0 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \ -{ \ - GB_DECLAREZ (z) ; \ - GB_UNARYOP (z, aij, Ax [pA], i, , ) ; \ - GB_PUTC (z, Cx, pC) ; \ -} - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c) - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positionj -// op: (positionj, void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (j) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA], , j, ) - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = c - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positionj -// op: (positionj, void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (j) -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \ -{ \ - GB_DECLAREZ (z) ; \ - GB_UNARYOP (z, aij, Ax [pA], , j, ) ; \ - GB_PUTC (z, Cx, pC) ; \ -} - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c) - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positionj1 -// op: (positionj1, void) - -// unary operator types: -#define GB_Z_TYPE int32_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (j) + 1 -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA], , j, ) - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = c - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: positionj1 -// op: (positionj1, void) - -// unary operator types: -#define GB_Z_TYPE int64_t -#define GB_X_TYPE void -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = (j) + 1 -#define GB_DEPENDS_ON_X 0 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 1 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \ -{ \ - GB_DECLAREZ (z) ; \ - GB_UNARYOP (z, aij, Ax [pA], , j, ) ; \ - GB_PUTC (z, Cx, pC) ; \ -} - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c) - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_IS_PATTERN 1 -#define GB_A_TYPE void -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_apply / GB_macrofy_apply: sqrt -// op: (sqrt, GxB_FC64_t) - -// unary operator types: -#define GB_Z_TYPE GxB_FC64_t -#define GB_X_TYPE GxB_FC64_t -#define GB_Y_TYPE void - -// unary operator: -#define GB_UNARYOP(z,x,i,j,y) z = GB_csqrt (x) -#define GB_DEPENDS_ON_X 1 -#define GB_DEPENDS_ON_Y 0 -#define GB_DEPENDS_ON_I 0 -#define GB_DEPENDS_ON_J 0 -#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \ -{ \ - GB_DECLAREA (aij) ; \ - GB_GETA (aij, Ax, pA, ) ; \ - GB_DECLAREZ (z) ; \ - GB_UNARYOP (z, aij, , , ) ; \ - GB_PUTC (z, Cx, pC) ; \ -} - -// C matrix: sparse -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 1 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE int32_t -#ifndef GB_GUARD_GJ_cast_to_int32_DEFINED -#define GB_GUARD_GJ_cast_to_int32_DEFINED -GB_STATIC_INLINE -int32_t GJ_cast_to_int32 (double x) -{ - if (isnan (x)) return (0) ; - if (x <= (double) INT32_MIN) return (INT32_MIN) ; - if (x >= (double) INT32_MAX) return (INT32_MAX) ; - return ((int32_t) x) ; -} -#endif -#define GB_PUTC(c,Cx,p) Cx [p] = GJ_cast_to_int32 (GB_creal (c)) - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_HAS_ZOMBIES 0 -#define GB_A_TYPE int32_t -#define GB_A2TYPE GxB_FC64_t -#define GB_DECLAREA(a) GxB_FC64_t a -#define GB_GETA(a,Ax,p,iso) a = (GxB_FC64_t) (Ax [p]) - -#include "GB_apply_shared_definitions.h" - - -================================================================================ -GB_enumify_build / GB_macrofy_build: times -// op: (times, GxB_FC32_t) - -// binary dup operator types: -#define GB_Z_TYPE GxB_FC32_t -#define GB_X_TYPE GxB_FC32_t -#define GB_Y_TYPE GxB_FC32_t - -// S and T data types: -#define GB_T_TYPE bool -#define GB_S_TYPE bool - -// binary dup operator: -#define GB_DUP(z,x,y) z = GB_FC32_mul (x,y) -#define GB_UPDATE(z,y) GB_DUP(z,z,y) - -// build copy/dup methods: -#define GB_BLD_COPY(Tx,p,Sx,k) Tx [p] = Sx [k] -#define GB_BLD_DUP(Tx,p,Sx,k) \ - GxB_FC32_t y = (GxB_FC32_t) Sx [k] ; \ - GxB_FC32_t x = (GxB_FC32_t) Tx [p] ; \ - GxB_FC32_t z ; \ - GB_DUP (z, x, y) ; \ - Tx [p] = (GB_crealf (z) != 0 || GB_cimagf (z) != 0) ; - -#include "GB_kernel_shared_definitions.h" - - -================================================================================ -GB_enumify_build / GB_macrofy_build: and -// op: (and, bool) - -// binary dup operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE bool -#define GB_Y_TYPE bool - -// S and T data types: -#define GB_T_TYPE GxB_FC32_t -#define GB_S_TYPE GxB_FC32_t - -// binary dup operator: -#define GB_DUP(z,x,y) z = ((x) && (y)) -#define GB_UPDATE(z,y) z &= y - -// build copy/dup methods: -#define GB_BLD_COPY(Tx,p,Sx,k) Tx [p] = Sx [k] -#define GB_BLD_DUP(Tx,p,Sx,k) \ - bool y = (GB_crealf (Sx [k]) != 0 || GB_cimagf (Sx [k]) != 0) ; \ - bool x = (GB_crealf (Tx [p]) != 0 || GB_cimagf (Tx [p]) != 0) ; \ - bool z ; \ - GB_DUP (z, x, y) ; \ - Tx [p] = (GxB_FC32_t) z ; - -#include "GB_kernel_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)=A (assign) -// assign/subassign: C(I,J) = A -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: not present - - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA] -#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \ - GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso) -#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p] -#define GB_COPY_C_to_xwork(xwork,Cx,p) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(lo:s:hi,lo:s:hi)=A (assign) -// assign/subassign: C(I,J) = A -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_STRIDE -#define GB_J_KIND GB_STRIDE -#define GB_C_REPLACE 0 -// accum: not present - - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA] -#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \ - GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso) -#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p] -#define GB_COPY_C_to_xwork(xwork,Cx,p) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE void -#define GB_DECLAREA(a) -#define GB_GETA(a,Ax,p,iso) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(i,J)=s (row assign) -// assign/subassign: C(i,J) = A -#define GB_ASSIGN_KIND GB_ROW_ASSIGN -#define GB_I_KIND GB_ALL -#define GB_J_KIND GB_LIST -#define GB_C_REPLACE 0 -// accum: not present - - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar -#define GB_COPY_scalar_to_C(Cx,pC,cwork) Cx [pC] = cwork -#define GB_COPY_C_to_xwork(xwork,Cx,p) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// scalar: -#define GB_A_TYPE bool - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(I,j)=s (col assign) -// assign/subassign: C(I,j) = A -#define GB_ASSIGN_KIND GB_COL_ASSIGN -#define GB_I_KIND GB_LIST -#define GB_J_KIND GB_ALL -#define GB_C_REPLACE 0 -// accum: not present - - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar -#define GB_COPY_scalar_to_C(Cx,pC,cwork) Cx [pC] = cwork -#define GB_COPY_C_to_xwork(xwork,Cx,p) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// scalar: -#define GB_A_TYPE bool - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)=A (assign) -// assign/subassign: C(I,J) = scalar -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: not present - - -// C matrix: full -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 1 -#define GBP_C(Cp,k,vlen) ((k) * (vlen)) -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) ((p) % (vlen)) -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim) -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 1 -#define GB_C_TYPE void -#define GB_PUTC(cwork,Cx,p) -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar -#define GB_COPY_scalar_to_C(Cx,pC,cwork) -#define GB_COPY_C_to_xwork(xwork,Cx,pC) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// scalar: -#define GB_A_TYPE bool - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)+=s (assign) -// assign/subassign: C(I,J) += scalar -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: (plus, float) - -// accum operator types: -#define GB_Z_TYPE float -#define GB_X_TYPE float -#define GB_Y_TYPE float -#define GB_DECLAREZ(zwork) float zwork -#define GB_DECLAREX(xwork) float xwork -#define GB_DECLAREY(ywork) float ywork - -// accum operator: -#define GB_UPDATE(z,y) -#define GB_ACCUM_OP(z,x,y) -#define GB_ACCUMULATE_scalar(Cx,pC,ywork) - -// C matrix: full -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 1 -#define GBP_C(Cp,k,vlen) ((k) * (vlen)) -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) ((p) % (vlen)) -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim) -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 1 -#define GB_C_TYPE void -#define GB_PUTC(zwork,Cx,p) -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar -#define GB_COPY_scalar_to_C(Cx,pC,cwork) -#define GB_COPY_C_to_xwork(xwork,Cx,pC) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// scalar: -#define GB_A_TYPE bool -#define GB_COPY_scalar_to_ywork(ywork,scalar) ywork = (float) (scalar) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)+=s (assign) -// assign/subassign: C(I,J) += scalar -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: (plus, float) - -// accum operator types: -#define GB_Z_TYPE float -#define GB_X_TYPE float -#define GB_Y_TYPE float -#define GB_DECLAREZ(zwork) float zwork -#define GB_DECLAREX(xwork) float xwork -#define GB_DECLAREY(ywork) float ywork - -// accum operator: -#define GB_UPDATE(z,y) -#define GB_ACCUM_OP(z,x,y) -#define GB_ACCUMULATE_scalar(Cx,pC,ywork) - -// C matrix: full -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 1 -#define GBP_C(Cp,k,vlen) ((k) * (vlen)) -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) ((p) % (vlen)) -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim) -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 1 -#define GB_C_TYPE void -#define GB_PUTC(zwork,Cx,p) -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar -#define GB_COPY_scalar_to_C(Cx,pC,cwork) -#define GB_COPY_C_to_xwork(xwork,Cx,pC) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// scalar: -#define GB_A_TYPE bool -#define GB_COPY_scalar_to_ywork(ywork,scalar) ywork = (float) (scalar) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)+=A (assign) -// assign/subassign: C(I,J) += A -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: (plus, float) - -// accum operator types: -#define GB_Z_TYPE float -#define GB_X_TYPE float -#define GB_Y_TYPE float -#define GB_DECLAREZ(zwork) float zwork -#define GB_DECLAREX(xwork) float xwork -#define GB_DECLAREY(ywork) float ywork - -// accum operator: -#define GB_ACCUM_OP(z,x,y) z = (x) + (y) -#define GB_UPDATE(z,y) z += y -#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) \ -{ \ - GB_DECLAREY (ywork) ; \ - GB_GETA (ywork, Ax, pA, ) ; \ - GB_DECLAREX (xwork) ; \ - GB_COPY_C_to_xwork (xwork, Cx, pC) ; \ - GB_DECLAREZ (zwork) ; \ - GB_ACCUM_OP (zwork, xwork, ywork) ; \ - GB_PUTC (zwork, Cx, pC) ; \ -} - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(zwork,Cx,p) Cx [p] = ((zwork) != 0) -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA] -#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \ - GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso) -#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p] -#define GB_COPY_C_to_xwork(xwork,Cx,p) xwork = (float) (Cx [p]) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE float -#define GB_DECLAREA(a) float a -#define GB_GETA(a,Ax,p,iso) a = (float) (Ax [p]) -#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)&=A (assign) -// assign/subassign: C(I,J) &= A -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: (and, bool) - -// accum operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE bool -#define GB_Y_TYPE bool -#define GB_DECLAREZ(zwork) bool zwork -#define GB_DECLAREX(xwork) bool xwork -#define GB_DECLAREY(ywork) bool ywork - -// accum operator: -#define GB_ACCUM_OP(z,x,y) z = ((x) && (y)) -#define GB_UPDATE(z,y) z &= y -#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) \ -{ \ - GB_UPDATE (Cx [pC], Ax [pA]) ; \ -} - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(zwork,Cx,p) Cx [p] = zwork -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA] -#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \ - GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso) -#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p] -#define GB_COPY_C_to_xwork(xwork,Cx,p) xwork = Cx [p] - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE bool -#define GB_DECLAREA(a) bool a -#define GB_GETA(a,Ax,p,iso) a = Ax [p] -#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)<=A (assign) -// assign/subassign: C(I,J) lt= A -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: (lt, float) - -// accum operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE float -#define GB_Y_TYPE float -#define GB_DECLAREZ(zwork) bool zwork -#define GB_DECLAREX(xwork) float xwork -#define GB_DECLAREY(ywork) float ywork - -// accum operator: -#define GB_ACCUM_OP(z,x,y) z = ((x) < (y)) -#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) \ -{ \ - GB_DECLAREY (ywork) ; \ - GB_GETA (ywork, Ax, pA, ) ; \ - GB_DECLAREX (xwork) ; \ - GB_COPY_C_to_xwork (xwork, Cx, pC) ; \ - GB_ACCUM_OP (Cx [pC], xwork, ywork) ; \ -} - -// C matrix: hypersparse -#define GB_C_IS_HYPER 1 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 0 -#define GBP_C(Cp,k,vlen) Cp [k] -#define GBH_C(Ch,k) Ch [k] -#define GBI_C(Ci,p,vlen) Ci [p] -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = C->nvals -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 0 -#define GB_C_IN_ISO 0 -#define GB_C_TYPE bool -#define GB_PUTC(zwork,Cx,p) Cx [p] = zwork -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA] -#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \ - GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso) -#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p] -#define GB_COPY_C_to_xwork(xwork,Cx,p) xwork = (float) (Cx [p]) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE float -#define GB_DECLAREA(a) float a -#define GB_GETA(a,Ax,p,iso) a = (float) (Ax [p]) -#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso) - -#include "GB_assign_shared_definitions.h" - - -================================================================================ -GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)<=H (assign) -// assign/subassign: C(I,J) lt= A -#define GB_ASSIGN_KIND GB_ASSIGN -#define GB_I_KIND GB_RANGE -#define GB_J_KIND GB_RANGE -#define GB_C_REPLACE 0 -// accum: (lt, float) - -// accum operator types: -#define GB_Z_TYPE bool -#define GB_X_TYPE float -#define GB_Y_TYPE float -#define GB_DECLAREZ(zwork) bool zwork -#define GB_DECLAREX(xwork) float xwork -#define GB_DECLAREY(ywork) float ywork - -// accum operator: -#define GB_ACCUM_OP(z,x,y) -#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) - -// C matrix: full -#define GB_C_IS_HYPER 0 -#define GB_C_IS_SPARSE 0 -#define GB_C_IS_BITMAP 0 -#define GB_C_IS_FULL 1 -#define GBP_C(Cp,k,vlen) ((k) * (vlen)) -#define GBH_C(Ch,k) (k) -#define GBI_C(Ci,p,vlen) ((p) % (vlen)) -#define GBB_C(Cb,p) 1 -#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim) -#define GB_C_NHELD(e) GB_C_NVALS(e) -#define GB_C_ISO 1 -#define GB_C_IN_ISO 1 -#define GB_C_TYPE void -#define GB_PUTC(zwork,Cx,p) -#define GB_DECLAREC(cwork) bool cwork -#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) -#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) -#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p] -#define GB_COPY_C_to_xwork(xwork,Cx,pC) - -// M matrix: none -#define GB_M_TYPE void -#define GB_MCAST(Mx,p,msize) 1 -#define GB_MASK_STRUCT 1 -#define GB_MASK_COMP 0 -#define GB_NO_MASK 1 - -// A matrix: hypersparse -#define GB_A_IS_HYPER 1 -#define GB_A_IS_SPARSE 0 -#define GB_A_IS_BITMAP 0 -#define GB_A_IS_FULL 0 -#define GBP_A(Ap,k,vlen) Ap [k] -#define GBH_A(Ah,k) Ah [k] -#define GBI_A(Ai,p,vlen) Ai [p] -#define GBB_A(Ab,p) 1 -#define GB_A_NVALS(e) int64_t e = A->nvals -#define GB_A_NHELD(e) GB_A_NVALS(e) -#define GB_A_ISO 0 -#define GB_A_TYPE bool -#define GB_A2TYPE float -#define GB_DECLAREA(a) float a -#define GB_GETA(a,Ax,p,iso) a = (float) (Ax [p]) -#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso) - -#include "GB_assign_shared_definitions.h" diff --git a/GraphBLAS/Test/test169.m b/GraphBLAS/Test/test169.m new file mode 100644 index 0000000000..33c5509b92 --- /dev/null +++ b/GraphBLAS/Test/test169.m @@ -0,0 +1,49 @@ +function test169 +%TEST169 C=A+B with different sparsity formats + +% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. +% SPDX-License-Identifier: Apache-2.0 + +rng ('default') ; + +fprintf ('test169:\n') ; + +n = 50 ; + +desc = struct ('mask', 'complement') ; + +for trial = 1:5 + + C = GB_spec_random (n, n, 0.5, 1, 'double') ; + M = GB_spec_random (n, n, 0.2, 1, 'double') ; + A = GB_spec_random (n, n, 0.5, 1, 'double') ; + B = GB_spec_random (n, n, 0.5, 1, 'double') ; + + for C_sparsity = [1 2 4 8] + C.sparsity = C_sparsity ; + + for M_sparsity = [1 2 4 8] + M.sparsity = M_sparsity ; + + for A_sparsity = [1 2 4 8] + A.sparsity = A_sparsity ; + + for B_sparsity = [1 2 4 8] + B.sparsity = B_sparsity ; + + C1 = GB_spec_Matrix_eWiseAdd (C, M, [], 'plus', A, B, desc); + C2 = GB_mex_Matrix_eWiseAdd (C, M, [], 'plus', A, B, desc); + GB_spec_compare (C1, C2) ; + + C1 = GB_spec_Matrix_eWiseAdd (C, M, [], 'plus', A, B, [ ]) ; + C2 = GB_mex_Matrix_eWiseAdd (C, M, [], 'plus', A, B, [ ]) ; + GB_spec_compare (C1, C2) ; + end + end + end + fprintf ('.') ; + end +end + +fprintf ('\ntest169: all tests passed\n') ; + diff --git a/GraphBLAS/Test/testall.m b/GraphBLAS/Test/testall.m index 17204c2a56..50cec10f01 100644 --- a/GraphBLAS/Test/testall.m +++ b/GraphBLAS/Test/testall.m @@ -99,6 +99,7 @@ function testall (threads,longtests) % tests with high rates (over 100/sec) %---------------------------------------- +logstat ('test169' ,t, j0 , f1 ) ; % C=A+B with many formats logstat ('test250' ,t, j44 , f10 ) ; % JIT tests, set/get, other tests logstat ('test279' ,t, j0 , f1 ) ; % blob get/set logstat ('test278' ,t, j0 , f1 ) ; % descriptor get/set diff --git a/GraphBLAS/Test/unused/test169.m b/GraphBLAS/Test/unused/test169_orig.m similarity index 100% rename from GraphBLAS/Test/unused/test169.m rename to GraphBLAS/Test/unused/test169_orig.m diff --git a/GraphBLAS/cmake_modules/GraphBLAS_JIT_configure.cmake b/GraphBLAS/cmake_modules/GraphBLAS_JIT_configure.cmake index db377d925e..0ae9693a6b 100644 --- a/GraphBLAS/cmake_modules/GraphBLAS_JIT_configure.cmake +++ b/GraphBLAS/cmake_modules/GraphBLAS_JIT_configure.cmake @@ -114,7 +114,7 @@ if ( GRAPHBLAS_USE_JIT OR GRAPHBLAS_USE_CUDA ) message ( STATUS "------------------------------------------------------------------------" ) # one or both JITs are enabled; make sure the cache path exists message ( STATUS "JIT C compiler: ${GB_C_COMPILER}" ) - message ( STATUS "JIT C flags: ${GB_C_FLAGS}" ) + message ( STATUS "JIT C flags: ${GB_C_FLAGS} ${GB_OPENMP_C_FLAGS}" ) message ( STATUS "JIT link flags: ${GB_C_LINK_FLAGS}" ) message ( STATUS "JIT lib prefix: ${GB_LIB_PREFIX}" ) message ( STATUS "JIT lib suffix: ${GB_LIB_SUFFIX}" ) @@ -133,6 +133,4 @@ file ( MAKE_DIRECTORY "${GRAPHBLAS_CACHE_PATH}/lib" ) file ( MAKE_DIRECTORY "${GRAPHBLAS_CACHE_PATH}/tmp" ) file ( MAKE_DIRECTORY "${GRAPHBLAS_CACHE_PATH}/lock" ) file ( MAKE_DIRECTORY "${GRAPHBLAS_CACHE_PATH}/c" ) -file ( MAKE_DIRECTORY "${GRAPHBLAS_CACHE_PATH}/cu" ) - diff --git a/GraphBLAS/cmake_modules/GraphBLAS_PreJIT.cmake b/GraphBLAS/cmake_modules/GraphBLAS_PreJIT.cmake index ffc8aa851b..e9078c8b0f 100644 --- a/GraphBLAS/cmake_modules/GraphBLAS_PreJIT.cmake +++ b/GraphBLAS/cmake_modules/GraphBLAS_PreJIT.cmake @@ -7,7 +7,10 @@ #------------------------------------------------------------------------------- -# create a list of files +#------------------------------------------------------------------------------- +# create a list of files of CPU PreJIT kernels +#------------------------------------------------------------------------------- + file ( GLOB PRE1 "PreJIT/GB_jit_*.c" ) set ( PREJIT "" ) set ( PREPRO "" ) @@ -112,3 +115,18 @@ configure_file ( "Config/GB_prejit.c.in" "${PROJECT_SOURCE_DIR}/Config/GB_prejit.c" NEWLINE_STYLE LF ) +#------------------------------------------------------------------------------- +# create a list of files of CUDA PreJIT kernels +#------------------------------------------------------------------------------- + +# FIXME: add CUDA PreJIT kernels. For example: + +# ... +# elseif ( ${F} MATCHES "^GB_jit__cuda_reduce" ) +# list ( APPEND PREPRO "JIT_CUDA_RED (" ${F} ")\n" ) +# endif ( ) + +# configure_file ( "CUDA/Config/GB_prejit.c.in" +# "${PROJECT_SOURCE_DIR}/CUDA/Config/GB_prejit.c" +# NEWLINE_STYLE LF ) + diff --git a/GraphBLAS/cmake_modules/GraphBLAS_version.cmake b/GraphBLAS/cmake_modules/GraphBLAS_version.cmake index f5df5d16ae..a3eba6df7d 100644 --- a/GraphBLAS/cmake_modules/GraphBLAS_version.cmake +++ b/GraphBLAS/cmake_modules/GraphBLAS_version.cmake @@ -8,7 +8,7 @@ #------------------------------------------------------------------------------- # version of SuiteSparse:GraphBLAS -set ( GraphBLAS_DATE "Feb XX, 2024" ) # FIXME for SuiteSparse 7.7.0 +set ( GraphBLAS_DATE "Mar 22, 2024" ) set ( GraphBLAS_VERSION_MAJOR 9 CACHE STRING "" FORCE ) set ( GraphBLAS_VERSION_MINOR 1 CACHE STRING "" FORCE ) set ( GraphBLAS_VERSION_SUB 0 CACHE STRING "" FORCE ) diff --git a/GraphBLAS/rmm_wrap/README.md b/GraphBLAS/rmm_wrap/README.md index 1b66003598..f0120f4b3e 100644 --- a/GraphBLAS/rmm_wrap/README.md +++ b/GraphBLAS/rmm_wrap/README.md @@ -5,7 +5,9 @@ SPDX-License-Identifier: Apache-2.0 rmm_wrap defines a single global object, the RMM_Wrap_Handle that holds an RMM (Rapids Memory Manager) memory resource and a hash map (C++ std:unordered_map). This allows rmm_wrap to provide 7 functions to a C -application: +application. + +Note that the rmm_wrap functions are NOT thread safe. Create/destroy an RMM resource: diff --git a/GraphBLAS/rmm_wrap/rmm_wrap.cpp b/GraphBLAS/rmm_wrap/rmm_wrap.cpp index 7246baca51..22362b7c98 100644 --- a/GraphBLAS/rmm_wrap/rmm_wrap.cpp +++ b/GraphBLAS/rmm_wrap/rmm_wrap.cpp @@ -36,6 +36,8 @@ // RMM_Wrap_Handle: a global object containing the RMM context //------------------------------------------------------------------------------ +// NOTE: this is not thread-safe + // rmm_wrap_context is a pointer to an array of global RMM_Wrap_Handle objects // (one per GPU) that all methods in this file can access. The array of // objects cannot be accessed outside this file. @@ -81,7 +83,6 @@ inline auto make_cuda() inline auto make_managed() { - std::cout << "Inside make_managed" << std::endl; return std::make_shared() ; } @@ -131,16 +132,14 @@ inline auto make_and_set_managed_pool std::size_t maximum_size ) { - std::cout<< " make_managed_pool called with init_size" - < ( make_managed(), initial_size, maximum_size ) ; - std::cout << "Created resource" << std::endl; + // std::cout << "Created resource" << std::endl; rmm::mr::set_current_device_resource( resource.get()) ; - std::cout << "Set resource" << std::endl; + // std::cout << "Set resource" << std::endl; return resource; } @@ -185,7 +184,9 @@ void rmm_wrap_finalize (void) //------------------------------------------------------------------------------ // get_current_device: helper to get id for currently selected device //------------------------------------------------------------------------------ -int get_current_device(void) { + +int get_current_device(void) +{ int device_id; cudaGetDevice(&device_id); return device_id; @@ -194,26 +195,29 @@ int get_current_device(void) { //------------------------------------------------------------------------------ // rmm_wrap_initialize: initialize rmm_wrap_context[device_id] //------------------------------------------------------------------------------ -int rmm_wrap_initialize // returns -1 on error, 0 on success + +int rmm_wrap_initialize // returns -1 on error, 0 on success ( - uint32_t device_id, // 2, 5, or 7 - RMM_MODE mode, // TODO: describe. Should we default this? - size_t init_pool_size, // TODO: describe. Should we default this? - size_t max_pool_size, // TODO: describe. Should we default this? - size_t stream_pool_size // TODO: describe. Should we default this? + uint32_t device_id, // 2, 5, or 7 + RMM_MODE mode, // TODO: describe. Should we default this? + size_t init_pool_size, // TODO: describe. Should we default this? + size_t max_pool_size, // TODO: describe. Should we default this? + size_t stream_pool_size // TODO: describe. Should we default this? ) { //-------------------------------------------------------------------------- // check inputs //-------------------------------------------------------------------------- - if(rmm_wrap_context[device_id] != NULL) { - return (-1); + + if (rmm_wrap_context[device_id] != NULL) + { + return (-1) ; } if(stream_pool_size <= 0) { - std::cout << "Stream pool size must be >=0" << std::endl; + // std::cout << "Stream pool size must be >=0" << std::endl; // failed to create the alloc_map return (-1) ; } @@ -230,7 +234,7 @@ int rmm_wrap_initialize // returns -1 on error, 0 on success //-------------------------------------------------------------------------- // Set CUDA stream pool - std::cout << "Creating rmm_wrap stream pool" << std::endl; + // std::cout << "Creating rmm_wrap stream pool" << std::endl; rmm_wrap_context[device_id]->stream_pool = make_and_set_cuda_stream_pool(stream_pool_size); RMM_WRAP_CHECK_CUDA(cudaStreamCreate(&(rmm_wrap_context[device_id]->main_stream))); @@ -255,8 +259,9 @@ int rmm_wrap_initialize // returns -1 on error, 0 on success } else if ( mode == rmm_wrap_managed ) { - std::cout << "Seting managed pool" << std::endl; - rmm_wrap_context[device_id]->resource = make_and_set_managed_pool( init_pool_size, max_pool_size); + // std::cout << "Seting managed pool" << std::endl; + rmm_wrap_context[device_id]->resource = + make_and_set_managed_pool( init_pool_size, max_pool_size); } else { @@ -264,7 +269,7 @@ int rmm_wrap_initialize // returns -1 on error, 0 on success return (-1) ; } - std::cout << "Setting mode for rmm_wrap context" << std::endl; + // std::cout << "Setting mode for rmm_wrap context" << std::endl; // Mark down the mode for reference later rmm_wrap_context[device_id]->mode = mode; @@ -272,11 +277,11 @@ int rmm_wrap_initialize // returns -1 on error, 0 on success // create size map to lookup size of each allocation //-------------------------------------------------------------------------- - std::cout << "Setting size_map for rmm_wrap context" << std::endl; + // std::cout << "Setting size_map for rmm_wrap context" << std::endl; rmm_wrap_context[device_id]->size_map = std::make_shared () ; if (rmm_wrap_context[device_id]->size_map.get() == NULL) { - std::cout << "Failed to create size_map" << std::endl; + // std::cout << "Failed to create size_map" << std::endl; // failed to create the alloc_map return (-1) ; } @@ -291,12 +296,14 @@ int rmm_wrap_initialize // returns -1 on error, 0 on success int rmm_wrap_initialize_all_same ( RMM_MODE mode, // TODO: describe. Should we default this? - size_t init_pool_size, // TODO: describe. Should we default this? + size_t init_pool_size, // TODO: describe. Should we default this? size_t max_pool_size, // TODO: describe. Should we default this? size_t stream_pool_size // TODO: describe. Should we default this? -) { +) +{ - if(rmm_wrap_context != NULL) { + if (rmm_wrap_context != NULL) + { return (-1); } @@ -316,7 +323,7 @@ int rmm_wrap_initialize_all_same intermediate.erase(std::remove_if(intermediate.begin(), intermediate.end(), ::isspace), intermediate.end()); uint32_t device_id = static_cast(stoi(intermediate)); - std::cout << "Found device_id " << device_id << std::endl; + // std::cout << "Found device_id " << device_id << std::endl; devices.push_back(device_id); } /** @@ -325,7 +332,7 @@ int rmm_wrap_initialize_all_same */ } else { devices.push_back(0); - std::cout << "Using default device_id 0" << std::endl; + // std::cout << "Using default device_id 0" << std::endl; } // Allocate rmm_wrap_contexts @@ -333,7 +340,7 @@ int rmm_wrap_initialize_all_same for(int i = 0; i < devices.size(); ++i) { rmm_wrap_context[i] = NULL; uint32_t device_id = devices[i]; - std::cout << "Creating rmm_wrap_context for device_id " << device_id << std::endl; + // std::cout << "Creating rmm_wrap_context for device_id " << device_id << std::endl; int ret = rmm_wrap_initialize(device_id, mode, init_pool_size, max_pool_size, stream_pool_size); if(ret < 0) { return ret; @@ -347,7 +354,8 @@ int rmm_wrap_initialize_all_same // rmm_wrap_get_next_stream_from_pool: return the next available stream from the pool // Output is cudaStream_t //------------------------------------------------------------------------------ -void* rmm_wrap_get_next_stream_from_pool(void) { +void* rmm_wrap_get_next_stream_from_pool(void) +{ return rmm_wrap_context[get_current_device()]->stream_pool->get_stream(); } @@ -355,7 +363,8 @@ void* rmm_wrap_get_next_stream_from_pool(void) { // rmm_wrap_get_stream_from_pool: return specific stream from the pool // Output is cudaStream_t //------------------------------------------------------------------------------ -void* rmm_wrap_get_stream_from_pool(std::size_t stream_id) { +void* rmm_wrap_get_stream_from_pool(std::size_t stream_id) +{ return rmm_wrap_context[get_current_device()]->stream_pool->get_stream(stream_id); } @@ -363,7 +372,8 @@ void* rmm_wrap_get_stream_from_pool(std::size_t stream_id) { // rmm_wrap_get_main_stream: return the main cuda stream // Output is cudaStream_t //------------------------------------------------------------------------------ -void* rmm_wrap_get_main_stream(void) { +void* rmm_wrap_get_main_stream(void) +{ return rmm_wrap_context[get_current_device()]->main_stream; } //------------------------------------------------------------------------------ @@ -477,7 +487,10 @@ void rmm_wrap_free (void *p) void *rmm_wrap_allocate( std::size_t *size) { - if (rmm_wrap_context == NULL) return (NULL) ; + if (rmm_wrap_context == NULL) + { + return (NULL) ; + } uint32_t device_id = get_current_device(); @@ -499,9 +512,6 @@ void *rmm_wrap_allocate( std::size_t *size) *size += (256 - aligned) ; } -// printf(" rmm_wrap_alloc %ld bytes\n",*size) ; - - rmm::mr::device_memory_resource *memoryresource = rmm::mr::get_current_device_resource() ; void *p = memoryresource->allocate( *size ) ; @@ -525,7 +535,10 @@ void *rmm_wrap_allocate( std::size_t *size) void rmm_wrap_deallocate( void *p, std::size_t size) { - if (rmm_wrap_context == NULL) return ; + if (rmm_wrap_context == NULL) + { + return ; + } // Note: there are 3 PANIC cases below. The API of rmm_wrap_deallocate // does not allow an error condition to be returned. These PANICs could be @@ -543,6 +556,7 @@ void rmm_wrap_deallocate( void *p, std::size_t size) } uint32_t device_id = get_current_device(); + // check the size given. If the input size is zero, then the // size is unknown (say rmm_wrap_free(p)). In that case, just trust the // hashmap. Otherwise, double-check to make sure the size is correct. @@ -560,7 +574,7 @@ void rmm_wrap_deallocate( void *p, std::size_t size) //actual_size = am->at( (std::size_t)(p) ) ; auto iter = am->find( (std::size_t)(p) ) ; if (iter != am->end() ) actual_size = iter->second; - else std::cout<< " rmm_wrap:: tried to free unallocated pointer ! " << p ; + // else std::cout<< " rmm_wrap:: tried to free unallocated pointer ! " << p ; } if (actual_size == 0)