diff --git a/GraphBLAS/CMakeLists.txt b/GraphBLAS/CMakeLists.txt
index 9157a4b906..8229506f9d 100644
--- a/GraphBLAS/CMakeLists.txt
+++ b/GraphBLAS/CMakeLists.txt
@@ -203,11 +203,6 @@ configure_file ( "Config/README.md.in"
     "${PROJECT_SOURCE_DIR}/README.md"
     NEWLINE_STYLE LF )
 
-# for CUDA
-configure_file ( "CUDA/Config/GB_cuda_common_jitFactory.hpp.in"
-    "${PROJECT_SOURCE_DIR}/CUDA/GB_cuda_common_jitFactory.hpp"
-    NEWLINE_STYLE LF )
-
 #-------------------------------------------------------------------------------
 # include directories for both graphblas and the demos
 #-------------------------------------------------------------------------------
@@ -465,6 +460,7 @@ if ( GRAPHBLAS_HAS_OPENMP )
         target_link_libraries ( GraphBLAS_static PRIVATE OpenMP::OpenMP_C )
     endif ( )
     message ( STATUS "CMAKE OpenMP C flags:      ${OpenMP_C_FLAGS}" )
+    set ( GB_OPENMP_C_FLAGS "${OpenMP_C_FLAGS}" )
 else ( )
     message ( WARNING
     "WARNING:  OpenMP was not found (or was disabled with "
@@ -485,6 +481,7 @@ else ( )
         "The C compiler does not support thread-local-storage; "
         "GxB_Context_engage will return GrB_NOT_IMPLEMENTED." )
     endif ( )
+    set ( GB_OPENMP_C_FLAGS "" )
 endif ( )
 
 if ( SUITESPARSE_HAS_CUDA AND GRAPHBLAS_USE_CUDA )
diff --git a/GraphBLAS/CUDA/.gitignore b/GraphBLAS/CUDA/.gitignore
index 2650c12fe3..8d9e4b49ad 100644
--- a/GraphBLAS/CUDA/.gitignore
+++ b/GraphBLAS/CUDA/.gitignore
@@ -2,8 +2,6 @@
 *.o
 *.a
 *.so
-jitFactory
-stringify
 rmm_log.txt
 
 # Do not ignore this file
diff --git a/GraphBLAS/CUDA/CMakeLists.txt b/GraphBLAS/CUDA/CMakeLists.txt
index 2b477a36bf..c0c74d825d 100644
--- a/GraphBLAS/CUDA/CMakeLists.txt
+++ b/GraphBLAS/CUDA/CMakeLists.txt
@@ -2,7 +2,7 @@
 # GraphBLAS/CUDA/CMakeLists.txt:  cmake script for GraphBLAS/CUDA
 #-------------------------------------------------------------------------------
 
-# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
 
 # Some files in this folder are (c) NVIDIA or (c) Google.  Please refer
 # to their individual licenses (Apache, BSD, or others).
@@ -12,11 +12,6 @@
 
 cmake_minimum_required ( VERSION 3.20 ) # GraphBLAS can be built stand-alone
 
-# CMake build for generating googletest c++ files that can be compiled and
-# executed in parallel.  Build can be customized to speed up development by
-# allowing the targeting of specific specific parameters. The output of this
-# build is an executable that can be used to run the gtests.
-
 project ( GRAPHBLAS_CUDA
     VERSION "${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB}"
     LANGUAGES CXX CUDA )
@@ -29,7 +24,6 @@ set ( CMAKE_CUDA_FLAGS "-cudart=static -lineinfo " )
 set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++17 -fPIC " )
 
 add_compile_definitions ( GBNCPUFEAT )
-add_compile_definitions ( GBCUDA_CPLUSPLUS )
 
 message ( STATUS "C++ flags for CUDA: ${CMAKE_CXX_FLAGS}" )
 
@@ -160,126 +154,13 @@ if ( NOT MSVC )
 endif ( )
 
 #-------------------------------------------------------------------------------
-# test suite for the CUDA kernels
+# report
 #-------------------------------------------------------------------------------
 
-if ( 0 )
-
-# 1. Execute enumify/stringify/jitify logic to compile ptx kernels and
-# compile/link w/ relevant *.cu files.
-
-# TODO: Need to do this piece in cmake
-
-# 2. Generate test .cu files named "{semiring_operation}_test_instances.hpp"
-set ( CUDA_TEST_SUITES
-    AxB_dot3
-#    reduce_to_scalar
-)
-
-#
-set ( CUDA_TEST_MONOIDS PLUS MIN MAX) # TIMES ANY )
-set ( CUDA_TEST_BINOPS TIMES PLUS MIN MAX DIV ) #MINUS RDIV RMINUS FIRST SECOND PAIR )
-set ( CUDA_TEST_SEMIRINGS PLUS_TIMES MIN_PLUS MAX_PLUS )
-set ( CUDA_TEST_DATATYPES int32_t int64_t uint32_t uint64_t float double )
-set ( CUDA_TEST_KERNELS vsvs) # mp vsvs dndn spdn vssp )
-set ( CUDA_TEST_FORMATS sparse dense sparse_dense reduce )
-
-# TODO: Update testGen.py to accept the above CUDA_TEST_* params as arguments
-
-# Note: I don't believe there's a way to do this particular piece in parallel but
-# once all the files are written, we should be able to compile them in parallel
-
-# Separate individual kernels from larger "overview" test (e.g. 2-level testing structure)
-# We want to test all the *_cuda versions
-
-# TODO: make this a shorter test
-set(CUDA_TEST_CPP_FILES "")
-if ( FALSE ) # TODO: use a cmake option
-    foreach(var ${CUDA_TEST_SUITES})
-        foreach(semiring ${CUDA_TEST_SEMIRINGS})
-            foreach(kernel ${CUDA_TEST_KERNELS})
-                foreach(format ${CUDA_TEST_FORMATS})
-                    # TODO: Have Python script also build separate cudaTest.cpp (named something
-                    # like AxB_dot3_cuda_tests.cpp) for each suite. This way we should be able to
-                    # easily ignore them from the build
-                    add_custom_command(
-                            OUTPUT
-                            ${CMAKE_CURRENT_BINARY_DIR}/${var}_${semiring}_${format}_test_instances.hpp
-                            ${CMAKE_CURRENT_BINARY_DIR}/${var}_${semiring}_${format}_cuda_tests.cpp
-#                            DEPENDS
-#                            jitFactory.hpp
-                            COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/test/testGen_cmake.py "\"${CMAKE_CURRENT_SOURCE_DIR}\"" "\"${var}\"" "\"${CUDA_TEST_MONOIDS}\""
-                                "\"${CUDA_TEST_BINOPS}\"" "\"${semiring}\"" "\"${CUDA_TEST_DATATYPES}\""
-                                "\"${kernel}\""
-                    )
-                    # Construct final list of files to compile (in parallel)
-                    list(APPEND CUDA_TEST_CPP_FILES ${CMAKE_CURRENT_BINARY_DIR}/${var}_${semiring}_${format}_cuda_tests.cpp)
-                endforeach()
-            endforeach()
-        endforeach()
-    endforeach()
-endif ( )
-
-include(FetchContent)
-FetchContent_Declare(
-        googletest
-        # Specify the commit you depend on and update it regularly.
-        URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip
-)
-# For Windows: Prevent overriding the parent project's compiler/linker settings
-set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-FetchContent_GetProperties(googletest)
-if(NOT googletest_POPULATED)
-    FetchContent_Populate(googletest)
-    add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL)
-endif()
-
-#FetchContent_MakeAvailable(googletest EC)
-
-
-#file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes)
-#execute_process(
-#        COMMAND git clone "https://github.com/google/googletest.git" googletest
-#        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes)
-#
-#include_directories(${CMAKE_CURRENT_BINARY_DIR}/external_includes/googletest/googletest/include)
-
-#add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/external_includes/googletest/googletest/)
-
-# 3. Compile/link individual {test_suite_name}_cuda_tests.cpp files into a gtest executable
-set(GRAPHBLAS_CUDA_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/test)
+message ( STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES} ")
+message ( STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER} ")
+message ( STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS} ")
+message ( STATUS "CMAKE_CUDA_FLAGS_RELEASE: ${CMAKE_CUDA_FLAGS_RELEASE} ")
+message ( STATUS "CMAKE_CUDA_FLAGS_DEBUG: ${CMAKE_CUDA_FLAGS_DEBUG} ")
 
-message(STATUS "CUDA tests files: " "${CUDA_TEST_CPP_FILES}")
 
-add_executable(graphblascuda_test ${CUDA_TEST_CPP_FILES} ${CMAKE_CURRENT_SOURCE_DIR}/test/run_tests.cpp)
-
-set_target_properties(graphblascuda_test PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(graphblascuda_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-set_target_properties(graphblascuda_test PROPERTIES CUDA_ARCHITECTURES "52;75;80" )
-
-include(GoogleTest)
-
-if ( ENABLE_SHARED_LIBS )
-    target_link_libraries ( graphblascuda_test PUBLIC GraphBLAS  )
-else ( )
-    target_link_libraries ( graphblascuda_test PUBLIC GraphBLAS_static )
-endif ( )
-
-target_link_libraries ( graphblascuda_test
-    PUBLIC
-    GraphBLAS_CUDA
-    RMM_wrap
-    CUDA::cudart_static
-    CUDA::nvrtc
-    ${ADDITIONAL_DEPS}
-    PRIVATE
-    gtest_main )
-
-target_include_directories ( graphblascuda_test
-    PUBLIC
-    rmm_wrap
-    ${ADDITIONAL_INCLUDES}
-    ${CUDAToolkit_INCLUDE_DIRS}
-    ${GRAPHBLAS_CUDA_INCLUDES} )
-
-endif ( )
diff --git a/GraphBLAS/CUDA/Config/GB_cuda_common_jitFactory.hpp.in b/GraphBLAS/CUDA/Config/GB_cuda_common_jitFactory.hpp.in
deleted file mode 100644
index 5d7ad01e6f..0000000000
--- a/GraphBLAS/CUDA/Config/GB_cuda_common_jitFactory.hpp.in
+++ /dev/null
@@ -1,82 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_common_jitFactory.hpp: for all jitFactory classes
-//------------------------------------------------------------------------------
-
-// (c) Nvidia Corp. 2023 All rights reserved
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-// Common defines for all jitFactory classes:
-// iostream callback to deliver the buffer to jitify as if read from a file
-// compiler flags
-// Include this file along with any jitFactory you need.
-
-// NOTE: do not edit the GB_cuda_common_jitFactory.hpp directly.  It is
-// configured by cmake from the following file:
-// GraphBLAS/CUDA/Config/GB_cuda_common_jitFactory.hpp.in
-
-#ifndef GB_CUDA_COMMON_JITFACTORY_HPP
-#define GB_CUDA_COMMON_JITFACTORY_HPP
-
-#pragma once
-
-#include "GraphBLAS_cuda.h"
-
-extern "C"
-{
-    #include "GB.h"
-    #include "GB_stringify.h"
-}
-
-#include <iostream>
-#include <cstdint>
-#include "GB_cuda_jitify_cache.h"
-#include "GB_cuda_jitify_launcher.h"
-#include "GB_cuda_mxm_factory.hpp"
-#include "GB_cuda_error.h"
-#include "../rmm_wrap/rmm_wrap.h"
-#include "GB_iceil.h"
-
-// amount of shared memory to use in CUDA kernel launches
-constexpr unsigned int SMEM = 0 ;
-
-#if 0
-
-static const std::vector<std::string> GB_jit_cuda_compiler_flags{   // OLD
-   "-std=c++17",
-   //"-G",
-   "-remove-unused-globals",
-   "-w",
-   "-D__CUDACC_RTC__",
-// "-I" + jit::get_user_home_cache_dir(),   // FIXME: add +/cu/00
-// "-I" + jit::get_user_home_cache_dir() + "/src",
-   "-I/usr/local/cuda/include",
-   // FIXME: add SUITESPARSE_CUDA_ARCHITECTURES here, via config
-};
-
-#endif
-
-inline std::vector<std::string> GB_cuda_jit_compiler_flags ( )
-{
-    return (
-        std::vector<std::string>  (
-        {"-std=c++17",
-        //"-G",
-        "-remove-unused-globals",
-        "-w",
-        "-D__CUDACC_RTC__",
-        "-I" + jit::get_user_home_cache_dir(),   // FIXME: add +/cu/00
-        "-I" + jit::get_user_home_cache_dir() + "/src",
-        "-I/usr/local/cuda/include"
-        // FIXME: add SUITESPARSE_CUDA_ARCHITECTURES here, via config
-        })) ;
-} ;
-
-// FIXME: rename GB_jit_cuda_header_names or something
-static const std::vector<std::string> header_names ={};
-
-// FIXME: rename GB_jit_cuda_file_callback
-inline std::istream* (*file_callback)(std::string, std::iostream&);
-
-#endif
diff --git a/GraphBLAS/CUDA/Config/GraphBLAS_CUDA.pc.in b/GraphBLAS/CUDA/Config/GraphBLAS_CUDA.pc.in
index 2f5a31ea12..befb30bbe6 100644
--- a/GraphBLAS/CUDA/Config/GraphBLAS_CUDA.pc.in
+++ b/GraphBLAS/CUDA/Config/GraphBLAS_CUDA.pc.in
@@ -1,4 +1,4 @@
-# GraphBLAS_CUDA, Copyright (c) 2017-2023, Timothy A. Davis.
+# GraphBLAS_CUDA, Copyright (c) 2017-2024, FIXME
 # All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
 
diff --git a/GraphBLAS/CUDA/Config/GraphBLAS_CUDAConfig.cmake.in b/GraphBLAS/CUDA/Config/GraphBLAS_CUDAConfig.cmake.in
index 41db265312..6db344ffb7 100644
--- a/GraphBLAS/CUDA/Config/GraphBLAS_CUDAConfig.cmake.in
+++ b/GraphBLAS/CUDA/Config/GraphBLAS_CUDAConfig.cmake.in
@@ -4,7 +4,7 @@
 
 # The following copyright and license applies to just this file only, not to
 # the library itself:
-# GraphBLASConfig.cmake, Copyright (c) 2023, Timothy A. Davis.  All Rights Reserved.
+# GraphBLASConfig.cmake, Copyright (c) 2023-2024, FIXME
 # SPDX-License-Identifier: BSD-3-clause
 
 #-------------------------------------------------------------------------------
diff --git a/GraphBLAS/CUDA/GB_cuda.h b/GraphBLAS/CUDA/GB_cuda.h
deleted file mode 100644
index 3dac3a7c5e..0000000000
--- a/GraphBLAS/CUDA/GB_cuda.h
+++ /dev/null
@@ -1,139 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda.h
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-#ifndef GB_CUDA_H
-#define GB_CUDA_H
-
-extern "C"
-{ 
-    #include "GB_dev.h"
-    #include "GB_compiler.h"
-    #include "GB_cpu_features.h"
-    #include "GB_warnings.h"
-}
-
-#include "GraphBLAS_cuda.h"
-
-extern "C"
-{
-    #include <cassert>
-    #include <cmath>
-    #include "GB.h"
-}
-
-// Finally, include the CUDA definitions
-#include "cuda_runtime.h"
-#include "cuda.h"
-// #include "cub.cuh"
-#include "jitify.hpp"
-#include "GB_cuda_mxm_factory.hpp"
-
-#include <iostream>
-
-#define CHECK_CUDA_SIMPLE(call)                                           \
-  do {                                                                    \
-    cudaError_t err = call;                                               \
-    if (err != cudaSuccess) {                                             \
-      const char* str = cudaGetErrorName( err);                           \
-      std::cout << "(CUDA runtime) returned " << str;                     \
-      std::cout << " (" << __FILE__ << ":" << __LINE__ << ":" << __func__ \
-                << "())" << std::endl;                                    \
-      return (GrB_PANIC) ;                                                \
-    }                                                                     \
-  } while (0)
-
-#define CU_OK(call) CHECK_CUDA_SIMPLE(call)
-
-//------------------------------------------------------------------------------
-// GB_CUDA_CATCH: catch error from a try { ... } region
-//------------------------------------------------------------------------------
-
-//  #define GB_FREE_ALL { some macro to free all temporaries }
-//  GrB_Info info ;
-//  try { ... do stuff that can throw an exception }
-//  GB_CUDA_CATCH (info) ;
-
-#define GB_CUDA_CATCH(info)                                     \
-    catch (std::exception& e)                                   \
-    {                                                           \
-        printf ("CUDA error: %s\n", e.what ( )) ;               \
-        info = GrB_PANIC ;                                      \
-        /* out_of_memory : info = GrB_OUT_OF_MEMORY ; */        \
-        /* nulltpr:  info = ... ; */                            \
-        /* no gpus here: info = GrB_PANIC ; */                  \
-    }                                                           \
-    if (info != GrB_SUCCESS)                                    \
-    {                                                           \
-        /* CUDA failed */                                       \
-        GB_FREE_ALL ;                                           \
-        return (info) ;                                         \
-    }
-
-// NBUCKETS buckets: computed by up to NBUCKETS-1 kernel launches (zombies need
-// no work...), using different kernels (with different configurations
-// depending on the bucket).
-
-#include "GB_cuda_buckets.h"
-
-extern "C"
-{
-    #include "GB_stringify.h"
-}
-
-//------------------------------------------------------------------------------
-// prefetch and memadvise
-//------------------------------------------------------------------------------
-
-// for the "which" parameter of GB_cuda_matrix_prefetch:
-// FIXME: rename this to GB_WHATEVER_P for GB_cuda_matrix_advise
-#define GB_PREFETCH_P   1
-#define GB_PREFETCH_H   2
-#define GB_PREFETCH_Y   4
-#define GB_PREFETCH_B   8
-#define GB_PREFETCH_I  16
-#define GB_PREFETCH_X  32
-#define GB_PREFETCH_PIX   (GB_PREFETCH_P + GB_PREFETCH_I + GB_PREFETCH_X)
-#define GB_PREFETCH_PYI   (GB_PREFETCH_P + GB_PREFETCH_Y + GB_PREFETCH_I)
-#define GB_PREFETCH_PYBI  (GB_PREFETCH_PYI + GB_PREFETCH_B)
-#define GB_PREFETCH_PYBIX (GB_PREFETCH_PYBI + GB_PREFETCH_X)
-#define GB_PREFETCH_PHI   (GB_PREFETCH_P + GB_PREFETCH_H + GB_PREFETCH_I)
-#define GB_PREFETCH_PHBI  (GB_PREFETCH_PHI + GB_PREFETCH_B)
-#define GB_PREFETCH_PHBIX (GB_PREFETCH_PHBI + GB_PREFETCH_X)
-
-GrB_Info GB_cuda_matrix_prefetch
-(
-    GrB_Matrix A,
-    int which,              // which components to prefetch (phybix control)
-    int device,             // GPU device or cudaCpuDeviceId
-    cudaStream_t stream
-) ;
-
-#if 0
-// do we need this function too?
-GrB_Info GB_cuda_matrix_advise
-(
-    GrB_Matrix A,
-
-    p, h, y, b, i, x?   6 bools
-
-    what to do:  advise (prefer location? access by)?  prefetch? nothing?
-        avdice: enum (1 to 6)
-
-    int device,             // GPU device or cudaCpuDeviceId
-) ;
-#endif
-
-void GB_cuda_upscale_identity
-(
-    GB_void *identity_upscaled,     // output: at least sizeof (uint16_t)
-    GrB_Monoid monoid               // input: monoid to upscale
-) ;
-
-#endif
-
diff --git a/GraphBLAS/CUDA/GB_cuda.hpp b/GraphBLAS/CUDA/GB_cuda.hpp
new file mode 100644
index 0000000000..baa0bc23ee
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda.hpp
@@ -0,0 +1,97 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/GB_cuda.hpp: include file for host CUDA methods (not for JIT)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_CUDA_HPP
+#define GB_CUDA_HPP
+
+extern "C"
+{ 
+    #include "GB_dev.h"
+    #include "GB_compiler.h"
+    #include "GB_cpu_features.h"
+    #include "GB_warnings.h"
+}
+
+#include "GraphBLAS_cuda.hpp"
+
+extern "C"
+{
+    #include <cassert>
+    #include <cmath>
+    #include "GB.h"
+    #include "GB_stringify.h"
+    #include "GB_iceil.h"
+}
+
+// Finally, include the CUDA definitions
+#include "cuda_runtime.h"
+#include "cuda.h"
+
+#include <limits>
+#include <iostream>
+#include <cstdint>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "GB_cuda_error.hpp"
+#include "GB_cuda_timer.hpp"
+
+//------------------------------------------------------------------------------
+// prefetch and memadvise
+//------------------------------------------------------------------------------
+
+// for the "which" parameter of GB_cuda_matrix_prefetch:
+// FIXME: rename this to GB_WHATEVER_P for GB_cuda_matrix_advise
+
+#define GB_PREFETCH_P   1
+#define GB_PREFETCH_H   2
+#define GB_PREFETCH_Y   4
+#define GB_PREFETCH_B   8
+#define GB_PREFETCH_I  16
+#define GB_PREFETCH_X  32
+#define GB_PREFETCH_PIX   (GB_PREFETCH_P + GB_PREFETCH_I + GB_PREFETCH_X)
+#define GB_PREFETCH_PYI   (GB_PREFETCH_P + GB_PREFETCH_Y + GB_PREFETCH_I)
+#define GB_PREFETCH_PYBI  (GB_PREFETCH_PYI + GB_PREFETCH_B)
+#define GB_PREFETCH_PYBIX (GB_PREFETCH_PYBI + GB_PREFETCH_X)
+#define GB_PREFETCH_PHI   (GB_PREFETCH_P + GB_PREFETCH_H + GB_PREFETCH_I)
+#define GB_PREFETCH_PHBI  (GB_PREFETCH_PHI + GB_PREFETCH_B)
+#define GB_PREFETCH_PHBIX (GB_PREFETCH_PHBI + GB_PREFETCH_X)
+
+GrB_Info GB_cuda_matrix_prefetch
+(
+    GrB_Matrix A,
+    int which,              // which components to prefetch (phybix control)
+    int device,             // GPU device or cudaCpuDeviceId
+    cudaStream_t stream
+) ;
+
+#if 0
+// do we need this function too?
+GrB_Info GB_cuda_matrix_advise
+(
+    GrB_Matrix A,
+
+    p, h, y, b, i, x?   6 bools
+
+    what to do:  advise (prefer location? access by)?  prefetch? nothing?
+        avdice: enum (1 to 6)
+
+    int device,             // GPU device or cudaCpuDeviceId
+) ;
+#endif
+
+void GB_cuda_upscale_identity
+(
+    GB_void *identity_upscaled,     // output: at least sizeof (uint32_t)
+    GrB_Monoid monoid               // input: monoid to upscale
+) ;
+
+#endif
+
diff --git a/GraphBLAS/CUDA/GB_cuda_AxB.hpp b/GraphBLAS/CUDA/GB_cuda_AxB.hpp
new file mode 100644
index 0000000000..19a319777e
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_AxB.hpp
@@ -0,0 +1,33 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/GB_cuda_AxB.hpp
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_CUDA_AXB_H
+#define GB_CUDA_AXB_H
+
+#include "GB_cuda.hpp"
+#include "GB_hash.h"
+
+GrB_Info GB_cuda_AxB_dot3_jit
+(
+    // input/output:
+    GrB_Matrix C,               // FIXME: allow iso for this kernel
+    // input:
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A,
+    const GrB_Matrix B,
+    const GrB_Semiring semiring,
+    const bool flipxy,
+    // CUDA stream, device, and # of ms
+    cudaStream_t stream,
+    int device,
+    int number_of_sms
+) ;
+
+#endif
+
diff --git a/GraphBLAS/CUDA/GB_cuda_AxB_dot3.cpp b/GraphBLAS/CUDA/GB_cuda_AxB_dot3.cpp
new file mode 100644
index 0000000000..df14d833cf
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_AxB_dot3.cpp
@@ -0,0 +1,257 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/GB_cuda_AxB_dot3: compute C<M> = A'*B on GPU(s)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This function computes C<M>=A'*B on the GPUs.  The mask must be present,
+// sparse or hypersparse, and not complemented.  The mask is always applied.  A
+// and B can have any sparsity format.  C is computed as sparse or hypersparse,
+// with the same format as M.
+
+#define GB_FREE_WORKSPACE                                               \
+{                                                                       \
+    /* FIXME: use a stream pool instead */                              \
+    if (stream != nullptr) cudaStreamDestroy (stream) ;                 \
+    stream = nullptr ;                                                  \
+}
+
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_FREE_WORKSPACE ;                                                 \
+    GB_phybix_free (C) ;                                                \
+}
+
+#include "GB_cuda_AxB.hpp"
+
+//------------------------------------------------------------------------------
+// GB_cuda_AxB_dot3
+//------------------------------------------------------------------------------
+
+GrB_Info GB_cuda_AxB_dot3           // C<M> = A'*B using dot product method
+(
+    GrB_Matrix C,                   // output matrix
+    const GrB_Matrix M,             // mask matrix
+    const bool Mask_struct,         // if true, use the only structure of M
+    const GrB_Matrix A,             // input matrix
+    const GrB_Matrix B,             // input matrix
+    const GrB_Semiring semiring,    // semiring that defines C=A*B
+    const bool flipxy               // if true, do z=fmult(b,a) vs fmult(a,b)
+)
+{
+
+    cudaStream_t stream = nullptr ;
+
+    //--------------------------------------------------------------------------
+    // create the stream
+    //--------------------------------------------------------------------------
+
+    // FIXME: pass in a stream instead, or checkout a stream
+    CUDA_OK (cudaStreamCreate (&stream)) ;
+    GpuTimer kernel_timer; 
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    // when CUDA is enabled, no static headers are used in all of GraphBLAS
+    GrB_Info info ;
+    ASSERT (C != NULL && !(C->static_header)) ;
+    ASSERT (M != NULL && !(M->static_header)) ;
+    ASSERT (A != NULL && !(A->static_header)) ;
+    ASSERT (B != NULL && !(B->static_header)) ;
+
+    ASSERT_MATRIX_OK (M, "M for dot3 cuda A'*B", GB0) ;
+    ASSERT_MATRIX_OK (A, "A for dot3 cuda A'*B", GB0) ;
+    ASSERT_MATRIX_OK (B, "B for dot3 cuda A'*B", GB0) ;
+
+    ASSERT (!GB_PENDING (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+
+    ASSERT (!GB_PENDING (B)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+
+    ASSERT_SEMIRING_OK (semiring, "semiring for dot3 numeric A'*B", GB0) ;
+
+    ASSERT (A->vlen == B->vlen) ;
+    GBURBLE ("(GPU dot3) ") ;
+
+    //--------------------------------------------------------------------------
+    // initializations
+    //--------------------------------------------------------------------------
+
+    int device = -1;
+
+    // FIXME: control the GPU to use via the descriptor
+    CUDA_OK (cudaSetDevice ( 0 )) ;
+    CUDA_OK (cudaGetDevice (&device)) ;
+    int number_of_sms = GB_Global_gpu_sm_get (0) ;
+
+    //--------------------------------------------------------------------------
+    // get M, A, and B
+    //--------------------------------------------------------------------------
+
+    const int64_t mvlen = M->vlen ;
+    const int64_t mvdim = M->vdim ;
+    const int64_t mnz = GB_nnz (M) ;
+    const int64_t mnvec = M->nvec ;
+    const bool M_is_hyper = GB_IS_HYPERSPARSE( M ) ;
+
+    const int64_t anz = GB_nnz (A) ;
+    const int64_t anvec = A->nvec ;
+    bool A_is_sparse = GB_IS_SPARSE (A) ;
+    bool A_is_hyper  = GB_IS_HYPERSPARSE (A) ;
+    bool A_is_bitmap = GB_IS_BITMAP (A) ;
+    bool A_is_full   = GB_IS_FULL (A) ;
+    bool A_is_sparse_or_hyper = A_is_sparse || A_is_hyper ;
+    bool A_is_bitmap_or_full  = A_is_bitmap || A_is_full  ;
+
+    const int64_t bnz = GB_nnz (B) ;
+    const int64_t bnvec = B->nvec ;
+    bool B_is_sparse = GB_IS_SPARSE (B) ;
+    bool B_is_hyper  = GB_IS_HYPERSPARSE (B) ;
+    bool B_is_bitmap = GB_IS_BITMAP (B) ;
+    bool B_is_full   = GB_IS_FULL (B) ;
+    bool B_is_sparse_or_hyper = B_is_sparse || B_is_hyper ;
+    bool B_is_bitmap_or_full  = B_is_bitmap || B_is_full  ;
+
+    //--------------------------------------------------------------------------
+    // get the semiring operators
+    //--------------------------------------------------------------------------
+
+    GrB_BinaryOp mult = semiring->multiply ;
+    GrB_Monoid add = semiring->add ;
+    ASSERT (mult->ztype == add->op->ztype) ;
+    GB_Opcode mult_opcode = mult->opcode ;
+    if (mult->xtype->code == GB_BOOL_code)
+    {
+        mult_opcode = GB_boolean_rename (mult_opcode) ;
+    }
+    bool A_is_pattern, B_is_pattern ;
+    GB_binop_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult_opcode) ;
+
+    //--------------------------------------------------------------------------
+    // allocate C, the same size and # of entries as M
+    //--------------------------------------------------------------------------
+
+    // FUTURE: ctype need not be the op->ztype
+    GrB_Type ctype = add->op->ztype ;
+    int64_t cvlen = mvlen ;
+    int64_t cvdim = mvdim ;
+    int64_t cnz = mnz ;
+    int64_t cnvec = mnvec ;
+
+    int M_sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
+    int C_sparsity = M_sparsity ;
+    bool C_iso = false ;    // FIXME: pass in C_iso and cscalar
+    bool C_in_iso = false ;    // FIXME: pass in C_in_iso and cscalar
+
+    if (C_iso)
+    {
+        A_is_pattern = true ;
+        B_is_pattern = true ;
+    }
+
+    GB_OK (GB_new_bix (&C, // sparse or hyper (from M), existing header
+        ctype, cvlen, cvdim, GB_Ap_malloc, true,
+        M_sparsity, false, M->hyper_switch, cnvec,
+        cnz+1,  // add one to cnz for GB_cumsum of Cwork 
+        true, C_iso)) ;
+
+    //--------------------------------------------------------------------------
+    // Pre-fetch arrays that will be used on the device
+    //--------------------------------------------------------------------------
+
+    // GB_cuda_matrix_advise (C, cnvec, cnz, which, what, device)
+    // advise C
+    CUDA_OK (cudaMemAdvise (C->p, (cnvec+1) * sizeof ( int64_t),
+        cudaMemAdviseSetPreferredLocation, device)) ;
+    if (M_is_hyper)
+    { 
+        CUDA_OK (cudaMemAdvise (C->h, cnvec * sizeof ( int64_t),
+            cudaMemAdviseSetPreferredLocation, device)) ;
+    }
+    CUDA_OK (cudaMemAdvise (C->i, (cnz+1) * sizeof ( int64_t),
+        cudaMemAdviseSetPreferredLocation, device)) ;
+    if (!C_iso)
+    {
+        CUDA_OK (cudaMemAdvise (C->x, (cnz+1) * C->type->size ,
+            cudaMemAdviseSetPreferredLocation, device)) ;
+    }
+
+    // prefetch M (if M hypersparse: using M->h not M->Y)
+    GB_OK (GB_cuda_matrix_prefetch (M,
+        Mask_struct ? GB_PREFETCH_PHBI : GB_PREFETCH_PHBIX, device, stream)) ;
+
+    //--------------------------------------------------------------------------
+    // copy Mp and Mh into C
+    //--------------------------------------------------------------------------
+
+    // FIXME: use shallow?
+    CUDA_OK (cudaMemcpyAsync (C->p, M->p, (cnvec+1) * sizeof (int64_t),
+        cudaMemcpyDefault, stream)) ;
+    if (M_is_hyper)
+    { 
+        CUDA_OK (cudaMemcpyAsync (C->h, M->h, cnvec * sizeof (int64_t),
+            cudaMemcpyDefault, stream)) ;
+    }
+
+    C->nvals = cnz ;
+    C->magic = GB_MAGIC ;
+    C->nvec_nonempty = M->nvec_nonempty ;
+    C->jumbled = GB_JUMBLED (M) ;   // C is jumbled if M is jumbled
+
+    GBURBLE ("(GPU C created and copied from M) ") ;
+
+    //--------------------------------------------------------------------------
+    // prefetch A and B
+    //--------------------------------------------------------------------------
+
+    // M might be very very sparse.  A(:,i) is not needed if M(:,i) is empty.
+    // Likewise, B(:,j) is not needed if M(:,j) is empty.  For now, try this
+    // heuristic:  if M is hypersparse, then do not prefetch A->b or A->x.  
+
+    int prefetch_b = (M_is_hyper) ? 0 : GB_PREFETCH_B ;
+    int prefetch_x = (M_is_hyper) ? 0 : GB_PREFETCH_X ;
+    int prefetch_pybi = GB_PREFETCH_PYI + prefetch_b ;
+
+    // prefetch A (if A hypersparse: using A->Y)
+    GB_OK (GB_cuda_matrix_prefetch (A, prefetch_pybi +
+        (A_is_pattern ? 0 : prefetch_x), device, stream)) ;
+
+    // prefetch B (if B hypersparse: using B->Y)
+    GB_OK (GB_cuda_matrix_prefetch (B, prefetch_pybi +
+        (B_is_pattern ? 0 : prefetch_x), device, stream)) ;
+
+    //--------------------------------------------------------------------------
+    // C<M>=A'*B on CUDA, in the JIT
+    //--------------------------------------------------------------------------
+
+//  final call looks like this:
+//  GB_OK (GB_cuda_AxB_dot3_jit (C, M, Mask_struct, A, B, semiring, flipxy,
+//      stream, device, number_of_sms)) ;
+
+//  debugging for now, to die early if the CUDA fails to compile, load, or run:
+    info = GB_cuda_AxB_dot3_jit (C, M, Mask_struct, A, B, semiring, flipxy,
+        stream, device, number_of_sms) ;
+    if (info == GrB_NO_VALUE) info = GrB_PANIC ;
+    GB_OK (info) ;
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    GB_FREE_WORKSPACE ;
+    return GrB_SUCCESS; 
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch.cpp b/GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch.cpp
index c69dc2132f..cac90233f2 100644
--- a/GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch.cpp
+++ b/GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch.cpp
@@ -2,20 +2,14 @@
 // GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch: decide to use GPU for dot3
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Decide branch direction for GPU use for the dot-product MxM
+// Decide branch direction for GPU use for the dot-product C<M>=A'*B
 
-#include "GraphBLAS_cuda.h"
-
-extern "C" 
-{
-  #include "GB_mxm.h"
-}
-#include "GB_cuda.h"
+#include "GB_cuda.hpp"
 #include <cuda_runtime.h>
 
 bool GB_cuda_AxB_dot3_branch 
@@ -36,8 +30,12 @@ bool GB_cuda_AxB_dot3_branch
         !GB_cuda_type_branch (semiring->multiply->ztype))
     {
         // one or more types are not yet supported on the GPU
-        // FIXME: remove debug output here:
-        std::cout << "Not using cuda path: type size not supported" <<  std::endl;
+        return (false) ;
+    }
+
+    if (A->vlen == 0)
+    {
+        // C has no entries: no need to compute it on the GPU
         return (false) ;
     }
 
@@ -46,9 +44,6 @@ bool GB_cuda_AxB_dot3_branch
     double bdeg = ((double) GB_nnz (B)) / ((double) GB_IMAX (1, B->nvec)) ;
     double work = GB_nnz (M) * GB_IMIN (adeg, bdeg) ;
 
-    // TODO if A or B are not accessed (first, 2nd, or pair ops)
-    // then the type if A can be user-defined here, for CUDA.
-
     int ngpus_to_use = GB_ngpus_to_use (work) ;
     GBURBLE (" work:%g GPUs:%d ", work, ngpus_to_use) ;
     if (ngpus_to_use > 0)
@@ -60,9 +55,8 @@ bool GB_cuda_AxB_dot3_branch
     }
     else
     {
-        // FIXME: remove debug output here:
-        std::cout << "Not using cuda path." <<  std::endl;
+//      std::cout << "Not using cuda path for dot3." <<  std::endl;
         return false ;
     }
-
 }
+
diff --git a/GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit.cpp b/GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit.cpp
index 5933403dd9..fab1a7bdad 100644
--- a/GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit.cpp
+++ b/GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit.cpp
@@ -1,464 +1,65 @@
 //------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit: compute C<M> = A'*B on GPU(s)
+// GB_cuda_AxB_dot3_jit: reduce a matrix to a scalar, via the CUDA JIT
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// This function computes C<M>=A'*B on the GPUs.  The mask must be present,
-// sparse or hypersparse, and not complemented.  The mask is always applied.  A
-// and B can have any sparsity format.  C is computed as sparse or hypersparse,
-// with the same format as M.
+#include "GB_cuda_AxB.hpp"
 
-#include "GB_cuda.h"
 extern "C"
 {
-    #include "GB_mxm.h"
+    typedef GB_JIT_CUDA_KERNEL_DOT3_PROTO ((*GB_jit_dl_function)) ;
 }
 
-#include "GB_cuda_jitify_cache.h"
-#include "GB_cuda_common_jitFactory.hpp"
-#include "GB_cuda_reduce_jitFactory.hpp"
-#include "GB_cuda_mxm_dot3_jitFactory.hpp"
-#include "test/GpuTimer.h"
-
-/*
-template<typename T, typename I>
-void print_array(void *arr, I size, const char *name) {
-    std::cout << "Printing " << name << std::endl;
-    for(I i = 0; i < size; ++i) {
-        std::cout << static_cast<T*>(arr)[i] << ", ";
-    }
-    std::cout << std::endl << "Done." << std::endl;
-}
-*/
-
-#undef  GB_FREE_WORKSPACE
-#define GB_FREE_WORKSPACE                                               \
-{                                                                       \
-    /* FIXME: use a stream pool instead */                              \
-    CU_OK (cudaStreamSynchronize(stream));                              \
-    CU_OK (cudaStreamDestroy(stream));                                  \
-    GB_FREE_WORK (&Nanobuckets, Nb_size) ;                              \
-    GB_FREE_WORK (&Blockbucket, Bb_size) ;                              \
-    GB_FREE_WORK (&Bucketp, Bup_size) ;                                 \
-    GB_FREE_WORK (&offset, O_size) ;                                    \
-    GB_FREE_WORK (&Bucket, Bu_size) ;                                   \
-}
-
-#undef  GB_FREE_ALL
-#define GB_FREE_ALL                                                     \
-{                                                                       \
-    GB_FREE_WORKSPACE ;                                                 \
-    GB_phybix_free (C) ;                                                \
-}
-
-//------------------------------------------------------------------------------
-// GB_AxB_dot3_cuda
-//------------------------------------------------------------------------------
-
-GrB_Info GB_cuda_AxB_dot3_jit       // C<M> = A'*B using dot product method
+GrB_Info GB_cuda_AxB_dot3_jit
 (
-    GrB_Matrix C,                   // output matrix
-    const GrB_Matrix M,             // mask matrix
-    const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_Matrix A,             // input matrix
-    const GrB_Matrix B,             // input matrix
-    const GrB_Semiring semiring,    // semiring that defines C=A*B
-    const bool flipxy               // if true, do z=fmult(b,a) vs fmult(a,b)
+    // input/output:
+    GrB_Matrix C,               // FIXME: allow iso for this kernel
+    // input:
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A,
+    const GrB_Matrix B,
+    const GrB_Semiring semiring,
+    const bool flipxy,
+    // CUDA stream, device, and # of ms
+    cudaStream_t stream,
+    int device,
+    int number_of_sms
 )
-{
-
-    // FIXME: pass in a stream instead, or checkout a stream
-    cudaStream_t stream = NULL ;
-    CU_OK (cudaStreamCreate(&stream));
-
-    GpuTimer kernel_timer; 
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    // when CUDA is enabled, no static headers are used in all of GraphBLAS
-    GrB_Info info ;
-    ASSERT (C != NULL && !(C->static_header)) ;
-    ASSERT (M != NULL && !(M->static_header)) ;
-    ASSERT (A != NULL && !(A->static_header)) ;
-    ASSERT (B != NULL && !(B->static_header)) ;
-
-    ASSERT_MATRIX_OK (M, "M for dot3 cuda A'*B", GB0) ;
-    ASSERT_MATRIX_OK (A, "A for dot3 cuda A'*B", GB0) ;
-    ASSERT_MATRIX_OK (B, "B for dot3 cuda A'*B", GB0) ;
-
-    ASSERT (!GB_PENDING (M)) ;
-    ASSERT (GB_JUMBLED_OK (M)) ;
-    ASSERT (!GB_ZOMBIES (M)) ;
-
-    ASSERT (!GB_PENDING (A)) ;
-    ASSERT (!GB_JUMBLED (A)) ;
-    ASSERT (!GB_ZOMBIES (A)) ;
-
-    ASSERT (!GB_PENDING (B)) ;
-    ASSERT (!GB_ZOMBIES (B)) ;
-    ASSERT (!GB_JUMBLED (B)) ;
-
-    ASSERT_SEMIRING_OK (semiring, "semiring for dot3 numeric A'*B", GB0) ;
-
-    ASSERT (A->vlen == B->vlen) ;
-    GBURBLE ("(GPU dot3) ") ;
-    //printf ("\nM -------------\n") ; GxB_Matrix_fprint (M, "M", GxB_SHORT, stdout) ;
-    //printf ("\nA -------------\n") ; GxB_Matrix_fprint (A, "A", GxB_SHORT, stdout) ;
-    //printf ("\nB -------------\n") ; GxB_Matrix_fprint (B, "B", GxB_SHORT, stdout) ;
-
-    //--------------------------------------------------------------------------
-    // initializations
-    //--------------------------------------------------------------------------
-
-    int64_t *Nanobuckets = NULL ; size_t Nb_size  = 0 ;
-    int64_t *Blockbucket = NULL ; size_t Bb_size  = 0 ;
-    int64_t *Bucket = NULL      ; size_t Bu_size  = 0 ;
-    int64_t *Bucketp = NULL     ; size_t Bup_size = 0 ;
-    int64_t *offset = NULL      ; size_t O_size   = 0 ;
-
-    int device = -1;
-
-    // FIXME: control the GPU to use via the descriptor
-    CU_OK (cudaSetDevice( 0 ));
-    CU_OK (cudaGetDevice(&device));
-
-    //--------------------------------------------------------------------------
-    // get M
-    //--------------------------------------------------------------------------
-
-    const int64_t mvlen = M->vlen ;
-    const int64_t mvdim = M->vdim ;
-    const int64_t mnz = GB_nnz (M) ;
-    const int64_t mnvec = M->nvec ;
-    const bool M_is_hyper = GB_IS_HYPERSPARSE( M ) ;
-
-    const int64_t anz = GB_nnz (A) ;
-    const int64_t anvec = A->nvec ;
-
-    const int64_t bnz = GB_nnz (B) ;
-    const int64_t bnvec = B->nvec ;
-
-    //--------------------------------------------------------------------------
-    // allocate C, the same size and # of entries as M
-    //--------------------------------------------------------------------------
-
-    // FUTURE: ctype need not be the op->ztype
-    GrB_Type ctype = semiring->add->op->ztype ;
-    int64_t cvlen = mvlen ;
-    int64_t cvdim = mvdim ;
-    int64_t cnz = mnz ;
-    int64_t cnvec = mnvec ;
-
-    int M_sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
-    int C_sparsity = M_sparsity ;
-    bool C_iso = false ;    // FIXME: pass in C_iso and cscalar
-    bool C_in_iso = false ;    // FIXME: pass in C_in_iso and cscalar
-    info = GB_new_bix (&C, // sparse or hyper (from M), existing header
-        ctype, cvlen, cvdim, GB_Ap_malloc, true,
-        M_sparsity, false, M->hyper_switch, cnvec,
-        cnz+1,  // add one to cnz for GB_cumsum of Cwork 
-        true, C_iso) ;
-
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory
-        GB_FREE_ALL ;
-        return (info) ;
-    }
-
-//  try this with GB_Ap_null, above in GB_new_bix
-//  C->p = M->p ; C->p_shallow = true ;
-//  C->h = M->h ; C->h_shallow = true ;
-
-    //--------------------------------------------------------------------------
-    // Pre-fetch arrays that will be used on the device
-    //--------------------------------------------------------------------------
-
-    // GB_cuda_matrix_advise (C, cnvec, cnz, which, what, device)
-    // advise C
-    CU_OK (cudaMemAdvise (C->p, (cnvec+1) * sizeof ( int64_t),
-        cudaMemAdviseSetPreferredLocation, device)) ;
-    if (M_is_hyper)
-    { 
-        CU_OK (cudaMemAdvise (C->h, cnvec * sizeof ( int64_t),
-            cudaMemAdviseSetPreferredLocation, device)) ;
-    }
-    CU_OK (cudaMemAdvise (C->i, (cnz+1) * sizeof ( int64_t),
-        cudaMemAdviseSetPreferredLocation, device)) ;
-    if (!C_iso)
-    {
-        CU_OK (cudaMemAdvise (C->x, (cnz+1) * C->type->size ,
-            cudaMemAdviseSetPreferredLocation, device)) ;
-    }
-
-    // prefetch M (if M hypersparse: using M->h not M->Y)
-    GB_OK (GB_cuda_matrix_prefetch (M,
-        Mask_struct ? GB_PREFETCH_PHBI : GB_PREFETCH_PHBIX, device, stream)) ;
-
-    //--------------------------------------------------------------------------
-    // copy Mp and Mh into C
-    //--------------------------------------------------------------------------
-
-    // FIXME: use shallow?
-    CU_OK (cudaMemcpyAsync (C->p, M->p, (cnvec+1) * sizeof (int64_t),
-        cudaMemcpyDefault, stream)) ;
-    if (M_is_hyper)
-    { 
-        CU_OK (cudaMemcpyAsync (C->h, M->h, cnvec * sizeof (int64_t),
-            cudaMemcpyDefault, stream)) ;
-    }
-
-    C->nvals = cnz ;
-    C->magic = GB_MAGIC ;
-    C->nvec_nonempty = M->nvec_nonempty ;
-    C->jumbled = GB_JUMBLED (M) ;   // C is jumbled if M is jumbled
-
-    GBURBLE ("(GPU C created and copied from M) ") ;
+{ 
 
     //--------------------------------------------------------------------------
-    // stringify the semiring and the mask
+    // encodify the problem
     //--------------------------------------------------------------------------
 
-    GB_cuda_mxm_factory my_mxm_spec = GB_cuda_mxm_factory ( ) ;
-
-    // (1) create the mxm code and name
-    my_mxm_spec.mxm_factory ( C_iso, C_in_iso, C_sparsity, ctype,
+    GB_jit_encoding encoding ;
+    char *suffix ;
+    uint64_t hash = GB_encodify_mxm (&encoding, &suffix,
+        GB_JIT_CUDA_KERNEL_AXB_DOT3,
+        // FIXME: all C to be iso
+        /* C->iso: */ false, false, GB_sparsity (C), C->type,
         M, Mask_struct, false, semiring, flipxy, A, B) ;
 
-    // (2) ensure the jitifier has "GB_mxm_[my_mxm_spec.sr_code].h"
-    jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
-    filecache.getFile (my_mxm_spec) ;
-
-    GBURBLE ("(GPU stringified srcode = %lu)\n", my_mxm_spec.sr_code) ;
-
-    //--------------------------------------------------------------------------
-    // get A and B
-    //--------------------------------------------------------------------------
-
-    // FIXME: add acode, bcode to the GB_cuda_mxm_factory object
-    int acode = GB_RSHIFT (my_mxm_spec.sr_code, 12, 4) ;   // if 0: A is pattern
-    int bcode = GB_RSHIFT (my_mxm_spec.sr_code,  8, 4) ;   // if 0: B is pattern
-
-    bool A_is_sparse = GB_IS_SPARSE (A) ;
-    bool A_is_hyper  = GB_IS_HYPERSPARSE (A) ;
-    bool A_is_bitmap = GB_IS_BITMAP (A) ;
-    bool A_is_full   = GB_IS_FULL (A) ;
-    bool A_is_sparse_or_hyper = A_is_sparse || A_is_hyper ;
-    bool A_is_bitmap_or_full  = A_is_bitmap || A_is_full  ;
-    bool A_is_pattern = (acode == 0) ;
-
-    bool B_is_sparse = GB_IS_SPARSE (B) ;
-    bool B_is_hyper  = GB_IS_HYPERSPARSE (B) ;
-    bool B_is_bitmap = GB_IS_BITMAP (B) ;
-    bool B_is_full   = GB_IS_FULL (B) ;
-    bool B_is_sparse_or_hyper = B_is_sparse || B_is_hyper ;
-    bool B_is_bitmap_or_full  = B_is_bitmap || B_is_full  ;
-    bool B_is_pattern = (bcode == 0) ;
-
-    // M might be very very sparse.  A(:,i) is not needed if M(:,i) is empty.
-    // Likewise, B(:,j) is not needed if M(:,j) is empty.  For now, try this
-    // heuristic:  if M is hypersparse, then do not prefetch A->b or A->x.  
-
-    int prefetch_b = (M_is_hyper) ? 0 : GB_PREFETCH_B ;
-    int prefetch_x = (M_is_hyper) ? 0 : GB_PREFETCH_X ;
-    int prefetch_pybi = GB_PREFETCH_PYI + prefetch_b ;
-
-    // prefetch A (if A hypersparse: using A->Y)
-    GB_OK (GB_cuda_matrix_prefetch (A, prefetch_pybi +
-        (A_is_pattern ? 0 : prefetch_x), device, stream)) ;
-
-    // prefetch B (if B hypersparse: using B->Y)
-    GB_OK (GB_cuda_matrix_prefetch (B, prefetch_pybi +
-        (B_is_pattern ? 0 : prefetch_x), device, stream)) ;
-
     //--------------------------------------------------------------------------
-    // C<M>=A'*B via jitified kernels
+    // get the kernel function pointer, loading or compiling it if needed
     //--------------------------------------------------------------------------
 
-    if (A_is_bitmap_or_full && B_is_bitmap_or_full)
-    {
-
-        //----------------------------------------------------------------------
-        // (full or bitmap) times (full or bitmap)
-        //----------------------------------------------------------------------
-
-        dense_phase1launchFactory dp1lf(my_mxm_spec);
-
-        GBURBLE ("(GPU dense phase1 start nblk = %d) ",
-            dp1lf.get_number_of_blocks(M)) ;
-        kernel_timer.Start();
-            dp1lf.jitGridBlockLaunch(C, M, A, B, stream);
-            CU_OK (cudaStreamSynchronize(stream));
-        kernel_timer.Stop();
-        GBURBLE ("(GPU phase1 done %12.6g ms )\n", kernel_timer.Elapsed()) ;
-
-        mxm_dense_launchFactory mdlf(my_mxm_spec);
-        GBURBLE ("(GPU Dense full x full launch ) ") ;
-        kernel_timer.Start();
-            mdlf.jitGridBlockLaunch( C, M, A, B, stream);
-            CU_OK (cudaStreamSynchronize(stream));  // only for timing
-        kernel_timer.Stop();
-        GBURBLE ("(GPU Dense full x full done %12.6g ms, rate=%12.6g)\n", 
-               kernel_timer.Elapsed(), (mnvec)/(1000*kernel_timer.Elapsed())) ;  
-
-    }
-    else
-    {
-
-        //----------------------------------------------------------------------
-        // (sparse or hyper) times (sparse or hyper)
-        // (sparse or hyper) times (bitmap or full)
-        // (bitmap or full) times (sparse or hyper)
-        //----------------------------------------------------------------------
-
-        //----------------------------------------------------------------------
-        // construct the tasks for phase1 and phase2
-        //----------------------------------------------------------------------
-
-        // on the CPU: nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
-        // on the GPU:
-        phase1launchFactory p1lf(my_mxm_spec);
-        phase2launchFactory p2lf;
-        phase2endlaunchFactory p2elf;
-
-        // # of threads in phase1 and phase2 kernel launches are related
-        // # by the size of the warp.  ph2_task = ph1_task/32 for example
-        int nthrd = p2lf.get_threads_per_block();
-        int ntasks = p2elf.get_number_of_blocks(M);
-
-        int64_t nanobuckets_size = NBUCKETS * nthrd * ntasks;
-        int64_t blockbuckets_size = NBUCKETS * ntasks;
-
-        Nanobuckets = GB_MALLOC_WORK (nanobuckets_size, int64_t, &Nb_size) ;
-        Blockbucket = GB_MALLOC_WORK (blockbuckets_size, int64_t, &Bb_size) ;
-        Bucketp = GB_MALLOC_WORK (NBUCKETS+1, int64_t, &Bup_size) ;
-        offset = GB_MALLOC_WORK (NBUCKETS, int64_t, &O_size) ;
-        Bucket = GB_MALLOC_WORK (mnz, int64_t, &Bu_size) ;
-
-        if (Nanobuckets == NULL || Blockbucket == NULL || Bucketp == NULL
-            || Bucket == NULL || offset == NULL)
-        {
-            // out of memory
-            GB_FREE_ALL ;
-            return (GrB_OUT_OF_MEMORY) ;
-        }
-
-        // FIXME: do async with streams
-        // FIXME: do we need any of these?
-        //CU_OK (cudaMemsetAsync(Nanobuckets, 0,
-        //    nanobuckets_size * sizeof(int64_t), stream));
-        //CU_OK (cudaMemsetAsync(Blockbucket, 0,
-        //    blockbuckets_size * sizeof(int64_t), stream));
-        CU_OK (cudaMemsetAsync(Bucketp, 0,
-            (NBUCKETS+1) * sizeof(int64_t), stream));
-        CU_OK (cudaMemsetAsync(offset, 0,
-            NBUCKETS * sizeof(int64_t), stream));
-        //CU_OK (cudaMemsetAsync(Bucket, 0,
-        //    mnz * sizeof(int64_t), stream));
-
-        //----------------------------------------------------------------------
-        // phase1 and phase2: place each C(i,j) in a bucket
-        //----------------------------------------------------------------------
-
-        CU_OK (cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t),
-            cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
-        CU_OK (cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t),
-            cudaMemAdviseSetAccessedBy, device));
-
-        CU_OK (cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t),
-            cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
-        CU_OK (cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t),
-            cudaMemAdviseSetAccessedBy, device));
-
-        //----------------------------------------------------------------------
-        // phase1: assign each C(i,j) to a bucket, and count them
-        //----------------------------------------------------------------------
-
-        GBURBLE ("(GPU sparse phase1 start nblk = %d) ",
-            p1lf.get_number_of_blocks(M));
-        kernel_timer.Start();
-        p1lf.jitGridBlockLaunch(Nanobuckets, Blockbucket, C, M, A, B, stream);
-        CU_OK (cudaStreamSynchronize(stream));
-        kernel_timer.Stop();
-
-        GBURBLE ("(GPU phase1 done %12.6g ms )\n", kernel_timer.Elapsed()) ;
-
-        //----------------------------------------------------------------------
-        // phase2: cumsum across the blockbuckets, propagate to thread level
-        //----------------------------------------------------------------------
-
-        GBURBLE ("(GPU phase2 start nblk=%d ) ", ntasks) ;
-
-        kernel_timer.Start();
-        p2lf.jitGridBlockLaunch(Blockbucket, offset, M, stream);
-        kernel_timer.Stop();
-
-        CU_OK (cudaStreamSynchronize(stream));
-
-        int64_t s= offset[0];
-        C->nzombies = s;
-        bool all_in_one = false;
-        for ( int bucket = 1 ; bucket < NBUCKETS+1; ++bucket)
-        {
-            Bucketp[bucket] = s; 
-            s += offset[bucket];
-            if ( (Bucketp[bucket] - Bucketp[bucket-1] ) == mnz )
-            {
-                all_in_one = true;
-            }
-        }
-
-        GBURBLE ("(GPU phase2 done %12.6g ms )\n", kernel_timer.Elapsed()) ;
-
-        if (!all_in_one) 
-        {
-            GBURBLE ("(GPU phase2end start nblk=%d) ",  ntasks) ;
-
-            kernel_timer.Start();
-            p2elf.jitGridBlockLaunch(Nanobuckets, Blockbucket,
-                             Bucketp, Bucket, offset, C, M, stream);
-
-            CU_OK (cudaStreamSynchronize(stream));
-            kernel_timer.Stop();
-            GBURBLE ("(GPU phase2end done %12.6g ms)\n",kernel_timer.Elapsed());
-        }
-
-        //----------------------------------------------------------------------
-        // phase3: do the numerical work
-        //----------------------------------------------------------------------
-
-        for ( int bucket = 1 ; bucket < NBUCKETS; ++bucket)
-        {
-            int64_t start = Bucketp[bucket];
-            int64_t end   = Bucketp[bucket + 1 ];
-            if (end - start > 0)
-            {
-                // TODO: Use stream pool
-                phase3launchFactory p3lf(my_mxm_spec, (GB_bucket_code)bucket);
-                GBURBLE ("(GPU phase3 bucket %d launch ) ", bucket) ;
-                kernel_timer.Start();
-                p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket,
-                    C, M, A, B, stream);
-                CU_OK (cudaStreamSynchronize(stream));  // only for timing
-                kernel_timer.Stop();
-                GBURBLE ("(GPU phase3 bucket %d done %12.6g ms, rate=%12.6g)\n",
-                    bucket, kernel_timer.Elapsed(),
-                    (end-start)/(1000*kernel_timer.Elapsed())) ; 
-            }
-        }
-    }
+    void *dl_function ;
+    GrB_Info info = GB_jitifyer_load (&dl_function,
+        GB_jit_mxm_family, "cuda_AxB_dot3",
+        hash, &encoding, suffix, semiring, NULL,
+        NULL, C->type, A->type, B->type) ;
+    if (info != GrB_SUCCESS) return (info) ;
 
     //--------------------------------------------------------------------------
-    // free workspace and return result
+    // call the jit kernel and return result
     //--------------------------------------------------------------------------
 
-    GB_FREE_WORKSPACE ;
-    return GrB_SUCCESS; 
+    GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
+    return (GB_jit_kernel (C, M, A, B, stream, device, number_of_sms,
+        &GB_callback)) ;
 }
 
diff --git a/GraphBLAS/CUDA/GB_cuda_common_jitFactory.hpp b/GraphBLAS/CUDA/GB_cuda_common_jitFactory.hpp
deleted file mode 100644
index 5d7ad01e6f..0000000000
--- a/GraphBLAS/CUDA/GB_cuda_common_jitFactory.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_common_jitFactory.hpp: for all jitFactory classes
-//------------------------------------------------------------------------------
-
-// (c) Nvidia Corp. 2023 All rights reserved
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-// Common defines for all jitFactory classes:
-// iostream callback to deliver the buffer to jitify as if read from a file
-// compiler flags
-// Include this file along with any jitFactory you need.
-
-// NOTE: do not edit the GB_cuda_common_jitFactory.hpp directly.  It is
-// configured by cmake from the following file:
-// GraphBLAS/CUDA/Config/GB_cuda_common_jitFactory.hpp.in
-
-#ifndef GB_CUDA_COMMON_JITFACTORY_HPP
-#define GB_CUDA_COMMON_JITFACTORY_HPP
-
-#pragma once
-
-#include "GraphBLAS_cuda.h"
-
-extern "C"
-{
-    #include "GB.h"
-    #include "GB_stringify.h"
-}
-
-#include <iostream>
-#include <cstdint>
-#include "GB_cuda_jitify_cache.h"
-#include "GB_cuda_jitify_launcher.h"
-#include "GB_cuda_mxm_factory.hpp"
-#include "GB_cuda_error.h"
-#include "../rmm_wrap/rmm_wrap.h"
-#include "GB_iceil.h"
-
-// amount of shared memory to use in CUDA kernel launches
-constexpr unsigned int SMEM = 0 ;
-
-#if 0
-
-static const std::vector<std::string> GB_jit_cuda_compiler_flags{   // OLD
-   "-std=c++17",
-   //"-G",
-   "-remove-unused-globals",
-   "-w",
-   "-D__CUDACC_RTC__",
-// "-I" + jit::get_user_home_cache_dir(),   // FIXME: add +/cu/00
-// "-I" + jit::get_user_home_cache_dir() + "/src",
-   "-I/usr/local/cuda/include",
-   // FIXME: add SUITESPARSE_CUDA_ARCHITECTURES here, via config
-};
-
-#endif
-
-inline std::vector<std::string> GB_cuda_jit_compiler_flags ( )
-{
-    return (
-        std::vector<std::string>  (
-        {"-std=c++17",
-        //"-G",
-        "-remove-unused-globals",
-        "-w",
-        "-D__CUDACC_RTC__",
-        "-I" + jit::get_user_home_cache_dir(),   // FIXME: add +/cu/00
-        "-I" + jit::get_user_home_cache_dir() + "/src",
-        "-I/usr/local/cuda/include"
-        // FIXME: add SUITESPARSE_CUDA_ARCHITECTURES here, via config
-        })) ;
-} ;
-
-// FIXME: rename GB_jit_cuda_header_names or something
-static const std::vector<std::string> header_names ={};
-
-// FIXME: rename GB_jit_cuda_file_callback
-inline std::istream* (*file_callback)(std::string, std::iostream&);
-
-#endif
diff --git a/GraphBLAS/CUDA/GB_cuda_error.h b/GraphBLAS/CUDA/GB_cuda_error.h
deleted file mode 100644
index d9aec9b3ff..0000000000
--- a/GraphBLAS/CUDA/GB_cuda_error.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_error.h
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-/*
- * Copyright (c) 2023 NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#ifndef GB_CUDA_ERROR_H
-#define GB_CUDA_ERROR_H
-
-#include <cuda_runtime.h>
-
-static const char *_cudaGetErrorEnum(cudaError_t error) {
-    return cudaGetErrorName(error);
-}
-
-template <typename T>
-void check(T result, char const *const func, const char *const file,
-           int const line) {
-    if (result) {
-        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
-                static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
-        exit(EXIT_FAILURE);
-    }
-}
-
-#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
-
-// This will output the proper error string when calling cudaGetLastError
-#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
-
-inline void __getLastCudaError(const char *errorMessage, const char *file,
-                               const int line) {
-    cudaError_t err = cudaGetLastError();
-
-    if (cudaSuccess != err) {
-        fprintf(stderr,
-                "%s(%i) : getLastCudaError() CUDA error :"
-                " %s : (%d) %s.\n",
-                file, line, errorMessage, static_cast<int>(err),
-                cudaGetErrorString(err));
-        exit(EXIT_FAILURE);
-    }
-}
-
-// This will only print the proper error string when calling cudaGetLastError
-// but not exit program incase error detected.
-#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
-
-inline void __printLastCudaError(const char *errorMessage, const char *file,
-                                 const int line) {
-    cudaError_t err = cudaGetLastError();
-
-    if (cudaSuccess != err) {
-        fprintf(stderr,
-                "%s(%i) : getLastCudaError() CUDA error :"
-                " %s : (%d) %s.\n",
-                file, line, errorMessage, static_cast<int>(err),
-                cudaGetErrorString(err));
-    }
-}
-#define CHECK_CUDA(call) checkCudaErrors( call )
-
-#endif
diff --git a/GraphBLAS/CUDA/GB_cuda_get_device_count.cu b/GraphBLAS/CUDA/GB_cuda_get_device_count.cu
index 7cad833fa1..3f0d074dd4 100644
--- a/GraphBLAS/CUDA/GB_cuda_get_device_count.cu
+++ b/GraphBLAS/CUDA/GB_cuda_get_device_count.cu
@@ -2,12 +2,13 @@
 // GraphBLAS/CUDA/GB_cuda_get_device_count.cu: find out how many GPUs exist
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-#include "GB_cuda.h"
+#include "GB_cuda.hpp"
 
 bool GB_cuda_get_device_count   // true if OK, false if failure
 (
diff --git a/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu b/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu
index 7bb7e1407f..daaac9a214 100644
--- a/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu
+++ b/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu
@@ -2,12 +2,17 @@
 // GraphBLAS/CUDA/GB_cuda_get_device_properties: get the properties of a GPU
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-#include "GB_cuda.h"
+#include "GB_cuda.hpp"
+#define CU_OK(cudaMethod)                               \
+{                                                       \
+    if ((cudaMethod) != cudaSuccess) return (false) ;   \
+}
 
 //------------------------------------------------------------------------------
 // GB_cuda_get_device: get the current GPU
@@ -20,7 +25,7 @@ bool GB_cuda_get_device (int &device)
         // invalid inputs
         return (false) ;
     }
-    CHECK_CUDA_SIMPLE (cudaGetDevice (&device)) ;
+    CU_OK (cudaGetDevice (&device)) ;
     return (true) ;
 }
 
@@ -35,7 +40,7 @@ bool GB_cuda_set_device (int device)
         // invalid inputs
         return (false) ;
     }
-    CHECK_CUDA_SIMPLE (cudaSetDevice (device)) ;
+    CU_OK (cudaSetDevice (device)) ;
     return (true) ;
 }
 
@@ -64,7 +69,7 @@ bool GB_cuda_get_device_properties  // true if OK, false if failure
     memset (prop, 0, sizeof (GB_cuda_device)) ;
 
     int old_device ;
-    CHECK_CUDA_SIMPLE ( cudaGetDevice( &old_device ) ) ;
+    CU_OK (cudaGetDevice (&old_device )) ;
 
     //--------------------------------------------------------------------------
     // get the properties
@@ -73,26 +78,24 @@ bool GB_cuda_get_device_properties  // true if OK, false if failure
     int num_sms, compute_capability_major, compute_capability_minor ;
     size_t memfree, memtotal ;
 
-    CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&num_sms,
-                                         cudaDevAttrMultiProcessorCount,
-                                         device) ) ;
-    CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&compute_capability_major,
-                                         cudaDevAttrComputeCapabilityMajor,
-                                         device) ) ;
-    CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&compute_capability_minor,
-                                         cudaDevAttrComputeCapabilityMajor,
-                                         device) ) ;
+    CU_OK (cudaDeviceGetAttribute (&num_sms,
+                cudaDevAttrMultiProcessorCount, device)) ;
+    CU_OK (cudaDeviceGetAttribute (&compute_capability_major,
+                cudaDevAttrComputeCapabilityMajor, device)) ;
+    CU_OK (cudaDeviceGetAttribute (&compute_capability_minor,
+                cudaDevAttrComputeCapabilityMinor, device)) ;
 
-    CHECK_CUDA_SIMPLE ( cudaSetDevice( device ) ) ;
-    CHECK_CUDA_SIMPLE ( cudaMemGetInfo( & memfree, &memtotal) ) ;
-    CHECK_CUDA_SIMPLE ( cudaSetDevice( old_device ) ) ;
+    CU_OK (cudaSetDevice (device )) ;
+    CU_OK (cudaMemGetInfo (&memfree, &memtotal)) ;
+    CU_OK (cudaSetDevice (old_device )) ;
 
     prop->total_global_memory = memtotal ;
     prop->number_of_sms = num_sms ;
     prop->compute_capability_major = compute_capability_major ;
     prop->compute_capability_minor = compute_capability_minor ;
 
-    printf ("Device: %d: memory: %ld SMs: %d compute: %d.%d\n",
+    // FIXME: remove this printf
+    printf ("\nDevice: %d: memory: %ld SMs: %d compute: %d.%d\n",
         device, prop->total_global_memory, prop->number_of_sms,
         prop->compute_capability_major, prop->compute_capability_minor) ;
 
diff --git a/GraphBLAS/CUDA/GB_cuda_init.c b/GraphBLAS/CUDA/GB_cuda_init.c
index 25dd233b88..ed920f9758 100644
--- a/GraphBLAS/CUDA/GB_cuda_init.c
+++ b/GraphBLAS/CUDA/GB_cuda_init.c
@@ -2,7 +2,8 @@
 // GraphBLAS/CUDA/GB_cuda_init: initialize the GPUs for use by GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
@@ -12,7 +13,6 @@
 // assumed.  Then each GPU is "warmed up" by allocating a small amount of
 // memory.
 
-#undef GBCUDA_CPLUSPLUS
 #include "GB.h"
 
 GrB_Info GB_cuda_init (void)
@@ -55,7 +55,6 @@ GrB_Info GB_cuda_init (void)
 
     GB_cuda_set_device (0) ;            // make GPU 0 the default device
     GB_Context_gpu_id_set (NULL, 0) ;   // set GxB_CONTEXT_WORLD->gpu_id to 0
-    GB_Global_hack_set (2, 0) ;         // gpu_hack default
 
     // also check for jit cache, pre-load library of common kernels ...
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/CUDA/GB_cuda_jitify_cache.cu b/GraphBLAS/CUDA/GB_cuda_jitify_cache.cu
deleted file mode 100644
index 3e66d735b5..0000000000
--- a/GraphBLAS/CUDA/GB_cuda_jitify_cache.cu
+++ /dev/null
@@ -1,233 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_jitify_cache.cu
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-/*
- * Copyright (c) 2019,2023 NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-//------------------------------------------------------------------------------
-
-#include <stdio.h>
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <pwd.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <filesystem>
-
-#include "GB_cuda_jitify_cache.h"
-#include "GraphBLAS_cuda.h"
-
-extern "C"
-{
-    #include "GB.h"
-    #include "GB_jitifyer.h"
-}
-
-namespace jit {
-
-// Get the directory in home to use for storing the cache
-    std::string get_user_home_cache_dir() {
-        const char *path = GB_jitifyer_get_cache_path ( ) ;
-        if (path == NULL)
-        {
-            return std::string ("") ;
-        }
-        else
-        {
-            return std::string (path) ;
-        }
-    }
-
-GBJitCache::GBJitCache() { }
-
-GBJitCache::~GBJitCache() { }
-
-
-std::mutex GBJitCache::_kernel_cache_mutex;
-std::mutex GBJitCache::_program_cache_mutex;
-
-std::string GBJitCache::getFile(
-    File_Desc &file_object )
-{
-    // Lock for thread safety
-    std::lock_guard<std::mutex> lock(_program_cache_mutex);
-
-    // Macrofied version
-    auto cached_file = getCachedFile( file_object, file_map );
-    return *std::get<1>( cached_file ).get();
-}
-
-named_prog<jitify::experimental::Program> GBJitCache::getProgram(
-    std::string const& prog_name, 
-    std::string const& cuda_source,
-    std::vector<std::string> const& given_headers,
-    std::vector<std::string> const& given_options,
-    jitify::experimental::file_callback_type file_callback)
-{
-    // Lock for thread safety
-    std::lock_guard<std::mutex> lock(_program_cache_mutex);
-//    printf(" jit_cache get program %s\n", prog_name.c_str());
-
-    return getCached(prog_name, program_map, 
-        [&](){
-            return jitify::experimental::Program(cuda_source,
-                                        given_headers,
-                                        given_options,
-                                        file_callback);
-        }
-    );
-}
-
-named_prog<jitify::experimental::KernelInstantiation> GBJitCache::getKernelInstantiation(
-    std::string const& kern_name,
-    named_prog<jitify::experimental::Program> const& named_program,
-    std::vector<std::string> const& arguments)
-{
-    // Lock for thread safety
-    std::lock_guard<std::mutex> lock(_kernel_cache_mutex);
-
-    std::string prog_name = std::get<0>(named_program);
-    jitify::experimental::Program& program = *std::get<1>(named_program);
-
-    // Make instance name e.g. "prog_binop.kernel_v_v_int_int_long int_Add"
-    std::string kern_inst_name = kern_name;
-    for ( auto&& arg : arguments ) kern_inst_name += '_' + arg;
-
-    //printf(" got kernel instance %s\n",kern_inst_name.c_str());
-
-    return getCached(kern_inst_name, kernel_inst_map, 
-        [&](){return program.kernel(kern_name)
-                            .instantiate(arguments);
-        }
-    );
-}
-
-// Another overload for getKernelInstantiation which might be useful to get
-// kernel instantiations in one step
-// ------------------------------------------------------------------------
-/*
-jitify::experimental::KernelInstantiation GBJitCache::getKernelInstantiation(
-    std::string const& kern_name,
-    std::string const& prog_name,
-    std::string const& cuda_source = "",
-    std::vector<std::string> const& given_headers = {},
-    std::vector<std::string> const& given_options = {},
-    file_callback_type file_callback = nullptr)
-{
-    auto program = getProgram(prog_name,
-                              cuda_source,
-                              given_headers,
-                              given_options,
-                              file_callback);
-    return getKernelInstantiation(kern_name, program);
-}
-*/
-
-GBJitCache::cacheFile::cacheFile(std::string file_name)
- : _file_name{file_name}
-{ }
-
-GBJitCache::cacheFile::~cacheFile() { }
-
-std::string GBJitCache::cacheFile::read_file()
-{
-    // Open file (duh)
-    int fd = open ( _file_name.c_str(), O_RDWR );
-    if ( fd == -1 ) {
-        // TODO: connect errors to GrB_error result
-//        printf(" failed to open cache file %s\n",_file_name.c_str());
-        successful_read = false;
-        return std::string();
-    }
-
-    // Lock the file descriptor. we the only ones now
-    if ( lockf(fd, F_LOCK, 0) == -1 ) {
-        successful_read = false;
-        return std::string();
-    }
-
-    // Get file descriptor from file pointer
-    FILE *fp = fdopen( fd, "rb" );
-
-    // Get file length
-    fseek( fp , 0L , SEEK_END);
-    size_t file_size = ftell( fp );
-    rewind( fp );
-
-    // Allocate memory of file length size
-    std::string content;
-    content.resize(file_size);
-
-    char *buffer = content.data();
-
-    // Copy file into buffer
-    if( fread(buffer, file_size, 1, fp) != 1 ) {
-        //printf(" failed to read cache file %s\n",_file_name.c_str());
-        successful_read = false;
-        fclose(fp);
-//        free(buffer); FIXME: Shouldn't need to free buffer since it's RAII
-        return content; // FIXME: use unique_ptr here
-    }
-
-//    printf("about to close\n");
-    fclose(fp);
-    successful_read = true;
-//    printf(" read cache file %s\n",_file_name.c_str());
-
-    return content;
-}
-
-void GBJitCache::cacheFile::write(std::string content)
-{
-    // Open file and create if it doesn't exist, with access 0600
-    int fd = open ( _file_name.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR );
-    if ( fd == -1 ) {
-        //printf(" failed to open cache file for write %s\n",_file_name.c_str());
-        successful_write = false;
-        return;
-    }
-
-    // Lock the file descriptor. we the only ones now
-    if ( lockf(fd, F_LOCK, 0) == -1 ) {
-        successful_write = false;
-        return;
-    }
-
-    // Get file descriptor from file pointer
-    FILE *fp = fdopen( fd, "wb" );
-
-    // Copy string into file
-    if( fwrite(content.c_str(), content.length(), 1, fp) != 1 ) {
-        //printf(" failed to write cache file %s\n",_file_name.c_str());
-        successful_write = false;
-        fclose(fp);
-        return;
-    }
-    fclose(fp);
-
-    successful_write = true;
-    //printf(" wrote cache file %s\n",_file_name.c_str());
-    
-    return;
-}
-
-} // namespace jit
diff --git a/GraphBLAS/CUDA/GB_cuda_jitify_cache.h b/GraphBLAS/CUDA/GB_cuda_jitify_cache.h
deleted file mode 100644
index 36124da469..0000000000
--- a/GraphBLAS/CUDA/GB_cuda_jitify_cache.h
+++ /dev/null
@@ -1,327 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_jitify_cache.h
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-/*
- * Copyright (c) 2019,2020 NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef GB_JIT_CACHE_H_
-#define GB_JIT_CACHE_H_
-
-#include <jitify.hpp>
-#include <unordered_map>
-#include <string>
-#include <memory>
-#include <mutex>
-#include <iostream>
-#include <fstream>
-
-
-#define JITIFY_USE_CACHE 1
-
-namespace jit {
-
-std::string get_user_home_cache_dir();
-
-template <typename Tv>
-using named_prog = std::pair<std::string, std::shared_ptr<Tv>>;
-
-// Basic file descriptor to enable file manipulation with caching 
-class File_Desc
-{
-public:
-   virtual void open( const char *path_and_file, const char *mode) {}
-   virtual void close() {}
-   virtual void macrofy() {
-
-       printf("Uh oh. this isn't good\n");
-
-   }
-   std::string filename;
-};
-
-/**
- * @brief Get the string path to the JITIFY kernel cache directory.
- *
- * This path can be overridden at runtime by defining an environment variable
- * named `GB_CUDA_KERNEL_CACHE_PATH`. The value of this variable must be a path
- * under which the process' user has read/write priveleges.
- *
- * This function returns a path to the cache directory, creating it if it
- * doesn't exist.
- *
- * The default cache directory `~/.GraphBLAS_kernel_cache`.
- **/
-
-class GBJitCache
-{
-public:
-
-    /**---------------------------------------------------------------------------*
-     * @brief Get a process wide singleton cache object
-     * 
-     *---------------------------------------------------------------------------**/
-    static GBJitCache& Instance() {
-        // Meyers' singleton is thread safe in C++11
-        // Link: https://stackoverflow.com/a/1661564
-        static GBJitCache cache;
-        return cache;
-    }
-
-    GBJitCache();
-    ~GBJitCache();
-
-    /**---------------------------------------------------------------------------*
-     * @brief Get the file object
-     * 
-     * Searches an internal in-memory cache and file based cache for the file 
-     * and if not found, opens the file, calls macrofy, closes the file 
-     * 
-     * @param file_desc [in] object representing file:  open, macrofy, close 
-     * @return  string name of file, or 'error' if not able to create file  
-     *---------------------------------------------------------------------------**/
-    std::string getFile( File_Desc & file_obj );
-
-    /**---------------------------------------------------------------------------*
-     * @brief Get the Kernel Instantiation object
-     * 
-     * Searches an internal in-memory cache and file based cache for the kernel
-     * and if not found, JIT compiles and returns the kernel
-     * 
-     * @param kern_name [in] name of kernel to return
-     * @param program   [in] Jitify preprocessed program to get the kernel from
-     * @param arguments [in] template arguments for kernel in vector of strings
-     * @return  Pair of string kernel identifier and compiled kernel object
-     *---------------------------------------------------------------------------**/
-    named_prog<jitify::experimental::KernelInstantiation> getKernelInstantiation(
-        std::string const& kern_name,
-        named_prog<jitify::experimental::Program> const& program,
-        std::vector<std::string> const& arguments);
-
-    /**---------------------------------------------------------------------------*
-     * @brief Get the Jitify preprocessed Program object
-     * 
-     * Searches an internal in-memory cache and file based cache for the Jitify
-     * pre-processed program and if not found, JIT processes and returns it
-     * 
-     * @param prog_file_name [in] name of program to return
-     * @param cuda_source    [in] string source code of program to compile
-     * @param given_headers  [in] vector of strings representing source or names of
-     *  each header included in cuda_source
-     * @param given_options  [in] vector of strings options to pass to NVRTC
-     * @param file_callback  [in] pointer to callback function to call whenever a
-     *  header needs to be loaded
-     * @return named_prog<jitify::experimental::Program> 
-     *---------------------------------------------------------------------------**/
-    named_prog<jitify::experimental::Program> getProgram(
-        std::string const& prog_file_name, 
-        std::string const& cuda_source = "",
-        std::vector<std::string> const& given_headers = {},
-        std::vector<std::string> const& given_options = {},
-        jitify::experimental::file_callback_type file_callback = nullptr);
-
-private:
-    template <typename Tv>
-    using umap_str_shptr = std::unordered_map<std::string, std::shared_ptr<Tv>>;
-
-    umap_str_shptr<std::string>                                file_map;
-    umap_str_shptr<jitify::experimental::KernelInstantiation>  kernel_inst_map;
-    umap_str_shptr<jitify::experimental::Program>              program_map;
-
-    /*
-    Even though this class can be used as a non-singleton, the file cache
-    access should remain limited to one thread per process. The lockf locks can
-    prevent multiple processes from accessing the file but are ineffective in
-    preventing multiple threads from doing so as the lock is shared by the
-    entire process.
-    Therefore the mutexes are static.
-    */
-    static std::mutex _file_cache_mutex;
-    static std::mutex _kernel_cache_mutex;
-    static std::mutex _program_cache_mutex;
-
-private:
-    /**---------------------------------------------------------------------------*
-     * @brief Class to allow process wise exclusive access to cache files
-     *
-     *---------------------------------------------------------------------------**/
-    class cacheFile
-    {
-    private:
-        std::string _file_name ;
-        // FIXME this isn't used, is it?
-        std::string _dir_name = "~/.GraphBLAS_kernel_cache/";   // FIXME
-        bool successful_read = false;
-        bool successful_write = false;
-    public:
-        cacheFile(std::string file_name);
-        ~cacheFile();
-
-        /**---------------------------------------------------------------------------*
-         * @brief Read this file and return the contents as a std::string
-         * 
-         *---------------------------------------------------------------------------**/
-        std::string read_file();
-
-        /**---------------------------------------------------------------------------*
-         * @brief Write the passed string to this file
-         * 
-         *---------------------------------------------------------------------------**/
-        void write(std::string);
-
-        /**---------------------------------------------------------------------------*
-         * @brief Check whether the read() operation on the file completed successfully
-         * 
-         * @return true Read was successful. String returned by `read()` is valid
-         * @return false Read was unsuccessful. String returned by `read()` is empty
-         *---------------------------------------------------------------------------**/
-        bool is_read_successful() { return successful_read; }
-
-        /**---------------------------------------------------------------------------*
-         * @brief Check whether the write() operation on the file completed successfully
-         * 
-         * @return true Write was successful.
-         * @return false Write was unsuccessful. File state is undefined
-         *---------------------------------------------------------------------------**/
-        bool is_write_successful() { return successful_write; }
-    };
-
-private:
-
-    template <typename T, typename FileDescType>
-    named_prog<T> getCachedFile(
-            FileDescType &file_object,
-        umap_str_shptr<T>& map )
-    {
-
-//        printf("INside get cached file\n");
-        std::string name = file_object.filename;
-
-        // Find memory cached T object
-        auto it = map.find(name);
-        if ( it != map.end()) {
-//            std::cout<<"found memory-cached file "<<name<<std::endl;
-            return std::make_pair(name, it->second);
-        }
-        else { // Find file cached T object
-            bool successful_read = false;
-            std::string serialized;
-            std::string cache_dir = get_user_home_cache_dir();
-            std::string file_name = cache_dir + "/" + name;
-            if (not cache_dir.empty() ) {
-                // TODO: Use OS-agnostic path separator here
-//                std::cout<<"looking for prog in file "<<file_name<<std::endl;
-                file_object.open(file_name.c_str(), "r");
-                cacheFile file{file_name};
-                serialized = file.read_file();
-                successful_read = file.is_read_successful();
-//                std::cout << "successful_read: " << successful_read << std::endl;
-                if(successful_read) {
-                    file_object.close();
-//                    std::cout << "Just closed" << std::endl;
-                }
-            }
-            if (not successful_read) {
-                // JIT compile and write to file if possible
-//                std::cout << "not successful read. macrofying" << std::endl;
-                file_object.open(file_name.c_str(), "w");
-                file_object.macrofy();
-//                std::cout<<" got fresh content for "<<name<<std::endl;
-                file_object.close();
-
-                if (not cache_dir.empty()) {
-//                    std::cout<<"writing in file "<<file_name<<std::endl;
-                    cacheFile file{file_name};
-
-                    cacheFile macrofied{name};
-                    serialized = macrofied.read_file();
-                    file.write(serialized);
-                }
-            }
-            // Add deserialized T to cache and return
-            map[name] = std::make_shared<std::string>(serialized);
-            //std::cout<<"storing file in memory "<<name<<std::endl;
-            return std::make_pair(name, map[name]);
-        }
-    }
-
-
-    // GetCached 
-    // This does a lot. 1) it checks if the file named is in the memory cache 
-    //                  2) checks the disk cache 
-    //                  3) compiles and caches the result and returns the program
-    template <typename T, typename FallbackFunc>
-    named_prog<T> getCached(
-        std::string const& name,
-        umap_str_shptr<T>& map,
-        FallbackFunc func) {
-
-        // Find memory cached T object
-        auto it = map.find(name);
-        if ( it != map.end()) {
-//            std::cout<<"found memory-cached prog "<<name<<std::endl;
-            return std::make_pair(name, it->second);
-        }
-        else { // Find file cached T object
-            bool successful_read = false;
-            std::string serialized;
-            #if defined(JITIFY_USE_CACHE)
-                std::string cache_dir = get_user_home_cache_dir() ;
-                if (not cache_dir.empty() ) {
-                    // TODO: Use OS-agnostic path separator
-                    std::string file_name = cache_dir + "/" + name;
-                    //std::cout<<"looking for prog in file "<<file_name<<std::endl;
-
-                    cacheFile file{file_name};
-                    serialized = file.read_file();
-                    successful_read = file.is_read_successful();
-                }
-            #endif
-            if (not successful_read) {
-                // JIT compile and write to file if possible
-//                    std::cout << "compiling now" << std::endl;
-                auto f = func();
-
-//                    std::cout << "completed func()" << std::endl;
-                serialized = f.serialize();
-//                std::cout<<" compiled serialized prog "<<name<<std::endl;
-
-                #if defined(JITIFY_USE_CACHE)
-                    if (not cache_dir.empty()) {
-                        std::string file_name = cache_dir + "/" + name;
-//                        std::cout<<"writing prog in file "<<file_name<<std::endl;
-                        cacheFile file{file_name};
-                        file.write(serialized);
-                    }
-                #endif
-            }
-            // Add deserialized T to cache and return
-            auto program = std::make_shared<T>(T::deserialize(serialized));
-            map[name] = program;
-            //std::cout<<"storing prog in memory "<<name<<std::endl;
-            return std::make_pair(name, program);
-        }
-    }
-};
-
-} // namespace jit
-
-
-#endif // GB_JIT_CACHE_H_
diff --git a/GraphBLAS/CUDA/GB_cuda_jitify_launcher.cu b/GraphBLAS/CUDA/GB_cuda_jitify_launcher.cu
deleted file mode 100644
index 207ce5120b..0000000000
--- a/GraphBLAS/CUDA/GB_cuda_jitify_launcher.cu
+++ /dev/null
@@ -1,60 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_jitify_launcher.cu
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-/*
- * Copyright (c) 2019,2020 NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "GB_cuda_jitify_launcher.h"
-#include <cstdint>
-
-namespace jit {
-
-    launcher::launcher(
-      const std::string& hash,
-      const std::string& cuda_source,
-      const std::vector<std::string>& header_names,
-      const std::vector<std::string>& compiler_flags,
-      jitify::experimental::file_callback_type file_callback,
-      cudaStream_t stream
-    )
-     : cache_instance{jit::GBJitCache::Instance()}
-     , stream(stream) 
-    {
-      program = cache_instance.getProgram(
-                  hash,
-                  cuda_source.c_str(),
-                  header_names,
-                  compiler_flags,
-                  file_callback
-                );
-    }
-
-    launcher::launcher(launcher&& launcher)
-     : program {std::move(launcher.program)}
-     , cache_instance {jit::GBJitCache::Instance()}
-     , kernel_inst {std::move(launcher.kernel_inst)}
-     , stream {launcher.stream}
-    { }
-
-} // namespace jit
diff --git a/GraphBLAS/CUDA/GB_cuda_jitify_launcher.h b/GraphBLAS/CUDA/GB_cuda_jitify_launcher.h
deleted file mode 100644
index 088a2bd77a..0000000000
--- a/GraphBLAS/CUDA/GB_cuda_jitify_launcher.h
+++ /dev/null
@@ -1,152 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_jitify_launcher.h
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-/*
- * Copyright (c) 2019,2023 NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-//------------------------------------------------------------------------------
-
-// FIXME: rename .hpp?
-
-#ifndef GB_CUDA_JITIFY_LAUNCHER_H
-#define GB_CUDA_JITIFY_LAUNCHER_H
-
-#include "GB_cuda_jitify_cache.h"
-
-#include <unordered_map>
-#include <memory>
-#include <string>
-#include <fstream>
-
-#undef  JITIFY_PRINT_INSTANTIATION
-#define JITIFY_PRINT_INSTANTIATION 0
-#undef  JITIFY_PRINT_SOURCE
-#define JITIFY_PRINT_SOURCE 1
-#undef  JITIFY_PRINT_LOG
-#define JITIFY_PRINT_LOG 1
-#undef  JITIFY_PRINT_PTX
-#define JITIFY_PRINT_PTX 1
-#undef  JITIFY_PRINT_LINKER_LOG
-#define JITIFY_PRINT_LINKER_LOG 0
-#undef  JITIFY_PRINT_LAUNCH
-#define JITIFY_PRINT_LAUNCH 1
-#include "jitify.hpp"
-
-
-namespace jit {
-
-/**
- * @brief Class used to handle compilation and execution of JIT kernels
- * 
- */
-class launcher {
- public:
-  launcher() = delete;
-   
-  /**
-   * @brief C'tor of the launcher class
-   * 
-   * Method to generate vector containing all template types for a JIT kernel.
-   *  This vector is used to get the compiled kernel for one set of types and set
-   *  it as the kernel to launch using this launcher.
-   * 
-   * @param hash The hash to be used as the key for caching
-   * @param cuda_code The CUDA code that contains the kernel to be launched
-   * @param header_names Strings of header_names or strings that contain content
-   * of the header files
-   * @param compiler_flags Strings of compiler flags
-   * @param file_callback a function that returns header file contents given header
-   * file names.
-   * @param stream The non-owned stream to use for execution
-   */
-  launcher(
-    const std::string& hash,
-    const std::string& cuda_source,
-    const std::vector<std::string>& header_names,
-    const std::vector<std::string>& compiler_flags,
-    jitify::experimental::file_callback_type file_callback,
-    cudaStream_t stream = 0
-  );       
-  launcher(launcher&&);
-  launcher(const launcher&) = delete;
-  launcher& operator=(launcher&&) = delete;
-  launcher& operator=(const launcher&) = delete;
-
-  /**
-   * @brief Sets the kernel to launch using this launcher
-   * 
-   * Method to generate vector containing all template types for a JIT kernel.
-   *  This vector is used to get the compiled kernel for one set of types and set
-   *  it as the kernel to launch using this launcher.
-   * 
-   * @param kernel_name The kernel to be launched
-   * @param arguments   The template arguments to be used to instantiate the kernel
-   * @return launcher& ref to this launcehr object
-   */
-  launcher& set_kernel_inst(
-    const std::string& kernel_name,
-    const std::vector<std::string>& arguments
-  )
-  { // program is a member variable of the launcher
-    kernel_inst = cache_instance.getKernelInstantiation(kernel_name, program, arguments);
-    return *this;
-  }
-
-  /**
-   * @brief Handle the Jitify API to launch using information 
-   *  contained in the members of `this`
-   * 
-   * @tparam grid and block sizes 
-   * @return Return launcher reference if successful
-   */
-  jitify::experimental::KernelLauncher configure( dim3 grid, dim3 block, unsigned int smem = 0, cudaStream_t stream = 0){
-    return get_kernel().configure( grid, block, smem, stream);
-    //return get_kernel().configure_1d_max_occupancy( max_block_size=block.x); 
-  }
-
-
-  /**
-   * @brief Handle the Jitify API to launch using information 
-   *  contained in the members of `this`
-   * 
-   * @tparam All parameters to launch the kernel
-   * @return Return GDF_SUCCESS if successful
-   */
-  template <typename ... Args>
-  void launch(Args ... args) {
-    get_kernel().configure_1d_max_occupancy(32, 0, 0, stream).launch(args...);
-  }
-
- private:
-  jit::GBJitCache& cache_instance;
-  jit::named_prog<jitify::experimental::Program> program;
-  jit::named_prog<jitify::experimental::KernelInstantiation> kernel_inst;
-  cudaStream_t stream;
-
-  jitify::experimental::KernelInstantiation& get_kernel() { return *std::get<1>(kernel_inst); }
-};
-
-} // namespace jit
-
-#endif
diff --git a/GraphBLAS/CUDA/GB_cuda_matrix_prefetch.cpp b/GraphBLAS/CUDA/GB_cuda_matrix_prefetch.cpp
index c71dc0cd6c..d3e5710c33 100644
--- a/GraphBLAS/CUDA/GB_cuda_matrix_prefetch.cpp
+++ b/GraphBLAS/CUDA/GB_cuda_matrix_prefetch.cpp
@@ -2,12 +2,13 @@
 // GraphBLAS/CUDA/GB_cuda_matrix_prefetch: prefetch a matrix to a GPU or the CPU
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-#include "GB_cuda.h"
+#include "GB_cuda.hpp"
 #define GB_FREE_ALL ;
 
 GrB_Info GB_cuda_matrix_prefetch
@@ -25,33 +26,39 @@ GrB_Info GB_cuda_matrix_prefetch
 
     if (A->p != NULL && (which & GB_PREFETCH_P))
     {
-        CU_OK (cudaMemPrefetchAsync (A->p, (anvec+1) * sizeof (int64_t), device, stream)) ;
+        CUDA_OK (cudaMemPrefetchAsync (A->p, (anvec+1) * sizeof (int64_t),
+            device, stream)) ;
     }
 
     if (A->h != NULL && (which & GB_PREFETCH_H))
     {
-        CU_OK (cudaMemPrefetchAsync (A->h, anvec * sizeof (int64_t), device, stream)) ;
+        CUDA_OK (cudaMemPrefetchAsync (A->h, anvec * sizeof (int64_t),
+            device, stream)) ;
     }
 
     if (A->Y != NULL && (which & GB_PREFETCH_Y))
     {
         // prefetch the hyper_hash: A->Y->p, A->Y->i, and A->Y->x
-        GB_OK (GB_cuda_matrix_prefetch (A->Y, GB_PREFETCH_PIX, device, stream)) ;
+        GB_OK (GB_cuda_matrix_prefetch (A->Y, GB_PREFETCH_PIX,
+            device, stream)) ;
     }
 
     if (A->b != NULL && (which & GB_PREFETCH_B))
     {
-        CU_OK (cudaMemPrefetchAsync (A->b, anz * sizeof (int8_t), device, stream)) ;
+        CUDA_OK (cudaMemPrefetchAsync (A->b, anz * sizeof (int8_t),
+            device, stream)) ;
     }
 
     if (A->i != NULL && (which & GB_PREFETCH_I))
     {
-        CU_OK (cudaMemPrefetchAsync (A->i, anz * sizeof (int64_t), device, stream)) ;
+        CUDA_OK (cudaMemPrefetchAsync (A->i, anz * sizeof (int64_t),
+            device, stream)) ;
     }
 
     if (A->x != NULL && (which & GB_PREFETCH_X))
     {
-        CU_OK (cudaMemPrefetchAsync (A->x, (A->iso ? 1:anz) * A->type->size, device, stream)) ;
+        CUDA_OK (cudaMemPrefetchAsync (A->x, (A->iso ? 1:anz) * A->type->size,
+            device, stream)) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/CUDA/GB_cuda_mxm_dot3_jitFactory.hpp b/GraphBLAS/CUDA/GB_cuda_mxm_dot3_jitFactory.hpp
deleted file mode 100644
index 3fce511b40..0000000000
--- a/GraphBLAS/CUDA/GB_cuda_mxm_dot3_jitFactory.hpp
+++ /dev/null
@@ -1,832 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_mxm_dot3_jitFactory.hpp
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-/*
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of NVIDIA CORPORATION nor the names of its
- *   contributors may be used to endorse or promote products derived
- *   from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef GB_MXM_DOT3_JITFACTORY_H
-#define GB_MXM_DOT3_JITFACTORY_H
-
-#pragma once
-
-/**
- * This file is responsible for picking all the parameters and what kernel
- * variaiton we will use for a given instance
- * - data types
- * - semiring types
- * - binary ops
- * - monoids
- *
- * Kernel factory says "Here's the actual instance I want you to build with the
- * given parameters"
- */
-
-
-//AxB_dot3_phase1 kernel launchers
-template<int threads_per_block, int chunk_size> class phase1launchFactory ;
-template<int threads_per_block, int chunk_size> class dense_phase1launchFactory ;
-
-//AxB_dot3_phase3 kernel launchers
-
-//------------------------------------------------------------------------------
-// dot3: dense_phase1launchFactory 
-//------------------------------------------------------------------------------
-
-// Handles full/bitmap cases, which means we don't need buckets and zombies.
-// This is a much simpler kernel as a result, it only does the i,j lookup 
-// and stores the values in Mi and Ci. 
-
-
-template<int threads_per_block=32, int chunk_size = 128>
-class dense_phase1launchFactory 
-{
-  // FIXME: this is the full name.  Why?  See below for partial name.
-  // Need to be consistent in naming schemes.
-  std::string kernel_name = "GB_cuda_jit_AxB_dot3_dense_phase1";
-
-  GB_cuda_mxm_factory &mxm_factory_;
-
-public:
-
-  int get_number_of_blocks(GrB_Matrix M) {
-      int number_of_sms = GB_Global_gpu_sm_get (0);
-      int nblks = ( GB_nnz (M) + chunk_size - 1)/chunk_size;
-      return GB_IMIN( nblks,  chunk_size * number_of_sms);
-  }
-
-  int get_threads_per_block() {
-      return threads_per_block;
-  }
-
-  // This assumes the needed state on the GB_cuda_mxm_factory
-  // has already been populated
-  dense_phase1launchFactory(GB_cuda_mxm_factory &mxm_factory): mxm_factory_(mxm_factory){}
-
-  bool jitGridBlockLaunch( GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B, cudaStream_t stream = 0) {
-
-    // Idea is to have each task work on a continguous block of columns of C
-    // Note: for small tests, mnz is small so ntasks is be governed by
-    // chunksize, not chunk_size*number_of_sms.  For large problems in
-    // production, chunksize is less important since ntasks will likely be
-    // bounded by chunk_size*number_of_sms (say 128*80 = 10,240 on a V100, for
-    // the default chunk_size of 128).
-
-    // Defining dummy instance only so we can introspect type
-//    // (1) create the mxm code and name
-
-//    // (2) ensure the jitifier has "GB_mxm_[mymxm.sr_code].h"
-    jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
-    filecache.getFile (mxm_factory_) ;
-
-    uint64_t sr_code = mxm_factory_.sr_code  ;
-    int mask_ecode = GB_RSHIFT (sr_code, 20, 4) ;
-    bool mask_no_type = (mask_ecode < 4) ;
-    auto sr_code_str = std::to_string(sr_code) ;
-    std::vector<std::string> template_types = {
-        (mask_no_type) ? "bool" : M->type->name, sr_code_str };
-
-    std::stringstream string_to_be_jitted ;
-
-    string_to_be_jitted << kernel_name << std::endl <<
-    R"(#include "GB_cuda_kernel.h")" << std::endl <<
-    R"(#include ")" << mxm_factory_.filename << R"(")" << std::endl <<
-    R"(#include ")" << kernel_name << R"(.cuh")" << std::endl;
-
-    bool result = false;
-
-    dim3 grid(get_number_of_blocks(M));
-    dim3 block(get_threads_per_block());
-
-    std::cout << "HERE I AM 7" << std::endl ;
-    jit::launcher( kernel_name + "_" + sr_code_str + ".jtfy",
-                   string_to_be_jitted.str(),
-                   header_names,
-                   GB_cuda_jit_compiler_flags ( ),
-                   file_callback /* FIXME: make NULL */)
-                 .set_kernel_inst(  kernel_name, template_types)
-                 .configure(grid, block, SMEM, stream)
-                 .launch( C, M);
-
-      result = true;
-
-      return result;
-     }
-};
-
-//------------------------------------------------------------------------------
-// dot3: phase1launchFactory 
-//------------------------------------------------------------------------------
-
-// FIXME: We probably want to remove this type template altogether and provide
-// a macro/function that can convert from a GrB_Type instance to the name of a
-// type that the jitifier will accept.
-
-template<int threads_per_block=32, int chunk_size = 128>
-class phase1launchFactory 
-{
-  std::string kernel_name = "GB_cuda_jit_AxB_dot3_phase1";
-
-  GB_cuda_mxm_factory &mxm_factory_;
-
-public:
-
-  int get_number_of_blocks(GrB_Matrix M) {
-      int number_of_sms = GB_Global_gpu_sm_get (0);
-      int nblks = ( GB_nnz (M) + chunk_size - 1)/chunk_size;
-      return GB_IMIN( nblks,  chunk_size * number_of_sms);
-  }
-
-  int get_threads_per_block() {
-      return threads_per_block;
-  }
-
-  // This assumes the needed state on the GB_cuda_mxm_factory
-  // has already been populated
-  phase1launchFactory(GB_cuda_mxm_factory &mxm_factory): mxm_factory_(mxm_factory){}
-
-  bool jitGridBlockLaunch(int64_t *nanobuckets, int64_t *blockBucket,
-                          GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B, cudaStream_t stream = 0) {
-
-    // Idea is to have each task work on a continguous block of columns of C
-    // Note: for small tests, mnz is small so ntasks is be governed by
-    // chunksize, not chunk_size*number_of_sms.  For large problems in
-    // production, chunksize is less important since ntasks will likely be
-    // bounded by chunk_size*number_of_sms (say 128*80 = 10,240 on a V100, for
-    // the default chunk_size of 128).
-
-    // Defining dummy instance only so we can introspect type
-//    // (1) create the mxm code and name
-
-//    // (2) ensure the jitifier has "GB_mxm_[mymxm.sr_code].h"
-    jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
-    filecache.getFile (mxm_factory_) ;
-
-    uint64_t sr_code = mxm_factory_.sr_code ;
-    int mask_ecode = GB_RSHIFT (sr_code, 20, 4) ;
-    bool mask_no_type = (mask_ecode < 4) ;
-    auto sr_code_str = std::to_string(sr_code) ;
-    std::vector<std::string> template_types = {
-        (mask_no_type) ? "bool" : M->type->name, sr_code_str };
-
-    std::stringstream string_to_be_jitted ;
-
-    string_to_be_jitted << kernel_name << std::endl <<
-    R"(#include "GB_cuda_kernel.h")" << std::endl <<
-    R"(#include ")" << mxm_factory_.filename << R"(")" << std::endl <<
-    R"(#include ")" << kernel_name << R"(.cuh")" << std::endl;
-
-    std::cout << "header names:" << std::endl ;
-//  std::cout << header_names << std::endl ;
-    for (std::string s : header_names)
-    {
-        std::cout << "     " << s << std::endl ;
-    }
-//  std::cout << "string_to_be_jitted :" << std::endl ;
-//  std::cout << string_to_be_jitted << std::endl ;
-    std::cout << "GB_cuda_jit_compiler_flags ( ):" << std::endl ;
-    for (std::string s : GB_cuda_jit_compiler_flags ( ))
-    {
-        std::cout << "     " << s << std::endl ;
-    }
-    std::cout << "kernel_name + sr_code_str .jtfy:" << std::endl ;
-    std::cout << kernel_name + "_" + sr_code_str + ".jtfy" << std::endl ;
-    std::cout << "jit::get_user_home_cache_dir ( ):" << std::endl ;
-    std::cout << jit::get_user_home_cache_dir ( ) << std::endl ;
-
-    bool result = false;
-
-    dim3 grid(get_number_of_blocks(M));
-    dim3 block(get_threads_per_block());
-
-    std::cout << "HERE I AM 1" << std::endl ;
-    jit::launcher( kernel_name + "_" + sr_code_str + ".jtfy",
-                   string_to_be_jitted.str(),
-                   header_names,
-                   GB_cuda_jit_compiler_flags ( ),
-                   file_callback)
-                 .set_kernel_inst(  kernel_name, template_types)
-                 .configure(grid, block, SMEM, stream)
-                 .launch( nanobuckets, blockBucket, C, M, A, B);
-
-      result = true;
-
-      return result;
-     }
-};
-
-//------------------------------------------------------------------------------
-// dot3: phase2launchFactory
-//------------------------------------------------------------------------------
-
-template<int threads_per_block = 32, int chunk_size = 128>
-class phase2launchFactory
-{
-
-  std::string base_name = "GB_cuda_jit";
-  // FIXME: this is the partial name.  Why?  See above.
-  std::string kernel_name = "AxB_phase2";
-
-public:
-
-  int get_threads_per_block() {
-        return threads_per_block;
-  }
-
-  int get_number_of_blocks(GrB_Matrix M) {
-    const int64_t mnz = GB_nnz (M) ;
-    int ntasks = ( mnz +chunk_size -1)/chunk_size;
-    // Idea is to have each task work on a continguous block of columns of C
-    ntasks = GB_IMIN( ntasks,  chunk_size*GB_Global_gpu_sm_get (0)) ;    // ntasks will be grid.x
-    return (ntasks + threads_per_block - 1) / threads_per_block ;
-  }
-
-  int get_number_of_phase1_blocks( GrB_Matrix M){
-    const int64_t mnz = GB_nnz (M) ;
-    int number_of_sms = GB_Global_gpu_sm_get (0);
-    int nblks = ( GB_nnz (M) + chunk_size - 1)/chunk_size;
-    return GB_IMIN( nblks,  chunk_size * number_of_sms);
-  }
-
-  bool jitGridBlockLaunch(// parameters to AxB_phase2:
-                          int64_t *blockBucket, int64_t *offset, GrB_Matrix M, cudaStream_t stream = 0) {
-
-    bool result = false;
-
-      dim3 grid(get_number_of_blocks(M));
-      dim3 block(get_threads_per_block());
-
-      std::string hashable_name = base_name + "_" + kernel_name;
-      std::stringstream string_to_be_jitted ;
-      string_to_be_jitted << hashable_name << std::endl <<
-        R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
-
-      const int64_t mnz = GB_nnz (M) ;
-    std::cout << "HERE I AM 2" << std::endl ;
-      jit::launcher( hashable_name,
-                     string_to_be_jitted.str(),
-                     header_names,
-                     GB_cuda_jit_compiler_flags ( ),
-                     file_callback)
-                   .set_kernel_inst( kernel_name, {})
-                   .configure(grid, block, SMEM, stream)
-                   // parameters to AxB_phase2:
-                   .launch( blockBucket, offset, get_number_of_phase1_blocks(M));
-
-      result= true;
-
-      return result;
-     }
-
-};
-
-//------------------------------------------------------------------------------
-// dot3: phase2endlaunchFactory
-//------------------------------------------------------------------------------
-
-template< int threads_per_block = 32, int chunk_size = 128>
-class phase2endlaunchFactory
-{
-
-  std::string base_name = "GB_cuda_jit";
-  std::string kernel_name = "AxB_phase2end";
-
-public:
-
-  int get_threads_per_block() {
-        return threads_per_block;
-  }
-
-  int get_number_of_blocks(GrB_Matrix M) {
-    const int64_t mnz = GB_nnz (M) ;
-    int ntasks = ( mnz +chunk_size -1)/chunk_size;
-    int number_of_sms = GB_Global_gpu_sm_get (0);
-
-    // Idea is to have each task work on a continguous block of columns of C
-    return GB_IMIN( ntasks,  chunk_size*number_of_sms) ;    // ntasks will be grid.x
-  }
-
-  bool jitGridBlockLaunch(int64_t *nanobuckets, int64_t *blockBucket,
-                          int64_t *bucketp, int64_t *bucket, int64_t *offset,
-                          GrB_Matrix C, GrB_Matrix M, cudaStream_t stream = 0)
-     {
-
-      bool result = false;
-
-      dim3 grid(get_number_of_blocks(M));
-      dim3 block(get_threads_per_block());
-
-      std::string hashable_name = base_name + "_" + kernel_name;
-      std::stringstream string_to_be_jitted ;
-      string_to_be_jitted << hashable_name << std::endl <<
-        R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
-
-    std::cout << "HERE I AM 3" << std::endl ;
-      jit::launcher( hashable_name,
-                     string_to_be_jitted.str(),
-                     header_names,
-                     GB_cuda_jit_compiler_flags ( ),
-                     file_callback)
-                   .set_kernel_inst(  kernel_name , {})
-                   .configure(grid, block, SMEM, stream)
-                   .launch( nanobuckets, blockBucket, bucketp, bucket, offset, C, GB_nnz (M));
-
-      result= true;
-
-      return result;
-     }
-
-};
-
-
-//------------------------------------------------------------------------------
-// dot3: mxm_dense_launchFactory
-//------------------------------------------------------------------------------
-
-class mxm_dense_launchFactory
-{
-  std::string base_name = "GB_cuda_jit";
-  std::string kernel_name = "AxB_dot3_phase3_dndn";
-
-  GB_cuda_mxm_factory &mxm_factory_;
-
-public:
-
-  /**
-   * This assumes the needed state on the GB_cuda_mxm_factory has already been populated.
-   * The `bucket_code` determines which kernel is launched
-   */
-  mxm_dense_launchFactory(GB_cuda_mxm_factory &mymxmfactory):
-      mxm_factory_(mymxmfactory) {}
-
-  bool jitGridBlockLaunch( GrB_Matrix C,  GrB_Matrix M, GrB_Matrix A, GrB_Matrix B,
-                          cudaStream_t stream = 0) {
-
-      bool result = false;
-
-    //----------------------------------------------------------------------
-    // do the numerical work
-    //----------------------------------------------------------------------
-
-    const int64_t nz = GB_nnz(M); // number of dots in the mask
-    const int64_t mnvec = M->nvec ;
-
-    int gridsz, blocksz;
-
-    std::stringstream final_kernel_name_ss;
-    final_kernel_name_ss << kernel_name;
-
-    /**
-     * Configure geometry and kernel function name based on sparsity of C and number of vectors in M
-     */
-    configure( nz, mnvec, final_kernel_name_ss, blocksz, gridsz);
-
-    auto sr_code = std::to_string(mxm_factory_.sr_code);    // FIXME: make hexadecimal
-
-    GrB_BinaryOp mult = mxm_factory_.semiring->multiply ;
-
-    std::string hashable_name = base_name + "_" + final_kernel_name_ss.str();
-    std::stringstream string_to_be_jitted ;
-    std::vector<std::string> template_types =
-    {
-        C->type->name, A->type->name, B->type->name,
-        mult->ztype->name, mult->xtype->name, mult->ytype->name,
-        sr_code
-    };
-
-    jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
-    filecache.getFile (mxm_factory_) ;
-
-    string_to_be_jitted << hashable_name << std::endl <<
-    R"(#include "GB_cuda_kernel.h")" << std::endl <<
-    R"(#include ")" << mxm_factory_.filename << R"(")" << std::endl <<
-    R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
-
-    dim3 grid(gridsz);
-    dim3 block(blocksz);
-
-    GBURBLE ("(GPU dot3 mxm dense launch nblocks,blocksize= %d,%d )\n", gridsz,blocksz) ;
-    std::cout << "HERE I AM 4" << std::endl ;
-    jit::launcher( hashable_name + "_" + sr_code,
-                   string_to_be_jitted.str(),
-                   header_names,
-                   GB_cuda_jit_compiler_flags ( ),
-                   file_callback)
-               .set_kernel_inst(final_kernel_name_ss.str(), template_types )
-                               // { C->type->name,
-                               //   A->type->name,
-                               //   B->type->name })
-               .configure(grid, block, SMEM, stream) //if commented, use implicit 1D configure in launch
-               .launch(
-                        C,                 // final output matrix
-                                           // inputs, not modified:
-                        M,                 // Mi used for column index
-                        A,                 // A matrix
-                        B                  // B matrix
-                    );
-
-    result= true;
-
-    return result;
-  }
-
-private:
-    void configure(std::int64_t Cnz, std::int64_t mnvec, std::stringstream &opname,
-                   int &blocksz, int &gridsz) {
-    int number_of_sms = GB_Global_gpu_sm_get (0) ;
-
-    int work_per_thread;
-
-     blocksz = 64;
-     work_per_thread = 8;
-     
-     if( Cnz > 1024){
-       blocksz = 512;
-       work_per_thread = 64;
-     }
-
-     // gridsz = ceiling (Cnz / work_per_thread*blocksz)
-     gridsz = GB_ICEIL (Cnz, work_per_thread*blocksz) ;
-
-  }
-};
-
-//------------------------------------------------------------------------------
-// FIXME: rename GB_cuda_mxm_dot3_jitFactory_sparse_dense_launchFactory
-//------------------------------------------------------------------------------
-
-class mxm_sparse_dense_launchFactory
-{
-  std::string base_name = "GB_cuda_jit";
-  std::string kernel_name = "AxB_dot3";
-
-  GB_cuda_mxm_factory &mxm_factory_;
-
-public:
-
-  /**
-   * This assumes the needed state on the GB_cuda_mxm_factory has already been populated.
-   * The `bucket_code` determines which kernel is launched
-   */
-  mxm_sparse_dense_launchFactory(GB_cuda_mxm_factory &mymxmfactory):
-      mxm_factory_(mymxmfactory) {}
-
-  bool jitGridBlockLaunch( GrB_Matrix C,  GrB_Matrix M, GrB_Matrix A, GrB_Matrix B,
-                          cudaStream_t stream = 0) {
-
-      bool result = false;
-
-    //----------------------------------------------------------------------
-    // do the numerical work
-    //----------------------------------------------------------------------
-
-    const int64_t nz = GB_nnz(M); // number of dots in the mask
-    const int64_t mnvec = M->nvec ;
-
-    int gridsz, blocksz;
-
-    std::stringstream final_kernel_name_ss;
-    final_kernel_name_ss << kernel_name;
-
-    /**
-     * Configure geometry and kernel function name based on sparsity of C and number of vectors in M
-     */
-    configure( nz, mnvec, final_kernel_name_ss, blocksz, gridsz);
-
-    auto sr_code = std::to_string(mxm_factory_.sr_code);
-
-    GrB_BinaryOp mult = mxm_factory_.semiring->multiply ;
-
-    std::string hashable_name = base_name + "_" + final_kernel_name_ss.str();
-    std::stringstream string_to_be_jitted ;
-    std::vector<std::string> template_types =
-    {
-        C->type->name, A->type->name, B->type->name,
-        mult->ztype->name, mult->xtype->name, mult->ytype->name,
-        sr_code
-    };
-
-    jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
-    filecache.getFile (mxm_factory_) ;
-
-    string_to_be_jitted << hashable_name << std::endl <<
-    R"(#include "GB_cuda_kernel.h")" << std::endl <<
-    R"(#include ")" << mxm_factory_.filename << R"(")" << std::endl <<
-    R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
-
-    dim3 grid(gridsz);
-    dim3 block(blocksz);
-
-    GBURBLE ("(GPU dot3 mxm sparse_dense launch nblocks,blocksize= %d,%d )\n", gridsz,blocksz) ;
-    std::cout << "HERE I AM 5" << std::endl ;
-    jit::launcher( hashable_name + "_" + sr_code,
-                   string_to_be_jitted.str(),
-                   header_names,
-                   GB_cuda_jit_compiler_flags ( ),
-                   file_callback)
-               .set_kernel_inst(final_kernel_name_ss.str(), template_types )
-                               // { C->type->name,
-                               //   A->type->name,
-                               //   B->type->name })
-               .configure(grid, block, SMEM, stream) //if commented, use implicit 1D configure in launch
-               .launch(
-                        C,                 // final output matrix
-                                           // inputs, not modified:
-                        M,                 // Mi used for column index
-                        A,                 // A matrix
-                        B                  // B matrix
-                    );
-
-    result= true;
-
-    return result;
-  }
-
-private:
-    void configure(std::int64_t Cnz, std::int64_t mnvec, std::stringstream &opname,
-                   int &blocksz, int &gridsz) {
-    int number_of_sms = GB_Global_gpu_sm_get (0) ;
-
-    int work_per_thread;
-
-     blocksz = 64;
-     work_per_thread = 8;
-     
-     if( Cnz > 1024){
-       blocksz = 512;
-       work_per_thread = 64;
-     }
-
-     // gridsz = ceiling (Cnz / work_per_thread*blocksz)
-     gridsz = GB_ICEIL (Cnz, work_per_thread*blocksz) ;
-
-  }
-};
-
-//------------------------------------------------------------------------------
-// dot3: phase3launchFactory
-//------------------------------------------------------------------------------
-
-class phase3launchFactory
-{
-  std::string base_name = "GB_cuda_jit";
-  std::string kernel_name = "AxB_dot3";
-
-  GB_cuda_mxm_factory &mxm_factory_;
-
-  GB_bucket_code bucket_code_;
-
-public:
-
-   std::string Opname;
-
-  /**
-   * This assumes the needed state on the GB_cuda_mxm_factory has already been populated.
-   * The `bucket_code` determines which kernel is launched
-   */
-  phase3launchFactory(GB_cuda_mxm_factory &mymxmfactory, GB_bucket_code bucket_code):
-      mxm_factory_(mymxmfactory), bucket_code_(bucket_code) {}
-
-  bool jitGridBlockLaunch(int64_t start, int64_t end, int64_t *bucketp, int64_t *bucket,
-                          GrB_Matrix C,  GrB_Matrix M, GrB_Matrix A, GrB_Matrix B,
-                          cudaStream_t stream = 0) {
-
-      bool result = false;
-
-    //----------------------------------------------------------------------
-    // phase3: do the numerical work
-    //----------------------------------------------------------------------
-
-    const int64_t nz = end - start; // number of dots in this bucket  
-    const int64_t mnvec = M->nvec ;
-
-    int gridsz, blocksz, sz = 4;
-
-    std::stringstream final_kernel_name_ss;
-    final_kernel_name_ss << kernel_name << "_";
-
-    /**
-     * Configure geometry and kernel function name based on sparsity of C and number of vectors in M
-     */
-    auto sr_code = std::to_string(mxm_factory_.sr_code);
-
-    configure2( nz, mnvec, final_kernel_name_ss, blocksz, gridsz, sz, mxm_factory_.sr_code);
-
-
-    GrB_BinaryOp mult = mxm_factory_.semiring->multiply ;
-
-    std::string hashable_name = base_name + "_" + final_kernel_name_ss.str();
-    std::stringstream string_to_be_jitted ;
-    std::vector<std::string> template_types =
-    {
-        C->type->name, A->type->name, B->type->name,
-        mult->ztype->name, mult->xtype->name, mult->ytype->name,
-        sr_code
-    };
-
-    jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
-    filecache.getFile (mxm_factory_) ;
-
-    // FIXME: why is "hashable_name" used sometimes, and sometimes "kernel_name"?
-    string_to_be_jitted << hashable_name << std::endl <<
-    R"(#include "GB_cuda_kernel.h")" << std::endl <<
-    R"(#include ")" << mxm_factory_.filename << R"(")" << std::endl <<
-    R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
-
-    dim3 grid(gridsz);
-    dim3 block(blocksz);
-
-    GBURBLE ("(GPU phase3 launch %s st,end=%ld,%ld nblocks,blocksize= %d,%d )\n", this->Opname.c_str(),
-              start,end,gridsz,blocksz) ;
-    std::cout << "HERE I AM 6" << std::endl ;
-    jit::launcher( hashable_name + "_" + sr_code,
-                   string_to_be_jitted.str(),
-                   header_names,
-                   GB_cuda_jit_compiler_flags ( ),
-                   file_callback)
-               .set_kernel_inst(final_kernel_name_ss.str(), template_types )
-                               // { C->type->name,
-                               //   A->type->name,
-                               //   B->type->name })
-               .configure(grid, block, SMEM, stream) //if commented, use implicit 1D configure in launch
-               .launch(
-                        start,             // input/output:
-                        end,               // global bucket cumsum, of size NBUCKETS+1
-                        bucket,            // global buckets, of size cnz (== mnz)
-                        C,                 // final output matrix
-                                           // inputs, not modified:
-                        M,                 // Mi used for column index
-                        A,                 // A matrix
-                        B,                 // B matrix
-                        sz                 // only used for sparse-sparse cases
-                    );
-
-    result= true;
-
-    return result;
-  }
-
-private:
-    void configure2(std::int64_t Cnz, std::int64_t mnvec, std::stringstream &opname,
-                   int &blocksz, int &gridsz, int &sz, uint64_t sr_code) {
-    int number_of_sms = GB_Global_gpu_sm_get (0) ;
-
-    int work_per_thread;
-
-    // 0:hyper, 1:sparse, 2:bitmap, 3:full
-    int asparsity   = GB_RSHIFT (sr_code,  2, 2) ;
-    int bsparsity   = GB_RSHIFT (sr_code,  0, 2) ;
-
-    if (asparsity <= 1 && bsparsity <= 1)
-    {
-        // both A and B are sparse/hyper
-        switch (bucket_code_)
-        {
-
-            //--------------------------------------------------------------
-            // not a bucket ... bring out your dead:
-            //--------------------------------------------------------------
-
-            case GB_BUCKET_ZOMBIE : // C(i,j) is a zombie (not a bucket)
-                break ;
-
-            //--------------------------------------------------------------
-            // CUDA kernel: vsvs bucket:
-            //--------------------------------------------------------------
-
-            case GB_BUCKET_VSVS :
-                Opname = "phase3_vsvs" ;
-                blocksz = 256;
-                work_per_thread = 4;
-                
-                if( Cnz > (2<<12)){
-                  blocksz = 512;
-                  work_per_thread = 4;
-                }
-
-                // gridsz = ceiling (Cnz / work_per_thread*blocksz)
-                gridsz = GB_ICEIL (Cnz, work_per_thread*blocksz) ;
-                if (gridsz > 256*number_of_sms)  gridsz = 256*number_of_sms;
-                break ;
-
-            //--------------------------------------------------------------
-            // CUDA kernel: mp, use the merge-path method:
-            //--------------------------------------------------------------
-
-            case GB_BUCKET_MERGEPATH :
-                Opname = "phase3_mp" ;
-                blocksz = 32;
-                work_per_thread = 256 ;
-
-                if( Cnz > (2<<20)){
-                  work_per_thread = 1024;
-                }
-                gridsz = GB_ICEIL (Cnz, work_per_thread) ;
-                if ((gridsz < number_of_sms) && (Cnz > (2<<20)))
-                {
-                   gridsz = number_of_sms; 
-                }
-                if (gridsz > 256*number_of_sms)  gridsz = 256*number_of_sms;
-                break ;
-
-            default:
-                break ;
-        }
-
-    }
-    else
-    {
-        // either A or B are bitmap/full
-        switch (bucket_code_)
-        {
-
-            //--------------------------------------------------------------
-            // not a bucket ... bring out your dead:
-            //--------------------------------------------------------------
-
-            case GB_BUCKET_ZOMBIE : // C(i,j) is a zombie (not a bucket)
-                break ;
-
-            //--------------------------------------------------------------
-            // CUDA kernel: vsdn bucket:  one thread per C(i,j) dot product
-            //--------------------------------------------------------------
-
-            case GB_BUCKET_VSDN :
-                Opname = "phase3_vsdn" ;
-
-                // FIXME:
-                blocksz = 256;
-                work_per_thread = 4;
-                
-                if( Cnz > (2<<12)){
-                  blocksz = 512;
-                  work_per_thread = 4;
-                }
-
-                // gridsz = ceiling (Cnz / work_per_thread*blocksz)
-                gridsz = GB_ICEIL (Cnz, work_per_thread*blocksz) ;
-                if (gridsz > 256*number_of_sms)  gridsz = 256*number_of_sms;
-                break ;
-
-            //--------------------------------------------------------------
-            // CUDA kernel: spdn bucket: one warp per C(i,j) dot product
-            //--------------------------------------------------------------
-
-            case GB_BUCKET_SPDN :
-                Opname = "phase3_spdn" ;
-
-                // FIXME:
-                blocksz = 32;
-                work_per_thread = 256 ;
-
-                if( Cnz > (2<<20)){
-                  work_per_thread = 1024;
-                }
-                gridsz = GB_ICEIL (Cnz, work_per_thread) ;
-                if ((gridsz < number_of_sms) && (Cnz > (2<<20)))
-                {
-                   gridsz = number_of_sms; 
-                }
-                if (gridsz > 256*number_of_sms)  gridsz = 256*number_of_sms;
-                break ;
-
-            default:
-                break ;
-        }
-
-    }
-
-    opname << Opname;
-  }
-};
-
-#endif
diff --git a/GraphBLAS/CUDA/GB_cuda_mxm_factory.hpp b/GraphBLAS/CUDA/GB_cuda_mxm_factory.hpp
deleted file mode 100644
index 227b776f65..0000000000
--- a/GraphBLAS/CUDA/GB_cuda_mxm_factory.hpp
+++ /dev/null
@@ -1,167 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_mxm_factory.hpp
-//------------------------------------------------------------------------------
-
-// (c) Nvidia Corp. 2023 All rights reserved
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-// Class to manage both stringify functions from mxm, ops and monoids to a
-// header file.
-
-// Implementations of string callbacks
-#pragma once
-
-// FIXME do we need the iostrean any more?
-#include <iostream>
-#include <cstdint>
-#include "GB_cuda_jitify_cache.h"
-
-extern "C"
-{
-    #include "GB.h"
-    #include "GB_binop.h"
-    #include "GB_stringify.h"
-}
-
-// FIXME: do we need the file_callback method?
-// Define function pointer we will use later
-//std::istream* (*file_callback)(std::string, std::iostream&);
-
-//------------------------------------------------------------------------------
-// GB_cuda_mxm_factory
-//------------------------------------------------------------------------------
-
-// Define a factory class for building any mxm text definitions
-
-class GB_cuda_mxm_factory: public jit::File_Desc
-{
-
-    //--------------------------------------------------------------------------
-    // public members of the object
-    //--------------------------------------------------------------------------
-
-    public:
-
-        uint64_t sr_code ;          // unique 62-bit code for a GrB_mxm problem
-        GrB_Semiring semiring ;     // the semiring for GrB_mxm
-        GrB_Type ctype, atype, btype ;  // the types of C, A, and B
-        FILE *fp ;                  // file for GB_mxm_*.h header
-
-    //--------------------------------------------------------------------------
-    // open/close: access the GB_mxm_*.h header file for a specific instance
-    //--------------------------------------------------------------------------
-
-    void open (const char *path_and_file, const char *mode)
-    {
-        fp = fopen (path_and_file, mode) ;
-    }
-
-    void close( )
-    {
-        fclose (fp) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // mxm_factory: create unique code for a GrB_mxm problem
-    //--------------------------------------------------------------------------
-
-    // mxm_factory takes a set of inputs describing and operation (semiring,
-    // mask, datatypes, sparsity formats, etc) and produces a numerical unique
-    // value for those. This allows rapid lookups to see if we have handled this
-    // case before, and avoids the need to generate and manage strings at this
-    // stage.
-
-    // FIXME: pass in user's C_in matrix, in case C_in<M>+=A*B can be done
-    //        in-place
-    // FIXME: handle hypersparse case in dot3
-
-    void mxm_factory
-    (
-        // C matrix:
-        bool C_iso,             // true if C is iso-valued
-        bool C_in_iso,          // C input iso status
-        int C_sparsity,         // sparsity structure of C
-        GrB_Type ctype,         // the type of C
-        // M matrix:
-        GrB_Matrix M,           // may be NULL
-        bool Mask_struct,       // mask is structural
-        bool Mask_comp,         // mask is complemented
-        // semiring:
-        GrB_Semiring semiring,  // the semiring to enumify
-        bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
-        // A and B:
-        GrB_Matrix A,
-        GrB_Matrix B
-    )
-    {
-
-        if (C_iso)
-        {
-            // the kernel does not access any values of C, A, or B
-            semiring = GxB_ANY_PAIR_BOOL ;
-            flipxy = false ;
-        }
-
-       uint64_t scode ;
-
-       GB_enumify_mxm (
-	    // output:
-	    &scode,         // unique encoding of the entire semiring
-	    // input:
-            C_iso,          // true if C is iso-valued
-            C_in_iso,
-	    C_sparsity,     // sparsity structure of C
-	    ctype,          // the type of C
-            // M matrix:
-            M,
-	    Mask_struct,    // mask is structural
-	    Mask_comp,      // mask is complemented
-            // semiring:
-	    semiring,      // the semiring to enumify
-	    flipxy,        // multiplier is: mult(a,b) or mult(b,a)
-            // A and B:
-            A,
-            B
-       ) ;
-
-       this->sr_code = scode;
-       this->semiring = semiring ;
-       this->atype = A->type ;
-       this->btype = B->type ;
-       this->ctype = ctype ;
-
-       std::stringstream ss;
-       // FIXME: use same name scheme as the CPU jit
-       ss << "GB_mxm_" << this->sr_code << ".h";
-
-       std::string new_filename = ss.str();
-       filename.resize(new_filename.size());
-       strcpy(filename.data(), new_filename.data());
-
-    }
-
-    //--------------------------------------------------------------------------
-    // macrofy: create macros from sr_code and data types
-    //--------------------------------------------------------------------------
-
-    // macrofy takes a code and creates the corresponding string macros for
-    // operators, datatypes, sparsity formats and writes its results to a file.
-
-    void macrofy ( ) override
-    {
-       GB_macrofy_mxm (
-	    // output to file :
-	    fp,
-	    // input:
-	    this->sr_code,
-	    this->semiring,
-	    this->ctype,
-	    this->atype,
-	    this->btype
-       ) ;
-    }
-
-} ; // GB_cuda_mxm_factory
-
diff --git a/GraphBLAS/CUDA/GB_cuda_reduce.hpp b/GraphBLAS/CUDA/GB_cuda_reduce.hpp
new file mode 100644
index 0000000000..3dfe07372f
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_reduce.hpp
@@ -0,0 +1,30 @@
+//------------------------------------------------------------------------------
+// GB_cuda_reduce.hpp: CPU definitions for CUDA reductions
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_CUDA_REDUCE_H
+#define GB_CUDA_REDUCE_H
+
+#include "GB_cuda.hpp"
+
+GrB_Info GB_cuda_reduce_to_scalar_jit   // z = reduce_to_scalar (A) via CUDA JIT
+(
+    // output:
+    GB_void *z,                 // result if has_cheeseburger is true
+    GrB_Matrix V,               // result if has_cheeseburger is false
+    // input:
+    const GrB_Monoid monoid,    // monoid to do the reduction
+    const GrB_Matrix A,         // matrix to reduce
+    // CUDA stream and launch parameters:
+    cudaStream_t stream,
+    int32_t gridsz,
+    int32_t blocksz
+) ;
+
+#endif
+
diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_factory.hpp b/GraphBLAS/CUDA/GB_cuda_reduce_factory.hpp
deleted file mode 100644
index e0f7aae75b..0000000000
--- a/GraphBLAS/CUDA/GB_cuda_reduce_factory.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_reduce_factory.hpp
-//------------------------------------------------------------------------------
-
-// (c) Nvidia Corp. 2023 All rights reserved
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-// Class to manage both stringify functions from mxm, ops and monoids to a
-// header file.
-
-// FIXME: does it?
-// Also provides a iostream callback to deliver the buffer to jitify as
-// if read from a file
-
-// Implementations of string callbacks
-#pragma once
-
-// FIXME: do we use iostream?
-#include <iostream>
-#include <cstdint>
-#include "GB_cuda_jitify_cache.h"
-
-extern "C"
-{
-    #include "GB.h"
-    #include "GB_stringify.h"
-}
-
-//------------------------------------------------------------------------------
-// GB_cuda_reduce_factory: construct code and header file for reduce jit kernel
-//------------------------------------------------------------------------------
-
-class GB_cuda_reduce_factory: public jit::File_Desc {
-
-public:
-
-    uint64_t rcode ;        // unique encoding from GB_enumify_reduce
-    GrB_Monoid monoid ;     // monoid to perform the reduction
-    GrB_Type atype ;        // input matrix data type
-    FILE *fp ;              // file pointer for GB_reduce_*.h header file
-
-    //--------------------------------------------------------------------------
-    // open/close: access the GB_reduce_*.h header file for a specific instance
-    //--------------------------------------------------------------------------
-
-    void open (const char *path_and_file, const char *mode)
-    {
-        fp = fopen (path_and_file, mode) ;
-    }
-
-    void close( )
-    {
-        fclose (fp) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // reduce_factory: encode the reduction problem into a scalar rcode
-    //--------------------------------------------------------------------------
-
-    void reduce_factory (GrB_Monoid monoid, GrB_Matrix A)
-    {
-        uint64_t rcode ;
-
-        GB_enumify_reduce
-        (
-                // output:
-                &rcode,         // unique encoding of entire monoid
-                // input:
-                monoid,         // monoid to use for the reduction
-                A               // matrix to reduce
-        ) ;
-
-        this->rcode = rcode ;
-        this->monoid = monoid ;
-        this->atype = A->type ;
-
-        // FIXME: use same name scheme as the CPU jit
-        std::stringstream ss ;
-        ss << "GB_reduce_" << this->rcode << ".h";
-
-        std::string new_filename = ss.str() ;
-        filename.resize(new_filename.size()) ;
-        strcpy(filename.data(), new_filename.data()) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // macrofy: construct a header file from the rcode and data types
-    //--------------------------------------------------------------------------
-
-    void macrofy ( ) override
-    {
-        GB_macrofy_reduce (
-            // output to file :
-            fp,
-            // input:
-            this->rcode,
-            this->monoid,
-            this->atype
-        ) ;
-    }
-
-} ; // GB_cuda_reduce_factory
-
diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_jitFactory.hpp b/GraphBLAS/CUDA/GB_cuda_reduce_jitFactory.hpp
deleted file mode 100644
index fd618bddf0..0000000000
--- a/GraphBLAS/CUDA/GB_cuda_reduce_jitFactory.hpp
+++ /dev/null
@@ -1,254 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_reduce_jitFactory.hpp: kernel for reduction to scalar
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-/*
- * Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of NVIDIA CORPORATION nor the names of its
- *   contributors may be used to endorse or promote products derived
- *   from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-//------------------------------------------------------------------------------
-
-// Constructs an instance of the template/GB_jit_reduce.cuh kernel to reduce
-// a GrB_Matrix to a scalar.
-
-#ifndef GB_REDUCE_JITFACTORY_H
-#define GB_REDUCE_JITFACTORY_H
-
-#pragma once
-#include "GB_cuda_reduce_factory.hpp"
-
-/**
- * This file is responsible for picking all the parameters and what kernel
- * variaiton we will use for a given instance
- * - data types
- * - semiring types
- * - binary ops
- * - monoids
- *
- * Kernel factory says "Here's the actual instance I want you to build with the
- * given parameters"
- */
-
-// Kernel jitifiers
-class reduceFactory ;
-
-//------------------------------------------------------------------------------
-// reduceFactory
-//------------------------------------------------------------------------------
-
-class reduceFactory
-{
-
-    //--------------------------------------------------------------------------
-    // class properties
-    //--------------------------------------------------------------------------
-
-    std::string base_name = "GB_cuda_jit";
-    std::string kernel_name = "reduce";
-
-    int threads_per_block = 320 ;
-    int work_per_thread = 256;
-//  int number_of_sms = GB_Global_gpu_sm_get (0) ;
-
-    GB_cuda_reduce_factory &reduce_factory_ ;
-
-    public:
-
-    //--------------------------------------------------------------------------
-    // class constructor
-    //--------------------------------------------------------------------------
-
-    reduceFactory (GB_cuda_reduce_factory &myreducefactory) :
-        reduce_factory_(myreducefactory) {}
-
-    //--------------------------------------------------------------------------
-    // GB_get_threads_per_block: determine # of threads in a threadBlock
-    //--------------------------------------------------------------------------
-
-    int GB_get_threads_per_block ( )
-    {
-        return threads_per_block ;
-    }
-
-    //--------------------------------------------------------------------------
-    // GB_get_number_of_blocks: determine # of threadBlocks to use
-    //--------------------------------------------------------------------------
-
-    int GB_get_number_of_blocks
-    (
-        int64_t anvals     // # of entries in input matrix
-    )
-    {
-        // FIXME: this is a lot of blocks.  Use a smaller number (cap at, say,
-        // 64K), to simplify the non-atomic reductions
-        return (anvals + work_per_thread*threads_per_block - 1) /
-               (work_per_thread*threads_per_block) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // jitGridBlockLaunch:  construct and launch the GB_jit_reduce kernel
-    //--------------------------------------------------------------------------
-
-    // Note: this does assume the erased types are compatible w/ the monoid's
-    // ztype (an erased type is the type overwritten by a pun type).
-
-    bool jitGridBlockLaunch     // FIXME: return GrB_Info
-    (
-        GrB_Matrix A,           // matrix to reduce to a scalar
-        GB_void *output,        // output scalar (static on CPU), of size zsize
-        GrB_Matrix *V_handle,   // result of a partial reduction
-        GrB_Monoid monoid,      // monoid to use for the reducution
-        cudaStream_t stream = 0 // stream to use, default stream 0
-    )
-    {
-        GBURBLE ("\n(launch reduce factory) \n") ;
-
-        GrB_Type ztype = monoid->op->ztype ;
-        size_t zsize = ztype->size ;
-
-        GB_void *zscalar = NULL ;
-        (*V_handle) = NULL ;
-        GrB_Matrix V = NULL ;
-
-        jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
-        filecache.getFile (reduce_factory_) ;
-
-        auto rcode = std::to_string(reduce_factory_.rcode);
-        bool has_cheeseburger = GB_RSHIFT (reduce_factory_.rcode, 27, 1) ;
-        GBURBLE ("has_cheeseburger %d\n", has_cheeseburger) ;
-
-        std::string hashable_name = base_name + "_" + kernel_name;
-        std::stringstream string_to_be_jitted ;
-        string_to_be_jitted << hashable_name << std::endl <<
-        R"(#include "GB_cuda_kernel.h")" << std::endl <<
-        R"(#include ")" << reduce_factory_.filename << R"(")" << std::endl <<
-        R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
-
-        int64_t anvals = GB_nnz_held (A) ;
-
-        // determine kernel launch geometry
-        int blocksz = GB_get_threads_per_block ( ) ;
-        int gridsz = GB_get_number_of_blocks (anvals) ;
-        dim3 grid (gridsz) ;
-        dim3 block (blocksz) ;
-
-        // determine the kind of reduction: partial (to &V), or complete
-        // (to the scalar output)
-        if (has_cheeseburger)
-        {
-            // the kernel launch can reduce A to zscalar all by itself
-            // allocate and initialize zscalar (upscaling it to at least 32 bits)
-            size_t zscalar_size = GB_IMAX (zsize, sizeof (uint32_t)) ;
-            (GB_void *) rmm_wrap_malloc (zscalar_size) ;
-            zscalar = (GB_void *) rmm_wrap_malloc (zscalar_size) ;
-            if (zscalar == NULL)
-            {
-                // out of memory
-                return (GrB_OUT_OF_MEMORY) ;
-            }
-            GB_cuda_upscale_identity (zscalar, monoid) ;
-        }
-        else
-        {
-            // allocate a full GrB_Matrix V for the partial result, of size
-            // gridsz-by-1, and of type ztype.  V is allocated but not
-            // initialized.
-            GrB_Info info = GB_new_bix (&V, ztype, gridsz, 1, GB_Ap_null,
-                true, GxB_FULL, false, 0, -1, gridsz, true, false) ;
-            if (info != GrB_SUCCESS)
-            {
-                // out of memory
-                return (info) ;
-            }
-        }
-
-        GBURBLE ("(cuda reduce launch %d threads in %d blocks)",
-            blocksz, gridsz ) ;
-
-        // construct and launch the kernel
-        // FIXME: use same name scheme as the CPU jit
-        // FIXME: where does it go if it fails?  try/catch?
-        jit::launcher(hashable_name + "_" + rcode,
-                string_to_be_jitted.str(),
-                header_names,
-                GB_cuda_jit_compiler_flags ( ),
-                file_callback)  // FIXME: where is file_callback defined?
-           .set_kernel_inst(  hashable_name ,
-                { A->type->name, monoid->op->ztype->name })
-           .configure(grid, block, SMEM, stream)
-           .launch (A, zscalar, V, anvals) ;
-
-        // synchronize before copying result to host
-        CHECK_CUDA (cudaStreamSynchronize (stream)) ;
-
-        // FIXME: sometimes we use CHECK_CUDA, sometimes CU_OK.  Need to
-        // be consistent.  Also, if this method fails, zscalar
-        // must be freed: we can do this in the CU_OK or CHECK_CUDA macros.
-        // Or in a try/catch?
-
-        if (has_cheeseburger)
-        {
-            // return the scalar result
-            // output = zscalar (but only the first zsize bytes of it)
-            memcpy (output, zscalar, zsize) ;
-            rmm_wrap_free (zscalar) ;
-        }
-        else
-        {
-            // return the partial reduction
-            (*V_handle) = V ;
-        }
-
-        return (GrB_SUCCESS) ;
-    }
-} ;
-
-//------------------------------------------------------------------------------
-// GB_cuda_reduce
-//------------------------------------------------------------------------------
-
-inline bool GB_cuda_reduce      // FIXME: return GrB_Info, not bool
-(
-    GB_cuda_reduce_factory &myreducefactory,    // reduction JIT factory
-    GrB_Matrix A,               // matrix to reduce
-    GB_void *output,            // result of size monoid->op->ztype->size
-    GrB_Matrix *V_handle,       // result of a partial reduction
-    GrB_Monoid monoid,          // monoid for the reduction
-    cudaStream_t stream = 0     // stream to use
-)
-{
-    reduceFactory rf(myreducefactory);
-    GBURBLE ("(starting cuda reduce)" ) ;
-    bool result = rf.jitGridBlockLaunch (A, output, V_handle, monoid, stream) ;
-    GBURBLE ("(ending cuda reduce)" ) ;
-    return (result) ;
-}
-
-#endif
-
diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar.cpp b/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar.cpp
new file mode 100644
index 0000000000..e1da05383c
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar.cpp
@@ -0,0 +1,153 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/GB_cuda_reduce_to_scalar: reduce on the GPU with semiring 
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Reduce a matrix A to a scalar s, or to a smaller matrix V if the GPU was
+// only able to do a partial reduction.  This case occurs if the GPU does not
+// cannot do an atomic update for the monoid.  To handle this case, the GPU
+// returns a full GrB_Matrix V, of size gridsize-by-1, with one entry per
+// threadblock.  Then GB_reduce_to_scalar on the CPU sees this V as the result,
+// and calls itself recursively to continue the reduction.
+
+#define GB_FREE_ALL                                         \
+{                                                           \
+    GB_FREE_WORK (&zscalar, zscalar_size) ;                 \
+    GB_Matrix_free (&V) ;                                   \
+    if (stream != nullptr) cudaStreamDestroy (stream) ;     \
+    stream = nullptr ;                                      \
+}
+
+#include "GB_cuda_reduce.hpp"
+
+GrB_Info GB_cuda_reduce_to_scalar
+(
+    // output:
+    GB_void *s,                 // note: statically allocated on CPU stack; if
+                                // the result is in s then V is NULL.
+    GrB_Matrix *V_handle,       // partial result if unable to reduce to scalar;
+                                // NULL if result is in s.
+    // input:
+    const GrB_Monoid monoid,
+    const GrB_Matrix A
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GB_void *zscalar = NULL ;
+    size_t zscalar_size = 0 ;
+    GrB_Matrix V = NULL ;
+    (*V_handle) = NULL ;
+    GrB_Info info = GrB_SUCCESS ;
+    cudaStream_t stream = nullptr ;
+
+    //--------------------------------------------------------------------------
+    // create the stream
+    //--------------------------------------------------------------------------
+
+    // FIXME: use the stream pool
+    CUDA_OK (cudaStreamCreate (&stream)) ;
+
+    //--------------------------------------------------------------------------
+    // determine problem characteristics and allocate worksbace
+    //--------------------------------------------------------------------------
+
+    int threads_per_block = 320 ;
+    int work_per_thread = 256;
+//  int number_of_sms = GB_Global_gpu_sm_get (0) ;
+
+    GrB_Type ztype = monoid->op->ztype ;
+    size_t zsize = ztype->size ;
+
+    // determine kernel launch geometry
+    int64_t anvals = GB_nnz_held (A) ;
+    int blocksz = threads_per_block ;
+    int gridsz =
+        // FIXME: this is a lot of blocks.  Use a smaller number (cap at,
+        // say, 64K), to simplify the non-atomic reductions
+        (anvals + work_per_thread*threads_per_block - 1) /
+               (work_per_thread*threads_per_block) ;
+
+    // FIXME: GB_enumify_reduce is called twice: here (to get has_cheeseburger)
+    // and in GB_cuda_reduce_to_scalar_jit.  Can we just call it once?  One
+    // solution: The code from here to the call to GB_cuda_reduce_to_scalar_jit
+    // could be added to the GB_cuda_reduce_to_scalar_jit function itself.
+
+    uint64_t rcode ;
+    GB_enumify_reduce (&rcode, monoid, A) ;
+    bool has_cheeseburger = GB_RSHIFT (rcode, 27, 1) ;
+    GBURBLE ("has_cheeseburger %d\n", has_cheeseburger) ;
+
+    // determine the kind of reduction: partial (to &V), or complete
+    // (to the scalar output)
+    if (has_cheeseburger)
+    {
+        // the kernel launch can reduce A to zscalar all by itself
+        // allocate and initialize zscalar (upscaling it to at least 32 bits)
+        size_t zscalar_space = GB_IMAX (zsize, sizeof (uint32_t)) ;
+        zscalar = GB_MALLOC (zscalar_space, GB_void, &zscalar_size) ;
+        if (zscalar == NULL)
+        {
+            // out of memory
+            GB_FREE_ALL ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+        GB_cuda_upscale_identity (zscalar, monoid) ;
+    }
+    else
+    {
+        // allocate a full GrB_Matrix V for the partial result, of size
+        // gridsz-by-1, and of type ztype.  V is allocated but not
+        // initialized.
+        GB_OK (GB_new_bix (&V, ztype, gridsz, 1, GB_Ap_null,
+            true, GxB_FULL, false, 0, -1, gridsz, true, false)) ;
+    }
+
+    GBURBLE ("(cuda reduce launch %d threads in %d blocks)",
+        blocksz, gridsz ) ;
+
+    //--------------------------------------------------------------------------
+    // reduce C to a scalar via the CUDA JIT
+    //--------------------------------------------------------------------------
+
+//  final call looks like this:
+//  GB_OK (GB_cuda_reduce_to_scalar_jit (zscalar, V, monoid, A,
+//      stream, gridsz, blocksz)) ;
+
+//  debugging for now, to die early if the CUDA fails to compile, load, or run:
+    info = (GB_cuda_reduce_to_scalar_jit (zscalar, V, monoid, A,
+        stream, gridsz, blocksz)) ;
+    if (info == GrB_NO_VALUE) info = GrB_PANIC ;
+    GB_OK (info) ;
+
+    //--------------------------------------------------------------------------
+    // return result and destroy the stream
+    //--------------------------------------------------------------------------
+
+    CUDA_OK (cudaStreamSynchronize (stream)) ;
+
+    if (has_cheeseburger)
+    {
+        // return the scalar result
+        // s = zscalar (but only the first zsize bytes of it)
+        memcpy (s, zscalar, zsize) ;
+        GB_FREE_WORK (&zscalar, zscalar_size) ;
+    }
+    else
+    {
+        // return the partial reduction
+        (*V_handle) = V ;
+    }
+
+    CUDA_OK (cudaStreamDestroy (stream)) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch.cpp b/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch.cpp
index f336ce8002..353201b65b 100644
--- a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch.cpp
+++ b/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch.cpp
@@ -2,14 +2,14 @@
 // GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch: decide to use GPU for reduce
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // Decide branch direction for GPU use for the reduction to scalar
 
-#include "GB_cuda.h"
+#include "GB_cuda_reduce.hpp"
 
 bool GB_cuda_reduce_to_scalar_branch    // return true to use the GPU
 (
@@ -22,9 +22,6 @@ bool GB_cuda_reduce_to_scalar_branch    // return true to use the GPU
         !GB_cuda_type_branch (monoid->op->ztype))
     {
         // one or more types are not yet supported on the GPU
-        // FIXME: remove debug output here:
-        std::cout << "Not using cuda path: type size not supported"
-            <<  std::endl ;
         return (false) ;
     }
 
@@ -48,11 +45,11 @@ bool GB_cuda_reduce_to_scalar_branch    // return true to use the GPU
     {
         // FIXME: gpu_id = GB_Context_gpu_id_get ( ) ;
         // cudaSetDevice (gpu_id) ;
-        return true;
+        return (true) ;
     }
     else
     {
-        return false;
+        return (false) ;
     }
 }
 
diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit.cpp b/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit.cpp
index 0d7c6578b0..cc8603c708 100644
--- a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit.cpp
+++ b/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit.cpp
@@ -1,62 +1,62 @@
 //------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit: reduce on the GPU with semiring 
+// GB_cuda_reduce_to_scalar_jit: reduce a matrix to a scalar, via the CUDA JIT
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Reduce a matrix A to a scalar s, or to a smaller matrix V if the GPU was
-// only able to do a partial reduction.  This case occurs if the GPU does not
-// cannot do an atomic update for the monoid.  To handle this case, the GPU
-// returns a full GrB_Matrix V, of size gridsize-by-1, with one entry per
-// threadblock.  Then GB_reduce_to_scalar on the CPU sees this V as the result,
-// and calls itself recursively to continue the reduction.
-
-#include "GraphBLAS_cuda.h"
+#include "GB_cuda_reduce.hpp"
 
 extern "C"
 {
-    #include "GB_reduce.h"
+    typedef GB_JIT_CUDA_KERNEL_REDUCE_PROTO ((*GB_jit_dl_function)) ;
 }
 
-#include "GB_cuda.h"
-#include "GB_cuda_jitify_cache.h"
-#include "GB_cuda_common_jitFactory.hpp"
-#include "GB_cuda_reduce_jitFactory.hpp"
-
-GrB_Info GB_cuda_reduce_to_scalar_jit
+GrB_Info GB_cuda_reduce_to_scalar_jit   // z = reduce_to_scalar (A) via CUDA JIT
 (
     // output:
-    GB_void *s,                 // note: statically allocated on CPU stack; if
-                                // the result is in s then V is NULL.
-    GrB_Matrix *V_handle,       // partial result if unable to reduce to scalar;
-                                // NULL if result is in s.
+    GB_void *z,                 // result if has_cheeseburger is true
+    GrB_Matrix V,               // result if has_cheeseburger is false
     // input:
-    const GrB_Monoid monoid,
-    const GrB_Matrix A
+    const GrB_Monoid monoid,    // monoid to do the reduction
+    const GrB_Matrix A,         // matrix to reduce
+    // CUDA stream and launch parameters:
+    cudaStream_t stream,
+    int32_t gridsz,
+    int32_t blocksz
 )
-{
-
-    // FIXME: use the stream pool
-    cudaStream_t stream ;
-    CHECK_CUDA (cudaStreamCreate (&stream)) ;
+{ 
 
     //--------------------------------------------------------------------------
-    // reduce C to a scalar
+    // encodify the problem
     //--------------------------------------------------------------------------
 
-    // FIXME: check error conditions (out of memory, etc)
-    GB_cuda_reduce_factory myreducefactory ;
-    myreducefactory.reduce_factory (monoid, A) ;
+    GB_jit_encoding encoding ;
+    char *suffix ;
+    uint64_t hash = GB_encodify_reduce (&encoding, &suffix,
+        GB_JIT_CUDA_KERNEL_REDUCE, monoid, A) ;
+
+    // FIXME: could get has_cheesburger here, and allocate zscalar
+    // and V accordingly.
 
-    // FIXME: get GrB_Info result from GB_cuda_reduce
-    GB_cuda_reduce (myreducefactory, A, s, V_handle, monoid, stream) ;
+    //--------------------------------------------------------------------------
+    // get the kernel function pointer, loading or compiling it if needed
+    //--------------------------------------------------------------------------
+
+    void *dl_function ;
+    GrB_Info info = GB_jitifyer_load (&dl_function,
+        GB_jit_reduce_family, "cuda_reduce",
+        hash, &encoding, suffix, NULL, monoid,
+        NULL, A->type, NULL, NULL) ;
+    if (info != GrB_SUCCESS) return (info) ;
 
-    CHECK_CUDA (cudaStreamSynchronize (stream)) ;
-    CHECK_CUDA (cudaStreamDestroy (stream)) ;
+    //--------------------------------------------------------------------------
+    // call the jit kernel and return result
+    //--------------------------------------------------------------------------
 
-    return (GrB_SUCCESS) ;
+    GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
+    return (GB_jit_kernel (z, V, A, stream, gridsz, blocksz)) ;
 }
 
diff --git a/GraphBLAS/CUDA/GB_cuda_type_bits.c b/GraphBLAS/CUDA/GB_cuda_type_bits.c
deleted file mode 100644
index 17de151f3f..0000000000
--- a/GraphBLAS/CUDA/GB_cuda_type_bits.c
+++ /dev/null
@@ -1,35 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GB_cuda_type_bits
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-#undef GBCUDA_CPLUSPLUS
-#include "GB.h"
-
-size_t GB_cuda_type_bits (GB_Type_code);
-
-size_t GB_cuda_type_bits (GB_Type_code type_code) 
-{
-    switch (type_code)
-    {
-        case GB_BOOL_code   : return (8) ;
-        case GB_INT8_code   : return (8) ;
-        case GB_INT16_code  : return (16) ;
-        case GB_INT32_code  : return (32) ;
-        case GB_INT64_code  : return (64) ;
-        case GB_UINT8_code  : return (8) ;
-        case GB_UINT16_code : return (16) ;
-        case GB_UINT32_code : return (32) ;
-        case GB_UINT64_code : return (64) ;
-        case GB_FP32_code   : return (32) ;
-        case GB_FP64_code   : return (64) ;
-//      case GB_FC32_code   : return (64) ;
-//      case GB_FC64_code   : return (128) ;
-        default             : return (0) ;
-    }
-}
-
diff --git a/GraphBLAS/CUDA/GB_cuda_type_branch.cpp b/GraphBLAS/CUDA/GB_cuda_type_branch.cpp
index ba268b2a33..1debd0eb4b 100644
--- a/GraphBLAS/CUDA/GB_cuda_type_branch.cpp
+++ b/GraphBLAS/CUDA/GB_cuda_type_branch.cpp
@@ -2,7 +2,7 @@
 // GraphBLAS/CUDA/GB_cuda_type_branch: decide if GPU can be used on a type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
@@ -12,9 +12,11 @@
 // bytes or less.  If user-defined type has a different size, it cannot be done
 // on the GPU.
 
+// FIXME: get the CUDA kernels to work on large types
+
 // All built-in types pass this rule.
 
-#include "GB_cuda.h"
+#include "GB_cuda.hpp"
 
 bool GB_cuda_type_branch            // return true if the type is OK on GPU
 (
diff --git a/GraphBLAS/CUDA/GB_cuda_upscale_identity.cpp b/GraphBLAS/CUDA/GB_cuda_upscale_identity.cpp
index d7d5ec8a9e..c034f4b4ad 100644
--- a/GraphBLAS/CUDA/GB_cuda_upscale_identity.cpp
+++ b/GraphBLAS/CUDA/GB_cuda_upscale_identity.cpp
@@ -2,7 +2,7 @@
 // GraphBLAS/CUDA/GB_cuda_upscale_identity: return identity, >= 16 bits in size
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
@@ -11,7 +11,7 @@
 // for 2-byte values.  This method initializes the identity value of a monoid,
 // scaling up the 1-byte and 2-byte cases to 4-bytes.
 
-#include "GB_cuda.h"
+#include "GB_cuda.hpp"
 extern "C"
 {
     #include "GB_binop.h"
diff --git a/GraphBLAS/CUDA/GB_cuda_warmup.cu b/GraphBLAS/CUDA/GB_cuda_warmup.cu
index 4b8016e59c..8a7322b9f6 100644
--- a/GraphBLAS/CUDA/GB_cuda_warmup.cu
+++ b/GraphBLAS/CUDA/GB_cuda_warmup.cu
@@ -2,50 +2,55 @@
 // GraphBLAS/CUDA/GB_cuda_warmup.cu: warmup the GPU
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-#include "GB_cuda.h"
+#include "GB_cuda.hpp"
 
 bool GB_cuda_warmup (int device)
 {
-    // allocate 'nothing' just to load the drivers.
-    // No need to free the result.
-    bool ok = GB_cuda_set_device( device );
-    if (!ok)
+    
+    //--------------------------------------------------------------------------
+    // set the device
+    //--------------------------------------------------------------------------
+
+    if (!GB_cuda_set_device (device))
     {
-        printf ("invalid GPU: %d\n", device) ;
+        // invalid device
         return (false) ;
     }
 
-    double gpu_memory_size = GB_Global_gpu_memorysize_get (device);
+    // FIXME: why do we need this?
+    double gpu_memory_size = GB_Global_gpu_memorysize_get (device) ;
+
+    //--------------------------------------------------------------------------
+    // allocate two small blocks just to load the drivers
+    //--------------------------------------------------------------------------
 
     size_t size = 0 ;
     void *p = GB_malloc_memory (1, 1, &size) ;
     if (p == NULL)
     {
-        printf ("Hey!! where's da memory???\n") ;
+        // no memory on the device
         return (false) ;
     }
-//    printf ("oooo nice block of memory of size %lu\n", size) ;
-    GB_free_memory ( &p, size) ;
-//    printf ("be free, block of memory of size %lu\n", size) ;
+    GB_free_memory (&p, size) ;
 
-//    printf ("good ol' cudaMalloc just to be sure\n");
-    cudaMalloc ( &p, size ) ;
+    cudaMalloc (&p, size ) ;
     if (p == NULL)
     {
-        printf ("Hey!! where's da GPU???\n") ;
+        // no memory on the device
         return (false) ;
     }
     cudaFree (p) ;
 
-//    printf ("GPU %d nice and toasty now\n", device) ;
-
-    // TODO check for jit cache? or in GB_init?
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
 
-    return  true; //(err == cudaSuccess) ;
+    return (true) ;
 }
 
diff --git a/GraphBLAS/CUDA/GraphBLAS_cuda.h b/GraphBLAS/CUDA/GraphBLAS_cuda.hpp
similarity index 87%
rename from GraphBLAS/CUDA/GraphBLAS_cuda.h
rename to GraphBLAS/CUDA/GraphBLAS_cuda.hpp
index 4bb6872dc4..c3968e37a7 100644
--- a/GraphBLAS/CUDA/GraphBLAS_cuda.h
+++ b/GraphBLAS/CUDA/GraphBLAS_cuda.hpp
@@ -1,8 +1,8 @@
 //------------------------------------------------------------------------------
-// GraphBLAS/CUDA/GraphBLAS_cuda.h
+// GraphBLAS/CUDA/GraphBLAS_cuda.hpp
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_dense_phase1.cuh b/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_dense_phase1.cuh
deleted file mode 100644
index 2d971a5ecc..0000000000
--- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_dense_phase1.cuh
+++ /dev/null
@@ -1,166 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_dense_phase1.cuh
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-// phase1 for dot3, A and B are bitmap/full
-// dense phase1: symbolic load balancing and data partition
-// to assign work to different 'buckets' for later compute
-
-//  This kernel scans the non-zero pattern in A and B, takes into account the
-//  mask and computes total work required to form C. Then it classifies each
-//  dot product into a set of buckets for efficient compute. 
-
-#pragma once
-
-#include <limits>
-#include "GB_cuda_kernel.h"
-#include "GB_mxm_shared_definitions.h"
-#include "GB_cuda_buckets.h"
-#include <cub/block/block_scan.cuh>
-#include <cooperative_groups.h>
-
-using namespace cooperative_groups;
-
-//------------------------------------------------------------------------------
-// GB_jit_AxB_dot3_dense_phase1: lookup i,j pairs and store in Mi, Ci 
-//------------------------------------------------------------------------------
-
-// GB_AxB_dense_phase1 is a CUDA kernel that scans all entries in M and
-// assigns i,j coordinates for each entries and stores in Mi and Ci. 
-
-template<typename T_M, uint64_t srcode, int chunk_size = 128>
-__global__ void GB_jit_AxB_dot3_dense_phase1
-(
-    // input/output:
-    GrB_Matrix C,           // final output matrix
-    const GrB_Matrix M      // mask matrix
-)
-{
-
-    //--------------------------------------------------------------------------
-    // get C, M, A, and B
-    //--------------------------------------------------------------------------
-
-    const int64_t *__restrict__ Mp = M->p ;
-    const int64_t *__restrict__ Mi = M->i ;
-    #if !GB_MASK_STRUCT
-    const GB_M_TYPE *__restrict__ Mx = (GB_M_TYPE *) M->x ;
-    #endif
-    const int64_t mnvec = M->nvec ;
-    const int64_t mvlen = M->vlen ;
-//  const int64_t mnz =  GB_nnz(M) ;
-    const GB_M_NVALS (mnz) ;
-
-    int64_t *__restrict__ Ci = C->i ;   // for zombies, or bucket assignment
-
-    // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a
-    // zombie, or (k << 4) + bucket otherwise, where C(:,j) is the kth vector
-    // of C (j = Ch [k] if hypersparse or j = k if standard sparse), and
-    // where bucket is the bucket assignment for C(i,j).
-    // bucket can be recovered from Ci by bucket = Ci & 0xF
-
-    // ASSERT (mnz > 0) ;
-    // ASSERT (gridDim.x <= mnz) ;
-
-    // shared cache used for coordinate search
-    __shared__ int64_t ks [chunk_size] ;
-
-    //--------------------------------------------------------------------------
-    // assign all entries of C to the buckets
-    //--------------------------------------------------------------------------
-
-    // all threads in this block will compute the same values for these:
-    int64_t pfirst, plast, kfirst, klast ;
-
-    int64_t chunk_max = GB_ICEIL (mnz, chunk_size) ;
-    //      (mnz + chunk_size -1)/chunk_size;
-    for ( int64_t chunk = blockIdx.x;
-                  chunk < chunk_max;
-                  chunk += gridDim.x )
-    {
-
-        //----------------------------------------------------------------------
-        // determine the work done by this iteration, "chunk"
-        //----------------------------------------------------------------------
-
-        // The slice for each task contains entries pfirst:plast-1 of M and C.
-        // This iteration "chunk" computes Ci and Cx [pfirst...plast-1], using
-        // Mi and Mx [pfirst:plast-1].  All threads in the thread block are
-        // used for this "chunk".
-        pfirst = chunk_size * chunk ;
-        plast  = pfirst + chunk_size ;
-        // plast = GB_IMIN (plast, mnz) ;
-        if (plast > mnz) plast = mnz ;
-        int64_t my_chunk_size = plast - pfirst ;
-
-        // find the first vector of the slice for this chunk: the
-        // vector that owns the entry Mi [pfirst] and Mx [pfirst].
-        kfirst = GB_search_for_vector_device (pfirst, Mp, 0, mnvec, mvlen) ;
-
-        // find the last vector of the slice for task blockIdx.x: the
-        // vector that owns the entry Mi [plast-1] and Mx [plast-1].
-        klast = GB_search_for_vector_device (plast-1, Mp, kfirst, mnvec, mvlen);
-
-        // number of vectors in C and M for this "chunk" iteration, where
-        // Mp [kfirst:klast] will be operated on.
-        int64_t nk = klast - kfirst + 1 ;
-
-        //----------------------------------------------------------------------
-        // fill ks to find all indices
-        //----------------------------------------------------------------------
-
-        // search for k values for each entry pfirst:plast-1
-        float slope = ((float) nk) / ((float) my_chunk_size) ;
-        int64_t mnvec1 = mnvec - 1 ;
-        for (int64_t kk = threadIdx.x ; kk < my_chunk_size ; kk += blockDim.x)
-        {
-            // get a rough estimate of k for the kkth entry in ks
-            int64_t k = kfirst + (int64_t) (slope * ((float) kk)) ;
-            // k cannot be smaller than kfirst, but might be bigger than
-            // mnvec-1, so ensure it is in the valid range, kfirst to mnvec-1
-            // k = GB_IMIN (k, mnvec-1) ;
-            if (k > mnvec1) k = mnvec1 ; 
-            // look for p in Mp, where p is in range pfirst:plast-1
-            // where pfirst >= 0 and plast < mnz
-            int64_t p = kk + pfirst ;
-            // linear-time search for the k value of the pth entry
-            while ( Mp [ k + 1 ] <= p ) k++ ;
-            while ( Mp [ k     ] >  p ) k-- ;
-            ks [kk] = k ;
-        }
-        this_thread_block().sync();
-
-        //----------------------------------------------------------------------
-        // assign entries in C(i,j) to the buckets
-        //----------------------------------------------------------------------
-
-        for ( int64_t pM = pfirst + threadIdx.x;
-                      pM < pfirst + my_chunk_size;
-                      pM += blockDim.x )
-        {
-            int64_t k = ks [pM - pfirst] ;  // get the k value of Mi,Mx [pM].
-            // j = k or j = Mh [k] if C and M are hypersparse, but j is not
-            // needed here.
-
-            #if GB_MASK_STRUCT
-            {
-                // no need to check the value of M(i,j); no prezombies
-                Ci[pM] = (k << 4) ;
-            }
-            #else
-            {
-                bool mij = (bool) GB_MCAST (Mx,pM,) ;
-                int64_t i = Mi [ pM ] ;
-                // FIXME: no need for k<<4, just place k or GB_FLIP(i) in Ci
-                Ci[pM] = (!mij) * ( GB_FLIP(i) << 4)
-                       +   mij  * ((k<<4) ) ;
-            }
-            #endif
-        }
-    }
-}
-
diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh b/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh
deleted file mode 100644
index 80a4e2020b..0000000000
--- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh
+++ /dev/null
@@ -1,255 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-// This CUDA kernel produces the semiring product of two
-// dense matrices of types T_A and T_B and common index space size n, to a  
-// output matrix of type T_C. The matrices are dense, with uniform
-// non-zeros and sparsity patterns. 
-// ie. we want to produce C = A'*B in the sense of the given semi-ring.
-
-// This version uses a simple warp-based dense dot product algorithm, when the
-// vectors coming from both A and B are dense, for any size of N.
-
-// Both the grid and block are 1D, so blockDim.x is the # threads in a
-// threadblock, and the # of threadblocks is grid.x
-
-// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number
-// of active threads = min( min(nzA, nzB), 32) 
-
-// Thus, threadblock b owns a semi-ring dot product on a pair of vectors. 
-// The work is to load the data, do the multiply and add work and finally 
-// reduce this data to a scalar, and write it to Cx[pair].
-
-//  int64_t start          <- start of vector pairs for this kernel
-//  int64_t end            <- end of vector pairs for this kernel
-//  int64_t *Bucket        <- array of pair indices for all kernels 
-//  GrB_Matrix C           <- result matrix 
-//  GrB_Matrix M           <- mask matrix
-//  GrB_Matrix A           <- input matrix A
-//  GrB_Matrix B           <- input matrix B
-//  int sz                 <- size parameter (not used) 
-
-/* FIXME: This kernel needs to be split into 4 methods:
-
-        (A bitmap) * (B bitmap)
-        (A full ) * (B bitmap)
-        (A bitmap) * (B full)
-        (A full) * (B full)
-
-    The buckets are not needed at all.  A single pass can be done.
-    C and M would still be sparse or hypersparse.
-
-    See also denseDotProduct.cu.
-*/
-
-#pragma once
-#include <limits>
-#include <cstdint>
-#include "GB_cuda_kernel.h"
-#include "GB_mxm_shared_definitions.h"
-#include <cooperative_groups.h>
-
-// Using tile size fixed at compile time, we don't need shared memory
-#define tile_sz 32 
-
-using namespace cooperative_groups;
-
-//------------------------------------------------------------------------------
-// warp_ReduceSum
-//------------------------------------------------------------------------------
-
-template< typename T_Z, int warp_sz>
-__inline__ __device__ T_Z warp_ReduceSum(thread_block_tile<warp_sz> g, T_Z val)
-{
-    // Each iteration halves the number of active threads
-    // Each thread adds its partial sum[i] to sum[lane+i]
-    // FIXME: only works if sizeof(T_Z) <= 32 bytes
-    // FIXME: the ANY monoid needs the cij_exists for each thread
-    for (int i = g.size() / 2; i > 0; i /= 2)
-    {
-        T_Z next = g.shfl_down( val, i) ;
-        GB_ADD( val, val, next ); 
-    }
-    return val; // note: only thread 0 will return full sum
-}
-
-//------------------------------------------------------------------------------
-// AxB_dot3_phase3_dndn
-//------------------------------------------------------------------------------
-
-template<
-    typename T_C, typename T_A, typename T_B,
-    typename T_Z, typename T_X, typename T_Y,
-    uint64_t srcode>
-__global__ void AxB_dot3_phase3_dndn
-(
-    GrB_Matrix C,
-    GrB_Matrix M,
-    GrB_Matrix A,
-    GrB_Matrix B
-)
-{
-    // TODO: Figure out how to use graphblas-specific INFINITY macro
-    #ifndef INFINITY
-    #define INFINITY std::numeric_limits<T_C>::max()
-    #endif
-
-    const T_A *__restrict__ Ax = (T_A *)A->x  ;
-    const T_B *__restrict__ Bx = (T_B *)B->x  ;
-          T_C *__restrict__ Cx = (T_C *)C->x  ;
-          int64_t *__restrict__ Ci = C->i ;
-    const int64_t *__restrict__ Mi = M->i ;
-    #if GB_M_IS_HYPER
-    const int64_t *__restrict__ Mh = M->h ;
-    #endif
-    // A and B are either bitmap or full
-    #if GB_A_IS_BITMAP
-    const int8_t  *__restrict__ Ab = A->b ;
-    #endif
-    #if GB_B_IS_BITMAP
-    const int8_t  *__restrict__ Bb = B->b ;
-    #endif
-
-    // zombie count
-    int64_t zc = 0;
-
-    int64_t start = 0;
-    int64_t end   = M->p[M->nvec];
-
-    // total items to be inspected
-    int64_t nnzA = A->vlen;
-    int64_t nnzB = B->vlen;
-    int s = blockDim.x;
-
-    // Main loop over pairs 
-    for ( int64_t pair_id  = start + blockIdx.x; //warp per pair 
-                  pair_id  < end;  
-                  pair_id += gridDim.x )
-    {
-
-        // get M(i,j) and C(i,j)
-        int64_t i = Mi[pair_id];
-        int64_t kk = Ci[pair_id] >> 4;      // FIXME: can remove ">> 4"
-        bool cij_exists = false ;
-        GB_DECLARE_IDENTITY (cij) ;         // GB_Z_TYPE cij = identity
-
-        // skip if C(i,j) is a prezombie
-        if (kk >= 0)
-        {
-
-            // j = kk or j = Mh [kk] if C and M are hypersparse
-            int64_t j = GBH_M (Mh, kk) ;
-
-            int64_t pA     = (A->vlen)*i;
-            int64_t pA_end = pA +(A->vlen);
-
-            int64_t pB     = (B->vlen)*j;
-            int64_t pB_end = pB +(B->vlen);
-
-            //      if (threadIdx.x == 0 ){
-            //          printf("tid=%d, i,j = %d,%d  nnzA= %d, nnzB=%d\n",
-            //                 threadIdx.x, (int)i,(int)j,  (int)nnzA, (int)nnzB);
-            //      }
-            //      __syncthreads();
-
-            // convert global data pointer to the local pointer of this block
-            GB_DECLAREA (aki) ;
-            GB_DECLAREB (bkj) ;
-
-            #if GB_A_IS_FULL && GB_B_IS_FULL
-            {
-                cij_exists = true ;
-                for (int64_t k = threadIdx.x ; k < nnzA ; k += s)
-                { 
-                    // cij += A(k,i) * B(k,j)
-                    GB_GETA (aki, Ax, pA+k, ) ;           // aki = A(k,i)
-                    GB_GETB (bkj, Bx, pB+k, ) ;           // bkj = B(k,j)
-                    GB_MULTADD ( cij, aki, bkj, i, k, j ) ; // cij += aki * bkj
-                }
-            }
-            #elif GB_A_IS_BITMAP && GB_B_IS_BITMAP
-            {
-                for ( int64_t k = threadIdx.x ; k < nnzA ; k += s)
-                { 
-                    GB_GETA (aki, Ax, pA+k, ) ;           // aki = A(k,i)
-                    GB_GETB (bkj, Bx, pB+k, ) ;           // bkj = B(k,j)
-                    int8_t b = (Ab [pA+k] && Bb [pB+k]) ;
-                    cij_exists |= b ;
-                    if (b)
-                    {
-                        GB_MULTADD ( cij, aki, bkj, i, k, j ) ;        // cij += aki * bkj
-                    }
-                }
-            }
-            #elif GB_A_IS_FULL && GB_B_IS_BITMAP
-            {
-                for ( int64_t k = threadIdx.x ; k < nnzA ; k += s)
-                { 
-                    if (Bb [pB+k])
-                    {
-                        GB_GETA (aki, Ax, pA+k, ) ;           // aki = A(k,i)
-                        GB_GETB (bkj, Bx, pB+k, ) ;           // bkj = B(k,j)
-                        GB_MULTADD ( cij, aki, bkj, i, k, j ) ;        // cij += aki * bkj
-                        cij_exists = true ;
-                    }
-                }
-            }
-            #elif GB_A_IS_BITMAP && GB_B_IS_FULL
-            {
-                for ( int64_t k = threadIdx.x ; k < nnzA ; k += s)
-                { 
-                    if (Ab [pB+k])
-                    {
-                        GB_GETA (aki, Ax, pA+k, ) ;           // aki = A(k,i)
-                        GB_GETB (bkj, Bx, pB+k, ) ;           // bkj = B(k,j)
-                        GB_MULTADD ( cij, aki, bkj, i, k, j ) ;        // cij += aki * bkj
-                        cij_exists = true ;
-                    }
-                }
-            }
-            #endif
-        }
-
-        //----------------------------------------------------------------------
-        // reduce per-thread sums to a single scalar
-        //----------------------------------------------------------------------
-
-        // Do vote here for control.
-        thread_block_tile<32> tile = tiled_partition<32>( this_thread_block() );
-        cij_exists = tile.any( cij_exists);
-        tile.sync();
-
-        #if !GB_C_ISO
-        // FIXME: the ANY monoid needs the cij_exists for each thread
-        cij = warp_ReduceSum<T_Z, 32> ( tile, cij);
-        #endif
-
-        // write result for this block to global mem
-        if (threadIdx.x == 0)
-        {
-            if (cij_exists)
-            {
-                GB_PUTC (cij, Cx, pair_id) ;        // Cx [pair_id] = (T_C) cij
-                Ci [pair_id] = i ;
-            }
-            else
-            {
-                // cij is a zombie
-                zc++;
-                Ci [pair_id] = GB_FLIP (i) ;
-            }
-        }
-        //__syncthreads ( ) ;
-
-        if( threadIdx.x ==0 && zc > 0)
-        {
-            GB_cuda_atomic_add <int64_t>( &(C->nzombies), zc) ;
-        }
-    }
-}
-
diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_phase2.cuh b/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_phase2.cuh
deleted file mode 100644
index 3d9f7d39cb..0000000000
--- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_phase2.cuh
+++ /dev/null
@@ -1,193 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_GB_AxB_phase2.cuh
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-// fill the global buckets
-//------------------------------------------------------------------------------
-
-#pragma once
-#include "GB_cuda_kernel.h"
-#include "GB_mxm_shared_definitions.h"
-#include "GB_cuda_buckets.h"
-#include <stdint.h>
-#include <cooperative_groups.h>
-#include <cub/block/block_scan.cuh>
-
-using namespace cooperative_groups;
-
-//------------------------------------------------------------------------------
-// BlockPrefixCallbackOp
-//------------------------------------------------------------------------------
-
-// A stateful callback functor that maintains a running prefix to be applied
-// during consecutive scan operations.
-struct BlockPrefixCallbackOp
-{
-   // Running prefix
-   int64_t running_total;
-   // Constructor
-   __device__ BlockPrefixCallbackOp(int64_t running_total) : running_total(running_total) {}
-
-   // Callback operator to be entered by the first warp of threads in the block.
-   // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-   __device__ int64_t operator()(int64_t block_aggregate)
-   {
-     int64_t old_prefix = running_total;
-     running_total += block_aggregate;
-     return old_prefix;
-   }
-};
-
-//------------------------------------------------------------------------------
-// blockBucketExclusiveSum
-//------------------------------------------------------------------------------
-
-__inline__
-__device__ void blockBucketExclusiveSum(int bucketId, int64_t *d_data, int nblocks)
-{
-   #define blocksize  32
-
-   // Specialize BlockScan for a 1D block of 32 threads
-   typedef cub::BlockScan<int64_t, 32, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
-
-   // Allocate shared memory for BlockScan
-   __shared__ typename BlockScan::TempStorage temp_storage;
-
-   // Initialize running total
-   BlockPrefixCallbackOp prefix_op(0);
-
-   // Have the block iterate over segments of items
-   int64_t data=0;
-
-   int64_t *blockbucket= d_data;
-
-   for (int block_id = 0; block_id < nblocks; block_id += blocksize)
-   {
-    // Load a segment of consecutive items that are blocked across threads
-
-    //printf("block %d entering sum\n",blockIdx.x);
-      int loc = block_id + threadIdx.x;
-      if ( loc < nblocks)
-      {
-        //printf("block %di loading tid=%d\n",block_id,tid);
-        data  = blockbucket[bucketId*nblocks +loc ] ;
-      }
-      this_thread_block().sync();
-
-      //printf("bb%d_%d s0 before prefix= %ld \n", block_id,bucketId,
-      //                     blockbucket[bucketId*nblocks +loc] )  ;
-      // Collectively compute the block-wide exclusive prefix sum
-      BlockScan(temp_storage).ExclusiveSum( data, data, prefix_op);
-      this_thread_block().sync();
-
-      if ( loc < nblocks)
-      {
-        blockbucket[bucketId*nblocks   +loc ]  = data  ;
-      }
-      //this_thread_block().sync();
-
-      //printf("bb%d_%d = %ld \n", block_id, bucketId, blockbucket[bucketId*nblocks +loc] )  ;
-
-      data = 0;
-   }
-}
-
-//------------------------------------------------------------------------------
-// warp_ReduceSumPlus_uint64
-//------------------------------------------------------------------------------
-
-template< int tile_sz>
-__inline__ __device__ uint64_t warp_ReduceSumPlus_uint64( thread_block_tile<tile_sz> tile, uint64_t val)
-{
-    // Each iteration halves the number of active threads
-    // Each thread adds its partial sum[i] to sum[lane+i]
-    for (int i = tile.size() / 2; i > 0; i /= 2) {
-        val +=  tile.shfl_down( val, i);
-    }
-    return val; // note: only thread 0 will return full sum
-}
-
-//------------------------------------------------------------------------------
-// AxB_phase2
-//------------------------------------------------------------------------------
-
-// GB_AxB_cuda_dot3_phase2 is a CUDA kernel that takes as input the
-// nanobuckets and blockbucket arrays computed by the first phase kernel,
-// GB_AxB_cuda_dot3_phase1.  The launch geometry of this kernel must match the
-// GB_AxB_cuda_dot3_phase1 kernel, with the same # of threads and threadblocks.
-
-__global__ void AxB_phase2  // FIXME rename
-(
-    // input, not modified:
-    int64_t *__restrict__ blockbucket,    // global bucket count, of size NBUCKETS*nblocks
-    // output:
-    int64_t *__restrict__ offset,         // global offsets, for each bucket
-    // inputs, not modified:
-    const int nblocks        // input number of blocks to reduce across, ie size of vector for 1 bucket
-)
-{
-
-    //--------------------------------------------------------------------------
-    // sum up the bucket counts of prior threadblocks
-    //--------------------------------------------------------------------------
-
-    // blockbucket is an array of size NBUCKETS-by-nblocks, held by row.  The
-    // entry blockbucket [bucket * nblocks + t] holds the # of entries
-    // in the bucket (in range 0 to NBUCKETS-1) found by threadblock t.
-
-    //__shared__ uint64_t offset [NBUCKETS] ;
-    uint64_t s[NBUCKETS];
-
-    #pragma unroll
-    for(int b = 0; b < NBUCKETS; ++b){
-        s[b] = 0;
-    }
-
-    thread_block_tile<32> tile = tiled_partition<32>(this_thread_block() );
-
-    //printf("block %d,dim %d entering sum %d nblocks\n",blockIdx.x, blockDim.x, nblocks);
-    int64_t tid = threadIdx.x  + blockIdx.x * blockDim.x;
-
-
-     #pragma unroll
-     for(int b = 0; b < NBUCKETS; ++b) {
-         for( tid = threadIdx.x + blockIdx.x * blockDim.x;
-              tid < nblocks;
-              tid += blockDim.x*gridDim.x) {
-            s[b]  += blockbucket[  b * nblocks +tid] ;
-         }
-         this_thread_block().sync(); 
-
-         s[b]  = warp_ReduceSumPlus_uint64<32>( tile, s[b]);
-     }
-
-    if (threadIdx.x ==0 )
-    {
-        #pragma unroll
-        for(int b = 0; b < NBUCKETS; ++b) {
-            atomicAdd( (unsigned long long int*)&(offset[b]), s[b]);
-        }
-    }
-    this_thread_block().sync(); 
-
-    if( gridDim.x >= NBUCKETS)
-    {
-        // Cumulative sum across blocks for each bucket
-        if (blockIdx.x <NBUCKETS) {
-            blockBucketExclusiveSum( blockIdx.x, blockbucket, nblocks ) ;
-        }
-    }
-    else
-    {
-        if (blockIdx.x == 0)
-        {
-            #pragma unroll
-            for(int b = 0; b < NBUCKETS; ++b) {
-                blockBucketExclusiveSum( b, blockbucket, nblocks ) ;
-            }
-        }
-    }
-} // phase2
diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_phase2end.cuh b/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_phase2end.cuh
deleted file mode 100644
index 7cadd9e751..0000000000
--- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_phase2end.cuh
+++ /dev/null
@@ -1,140 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_phase2end.cuh
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-// fill the global buckets
-//------------------------------------------------------------------------------
-
-#pragma once
-#include "GB_cuda_kernel.h"
-#include "GB_mxm_shared_definitions.h"
-#include "GB_cuda_buckets.h"
-
-//#include <cooperative_groups.h>
-//#include <cub/block/block_scan.cuh>
-//using namespace cooperative_groups;
-
-__global__
-void AxB_phase2end
-    (
-        // input, not modified:
-        const int64_t *__restrict__ nanobuckets,    // array of size NBUCKETS-blockDim.x-by-nblocks
-        const int64_t *__restrict__ blockbucket,    // global bucket count, of size NBUCKETS*nblocks
-        // output:
-        const int64_t *__restrict__ bucketp,        // global bucket cumsum, of size NBUCKETS+1
-              int64_t *__restrict__ bucket,         // global buckets, of size cnz (== mnz)
-        const int64_t *__restrict__ offset,         // global offsets, for each bucket
-        // inputs, not modified:
-        const GrB_Matrix C,      // output matrix
-        const int64_t cnz        // number of entries in C and M
-    )
-{
-
-    //--------------------------------------------------------------------------
-    // get C information 
-    //--------------------------------------------------------------------------
-
-    // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a
-    // zombie, or (k << 4) + bucket otherwise, where C(:,j) is the kth vector
-    // of C (j = Ch [k] if hypersparse or j = k if standard sparse), and
-    // where bucket is the bucket assignment for C(i,j).  This phase does not
-    // need k, just the bucket for each entry C(i,j).
-
-    int64_t *__restrict__ Ci = C->i ;       // for zombies, or bucket assignment
-    //int64_t *Mp = C->p ;       // for offset calculations
-    //int64_t mnvec = C->nvec;
-
-    //--------------------------------------------------------------------------
-    // load and shift the nanobuckets for this thread block
-    //--------------------------------------------------------------------------
-
-    // The taskbucket for this threadblock is an array of size
-    // NBUCKETS-by-blockDim.x, held by row.  It forms a 2D array within the 3D
-    // nanobuckets array.
-    const int64_t *taskbucket = nanobuckets + blockIdx.x * (NBUCKETS * blockDim.x) ;
-
-    //printf("block%d thd%d blockbucket= %ld\n", blockIdx.x, threadIdx.x,
-    //                                           blockbucket[blockIdx.x*gridDim.x+blockIdx.x]);
-
-    // Each thread in this threadblock owns one column of this taskbucket, for
-    // its set of NBUCKETS nanobuckets.  The nanobuckets are a column of length NBUCKETS,
-    // with stride equal to blockDim.x.
-    const int64_t *nanobucket = taskbucket + threadIdx.x;
-
-    // Each thread loads its NBUCKETS nanobucket values into registers.
-    int64_t my_bucket[NBUCKETS];
-
-    #pragma unroll 
-    for(int b = 0; b < NBUCKETS; ++b) {
-        my_bucket[b] = nanobucket [b * blockDim.x]
-                     + blockbucket [b * gridDim.x + blockIdx.x]
-                     + bucketp [b] ;
-
-    //if(b==3) printf("blk:%d tid: %d my_buck[%d]=%lu \n", blockIdx.x, threadIdx.x,  b, my_bucket[b]);
-    }
-
-    // Now each thread has an index into the global set of NBUCKETS buckets,
-    // held in bucket, of where to place its own entries.
-
-    //--------------------------------------------------------------------------
-    // construct the global buckets
-    //--------------------------------------------------------------------------
-
-    // The slice for task blockIdx.x contains entries pfirst:plast-1 of M and
-    // C, which is the part of C operated on by this threadblock.
-    int64_t pfirst, plast ;
-
-    __shared__ int64_t bucket_idx[chunksize];
-  //__shared__ int64_t bucket_s[NBUCKETS][chunksize];
-
-    int chunk_max= (cnz + chunksize -1)/chunksize;
-    for ( int chunk = blockIdx.x;
-          chunk < chunk_max;
-          chunk += gridDim.x )
-    {
-
-        pfirst = chunksize * chunk ;
-        plast  = GB_IMIN( chunksize * (chunk+1), cnz ) ;
-
-        for ( int64_t p = pfirst + threadIdx.x; p < plast ; p += blockDim.x )
-        {
-            // get the entry C(i,j), and extract its bucket.  Then
-            // place the entry C(i,j) in the global bucket it belongs to.
-            int tid = p - pfirst;
-
-            // TODO: these writes to global are not coalesced.  Instead: each
-            // threadblock could buffer its writes to NBUCKETS buffers and when the
-            // buffers are full they can be written to global.
-            int ibucket = Ci[p] & 0xF;
-            //printf(" thd: %d p,Ci[p] = %ld,%ld,%d\n", threadIdx.x, p, Ci[p], irow );
-
-            //bucket[my_bucket[ibucket]++] = p;
-            //int idx = (my_bucket[ibucket]  - pfirst); 
-            //my_bucket[ibucket] +=  1; //blockDim.x;
-            //int idx = (my_bucket[ibucket]++ - pfirst) & 0x7F;
-            //bucket_s[ibucket][ idx ] = p;
-            bucket_idx[tid] = my_bucket[ibucket]++;
-            Ci[p] = (ibucket==0) * (Ci[p] >> 4) + (ibucket > 0)* Ci[p];
-            //if(ibucket == 0) {
-            ////    bucket[my_bucket[0]++] = p;
-            //    Ci[p] = Ci[p] >> 4;
-            //} else {
-            //  bucket[my_bucket[ibucket]++] = p;
-            //}
-        }
-
-        for ( int64_t p = pfirst + threadIdx.x; p < plast ; p+= blockDim.x )
-        {
-            int tid = p - pfirst;
-            //int ibucket = Ci[p] & 0xF;
-            //bucket[ p ] = bucket_s[ibucket][tid];
-            bucket [ bucket_idx[tid]  ]  = p;
-            //printf("ibucket = %d tid=%d p=%lu idx = %lu  val = %lu \n",ibucket, threadIdx.x,p, tid, bucket_s[ibucket][tid]);
-            //printf("ibucket = %d tid=%d p=%lu idx = %lu  \n",ibucket, threadIdx.x, p, bucket_idx[tid]);
-        }
-    }
-}
-
diff --git a/GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_AxB_dot3.cu b/GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_AxB_dot3.cu
new file mode 100644
index 0000000000..f515ef2177
--- /dev/null
+++ b/GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_AxB_dot3.cu
@@ -0,0 +1,552 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_AxB_dot3.cu
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// GB_jit_kernel_cuda_AxB_dot3: C<M>=A'*B using the dot3 method on the GPU.
+
+#define GB_FREE_ALL ;
+
+#if GB_C_ISO
+// FIXME
+#error "kernel undefined for C iso"
+#endif
+
+// FIXME: Figure out how to use graphblas-specific INFINITY macro
+#ifndef INFINITY
+#define INFINITY std::numeric_limits<double>::max()
+#endif
+
+//------------------------------------------------------------------------------
+// kernel launch geometry
+//------------------------------------------------------------------------------
+
+// FIXME: some duplicates here
+#define chunk_size 128
+#define log2_chunk_size 7
+#define tile_sz 32 
+#define shared_vector_size 128 
+#define blocksize  32
+#define threads_per_block 32
+
+//------------------------------------------------------------------------------
+// operators
+//------------------------------------------------------------------------------
+
+#if GB_C_ISO
+
+    #define GB_DOT_TERMINAL( c ) break
+    #define GB_DOT_MERGE(pA,pB)                                         \
+    {                                                                   \
+        cij_exists = true ;                                             \
+    }
+    #define GB_CIJ_EXIST_POSTCHECK
+
+#else
+
+    #define GB_DOT_TERMINAL( c ) GB_IF_TERMINAL_BREAK ( c, zterminal )
+
+    #if GB_IS_PLUS_PAIR_REAL_SEMIRING
+
+        // cij += A(k,i) * B(k,j), for merge operation (plus_pair_real semiring)
+        #if GB_Z_IGNORE_OVERFLOW
+            // plus_pair for int64, uint64, float, or double
+            #define GB_DOT_MERGE(pA,pB) cij++ ;
+            #define GB_CIJ_EXIST_POSTCHECK cij_exists = (cij != 0) ;
+        #else
+            // plus_pair semiring for small integers
+            #define GB_DOT_MERGE(pA,pB)                                     \
+            {                                                               \
+                cij_exists = true ;                                         \
+                cij++ ;                                                     \
+            }
+            #define GB_CIJ_EXIST_POSTCHECK
+        #endif
+
+    #else
+
+        // cij += A(k,i) * B(k,j), for merge operation (general case)
+        #define GB_DOT_MERGE(pA,pB)                                         \
+        {                                                                   \
+            GB_GETA ( aki, Ax, pA, ) ;      /* aki = A(k,i) */              \
+            GB_GETB ( bkj, Bx, pB, ) ;      /* bkj = B(k,j) */              \
+            cij_exists = true ;                                             \
+            GB_MULTADD ( cij, aki, bkj, i, k, j ) ;  /* cij += aki * bkj */ \
+        }
+        #define GB_CIJ_EXIST_POSTCHECK
+
+    #endif
+
+#endif
+
+//------------------------------------------------------------------------------
+// dot3 buckets
+//------------------------------------------------------------------------------
+
+#define NBUCKETS 3
+
+// NBUCKETS buckets: computed by up to NBUCKETS-1 kernel launches (zombies need
+// no work...), each using different kernels (with different configurations
+// depending on the bucket).
+
+// dot3:  C<M>=A'B, M is sparse or hyper, C is sparse or hyper
+// 32 kernels A,B: (hyper,sparse,bitmap,full)^2 x (M and C are sparse/hyper)
+
+typedef enum
+{
+    GB_BUCKET_ZOMBIE = 0,       // C(i,j) is a zombie (not a bucket)
+    // both A and B are sparse/hyper:
+    GB_BUCKET_VSVS = 1,         // vsvs: both A(:,i) and B(:,j) are very sparse
+    GB_BUCKET_MERGEPATH = 2,    // mp: use the merge-path method
+    // A is sparse/hyper and B is bitmap/full, or
+    // A is bitmap/full  and B is sparse/hyper
+    GB_BUCKET_VSDN = 1,         // vsdn: the sparse vector is very sparse
+    GB_BUCKET_SPDN = 2,         // spdn: sparse vector has lots of entries;
+                                // use a whole warp for each dot product
+}
+GB_bucket_code ;    // FIXME: rename GB_dot3_bucket_code
+
+// These may use another bucket enum:
+
+    // two full/(sparse,hyper) kernels:
+    //  // CUDA kernel: spdn, handles 4 buckets:
+    //  // A(:,i) is dense and B(:,j) is very sparse (< 256 entries)
+    //  GB_BUCKET_DNVS = 2,
+    //  // A(:,i) is dense and B(:,j) is sparse (>= 256 entries)
+    //  GB_BUCKET_DNSP = 3,
+
+    // a sparse/full kernel
+    //  // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense
+    //  GB_BUCKET_VSDN = 4,
+    //  // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense
+    //  GB_BUCKET_SPDN = 5,
+
+    // a sparse/bitmap kernel
+    // a bitmap/bitmap kernel
+    // a bitmap/sparse kernel
+    // ...
+
+#include "GB_cuda_shfl_down.cuh"
+
+//------------------------------------------------------------------------------
+// CUDA device kernels for each case
+//------------------------------------------------------------------------------
+
+#include "GB_cuda_ek_slice.cuh"
+
+#if ((GB_A_IS_BITMAP || GB_A_IS_FULL) && (GB_B_IS_BITMAP || GB_B_IS_FULL))
+    // dense-dense
+    #include "GB_cuda_jit_AxB_dot3_dense_phase1.cuh"
+    #include "GB_cuda_jit_AxB_dot3_phase3_dndn.cuh"
+#else
+    // sparse-sparse, sparse-dense, or dense-sparse
+
+    #undef  GB_FREE_ALL
+    #define GB_FREE_ALL                         \
+    {                                           \
+        GB_FREE_WORK (&Nanobuckets, Nb_size) ;  \
+        GB_FREE_WORK (&Blockbucket, Bb_size) ;  \
+        GB_FREE_WORK (&Bucketp, Bup_size) ;     \
+        GB_FREE_WORK (&offset, O_size) ;        \
+        GB_FREE_WORK (&Bucket, Bu_size) ;       \
+    }
+
+    #include "GB_cuda_jit_AxB_dot3_phase1.cuh"
+    #include "GB_cuda_jit_AxB_dot3_phase2.cuh"
+    #include "GB_cuda_jit_AxB_dot3_phase2end.cuh"
+    #if ((GB_A_IS_SPARSE || GB_A_IS_HYPER) && \
+         (GB_B_IS_SPARSE || GB_B_IS_HYPER))
+        // sparse-sparse
+        #include "GB_cuda_jit_AxB_dot3_phase3_mp.cuh"
+        #include "GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh"
+    #else
+        // sparse-dense or dense-sparse
+        #include "GB_cuda_jit_AxB_dot3_phase3_spdn.cuh"
+        #include "GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh"
+    #endif
+#endif
+
+//------------------------------------------------------------------------------
+// host function to launch the CUDA kernels for dot3 on the GPU
+//------------------------------------------------------------------------------
+
+// #include "GB_cuda_timer.hpp"
+
+extern "C"
+{
+    GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel) ;
+}
+
+GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
+{
+
+    // GpuTimer kernel_timer ;
+
+    //--------------------------------------------------------------------------
+    // get callback functions
+    //--------------------------------------------------------------------------
+
+    #ifdef GB_JIT_RUNTIME
+    // get callback functions
+    GB_free_memory_f GB_free_memory = my_callback->GB_free_memory_func ;
+    GB_malloc_memory_f GB_malloc_memory = my_callback->GB_malloc_memory_func ;
+    #endif
+
+    //--------------------------------------------------------------------------
+    // declare workspace
+    //--------------------------------------------------------------------------
+
+    #if ((GB_A_IS_BITMAP || GB_A_IS_FULL) && (GB_B_IS_BITMAP || GB_B_IS_FULL))
+    // dense-dense case requires no workspace
+    #else
+    // sparse-sparse, sparse-dense, and dense-sparse requires workspace
+    int64_t *Nanobuckets = NULL ; size_t Nb_size  = 0 ;
+    int64_t *Blockbucket = NULL ; size_t Bb_size  = 0 ;
+    int64_t *Bucket = NULL      ; size_t Bu_size  = 0 ;
+    int64_t *Bucketp = NULL     ; size_t Bup_size = 0 ;
+    int64_t *offset = NULL      ; size_t O_size   = 0 ;
+    #endif
+
+    //--------------------------------------------------------------------------
+    // get problem size
+    //--------------------------------------------------------------------------
+
+    const GB_M_NVALS (mnz) ;
+    int nblks_1 = (mnz + chunk_size - 1) / chunk_size ;
+    int number_of_blocks_1 = GB_IMIN (nblks_1,  chunk_size * number_of_sms) ;
+
+    // most methods can use these launch geometries:
+    dim3 grid_1 (number_of_blocks_1) ;
+    dim3 block (threads_per_block) ;
+
+    //--------------------------------------------------------------------------
+    // C<M>=A'*B via jitified kernels
+    //--------------------------------------------------------------------------
+
+    #if ((GB_A_IS_BITMAP || GB_A_IS_FULL) && (GB_B_IS_BITMAP || GB_B_IS_FULL))
+    {
+
+        //----------------------------------------------------------------------
+        // (full or bitmap) times (full or bitmap)
+        //----------------------------------------------------------------------
+
+        // full/bitmap cases, which means we don't need buckets and zombies.
+        // This is a much simpler kernel as a result, it only does the i,j
+        // lookup and stores the values in Mi and Ci. 
+
+        // Idea is to have each task work on a continguous block of columns of
+        // C Note: for small tests, mnz is small so ntasks is be governed by
+        // chunk_size, not chunk_size*number_of_sms.  For large problems in
+        // production, chunk_size is less important since ntasks will likely be
+        // bounded by chunk_size*number_of_sms (say 128*80 = 10,240 on a V100,
+        // for the default chunk_size of 128).
+
+        //----------------------------------------------------------------------
+        // dense case, phase 1
+        //----------------------------------------------------------------------
+
+        // kernel_timer.Start();
+        GB_cuda_AxB_dot3_dense_phase1_kernel <<<grid_1, block, 0, stream>>>
+            (C, M) ;
+
+        CUDA_OK (cudaStreamSynchronize(stream)) ;  // is this needed?
+
+        // kernel_timer.Stop();
+        // printf ("(GPU phase1 %12.6g ms )\n", kernel_timer.Elapsed()) ;
+
+        //----------------------------------------------------------------------
+        // dense case, phase "3" (FIXME: rename to dense_phase2)
+        //----------------------------------------------------------------------
+
+        int work_per_thread = 8 ;
+        int blocksz = 64 ;
+        work_per_thread = 8 ;
+        if (mnz > 1024)
+        {
+            blocksz = 512 ;
+            work_per_thread = 64 ;
+        }
+        int gridsz = GB_ICEIL (mnz, work_per_thread*blocksz) ;
+        dim3 grid_2 (gridsz) ;
+
+        // kernel_timer.Start();
+
+        GB_cuda_AxB_dot3_phase3_dndn_kernel <<grid_2, block, 0, stream>>
+            (C, M, A, B) ;
+
+    }
+    #else
+    {
+
+        //----------------------------------------------------------------------
+        // (sparse or hyper) times (sparse or hyper)
+        // (sparse or hyper) times (bitmap or full)
+        // (bitmap or full) times (sparse or hyper)
+        //----------------------------------------------------------------------
+
+        //----------------------------------------------------------------------
+        // construct the tasks for phase1 and phase2
+        //----------------------------------------------------------------------
+
+        // # of threads in phase1 and phase2 kernel launches are related
+        // # by the size of the warp.  ph2_task = ph1_task/32 for example
+
+        int64_t blockbuckets_size = NBUCKETS * number_of_blocks_1 ;
+        int64_t nanobuckets_size = blockbuckets_size * threads_per_block ;
+
+        Nanobuckets = GB_MALLOC_WORK (nanobuckets_size, int64_t, &Nb_size) ;
+        Blockbucket = GB_MALLOC_WORK (blockbuckets_size, int64_t, &Bb_size) ;
+        Bucketp = GB_MALLOC_WORK (NBUCKETS+1, int64_t, &Bup_size) ;
+        offset = GB_MALLOC_WORK (NBUCKETS, int64_t, &O_size) ;
+        Bucket = GB_MALLOC_WORK (mnz, int64_t, &Bu_size) ;
+
+        if (Nanobuckets == NULL || Blockbucket == NULL || Bucketp == NULL
+            || Bucket == NULL || offset == NULL)
+        {
+            // out of memory
+            GB_FREE_ALL ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+
+        // FIXME: do async with streams
+        // FIXME: do we need any of these?
+        //CUDA_OK (cudaMemsetAsync(Nanobuckets, 0,
+        //    nanobuckets_size * sizeof(int64_t), stream));
+        //CUDA_OK (cudaMemsetAsync(Blockbucket, 0,
+        //    blockbuckets_size * sizeof(int64_t), stream));
+        CUDA_OK (cudaMemsetAsync(Bucketp, 0,
+            (NBUCKETS+1) * sizeof(int64_t), stream));
+        CUDA_OK (cudaMemsetAsync(offset, 0,
+            NBUCKETS * sizeof(int64_t), stream));
+        //CUDA_OK (cudaMemsetAsync(Bucket, 0,
+        //    mnz * sizeof(int64_t), stream));
+
+        //----------------------------------------------------------------------
+        // phase1 and phase2: place each C(i,j) in a bucket
+        //----------------------------------------------------------------------
+
+        CUDA_OK (cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t),
+            cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
+        CUDA_OK (cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t),
+            cudaMemAdviseSetAccessedBy, device));
+
+        CUDA_OK (cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t),
+            cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
+        CUDA_OK (cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t),
+            cudaMemAdviseSetAccessedBy, device));
+
+        //----------------------------------------------------------------------
+        // phase1: assign each C(i,j) to a bucket, and count them
+        //----------------------------------------------------------------------
+
+        // kernel_timer.Start();
+
+        GB_jit_AxB_dot3_phase1_kernel <<<grid_1, block, 0, stream>>>
+            (Nanobuckets, Blockbucket, C, M, A, B) ;
+
+        CUDA_OK (cudaStreamSynchronize (stream)) ;
+
+        // kernel_timer.Stop();
+        // printf ("(GPU phase1 %12.6g ms )\n", kernel_timer.Elapsed()) ;
+
+        //----------------------------------------------------------------------
+        // phase2: cumsum across the blockbuckets, propagate to thread level
+        //----------------------------------------------------------------------
+
+        // # of blocks for phase2:
+        int number_of_blocks_2 = (number_of_blocks_1 + threads_per_block - 1)
+            / threads_per_block ;
+
+        dim3 grid_2 (number_of_blocks_2) ;
+
+        // kernel_timer.Start();
+
+        GB_cuda_AxB_dot3_phase2_kernel <<<grid_2, block, 0, stream>>>
+            (Blockbucket, offset, number_of_blocks_1) ;
+
+        CUDA_OK (cudaStreamSynchronize (stream)) ;
+
+        int64_t s = offset [0] ;
+        C->nzombies = s ;
+        bool all_in_one = false ;
+        for (int bucket = 1 ; bucket < NBUCKETS+1 ; bucket++)
+        {
+            Bucketp[bucket] = s ; 
+            s += offset [bucket] ;
+            if ((Bucketp [bucket] - Bucketp [bucket-1] ) == mnz)
+            {
+                all_in_one = true ;
+            }
+        }
+
+        // kernel_timer.Stop();
+        // printf ("(GPU phase2 %12.6g ms )\n", kernel_timer.Elapsed()) ;
+
+        //----------------------------------------------------------------------
+        // phase2end
+        //----------------------------------------------------------------------
+
+        if (!all_in_one) 
+        {
+            // kernel_timer.Start();
+
+            GB_cuda_AxB_dot3_phase2end_kernel <<<grid_1, block, 0, stream>>>
+                (Nanobuckets, Blockbucket, Bucketp, Bucket, offset, C, mnz) ;
+
+            CUDA_OK (cudaStreamSynchronize (stream)) ;
+            // kernel_timer.Stop();
+            // printf ("(GPU phase2end %12.6g ms)\n",kernel_timer.Elapsed());
+        }
+
+        //----------------------------------------------------------------------
+        // phase3: do the numerical work
+        //----------------------------------------------------------------------
+
+        // kernel_timer.Start();
+
+        for (int bucket = 1 ; bucket < NBUCKETS ; bucket++)
+        {
+            int64_t start = Bucketp [bucket] ;
+            int64_t end   = Bucketp [bucket + 1] ;
+            int64_t cnz_in_bucket = end - start ;
+            int gridsz, blocksz, work_per_thread ;
+            if (cnz_in_bucket > 0)
+            {
+
+                #if ((GB_A_IS_SPARSE || GB_A_IS_HYPER) && \
+                     (GB_B_IS_SPARSE || GB_B_IS_HYPER))
+
+                    switch (bucket)
+                    {
+
+                        //------------------------------------------------------
+                        // vsvs bucket: both vectors very sparse
+                        //------------------------------------------------------
+
+                        case GB_BUCKET_VSVS :
+                        {
+                            // FIXME: should be a function of cuda architecture
+                            blocksz = 256 ;
+                            work_per_thread = 4 ;
+                            if (cnz_in_bucket > (2<<12))
+                            {
+                                blocksz = 512 ;
+                            }
+                            gridsz = GB_ICEIL (cnz_in_bucket,
+                                work_per_thread*blocksz) ;
+                            gridsz = GB_IMIN (gridsz, 256*number_of_sms) ;
+                            dim3 grid_3 (gridsz) ;
+                            GB_cuda_AxB_dot3_phase3_vsvs_kernel
+                                <<<grid_3, block, 0, stream>>>
+                                (start, end, Bucket, C, M, A, B) ;
+                        }
+                        break ;
+
+                        //------------------------------------------------------
+                        // mergepath bucket:
+                        //------------------------------------------------------
+
+                        case GB_BUCKET_MERGEPATH :
+                        {
+                            // FIXME: should be a function of cuda architecture
+                            blocksz = 32 ;
+                            work_per_thread = 256 ;
+                            if (cnz_in_bucket > (2<<20))
+                            {
+                                work_per_thread = 1024 ;
+                            }
+                            gridsz = GB_ICEIL (cnz_in_bucket, work_per_thread) ;
+                            if ((gridsz < number_of_sms) &&
+                                (cnz_in_bucket > (2<<20)))
+                            {
+                                gridsz = number_of_sms ;
+                            }
+                            gridsz = GB_IMIN (gridsz, 256*number_of_sms) ;
+                            dim3 grid_3 (gridsz) ;
+                            GB_cuda_AxB_dot3_phase3_mp_kernel
+                                <<<grid_3, block, 0, stream>>>
+                                (start, end, Bucket, C, M, A, B) ;
+                        }
+                        break ;
+                    }
+
+                #else
+
+                    switch (bucket)
+                    {
+
+                        //------------------------------------------------------
+                        // vsdn bucket: one thread per C(i,j) dot product
+                        //------------------------------------------------------
+
+                        case GB_BUCKET_VSDN :
+                        {
+                            // FIXME: should be a function of cuda architecture
+                            blocksz = 256 ;
+                            work_per_thread = 4 ;
+                            if (cnz_in_bucket > (2<<12))
+                            {
+                                blocksz = 512 ;
+                            }
+                            gridsz = GB_ICEIL (cnz_in_bucket,
+                                work_per_thread*blocksz) ;
+                            gridsz = GB_IMIN (gridsz, 256*number_of_sms) ;
+                            dim3 grid_3 (gridsz) ;
+                            GB_cuda_AxB_dot3_phase3_vsdn_kernel
+                                <<<grid_3, block, 0, stream>>>
+                                (start, end, Bucket, C, M, A, B) ;
+                        }
+                        break ;
+
+                        //------------------------------------------------------
+                        // spdn bucket: one warp per C(i,j) dot product
+                        //------------------------------------------------------
+
+                        case GB_BUCKET_SPDN :
+                        {
+                            // FIXME: should be a function of cuda architecture
+                            blocksz = 32 ;
+                            work_per_thread = 256 ;
+                            if (cnz_in_bucket > (2<<20))
+                            {
+                                work_per_thread = 1024 ;
+                            }
+                            gridsz = GB_ICEIL (cnz_in_bucket, work_per_thread) ;
+                            if ((gridsz < number_of_sms) &&
+                                (cnz_in_bucket > (2<<20)))
+                            {
+                                gridsz = number_of_sms ;
+                            }
+                            gridsz = GB_IMIN (gridsz, 256*number_of_sms) ;
+                            dim3 grid_3 (gridsz) ;
+                            GB_cuda_AxB_dot3_phase3_spdn_kernel
+                                <<<grid_3, block, 0, stream>>>
+                                (start, end, Bucket, C, M, A, B) ;
+                            break ;
+                        }
+                    }
+                #endif
+            }
+        }
+    }
+    #endif
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    CUDA_OK (cudaStreamSynchronize (stream)) ;
+
+    // kernel_timer.Stop();
+    // printf ("(GPU phase3 %12.6g ms, rate=%12.6g)\n",
+    //     kernel_timer.Elapsed(), mnz/(1000*kernel_timer.Elapsed())) ; 
+
+    GB_FREE_ALL ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_reduce.cuh b/GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_reduce.cu
similarity index 71%
rename from GraphBLAS/CUDA/JitKernels/GB_cuda_jit_reduce.cuh
rename to GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_reduce.cu
index 354e3b216c..94de78d706 100644
--- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_reduce.cuh
+++ b/GraphBLAS/CUDA/JitKernels/GB_jit_kernel_cuda_reduce.cu
@@ -1,14 +1,17 @@
 //------------------------------------------------------------------------------
-// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_reduce.cuh
+// GraphBLAS/CUDA/JitKernels/GB_jit_cuda_reduce.cu
 //------------------------------------------------------------------------------
 
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// The GB_cuda_jit_reduce CUDA kernel reduces a GrB_Matrix A of any type T_A,
-// to a scalar of type T_Z.  Each threadblock (blockIdx.x) reduces its portion
-// of Ax to a single scalar, and then atomics are used across the threadblocks.
+// The GB_cuda_jit_reduce CUDA kernel reduces a GrB_Matrix A of any type
+// GB_A_TYPE, to a scalar of type GB_Z_TYPE.  Each threadblock (blockIdx.x)
+// reduces its portion of Ax to a single scalar, and then atomics are used
+// across the threadblocks.
 
 // Both the grid and block are 1D, so blockDim.x is the # threads in a
 // threadblock, and the # of threadblocks is grid.x
@@ -21,63 +24,33 @@
 
 // If the reduction is done on the GPU, A will never be iso-valued.
 
-#include <limits>
-#include <type_traits>
-#include "GB_cuda_kernel.h"
-#include "GB_monoid_shared_definitions.h"
-#include "GB_cuda_atomics.cuh"
-#include <cstdint>
-#include <cooperative_groups.h>
-
 #if GB_C_ISO
 #error "kernel undefined for C iso"
 #endif
 
-using namespace cooperative_groups;
-
-//------------------------------------------------------------------------------
-// GB_warp_Reduce: reduce all entries in a warp to a single scalar
-//------------------------------------------------------------------------------
-
-// GB_warp_Reduce assumes WARPSIZE is 32 threads.
+// FIXME: put these definitions in GB_cuda_kernel.h:
+#define tile_sz 32
+#define log2_tile_sz 5
 
-template<typename T_Z>
-__inline__ __device__
-T_Z GB_warp_Reduce( thread_block_tile<WARPSIZE> g, T_Z val)
-{
-    // Each iteration halves the number of active threads
-    // Each thread adds its partial val[k] to val[lane+k]
-
-    // FIXME: doesn't work unless sizeof(T_Z) <= 32 bytes
-
-    T_Z fold = g.shfl_down ( val, 16) ;
-    GB_ADD ( val, val, fold ) ;
-    fold = g.shfl_down ( val, 8) ;
-    GB_ADD ( val, val, fold ) ;
-    fold = g.shfl_down ( val, 4) ;
-    GB_ADD ( val, val, fold ) ;
-    fold = g.shfl_down ( val, 2) ;
-    GB_ADD ( val, val, fold ) ;
-    fold = g.shfl_down ( val, 1) ;
-    GB_ADD ( val, val, fold ) ;
-    return (val) ; // note: only thread 0 will return full val
-}
+#include "GB_cuda_shfl_down.cuh"
 
 //------------------------------------------------------------------------------
 // GB_block_Reduce: reduce across all warps into a single scalar
 //------------------------------------------------------------------------------
 
-template<typename T_Z>
-__inline__ __device__
-T_Z GB_block_Reduce(thread_block g, T_Z val)
+__inline__ __device__ GB_Z_TYPE GB_block_Reduce
+(
+    thread_block g,
+    GB_Z_TYPE val
+)
 {
-    static __shared__ T_Z shared [WARPSIZE] ;
-    int lane = threadIdx.x & (WARPSIZE-1) ;
-    int wid  = threadIdx.x >> LOG2_WARPSIZE ;
-    thread_block_tile<WARPSIZE> tile = tiled_partition<WARPSIZE>( g ) ;
+    static __shared__ GB_Z_TYPE shared [tile_sz] ;
+    int lane = threadIdx.x & (tile_sz-1) ;
+    int wid  = threadIdx.x >> log2_tile_sz ;
+    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( g ) ;
 
     // Each warp performs partial reduction
-    val = GB_warp_Reduce<T_Z>( tile, val) ;
+    val = GB_cuda_warp_reduce_ztype (tile, val) ;
 
     // Wait for all partial reductions
     if (lane == 0)
@@ -88,25 +61,25 @@ T_Z GB_block_Reduce(thread_block g, T_Z val)
 
     GB_DECLARE_IDENTITY_CONST (zid) ;   // const GB_Z_TYPE zid = identity ;
 
-    val = (threadIdx.x < (blockDim.x >> LOG2_WARPSIZE)) ?
-        shared [lane] : zid ;
+    val = (threadIdx.x < (blockDim.x >> LOG2_WARPSIZE)) ?  shared [lane] : zid ;
 
     // Final reduce within first warp
-    val = GB_warp_Reduce<T_Z>( tile, val) ;
+    val = GB_cuda_warp_reduce_ztype (tile, val) ;
     return (val) ;
 }
 
 //------------------------------------------------------------------------------
-// GB_jit_reduce: reduce all entries in a matrix to a single scalar
+// GB_cuda_reduce_kernel: reduce all entries in a matrix to a single scalar
 //------------------------------------------------------------------------------
 
-template< typename T_A, typename T_Z>
-__global__ void GB_jit_reduce   // FIXME rename
+__global__ void GB_cuda_reduce_kernel
 (
-    GrB_Matrix A,   // matrix to reduce
+    // output:
     void *zscalar,  // scalar result, at least sizeof (uint32_t)
     GrB_Matrix V,   // matrix result, for partial reduction (or NULL)
-    int64_t anz     // # of entries in A: anz = GB_nnz_held (A)
+    // input:
+    GrB_Matrix A,   // matrix to reduce
+    int64_t anz     // # of entries in A
 )
 {
 
@@ -114,19 +87,20 @@ __global__ void GB_jit_reduce   // FIXME rename
     // initializations
     //--------------------------------------------------------------------------
 
-    const T_A *__restrict__ Ax = (T_A *) A->x ;
+    const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *) A->x ;
 
-    // each thread reduces its result into zmine, of type T_Z
+    // each thread reduces its result into zmine, of type GB_Z_TYPE
     GB_DECLARE_IDENTITY (zmine) ; // GB_Z_TYPE zmine = identity ;
 
     // On input, zscalar is already initialized to the monoid identity value.
-    // If T_Z has size less than 4 bytes, zscalar has been upscaled to 4 bytes.
+    // If GB_Z_TYPE has size less than 4 bytes, zscalar has been upscaled to 4
+    // bytes.
 
     //--------------------------------------------------------------------------
     // phase 1: each thread reduces a part of the matrix to its own scalar
     //--------------------------------------------------------------------------
 
-    #if GB_A_IS_SPARSE || GB_A_IS_HYPERSPARSE
+    #if GB_A_IS_SPARSE || GB_A_IS_HYPER
     {
 
         //----------------------------------------------------------------------
@@ -180,7 +154,7 @@ __global__ void GB_jit_reduce   // FIXME rename
         // A is bitmap
         //----------------------------------------------------------------------
 
-        const uint8_t *__restrict__ Ab = A->b ;
+        const int8_t *__restrict__ Ab = A->b ;
         for (int64_t p = blockIdx.x * blockDim.x + threadIdx.x ;
             p < anz ;
             p += blockDim.x * gridDim.x)
@@ -197,7 +171,7 @@ __global__ void GB_jit_reduce   // FIXME rename
     // phase 2: each threadblock reduces all threads into a scalar
     //--------------------------------------------------------------------------
 
-    zmine = GB_block_Reduce< T_Z >( this_thread_block(), zmine) ;
+    zmine = GB_block_Reduce( this_thread_block(), zmine) ;
     this_thread_block().sync() ;
 
     //--------------------------------------------------------------------------
@@ -229,3 +203,21 @@ __global__ void GB_jit_reduce   // FIXME rename
     }
 }
 
+//------------------------------------------------------------------------------
+// host function to launch the CUDA kernel
+//------------------------------------------------------------------------------
+
+extern "C"
+{
+    GB_JIT_CUDA_KERNEL_REDUCE_PROTO (GB_jit_kernel) ;
+}
+
+GB_JIT_CUDA_KERNEL_REDUCE_PROTO (GB_jit_kernel)
+{
+    dim3 grid (gridsz) ;
+    dim3 block (blocksz) ;
+    GB_A_NHELD (anz) ;      // anz = # of entries held in A
+    GB_cuda_reduce_kernel <<<grid, block, 0, stream>>> (zscalar, V, A, anz) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/CUDA/README.txt b/GraphBLAS/CUDA/README.txt
index 48ef3edd09..71d84593c4 100644
--- a/GraphBLAS/CUDA/README.txt
+++ b/GraphBLAS/CUDA/README.txt
@@ -2,11 +2,10 @@ GraphBLAS/CUDA: CUDA acceleration for SuiteSparse:GraphBLAS
 
 Dependencies:
     local_cub       BSD 3-clause, (c) NVIDIA (part of CUDA Toolkit)
-    rmm_wrap        Apache 2.0, (c) FIXME
+    rmm_wrap        Apache 2.0, (c) NVIDIA
     cuCollections   Apache 2.0, (c) NVIDIA
     Rapids
     cuco/cub        BSD 3-clause, (c) NVIDIA
     cuco/libcudacxx BSD?, (c) NVIDIA
     cuco/libcxx     Apache 2.0, (c) NVIDIA
-    google-benchmark    ?, (c) Google
 
diff --git a/GraphBLAS/CUDA/TODO.txt b/GraphBLAS/CUDA/TODO.txt
index 4bc2524747..1b4fa1620e 100644
--- a/GraphBLAS/CUDA/TODO.txt
+++ b/GraphBLAS/CUDA/TODO.txt
@@ -1,3 +1,21 @@
+TODO (Mar 2024):
+
+    set/get cuda archictures
+    CUDA PreJIT kernels
+    GB_cuda_matrix_advise:  write it
+    dot3: allow iso
+    use a stream pool (from RMM)
+    can rmm_wrap be thread safe?
+    # of threadblocks in reduce
+    reduce calls GB_enumify_reduce twice
+    set/get which GPU(s) to use
+    data types > 32 bytes
+    handling nvcc compiler errors
+    static device function for computing ks (acts like GB_ek_slice,
+        so call it GB_ek_slice_device
+
+--------------------------------------------------------------------------------
+
 all the FIXMEs
 
 clean up comments and code style
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_atomics.cuh b/GraphBLAS/CUDA/Template/GB_cuda_atomics.cuh
index 9eeddf43dc..6152d003e9 100644
--- a/GraphBLAS/CUDA/Template/GB_cuda_atomics.cuh
+++ b/GraphBLAS/CUDA/Template/GB_cuda_atomics.cuh
@@ -2,7 +2,13 @@
 // GraphBLAS/CUDA/Template/GB_cuda_atomics.cuh: CUDA atomics for GraphBLAS
 //------------------------------------------------------------------------------
 
-// yet still more stuff here
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+//------------------------------------------------------------------------------
+
+// Atomic device functions for CUDA JIT kernels.  Not used on the host.
 
 /*
  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
@@ -31,41 +37,40 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * SPDX-License-Identifier: BSD-3-Clause
  */
 
 //------------------------------------------------------------------------------
 // Specializations for different atomic operations on different types
 //------------------------------------------------------------------------------
 
-// No 1-byte methods are available (bool, uint8_t, int8_t), because CUDA does
-// not support atomicCAS for a single byte.  Instead, to compute a single byte
-// atomically, GraphBLAS must operate on a larger temporary type (typically
-// uint32_t, but it could also use a 16-bit type), and when all results are
-// computed and the kernel launch is done, the final value is copied to the
-// single byte result on the host.
+// No 1- or 2-byte methods are available (bool, uint8_t, int8_t, uint16_t,
+// int16_t), because CUDA does not support atomicCAS for just one or two bytes.
+// Instead, to compute one or two bytes atomically, GraphBLAS must operate on a
+// larger temporary type (typically uint32_t) and when all results are computed
+// and the kernel launch is done, the final value is copied to the one or two
+// bytes result on the host.
 //
 // The GxB_FC64_t type is supported only by GB_cuda_atomic_add.
 //
 // GB_cuda_atomic_write, GB_cuda_atomic_times:
 //
-//      int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t,
+//      int32_t, uint32_t, int64_t, uint64_t,
 //      float, double, and GxB_FC32_t (not GxB_FC64_t).
 //
 // GB_cuda_atomic_min, GB_cuda_atomic_max:
 //
-//      int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t,
+//      int32_t, uint32_t, int64_t, uint64_t,
 //      float, and double (not GxB_FC32_t or GxB_FC64_t).
 //
 // GB_cuda_atomic_add:
 //
-//      int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t,
+//      int32_t, uint32_t, int64_t, uint64_t,
 //      float, double, GxB_FC32_t, and GxB_FC64_t.
 // 
 // GB_cuda_atomic_bor, GB_cuda_atomic_band,
 // GB_cuda_atomic_bxor, GB_cuda_atomic_bxnor :
 //
-//      uint16_t, uint32_t, uint64_t
+//      uint32_t, uint64_t
 //
 // GB_cuda_atomic_lock, GB_cuda_atomic_unlock:
 //
@@ -95,49 +100,9 @@ __device__ __inline__ void GB_cuda_unlock (uint32_t *mutex) ;
 // GB_cuda_atomic_write
 //------------------------------------------------------------------------------
 
-// atomic write (16, 32, and 64 bits)
+// atomic write (32 and 64 bits)
 // no atomic write for GxB_FC64_t
 
-template<> __device__ __inline__ void GB_cuda_atomic_write<int16_t>
-(
-    int16_t *ptr,   // target to modify
-    int16_t val     // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int v = GB_PUN (unsigned short int, val) ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, v) ;
-    }
-    while (assumed != old) ;
-}
-
-template<> __device__ __inline__ void GB_cuda_atomic_write<uint16_t>
-(
-    uint16_t *ptr,  // target to modify
-    uint16_t val    // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int v = (unsigned short int) val ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, v) ;
-    }
-    while (assumed != old) ;
-}
-
 template<> __device__ __inline__ void GB_cuda_atomic_write<int32_t>
 (
     int32_t *ptr,   // target to modify
@@ -217,48 +182,7 @@ template<> __device__ __inline__ void GB_cuda_atomic_write<GxB_FC32_t>
 // GB_cuda_atomic_add for built-in types
 //------------------------------------------------------------------------------
 
-// types: int and uint [16,32,64], float, double, GxB_FC32_t, complex double
-
-template<> __device__ __inline__ void GB_cuda_atomic_add<int16_t>
-(
-    int16_t *ptr,   // target to modify
-    int16_t val     // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // compute the new value:
-        int16_t new_value = GB_PUN (int16_t, assumed) + val ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, GB_PUN (unsigned short int, new_value)) ;
-    }
-    while (assumed != old) ;
-}
-
-template<> __device__ __inline__ void GB_cuda_atomic_add<uint16_t>
-(
-    uint16_t *ptr,  // target to modify
-    uint16_t val    // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int v = (unsigned short int) val ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, assumed + v) ;
-    }
-    while (assumed != old) ;
-}
+// types: int and uint [32,64], float, double, GxB_FC32_t, complex double
 
 template<> __device__ __inline__ void GB_cuda_atomic_add<int32_t>
 (
@@ -351,50 +275,9 @@ template<> __device__ __inline__ void GB_cuda_atomic_add<GxB_FC64_t>
 // GB_cuda_atomic_times for built-in types
 //------------------------------------------------------------------------------
 
-// types: int and uint [16,32,64], float, double, GxB_FC32_t
+// types: int and uint [32,64], float, double, GxB_FC32_t
 // no GxB_FC64_t.
 
-template<> __device__ __inline__ void GB_cuda_atomic_times<int16_t>
-(
-    int16_t *ptr,   // target to modify
-    int16_t val     // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // compute the new value:
-        int16_t new_value = GB_PUN (int16_t, assumed) * val ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, GB_PUN (unsigned short int, new_value)) ;
-    }
-    while (assumed != old) ;
-}
-
-template<> __device__ __inline__ void GB_cuda_atomic_times<uint16_t>
-(
-    uint16_t *ptr,  // target to modify
-    uint16_t val    // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int v = (unsigned short int) val ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, assumed * v) ;
-    }
-    while (assumed != old) ;
-}
-
 template<> __device__ __inline__ void GB_cuda_atomic_times<int32_t>
 (
     int32_t *ptr,   // target to modify
@@ -546,53 +429,9 @@ template<> __device__ __inline__ void GB_cuda_atomic_times<GxB_FC32_t>
 // GB_cuda_atomic_min
 //------------------------------------------------------------------------------
 
-// types: int and uint [16,32,64], float, and double
+// types: int and uint [32,64], float, and double
 // no complex types
 
-template<> __device__ __inline__ void GB_cuda_atomic_min<int16_t>
-(
-    int16_t *ptr,   // target to modify
-    int16_t val     // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // compute the new value
-        int16_t assumed_int16 = GB_PUN (int16_t, assumed) ;
-        int16_t new_value = GB_IMIN (assumed_int16, val) ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, GB_PUN (unsigned short int, new_value)) ;
-    }
-    while (assumed != old) ;
-}
-
-template<> __device__ __inline__ void GB_cuda_atomic_min<uint16_t>
-(
-    uint16_t *ptr,  // target to modify
-    uint16_t val    // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int v = (unsigned short int) val ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // compute the new value
-        unsigned short int new_value = GB_IMIN (assumed, v) ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, new_value) ;
-    }
-    while (assumed != old) ;
-}
-
 template<> __device__ __inline__ void GB_cuda_atomic_min<int32_t>
 (
     int32_t *ptr,   // target to modify
@@ -679,53 +518,9 @@ template<> __device__ __inline__ void GB_cuda_atomic_min<double>
 // GB_cuda_atomic_max
 //------------------------------------------------------------------------------
 
-// types: int and uint [16,32,64], float, and double
+// types: int and uint [32,64], float, and double
 // no complex types
 
-template<> __device__ __inline__ void GB_cuda_atomic_max<int16_t>
-(
-    int16_t *ptr,   // target to modify
-    int16_t val     // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // compute the new value
-        int16_t assumed_int16 = GB_PUN (int16_t, assumed) ;
-        int16_t new_value = GB_IMIN (assumed_int16, val) ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, GB_PUN (unsigned short int, new_value)) ;
-    }
-    while (assumed != old) ;
-}
-
-template<> __device__ __inline__ void GB_cuda_atomic_max<uint16_t>
-(
-    uint16_t *ptr,  // target to modify
-    uint16_t val    // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int v = (unsigned short int) val ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // compute the new value
-        unsigned short int new_value = GB_IMIN (assumed, v) ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, new_value) ;
-    }
-    while (assumed != old) ;
-}
-
 template<> __device__ __inline__ void GB_cuda_atomic_max<int32_t>
 (
     int32_t *ptr,   // target to modify
@@ -812,27 +607,7 @@ template<> __device__ __inline__ void GB_cuda_atomic_max<double>
 // GB_cuda_atomic_bor
 //------------------------------------------------------------------------------
 
-// bitwise: on uint [16,32,64]
-
-template<> __device__ __inline__ void GB_cuda_atomic_bor<uint16_t>
-(
-    uint16_t *ptr,  // target to modify
-    uint16_t val    // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int v = (unsigned short int) val ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, assumed | v) ;
-    }
-    while (assumed != old) ;
-}
+// bitwise: on uint [32,64]
 
 template<> __device__ __inline__ void GB_cuda_atomic_bor<uint32_t>
 (
@@ -858,27 +633,7 @@ template<> __device__ __inline__ void GB_cuda_atomic_bor<uint64_t>
 // GB_cuda_atomic_band
 //------------------------------------------------------------------------------
 
-// bitwise: on uint [16,32,64]
-
-template<> __device__ __inline__ void GB_cuda_atomic_band<uint16_t>
-(
-    uint16_t *ptr,  // target to modify
-    uint16_t val    // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int v = (unsigned short int) val ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, assumed & v) ;
-    }
-    while (assumed != old) ;
-}
+// bitwise: on uint [32,64]
 
 template<> __device__ __inline__ void GB_cuda_atomic_band<uint32_t>
 (
@@ -904,27 +659,7 @@ template<> __device__ __inline__ void GB_cuda_atomic_band<uint64_t>
 // GB_cuda_atomic_bxor
 //------------------------------------------------------------------------------
 
-// bitwise: on uint [16,32,64]
-
-template<> __device__ __inline__ void GB_cuda_atomic_bxor<uint16_t>
-(
-    uint16_t *ptr,  // target to modify
-    uint16_t val    // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int v = (unsigned short int) val ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, assumed ^ v) ;
-    }
-    while (assumed != old) ;
-}
+// bitwise: on uint [32,64]
 
 template<> __device__ __inline__ void GB_cuda_atomic_bxor<uint32_t>
 (
@@ -950,27 +685,7 @@ template<> __device__ __inline__ void GB_cuda_atomic_bxor<uint64_t>
 // GB_cuda_atomic_bxnor
 //------------------------------------------------------------------------------
 
-// bitwise: on uint [16,32,64]
-
-template<> __device__ __inline__ void GB_cuda_atomic_bxnor<uint16_t>
-(
-    uint16_t *ptr,  // target to modify
-    uint16_t val    // value to modify the target with
-)
-{
-    unsigned short int *p = (unsigned short int *) ptr ;
-    unsigned short int v = (unsigned short int) val ;
-    unsigned short int assumed ;
-    unsigned short int old = *p ;
-    do
-    {
-        // assume the old value
-        assumed = old ;
-        // modify it atomically:
-        old = atomicCAS (p, assumed, ~(assumed ^ v)) ;
-    }
-    while (assumed != old) ;
-}
+// bitwise: on uint [32,64]
 
 template<> __device__ __inline__ void GB_cuda_atomic_bxnor<uint32_t>
 (
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_buckets.h b/GraphBLAS/CUDA/Template/GB_cuda_buckets.h
deleted file mode 100644
index 57cc9ebc4a..0000000000
--- a/GraphBLAS/CUDA/Template/GB_cuda_buckets.h
+++ /dev/null
@@ -1,59 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/Template/GB_cuda_buckets.h: bucket definitions for dot3
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-// FIXME: rename this file to GB_cuda_dot3_buckets.h (or .cuh? .hpp?)
-
-//------------------------------------------------------------------------------
-
-#ifndef GB_CUDA_BUCKETS_H
-#define GB_CUDA_BUCKETS_H
-
-#define NBUCKETS 3
-
-// NBUCKETS buckets: computed by up to NBUCKETS-1 kernel launches (zombies need
-// no work...), each using different kernels (with different configurations
-// depending on the bucket).
-
-// dot3:  C<M>=A'B, M is sparse or hyper, C is sparse or hyper
-// 32 kernels A,B: (hyper,sparse,bitmap,full)^2 x (M and C are sparse/hyper)
-
-// FIXME: rename enum values to GB_DOT3_BUCKET*
-typedef enum
-{
-    GB_BUCKET_ZOMBIE = 0,       // C(i,j) is a zombie (not a bucket)
-    // both A and B are sparse/hyper:
-    GB_BUCKET_VSVS = 1,         // vsvs: both A(:,i) and B(:,j) are very sparse
-    GB_BUCKET_MERGEPATH = 2,    // mp: use the merge-path method
-    // A is sparse/hyper and B is bitmap/full, or
-    // A is bitmap/full  and B is sparse/hyper
-    GB_BUCKET_VSDN = 1,         // vsdn: the sparse vector is very sparse
-    GB_BUCKET_SPDN = 2,         // spdn: sparse vector has lots of entries;
-                                // use a whole warp for each dot product
-}
-GB_bucket_code ;    // FIXME: rename GB_dot3_bucket_code
-
-// These may use another bucket enum:
-
-    // two full/(sparse,hyper) kernels:
-    //  // CUDA kernel: spdn, handles 4 buckets:
-    //  // A(:,i) is dense and B(:,j) is very sparse (< 256 entries)
-    //  GB_BUCKET_DNVS = 2,
-    //  // A(:,i) is dense and B(:,j) is sparse (>= 256 entries)
-    //  GB_BUCKET_DNSP = 3,
-
-    // a sparse/full kernel
-    //  // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense
-    //  GB_BUCKET_VSDN = 4,
-    //  // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense
-    //  GB_BUCKET_SPDN = 5,
-
-    // a sparse/bitmap kernel
-    // a bitmap/bitmap kernel
-    // a bitmap/sparse kernel
-    // ...
-
-#endif
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_dot3_defn.h b/GraphBLAS/CUDA/Template/GB_cuda_dot3_defn.h
deleted file mode 100644
index 651ec5c064..0000000000
--- a/GraphBLAS/CUDA/Template/GB_cuda_dot3_defn.h
+++ /dev/null
@@ -1,72 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/Template/GB_cuda_dot3_defn.h: defns just for dot3
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-// FIXME: rename this to .cuh?   It is only #included by GB_cuda_jit*
-
-#pragma once
-
-//------------------------------------------------------------------------------
-// operators
-//------------------------------------------------------------------------------
-
-#if GB_C_ISO
-
-//  GB_MULTADD now defined in header
-//  #define GB_MULTADD( c, a ,b, i, k, j)
-    #define GB_DOT_TERMINAL( c ) break
-    #define GB_DOT_MERGE(pA,pB)                                         \
-    {                                                                   \
-        cij_exists = true ;                                             \
-    }
-    #define GB_CIJ_EXIST_POSTCHECK
-
-#else
-
-//  GB_MULTADD now defined in header
-//  #define GB_MULTADD( c, a, b, i, k, j )                              \
-//  {                                                                   \
-//      GB_Z_TYPE x_op_y ;                                              \
-//      GB_MULT ( x_op_y, a, b, i, k, j ) ; /* x_op_y = a*b */          \
-//      GB_ADD ( c, c, x_op_y ) ;           /* c += x_op_y  */          \
-//  }
-
-    #define GB_DOT_TERMINAL( c ) GB_IF_TERMINAL_BREAK ( c, zterminal )
-
-    #if GB_IS_PLUS_PAIR_REAL_SEMIRING
-
-        // cij += A(k,i) * B(k,j), for merge operation (plus_pair_real semiring)
-        #if GB_Z_IGNORE_OVERFLOW
-            // plus_pair for int64, uint64, float, or double
-            #define GB_DOT_MERGE(pA,pB) cij++ ;
-            #define GB_CIJ_EXIST_POSTCHECK cij_exists = (cij != 0) ;
-        #else
-            // plus_pair semiring for small integers
-            #define GB_DOT_MERGE(pA,pB)                                     \
-            {                                                               \
-                cij_exists = true ;                                         \
-                cij++ ;                                                     \
-            }
-            #define GB_CIJ_EXIST_POSTCHECK
-        #endif
-
-    #else
-
-        // cij += A(k,i) * B(k,j), for merge operation (general case)
-        #define GB_DOT_MERGE(pA,pB)                                         \
-        {                                                                   \
-            GB_GETA ( aki, Ax, pA, ) ;      /* aki = A(k,i) */              \
-            GB_GETB ( bkj, Bx, pB, ) ;      /* bkj = B(k,j) */              \
-            cij_exists = true ;                                             \
-            GB_MULTADD ( cij, aki, bkj, i, k, j ) ;  /* cij += aki * bkj */ \
-        }
-        #define GB_CIJ_EXIST_POSTCHECK
-
-    #endif
-
-#endif
-
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_ek_slice.cuh b/GraphBLAS/CUDA/Template/GB_cuda_ek_slice.cuh
new file mode 100644
index 0000000000..8b864a22fd
--- /dev/null
+++ b/GraphBLAS/CUDA/Template/GB_cuda_ek_slice.cuh
@@ -0,0 +1,192 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/Template/GB_cuda_ek_slice.cuh
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// GB_cuda_ek_slice_setup
+//------------------------------------------------------------------------------
+
+static __device__ __inline__ int64_t GB_cuda_ek_slice_setup
+(
+    // inputs, not modified:
+    const int64_t *Ap,          // array of size anvec+1
+    const int64_t anvec,        // # of vectors in the matrix A
+    const int64_t anz,          // # of entries in the sparse/hyper matrix A
+    const int64_t pfirst,       // first entry in A to find k
+    const int64_t max_pchunk,   // max # of entries in A to find k
+    // output:
+    int64_t *my_chunk_size,     // size of the chunk for this threadblock
+    int64_t *anvec1,            // anvec-1
+    float *slope                // slope of vectors from kfirst to klast
+)
+{
+
+    //--------------------------------------------------------------------------
+    // determine the range of entryes pfirst:plast-1 for this chunk
+    //--------------------------------------------------------------------------
+
+    // The slice for each threadblock contains entries pfirst:plast-1 of A.
+    // The threadblock works on a chunk of entries in Ai/Ax [pfirst...plast-1].
+
+    ASSERT (pfirst < anz) ;
+    ASSERT (max_pchunk > 0) ;
+    int64_t plast = pfirst + max_pchunk ;
+    plast = GB_IMIN (plast, anz) ;
+    (*my_chunk_size) = plast - pfirst ;
+    ASSERT ((*my_chunk_size) > 0) ;
+
+    //--------------------------------------------------------------------------
+    // estimate the first and last vectors for this chunk
+    //--------------------------------------------------------------------------
+
+    // find kfirst, the first vector of the slice for this chunk.  kfirst is
+    // the vector that owns the entry Ai [pfirst] and Ax [pfirst].  The search
+    // does not need to be exact, so kfirst is an estimate.
+
+    int64_t kfirst = 0 ;
+    int64_t kright = anvec ;
+    GB_TRIM_BINARY_SEARCH (pfirst, Ap, kfirst, kright) ;
+
+    // find klast, the last vector of the slice for this chunk.  klast is the
+    // vector that owns the entry Ai [plast-1] and Ax [plast-1].  The search
+    // does not have to be exact, so klast is an estimate.
+
+    int64_t klast = kfirst ;
+    kright = anvec ;
+    GB_TRIM_BINARY_SEARCH (plast, Ap, klast, kright) ;
+
+    //--------------------------------------------------------------------------
+    // find slope of vectors in this chunk, and return result
+    //--------------------------------------------------------------------------
+
+    // number of vectors in A for this chunk, where
+    // Ap [kfirst:klast-1] will be searched.
+    int64_t nk = klast - kfirst + 1 ;
+
+    // slope is the estimated # of vectors in this chunk, divided by the
+    // chunk size.
+    (*slope) = ((float) nk) / ((float) (*my_chunk_size)) ;
+
+    (*anvec1) = anvec - 1 ;
+    return (kfirst) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_cuda_ek_slice_entry
+//------------------------------------------------------------------------------
+
+// Let p = kk + pfirst, where kk ranges from 0:my_chunk_size-1, and so p ranges
+// from kk:(kk+my_chunk_size-1), and where my_chunk_size is normally of size
+// max_pchunk, unless this is the last chunk in the entire matrix.
+// GB_cuda_ek_slice_entry computes k for this entry, so that the kth vector
+// contains the entry aij with row index i = Ai [p] and value aij = Ax [p]
+// (assuming that A is a sparse or hypersparse matrix held by column).  That
+// is, Ap [k] <= p < Ap [k+1] will hold.  If A is sparse and held by column,
+// then aij is in column j = k.  If A is hypersparse, then aij is in column j =
+// Ah [k].
+
+// The method returns the index k of the vector in A that contains the pth
+// entry in A, at position p = kk + pfirst.
+
+static __device__ __inline__ int64_t GB_cuda_ek_slice_entry
+(
+    // inputs, not modified:
+    const int64_t kk,           // find the k value of the kkth entry
+    const int64_t pfirst,       // first entry in A to find k (for which kk=0)
+    const int64_t *Ap,          // array of size anvec+1
+    const int64_t anvec1,       // anvec-1
+    const int64_t kfirst,       // estimate of first vector in the chunk
+    const float slope           // estimate # vectors in chunk / my_chunk_size
+)
+{
+
+    // get a rough estimate of k for the kkth entry
+    int64_t k = kfirst + (int64_t) (slope * ((float) kk)) ;
+
+    // The estimate of k cannot be smaller than kfirst, but it might be bigger
+    // than anvec-1, so ensure it is in the valid range, kfirst to anvec-1.
+    k = GB_IMIN (k, anvec1) ;
+
+    // look for p in Ap, where p is in range pfirst:plast-1
+    // where pfirst >= 0 and plast < anz
+    int64_t p = kk + pfirst ;
+
+    // linear-time search for the k value of the pth entry
+    while (Ap [k+1] <= p) k++ ;
+    while (Ap [k  ] >  p) k-- ;
+
+    // the pth entry of A is contained in the kth vector of A
+    ASSERT (Ap [k] <= p && p < Ap [k+1]) ;
+
+    // return the result k
+    return (k) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_cuda_ek_slice
+//------------------------------------------------------------------------------
+
+// GB_cuda_ek_slice finds the vector k that owns each entry in the sparse or
+// hypersparse matrix A, in Ai/Ax [pfirst:plast-1], where plast = min (anz,
+// pfirst+max_pchunk).  Returns my_chunk_size = plast - pfirst, which is the
+// size of the chunk operated on by this threadblock.
+
+// The function GB_cuda_ek_slice behaves somewhat like GB_ek_slice used on the
+// CPU.  The latter is for OpenMP parallelism on the CPU only; it does not
+// need to compute ks.
+
+static __device__ __inline__ int64_t GB_cuda_ek_slice // returns my_chunk_size
+(
+    // inputs, not modified:
+    const int64_t *Ap,          // array of size anvec+1
+    const int64_t anvec,        // # of vectors in the matrix A
+    const int64_t anz,          // # of entries in the sparse/hyper matrix A
+    const int64_t pfirst,       // first entry in A to find k
+    const int64_t max_pchunk,   // max # of entries in A to find k
+    // output:
+    int64_t *ks                 // k value for each pfirst:plast-1
+)
+{
+
+    //--------------------------------------------------------------------------
+    // determine the chunk for this threadblock and its slope
+    //--------------------------------------------------------------------------
+
+    int64_t my_chunk_size, anvec1 ;
+    float slope ;
+    int64_t kfirst = GB_cuda_ek_slice_setup (Ap, anvec, anz, pfirst,
+        max_pchunk, &my_chunk_size, &anvec1, &slope) ;
+
+    //--------------------------------------------------------------------------
+    // find the kth vector that contains each entry p = pfirst:plast-1
+    //--------------------------------------------------------------------------
+
+    for (int64_t kk = threadIdx.x ; kk < my_chunk_size ; kk += blockDim.x)
+    {
+
+        //----------------------------------------------------------------------
+        // determine the kth vector that contains the pth entry
+        //----------------------------------------------------------------------
+
+        int64_t k = GB_cuda_ek_slice_entry (kk, pfirst, Ap, anvec1, kfirst,
+            slope) ;
+
+        //----------------------------------------------------------------------
+        // save the result in ks
+        //----------------------------------------------------------------------
+
+        ks [kk] = k ;
+    }
+
+    //--------------------------------------------------------------------------
+    // sync all threads and return result
+    //--------------------------------------------------------------------------
+
+    this_thread_block().sync() ;
+    return (my_chunk_size) ;
+}
+
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_error.hpp b/GraphBLAS/CUDA/Template/GB_cuda_error.hpp
new file mode 100644
index 0000000000..fe7815c6c2
--- /dev/null
+++ b/GraphBLAS/CUDA/Template/GB_cuda_error.hpp
@@ -0,0 +1,37 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/GB_cuda_error.hpp: call a cuda method and check its result
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_CUDA_ERROR_HPP
+#define GB_CUDA_ERROR_HPP
+
+//------------------------------------------------------------------------------
+// CUDA_OK: like GB_OK but for calls to cuda* methods
+//------------------------------------------------------------------------------
+
+// FIXME: GrB_NO_VALUE means something in CUDA failed, and the caller will then
+// do the computation on the CPU.  Need to turn off the JIT for CUDA kernels
+// (but not CPU kernels) if some CUDA error occurred.  Current JIT control does
+// not distinguish between CPU and CUDA failures.
+
+#define CUDA_OK(cudaMethod)                                                 \
+{                                                                           \
+    cudaError_t cuda_error = cudaMethod ;                                   \
+    if (cuda_error != cudaSuccess)                                          \
+    {                                                                       \
+        GrB_Info info = (cuda_error == cudaErrorMemoryAllocation) ?         \
+            GrB_OUT_OF_MEMORY : GrB_NO_VALUE ;                              \
+        GBURBLE ("(cuda failed: %d:%s file:%s line:%d) ", (int) cuda_error, \
+            cudaGetErrorString (cuda_error), __FILE__, __LINE__) ;          \
+        GB_FREE_ALL ;                                                       \
+        return (info) ;                                                     \
+    }                                                                       \
+}
+
+#endif
+
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh
new file mode 100644
index 0000000000..4c202eeaf5
--- /dev/null
+++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh
@@ -0,0 +1,122 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_dense_phase1.cuh
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// phase1 for dot3, A and B are bitmap/full.
+// dense phase1: symbolic load balancing and data partition.
+
+// This kernel scans the non-zero pattern in A and B, takes into account the
+// mask and computes total work required to form C. Then it computes the vector
+// k that contains each entry C(i,j) that isn't a zombie, or sets C(i,j) to its
+// zombie status.
+
+//------------------------------------------------------------------------------
+// GB_cuda_AxB_dot3_dense_phase1_kernel: lookup i,k pairs and store in Ci 
+//------------------------------------------------------------------------------
+
+// GB_cuda_AxB_dot3_dense_phase1_kernel is a CUDA kernel that scans all entries
+// in M and assigns i,j coordinates for each entries and stores in Mi and Ci. 
+// A and B are both bitmap/full.  C and M are sparse/hypersparse.
+
+__global__ void GB_cuda_AxB_dot3_dense_phase1_kernel
+(
+    // input/output:
+    GrB_Matrix C,           // final output matrix
+    const GrB_Matrix M      // mask matrix
+)
+{
+
+    //--------------------------------------------------------------------------
+    // get C, M, A, and B
+    //--------------------------------------------------------------------------
+
+    const int64_t *__restrict__ Mp = M->p ;
+    const int64_t *__restrict__ Mi = M->i ;
+    #if !GB_MASK_STRUCT
+    const GB_M_TYPE *__restrict__ Mx = (GB_M_TYPE *) M->x ;
+    #endif
+    const int64_t mnvec = M->nvec ;
+    const GB_M_NVALS (mnz) ;
+
+    int64_t *__restrict__ Ci = C->i ;   // for zombies, or vector k
+
+    // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a
+    // zombie, or k otherwise, where C(:,j) is the kth vector of C (j = Ch [k]
+    // if hypersparse or j = k if standard sparse).
+
+    //--------------------------------------------------------------------------
+    // determine the vector k of all entries in C(i,j), one chunk at a time
+    //--------------------------------------------------------------------------
+
+#if 0
+    __shared__ int64_t ks [chunk_size] ;
+#endif
+
+//  int64_t chunk_max = GB_ICEIL (mnz, chunk_size) ;
+//  for (int64_t chunk = blockIdx.x ; chunk < chunk_max ; chunk += gridDim.x )
+
+    for (int64_t pfirst = blockIdx.x << log2_chunk_size ;
+                 pfirst < mnz ;
+                 pfirst += gridDim.x << log2_chunk_size)
+    {
+
+        //----------------------------------------------------------------------
+        // find the vector k that contains each entry C(i,j) in this chunk
+        //----------------------------------------------------------------------
+
+        // This threadblock works on Mi/Mx and Ci/Cx, in positions pfirst to
+        // pfirst + my_chunk_size - 1.
+
+#if 0
+        int64_t my_chunk_size = GB_cuda_ek_slice (Mp, mnvec, mnz, pfirst,
+            chunk_size, /* output: */ ks) ;
+#else
+        int64_t my_chunk_size, mnvec1 ;
+        float slope ;
+        int64_t kfirst = GB_cuda_ek_slice_setup (Mp, mnvec, mnz, pfirst,
+            chunk_size, &my_chunk_size, &mnvec1, &slope) ;
+#endif
+
+        //----------------------------------------------------------------------
+        // assign entries in C(i,j): either its vector k or its zombie status
+        //----------------------------------------------------------------------
+
+//      for (int64_t pM = pfirst + threadIdx.x ;
+//                   pM < pfirst + my_chunk_size ;
+//                   pM += blockDim.x)
+
+        for (int64_t kk = threadIdx.x ; kk < my_chunk_size ; kk += blockDim.x)
+        {
+
+#if 0
+            int64_t k = ks [kk] ;       // get the k value of Mi,Mx [pM].
+#else
+            int64_t k = GB_cuda_ek_slice_entry (kk, pfirst, Mp, mnvec1, kfirst,
+                slope) ;
+#endif
+
+            int64_t pM = kk + pfirst ;
+
+            #if GB_MASK_STRUCT
+            {
+                // no need to check the value of M(i,j); no prezombies
+                Ci [pM] = k ;
+            }
+            #else
+            {
+                bool mij = (bool) GB_MCAST (Mx, pM, ) ;
+                int64_t i = Mi [pM] ;
+                Ci [pM] = (!mij) * (GB_FLIP (i))
+                        +   mij  * (k) ;
+            }
+            #endif
+        }
+    }
+}
+
diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase1.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase1.cuh
similarity index 63%
rename from GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase1.cuh
rename to GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase1.cuh
index 23b6b272aa..346b5de04a 100644
--- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase1.cuh
+++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase1.cuh
@@ -2,6 +2,8 @@
 // GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase1.cuh
 //------------------------------------------------------------------------------
 
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
@@ -11,27 +13,9 @@
 // dot3, phase1: symbolic load balancing and data partition
 // to assign work to different 'buckets' for later compute
 
-//  This kernel scans the non-zero pattern in A and B, takes into account the
-//  mask and computes total work required to form C. Then it classifies each
-//  dot product into a set of buckets for efficient compute. 
-
-#pragma once
-
-#include <limits>
-#include <stdint.h>
-#include "GB_cuda_kernel.h"
-#include "GB_mxm_shared_definitions.h"
-#include "GB_hash.h"
-#include "GB_hyper_hash_lookup.h"
-#include "GB_cuda_buckets.h"
-#include <cub/block/block_scan.cuh>
-#include <cooperative_groups.h>
-
-using namespace cooperative_groups;
-
-//------------------------------------------------------------------------------
-// GB_jit_AxB_dot3_phase1: build nanobuckets, hunt for pre-zombies
-//------------------------------------------------------------------------------
+// This kernel scans the non-zero pattern in A and B, takes into account the
+// mask and computes total work required to form C. Then it classifies each dot
+// product into a set of buckets for efficient compute.
 
 // GB_AxB_cuda_dot3_phase1 is a CUDA kernel that scans all entries in C and
 // assigns them to each of the NBUCKETS buckets.  The output is a
@@ -54,8 +38,7 @@ using namespace cooperative_groups;
 // FIXME: What if all entries are in one bucket;
 // can we skip the bucket creation?
 
-template<typename T_M, uint64_t srcode, int chunk_size = 128>
-__global__ void GB_jit_AxB_dot3_phase1
+__global__ void GB_jit_AxB_dot3_phase1_kernel
 (
     // outputs, preallocated in global memory:
     int64_t *nanobuckets,   // array of size NBUCKETS-blockDim.x-by-gridDim.x
@@ -73,30 +56,26 @@ __global__ void GB_jit_AxB_dot3_phase1
     // get C, M, A, and B
     //--------------------------------------------------------------------------
 
+    #if GB_M_IS_HYPER
     const int64_t *__restrict__ Mh = M->h ;
+    #endif
     const int64_t *__restrict__ Mp = M->p ;
     const int64_t *__restrict__ Mi = M->i ;
     #if !GB_MASK_STRUCT
     const GB_M_TYPE *__restrict__ Mx = (GB_M_TYPE *) M->x ;
     #endif
     const int64_t mnvec = M->nvec ;
-    const int64_t mvlen = M->vlen ;
-//  const int64_t mnz = GB_nnz(M) ;
+    // const int64_t mvlen = M->vlen ;
     const GB_M_NVALS (mnz) ;
-    const bool M_is_hyper = M->h != NULL ;
     ASSERT (GB_M_IS_SPARSE || GB_M_IS_HYPER) ;
 
+    #if GB_A_IS_SPARSE || GB_A_IS_HYPER
     const int64_t *__restrict__ Ap = A->p ;
-    const int64_t *__restrict__ Ai = A->i ;
-    const int64_t avlen = A->vlen ;
-//  const int64_t anz = GB_nnz(A) ;
-    const GB_A_NVALS (anz) ;
+    #endif
 
+    #if GB_B_IS_SPARSE || GB_B_IS_HYPER
     const int64_t *__restrict__ Bp = B->p ;
-    const int64_t *__restrict__ Bi = B->i ;
-    const int64_t bvlen = B->vlen ;
-//  const int64_t bnz = GB_nnz(B);
-    const GB_B_NVALS (bnz) ;
+    #endif
 
     #if GB_A_IS_HYPER
     const int64_t anvec = A->nvec ;
@@ -131,139 +110,124 @@ __global__ void GB_jit_AxB_dot3_phase1
     //--------------------------------------------------------------------------
     // clear the bucket counters
     //--------------------------------------------------------------------------
-    int64_t my_bucket[NBUCKETS];
-
-    // ASSERT (mnz > 0) ;
-    // ASSERT (gridDim.x <= mnz) ;
 
+    int64_t my_bucket [NBUCKETS] ;
     // each thread uses NBUCKETS bucket counters, held in register
     #pragma unroll
-    for(int b = 0; b < NBUCKETS; ++b) {
-        my_bucket[b] = 0;
+    for (int b = 0 ; b < NBUCKETS ; b++)
+    {
+        my_bucket [b] = 0 ;
     }
 
-    __shared__ int64_t ks [chunk_size] ;
-
     //--------------------------------------------------------------------------
-    // assign all entries of C to the buckets
+    // assign buckets to all entries in C(i,j), one chunk at a time
     //--------------------------------------------------------------------------
 
-    // all threads in this block will compute the same values for these:
-    int64_t pfirst, plast, kfirst, klast ;
+#if 0
+    // removing ks saves about 10% of the phase1 time
+    // (19.5 msec to 17.5 msec for the com-Orkut matrix)
+    __shared__ int64_t ks [chunk_size] ;
+#endif
 
-    int64_t chunk_max = GB_ICEIL (mnz, chunk_size) ;
-    //      (mnz + chunk_size -1)/chunk_size;
-    for ( int64_t chunk = blockIdx.x;
-                  chunk < chunk_max;
-                  chunk += gridDim.x )
+    for (int64_t pfirst = blockIdx.x << log2_chunk_size ;
+                 pfirst < mnz ;
+                 pfirst += gridDim.x << log2_chunk_size)
     {
 
         //----------------------------------------------------------------------
-        // determine the work done by this iteration, "chunk"
+        // find the vector k that contains each entry C(i,j) in this chunk
         //----------------------------------------------------------------------
 
-        // The slice for each task contains entries pfirst:plast-1 of M and C.
-        // This iteration "chunk" computes Ci and Cx [pfirst...plast-1], using
-        // Mi and Mx [pfirst:plast-1].  All threads in the thread block are
-        // used for this "chunk".
-        pfirst = chunk_size * chunk ;
-        plast  = pfirst + chunk_size ;
-        // plast = GB_IMIN (plast, mnz) ;
-        if (plast > mnz) plast = mnz ;
-        int64_t my_chunk_size = plast - pfirst ;
-
-        // find the first vector of the slice for this chunk: the
-        // vector that owns the entry Mi [pfirst] and Mx [pfirst].
-        kfirst = GB_search_for_vector_device (pfirst, Mp, 0, mnvec, mvlen) ;
-
-        // find the last vector of the slice for task blockIdx.x: the
-        // vector that owns the entry Mi [plast-1] and Mx [plast-1].
-        klast = GB_search_for_vector_device (plast-1, Mp, kfirst, mnvec, mvlen);
+        // This threadblock works on Mi/Mx and Ci/Mx, in positions pfirst to
+        // pfirst + my_chunk_size - 1.
 
-        // number of vectors in C and M for this "chunk" iteration, where
-        // Mp [kfirst:klast] will be operated on.
-        int64_t nk = klast - kfirst + 1 ;
+#if 0
+        int64_t my_chunk_size = GB_cuda_ek_slice (Mp, mnvec, mnz, pfirst,
+            chunk_size, /* output: */ ks) ;
+#else
+        int64_t my_chunk_size, mnvec1 ;
+        float slope ;
+        int64_t kfirst = GB_cuda_ek_slice_setup (Mp, mnvec, mnz, pfirst,
+            chunk_size, &my_chunk_size, &mnvec1, &slope) ;
+#endif
 
         //----------------------------------------------------------------------
-        // fill ks to find all indices
+        // assign entries in C(i,j) to the buckets
         //----------------------------------------------------------------------
 
-        // search for k values for each entry pfirst:plast-1
-        float slope = ((float) nk) / ((float) my_chunk_size) ;
-        int64_t mnvec1 = mnvec - 1 ;
         for (int64_t kk = threadIdx.x ; kk < my_chunk_size ; kk += blockDim.x)
         {
-            // get a rough estimate of k for the kkth entry in ks
-            int64_t k = kfirst + (int64_t) (slope * ((float) kk)) ;
-            // k cannot be smaller than kfirst, but might be bigger than
-            // mnvec-1, so ensure it is in the valid range, kfirst to mnvec-1
-            // k = GB_IMIN (k, mnvec-1) ;
-            if (k > mnvec1) k = mnvec1 ; 
-            // look for p in Mp, where p is in range pfirst:plast-1
-            // where pfirst >= 0 and plast < mnz
-            int64_t p = kk + pfirst ;
-            // linear-time search for the k value of the pth entry
-            while ( Mp [ k + 1 ] <= p ) k++ ;
-            while ( Mp [ k     ] >  p ) k-- ;
-            ks [kk] = k ;
-        }
-        this_thread_block().sync();
 
-        //----------------------------------------------------------------------
-        // assign entries in C(i,j) to the buckets
-        //----------------------------------------------------------------------
+            //------------------------------------------------------------------
+            // determine the kth vector that contains the pth entry
+            //------------------------------------------------------------------
+
+#if 0
+            int64_t k = ks [kk] ;           // get the k value of Mi,Mx [pM]
+#else
+            int64_t k = GB_cuda_ek_slice_entry (kk, pfirst, Mp, mnvec1, kfirst,
+                slope) ;
+#endif
+
+            //------------------------------------------------------------------
+            // get C(i,j): zombie if A(:,i) and B(:,j) are empty or M(i,j) false
+            //------------------------------------------------------------------
+
+            // C(i,j) is in the kth vector of C, where j == k if C is sparse,
+            // or j = Mh [k] if C is hypersparse
 
-        for ( int64_t pM = pfirst + threadIdx.x;
-                      pM < pfirst + my_chunk_size;
-                      pM += blockDim.x )
-        {
             GB_bucket_code bucket = GB_BUCKET_ZOMBIE ;
-            int64_t k = ks [pM - pfirst] ;  // get the k value of Mi,Mx [pM].
-            int64_t i = Mi [ pM ] ;
-            int64_t j = GBH_M (Mh, k) ;     // note that Ch and Mh are the same
-            if ( GB_MCAST ( Mx, pM, ) )
+            int64_t pM = kk + pfirst ;
+            int64_t i = Mi [pM] ;
+
+            if (GB_MCAST (Mx, pM, ))        // if (M (i,j) is true):
             {
 
                 //--------------------------------------------------------------
                 // get B(:,j)
                 //--------------------------------------------------------------
 
-                int64_t pB, pB_end ;
+                #if GB_B_IS_SPARSE || GB_B_IS_HYPER
+                int64_t j = GBH_M (Mh, k) ; // that Ch and Mh are the same
+                int64_t pB, pB_end, bjnz ;
+                #endif
+
                 #if GB_B_IS_HYPER
                 GB_hyper_hash_lookup (Bh, bnvec, Bp, B_Yp, B_Yi, B_Yx,
                     B_hash_bits, j, &pB, &pB_end) ;
+                bjnz = pB_end - pB ;
+                if (bjnz > 0)
                 #elif GB_B_IS_SPARSE
-                pB       = Bp[j] ;
-                pB_end   = Bp[j+1] ;
+                pB     = Bp [j] ;
+                pB_end = Bp [j+1] ;
+                bjnz = pB_end - pB ;        // # of entries in B(:,j)
+                if (bjnz > 0)
                 #else
-                // B is bitmap or full
-                pB       = bvlen * j ;
-                pB_end   = pB + j ;
+                // B is bitmap or full: no need to look up B(:,j)
                 #endif
-
-                int64_t bjnz = pB_end - pB ;
-                if (bjnz > 0)
                 {
 
                     //----------------------------------------------------------
                     // get A(:,i)
                     //----------------------------------------------------------
 
-                    int64_t pA, pA_end ;
+                    #if GB_A_IS_SPARSE || GB_A_IS_HYPER
+                    int64_t pA, pA_end, ainz ;
+                    #endif
+
                     #if GB_A_IS_HYPER
                     GB_hyper_hash_lookup (Ah, anvec, Ap, A_Yp, A_Yi, A_Yx,
                         A_hash_bits, i, &pA, &pA_end) ;
+                    ainz = pA_end - pA ;
+                    if (ainz > 0)
                     #elif GB_A_IS_SPARSE
-                    pA       = Ap[i] ;
-                    pA_end   = Ap[i+1] ;
+                    pA     = Ap [i] ;
+                    pA_end = Ap [i+1] ;
+                    ainz = pA_end - pA ;        // # of entries in A(:,i)
+                    if (ainz > 0)
                     #else
-                    // A is bitmap or full
-                    pA       = avlen * i ;
-                    pA_end   = pA + i ;
+                    // A is bitmap or full: no need to look up A(:,i)
                     #endif
-
-                    int64_t ainz = pA_end - pA ;
-                    if (ainz > 0)
                     {
                         // determine the bucket for C(i,j)
                         #if (GB_A_IS_SPARSE || GB_A_IS_HYPER) && \
@@ -291,12 +255,20 @@ __global__ void GB_jit_AxB_dot3_phase1
                 }
             }
 
-            Ci[pM] = (bucket == GB_BUCKET_ZOMBIE) * ( GB_FLIP(i) << 4)
-                   + (bucket != GB_BUCKET_ZOMBIE) * ((k<<4) + bucket) ;
-            my_bucket[bucket]++;
+            //------------------------------------------------------------------
+            // assign C(i,j) to its bucket
+            //------------------------------------------------------------------
+
+            // encode the bucket or zombie status in the row index of C(i,j)
+            Ci [pM] = (bucket == GB_BUCKET_ZOMBIE) * ( GB_FLIP(i) << 4)
+                    + (bucket != GB_BUCKET_ZOMBIE) * ((k<<4) + bucket) ;
+
+            // each thread counts its own bucket sizes
+            my_bucket [bucket]++ ;
         }
     }
-    this_thread_block().sync();
+
+    this_thread_block().sync() ;
 
     //--------------------------------------------------------------------------
     // cumulative sum of each bucket
@@ -313,17 +285,17 @@ __global__ void GB_jit_AxB_dot3_phase1
         nanobuckets + blockIdx.x * (NBUCKETS * blockDim.x) + threadIdx.x ;
 
     #pragma unroll
-    for (int b = 0; b < NBUCKETS; ++b)
+    for (int b = 0 ; b < NBUCKETS ; b++)
     {
         if ( threadIdx.x == blockDim.x-1)
         {
             blockbucket [blockIdx.x + b * gridDim.x] = my_bucket[b] ;
         }
-        this_thread_block().sync();
+        this_thread_block().sync() ;
 
         BlockCumSum(temp_storage).ExclusiveSum( my_bucket[b], my_bucket[b]) ;
 
-        this_thread_block().sync();
+        this_thread_block().sync() ;
 
         nanobucket [b * blockDim.x] = my_bucket[b] ;
     }
@@ -337,7 +309,7 @@ __global__ void GB_jit_AxB_dot3_phase1
     if (threadIdx.x == blockDim.x - 1 )
     {
         #pragma unroll
-        for(int b = 0; b < NBUCKETS; ++b)
+        for (int b = 0; b < NBUCKETS; ++b)
         {
             blockbucket [b * gridDim.x + blockIdx.x] += my_bucket[b];
         }
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase2.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase2.cuh
new file mode 100644
index 0000000000..171347573f
--- /dev/null
+++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase2.cuh
@@ -0,0 +1,176 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_GB_AxB_dot3_phase2.cuh
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// AxB_dot3_phase2: fill the global buckets
+
+//------------------------------------------------------------------------------
+// BlockPrefixCallbackOp
+//------------------------------------------------------------------------------
+
+// A stateful callback functor that maintains a running prefix to be applied
+// during consecutive scan operations.
+struct BlockPrefixCallbackOp
+{
+    // Running prefix
+    int64_t running_total ;
+
+    // Constructor
+    __device__ BlockPrefixCallbackOp (int64_t running_total) :
+        running_total(running_total) {}
+
+    // Callback operator to be entered by the first warp of threads in the
+    // block.  Thread-0 is responsible for returning a value for seeding the
+    // block-wide scan.
+    __device__ int64_t operator()(int64_t block_aggregate)
+    {
+        int64_t old_prefix = running_total ;
+        running_total += block_aggregate ;
+        return old_prefix ;
+    }
+} ;
+
+//------------------------------------------------------------------------------
+// blockBucketExclusiveSum
+//------------------------------------------------------------------------------
+
+__inline__ __device__ void blockBucketExclusiveSum
+(
+    int bucketId,
+    int64_t *d_data,
+    int nblocks
+)
+{
+
+    // Specialize BlockScan for a 1D block of 32 threads
+    typedef cub::BlockScan<int64_t, 32, cub::BLOCK_SCAN_WARP_SCANS> BlockScan ;
+
+    // Allocate shared memory for BlockScan
+    __shared__ typename BlockScan::TempStorage temp_storage ;
+
+    // Initialize running total
+    BlockPrefixCallbackOp prefix_op (0) ;
+
+    // Have the block iterate over segments of items
+    int64_t data = 0 ;
+
+    int64_t *blockbucket = d_data ;
+
+    for (int block_id = 0 ; block_id < nblocks ; block_id += blocksize)
+    {
+        // Load a segment of consecutive items that are blocked across threads
+
+        int loc = block_id + threadIdx.x;
+        if (loc < nblocks)
+        {
+            data = blockbucket [bucketId*nblocks + loc] ;
+        }
+        this_thread_block().sync() ;
+
+        // Collectively compute the block-wide exclusive prefix sum
+        BlockScan(temp_storage).ExclusiveSum (data, data, prefix_op) ;
+        this_thread_block().sync() ;
+
+        if (loc < nblocks)
+        {
+            blockbucket [bucketId*nblocks + loc] = data ;
+        }
+
+        // this_thread_block().sync();
+
+        data = 0 ;
+    }
+}
+
+//------------------------------------------------------------------------------
+// GB_cuda_AxB_dot3_phase2_kernel
+//------------------------------------------------------------------------------
+
+// GB_cuda_AxB__dot3_phase2 is a CUDA kernel that takes as input the
+// nanobuckets and blockbucket arrays computed by the first phase kernel,
+// GB_cuda_AxB__dot3_phase1.  The launch geometry of this kernel must match
+// the GB_cuda_AxB_dot3_phase1 kernel, with the same # of threads and
+// threadblocks.
+
+__global__ void GB_cuda_AxB_dot3_phase2_kernel
+(
+    // input, not modified:
+    int64_t *__restrict__ blockbucket,  // global bucket count,
+                                        // of size NBUCKETS*nblocks
+    // output:
+    int64_t *__restrict__ offset,       // global offsets, for each bucket
+    // inputs, not modified:
+    const int nblocks               // input number of blocks to reduce
+                                    // across, ie size of vector for 1 bucket
+)
+{
+
+    //--------------------------------------------------------------------------
+    // sum up the bucket counts of prior threadblocks
+    //--------------------------------------------------------------------------
+
+    // blockbucket is an array of size NBUCKETS-by-nblocks, held by row.  The
+    // entry blockbucket [bucket * nblocks + t] holds the # of entries
+    // in the bucket (in range 0 to NBUCKETS-1) found by threadblock t.
+
+    uint64_t s [NBUCKETS] ;
+
+    #pragma unroll
+    for (int b = 0 ; b < NBUCKETS ; b++)
+    {
+        s [b] = 0 ;
+    }
+
+    thread_block_tile<32> tile = tiled_partition<32>(this_thread_block() );
+
+     #pragma unroll
+     for (int b = 0 ; b < NBUCKETS ; b++)
+     {
+        for (int64_t tid = threadIdx.x + blockIdx.x * blockDim.x ;
+              tid < nblocks ;
+              tid += blockDim.x*gridDim.x)
+        {
+            s [b] += blockbucket [b * nblocks + tid] ;
+        }
+        this_thread_block().sync(); 
+
+        s [b] = GB_cuda_warp_sum_uint64 (tile, s [b]) ;
+     }
+
+    if (threadIdx.x == 0)
+    {
+        #pragma unroll
+        for (int b = 0 ; b < NBUCKETS ; b++)
+        {
+            atomicAdd ((unsigned long long int*) &(offset [b]), s [b]) ;
+        }
+    }
+    this_thread_block().sync(); 
+
+    if (gridDim.x >= NBUCKETS)
+    {
+        // Cumulative sum across blocks for each bucket
+        if (blockIdx.x <NBUCKETS)
+        {
+            blockBucketExclusiveSum (blockIdx.x, blockbucket, nblocks) ;
+        }
+    }
+    else
+    {
+        if (blockIdx.x == 0)
+        {
+            #pragma unroll
+            for (int b = 0 ; b < NBUCKETS ; b++)
+            {
+                blockBucketExclusiveSum (b, blockbucket, nblocks) ;
+            }
+        }
+    }
+}
+
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase2end.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase2end.cuh
new file mode 100644
index 0000000000..6a64c7845c
--- /dev/null
+++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase2end.cuh
@@ -0,0 +1,141 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase2end.cuh
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// GB_cuda_AxB_dot3_phase2end_kernel: fill the global buckets
+//------------------------------------------------------------------------------
+
+__global__ void GB_cuda_AxB_dot3_phase2end_kernel
+(
+    // input, not modified:
+    const int64_t *__restrict__ nanobuckets,  // array of size
+                                              // NBUCKETS-blockDim.x-by-nblocks
+    const int64_t *__restrict__ blockbucket,  // global bucket count, of size
+                                              // NBUCKETS*nblocks
+    // output:
+    const int64_t *__restrict__ bucketp,      // global bucket cumsum,
+                                              // of size NBUCKETS+1
+          int64_t *__restrict__ bucket,       // global buckets, of size
+                                              // cnz == mnz
+    const int64_t *__restrict__ offset,       // global offsets for each bucket
+    // inputs, not modified:
+    const GrB_Matrix C,      // output matrix
+    const int64_t cnz        // number of entries in C and M
+)
+{
+
+    //--------------------------------------------------------------------------
+    // get C information 
+    //--------------------------------------------------------------------------
+
+    // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a
+    // zombie, or (k << 4) + bucket otherwise, where C(:,j) is the kth vector
+    // of C (j = Ch [k] if hypersparse or j = k if standard sparse), and
+    // where bucket is the bucket assignment for C(i,j).  This phase does not
+    // need k, just the bucket for each entry C(i,j).
+
+    int64_t *__restrict__ Ci = C->i ;       // for zombies, or bucket assignment
+    //int64_t *Mp = C->p ;       // for offset calculations
+    //int64_t mnvec = C->nvec;
+
+    //--------------------------------------------------------------------------
+    // load and shift the nanobuckets for this thread block
+    //--------------------------------------------------------------------------
+
+    // The taskbucket for this threadblock is an array of size
+    // NBUCKETS-by-blockDim.x, held by row.  It forms a 2D array within the 3D
+    // nanobuckets array.
+    const int64_t *taskbucket = nanobuckets +
+        blockIdx.x * (NBUCKETS * blockDim.x) ;
+
+    // Each thread in this threadblock owns one column of this taskbucket, for
+    // its set of NBUCKETS nanobuckets.  The nanobuckets are a column of length
+    // NBUCKETS, with stride equal to blockDim.x.
+
+    const int64_t *nanobucket = taskbucket + threadIdx.x ;
+
+    // Each thread loads its NBUCKETS nanobucket values into registers.
+    int64_t my_bucket [NBUCKETS] ;
+
+    #pragma unroll 
+    for (int b = 0 ; b < NBUCKETS ; b++)
+    {
+        my_bucket [b] = nanobucket [b * blockDim.x]
+                      + blockbucket [b * gridDim.x + blockIdx.x]
+                      + bucketp [b] ;
+    }
+
+    // Now each thread has an index into the global set of NBUCKETS buckets,
+    // held in bucket, of where to place its own entries.
+
+    //--------------------------------------------------------------------------
+    // construct the global buckets
+    //--------------------------------------------------------------------------
+
+    // The slice for task blockIdx.x contains entries pfirst:plast-1 of M and
+    // C, which is the part of C operated on by this threadblock.
+
+    // FIXME: why is bucket_idx needed?
+    __shared__ int64_t bucket_idx [chunk_size] ;
+
+//  int64_t chunk_max = (cnz + chunk_size -1) / chunk_size ;
+//  for (int64_t chunk = blockIdx.x ; chunk < chunk_max ; chunk += gridDim.x)
+
+    for (int64_t pfirst = blockIdx.x << log2_chunk_size ;
+                 pfirst < cnz ;
+                 pfirst += gridDim.x << log2_chunk_size)
+    {
+
+        // pfirst = chunk_size * chunk ;
+        // plast  = GB_IMIN( chunk_size * (chunk+1), cnz ) ;
+        int64_t plast = pfirst + chunk_size ;
+        plast = GB_IMIN (plast, cnz) ;
+
+        for (int64_t p = pfirst + threadIdx.x ; p < plast ; p += blockDim.x)
+        {
+            // get the entry C(i,j), and extract its bucket.  Then
+            // place the entry C(i,j) in the global bucket it belongs to.
+            int tid = p - pfirst ;
+
+            // TODO: these writes to global are not coalesced.  Instead: each
+            // threadblock could buffer its writes to NBUCKETS buffers and when
+            // the buffers are full they can be written to global.
+
+            int ibucket = Ci [p] & 0xF;
+
+            //bucket[my_bucket[ibucket]++] = p;
+            //int idx = (my_bucket[ibucket]  - pfirst); 
+            //my_bucket[ibucket] +=  1; //blockDim.x ;
+            //int idx = (my_bucket[ibucket]++ - pfirst) & 0x7F;
+            //bucket_s[ibucket][ idx ] = p;
+
+            bucket_idx [tid] = my_bucket [ibucket]++ ;
+            Ci [p] = (ibucket==0) * (Ci [p] >> 4) + (ibucket > 0) * Ci [p] ;
+
+            //if(ibucket == 0) {
+            ////    bucket[my_bucket[0]++] = p;
+            //    Ci[p] = Ci[p] >> 4;
+            //} else {
+            //  bucket[my_bucket[ibucket]++] = p;
+            //}
+        }
+
+        // FIXME: can't this be merged with the loop above?  Or is it a
+        // partial implementation of a coalesced write to the global bucket
+        // array?
+
+        for (int64_t p = pfirst + threadIdx.x ; p < plast ; p += blockDim.x)
+        {
+            int tid = p - pfirst ;
+            bucket [bucket_idx [tid]] = p ;
+        }
+    }
+}
+
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh
new file mode 100644
index 0000000000..c36f35d0cc
--- /dev/null
+++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh
@@ -0,0 +1,222 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This CUDA kernel produces the semiring product of two dense matrices of
+// types GB_A_TYPE and GB_B_TYPE and common index space size n, to an output
+// matrix of type GB_C_TYPE. The matrices are dense, with uniform non-zeros and
+// sparsity patterns.  ie. we want to produce C = A'*B in the sense of the
+// given semi-ring.
+
+// This version uses a simple warp-based dense dot product algorithm, when the
+// vectors coming from both A and B are dense, for any size of N.
+
+// Both the grid and block are 1D, so blockDim.x is the # threads in a
+// threadblock, and the # of threadblocks is grid.x
+
+// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number
+// of active threads = min( min(nzA, nzB), 32) 
+
+// Thus, threadblock b owns a semi-ring dot product on a pair of vectors. 
+// The work is to load the data, do the multiply and add work and finally 
+// reduce this data to a scalar, and write it to Cx[pair].
+
+//------------------------------------------------------------------------------
+// GB_cuda_AxB_dot3_phase3_dndn_kernel
+//------------------------------------------------------------------------------
+
+__global__ void GB_cuda_AxB_dot3_phase3_dndn_kernel
+(
+    GrB_Matrix C,   // result matrix
+    GrB_Matrix M,   // mask matrix
+    GrB_Matrix A,   // input matrix A
+    GrB_Matrix B    // input matrix B
+)
+{
+
+    //--------------------------------------------------------------------------
+    // get C, M, A, and B
+    //--------------------------------------------------------------------------
+
+    #if !GB_A_IS_PATTERN
+    const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *)A->x  ;
+    #endif
+    #if !GB_A_IS_PATTERN
+    const GB_B_TYPE *__restrict__ Bx = (GB_B_TYPE *)B->x  ;
+    #endif
+          GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *)C->x  ;
+          int64_t *__restrict__ Ci = C->i ;
+    const int64_t *__restrict__ Mi = M->i ;
+    #if GB_M_IS_HYPER
+    const int64_t *__restrict__ Mh = M->h ;
+    #endif
+    // A and B are either bitmap or full
+    #if GB_A_IS_BITMAP
+    const int8_t  *__restrict__ Ab = A->b ;
+    #endif
+    #if GB_B_IS_BITMAP
+    const int8_t  *__restrict__ Bb = B->b ;
+    #endif
+
+    // zombie count
+    uint64_t zc = 0 ;
+
+    GB_M_NVALS (mnz) ;
+
+    // total items to be inspected
+    int64_t vlen = A->vlen ;
+    ASSERT (vlen == B->vlen) ;
+    ASSERT (vlen > 0) ;
+
+    //--------------------------------------------------------------------------
+    // compute C(i,j) = A(:,i)'*B(:,j) for each entry in M(i,j)
+    //--------------------------------------------------------------------------
+
+    for (int64_t pM = blockIdx.x ; pM < mnz ; pM += gridDim.x)
+    {
+
+        //----------------------------------------------------------------------
+        // get M(i,j) and C(i,j)
+        //----------------------------------------------------------------------
+
+        int64_t i = Mi [pM] ;
+        int64_t kth = Ci [pM] ;             // C(i,j) is in the kth vector of C
+        bool cij_exists = false ;
+        GB_DECLARE_IDENTITY (cij) ;         // GB_Z_TYPE cij = identity
+
+        //----------------------------------------------------------------------
+        // The threadblock cooperates to compute a single entry C(i,j)
+        //----------------------------------------------------------------------
+
+        #ifndef GB_MASK_STRUCT
+        // skip if C(i,j) is a prezombie
+        if (kth >= 0)
+        #endif
+        {
+
+            // j = kth or j = Mh [kth] if C and M are hypersparse
+            int64_t j = GBH_M (Mh, kth) ;
+            int64_t pA = vlen * i ;
+            int64_t pB = vlen * j ;
+
+            GB_DECLAREA (aki) ;
+            GB_DECLAREB (bkj) ;
+
+            #if GB_A_IS_FULL && GB_B_IS_FULL
+            {
+                cij_exists = true ;
+                for (int64_t k = threadIdx.x ; k < vlen ; k += blockDim.x)
+                { 
+                    // cij += A(k,i) * B(k,j)
+                    GB_GETA (aki, Ax, pA+k, ) ;           // aki = A(k,i)
+                    GB_GETB (bkj, Bx, pB+k, ) ;           // bkj = B(k,j)
+                    GB_MULTADD ( cij, aki, bkj, i, k, j ) ; // cij += aki * bkj
+                }
+            }
+            #elif GB_A_IS_BITMAP && GB_B_IS_BITMAP
+            {
+                for ( int64_t k = threadIdx.x ; k < vlen ; k += blockDim.x)
+                { 
+                    GB_GETA (aki, Ax, pA+k, ) ;           // aki = A(k,i)
+                    GB_GETB (bkj, Bx, pB+k, ) ;           // bkj = B(k,j)
+                    int8_t b = (Ab [pA+k] && Bb [pB+k]) ;
+                    cij_exists |= b ;
+                    if (b)
+                    {
+                        // cij += aki * bkj
+                        GB_MULTADD ( cij, aki, bkj, i, k, j ) ;
+                    }
+                }
+            }
+            #elif GB_A_IS_FULL && GB_B_IS_BITMAP
+            {
+                for ( int64_t k = threadIdx.x ; k < vlen ; k += blockDim.x)
+                { 
+                    if (Bb [pB+k])
+                    {
+                        GB_GETA (aki, Ax, pA+k, ) ;           // aki = A(k,i)
+                        GB_GETB (bkj, Bx, pB+k, ) ;           // bkj = B(k,j)
+                        // cij += aki * bkj
+                        GB_MULTADD ( cij, aki, bkj, i, k, j ) ;
+                        cij_exists = true ;
+                    }
+                }
+            }
+            #elif GB_A_IS_BITMAP && GB_B_IS_FULL
+            {
+                for ( int64_t k = threadIdx.x ; k < vlen ; k += blockDim.x)
+                { 
+                    if (Ab [pB+k])
+                    {
+                        GB_GETA (aki, Ax, pA+k, ) ;           // aki = A(k,i)
+                        GB_GETB (bkj, Bx, pB+k, ) ;           // bkj = B(k,j)
+                        // cij += aki * bkj
+                        GB_MULTADD ( cij, aki, bkj, i, k, j ) ;
+                        cij_exists = true ;
+                    }
+                }
+            }
+            #endif
+        }
+
+        //----------------------------------------------------------------------
+        // reduce per-thread sums to a single scalar
+        //----------------------------------------------------------------------
+
+        // FIXME: no need to do this if C(i,j) is a zombie (cij_exists is
+        // always false), or if A and B are both full and C(i,j) is not a
+        // zombie (cij_exists is always true).
+
+        // FIXME: this only works if the size of the thread block is 32,
+        // right?
+
+        // Do vote here for control.
+        thread_block_tile<32> tile = tiled_partition<32> (this_thread_block()) ;
+
+        // FIXME: tile.any takes an int predicate, not bool. How does this work?
+        cij_exists = tile.any (cij_exists) ;
+        tile.sync();
+
+        #if !GB_C_ISO
+        // FIXME: the ANY monoid needs the cij_exists for each thread
+        cij = GB_cuda_warp_reduce_ztype (tile, cij) ;
+        #endif
+
+        // FIXME: if A and B are full, and GB_MASK_STRUCT is true, cij_exists
+        // is always true because vlen > 0 always holds for this kernel.
+
+        // FIXME: if kth < 0, C(i,j) is a prezombie, and Ci [pM] already holds
+        // GB_FLIP (i).
+
+        // write result for this block to global mem
+        if (threadIdx.x == 0)
+        {
+            if (cij_exists)
+            {
+                // Cx [pM] = (GB_C_TYPE) cij
+                GB_PUTC (cij, Cx, pM) ;
+                Ci [pM] = i ;
+            }
+            else
+            {
+                // cij is a zombie
+                zc++ ;
+                Ci [pM] = GB_FLIP (i) ;
+            }
+        }
+
+        // __syncthreads ( ) ;
+
+        if( threadIdx.x ==0 && zc > 0)
+        {
+            GB_cuda_atomic_add <uint64_t>( &(C->nzombies), zc) ;
+        }
+    }
+}
+
diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_mp.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh
similarity index 64%
rename from GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_mp.cuh
rename to GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh
index 838b7e4ccf..3fb4ead9e8 100644
--- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_mp.cuh
+++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh
@@ -2,88 +2,46 @@
 // GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_mp.cuh
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// This CUDA kernel produces the semi-ring product of two
-// sparse matrices of types T_A and T_B and common index space size n, to a  
-// output matrix of type T_C. The matrices are sparse, with different numbers
-// of non-zeros and different sparsity patterns. 
-// ie. we want to produce C = A'*B in the sense of the given semi-ring.
+// This CUDA kernel produces the semi-ring product of two sparse matrices of
+// types GB_A_TYPE and GB_B_TYPE and common index space size n, to a  output
+// matrix of type GB_C_TYPE. The matrices are sparse, with different numbers of
+// non-zeros and different sparsity patterns.  ie. we want to produce C = A'*B
+// in the sense of the given semi-ring.
 
-// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are 
-// relatively close in size, neither is very sparse nor dense, for any size of N.
-// Handles arbitrary sparsity patterns with guaranteed load balance.
+// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are
+// relatively close in size, neither is very sparse nor dense, for any size of
+// N.  Handles arbitrary sparsity patterns with guaranteed load balance.
 
 // Both the grid and block are 1D, so blockDim.x is the # threads in a
 // threadblock, and the # of threadblocks is grid.x
 
-// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number
-// of active threads = min( min(g_xnz, g_ynz), 32) 
+// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number of
+// active threads = min( min(g_xnz, g_ynz), 32) 
 
-// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi.  Its job
-// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot
-// product on those items in the intersection, and finally reduce this data to a scalar, 
-// on exit write it to g_odata [b].
+// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi.
+// Its job is to find the intersection of the index sets g_xi and g_yi, perform
+// the semi-ring dot product on those items in the intersection, and finally
+// reduce this data to a scalar, on exit write it to g_odata [b].
 
 //  int64_t start          <- start of vector pairs for this kernel
 //  int64_t end            <- end of vector pairs for this kernel
 //  int64_t *Bucket        <- array of pair indices for all kernels 
-//  matrix<T_C> *C         <- result matrix 
-//  matrix<T_M> *M         <- mask matrix
-//  matrix<T_A> *A         <- input matrix A
-//  matrix<T_B> *B         <- input matrix B
-
-#pragma once
-
-#include <limits>
-#include <cstdint>
-#include <cooperative_groups.h>
-#include "GB_cuda_kernel.h"
-#include "GB_mxm_shared_definitions.h"
-#include "GB_cuda_atomics.cuh"
-#include "GB_hash.h"
-#include "GB_hyper_hash_lookup.h"
-#include "GB_cuda_dot3_defn.h"
-
-// Using tile size fixed at compile time, we don't need shared memory
-#define tile_sz 32 
-
-using namespace cooperative_groups;
-
-//------------------------------------------------------------------------------
-// GB_reduce_sum
-//------------------------------------------------------------------------------
-
-template< typename T_Z, int warp_sz>
-__device__ __inline__ 
-T_Z GB_reduce_sum(thread_block_tile<warp_sz> g, T_Z val)
-{
-    // Each iteration halves the number of active threads
-    // Each thread adds its partial sum[i] to sum[lane+i]
-    // Temporary T_Z is necessary to handle arbirary ops
-    // FIXME: only works if sizeof(T_Z) <= 32 bytes
-    // FIXME: the ANY monoid needs the cij_exists for each thread
-    #pragma unroll
-    for (int i = warp_sz >> 1; i > 0; i >>= 1)
-    {
-        T_Z next = g.shfl_down( val, i);
-        GB_ADD( val, val, next ); 
-    }
-    return val;
-}
+//  GrB_Matrix C           <- result matrix 
+//  GrB_Matrix M           <- mask matrix
+//  GrB_Matrix A           <- input matrix A
+//  GrB_Matrix B           <- input matrix B
 
 //------------------------------------------------------------------------------
-// AxB_dot3_phase3_mp
+// GB_cuda_AxB_dot3_phase3_mp_kernel
 //------------------------------------------------------------------------------
-
-template<
-    typename T_C, typename T_A, typename T_B,
-    typename T_Z, typename T_X, typename T_Y,
-    uint64_t srcode>
-__global__ void AxB_dot3_phase3_mp  // FIXME rename
+  
+__global__ void GB_cuda_AxB_dot3_phase3_mp_kernel
 (
     int64_t start,
     int64_t end,
@@ -91,19 +49,17 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
     GrB_Matrix C,
     GrB_Matrix M,
     GrB_Matrix A,
-    GrB_Matrix B,
-    int sz
+    GrB_Matrix B
 )
 {
 
-    // TODO: Figure out how to use graphblas-specific INFINITY macro
-    #ifndef INFINITY
-    #define INFINITY std::numeric_limits<T_C>::max()
+    #if !GB_A_IS_PATTERN
+    const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *)A->x  ;
     #endif
-
-    const T_A *__restrict__ Ax = (T_A *)A->x  ;
-    const T_B *__restrict__ Bx = (T_B *)B->x  ;
-          T_C *__restrict__ Cx = (T_C *)C->x  ;
+    #if !GB_B_IS_PATTERN
+    const GB_B_TYPE *__restrict__ Bx = (GB_B_TYPE *)B->x  ;
+    #endif
+          GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *)C->x  ;
           int64_t *__restrict__ Ci = C->i ;
     const int64_t *__restrict__ Mi = M->i ;
     #if GB_M_IS_HYPER
@@ -141,18 +97,10 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
     // zombie count
     int64_t zc = 0;
 
-    int64_t pair_id;
-
     // set thread ID
-    int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;
+//  int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;
     int tid = threadIdx.x;
 
-    int b = blockIdx.x ;
-
-    // total items to be inspected
-    int64_t ainz = 0;
-    int64_t bjnz = 0;
-
     thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());
     int all_in_one = ( (end - start) == (M->p)[(M->nvec)] ) ;
 
@@ -163,7 +111,7 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
          kk += gridDim.x )
     {
 
-        pair_id = all_in_one ? kk : Bucket [kk] ;
+        int64_t pair_id = all_in_one ? kk : Bucket [kk] ;
         int64_t i = Mi[pair_id];
         int64_t k = Ci[pair_id] >> 4;
 
@@ -180,7 +128,7 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
         pA_end   = Ap[i+1] ;
         #endif
 
-        ainz = pA_end - pA_start ;
+        int64_t ainz = pA_end - pA_start ;
 
         GB_DECLAREA (aki) ;
         GB_DECLAREB (bkj) ;
@@ -188,7 +136,6 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
 
         int cij_exists = 0 ;       // FIXME: make a bool
 
-        #define shared_vector_size 128 
         __shared__ int64_t Ai_s[shared_vector_size];
         int shared_steps_A = (ainz + shared_vector_size -1)/shared_vector_size;
 
@@ -210,7 +157,7 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
         pB_end   = Bp[j+1] ;
         #endif
 
-        bjnz = pB_end - pB_start;          // bjnz
+        int64_t bjnz = pB_end - pB_start;          // bjnz
         int shared_steps_B = (bjnz + shared_vector_size -1)/shared_vector_size;
          
         __shared__ int64_t Bj_s[shared_vector_size];
@@ -221,14 +168,7 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
             Bj_s[i] = Bi[ i + pB_start];
         }   
         this_thread_block().sync();
-     
-  //if (threadIdx.x ==0 ) {
-  //  printf("block %d  doing dot %lld  i,j= %lld,%lld\n", blockIdx.x, pair_id, i, j);
-  //  printf("block %d  doing dot %lld  ainz,bjnz= %lld,%lld, A_steps=%d, B_steps=%d\n", 
-  //          blockIdx.x, pair_id, ainz, bjnz, shared_steps_A, shared_steps_B);
-  //}
-  //this_thread_block().sync();
-    
+
         //we want more than one intersection per thread
         while ( (shared_steps_A > 0) && (shared_steps_B > 0) )
         {
@@ -238,40 +178,28 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
             if ( shared_steps_B > 1) bwork = shared_vector_size;  
             int64_t nxy = awork + bwork;
 
-            int work_per_thread = (nxy + blockDim.x -1)/blockDim.x;  // ceil Divide by 32 = blockDim.x 
+            // ceil Divide by 32 = blockDim.x :
+            int work_per_thread = (nxy + blockDim.x -1)/blockDim.x;
             int diag     = GB_IMIN( work_per_thread*tid, nxy);
             int diag_end = GB_IMIN( diag + work_per_thread, nxy);
-            //printf(" thd%d parts = %u wpt = %u diag, diag_end  = %u,%u\n",tid, blockDim.x, work_per_thread, diag, diag_end); 
 
-            //if (1) //(threadIdx.x == 0)
-            //{
-            //    printf ("pair %ld tid%d work_per_thread %d nxy %ld parts %d diag %d diag_end %d Astep=%d, Bstep=%d\n",
-            //        pair_id, threadIdx.x, work_per_thread, nxy, blockDim.x, diag, diag_end,shared_steps_A,shared_steps_B) ;
-            //}
-            //this_thread_block().sync();
+            // bwork takes place of bjnz:
+            int x_min = GB_IMAX( (diag - bwork) , 0);
 
-            int x_min = GB_IMAX( (diag - bwork) , 0); //bwork takes place of bjnz
-            int x_max = GB_IMIN( diag, awork);      //awork takes place of ainz
+            //awork takes place of ainz:
+            int x_max = GB_IMIN( diag, awork);
 
             while ( x_min < x_max)
             {
                 //binary search for correct diag break
                 int pivot = (x_min +x_max) >> 1;
-                //printf("start search thd%u piv=%u xmin,xmax = %u,%u diag_end=%d\n", tid_global, pivot, x_min, x_max, diag_end);
                 int64_t Apiv =  Ai_s[pivot] ;
                 int64_t Bpiv = Bj_s[diag -pivot -1] ;
 
-                // if ( Apiv < Bpiv ) {
-                //    x_min = pivot +1;
-                // }
-                // else {
-                //    x_max = pivot;
-                // }
-                x_min = (pivot + 1)* (Apiv < Bpiv)   + x_min * (1 - (Apiv < Bpiv));
-                x_max = pivot * (1 - (Apiv < Bpiv))  + x_max * (Apiv < Bpiv);
+                x_min = (pivot + 1)* (Apiv < Bpiv)  + x_min * (1 - (Apiv < Bpiv));
+                x_max = pivot * (1 - (Apiv < Bpiv)) + x_max * (Apiv < Bpiv);
 
             }
-            //printf("start search thd%u xcoord= %u diag=%d, diag_end=%d\n", tid_global, x_min, diag, diag_end);
 
             int xcoord = x_min;
             int ycoord = diag -x_min -1;
@@ -285,8 +213,6 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
             int tx_start = xcoord; // +pA_start;
             int ty_start = diag -xcoord; // +pB_start; 
 
-            //if (x_start != y_start)
-            //   printf("start thd%u  xs,ys = %i,%i\n", tid_global, x_start, y_start);
 
             x_min = GB_IMAX( (diag_end - bwork), 0); //bwork replace bjnz
             x_max = GB_IMIN( diag_end, awork);      //awork replace ainz
@@ -297,16 +223,10 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
                 int64_t Apiv = Ai_s[pivot] ;
                 int64_t Bpiv = Bj_s[diag_end -pivot -1] ;
 
-                //if ( Apiv < Bpiv ) {
-                //   x_min = pivot +1;
-                //}
-                //else {
-                //   x_max = pivot;
-                //}
                 x_min = (pivot + 1)* (Apiv < Bpiv)   + x_min * (1 - (Apiv < Bpiv));
                 x_max = pivot * (1 - (Apiv < Bpiv))  + x_max * (Apiv < Bpiv);
             }
-            //printf("end search thd%u x_coord = %u diag=%d, diag_end=%d\n", tid_global, x_min, diag, diag_end);
+
             xcoord = x_min;
             ycoord = diag_end -x_min -1;
 
@@ -318,21 +238,6 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
             int64_t pA = tx_start;       // pA
             int64_t pB = ty_start;       // pB
 
-            //if (1) // threadIdx.x == 0)
-            //{
-            //    printf ("%d tx_start %d\n", threadIdx.x, tx_start) ;
-            //    printf ("%d tx_end   %d\n", threadIdx.x, tx_end  ) ;
-            //    printf ("%d ty_start %d\n", threadIdx.x, ty_start) ;
-            //    printf ("%d ty_end   %d\n", threadIdx.x, ty_end  ) ;
-            //}
-            //this_thread_block().sync();
-
-            //    if(threadIdx.x == 0 ) {
-            //        printf("blk%d, thd%d k=%d, l=%d, tx_start=%d, ty_start=%d, tx_end=%d, ty_end=%d\n",
-            //      blockIdx.x, tid_global, k, l, tx_start, ty_start, tx_end, ty_end);
-            //    }
-            //  this_thread_block().sync();
-
             while ( pA < tx_end && pB < ty_end ) 
             {
                 int64_t Aind = Ai_s[pA] ;
@@ -416,14 +321,6 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
         // reduce sum per-thread values to a single scalar, get OR of flag
         //----------------------------------------------------------------------
 
-        /*
-        if (tid == 0)
-        {
-            printf ("reduce %d : %d exists = %d\n", b,  cij, cij_exists) ;
-        }
-        __syncthreads();
-        */
-
         // Do vote here for control.
         cij_exists = tile.any (cij_exists) ;
         tile.sync ( ) ;
@@ -432,7 +329,7 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
         if (cij_exists)
         {
             // FIXME: the ANY monoid needs the cij_exists for each thread
-           cij = GB_reduce_sum<T_Z, tile_sz>( tile, cij );
+            cij = GB_cuda_warp_reduce_ztype (tile, cij) ;
         }
         #endif
 
@@ -441,7 +338,8 @@ __global__ void AxB_dot3_phase3_mp  // FIXME rename
         {
             if (cij_exists)
             {
-                GB_PUTC (cij, Cx, pair_id) ;        // Cx [pair_id] = (T_C) cij
+                // Cx [pair_id] = (GB_C_TYPE) cij
+                GB_PUTC (cij, Cx, pair_id) ;
                 Ci [pair_id] = i ;
             }
             else
diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh
similarity index 52%
rename from GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh
rename to GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh
index e8986d86cf..c0e04e9361 100644
--- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh
+++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh
@@ -2,96 +2,55 @@
 // GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh
 //------------------------------------------------------------------------------
 
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// This CUDA kernel produces the semi-ring product of two
-// sparse matrices of types T_A and T_B and common index space size n, to a  
-// output matrix of type T_C. The matrices are sparse, with different numbers
-// of non-zeros and different sparsity patterns. 
-// ie. we want to produce C = A'*B in the sense of the given semi-ring.
+// This CUDA kernel produces the semi-ring product of two sparse matrices of
+// types GB_A_TYPE and GB_B_TYPE and common index space size n, to an output
+// matrix of type GB_C_TYPE. The matrices are sparse, with different numbers of
+// non-zeros and different sparsity patterns.  ie. we want to produce C = A'*B
+// in the sense of the given semi-ring.
 
 // This version uses an entire threadblock to compute each C(i,j) dot product.
 
 // Both the grid and block are 1D, so blockDim.x is the # threads in a
 // threadblock, and the # of threadblocks is grid.x
 
-//  int64_t start          <- start of vector pairs for this kernel
-//  int64_t end            <- end of vector pairs for this kernel
-//  int64_t *Bucket        <- array of pair indices for all kernels 
-//  matrix<T_C> *C         <- result matrix 
-//  matrix<T_M> *M         <- mask matrix
-//  matrix<T_A> *A         <- input matrix A
-//  matrix<T_B> *B         <- input matrix B
-
-#pragma once
-
-#include <limits>
-#include <cstdint>
-#include <cooperative_groups.h>
-#include "GB_cuda_kernel.h"
-#include "GB_mxm_shared_definitions.h"
-#include "GB_hash.h"
-#include "GB_hyper_hash_lookup.h"
-#include "GB_cuda_dot3_defn.h"
-
-// Using tile size fixed at compile time, we don't need shared memory
-#define tile_sz 32 
-
-using namespace cooperative_groups;
-
 //------------------------------------------------------------------------------
-// GB_reduce_sum
+// GB_cuda_AxB_dot3_phase3_spdn_kernel
 //------------------------------------------------------------------------------
 
-template< typename T_Z, int warp_sz>
-__device__ __inline__ 
-T_Z GB_reduce_sum(thread_block_tile<warp_sz> g, T_Z val)
-{
-    // Each iteration halves the number of active threads
-    // Each thread adds its partial sum[i] to sum[lane+i]
-    // Temporary T_Z is necessary to handle arbirary ops
-    // FIXME: only works if sizeof(T_Z) <= 32 bytes
-    // FIXME: the ANY monoid needs the cij_exists for each thread
-    #pragma unroll
-    for (int i = warp_sz >> 1; i > 0; i >>= 1)
-    {
-        T_Z next = g.shfl_down( val, i);
-        GB_ADD( val, val, next ); 
-    }
-    return val;
-}
-
-//------------------------------------------------------------------------------
-// AxB_dot3_phase3_spdn
-//------------------------------------------------------------------------------
-
-template<
-    typename T_C, typename T_A, typename T_B,
-    typename T_Z, typename T_X, typename T_Y,
-    uint64_t srcode>
-__global__ void AxB_dot3_phase3_spdn        // FIXME rename
+__global__ void GB_cuda_AxB_dot3_phase3_spdn_kernel
 (
-    int64_t start,
-    int64_t end,
+    int64_t start,      // start of vector pairs for this kernel
+    int64_t end,        // end of vector pairs for this kernel
     int64_t *Bucket,    // do the work in Bucket [start:end-1]
-    GrB_Matrix C,
-    GrB_Matrix M,
-    GrB_Matrix A,
-    GrB_Matrix B,
-    int sz              // FIXME: unused
+    GrB_Matrix C,       // result matrix 
+    GrB_Matrix M,       // mask matrix
+    GrB_Matrix A,       // input matrix A
+    GrB_Matrix B        // input matrix B
 )
 {
 
-    // TODO: Figure out how to use graphblas-specific INFINITY macro
-    #ifndef INFINITY
-    #define INFINITY std::numeric_limits<T_C>::max()
+    // sparse-times-dense or dense-times-sparse
+    #if !(((GB_A_IS_SPARSE || GB_A_IS_HYPER) &&         \
+           (GB_B_IS_BITMAP || GB_B_IS_FULL))            \
+            ||                                          \
+          ((GB_B_IS_SPARSE || GB_B_IS_HYPER) &&         \
+           (GB_A_IS_BITMAP || GB_A_IS_FULL)))
+    #error "spdn: for sparse-dense or dense-sparse cases only"
     #endif
 
-    const T_A *__restrict__ Ax = (T_A *)A->x  ;
-    const T_B *__restrict__ Bx = (T_B *)B->x  ;
-          T_C *__restrict__ Cx = (T_C *)C->x  ;
+    #if !GB_A_IS_PATTERN
+    const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *)A->x  ;
+    #endif
+    #if !GB_B_IS_PATTERN
+    const GB_B_TYPE *__restrict__ Bx = (GB_B_TYPE *)B->x  ;
+    #endif
+          GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *)C->x  ;
           int64_t *__restrict__ Ci = C->i ;
     const int64_t *__restrict__ Mi = M->i ;
     #if GB_M_IS_HYPER
@@ -101,6 +60,8 @@ __global__ void AxB_dot3_phase3_spdn        // FIXME rename
     #if GB_A_IS_HYPER || GB_A_IS_SPARSE
     const int64_t *__restrict__ Ai = A->i ;
     const int64_t *__restrict__ Ap = A->p ;
+    #else
+    const int64_t avlen = A->vlen ;
     #endif
 
     #if GB_A_IS_BITMAP
@@ -110,6 +71,8 @@ __global__ void AxB_dot3_phase3_spdn        // FIXME rename
     #if GB_B_IS_HYPER || GB_B_IS_SPARSE
     const int64_t *__restrict__ Bi = B->i ;
     const int64_t *__restrict__ Bp = B->p ;
+    #else
+    const int64_t bvlen = B->vlen ;
     #endif
 
     #if GB_B_IS_BITMAP
@@ -136,87 +99,94 @@ __global__ void AxB_dot3_phase3_spdn        // FIXME rename
     const int64_t B_hash_bits = (B->Y == NULL) ? 0 : (B->Y->vdim - 1) ;
     #endif
 
-    // zombie count
-    int64_t zc = 0;
+    // zombie count for this threadblock
+    uint64_t zc = 0 ;
 
-    int64_t pair_id;
+    thread_block_tile<tile_sz> tile = 
+        tiled_partition<tile_sz> (this_thread_block()) ;
 
-    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());
-    int all_in_one = ( (end - start) == (M->p)[(M->nvec)] ) ;
+    GB_M_NVALS (mnz) ;
+    ASSERT (GB_M_IS_SPARSE || GB_M_IS_HYPER) ;
+    int64_t cnz_in_bucket = end - start ;
+    int all_in_one = (cnz_in_bucket == mnz) ;
 
     // Main loop over pairs 
     int64_t kk ;
-    for (kk = start+ blockIdx.x; // warp per C(i,j)=A(:,i)'*B(:,j) dot product
-         kk < end;  
-         kk += gridDim.x )
+    for (kk = start + blockIdx.x ; // warp per C(i,j)=A(:,i)'*B(:,j) dot product
+         kk < end ;
+         kk += gridDim.x)
     {
 
-        pair_id = all_in_one ? kk : Bucket [kk] ;
-        int64_t i = Mi[pair_id];
-        int64_t k = Ci[pair_id] >> 4;
+        //----------------------------------------------------------------------
+        // get M(i,j) and C(i,j)
+        //----------------------------------------------------------------------
 
+        int64_t pair_id = all_in_one ? kk : Bucket [kk] ;
+        int64_t i = Mi [pair_id] ;
+        int64_t k = Ci [pair_id] >> 4 ;
         // j = k or j = Mh [k] if C and M are hypersparse
         int64_t j = GBH_M (Mh, k) ;
 
-        // find A(:,i)
-        int64_t pA, pA_end ;
+        //----------------------------------------------------------------------
+        // get A(:,i)
+        //----------------------------------------------------------------------
+
         #if GB_A_IS_HYPER
+        int64_t pA, pA_end ;
         GB_hyper_hash_lookup (Ah, anvec, Ap, A_Yp, A_Yi, A_Yx, A_hash_bits,
             i, &pA, &pA_end) ;
         #elif GB_A_IS_SPARSE
-        pA = Ap[i] ;
-        pA_end   = Ap[i+1] ;
+        int64_t pA     = Ap [i] ;
+        int64_t pA_end = Ap [i+1] ;
         #else
-        // A is bitmap or full
-        pA = A->vlen * i ;
-        pA_end = pA + i ;
+        // A is bitmap or full: only pA is needed
+        int64_t pA = avlen * i ;
         #endif
 
-        GB_DECLAREA (aki) ;
-        GB_DECLAREB (bkj) ;
-        GB_DECLARE_IDENTITY (cij) ;         // GB_Z_TYPE cij = identity
-
-        int cij_exists = 0 ;       // FIXME: make a bool
+        //----------------------------------------------------------------------
+        // get B(:,j)
+        //----------------------------------------------------------------------
 
-        // find B(:,j)
-        int64_t pB, pB_end ;
         #if GB_B_IS_HYPER
+        int64_t pB, pB_end ;
         GB_hyper_hash_lookup (Bh, bnvec, Bp, B_Yp, B_Yi, B_Yx, B_hash_bits,
            j, &pB, &pB_end) ;
         #elif GB_B_IS_SPARSE
-        pB     = Bp[j] ;
-        pB_end = Bp[j+1] ;
+        int64_t pB     = Bp [j] ;
+        int64_t pB_end = Bp [j+1] ;
         #else
-        // B is bitmap or full
-        pB     = B->vlen * j ;
-        pB_end = pB + j ;
+        // B is bitmap or full: only pB is needed
+        int64_t pB = bvlen * j ;
         #endif
 
         //----------------------------------------------------------------------
-        // compute C(i,j) = A(:,i)'*B(:,j) using the entire threadblock
+        // C(i,j) = A(:,i)'*B(:,j) using the entire threadblock
         //----------------------------------------------------------------------
 
+        GB_DECLAREA (aki) ;
+        GB_DECLAREB (bkj) ;
+        GB_DECLARE_IDENTITY (cij) ;         // GB_Z_TYPE cij = identity
+        int cij_exists = 0 ;
+
         #if ( GB_A_IS_FULL )
         {
-//          int64_t bjnz = pB_end - pB ;    // bjnz = nnz (B (:,j))
-//          if (bjnz > 0)                   // will always be >= 128
-            {
 
-                //--------------------------------------------------------------
-                // A is full and B is sparse/hyper
-                //--------------------------------------------------------------
+            //------------------------------------------------------------------
+            // A is full and B is sparse/hyper
+            //------------------------------------------------------------------
 
-                cij_exists = true ;
-                for (int64_t p = pB + threadIdx.x ; p < pB_end ; p += blockDim.x)
-                {
-                    int64_t k = Bi [p] ;        // next row index of B(:,j)
-                    // cij += A(k,i) * B(k,j)
-                    GB_GETA ( aki, Ax, pA+k, ) ;           // aki = A(k,i)
-                    GB_GETB ( bkj, Bx, p, ) ;              // bkj = B(k,j)
-                    GB_MULTADD ( cij, aki, bkj, i, k, j ) ;        // cij += aki * bkj
-                    GB_DOT_TERMINAL (cij) ;     // break if cij == terminal
-                }
+            cij_exists = true ;
+            for (int64_t p = pB + threadIdx.x ; p < pB_end ; p += blockDim.x)
+            {
+                int64_t k = Bi [p] ;        // next row index of B(:,j)
+                // cij += A(k,i) * B(k,j)
+                GB_GETA ( aki, Ax, pA+k, ) ;           // aki = A(k,i)
+                GB_GETB ( bkj, Bx, p, ) ;              // bkj = B(k,j)
+                // cij += aki * bkj
+                GB_MULTADD ( cij, aki, bkj, i, k, j ) ;
+                GB_DOT_TERMINAL (cij) ;     // break if cij == terminal
             }
+
         }
         #elif ( GB_A_IS_BITMAP )
         {
@@ -237,25 +207,23 @@ __global__ void AxB_dot3_phase3_spdn        // FIXME rename
         }
         #elif ( GB_B_IS_FULL )
         {
-//          int64_t ainz = pA_end - pA ;    // ainz = nnz (A (:,i))
-//          if (ainz > 0)                   // will always be >= 128
-            {
 
-                //--------------------------------------------------------------
-                // A is sparse/hyper and B is full
-                //--------------------------------------------------------------
+            //------------------------------------------------------------------
+            // A is sparse/hyper and B is full
+            //------------------------------------------------------------------
 
-                cij_exists = true ;
-                for (int64_t p = pA + threadIdx.x ; p < pA_end ; p += blockDim.x)
-                {
-                    int64_t k = Ai [p] ;        // next row index of A(:,i)
-                    // cij += A(k,i) * B(k,j)
-                    GB_GETA ( aki, Ax, p, ) ;               // aki = A(i,k)
-                    GB_GETB ( bkj, Bx, pB+k, ) ;            // bkj = B(j,k)
-                    GB_MULTADD ( cij, aki, bkj, i, k, j) ;         // cij += aik * bjk
-                    GB_DOT_TERMINAL (cij) ;     // break if cij == terminal
-                }
+            cij_exists = true ;
+            for (int64_t p = pA + threadIdx.x ; p < pA_end ; p += blockDim.x)
+            {
+                int64_t k = Ai [p] ;        // next row index of A(:,i)
+                // cij += A(k,i) * B(k,j)
+                GB_GETA ( aki, Ax, p, ) ;               // aki = A(i,k)
+                GB_GETB ( bkj, Bx, pB+k, ) ;            // bkj = B(j,k)
+                // cij += aik * bjk
+                GB_MULTADD ( cij, aki, bkj, i, k, j) ;
+                GB_DOT_TERMINAL (cij) ;     // break if cij == terminal
             }
+
         }
         #elif ( GB_B_IS_BITMAP )
         {
@@ -277,13 +245,17 @@ __global__ void AxB_dot3_phase3_spdn        // FIXME rename
         }
         #endif
 
+        //----------------------------------------------------------------------
+        // save C(i,j) or declare it a zombie
+        //----------------------------------------------------------------------
+
         GB_CIJ_EXIST_POSTCHECK
 
         //----------------------------------------------------------------------
         // reduce sum per-thread values to a single scalar, get OR of flag
         //----------------------------------------------------------------------
 
-        // Do vote here for control.
+        // Do vote here for control
         cij_exists = tile.any (cij_exists) ;
         tile.sync ( ) ;
 
@@ -291,7 +263,7 @@ __global__ void AxB_dot3_phase3_spdn        // FIXME rename
         if (cij_exists)
         {
             // FIXME: the ANY monoid needs cij_exists for each thread
-            cij = GB_reduce_sum<T_Z, tile_sz>( tile, cij );
+            cij = GB_cuda_warp_reduce_ztype (tile, cij) ;
         }
         #endif
 
@@ -300,13 +272,14 @@ __global__ void AxB_dot3_phase3_spdn        // FIXME rename
         {
             if (cij_exists)
             {
-                GB_PUTC (cij, Cx, pair_id) ;        // Cx [pair_id] = (T_C) cij
+                // Cx [pair_id] = (GB_C_TYPE) cij
+                GB_PUTC (cij, Cx, pair_id) ;
                 Ci [pair_id] = i ;
             }
             else
             {
                 // cij is a zombie
-                zc++;
+                zc++ ;
                 Ci [pair_id] = GB_FLIP (i) ;
             }
         }
@@ -319,7 +292,7 @@ __global__ void AxB_dot3_phase3_spdn        // FIXME rename
 
     if (threadIdx.x == 0 && zc > 0)
     {
-        GB_cuda_atomic_add <int64_t>( &(C->nzombies), zc) ;
+        GB_cuda_atomic_add <uint64_t> (&(C->nzombies), zc) ;
     }
 }
 
diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh
similarity index 64%
rename from GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh
rename to GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh
index e5168527e0..018df8c1ae 100644
--- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh
+++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh
@@ -2,6 +2,8 @@
 // GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh
 //------------------------------------------------------------------------------
 
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
@@ -11,76 +13,46 @@
 //  Each thread in this kernel is responsible for m vector-pairs(x,y), 
 //  m = 256/sz, where sz is in {4, 16, 64, 256}
 //  We know each non-zero on the sparse side will hit a dense value.
-//  Template on <T_C, T_A, T_B, T_X, T_Y, T_Z >
 //  Parameters:
 
-//  matrix<T_C> *C         <- C result matrix 
-//  matrix<T_C> *M         <- Mask matrix 
-//  matrix<T_A> *A         <- A matrix to multiply, sparse 
-//  matrix<T_B> *B         <- B matrix to multiply, dense in sparse format? 
+//  C         <- C result matrix 
+//  M         <- Mask matrix 
+//  A         <- A matrix to multiply, sparse 
+//  B         <- B matrix to multiply, dense in sparse format? 
 //******************************************************************************
 
-#pragma once
-#include <limits>
-#include <cstdint>
-#include <stdio.h>
-#include "GB_cuda_kernel.h"
-#include "GB_mxm_shared_definitions.h"
-#include "GB_hash.h"
-#include "GB_hyper_hash_lookup.h"
-#include <cooperative_groups.h>
-#define tile_sz 32
-//#include "local_cub/block/block_reduce.cuh"
-#include <cub/block/block_reduce.cuh>
-#include "GB_cuda_dot3_defn.h"
-
-using namespace cooperative_groups;
-
-//------------------------------------------------------------------------------
-// reduce_sum_int64
-//------------------------------------------------------------------------------
-
-// for counting zombies only (always int64_t)
-template< int warpSize >
-__device__ int64_t reduce_sum_int64(thread_block_tile<warpSize> g, int64_t val)
-{
-    // Each iteration halves the number of active threads
-    // Each thread adds its partial sum[i] to sum[lane+i]
-    for (int i = g.size() / 2; i > 0; i /= 2)
-    {
-        val += g.shfl_down(val,i) ;
-    }
-    return val; // note: only thread 0 will return full sum
-}
-
 //------------------------------------------------------------------------------
-// AxB_dot3_phase3_vsdn
+// GB_cuda_AxB_dot3_phase3_vsdn_kernel
 //------------------------------------------------------------------------------
 
-template<
-    typename T_C, typename T_A, typename T_B,
-    typename T_Z, typename T_X, typename T_Y,
-    uint64_t srcode>
-__global__ void AxB_dot3_phase3_vsdn
+__global__ void GB_cuda_AxB_dot3_phase3_vsdn_kernel
 ( 
-  int64_t start,
-  int64_t end,
-  int64_t *Bucket,  // do the work in Bucket [start:end-1]
-  GrB_Matrix C, 
-  GrB_Matrix M, 
-  GrB_Matrix A, 
-  GrB_Matrix B,
-  int sz            // unused (FIXME: remove this)
+    int64_t start,
+    int64_t end,
+    int64_t *Bucket,  // do the work in Bucket [start:end-1]
+    GrB_Matrix C, 
+    GrB_Matrix M, 
+    GrB_Matrix A, 
+    GrB_Matrix B
 )
 {
-    // TODO: Figure out how to use graphblas-specific INFINITY macro
-    #ifndef INFINITY
-    #define INFINITY std::numeric_limits<T_C>::max()
+
+    // sparse-times-dense or dense-times-sparse
+    #if !(((GB_A_IS_SPARSE || GB_A_IS_HYPER) &&         \
+           (GB_B_IS_BITMAP || GB_B_IS_FULL))            \
+            ||                                          \
+          ((GB_B_IS_SPARSE || GB_B_IS_HYPER) &&         \
+           (GB_A_IS_BITMAP || GB_A_IS_FULL)))
+    #error "vsdn: for sparse-dense or dense-sparse cases only"
     #endif
 
-    const T_A *__restrict__ Ax = (T_A *)A->x  ;
-    const T_B *__restrict__ Bx = (T_B *)B->x  ;
-          T_C *__restrict__ Cx = (T_C *)C->x  ;
+    #if !GB_A_IS_PATTERN
+    const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *)A->x  ;
+    #endif
+    #if !GB_B_IS_PATTERN
+    const GB_B_TYPE *__restrict__ Bx = (GB_B_TYPE *)B->x  ;
+    #endif
+          GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *)C->x  ;
     int64_t *__restrict__ Ci = C->i ;
     const int64_t *__restrict__ Mi = M->i ;
     #if GB_M_IS_HYPER
@@ -90,6 +62,8 @@ __global__ void AxB_dot3_phase3_vsdn
     #if GB_A_IS_HYPER || GB_A_IS_SPARSE
     const int64_t *__restrict__ Ai = A->i ;
     const int64_t *__restrict__ Ap = A->p ;
+    #else
+    const int64_t avlen = A->vlen ;
     #endif
 
     #if GB_A_IS_BITMAP
@@ -99,6 +73,8 @@ __global__ void AxB_dot3_phase3_vsdn
     #if GB_B_IS_HYPER || GB_B_IS_SPARSE
     const int64_t *__restrict__ Bi = B->i ;
     const int64_t *__restrict__ Bp = B->p ;
+    #else
+    const int64_t bvlen = B->vlen ;
     #endif
 
     #if GB_B_IS_BITMAP
@@ -125,73 +101,72 @@ __global__ void AxB_dot3_phase3_vsdn
     const int64_t B_hash_bits = (B->Y == NULL) ? 0 : (B->Y->vdim - 1) ;
     #endif
 
-//   typedef cub::BlockReduce<int, 32> BlockReduce;
-//   __shared__ typename BlockReduce::TempStorage temp_storage;
+    uint64_t zc = 0 ;       // zombie count
 
-//   if( threadIdx.x ==0)
-//      printf("thd:%d %d dots/thrd, nvec = %d blockDim=%d\n",threadIdx.x, sz, nvec, blockDim.x);
-//   __syncthreads();
+    GB_M_NVALS (mnz) ;
+    ASSERT (GB_M_IS_SPARSE || GB_M_IS_HYPER) ;
+    int64_t cnz_in_bucket = end - start ;
+    int all_in_one = (cnz_in_bucket == mnz) ;
 
-    int64_t pair_id; 
-
-    int64_t zc = 0 ;
-
-//       if (threadIdx.x ==0)
-//         printf("thd%u pi=%lld\n",tid, start+threadIdx.x);
-//       __syncthreads();
-
-    int all_in_one = ( (end - start) == (M->p)[(M->nvec)] ) ;
-
-    for (int64_t kk = start +threadIdx.x +blockIdx.x*blockDim.x; 
-                 kk < end ;  
-                 kk += gridDim.x*blockDim.x  )
+    for (int64_t kk = start + threadIdx.x + blockIdx.x*blockDim.x ;
+                 kk < end ;
+                 kk += gridDim.x*blockDim.x)
     {
 
+        //----------------------------------------------------------------------
+        // get the entry C(i,j)
+        //----------------------------------------------------------------------
+
         int64_t pair_id = all_in_one ? kk : Bucket[ kk ];
-        int64_t i = Mi[pair_id];  // cols from mask
+        int64_t i = Mi [pair_id] ;
 
-        // FIXME: use another variable, not "k" here:
-        int64_t k = Ci[pair_id] >> 4;  // vector of C encoded in phase1
+        int64_t k = Ci [pair_id] >> 4;  // vector of C encoded in phase1
 
         // j = k or j = Mh [k] if C and M are hypersparse
         int64_t j = GBH_M (Mh, k) ;
 
-        // Prep row offsets for both A and B
+        //----------------------------------------------------------------------
+        // get A(:,i)
+        //----------------------------------------------------------------------
 
-        // find A(:,i)
-        int64_t pA, pA_end ;
         #if GB_A_IS_HYPER
+        int64_t pA, pA_end ;
         GB_hyper_hash_lookup (Ah, anvec, Ap, A_Yp, A_Yi, A_Yx, A_hash_bits,
             i, &pA, &pA_end) ;
         #elif GB_A_IS_SPARSE
-        pA     = Ap[i] ;
-        pA_end = Ap[i+1] ;
+        int64_t pA     = Ap[i] ;
+        int64_t pA_end = Ap[i+1] ;
         #else
-        // A is bitmap or full
-        pA     = (A->vlen)*i;
-        pA_end = pA +(A->vlen);
+        // A is bitmap or full: only pA is needed
+        int64_t pA = avlen * i ;
         #endif
 
-        // find B(:,j)
-        int64_t pB, pB_end ;
+        //----------------------------------------------------------------------
+        // get B(:,j)
+        //----------------------------------------------------------------------
+
         #if GB_B_IS_HYPER
+        int64_t pB, pB_end ;
         GB_hyper_hash_lookup (Bh, bnvec, Bp, B_Yp, B_Yi, B_Yx, B_hash_bits,
             j, &pB, &pB_end) ;
         #elif GB_B_IS_SPARSE
-        pB       = Bp[j];   // col of C
-        pB_end   = Bp[j+1];
+        int64_t pB       = Bp [j] ;
+        int64_t pB_end   = Bp [j+1] ;
         #else
-        // B is bitmap or full
-        pB   = (B->vlen)*j;
-        pB_end = pB +(B->vlen);
+        // B is bitmap or full: only pB is needed
+        int64_t pB = bvlen * j ;
         #endif
 
+        //----------------------------------------------------------------------
+        // C(i,j) = A(:,i)'*B(:,j)
+        //----------------------------------------------------------------------
+
         GB_DECLAREA (aki) ;
         GB_DECLAREB (bkj) ;
         GB_DECLARE_IDENTITY (cij) ;         // GB_Z_TYPE cij = identity
         bool cij_exists = false ;
 
-        int64_t my_nzombies = 0;
+        uint64_t my_nzombies = 0 ;
 
         #if ( GB_A_IS_FULL )
         {
@@ -204,7 +179,7 @@ __global__ void AxB_dot3_phase3_vsdn
                 //--------------------------------------------------------------
 
                 cij_exists = true ;
-                for (int64_t p = pB ; p < pB_end ; ++p)
+                for (int64_t p = pB ; p < pB_end ; p++)
                 {
                     int64_t k = Bi [p] ;        // next row index of B(:,j)
                     // cij += A(k,i) * B(k,j)
@@ -221,7 +196,7 @@ __global__ void AxB_dot3_phase3_vsdn
             // A is bitmap and B is sparse/hyper
             //------------------------------------------------------------------
 
-            for (int64_t p = pB ; p < pB_end ; ++p)
+            for (int64_t p = pB ; p < pB_end ; p++)
             {
                 int64_t k = Bi [p] ;        // next row index of B(:,j)
                 if (Ab [pA+k])              // check if A(k,i) exists
@@ -243,7 +218,7 @@ __global__ void AxB_dot3_phase3_vsdn
                 //--------------------------------------------------------------
 
                 cij_exists = true ;
-                for (int64_t p = pA ; p < pA_end ; ++p)
+                for (int64_t p = pA ; p < pA_end ; p++)
                 {
                     int64_t k = Ai [p] ;        // next row index of A(:,i)
                     // cij += A(k,i) * B(k,j)
@@ -261,7 +236,7 @@ __global__ void AxB_dot3_phase3_vsdn
             // A is sparse/hyper and B is bitmap
             //------------------------------------------------------------------
 
-            for (int64_t p = pA ; p < pA_end ; ++p)
+            for (int64_t p = pA ; p < pA_end ; p++)
             {
                 int64_t k = Ai [p] ;        // next row index of A(:,i)
                 if (Bb [pB+k])              // check if B(k,j) exists
@@ -274,10 +249,15 @@ __global__ void AxB_dot3_phase3_vsdn
         }
         #endif
 
+        //----------------------------------------------------------------------
+        // save C(i,j) or declare it a zombie
+        //----------------------------------------------------------------------
+
         GB_CIJ_EXIST_POSTCHECK
         if (cij_exists)
         {
-            GB_PUTC (cij, Cx, pair_id) ;        // Cx [pair_id] = (T_C) cij
+            // Cx [pair_id] = (GB_C_TYPE) cij
+            GB_PUTC (cij, Cx, pair_id) ;
             Ci [pair_id] = i ;
         }
         else
@@ -286,17 +266,17 @@ __global__ void AxB_dot3_phase3_vsdn
             Ci [pair_id] = GB_FLIP (i) ;
         }
 
-        // FIXME: use the same method as vsvs for counting zombies
         // sum up the zombie count:
-        thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());
-        zc += reduce_sum_int64<tile_sz>(tile, my_nzombies);
+        thread_block_tile<tile_sz> tile =
+            tiled_partition<tile_sz> (this_thread_block ()) ;
+        zc += GB_cuda_warp_sum_uint64 (tile, my_nzombies) ;
     }
 
-    if(threadIdx.x == 0 && zc > 0)
+    if (threadIdx.x == 0 && zc > 0)
     {
         // this threadblock accumulates its zombie count into the global
         // zombie count
-        GB_cuda_atomic_add <int64_t>( &(C->nzombies), zc) ;
+        GB_cuda_atomic_add <uint64_t>( &(C->nzombies), zc) ;
     }
 }
 
diff --git a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh
similarity index 53%
rename from GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh
rename to GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh
index bcd0d4d25c..edf539634d 100644
--- a/GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh
+++ b/GraphBLAS/CUDA/Template/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh
@@ -2,6 +2,8 @@
 // GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh
 //------------------------------------------------------------------------------
 
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
@@ -13,144 +15,97 @@
 //  using a serial merge algorithm on the sparse vectors. 
 //  m = 256/sz, where sz is in {4, 16, 64, 256}
 //  For a vector-pair, sz = xnz + ynz 
-//  Template on <T_C, T_M, T_A, T_B>
 //  Parameters:
 
 //  int64_t start          <- start of vector pairs for this kernel
 //  int64_t end            <- end of vector pairs for this kernel
 //  int64_t *Bucket        <- array of pair indices for all kernels 
-//  matrix<T_C> *C         <- result matrix 
-//  matrix<T_M> *M         <- mask matrix
-//  matrix<T_A> *A         <- input matrix A
-//  matrix<T_B> *B         <- input matrix B
-//  int sz                 <- nnz of very sparse vectors
+//  C         <- result matrix 
+//  M         <- mask matrix
+//  A         <- input matrix A
+//  B         <- input matrix B
 
 //  Blocksize is 1024, uses warp and block reductions to count zombies produced.
 //******************************************************************************
 
-#pragma once
-#include <limits>
-#include <cstdint>
-#include <cmath>
-#include <stdio.h>
-#include <cooperative_groups.h>
-#include "GB_cuda_kernel.h"
-#include "GB_mxm_shared_definitions.h"
-#include "GB_cuda_atomics.cuh"
-#include "GB_hash.h"
-#include "GB_hyper_hash_lookup.h"
-#include "GB_cuda_dot3_defn.h"
-
-using namespace cooperative_groups;
-
-//------------------------------------------------------------------------------
-// GB_warp_ReduceSumPlus_int64
-//------------------------------------------------------------------------------
-
-template< int tile_sz>
-__inline__ __device__ 
-int64_t GB_warp_ReduceSumPlus_int64( thread_block_tile<tile_sz> g, int64_t val)
-{
-    // Each iteration halves the number of active threads
-    // Each thread adds its partial sum[i] to sum[lane+i]
-    /*
-    #pragma unroll
-    for (int i = tile_sz >> 1; i > 0; i >>= 1) {
-        val +=  g.shfl_down( val, i);
-    }
-    */
-    val +=  g.shfl_down( val, 16);
-    val +=  g.shfl_down( val, 8);
-    val +=  g.shfl_down( val, 4);
-    val +=  g.shfl_down( val, 2);
-    val +=  g.shfl_down( val, 1);
-    return val; // note: only thread 0 will return full sum
-}
-
 //------------------------------------------------------------------------------
-// GB_block_ReduceSum_int64
+// GB_block_ReduceSum_uint64
 //------------------------------------------------------------------------------
 
-template<int warpSize>
-__inline__ __device__
-int64_t GB_block_ReduceSum_int64(thread_block g, int64_t val)
+__inline__ __device__ uint64_t GB_block_ReduceSum_uint64
+(
+    thread_block g,     // FIXME: g is used for thread_block_tile elsewhere;
+                        // be consistent.
+    uint64_t val
+)
 {
-  static __shared__ int64_t shared[warpSize]; // Shared mem for 32 partial sums
+    // Shared mem for 32 partial sums
+    static __shared__ uint64_t shared [tile_sz] ;
 
-  int lane = threadIdx.x & 31 ; // % warpSize;
-  int wid  = threadIdx.x >> 5 ; // / warpSize;
-  thread_block_tile<warpSize> tile = tiled_partition<warpSize>( g );
+    // FIXME: assumes tile_sz is 32:  (use an #if .. #else ... #endif)
+    int lane = threadIdx.x & 31 ; // % tile_sz;
+    int wid  = threadIdx.x >> 5 ; // / tile_sz;
+    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz> (g) ;
 
-  // Each warp performs partial reduction
-  val = GB_warp_ReduceSumPlus_int64<warpSize>( tile, val);    
+    // Each warp performs partial reduction
+    val = GB_cuda_warp_sum_uint64 (tile, val) ;    
 
-  // Wait for all partial reductions
-  if (lane==0) shared[wid]=val; // Write reduced value to shared memory
-  g.sync();                     // Wait for all partial reductions
+    // Wait for all partial reductions
+    if (lane == 0)
+    {
+        shared [wid] = val ; // Write reduced value to shared memory
+    }
 
-  //if (wid > 0 ) return val;
+    g.sync();                     // Wait for all partial reductions
 
-  //read from shared memory only if that warp existed
-  val = (threadIdx.x <  (blockDim.x / warpSize ) ) ? shared[lane] : 0;
+    // read from shared memory only if that warp existed
+    val = (threadIdx.x <  (blockDim.x / tile_sz ) ) ? shared[lane] : 0;
 
-  // Final reduce within first warp
-  if (wid==0) val = GB_warp_ReduceSumPlus_int64<warpSize>( tile, val);
+    // Final reduce within first warp
+    if (wid == 0)
+    {
+        val = GB_cuda_warp_sum_uint64 (tile, val) ;
+    }
 
-  return val;
+    return (val) ;
 }
 
 //------------------------------------------------------------------------------
-// AxB_dot3_phase3_vsvs
+// GB_cuda_AxB_dot3_phase3_vsvs_kernel
 //------------------------------------------------------------------------------
 
-template<
-    typename T_C, typename T_A, typename T_B,
-    typename T_Z, typename T_X, typename T_Y, uint64_t srcode>
-__global__ void AxB_dot3_phase3_vsvs
-( 
-  int64_t start,
-  int64_t end,
-  int64_t *Bucket,  // do the work in Bucket [start:end-1]
-  GrB_Matrix C,
-  GrB_Matrix M,
-  GrB_Matrix A,
-  GrB_Matrix B,
-  int sz            // unused
+__global__ void GB_cuda_AxB_dot3_phase3_vsvs_kernel
+(
+    int64_t start,
+    int64_t end,
+    int64_t *Bucket,  // do the work in Bucket [start:end-1]
+    GrB_Matrix C,
+    GrB_Matrix M,
+    GrB_Matrix A,
+    GrB_Matrix B
 )
 {
 
-    // TODO: Figure out how to use graphblas-specific INFINITY macro
-    #ifndef INFINITY
-    #define INFINITY std::numeric_limits<T_C>::max()
+    #if !GB_A_IS_PATTERN
+    const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *)A->x  ;
     #endif
-
-    int64_t dots = end - start;
-   // sz = expected non-zeros per dot
-//   /*
-//   int m = (gridDim.x*blockDim.x)*256/sz;
-//   int dpt = (nvecs+ gridDim.x*blockDim.x -1)/(gridDim.x*blockDim.x);
-//   m = dpt < m ? dpt : m;
-//
-//   int dots = (nvecs +m -1)/m;
-//   */
-    const T_A *__restrict__ Ax = (T_A *)A->x  ;
-    const T_B *__restrict__ Bx = (T_B *)B->x  ;
-          T_C *__restrict__ Cx = (T_C *)C->x  ;
+    #if !GB_B_IS_PATTERN
+    const GB_B_TYPE *__restrict__ Bx = (GB_B_TYPE *)B->x  ;
+    #endif
+          GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *)C->x  ;
           int64_t *__restrict__ Ci = C->i ;
     const int64_t *__restrict__ Mi = M->i ;
     #if GB_M_IS_HYPER
     const int64_t *__restrict__ Mh = M->h ;
     #endif
 
-    #if GB_A_IS_HYPER || GB_A_IS_SPARSE
+    ASSERT (GB_A_IS_HYPER || GB_A_IS_SPARSE) ;
     const int64_t *__restrict__ Ai = A->i ;
     const int64_t *__restrict__ Ap = A->p ;
-    #endif
 
-    #if GB_B_IS_HYPER || GB_B_IS_SPARSE
+    ASSERT (GB_B_IS_HYPER || GB_B_IS_SPARSE) ;
     const int64_t *__restrict__ Bi = B->i ;
     const int64_t *__restrict__ Bp = B->p ;
-    #endif
 
     #if GB_A_IS_HYPER
     const int64_t anvec = A->nvec ;
@@ -172,20 +127,14 @@ __global__ void AxB_dot3_phase3_vsvs
     const int64_t B_hash_bits = (B->Y == NULL) ? 0 : (B->Y->vdim - 1) ;
     #endif
 
-    //int64_t pfirst, plast;
-
-    //GB_PARTITION (pfirst, plast, dots, blockIdx.x, gridDim.x ) ;
-
-    int64_t my_nzombies = 0 ;
+    uint64_t my_nzombies = 0 ;
 
-    int all_in_one = ( (end - start) == (M->p)[(M->nvec)] ) ;
+    GB_M_NVALS (mnz) ;
+    int all_in_one = ( (end - start) == mnz ) ;
 
-  //for ( int64_t kk = pfirst+ threadIdx.x ;
-  //              kk < plast;
-  //              kk += blockDim.x )
-    for ( int64_t kk = start+ threadIdx.x +blockDim.x*blockIdx.x ;
-                  kk < end;
-                  kk += blockDim.x*gridDim.x )
+    for (int64_t kk = start + threadIdx.x + blockDim.x*blockIdx.x ;
+                 kk < end ;
+                 kk += blockDim.x*gridDim.x )
     {
         int64_t pair_id = all_in_one ? kk : Bucket[ kk ];
 
@@ -201,8 +150,8 @@ __global__ void AxB_dot3_phase3_vsvs
         GB_hyper_hash_lookup (Ah, anvec, Ap, A_Yp, A_Yi, A_Yx, A_hash_bits,
            i, &pA, &pA_end) ;
         #else
-        pA       = Ap[i] ;
-        pA_end   = Ap[i+1] ;
+        pA     = Ap [i] ;
+        pA_end = Ap [i+1] ;
         #endif
 
         // find B(:,j):  B is always sparse or hypersparse
@@ -211,8 +160,8 @@ __global__ void AxB_dot3_phase3_vsvs
         GB_hyper_hash_lookup (Bh, bnvec, Bp, B_Yp, B_Yi, B_Yx, B_hash_bits,
            j, &pB, &pB_end) ;
         #else
-        pB       = Bp[j] ;
-        pB_end   = Bp[j+1] ;
+        pB     = Bp [j] ;
+        pB_end = Bp [j+1] ;
         #endif
 
         GB_DECLAREA (aki) ;
@@ -243,7 +192,7 @@ __global__ void AxB_dot3_phase3_vsvs
         GB_CIJ_EXIST_POSTCHECK ;
         if (cij_exists)
         {
-            GB_PUTC (cij, Cx, pair_id) ;        // Cx [pair_id] = (T_C) cij
+            GB_PUTC (cij, Cx, pair_id) ;    // Cx [pair_id] = (GB_C_TYPE) cij
             Ci [pair_id] = i ;
         }
         else
@@ -257,12 +206,12 @@ __global__ void AxB_dot3_phase3_vsvs
     // FIXME: use this in spdn and vsdn:
     this_thread_block().sync(); 
 
-    my_nzombies = GB_block_ReduceSum_int64<32>( this_thread_block(), my_nzombies);
+    my_nzombies = GB_block_ReduceSum_uint64 (this_thread_block(), my_nzombies) ;
     this_thread_block().sync(); 
 
     if( threadIdx.x == 0 && my_nzombies > 0)
     {
-        GB_cuda_atomic_add <uint64_t>( &(C->nzombies), (uint64_t) my_nzombies) ;
+        GB_cuda_atomic_add <uint64_t>( &(C->nzombies), my_nzombies) ;
     }
 }
 
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_kernel.cuh b/GraphBLAS/CUDA/Template/GB_cuda_kernel.cuh
new file mode 100644
index 0000000000..e37259530d
--- /dev/null
+++ b/GraphBLAS/CUDA/Template/GB_cuda_kernel.cuh
@@ -0,0 +1,79 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/Template/GB_cuda_kernel.cuh: definitions for CUDA kernels
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This file is #include'd into all device functions for CUDA JIT kernels for
+// GraphBLAS.  It provides a subset of GraphBLAS.h and GB.h, plus other
+// definitions.  It is not used on the host.
+
+#pragma once
+
+//------------------------------------------------------------------------------
+// C++ and CUDA #include files
+//------------------------------------------------------------------------------
+
+#include <limits>
+#include <type_traits>
+#include <cstdint>
+#include <cmath>
+#include <stdio.h>
+#include <cub/block/block_scan.cuh>
+#include <cooperative_groups.h>
+using namespace cooperative_groups ;
+
+//------------------------------------------------------------------------------
+// CUDA kernel definitions
+//------------------------------------------------------------------------------
+
+#define GB_CUDA_KERNEL
+
+#undef  ASSERT
+#define ASSERT(x)
+
+//------------------------------------------------------------------------------
+// NVIDIA warp size
+//------------------------------------------------------------------------------
+
+#define WARPSIZE 32
+#define LOG2_WARPSIZE 5
+
+//------------------------------------------------------------------------------
+
+// for internal static inline functions
+#undef  GB_STATIC_INLINE
+#define GB_STATIC_INLINE static __device__ __inline__
+
+//------------------------------------------------------------------------------
+// subset of GraphBLAS.h
+//------------------------------------------------------------------------------
+
+#include "GraphBLAS_h_subset.cuh"
+
+//------------------------------------------------------------------------------
+// subset of GB.h
+//------------------------------------------------------------------------------
+
+#include "GB_h_subset.cuh"
+
+//------------------------------------------------------------------------------
+// final #include files
+//------------------------------------------------------------------------------
+
+#include "GB_cuda_error.hpp"
+#include "GB_printf_kernels.h"
+#include "GB_cuda_atomics.cuh"
+#include "GB_hash.h"
+#include "GB_hyper_hash_lookup.h"
+
+extern "C"
+{
+    #include "GB_werk.h"
+    #include "GB_callback.h"
+}
+
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_kernel.h b/GraphBLAS/CUDA/Template/GB_cuda_kernel.h
deleted file mode 100644
index b4eb500a56..0000000000
--- a/GraphBLAS/CUDA/Template/GB_cuda_kernel.h
+++ /dev/null
@@ -1,263 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/Template/GB_cuda_kernel.h: definitions for CUDA kernels
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-// This file is #include'd into all CUDA kernels for GraphBLAS.  It provides
-// a subset of GraphBLAS.h and GB.h, plus other definitions.
-
-// FIXME: rename to .cuh?
-
-#pragma once
-
-#define GB_CUDA_KERNEL
-
-#undef  ASSERT
-#define ASSERT(x)
-
-//------------------------------------------------------------------------------
-// TODO: this will be in the jit code:
-#define chunksize 128 
-
-//------------------------------------------------------------------------------
-// NVIDIA warp size
-//------------------------------------------------------------------------------
-
-#define WARPSIZE 32
-#define LOG2_WARPSIZE 5
-
-//------------------------------------------------------------------------------
-
-#ifndef INFINITY
-#define INFINITY (std::numeric_limits<double>::max())
-#endif
-
-// for internal static inline functions
-#undef  GB_STATIC_INLINE
-#define GB_STATIC_INLINE static __device__ __inline__
-
-//------------------------------------------------------------------------------
-// subset of GraphBLAS.h
-//------------------------------------------------------------------------------
-
-#ifndef GRAPHBLAS_H
-#define GRAPHBLAS_H
-
-#undef restrict
-#undef GB_restrict
-#define GB_restrict __restrict__
-#define restrict GB_restrict
-
-#include <stdint.h>
-//#include <stdbool.h>
-#include <stddef.h>
-#include <string.h>
-
-#undef  GB_GLOBAL
-#define GB_GLOBAL extern
-
-// GB_STR: convert the content of x into a string "x"
-#define GB_XSTR(x) GB_STR(x)
-#define GB_STR(x) #x
-
-#undef  GxB_MAX_NAME_LEN
-#define GxB_MAX_NAME_LEN 128
-
-typedef uint64_t GrB_Index ;
-typedef struct GB_Descriptor_opaque *GrB_Descriptor ;
-typedef struct GB_Type_opaque *GrB_Type ;
-typedef struct GB_UnaryOp_opaque *GrB_UnaryOp ;
-typedef struct GB_BinaryOp_opaque *GrB_BinaryOp ;
-typedef struct GB_IndexUnaryOp_opaque *GrB_IndexUnaryOp ;
-typedef struct GB_Monoid_opaque *GrB_Monoid ;
-typedef struct GB_Semiring_opaque *GrB_Semiring ;
-typedef struct GB_Scalar_opaque *GrB_Scalar ;
-typedef struct GB_Vector_opaque *GrB_Vector ;
-typedef struct GB_Matrix_opaque *GrB_Matrix ;
-typedef struct GB_Context_opaque *GxB_Context ;
-typedef struct GB_Global_opaque *GrB_Global ;
-typedef struct GB_Iterator_opaque *GxB_Iterator ;
-
-#define GxB_HYPERSPARSE 1   // store matrix in hypersparse form
-#define GxB_SPARSE      2   // store matrix as sparse form (compressed vector)
-#define GxB_BITMAP      4   // store matrix as a bitmap
-#define GxB_FULL        8   // store matrix as full; all entries must be present
-
-typedef void (*GxB_unary_function)  (void *, const void *) ;
-typedef void (*GxB_binary_function) (void *, const void *, const void *) ;
-
-typedef bool (*GxB_select_function)      // return true if A(i,j) is kept
-(
-    GrB_Index i,                // row index of A(i,j)
-    GrB_Index j,                // column index of A(i,j)
-    const void *x,              // value of A(i,j)
-    const void *thunk           // optional input for select function
-) ;
-
-typedef void (*GxB_index_unary_function)
-(
-    void *z,            // output value z, of type ztype
-    const void *x,      // input value x of type xtype; value of v(i) or A(i,j)
-    GrB_Index i,        // row index of A(i,j)
-    GrB_Index j,        // column index of A(i,j), or zero for v(i)
-    const void *y       // input scalar y
-) ;
-
-#define GxB_GLOBAL_GPU_ID 26
-
-typedef enum
-{
-    // for all GrB_Descriptor fields:
-    GxB_DEFAULT = 0,    // default behavior of the method
-
-    // for GrB_OUTP only:
-    GrB_REPLACE = 1,    // clear the output before assigning new values to it
-
-    // for GrB_MASK only:
-    GrB_COMP = 2,       // use the structural complement of the input
-    GrB_SCMP = 2,       // same as GrB_COMP (historical; use GrB_COMP instead)
-    GrB_STRUCTURE = 4,  // use the only pattern of the mask, not its values
-
-    // for GrB_INP0 and GrB_INP1 only:
-    GrB_TRAN = 3,       // use the transpose of the input
-
-    // for GxB_AxB_METHOD only:
-    GxB_AxB_GUSTAVSON = 1001,   // gather-scatter saxpy method
-    GxB_AxB_DOT       = 1003,   // dot product
-    GxB_AxB_HASH      = 1004,   // hash-based saxpy method
-    GxB_AxB_SAXPY     = 1005    // saxpy method (any kind)
-}
-GrB_Desc_Value ;
-
-#endif
-
-//------------------------------------------------------------------------------
-// subset of GB.h
-//------------------------------------------------------------------------------
-
-//#include GB_iceil.h
-#define GB_ICEIL(a,b) (((a) + (b) - 1) / (b))
-//#include GB_imin.h
-#define GB_IMAX(x,y) (((x) > (y)) ? (x) : (y))
-#define GB_IMIN(x,y) (((x) < (y)) ? (x) : (y))
-//#include GB_zombie.h
-#define GB_FLIP(i)             (-(i)-2)
-#define GB_IS_FLIPPED(i)       ((i) < 0)
-#define GB_IS_ZOMBIE(i)        ((i) < 0)
-#define GB_IS_NOT_FLIPPED(i)   ((i) >= 0)
-#define GB_UNFLIP(i)           (((i) < 0) ? GB_FLIP(i) : (i))
-#define GBI_UNFLIP(Ai,p,avlen)      \
-    ((Ai == NULL) ? ((p) % (avlen)) : GB_UNFLIP (Ai [p]))
-
-#include "GB_index.h"
-#include "GB_partition.h"
-#include "GB_pun.h"
-#include "GB_opaque.h"
-#include "GB_int64_mult.h"
-#define GB_HAS_CMPLX_MACROS 1
-#include "GB_complex.h"
-
-// version for the GPU, with fewer branches
-#define GB_TRIM_BINARY_SEARCH(i,X,pleft,pright)                             \
-{                                                                           \
-    /* binary search of X [pleft ... pright] for integer i */               \
-    while (pleft < pright)                                                  \
-    {                                                                       \
-        int64_t pmiddle = (pleft + pright) >> 1 ;                           \
-        bool less = (X [pmiddle] < i) ;                                     \
-        pleft  = less ? (pmiddle+1) : pleft ;                               \
-        pright = less ? pright : pmiddle ;                                  \
-    }                                                                       \
-    /* binary search is narrowed down to a single item */                   \
-    /* or it has found the list is empty */                                 \
-    ASSERT (pleft == pright || pleft == pright + 1) ;                       \
-}
-
-#define GB_BINARY_SEARCH(i,X,pleft,pright,found)                            \
-{                                                                           \
-    GB_TRIM_BINARY_SEARCH (i, X, pleft, pright) ;                           \
-    found = (pleft == pright && X [pleft] == i) ;                           \
-}
-
-#define GB_SPLIT_BINARY_SEARCH(i,X,pleft,pright,found)                      \
-{                                                                           \
-    GB_BINARY_SEARCH (i, X, pleft, pright, found)                           \
-    if (!found && (pleft == pright))                                        \
-    {                                                                       \
-        if (i > X [pleft])                                                  \
-        {                                                                   \
-            pleft++ ;                                                       \
-        }                                                                   \
-        else                                                                \
-        {                                                                   \
-            pright++ ;                                                      \
-        }                                                                   \
-    }                                                                       \
-}
-
-static __device__ __inline__ int64_t GB_search_for_vector_device
-(
-    const int64_t p,                // search for vector k that contains p
-    const int64_t *restrict Ap,  // vector pointers to search
-    int64_t kleft,                  // left-most k to search
-    int64_t anvec,                  // Ap is of size anvec+1
-    int64_t avlen                   // A->vlen
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    if (Ap == NULL)
-    { 
-        // A is full or bitmap
-        ASSERT (p >= 0 && p < avlen * anvec) ;
-        return ((avlen == 0) ? 0 : (p / avlen)) ;
-    }
-
-    // A is sparse
-    ASSERT (p >= 0 && p < Ap [anvec]) ;
-
-    //--------------------------------------------------------------------------
-    // search for k
-    //--------------------------------------------------------------------------
-
-    int64_t k = kleft ;
-    int64_t kright = anvec ;
-    bool found ;
-    GB_SPLIT_BINARY_SEARCH (p, Ap, k, kright, found) ;
-    if (found)
-    {
-        // Ap [k] == p has been found, but if k is an empty vector, then the
-        // next vector will also contain the entry p.  In that case, k needs to
-        // be incremented until finding the first non-empty vector for which
-        // Ap [k] == p.
-        ASSERT (Ap [k] == p) ;
-        while (k < anvec-1 && Ap [k+1] == p)
-        { 
-            k++ ;
-        }
-    }
-    else
-    { 
-        // p has not been found in Ap, so it appears in the middle of Ap [k-1]
-        // ... Ap [k], as computed by the binary search.  This is the range of
-        // entries for the vector k-1, so k must be decremented.
-        k-- ;
-    }
-
-    //--------------------------------------------------------------------------
-    // return result
-    //--------------------------------------------------------------------------
-
-    // The entry p must reside in a non-empty vector.
-    ASSERT (k >= 0 && k < anvec) ;
-    ASSERT (Ap [k] <= p && p < Ap [k+1]) ;
-
-    return (k) ;
-}
-
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_shfl_down.cuh b/GraphBLAS/CUDA/Template/GB_cuda_shfl_down.cuh
new file mode 100644
index 0000000000..13b5c505f0
--- /dev/null
+++ b/GraphBLAS/CUDA/Template/GB_cuda_shfl_down.cuh
@@ -0,0 +1,384 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/Template/GB_cuda_shfl_down.cuh:  warp-level reductions
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+//------------------------------------------------------------------------------
+
+// shfl_down is a method in the cooperative_groups namespace.  It allows all
+// threads in a warp (or other thread partition) to work together in a
+// cooperative fashion.
+//
+// Suppose we have a tile that defines a single warp of 32 threads:
+//
+//      #define tile_sz 32
+//      thread_block_tile<tile_sz> tile =
+//          tiled_partition<tile_sz> (this_thread_block()) ;
+//
+// Suppose each thread has two scalars dest and src of type T.  Then:
+//
+//      T dest, src ;
+//      dest = tile.shfl_down (src, delta) ;
+//
+// performs the following computation for each thread i:
+//
+//      if (i+delta < tile_sz)
+//      {
+//          dest = (the value of src on thread i+delta)
+//      }
+//
+// Where i ranges from 0 to the tile_size-1, which is the warp size of 32 (the
+// size of the tile, given by tile.num_threads() and also the #define'd value
+// tile_sz), minus one.  If i+delta >= tile_sz for the ith thread, then nothing
+// happens for that thread, and the thread is inactive.
+//
+// Restrictions:  tile_sz must be a power of 2, and it must be 32 or less for
+// tile.shfl_down().  The type T must be trivially-copyable (that is
+// is_trivially_copyable<T>::value must be true), and sizeof (T) <= 32 must
+// hold (that is, the size of T must be 32 bytes or less).  The 32-byte limit
+// is handled by GB_cuda_shfl_down_large_ztype, which uses repeated calls to
+// tile.shfl_down on 32-byte chunks.
+
+// FIXME for tile.shfl_down(...), delta is an int, so can it be negative?
+// For the __shfl_down warp shuffle function, delta is an unsigned int.
+
+//------------------------------------------------------------------------------
+// GB_cuda_warp_sum_uint64: reduce a uint64_t value across a single warp
+//------------------------------------------------------------------------------
+
+// On input, each thread in the tile holds a single uint64_t value.  On output,
+// thread zero holds the sum of values from all the warps.
+
+__device__ __inline__ uint64_t GB_cuda_warp_sum_uint64
+(
+    thread_block_tile<tile_sz> tile,
+    uint64_t value
+)
+{
+
+    //--------------------------------------------------------------------------
+    // sum value on all threads to a single value
+    //--------------------------------------------------------------------------
+
+    #if (tile_sz == 32)
+    {
+        // this is the typical case
+        value += tile.shfl_down (value, 16) ;
+        value += tile.shfl_down (value,  8) ;
+        value += tile.shfl_down (value,  4) ;
+        value += tile.shfl_down (value,  2) ;
+        value += tile.shfl_down (value,  1) ;
+    }
+    #else
+    {
+        #pragma unroll
+        for (int i = tile_sz >> 1 ; i > 0 ; i >>= 1)
+        {
+            value += tile.shfl_down (value, i) ;
+        }
+    }
+    #endif
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    // Note that only thread 0 will have the full summation of all values in
+    // the tile.  To broadcast it to all threads, use the following:
+
+    // value = tile.shfl (value, 0) ;
+
+    return (value) ;
+}
+
+#if 0
+
+//------------------------------------------------------------------------------
+// warp_ReduceSumPlus_uint64: for dot3_phase2
+//------------------------------------------------------------------------------
+
+__inline__ __device__ uint64_t warp_ReduceSumPlus_uint64
+(
+    thread_block_tile<tile_sz> tile,
+    uint64_t val
+)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = tile.num_threads() / 2; i > 0; i /= 2)
+    {
+        val += tile.shfl_down (val, i) ;
+    }
+    return val; // note: only thread 0 will return full sum
+}
+
+//------------------------------------------------------------------------------
+// GB_warp_ReduceSumPlus_uint64_vsvs: for vsvs kernel
+//------------------------------------------------------------------------------
+
+__inline__ __device__ uint64_t GB_warp_ReduceSumPlus_uint64_vsvs
+(
+    thread_block_tile<tile_sz> g,
+    uint64_t val
+)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    /*
+    #pragma unroll
+    for (int i = tile_sz >> 1; i > 0; i >>= 1) {
+        val +=  g.shfl_down( val, i);
+    }
+    */
+    // assuming tile_sz is 32:
+    val +=  g.shfl_down( val, 16);
+    val +=  g.shfl_down( val, 8);
+    val +=  g.shfl_down( val, 4);
+    val +=  g.shfl_down( val, 2);
+    val +=  g.shfl_down( val, 1);
+    return val; // note: only thread 0 will return full sum
+}
+
+//------------------------------------------------------------------------------
+// reduce_sum_int64: for vsdn
+//------------------------------------------------------------------------------
+
+// for counting zombies only (always int64_t)
+__device__ int64_t reduce_sum_int64
+(
+    thread_block_tile<tile_sz> g,
+    int64_t val
+)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int64_t i = g.num_threads() / 2; i > 0; i /= 2)
+    {
+        val += g.shfl_down(val,i) ;
+    }
+    return val; // note: only thread 0 will return full sum
+}
+
+#endif
+
+//------------------------------------------------------------------------------
+// GB_cuda_shfl_down_large_ztype: shfl_down a type larger than 32 bytes
+//------------------------------------------------------------------------------
+
+// This returns result = tile.shfl_down (value, delta), where value has type
+// GB_Z_TYPE, and sizeof (GB_Z_TYPE) > 32.
+
+#if ( GB_Z_SIZE > 32 )
+
+    // # of 32-byte chunks to hold a single GB_Z_TYPE, excluding leftover
+    // chunk; GB_Z_SIZE is sizeof (GB_Z_TYPE) as a hard-coded constant.
+    #define GB_Z_NCHUNKS ( GB_Z_SIZE / 32 )
+
+    // ztype_chunk is always 32 bytes in size
+    typedef struct { uint8_t bytes [32] ; } ztype_chunk ;
+
+    // size of the single leftover chunk of size 0 to < 32 bytes
+    #define GB_Z_LEFTOVER ( GB_Z_SIZE - ( GB_Z_NCHUNKS * 32 ) )
+
+    #if ( GB_Z_LEFTOVER > 0 )
+    // leftover chunk is not defined if GB_Z_SIZE is a multiple of 32
+    typedef struct { uint8_t bytes [GB_Z_LEFTOVER] ; } ztype_leftover ;
+    #endif
+
+    __device__ __inline__ void GB_cuda_shfl_down_large_ztype
+    (
+        GB_Z_TYPE *result,
+        thread_block_tile<tile_sz> tile,
+        GB_Z_TYPE *value,
+        int delta
+    )
+    {
+
+        // get pointers to value and result, as chunks of size 32 bytes
+        struct ztype_chunk *v = (struct ztype_chunk *) value ;
+        struct ztype_chunk *r = (struct ztype_chunk *) result ;
+
+        // shfl_down value into result, one chunk at a time
+        #pragma unroll
+        for (int chunk = 0 ; chunk < GB_Z_NCHUNKS ; chunk++, r++, v++)
+        {
+            (*r) = tile.shfl_down (*v, delta) ;
+        }
+
+        #if ( GB_Z_LEFTOVER > 0 )
+        // handle the leftover chunk, if it has nonzero size
+        struct ztype_leftover *v_leftover = (struct ztype_leftover *) v ;
+        struct ztype_leftover *r_leftover = (struct ztype_leftover *) r ;
+        (*r_leftover) = tile.shfl_down (*v_leftover, delta) ;
+        #endif
+    }
+
+#endif
+
+//------------------------------------------------------------------------------
+// GB_cuda_warp_reduce_ztype: reduce a ztype to a scalar, on a single warp
+//------------------------------------------------------------------------------
+
+// FIXME: make value parameter *value, and return type void?
+
+__device__ __inline__ GB_Z_TYPE GB_cuda_warp_reduce_ztype
+(
+    thread_block_tile<tile_sz> tile,
+    GB_Z_TYPE value
+)
+{
+
+    #if ( GB_Z_SIZE <= 32 )
+    {
+
+        //----------------------------------------------------------------------
+        // GB_Z_TYPE can done with a single shfl_down
+        //----------------------------------------------------------------------
+
+        #if ( tile_sz == 32 )
+        {
+            // this is the typical case
+            GB_Z_TYPE next ;
+            next = tile.shfl_down (value, 16) ;
+            GB_ADD (value, value, next) ;
+            next = tile.shfl_down (value,  8) ;
+            GB_ADD (value, value, next) ;
+            next = tile.shfl_down (value,  4) ;
+            GB_ADD (value, value, next) ;
+            next = tile.shfl_down (value,  2) ;
+            GB_ADD (value, value, next) ;
+            next = tile.shfl_down (value,  1) ;
+            GB_ADD (value, value, next) ;
+        }
+        #else
+        {
+
+            #pragma unroll
+            for (int i = tile_sz >> 1 ; i > 0 ; i >>= 1)
+            {
+                GB_Z_TYPE next = tile.shfl_down (value, i) ;
+                GB_ADD (value, value, next) ;
+            }
+
+        }
+        #endif
+    }
+    #else
+    {
+
+        //----------------------------------------------------------------------
+        // sizeof (GB_Z_TYPE) is too large for a single shfl_down
+        //----------------------------------------------------------------------
+
+        #pragma unroll
+        for (int i = tile_sz >> 1 ; i > 0 ; i >>= 1)
+        {
+            GB_Z_TYPE next ;
+            GB_cuda_shfl_down_large_ztype (&next, tile, &value, i) ;
+            GB_ADD (value, value, next) ;
+        }
+    }
+    #endif
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    // Note that only thread 0 will have the full summation of all values in
+    // the tile.  To broadcast it to all threads, use the following:
+
+    // value = tile.shfl (value, 0) ;
+
+    // or if the ztype is large:
+    // GB_cuda_shfl_down_large_ztype (&value, tile, &value, 0) ;
+
+    return (value) ;
+}
+
+#if 0
+
+//------------------------------------------------------------------------------
+// warp_ReduceSum_dndn: for dndn kernel
+//------------------------------------------------------------------------------
+
+__inline__ __device__ GB_Z_TYPE warp_ReduceSum_dndn
+(
+    thread_block_tile<32> g,
+    GB_Z_TYPE val
+)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    // FIXME: only works if sizeof(GB_Z_TYPE) <= 32 bytes
+    // FIXME: the ANY monoid needs the cij_exists for each thread
+    for (int i = g.num_threads() / 2; i > 0; i /= 2)
+    {
+        GB_Z_TYPE next = g.shfl_down( val, i) ;
+        GB_ADD( val, val, next ); 
+    }
+    return val; // note: only thread 0 will return full sum
+}
+
+//------------------------------------------------------------------------------
+// GB_reduce_sum: for dot3 mp and spdn
+//------------------------------------------------------------------------------
+
+__device__ __inline__ GB_Z_TYPE GB_reduce_sum
+(
+    thread_block_tile<tile_sz> g,
+    GB_Z_TYPE val
+)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    // Temporary GB_Z_TYPE is necessary to handle arbirary ops
+    // FIXME: only works if sizeof(GB_Z_TYPE) <= 32 bytes
+    // FIXME: the ANY monoid needs the cij_exists for each thread
+    #pragma unroll
+    for (int i = tile_sz >> 1 ; i > 0 ; i >>= 1)
+    {
+        GB_Z_TYPE next = g.shfl_down (val, i) ;
+        GB_ADD (val, val, next) ; 
+    }
+    return val;
+}
+
+//------------------------------------------------------------------------------
+// GB_warp_Reduce: for cuda_reduce
+//------------------------------------------------------------------------------
+
+__device__ __inline__ GB_Z_TYPE GB_warp_Reduce
+(
+    thread_block_tile<tile_sz> g,
+    GB_Z_TYPE val
+)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial val[k] to val[lane+k]
+
+    // FIXME: doesn't work unless sizeof(GB_Z_TYPE) <= 32 bytes
+
+#if ( GB_Z_SIZE <= 32 )
+    // assumes tile_sz is 32:
+    GB_Z_TYPE fold = g.shfl_down ( val, 16) ;
+    GB_ADD ( val, val, fold ) ;
+    fold = g.shfl_down ( val, 8) ;
+    GB_ADD ( val, val, fold ) ;
+    fold = g.shfl_down ( val, 4) ;
+    GB_ADD ( val, val, fold ) ;
+    fold = g.shfl_down ( val, 2) ;
+    GB_ADD ( val, val, fold ) ;
+    fold = g.shfl_down ( val, 1) ;
+    GB_ADD ( val, val, fold ) ;
+#else
+    // use shared memory and do not use shfl_down?
+    // or use repeated calls to shfl_down, on chunks of 32 bytes each?
+    #error "not implemented yet"
+#endif
+
+    return (val) ; // note: only thread 0 will return full val
+}
+#endif
diff --git a/GraphBLAS/CUDA/Template/GB_cuda_timer.hpp b/GraphBLAS/CUDA/Template/GB_cuda_timer.hpp
new file mode 100644
index 0000000000..12a6e87d6b
--- /dev/null
+++ b/GraphBLAS/CUDA/Template/GB_cuda_timer.hpp
@@ -0,0 +1,52 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/test/GB_cuda_timer.hpp
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_CUDA_TIMER_HPP
+#define GB_CUDA_TIMER_HPP
+
+#include <cuda_runtime.h>
+struct GpuTimer
+{
+    cudaEvent_t start;
+    cudaEvent_t stop;
+
+    GpuTimer()
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+    }
+
+    ~GpuTimer()
+    {
+        cudaEventDestroy(start);
+        cudaEventDestroy(stop);
+    }
+         
+    void Start()
+    {
+        cudaEventRecord(start, 0);
+    }
+               
+    void Stop()
+    {
+        cudaEventRecord(stop, 0);
+    }
+                     
+    float Elapsed()
+    {
+        float elapsed;
+        cudaEventSynchronize(stop);
+        cudaEventElapsedTime(&elapsed, start, stop);
+        return elapsed;
+    }
+} ;
+
+#endif
+
diff --git a/GraphBLAS/CUDA/Template/GB_h_subset.cuh b/GraphBLAS/CUDA/Template/GB_h_subset.cuh
new file mode 100644
index 0000000000..f371da6041
--- /dev/null
+++ b/GraphBLAS/CUDA/Template/GB_h_subset.cuh
@@ -0,0 +1,77 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/Template/GB_h_subset.cuh: subset of GB.h
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Note the header guard is the same as GB.h:
+#ifndef GB_H
+#define GB_H
+
+// from GB_iceil.h:
+#define GB_ICEIL(a,b) (((a) + (b) - 1) / (b))
+// from GB_imin.h:
+#define GB_IMAX(x,y) (((x) > (y)) ? (x) : (y))
+#define GB_IMIN(x,y) (((x) < (y)) ? (x) : (y))
+// from GB_zombie.h:
+#define GB_FLIP(i)             (-(i)-2)
+#define GB_IS_FLIPPED(i)       ((i) < 0)
+#define GB_IS_ZOMBIE(i)        ((i) < 0)
+#define GB_IS_NOT_FLIPPED(i)   ((i) >= 0)
+#define GB_UNFLIP(i)           (((i) < 0) ? GB_FLIP(i) : (i))
+#define GBI_UNFLIP(Ai,p,avlen)      \
+    ((Ai == NULL) ? ((p) % (avlen)) : GB_UNFLIP (Ai [p]))
+
+#include "GB_index.h"
+#include "GB_partition.h"
+#include "GB_pun.h"
+#include "GB_opaque.h"
+#include "GB_int64_mult.h"
+#define GB_HAS_CMPLX_MACROS 1
+#include "GB_complex.h"
+#include "GB_memory_macros.h"
+
+// version for the GPU, with fewer branches
+#define GB_TRIM_BINARY_SEARCH(i,X,pleft,pright)                             \
+{                                                                           \
+    /* binary search of X [pleft ... pright] for integer i */               \
+    while (pleft < pright)                                                  \
+    {                                                                       \
+        int64_t pmiddle = (pleft + pright) >> 1 ;                           \
+        bool less = (X [pmiddle] < i) ;                                     \
+        pleft  = less ? (pmiddle+1) : pleft ;                               \
+        pright = less ? pright : pmiddle ;                                  \
+    }                                                                       \
+    /* binary search is narrowed down to a single item */                   \
+    /* or it has found the list is empty */                                 \
+    ASSERT (pleft == pright || pleft == pright + 1) ;                       \
+}
+
+#define GB_BINARY_SEARCH(i,X,pleft,pright,found)                            \
+{                                                                           \
+    GB_TRIM_BINARY_SEARCH (i, X, pleft, pright) ;                           \
+    found = (pleft == pright && X [pleft] == i) ;                           \
+}
+
+#define GB_SPLIT_BINARY_SEARCH(i,X,pleft,pright,found)                      \
+{                                                                           \
+    GB_BINARY_SEARCH (i, X, pleft, pright, found)                           \
+    if (!found && (pleft == pright))                                        \
+    {                                                                       \
+        if (i > X [pleft])                                                  \
+        {                                                                   \
+            pleft++ ;                                                       \
+        }                                                                   \
+        else                                                                \
+        {                                                                   \
+            pright++ ;                                                      \
+        }                                                                   \
+    }                                                                       \
+}
+
+
+#endif
+
diff --git a/GraphBLAS/CUDA/Template/GraphBLAS_h_subset.cuh b/GraphBLAS/CUDA/Template/GraphBLAS_h_subset.cuh
new file mode 100644
index 0000000000..53085666b5
--- /dev/null
+++ b/GraphBLAS/CUDA/Template/GraphBLAS_h_subset.cuh
@@ -0,0 +1,135 @@
+//------------------------------------------------------------------------------
+// GraphBLAS/CUDA/Template/GraphBLAS_h_subset.cuh: subset of GraphBLAS.h
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Note the header gaurd is the same as GraphBLAS.h:
+#ifndef GRAPHBLAS_H
+#define GRAPHBLAS_H
+
+typedef enum
+{
+
+    GrB_SUCCESS = 0,            // all is well
+
+    //--------------------------------------------------------------------------
+    // informational codes, not an error:
+    //--------------------------------------------------------------------------
+
+    GrB_NO_VALUE = 1,           // A(i,j) requested but not there
+    GxB_EXHAUSTED = 7089,       // iterator is exhausted
+
+    //--------------------------------------------------------------------------
+    // errors:
+    //--------------------------------------------------------------------------
+
+    GrB_UNINITIALIZED_OBJECT = -1,  // object has not been initialized
+    GrB_NULL_POINTER = -2,          // input pointer is NULL
+    GrB_INVALID_VALUE = -3,         // generic error; some value is bad
+    GrB_INVALID_INDEX = -4,         // row or column index is out of bounds
+    GrB_DOMAIN_MISMATCH = -5,       // object domains are not compatible
+    GrB_DIMENSION_MISMATCH = -6,    // matrix dimensions do not match
+    GrB_OUTPUT_NOT_EMPTY = -7,      // output matrix already has values
+    GrB_NOT_IMPLEMENTED = -8,       // method not implemented
+    GrB_ALREADY_SET = -9,           // field already written to
+    GrB_PANIC = -101,               // unknown error
+    GrB_OUT_OF_MEMORY = -102,       // out of memory
+    GrB_INSUFFICIENT_SPACE = -103,  // output array not large enough
+    GrB_INVALID_OBJECT = -104,      // object is corrupted
+    GrB_INDEX_OUT_OF_BOUNDS = -105, // row or col index out of bounds
+    GrB_EMPTY_OBJECT = -106         // an object does not contain a value
+
+}
+GrB_Info ;
+
+#undef restrict
+#undef GB_restrict
+#define GB_restrict __restrict__
+#define restrict GB_restrict
+
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+#undef  GB_GLOBAL
+#define GB_GLOBAL extern
+
+// GB_STR: convert the content of x into a string "x"
+#define GB_XSTR(x) GB_STR(x)
+#define GB_STR(x) #x
+
+#undef  GxB_MAX_NAME_LEN
+#define GxB_MAX_NAME_LEN 128
+
+typedef uint64_t GrB_Index ;
+typedef struct GB_Descriptor_opaque *GrB_Descriptor ;
+typedef struct GB_Type_opaque *GrB_Type ;
+typedef struct GB_UnaryOp_opaque *GrB_UnaryOp ;
+typedef struct GB_BinaryOp_opaque *GrB_BinaryOp ;
+typedef struct GB_IndexUnaryOp_opaque *GrB_IndexUnaryOp ;
+typedef struct GB_Monoid_opaque *GrB_Monoid ;
+typedef struct GB_Semiring_opaque *GrB_Semiring ;
+typedef struct GB_Scalar_opaque *GrB_Scalar ;
+typedef struct GB_Vector_opaque *GrB_Vector ;
+typedef struct GB_Matrix_opaque *GrB_Matrix ;
+typedef struct GB_Context_opaque *GxB_Context ;
+typedef struct GB_Global_opaque *GrB_Global ;
+typedef struct GB_Iterator_opaque *GxB_Iterator ;
+
+#define GxB_HYPERSPARSE 1   // store matrix in hypersparse form
+#define GxB_SPARSE      2   // store matrix as sparse form (compressed vector)
+#define GxB_BITMAP      4   // store matrix as a bitmap
+#define GxB_FULL        8   // store matrix as full; all entries must be present
+
+typedef void (*GxB_unary_function)  (void *, const void *) ;
+typedef void (*GxB_binary_function) (void *, const void *, const void *) ;
+
+typedef bool (*GxB_select_function)      // return true if A(i,j) is kept
+(
+    GrB_Index i,                // row index of A(i,j)
+    GrB_Index j,                // column index of A(i,j)
+    const void *x,              // value of A(i,j)
+    const void *thunk           // optional input for select function
+) ;
+
+typedef void (*GxB_index_unary_function)
+(
+    void *z,            // output value z, of type ztype
+    const void *x,      // input value x of type xtype; value of v(i) or A(i,j)
+    GrB_Index i,        // row index of A(i,j)
+    GrB_Index j,        // column index of A(i,j), or zero for v(i)
+    const void *y       // input scalar y
+) ;
+
+#define GxB_GLOBAL_GPU_ID 26
+
+typedef enum
+{
+    // for all GrB_Descriptor fields:
+    GxB_DEFAULT = 0,    // default behavior of the method
+
+    // for GrB_OUTP only:
+    GrB_REPLACE = 1,    // clear the output before assigning new values to it
+
+    // for GrB_MASK only:
+    GrB_COMP = 2,       // use the structural complement of the input
+    GrB_SCMP = 2,       // same as GrB_COMP (historical; use GrB_COMP instead)
+    GrB_STRUCTURE = 4,  // use the only pattern of the mask, not its values
+
+    // for GrB_INP0 and GrB_INP1 only:
+    GrB_TRAN = 3,       // use the transpose of the input
+
+    // for GxB_AxB_METHOD only:
+    GxB_AxB_GUSTAVSON = 1001,   // gather-scatter saxpy method
+    GxB_AxB_DOT       = 1003,   // dot product
+    GxB_AxB_HASH      = 1004,   // hash-based saxpy method
+    GxB_AxB_SAXPY     = 1005    // saxpy method (any kind)
+}
+GrB_Desc_Value ;
+
+#endif
+
diff --git a/GraphBLAS/CUDA/go b/GraphBLAS/CUDA/go
deleted file mode 100755
index 74b4fa0787..0000000000
--- a/GraphBLAS/CUDA/go
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-# nuke the cached kernels and src
-find ~/.SuiteSparse/GrB9.0.1 -mindepth 1 -delete
-
-# rebuild the JITpackage
-( cd ../JITpackage ; make purge ; make )
-
-# rebuild GraphBLAS
-( cd .. ; make )
-
-# run a demo
-../build/wathen_demo
-
diff --git a/GraphBLAS/CUDA/jitify.hpp b/GraphBLAS/CUDA/jitify.hpp
deleted file mode 100644
index 4dc3a9b9b6..0000000000
--- a/GraphBLAS/CUDA/jitify.hpp
+++ /dev/null
@@ -1,4196 +0,0 @@
-/*
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of NVIDIA CORPORATION nor the names of its
- *   contributors may be used to endorse or promote products derived
- *   from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- */
-
-/*
-  -----------
-  Jitify 0.9
-  -----------
-  A C++ library for easy integration of CUDA runtime compilation into
-  existing codes.
-
-  --------------
-  How to compile
-  --------------
-  Compiler dependencies: <jitify.hpp>, -std=c++11
-  Linker dependencies:   dl cuda nvrtc
-
-  --------------------------------------
-  Embedding source files into executable
-  --------------------------------------
-  g++  ... -ldl -rdynamic -DJITIFY_ENABLE_EMBEDDED_FILES=1
-  -Wl,-b,binary,my_kernel.cu,include/my_header.cuh,-b,default nvcc ... -ldl
-  -Xcompiler "-rdynamic
-  -Wl\,-b\,binary\,my_kernel.cu\,include/my_header.cuh\,-b\,default"
-  JITIFY_INCLUDE_EMBEDDED_FILE(my_kernel_cu);
-  JITIFY_INCLUDE_EMBEDDED_FILE(include_my_header_cuh);
-
-  ----
-  TODO
-  ----
-  Extract valid compile options and pass the rest to cuModuleLoadDataEx
-  See if can have stringified headers automatically looked-up
-    by having stringify add them to a (static) global map.
-    The global map can be updated by creating a static class instance
-      whose constructor performs the registration.
-    Can then remove all headers from JitCache constructor in example code
-  See other TODOs in code
-*/
-
-/*! \file jitify.hpp
- *  \brief The Jitify library header
- */
-
-/*! \mainpage Jitify - A C++ library that simplifies the use of NVRTC
- *  \p Use class jitify::JitCache to manage and launch JIT-compiled CUDA
- *    kernels.
- *
- *  \p Use namespace jitify::reflection to reflect types and values into
- *    code-strings.
- *
- *  \p Use JITIFY_INCLUDE_EMBEDDED_FILE() to declare files that have been
- *  embedded into the executable using the GCC linker.
- *
- *  \p Use jitify::parallel_for and JITIFY_LAMBDA() to generate and launch
- *  simple kernels.
- */
-
-#pragma once
-
-#ifndef JITIFY_THREAD_SAFE
-#define JITIFY_THREAD_SAFE 1
-#endif
-
-#if JITIFY_ENABLE_EMBEDDED_FILES
-#include <dlfcn.h>
-#endif
-#include <stdint.h>
-#include <algorithm>
-#include <cctype>
-#include <cstring>  // For strtok_r etc.
-#include <deque>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <typeinfo>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#if JITIFY_THREAD_SAFE
-#include <mutex>
-#endif
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>  // For dim3, cudaStream_t
-#if CUDA_VERSION >= 8000
-#define NVRTC_GET_TYPE_NAME 1
-#endif
-#include <nvrtc.h>
-
-// For use by get_current_executable_path().
-#ifdef __linux__
-#include <linux/limits.h>  // For PATH_MAX
-
-#include <cstdlib>  // For realpath
-#define JITIFY_PATH_MAX PATH_MAX
-#elif defined(_WIN32) || defined(_WIN64)
-#include <windows.h>
-#define JITIFY_PATH_MAX MAX_PATH
-#else
-#error "Unsupported platform"
-#endif
-
-#ifdef _MSC_VER       // MSVC compiler
-#include <dbghelp.h>  // For UnDecorateSymbolName
-#else
-#include <cxxabi.h>  // For abi::__cxa_demangle
-#endif
-
-#if defined(_WIN32) || defined(_WIN64)
-// WAR for strtok_r being called strtok_s on Windows
-#pragma push_macro("strtok_r")
-#undef strtok_r
-#define strtok_r strtok_s
-// WAR for min and max possibly being macros defined by windows.h
-#pragma push_macro("min")
-#pragma push_macro("max")
-#undef min
-#undef max
-#endif
-
-#ifndef JITIFY_PRINT_LOG
-#define JITIFY_PRINT_LOG 1
-#endif
-
-#define JITIFY_PRINT_ALL 0
-
-#if JITIFY_PRINT_ALL
-#define JITIFY_PRINT_INSTANTIATION 1
-#define JITIFY_PRINT_SOURCE 1
-#define JITIFY_PRINT_LOG 1
-#define JITIFY_PRINT_PTX 1
-#define JITIFY_PRINT_LINKER_LOG 1
-#define JITIFY_PRINT_LAUNCH 1
-#define JITIFY_PRINT_HEADER_PATHS 1
-#endif
-
-#if JITIFY_ENABLE_EMBEDDED_FILES
-#define JITIFY_FORCE_UNDEFINED_SYMBOL(x) void* x##_forced = (void*)&x
-/*! Include a source file that has been embedded into the executable using the
- *    GCC linker.
- * \param name The name of the source file (<b>not</b> as a string), which must
- * be sanitized by replacing non-alpha-numeric characters with underscores.
- * E.g., \code{.cpp}JITIFY_INCLUDE_EMBEDDED_FILE(my_header_h)\endcode will
- * include the embedded file "my_header.h".
- * \note Files declared with this macro can be referenced using
- * their original (unsanitized) filenames when creating a \p
- * jitify::Program instance.
- */
-#define JITIFY_INCLUDE_EMBEDDED_FILE(name)                                \
-  extern "C" uint8_t _jitify_binary_##name##_start[] asm("_binary_" #name \
-                                                         "_start");       \
-  extern "C" uint8_t _jitify_binary_##name##_end[] asm("_binary_" #name   \
-                                                       "_end");           \
-  JITIFY_FORCE_UNDEFINED_SYMBOL(_jitify_binary_##name##_start);           \
-  JITIFY_FORCE_UNDEFINED_SYMBOL(_jitify_binary_##name##_end)
-#endif  // JITIFY_ENABLE_EMBEDDED_FILES
-
-/*! Jitify library namespace
- */
-namespace jitify {
-
-/*! Source-file load callback.
- *
- *  \param filename The name of the requested source file.
- *  \param tmp_stream A temporary stream that can be used to hold source code.
- *  \return A pointer to an input stream containing the source code, or NULL
- *  to defer loading of the file to Jitify's file-loading mechanisms.
- */
-typedef std::istream* (*file_callback_type)(std::string filename,
-                                            std::iostream& tmp_stream);
-// Exclude from Doxygen
-//! \cond
-
-class JitCache;
-
-// Simple cache using LRU discard policy
-template <typename KeyType, typename ValueType>
-class ObjectCache {
- public:
-  typedef KeyType key_type;
-  typedef ValueType value_type;
-
- private:
-  typedef std::map<key_type, value_type> object_map;
-  typedef std::deque<key_type> key_rank;
-  typedef typename key_rank::iterator rank_iterator;
-  object_map _objects;
-  key_rank _ranked_keys;
-  size_t _capacity;
-
-  inline void discard_old(size_t n = 0) {
-    if (n > _capacity) {
-      throw std::runtime_error("Insufficient capacity in cache");
-    }
-    while (_objects.size() > _capacity - n) {
-      key_type discard_key = _ranked_keys.back();
-      _ranked_keys.pop_back();
-      _objects.erase(discard_key);
-    }
-  }
-
- public:
-  inline ObjectCache(size_t capacity = 8) : _capacity(capacity) {}
-  inline void resize(size_t capacity) {
-    _capacity = capacity;
-    this->discard_old();
-  }
-  inline bool contains(const key_type& k) const {
-    return (bool)_objects.count(k);
-  }
-  inline void touch(const key_type& k) {
-    if (!this->contains(k)) {
-      throw std::runtime_error("Key not found in cache");
-    }
-    rank_iterator rank = std::find(_ranked_keys.begin(), _ranked_keys.end(), k);
-    if (rank != _ranked_keys.begin()) {
-      // Move key to front of ranks
-      _ranked_keys.erase(rank);
-      _ranked_keys.push_front(k);
-    }
-  }
-  inline value_type& get(const key_type& k) {
-    if (!this->contains(k)) {
-      throw std::runtime_error("Key not found in cache");
-    }
-    this->touch(k);
-    return _objects[k];
-  }
-  inline value_type& insert(const key_type& k,
-                            const value_type& v = value_type()) {
-    this->discard_old(1);
-    _ranked_keys.push_front(k);
-    return _objects.insert(std::make_pair(k, v)).first->second;
-  }
-  template <typename... Args>
-  inline value_type& emplace(const key_type& k, Args&&... args) {
-    this->discard_old(1);
-    // Note: Use of piecewise_construct allows non-movable non-copyable types
-    auto iter = _objects
-                    .emplace(std::piecewise_construct, std::forward_as_tuple(k),
-                             std::forward_as_tuple(args...))
-                    .first;
-    _ranked_keys.push_front(iter->first);
-    return iter->second;
-  }
-};
-
-namespace detail {
-
-// Convenience wrapper for std::vector that provides handy constructors
-template <typename T>
-class vector : public std::vector<T> {
-  typedef std::vector<T> super_type;
-
- public:
-  vector() : super_type() {}
-  vector(size_t n) : super_type(n) {}  // Note: Not explicit, allows =0
-  vector(std::vector<T> const& vals) : super_type(vals) {}
-  template <int N>
-  vector(T const (&vals)[N]) : super_type(vals, vals + N) {}
-  vector(std::vector<T>&& vals) : super_type(vals) {}
-  vector(std::initializer_list<T> vals) : super_type(vals) {}
-};
-
-// Helper functions for parsing/manipulating source code
-
-inline std::string replace_characters(std::string str,
-                                      std::string const& oldchars,
-                                      char newchar) {
-  size_t i = str.find_first_of(oldchars);
-  while (i != std::string::npos) {
-    str[i] = newchar;
-    i = str.find_first_of(oldchars, i + 1);
-  }
-  return str;
-}
-inline std::string sanitize_filename(std::string name) {
-  return replace_characters(name, "/\\.-: ?%*|\"<>", '_');
-}
-
-#if JITIFY_ENABLE_EMBEDDED_FILES
-class EmbeddedData {
-  void* _app;
-  EmbeddedData(EmbeddedData const&);
-  EmbeddedData& operator=(EmbeddedData const&);
-
- public:
-  EmbeddedData() {
-    _app = dlopen(NULL, RTLD_LAZY);
-    if (!_app) {
-      throw std::runtime_error(std::string("dlopen failed: ") + dlerror());
-    }
-    dlerror();  // Clear any existing error
-  }
-  ~EmbeddedData() {
-    if (_app) {
-      dlclose(_app);
-    }
-  }
-  const uint8_t* operator[](std::string key) const {
-    key = sanitize_filename(key);
-    key = "_binary_" + key;
-    uint8_t const* data = (uint8_t const*)dlsym(_app, key.c_str());
-    if (!data) {
-      throw std::runtime_error(std::string("dlsym failed: ") + dlerror());
-    }
-    return data;
-  }
-  const uint8_t* begin(std::string key) const {
-    return (*this)[key + "_start"];
-  }
-  const uint8_t* end(std::string key) const { return (*this)[key + "_end"]; }
-};
-#endif  // JITIFY_ENABLE_EMBEDDED_FILES
-
-inline bool is_tokenchar(char c) {
-  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
-         (c >= '0' && c <= '9') || c == '_';
-}
-inline std::string replace_token(std::string src, std::string token,
-                                 std::string replacement) {
-  size_t i = src.find(token);
-  while (i != std::string::npos) {
-    if (i == 0 || i == src.size() - token.size() ||
-        (!is_tokenchar(src[i - 1]) && !is_tokenchar(src[i + token.size()]))) {
-      src.replace(i, token.size(), replacement);
-      i += replacement.size();
-    } else {
-      i += token.size();
-    }
-    i = src.find(token, i);
-  }
-  return src;
-}
-inline std::string path_base(std::string p) {
-  // "/usr/local/myfile.dat" -> "/usr/local"
-  // "foo/bar"  -> "foo"
-  // "foo/bar/" -> "foo/bar"
-#if defined _WIN32 || defined _WIN64
-  char sep = '\\';
-#else
-  char sep = '/';
-#endif
-  size_t i = p.find_last_of(sep);
-  if (i != std::string::npos) {
-    return p.substr(0, i);
-  } else {
-    return "";
-  }
-}
-inline std::string path_join(std::string p1, std::string p2) {
-#ifdef _WIN32
-  char sep = '\\';
-#else
-  char sep = '/';
-#endif
-  if (p1.size() && p2.size() && p2[0] == sep) {
-    throw std::invalid_argument("Cannot join to absolute path");
-  }
-  if (p1.size() && p1[p1.size() - 1] != sep) {
-    p1 += sep;
-  }
-  return p1 + p2;
-}
-// Elides "/." and "/.." tokens from path.
-inline std::string path_simplify(const std::string& path) {
-  std::vector<std::string> dirs;
-  std::string cur_dir;
-  bool after_slash = false;
-  for (int i = 0; i < (int)path.size(); ++i) {
-    if (path[i] == '/') {
-      if (after_slash) continue;  // Ignore repeat slashes
-      after_slash = true;
-      if (cur_dir == ".." && !dirs.empty() && dirs.back() != "..") {
-        if (dirs.size() == 1 && dirs.front().empty()) {
-          throw std::runtime_error(
-              "Invalid path: back-traversals exceed depth of absolute path");
-        }
-        dirs.pop_back();
-      } else if (cur_dir != ".") {  // Ignore /./
-        dirs.push_back(cur_dir);
-      }
-      cur_dir.clear();
-    } else {
-      after_slash = false;
-      cur_dir.push_back(path[i]);
-    }
-  }
-  if (!after_slash) {
-    dirs.push_back(cur_dir);
-  }
-  std::stringstream ss;
-  for (int i = 0; i < (int)dirs.size() - 1; ++i) {
-    ss << dirs[i] << "/";
-  }
-  if (!dirs.empty()) ss << dirs.back();
-  if (after_slash) ss << "/";
-  return ss.str();
-}
-inline unsigned long long hash_larson64(const char* s,
-                                        unsigned long long seed = 0) {
-  unsigned long long hash = seed;
-  while (*s) {
-    hash = hash * 101 + *s++;
-  }
-  return hash;
-}
-
-inline uint64_t hash_combine(uint64_t a, uint64_t b) {
-  // Note: The magic number comes from the golden ratio
-  return a ^ (0x9E3779B97F4A7C17ull + b + (b >> 2) + (a << 6));
-}
-
-inline bool extract_include_info_from_compile_error(std::string log,
-                                                    std::string& name,
-                                                    std::string& parent,
-                                                    int& line_num) {
-  static const std::vector<std::string> pattern = {
-      "could not open source file \"", "cannot open source file \""};
-
-  for (auto& p : pattern) {
-    size_t beg = log.find(p);
-    if (beg != std::string::npos) {
-      beg += p.size();
-      size_t end = log.find("\"", beg);
-      name = log.substr(beg, end - beg);
-
-      size_t line_beg = log.rfind("\n", beg);
-      if (line_beg == std::string::npos) {
-        line_beg = 0;
-      } else {
-        line_beg += 1;
-      }
-
-      size_t split = log.find("(", line_beg);
-      parent = log.substr(line_beg, split - line_beg);
-      line_num =
-          atoi(log.substr(split + 1, log.find(")", split + 1) - (split + 1))
-                   .c_str());
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-inline bool is_include_directive_with_quotes(const std::string& source,
-                                             int line_num) {
-  // TODO: Check each find() for failure.
-  size_t beg = 0;
-  for (int i = 1; i < line_num; ++i) {
-    beg = source.find("\n", beg) + 1;
-  }
-  beg = source.find("include", beg) + 7;
-  beg = source.find_first_of("\"<", beg);
-  return source[beg] == '"';
-}
-
-inline std::string comment_out_code_line(int line_num, std::string source) {
-  size_t beg = 0;
-  for (int i = 1; i < line_num; ++i) {
-    beg = source.find("\n", beg) + 1;
-  }
-  return (source.substr(0, beg) + "//" + source.substr(beg));
-}
-
-inline void print_with_line_numbers(std::string const& source) {
-  int linenum = 1;
-  std::stringstream source_ss(source);
-  for (std::string line; std::getline(source_ss, line); ++linenum) {
-    std::cout << std::setfill(' ') << std::setw(3) << linenum << " " << line
-              << std::endl;
-  }
-}
-
-inline void print_compile_log(std::string program_name,
-                              std::string const& log) {
-  std::cout << "---------------------------------------------------"
-            << std::endl;
-  std::cout << "--- JIT compile log for " << program_name << " ---"
-            << std::endl;
-  std::cout << "---------------------------------------------------"
-            << std::endl;
-  std::cout << log << std::endl;
-  std::cout << "---------------------------------------------------"
-            << std::endl;
-}
-
-inline std::vector<std::string> split_string(std::string str,
-                                             long maxsplit = -1,
-                                             std::string delims = " \t") {
-  std::vector<std::string> results;
-  if (maxsplit == 0) {
-    results.push_back(str);
-    return results;
-  }
-  // Note: +1 to include NULL-terminator
-  std::vector<char> v_str(str.c_str(), str.c_str() + (str.size() + 1));
-  char* c_str = v_str.data();
-  char* saveptr = c_str;
-  char* token = nullptr;
-  for (long i = 0; i != maxsplit; ++i) {
-    token = ::strtok_r(c_str, delims.c_str(), &saveptr);
-    c_str = 0;
-    if (!token) {
-      return results;
-    }
-    results.push_back(token);
-  }
-  // Check if there's a final piece
-  token += ::strlen(token) + 1;
-  if (token - v_str.data() < (ptrdiff_t)str.size()) {
-    // Find the start of the final piece
-    token += ::strspn(token, delims.c_str());
-    if (*token) {
-      results.push_back(token);
-    }
-  }
-  return results;
-}
-
-static const std::map<std::string, std::string>& get_jitsafe_headers_map();
-
-inline bool load_source(
-    std::string filename, std::map<std::string, std::string>& sources,
-    std::string current_dir = "",
-    std::vector<std::string> include_paths = std::vector<std::string>(),
-    file_callback_type file_callback = 0,
-    std::map<std::string, std::string>* fullpaths = nullptr,
-    bool search_current_dir = true) {
-  std::istream* source_stream = 0;
-  std::stringstream string_stream;
-  std::ifstream file_stream;
-  // First detect direct source-code string ("my_program\nprogram_code...")
-  size_t newline_pos = filename.find("\n");
-  if (newline_pos != std::string::npos) {
-    std::string source = filename.substr(newline_pos + 1);
-    filename = filename.substr(0, newline_pos);
-    string_stream << source;
-    source_stream = &string_stream;
-  }
-  if (sources.count(filename)) {
-    // Already got this one
-    return true;
-  }
-  if (!source_stream) {
-    std::string fullpath = path_join(current_dir, filename);
-    // Try loading from callback
-    if (!file_callback ||
-        !(source_stream = file_callback(fullpath, string_stream))) {
-#if JITIFY_ENABLE_EMBEDDED_FILES
-      // Try loading as embedded file
-      EmbeddedData embedded;
-      std::string source;
-      try {
-        source.assign(embedded.begin(fullpath), embedded.end(fullpath));
-        string_stream << source;
-        source_stream = &string_stream;
-      } catch (std::runtime_error const&)
-#endif  // JITIFY_ENABLE_EMBEDDED_FILES
-      {
-        // Try loading from filesystem
-        bool found_file = false;
-        if (search_current_dir) {
-          file_stream.open(fullpath.c_str());
-          if (file_stream) {
-            source_stream = &file_stream;
-            found_file = true;
-          }
-        }
-        // Search include directories
-        if (!found_file) {
-          for (int i = 0; i < (int)include_paths.size(); ++i) {
-            fullpath = path_join(include_paths[i], filename);
-            file_stream.open(fullpath.c_str());
-            if (file_stream) {
-              source_stream = &file_stream;
-              found_file = true;
-              break;
-            }
-          }
-          if (!found_file) {
-            // Try loading from builtin headers
-            fullpath = path_join("__jitify_builtin", filename);
-            auto it = get_jitsafe_headers_map().find(filename);
-            if (it != get_jitsafe_headers_map().end()) {
-              string_stream << it->second;
-              source_stream = &string_stream;
-            } else {
-              return false;
-            }
-          }
-        }
-      }
-    }
-    if (fullpaths) {
-      // Record the full file path corresponding to this include name.
-      (*fullpaths)[filename] = path_simplify(fullpath);
-    }
-  }
-  sources[filename] = std::string();
-  std::string& source = sources[filename];
-  std::string line;
-  size_t linenum = 0;
-  unsigned long long hash = 0;
-  bool pragma_once = false;
-  bool remove_next_blank_line = false;
-  while (std::getline(*source_stream, line)) {
-    ++linenum;
-
-    // HACK WAR for static variables not allowed on the device (unless
-    // __shared__)
-    // TODO: This breaks static member variables
-    // line = replace_token(line, "static const", "/*static*/ const");
-
-    // TODO: Need to watch out for /* */ comments too
-    std::string cleanline =
-        line.substr(0, line.find("//"));  // Strip line comments
-    // if( cleanline.back() == "\r" ) { // Remove Windows line ending
-    //	cleanline = cleanline.substr(0, cleanline.size()-1);
-    //}
-    // TODO: Should trim whitespace before checking .empty()
-    if (cleanline.empty() && remove_next_blank_line) {
-      remove_next_blank_line = false;
-      continue;
-    }
-    // Maintain a file hash for use in #pragma once WAR
-    hash = hash_larson64(line.c_str(), hash);
-    if (cleanline.find("#pragma once") != std::string::npos) {
-      pragma_once = true;
-      // Note: This is an attempt to recover the original line numbering,
-      //         which otherwise gets off-by-one due to the include guard.
-      remove_next_blank_line = true;
-      // line = "//" + line; // Comment out the #pragma once line
-      continue;
-    }
-
-    // HACK WAR for Thrust using "#define FOO #pragma bar"
-    size_t pragma_beg = cleanline.find("#pragma ");
-    if (pragma_beg != std::string::npos) {
-      std::string line_after_pragma = line.substr(pragma_beg);
-      std::vector<std::string> pragma_split =
-          split_string(line_after_pragma, 2);
-      line =
-          (line.substr(0, pragma_beg) + "_Pragma(\"" + pragma_split[1] + "\")");
-      if (pragma_split.size() == 3) {
-        line += " " + pragma_split[2];
-      }
-    }
-
-    source += line + "\n";
-  }
-  // HACK TESTING (WAR for cub)
-  // source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source;
-  ////source = "cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }\n" +
-  /// source;
-
-  // WAR for #pragma once causing problems when there are multiple inclusions
-  //   of the same header from different paths.
-  if (pragma_once) {
-    std::stringstream ss;
-    ss << std::uppercase << std::hex << std::setw(8) << std::setfill('0')
-       << hash;
-    std::string include_guard_name = "_JITIFY_INCLUDE_GUARD_" + ss.str() + "\n";
-    std::string include_guard_header;
-    include_guard_header += "#ifndef " + include_guard_name;
-    include_guard_header += "#define " + include_guard_name;
-    std::string include_guard_footer;
-    include_guard_footer += "#endif // " + include_guard_name;
-    source = include_guard_header + source + "\n" + include_guard_footer;
-  }
-  // return filename;
-  return true;
-}
-
-}  // namespace detail
-
-//! \endcond
-
-/*! Jitify reflection utilities namespace
- */
-namespace reflection {
-
-//  Provides type and value reflection via a function 'reflect':
-//    reflect<Type>()   -> "Type"
-//    reflect(value)    -> "(T)value"
-//    reflect<VAL>()    -> "VAL"
-//    reflect<Type,VAL> -> "VAL"
-//    reflect_template<float,NonType<int,7>,char>() -> "<float,7,char>"
-//    reflect_template({"float", "7", "char"}) -> "<float,7,char>"
-
-/*! A wrapper class for non-type template parameters.
- */
-template <typename T, T VALUE_>
-struct NonType {
-  constexpr static T VALUE = VALUE_;
-};
-
-// Forward declaration
-template <typename T>
-inline std::string reflect(T const& value);
-
-//! \cond
-
-namespace detail {
-
-template <typename T>
-inline std::string value_string(const T& x) {
-  std::stringstream ss;
-  ss << x;
-  return ss.str();
-}
-// WAR for non-printable characters
-template <>
-inline std::string value_string<char>(const char& x) {
-  std::stringstream ss;
-  ss << (int)x;
-  return ss.str();
-}
-template <>
-inline std::string value_string<signed char>(const signed char& x) {
-  std::stringstream ss;
-  ss << (int)x;
-  return ss.str();
-}
-template <>
-inline std::string value_string<unsigned char>(const unsigned char& x) {
-  std::stringstream ss;
-  ss << (int)x;
-  return ss.str();
-}
-template <>
-inline std::string value_string<wchar_t>(const wchar_t& x) {
-  std::stringstream ss;
-  ss << (long)x;
-  return ss.str();
-}
-// Specialisation for bool true/false literals
-template <>
-inline std::string value_string<bool>(const bool& x) {
-  return x ? "true" : "false";
-}
-
-// Removes all tokens that start with double underscores.
-inline void strip_double_underscore_tokens(char* s) {
-  using jitify::detail::is_tokenchar;
-  char* w = s;
-  do {
-    if (*s == '_' && *(s + 1) == '_') {
-      while (is_tokenchar(*++s))
-        ;
-    }
-  } while ((*w++ = *s++));
-}
-
-//#if CUDA_VERSION < 8000
-#ifdef _MSC_VER  // MSVC compiler
-inline std::string demangle_cuda_symbol(const char* mangled_name) {
-  // We don't have a way to demangle CUDA symbol names under MSVC.
-  return mangled_name;
-}
-inline std::string demangle_native_type(const std::type_info& typeinfo) {
-  // Get the decorated name and skip over the leading '.'.
-  const char* decorated_name = typeinfo.raw_name() + 1;
-  char undecorated_name[4096];
-  if (UnDecorateSymbolName(
-          decorated_name, undecorated_name,
-          sizeof(undecorated_name) / sizeof(*undecorated_name),
-          UNDNAME_NO_ARGUMENTS |          // Treat input as a type name
-              UNDNAME_NAME_ONLY           // No "class" and "struct" prefixes
-          /*UNDNAME_NO_MS_KEYWORDS*/)) {  // No "__cdecl", "__ptr64" etc.
-    // WAR for UNDNAME_NO_MS_KEYWORDS messing up function types.
-    strip_double_underscore_tokens(undecorated_name);
-    return undecorated_name;
-  }
-  throw std::runtime_error("UnDecorateSymbolName failed");
-}
-#else   // not MSVC
-inline std::string demangle_cuda_symbol(const char* mangled_name) {
-  size_t bufsize = 0;
-  char* buf = nullptr;
-  std::string demangled_name;
-  int status;
-  auto demangled_ptr = std::unique_ptr<char, decltype(free)*>(
-      abi::__cxa_demangle(mangled_name, buf, &bufsize, &status), free);
-  if (status == 0) {
-    demangled_name = demangled_ptr.get();  // all worked as expected
-  } else if (status == -2) {
-    demangled_name = mangled_name;  // we interpret this as plain C name
-  } else if (status == -1) {
-    throw std::runtime_error(
-        std::string("memory allocation failure in __cxa_demangle"));
-  } else if (status == -3) {
-    throw std::runtime_error(std::string("invalid argument to __cxa_demangle"));
-  }
-  return demangled_name;
-}
-inline std::string demangle_native_type(const std::type_info& typeinfo) {
-  return demangle_cuda_symbol(typeinfo.name());
-}
-#endif  // not MSVC
-//#endif // CUDA_VERSION < 8000
-
-template <typename>
-class JitifyTypeNameWrapper_ {};
-
-template <typename T>
-struct type_reflection {
-  inline static std::string name() {
-    //#if CUDA_VERSION < 8000
-    // TODO: Use nvrtcGetTypeName once it has the same behavior as this.
-    // WAR for typeid discarding cv qualifiers on value-types
-    // Wrap type in dummy template class to preserve cv-qualifiers, then strip
-    // off the wrapper from the resulting string.
-    std::string wrapped_name =
-        demangle_native_type(typeid(JitifyTypeNameWrapper_<T>));
-    // Note: The reflected name of this class also has namespace prefixes.
-    const std::string wrapper_class_name = "JitifyTypeNameWrapper_<";
-    size_t start = wrapped_name.find(wrapper_class_name);
-    if (start == std::string::npos) {
-      throw std::runtime_error("Type reflection failed: " + wrapped_name);
-    }
-    start += wrapper_class_name.size();
-    std::string name =
-        wrapped_name.substr(start, wrapped_name.size() - (start + 1));
-    return name;
-    //#else
-    //         std::string ret;
-    //         nvrtcResult status = nvrtcGetTypeName<T>(&ret);
-    //         if( status != NVRTC_SUCCESS ) {
-    //                 throw std::runtime_error(std::string("nvrtcGetTypeName
-    // failed:
-    //")+ nvrtcGetErrorString(status));
-    //         }
-    //         return ret;
-    //#endif
-  }
-};  // namespace detail
-template <typename T, T VALUE>
-struct type_reflection<NonType<T, VALUE> > {
-  inline static std::string name() {
-    return jitify::reflection::reflect(VALUE);
-  }
-};
-
-}  // namespace detail
-
-//! \endcond
-
-/*! Create an Instance object that contains a const reference to the
- *  value.  We use this to wrap abstract objects from which we want to extract
- *  their type at runtime (e.g., derived type).  This is used to facilitate
- *  templating on derived type when all we know at compile time is abstract
- * type.
- */
-template <typename T>
-struct Instance {
-  const T& value;
-  Instance(const T& value) : value(value) {}
-};
-
-/*! Create an Instance object from which we can extract the value's run-time
- * type.
- *  \param value The const value to be captured.
- */
-template <typename T>
-inline Instance<T const> instance_of(T const& value) {
-  return Instance<T const>(value);
-}
-
-/*! A wrapper used for representing types as values.
- */
-template <typename T>
-struct Type {};
-
-// Type reflection
-// E.g., reflect<float>() -> "float"
-// Note: This strips trailing const and volatile qualifiers
-/*! Generate a code-string for a type.
- *  \code{.cpp}reflect<float>() --> "float"\endcode
- */
-template <typename T>
-inline std::string reflect() {
-  return detail::type_reflection<T>::name();
-}
-// Value reflection
-// E.g., reflect(3.14f) -> "(float)3.14"
-/*! Generate a code-string for a value.
- *  \code{.cpp}reflect(3.14f) --> "(float)3.14"\endcode
- */
-template <typename T>
-inline std::string reflect(T const& value) {
-  return "(" + reflect<T>() + ")" + detail::value_string(value);
-}
-// Non-type template arg reflection (implicit conversion to int64_t)
-// E.g., reflect<7>() -> "(int64_t)7"
-/*! Generate a code-string for an integer non-type template argument.
- *  \code{.cpp}reflect<7>() --> "(int64_t)7"\endcode
- */
-template <int64_t N>
-inline std::string reflect() {
-  return reflect<NonType<int64_t, N> >();
-}
-// Non-type template arg reflection (explicit type)
-// E.g., reflect<int,7>() -> "(int)7"
-/*! Generate a code-string for a generic non-type template argument.
- *  \code{.cpp} reflect<int,7>() --> "(int)7" \endcode
- */
-template <typename T, T N>
-inline std::string reflect() {
-  return reflect<NonType<T, N> >();
-}
-// Type reflection via value
-// E.g., reflect(Type<float>()) -> "float"
-/*! Generate a code-string for a type wrapped as a Type instance.
- *  \code{.cpp}reflect(Type<float>()) --> "float"\endcode
- */
-template <typename T>
-inline std::string reflect(jitify::reflection::Type<T>) {
-  return reflect<T>();
-}
-
-/*! Generate a code-string for a type wrapped as an Instance instance.
- *  \code{.cpp}reflect(Instance<float>(3.1f)) --> "float"\endcode
- *  or more simply when passed to a instance_of helper
- *  \code{.cpp}reflect(instance_of(3.1f)) --> "float"\endcodei
- *  This is specifically for the case where we want to extract the run-time
- * type, e.g., derived type, of an object pointer.
- */
-template <typename T>
-inline std::string reflect(jitify::reflection::Instance<T>& value) {
-  return detail::demangle_native_type(typeid(value.value));
-}
-
-// Type from value
-// E.g., type_of(3.14f) -> Type<float>()
-/*! Create a Type object representing a value's type.
- *  \param value The value whose type is to be captured.
- */
-template <typename T>
-inline Type<T> type_of(T& value) {
-  return Type<T>();
-}
-/*! Create a Type object representing a value's type.
- *  \param value The const value whose type is to be captured.
- */
-template <typename T>
-inline Type<T const> type_of(T const& value) {
-  return Type<T const>();
-}
-
-// Multiple value reflections one call, returning list of strings
-template <typename... Args>
-inline std::vector<std::string> reflect_all(Args... args) {
-  return {reflect(args)...};
-}
-
-inline std::string reflect_list(jitify::detail::vector<std::string> const& args,
-                                std::string opener = "",
-                                std::string closer = "") {
-  std::stringstream ss;
-  ss << opener;
-  for (int i = 0; i < (int)args.size(); ++i) {
-    if (i > 0) ss << ",";
-    ss << args[i];
-  }
-  ss << closer;
-  return ss.str();
-}
-
-// Template instantiation reflection
-// inline std::string reflect_template(std::vector<std::string> const& args) {
-inline std::string reflect_template(
-    jitify::detail::vector<std::string> const& args) {
-  // Note: The space in " >" is a WAR to avoid '>>' appearing
-  return reflect_list(args, "<", " >");
-}
-// TODO: See if can make this evaluate completely at compile-time
-template <typename... Ts>
-inline std::string reflect_template() {
-  return reflect_template({reflect<Ts>()...});
-  // return reflect_template<sizeof...(Ts)>({reflect<Ts>()...});
-}
-
-}  // namespace reflection
-
-//! \cond
-
-namespace detail {
-
-// Demangles nested variable names using the PTX name mangling scheme
-// (which follows the Itanium64 ABI). E.g., _ZN1a3Foo2bcE -> a::Foo::bc.
-inline std::string demangle_ptx_variable_name(const char* name) {
-  std::stringstream ss;
-  const char* c = name;
-  if (*c++ != '_' || *c++ != 'Z') return name;  // Non-mangled name
-  if (*c++ != 'N') return "";  // Not a nested name, unsupported
-  while (true) {
-    // Parse identifier length.
-    int n = 0;
-    while (std::isdigit(*c)) {
-      n = n * 10 + (*c - '0');
-      c++;
-    }
-    if (!n) return "";  // Invalid or unsupported mangled name
-    // Parse identifier.
-    const char* c0 = c;
-    while (n-- && *c) c++;
-    if (!*c) return "";  // Mangled name is truncated
-    std::string id(c0, c);
-    // Identifiers starting with "_GLOBAL" are anonymous namespaces.
-    ss << (id.substr(0, 7) == "_GLOBAL" ? "(anonymous namespace)" : id);
-    // Nested name specifiers end with 'E'.
-    if (*c == 'E') break;
-    // There are more identifiers to come, add join token.
-    ss << "::";
-  }
-  return ss.str();
-}
-
-static const char* get_current_executable_path() {
-  static const char* path = []() -> const char* {
-    static char buffer[JITIFY_PATH_MAX] = {};
-#ifdef __linux__
-    if (!::realpath("/proc/self/exe", buffer)) return nullptr;
-#elif defined(_WIN32) || defined(_WIN64)
-    if (!GetModuleFileNameA(nullptr, buffer, JITIFY_PATH_MAX)) return nullptr;
-#endif
-    return buffer;
-  }();
-  return path;
-}
-
-inline bool endswith(const std::string& str, const std::string& suffix) {
-  return str.size() >= suffix.size() &&
-         str.substr(str.size() - suffix.size()) == suffix;
-}
-
-// Infers the JIT input type from the filename suffix. If no known suffix is
-// present, the filename is assumed to refer to a library, and the associated
-// suffix (and possibly prefix) is automatically added to the filename.
-inline CUjitInputType get_cuda_jit_input_type(std::string* filename) {
-  if (endswith(*filename, ".ptx")) {
-    return CU_JIT_INPUT_PTX;
-  } else if (endswith(*filename, ".cubin")) {
-    return CU_JIT_INPUT_CUBIN;
-  } else if (endswith(*filename, ".fatbin")) {
-    return CU_JIT_INPUT_FATBINARY;
-  } else if (endswith(*filename,
-#if defined _WIN32 || defined _WIN64
-                      ".obj"
-#else  // Linux
-                      ".o"
-#endif
-                      )) {
-    return CU_JIT_INPUT_OBJECT;
-  } else {  // Assume library
-#if defined _WIN32 || defined _WIN64
-    if (!endswith(*filename, ".lib")) {
-      *filename += ".lib";
-    }
-#else  // Linux
-    if (!endswith(*filename, ".a")) {
-      *filename = "lib" + *filename + ".a";
-    }
-#endif
-    return CU_JIT_INPUT_LIBRARY;
-  }
-}
-
-class CUDAKernel {
-  std::vector<std::string> _link_files;
-  std::vector<std::string> _link_paths;
-  CUlinkState _link_state;
-  CUmodule _module;
-  CUfunction _kernel;
-  std::string _func_name;
-  std::string _ptx;
-  std::map<std::string, std::string> _global_map;
-  std::vector<CUjit_option> _opts;
-  std::vector<void*> _optvals;
-#ifdef JITIFY_PRINT_LINKER_LOG
-  static const unsigned int _log_size = 8192;
-  char _error_log[_log_size];
-  char _info_log[_log_size];
-#endif
-
-  inline void cuda_safe_call(CUresult res) const {
-    if (res != CUDA_SUCCESS) {
-      const char* msg;
-      cuGetErrorName(res, &msg);
-      throw std::runtime_error(msg);
-    }
-  }
-  inline void create_module(std::vector<std::string> link_files,
-                            std::vector<std::string> link_paths) {
-    CUresult result;
-#ifndef JITIFY_PRINT_LINKER_LOG
-    // WAR since linker log does not seem to be constructed using a single call
-    // to cuModuleLoadDataEx.
-    if (link_files.empty()) {
-      result =
-          cuModuleLoadDataEx(&_module, _ptx.c_str(), (unsigned)_opts.size(),
-                             _opts.data(), _optvals.data());
-    } else
-#endif
-    {
-      cuda_safe_call(cuLinkCreate((unsigned)_opts.size(), _opts.data(),
-                                  _optvals.data(), &_link_state));
-      cuda_safe_call(cuLinkAddData(_link_state, CU_JIT_INPUT_PTX,
-                                   (void*)_ptx.c_str(), _ptx.size(),
-                                   "jitified_source.ptx", 0, 0, 0));
-      for (int i = 0; i < (int)link_files.size(); ++i) {
-        std::string link_file = link_files[i];
-        CUjitInputType jit_input_type;
-        if (link_file == ".") {
-          // Special case for linking to current executable.
-          link_file = get_current_executable_path();
-          jit_input_type = CU_JIT_INPUT_OBJECT;
-        } else {
-          // Infer based on filename.
-          jit_input_type = get_cuda_jit_input_type(&link_file);
-        }
-        CUresult result = cuLinkAddFile(_link_state, jit_input_type,
-                                        link_file.c_str(), 0, 0, 0);
-        int path_num = 0;
-        while (result == CUDA_ERROR_FILE_NOT_FOUND &&
-               path_num < (int)link_paths.size()) {
-          std::string filename = path_join(link_paths[path_num++], link_file);
-          result = cuLinkAddFile(_link_state, jit_input_type, filename.c_str(),
-                                 0, 0, 0);
-        }
-#if JITIFY_PRINT_LINKER_LOG
-        if (result == CUDA_ERROR_FILE_NOT_FOUND) {
-          std::cerr << "Linker error: Device library not found: " << link_file
-                    << std::endl;
-        } else if (result != CUDA_SUCCESS) {
-          std::cerr << "Linker error: Failed to add file: " << link_file
-                    << std::endl;
-          std::cerr << _error_log << std::endl;
-        }
-#endif
-        cuda_safe_call(result);
-      }
-      size_t cubin_size;
-      void* cubin;
-      result = cuLinkComplete(_link_state, &cubin, &cubin_size);
-      if (result == CUDA_SUCCESS) {
-        result = cuModuleLoadData(&_module, cubin);
-      }
-    }
-#ifdef JITIFY_PRINT_LINKER_LOG
-    std::cout << "---------------------------------------" << std::endl;
-    std::cout << "--- Linker for "
-              << reflection::detail::demangle_cuda_symbol(_func_name.c_str())
-              << " ---" << std::endl;
-    std::cout << "---------------------------------------" << std::endl;
-    std::cout << _info_log << std::endl;
-    std::cout << std::endl;
-    std::cout << _error_log << std::endl;
-    std::cout << "---------------------------------------" << std::endl;
-#endif
-    cuda_safe_call(result);
-    // Allow _func_name to be empty to support cases where we want to generate
-    // PTX containing extern symbol definitions but no kernels.
-    if (!_func_name.empty()) {
-      cuda_safe_call(
-          cuModuleGetFunction(&_kernel, _module, _func_name.c_str()));
-    }
-  }
-  inline void destroy_module() {
-    if (_link_state) {
-      cuda_safe_call(cuLinkDestroy(_link_state));
-    }
-    _link_state = 0;
-    if (_module) {
-      cuModuleUnload(_module);
-    }
-    _module = 0;
-  }
-
-  // create a map of __constant__ and __device__ variables in the ptx file
-  // mapping demangled to mangled name
-  inline void create_global_variable_map() {
-    size_t pos = 0;
-    while (pos < _ptx.size()) {
-      pos = std::min(_ptx.find(".const .align", pos),
-                     _ptx.find(".global .align", pos));
-      if (pos == std::string::npos) break;
-      size_t end = _ptx.find_first_of(";=", pos);
-      if (_ptx[end] == '=') --end;
-      std::string line = _ptx.substr(pos, end - pos);
-      pos = end;
-      size_t symbol_start = line.find_last_of(" ") + 1;
-      size_t symbol_end = line.find_last_of("[");
-      std::string entry = line.substr(symbol_start, symbol_end - symbol_start);
-      std::string key = detail::demangle_ptx_variable_name(entry.c_str());
-      // Skip unsupported mangled names. E.g., a static variable defined inside
-      // a function (such variables are not directly addressable from outside
-      // the function, so skipping them is the correct behavior).
-      if (key == "") continue;
-      _global_map[key] = entry;
-    }
-  }
-
-  inline void set_linker_log() {
-#ifdef JITIFY_PRINT_LINKER_LOG
-    _opts.push_back(CU_JIT_INFO_LOG_BUFFER);
-    _optvals.push_back((void*)_info_log);
-    _opts.push_back(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES);
-    _optvals.push_back((void*)(long)_log_size);
-    _opts.push_back(CU_JIT_ERROR_LOG_BUFFER);
-    _optvals.push_back((void*)_error_log);
-    _opts.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES);
-    _optvals.push_back((void*)(long)_log_size);
-    _opts.push_back(CU_JIT_LOG_VERBOSE);
-    _optvals.push_back((void*)1);
-#endif
-  }
-
- public:
-  inline CUDAKernel() : _link_state(0), _module(0), _kernel(0) {}
-  inline CUDAKernel(const CUDAKernel& other) = delete;
-  inline CUDAKernel& operator=(const CUDAKernel& other) = delete;
-  inline CUDAKernel(CUDAKernel&& other) = delete;
-  inline CUDAKernel& operator=(CUDAKernel&& other) = delete;
-  inline CUDAKernel(const char* func_name, const char* ptx,
-                    std::vector<std::string> link_files,
-                    std::vector<std::string> link_paths, unsigned int nopts = 0,
-                    CUjit_option* opts = 0, void** optvals = 0)
-      : _link_files(link_files),
-        _link_paths(link_paths),
-        _link_state(0),
-        _module(0),
-        _kernel(0),
-        _func_name(func_name),
-        _ptx(ptx),
-        _opts(opts, opts + nopts),
-        _optvals(optvals, optvals + nopts) {
-    this->set_linker_log();
-    this->create_module(link_files, link_paths);
-    this->create_global_variable_map();
-  }
-
-  inline CUDAKernel& set(const char* func_name, const char* ptx,
-                         std::vector<std::string> link_files,
-                         std::vector<std::string> link_paths,
-                         unsigned int nopts = 0, CUjit_option* opts = 0,
-                         void** optvals = 0) {
-    this->destroy_module();
-    _func_name = func_name;
-    _ptx = ptx;
-    _link_files = link_files;
-    _link_paths = link_paths;
-    _opts.assign(opts, opts + nopts);
-    _optvals.assign(optvals, optvals + nopts);
-    this->set_linker_log();
-    this->create_module(link_files, link_paths);
-    this->create_global_variable_map();
-    return *this;
-  }
-  inline ~CUDAKernel() { this->destroy_module(); }
-  inline operator CUfunction() const { return _kernel; }
-
-  inline CUresult launch(dim3 grid, dim3 block, unsigned int smem,
-                         CUstream stream, std::vector<void*> arg_ptrs) const {
-    return cuLaunchKernel(_kernel, grid.x, grid.y, grid.z, block.x, block.y,
-                          block.z, smem, stream, arg_ptrs.data(), NULL);
-  }
-
-  inline CUdeviceptr get_global_ptr(const char* name,
-                                    size_t* size = nullptr) const {
-    CUdeviceptr global_ptr = 0;
-    auto global = _global_map.find(name);
-    if (global != _global_map.end()) {
-      cuda_safe_call(cuModuleGetGlobal(&global_ptr, size, _module,
-                                       global->second.c_str()));
-    } else {
-      throw std::runtime_error(std::string("failed to look up global ") + name);
-    }
-    return global_ptr;
-  }
-
-  template <typename T>
-  inline CUresult get_global_data(const char* name, T* data, size_t count,
-                                  CUstream stream = 0) const {
-    size_t size_bytes;
-    CUdeviceptr ptr = get_global_ptr(name, &size_bytes);
-    size_t given_size_bytes = count * sizeof(T);
-    if (given_size_bytes != size_bytes) {
-      throw std::runtime_error(
-          std::string("Value for global variable ") + name +
-          " has wrong size: got " + std::to_string(given_size_bytes) +
-          " bytes, expected " + std::to_string(size_bytes));
-    }
-    return cuMemcpyDtoH(data, ptr, size_bytes);
-  }
-
-  template <typename T>
-  inline CUresult set_global_data(const char* name, const T* data, size_t count,
-                                  CUstream stream = 0) const {
-    size_t size_bytes;
-    CUdeviceptr ptr = get_global_ptr(name, &size_bytes);
-    size_t given_size_bytes = count * sizeof(T);
-    if (given_size_bytes != size_bytes) {
-      throw std::runtime_error(
-          std::string("Value for global variable ") + name +
-          " has wrong size: got " + std::to_string(given_size_bytes) +
-          " bytes, expected " + std::to_string(size_bytes));
-    }
-    return cuMemcpyHtoD(ptr, data, size_bytes);
-  }
-
-  const std::string& function_name() const { return _func_name; }
-  const std::string& ptx() const { return _ptx; }
-  const std::vector<std::string>& link_files() const { return _link_files; }
-  const std::vector<std::string>& link_paths() const { return _link_paths; }
-};
-
-static const char* jitsafe_header_preinclude_h = R"(
-//// WAR for Thrust (which appears to have forgotten to include this in result_of_adaptable_function.h
-//#include <type_traits>
-
-//// WAR for Thrust (which appear to have forgotten to include this in error_code.h)
-//#include <string>
-
-// WAR for Thrust (which only supports gnuc, clang or msvc)
-#define __GNUC__ 4
-
-// WAR for generics/shfl.h
-#define THRUST_STATIC_ASSERT(x)
-
-// WAR for CUB
-#ifdef __host__
-#undef __host__
-#endif
-#define __host__
-
-// WAR to allow exceptions to be parsed
-#define try
-#define catch(...)
-)";
-
-
-static const char* jitsafe_header_float_h = R"(
-#pragma once
-
-#define FLT_RADIX       2
-#define FLT_MANT_DIG    24
-#define DBL_MANT_DIG    53
-#define FLT_DIG         6
-#define DBL_DIG         15
-#define FLT_MIN_EXP     -125
-#define DBL_MIN_EXP     -1021
-#define FLT_MIN_10_EXP  -37
-#define DBL_MIN_10_EXP  -307
-#define FLT_MAX_EXP     128
-#define DBL_MAX_EXP     1024
-#define FLT_MAX_10_EXP  38
-#define DBL_MAX_10_EXP  308
-#define FLT_MAX         3.4028234e38f 
-#define DBL_MAX         1.7976931348623157e308 
-#define FLT_EPSILON     1.19209289e-7f 
-#define DBL_EPSILON     2.220440492503130e-16 
-#define FLT_MIN         1.1754943e-38f; 
-#define DBL_MIN         2.2250738585072013e-308 
-#define FLT_ROUNDS      1
-#if defined __cplusplus && __cplusplus >= 201103L
-#define FLT_EVAL_METHOD 0
-#define DECIMAL_DIG     21
-#endif
-)";
-
-static const char* jitsafe_header_limits_h = R"(
-#pragma once
-
-#if defined _WIN32 || defined _WIN64
- #define __WORDSIZE 32
-#else
- #if defined __x86_64__ && !defined __ILP32__
-  #define __WORDSIZE 64
- #else
-  #define __WORDSIZE 32
- #endif
-#endif
-#define MB_LEN_MAX  16
-#define CHAR_BIT    8
-#define SCHAR_MIN   (-128)
-#define SCHAR_MAX   127
-#define UCHAR_MAX   255
-enum {
-  _JITIFY_CHAR_IS_UNSIGNED = (char)-1 >= 0,
-  CHAR_MIN = _JITIFY_CHAR_IS_UNSIGNED ? 0 : SCHAR_MIN,
-  CHAR_MAX = _JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX,
-};
-#define SHRT_MIN    (-32768)
-#define SHRT_MAX    32767
-#define USHRT_MAX   65535
-#define INT_MIN     (-INT_MAX - 1)
-#define INT_MAX     2147483647
-#define UINT_MAX    4294967295U
-#if __WORDSIZE == 64
- # define LONG_MAX  9223372036854775807L
-#else
- # define LONG_MAX  2147483647L
-#endif
-#define LONG_MIN    (-LONG_MAX - 1L)
-#if __WORDSIZE == 64
- #define ULONG_MAX  18446744073709551615UL
-#else
- #define ULONG_MAX  4294967295UL
-#endif
-#define LLONG_MAX  9223372036854775807LL
-#define LLONG_MIN  (-LLONG_MAX - 1LL)
-#define ULLONG_MAX 18446744073709551615ULL
-)";
-
-static const char* jitsafe_header_iterator = R"(
-#pragma once
-
-namespace __jitify_iterator_ns {
-struct output_iterator_tag {};
-struct input_iterator_tag {};
-struct forward_iterator_tag {};
-struct bidirectional_iterator_tag {};
-struct random_access_iterator_tag {};
-template<class Iterator>
-struct iterator_traits {
-  typedef typename Iterator::iterator_category iterator_category;
-  typedef typename Iterator::value_type        value_type;
-  typedef typename Iterator::difference_type   difference_type;
-  typedef typename Iterator::pointer           pointer;
-  typedef typename Iterator::reference         reference;
-};
-template<class T>
-struct iterator_traits<T*> {
-  typedef random_access_iterator_tag iterator_category;
-  typedef T                          value_type;
-  typedef ptrdiff_t                  difference_type;
-  typedef T*                         pointer;
-  typedef T&                         reference;
-};
-template<class T>
-struct iterator_traits<T const*> {
-  typedef random_access_iterator_tag iterator_category;
-  typedef T                          value_type;
-  typedef ptrdiff_t                  difference_type;
-  typedef T const*                   pointer;
-  typedef T const&                   reference;
-};
-} // namespace __jitify_iterator_ns
-namespace std { using namespace __jitify_iterator_ns; }
-using namespace __jitify_iterator_ns;
-)";
-
-// TODO: This is incomplete; need floating point limits
-//   Joe Eaton: added IEEE float and double types, none of the smaller types
-//              using type specific structs since we can't template on floats.
-static const char* jitsafe_header_limits = R"(
-#pragma once
-#include <climits>
-#include <cfloat>
-// TODO: epsilon(), infinity(), etc
-namespace __jitify_detail {
-#if __cplusplus >= 201103L
-#define JITIFY_CXX11_CONSTEXPR constexpr
-#define JITIFY_CXX11_NOEXCEPT noexcept
-#else
-#define JITIFY_CXX11_CONSTEXPR
-#define JITIFY_CXX11_NOEXCEPT
-#endif
-
-struct FloatLimits {
-#if __cplusplus >= 201103L
-   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
-          float lowest() JITIFY_CXX11_NOEXCEPT {   return -FLT_MAX;}
-   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
-          float min() JITIFY_CXX11_NOEXCEPT {      return FLT_MIN; }
-   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
-          float max() JITIFY_CXX11_NOEXCEPT {      return FLT_MAX; }
-#endif  // __cplusplus >= 201103L
-   enum {
-   is_specialized    = true,
-   is_signed         = true,
-   is_integer        = false,
-   is_exact          = false,
-   has_infinity      = true,
-   has_quiet_NaN     = true,
-   has_signaling_NaN = true,
-   has_denorm        = 1,
-   has_denorm_loss   = true,
-   round_style       = 1,
-   is_iec559         = true,
-   is_bounded        = true,
-   is_modulo         = false,
-   digits            = 24,
-   digits10          = 6,
-   max_digits10      = 9,
-   radix             = 2,
-   min_exponent      = -125,
-   min_exponent10    = -37,
-   max_exponent      = 128,
-   max_exponent10    = 38,
-   tinyness_before   = false,
-   traps             = false
-   };
-};
-struct DoubleLimits {
-#if __cplusplus >= 201103L
-   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
-          double lowest() noexcept { return -DBL_MAX; }
-   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
-          double min() noexcept { return DBL_MIN; }
-   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
-          double max() noexcept { return DBL_MAX; }
-#endif  // __cplusplus >= 201103L
-   enum {
-   is_specialized    = true,
-   is_signed         = true,
-   is_integer        = false,
-   is_exact          = false,
-   has_infinity      = true,
-   has_quiet_NaN     = true,
-   has_signaling_NaN = true,
-   has_denorm        = 1,
-   has_denorm_loss   = true,
-   round_style       = 1,
-   is_iec559         = true,
-   is_bounded        = true,
-   is_modulo         = false,
-   digits            = 53,
-   digits10          = 15,
-   max_digits10      = 17,
-   radix             = 2,
-   min_exponent      = -1021,
-   min_exponent10    = -307,
-   max_exponent      = 1024,
-   max_exponent10    = 308,
-   tinyness_before   = false,
-   traps             = false
-   };
-};
-template<class T, T Min, T Max, int Digits=-1>
-struct IntegerLimits {
-	static inline __host__ __device__ T min() { return Min; }
-	static inline __host__ __device__ T max() { return Max; }
-#if __cplusplus >= 201103L
-	static constexpr inline __host__ __device__ T lowest() noexcept {
-		return Min;
-	}
-#endif  // __cplusplus >= 201103L
-	enum {
-       is_specialized = true,
-       digits = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits,
-       digits10   = (digits * 30103) / 100000,
-       is_signed  = ((T)(-1)<0),
-       is_integer = true,
-       is_exact   = true,
-       radix      = 2,
-       is_bounded = true,
-       is_modulo  = false
-	};
-};
-} // namespace __jitify_detail
-namespace std { using namespace __jitify_detail; }
-namespace __jitify_limits_ns {
-template<typename T> struct numeric_limits {
-    enum { is_specialized = false };
-};
-template<> struct numeric_limits<bool>               : public 
-__jitify_detail::IntegerLimits<bool,              false,    true,1> {};
-template<> struct numeric_limits<char>               : public 
-__jitify_detail::IntegerLimits<char,              CHAR_MIN, CHAR_MAX> 
-{};
-template<> struct numeric_limits<signed char>        : public 
-__jitify_detail::IntegerLimits<signed char,       SCHAR_MIN,SCHAR_MAX> 
-{};
-template<> struct numeric_limits<unsigned char>      : public 
-__jitify_detail::IntegerLimits<unsigned char,     0,        UCHAR_MAX> 
-{};
-template<> struct numeric_limits<wchar_t>            : public 
-__jitify_detail::IntegerLimits<wchar_t,           INT_MIN,  INT_MAX> {};
-template<> struct numeric_limits<short>              : public 
-__jitify_detail::IntegerLimits<short,             SHRT_MIN, SHRT_MAX> 
-{};
-template<> struct numeric_limits<unsigned short>     : public 
-__jitify_detail::IntegerLimits<unsigned short,    0,        USHRT_MAX> 
-{};
-template<> struct numeric_limits<int>                : public 
-__jitify_detail::IntegerLimits<int,               INT_MIN,  INT_MAX> {};
-template<> struct numeric_limits<unsigned int>       : public 
-__jitify_detail::IntegerLimits<unsigned int,      0,        UINT_MAX> 
-{};
-template<> struct numeric_limits<long>               : public 
-__jitify_detail::IntegerLimits<long,              LONG_MIN, LONG_MAX> 
-{};
-template<> struct numeric_limits<unsigned long>      : public 
-__jitify_detail::IntegerLimits<unsigned long,     0,        ULONG_MAX> 
-{};
-template<> struct numeric_limits<long long>          : public 
-__jitify_detail::IntegerLimits<long long,         LLONG_MIN,LLONG_MAX> 
-{};
-template<> struct numeric_limits<unsigned long long> : public 
-__jitify_detail::IntegerLimits<unsigned long long,0,        ULLONG_MAX> 
-{};
-//template<typename T> struct numeric_limits { static const bool 
-//is_signed = ((T)(-1)<0); };
-template<> struct numeric_limits<float>              : public 
-__jitify_detail::FloatLimits 
-{};
-template<> struct numeric_limits<double>             : public 
-__jitify_detail::DoubleLimits 
-{};
-} // namespace __jitify_limits_ns
-namespace std { using namespace __jitify_limits_ns; }
-using namespace __jitify_limits_ns;
-)";
-
-// TODO: This is highly incomplete
-static const char* jitsafe_header_type_traits = R"(
-    #pragma once
-    #if __cplusplus >= 201103L
-    namespace __jitify_type_traits_ns {
-
-    template<bool B, class T = void> struct enable_if {};
-    template<class T>                struct enable_if<true, T> { typedef T type; };
-    #if __cplusplus >= 201402L
-    template< bool B, class T = void > using enable_if_t = typename enable_if<B,T>::type;
-    #endif
-
-    struct true_type  {
-      enum { value = true };
-      operator bool() const { return true; }
-    };
-    struct false_type {
-      enum { value = false };
-      operator bool() const { return false; }
-    };
-
-    template<typename T> struct is_floating_point    : false_type {};
-    template<> struct is_floating_point<float>       :  true_type {};
-    template<> struct is_floating_point<double>      :  true_type {};
-    template<> struct is_floating_point<long double> :  true_type {};
-
-    template<class T> struct is_integral              : false_type {};
-    template<> struct is_integral<bool>               :  true_type {};
-    template<> struct is_integral<char>               :  true_type {};
-    template<> struct is_integral<signed char>        :  true_type {};
-    template<> struct is_integral<unsigned char>      :  true_type {};
-    template<> struct is_integral<short>              :  true_type {};
-    template<> struct is_integral<unsigned short>     :  true_type {};
-    template<> struct is_integral<int>                :  true_type {};
-    template<> struct is_integral<unsigned int>       :  true_type {};
-    template<> struct is_integral<long>               :  true_type {};
-    template<> struct is_integral<unsigned long>      :  true_type {};
-    template<> struct is_integral<long long>          :  true_type {};
-    template<> struct is_integral<unsigned long long> :  true_type {};
-
-    template<typename T> struct is_signed    : false_type {};
-    template<> struct is_signed<float>       :  true_type {};
-    template<> struct is_signed<double>      :  true_type {};
-    template<> struct is_signed<long double> :  true_type {};
-    template<> struct is_signed<signed char> :  true_type {};
-    template<> struct is_signed<short>       :  true_type {};
-    template<> struct is_signed<int>         :  true_type {};
-    template<> struct is_signed<long>        :  true_type {};
-    template<> struct is_signed<long long>   :  true_type {};
-
-    template<typename T> struct is_unsigned             : false_type {};
-    template<> struct is_unsigned<unsigned char>      :  true_type {};
-    template<> struct is_unsigned<unsigned short>     :  true_type {};
-    template<> struct is_unsigned<unsigned int>       :  true_type {};
-    template<> struct is_unsigned<unsigned long>      :  true_type {};
-    template<> struct is_unsigned<unsigned long long> :  true_type {};
-
-    template<typename T, typename U> struct is_same      : false_type {};
-    template<typename T>             struct is_same<T,T> :  true_type {};
-
-    template<class T> struct is_array : false_type {};
-    template<class T> struct is_array<T[]> : true_type {};
-    template<class T, size_t N> struct is_array<T[N]> : true_type {};
-
-    //partial implementation only of is_function
-    template<class> struct is_function : false_type { };
-    template<class Ret, class... Args> struct is_function<Ret(Args...)> : true_type {}; //regular
-    template<class Ret, class... Args> struct is_function<Ret(Args......)> : true_type {}; // variadic
-
-    template<class> struct result_of;
-    template<class F, typename... Args>
-    struct result_of<F(Args...)> {
-    // TODO: This is a hack; a proper implem is quite complicated.
-    typedef typename F::result_type type;
-    };
-
-    template <class T> struct remove_reference { typedef T type; };
-    template <class T> struct remove_reference<T&> { typedef T type; };
-    template <class T> struct remove_reference<T&&> { typedef T type; };
-    #if __cplusplus >= 201402L
-    template< class T > using remove_reference_t = typename remove_reference<T>::type;
-    #endif
-
-    template<class T> struct remove_extent { typedef T type; };
-    template<class T> struct remove_extent<T[]> { typedef T type; };
-    template<class T, size_t N> struct remove_extent<T[N]> { typedef T type; };
-    #if __cplusplus >= 201402L
-    template< class T > using remove_extent_t = typename remove_extent<T>::type;
-    #endif
-
-    template< class T > struct remove_const          { typedef T type; };
-    template< class T > struct remove_const<const T> { typedef T type; };
-    template< class T > struct remove_volatile             { typedef T type; };
-    template< class T > struct remove_volatile<volatile T> { typedef T type; };
-    template< class T > struct remove_cv { typedef typename remove_volatile<typename remove_const<T>::type>::type type; };
-    #if __cplusplus >= 201402L
-    template< class T > using remove_cv_t       = typename remove_cv<T>::type;
-    template< class T > using remove_const_t    = typename remove_const<T>::type;
-    template< class T > using remove_volatile_t = typename remove_volatile<T>::type;
-    #endif
-
-    template<bool B, class T, class F> struct conditional { typedef T type; };
-    template<class T, class F> struct conditional<false, T, F> { typedef F type; };
-    #if __cplusplus >= 201402L
-    template< bool B, class T, class F > using conditional_t = typename conditional<B,T,F>::type;
-    #endif
-
-    namespace __jitify_detail {
-    template< class T, bool is_function_type = false > struct add_pointer { using type = typename remove_reference<T>::type*; };
-    template< class T > struct add_pointer<T, true> { using type = T; };
-    template< class T, class... Args > struct add_pointer<T(Args...), true> { using type = T(*)(Args...); };
-    template< class T, class... Args > struct add_pointer<T(Args..., ...), true> { using type = T(*)(Args..., ...); };
-    }
-    template< class T > struct add_pointer : __jitify_detail::add_pointer<T, is_function<T>::value> {};
-    #if __cplusplus >= 201402L
-    template< class T > using add_pointer_t = typename add_pointer<T>::type;
-    #endif
-
-    template< class T > struct decay {
-    private:
-      typedef typename remove_reference<T>::type U;
-    public:
-      typedef typename conditional<is_array<U>::value, typename remove_extent<U>::type*,
-        typename conditional<is_function<U>::value,typename add_pointer<U>::type,typename remove_cv<U>::type
-        >::type>::type type;
-    };
-    #if __cplusplus >= 201402L
-    template< class T > using decay_t = typename decay<T>::type;
-    #endif
-
-    } // namespace __jtiify_type_traits_ns
-    namespace std { using namespace __jitify_type_traits_ns; }
-    using namespace __jitify_type_traits_ns;
-    #endif // c++11
-)";
-
-// TODO: INT_FAST8_MAX et al. and a few other misc constants
-static const char* jitsafe_header_stdint_h =
-    "#pragma once\n"
-    "#include <climits>\n"
-    "namespace __jitify_stdint_ns {\n"
-    "typedef signed char      int8_t;\n"
-    "typedef signed short     int16_t;\n"
-    "typedef signed int       int32_t;\n"
-    "typedef signed long long int64_t;\n"
-    "typedef signed char      int_fast8_t;\n"
-    "typedef signed short     int_fast16_t;\n"
-    "typedef signed int       int_fast32_t;\n"
-    "typedef signed long long int_fast64_t;\n"
-    "typedef signed char      int_least8_t;\n"
-    "typedef signed short     int_least16_t;\n"
-    "typedef signed int       int_least32_t;\n"
-    "typedef signed long long int_least64_t;\n"
-    "typedef signed long long intmax_t;\n"
-    "typedef signed long      intptr_t; //optional\n"
-    "typedef unsigned char      uint8_t;\n"
-    "typedef unsigned short     uint16_t;\n"
-    "typedef unsigned int       uint32_t;\n"
-    "typedef unsigned long long uint64_t;\n"
-    "typedef unsigned char      uint_fast8_t;\n"
-    "typedef unsigned short     uint_fast16_t;\n"
-    "typedef unsigned int       uint_fast32_t;\n"
-    "typedef unsigned long long uint_fast64_t;\n"
-    "typedef unsigned char      uint_least8_t;\n"
-    "typedef unsigned short     uint_least16_t;\n"
-    "typedef unsigned int       uint_least32_t;\n"
-    "typedef unsigned long long uint_least64_t;\n"
-    "typedef unsigned long long uintmax_t;\n"
-    "typedef unsigned long      uintptr_t; //optional\n"
-    "#define INT8_MIN    SCHAR_MIN\n"
-    "#define INT16_MIN   SHRT_MIN\n"
-    "#define INT32_MIN   INT_MIN\n"
-    "#define INT64_MIN   LLONG_MIN\n"
-    "#define INT8_MAX    SCHAR_MAX\n"
-    "#define INT16_MAX   SHRT_MAX\n"
-    "#define INT32_MAX   INT_MAX\n"
-    "#define INT64_MAX   LLONG_MAX\n"
-    "#define UINT8_MAX   UCHAR_MAX\n"
-    "#define UINT16_MAX  USHRT_MAX\n"
-    "#define UINT32_MAX  UINT_MAX\n"
-    "#define UINT64_MAX  ULLONG_MAX\n"
-    "#define INTPTR_MIN  LONG_MIN\n"
-    "#define INTMAX_MIN  LLONG_MIN\n"
-    "#define INTPTR_MAX  LONG_MAX\n"
-    "#define INTMAX_MAX  LLONG_MAX\n"
-    "#define UINTPTR_MAX ULONG_MAX\n"
-    "#define UINTMAX_MAX ULLONG_MAX\n"
-    "#define PTRDIFF_MIN INTPTR_MIN\n"
-    "#define PTRDIFF_MAX INTPTR_MAX\n"
-    "#define SIZE_MAX    UINT64_MAX\n"
-    "} // namespace __jitify_stdint_ns\n"
-    "namespace std { using namespace __jitify_stdint_ns; }\n"
-    "using namespace __jitify_stdint_ns;\n";
-
-// TODO: offsetof
-static const char* jitsafe_header_stddef_h =
-    "#pragma once\n"
-    "#include <climits>\n"
-    "namespace __jitify_stddef_ns {\n"
-    "#if __cplusplus >= 201103L\n"
-    "typedef decltype(nullptr) nullptr_t;\n"
-    "#if defined(_MSC_VER)\n"
-    "  typedef double max_align_t;\n"
-    "#elif defined(__APPLE__)\n"
-    "  typedef long double max_align_t;\n"
-    "#else\n"
-    "  // Define max_align_t to match the GCC definition.\n"
-    "  typedef struct {\n"
-    "    long long __jitify_max_align_nonce1\n"
-    "        __attribute__((__aligned__(__alignof__(long long))));\n"
-    "    long double __jitify_max_align_nonce2\n"
-    "        __attribute__((__aligned__(__alignof__(long double))));\n"
-    "  } max_align_t;\n"
-    "#endif\n"
-    "#endif  // __cplusplus >= 201103L\n"
-    "#if __cplusplus >= 201703L\n"
-    "enum class byte : unsigned char {};\n"
-    "#endif  // __cplusplus >= 201703L\n"
-    "} // namespace __jitify_stddef_ns\n"
-    "namespace std {\n"
-    "  // NVRTC provides built-in definitions of ::size_t and ::ptrdiff_t.\n"
-    "  using ::size_t;\n"
-    "  using ::ptrdiff_t;\n"
-    "  using namespace __jitify_stddef_ns;\n"
-    "} // namespace std\n"
-    "using namespace __jitify_stddef_ns;\n";
-
-static const char* jitsafe_header_stdlib_h =
-    "#pragma once\n"
-    "#include <stddef.h>\n";
-static const char* jitsafe_header_stdio_h =
-    "#pragma once\n"
-    "#include <stddef.h>\n"
-    "#define FILE int\n"
-    "int fflush ( FILE * stream );\n"
-    "int fprintf ( FILE * stream, const char * format, ... );\n";
-
-static const char* jitsafe_header_string_h =
-    "#pragma once\n"
-    "char* strcpy ( char * destination, const char * source );\n"
-    "int strcmp ( const char * str1, const char * str2 );\n"
-    "char* strerror( int errnum );\n";
-
-static const char* jitsafe_header_cstring =
-    "#pragma once\n"
-    "\n"
-    "namespace __jitify_cstring_ns {\n"
-    "char* strcpy ( char * destination, const char * source );\n"
-    "int strcmp ( const char * str1, const char * str2 );\n"
-    "char* strerror( int errnum );\n"
-    "} // namespace __jitify_cstring_ns\n"
-    "namespace std { using namespace __jitify_cstring_ns; }\n"
-    "using namespace __jitify_cstring_ns;\n";
-
-// HACK TESTING (WAR for cub)
-static const char* jitsafe_header_iostream =
-    "#pragma once\n"
-    "#include <ostream>\n"
-    "#include <istream>\n";
-// HACK TESTING (WAR for Thrust)
-static const char* jitsafe_header_ostream =
-    "#pragma once\n"
-    "\n"
-    "namespace __jitify_ostream_ns {\n"
-    "template<class CharT,class Traits=void>\n"  // = std::char_traits<CharT>
-                                                 // >\n"
-    "struct basic_ostream {\n"
-    "};\n"
-    "typedef basic_ostream<char> ostream;\n"
-    "ostream& endl(ostream& os);\n"
-    "ostream& operator<<( ostream&, ostream& (*f)( ostream& ) );\n"
-    "template< class CharT, class Traits > basic_ostream<CharT, Traits>& endl( "
-    "basic_ostream<CharT, Traits>& os );\n"
-    "template< class CharT, class Traits > basic_ostream<CharT, Traits>& "
-    "operator<<( basic_ostream<CharT,Traits>& os, const char* c );\n"
-    "#if __cplusplus >= 201103L\n"
-    "template< class CharT, class Traits, class T > basic_ostream<CharT, "
-    "Traits>& operator<<( basic_ostream<CharT,Traits>&& os, const T& value );\n"
-    "#endif  // __cplusplus >= 201103L\n"
-    "} // namespace __jitify_ostream_ns\n"
-    "namespace std { using namespace __jitify_ostream_ns; }\n"
-    "using namespace __jitify_ostream_ns;\n";
-
-static const char* jitsafe_header_istream =
-    "#pragma once\n"
-    "\n"
-    "namespace __jitify_istream_ns {\n"
-    "template<class CharT,class Traits=void>\n"  // = std::char_traits<CharT>
-                                                 // >\n"
-    "struct basic_istream {\n"
-    "};\n"
-    "typedef basic_istream<char> istream;\n"
-    "} // namespace __jitify_istream_ns\n"
-    "namespace std { using namespace __jitify_istream_ns; }\n"
-    "using namespace __jitify_istream_ns;\n";
-
-static const char* jitsafe_header_sstream =
-    "#pragma once\n"
-    "#include <ostream>\n"
-    "#include <istream>\n";
-
-static const char* jitsafe_header_utility =
-    "#pragma once\n"
-    "namespace __jitify_utility_ns {\n"
-    "template<class T1, class T2>\n"
-    "struct pair {\n"
-    "	T1 first;\n"
-    "	T2 second;\n"
-    "	inline pair() {}\n"
-    "	inline pair(T1 const& first_, T2 const& second_)\n"
-    "		: first(first_), second(second_) {}\n"
-    "	// TODO: Standard includes many more constructors...\n"
-    "	// TODO: Comparison operators\n"
-    "};\n"
-    "template<class T1, class T2>\n"
-    "pair<T1,T2> make_pair(T1 const& first, T2 const& second) {\n"
-    "	return pair<T1,T2>(first, second);\n"
-    "}\n"
-    "} // namespace __jitify_utility_ns\n"
-    "namespace std { using namespace __jitify_utility_ns; }\n"
-    "using namespace __jitify_utility_ns;\n";
-
-// TODO: incomplete
-static const char* jitsafe_header_vector =
-    "#pragma once\n"
-    "namespace __jitify_vector_ns {\n"
-    "template<class T, class Allocator=void>\n"  // = std::allocator> \n"
-    "struct vector {\n"
-    "};\n"
-    "} // namespace __jitify_vector_ns\n"
-    "namespace std { using namespace __jitify_vector_ns; }\n"
-    "using namespace __jitify_vector_ns;\n";
-
-// TODO: incomplete
-static const char* jitsafe_header_string =
-    "#pragma once\n"
-    "namespace __jitify_string_ns {\n"
-    "template<class CharT,class Traits=void,class Allocator=void>\n"
-    "struct basic_string {\n"
-    "basic_string();\n"
-    "basic_string( const CharT* s );\n"  //, const Allocator& alloc =
-                                         // Allocator() );\n"
-    "const CharT* c_str() const;\n"
-    "bool empty() const;\n"
-    "void operator+=(const char *);\n"
-    "void operator+=(const basic_string &);\n"
-    "};\n"
-    "typedef basic_string<char> string;\n"
-    "} // namespace __jitify_string_ns\n"
-    "namespace std { using namespace __jitify_string_ns; }\n"
-    "using namespace __jitify_string_ns;\n";
-
-// TODO: incomplete
-static const char* jitsafe_header_stdexcept =
-    "#pragma once\n"
-    "namespace __jitify_stdexcept_ns {\n"
-    "struct runtime_error {\n"
-    "explicit runtime_error( const std::string& what_arg );"
-    "explicit runtime_error( const char* what_arg );"
-    "virtual const char* what() const;\n"
-    "};\n"
-    "} // namespace __jitify_stdexcept_ns\n"
-    "namespace std { using namespace __jitify_stdexcept_ns; }\n"
-    "using namespace __jitify_stdexcept_ns;\n";
-
-// TODO: incomplete
-static const char* jitsafe_header_complex =
-    "#pragma once\n"
-    "namespace __jitify_complex_ns {\n"
-    "template<typename T>\n"
-    "class complex {\n"
-    "	T _real;\n"
-    "	T _imag;\n"
-    "public:\n"
-    "	complex() : _real(0), _imag(0) {}\n"
-    "	complex(T const& real, T const& imag)\n"
-    "		: _real(real), _imag(imag) {}\n"
-    "	complex(T const& real)\n"
-    "               : _real(real), _imag(static_cast<T>(0)) {}\n"
-    "	T const& real() const { return _real; }\n"
-    "	T&       real()       { return _real; }\n"
-    "	void real(const T &r) { _real = r; }\n"
-    "	T const& imag() const { return _imag; }\n"
-    "	T&       imag()       { return _imag; }\n"
-    "	void imag(const T &i) { _imag = i; }\n"
-    "       complex<T>& operator+=(const complex<T> z)\n"
-    "         { _real += z.real(); _imag += z.imag(); return *this; }\n"
-    "};\n"
-    "template<typename T>\n"
-    "complex<T> operator*(const complex<T>& lhs, const complex<T>& rhs)\n"
-    "  { return complex<T>(lhs.real()*rhs.real()-lhs.imag()*rhs.imag(),\n"
-    "                      lhs.real()*rhs.imag()+lhs.imag()*rhs.real()); }\n"
-    "template<typename T>\n"
-    "complex<T> operator*(const complex<T>& lhs, const T & rhs)\n"
-    "  { return complexs<T>(lhs.real()*rhs,lhs.imag()*rhs); }\n"
-    "template<typename T>\n"
-    "complex<T> operator*(const T& lhs, const complex<T>& rhs)\n"
-    "  { return complexs<T>(rhs.real()*lhs,rhs.imag()*lhs); }\n"
-    "} // namespace __jitify_complex_ns\n"
-    "namespace std { using namespace __jitify_complex_ns; }\n"
-    "using namespace __jitify_complex_ns;\n";
-
-// TODO: This is incomplete (missing binary and integer funcs, macros,
-// constants, types)
-static const char* jitsafe_header_math =
-    "#pragma once\n"
-    "namespace __jitify_math_ns {\n"
-    "#if __cplusplus >= 201103L\n"
-    "#define DEFINE_MATH_UNARY_FUNC_WRAPPER(f) \\\n"
-    "	inline double      f(double x)         { return ::f(x); } \\\n"
-    "	inline float       f##f(float x)       { return ::f(x); } \\\n"
-    "	/*inline long double f##l(long double x) { return ::f(x); }*/ \\\n"
-    "	inline float       f(float x)          { return ::f(x); } \\\n"
-    "	/*inline long double f(long double x)    { return ::f(x); }*/\n"
-    "#else\n"
-    "#define DEFINE_MATH_UNARY_FUNC_WRAPPER(f) \\\n"
-    "	inline double      f(double x)         { return ::f(x); } \\\n"
-    "	inline float       f##f(float x)       { return ::f(x); } \\\n"
-    "	/*inline long double f##l(long double x) { return ::f(x); }*/\n"
-    "#endif\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(cos)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(sin)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(tan)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(acos)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(asin)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(atan)\n"
-    "template<typename T> inline T atan2(T y, T x) { return ::atan2(y, x); }\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(cosh)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(sinh)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(tanh)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(exp)\n"
-    "template<typename T> inline T frexp(T x, int* exp) { return ::frexp(x, "
-    "exp); }\n"
-    "template<typename T> inline T ldexp(T x, int  exp) { return ::ldexp(x, "
-    "exp); }\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(log)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(log10)\n"
-    "template<typename T> inline T modf(T x, T* intpart) { return ::modf(x, "
-    "intpart); }\n"
-    "template<typename T> inline T pow(T x, T y) { return ::pow(x, y); }\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(sqrt)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(ceil)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(floor)\n"
-    "template<typename T> inline T fmod(T n, T d) { return ::fmod(n, d); }\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(fabs)\n"
-    "template<typename T> inline T abs(T x) { return ::abs(x); }\n"
-    "#if __cplusplus >= 201103L\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(acosh)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(asinh)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(atanh)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(exp2)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(expm1)\n"
-    "template<typename T> inline int ilogb(T x) { return ::ilogb(x); }\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(log1p)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(log2)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(logb)\n"
-    "template<typename T> inline T scalbn (T x, int n)  { return ::scalbn(x, "
-    "n); }\n"
-    "template<typename T> inline T scalbln(T x, long n) { return ::scalbn(x, "
-    "n); }\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(cbrt)\n"
-    "template<typename T> inline T hypot(T x, T y) { return ::hypot(x, y); }\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(erf)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(erfc)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(tgamma)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(lgamma)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(trunc)\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(round)\n"
-    "template<typename T> inline long lround(T x) { return ::lround(x); }\n"
-    "template<typename T> inline long long llround(T x) { return ::llround(x); "
-    "}\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(rint)\n"
-    "template<typename T> inline long lrint(T x) { return ::lrint(x); }\n"
-    "template<typename T> inline long long llrint(T x) { return ::llrint(x); "
-    "}\n"
-    "DEFINE_MATH_UNARY_FUNC_WRAPPER(nearbyint)\n"
-    // TODO: remainder, remquo, copysign, nan, nextafter, nexttoward, fdim,
-    // fmax, fmin, fma
-    "#endif\n"
-    "#undef DEFINE_MATH_UNARY_FUNC_WRAPPER\n"
-    "} // namespace __jitify_math_ns\n"
-    "namespace std { using namespace __jitify_math_ns; }\n"
-    "#define M_PI 3.14159265358979323846\n"
-    // Note: Global namespace already includes CUDA math funcs
-    "//using namespace __jitify_math_ns;\n";
-
-static const char* jitsafe_header_memory_h = R"(
-    #pragma once
-    #include <string.h>
- )";
-
-// TODO: incomplete
-static const char* jitsafe_header_mutex = R"(
-    #pragma once
-    #if __cplusplus >= 201103L
-    namespace __jitify_mutex_ns {
-    class mutex {
-    public:
-    void lock();
-    bool try_lock();
-    void unlock();
-    };
-    } // namespace __jitify_mutex_ns
-    namespace std { using namespace __jitify_mutex_ns; }
-    using namespace __jitify_mutex_ns;
-    #endif
- )";
-
-static const char* jitsafe_header_algorithm = R"(
-    #pragma once
-    #if __cplusplus >= 201103L
-    namespace __jitify_algorithm_ns {
-
-    #if __cplusplus == 201103L
-    #define JITIFY_CXX14_CONSTEXPR
-    #else
-    #define JITIFY_CXX14_CONSTEXPR constexpr
-    #endif
-
-    template<class T> JITIFY_CXX14_CONSTEXPR const T& max(const T& a, const T& b)
-    {
-      return (b > a) ? b : a;
-    }
-    template<class T> JITIFY_CXX14_CONSTEXPR const T& min(const T& a, const T& b)
-    {
-      return (b < a) ? b : a;
-    }
-
-    } // namespace __jitify_algorithm_ns
-    namespace std { using namespace __jitify_algorithm_ns; }
-    using namespace __jitify_algorithm_ns;
-    #endif
- )";
-
-static const char* jitsafe_header_time_h = R"(
-    #pragma once
-    #define NULL 0
-    #define CLOCKS_PER_SEC 1000000
-    namespace __jitify_time_ns {
-    typedef long time_t;
-    struct tm {
-      int tm_sec;
-      int tm_min;
-      int tm_hour;
-      int tm_mday;
-      int tm_mon;
-      int tm_year;
-      int tm_wday;
-      int tm_yday;
-      int tm_isdst;
-    };
-    #if __cplusplus >= 201703L
-    struct timespec {
-      time_t tv_sec;
-      long tv_nsec;
-    };
-    #endif
-    }  // namespace __jitify_time_ns
-    namespace std {
-      // NVRTC provides built-in definitions of ::size_t and ::clock_t.
-      using ::size_t;
-      using ::clock_t;
-      using namespace __jitify_time_ns;
-    }
-    using namespace __jitify_time_ns;
- )";
-
-// WAR: These need to be pre-included as a workaround for NVRTC implicitly using
-// /usr/include as an include path. The other built-in headers will be included
-// lazily as needed.
-static const char* preinclude_jitsafe_header_names[] = {
-    "jitify_preinclude.h",
-    "limits.h",
-    "math.h",
-    "memory.h",
-    "stdint.h",
-    "stdlib.h",
-    "stdio.h",
-    "string.h",
-    "time.h",
-};
-
-template <class T, int N>
-int array_size(T (&)[N]) {
-  return N;
-}
-const int preinclude_jitsafe_headers_count =
-    array_size(preinclude_jitsafe_header_names);
-
-static const std::map<std::string, std::string>& get_jitsafe_headers_map() {
-  static const std::map<std::string, std::string> jitsafe_headers_map = {
-      {"jitify_preinclude.h", jitsafe_header_preinclude_h},
-      {"float.h", jitsafe_header_float_h},
-      {"cfloat", jitsafe_header_float_h},
-      {"limits.h", jitsafe_header_limits_h},
-      {"climits", jitsafe_header_limits_h},
-      {"stdint.h", jitsafe_header_stdint_h},
-      {"cstdint", jitsafe_header_stdint_h},
-      {"stddef.h", jitsafe_header_stddef_h},
-      {"cstddef", jitsafe_header_stddef_h},
-      {"stdlib.h", jitsafe_header_stdlib_h},
-      {"cstdlib", jitsafe_header_stdlib_h},
-      {"stdio.h", jitsafe_header_stdio_h},
-      {"cstdio", jitsafe_header_stdio_h},
-      {"string.h", jitsafe_header_string_h},
-      {"cstring", jitsafe_header_cstring},
-      {"iterator", jitsafe_header_iterator},
-      {"limits", jitsafe_header_limits},
-      {"type_traits", jitsafe_header_type_traits},
-      {"utility", jitsafe_header_utility},
-      {"math.h", jitsafe_header_math},
-      {"cmath", jitsafe_header_math},
-      {"memory.h", jitsafe_header_memory_h},
-      {"complex", jitsafe_header_complex},
-      {"iostream", jitsafe_header_iostream},
-      {"ostream", jitsafe_header_ostream},
-      {"istream", jitsafe_header_istream},
-      {"sstream", jitsafe_header_sstream},
-      {"vector", jitsafe_header_vector},
-      {"string", jitsafe_header_string},
-      {"stdexcept", jitsafe_header_stdexcept},
-      {"mutex", jitsafe_header_mutex},
-      {"algorithm", jitsafe_header_algorithm},
-      {"time.h", jitsafe_header_time_h},
-      {"ctime", jitsafe_header_time_h},
-  };
-  return jitsafe_headers_map;
-}
-
-inline void add_options_from_env(std::vector<std::string>& options) {
-  // Add options from environment variable
-  const char* env_options = std::getenv("JITIFY_OPTIONS");
-  if (env_options) {
-    std::stringstream ss;
-    ss << env_options;
-    std::string opt;
-    while (!(ss >> opt).fail()) {
-      options.push_back(opt);
-    }
-  }
-  // Add options from JITIFY_OPTIONS macro
-#ifdef JITIFY_OPTIONS
-#define JITIFY_TOSTRING_IMPL(x) #x
-#define JITIFY_TOSTRING(x) JITIFY_TOSTRING_IMPL(x)
-  std::stringstream ss;
-  ss << JITIFY_TOSTRING(JITIFY_OPTIONS);
-  std::string opt;
-  while (!(ss >> opt).fail()) {
-    options.push_back(opt);
-  }
-#undef JITIFY_TOSTRING
-#undef JITIFY_TOSTRING_IMPL
-#endif  // JITIFY_OPTIONS
-}
-
-inline void detect_and_add_cuda_arch(std::vector<std::string>& options) {
-  for (int i = 0; i < (int)options.size(); ++i) {
-    // Note that this will also match the middle of "--gpu-architecture".
-    if (options[i].find("-arch") != std::string::npos) {
-      // Arch already specified in options
-      return;
-    }
-  }
-  // Use the compute capability of the current device
-  // TODO: Check these API calls for errors
-  cudaError_t status;
-  int device;
-  status = cudaGetDevice(&device);
-  if (status != cudaSuccess) {
-    throw std::runtime_error(
-        std::string(
-            "Failed to detect GPU architecture: cudaGetDevice failed: ") +
-        cudaGetErrorString(status));
-  }
-  int cc_major;
-  cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device);
-  int cc_minor;
-  cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device);
-  int cc = cc_major * 10 + cc_minor;
-  // Note: We must limit the architecture to the max supported by the current
-  //         version of NVRTC, otherwise newer hardware will cause errors
-  //         on older versions of CUDA.
-  // TODO: It would be better to detect this somehow, rather than hard-coding it
-
-  // Tegra chips do not have forwards compatibility so we need to special case
-  // them.
-  bool is_tegra = ((cc_major == 3 && cc_minor == 2) ||  // Logan
-                   (cc_major == 5 && cc_minor == 3) ||  // Erista
-                   (cc_major == 6 && cc_minor == 2) ||  // Parker
-                   (cc_major == 7 && cc_minor == 2));   // Xavier
-  if (!is_tegra) {
-    // ensure that future CUDA versions just work (even if suboptimal)
-    const int cuda_major = std::min(10, CUDA_VERSION / 1000);
-    // clang-format off
-    switch (cuda_major) {
-      case 10: cc = std::min(cc, 75); break; // Turing
-      case  9: cc = std::min(cc, 70); break; // Volta
-      case  8: cc = std::min(cc, 61); break; // Pascal
-      case  7: cc = std::min(cc, 52); break; // Maxwell
-      default:
-        throw std::runtime_error("Unexpected CUDA major version " +
-                                 std::to_string(cuda_major));
-    }
-    // clang-format on
-  }
-
-  std::stringstream ss;
-  ss << cc;
-  options.push_back("-arch=compute_" + ss.str());
-}
-
-inline void detect_and_add_cxx11_flag(std::vector<std::string>& options) {
-  // Reverse loop so we can erase on the fly.
-  for (int i = (int)options.size() - 1; i >= 0; --i) {
-    if (options[i].find("-std=c++98") != std::string::npos) {
-      // NVRTC doesn't support specifying c++98 explicitly, so we remove it.
-      options.erase(options.begin() + i);
-      return;
-    } else if (options[i].find("-std") != std::string::npos) {
-      // Some other standard was explicitly specified, don't change anything.
-      return;
-    }
-  }
-  // Jitify must be compiled with C++11 support, so we default to enabling it
-  // for the JIT-compiled code too.
-  options.push_back("-std=c++11");
-}
-
-inline void split_compiler_and_linker_options(
-    std::vector<std::string> options,
-    std::vector<std::string>* compiler_options,
-    std::vector<std::string>* linker_files,
-    std::vector<std::string>* linker_paths) {
-  for (int i = 0; i < (int)options.size(); ++i) {
-    std::string opt = options[i];
-    std::string flag = opt.substr(0, 2);
-    std::string value = opt.substr(2);
-    if (flag == "-l") {
-      linker_files->push_back(value);
-    } else if (flag == "-L") {
-      linker_paths->push_back(value);
-    } else {
-      compiler_options->push_back(opt);
-    }
-  }
-}
-
-inline bool pop_remove_unused_globals_flag(std::vector<std::string>* options) {
-  auto it = std::remove_if(
-      options->begin(), options->end(), [](const std::string& opt) {
-        return opt.find("-remove-unused-globals") != std::string::npos;
-      });
-  if (it != options->end()) {
-    options->resize(it - options->begin());
-    return true;
-  }
-  return false;
-}
-
-inline std::string ptx_parse_decl_name(const std::string& line) {
-  size_t name_end = line.find_first_of("[;");
-  if (name_end == std::string::npos) {
-    throw std::runtime_error(
-        "Failed to parse .global/.const declaration in PTX: expected a "
-        "semicolon");
-  }
-  size_t name_start_minus1 = line.find_last_of(" \t", name_end);
-  if (name_start_minus1 == std::string::npos) {
-    throw std::runtime_error(
-        "Failed to parse .global/.const declaration in PTX: expected "
-        "whitespace");
-  }
-  size_t name_start = name_start_minus1 + 1;
-  std::string name = line.substr(name_start, name_end - name_start);
-  return name;
-}
-
-inline void ptx_remove_unused_globals(std::string* ptx) {
-  std::istringstream iss(*ptx);
-  std::vector<std::string> lines;
-  std::unordered_map<size_t, std::string> line_num_to_global_name;
-  std::unordered_set<std::string> name_set;
-  for (std::string line; std::getline(iss, line);) {
-    size_t line_num = lines.size();
-    lines.push_back(line);
-    auto terms = split_string(line);
-    if (terms.size() <= 1) continue;  // Ignore lines with no arguments
-    if (terms[0].substr(0, 2) == "//") continue;  // Ignore comment lines
-    if (terms[0].substr(0, 7) == ".global" ||
-        terms[0].substr(0, 6) == ".const") {
-      line_num_to_global_name.emplace(line_num, ptx_parse_decl_name(line));
-      continue;
-    }
-    if (terms[0][0] == '.') continue;  // Ignore .version, .reg, .param etc.
-    // Note: The first term will always be an instruction name; starting at 1
-    // also allows unchecked inspection of the previous term.
-    for (int i = 1; i < (int)terms.size(); ++i) {
-      if (terms[i].substr(0, 2) == "//") break;  // Ignore comments
-      // Note: The characters '.' and '%' are not treated as delimiters.
-      const char* token_delims = " \t()[]{},;+-*/~&|^?:=!<>\"'\\";
-      for (auto token : split_string(terms[i], -1, token_delims)) {
-        if (  // Ignore non-names
-            !(std::isalpha(token[0]) || token[0] == '_' || token[0] == '$') ||
-            token.find('.') != std::string::npos ||
-            // Ignore variable/parameter declarations
-            terms[i - 1][0] == '.' ||
-            // Ignore branch instructions
-            (token == "bra" && terms[i - 1][0] == '@') ||
-            // Ignore branch labels
-            (token.substr(0, 2) == "BB" &&
-             terms[i - 1].substr(0, 3) == "bra")) {
-          continue;
-        }
-        name_set.insert(token);
-      }
-    }
-  }
-  std::ostringstream oss;
-  for (size_t line_num = 0; line_num < lines.size(); ++line_num) {
-    auto it = line_num_to_global_name.find(line_num);
-    if (it != line_num_to_global_name.end()) {
-      const std::string& name = it->second;
-      if (!name_set.count(name)) {
-        continue;  // Remove unused .global declaration.
-      }
-    }
-    oss << lines[line_num] << '\n';
-  }
-  *ptx = oss.str();
-}
-
-inline nvrtcResult compile_kernel(std::string program_name,
-                                  std::map<std::string, std::string> sources,
-                                  std::vector<std::string> options,
-                                  std::string instantiation = "",
-                                  std::string* log = 0, std::string* ptx = 0,
-                                  std::string* mangled_instantiation = 0) {
-  std::string program_source = sources[program_name];
-  // Build arrays of header names and sources
-  std::vector<const char*> header_names_c;
-  std::vector<const char*> header_sources_c;
-  int num_headers = (int)(sources.size() - 1);
-  header_names_c.reserve(num_headers);
-  header_sources_c.reserve(num_headers);
-  typedef std::map<std::string, std::string> source_map;
-  for (source_map::const_iterator iter = sources.begin(); iter != sources.end();
-       ++iter) {
-    std::string const& name = iter->first;
-    std::string const& code = iter->second;
-    if (name == program_name) {
-      continue;
-    }
-    header_names_c.push_back(name.c_str());
-    header_sources_c.push_back(code.c_str());
-  }
-
-  // TODO: This WAR is expected to be unnecessary as of CUDA > 10.2.
-  bool should_remove_unused_globals =
-      detail::pop_remove_unused_globals_flag(&options);
-
-  std::vector<const char*> options_c(options.size() + 2);
-  options_c[0] = "--device-as-default-execution-space";
-  options_c[1] = "--pre-include=jitify_preinclude.h";
-  for (int i = 0; i < (int)options.size(); ++i) {
-    options_c[i + 2] = options[i].c_str();
-  }
-
-#if CUDA_VERSION < 8000
-  std::string inst_dummy;
-  if (!instantiation.empty()) {
-    // WAR for no nvrtcAddNameExpression before CUDA 8.0
-    // Force template instantiation by adding dummy reference to kernel
-    inst_dummy = "__jitify_instantiation";
-    program_source +=
-        "\nvoid* " + inst_dummy + " = (void*)" + instantiation + ";\n";
-  }
-#endif
-
-#define CHECK_NVRTC(call)       \
-  do {                          \
-    nvrtcResult ret = call;     \
-    if (ret != NVRTC_SUCCESS) { \
-      return ret;               \
-    }                           \
-  } while (0)
-
-  nvrtcProgram nvrtc_program;
-  CHECK_NVRTC(nvrtcCreateProgram(
-      &nvrtc_program, program_source.c_str(), program_name.c_str(), num_headers,
-      header_sources_c.data(), header_names_c.data()));
-
-#if CUDA_VERSION >= 8000
-  if (!instantiation.empty()) {
-    CHECK_NVRTC(nvrtcAddNameExpression(nvrtc_program, instantiation.c_str()));
-  }
-#endif
-
-  nvrtcResult ret = nvrtcCompileProgram(nvrtc_program, (int)options_c.size(),
-                                        options_c.data());
-  if (log) {
-    size_t logsize;
-    CHECK_NVRTC(nvrtcGetProgramLogSize(nvrtc_program, &logsize));
-    std::vector<char> vlog(logsize, 0);
-    CHECK_NVRTC(nvrtcGetProgramLog(nvrtc_program, vlog.data()));
-    log->assign(vlog.data(), logsize);
-  }
-  if (ret != NVRTC_SUCCESS) {
-    return ret;
-  }
-
-  if (ptx) {
-    size_t ptxsize;
-    CHECK_NVRTC(nvrtcGetPTXSize(nvrtc_program, &ptxsize));
-    std::vector<char> vptx(ptxsize);
-    CHECK_NVRTC(nvrtcGetPTX(nvrtc_program, vptx.data()));
-    ptx->assign(vptx.data(), ptxsize);
-    if (should_remove_unused_globals) {
-      detail::ptx_remove_unused_globals(ptx);
-    }
-  }
-
-  if (!instantiation.empty() && mangled_instantiation) {
-#if CUDA_VERSION >= 8000
-    const char* mangled_instantiation_cstr;
-    // Note: The returned string pointer becomes invalid after
-    //         nvrtcDestroyProgram has been called, so we save it.
-    CHECK_NVRTC(nvrtcGetLoweredName(nvrtc_program, instantiation.c_str(),
-                                    &mangled_instantiation_cstr));
-    *mangled_instantiation = mangled_instantiation_cstr;
-#else
-    // Extract mangled kernel template instantiation from PTX
-    inst_dummy += " = ";  // Note: This must match how the PTX is generated
-    int mi_beg = ptx->find(inst_dummy) + inst_dummy.size();
-    int mi_end = ptx->find(";", mi_beg);
-    *mangled_instantiation = ptx->substr(mi_beg, mi_end - mi_beg);
-#endif
-  }
-
-  CHECK_NVRTC(nvrtcDestroyProgram(&nvrtc_program));
-#undef CHECK_NVRTC
-  return NVRTC_SUCCESS;
-}
-
-inline void load_program(std::string const& cuda_source,
-                         std::vector<std::string> const& headers,
-                         file_callback_type file_callback,
-                         std::vector<std::string>* include_paths,
-                         std::map<std::string, std::string>* program_sources,
-                         std::vector<std::string>* program_options,
-                         std::string* program_name) {
-  // Extract include paths from compile options
-  std::vector<std::string>::iterator iter = program_options->begin();
-  while (iter != program_options->end()) {
-    std::string const& opt = *iter;
-    if (opt.substr(0, 2) == "-I") {
-      include_paths->push_back(opt.substr(2));
-      iter = program_options->erase(iter);
-    } else {
-      ++iter;
-    }
-  }
-
-  // Load program source
-  if (!detail::load_source(cuda_source, *program_sources, "", *include_paths,
-                           file_callback)) {
-    throw std::runtime_error("Source not found: " + cuda_source);
-  }
-  *program_name = program_sources->begin()->first;
-
-  // Maps header include names to their full file paths.
-  std::map<std::string, std::string> header_fullpaths;
-
-  // Load header sources
-  for (std::string const& header : headers) {
-    if (!detail::load_source(header, *program_sources, "", *include_paths,
-                             file_callback, &header_fullpaths)) {
-      // **TODO: Deal with source not found
-      throw std::runtime_error("Source not found: " + header);
-    }
-  }
-
-#if JITIFY_PRINT_SOURCE
-  std::string& program_source = (*program_sources)[*program_name];
-  std::cout << "---------------------------------------" << std::endl;
-  std::cout << "--- Source of " << *program_name << " ---" << std::endl;
-  std::cout << "---------------------------------------" << std::endl;
-  detail::print_with_line_numbers(program_source);
-  std::cout << "---------------------------------------" << std::endl;
-#endif
-
-  std::vector<std::string> compiler_options, linker_files, linker_paths;
-  detail::split_compiler_and_linker_options(*program_options, &compiler_options,
-                                            &linker_files, &linker_paths);
-
-  // If no arch is specified at this point we use whatever the current
-  // context is. This ensures we pick up the correct internal headers
-  // for arch-dependent compilation, e.g., some intrinsics are only
-  // present for specific architectures.
-  detail::detect_and_add_cuda_arch(compiler_options);
-  detail::detect_and_add_cxx11_flag(compiler_options);
-
-  // Iteratively try to compile the sources, and use the resulting errors to
-  // identify missing headers.
-  std::string log;
-  nvrtcResult ret;
-  while ((ret = detail::compile_kernel(*program_name, *program_sources,
-                                       compiler_options, "", &log)) ==
-         NVRTC_ERROR_COMPILATION) {
-    std::string include_name;
-    std::string include_parent;
-    int line_num = 0;
-    if (!detail::extract_include_info_from_compile_error(
-            log, include_name, include_parent, line_num)) {
-#if JITIFY_PRINT_LOG
-      detail::print_compile_log(*program_name, log);
-#endif
-      // There was a non include-related compilation error
-      // TODO: How to handle error?
-      throw std::runtime_error("Runtime compilation failed");
-    }
-
-    bool is_included_with_quotes = false;
-    if (program_sources->count(include_parent)) {
-      const std::string& parent_source = (*program_sources)[include_parent];
-      is_included_with_quotes =
-          is_include_directive_with_quotes(parent_source, line_num);
-    }
-
-    // Try to load the new header
-    // Note: This fullpath lookup is needed because the compiler error
-    // messages have the include name of the header instead of its full path.
-    std::string include_parent_fullpath = header_fullpaths[include_parent];
-    std::string include_path = detail::path_base(include_parent_fullpath);
-    if (detail::load_source(include_name, *program_sources, include_path,
-                            *include_paths, file_callback, &header_fullpaths,
-                            is_included_with_quotes)) {
-#if JITIFY_PRINT_HEADER_PATHS
-      std::cout << "Found #include " << include_name << " from "
-                << include_parent << ":" << line_num << " ["
-                << include_parent_fullpath << "]"
-                << " at:\n  " << header_fullpaths[include_name] << std::endl;
-#endif
-    } else {  // Failed to find header file.
-      // Comment-out the include line and print a warning
-      if (!program_sources->count(include_parent)) {
-        // ***TODO: Unless there's another mechanism (e.g., potentially
-        //            the parent path vs. filename problem), getting
-        //            here means include_parent was found automatically
-        //            in a system include path.
-        //            We need a WAR to zap it from *its parent*.
-
-        typedef std::map<std::string, std::string> source_map;
-        for (source_map::const_iterator it = program_sources->begin();
-             it != program_sources->end(); ++it) {
-          std::cout << "  " << it->first << std::endl;
-        }
-        throw std::out_of_range(include_parent +
-                                " not in loaded sources!"
-                                " This may be due to a header being loaded by"
-                                " NVRTC without Jitify's knowledge.");
-      }
-      std::string& parent_source = (*program_sources)[include_parent];
-      parent_source = detail::comment_out_code_line(line_num, parent_source);
-#if JITIFY_PRINT_LOG
-      std::cout << include_parent << "(" << line_num
-                << "): warning: " << include_name << ": [jitify] File not found"
-                << std::endl;
-#endif
-    }
-  }
-  if (ret != NVRTC_SUCCESS) {
-#if JITIFY_PRINT_LOG
-    if (ret == NVRTC_ERROR_INVALID_OPTION) {
-      std::cout << "Compiler options: ";
-      for (int i = 0; i < (int)compiler_options.size(); ++i) {
-        std::cout << compiler_options[i] << " ";
-      }
-      std::cout << std::endl;
-    }
-#endif
-    throw std::runtime_error(std::string("NVRTC error: ") +
-                             nvrtcGetErrorString(ret));
-  }
-}
-
-inline void instantiate_kernel(
-    std::string const& program_name,
-    std::map<std::string, std::string> const& program_sources,
-    std::string const& instantiation, std::vector<std::string> const& options,
-    std::string* log, std::string* ptx, std::string* mangled_instantiation,
-    std::vector<std::string>* linker_files,
-    std::vector<std::string>* linker_paths) {
-  std::vector<std::string> compiler_options;
-  detail::split_compiler_and_linker_options(options, &compiler_options,
-                                            linker_files, linker_paths);
-
-  std::cout << "ABout to compile kernel" << std::endl;
-  nvrtcResult ret =
-      detail::compile_kernel(program_name, program_sources, compiler_options,
-                             instantiation, log, ptx, mangled_instantiation);
-#if JITIFY_PRINT_LOG
-  if (log->size() > 1) {
-    detail::print_compile_log(program_name, *log);
-  }
-#endif
-  if (ret != NVRTC_SUCCESS) {
-    throw std::runtime_error(std::string("NVRTC error: ") +
-                             nvrtcGetErrorString(ret));
-  }
-    std::cout << "done compilling" << std::endl;
-
-#if JITIFY_PRINT_PTX
-  std::cout << "---------------------------------------" << std::endl;
-  std::cout << *mangled_instantiation << std::endl;
-  std::cout << "---------------------------------------" << std::endl;
-  std::cout << "--- PTX for " << mangled_instantiation << " in " << program_name
-            << " ---" << std::endl;
-  std::cout << "---------------------------------------" << std::endl;
-  std::cout << *ptx << std::endl;
-  std::cout << "---------------------------------------" << std::endl;
-#endif
-}
-
-inline void get_1d_max_occupancy(CUfunction func,
-                                 CUoccupancyB2DSize smem_callback,
-                                 unsigned int* smem, int max_block_size,
-                                 unsigned int flags, int* grid, int* block) {
-  if (!func) {
-    throw std::runtime_error(
-        "Kernel pointer is NULL; you may need to define JITIFY_THREAD_SAFE "
-        "1");
-  }
-  CUresult res = cuOccupancyMaxPotentialBlockSizeWithFlags(
-      grid, block, func, smem_callback, *smem, max_block_size, flags);
-  if (res != CUDA_SUCCESS) {
-    const char* msg;
-    cuGetErrorName(res, &msg);
-    throw std::runtime_error(msg);
-  }
-  if (smem_callback) {
-    *smem = (unsigned int)smem_callback(*block);
-  }
-}
-
-}  // namespace detail
-
-//! \endcond
-
-class KernelInstantiation;
-class Kernel;
-class Program;
-class JitCache;
-
-struct ProgramConfig {
-  std::vector<std::string> options;
-  std::vector<std::string> include_paths;
-  std::string name;
-  typedef std::map<std::string, std::string> source_map;
-  source_map sources;
-};
-
-class JitCache_impl {
-  friend class Program_impl;
-  friend class KernelInstantiation_impl;
-  friend class KernelLauncher_impl;
-  typedef uint64_t key_type;
-  jitify::ObjectCache<key_type, detail::CUDAKernel> _kernel_cache;
-  jitify::ObjectCache<key_type, ProgramConfig> _program_config_cache;
-  std::vector<std::string> _options;
-#if JITIFY_THREAD_SAFE
-  std::mutex _kernel_cache_mutex;
-  std::mutex _program_cache_mutex;
-#endif
- public:
-  inline JitCache_impl(size_t cache_size)
-      : _kernel_cache(cache_size), _program_config_cache(cache_size) {
-    detail::add_options_from_env(_options);
-
-    // Bootstrap the cuda context to avoid errors
-    cudaFree(0);
-  }
-};
-
-class Program_impl {
-  // A friendly class
-  friend class Kernel_impl;
-  friend class KernelLauncher_impl;
-  friend class KernelInstantiation_impl;
-  // TODO: This can become invalid if JitCache is destroyed before the
-  //         Program object is. However, this can't happen if JitCache
-  //           instances are static.
-  JitCache_impl& _cache;
-  uint64_t _hash;
-  ProgramConfig* _config;
-  void load_sources(std::string source, std::vector<std::string> headers,
-                    std::vector<std::string> options,
-                    file_callback_type file_callback);
-
- public:
-  inline Program_impl(JitCache_impl& cache, std::string source,
-                      jitify::detail::vector<std::string> headers = 0,
-                      jitify::detail::vector<std::string> options = 0,
-                      file_callback_type file_callback = 0);
-  inline Program_impl(Program_impl const&) = default;
-  inline Program_impl(Program_impl&&) = default;
-  inline std::vector<std::string> const& options() const {
-    return _config->options;
-  }
-  inline std::string const& name() const { return _config->name; }
-  inline ProgramConfig::source_map const& sources() const {
-    return _config->sources;
-  }
-  inline std::vector<std::string> const& include_paths() const {
-    return _config->include_paths;
-  }
-};
-
-class Kernel_impl {
-  friend class KernelLauncher_impl;
-  friend class KernelInstantiation_impl;
-  Program_impl _program;
-  std::string _name;
-  std::vector<std::string> _options;
-  uint64_t _hash;
-
- public:
-  inline Kernel_impl(Program_impl const& program, std::string name,
-                     jitify::detail::vector<std::string> options = 0);
-  inline Kernel_impl(Kernel_impl const&) = default;
-  inline Kernel_impl(Kernel_impl&&) = default;
-};
-
-class KernelInstantiation_impl {
-  friend class KernelLauncher_impl;
-  Kernel_impl _kernel;
-  uint64_t _hash;
-  std::string _template_inst;
-  std::vector<std::string> _options;
-  detail::CUDAKernel* _cuda_kernel;
-  inline void print() const;
-  void build_kernel();
-
- public:
-  inline KernelInstantiation_impl(
-      Kernel_impl const& kernel, std::vector<std::string> const& template_args);
-  inline KernelInstantiation_impl(KernelInstantiation_impl const&) = default;
-  inline KernelInstantiation_impl(KernelInstantiation_impl&&) = default;
-  detail::CUDAKernel const& cuda_kernel() const { return *_cuda_kernel; }
-};
-
-class KernelLauncher_impl {
-  KernelInstantiation_impl _kernel_inst;
-  dim3 _grid;
-  dim3 _block;
-  unsigned int _smem;
-  cudaStream_t _stream;
-
- public:
-  inline KernelLauncher_impl(KernelInstantiation_impl const& kernel_inst,
-                             dim3 grid, dim3 block, unsigned int smem = 0,
-                             cudaStream_t stream = 0)
-      : _kernel_inst(kernel_inst),
-        _grid(grid),
-        _block(block),
-        _smem(smem),
-        _stream(stream) {}
-  inline KernelLauncher_impl(KernelLauncher_impl const&) = default;
-  inline KernelLauncher_impl(KernelLauncher_impl&&) = default;
-  inline CUresult launch(
-      jitify::detail::vector<void*> arg_ptrs,
-      jitify::detail::vector<std::string> arg_types = 0) const;
-};
-
-/*! An object representing a configured and instantiated kernel ready
- *    for launching.
- */
-class KernelLauncher {
-  std::unique_ptr<KernelLauncher_impl const> _impl;
-
- public:
-  inline KernelLauncher(KernelInstantiation const& kernel_inst, dim3 grid,
-                        dim3 block, unsigned int smem = 0,
-                        cudaStream_t stream = 0);
-
-  // Note: It's important that there is no implicit conversion required
-  //         for arg_ptrs, because otherwise the parameter pack version
-  //         below gets called instead (probably resulting in a segfault).
-  /*! Launch the kernel.
-   *
-   *  \param arg_ptrs  A vector of pointers to each function argument for the
-   *    kernel.
-   *  \param arg_types A vector of function argument types represented
-   *    as code-strings. This parameter is optional and is only used to print
-   *    out the function signature.
-   */
-  inline CUresult launch(
-      std::vector<void*> arg_ptrs = std::vector<void*>(),
-      jitify::detail::vector<std::string> arg_types = 0) const {
-    return _impl->launch(arg_ptrs, arg_types);
-  }
-  // Regular function call syntax
-  /*! Launch the kernel.
-   *
-   *  \see launch
-   */
-  template <typename... ArgTypes>
-  inline CUresult operator()(ArgTypes... args) const {
-    return this->launch(args...);
-  }
-  /*! Launch the kernel.
-   *
-   *  \param args Function arguments for the kernel.
-   */
-  template <typename... ArgTypes>
-  inline CUresult launch(ArgTypes... args) const {
-    return this->launch(std::vector<void*>({(void*)&args...}),
-                        {reflection::reflect<ArgTypes>()...});
-  }
-};
-
-/*! An object representing a kernel instantiation made up of a Kernel and
- *    template arguments.
- */
-class KernelInstantiation {
-  friend class KernelLauncher;
-  std::unique_ptr<KernelInstantiation_impl const> _impl;
-
- public:
-  inline KernelInstantiation(Kernel const& kernel,
-                             std::vector<std::string> const& template_args);
-
-  /*! Implicit conversion to the underlying CUfunction object.
-   *
-   * \note This allows use of CUDA APIs like
-   *   cuOccupancyMaxActiveBlocksPerMultiprocessor.
-   */
-  inline operator CUfunction() const { return _impl->cuda_kernel(); }
-
-  /*! Configure the kernel launch.
-   *
-   *  \see configure
-   */
-  inline KernelLauncher operator()(dim3 grid, dim3 block, unsigned int smem = 0,
-                                   cudaStream_t stream = 0) const {
-    return this->configure(grid, block, smem, stream);
-  }
-  /*! Configure the kernel launch.
-   *
-   *  \param grid   The thread grid dimensions for the launch.
-   *  \param block  The thread block dimensions for the launch.
-   *  \param smem   The amount of shared memory to dynamically allocate, in
-   * bytes.
-   *  \param stream The CUDA stream to launch the kernel in.
-   */
-  inline KernelLauncher configure(dim3 grid, dim3 block, unsigned int smem = 0,
-                                  cudaStream_t stream = 0) const {
-    return KernelLauncher(*this, grid, block, smem, stream);
-  }
-  /*! Configure the kernel launch with a 1-dimensional block and grid chosen
-   *  automatically to maximise occupancy.
-   *
-   * \param max_block_size  The upper limit on the block size, or 0 for no
-   * limit.
-   * \param smem  The amount of shared memory to dynamically allocate, in bytes.
-   * \param smem_callback  A function returning smem for a given block size (overrides \p smem).
-   * \param stream The CUDA stream to launch the kernel in.
-   * \param flags The flags to pass to cuOccupancyMaxPotentialBlockSizeWithFlags.
-   */
-  inline KernelLauncher configure_1d_max_occupancy(
-      int max_block_size = 0, unsigned int smem = 0,
-      CUoccupancyB2DSize smem_callback = 0, cudaStream_t stream = 0,
-      unsigned int flags = 0) const {
-    int grid;
-    int block;
-    CUfunction func = _impl->cuda_kernel();
-    detail::get_1d_max_occupancy(func, smem_callback, &smem, max_block_size,
-                                 flags, &grid, &block);
-    return this->configure(grid, block, smem, stream);
-  }
-
-  /*
-   * \deprecated Use \p get_global_ptr instead.
-   */
-  inline CUdeviceptr get_constant_ptr(const char* name,
-                                      size_t* size = nullptr) const {
-    return get_global_ptr(name, size);
-  }
-
-  /*
-   * Get a device pointer to a global __constant__ or __device__ variable using
-   * its un-mangled name. If provided, *size is set to the size of the variable
-   * in bytes.
-   */
-  inline CUdeviceptr get_global_ptr(const char* name,
-                                    size_t* size = nullptr) const {
-    return _impl->cuda_kernel().get_global_ptr(name, size);
-  }
-
-  /*
-   * Copy data from a global __constant__ or __device__ array to the host using
-   * its un-mangled name.
-   */
-  template <typename T>
-  inline CUresult get_global_array(const char* name, T* data, size_t count,
-                                   CUstream stream = 0) const {
-    return _impl->cuda_kernel().get_global_data(name, data, count, stream);
-  }
-
-  /*
-   * Copy a value from a global __constant__ or __device__ variable to the host
-   * using its un-mangled name.
-   */
-  template <typename T>
-  inline CUresult get_global_value(const char* name, T* value,
-                                   CUstream stream = 0) const {
-    return get_global_array(name, value, 1, stream);
-  }
-
-  /*
-   * Copy data from the host to a global __constant__ or __device__ array using
-   * its un-mangled name.
-   */
-  template <typename T>
-  inline CUresult set_global_array(const char* name, const T* data,
-                                   size_t count, CUstream stream = 0) const {
-    return _impl->cuda_kernel().set_global_data(name, data, count, stream);
-  }
-
-  /*
-   * Copy a value from the host to a global __constant__ or __device__ variable
-   * using its un-mangled name.
-   */
-  template <typename T>
-  inline CUresult set_global_value(const char* name, const T& value,
-                                   CUstream stream = 0) const {
-    return set_global_array(name, &value, 1, stream);
-  }
-
-  const std::string& mangled_name() const {
-    return _impl->cuda_kernel().function_name();
-  }
-
-  const std::string& ptx() const { return _impl->cuda_kernel().ptx(); }
-
-  const std::vector<std::string>& link_files() const {
-    return _impl->cuda_kernel().link_files();
-  }
-
-  const std::vector<std::string>& link_paths() const {
-    return _impl->cuda_kernel().link_paths();
-  }
-};
-
-/*! An object representing a kernel made up of a Program, a name and options.
- */
-class Kernel {
-  friend class KernelInstantiation;
-  std::unique_ptr<Kernel_impl const> _impl;
-
- public:
-  Kernel(Program const& program, std::string name,
-         jitify::detail::vector<std::string> options = 0);
-
-  /*! Instantiate the kernel.
-   *
-   *  \param template_args A vector of template arguments represented as
-   *    code-strings. These can be generated using
-   *    \code{.cpp}jitify::reflection::reflect<type>()\endcode or
-   *    \code{.cpp}jitify::reflection::reflect(value)\endcode
-   *
-   *  \note Template type deduction is not possible, so all types must be
-   *    explicitly specified.
-   */
-  // inline KernelInstantiation instantiate(std::vector<std::string> const&
-  // template_args) const {
-  inline KernelInstantiation instantiate(
-      std::vector<std::string> const& template_args =
-          std::vector<std::string>()) const {
-    return KernelInstantiation(*this, template_args);
-  }
-
-  // Regular template instantiation syntax (note limited flexibility)
-  /*! Instantiate the kernel.
-   *
-   *  \note The template arguments specified on this function are
-   *    used to instantiate the kernel. Non-type template arguments must
-   *    be wrapped with
-   *    \code{.cpp}jitify::reflection::NonType<type,value>\endcode
-   *
-   *  \note Template type deduction is not possible, so all types must be
-   *    explicitly specified.
-   */
-  template <typename... TemplateArgs>
-  inline KernelInstantiation instantiate() const {
-    return this->instantiate(
-        std::vector<std::string>({reflection::reflect<TemplateArgs>()...}));
-  }
-  // Template-like instantiation syntax
-  //   E.g., instantiate(myvar,Type<MyType>())(grid,block)
-  /*! Instantiate the kernel.
-   *
-   *  \param targs The template arguments for the kernel, represented as
-   *    values. Types must be wrapped with
-   *    \code{.cpp}jitify::reflection::Type<type>()\endcode or
-   *    \code{.cpp}jitify::reflection::type_of(value)\endcode
-   *
-   *  \note Template type deduction is not possible, so all types must be
-   *    explicitly specified.
-   */
-  template <typename... TemplateArgs>
-  inline KernelInstantiation instantiate(TemplateArgs... targs) const {
-    return this->instantiate(
-        std::vector<std::string>({reflection::reflect(targs)...}));
-  }
-};
-
-/*! An object representing a program made up of source code, headers
- *    and options.
- */
-class Program {
-  friend class Kernel;
-  std::unique_ptr<Program_impl const> _impl;
-
- public:
-  Program(JitCache& cache, std::string source,
-          jitify::detail::vector<std::string> headers = 0,
-          jitify::detail::vector<std::string> options = 0,
-          file_callback_type file_callback = 0);
-
-  /*! Select a kernel.
-   *
-   * \param name The name of the kernel (unmangled and without
-   * template arguments).
-   * \param options A vector of options to be passed to the NVRTC
-   * compiler when compiling this kernel.
-   */
-  inline Kernel kernel(std::string name,
-                       jitify::detail::vector<std::string> options = 0) const {
-    return Kernel(*this, name, options);
-  }
-  /*! Select a kernel.
-   *
-   *  \see kernel
-   */
-  inline Kernel operator()(
-      std::string name, jitify::detail::vector<std::string> options = 0) const {
-    return this->kernel(name, options);
-  }
-};
-
-/*! An object that manages a cache of JIT-compiled CUDA kernels.
- *
- */
-class JitCache {
-  friend class Program;
-  std::unique_ptr<JitCache_impl> _impl;
-
- public:
-  /*! JitCache constructor.
-   *  \param cache_size The number of kernels to hold in the cache
-   *    before overwriting the least-recently-used ones.
-   */
-  enum { DEFAULT_CACHE_SIZE = 128 };
-  JitCache(size_t cache_size = DEFAULT_CACHE_SIZE)
-      : _impl(new JitCache_impl(cache_size)) {}
-
-  /*! Create a program.
-   *
-   *  \param source A string containing either the source filename or
-   *    the source itself; in the latter case, the first line must be
-   *    the name of the program.
-   *  \param headers A vector of strings representing the source of
-   *    each header file required by the program. Each entry can be
-   *    either the header filename or the header source itself; in
-   *    the latter case, the first line must be the name of the header
-   *    (i.e., the name by which the header is #included).
-   *  \param options A vector of options to be passed to the
-   *    NVRTC compiler. Include paths specified with \p -I
-   *    are added to the search paths used by Jitify. The environment
-   *    variable JITIFY_OPTIONS can also be used to define additional
-   *    options.
-   *  \param file_callback A pointer to a callback function that is
-   *    invoked whenever a source file needs to be loaded. Inside this
-   *    function, the user can either load/specify the source themselves
-   *    or defer to Jitify's file-loading mechanisms.
-   *  \note Program or header source files referenced by filename are
-   *  looked-up using the following mechanisms (in this order):
-   *  \note 1) By calling file_callback.
-   *  \note 2) By looking for the file embedded in the executable via the GCC
-   * linker.
-   *  \note 3) By looking for the file in the filesystem.
-   *
-   *  \note Jitify recursively scans all source files for \p #include
-   *  directives and automatically adds them to the set of headers needed
-   *  by the program.
-   *  If a \p #include directive references a header that cannot be found,
-   *  the directive is automatically removed from the source code to prevent
-   *  immediate compilation failure. This may result in compilation errors
-   *  if the header was required by the program.
-   *
-   *  \note Jitify automatically includes NVRTC-safe versions of some
-   *  standard library headers.
-   */
-  inline Program program(std::string source,
-                         jitify::detail::vector<std::string> headers = 0,
-                         jitify::detail::vector<std::string> options = 0,
-                         file_callback_type file_callback = 0) {
-    return Program(*this, source, headers, options, file_callback);
-  }
-};
-
-inline Program::Program(JitCache& cache, std::string source,
-                        jitify::detail::vector<std::string> headers,
-                        jitify::detail::vector<std::string> options,
-                        file_callback_type file_callback)
-    : _impl(new Program_impl(*cache._impl, source, headers, options,
-                             file_callback)) {}
-
-inline Kernel::Kernel(Program const& program, std::string name,
-                      jitify::detail::vector<std::string> options)
-    : _impl(new Kernel_impl(*program._impl, name, options)) {}
-
-inline KernelInstantiation::KernelInstantiation(
-    Kernel const& kernel, std::vector<std::string> const& template_args)
-    : _impl(new KernelInstantiation_impl(*kernel._impl, template_args)) {}
-
-inline KernelLauncher::KernelLauncher(KernelInstantiation const& kernel_inst,
-                                      dim3 grid, dim3 block, unsigned int smem,
-                                      cudaStream_t stream)
-    : _impl(new KernelLauncher_impl(*kernel_inst._impl, grid, block, smem,
-                                    stream)) {}
-
-inline std::ostream& operator<<(std::ostream& stream, dim3 d) {
-  if (d.y == 1 && d.z == 1) {
-    stream << d.x;
-  } else {
-    stream << "(" << d.x << "," << d.y << "," << d.z << ")";
-  }
-  return stream;
-}
-
-inline CUresult KernelLauncher_impl::launch(
-    jitify::detail::vector<void*> arg_ptrs,
-    jitify::detail::vector<std::string> arg_types) const {
-#if JITIFY_PRINT_LAUNCH
-  Kernel_impl const& kernel = _kernel_inst._kernel;
-  std::string arg_types_string =
-      (arg_types.empty() ? "..." : reflection::reflect_list(arg_types));
-  std::cout << "Launching " << kernel._name << _kernel_inst._template_inst
-            << "<<<" << _grid << "," << _block << "," << _smem << "," << _stream
-            << ">>>"
-            << "(" << arg_types_string << ")" << std::endl;
-#endif
-  if (!_kernel_inst._cuda_kernel) {
-    throw std::runtime_error(
-        "Kernel pointer is NULL; you may need to define JITIFY_THREAD_SAFE 1");
-  }
-  return _kernel_inst._cuda_kernel->launch(_grid, _block, _smem, _stream,
-                                           arg_ptrs);
-}
-
-inline KernelInstantiation_impl::KernelInstantiation_impl(
-    Kernel_impl const& kernel, std::vector<std::string> const& template_args)
-    : _kernel(kernel), _options(kernel._options) {
-  _template_inst =
-      (template_args.empty() ? ""
-                             : reflection::reflect_template(template_args));
-  using detail::hash_combine;
-  using detail::hash_larson64;
-  _hash = _kernel._hash;
-  _hash = hash_combine(_hash, hash_larson64(_template_inst.c_str()));
-  JitCache_impl& cache = _kernel._program._cache;
-  uint64_t cache_key = _hash;
-#if JITIFY_THREAD_SAFE
-  std::lock_guard<std::mutex> lock(cache._kernel_cache_mutex);
-#endif
-  if (cache._kernel_cache.contains(cache_key)) {
-#if JITIFY_PRINT_INSTANTIATION
-    std::cout << "Found ";
-    this->print();
-#endif
-    _cuda_kernel = &cache._kernel_cache.get(cache_key);
-  } else {
-#if JITIFY_PRINT_INSTANTIATION
-    std::cout << "Building ";
-    this->print();
-#endif
-    _cuda_kernel = &cache._kernel_cache.emplace(cache_key);
-    this->build_kernel();
-  }
-}
-
-inline void KernelInstantiation_impl::print() const {
-  std::string options_string = reflection::reflect_list(_options);
-  std::cout << _kernel._name << _template_inst << " [" << options_string << "]"
-            << std::endl;
-}
-
-inline void KernelInstantiation_impl::build_kernel() {
-  Program_impl const& program = _kernel._program;
-
-  std::string instantiation = _kernel._name + _template_inst;
-
-  std::string log, ptx, mangled_instantiation;
-  std::vector<std::string> linker_files, linker_paths;
-  detail::instantiate_kernel(program.name(), program.sources(), instantiation,
-                             _options, &log, &ptx, &mangled_instantiation,
-                             &linker_files, &linker_paths);
-
-  _cuda_kernel->set(mangled_instantiation.c_str(), ptx.c_str(), linker_files,
-                    linker_paths);
-}
-
-Kernel_impl::Kernel_impl(Program_impl const& program, std::string name,
-                         jitify::detail::vector<std::string> options)
-    : _program(program), _name(name), _options(options) {
-  // Merge options from parent
-  _options.insert(_options.end(), _program.options().begin(),
-                  _program.options().end());
-  detail::detect_and_add_cuda_arch(_options);
-  detail::detect_and_add_cxx11_flag(_options);
-  std::string options_string = reflection::reflect_list(_options);
-  using detail::hash_combine;
-  using detail::hash_larson64;
-  _hash = _program._hash;
-  _hash = hash_combine(_hash, hash_larson64(_name.c_str()));
-  _hash = hash_combine(_hash, hash_larson64(options_string.c_str()));
-}
-
-Program_impl::Program_impl(JitCache_impl& cache, std::string source,
-                           jitify::detail::vector<std::string> headers,
-                           jitify::detail::vector<std::string> options,
-                           file_callback_type file_callback)
-    : _cache(cache) {
-  // Compute hash of source, headers and options
-  std::string options_string = reflection::reflect_list(options);
-  using detail::hash_combine;
-  using detail::hash_larson64;
-  _hash = hash_combine(hash_larson64(source.c_str()),
-                       hash_larson64(options_string.c_str()));
-  for (size_t i = 0; i < headers.size(); ++i) {
-    _hash = hash_combine(_hash, hash_larson64(headers[i].c_str()));
-  }
-  _hash = hash_combine(_hash, (uint64_t)file_callback);
-  // Add pre-include built-in JIT-safe headers
-  for (int i = 0; i < detail::preinclude_jitsafe_headers_count; ++i) {
-    const char* hdr_name = detail::preinclude_jitsafe_header_names[i];
-    const std::string& hdr_source =
-        detail::get_jitsafe_headers_map().at(hdr_name);
-    headers.push_back(std::string(hdr_name) + "\n" + hdr_source);
-  }
-  // Merge options from parent
-  options.insert(options.end(), _cache._options.begin(), _cache._options.end());
-  // Load sources
-#if JITIFY_THREAD_SAFE
-  std::lock_guard<std::mutex> lock(cache._program_cache_mutex);
-#endif
-  if (!cache._program_config_cache.contains(_hash)) {
-    _config = &cache._program_config_cache.insert(_hash);
-    this->load_sources(source, headers, options, file_callback);
-  } else {
-    _config = &cache._program_config_cache.get(_hash);
-  }
-}
-
-inline void Program_impl::load_sources(std::string source,
-                                       std::vector<std::string> headers,
-                                       std::vector<std::string> options,
-                                       file_callback_type file_callback) {
-  _config->options = options;
-  detail::load_program(source, headers, file_callback, &_config->include_paths,
-                       &_config->sources, &_config->options, &_config->name);
-}
-
-enum Location { HOST, DEVICE };
-
-/*! Specifies location and parameters for execution of an algorithm.
- *  \param stream        The CUDA stream on which to execute.
- *  \param headers       A vector of headers to include in the code.
- *  \param options       Options to pass to the NVRTC compiler.
- *  \param file_callback See jitify::Program.
- *  \param block_size    The size of the CUDA thread block with which to
- * execute.
- *  \param cache_size    The number of kernels to store in the cache
- * before overwriting the least-recently-used ones.
- */
-struct ExecutionPolicy {
-  /*! Location (HOST or DEVICE) on which to execute.*/
-  Location location;
-  /*! List of headers to include when compiling the algorithm.*/
-  std::vector<std::string> headers;
-  /*! List of compiler options.*/
-  std::vector<std::string> options;
-  /*! Optional callback for loading source files.*/
-  file_callback_type file_callback;
-  /*! CUDA stream on which to execute.*/
-  cudaStream_t stream;
-  /*! CUDA device on which to execute.*/
-  int device;
-  /*! CUDA block size with which to execute.*/
-  int block_size;
-  /*! The number of instantiations to store in the cache before overwriting
-   *  the least-recently-used ones.*/
-  size_t cache_size;
-  ExecutionPolicy(Location location_ = DEVICE,
-                  jitify::detail::vector<std::string> headers_ = 0,
-                  jitify::detail::vector<std::string> options_ = 0,
-                  file_callback_type file_callback_ = 0,
-                  cudaStream_t stream_ = 0, int device_ = 0,
-                  int block_size_ = 256,
-                  size_t cache_size_ = JitCache::DEFAULT_CACHE_SIZE)
-      : location(location_),
-        headers(headers_),
-        options(options_),
-        file_callback(file_callback_),
-        stream(stream_),
-        device(device_),
-        block_size(block_size_),
-        cache_size(cache_size_) {}
-};
-
-template <class Func>
-class Lambda;
-
-/*! An object that captures a set of variables for use in a parallel_for
- *    expression. See JITIFY_CAPTURE().
- */
-class Capture {
- public:
-  std::vector<std::string> _arg_decls;
-  std::vector<void*> _arg_ptrs;
-
- public:
-  template <typename... Args>
-  inline Capture(std::vector<std::string> arg_names, Args const&... args)
-      : _arg_ptrs{(void*)&args...} {
-    std::vector<std::string> arg_types = {reflection::reflect<Args>()...};
-    _arg_decls.resize(arg_names.size());
-    for (int i = 0; i < (int)arg_names.size(); ++i) {
-      _arg_decls[i] = arg_types[i] + " " + arg_names[i];
-    }
-  }
-};
-
-/*! An object that captures the instantiated Lambda function for use
-    in a parallel_for expression and the function string for NVRTC
-    compilation
- */
-template <class Func>
-class Lambda {
- public:
-  Capture _capture;
-  std::string _func_string;
-  Func _func;
-
- public:
-  inline Lambda(Capture const& capture, std::string func_string, Func func)
-      : _capture(capture), _func_string(func_string), _func(func) {}
-};
-
-template <typename T>
-inline Lambda<T> make_Lambda(Capture const& capture, std::string func,
-                             T lambda) {
-  return Lambda<T>(capture, func, lambda);
-}
-
-#define JITIFY_CAPTURE(...)                                            \
-  jitify::Capture(jitify::detail::split_string(#__VA_ARGS__, -1, ","), \
-                  __VA_ARGS__)
-
-#define JITIFY_MAKE_LAMBDA(capture, x, ...)               \
-  jitify::make_Lambda(capture, std::string(#__VA_ARGS__), \
-                      [x](int i) { __VA_ARGS__; })
-
-#define JITIFY_ARGS(...) __VA_ARGS__
-
-#define JITIFY_LAMBDA_(x, ...) \
-  JITIFY_MAKE_LAMBDA(JITIFY_CAPTURE(x), JITIFY_ARGS(x), __VA_ARGS__)
-
-// macro sequence to strip surrounding brackets
-#define JITIFY_STRIP_PARENS(X) X
-#define JITIFY_PASS_PARAMETERS(X) JITIFY_STRIP_PARENS(JITIFY_ARGS X)
-
-/*! Creates a Lambda object with captured variables and a function
- *    definition.
- *  \param capture A bracket-enclosed list of variables to capture.
- *  \param ...     The function definition.
- *
- *  \code{.cpp}
- *  float* capture_me;
- *  int    capture_me_too;
- *  auto my_lambda = JITIFY_LAMBDA( (capture_me, capture_me_too),
- *                                  capture_me[i] = i*capture_me_too );
- *  \endcode
- */
-#define JITIFY_LAMBDA(capture, ...)                            \
-  JITIFY_LAMBDA_(JITIFY_ARGS(JITIFY_PASS_PARAMETERS(capture)), \
-                 JITIFY_ARGS(__VA_ARGS__))
-
-// TODO: Try to implement for_each that accepts iterators instead of indices
-//       Add compile guard for NOCUDA compilation
-/*! Call a function for a range of indices
- *
- *  \param policy Determines the location and device parameters for
- *  execution of the parallel_for.
- *  \param begin  The starting index.
- *  \param end    The ending index.
- *  \param lambda A Lambda object created using the JITIFY_LAMBDA() macro.
- *
- *  \code{.cpp}
- *  char const* in;
- *  float*      out;
- *  parallel_for(0, 100, JITIFY_LAMBDA( (in, out), {char x = in[i]; out[i] =
- * x*x; } ); \endcode
- */
-template <typename IndexType, class Func>
-CUresult parallel_for(ExecutionPolicy policy, IndexType begin, IndexType end,
-                      Lambda<Func> const& lambda) {
-  using namespace jitify;
-
-  if (policy.location == HOST) {
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (IndexType i = begin; i < end; i++) {
-      lambda._func(i);
-    }
-    return CUDA_SUCCESS;  // FIXME - replace with non-CUDA enum type?
-  }
-
-  thread_local static JitCache kernel_cache(policy.cache_size);
-
-  std::vector<std::string> arg_decls;
-  arg_decls.push_back("I begin, I end");
-  arg_decls.insert(arg_decls.end(), lambda._capture._arg_decls.begin(),
-                   lambda._capture._arg_decls.end());
-
-  std::stringstream source_ss;
-  source_ss << "parallel_for_program\n";
-  for (auto const& header : policy.headers) {
-    std::string header_name = header.substr(0, header.find("\n"));
-    source_ss << "#include <" << header_name << ">\n";
-  }
-  source_ss << "template<typename I>\n"
-               "__global__\n"
-               "void parallel_for_kernel("
-            << reflection::reflect_list(arg_decls)
-            << ") {\n"
-               "	I i0 = threadIdx.x + blockDim.x*blockIdx.x;\n"
-               "	for( I i=i0+begin; i<end; i+=blockDim.x*gridDim.x ) {\n"
-               "	"
-            << "\t" << lambda._func_string << ";\n"
-            << "	}\n"
-               "}\n";
-
-  Program program = kernel_cache.program(source_ss.str(), policy.headers,
-                                         policy.options, policy.file_callback);
-
-  std::vector<void*> arg_ptrs;
-  arg_ptrs.push_back(&begin);
-  arg_ptrs.push_back(&end);
-  arg_ptrs.insert(arg_ptrs.end(), lambda._capture._arg_ptrs.begin(),
-                  lambda._capture._arg_ptrs.end());
-
-  size_t n = end - begin;
-  dim3 block(policy.block_size);
-  dim3 grid((unsigned int)std::min((n - 1) / block.x + 1, size_t(65535)));
-  cudaSetDevice(policy.device);
-  return program.kernel("parallel_for_kernel")
-      .instantiate<IndexType>()
-      .configure(grid, block, 0, policy.stream)
-      .launch(arg_ptrs);
-}
-
-namespace experimental {
-
-using jitify::file_callback_type;
-
-namespace serialization {
-
-namespace detail {
-
-// This should be incremented whenever the serialization format changes in any
-// incompatible way.
-static constexpr const size_t kSerializationVersion = 1;
-
-inline void serialize(std::ostream& stream, size_t u) {
-  uint64_t u64 = u;
-  stream.write(reinterpret_cast<char*>(&u64), sizeof(u64));
-}
-
-inline bool deserialize(std::istream& stream, size_t* size) {
-  uint64_t u64;
-  stream.read(reinterpret_cast<char*>(&u64), sizeof(u64));
-  *size = u64;
-  return stream.good();
-}
-
-inline void serialize(std::ostream& stream, std::string const& s) {
-  serialize(stream, s.size());
-  stream.write(s.data(), s.size());
-}
-
-inline bool deserialize(std::istream& stream, std::string* s) {
-  size_t size;
-  if (!deserialize(stream, &size)) return false;
-  s->resize(size);
-  if (s->size()) {
-    stream.read(&(*s)[0], s->size());
-  }
-  return stream.good();
-}
-
-inline void serialize(std::ostream& stream, std::vector<std::string> const& v) {
-  serialize(stream, v.size());
-  for (auto const& s : v) {
-    serialize(stream, s);
-  }
-}
-
-inline bool deserialize(std::istream& stream, std::vector<std::string>* v) {
-  size_t size;
-  if (!deserialize(stream, &size)) return false;
-  v->resize(size);
-  for (auto& s : *v) {
-    if (!deserialize(stream, &s)) return false;
-  }
-  return true;
-}
-
-inline void serialize(std::ostream& stream,
-                      std::map<std::string, std::string> const& m) {
-  serialize(stream, m.size());
-  for (auto const& kv : m) {
-    serialize(stream, kv.first);
-    serialize(stream, kv.second);
-  }
-}
-
-inline bool deserialize(std::istream& stream,
-                        std::map<std::string, std::string>* m) {
-  size_t size;
-  if (!deserialize(stream, &size)) return false;
-  for (size_t i = 0; i < size; ++i) {
-    std::string key;
-    if (!deserialize(stream, &key)) return false;
-    if (!deserialize(stream, &(*m)[key])) return false;
-  }
-  return true;
-}
-
-template <typename T, typename... Rest>
-inline void serialize(std::ostream& stream, T const& value, Rest... rest) {
-  serialize(stream, value);
-  serialize(stream, rest...);
-}
-
-template <typename T, typename... Rest>
-inline bool deserialize(std::istream& stream, T* value, Rest... rest) {
-  if (!deserialize(stream, value)) return false;
-  return deserialize(stream, rest...);
-}
-
-inline void serialize_magic_number(std::ostream& stream) {
-  stream.write("JTFY", 4);
-  serialize(stream, kSerializationVersion);
-}
-
-inline bool deserialize_magic_number(std::istream& stream) {
-  char magic_number[4] = {0, 0, 0, 0};
-  stream.read(&magic_number[0], 4);
-  if (!(magic_number[0] == 'J' && magic_number[1] == 'T' &&
-        magic_number[2] == 'F' && magic_number[3] == 'Y')) {
-    return false;
-  }
-  size_t serialization_version;
-  if (!deserialize(stream, &serialization_version)) return false;
-  return serialization_version == kSerializationVersion;
-}
-
-}  // namespace detail
-
-template <typename... Values>
-inline std::string serialize(Values const&... values) {
-  std::ostringstream ss(std::stringstream::out | std::stringstream::binary);
-  detail::serialize_magic_number(ss);
-  detail::serialize(ss, values...);
-  return ss.str();
-}
-
-template <typename... Values>
-inline bool deserialize(std::string const& serialized, Values*... values) {
-  std::istringstream ss(serialized,
-                        std::stringstream::in | std::stringstream::binary);
-  if (!detail::deserialize_magic_number(ss)) return false;
-  return detail::deserialize(ss, values...);
-}
-
-}  // namespace serialization
-
-class Program;
-class Kernel;
-class KernelInstantiation;
-class KernelLauncher;
-
-/*! An object representing a program made up of source code, headers
- *    and options.
- */
-class Program {
- private:
-  friend class KernelInstantiation;
-  std::string _name;
-  std::vector<std::string> _options;
-  std::map<std::string, std::string> _sources;
-
-  // Private constructor used by deserialize()
-  Program() {}
-
- public:
-  /*! Create a program.
-   *
-   *  \param source A string containing either the source filename or
-   *    the source itself; in the latter case, the first line must be
-   *    the name of the program.
-   *  \param headers A vector of strings representing the source of
-   *    each header file required by the program. Each entry can be
-   *    either the header filename or the header source itself; in
-   *    the latter case, the first line must be the name of the header
-   *    (i.e., the name by which the header is #included).
-   *  \param options A vector of options to be passed to the
-   *    NVRTC compiler. Include paths specified with \p -I
-   *    are added to the search paths used by Jitify. The environment
-   *    variable JITIFY_OPTIONS can also be used to define additional
-   *    options.
-   *  \param file_callback A pointer to a callback function that is
-   *    invoked whenever a source file needs to be loaded. Inside this
-   *    function, the user can either load/specify the source themselves
-   *    or defer to Jitify's file-loading mechanisms.
-   *  \note Program or header source files referenced by filename are
-   *  looked-up using the following mechanisms (in this order):
-   *  \note 1) By calling file_callback.
-   *  \note 2) By looking for the file embedded in the executable via the GCC
-   * linker.
-   *  \note 3) By looking for the file in the filesystem.
-   *
-   *  \note Jitify recursively scans all source files for \p #include
-   *  directives and automatically adds them to the set of headers needed
-   *  by the program.
-   *  If a \p #include directive references a header that cannot be found,
-   *  the directive is automatically removed from the source code to prevent
-   *  immediate compilation failure. This may result in compilation errors
-   *  if the header was required by the program.
-   *
-   *  \note Jitify automatically includes NVRTC-safe versions of some
-   *  standard library headers.
-   */
-  Program(std::string const& cuda_source,
-          std::vector<std::string> const& given_headers = {},
-          std::vector<std::string> const& given_options = {},
-          file_callback_type file_callback = nullptr) {
-    // Add pre-include built-in JIT-safe headers
-    std::vector<std::string> headers = given_headers;
-    for (int i = 0; i < detail::preinclude_jitsafe_headers_count; ++i) {
-      const char* hdr_name = detail::preinclude_jitsafe_header_names[i];
-      const std::string& hdr_source =
-          detail::get_jitsafe_headers_map().at(hdr_name);
-      headers.push_back(std::string(hdr_name) + "\n" + hdr_source);
-    }
-
-    _options = given_options;
-    detail::add_options_from_env(_options);
-    std::vector<std::string> include_paths;
-    detail::load_program(cuda_source, headers, file_callback, &include_paths,
-                         &_sources, &_options, &_name);
-  }
-
-  /*! Restore a serialized program.
-   *
-   * \param serialized_program The serialized program to restore.
-   *
-   * \see serialize
-   */
-  static Program deserialize(std::string const& serialized_program) {
-    Program program;
-    if (!serialization::deserialize(serialized_program, &program._name,
-                                    &program._options, &program._sources)) {
-      throw std::runtime_error("Failed to deserialize program");
-    }
-    return program;
-  }
-
-  /*! Save the program.
-   *
-   * \see deserialize
-   */
-  std::string serialize() const {
-    // Note: Must update kSerializationVersion if this is changed.
-    return serialization::serialize(_name, _options, _sources);
-  };
-
-  /*! Select a kernel.
-   *
-   * \param name The name of the kernel (unmangled and without
-   * template arguments).
-   * \param options A vector of options to be passed to the NVRTC
-   * compiler when compiling this kernel.
-   */
-  Kernel kernel(std::string const& name,
-                std::vector<std::string> const& options = {}) const;
-};
-
-class Kernel {
-  friend class KernelInstantiation;
-  Program const* _program;
-  std::string _name;
-  std::vector<std::string> _options;
-
- public:
-  Kernel(Program const* program, std::string const& name,
-         std::vector<std::string> const& options = {})
-      : _program(program), _name(name), _options(options) {}
-
-  /*! Instantiate the kernel.
-   *
-   *  \param template_args A vector of template arguments represented as
-   *    code-strings. These can be generated using
-   *    \code{.cpp}jitify::reflection::reflect<type>()\endcode or
-   *    \code{.cpp}jitify::reflection::reflect(value)\endcode
-   *
-   *  \note Template type deduction is not possible, so all types must be
-   *    explicitly specified.
-   */
-  KernelInstantiation instantiate(
-      std::vector<std::string> const& template_args =
-          std::vector<std::string>()) const;
-
-  // Regular template instantiation syntax (note limited flexibility)
-  /*! Instantiate the kernel.
-   *
-   *  \note The template arguments specified on this function are
-   *    used to instantiate the kernel. Non-type template arguments must
-   *    be wrapped with
-   *    \code{.cpp}jitify::reflection::NonType<type,value>\endcode
-   *
-   *  \note Template type deduction is not possible, so all types must be
-   *    explicitly specified.
-   */
-  template <typename... TemplateArgs>
-  KernelInstantiation instantiate() const;
-
-  // Template-like instantiation syntax
-  //   E.g., instantiate(myvar,Type<MyType>())(grid,block)
-  /*! Instantiate the kernel.
-   *
-   *  \param targs The template arguments for the kernel, represented as
-   *    values. Types must be wrapped with
-   *    \code{.cpp}jitify::reflection::Type<type>()\endcode or
-   *    \code{.cpp}jitify::reflection::type_of(value)\endcode
-   *
-   *  \note Template type deduction is not possible, so all types must be
-   *    explicitly specified.
-   */
-  template <typename... TemplateArgs>
-  KernelInstantiation instantiate(TemplateArgs... targs) const;
-};
-
-class KernelInstantiation {
-  friend class KernelLauncher;
-  std::unique_ptr<detail::CUDAKernel> _cuda_kernel;
-
-  // Private constructor used by deserialize()
-  KernelInstantiation(std::string const& func_name, std::string const& ptx,
-                      std::vector<std::string> const& link_files,
-                      std::vector<std::string> const& link_paths)
-      : _cuda_kernel(new detail::CUDAKernel(func_name.c_str(), ptx.c_str(),
-                                            link_files, link_paths)) {}
-
- public:
-  KernelInstantiation(Kernel const& kernel,
-                      std::vector<std::string> const& template_args) {
-    Program const* program = kernel._program;
-
-    std::string template_inst =
-        (template_args.empty() ? ""
-                               : reflection::reflect_template(template_args));
-    std::string instantiation = kernel._name + template_inst;
-
-    std::vector<std::string> options;
-    options.insert(options.begin(), program->_options.begin(),
-                   program->_options.end());
-    options.insert(options.begin(), kernel._options.begin(),
-                   kernel._options.end());
-    detail::detect_and_add_cuda_arch(options);
-    detail::detect_and_add_cxx11_flag(options);
-
-    std::string log, ptx, mangled_instantiation;
-    std::vector<std::string> linker_files, linker_paths;
-
-    std::cout << "About to instantiate kernel" << std::endl;
-    detail::instantiate_kernel(program->_name, program->_sources, instantiation,
-                               options, &log, &ptx, &mangled_instantiation,
-                               &linker_files, &linker_paths);
-
-    std::cout << "instantiated kernel" << std::endl;
-    _cuda_kernel.reset(new detail::CUDAKernel(mangled_instantiation.c_str(),
-                                              ptx.c_str(), linker_files,
-                                              linker_paths));
-  }
-
-  /*! Implicit conversion to the underlying CUfunction object.
-   *
-   * \note This allows use of CUDA APIs like
-   *   cuOccupancyMaxActiveBlocksPerMultiprocessor.
-   */
-  operator CUfunction() const { return *_cuda_kernel; }
-
-  /*! Restore a serialized kernel instantiation.
-   *
-   * \param serialized_kernel_inst The serialized kernel instantiation to
-   * restore.
-   *
-   * \see serialize
-   */
-  static KernelInstantiation deserialize(
-      std::string const& serialized_kernel_inst) {
-    std::string func_name, ptx;
-    std::vector<std::string> link_files, link_paths;
-    if (!serialization::deserialize(serialized_kernel_inst, &func_name, &ptx,
-                                    &link_files, &link_paths)) {
-      throw std::runtime_error("Failed to deserialize kernel instantiation");
-    }
-    return KernelInstantiation(func_name, ptx, link_files, link_paths);
-  }
-
-  /*! Save the program.
-   *
-   * \see deserialize
-   */
-  std::string serialize() const {
-    // Note: Must update kSerializationVersion if this is changed.
-
-    std::cout << "Inside serialize!!!!" << std::endl;
-    return serialization::serialize(
-        _cuda_kernel->function_name(), _cuda_kernel->ptx(),
-        _cuda_kernel->link_files(), _cuda_kernel->link_paths());
-  }
-
-  /*! Configure the kernel launch.
-   *
-   *  \param grid   The thread grid dimensions for the launch.
-   *  \param block  The thread block dimensions for the launch.
-   *  \param smem   The amount of shared memory to dynamically allocate, in
-   * bytes.
-   *  \param stream The CUDA stream to launch the kernel in.
-   */
-  KernelLauncher configure(dim3 grid, dim3 block, unsigned int smem = 0,
-                           cudaStream_t stream = 0) const;
-
-  /*! Configure the kernel launch with a 1-dimensional block and grid chosen
-   *  automatically to maximise occupancy.
-   *
-   * \param max_block_size  The upper limit on the block size, or 0 for no
-   * limit.
-   * \param smem  The amount of shared memory to dynamically allocate, in bytes.
-   * \param smem_callback  A function returning smem for a given block size
-   * (overrides \p smem).
-   * \param stream The CUDA stream to launch the kernel in.
-   * \param flags The flags to pass to
-   * cuOccupancyMaxPotentialBlockSizeWithFlags.
-   */
-  KernelLauncher configure_1d_max_occupancy(
-      int max_block_size = 0, unsigned int smem = 0,
-      CUoccupancyB2DSize smem_callback = 0, cudaStream_t stream = 0,
-      unsigned int flags = 0) const;
-
-  /*
-   * \deprecated Use \p get_global_ptr instead.
-   */
-  CUdeviceptr get_constant_ptr(const char* name, size_t* size = nullptr) const {
-    return get_global_ptr(name, size);
-  }
-
-  /*
-   * Get a device pointer to a global __constant__ or __device__ variable using
-   * its un-mangled name. If provided, *size is set to the size of the variable
-   * in bytes.
-   */
-  CUdeviceptr get_global_ptr(const char* name, size_t* size = nullptr) const {
-    return _cuda_kernel->get_global_ptr(name, size);
-  }
-
-  /*
-   * Copy data from a global __constant__ or __device__ array to the host using
-   * its un-mangled name.
-   */
-  template <typename T>
-  CUresult get_global_array(const char* name, T* data, size_t count,
-                            CUstream stream = 0) const {
-    return _cuda_kernel->get_global_data(name, data, count, stream);
-  }
-
-  /*
-   * Copy a value from a global __constant__ or __device__ variable to the host
-   * using its un-mangled name.
-   */
-  template <typename T>
-  CUresult get_global_value(const char* name, T* value,
-                            CUstream stream = 0) const {
-    return get_global_array(name, value, 1, stream);
-  }
-
-  /*
-   * Copy data from the host to a global __constant__ or __device__ array using
-   * its un-mangled name.
-   */
-  template <typename T>
-  CUresult set_global_array(const char* name, const T* data, size_t count,
-                            CUstream stream = 0) const {
-    return _cuda_kernel->set_global_data(name, data, count, stream);
-  }
-
-  /*
-   * Copy a value from the host to a global __constant__ or __device__ variable
-   * using its un-mangled name.
-   */
-  template <typename T>
-  CUresult set_global_value(const char* name, const T& value,
-                            CUstream stream = 0) const {
-    return set_global_array(name, &value, 1, stream);
-  }
-
-  const std::string& mangled_name() const {
-    return _cuda_kernel->function_name();
-  }
-
-  const std::string& ptx() const { return _cuda_kernel->ptx(); }
-
-  const std::vector<std::string>& link_files() const {
-    return _cuda_kernel->link_files();
-  }
-
-  const std::vector<std::string>& link_paths() const {
-    return _cuda_kernel->link_paths();
-  }
-};
-
-class KernelLauncher {
-  KernelInstantiation const* _kernel_inst;
-  dim3 _grid;
-  dim3 _block;
-  unsigned int _smem;
-  cudaStream_t _stream;
-
- public:
-  KernelLauncher(KernelInstantiation const* kernel_inst, dim3 grid, dim3 block,
-                 unsigned int smem = 0, cudaStream_t stream = 0)
-      : _kernel_inst(kernel_inst),
-        _grid(grid),
-        _block(block),
-        _smem(smem),
-        _stream(stream) {}
-
-  // Note: It's important that there is no implicit conversion required
-  //         for arg_ptrs, because otherwise the parameter pack version
-  //         below gets called instead (probably resulting in a segfault).
-  /*! Launch the kernel.
-   *
-   *  \param arg_ptrs  A vector of pointers to each function argument for the
-   *    kernel.
-   *  \param arg_types A vector of function argument types represented
-   *    as code-strings. This parameter is optional and is only used to print
-   *    out the function signature.
-   */
-  CUresult launch(std::vector<void*> arg_ptrs = {},
-                  std::vector<std::string> arg_types = {}) const {
-#if JITIFY_PRINT_LAUNCH
-    std::string arg_types_string =
-        (arg_types.empty() ? "..." : reflection::reflect_list(arg_types));
-    std::cout << "Launching " << _kernel_inst->_cuda_kernel->function_name()
-              << "<<<" << _grid << "," << _block << "," << _smem << ","
-              << _stream << ">>>"
-              << "(" << arg_types_string << ")" << std::endl;
-#endif
-
-    return _kernel_inst->_cuda_kernel->launch(_grid, _block, _smem, _stream,
-                                              arg_ptrs);
-  }
-
-  /*! Launch the kernel.
-   *
-   *  \param args Function arguments for the kernel.
-   */
-  template <typename... ArgTypes>
-  CUresult launch(ArgTypes... args) const {
-    return this->launch(std::vector<void*>({(void*)&args...}),
-                        {reflection::reflect<ArgTypes>()...});
-  }
-};
-
-inline Kernel Program::kernel(std::string const& name,
-                              std::vector<std::string> const& options) const {
-  return Kernel(this, name, options);
-}
-
-inline KernelInstantiation Kernel::instantiate(
-    std::vector<std::string> const& template_args) const {
-  return KernelInstantiation(*this, template_args);
-}
-
-template <typename... TemplateArgs>
-inline KernelInstantiation Kernel::instantiate() const {
-  return this->instantiate(
-      std::vector<std::string>({reflection::reflect<TemplateArgs>()...}));
-}
-
-template <typename... TemplateArgs>
-inline KernelInstantiation Kernel::instantiate(TemplateArgs... targs) const {
-  return this->instantiate(
-      std::vector<std::string>({reflection::reflect(targs)...}));
-}
-
-inline KernelLauncher KernelInstantiation::configure(
-    dim3 grid, dim3 block, unsigned int smem, cudaStream_t stream) const {
-  return KernelLauncher(this, grid, block, smem, stream);
-}
-
-inline KernelLauncher KernelInstantiation::configure_1d_max_occupancy(
-    int max_block_size, unsigned int smem, CUoccupancyB2DSize smem_callback,
-    cudaStream_t stream, unsigned int flags) const {
-  int grid;
-  int block;
-  CUfunction func = *_cuda_kernel;
-  detail::get_1d_max_occupancy(func, smem_callback, &smem, max_block_size,
-                               flags, &grid, &block);
-  return this->configure(grid, block, smem, stream);
-}
-
-}  // namespace experimental
-
-}  // namespace jitify
-
-#if defined(_WIN32) || defined(_WIN64)
-#pragma pop_macro("max")
-#pragma pop_macro("min")
-#pragma pop_macro("strtok_r")
-#endif
diff --git a/GraphBLAS/CUDA/test/.gitignore b/GraphBLAS/CUDA/test/.gitignore
deleted file mode 100644
index ab8dd2b30f..0000000000
--- a/GraphBLAS/CUDA/test/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-# Ignore these files:
-graphblascuda_test
-
-# Do not ignore this file
-!.gitignore
-
diff --git a/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.cpp b/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.cpp
deleted file mode 100644
index ca7c5de350..0000000000
--- a/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.cpp
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-// Test AxB_dot3_cuda kernels 
-// Using data generators and test classes, cover
-// all NBUCKETS cases for the masked GEMM ( C, M, A, B) in GraphBLAS
-// Tests Semirings, data types and a range of data input sizes and shapes
-// Connects to the jitFactory for launches.
-
-#include <cassert>
-#include <cmath>
-#include <random>
-#include <algorithm>
-#include <stdint.h>
-
-//Test instances and groupings
-
diff --git a/GraphBLAS/CUDA/test/GB_cuda_type_wrap.hpp b/GraphBLAS/CUDA/test/GB_cuda_type_wrap.hpp
deleted file mode 100644
index 3acf26553f..0000000000
--- a/GraphBLAS/CUDA/test/GB_cuda_type_wrap.hpp
+++ /dev/null
@@ -1,246 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/test/GB_cuda_type_wrap.hpp
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-/*
- * Copyright (c) 2019,2020 NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#ifndef GB_CONV_TYPE_H
-#define GB_CONV_TYPE_H
-
-extern "C" {
-#include "GB.h"
-};
-#include <stdint.h>
-#include <string>
-#include <typeinfo>
-#include <type_traits>
-#include <memory>
-#include <cstdlib>
-
-/**---------------------------------------------------------------------------*
- * @file type_convert.hpp
- * @brief Defines the mapping between concrete C++ types and Grb types.
- *---------------------------------------------------------------------------**/
-namespace cuda::jit {
-
-template <typename T>
-GrB_Type to_grb_type();
-
-template<> inline GrB_Type to_grb_type<int8_t>() { return GrB_INT8; }
-template<> inline GrB_Type to_grb_type<int16_t>() { return GrB_INT16; }
-template<> inline GrB_Type to_grb_type<int32_t>() { return GrB_INT32; }
-template<> inline GrB_Type to_grb_type<int64_t>() { return GrB_INT64; }
-template<> inline GrB_Type to_grb_type<uint8_t>() { return GrB_UINT8; }
-template<> inline GrB_Type to_grb_type<uint16_t>() { return GrB_UINT16; }
-template<> inline GrB_Type to_grb_type<uint32_t>() { return GrB_UINT32; }
-template<> inline GrB_Type to_grb_type<uint64_t>() { return GrB_UINT64; }
-template<> inline GrB_Type to_grb_type<float>() { return GrB_FP32; }
-template<> inline GrB_Type to_grb_type<double>() { return GrB_FP64; }
-template<> inline GrB_Type to_grb_type<bool>() { return GrB_BOOL; }
-
-
-template <typename T>
-void set_element(GrB_Matrix A, T x, int64_t i, int64_t j);
-
-template<> inline void set_element<int8_t>(GrB_Matrix A, int8_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_INT8(A, x, i, j); }
-template<> inline void set_element<int16_t>(GrB_Matrix A, int16_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_INT16(A, x, i, j); }
-template<> inline void set_element<int32_t>(GrB_Matrix A, int32_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_INT32(A, x, i, j); }
-template<> inline void set_element<int64_t>(GrB_Matrix A, int64_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_INT64(A, x, i, j); }
-template<> inline void set_element<uint8_t>(GrB_Matrix A, uint8_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_UINT8(A, x, i, j); }
-template<> inline void set_element<uint16_t>(GrB_Matrix A, uint16_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_UINT16(A, x, i, j); }
-template<> inline void set_element<uint32_t>(GrB_Matrix A, uint32_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_UINT32(A, x, i, j); }
-template<> inline void set_element<uint64_t>(GrB_Matrix A, uint64_t x, int64_t i, int64_t j) { GrB_Matrix_setElement_UINT64(A, x, i, j); }
-template<> inline void set_element<float>(GrB_Matrix A, float x, int64_t i, int64_t j) { GrB_Matrix_setElement_FP32(A, x, i, j); }
-template<> inline void set_element<double>(GrB_Matrix A, double x, int64_t i, int64_t j) { GrB_Matrix_setElement_FP64(A, x, i, j); }
-template<> inline void set_element<bool>(GrB_Matrix A, bool x, int64_t i, int64_t j) { GrB_Matrix_setElement_BOOL(A, x, i, j); }
-
-
-template <typename T>
-void vector_set_element(GrB_Vector A, T x, int64_t i);
-
-template<> inline void vector_set_element<int8_t>(GrB_Vector A, int8_t x, int64_t i) { GrB_Vector_setElement_INT8(A, x, i); }
-template<> inline void vector_set_element<int16_t>(GrB_Vector A, int16_t x, int64_t i) { GrB_Vector_setElement_INT16(A, x, i); }
-template<> inline void vector_set_element<int32_t>(GrB_Vector A, int32_t x, int64_t i) { GrB_Vector_setElement_INT32(A, x, i); }
-template<> inline void vector_set_element<int64_t>(GrB_Vector A, int64_t x, int64_t i) { GrB_Vector_setElement_INT64(A, x, i); }
-template<> inline void vector_set_element<uint8_t>(GrB_Vector A, uint8_t x, int64_t i) { GrB_Vector_setElement_UINT8(A, x, i); }
-template<> inline void vector_set_element<uint16_t>(GrB_Vector A, uint16_t x, int64_t i) { GrB_Vector_setElement_UINT16(A, x, i); }
-template<> inline void vector_set_element<uint32_t>(GrB_Vector A, uint32_t x, int64_t i) { GrB_Vector_setElement_UINT32(A, x, i); }
-template<> inline void vector_set_element<uint64_t>(GrB_Vector A, uint64_t x, int64_t i) { GrB_Vector_setElement_UINT64(A, x, i); }
-template<> inline void vector_set_element<float>(GrB_Vector A, float x, int64_t i) { GrB_Vector_setElement_FP32(A, x, i); }
-template<> inline void vector_set_element<double>(GrB_Vector A, double x, int64_t i) { GrB_Vector_setElement_FP64(A, x, i); }
-template<> inline void vector_set_element<bool>(GrB_Vector A, bool x, int64_t i) { GrB_Vector_setElement_BOOL(A, x, i); }
-
-
-    template <typename T>
-    void scalar_set_element(GrB_Scalar A, T x);
-
-    template<> inline void scalar_set_element<int8_t>(GrB_Scalar A, int8_t x) { GrB_Scalar_setElement_INT8(A, x); }
-    template<> inline void scalar_set_element<int16_t>(GrB_Scalar A, int16_t x) { GrB_Scalar_setElement_INT16(A, x); }
-    template<> inline void scalar_set_element<int32_t>(GrB_Scalar A, int32_t x) { GrB_Scalar_setElement_INT32(A, x); }
-    template<> inline void scalar_set_element<int64_t>(GrB_Scalar A, int64_t x) { GrB_Scalar_setElement_INT64(A, x); }
-    template<> inline void scalar_set_element<uint8_t>(GrB_Scalar A, uint8_t x) { GrB_Scalar_setElement_UINT8(A, x); }
-    template<> inline void scalar_set_element<uint16_t>(GrB_Scalar A, uint16_t x) { GrB_Scalar_setElement_UINT16(A, x); }
-    template<> inline void scalar_set_element<uint32_t>(GrB_Scalar A, uint32_t x) { GrB_Scalar_setElement_UINT32(A, x); }
-    template<> inline void scalar_set_element<uint64_t>(GrB_Scalar A, uint64_t x) { GrB_Scalar_setElement_UINT64(A, x); }
-    template<> inline void scalar_set_element<float>(GrB_Scalar A, float x) { GrB_Scalar_setElement_FP32(A, x); }
-    template<> inline void scalar_set_element<double>(GrB_Scalar A, double x) { GrB_Scalar_setElement_FP64(A, x); }
-    template<> inline void scalar_set_element<bool>(GrB_Scalar A, bool x) { GrB_Scalar_setElement_BOOL(A, x); }
-
-
-template<typename T>
-GrB_Info vector_reduce(T *scalar, GrB_Vector A, GrB_Monoid op);
-
-template<> inline GrB_Info vector_reduce<int8_t>(int8_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_INT8(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info vector_reduce<int16_t>(int16_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_INT16(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info vector_reduce<int32_t>(int32_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_INT32(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info vector_reduce<int64_t>(int64_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_INT64(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info vector_reduce<uint8_t>(uint8_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_UINT8(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info vector_reduce<uint16_t>(uint16_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_UINT16(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info vector_reduce<uint32_t>(uint32_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_UINT32(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info vector_reduce<uint64_t>(uint64_t *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_UINT64(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info vector_reduce<float>(float *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_FP32(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info vector_reduce<double>(double *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_FP64(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info vector_reduce<bool>(bool *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_BOOL(scalar, NULL, op, A, NULL); }
-
-/**
- *     GxB_Matrix_reduce_FC32     // c = accum (c, reduce_to_scalar (A))
-            (
-                    GxB_FC32_t *c,                  // result scalar
-    const GrB_BinaryOp accum,       // optional accum for c=accum(c,t)
-    const GrB_Monoid monoid,        // monoid to do the reduction
-    const GrB_Matrix A,             // matrix to reduce
-    const GrB_Descriptor desc
-
- * @tparam T 
- * @param scalar 
- * @param A 
- * @param op 
- * @return 
- */
-
-template<typename T>
-GrB_Info matrix_reduce(T *scalar, GrB_Matrix A, GrB_Monoid op);
-
-template<> inline GrB_Info matrix_reduce<int8_t>(int8_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_INT8(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info matrix_reduce<int16_t>(int16_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_INT16(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info matrix_reduce<int32_t>(int32_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_INT32(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info matrix_reduce<int64_t>(int64_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_INT64(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info matrix_reduce<uint8_t>(uint8_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_UINT8(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info matrix_reduce<uint16_t>(uint16_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_UINT16(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info matrix_reduce<uint32_t>(uint32_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_UINT32(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info matrix_reduce<uint64_t>(uint64_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_UINT64(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info matrix_reduce<float>(float *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_FP32(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info matrix_reduce<double>(double *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_FP64(scalar, NULL, op, A, NULL); }
-template<> inline GrB_Info matrix_reduce<bool>(bool *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_BOOL(scalar, NULL, op, A, NULL); }
-
-
-template <typename T>
-GrB_Info get_element(GrB_Matrix A, T* x, int64_t i, int64_t j);
-template<> inline GrB_Info get_element<int8_t>(GrB_Matrix A, int8_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_INT8(x, A, i, j); }
-template<> inline GrB_Info get_element<int16_t>(GrB_Matrix A, int16_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_INT16(x, A, i, j); }
-template<> inline GrB_Info get_element<int32_t>(GrB_Matrix A, int32_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_INT32(x, A, i, j); }
-template<> inline GrB_Info get_element<int64_t>(GrB_Matrix A, int64_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_INT64(x, A, i, j); }
-template<> inline GrB_Info get_element<uint8_t>(GrB_Matrix A, uint8_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_UINT8(x, A, i, j); }
-template<> inline GrB_Info get_element<uint16_t>(GrB_Matrix A, uint16_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_UINT16(x, A, i, j); }
-template<> inline GrB_Info get_element<uint32_t>(GrB_Matrix A, uint32_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_UINT32(x, A, i, j); }
-template<> inline GrB_Info get_element<uint64_t>(GrB_Matrix A, uint64_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_UINT64(x, A, i, j); }
-template<> inline GrB_Info get_element<float>(GrB_Matrix A, float *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_FP32(x, A, i, j); }
-template<> inline GrB_Info get_element<double>(GrB_Matrix A, double *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_FP64(x, A, i, j); }
-template<> inline GrB_Info get_element<bool>(GrB_Matrix A, bool *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_BOOL(x, A, i, j); }
-
-
-
-
-
-template<typename T>
-class type_name {
-public:
-    static const char *name;
-};
-
-#define DECLARE_TYPE_NAME(x)  template<> inline const char *type_name<x>::name = #x;
-#define GET_TYPE_NAME(x) (type_name<decltype(x)>::name)
-
-    DECLARE_TYPE_NAME(int);
-    DECLARE_TYPE_NAME(int&);
-    DECLARE_TYPE_NAME(int*);
-    DECLARE_TYPE_NAME(int8_t);
-    DECLARE_TYPE_NAME(int8_t&);
-    DECLARE_TYPE_NAME(int8_t*);
-    DECLARE_TYPE_NAME(unsigned char);
-    DECLARE_TYPE_NAME(unsigned char&);
-    DECLARE_TYPE_NAME(unsigned char*);
-//  DECLARE_TYPE_NAME(unsigned int);
-//  DECLARE_TYPE_NAME(unsigned int&);
-//  DECLARE_TYPE_NAME(unsigned int*);
-//  DECLARE_TYPE_NAME(unsigned int32_t);
-//  DECLARE_TYPE_NAME(unsigned int32_t&);
-//  DECLARE_TYPE_NAME(unsigned int32_t*);
-    DECLARE_TYPE_NAME(unsigned int64_t);
-    DECLARE_TYPE_NAME(unsigned int64_t&);
-    DECLARE_TYPE_NAME(unsigned int64_t*);
-    DECLARE_TYPE_NAME(long);
-    DECLARE_TYPE_NAME(long&);
-    DECLARE_TYPE_NAME(long*);
-    DECLARE_TYPE_NAME(float);
-    DECLARE_TYPE_NAME(float&);
-    DECLARE_TYPE_NAME(float*);
-    DECLARE_TYPE_NAME(double);
-    DECLARE_TYPE_NAME(double&);
-    DECLARE_TYPE_NAME(double*);
-    DECLARE_TYPE_NAME(bool);
-
-
-
-    inline const std::string grb_str_type(GB_Type_code grb_type_code) {
-        switch(grb_type_code) {
-            case GB_BOOL_code:
-                return "bool";
-            case GB_INT8_code:
-                return "int8_t";
-            case GB_UINT8_code:
-                return "uint8_t";
-            case GB_INT16_code:
-                return "int16_t";
-            case GB_UINT16_code:
-                return "uint16_t";
-            case GB_INT32_code:
-                return "int32_t";
-            case GB_UINT32_code:
-                return "uint32_t";
-            case GB_INT64_code:
-                return "int64_t";
-            case GB_UINT64_code:
-                return "uint64_t";
-            case GB_FP32_code:
-                return "float";
-            case GB_FP64_code:
-                return "double";
-            default:
-                printf("Error: GrB_Type not supported.\n");
-                exit(1);
-        }
-    }
-
-
-}  // namespace cuda::jit
-#endif
diff --git a/GraphBLAS/CUDA/test/GpuTimer.h b/GraphBLAS/CUDA/test/GpuTimer.h
deleted file mode 100644
index ed5b52520d..0000000000
--- a/GraphBLAS/CUDA/test/GpuTimer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/test/GpuTimer.h
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-#ifndef __GPU_TIMER_H__
-#define __GPU_TIMER_H__
-
-#include <cuda_runtime.h>
-struct GpuTimer
-{
-          cudaEvent_t start;
-          cudaEvent_t stop;
-                 
-          GpuTimer()
-          {
-               cudaEventCreate(&start);
-               cudaEventCreate(&stop);
-          }
-                       
-          ~GpuTimer()
-          {
-               cudaEventDestroy(start);
-               cudaEventDestroy(stop);
-          }
-                             
-          void Start()
-          {
-               cudaEventRecord(start, 0);
-          }
-                                   
-          void Stop()
-          {
-              cudaEventRecord(stop, 0);
-          }
-                                         
-          float Elapsed()
-          {
-              float elapsed;
-              cudaEventSynchronize(stop);
-              cudaEventElapsedTime(&elapsed, start, stop);
-              return elapsed;
-          }
-};
-
-#endif  /* __GPU_TIMER_H__ */
diff --git a/GraphBLAS/CUDA/test/Makefile b/GraphBLAS/CUDA/test/Makefile
deleted file mode 100644
index 412ac47294..0000000000
--- a/GraphBLAS/CUDA/test/Makefile
+++ /dev/null
@@ -1,133 +0,0 @@
-#-------------------------------------------------------------------------------
-# GraphBLAS/CUDA/test/Makefile
-#-------------------------------------------------------------------------------
-
-# cuda 10.1+ is assumed
-
-all: cudaTest
-
-
-LIBS = -L/usr/local/cuda/lib64 -L/usr/local/cuda/lib64/stubs -lpthreads -lcudadevrt -lcudart -lnvrtc
-INC += -I$(CUDA_DIR)/include -I../ -I../../Source -I../../Include -I../../Source/Template -I$(TEMPLATE_DIR) -Igoogletest/include
-
-CUDA_OPTS = -O2 --cudart=shared --gpu-architecture=compute_75\
-        --relocatable-device-code true --device-c\
-        --std=c++17 -Xcompiler -fPIC
-
-%.o: %.cu
-	nvcc -c $(I) $(CUDA_OPTS) $(INC)  -o $@ $< 
-
-config:
-	nvidia-smi
-	nvcc --version
-	@echo " "
-	@echo "SO_NAME:   " $(SO_NAME)
-	@echo "SO_OPTS:   " $(SO_OPTS)
-	@echo "LIBS:      " $(LIBS)
-	@echo "CUDA_OPTS: " $(CUDA_OPTS)
-	@echo "SRC:       " $(SRC)
-	@echo "OBJ:       " $(OBJ)
-	@echo "I:         " $(I)
-	@echo " "
-	gcc  --version
-	icc  --version
-
-clean:
-	rm -f *.o
-	rm -f stringify
-	rm -f cudaTest 
-	rm -f testJit
-.PHONY: clean
-
-distclean: clean
-	rm -f *.so *.a
-
-purge: distclean
-
-################################################################################
-
-GXX     ?= g++
-GCC     ?= gcc
-DOXYGEN ?= doxygen
-CXXFLAGS ?= -O3 -Wall -g -fmessage-length=80
-CFLAGS ?= -O2  -g -std=c11 
-
-CXX11 ?= 1
-
-CUDA_DIR ?= /usr/local/cuda
-
-CXXFLAGS += -pthread
-
-ifeq ($(CXX11),1)
-	CXXFLAGS += -std=c++17
-endif
-
-EMBED_BEGIN = -rdynamic -Wl,-b,binary,
-EMBED_END   = ,-b,default
-
-UNAME_S := $(shell uname -s)
-ifeq ($(UNAME_S),Linux)
-	CXXFLAGS += -D LINUX 
-	CUDA_LIB_DIR = $(CUDA_DIR)/lib64
-else ifeq ($(UNAME_S),Darwin)
-	CUDA_LIB_DIR = $(CUDA_DIR)/lib
-endif
-
-TEMPLATE_DIR ?= ../templates
-
-LIB += -ldl -L$(CUDA_LIB_DIR) -L$(CUDA_LIB_DIR)/stubs -lcuda -lcudadevrt -lcudart -lnvrtc
-
-# FIXME: file names in HEADERS are old
-HEADERS = jitify.hpp dataFactory.hpp jitFactory.hpp jitTestFactory.hpp semiringFactory.hpp \
-          ../type_name.hpp 
-
-TEMPLATES :=  $(wildcard $(TEMPLATE_DIR)/*.cu)
-
-CU_OBJS := ../GB_cuda_jitify_cache.o ../GB_cuda_jitify_launcher.o 
-
-CFILES := $(wildcard ../*.c)  
-
-COBJS := $(patsubst %.c, %.o, $(CFILES) )
-
-JIT_TEMP := $(patsubst %.cu, %.cu.jit, $(TEMPLATES))
-
-GTEST_LIB := googletest/build/lib/libgtest.a googletest/build/lib/libgtest_main.a
-
-%.cu.jit: %.cu 
-	../stringify $? > $@
-
-stringify: stringify.cpp
-	$(GXX) -o $@ $< -O3 -Wall
-
-%.o: %.c
-	$(GXX) -c -o $@ $< $(CFLAGS) $(INC) 
-
-%.o: %.cpp
-	$(GXX) -c -o $@ $< $(CXXFLAGS) $(INC) 
-
-cu_link.o: $(CU_OBJS)
-	nvcc --gpu-architecture=compute_75 --device-link $(CU_OBJS)  --output-file cu_link.o
-
-
-testJit: ../tofix/testJit.cpp $(OBJS) $(HEADERS) $(JIT_TEMP)
-	$(GXX) -o $@ $< $(CXXFLAGS) $(INC) $(OBJS) $(LIB)
-
-AxB_dot3_test_instances.hpp:  testGen.py
-	python3 testGen.py
-
-
-instances :=  AxB_dot3_test_instances.hpp 
-
-
-cudaTest: cudaTest.cpp.bak $(COBJS) $(OBJS) $(HEADERS) $(JIT_TEMP) cu_link.o AxB_dot3_cuda_tests.hpp.bak  $(instances)
-	$(GXX) -o $@ $< $(CXXFLAGS) $(INC) $(COBJS) $(CU_OBJS) cu_link.o $(LIB) $(GTEST_LIB)
-
-%.cu: %.cutmp
-	cp $? $@
-
-
-doc: jitify.hpp Doxyfile
-	$(DOXYGEN) Doxyfile
-.PHONY: doc
-
-
diff --git a/GraphBLAS/CUDA/test/cuda_tests_template.cpp b/GraphBLAS/CUDA/test/cuda_tests_template.cpp
deleted file mode 100644
index 340ded031b..0000000000
--- a/GraphBLAS/CUDA/test/cuda_tests_template.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/test/cuda_tests_template.cpp
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-// Test AxB_dot3_cuda kernels 
-// Using data generators and test classes, cover
-// all NBUCKETS cases for the masked GEMM ( C, M, A, B) in GraphBLAS
-// Tests Semirings, data types and a range of data input sizes and shapes
-// Connects to the jitFactory for launches.
-
-#include <cassert>
-#include <cmath>
-#include <random>
-#include <algorithm>
-#include <stdint.h>
-#include "problem_spec.hpp"
-#include "jitTestFactory.hpp"
-#include "../GB_cuda_buckets.h"
-
-//Test instances and groupings
-
diff --git a/GraphBLAS/CUDA/test/dataFactory.hpp b/GraphBLAS/CUDA/test/dataFactory.hpp
deleted file mode 100644
index 5c6263c918..0000000000
--- a/GraphBLAS/CUDA/test/dataFactory.hpp
+++ /dev/null
@@ -1,382 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/test/dataFactory.hpp
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-#pragma once
-
-#include <cmath>
-#include <cstdint>
-#include <random>
-#include <unordered_set>
-
-#include "GB.h"
-#include "GB_cuda_type_wrap.hpp"
-#include "test_utility.hpp"
-#include "GB_cuda_error.h"
-
-// CAUTION: This assumes our indices are small enough to fit into a 32-bit int.
-inline std::int64_t gen_key(std::int64_t i, std::int64_t j) {
-    return (std::int64_t) i << 32 | (std::int64_t) j;
-}
-
-//Vector generators
-template<typename T>
-void fillvector_linear( int N, T *vec, int start=0) {
-   for (int i = start; i< N+start; ++i) vec[i] = T(i);
-}
-template<typename T>
-void fillvector_constant( int N, T *vec, T val) {
-   for (int i = 0; i< N; ++i) vec[i] = val;
-}
-
-// Mix-in class to enable unified memory
-class Managed {
-public:
-  void *operator new(size_t len) {
-    void *ptr = nullptr;
-    //std::cout<<"in new operator, alloc for "<<len<<" bytes"<<std::endl;
-    CHECK_CUDA( cudaMallocManaged( &ptr, len) );
-    cudaDeviceSynchronize();
-    //std::cout<<"in new operator, sync "<<len<<" bytes"<<std::endl;
-    return ptr;
-  }
-
-  void operator delete(void *ptr) {
-    cudaDeviceSynchronize();
-    //std::cout<<"in delete operator, free "<<std::endl;
-    CHECK_CUDA( cudaFree(ptr) );
-  }
-};
-
-// FIXME: We should just be able to get rid of this now.
-//Basic matrix container class
-template<typename T>
-class matrix : public Managed {
-    int64_t nrows_;
-    int64_t ncols_;
-
-  public:
-    GrB_Matrix mat;
-
-    matrix(int64_t nrows, int64_t ncols): nrows_(nrows), ncols_(ncols) {}
-
-     GrB_Matrix get_grb_matrix() {
-         return mat;
-     }
-
-     ~matrix() {
-        if(mat != NULL) {
-            GrB_Matrix_free(&mat);
-            mat = NULL;
-        }
-    }
-
-     uint64_t get_zombie_count() { return mat->nzombies;}
-
-     void clear() {
-        GRB_TRY (GrB_Matrix_clear (mat)) ;
-     }
-
-     void alloc() {
-         GrB_Type type = cuda::jit::to_grb_type<T>();
-
-         GRB_TRY (GrB_Matrix_new (&mat, type, nrows_, ncols_)) ;
-
-         // GxB_Matrix_Option_set (mat, GxB_SPARSITY_CONTROL,
-            // GxB_SPARSE) ;
-            // or:
-            // GxB_HYPERSPARSE, GxB_BITMAP, GxB_FULL
-     }
-
-
-    void fill_random( int64_t nnz, int gxb_sparsity_control, int gxb_format, std::int64_t seed = 12345ULL, T val_min = 0.0, T val_max = 2.0 , bool debug_print = false) {
-
-//        std::cout << "inside fill_random, using seed "<< seed << std::endl;
-        alloc();
-
-        double inv_sparsity ;
-        if (nnz < 0)
-        {
-            // build a matrix with all entries present
-            inv_sparsity = 1 ;
-        }
-        else
-        {
-            inv_sparsity = ceil(((double)nrows_*ncols_)/nnz);   //= values not taken per value occupied in index space
-        }
-//
-//        std::cout<< "fill_random nrows="<< nrows_<<"ncols=" << ncols_ <<" need "<< nnz<<" values, invsparse = "<<inv_sparsity<<std::endl;
-//        std::cout<< "fill_random"<<" after alloc values"<<std::endl;
-//        std::cout<<"vdim ready "<<std::endl;
-//        std::cout<<"vlen ready "<<std::endl;
-//        std::cout<<"ready to fill p"<<std::endl;
-
-        bool make_symmetric = false;
-        bool no_self_edges = false;
-
-        std::mt19937 r(seed);
-        std::uniform_real_distribution<double> dis(0.0, 1.0);
-
-        if (nnz < 0 || inv_sparsity == 1.)
-        {
-//            std::cout<<"filling dense"<<std::endl;
-            for (int64_t i = 0 ; i < nrows_ ; i++)
-            {
-                for (int64_t j = 0 ; j < ncols_ ; j++)
-                {
-                    T x = (T)(dis(r) * (val_max - val_min)) + (T)val_min ;
-                    if (make_symmetric)
-                    {
-                        // A (i,j) = x
-                        cuda::jit::set_element<T> (mat, x, i, j) ;
-                        // A (j,i) = x
-                        cuda::jit::set_element<T> (mat, x, j, i) ;
-                    }
-                    else
-                    {
-                        // A (i,j) = x
-                        cuda::jit::set_element<T> (mat, x, i, j) ;
-                    }
-                }
-            }
-
-//            std::cout << "done." << std::endl;
-        }
-        else
-        {
-//            std::cout<<"filling sparse"<<std::endl;
-            unordered_set<std::int64_t> row_lookup;
-            unordered_set<std::int64_t> key_lookup;
-            for ( int co = 0; co < 2*nrows_; co++ )
-            {
-                GrB_Index i = ((GrB_Index) (dis(r) * nrows_)) % ((GrB_Index) nrows_) ;
-
-                row_lookup.insert( i );
-            }
-            int remain= nnz; //countdown to done
-
-            while ( remain > 0) 
-            { 
-//            std::cout<< remain<<" nonzeroes left to fill.."<<std::endl;
-            for ( GrB_Index i : row_lookup)
-            {
-                GrB_Index col_guess = ((GrB_Index) (dis(r) * nnz/row_lookup.size() )) % ((GrB_Index) ncols_) ;
-                col_guess++;  // make it at least 1
-
-                //std::cout<<"putting "<< col_guess<<" values in row "<<i<<std::endl;
-                while (col_guess > 0 )
-                {
-                    GrB_Index j = ((GrB_Index) (dis(r) * ncols_)) % ((GrB_Index) ncols_) ;
-                    if (key_lookup.count( gen_key(i,j) ) == 1) continue;
-                    if (no_self_edges && (i == j)) continue ;
-
-                    key_lookup.insert( gen_key(i, j) );
-                    col_guess--;
-                    remain= (nnz- key_lookup.size() );
-                    if (remain <= 0) break;
-                    if (make_symmetric) {
-                      // A (j,i) = x
-                      if (key_lookup.count( gen_key( j, i) ) == 0)
-                      {
-                         key_lookup.insert( gen_key( j, i) ) ;
-                         col_guess--;
-                         remain= (nnz- key_lookup.size() );
-                      }
-                    }
-                    if (remain <= 0) break;
-                }
-                if (remain <= 0) break;
-                //std::cout<< remain<<" nonzeroes left..."<<std::endl;
-            }
-            } //remain > 0
-            /*
-            while(key_lookup.size() < nnz) {
-                GrB_Index i = ((GrB_Index) (dis(r) * nrows_)) % ((GrB_Index) nrows_) ;
-                GrB_Index j = ((GrB_Index) (dis(r) * ncols_)) % ((GrB_Index) ncols_) ;
-
-                key_lookup.insert( gen_key(i, j) );
-                if (make_symmetric) {
-                    // A (j,i) = x
-                    key_lookup.insert( gen_key( j, i) ) ;
-                }
-            } */
-
-            for (int64_t k : key_lookup)
-            {
-                GrB_Index i = k >> 32;
-                GrB_Index j = k & 0x0000ffff;
-
-                T x = (T)val_min + (T)(dis(r) * (val_max - val_min)) ;
-                // A (i,j) = x
-                cuda::jit::set_element<T> (mat, x, i, j) ;
-                if (make_symmetric) {
-                    // A (j,i) = x
-                    cuda::jit::set_element<T>(mat, x, j, i) ;
-                }
-            }
-        }
-
-        GRB_TRY (GrB_Matrix_wait (mat, GrB_MATERIALIZE)) ;
-        GB_convert_any_to_non_iso (mat, true) ;
-        // TODO: Need to specify these
-        GRB_TRY (GxB_Matrix_Option_set (mat, GxB_SPARSITY_CONTROL, gxb_sparsity_control)) ;
-        GRB_TRY (GxB_Matrix_Option_set(mat, GxB_FORMAT, gxb_format));
-        GRB_TRY (GrB_Matrix_wait (mat, GrB_MATERIALIZE)) ;
-        GRB_TRY (GrB_Matrix_nvals ((GrB_Index *) &nnz, mat)) ;
-        //GRB_TRY (GxB_Matrix_fprint (mat, "my random mat", GxB_SHORT_VERBOSE, stdout)) ;
-
-        bool iso ;
-        GRB_TRY (GxB_Matrix_iso (&iso, mat)) ;
-        if (iso)
-        {
-            printf ("Die! (cannot do iso)\n") ;
-            GRB_TRY (GrB_Matrix_free (&mat)) ;
-        }
-
-    }
-
-};
-
-
-
-template< typename T_C, typename T_M, typename T_A, typename T_B>
-class SpGEMM_problem_generator {
-
-    float Anzpercent,Bnzpercent,Mnzpercent;
-    int64_t Mnz;
-    int64_t *Bucket = nullptr;
-
-    int64_t BucketStart[NBUCKETS+1];
-    unsigned seed = 13372801;
-    bool ready = false;
-
-    int64_t nrows_;
-    int64_t ncols_;
-
-  public:
-
-    matrix<T_C> *C= nullptr;
-    matrix<T_M> *M= nullptr;
-    matrix<T_A> *A= nullptr;
-    matrix<T_B> *B= nullptr;
-
-    SpGEMM_problem_generator() {};
-
-    SpGEMM_problem_generator(int64_t nrows, int64_t ncols): nrows_(nrows), ncols_(ncols) {
-    
-       // Create sparse matrices
-       C = new matrix<T_C>(nrows_, ncols_);
-       M = new matrix<T_M>(nrows_, ncols_);
-       A = new matrix<T_A>(nrows_, ncols_);
-       B = new matrix<T_B>(nrows_, ncols_);
-    };
-
-    void initDim ( int64_t nrows, int64_t ncols){
-       nrows_ = nrows;
-       ncols_ = ncols;
-       // Create sparse matrices
-       C = new matrix<T_C>(nrows_, ncols_);
-       M = new matrix<T_M>(nrows_, ncols_);
-       A = new matrix<T_A>(nrows_, ncols_);
-       B = new matrix<T_B>(nrows_, ncols_);
-    }
-
-    matrix<T_C>* getCptr(){ return C;}
-    matrix<T_M>* getMptr(){ return M;}
-    matrix<T_A>* getAptr(){ return A;}
-    matrix<T_B>* getBptr(){ return B;}
-
-    void init_A(std::int64_t Anz, int gxb_sparsity_control, int gxb_format, std::int64_t seed = 12345ULL, T_A min_val = 0.0, T_A max_val = 2.0) {
-        Anzpercent = float(Anz)/float(nrows_*ncols_);
-        A->fill_random(Anz, gxb_sparsity_control, gxb_format, seed, min_val, max_val);
-    }
-
-    void init_B(std::int64_t Bnz, int gxb_sparsity_control, int gxb_format, std::int64_t seed = 54321ULL, T_B min_val = 0.0, T_B max_val = 2.0) {
-        Bnzpercent = float(Bnz)/float(nrows_*ncols_);
-        B->fill_random(Bnz, gxb_sparsity_control, gxb_format, seed, min_val, max_val);
-    }
-
-    GrB_Matrix getC(){ return C->get_grb_matrix();}
-    GrB_Matrix getM(){ return M->get_grb_matrix();}
-    GrB_Matrix getA(){ return A->get_grb_matrix();}
-    GrB_Matrix getB(){ return B->get_grb_matrix();}
-
-    int64_t* getBucket() { return Bucket;}
-    int64_t* getBucketStart(){ return BucketStart;}
-
-    void init_C(float Mnzp, std::int64_t seed_c = 23456ULL, std::int64_t seed_m = 4567ULL){
-
-       // Get sizes relative to fully dense matrices
-       Mnzpercent = Mnzp;
-       Mnz = (int64_t)(Mnzp * nrows_ * ncols_);
-
-       //Seed the generator
-       //std::cout<<"filling matrices"<<std::endl;
-
-       C->fill_random(Mnz, GxB_SPARSE, GxB_BY_ROW, seed_m);
-       M->fill_random(Mnz, GxB_SPARSE, GxB_BY_ROW, seed_m);
-
-    }
-
-    void del(){
-       C->clear();
-       M->clear();
-       A->clear();
-       B->clear();
-       //if (Bucket != nullptr) CHECK_CUDA( cudaFree(Bucket) );
-       delete C;
-       delete M;
-       delete A;
-       delete B;
-       CHECK_CUDA( cudaDeviceSynchronize() );
-    }
-
-    //
-    void fill_buckets( int fill_bucket){
-
-       std::cout<<Mnz<<" slots to fill"<<std::endl;
-
-       if (fill_bucket == -1){  
-
-       // Allocate Bucket space
-       CHECK_CUDA( cudaMallocManaged((void**)&Bucket, Mnz*sizeof(int64_t)) );
-
-       //Fill buckets with random extents such that they sum to Mnz, set BucketStart
-           BucketStart[0] = 0; 
-           BucketStart[NBUCKETS] = Mnz;
-           for (int b = 1; b < NBUCKETS; ++b){
-              BucketStart[b] = BucketStart[b-1] + (Mnz / NBUCKETS);
-              //std::cout<< "bucket "<< b<<" starts at "<<BucketStart[b]<<std::endl;
-              for (int j = BucketStart[b-1]; j < BucketStart[b]; ++j) { 
-                Bucket[j] = b ;
-              }
-           }
-           int b = GB_BUCKET_MERGEPATH;
-           for (int j = BucketStart[GB_BUCKET_MERGEPATH]; j < BucketStart[NBUCKETS]; ++j) { 
-                Bucket[j] = b ; 
-           }
-       }
-       else {// all in one test bucket
-
-           CHECK_CUDA( cudaMallocManaged((void**)&Bucket, Mnz*sizeof(int64_t)) );
-           for (int j = 0; j < Mnz; ++j) {
-               Bucket[j] = j ;
-           }
-
-           BucketStart[0] = 0;
-           BucketStart[NBUCKETS] = Mnz;
-           for (int b= 0; b<NBUCKETS; ++b){
-              if (b <= fill_bucket) BucketStart[b] = 0;
-              if (b  > fill_bucket) BucketStart[b] = Mnz;
-              //std::cout<< " one  bucket "<< b<<"starts at "<<BucketStart[b]<<std::endl;
-           } 
-           std::cout<<"all pairs to bucket "<<fill_bucket<<", no filling"<<std::endl;
-           std::cout<<"done assigning buckets"<<std::endl;
-       }
-    }
-};
-
diff --git a/GraphBLAS/CUDA/test/gen_test_data.ipynb b/GraphBLAS/CUDA/test/gen_test_data.ipynb
deleted file mode 100644
index bb8bb8440d..0000000000
--- a/GraphBLAS/CUDA/test/gen_test_data.ipynb
+++ /dev/null
@@ -1,267 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "ebc1beb3-bbcd-45e4-be34-7c95ea464470",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "try:\n",
-    "    import pygraphblas\n",
-    "except:\n",
-    "    !mamba install -c conda-forge pygraphblas"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "982508dd-2f43-417c-863d-1ec88779f18d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pygraphblas import *\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "4c618f21-709e-4a73-8b18-a9ce29268a95",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def cohen(A, U, L): return L.mxm(U, mask=A)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "0ed98a12-6378-44e1-a5bd-2f36b629dd7a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "prob = dict(Matrix.ssget('Newman/karate'))\n",
-    "Mat = prob['karate.mtx'].cast(INT64)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "e34ecad7-ac0a-4efb-94bc-474bd31240b7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "A = Mat.triu()\n",
-    "B = Mat.tril()\n",
-    "M = Mat\n",
-    "C = cohen(Mat, A, B)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "2090b90e-4fcb-495d-b702-cacc4dc3a9e6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "A_sp = A.to_scipy_sparse()\n",
-    "B_sp = B.to_scipy_sparse()\n",
-    "M_sp = M.to_scipy_sparse()\n",
-    "C_sp = C.to_scipy_sparse()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "2ff4d1fb-e47d-40e9-b880-b1cba58be267",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def np_to_str(npa):\n",
-    "    return np.array2string(npa, separator=\",\").replace(\"[\", \"{\").replace(\"]\", \"}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "2604808c-4bfd-4751-9058-b18842017d1a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = (np_to_str(A_sp.indptr), np_to_str(A_sp.indices), np_to_str(A_sp.data), \n",
-    " np_to_str(B_sp.indptr), np_to_str(B_sp.indices), np_to_str(B_sp.data),\n",
-    " np_to_str(M_sp.indptr), np_to_str(M_sp.indices), np_to_str(M_sp.data),\n",
-    " np_to_str(C_sp.indptr), np_to_str(C_sp.indices), np_to_str(C_sp.data))\n",
-    "\n",
-    "output_str = \"\"\"\n",
-    "#include <vector>\n",
-    "#include <cstdint>\n",
-    "\n",
-    "template<typename T_A, typename T_B, typename T_C, typename T_M>\n",
-    "class TestData {\n",
-    "\n",
-    "public:\n",
-    "    TestData(  std::vector<std::int64_t> A_indptr_,\n",
-    "    std::vector<std::int64_t> A_indices_,\n",
-    "    std::vector<T_A> A_data_,\n",
-    "\n",
-    "    std::vector<std::int64_t> B_indptr_,\n",
-    "    std::vector<std::int64_t> B_indices_,\n",
-    "    std::vector<T_B> B_data_,\n",
-    "\n",
-    "\n",
-    "    std::vector<std::int64_t> C_indptr_,\n",
-    "    std::vector<std::int64_t> C_indices_,\n",
-    "    std::vector<T_C> C_data_,\n",
-    "\n",
-    "    std::vector<std::int64_t> M_indptr_,\n",
-    "    std::vector<std::int64_t> M_indices_,\n",
-    "    std::vector<T_M> M_data_):\n",
-    "        A_indptr(A_indptr_), A_indices(A_indices_), A_data(A_data_),\n",
-    "        B_indptr(B_indptr_), B_indices(B_indices_), B_data(B_data_),\n",
-    "        C_indptr(C_indptr_), C_indices(C_indices_), C_data(C_data_),\n",
-    "        M_indptr(M_indptr_), M_indices(M_indices_), M_data(M_data_){}\n",
-    "\n",
-    "\n",
-    "  std::vector<std::int64_t> A_indptr;\n",
-    "  std::vector<std::int64_t> A_indices;\n",
-    "  std::vector<T_A> A_data;\n",
-    "  \n",
-    "  std::vector<std::int64_t> B_indptr;\n",
-    "  std::vector<std::int64_t> B_indices;\n",
-    "  std::vector<T_B> B_data;\n",
-    "  \n",
-    "  \n",
-    "  std::vector<std::int64_t> C_indptr;\n",
-    "  std::vector<std::int64_t> C_indices;\n",
-    "  std::vector<T_C> C_data;\n",
-    "\n",
-    "  std::vector<std::int64_t> M_indptr;\n",
-    "  std::vector<std::int64_t> M_indices;\n",
-    "  std::vector<T_M> M_data;\n",
-    "\n",
-    "};\n",
-    "\n",
-    "template<typename T_A, typename T_B, typename T_C, typename T_M>\n",
-    "std::unique_ptr<TestData<T_A, T_B, T_C, T_M>> make_karate_tricount() {\n",
-    "\n",
-    "    std::vector<std::int64_t> A_indptr = %s;\n",
-    "    std::vector<std::int64_t> A_indices = %s;\n",
-    "    std::vector<T_A> A_data = %s;\n",
-    "\n",
-    "    std::vector<std::int64_t> B_indptr = %s;\n",
-    "    std::vector<std::int64_t> B_indices = %s;\n",
-    "    std::vector<T_B> B_data = %s;\n",
-    "\n",
-    "    std::vector<std::int64_t> M_indptr = %s;\n",
-    "    std::vector<std::int64_t> M_indices = %s;\n",
-    "    std::vector<T_M> M_data = %s;\n",
-    "\n",
-    "    std::vector<std::int64_t> C_indptr = %s;\n",
-    "    std::vector<std::int64_t> C_indices = %s;\n",
-    "    std::vector<T_C> C_data = %s;\n",
-    "\n",
-    "    TestData<T_A, T_B, T_C, T_M> karate_tricount(A_indptr, A_indices, A_data,\n",
-    "                                                 B_indptr, B_indices, B_data,\n",
-    "                                                 C_indptr, C_indices, C_data,\n",
-    "                                                 M_indptr, M_indices, M_data);\n",
-    "\n",
-    "    return std::make_unique<TestData<T_A, T_B, T_C, T_M>>(karate_tricount);\n",
-    "}\n",
-    "\n",
-    "\n",
-    "\n",
-    "TestData karate_tricount;\n",
-    "karate.A_indptr = %s;\n",
-    "karate.A_indices = %s;\n",
-    "karate.A_data = %s;\n",
-    "\n",
-    "karate.B_indptr = %s;\n",
-    "karate.B_indices = %s;\n",
-    "karate.B_data = %s;\n",
-    "\n",
-    "karate.M_indptr = %s;\n",
-    "karate.M_indices = %s;\n",
-    "karate.M_data = %s;\n",
-    "\n",
-    "karate.C_indptr = %s;\n",
-    "karate.C_indices = %s;\n",
-    "karate.C_data = %s;\n",
-    "\"\"\" % data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "b8c2c497-7156-449e-9c44-6fa19bdedea3",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'\\ntemplate<typename T_A, typename T_B, typename T_C, typename T_M>\\nstruct TestData {\\n\\n  std::vector<int64_t> A_indptr;\\n  std::vector<int64_t> A_indices;\\n  std::vector<T_A> A_data;\\n  \\n  std::vector<int64_t> B_indptr;\\n  std::vector<int64_t> B_indices;\\n  std::vector<T_B> B_data;\\n  \\n  \\n  std::vector<int64_t> C_indptr;\\n  std::vector<int64_t> C_indices;\\n  std::vector<T_C> C_data;\\n\\n  std::vector<int64_t> M_indptr;\\n  std::vector<int64_t> M_indices;\\n  std::vector<T_M> M_data;\\n\\n}\\n\\n\\nTestData karate_tricount;\\nkarate.A_indptr = { 0,16,24,32,35,37,40,41,41,44,45,45,45,45,46,48,50,50,50,52,53,55,55,57,\\n 62,65,66,68,69,71,73,75,77,78,78};\\nkarate.A_indices = { 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,17,19,21,31, 2, 3, 7,13,17,19,21,30,\\n  3, 7, 8, 9,13,27,28,32, 7,12,13, 6,10, 6,10,16,16,30,32,33,33,33,32,33,\\n 32,33,32,33,33,32,33,32,33,25,27,29,32,33,25,27,31,31,29,33,33,31,33,32,\\n 33,32,33,32,33,33};\\nkarate.A_data = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1};\\n\\nkarate.B_indptr = { 0, 0, 1, 3, 6, 7, 8,11,15,17,18,21,22,24,28,28,28,30,32,32,34,34,36,36,\\n 36,36,38,38,41,42,44,46,50,61,78};\\nkarate.B_indices = { 0, 0, 1, 3, 6, 7, 8,11,15,17,18,21,22,24,28,28,28,30,32,32,34,34,36,36,\\n 36,36,38,38,41,42,44,46,50,61,78};\\nkarate.B_data = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1};\\n\\nkarate.M_indptr = {  0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80,\\n  82, 84, 87, 89, 91, 93, 98,101,104,106,110,113,117,121,127,139,156};\\nkarate.M_indices = {  0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80,\\n  82, 84, 87, 89, 91, 93, 98,101,104,106,110,113,117,121,127,139,156};\\nkarate.M_data = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\\n 1,1,1,1,1,1,1,1};\\n\\nkarate.C_indptr = { 0, 0, 7,12,17,19,21,24,27,29,29,31,31,32,35,35,35,36,37,37,38,38,39,39,\\n 39,39,40,40,41,41,43,45,47,51,56};\\nkarate.C_indices = { 0, 0, 7,12,17,19,21,24,27,29,29,31,31,32,35,35,35,36,37,37,38,38,39,39,\\n 39,39,40,40,41,41,43,45,47,51,56};\\nkarate.C_data = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1,\\n  1, 2, 3, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,\\n  1, 1,10, 1, 2, 1, 1,10};\\n'"
-      ]
-     },
-     "execution_count": 33,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "output_str"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "f0c11d60-a542-41b7-afe6-38d93df094f7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def store_file(output_string, filename = \"test_data.hpp\"):\n",
-    "    with open(filename, 'w') as f:\n",
-    "        f.write(output_string)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "id": "bc22e70f-5373-4e80-88d5-c18e9c03cf76",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "store_file(output_str)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "255965d0-b90b-4ad7-a53a-9f5eb4745906",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python (cuml_2204_0222222_2)",
-   "language": "python",
-   "name": "cuml_2204_022222_2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/GraphBLAS/CUDA/test/jitTestFactory.hpp b/GraphBLAS/CUDA/test/jitTestFactory.hpp
deleted file mode 100644
index 9e83ae4dd0..0000000000
--- a/GraphBLAS/CUDA/test/jitTestFactory.hpp
+++ /dev/null
@@ -1,916 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/test/jitTestFactory.hpp
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-#pragma once
-
-#include <cassert>
-#include <cmath>
-#include <random>
-#include <algorithm>
-#include <iostream>
-#include "GpuTimer.h"
-#include "GB_cuda_buckets.h"
-#include <gtest/gtest.h>
-#include "test_data.hpp"
-#include "../rmm_wrap/rmm_wrap.hpp"
-#include "problem_spec.hpp"
-
-extern "C" {
-    #include "GB.h"
-}
-
-#include "GB_cuda_common_jitFactory.hpp"
-#include "GB_cuda_mxm_dot3_jitFactory.hpp"
-#include "GB_cuda_reduce_jitFactory.hpp"
-#include "GB_cuda_reduce_factory.hpp"
-#include "dataFactory.hpp"
-
-////Operations for test results on CPU
-//template<typename T> T myOP_plus( T a, T b) { return  a + b;}
-//template<typename T> T myOP_min ( T a, T b) { return  a < b ? a : b;}
-//template<typename T> T myOP_max ( T a, T b) { return  a > b ? a : b;}
-//template<typename T> T myOP_first ( T a, T b) { return  a ;}
-//template<typename T> T myOP_second ( T a, T b) { return  b ;}
-//template<typename T> T myOP_times ( T a, T b) { return  a * b ;}
-//
-//template<typename T> T (*myOpPTR)(T a, T b);
-//template<typename T> T (*ADD_ptr)(T a, T b);
-//template<typename T> T (*MUL_ptr)(T a, T b);
-
-//AxB_dot3_phase1 kernels
-template <typename T_C, typename T_M, typename T_A,typename T_B>
-bool test_AxB_phase1_factory( int64_t , int64_t , int64_t , int64_t ) ;
-
-//AxB_dot3_phase2 kernels
-template <typename T_C>
-bool test_AxB_dot3_phase2_factory( int , int64_t , int64_t , int64_t, int64_t ) ;
-
-template<typename T>
-void make_grb_matrix(GrB_Matrix mat, int64_t n_rows, int64_t n_cols,
-                     std::vector<int64_t> &indptr,
-                     std::vector<int64_t> &indices, T *data,
-                     int gxb_sparsity_control = GxB_SPARSE,
-                     int gxb_format = GxB_BY_ROW) ;
-
-//Fixture to generate valid inputs and hold them for tests
-class AxB_dot3_Test : public ::testing::Test
-{
-   void SetUp() {}
-
-   void TearDown() {}
-};
-
-template<typename T, typename I>
-void print_array(void *arr, I size, const char *name) {
-    std::cout << "Printing " << name << std::endl;
-    for(I i = 0; i < size; ++i) {
-        std::cout << static_cast<T*>(arr)[i] << ", ";
-    }
-    std::cout << std::endl;
-}
-
-//------------------------------------------------------------------------------
-// test_AxB_phase1_factory: test phase1
-//------------------------------------------------------------------------------
-
-// Test generator code, to allow parameterized tests
-// Uses jitFactory, dataFactory and GB_jit 
-template <typename T_C, typename T_M, typename T_A,typename T_B>
-bool test_AxB_phase1_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec)
-{
-
-    cudaStream_t stream = (cudaStream_t)rmm_wrap_get_main_stream();
-
-    /********************
-     * Launch kernel
-     */
-    GB_cuda_mxm_factory mysemiringfactory = problem_spec.get_mxm_factory();
-    phase1launchFactory p1lF(mysemiringfactory);
-
-    GpuTimer kernTimer;
-
-    int nthrd = p1lF.get_threads_per_block();
-    int ntasks = p1lF.get_number_of_blocks(problem_spec.getM());
-
-    // TODO: Verify that RMM is checking and throwing exceptions
-    int nanobuckets_size = NBUCKETS * nthrd * ntasks;
-    int blockbuckets_size = NBUCKETS * ntasks;
-
-    int64_t *Nanobuckets = (int64_t*)rmm_wrap_malloc(nanobuckets_size * sizeof (int64_t));
-    int64_t *Blockbucket = (int64_t*)rmm_wrap_malloc(blockbuckets_size * sizeof (int64_t));
-
-    kernTimer.Start();
-    p1lF.jitGridBlockLaunch(Nanobuckets, Blockbucket,
-                            problem_spec.getC(), problem_spec.getM(),
-                            problem_spec.getA(), problem_spec.getB(), stream);
-
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-    kernTimer.Stop();
-    std::cout<<"returned from phase1 kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-//
-//  print_array<int64_t>(Nanobuckets, nanobuckets_size, "Nanobuckets");
-//  print_array<int64_t>(Blockbucket, blockbuckets_size, "Blockbucket");
-    std::cout<<"==== phase1 done=============================" <<std::endl;
-
-    int64_t bucket_count = 0;
-    for (int i =0; i< NBUCKETS*ntasks; ++i) bucket_count += Blockbucket[i];
-    EXPECT_EQ( bucket_count, problem_spec.getCnnz()); //check we sum to the right structural counts
-//
-    rmm_wrap_free(Nanobuckets);
-    rmm_wrap_free(Blockbucket);
-
-    std::cout << "end phase1 test ------------" << std::endl;
-
-    fflush(stdout);
-    return true;
-}
-
-// Test generator code, to allow parameterized tests
-// Uses jitFactory, dataFactory and GB_jit
-template <typename T_C, typename T_M, typename T_A,typename T_B>
-bool test_AxB_dense_phase1_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec)
-{
-    cudaStream_t stream = (cudaStream_t)rmm_wrap_get_main_stream();
-
-    /********************
-     * Launch kernel
-     */
-    GB_cuda_mxm_factory mysemiringfactory = problem_spec.get_mxm_factory();
-    dense_phase1launchFactory p1lF(mysemiringfactory);
-    p1lF.jitGridBlockLaunch(problem_spec.getC(), problem_spec.getM(), problem_spec.getA(), problem_spec.getB(), stream);
-    return true;
-}
-
-
-//------------------------------------------------------------------------------
-// test_AxB_phase2_factory: test phase2 and phase2end
-//------------------------------------------------------------------------------
-
-template <typename T_C, typename T_M, typename T_A, typename T_B>
-bool test_AxB_phase2_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec)
-{
-    cudaStream_t stream = (cudaStream_t)rmm_wrap_get_main_stream();
-
-    auto mymxm = problem_spec.get_mxm_factory();
-    phase1launchFactory p1lF(mymxm);
-    phase2launchFactory p2lF;
-    phase2endlaunchFactory p2elF;
-
-   GpuTimer kernTimer;
-   kernTimer.Start();
-
-   const int64_t mnz = GB_nnz (problem_spec.getM()) ;
-
-   int nthrd = p2lF.get_threads_per_block();
-   int ntasks = p2elF.get_number_of_blocks(problem_spec.getM());
-
-    // fabricate data as if it came from phase1:
-    int64_t *nanobuckets = (int64_t*)rmm_wrap_malloc(NBUCKETS * nthrd * ntasks * sizeof (int64_t));
-    int64_t *blockbucket = (int64_t*)rmm_wrap_malloc(NBUCKETS * ntasks * sizeof (int64_t));
-    int64_t *bucketp = (int64_t*)rmm_wrap_malloc((NBUCKETS+1) * sizeof (int64_t));
-    int64_t *offset = (int64_t*)rmm_wrap_malloc(NBUCKETS * sizeof (int64_t));
-    int64_t *bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t));
-
-    fillvector_constant(NBUCKETS, bucketp, (int64_t)0);
-    fillvector_constant(NBUCKETS, offset, (int64_t)0);
-    //fillvector_constant(problem_spec.getCnnz(), bucket, (int64_t)0);
-
-    std::cout << "Running phase1 kernel" << std::endl;
-    kernTimer.Start();
-    p1lF.jitGridBlockLaunch(nanobuckets, blockbucket,
-                            problem_spec.getC(), problem_spec.getM(),
-                            problem_spec.getA(), problem_spec.getB(), stream);
-
-
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-    kernTimer.Stop();
-
-    std::cout << " phase1 internal phase2 "<< kernTimer.Elapsed() <<"ms Done." << std::endl;
-
-    //    // launch phase2 (just with p2ntasks as the # of tasks)
-    kernTimer.Start();
-    p2lF.jitGridBlockLaunch(blockbucket, offset, problem_spec.getM(), stream);
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-    kernTimer.Stop();
-    std::cout << " phase2 kern "<< kernTimer.Elapsed() <<"ms Done." << std::endl;
-
-//
-//    // do the reduction between phase2 and phase2end
-    int64_t s= 0;
-    for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket)
-    {
-        bucketp[bucket] = s;
-        s+= offset[bucket];
-    }
-
-    // launch phase2end: note same # of tasks as phase1
-    kernTimer.Start();
-    p2elF.jitGridBlockLaunch( nanobuckets, blockbucket,
-                              bucketp, bucket, offset, problem_spec.getC(),
-                              problem_spec.getM(),stream);
-    CHECK_CUDA(cudaStreamSynchronize(stream));
-    kernTimer.Stop();
-    std::cout<<"returned from phase2end kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-//
-//
-    print_array<int64_t>(bucketp, NBUCKETS, "bucketp");
-//  print_array<int64_t>(bucket, mnz, "bucket");
-    std::cout<<"phase2 done =================="<<std::endl;
-
-    EXPECT_EQ( bucketp[NBUCKETS], problem_spec.getCnnz()); //check we sum to the right structural counts
-
-    rmm_wrap_free(nanobuckets);
-    rmm_wrap_free(blockbucket);
-    rmm_wrap_free(bucketp);
-    rmm_wrap_free(bucket);
-    rmm_wrap_free(offset);
-
-    return true;
-}
-
-template<typename T>
-void make_grb_matrix(GrB_Matrix mat, int64_t n_rows, int64_t n_cols,
-                     std::vector<int64_t> &indptr,
-                     std::vector<int64_t> &indices, T *data,
-                     int gxb_sparsity_control,
-                     int gxb_format ) 
-{
-
-    GrB_Type type = cuda::jit::to_grb_type<T>();
-
-    GRB_TRY (GrB_Matrix_new (&mat, type, n_rows, n_cols)) ;
-
-    for(int64_t row = 0; row < n_rows; ++row) {
-        int64_t start = indptr[row];
-        int64_t stop = indptr[row+1];
-
-        for(int64_t offset = start; offset < stop; ++offset) {
-            GrB_Index i = (GrB_Index) row;
-            GrB_Index j = (GrB_Index) indices[offset];
-            T x = data[offset];
-
-            cuda::jit::set_element<T> (mat, x, i, j) ;
-        }
-    }
-
-    GRB_TRY (GrB_Matrix_wait (mat, GrB_MATERIALIZE)) ;
-    GRB_TRY (GB_convert_any_to_non_iso (mat, true)) ;
-    GRB_TRY (GxB_Matrix_Option_set (mat, GxB_SPARSITY_CONTROL, gxb_sparsity_control)) ;
-    GRB_TRY (GxB_Matrix_Option_set(mat, GxB_FORMAT, gxb_format));
-
-
-}
-
-template <
-    typename T_C, typename T_M, typename T_A,typename T_B,
-    typename T_X, typename T_Y, typename T_Z>
-bool test_AxB_dot3_sparse_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec) {
-
-    // FIXME: Allow the adaptive tests in this guy
-    std::cout << "sparse test ======================" << std::endl;
-
-    GpuTimer kernTimer;
-
-    cudaStream_t strm;
-    CHECK_CUDA(cudaStreamCreate(&strm));
-
-    std::cout << "sr_code: " << problem_spec.get_mxm_factory().sr_code << std::endl;
-
-    bool result = false;
-
-    int64_t N = problem_spec.getN();
-    /**
-     * Run Phase 1, phase 2 and phase2end: Compute nanobuckets and blockbuckets
-     */
-
-    auto mymxm = problem_spec.get_mxm_factory();
-    phase1launchFactory p1lF(mymxm);
-    phase2launchFactory p2lF;
-    phase2endlaunchFactory p2elF;
-
-    GrB_Matrix C = problem_spec.getC();
-    GrB_Matrix M = problem_spec.getM();
-    GrB_Matrix A = problem_spec.getA();
-    GrB_Matrix B = problem_spec.getB();
-
-    const int64_t mnz = GB_nnz (M) ;
-    const int64_t cnz = GB_nnz (C) ;
-    const int64_t cvlen = C->vlen ;
-    const int64_t cvdim = C->vdim ;
-    const int64_t cnvec = C->nvec ;
-
-    bool C_iso = false ;
-    int C_sparsity = GB_sparsity (M) ;
-    int M_sparsity = GB_sparsity (M) ;
-    GrB_Type ctype = problem_spec.getBinaryOp()->ztype ;
-
-    int nthrd = p2lF.get_threads_per_block();
-    int ntasks = p2elF.get_number_of_blocks(M);
-
-    // fabricate data as if it came from phase1:
-    int64_t *nanobuckets = (int64_t*)rmm_wrap_malloc(NBUCKETS * nthrd * ntasks * sizeof (int64_t));
-    int64_t *blockbucket = (int64_t*)rmm_wrap_malloc(NBUCKETS * ntasks * sizeof (int64_t));
-    int64_t *bucketp = (int64_t*)rmm_wrap_malloc((NBUCKETS+1) * sizeof (int64_t));
-    int64_t *bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t));
-    int64_t *offset = (int64_t*)rmm_wrap_malloc(NBUCKETS * sizeof (int64_t));
-
-    fillvector_constant(NBUCKETS, bucketp, (int64_t)0);
-    fillvector_constant(NBUCKETS, offset, (int64_t)0);
-    //fillvector_constant(problem_spec.getCnnz(), bucket, (int64_t)0);
-
-    std::cout << "sparse phase1 kernel" << std::endl;
-    kernTimer.Start();
-    p1lF.jitGridBlockLaunch(nanobuckets, blockbucket,
-                            C, M, A, B, strm);
-    CHECK_CUDA(cudaStreamSynchronize(strm));
-    kernTimer.Stop();
-    std::cout<<"sparse test phase1 kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-
-    //    // launch phase2 (just with p2ntasks as the # of tasks)
-    kernTimer.Start();
-    p2lF.jitGridBlockLaunch(blockbucket, offset, M, strm);
-    CHECK_CUDA(cudaStreamSynchronize(strm));
-    kernTimer.Stop();
-    std::cout<<"sparse test phase2 kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-
-//
-//    // do the reduction between phase2 and phase2end
-    int64_t s= 0;
-    for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket)
-    {
-        bucketp[bucket] = s;
-        s+= offset[bucket];
-    }
-
-    std::cout << "Launching phase2end" << std::endl;
-
-    // launch phase2end: note same # of tasks as phase1
-    kernTimer.Start();
-    p2elF.jitGridBlockLaunch( nanobuckets, blockbucket,
-                              bucketp, bucket, offset, C, M, strm);
-    CHECK_CUDA(cudaStreamSynchronize(strm));
-    kernTimer.Stop();
-    std::cout << "sparse test phase2end " <<kernTimer.Elapsed()<<"ms"<<std::endl;
-
-    /**
-     * Run Phase 3: Execute dot3 on all buckets
-     */
-    for (int b = 1; b < NBUCKETS; ++b) {// loop on buckets
-           int64_t b_start = bucketp[b];
-           int64_t b_end = bucketp[b+1];
-           int64_t nvecs = b_end - b_start;
-
-           if (nvecs == 0) continue;
-
-           kernTimer.Start();
-           phase3launchFactory p3lf(mymxm, (GB_bucket_code)b);
-           p3lf.jitGridBlockLaunch( b_start, b_end, bucketp, bucket, C, M,
-                                    A, B, strm);
-           CHECK_CUDA(cudaStreamSynchronize(strm));
-
-           kernTimer.Stop();
-           std::cout << "phase3 bucket="<<b<<" done " <<kernTimer.Elapsed()<<"ms"<<std::endl;
-           fflush(stdout);
-
-       }
-       C->nzombies += (bucketp[1]); //add pre-zombies to the count;
-
-           GRB_TRY(GrB_Matrix_wait(C, GrB_MATERIALIZE));
-           fflush(stdout);
-
-            GrB_Matrix C_expected;
-            GrB_Type type = cuda::jit::to_grb_type<T_C>();
-            GRB_TRY (GrB_Matrix_new (&C_expected, type, N, N)) ;
-
-            // ensure the GPU is not used
-            GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, -1)) ;
-            GB_Global_hack_set (2, 2) ; // hack(2) = 2: never use the GPU
-
-            // Use GrB_DESC_S for structural because dot3 mask will never be complemented
-            // The order of B and A is swapped to account for CSR vs CSC assumption
-            GRB_TRY (GrB_mxm(C_expected, problem_spec.getM(), NULL, problem_spec.get_semiring(), problem_spec.getB(),
-                             problem_spec.getA(), problem_spec.get_mask_struct() ? GrB_DESC_ST1 : GrB_DESC_T1));
-
-
-            GRB_TRY(GrB_Matrix_wait(C_expected, GrB_MATERIALIZE));
-
-            // compare
-            double tol = 0 ;
-            GrB_Index nvals1 = 0, nvals2 = 0 ;
-            GRB_TRY (GrB_Matrix_nvals (&nvals1, C)) ;
-            GRB_TRY (GrB_Matrix_nvals (&nvals2, C_expected)) ;
-            if (nvals1 != nvals2) { printf ("Wrong number of nonzeroes found, test fail!!!\n") ; ADD_FAILURE( ) ; }
-            GrB_Index nrows, ncols ;
-            GrB_Matrix_nrows (&nrows, C_expected) ;
-            GrB_Matrix_ncols (&ncols, C_expected) ;
-
-            GrB_Matrix T;
-
-            GRB_TRY (GrB_Matrix_new (&T, GrB_BOOL, nrows, ncols)) ;
-            GrB_BinaryOp op = NULL;
-            GrB_UnaryOp op_abs = NULL ;
-            if      (type == GrB_BOOL  ) op = GrB_EQ_BOOL   ;
-            else if (type == GrB_INT8  ) op = GrB_EQ_INT8   ;
-            else if (type == GrB_INT16 ) op = GrB_EQ_INT16  ;
-            else if (type == GrB_INT32 ) op = GrB_EQ_INT32  ;
-            else if (type == GrB_INT64 ) op = GrB_EQ_INT64  ;
-            else if (type == GrB_UINT8 ) op = GrB_EQ_UINT8  ;
-            else if (type == GrB_UINT16) op = GrB_EQ_UINT16 ;
-            else if (type == GrB_UINT32) op = GrB_EQ_UINT32 ;
-            else if (type == GrB_UINT64) op = GrB_EQ_UINT64 ;
-            else if (type == GrB_FP32  )
-            {   tol = 1e-6;
-                op = (tol == 0)? GrB_EQ_FP32 : GrB_MINUS_FP32   ;
-                op_abs = GrB_ABS_FP32 ;
-            }
-            else if (type == GrB_FP64  )
-            {   tol = 1e12;
-                op = (tol == 0)? GrB_EQ_FP64 : GrB_MINUS_FP64   ;
-                op_abs = GrB_ABS_FP64 ;
-            }
-            else if (type == GxB_FC32  )
-            {   tol = 2e-6;
-                op = (tol == 0)? GxB_EQ_FC32 : GxB_MINUS_FC32   ;
-                op_abs = GxB_ABS_FC32 ;
-            }
-            else if (type == GxB_FC64  )
-            {   tol = 2e-12;
-                op = (tol == 0)? GxB_EQ_FC64 : GxB_MINUS_FC64   ;
-                op_abs = GxB_ABS_FC64 ;
-            }
-
-
-
-            if (tol == 0)
-            {
-                // check for perfect equality
-                GRB_TRY (GrB_Matrix_eWiseMult_BinaryOp (T, NULL, NULL, op, C, C_expected,
-                    NULL)) ;
-                GrB_Index nvals3 = 1 ;
-                GRB_TRY (GrB_Matrix_nvals (&nvals3, T)) ;
-//                if (nvals1 != nvals3) { printf (" difference matrix wrong size, test fail!!\n") ; ADD_FAILURE( ) ; }
-                bool is_same = false ;
-                GRB_TRY (GrB_Matrix_reduce_BOOL (&is_same, NULL, GrB_LAND_MONOID_BOOL,
-                    T, NULL)) ;
-                if (!is_same) { printf (" results don't match, test fail!!\n") ; ADD_FAILURE ( ) ; } 
-                GRB_TRY (GrB_Matrix_free (&T)) ;
-            }
-            else
-            {
-                // TODO: check with roundoff
-                // Diff = C - C_expected
-                GrB_Matrix Diff ;
-                GRB_TRY (GrB_Matrix_new (&Diff, GrB_FP64, nrows, ncols)) ;
-                GRB_TRY (GrB_Matrix_apply (Diff, NULL, NULL, GrB_AINV_FP64, C_expected, NULL)) ;
-                GRB_TRY (GrB_Matrix_eWiseAdd_BinaryOp (Diff, NULL, NULL, GrB_PLUS_FP64,
-                    C, Diff, NULL)) ;
-                GRB_TRY( GrB_Matrix_apply( Diff, NULL, NULL, op_abs, Diff, NULL) );
-                GrB_Index nvals3 = 1 ;
-                GRB_TRY (GrB_Matrix_nvals (&nvals3, Diff)) ;
-                if (nvals1 != nvals3) { printf ("fp difference matrix wrong size, test fail!!\n") ; ADD_FAILURE( ) ; } 
-                double is_same = false ;
-                GRB_TRY (GrB_Matrix_reduce_FP64 (&is_same, NULL, GrB_PLUS_MONOID_FP64,
-                    Diff, NULL)) ;
-                printf("difference = %12.6g, rel_l1_err=%12.6g\n", is_same, is_same/nvals3 );
-                EXPECT_LT( is_same/nvals3, tol);
-                GRB_TRY (GrB_Matrix_free (&Diff)) ;
-
-            }
-
-            // re-enable the GPU
-            GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, 0)) ;
-            GB_Global_hack_set (2, 1) ; // hack(2) = 1: always use the GPU
-
-    rmm_wrap_free(nanobuckets);
-    rmm_wrap_free(blockbucket);
-    rmm_wrap_free(bucketp);
-    rmm_wrap_free(bucket);
-    rmm_wrap_free(offset);
-    GRB_TRY(GrB_Matrix_free(&C_expected));
-    CHECK_CUDA(cudaStreamDestroy(strm));
-
-    std::cout << "phase 3 test complete ======================" << std::endl;
-    return result;
-}
-
-template <
-        typename T_C, typename T_M, typename T_A,typename T_B,
-        typename T_X, typename T_Y, typename T_Z>
-bool test_AxB_dot3_dense_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec) {
-
-    std::cout << "phase dense  test ======================" << std::endl;
-
-    GpuTimer kernTimer;
-
-    cudaStream_t strm = (cudaStream_t)rmm_wrap_get_main_stream();
-
-    bool result = false;
-
-    int64_t N = problem_spec.getN();
-
-    auto mymxm = problem_spec.get_mxm_factory();
-    dense_phase1launchFactory p1lF(mymxm);
-
-    GrB_Matrix C = problem_spec.getC();
-    GrB_Matrix M = problem_spec.getM();
-    GrB_Matrix A = problem_spec.getA();
-    GrB_Matrix B = problem_spec.getB();
-
-    problem_spec.set_sparsity_control( A, GxB_FULL, GxB_BY_ROW);
-    problem_spec.set_sparsity_control( B, GxB_FULL, GxB_BY_ROW);
-
-    const int64_t mnz = GB_nnz (M) ;
-    const int64_t cnz = GB_nnz (C) ;
-    const int64_t cvlen = C->vlen ;
-    const int64_t cvdim = C->vdim ;
-    const int64_t cnvec = C->nvec ;
-
-    bool C_iso = false ;
-    GrB_Type ctype = problem_spec.getBinaryOp()->ztype ;
-
-    std::cout << "Running phase1 kernel" << std::endl;
-    kernTimer.Start();
-    p1lF.jitGridBlockLaunch(C, M, A, B, strm);
-    CHECK_CUDA(cudaStreamSynchronize(strm));
-    kernTimer.Stop();
-    std::cout<<"Dense internal phase1 kernel done "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-
-    std::cout << "Running dense kernel" << std::endl;
-    mxm_dense_launchFactory p3lf(mymxm);
-    kernTimer.Start();
-    p3lf.jitGridBlockLaunch( C, M, A, B, strm);
-    CHECK_CUDA(cudaStreamSynchronize(strm));
-    kernTimer.Stop();
-    std::cout<<"Dense kernel done "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-
-    GRB_TRY(GrB_Matrix_wait(C, GrB_MATERIALIZE));
-    fflush(stdout);
-
-    GrB_Matrix C_expected;
-    GrB_Type type = cuda::jit::to_grb_type<T_C>();
-    GRB_TRY (GrB_Matrix_new (&C_expected, type, N, N)) ;
-
-    // ensure the GPU is not used
-    GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, -1)) ;
-    GB_Global_hack_set (2, 2) ; // hack(2) = 2: never use the GPU
-
-    // Use GrB_DESC_S for structural because dot3 mask will never be complemented
-    // The order of B and A is swapped to account for CSR vs CSC assumption
-    GRB_TRY (GrB_mxm(C_expected, problem_spec.getM(), NULL, problem_spec.get_semiring(), problem_spec.getB(),
-                     problem_spec.getA(), problem_spec.get_mask_struct() ? GrB_DESC_ST1 : GrB_DESC_T1));
-
-    GRB_TRY(GrB_Matrix_wait(C_expected, GrB_MATERIALIZE));
-    std::cout << "nnz: " << GB_nnz (C_expected) << std::endl ;
-
-    // compare
-    double tol = 0 ;
-    GrB_Index nvals1 = 0, nvals2 = 0 ;
-    GRB_TRY (GrB_Matrix_nvals (&nvals1, C)) ;
-    GRB_TRY (GrB_Matrix_nvals (&nvals2, C_expected)) ;
-    if (nvals1 != nvals2) { printf ("Wrong number of nonzeroes found, test fail!!! nvals1=%lu, nvals2=%lu\n", nvals1, nvals2) ; ADD_FAILURE( ) ; }
-    GrB_Index nrows, ncols ;
-    GrB_Matrix_nrows (&nrows, C_expected) ;
-    GrB_Matrix_ncols (&ncols, C_expected) ;
-
-    GrB_Matrix T;
-
-    GRB_TRY (GrB_Matrix_new (&T, GrB_BOOL, nrows, ncols)) ;
-    GrB_BinaryOp op = NULL;
-    GrB_UnaryOp op_abs = NULL ;
-    if      (type == GrB_BOOL  ) op = GrB_EQ_BOOL   ;
-    else if (type == GrB_INT8  ) op = GrB_EQ_INT8   ;
-    else if (type == GrB_INT16 ) op = GrB_EQ_INT16  ;
-    else if (type == GrB_INT32 ) op = GrB_EQ_INT32  ;
-    else if (type == GrB_INT64 ) op = GrB_EQ_INT64  ;
-    else if (type == GrB_UINT8 ) op = GrB_EQ_UINT8  ;
-    else if (type == GrB_UINT16) op = GrB_EQ_UINT16 ;
-    else if (type == GrB_UINT32) op = GrB_EQ_UINT32 ;
-    else if (type == GrB_UINT64) op = GrB_EQ_UINT64 ;
-    else if (type == GrB_FP32  )
-    {   tol = 5e-6;
-        op = (tol == 0)? GrB_EQ_FP32 : GrB_MINUS_FP32   ;
-        op_abs = GrB_ABS_FP32 ;
-    }
-    else if (type == GrB_FP64  )
-    {   tol = 1e12;
-        op = (tol == 0)? GrB_EQ_FP64 : GrB_MINUS_FP64   ;
-        op_abs = GrB_ABS_FP64 ;
-    }
-    else if (type == GxB_FC32  )
-    {   tol = 2e-6;
-        op = (tol == 0)? GxB_EQ_FC32 : GxB_MINUS_FC32   ;
-        op_abs = GxB_ABS_FC32 ;
-    }
-    else if (type == GxB_FC64  )
-    {   tol = 2e-12;
-        op = (tol == 0)? GxB_EQ_FC64 : GxB_MINUS_FC64   ;
-        op_abs = GxB_ABS_FC64 ;
-    }
-
-
-
-    if (tol == 0)
-    {
-        // check for perfect equality
-        GRB_TRY (GrB_Matrix_eWiseMult_BinaryOp (T, NULL, NULL, op, C, C_expected,
-                                                NULL)) ;
-        GrB_Index nvals3 = 1 ;
-        GRB_TRY (GrB_Matrix_nvals (&nvals3, T)) ;
-//        if (nvals1 != nvals3) { printf (" difference matrix wrong size, test fail!! nvals1=%ld nvals3=%ld\n", nvals1, nvals3) ; ADD_FAILURE( ) ; }
-        bool is_same = false ;
-        GRB_TRY (GrB_Matrix_reduce_BOOL (&is_same, NULL, GrB_LAND_MONOID_BOOL,
-                                         T, NULL)) ;
-        if (!is_same) { printf (" results don't match, test fail!!\n") ; ADD_FAILURE ( ) ; }
-        GRB_TRY (GrB_Matrix_free (&T)) ;
-    }
-    else
-    {
-        // TODO: check with roundoff
-        // Diff = C - C_expected
-        GrB_Matrix Diff ;
-        GRB_TRY (GrB_Matrix_new (&Diff, GrB_FP64, nrows, ncols)) ;
-        GRB_TRY (GrB_Matrix_apply (Diff, NULL, NULL, GrB_AINV_FP64, C_expected, NULL)) ;
-        GRB_TRY (GrB_Matrix_eWiseAdd_BinaryOp (Diff, NULL, NULL, GrB_PLUS_FP64,
-                                               C, Diff, NULL)) ;
-        GRB_TRY( GrB_Matrix_apply( Diff, NULL, NULL, op_abs, Diff, NULL) );
-        GrB_Index nvals3 = 1 ;
-        GRB_TRY (GrB_Matrix_nvals (&nvals3, Diff)) ;
-        if (nvals1 != nvals3) { printf ("fp difference matrix wrong size, test fail!!\n") ; ADD_FAILURE( ) ; }
-        double is_same = false ;
-        GRB_TRY (GrB_Matrix_reduce_FP64 (&is_same, NULL, GrB_PLUS_MONOID_FP64,
-                                         Diff, NULL)) ;
-        printf("difference = %12.6g, rel_l1_err=%12.6g\n", is_same, is_same/nvals3 );
-        EXPECT_LT( is_same/nvals3, tol);
-        GRB_TRY (GrB_Matrix_free (&Diff)) ;
-
-    }
-
-    // re-enable the GPU
-    GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, 0)) ;
-    GB_Global_hack_set (2, 1) ; // hack(2) = 1: always use the GPU
-
-
-    GRB_TRY(GrB_Matrix_free(&C_expected));
-
-    std::cout << "phase 3 dense test complete ======================" << std::endl;
-    return result;
-}
-
-template <
-        typename T_C, typename T_M, typename T_A,typename T_B,
-        typename T_X, typename T_Y, typename T_Z>
-bool test_AxB_dot3_sparse_dense_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec) {
-
-    std::cout << "sparse dense test ======================" << std::endl;
-
-    GpuTimer kernTimer;
-
-    cudaStream_t strm;
-    CHECK_CUDA(cudaStreamCreate(&strm));
-
-    bool result = false;
-
-    int64_t N = problem_spec.getN();
-
-    GrB_Matrix C = problem_spec.getC();
-    GrB_Matrix M = problem_spec.getM();
-    GrB_Matrix A = problem_spec.getA();
-    GrB_Matrix B = problem_spec.getB();
-
-    problem_spec.set_sparsity_control( A, GxB_SPARSE, GxB_BY_ROW);
-
-    // TODO: Need to make sure the format itself is actually dense.
-    problem_spec.set_sparsity_control( B, GxB_FULL, GxB_BY_ROW);
-
-    auto mymxm = problem_spec.get_mxm_factory();
-    dense_phase1launchFactory p1lF(mymxm);
-
-    const int64_t mnz = GB_nnz (M) ;
-    const int64_t cnz = GB_nnz (C) ;
-    const int64_t cvlen = C->vlen ;
-    const int64_t cvdim = C->vdim ;
-    const int64_t cnvec = C->nvec ;
-
-    bool C_iso = false ;
-    GrB_Type ctype = problem_spec.getBinaryOp()->ztype ;
-
-    std::cout << "Running dense_phase1 kernel" << std::endl;
-    kernTimer.Start();
-    p1lF.jitGridBlockLaunch(C, M, A, B, strm);
-    CHECK_CUDA(cudaStreamSynchronize(strm));
-    kernTimer.Stop();
-    std::cout<<"Dense internal phase1 kernel done "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-
-    std::cout << "Running sparse dense kernel" << std::endl;
-    mxm_sparse_dense_launchFactory spdnlf(mymxm);
-    kernTimer.Start();
-    spdnlf.jitGridBlockLaunch( C, M, A, B, strm);
-    CHECK_CUDA(cudaStreamSynchronize(strm));
-    kernTimer.Stop();
-    std::cout<<"Sparse_Dense done "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-
-    GRB_TRY(GrB_Matrix_wait(C, GrB_MATERIALIZE));
-    fflush(stdout);
-
-    GrB_Matrix C_expected;
-    GrB_Type type = cuda::jit::to_grb_type<T_C>();
-    GRB_TRY (GrB_Matrix_new (&C_expected, type, N, N)) ;
-
-    // ensure the GPU is not used
-    GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, -1)) ;
-    GB_Global_hack_set (2, 2) ; // hack(2) = 2: never use the GPU
-
-    // Use GrB_DESC_S for structural because dot3 mask will never be complemented
-    // The order of B and A is swapped to account for CSR vs CSC assumption
-    GRB_TRY (GrB_mxm(C_expected, problem_spec.getM(), NULL, problem_spec.get_semiring(), problem_spec.getB(),
-                     problem_spec.getA(), problem_spec.get_mask_struct() ? GrB_DESC_ST1 : GrB_DESC_T1));
-
-    GRB_TRY(GrB_Matrix_wait(C_expected, GrB_MATERIALIZE));
-    std::cout << "nnz: " << GB_nnz (C_expected) << std::endl ;
-
-    // compare
-    double tol = 0 ;
-    GrB_Index nvals1 = 0, nvals2 = 0 ;
-    GRB_TRY (GrB_Matrix_nvals (&nvals1, C)) ;
-    GRB_TRY (GrB_Matrix_nvals (&nvals2, C_expected)) ;
-    if (nvals1 != nvals2) { printf ("Wrong number of nonzeroes found, test fail!!! nvals1=%lu, nvals2=%lu\n", nvals1, nvals2) ; ADD_FAILURE( ) ; }
-    GrB_Index nrows, ncols ;
-    GrB_Matrix_nrows (&nrows, C_expected) ;
-    GrB_Matrix_ncols (&ncols, C_expected) ;
-
-    GrB_Matrix T;
-
-    GRB_TRY (GrB_Matrix_new (&T, GrB_BOOL, nrows, ncols)) ;
-    GrB_BinaryOp op = NULL;
-    GrB_UnaryOp op_abs = NULL ;
-    if      (type == GrB_BOOL  ) op = GrB_EQ_BOOL   ;
-    else if (type == GrB_INT8  ) op = GrB_EQ_INT8   ;
-    else if (type == GrB_INT16 ) op = GrB_EQ_INT16  ;
-    else if (type == GrB_INT32 ) op = GrB_EQ_INT32  ;
-    else if (type == GrB_INT64 ) op = GrB_EQ_INT64  ;
-    else if (type == GrB_UINT8 ) op = GrB_EQ_UINT8  ;
-    else if (type == GrB_UINT16) op = GrB_EQ_UINT16 ;
-    else if (type == GrB_UINT32) op = GrB_EQ_UINT32 ;
-    else if (type == GrB_UINT64) op = GrB_EQ_UINT64 ;
-    else if (type == GrB_FP32  )
-    {   tol = 5e-6;
-        op = (tol == 0)? GrB_EQ_FP32 : GrB_MINUS_FP32   ;
-        op_abs = GrB_ABS_FP32 ;
-    }
-    else if (type == GrB_FP64  )
-    {   tol = 1e12;
-        op = (tol == 0)? GrB_EQ_FP64 : GrB_MINUS_FP64   ;
-        op_abs = GrB_ABS_FP64 ;
-    }
-    else if (type == GxB_FC32  )
-    {   tol = 2e-6;
-        op = (tol == 0)? GxB_EQ_FC32 : GxB_MINUS_FC32   ;
-        op_abs = GxB_ABS_FC32 ;
-    }
-    else if (type == GxB_FC64  )
-    {   tol = 2e-12;
-        op = (tol == 0)? GxB_EQ_FC64 : GxB_MINUS_FC64   ;
-        op_abs = GxB_ABS_FC64 ;
-    }
-
-
-
-    if (tol == 0)
-    {
-        // check for perfect equality
-        GRB_TRY (GrB_Matrix_eWiseMult_BinaryOp (T, NULL, NULL, op, C, C_expected,
-                                                NULL)) ;
-        GrB_Index nvals3 = 1 ;
-        GRB_TRY (GrB_Matrix_nvals (&nvals3, T)) ;
-//        if (nvals1 != nvals3) { printf (" difference matrix wrong size, test fail!! nvals1=%ld nvals3=%ld\n", nvals1, nvals3) ; ADD_FAILURE( ) ; }
-        bool is_same = false ;
-        GRB_TRY (GrB_Matrix_reduce_BOOL (&is_same, NULL, GrB_LAND_MONOID_BOOL,
-                                         T, NULL)) ;
-        if (!is_same) { printf (" results don't match, test fail!!\n") ; ADD_FAILURE ( ) ; }
-        GRB_TRY (GrB_Matrix_free (&T)) ;
-    }
-    else
-    {
-        // TODO: check with roundoff
-        // Diff = C - C_expected
-        GrB_Matrix Diff ;
-        GRB_TRY (GrB_Matrix_new (&Diff, GrB_FP64, nrows, ncols)) ;
-        GRB_TRY (GrB_Matrix_apply (Diff, NULL, NULL, GrB_AINV_FP64, C_expected, NULL)) ;
-        GRB_TRY (GrB_Matrix_eWiseAdd_BinaryOp (Diff, NULL, NULL, GrB_PLUS_FP64,
-                                               C, Diff, NULL)) ;
-        GRB_TRY( GrB_Matrix_apply( Diff, NULL, NULL, op_abs, Diff, NULL) );
-        GrB_Index nvals3 = 1 ;
-        GRB_TRY (GrB_Matrix_nvals (&nvals3, Diff)) ;
-        if (nvals1 != nvals3) { printf ("fp difference matrix wrong size, test fail!!\n") ; ADD_FAILURE( ) ; }
-        double is_same = false ;
-        GRB_TRY (GrB_Matrix_reduce_FP64 (&is_same, NULL, GrB_PLUS_MONOID_FP64,
-                                         Diff, NULL)) ;
-        printf("difference = %12.6g, rel_l1_err=%12.6g\n", is_same, is_same/nvals3 );
-        EXPECT_LT( is_same/nvals3, tol);
-        GRB_TRY (GrB_Matrix_free (&Diff)) ;
-
-    }
-
-    // re-enable the GPU
-    GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, 0)) ;
-    GB_Global_hack_set (2, 1) ; // hack(2) = 1: always use the GPU
-
-
-    GRB_TRY(GrB_Matrix_free(&C_expected));
-    CHECK_CUDA(cudaStreamDestroy(strm));
-
-    std::cout << "phase 3 dense test complete ======================" << std::endl;
-    return result;
-}
-
-
-template <typename T_C, typename T_M, typename T_A, typename T_B>
-bool test_reduce_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec) {
-
-    std::cout << "reduce test ======================" << std::endl;
-
-    // TODO: This test doesn't really fit the `mxm` category
-    GrB_Monoid monoid = problem_spec.getMonoid();
-    int64_t N = problem_spec.getN();
-
-    GrB_Matrix A;
-
-    // TODO: Using C here so that the reduced type matches
-    GrB_Matrix_dup(&A, problem_spec.getC());
-    GrB_Type type = cuda::jit::to_grb_type<T_C>();
-
-    A->i[0] = GB_FLIP(A->i[0]); // FIXME
-    A->i[1] = GB_FLIP(A->i[1]); // FIXME
-    A->nzombies = 2; // FIXME: use an opaque method to insert zombies into A
-
-    //GRB_TRY (GxB_Matrix_fprint (A, "A", GxB_SHORT_VERBOSE, stdout)) ;
-
-    GB_cuda_reduce_factory myreducefactory;
-    myreducefactory.reduce_factory(monoid, A);
-
-    T_C actual;
-    GB_cuda_reduce(myreducefactory, A, &actual, monoid );
-
-    GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, -1)) ;
-    GB_Global_hack_set (2, 2) ; // hack(2) = 2: never use the GPU
-
-    T_C expected;
-    GRB_TRY(cuda::jit::matrix_reduce(&expected, A, monoid));
-
-    GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, 0)) ;
-    GB_Global_hack_set (2, 1) ; // hack(2) = 1: always use the GPU
-
-    double tol = 0;
-    GrB_BinaryOp op = NULL;
-    GrB_UnaryOp op_abs = NULL ;
-
-    if      (type == GrB_BOOL  ) op = GrB_EQ_BOOL   ;
-    else if (type == GrB_INT8  ) op = GrB_EQ_INT8   ;
-    else if (type == GrB_INT16 ) op = GrB_EQ_INT16  ;
-    else if (type == GrB_INT32 ) op = GrB_EQ_INT32  ;
-    else if (type == GrB_INT64 ) op = GrB_EQ_INT64  ;
-    else if (type == GrB_UINT8 ) op = GrB_EQ_UINT8  ;
-    else if (type == GrB_UINT16) op = GrB_EQ_UINT16 ;
-    else if (type == GrB_UINT32) op = GrB_EQ_UINT32 ;
-    else if (type == GrB_UINT64) op = GrB_EQ_UINT64 ;
-    else if (type == GrB_FP32  )
-    {   tol = 1e-6;
-        op = (tol == 0)? GrB_EQ_FP32 : GrB_MINUS_FP32   ;
-        op_abs = GrB_ABS_FP32 ;
-    }
-    else if (type == GrB_FP64  )
-    {   tol = 1e12;
-        op = (tol == 0)? GrB_EQ_FP64 : GrB_MINUS_FP64   ;
-        op_abs = GrB_ABS_FP64 ;
-    }
-    else if (type == GxB_FC32  )
-    {   tol = 2e-6;
-        op = (tol == 0)? GxB_EQ_FC32 : GxB_MINUS_FC32   ;
-        op_abs = GxB_ABS_FC32 ;
-    }
-    else if (type == GxB_FC64  )
-    {   tol = 2e-12;
-        op = (tol == 0)? GxB_EQ_FC64 : GxB_MINUS_FC64   ;
-        op_abs = GxB_ABS_FC64 ;
-    }
-
-    if(tol == 0) {
-       EXPECT_EQ( actual , expected);
-        //std::cout << "results do not match: reduced=" << expected << ", actual=" << actual << std::endl;
-        //exit(1);
-    } else if ( (tol > 0) && ( ( type ==GrB_FP32) || ( type ==GxB_FC32) 
-                            || ( type ==GrB_FP64) || ( type ==GxB_FC64) ) ){
-       EXPECT_LT( abs((double)actual - (double)expected)/(abs((double)expected)+1.e-12), tol) ;
-    }
-
-    std::cout<< expected<< " " << actual<< "reduce test complete ======================" << std::endl;
-    GRB_TRY(GrB_Matrix_free(&A));
-
-    return expected == actual;
-}
-
diff --git a/GraphBLAS/CUDA/test/problem_spec.hpp b/GraphBLAS/CUDA/test/problem_spec.hpp
deleted file mode 100644
index c1997771ab..0000000000
--- a/GraphBLAS/CUDA/test/problem_spec.hpp
+++ /dev/null
@@ -1,129 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/test/problem_spec.hpp
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-#pragma once
-
-#include <cassert>
-#include <cmath>
-#include <random>
-#include <algorithm>
-#include <iostream>
-#include "GpuTimer.h"
-#include "GB_cuda_buckets.h"
-#include "../../rmm_wrap/rmm_wrap.h"
-#include <gtest/gtest.h>
-#include "test_data.hpp"
-extern "C" {
-#include "GB.h"
-}
-
-#include "../GB_cuda_common_jitFactory.hpp"
-#include "../GB_cuda_mxm_dot3_jitFactory.hpp"
-#include "../GB_cuda_reduce_jitFactory.hpp"
-#include "dataFactory.hpp"
-
-template<typename T_C, typename T_M, typename T_A, typename T_B>
-class mxm_problem_spec {
-
-public:
-    mxm_problem_spec(GrB_Monoid monoid_, GrB_BinaryOp binop_, int64_t N_, int64_t Annz_, int64_t Bnnz_, int64_t Cnnz_,
-                     int sparsity_control_A_ = GxB_SPARSE, int sparsity_control_B_ = GxB_SPARSE) :
-        mysemiring(), binop(binop_), monoid(monoid_), N(N_),
-        G(N_, N_), Annz(Annz_), Bnnz(Bnnz_), Cnnz(Cnnz_), mask_struct(true), flipxy(false), mask_comp(false) {
-
-        // FIXME: This should be getting set automatically somehow.
-        float Cnzpercent = (float) Cnnz_/(N_*N_);
-
-        // TODO: Allocate and fill arrays for buckets and nano buckets
-        G.init_A(Annz_, sparsity_control_A_, GxB_BY_ROW);
-        G.init_B(Bnnz_, sparsity_control_B_, GxB_BY_ROW);
-        G.init_C(Cnzpercent);
-//      G.fill_buckets( TB ); // all elements go to testbucket= TB
-
-        /************************
-         * Create mxm factory
-         */
-        auto grb_info = GrB_Semiring_new(&mysemiring, monoid_, binop_);
-        GRB_TRY (grb_info) ;
-        GrB_Matrix A = G.getA();
-        GrB_Matrix B = G.getB();
-        //GRB_TRY (GxB_Matrix_fprint (A, "A", GxB_SHORT_VERBOSE, stdout)) ;
-        //GRB_TRY (GxB_Matrix_fprint (B, "B", GxB_SHORT_VERBOSE, stdout)) ;
-    }
-
-    ~mxm_problem_spec() {
-
-        std::cout << "Calling G.del()" << std::endl;
-        G.del();
-
-    }
-
-    GrB_Matrix getC(){ return G.getC(); }
-    GrB_Matrix getM(){ return G.getM(); }
-    GrB_Matrix getA(){ return G.getA(); }
-    GrB_Matrix getB(){ return G.getB(); }
-
-    GrB_Monoid getMonoid() { return monoid; }
-    GrB_BinaryOp getBinaryOp() { return binop; }
-
-    int64_t getN() { return N; }
-    int64_t getAnnz() { return Annz; }
-    int64_t getBnnz() { return Bnnz; }
-    int64_t getCnnz() { return Cnnz; }
-
-    auto &getG() { return G; }
-
-    GB_cuda_mxm_factory &get_mxm_factory() {
-
-        // Lazily create the mxm factory
-        if(!mymxmfactory.has_value()) {
-
-            mymxmfactory.emplace(GB_cuda_mxm_factory());
-            GrB_Matrix C = G.getC();
-            GrB_Matrix M = G.getM();
-            GrB_Matrix A = G.getA();
-            GrB_Matrix B = G.getB();
-
-            bool C_iso = false ;
-            int C_sparsity = GB_sparsity (M) ;
-            GrB_Type ctype = binop->ztype ;
-
-            (*mymxmfactory).mxm_factory (
-                    C_iso, C_sparsity, ctype,
-                    M, mask_struct, mask_comp,
-                    mysemiring, flipxy,
-                    A, B) ;
-        }
-        return *mymxmfactory;
-    }
-    GrB_Semiring get_semiring() { return mysemiring; }
-
-    void set_sparsity_control(GrB_Matrix mat, int gxb_sparsity_control, int gxb_format) {
-        GRB_TRY (GxB_Matrix_Option_set (mat, GxB_SPARSITY_CONTROL, gxb_sparsity_control)) ;
-        GRB_TRY (GxB_Matrix_Option_set(mat, GxB_FORMAT, gxb_format));
-        GRB_TRY (GrB_Matrix_wait (mat, GrB_MATERIALIZE)) ;
-    }
-
-    bool get_mask_struct() { return mask_struct; }
-
-private:
-
-    bool mask_struct{false};
-    bool flipxy{false};
-    bool mask_comp{false};
-
-    int64_t Annz;
-    int64_t Bnnz;
-    int64_t Cnnz;
-    int64_t N;
-    GrB_BinaryOp binop;
-    GrB_Monoid  monoid;
-    GrB_Semiring  mysemiring;
-    std::optional<GB_cuda_mxm_factory> mymxmfactory;
-    SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
-};
diff --git a/GraphBLAS/CUDA/test/run_tests.cpp b/GraphBLAS/CUDA/test/run_tests.cpp
deleted file mode 100644
index 2618b9f688..0000000000
--- a/GraphBLAS/CUDA/test/run_tests.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/test/run_tests.cpp
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-#include <gtest/gtest.h>
-
-#include "GraphBLAS_cuda.h"
-#include "rmm_wrap.h"
-
-#include "test_utility.hpp"
-
-int main(int argc, char **argv) {
-
-    size_t init_size, max_size, stream_pool_size;
-    init_size = 256*(1ULL<<10);
-    max_size  = 256*(1ULL<<20);
-    stream_pool_size = 1;
-
-    printf(" pool init size %ld, max size %ld\n", init_size, max_size);
-    rmm_wrap_initialize_all_same( rmm_wrap_managed, init_size, max_size,  stream_pool_size);
-
-    GRB_TRY (GxB_init (GxB_NONBLOCKING_GPU,
-        rmm_wrap_malloc, rmm_wrap_calloc, rmm_wrap_realloc, rmm_wrap_free)) ;
-
-    std::cout << "Done initializing graphblas and rmm" << std::endl;
-
-    GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_ID, 0)) ;
-
-    size_t buff_size = (1ULL<<13)+152;
-    void *p = (void *)rmm_wrap_allocate( &buff_size );
-
-    ::testing::InitGoogleTest(&argc, argv);
-    auto r = RUN_ALL_TESTS();
-
-    rmm_wrap_deallocate( p, buff_size);
-    GRB_TRY (GrB_finalize());
-    rmm_wrap_finalize();
-    std::cout << "Tests complete" << std::endl;
-
-    return r;
-}
diff --git a/GraphBLAS/CUDA/test/testGen_cmake.py b/GraphBLAS/CUDA/test/testGen_cmake.py
deleted file mode 100644
index 52857f0bd0..0000000000
--- a/GraphBLAS/CUDA/test/testGen_cmake.py
+++ /dev/null
@@ -1,176 +0,0 @@
-#-------------------------------------------------------------------------------
-# GraphBLAS/CUDA/test/testGen_cmake.py
-#-------------------------------------------------------------------------------
-
-# SPDX-License-Identifier: Apache-2.0
-
-#-------------------------------------------------------------------------------
-
-# Generate test instances from a large tensor product set of options
-
-GB_TYPE_PREFIX = "GrB"
-
-SUPPORTED_TYPES = {
-    "int32_t": "INT32",
-    "uint32_t": "UINT32",
-    "int64_t": "INT64",
-    "uint64_t": "UINT64",
-    "bool": "BOOL",
-    "float": "FP32",
-    "double": "FP64"
-}
-
-DOT3_BUCKETS = [1, 2]    # NBUCKETS, hard-coded
-
-DataShapes ={
-    "nanoxnano": {'N':32, 'Anz':64, 'Bnz':56, 'Cnz': 256},
-    "tinyxtiny": {'N':128, 'Anz':1256, 'Bnz':1028, 'Cnz': 1640},
-    "smallxsmall": {'N':1024, 'Anz': 65_536, 'Bnz':65_536, 'Cnz': 10000},
-    "ti_denxti_den": {'N':32, 'Anz':1024, 'Bnz':1024, 'Cnz': 1024},
-    "ti_spaxti_den": {'N':32, 'Anz':256, 'Bnz':1024, 'Cnz': 1024},
-    "medxmed": {'N':4096, 'Anz': 2**20, 'Bnz':2**20},
-    "largexlarge": {'N':2**16, 'Anz': 64*2**20, 'Bnz':64*2**20}
-}
-
-FORMATS = { "sparse": ["phase1", "phase2", "mxm_sparse"],
-            "dense": ["dense_phase1", "mxm_dense"],
-            "sparse_dense": ["dense_phase1", "mxm_sparse_dense"],
-            "reduce": ["reduce"]}
-
-FORMAT_INPUTS = {
-    "sparse": [("GxB_SPARSE", "GxB_SPARSE")],
-    "dense": [("GxB_FULL", "GxB_FULL"), ("GxB_BITMAP", "GxB_BITMAP")],
-    "sparse_dense": [("GxB_SPARSE", "GxB_FULL")],
-    "reduce": [("GxB_SPARSE", "GxB_SPARSE")]
-}
-
-FORMAT_DATASETS = {
-    "sparse": ["nanoxnano", "tinyxtiny", "smallxsmall"],
-    "dense": ["ti_denxti_den"],
-    "sparse_dense": ["ti_spaxti_den"],
-    "reduce": ["nanoxnano", "smallxsmall", "ti_denxti_den", "ti_spaxti_den"]
-}
-
-def std_type_to_gb_type(t):
-    return SUPPORTED_TYPES[t]
-
-def build_gb_monioid(t, m):
-    # Example: GrB_PLUS_MONIOD_UINT64
-    gb_type = std_type_to_gb_type(t)
-    return f"{GB_TYPE_PREFIX}_{m}_MONOID_{gb_type}"
-
-def build_gb_binop(t, b):
-    # Example: GrB_TIMES_UINT64
-    gb_type = std_type_to_gb_type(t)
-    return f"{GB_TYPE_PREFIX}_{b}_{gb_type}"
-
-
-
-
-def buildTest(ts="TestsuiteName", ds="tiny-tiny", df=("GxB_SPARSE", "GxB_SPARSE"),
-              SUM="PLUS", PRODUCT="TIMES",
-              typeC="int32_t",typeM="int32_t",
-              typeA="int32_t",typeB="int32_t",
-              type_x="int32_t", type_y="int32_t",type_z="int32_t"):
-
-    # build string interpolation from pieces
-    format_A, format_B = df
-
-    Test_name = f"{ds}{SUM}_{PRODUCT}__{format_A}_{format_B}__C{typeC}M{typeM}A{typeA}B{typeB}X{type_x}Y{type_y}Z{type_z}"
-    Test_suite = f"{ts}"
-
-    N = DataShapes[ds]['N']
-    Anz = DataShapes[ds]['Anz']
-    Bnz = DataShapes[ds]['Bnz']
-    Cnz = DataShapes[ds]['Cnz']
-
-    gb_monoid = build_gb_monioid(typeC, SUM)
-    gb_binop = build_gb_binop(typeC, PRODUCT)
-
-    TEST_HEAD = f"""
-    TEST( {Test_suite}, {Test_name}) {{
-
-        /**************************
-         * Create reference and input data
-         */
-        GrB_Monoid monoid = {gb_monoid}; 
-        GrB_BinaryOp binop = {gb_binop};
-
-        mxm_problem_spec<{typeC}, {typeM}, {typeA}, {typeB}> problem_spec(monoid, binop, {N}, {Anz}, {Bnz}, {Cnz},
-                                                                          {format_A}, {format_B});
-    """
-    phase1_body= f""" test_AxB_phase1_factory< {typeC}, {typeM}, {typeA}, {typeB}>(problem_spec);"""
-    phase2_body= f""" test_AxB_phase2_factory< {typeC}, {typeM}, {typeA}, {typeB} >(problem_spec);"""
-    dense_phase1_body = f""" test_AxB_dense_phase1_factory<{typeC}, {typeM}, {typeA}, {typeB}>(problem_spec);"""
-    mxm_sparse_body = f""" test_AxB_dot3_sparse_factory< {typeC},{typeM},{typeA},{typeB},{type_x},{type_y},{type_z} > (problem_spec);\n"""
-    mxm_dense_body = f""" test_AxB_dot3_dense_factory< {typeC},{typeM},{typeA},{typeB},{type_x},{type_y},{type_z} > (problem_spec);\n"""
-    mxm_sparse_dense_body = f""" test_AxB_dot3_sparse_dense_factory< {typeC},{typeM},{typeA},{typeB},{type_x},{type_y},{type_z} > (problem_spec);\n"""
-    reduce_body = f""" test_reduce_factory<{typeC}, {typeM}, {typeA}, {typeB}>(problem_spec);"""
-    phasedict = { "phase1": phase1_body,
-                  "phase2": phase2_body,
-                  "mxm_sparse": mxm_sparse_body,
-                  "mxm_dense": mxm_dense_body,
-                  "mxm_sparse_dense": mxm_sparse_dense_body,
-                  "reduce": reduce_body,
-                  "dense_phase1": dense_phase1_body }
-
-    return TEST_HEAD, phasedict
-
-def load_types(argv):
-    test_suite_name = argv[2]
-    Monoids = argv[3].split(";")
-    Binops  = argv[4].split(";")
-    Semirings = argv[5]
-    DataTypes = argv[6].split(";")
-
-    # Hard-coding data shapes for now
-    Kernels= argv[7]
-
-    return argv[1], test_suite_name, Monoids, Binops, Semirings, DataTypes, DataShapes, Kernels
-
-def write_test_instances_header(test_suite_name, mat_format, tests, Monoids, Binops, Semirings, DataTypes, DataShapes, Kernels):
-    outfile = f'{test_suite_name}_{Semirings}_{mat_format}_test_instances.hpp'
-    with open(outfile, 'w') as fp:
-        fp.write("#pragma once\n#include \"problem_spec.hpp\"\n");
-        m, b = Semirings.split("_")
-        Test_suite = f'{test_suite_name}_tests_{mat_format}_{m}_{b}'
-        for dtC in DataTypes:
-            dtX = dtC
-            dtY = dtC
-            dtZ = dtC
-            for dtM in ["bool", "int32_t", "int64_t", "float", "double"]:
-                for dtA in DataTypes:
-                    for dtB in DataTypes:
-                        for ds in FORMAT_DATASETS[mat_format]:
-                            for df in FORMAT_INPUTS[mat_format]:
-                                TEST_HEAD, TEST_BODY = buildTest( Test_suite, ds, df, m, b,
-                                                                  dtC, dtM, dtA, dtB, dtX, dtY, dtZ)
-                                fp.write( TEST_HEAD)
-                                for test in tests:
-                                    fp.write( TEST_BODY[test] )
-                                fp.write( "}\n")
-
-def write_cuda_test(source_dir, test_suite_name, mat_format, semiring, kernel):
-    import shutil
-
-    shutil.copy(f"{source_dir}/test/cuda_tests_template.cpp", f"{test_suite_name}_{semiring}_{mat_format}_cuda_tests.cpp")
-
-    with open(f"{test_suite_name}_{semiring}_{mat_format}_cuda_tests.cpp", "a") as file_object:
-        # Keeping this as a separate file for now to allow for further nesting
-        # of test instances for each test_suite_name
-        file_object.write(f"\n#include \"{test_suite_name}_{semiring}_{mat_format}_test_instances.hpp\"")
-
-if __name__ == "__main__":
-    import sys
-
-    if(len(sys.argv) != 8):
-        raise ValueError("Expected 7 arguments but only got %s" % len(sys.argv))
-
-    """
-    First load values
-    """
-    source_dir, test_suite_name, Monoids, Binops, Semirings, DataTypes, DataShapes, Kernels = load_types(sys.argv)
-
-    for mat_format, tests in FORMATS.items():
-        write_test_instances_header(test_suite_name, mat_format, tests, Monoids, Binops, Semirings, DataTypes, DataShapes, DOT3_BUCKETS)
-        write_cuda_test(source_dir, test_suite_name, mat_format, Semirings, Kernels)
diff --git a/GraphBLAS/CUDA/test/test_data.hpp b/GraphBLAS/CUDA/test/test_data.hpp
deleted file mode 100644
index d6cca87d0e..0000000000
--- a/GraphBLAS/CUDA/test/test_data.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/test/test_data.hpp
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-#include <vector>
-#include <cstdint>
-
-#pragma once
-
-template<typename T_A, typename T_B, typename T_C, typename T_M>
-class TestData {
-
-public:
-    TestData(  std::vector<std::int64_t> A_indptr_,
-    std::vector<std::int64_t> A_indices_,
-    std::vector<T_A> A_data_,
-
-    std::vector<std::int64_t> B_indptr_,
-    std::vector<std::int64_t> B_indices_,
-    std::vector<T_B> B_data_,
-
-
-    std::vector<std::int64_t> C_indptr_,
-    std::vector<std::int64_t> C_indices_,
-    std::vector<T_C> C_data_,
-
-    std::vector<std::int64_t> M_indptr_,
-    std::vector<std::int64_t> M_indices_,
-    std::vector<T_M> M_data_):
-        A_indptr(A_indptr_), A_indices(A_indices_), A_data(A_data_),
-        B_indptr(B_indptr_), B_indices(B_indices_), B_data(B_data_),
-        C_indptr(C_indptr_), C_indices(C_indices_), C_data(C_data_),
-        M_indptr(M_indptr_), M_indices(M_indices_), M_data(M_data_){}
-
-
-  std::vector<std::int64_t> A_indptr;
-  std::vector<std::int64_t> A_indices;
-  std::vector<T_A> A_data;
-  
-  std::vector<std::int64_t> B_indptr;
-  std::vector<std::int64_t> B_indices;
-  std::vector<T_B> B_data;
-  
-  
-  std::vector<std::int64_t> C_indptr;
-  std::vector<std::int64_t> C_indices;
-  std::vector<T_C> C_data;
-
-  std::vector<std::int64_t> M_indptr;
-  std::vector<std::int64_t> M_indices;
-  std::vector<T_M> M_data;
-
-};
-
-template<typename T_A, typename T_B, typename T_C, typename T_M>
-std::unique_ptr<TestData<T_A, T_B, T_C, T_M>> make_karate_tricount() {
-
-    std::vector<std::int64_t> A_indptr = { 0,16,24,32,35,37,40,41,41,44,45,45,45,45,46,48,50,50,50,52,53,55,55,57,
- 62,65,66,68,69,71,73,75,77,78,78};
-    std::vector<std::int64_t> A_indices = { 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,17,19,21,31, 2, 3, 7,13,17,19,21,30,
-  3, 7, 8, 9,13,27,28,32, 7,12,13, 6,10, 6,10,16,16,30,32,33,33,33,32,33,
- 32,33,32,33,33,32,33,32,33,25,27,29,32,33,25,27,31,31,29,33,33,31,33,32,
- 33,32,33,32,33,33};
-    std::vector<T_A> A_data = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1};
-
-    std::vector<std::int64_t> B_indptr = { 0, 0, 1, 3, 6, 7, 8,11,15,17,18,21,22,24,28,28,28,30,32,32,34,34,36,36,
- 36,36,38,38,41,42,44,46,50,61,78};
-    std::vector<std::int64_t> B_indices = { 0, 0, 1, 0, 1, 2, 0, 0, 0, 4, 5, 0, 1, 2, 3, 0, 2, 2, 0, 4, 5, 0, 0, 3,
-  0, 1, 2, 3, 5, 6, 0, 1, 0, 1, 0, 1,23,24, 2,23,24, 2,23,26, 1, 8, 0,24,
- 25,28, 2, 8,14,15,18,20,22,23,29,30,31, 8, 9,13,14,15,18,19,20,22,23,26,
- 27,28,29,30,31,32};
-    std::vector<T_B> B_data = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1};
-
-    std::vector<std::int64_t> M_indptr = {  0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80,
-  82, 84, 87, 89, 91, 93, 98,101,104,106,110,113,117,121,127,139,156};
-    std::vector<std::int64_t> M_indices = { 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,17,19,21,31, 0, 2, 3, 7,13,17,19,21,
- 30, 0, 1, 3, 7, 8, 9,13,27,28,32, 0, 1, 2, 7,12,13, 0, 6,10, 0, 6,10,16,
-  0, 4, 5,16, 0, 1, 2, 3, 0, 2,30,32,33, 2,33, 0, 4, 5, 0, 0, 3, 0, 1, 2,
-  3,33,32,33,32,33, 5, 6, 0, 1,32,33, 0, 1,33,32,33, 0, 1,32,33,25,27,29,
- 32,33,25,27,31,23,24,31,29,33, 2,23,24,33, 2,31,33,23,26,32,33, 1, 8,32,
- 33, 0,24,25,28,32,33, 2, 8,14,15,18,20,22,23,29,30,31,33, 8, 9,13,14,15,
- 18,19,20,22,23,26,27,28,29,30,31,32};
-    std::vector<T_M> M_data = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1};
-
-    std::vector<std::int64_t> C_indptr = { 0, 0, 7,12,17,19,21,24,27,29,29,31,31,32,35,35,35,36,37,37,38,38,39,39,
- 39,39,40,40,41,41,43,45,47,51,56};
-    std::vector<std::int64_t> C_indices = { 2, 3, 7,13,17,19,21, 1, 3, 7, 8,13, 1, 2, 7,12,13, 6,10, 6,10, 4, 5,16,
-  1, 2, 3, 2,32, 4, 5, 3, 1, 2, 3, 6, 1, 1, 1,31,33,32,33,32,33,25,33, 8,
- 29,30,33,27,29,30,31,32};
-    std::vector<T_C> C_data = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1,
-  1, 2, 3, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
-  1, 1,10, 1, 2, 1, 1,10};
-
-    TestData<T_A, T_B, T_C, T_M> karate_tricount(A_indptr, A_indices, A_data,
-                                                 B_indptr, B_indices, B_data,
-                                                 C_indptr, C_indices, C_data,
-                                                 M_indptr, M_indices, M_data);
-
-    return std::make_unique<TestData<T_A, T_B, T_C, T_M>>(karate_tricount);
-}
-
diff --git a/GraphBLAS/CUDA/test/test_jitify.cpp b/GraphBLAS/CUDA/test/test_jitify.cpp
deleted file mode 100644
index 30aab0adc2..0000000000
--- a/GraphBLAS/CUDA/test/test_jitify.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/test/test_jitify.cpp
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-#include "jitify.hpp"
-#include "GB_cuda_jitify_launcher.h"
-
-int main(int argc, char **argv) {
-
-#if 0
-
-BROKEN
-
-    std::string named_program = "GB_jit_AxB_phase2";
-    std::string kern_name = "AxB_phase2";
-
-
-    jitify::experimental::Program& program = *std::get<1>(named_program);
-    auto instantiated_kernel = program.kernel(kern_name).instantiate({});
-
-    // hashable name is program name
-    // string to be jitted is the actual prgram
-    //
-
-    dim3 grid(1);
-    dim3 block(1);
-
-//      std::cout<< kernel_name<<" with types " <<GET_TYPE_NAME(dumC)<<std::endl;
-
-    std::string hashable_name = "GB_jit_AxB_phase2";
-    std::stringstream string_to_be_jitted ;
-    string_to_be_jitted << hashable_name << std::endl <<
-    R"(#include "GB_jit_AxB_dot3_phase2.cuh")"; // FIXME: wrong name
-
-    jit::launcher( hashable_name,
-                   string_to_be_jitted.str(),
-                   header_names,
-                   GB_jit_cuda_compiler_flags,
-                   file_callback)
-            .set_kernel_inst( kernel_name, {})
-            .configure(grid, block)
-            .launch( nanobuckets, blockBucket, bucketp, bucket, C->mat, cnz);
-
-
-#endif
-
-}
diff --git a/GraphBLAS/CUDA/test/test_utility.hpp b/GraphBLAS/CUDA/test/test_utility.hpp
deleted file mode 100644
index 3eb6ba942e..0000000000
--- a/GraphBLAS/CUDA/test/test_utility.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/CUDA/test/test_utility.hpp
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-#pragma once
-
-// try calling a GrB_method and check the result
-#define GRB_TRY(GrB_method)                                     \
-{                                                               \
-    GrB_Info GB_info_result = GrB_method ;                      \
-    if (GB_info_result < GrB_SUCCESS)                           \
-    {                                                           \
-        printf ("test failure: file %s line %d status %d\n",    \
-            __FILE__, __LINE__, GB_info_result) ;               \
-        exit (EXIT_FAILURE) ;                                   \
-    }                                                           \
-}
-
diff --git a/GraphBLAS/CUDA/GB_cuda_cumsum.cu b/GraphBLAS/CUDA/unused/GB_cuda_cumsum.cu
similarity index 98%
rename from GraphBLAS/CUDA/GB_cuda_cumsum.cu
rename to GraphBLAS/CUDA/unused/GB_cuda_cumsum.cu
index da6b32f504..8ed9726afa 100644
--- a/GraphBLAS/CUDA/GB_cuda_cumsum.cu
+++ b/GraphBLAS/CUDA/unused/GB_cuda_cumsum.cu
@@ -18,7 +18,7 @@
 // sum (count [0..j-1]).  count [n] is implicitly zero on input.
 // On output, count [n] is the total sum.
 
-#include "GB_cuda.h"
+#include "GB_cuda.hpp"
 // #include <local_cub/device/device_scan.cuh>
 #include <cub/device/device_scan.cuh>
 
diff --git a/GraphBLAS/CUDA/unused/GB_search_for_vector_device.cuh b/GraphBLAS/CUDA/unused/GB_search_for_vector_device.cuh
new file mode 100644
index 0000000000..6384c1c840
--- /dev/null
+++ b/GraphBLAS/CUDA/unused/GB_search_for_vector_device.cuh
@@ -0,0 +1,69 @@
+//------------------------------------------------------------------------------
+// GB_search_for_vector_device
+//------------------------------------------------------------------------------
+
+static __device__ __inline__ int64_t GB_search_for_vector_device
+(
+    const int64_t p,                // search for vector k that contains p
+    const int64_t *restrict Ap,     // vector pointers to search
+    int64_t kleft,                  // left-most k to search
+    int64_t anvec,                  // Ap is of size anvec+1
+    int64_t avlen                   // A->vlen
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    if (Ap == NULL)
+    { 
+        // A is full or bitmap
+        ASSERT (p >= 0 && p < avlen * anvec) ;
+        return ((avlen == 0) ? 0 : (p / avlen)) ;
+    }
+
+    // A is sparse or hypersparse
+    ASSERT (p >= 0 && p < Ap [anvec]) ;
+
+    //--------------------------------------------------------------------------
+    // search for k
+    //--------------------------------------------------------------------------
+
+    int64_t k = kleft ;
+    int64_t kright = anvec ;
+    bool found ;
+    GB_SPLIT_BINARY_SEARCH (p, Ap, k, kright, found) ;
+
+    // FIXME: this is not needed if the search is approximate:
+    if (found)
+    {
+        // Ap [k] == p has been found, but if k is an empty vector, then the
+        // next vector will also contain the entry p.  In that case, k needs to
+        // be incremented until finding the first non-empty vector for which
+        // Ap [k] == p.
+        ASSERT (Ap [k] == p) ;
+        while (k < anvec-1 && Ap [k+1] == p)
+        { 
+            k++ ;
+        }
+    }
+    else
+    { 
+        // p has not been found in Ap, so it appears in the middle of Ap [k-1]
+        // ... Ap [k], as computed by the binary search.  This is the range of
+        // entries for the vector k-1, so k must be decremented.
+        k-- ;
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    // The entry p must reside in a non-empty vector.
+    ASSERT (k >= 0 && k < anvec) ;
+    ASSERT (Ap [k] <= p && p < Ap [k+1]) ;
+
+    return (k) ;
+}
+
diff --git a/GraphBLAS/Config/GB_config.h.in b/GraphBLAS/Config/GB_config.h.in
index 3585ce4c20..513cc40f94 100644
--- a/GraphBLAS/Config/GB_config.h.in
+++ b/GraphBLAS/Config/GB_config.h.in
@@ -21,7 +21,7 @@
 // GB_C_FLAGS: the C compiler flags used to compile GraphBLAS.  Used
 // for compiling and linking:
 #ifndef GB_C_FLAGS
-#define GB_C_FLAGS      "@GB_C_FLAGS@"
+#define GB_C_FLAGS      "@GB_C_FLAGS@ @GB_OPENMP_C_FLAGS@"
 #endif
 
 // GB_C_LINK_FLAGS: the flags passed to the C compiler for the link phase:
@@ -49,7 +49,7 @@
 #define GB_OMP_INC      "@GB_OMP_INC@"
 #endif
 
-// GB_OMP_INC_DIRS: include directories OpenMP, if in use by GraphBLAS,
+// GB_OMP_INC_DIRS: include directories for OpenMP, if in use by GraphBLAS,
 // for cmake:
 #ifndef GB_OMP_INC_DIRS
 #define GB_OMP_INC_DIRS "@GB_OMP_INC_DIRS@"
@@ -65,5 +65,25 @@
 #define GB_CMAKE_LIBRARIES  "@GB_CMAKE_LIBRARIES@"
 #endif
 
+// GB_CUDA_COMPILER: the CUDA compiler to compile CUDA JIT kernels:
+#ifndef GB_CUDA_COMPILER
+#define GB_CUDA_COMPILER "@GB_CUDA_COMPILER@"
+#endif
+
+// GB_CUDA_FLAGS: the CUDA flags to compile CUDA JIT kernels:
+#ifndef GB_CUDA_FLAGS
+#define GB_CUDA_FLAGS "@GB_CUDA_FLAGS@"
+#endif
+
+// GB_CUDA_INC: -I includes for CUDA JIT kernels:
+#ifndef GB_CUDA_INC
+#define GB_CUDA_INC "@GB_CUDA_INC@"
+#endif
+
+// GB_CUDA_ARCHITECTURES: the CUDA ARCHITECTURES for CUDA JIT kernels:
+#ifndef GB_CUDA_ARCHITECTURES
+#define GB_CUDA_ARCHITECTURES "@GB_CUDA_ARCHITECTURES@"
+#endif
+
 #endif
 
diff --git a/GraphBLAS/Config/GraphBLAS.h.in b/GraphBLAS/Config/GraphBLAS.h.in
index e01f63c2c3..0dbdd56fe8 100644
--- a/GraphBLAS/Config/GraphBLAS.h.in
+++ b/GraphBLAS/Config/GraphBLAS.h.in
@@ -279,7 +279,7 @@
 // The 'spec' string describes the GraphBLAS spec:
 #define GxB_SPEC_ABOUT \
 "GraphBLAS C API, by Benjamin Brock, Aydin Buluc, Raye Kimmerer,\n" \
-"Jim Kitchen, Major Kumar, Timothy Mattson, Scott McMillan, Jose' Moreira,\n" \
+"Jim Kitchen, Manoj Kumar, Timothy Mattson, Scott McMillan, Jose' Moreira,\n" \
 "Erik Welch, and Carl Yang.  Based on 'GraphBLAS Mathematics by Jeremy\n" \
 "Kepner.  See also 'Graph Algorithms in the Language of Linear Algebra,'\n" \
 "edited by J. Kepner and J. Gilbert, SIAM, 2011.\n"
@@ -3772,6 +3772,8 @@ typedef enum            // for global options or matrix options
     GxB_JIT_USE_CMAKE = 7032,        // CPU JIT: use cmake or direct compile
     GxB_JIT_ERROR_LOG = 7033,        // CPU JIT: error log file
 
+    GxB_JIT_CUDA_PREFACE = 7100,     // CUDA JIT C++ preface
+
     //------------------------------------------------------------
     // GrB_get for GrB_Matrix:
     //------------------------------------------------------------
@@ -3973,7 +3975,7 @@ GrB_Info GxB_Context_get       (GxB_Context, GxB_Context_Field, ...) ;
     _Generic                                                    \
     (                                                           \
         (arg1),                                                 \
-            default          : GxB_Global_Option_set ,          \
+            default:           GxB_Global_Option_set ,          \
             GxB_Option_Field : GxB_Global_Option_set ,          \
             GrB_Vector       : GxB_Vector_Option_set ,          \
             GrB_Matrix       : GxB_Matrix_Option_set ,          \
@@ -3986,7 +3988,7 @@ GrB_Info GxB_Context_get       (GxB_Context, GxB_Context_Field, ...) ;
     _Generic                                                    \
     (                                                           \
         (arg1),                                                 \
-            default          : GxB_Global_Option_get ,          \
+            default:           GxB_Global_Option_get ,          \
             GxB_Option_Field : GxB_Global_Option_get ,          \
             GrB_Vector       : GxB_Vector_Option_get ,          \
             GrB_Matrix       : GxB_Matrix_Option_get ,          \
diff --git a/GraphBLAS/Demo/Program/gauss_demo.c b/GraphBLAS/Demo/Program/gauss_demo.c
index 3411a31a8a..b32ec76b06 100644
--- a/GraphBLAS/Demo/Program/gauss_demo.c
+++ b/GraphBLAS/Demo/Program/gauss_demo.c
@@ -7,7 +7,7 @@
 
 //------------------------------------------------------------------------------
 
-#include "GraphBLAS.h"
+#include "graphblas_demos.h"
 #undef I
 
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Demo/Program/reduce_demo.c b/GraphBLAS/Demo/Program/reduce_demo.c
index 492b51423d..303cca39a4 100644
--- a/GraphBLAS/Demo/Program/reduce_demo.c
+++ b/GraphBLAS/Demo/Program/reduce_demo.c
@@ -7,7 +7,7 @@
 
 //------------------------------------------------------------------------------
 
-#include "GraphBLAS.h"
+#include "graphblas_demos.h"
 #undef I
 #if defined ( _OPENMP )
 #include <omp.h>
diff --git a/GraphBLAS/Demo/Program/wildtype_demo.c b/GraphBLAS/Demo/Program/wildtype_demo.c
index 8eb761ee95..36e312746e 100644
--- a/GraphBLAS/Demo/Program/wildtype_demo.c
+++ b/GraphBLAS/Demo/Program/wildtype_demo.c
@@ -10,7 +10,7 @@
 // Each "scalar" entry of this type consists of a 4x4 matrix and a string of
 // length 64.
 
-#include "GraphBLAS.h"
+#include "graphblas_demos.h"
 #undef I
 
 #if defined __INTEL_COMPILER
@@ -196,13 +196,7 @@ int main (void)
 {
 
     // start GraphBLAS
-    #if 1
     GrB_init (GrB_NONBLOCKING) ;
-    #else
-    GxB_init (GxB_NONBLOCKING_GPU, NULL, NULL, NULL, NULL, NULL) ;
-    GxB_set (GxB_GPU_ID, 0) ;
-    GB_Global_hack_set (2, 1) ; // always use the GPU
-    #endif
 
     GxB_Global_Option_set (GxB_BURBLE, true) ;
     int nthreads ;
diff --git a/GraphBLAS/Doc/ChangeLog b/GraphBLAS/Doc/ChangeLog
index 15e87e79d8..e1ef26ee59 100644
--- a/GraphBLAS/Doc/ChangeLog
+++ b/GraphBLAS/Doc/ChangeLog
@@ -1,4 +1,4 @@
-Feb XX, 2024: version 9.1.0
+Mar 22, 2024: version 9.1.0
 
     * minor updates to build system
     * C11 complex type detection:  this is now detected and configured by
@@ -9,6 +9,18 @@ Feb XX, 2024: version 9.1.0
         GraphBLAS.h to indicate which kind of complex data types are available
         in C11 or MSVC.  Contributed by Markus Mützel.
     * port to clang-cl: fixing the GxB_get and GxB_set macro
+    * (53) bug fix: eWiseAdd C<M>=A+B when M, A, and B are all hypersparse;
+        access to M was incorrect (also affects C<M>+=T for any operation, if
+        M and T are both hypersparse).
+
+Mar 1, 2024: version 9.0.3
+
+    * (52) performance bug fix: JIT kernels since v8.3.1 were not compiled with
+        OpenMP.
+
+Feb 26, 2024: version 9.0.2
+
+    * (51) bug fix: GraphBLAS/Makefile "make static" was incorrect.
 
 Jan 20, 2024: version 9.0.1
 
diff --git a/GraphBLAS/Doc/FUTURE.txt b/GraphBLAS/Doc/FUTURE.txt
index 7327aa3ccf..79dd37efa8 100644
--- a/GraphBLAS/Doc/FUTURE.txt
+++ b/GraphBLAS/Doc/FUTURE.txt
@@ -12,9 +12,13 @@ CUDA:
 
 Future features:
 
+    cumulative sum (or other monoid)
+
     pack/unpack COO
     kernel fusion
     CUDA kernels
+    distributed framework
+
     fine-grain parallelism for dot-product based mxm, mxv, vxm,
         then add GxB_vxvt (outer product) and GxB_vtxv (inner product)
         (or call them GxB_outerProduct and GxB_innerProduct?)
@@ -40,3 +44,21 @@ Future features:
         GrB_set (op, GxB_DEFN, "string"
     also for all ops
 
+    candidates for kernel fusion:
+        * triangle counting: mxm then reduce to scalar
+        * lcc: mxm then reduce to vector
+        * FusedMM: see https://arxiv.org/pdf/2011.06391.pdf
+
+    more:
+        * consider algorithms where fusion can occur
+        * performance monitor, or revised burble, to detect generic cases
+        * check if vectorization of GrB_mxm is effective when using clang
+        * see how HNSW vector search could be implemented in GraphBLAS
+
+CUDA JIT:
+
+    https://developer.nvidia.com/blog/cuda-12-0-compiler-support-for-runtime-lto-using-nvjitlink-library/
+        Developer webpage talking about ways to do nvJit with link time
+        optimization using CUDA 12.0  Shows precompiled path and JIT path to
+        generate kernels
+
diff --git a/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf b/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf
index 2560e6e7aa..15a176c960 100644
Binary files a/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf and b/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf differ
diff --git a/GraphBLAS/Doc/GraphBLAS_UserGuide.tex b/GraphBLAS/Doc/GraphBLAS_UserGuide.tex
index 07f87969a8..598b4c7484 100644
--- a/GraphBLAS/Doc/GraphBLAS_UserGuide.tex
+++ b/GraphBLAS/Doc/GraphBLAS_UserGuide.tex
@@ -14768,7 +14768,7 @@ \section{Release Notes}
 
 \begin{itemize}
 
-\item Feb XX, 2024: version 9.1.0 % FIXME for SuiteSparse 7.7.0
+\item Mar 22, 2024: version 9.1.0
 
     \begin{itemize}
     \item minor updates to build system
@@ -14780,6 +14780,23 @@ \section{Release Notes}
         \verb'GxB_HAVE_COMPLEX*' to
         GraphBLAS.h to indicate which kind of complex data types are available
         in C11 or MSVC.  Contributed by Markus M\"{u}tzel.
+    \item (53) bug fix: eWiseAdd \verb'C<M>=A+B' when \verb'M', \verb'A',
+        and \verb'B' are all hypersparse; access to \verb'M' was incorrect
+        (also affects \verb'C<M>+=T' for any operation, if \verb'M' and
+        \verb'T' are both hypersparse).
+    \end{itemize}
+
+\item Mar 1, 2024: version 9.0.3
+
+    \begin{itemize}
+    \item (52) performance bug fix: JIT kernels since v8.3.1 were not compiled
+    with OpenMP.
+    \end{itemize}
+
+\item Feb 26, 2024: version 9.0.2
+
+    \begin{itemize}
+    \item GraphBLAS/Makefile \verb"make static" was incorrect.
     \end{itemize}
 
 \item Jan 20, 2024: version 9.0.1
diff --git a/GraphBLAS/Doc/GraphBLAS_version.tex b/GraphBLAS/Doc/GraphBLAS_version.tex
index e970a35c25..c4b9b9002b 100644
--- a/GraphBLAS/Doc/GraphBLAS_version.tex
+++ b/GraphBLAS/Doc/GraphBLAS_version.tex
@@ -1,5 +1,5 @@
 % version of SuiteSparse:GraphBLAS
 \date{VERSION
 9.1.0,
-Feb XX, 2024}
+Mar 22, 2024}
 
diff --git a/GraphBLAS/GraphBLAS/CMakeLists.txt b/GraphBLAS/GraphBLAS/CMakeLists.txt
index e69ed35478..51156a96ea 100644
--- a/GraphBLAS/GraphBLAS/CMakeLists.txt
+++ b/GraphBLAS/GraphBLAS/CMakeLists.txt
@@ -42,6 +42,7 @@ include ( SuiteSparsePolicy )
 # option ( GRAPHBLAS_USE_CUDA "ON (default): enable CUDA acceleration for GraphBLAS, OFF: do not use CUDA" ${SUITESPARSE_USE_CUDA} )
 
 set ( GRAPHBLAS_HAS_CUDA OFF )
+message ( STATUS "GraphBLAS CUDA JIT: disabled for MATLAB" )
 
 # check for strict usage
 if ( SUITESPARSE_USE_STRICT AND GRAPHBLAS_USE_CUDA AND NOT GRAPHBLAS_HAS_CUDA )
diff --git a/GraphBLAS/GraphBLAS/rename/GB_rename.h b/GraphBLAS/GraphBLAS/rename/GB_rename.h
index a2a7b272ee..d3362f4ee5 100644
--- a/GraphBLAS/GraphBLAS/rename/GB_rename.h
+++ b/GraphBLAS/GraphBLAS/rename/GB_rename.h
@@ -535,6 +535,7 @@
 #define GB_jitifyer_get_C_link_flags GM_jitifyer_get_C_link_flags
 #define GB_jitifyer_get_control GM_jitifyer_get_control
 #define GB_jitifyer_get_C_preface GM_jitifyer_get_C_preface
+#define GB_jitifyer_get_CUDA_preface GM_jitifyer_get_CUDA_preface
 #define GB_jitifyer_get_error_log GM_jitifyer_get_error_log
 #define GB_jitifyer_get_use_cmake GM_jitifyer_get_use_cmake
 #define GB_jitifyer_hash_encoding GM_jitifyer_hash_encoding
@@ -544,6 +545,7 @@
 #define GB_jitifyer_load GM_jitifyer_load
 #define GB_jitifyer_load_worker GM_jitifyer_load_worker
 #define GB_jitifyer_lookup GM_jitifyer_lookup
+#define GB_jitifyer_nvcc_compile GM_jitifyer_nvcc_compile
 #define GB_jitifyer_path_256 GM_jitifyer_path_256
 #define GB_jitifyer_query GM_jitifyer_query
 #define GB_jitifyer_set_cache_path GM_jitifyer_set_cache_path
@@ -561,6 +563,8 @@
 #define GB_jitifyer_set_control GM_jitifyer_set_control
 #define GB_jitifyer_set_C_preface GM_jitifyer_set_C_preface
 #define GB_jitifyer_set_C_preface_worker GM_jitifyer_set_C_preface_worker
+#define GB_jitifyer_set_CUDA_preface GM_jitifyer_set_CUDA_preface
+#define GB_jitifyer_set_CUDA_preface_worker GM_jitifyer_set_CUDA_preface_worker
 #define GB_jitifyer_set_error_log GM_jitifyer_set_error_log
 #define GB_jitifyer_set_error_log_worker GM_jitifyer_set_error_log_worker
 #define GB_jitifyer_set_use_cmake GM_jitifyer_set_use_cmake
@@ -696,8 +700,27 @@
 #define GB_JITpackage_214 GM_JITpackage_214
 #define GB_JITpackage_215 GM_JITpackage_215
 #define GB_JITpackage_216 GM_JITpackage_216
+#define GB_JITpackage_217 GM_JITpackage_217
+#define GB_JITpackage_218 GM_JITpackage_218
+#define GB_JITpackage_219 GM_JITpackage_219
 #define GB_JITpackage_21 GM_JITpackage_21
+#define GB_JITpackage_220 GM_JITpackage_220
+#define GB_JITpackage_221 GM_JITpackage_221
+#define GB_JITpackage_222 GM_JITpackage_222
+#define GB_JITpackage_223 GM_JITpackage_223
+#define GB_JITpackage_224 GM_JITpackage_224
+#define GB_JITpackage_225 GM_JITpackage_225
+#define GB_JITpackage_226 GM_JITpackage_226
+#define GB_JITpackage_227 GM_JITpackage_227
+#define GB_JITpackage_228 GM_JITpackage_228
+#define GB_JITpackage_229 GM_JITpackage_229
 #define GB_JITpackage_22 GM_JITpackage_22
+#define GB_JITpackage_230 GM_JITpackage_230
+#define GB_JITpackage_231 GM_JITpackage_231
+#define GB_JITpackage_232 GM_JITpackage_232
+#define GB_JITpackage_233 GM_JITpackage_233
+#define GB_JITpackage_234 GM_JITpackage_234
+#define GB_JITpackage_235 GM_JITpackage_235
 #define GB_JITpackage_23 GM_JITpackage_23
 #define GB_JITpackage_24 GM_JITpackage_24
 #define GB_JITpackage_25 GM_JITpackage_25
@@ -885,6 +908,7 @@
 #define GB_macrofy_input GM_macrofy_input
 #define GB_macrofy_mask GM_macrofy_mask
 #define GB_macrofy_monoid GM_macrofy_monoid
+#define GB_macrofy_multadd GM_macrofy_multadd
 #define GB_macrofy_mxm GM_macrofy_mxm
 #define GB_macrofy_name GM_macrofy_name
 #define GB_macrofy_nvals GM_macrofy_nvals
diff --git a/GraphBLAS/Include/GraphBLAS.h b/GraphBLAS/Include/GraphBLAS.h
index 6659650a9a..680ebdfd92 100644
--- a/GraphBLAS/Include/GraphBLAS.h
+++ b/GraphBLAS/Include/GraphBLAS.h
@@ -234,7 +234,7 @@
 
 // The version of this implementation, and the GraphBLAS API version:
 #define GxB_IMPLEMENTATION_NAME "SuiteSparse:GraphBLAS"
-#define GxB_IMPLEMENTATION_DATE "Feb XX, 2024"
+#define GxB_IMPLEMENTATION_DATE "Mar 22, 2024"
 #define GxB_IMPLEMENTATION_MAJOR 9
 #define GxB_IMPLEMENTATION_MINOR 1
 #define GxB_IMPLEMENTATION_SUB   0
@@ -279,7 +279,7 @@
 // The 'spec' string describes the GraphBLAS spec:
 #define GxB_SPEC_ABOUT \
 "GraphBLAS C API, by Benjamin Brock, Aydin Buluc, Raye Kimmerer,\n" \
-"Jim Kitchen, Major Kumar, Timothy Mattson, Scott McMillan, Jose' Moreira,\n" \
+"Jim Kitchen, Manoj Kumar, Timothy Mattson, Scott McMillan, Jose' Moreira,\n" \
 "Erik Welch, and Carl Yang.  Based on 'GraphBLAS Mathematics by Jeremy\n" \
 "Kepner.  See also 'Graph Algorithms in the Language of Linear Algebra,'\n" \
 "edited by J. Kepner and J. Gilbert, SIAM, 2011.\n"
@@ -3772,6 +3772,8 @@ typedef enum            // for global options or matrix options
     GxB_JIT_USE_CMAKE = 7032,        // CPU JIT: use cmake or direct compile
     GxB_JIT_ERROR_LOG = 7033,        // CPU JIT: error log file
 
+    GxB_JIT_CUDA_PREFACE = 7100,     // CUDA JIT C++ preface
+
     //------------------------------------------------------------
     // GrB_get for GrB_Matrix:
     //------------------------------------------------------------
@@ -3973,7 +3975,7 @@ GrB_Info GxB_Context_get       (GxB_Context, GxB_Context_Field, ...) ;
     _Generic                                                    \
     (                                                           \
         (arg1),                                                 \
-            default          : GxB_Global_Option_set ,          \
+            default:           GxB_Global_Option_set ,          \
             GxB_Option_Field : GxB_Global_Option_set ,          \
             GrB_Vector       : GxB_Vector_Option_set ,          \
             GrB_Matrix       : GxB_Matrix_Option_set ,          \
@@ -3986,7 +3988,7 @@ GrB_Info GxB_Context_get       (GxB_Context, GxB_Context_Field, ...) ;
     _Generic                                                    \
     (                                                           \
         (arg1),                                                 \
-            default          : GxB_Global_Option_get ,          \
+            default:           GxB_Global_Option_get ,          \
             GxB_Option_Field : GxB_Global_Option_get ,          \
             GrB_Vector       : GxB_Vector_Option_get ,          \
             GrB_Matrix       : GxB_Matrix_Option_get ,          \
diff --git a/GraphBLAS/JITpackage/CMakeLists.txt b/GraphBLAS/JITpackage/CMakeLists.txt
index 3b5d11ab68..3d3dab7563 100644
--- a/GraphBLAS/JITpackage/CMakeLists.txt
+++ b/GraphBLAS/JITpackage/CMakeLists.txt
@@ -78,8 +78,8 @@ if ( TARGET grb_jitpackage )
         "../Include/GraphBLAS.h"
         "../Source/Template/*.[ch]"
         "../Source/JitKernels/*.[ch]"
-        "../CUDA/Template/*h"
-        "../CUDA/JitKernels/*h"
+        "../CUDA/Template/*"
+        "../CUDA/JitKernels/*"
         "../Source/Shared/*.h" )
 
     add_custom_command ( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/GB_JITpackage.c
diff --git a/GraphBLAS/Makefile b/GraphBLAS/Makefile
index ff91ca7266..f1d5ecef93 100644
--- a/GraphBLAS/Makefile
+++ b/GraphBLAS/Makefile
@@ -80,7 +80,7 @@ setup:
 
 # build the static library
 static:
-	( cd build && cmake $(CMAKE_OPTIONS) -DNSTATIC=0 .. && cmake --build . --config Release -j$(JOBS) )
+	( cd build && cmake $(CMAKE_OPTIONS) -DBUILD_STATIC_LIBS=ON -DBUILD_SHARED_LIBS=OFF .. && cmake --build . --config Release -j$(JOBS) )
 
 # installs GraphBLAS to the install location defined by cmake, usually
 # /usr/local/lib and /usr/local/include
diff --git a/GraphBLAS/README.md b/GraphBLAS/README.md
index cbb8760b20..b2ce94c06e 100644
--- a/GraphBLAS/README.md
+++ b/GraphBLAS/README.md
@@ -4,7 +4,7 @@ SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
 
 SPDX-License-Identifier: Apache-2.0
 
-VERSION 9.1.0, Feb XX, 2024
+VERSION 9.1.0, Mar 22, 2024
 
 SuiteSparse:GraphBLAS is a complete implementation of the GraphBLAS standard,
 which defines a set of sparse matrix operations on an extended algebra of
diff --git a/GraphBLAS/Source/Factories/GB_search_for_vector_template.c b/GraphBLAS/Source/Factories/GB_search_for_vector_template.c
index 4a8ff482d1..fd7d8588ba 100644
--- a/GraphBLAS/Source/Factories/GB_search_for_vector_template.c
+++ b/GraphBLAS/Source/Factories/GB_search_for_vector_template.c
@@ -18,13 +18,11 @@
 
 #ifdef GB_CUDA_KERNEL
 __device__
-static inline int64_t GB_search_for_vector_device
-#else
-static inline int64_t GB_search_for_vector // return vector k that contains p
 #endif
+static inline int64_t GB_search_for_vector // return vector k that contains p
 (
     const int64_t p,                // search for vector k that contains p
-    const int64_t *restrict Ap,  // vector pointers to search
+    const int64_t *restrict Ap,     // vector pointers to search
     int64_t kleft,                  // left-most k to search
     int64_t anvec,                  // Ap is of size anvec+1
     int64_t avlen                   // A->vlen
@@ -42,7 +40,7 @@ static inline int64_t GB_search_for_vector // return vector k that contains p
         return ((avlen == 0) ? 0 : (p / avlen)) ;
     }
 
-    // A is sparse
+    // A is sparse or hypersparse
     ASSERT (p >= 0 && p < Ap [anvec]) ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Factories/GB_twotype_factory.c b/GraphBLAS/Source/Factories/GB_twotype_factory.c
index 3548761d68..d3cbeeec90 100644
--- a/GraphBLAS/Source/Factories/GB_twotype_factory.c
+++ b/GraphBLAS/Source/Factories/GB_twotype_factory.c
@@ -36,11 +36,8 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _bool, bool, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _bool, bool, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _bool, bool, _fp64,   double    )
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _bool, bool, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _bool, bool, _fc64,   GxB_FC64_t)
-            #endif
             default: ;
         }
         break ;
@@ -62,11 +59,8 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _int8, int8_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _int8, int8_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _int8, int8_t, _fp64,   double    )
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _int8, int8_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _int8, int8_t, _fc64,   GxB_FC64_t)
-            #endif
             default: ;
         }
         break ;
@@ -88,11 +82,8 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _int16, int16_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _int16, int16_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _int16, int16_t, _fp64,   double    )
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _int16, int16_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _int16, int16_t, _fc64,   GxB_FC64_t)
-            #endif
             default: ;
         }
         break ;
@@ -114,11 +105,8 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _int32, int32_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _int32, int32_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _int32, int32_t, _fp64,   double    )
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _int32, int32_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _int32, int32_t, _fc64,   GxB_FC64_t)
-            #endif
             default: ;
         }
         break ;
@@ -140,11 +128,8 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _int64, int64_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _int64, int64_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _int64, int64_t, _fp64,   double    )
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _int64, int64_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _int64, int64_t, _fc64,   GxB_FC64_t)
-            #endif
             default: ;
         }
         break ;
@@ -166,11 +151,8 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _fp64,   double    )
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _fc64,   GxB_FC64_t)
-            #endif
             default: ;
         }
         break ;
@@ -192,11 +174,8 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _fp64,   double    )
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _fc64,   GxB_FC64_t)
-            #endif
             default: ;
         }
         break ;
@@ -218,11 +197,8 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _fp64,   double    )
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _fc64,   GxB_FC64_t)
-            #endif
             default: ;
         }
         break ;
@@ -244,11 +220,8 @@ switch (code1)
             #endif
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _uint64, uint64_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _uint64, uint64_t, _fp64,   double    )
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _uint64, uint64_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _uint64, uint64_t, _fc64,   GxB_FC64_t)
-            #endif
             default: ;
         }
         break ;
@@ -270,11 +243,8 @@ switch (code1)
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _fp32, float, _fp32,   float     )
             #endif
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _fp32, float, _fp64,   double    )
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _fp32, float, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _fp32, float, _fc64,   GxB_FC64_t)
-            #endif
             default: ;
         }
         break ;
@@ -296,11 +266,8 @@ switch (code1)
             #if !defined ( GB_EXCLUDE_SAME_TYPES )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _fp64, double, _fp64,   double    )
             #endif
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _fp64, double, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _fp64, double, _fc64,   GxB_FC64_t)
-            #endif
             default: ;
         }
         break ;
@@ -320,13 +287,10 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _fp64,   double    )
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             #if !defined ( GB_EXCLUDE_SAME_TYPES )
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _fc32,   GxB_FC32_t)
             #endif
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _fc64,   GxB_FC64_t)
-            #endif
             default: ;
         }
         break ;
@@ -346,13 +310,10 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _fp64,   double    )
-            #if !defined ( GRAPHBLAS_HAS_CUDA )
-            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _fc32,   GxB_FC32_t)
             #if !defined ( GB_EXCLUDE_SAME_TYPES )
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _fc64,   GxB_FC64_t)
             #endif
-            #endif
             default: ;
         }
         break ;
diff --git a/GraphBLAS/Source/FactoryKernels/GB_AxB__include2.h b/GraphBLAS/Source/FactoryKernels/GB_AxB__include2.h
index a3ed1a352f..f662aa1138 100644
--- a/GraphBLAS/Source/FactoryKernels/GB_AxB__include2.h
+++ b/GraphBLAS/Source/FactoryKernels/GB_AxB__include2.h
@@ -6,6 +6,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_AxB.h
+#include "GB_math.h"
 
 GrB_Info GB (_Adot2B__plus_pair_int8)
 (
diff --git a/GraphBLAS/Source/FactoryKernels/GB_aop__include.h b/GraphBLAS/Source/FactoryKernels/GB_aop__include.h
index a5a6add698..ee91742be7 100644
--- a/GraphBLAS/Source/FactoryKernels/GB_aop__include.h
+++ b/GraphBLAS/Source/FactoryKernels/GB_aop__include.h
@@ -6,6 +6,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_aop.h
+#include "GB_math.h"
 
 GrB_Info GB (_subassign_23__first_bool)
 (
diff --git a/GraphBLAS/Source/FactoryKernels/GB_as__include.h b/GraphBLAS/Source/FactoryKernels/GB_as__include.h
index 89776b5481..e02fbfddf9 100644
--- a/GraphBLAS/Source/FactoryKernels/GB_as__include.h
+++ b/GraphBLAS/Source/FactoryKernels/GB_as__include.h
@@ -6,6 +6,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_as.h
+#include "GB_math.h"
 
 GrB_Info GB (_subassign_05d__bool)
 (
diff --git a/GraphBLAS/Source/FactoryKernels/GB_bld__include.h b/GraphBLAS/Source/FactoryKernels/GB_bld__include.h
index ddf238d831..6197da7d5b 100644
--- a/GraphBLAS/Source/FactoryKernels/GB_bld__include.h
+++ b/GraphBLAS/Source/FactoryKernels/GB_bld__include.h
@@ -6,6 +6,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_bld.h
+#include "GB_math.h"
 
 GrB_Info GB (_bld__min_int8)
 (
diff --git a/GraphBLAS/Source/FactoryKernels/GB_ew__include.h b/GraphBLAS/Source/FactoryKernels/GB_ew__include.h
index 630b328472..c3be2bc1ce 100644
--- a/GraphBLAS/Source/FactoryKernels/GB_ew__include.h
+++ b/GraphBLAS/Source/FactoryKernels/GB_ew__include.h
@@ -6,6 +6,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_ew.h
+#include "GB_math.h"
 
 
 GrB_Info GB (_Cewise_fulln__first_bool)
diff --git a/GraphBLAS/Source/FactoryKernels/GB_red__include.h b/GraphBLAS/Source/FactoryKernels/GB_red__include.h
index 1dd63cdb28..ac6b8122b1 100644
--- a/GraphBLAS/Source/FactoryKernels/GB_red__include.h
+++ b/GraphBLAS/Source/FactoryKernels/GB_red__include.h
@@ -6,6 +6,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_red.h
+#include "GB_math.h"
 
 GrB_Info GB (_red__min_int8)
 (
diff --git a/GraphBLAS/Source/FactoryKernels/GB_sel__include.h b/GraphBLAS/Source/FactoryKernels/GB_sel__include.h
index 2667deb07e..c495c3f5e9 100644
--- a/GraphBLAS/Source/FactoryKernels/GB_sel__include.h
+++ b/GraphBLAS/Source/FactoryKernels/GB_sel__include.h
@@ -6,6 +6,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_sel.h
+#include "GB_math.h"
 
 
 GrB_Info GB (_sel_phase2__nonzombie_bool)
diff --git a/GraphBLAS/Source/FactoryKernels/GB_unop__include.h b/GraphBLAS/Source/FactoryKernels/GB_unop__include.h
index edef1e6ddc..cef8329258 100644
--- a/GraphBLAS/Source/FactoryKernels/GB_unop__include.h
+++ b/GraphBLAS/Source/FactoryKernels/GB_unop__include.h
@@ -6,6 +6,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_unop.h
+#include "GB_math.h"
 
 
 GrB_Info GB (_unop_tran__identity_bool_bool)
diff --git a/GraphBLAS/Source/GB.h b/GraphBLAS/Source/GB.h
index 580dee9cfe..1ee458b44f 100644
--- a/GraphBLAS/Source/GB.h
+++ b/GraphBLAS/Source/GB.h
@@ -19,8 +19,6 @@
 #endif
 #include "GB_static_header.h"
 #include "GB_positional.h"
-#include "GB_casting.h"
-#include "GB_math.h"
 #include "GB_bitwise.h"
 #include "GB_check.h"
 #include "GB_nnz.h"
@@ -53,7 +51,6 @@
 #include "GB_cast.h"
 #include "GB_wait.h"
 #include "GB_convert.h"
-#include "GB_ops.h"
 #include "GB_where.h"
 #include "GB_Context.h"
 #include "GB_cuda_gateway.h"
diff --git a/GraphBLAS/Source/GB_AxB__include1.h b/GraphBLAS/Source/GB_AxB__include1.h
index 22bec25456..03fdcc2f41 100644
--- a/GraphBLAS/Source/GB_AxB__include1.h
+++ b/GraphBLAS/Source/GB_AxB__include1.h
@@ -6,6 +6,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_AxB.h
+#include "GB_math.h"
 
 GrB_Info GB (_Adot2B__any_pair_iso)
 (
diff --git a/GraphBLAS/Source/GB_AxB_dot.c b/GraphBLAS/Source/GB_AxB_dot.c
index f43a4dc80e..785091002a 100644
--- a/GraphBLAS/Source/GB_AxB_dot.c
+++ b/GraphBLAS/Source/GB_AxB_dot.c
@@ -187,10 +187,10 @@ GrB_Info GB_AxB_dot                 // dot product (multiple methods)
             GB_sparsity_char_matrix (B)) ;
 
         #if defined ( GRAPHBLAS_HAS_CUDA )
-        if (!C_iso &&   // fixme for CUDA, remove and create C iso on output
+        if (!C_iso &&   // FIXME for CUDA, remove and create C iso on output
             GB_cuda_AxB_dot3_branch (M, Mask_struct, A, B, semiring, flipxy))
         {
-            info = (GB_cuda_AxB_dot3_jit (C, M, Mask_struct, A, B, semiring,
+            info = (GB_cuda_AxB_dot3 (C, M, Mask_struct, A, B, semiring,
                 flipxy)) ;
         }
         else
diff --git a/GraphBLAS/Source/GB_AxB_dot3.c b/GraphBLAS/Source/GB_AxB_dot3.c
index daf0cf0b68..8893196d8d 100644
--- a/GraphBLAS/Source/GB_AxB_dot3.c
+++ b/GraphBLAS/Source/GB_AxB_dot3.c
@@ -192,6 +192,7 @@ GrB_Info GB_AxB_dot3                // C<M> = A'*B using dot product method
 
     // M is sparse or hypersparse; C is the same as M
     nthreads = GB_nthreads (cnvec, chunk, nthreads_max) ;
+
     // TODO: try this with Cp and Ch shallow
     GB_memcpy (Cp, Mp, (cnvec+1) * sizeof (int64_t), nthreads) ;
     if (M_is_hyper)
@@ -304,6 +305,11 @@ GrB_Info GB_AxB_dot3                // C<M> = A'*B using dot product method
             { 
                 #include "GB_AxB_factory.c"
             }
+
+            if (info == GrB_SUCCESS)
+            {
+                GBURBLE (" factory ") ;
+            }
         }
         #endif
 
diff --git a/GraphBLAS/Source/GB_AxB_saxpy.h b/GraphBLAS/Source/GB_AxB_saxpy.h
index 9f0df1790b..5a37f7d999 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy.h
+++ b/GraphBLAS/Source/GB_AxB_saxpy.h
@@ -10,6 +10,7 @@
 #ifndef GB_AXB_SAXPY_H
 #define GB_AXB_SAXPY_H
 #include "GB.h"
+#include "GB_math.h"
 #include "GB_AxB_saxpy3.h"
 
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3.h b/GraphBLAS/Source/GB_AxB_saxpy3.h
index d672effd33..6d23a54897 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy3.h
+++ b/GraphBLAS/Source/GB_AxB_saxpy3.h
@@ -14,6 +14,7 @@
 #define GB_AXB_SAXPY3_H
 
 #include "GB.h"
+#include "GB_math.h"
 
 GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
 (
diff --git a/GraphBLAS/Source/GB_add.h b/GraphBLAS/Source/GB_add.h
index c8b974c0c4..1f3a6152b8 100644
--- a/GraphBLAS/Source/GB_add.h
+++ b/GraphBLAS/Source/GB_add.h
@@ -10,6 +10,7 @@
 #ifndef GB_ADD_H
 #define GB_ADD_H
 #include "GB.h"
+#include "GB_math.h"
 
 GrB_Info GB_add             // C=A+B, C<M>=A+B, or C<!M>=A+B
 (
diff --git a/GraphBLAS/Source/GB_add_phase0.c b/GraphBLAS/Source/GB_add_phase0.c
index 067eded6b7..a965bf5486 100644
--- a/GraphBLAS/Source/GB_add_phase0.c
+++ b/GraphBLAS/Source/GB_add_phase0.c
@@ -677,8 +677,8 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
             GB_OK (GB_hyper_hash_build (M, Werk)) ;
 
             const int64_t *restrict M_Yp = (M->Y == NULL) ? NULL : M->Y->p ;
-            const int64_t *restrict M_Yi = (M->Y == NULL) ? NULL : M->Y->p ;
-            const int64_t *restrict M_Yx = (M->Y == NULL) ? NULL : M->Y->p ;
+            const int64_t *restrict M_Yi = (M->Y == NULL) ? NULL : M->Y->i ;
+            const int64_t *restrict M_Yx = (M->Y == NULL) ? NULL : M->Y->x ;
             const int64_t M_hash_bits = (M->Y == NULL) ? 0 : (M->Y->vdim - 1) ;
 
             int64_t k ;
diff --git a/GraphBLAS/Source/GB_assign.h b/GraphBLAS/Source/GB_assign.h
index 3b891f33d0..c9ce0530fb 100644
--- a/GraphBLAS/Source/GB_assign.h
+++ b/GraphBLAS/Source/GB_assign.h
@@ -10,6 +10,7 @@
 #ifndef GB_ASSIGN_H
 #define GB_ASSIGN_H
 #include "GB.h"
+#include "GB_math.h"
 
 GrB_Info GB_assign                  // C<M>(Rows,Cols) += A or A'
 (
diff --git a/GraphBLAS/Source/GB_bitmap_assign.h b/GraphBLAS/Source/GB_bitmap_assign.h
index f294e89cdf..a45a05bfbe 100644
--- a/GraphBLAS/Source/GB_bitmap_assign.h
+++ b/GraphBLAS/Source/GB_bitmap_assign.h
@@ -10,6 +10,7 @@
 #ifndef GB_BITMAP_ASSIGN_H
 #define GB_BITMAP_ASSIGN_H
 #include "GB.h"
+#include "GB_math.h"
 
 GrB_Info GB_bitmap_assign
 (
diff --git a/GraphBLAS/Source/GB_cast.h b/GraphBLAS/Source/GB_cast.h
index fbf3587465..5312d08c58 100644
--- a/GraphBLAS/Source/GB_cast.h
+++ b/GraphBLAS/Source/GB_cast.h
@@ -10,6 +10,18 @@
 #ifndef GB_CAST_H
 #define GB_CAST_H
 
+//------------------------------------------------------------------------------
+// pointer casting function, returned by GB_cast_factory.
+//------------------------------------------------------------------------------
+
+typedef void (*GB_cast_function) (void *, const void *, size_t) ;
+
+GB_cast_function GB_cast_factory   // returns pointer to function to cast x to z
+(
+    const GB_Type_code code1,      // the type of z, the output value
+    const GB_Type_code code2       // the type of x, the input value
+) ;
+
 //------------------------------------------------------------------------------
 // GB_cast_scalar: typecast or copy a scalar
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_cast_factory.c b/GraphBLAS/Source/GB_cast_factory.c
index c97e60cef5..67907852fa 100644
--- a/GraphBLAS/Source/GB_cast_factory.c
+++ b/GraphBLAS/Source/GB_cast_factory.c
@@ -16,6 +16,7 @@
 // function GB_copy_user_user.
 
 #include "GB.h"
+#include "GB_casting.h"
 
 GB_cast_function GB_cast_factory   // returns pointer to function to cast x to z
 (
diff --git a/GraphBLAS/Source/GB_casting.c b/GraphBLAS/Source/GB_casting.c
index 90def26d33..bac1c2d87c 100644
--- a/GraphBLAS/Source/GB_casting.c
+++ b/GraphBLAS/Source/GB_casting.c
@@ -8,6 +8,7 @@
 //------------------------------------------------------------------------------
 
 #include "GB.h"
+#include "GB_casting.h"
 
 //------------------------------------------------------------------------------
 // typecasting from double to integer
diff --git a/GraphBLAS/Source/GB_casting.h b/GraphBLAS/Source/GB_casting.h
index e6cc7b4edb..cfe83d362b 100644
--- a/GraphBLAS/Source/GB_casting.h
+++ b/GraphBLAS/Source/GB_casting.h
@@ -9,21 +9,13 @@
 
 // The GJ_cast* methods are only used in JIT kernels.
 
+#ifdef __cplusplus
+#error "not used for C++"
+#endif
+
 #ifndef GB_CASTING_H
 #define GB_CASTING_H
 
-//------------------------------------------------------------------------------
-// pointer casting function, returned by GB_cast_factory.
-//------------------------------------------------------------------------------
-
-typedef void (*GB_cast_function) (void *, const void *, size_t) ;
-
-GB_cast_function GB_cast_factory   // returns pointer to function to cast x to z
-(
-    const GB_Type_code code1,      // the type of z, the output value
-    const GB_Type_code code2       // the type of x, the input value
-) ;
-
 //------------------------------------------------------------------------------
 // typecasting from double to integer
 //------------------------------------------------------------------------------
@@ -215,16 +207,12 @@ GB_CAST_FUNCTION (bool      , uint32_t  )
 GB_CAST_FUNCTION (bool      , uint64_t  )
 GB_CAST_FUNCTION (bool      , float     )
 GB_CAST_FUNCTION (bool      , double    )
-
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
 #undef  GB_CAST
 #define GB_CAST(ztype,x) (GB_crealf (x) != 0 || GB_cimagf (x) != 0)
 GB_CAST_FUNCTION (bool      , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) (GB_creal (x) != 0 || GB_cimag (x) != 0)
 GB_CAST_FUNCTION (bool      , GxB_FC64_t)
-#endif
 
 //------------------------------------------------------------------------------
 // typecast to int8_t
@@ -246,15 +234,11 @@ GB_CAST_FUNCTION (int8_t    , uint64_t  )
 GB_CAST_FUNCTION (int8_t    , float     )
 GB_CAST_FUNCTION (int8_t    , double    )
 #undef  GB_CAST
-
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
 #define GB_CAST(ztype,x) GB_cast_to_int8_t ((double) GB_crealf (x))
 GB_CAST_FUNCTION (int8_t    , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int8_t (GB_creal (x))
 GB_CAST_FUNCTION (int8_t    , GxB_FC64_t)
-#endif
 
 //------------------------------------------------------------------------------
 // typecast to int16_t
@@ -275,16 +259,12 @@ GB_CAST_FUNCTION (int16_t   , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_int16_t ((double) x)
 GB_CAST_FUNCTION (int16_t   , float     )
 GB_CAST_FUNCTION (int16_t   , double    )
-
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int16_t ((double) GB_crealf (x))
 GB_CAST_FUNCTION (int16_t   , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int16_t (GB_creal (x))
 GB_CAST_FUNCTION (int16_t   , GxB_FC64_t)
-#endif
 
 //------------------------------------------------------------------------------
 // typecast to int32_t
@@ -305,16 +285,12 @@ GB_CAST_FUNCTION (int32_t   , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_int32_t ((double) x)
 GB_CAST_FUNCTION (int32_t   , float     )
 GB_CAST_FUNCTION (int32_t   , double    )
-
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int32_t ((double) GB_crealf (x))
 GB_CAST_FUNCTION (int32_t   , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int32_t (GB_creal (x))
 GB_CAST_FUNCTION (int32_t   , GxB_FC64_t)
-#endif
 
 //------------------------------------------------------------------------------
 // typecast to int64_t
@@ -335,16 +311,12 @@ GB_CAST_FUNCTION (int64_t   , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_int64_t ((double) x)
 GB_CAST_FUNCTION (int64_t   , float     )
 GB_CAST_FUNCTION (int64_t   , double    )
-
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int64_t ((double) GB_crealf (x))
 GB_CAST_FUNCTION (int64_t   , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int64_t (GB_creal (x))
 GB_CAST_FUNCTION (int64_t   , GxB_FC64_t)
-#endif
 
 //------------------------------------------------------------------------------
 // typecast to uint8_t
@@ -365,16 +337,12 @@ GB_CAST_FUNCTION (uint8_t   , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_uint8_t ((double) x)
 GB_CAST_FUNCTION (uint8_t   , float     )
 GB_CAST_FUNCTION (uint8_t   , double    )
-
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint8_t ((double) GB_crealf (x))
 GB_CAST_FUNCTION (uint8_t   , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint8_t (GB_creal (x))
 GB_CAST_FUNCTION (uint8_t   , GxB_FC64_t)
-#endif
 
 //------------------------------------------------------------------------------
 // typecast to uint16_t
@@ -395,16 +363,12 @@ GB_CAST_FUNCTION (uint16_t  , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_uint16_t ((double) x)
 GB_CAST_FUNCTION (uint16_t  , float     )
 GB_CAST_FUNCTION (uint16_t  , double    )
-
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint16_t ((double) GB_crealf (x))
 GB_CAST_FUNCTION (uint16_t  , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint16_t (GB_creal (x))
 GB_CAST_FUNCTION (uint16_t  , GxB_FC64_t)
-#endif
 
 //------------------------------------------------------------------------------
 // typecast to uint32_t
@@ -425,16 +389,12 @@ GB_CAST_FUNCTION (uint32_t  , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_uint32_t ((double) x)
 GB_CAST_FUNCTION (uint32_t  , float     )
 GB_CAST_FUNCTION (uint32_t  , double    )
-
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint32_t ((double) GB_crealf (x))
 GB_CAST_FUNCTION (uint32_t  , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint32_t (GB_creal (x))
 GB_CAST_FUNCTION (uint32_t  , GxB_FC64_t)
-#endif
 
 //------------------------------------------------------------------------------
 // typecast to uint64_t
@@ -455,16 +415,12 @@ GB_CAST_FUNCTION (uint64_t  , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_uint64_t ((double) x)
 GB_CAST_FUNCTION (uint64_t  , float     )
 GB_CAST_FUNCTION (uint64_t  , double    )
-
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint64_t ((double) GB_crealf (x))
 GB_CAST_FUNCTION (uint64_t  , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint64_t (GB_creal (x))
 GB_CAST_FUNCTION (uint64_t  , GxB_FC64_t)
-#endif
 
 //------------------------------------------------------------------------------
 // typecast to float
@@ -483,16 +439,12 @@ GB_CAST_FUNCTION (float     , uint32_t  )
 GB_CAST_FUNCTION (float     , uint64_t  )
 GB_CAST_FUNCTION (float     , float     )
 GB_CAST_FUNCTION (float     , double    )
-
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_crealf (x)
 GB_CAST_FUNCTION (float     , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) ((float) GB_creal (x))
 GB_CAST_FUNCTION (float     , GxB_FC64_t)
-#endif
 
 //------------------------------------------------------------------------------
 // typecast to double
@@ -511,24 +463,17 @@ GB_CAST_FUNCTION (double    , uint32_t  )
 GB_CAST_FUNCTION (double    , uint64_t  )
 GB_CAST_FUNCTION (double    , float     )
 GB_CAST_FUNCTION (double    , double    )
-
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
 #undef  GB_CAST
 #define GB_CAST(ztype,x) ((double) GB_crealf (x))
 GB_CAST_FUNCTION (double    , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_creal (x)
 GB_CAST_FUNCTION (double    , GxB_FC64_t)
-#endif
 
 //------------------------------------------------------------------------------
 // typecast to float complex
 //------------------------------------------------------------------------------
 
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
-
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_CMPLX32 ((float) x, (float) 0)
 GB_CAST_FUNCTION (GxB_FC32_t, bool      )
@@ -573,8 +518,6 @@ GB_CAST_FUNCTION (GxB_FC64_t, GxB_FC32_t)
 #define GB_CAST(ztype,x) x
 GB_CAST_FUNCTION (GxB_FC64_t, GxB_FC64_t)
 
-#endif
-
 #undef  GB_CAST
 #undef  GB_CAST_FUNCTION
 
diff --git a/GraphBLAS/Source/GB_copy_user_user.c b/GraphBLAS/Source/GB_copy_user_user.c
index 84c69a3a86..797f94e3d1 100644
--- a/GraphBLAS/Source/GB_copy_user_user.c
+++ b/GraphBLAS/Source/GB_copy_user_user.c
@@ -8,6 +8,7 @@
 //------------------------------------------------------------------------------
 
 #include "GB.h"
+#include "GB_casting.h"
 
 void GB_copy_user_user (void *z, const void *x, size_t s)
 { 
diff --git a/GraphBLAS/Source/GB_cuda_gateway.h b/GraphBLAS/Source/GB_cuda_gateway.h
index cfb00a72ed..72bd217d73 100644
--- a/GraphBLAS/Source/GB_cuda_gateway.h
+++ b/GraphBLAS/Source/GB_cuda_gateway.h
@@ -66,17 +66,15 @@ static inline int GB_ngpus_to_use
     // get # of GPUs avaiable
     int gpu_count = GB_Global_gpu_count_get ( ) ;
 
-    if (gpu_hack == 2 || gpu_count == 0)
+    if (gpu_hack == 2 || gpu_count == 0 || work == 0)
     {
         // never use the GPU(s)
-        // printf ("(GPU: disabled, gpu_count: %d) ", gpu_count) ;
         return (0) ;
     }
     else if (gpu_hack == 1)
     {
         // always use all available GPU(s)
         // fixme for CUDA: allow 1 to gpu_count to be requested
-        // printf ("(using the GPU: %d) ", gpu_count) ;
         return (gpu_count) ;
     }
     else
@@ -84,15 +82,12 @@ static inline int GB_ngpus_to_use
         // default: use no more than max_gpus_to_use
         double gpu_chunk = 2e6 ;
         double max_gpus_to_use = floor (work / gpu_chunk) ;
-        // printf ("(work %g gpu_chunk: %g max gpus to use: %g) ",
-            // work, gpu_chunk, max_gpus_to_use) ;
         // but use no more than the # of GPUs available
         if (max_gpus_to_use > gpu_count) return (gpu_count) ;
         return ((int) max_gpus_to_use) ;
     }
 }
 
-
 //------------------------------------------------------------------------------
 // GB_cuda_* gateway functions
 //------------------------------------------------------------------------------
@@ -116,13 +111,18 @@ bool GB_cuda_get_device_properties
     GB_cuda_device *prop
 ) ;
 
+bool GB_cuda_type_branch            // return true if the type is OK on GPU
+(
+    const GrB_Type type             // type to query
+) ;
+
 bool GB_cuda_reduce_to_scalar_branch    // return true to use the GPU
 (
     const GrB_Monoid monoid,        // monoid to do the reduction
     const GrB_Matrix A              // input matrix
 ) ;
 
-GrB_Info GB_cuda_reduce_to_scalar_jit
+GrB_Info GB_cuda_reduce_to_scalar
 (
     // output:
     GB_void *s,                 // note: statically allocated on CPU stack; if
@@ -134,12 +134,7 @@ GrB_Info GB_cuda_reduce_to_scalar_jit
     const GrB_Matrix A
 ) ;
 
-bool GB_cuda_type_branch            // return true if the type is OK on GPU
-(
-    const GrB_Type type             // type to query
-) ;
-
-GrB_Info GB_cuda_AxB_dot3_jit       // C<M> = A'*B using dot product method
+GrB_Info GB_cuda_AxB_dot3           // C<M> = A'*B using dot product method
 (
     GrB_Matrix C,                   // output matrix, static header
     const GrB_Matrix M,             // mask matrix
diff --git a/GraphBLAS/Source/GB_emult.h b/GraphBLAS/Source/GB_emult.h
index c5661f9c00..7a9e7e2a03 100644
--- a/GraphBLAS/Source/GB_emult.h
+++ b/GraphBLAS/Source/GB_emult.h
@@ -10,6 +10,7 @@
 #ifndef GB_EMULT_H
 #define GB_EMULT_H
 #include "GB.h"
+#include "GB_math.h"
 #include "GB_bitmap_assign_methods.h"
 
 #define GB_EMULT_METHOD1_ADD 1      /* use GB_add instead of emult */
diff --git a/GraphBLAS/Source/GB_encodify_reduce.c b/GraphBLAS/Source/GB_encodify_reduce.c
index 61fb3f5425..5112883246 100644
--- a/GraphBLAS/Source/GB_encodify_reduce.c
+++ b/GraphBLAS/Source/GB_encodify_reduce.c
@@ -17,6 +17,7 @@ uint64_t GB_encodify_reduce // encode a GrB_reduce problem
                                 // except for the suffix
     char **suffix,              // suffix for user-defined kernel
     // input:
+    const GB_jit_kcode kcode,   // kernel to encode
     GrB_Monoid monoid,      // the monoid to enumify
     GrB_Matrix A            // input matrix to reduce
 )
@@ -40,7 +41,7 @@ uint64_t GB_encodify_reduce // encode a GrB_reduce problem
 
     GB_enumify_reduce (&encoding->code, monoid, A) ;
     bool builtin = (monoid->hash == 0) ;
-    encoding->kcode = GB_JIT_KERNEL_REDUCE ;
+    encoding->kcode = kcode ;
 
     //--------------------------------------------------------------------------
     // determine the suffix and its length
diff --git a/GraphBLAS/Source/GB_enumify_cuda_atomic.c b/GraphBLAS/Source/GB_enumify_cuda_atomic.c
index 315f78ed5c..e10b97dc39 100644
--- a/GraphBLAS/Source/GB_enumify_cuda_atomic.c
+++ b/GraphBLAS/Source/GB_enumify_cuda_atomic.c
@@ -37,12 +37,11 @@ bool GB_enumify_cuda_atomic
     {
 
         // user defined monoid: can apply GB_ADD via atomicCAS if the ztype has
-        // 16, 32, or 64 bits
+        // 32 or 64 bits
         case  0 :
 
             (*user_monoid_atomically) =
-                (zsize == sizeof (uint16_t) ||
-                 zsize == sizeof (uint32_t) ||
+                (zsize == sizeof (uint32_t) ||
                  zsize == sizeof (uint64_t))  ;
             break ;
 
@@ -234,14 +233,10 @@ bool GB_enumify_cuda_atomic
     { 
 
         //----------------------------------------------------------------------
-        // user-defined monoid with a type of 16, 32, or 64 bits
+        // user-defined monoid with a type of 32 or 64 bits
         //----------------------------------------------------------------------
 
-        if (zsize == sizeof (uint16_t))
-        {
-            (*cuda_type) = "unsigned short int" ;
-        }
-        else if (zsize == sizeof (uint32_t))
+        if (zsize == sizeof (uint32_t))
         {
             (*cuda_type) = "unsigned int" ;
         }
@@ -261,7 +256,7 @@ bool GB_enumify_cuda_atomic
         //----------------------------------------------------------------------
 
         // either built-in (GxB_ANY_FC64_MONOID or GxB_TIMES_FC64_MONOID),
-        // or user-defined where the type is not 16, 32, or 64 bits in size
+        // or user-defined where the type is not 32 or 64 bits in size
 
         has_cheeseburger = false ;
 
diff --git a/GraphBLAS/Source/GB_ewise_kernels.h b/GraphBLAS/Source/GB_ewise_kernels.h
index de2665caa1..b90a8a4f65 100644
--- a/GraphBLAS/Source/GB_ewise_kernels.h
+++ b/GraphBLAS/Source/GB_ewise_kernels.h
@@ -7,7 +7,6 @@
 
 //------------------------------------------------------------------------------
 
-#include "GB.h"
 #include "GB_emult.h"
 #include "GB_ek_slice.h"
 #include "GB_bitmap_assign_methods.h"
diff --git a/GraphBLAS/Source/GB_helper.h b/GraphBLAS/Source/GB_helper.h
index 317b6ac3ba..5cb05a5c27 100644
--- a/GraphBLAS/Source/GB_helper.h
+++ b/GraphBLAS/Source/GB_helper.h
@@ -14,6 +14,7 @@
 #define GB_HELPER_H
 
 #include "GB.h"
+#include "GB_math.h"
 
 double GB_helper0 (void) ;
 
diff --git a/GraphBLAS/Source/GB_init.c b/GraphBLAS/Source/GB_init.c
index 7b55498369..c3d6e6d253 100644
--- a/GraphBLAS/Source/GB_init.c
+++ b/GraphBLAS/Source/GB_init.c
@@ -31,7 +31,8 @@
 // The calloc function pointer is also optional and can be NULL.
 
 // If the mode is GxB_BLOCKING_GPU or GxB_NONBLOCKING_GPU, the 4 function
-// pointers are ignored, and rmm_wrap_malloc/.../rmm_wrap_free are used instead.
+// pointers are ignored, and rmm_wrap_malloc/.../rmm_wrap_free are used
+// instead.
 
 #define GB_FREE_ALL ;
 #include "GB.h"
@@ -44,7 +45,7 @@
 
 GrB_Info GB_init            // start up GraphBLAS
 (
-    const GrB_Mode mode,    // blocking or non-blocking mode
+    GrB_Mode mode,          // blocking or non-blocking mode
 
     // pointers to memory management functions.
     void * (* malloc_function  ) (size_t),          // required
@@ -78,7 +79,10 @@ GrB_Info GB_init            // start up GraphBLAS
     // establish malloc/calloc/realloc/free
     //--------------------------------------------------------------------------
 
+    bool malloc_is_thread_safe = true ;
+
     #if defined ( GRAPHBLAS_HAS_CUDA )
+    mode = GxB_NONBLOCKING_GPU ;    // HACK FIXME
     if (mode == GxB_NONBLOCKING_GPU || mode == GxB_BLOCKING_GPU)
     {
         // ignore the memory management function pointers and use rmm_wrap_*
@@ -86,6 +90,8 @@ GrB_Info GB_init            // start up GraphBLAS
         calloc_function  = rmm_wrap_calloc ;
         realloc_function = rmm_wrap_realloc ;
         free_function    = rmm_wrap_free ;
+        // the rmm_wrap methods are not thread-safe
+        malloc_is_thread_safe = false ;
     }
     #endif
 
@@ -104,7 +110,7 @@ GrB_Info GB_init            // start up GraphBLAS
     GB_Global_realloc_function_set (realloc_function) ; // ok if NULL
     GB_Global_free_function_set    (free_function   ) ; // cannot be NULL
 
-    GB_Global_malloc_is_thread_safe_set (true) ; // malloc must be thread-safe
+    GB_Global_malloc_is_thread_safe_set (malloc_is_thread_safe) ;
     GB_Global_memtable_clear ( ) ;
 
     GB_Global_malloc_tracking_set (false) ;
@@ -182,7 +188,7 @@ GrB_Info GB_init            // start up GraphBLAS
     GB_Global_timing_clear_all ( ) ;
 
     //--------------------------------------------------------------------------
-    // set up the JIT folder locations and compiler flags
+    // set up the JIT setting and emit the source to the cache folder
     //--------------------------------------------------------------------------
 
     GB_OK (GB_jitifyer_init ( )) ;
@@ -192,6 +198,12 @@ GrB_Info GB_init            // start up GraphBLAS
     //--------------------------------------------------------------------------
 
     #pragma omp flush
+    #if defined ( GRAPHBLAS_HAS_CUDA )
+//  this hack_get setting is used by GB_ngpus_to_use:
+//  GB_Global_hack_set (2,0) ;  // HACK FIXME: default: GPU for big enough probs
+//  GB_Global_hack_set (2,1) ;  // HACK FIXME: force the GPU always to be used
+//  GB_Global_hack_set (2,2) ;  // HACK FIXME: force the GPU never to be used
+    #endif
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_init.h b/GraphBLAS/Source/GB_init.h
index 9bd15d962e..8664293dc3 100644
--- a/GraphBLAS/Source/GB_init.h
+++ b/GraphBLAS/Source/GB_init.h
@@ -12,7 +12,7 @@
 
 GrB_Info GB_init            // start up GraphBLAS
 (
-    const GrB_Mode mode,    // blocking or non-blocking mode
+    GrB_Mode mode,          // blocking or non-blocking mode
 
     // pointers to memory management functions.
     void * (* malloc_function  ) (size_t),          // required
diff --git a/GraphBLAS/Source/GB_jitifyer.c b/GraphBLAS/Source/GB_jitifyer.c
index 7e96b03f04..d430977e57 100644
--- a/GraphBLAS/Source/GB_jitifyer.c
+++ b/GraphBLAS/Source/GB_jitifyer.c
@@ -1,5 +1,5 @@
 //------------------------------------------------------------------------------
-// GB_jitifyer.c: CPU jitifyer
+// GB_jitifyer.c: CPU / CUDA jitifyer
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
@@ -71,10 +71,14 @@ static size_t   GB_jit_C_libraries_allocated = 0 ;
 static char    *GB_jit_C_cmake_libs = NULL ;
 static size_t   GB_jit_C_cmake_libs_allocated = 0 ;
 
-// preface to add to each JIT kernel:
+// preface to add to each CPU JIT kernel:
 static char    *GB_jit_C_preface = NULL ;
 static size_t   GB_jit_C_preface_allocated = 0 ;
 
+// preface to add to each CUDA JIT kernel:
+static char    *GB_jit_CUDA_preface = NULL ;
+static size_t   GB_jit_CUDA_preface_allocated = 0 ;
+
 // temporary workspace for filenames and system commands:
 static char    *GB_jit_temp = NULL ;
 static size_t   GB_jit_temp_allocated = 0 ;
@@ -208,11 +212,12 @@ void GB_jitifyer_finalize (void)
     GB_FREE_STUFF (GB_jit_C_libraries) ;
     GB_FREE_STUFF (GB_jit_C_cmake_libs) ;
     GB_FREE_STUFF (GB_jit_C_preface) ;
+    GB_FREE_STUFF (GB_jit_CUDA_preface) ;
     GB_FREE_STUFF (GB_jit_temp) ;
 }
 
 //------------------------------------------------------------------------------
-// GB_jitifyer_init: initialize the CPU and CUDA JIT folders, flags, etc
+// GB_jitifyer_init: initialize the JIT folders, flags, etc
 //------------------------------------------------------------------------------
 
 // Returns GrB_SUCCESS or GrB_OUT_OF_MEMORY.  If any other error occurs (such
@@ -308,6 +313,7 @@ GrB_Info GB_jitifyer_init (void)
     GB_COPY_STUFF (GB_jit_C_libraries,  GB_C_LIBRARIES) ;
     GB_COPY_STUFF (GB_jit_C_cmake_libs, GB_CMAKE_LIBRARIES) ;
     GB_COPY_STUFF (GB_jit_C_preface,    "") ;
+    GB_COPY_STUFF (GB_jit_CUDA_preface, "") ;
     OK (GB_jitifyer_alloc_space ( )) ;
 
     //--------------------------------------------------------------------------
@@ -438,6 +444,7 @@ GrB_Info GB_jitifyer_init (void)
         else if (IS ("union"        )) c = GB_JIT_KERNEL_UNION ;
         else if (IS ("user_op"      )) c = GB_JIT_KERNEL_USEROP ;
         else if (IS ("user_type"    )) c = GB_JIT_KERNEL_USERTYPE ;
+        else if (IS ("cuda_reduce"  )) c = GB_JIT_CUDA_KERNEL_REDUCE ;
         else
         {
             // PreJIT error: kernel_name is invalid; ignore this kernel
@@ -548,8 +555,6 @@ GrB_Info GB_jitifyer_establish_paths (GrB_Info error_condition)
 
     // construct the c, lib, and lock paths and their 256 subfolders
     ok = ok && GB_jitifyer_path_256 ("c") ;
-    ok = ok && GB_jitifyer_path_256 ("cu") ;
-    ok = ok && GB_jitifyer_path_256 ("libcu") ;
     ok = ok && GB_jitifyer_path_256 ("lib") ;
     ok = ok && GB_jitifyer_path_256 ("lock") ;
 
@@ -558,7 +563,7 @@ GrB_Info GB_jitifyer_establish_paths (GrB_Info error_condition)
     ok = ok && GB_file_mkdir (GB_jit_temp) ;
 
     // construct the tmp path
-    snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/tmp", GB_jit_cache_path);
+    snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/tmp", GB_jit_cache_path) ;
     ok = ok && GB_file_mkdir (GB_jit_temp) ;
 
     //--------------------------------------------------------------------------
@@ -1230,7 +1235,6 @@ GrB_Info GB_jitifyer_set_C_cmake_libs_worker (const char *new_cmake_libs)
     return (GB_jitifyer_alloc_space ( )) ;
 }
 
-
 //------------------------------------------------------------------------------
 // GB_jitifyer_get_C_preface: return the current C preface
 //------------------------------------------------------------------------------
@@ -1286,6 +1290,61 @@ GrB_Info GB_jitifyer_set_C_preface_worker (const char *new_C_preface)
     return (GrB_SUCCESS) ;
 }
 
+//------------------------------------------------------------------------------
+// GB_jitifyer_get_CUDA_preface: return the current C preface
+//------------------------------------------------------------------------------
+
+const char *GB_jitifyer_get_CUDA_preface (void)
+{ 
+    const char *s ;
+    #pragma omp critical (GB_jitifyer_worker)
+    {
+        s = GB_jit_CUDA_preface ;
+    }
+    return (s) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_jitifyer_set_CUDA_preface: set new C preface
+//------------------------------------------------------------------------------
+
+GrB_Info GB_jitifyer_set_CUDA_preface (const char *new_CUDA_preface)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    if (new_CUDA_preface == NULL)
+    { 
+        return (GrB_NULL_POINTER) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // set the C preface in a critical section
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    #pragma omp critical (GB_jitifyer_worker)
+    {
+        info = GB_jitifyer_set_CUDA_preface_worker (new_CUDA_preface) ;
+    }
+    return (info) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_jitifyer_set_CUDA_preface_worker: set C preface in a critical section
+//------------------------------------------------------------------------------
+
+GrB_Info GB_jitifyer_set_CUDA_preface_worker (const char *new_CUDA_preface)
+{ 
+    // free the old strings that depend on the C preface
+    GB_FREE_STUFF (GB_jit_CUDA_preface) ;
+    // allocate the new GB_jit_CUDA_preface
+    GB_COPY_STUFF (GB_jit_CUDA_preface, new_CUDA_preface) ;
+    return (GrB_SUCCESS) ;
+}
+
 //------------------------------------------------------------------------------
 // GB_jitifyer_query: check if the type/op/monoid definitions match
 //------------------------------------------------------------------------------
@@ -1677,6 +1736,10 @@ GrB_Info GB_jitifyer_worker
     // lock the kernel
     //--------------------------------------------------------------------------
 
+    // FIXME: add kernel_name to the lock filename.  If the lock fails,
+    // sleep for 1 second and try again repeatedly, with a timeout limit of
+    // (say) 60 seconds.
+
     uint32_t bucket = hash & 0xFF ;
     snprintf (GB_jit_temp, GB_jit_temp_allocated,
         "%s/lock/%02x/%016" PRIx64 "_lock", GB_jit_cache_path, bucket, hash) ;
@@ -1797,15 +1860,19 @@ GrB_Info GB_jitifyer_load_worker
         //----------------------------------------------------------------------
 
         GBURBLE ("(jit: compile and load) ") ;
+        GB_jit_kcode kcode = encoding->kcode ;
+        const char *kernel_filetype =
+            (kcode < GB_JIT_CUDA_KERNEL) ? "c" : "cu" ;
 
         // create (or recreate) the kernel source, compile it, and load it
-        snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/c/%02x/%s.c",
-            GB_jit_cache_path, bucket, kernel_name) ;
+        snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/c/%02x/%s.%s",
+            GB_jit_cache_path, bucket, kernel_name, kernel_filetype) ;
         FILE *fp = fopen (GB_jit_temp, "w") ;
         if (fp != NULL)
         { 
             // create the preface
-            GB_macrofy_preface (fp, kernel_name, GB_jit_C_preface) ;
+            GB_macrofy_preface (fp, kernel_name,
+                GB_jit_C_preface, GB_jit_CUDA_preface, kcode) ;
             // macrofy the kernel operators, types, and matrix formats
             GB_macrofy_family (fp, family, encoding->code, semiring,
                 monoid, op, type1, type2, type3) ;
@@ -1814,12 +1881,14 @@ GrB_Info GB_jitifyer_load_worker
                          "#define GB_jit_kernel %s\n"
                          "#define GB_jit_query  %s_query\n"
                          "#endif\n"
-                         "#include \"GB_jit_kernel_%s.c\"\n",
-                         kernel_name, kernel_name, kname) ;
+                         "#include \"GB_jit_kernel_%s.%s\"\n",
+                         kernel_name, kernel_name, kname,
+                         kernel_filetype) ;
+
             // macrofy the query function
             bool builtin = (encoding->suffix_len == 0) ;
             GB_macrofy_query (fp, builtin, monoid, op1, op2, type1, type2,
-                type3, hash) ;
+                type3, hash, kcode) ;
             fclose (fp) ;
         }
 
@@ -1827,16 +1896,22 @@ GrB_Info GB_jitifyer_load_worker
         // gracefully fail.
 
         // compile the kernel to get the lib*.so file
-        if (GB_jit_use_cmake)
+        if (kcode >= GB_JIT_CUDA_KERNEL)
+        {
+            // use NVCC to directly compile the CUDA kernel
+            GB_jitifyer_nvcc_compile (kernel_name, bucket) ;
+        }
+        else if (GB_jit_use_cmake)
         { 
-            // use cmake to compile the kernel
+            // use cmake to compile the CPU kernel
             GB_jitifyer_cmake_compile (kernel_name, hash) ;
         }
         else
         { 
-            // use the compiler to directly compile the kernel
+            // use the compiler to directly compile the CPU kernel
             GB_jitifyer_direct_compile (kernel_name, bucket) ;
         }
+
         // load the kernel from the lib*.so file
         snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/lib/%02x/%s%s%s",
             GB_jit_cache_path, bucket, GB_LIB_PREFIX, kernel_name,
@@ -1865,7 +1940,7 @@ GrB_Info GB_jitifyer_load_worker
     }
 
     //--------------------------------------------------------------------------
-    // get the jit_kernel_function pointer
+    // get the GB_jit_kernel function pointer
     //--------------------------------------------------------------------------
 
     (*dl_function) = GB_file_dlsym (dl_handle, "GB_jit_kernel") ;
@@ -2314,6 +2389,87 @@ void GB_jitifyer_cmake_compile (char *kernel_name, uint64_t hash)
 #endif
 }
 
+//------------------------------------------------------------------------------
+// GB_jitifyer_nvcc_compile: compile a CUDA kernel with NVRTC
+//------------------------------------------------------------------------------
+
+// Compiles a CUDA JIT kernel in a *.cu file, containing host code that
+// launches one or more device kernels.
+
+// The input file has the form:
+//
+//      %s/c/%02x/%s or [cache_path]/c/[bucket]/[kernel_name].cu
+//
+// and the libary file is linked as
+//
+//      %s/lib/%02x/lib%s.so or [cache_path]/lib/[bucket]/lib[kernel_name].so
+//
+// All other temporary files (including *.o object files) are removed.
+
+void GB_jitifyer_nvcc_compile (char *kernel_name, uint32_t bucket)
+{
+
+#if defined ( GRAPHBLAS_HAS_CUDA ) && !defined ( NJIT )
+
+    char *burble_stdout = GB_Global_burble_get ( ) ? "" : GB_DEV_NULL ;
+    char *err_redirect = (strlen (GB_jit_error_log) > 0) ? " 2>> " : "" ;
+
+    GBURBLE ("(jit compiling cuda with nvcc: %s/c/%02x/%s.cu) ",
+        GB_jit_cache_path, bucket, kernel_name) ;
+
+    snprintf (GB_jit_temp, GB_jit_temp_allocated,
+
+    // compile:
+    "sh -c \""                          // execute with POSIX shell
+    "nvcc "                             // compiler command
+    "-forward-unknown-to-host-compiler "
+    "-DGB_JIT_RUNTIME=1  "              // nvcc flags
+    "-I/usr/local/cuda/include -std=c++17 -arch=sm_60 -fPIC "
+    "-I%s/src "                         // include source directory
+    "-o %s/c/%02x/%s%s "                // *.o output file
+    "-c %s/c/%02x/%s.cu "               // *.cu input file
+    "%s "                               // burble stdout
+    "%s %s ; "                          // error log file
+
+    // link:
+    "nvcc "                             // compiler
+    "-DGB_JIT_RUNTIME=1  "              // nvcc flags
+    "-I/usr/local/cuda/include -std=c++17 -arch=sm_60 "
+    " -shared "
+    "-o %s/lib/%02x/%s%s%s "            // lib*.so output file
+    "%s/c/%02x/%s%s "                   // *.o input file
+    " -cudart shared "
+//  "%s "                               // libraries to link with (any?)
+    "%s "                               // burble stdout
+    "%s %s\"",                          // error log file
+
+    // compile:
+    GB_jit_cache_path,                  // include source directory (cache/src)
+    GB_jit_cache_path, bucket, kernel_name, GB_OBJ_SUFFIX,  // *.o output file
+    GB_jit_cache_path, bucket, kernel_name,                 // *.cu input file
+    burble_stdout,                      // burble stdout
+    err_redirect, GB_jit_error_log,     // error log file
+
+    // link:
+    GB_jit_cache_path, bucket,  
+    GB_LIB_PREFIX, kernel_name, GB_LIB_SUFFIX,              // lib*.so file
+    GB_jit_cache_path, bucket, kernel_name, GB_OBJ_SUFFIX,  // *.o input file
+//  GB_jit_C_libraries                  // libraries to link with
+    burble_stdout,                      // burble stdout
+    err_redirect, GB_jit_error_log) ;   // error log file
+
+    // compile the library and return result
+    GBURBLE ("\n(jit: %s) ", GB_jit_temp) ;
+    GB_jitifyer_command (GB_jit_temp) ; // OK: see security comment above
+
+    // remove the *.o file
+    snprintf (GB_jit_temp, GB_jit_temp_allocated, "%s/c/%02x/%s%s",
+        GB_jit_cache_path, bucket, kernel_name, GB_OBJ_SUFFIX) ;
+    remove (GB_jit_temp) ;
+
+#endif
+}
+
 //------------------------------------------------------------------------------
 // GB_jitifyer_direct_compile: compile a kernel with just the compiler
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_jitifyer.h b/GraphBLAS/Source/GB_jitifyer.h
index 7aaefde893..cd75830a2b 100644
--- a/GraphBLAS/Source/GB_jitifyer.h
+++ b/GraphBLAS/Source/GB_jitifyer.h
@@ -169,6 +169,25 @@ typedef enum
     GB_JIT_KERNEL_CONVERTBITMAP = 85, // GB_convert_bitmap_worker
     GB_JIT_KERNEL_EXPANDISO     = 86, // GB_expand_iso
     GB_JIT_KERNEL_SORT          = 87, // GB_sort
+
+    //--------------------------------------------------------------------------
+    // future:: CUDA kernels
+    //--------------------------------------------------------------------------
+
+    GB_JIT_CUDA_KERNEL          = 1000, // no CUDA kernel
+
+    // reduce to scalar in CUDA
+    GB_JIT_CUDA_KERNEL_REDUCE   = 1001, // GB_cuda_reduce_to_scalar
+
+    // C<M> = A*B, except for row/col scale (which are ewise methods)
+    // ...
+    GB_JIT_CUDA_KERNEL_AXB_DOT3 = 1004, // GB_cuda_AxB_dot3
+
+    // ewise methods:
+    // ...
+    GB_JIT_CUDA_KERNEL_ROWSCALE = 1011,
+    // ...
+
 }
 GB_jit_kcode ;
 
@@ -326,6 +345,7 @@ bool GB_jitifyer_query
 
 void GB_jitifyer_cmake_compile (char *kernel_name, uint64_t hash) ;
 void GB_jitifyer_direct_compile (char *kernel_name, uint32_t bucket) ;
+void GB_jitifyer_nvcc_compile (char *kernel_name, uint32_t bucket) ;
 
 GrB_Info GB_jitifyer_init (void) ;  // initialize the JIT
 
@@ -372,6 +392,10 @@ const char *GB_jitifyer_get_C_preface (void) ;
 GrB_Info GB_jitifyer_set_C_preface (const char *new_C_preface) ;
 GrB_Info GB_jitifyer_set_C_preface_worker (const char *new_C_preface) ;
 
+const char *GB_jitifyer_get_CUDA_preface (void) ;
+GrB_Info GB_jitifyer_set_CUDA_preface (const char *new_CUDA_preface) ;
+GrB_Info GB_jitifyer_set_CUDA_preface_worker (const char *new_CUDA_preface) ;
+
 const char *GB_jitifyer_get_error_log (void) ;
 GrB_Info GB_jitifyer_set_error_log (const char *new_error_log) ;
 GrB_Info GB_jitifyer_set_error_log_worker (const char *new_error_log) ;
diff --git a/GraphBLAS/Source/GB_macrofy_assign.c b/GraphBLAS/Source/GB_macrofy_assign.c
index 776faf2583..4517fa023c 100644
--- a/GraphBLAS/Source/GB_macrofy_assign.c
+++ b/GraphBLAS/Source/GB_macrofy_assign.c
@@ -176,7 +176,7 @@ void GB_macrofy_assign          // construct all macros for GrB_assign
     {
         fprintf (fp, "\n// accum operator:\n") ;
         GB_macrofy_binop (fp, "GB_ACCUM_OP", false, true, false, accum_ecode,
-            C_iso, accum, NULL, NULL) ;
+            C_iso, accum, NULL, NULL, NULL) ;
 
         char *yname = "ywork" ;
 
diff --git a/GraphBLAS/Source/GB_macrofy_binop.c b/GraphBLAS/Source/GB_macrofy_binop.c
index 3d170bf2a8..08cb51d91a 100644
--- a/GraphBLAS/Source/GB_macrofy_binop.c
+++ b/GraphBLAS/Source/GB_macrofy_binop.c
@@ -8,6 +8,7 @@
 //------------------------------------------------------------------------------
 
 #include "GB.h"
+#include "GB_math.h"
 #include "GB_stringify.h"
 #include <ctype.h>
 
@@ -25,8 +26,9 @@ void GB_macrofy_binop
     bool C_iso,                 // if true: C is iso
     GrB_BinaryOp op,            // NULL if C is iso
     // output:
-    const char **f_handle,
-    const char **u_handle
+    const char **f_handle,      // basic expression z=f(x,y)
+    const char **u_handle,      // update z=f(z,y) for the CPU
+    const char **g_handle       // update z=f(z,y) for the GPU (if different)
 )
 {
 
@@ -758,10 +760,11 @@ void GB_macrofy_binop
     }
 
     //--------------------------------------------------------------------------
-    // return the u and f expressions
+    // return the u, f, and g expressions
     //--------------------------------------------------------------------------
 
     if (u_handle != NULL) (*u_handle) = u ;
     if (f_handle != NULL) (*f_handle) = f ;
+    if (g_handle != NULL) (*g_handle) = g ;
 }
 
diff --git a/GraphBLAS/Source/GB_macrofy_build.c b/GraphBLAS/Source/GB_macrofy_build.c
index 1c824908d4..2f3b481283 100644
--- a/GraphBLAS/Source/GB_macrofy_build.c
+++ b/GraphBLAS/Source/GB_macrofy_build.c
@@ -85,7 +85,7 @@ void GB_macrofy_build           // construct all macros for GB_build
 
     fprintf (fp, "\n// binary dup operator:\n") ;
     GB_macrofy_binop (fp, "GB_DUP", false, true, false, dup_ecode, false, dup,
-        NULL, NULL) ;
+        NULL, NULL, NULL) ;
 
     fprintf (fp, "\n// build copy/dup methods:\n") ;
 
diff --git a/GraphBLAS/Source/GB_macrofy_cast_expression.c b/GraphBLAS/Source/GB_macrofy_cast_expression.c
index 566ec1b071..001f26c9cf 100644
--- a/GraphBLAS/Source/GB_macrofy_cast_expression.c
+++ b/GraphBLAS/Source/GB_macrofy_cast_expression.c
@@ -10,6 +10,7 @@
 // Return a typecast expression to cast from xtype to ztype.
 
 #include "GB.h"
+#include "GB_math.h"
 #include "GB_stringify.h"
 
 const char *GB_macrofy_cast_expression  // return cast expression
diff --git a/GraphBLAS/Source/GB_macrofy_ewise.c b/GraphBLAS/Source/GB_macrofy_ewise.c
index 5f9ea7cb10..95168262a5 100644
--- a/GraphBLAS/Source/GB_macrofy_ewise.c
+++ b/GraphBLAS/Source/GB_macrofy_ewise.c
@@ -128,7 +128,7 @@ void GB_macrofy_ewise           // construct all macros for GrB_eWise
 
     fprintf (fp, "\n// binary operator%s:\n", flipxy ? " (flipped)" : "") ;
     GB_macrofy_binop (fp, "GB_BINOP", flipxy, false, true, binop_ecode, C_iso,
-        binaryop, NULL, NULL) ;
+        binaryop, NULL, NULL, NULL) ;
 
     if (binaryop->opcode == GB_SECOND_binop_code)
     { 
diff --git a/GraphBLAS/Source/GB_macrofy_monoid.c b/GraphBLAS/Source/GB_macrofy_monoid.c
index 46538ed278..a4fe05120b 100644
--- a/GraphBLAS/Source/GB_macrofy_monoid.c
+++ b/GraphBLAS/Source/GB_macrofy_monoid.c
@@ -24,7 +24,8 @@ void GB_macrofy_monoid  // construct the macros for a monoid
                         // semiring, times is normally a terminal monoid, but
                         // it's not worth exploiting in GrB_mxm.
     // output:
-    const char **u_expression
+    const char **u_expression,
+    const char **g_expression
 )
 {
 
@@ -39,7 +40,7 @@ void GB_macrofy_monoid  // construct the macros for a monoid
     //--------------------------------------------------------------------------
 
     GB_macrofy_binop (fp, "GB_ADD", false, true, false, add_ecode, C_iso,
-        op, NULL, u_expression) ;
+        op, NULL, u_expression, g_expression) ;
 
     //--------------------------------------------------------------------------
     // create macros for the identity value
@@ -260,6 +261,7 @@ void GB_macrofy_monoid  // construct the macros for a monoid
     // create macros for atomics on the CPU
     //--------------------------------------------------------------------------
 
+    fprintf (fp, "#define GB_Z_SIZE  %d\n", (int) zsize) ;
     fprintf (fp, "#define GB_Z_NBITS %d\n", 8 * (int) zsize) ;
 
     // atomic write
diff --git a/GraphBLAS/Source/GB_macrofy_multadd.c b/GraphBLAS/Source/GB_macrofy_multadd.c
new file mode 100644
index 0000000000..f739b65752
--- /dev/null
+++ b/GraphBLAS/Source/GB_macrofy_multadd.c
@@ -0,0 +1,47 @@
+//------------------------------------------------------------------------------
+// GB_macrofy_multadd: create a fused multiply-add operator
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+#include "GB_stringify.h"
+
+void GB_macrofy_multadd
+(
+    FILE *fp,
+    const char *update_expression,      // has the form "z = f(z,y)"
+    const char *multiply_expression,    // has the form "z = mult(x,y)"
+    bool flipxy
+)
+{
+
+    // CPU kernels can use the fused multiply-add
+    if (flipxy)
+    { 
+        fprintf (fp, "#define GB_MULTADD(z,y,x,j,k,i) ") ;
+    }
+    else
+    { 
+        fprintf (fp, "#define GB_MULTADD(z,x,y,i,k,j) ") ;
+    }
+    for (const char *p = update_expression ; (*p) != '\0' ; p++)
+    {
+        // all update operators have a single 'y'
+        if ((*p) == 'y')
+        { 
+            // inject the multiply operator; all have the form "z = ..."
+            fprintf (fp, "%s", multiply_expression + 4) ;
+        }
+        else
+        { 
+            // otherwise, print the update operator character
+            fprintf (fp, "%c", (*p)) ;
+        }
+    }
+    fprintf (fp, "\n") ;
+}
+
diff --git a/GraphBLAS/Source/GB_macrofy_mxm.c b/GraphBLAS/Source/GB_macrofy_mxm.c
index 58d917f3bd..919025e27c 100644
--- a/GraphBLAS/Source/GB_macrofy_mxm.c
+++ b/GraphBLAS/Source/GB_macrofy_mxm.c
@@ -2,7 +2,7 @@
 // GB_macrofy_mxm: construct all macros for a semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
@@ -10,10 +10,14 @@
 #include "GB.h"
 #include "GB_stringify.h"
 
-void GB_macrofy_mxm        // construct all macros for GrB_mxm
+//------------------------------------------------------------------------------
+// GB_macrofy_mxm: create all macros for GrB_mxm
+//------------------------------------------------------------------------------
+
+void GB_macrofy_mxm         // construct all macros for GrB_mxm
 (
     // output:
-    FILE *fp,                   // target file to write, already open
+    FILE *fp,               // target file to write, already open
     // input:
     uint64_t scode,
     GrB_Semiring semiring,  // the semiring to macrofy
@@ -109,10 +113,10 @@ void GB_macrofy_mxm        // construct all macros for GrB_mxm
     bool is_positional = GB_IS_BINARYOP_CODE_POSITIONAL (mult->opcode) ;
 
     fprintf (fp, "\n// monoid:\n") ;
-    const char *u_expr ;
+    const char *u_expr, *g_expr ;
     GB_macrofy_type (fp, "Z", "_", (zcode == 0) ? "GB_void" : ztype->name) ;
     GB_macrofy_monoid (fp, add_ecode, id_ecode, term_ecode, C_iso, monoid,
-        is_positional, &u_expr) ;
+        is_positional, &u_expr, &g_expr) ;
 
     //--------------------------------------------------------------------------
     // construct macros for the multiply operator
@@ -122,7 +126,7 @@ void GB_macrofy_mxm        // construct all macros for GrB_mxm
         flipxy ? " (flipped)" : "") ;
     const char *f_expr ;
     GB_macrofy_binop (fp, "GB_MULT", flipxy, false, false, mult_ecode, C_iso,
-        mult, &f_expr, NULL) ;
+        mult, &f_expr, NULL, NULL) ;
 
     //--------------------------------------------------------------------------
     // multiply-add operator
@@ -165,29 +169,20 @@ void GB_macrofy_mxm        // construct all macros for GrB_mxm
 
         // Since GB_MULT is not used, the fused GB_MULTADD must handle flipxy.
 
-        if (flipxy)
-        { 
-            fprintf (fp, "#define GB_MULTADD(z,y,x,j,k,i) ") ;
+        if (g_expr == NULL)
+        {
+            // the CPU and GPU use the same macro
+            GB_macrofy_multadd (fp, u_expr, f_expr, flipxy) ;
         }
         else
-        { 
-            fprintf (fp, "#define GB_MULTADD(z,x,y,i,k,j) ") ;
-        }
-        for (const char *p = u_expr ; (*p) != '\0' ; p++)
         {
-            // all update operators have a single 'y'
-            if ((*p) == 'y')
-            { 
-                // inject the multiply operator; all have the form "z = ..."
-                fprintf (fp, "%s", f_expr + 4) ;
-            }
-            else
-            { 
-                // otherwise, print the update operator character
-                fprintf (fp, "%c", (*p)) ;
-            }
+            // the CPU uses u_expr, and GPU uses g_expr
+            fprintf (fp, "#ifdef GB_CUDA_KERNEL\n") ;
+            GB_macrofy_multadd (fp, g_expr, f_expr, flipxy) ;
+            fprintf (fp, "#else\n") ;
+            GB_macrofy_multadd (fp, u_expr, f_expr, flipxy) ;
+            fprintf (fp, "#endif\n") ;
         }
-        fprintf (fp, "\n") ;
 
     }
     else
diff --git a/GraphBLAS/Source/GB_macrofy_preface.c b/GraphBLAS/Source/GB_macrofy_preface.c
index 0ad00c6f07..25363e5b75 100644
--- a/GraphBLAS/Source/GB_macrofy_preface.c
+++ b/GraphBLAS/Source/GB_macrofy_preface.c
@@ -14,9 +14,11 @@ void GB_macrofy_preface
 (
     FILE *fp,               // target file to write, already open
     char *kernel_name,      // name of the kernel
-    char *preface           // user-provided preface
+    char *C_preface,        // user-provided preface for CPU JIT kernels
+    char *CUDA_preface,     // user-provided preface for CUDA JIT kernels
+    GB_jit_kcode kcode
 )
-{ 
+{
 
     const char *date = GxB_IMPLEMENTATION_DATE ;
     int len = (int) strlen (date) ;
@@ -32,14 +34,25 @@ void GB_macrofy_preface
         "// The above copyright and license do not apply to any\n"
         "// user-defined types and operators defined below.\n"
         "//--------------------------------------"
-        "----------------------------------------\n"
-        "%s\n"
-        "#include \"GB_jit_kernel.h\"\n\n",
+        "----------------------------------------\n",
         kernel_name,
         GxB_IMPLEMENTATION_MAJOR,
         GxB_IMPLEMENTATION_MINOR,
         GxB_IMPLEMENTATION_SUB,
-        date + GB_IMAX (0, len - 4),
-        preface) ;
+        date + GB_IMAX (0, len - 4)) ;
+
+    if (kcode >= GB_JIT_CUDA_KERNEL)
+    {
+        // for CUDA JIT kernels
+        fprintf (fp, "#define GB_CUDA_KERNEL\n%s\n", CUDA_preface) ;
+    }
+    else
+    {
+        // CPU JIT kernels
+        fprintf (fp, "%s\n", C_preface) ;
+    }
+
+    // for all kernels: CPU and CUDA
+    fprintf (fp, "#include \"GB_jit_kernel.h\"\n\n") ;
 }
 
diff --git a/GraphBLAS/Source/GB_macrofy_query.c b/GraphBLAS/Source/GB_macrofy_query.c
index 87680f6190..c6ed31bbe8 100644
--- a/GraphBLAS/Source/GB_macrofy_query.c
+++ b/GraphBLAS/Source/GB_macrofy_query.c
@@ -20,7 +20,8 @@ void GB_macrofy_query
     GrB_Type type0,
     GrB_Type type1,
     GrB_Type type2,
-    uint64_t hash       // hash code for the kernel
+    uint64_t hash,      // hash code for the kernel
+    GB_jit_kcode kcode
 )
 {
 
@@ -28,8 +29,21 @@ void GB_macrofy_query
     // create the function header, and query the version
     //--------------------------------------------------------------------------
 
-    fprintf (fp, 
-        "GB_JIT_GLOBAL GB_JIT_QUERY_PROTO (GB_jit_query) ;\n"
+    if (kcode >= GB_JIT_CUDA_KERNEL)
+    {
+        // ensure the query function can be called from a C function
+        fprintf (fp, "extern \"C\"\n{\n") ;
+    }
+
+    fprintf (fp,
+        "GB_JIT_GLOBAL GB_JIT_QUERY_PROTO (GB_jit_query) ;\n") ;
+
+    if (kcode >= GB_JIT_CUDA_KERNEL)
+    {
+        fprintf (fp, "}\n") ;
+    }
+
+    fprintf (fp,
         "GB_JIT_GLOBAL GB_JIT_QUERY_PROTO (GB_jit_query)\n"
         "{\n"
         "    (*hash) = 0x%016" PRIx64 " ;\n"
diff --git a/GraphBLAS/Source/GB_macrofy_reduce.c b/GraphBLAS/Source/GB_macrofy_reduce.c
index 8c3f225f56..3ca25ac043 100644
--- a/GraphBLAS/Source/GB_macrofy_reduce.c
+++ b/GraphBLAS/Source/GB_macrofy_reduce.c
@@ -63,7 +63,7 @@ void GB_macrofy_reduce      // construct all macros for GrB_reduce to scalar
     fprintf (fp, "\n// monoid:\n") ;
     GB_macrofy_type (fp, "Z", "_", monoid->op->ztype->name) ;
     GB_macrofy_monoid (fp, red_ecode, id_ecode, term_ecode, false, monoid,
-        false, NULL) ;
+        false, NULL, NULL) ;
 
     fprintf (fp, "#define GB_GETA_AND_UPDATE(z,Ax,p)") ;
     if (atype == monoid->op->ztype)
diff --git a/GraphBLAS/Source/GB_macrofy_unop.c b/GraphBLAS/Source/GB_macrofy_unop.c
index 3f074e527e..2c8658c627 100644
--- a/GraphBLAS/Source/GB_macrofy_unop.c
+++ b/GraphBLAS/Source/GB_macrofy_unop.c
@@ -16,6 +16,7 @@
 //      #define GB_UNARYOP(z,x,j,i,y) z = f (x,i,j,y)
 
 #include "GB.h"
+#include "GB_math.h"
 #include "GB_stringify.h"
 #include <ctype.h>
 
diff --git a/GraphBLAS/Source/GB_mask.h b/GraphBLAS/Source/GB_mask.h
index 27c6338f77..5033419bb3 100644
--- a/GraphBLAS/Source/GB_mask.h
+++ b/GraphBLAS/Source/GB_mask.h
@@ -10,6 +10,7 @@
 #ifndef GB_MASK_H
 #define GB_MASK_H
 #include "GB.h"
+#include "GB_math.h"
 
 GrB_Info GB_mask                // C<M> = Z
 (
diff --git a/GraphBLAS/Source/GB_math.c b/GraphBLAS/Source/GB_math.c
index e4a6c1cf3b..c34f57ee11 100644
--- a/GraphBLAS/Source/GB_math.c
+++ b/GraphBLAS/Source/GB_math.c
@@ -8,6 +8,7 @@
 //------------------------------------------------------------------------------
 
 #include "GB.h"
+#include "GB_math.h"
 
 #if !GB_HAS_CMPLX_MACROS
 // complex constructors when the C compiler does not provide CMPLX and CMPLXF
diff --git a/GraphBLAS/Source/GB_math.h b/GraphBLAS/Source/GB_math.h
index 559cdf16e2..8720efd778 100644
--- a/GraphBLAS/Source/GB_math.h
+++ b/GraphBLAS/Source/GB_math.h
@@ -7,6 +7,8 @@
 
 //------------------------------------------------------------------------------
 
+#include "GB_casting.h"
+
 #ifndef GB_MATH_H
 #define GB_MATH_H
 
@@ -296,9 +298,6 @@ inline uint64_t GB_idiv_uint64 (uint64_t x, uint64_t y)
 
 // Three cases below are from ACM Algo 116, R. L. Smith, 1962.
 
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
-
 inline GxB_FC64_t GB_FC64_div (GxB_FC64_t x, GxB_FC64_t y)
 {
     double xr = GB_creal (x) ;
@@ -401,8 +400,6 @@ inline GxB_FC32_t GB_FC32_div (GxB_FC32_t x, GxB_FC32_t y)
 "    return (GJ_CMPLX32 ((float) GB_creal(zz), (float) GB_cimag(zz))) ; \n" \
 "}"
 
-#endif
-
 //------------------------------------------------------------------------------
 // z = x^y: wrappers for pow, powf, cpow, and cpowf
 //------------------------------------------------------------------------------
@@ -484,9 +481,6 @@ inline double GB_pow (double x, double y)
 "    return (pow (x, y)) ;                                           \n" \
 "}"
 
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
-
 inline GxB_FC32_t GB_FC32_pow (GxB_FC32_t x, GxB_FC32_t y)
 {
     float xr = GB_crealf (x) ;
@@ -608,7 +602,6 @@ inline GxB_FC64_t GB_FC64_pow (GxB_FC64_t x, GxB_FC64_t y)
 "    }                                                               \n" \
 "    return (GB_cpow (x, y)) ;                                       \n" \
 "}"
-#endif
 
 inline int8_t GB_pow_int8 (int8_t x, int8_t y)
 {
@@ -792,9 +785,6 @@ inline double GB_signum (double x)
 "    return ((double) ((x < 0) ? (-1) : ((x > 0) ? 1 : 0))) ;       \n" \
 "}"
 
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
-
 inline GxB_FC32_t GB_csignumf (GxB_FC32_t x)
 {
     if (GB_crealf (x) == 0 && GB_cimagf (x) == 0)
@@ -1258,5 +1248,3 @@ inline bool GB_cisfinite (GxB_FC64_t x)
 
 #endif
 
-#endif
-
diff --git a/GraphBLAS/Source/GB_ops.c b/GraphBLAS/Source/GB_ops.c
index ccd8749bb5..58c61e51ec 100644
--- a/GraphBLAS/Source/GB_ops.c
+++ b/GraphBLAS/Source/GB_ops.c
@@ -11,6 +11,8 @@
 // operators, index_unary operators, binary operators, monoids, and semirings.
 
 #include "GB.h"
+#include "GB_math.h"
+#include "GB_ops.h"
 
 //------------------------------------------------------------------------------
 // compiler flags
@@ -22,9 +24,7 @@
     #pragma warning (disable: 144 )
 #elif GB_COMPILER_GCC
     // disable gcc warnings
-    #if !defined ( __cplusplus )
     #pragma GCC diagnostic ignored "-Wincompatible-pointer-types"
-    #endif
 #elif GB_COMPILER_MSC
     // disable MS Visual Studio warnings
     GB_PRAGMA (warning (disable : 4146 ))
diff --git a/GraphBLAS/Source/GB_ops.h b/GraphBLAS/Source/GB_ops.h
index d938bda669..0745ec3a64 100644
--- a/GraphBLAS/Source/GB_ops.h
+++ b/GraphBLAS/Source/GB_ops.h
@@ -7,6 +7,10 @@
 
 //------------------------------------------------------------------------------
 
+#ifdef __cplusplus
+#error "not used for C++"
+#endif
+
 #ifndef GB_OPS_H
 #define GB_OPS_H
 
@@ -106,9 +110,6 @@ inline void GB_nonzombie_func (bool *z, const void *x,
 #define GB_DOUBLE
 #include "GB_ops_template.h"
 
-#if !defined ( GBCUDA_CPLUSPLUS )
-// TODO: does not yet work in CUDA
-
 #define GB_TYPE             GxB_FC32_t
 #define GB_XTYPE            FC32
 #define GB_BITS             64
@@ -127,5 +128,3 @@ inline void GB_nonzombie_func (bool *z, const void *x,
 
 #endif
 
-#endif
-
diff --git a/GraphBLAS/Source/GB_reduce_to_scalar.c b/GraphBLAS/Source/GB_reduce_to_scalar.c
index 838c5c5259..d743f6b9a8 100644
--- a/GraphBLAS/Source/GB_reduce_to_scalar.c
+++ b/GraphBLAS/Source/GB_reduce_to_scalar.c
@@ -114,7 +114,7 @@ GrB_Info GB_reduce_to_scalar    // z = reduce_to_scalar (A)
         //----------------------------------------------------------------------
 
         GrB_Matrix V = NULL ;
-        info = GB_cuda_reduce_to_scalar_jit (z, &V, monoid, A) ;
+        info = GB_cuda_reduce_to_scalar (z, &V, monoid, A) ;
 
         if (V != NULL)
         {
@@ -137,12 +137,12 @@ GrB_Info GB_reduce_to_scalar    // z = reduce_to_scalar (A)
             }
         }
 
-        // GB_cuda_reduce_to_scalar_jit may refuse to do the reduction and
+        // GB_cuda_reduce_to_scalar may refuse to do the reduction and
         // indicate this by returning GrB_NO_VALUE.  If so, the CPU will do it
         // below.
         if (!(info == GrB_SUCCESS || info == GrB_NO_VALUE))
         {
-            // GB_cuda_reduce_to_scalar_jit has returned an error
+            // GB_cuda_reduce_to_scalar has returned an error
             // (out of memory, or other error)
             return (info) ;
         }
diff --git a/GraphBLAS/Source/GB_reduce_to_scalar_jit.c b/GraphBLAS/Source/GB_reduce_to_scalar_jit.c
index 05ceaf6fb5..991c48b0e9 100644
--- a/GraphBLAS/Source/GB_reduce_to_scalar_jit.c
+++ b/GraphBLAS/Source/GB_reduce_to_scalar_jit.c
@@ -32,7 +32,8 @@ GrB_Info GB_reduce_to_scalar_jit    // z = reduce_to_scalar (A) via the JIT
 
     GB_jit_encoding encoding ;
     char *suffix ;
-    uint64_t hash = GB_encodify_reduce (&encoding, &suffix, monoid, A) ;
+    uint64_t hash = GB_encodify_reduce (&encoding, &suffix,
+        GB_JIT_KERNEL_REDUCE, monoid, A) ;
 
     //--------------------------------------------------------------------------
     // get the kernel function pointer, loading or compiling it if needed
diff --git a/GraphBLAS/Source/GB_select.h b/GraphBLAS/Source/GB_select.h
index 7c5e6598da..e049edd6af 100644
--- a/GraphBLAS/Source/GB_select.h
+++ b/GraphBLAS/Source/GB_select.h
@@ -10,6 +10,7 @@
 #ifndef GB_SELECT_H
 #define GB_SELECT_H
 #include "GB.h"
+#include "GB_math.h"
 #include "GB_is_nonzero.h"
 
 GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
diff --git a/GraphBLAS/Source/GB_stringify.h b/GraphBLAS/Source/GB_stringify.h
index 9b54b3d55d..630fcac01b 100644
--- a/GraphBLAS/Source/GB_stringify.h
+++ b/GraphBLAS/Source/GB_stringify.h
@@ -22,7 +22,9 @@ void GB_macrofy_preface
 (
     FILE *fp,               // target file to write, already open
     char *kernel_name,      // name of the kernel
-    char *preface           // user-provided preface
+    char *C_preface,        // user-provided preface for CPU JIT kernels
+    char *CUDA_preface,     // user-provided preface for CUDA JIT kernels
+    GB_jit_kcode kcode
 ) ;
 
 //------------------------------------------------------------------------------
@@ -73,6 +75,7 @@ uint64_t GB_encodify_reduce // encode a GrB_reduce problem
                                 // except for the suffix
     char **suffix,              // suffix for user-defined kernel
     // input:
+    const GB_jit_kcode kcode,   // kernel to encode
     GrB_Monoid monoid,      // the monoid to enumify
     GrB_Matrix A            // input matrix to reduce
 ) ;
@@ -422,10 +425,10 @@ void GB_enumify_mxm         // enumerate a GrB_mxm problem
     GrB_Matrix B
 ) ;
 
-void GB_macrofy_mxm        // construct all macros for GrB_mxm
+void GB_macrofy_mxm         // construct all macros for GrB_mxm
 (
     // output:
-    FILE *fp,                   // target file to write, already open
+    FILE *fp,               // target file to write, already open
     // input:
     uint64_t scode,
     GrB_Semiring semiring,  // the semiring to macrofy
@@ -434,6 +437,14 @@ void GB_macrofy_mxm        // construct all macros for GrB_mxm
     GrB_Type btype
 ) ;
 
+void GB_macrofy_multadd
+(
+    FILE *fp,
+    const char *update_expression,      // has the form "z = f(z,y)"
+    const char *multiply_expression,    // has the form "z = mult(x,y)"
+    bool flipxy
+) ;
+
 GrB_Info GB_AxB_saxpy3_jit      // C<M>=A*B, saxpy3, via the JIT
 (
     // input/output:
@@ -639,7 +650,8 @@ void GB_macrofy_monoid  // construct the macros for a monoid
                         // semiring, times is normally a terminal monoid, but
                         // it's not worth exploiting in GrB_mxm.
     // output:
-    const char **u_expression
+    const char **u_expression,
+    const char **g_expression
 ) ;
 
 bool GB_enumify_cuda_atomic         // return true if CUDA can do it atomically
@@ -665,7 +677,8 @@ void GB_macrofy_query
     GrB_Type type0,
     GrB_Type type1,
     GrB_Type type2,
-    uint64_t hash       // hash code for the kernel
+    uint64_t hash,      // hash code for the kernel
+    GB_jit_kcode kcode
 ) ;
 
 //------------------------------------------------------------------------------
@@ -687,16 +700,18 @@ void GB_macrofy_binop
     FILE *fp,
     // input:
     const char *macro_name,
-    bool flipxy,                // if true: op is f(y,x)
+    bool flipxy,                // if true: op is f(y,x) for a semiring
     bool is_monoid_or_build,    // if true: additive operator for monoid,
-                                // or binary op for GrB_Matrix_build
+                                // or binary op for GrB_Matrix_build, or
+                                // accum operator
     bool is_ewise,              // if true: binop for ewise methods
     int ecode,
     bool C_iso,                 // if true: C is iso
     GrB_BinaryOp op,            // NULL if C is iso
     // output:
-    const char **f_handle,
-    const char **u_handle
+    const char **f_handle,      // basic expression z=f(x,y)
+    const char **u_handle,      // update z=f(z,y) for the CPU
+    const char **g_handle       // update z=f(z,y) for the GPU (if different)
 ) ;
 
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GrB_Global_get.c b/GraphBLAS/Source/GrB_Global_get.c
index 31850b9e39..9bfb095a86 100644
--- a/GraphBLAS/Source/GrB_Global_get.c
+++ b/GraphBLAS/Source/GrB_Global_get.c
@@ -306,6 +306,11 @@ static GrB_Info GB_global_string_get (const char **value, int field)
             (*value) = GB_jitifyer_get_C_preface ( ) ;
             break ;
 
+        case GxB_JIT_CUDA_PREFACE : 
+
+            (*value) = GB_jitifyer_get_CUDA_preface ( ) ;
+            break ;
+
         case GxB_JIT_ERROR_LOG : 
 
             (*value) = GB_jitifyer_get_error_log ( ) ;
diff --git a/GraphBLAS/Source/GrB_Global_set.c b/GraphBLAS/Source/GrB_Global_set.c
index d8897cf680..540b69d06e 100644
--- a/GraphBLAS/Source/GrB_Global_set.c
+++ b/GraphBLAS/Source/GrB_Global_set.c
@@ -202,6 +202,10 @@ GrB_Info GrB_Global_set_String
 
             return (GB_jitifyer_set_C_preface (value)) ;
 
+        case GxB_JIT_CUDA_PREFACE : 
+
+            return (GB_jitifyer_set_CUDA_preface (value)) ;
+
         case GxB_JIT_ERROR_LOG : 
 
             return (GB_jitifyer_set_error_log (value)) ;
diff --git a/GraphBLAS/Source/GrB_init.c b/GraphBLAS/Source/GrB_init.c
index d0204449d4..c4fe7d0e51 100644
--- a/GraphBLAS/Source/GrB_init.c
+++ b/GraphBLAS/Source/GrB_init.c
@@ -8,7 +8,11 @@
 //------------------------------------------------------------------------------
 
 // GrB_init (or GxB_init) must called before any other GraphBLAS operation.
-// GrB_finalize must be called as the last GraphBLAS operation.
+// GrB_finalize must be called as the last GraphBLAS operation.  To use CUDA
+// and its RMM memory manager: use a mode of GxB_BLOCKING_GPU or
+// GxB_NONBLOCKING_GPU.
+
+// FIXME: rename GxB_*BLOCKING_GPU to GxB_*BLOCKING_CUDA.
 
 #include "GB.h"
 #include "GB_init.h"
@@ -29,8 +33,17 @@ GrB_Info GrB_init           // start up GraphBLAS
     // initialize GraphBLAS
     //--------------------------------------------------------------------------
 
-    // default:  use the C11 malloc memory manager, which is thread-safe 
-
+#if defined ( GRAPHBLAS_HAS_CUDA )
+    if (mode == GxB_BLOCKING_GPU || mode == GxB_NONBLOCKING_GPU)
+    {
+        return (GB_init (mode,              // blocking or non-blocking mode
+            // RMM C memory management functions
+            rmm_wrap_malloc, rmm_wrap_calloc, rmm_wrap_realloc, rmm_wrap_free,
+            Werk)) ;
+    }
+#endif
+
+    // default:  use the C11 malloc memory manager, which is thread-safe
     return (GB_init (mode,              // blocking or non-blocking mode
         malloc, calloc, realloc, free,  // ANSI C memory management functions
         Werk)) ;
diff --git a/GraphBLAS/Source/GxB_Global_Option_get.c b/GraphBLAS/Source/GxB_Global_Option_get.c
index 20b41cef7b..226a315fb7 100644
--- a/GraphBLAS/Source/GxB_Global_Option_get.c
+++ b/GraphBLAS/Source/GxB_Global_Option_get.c
@@ -334,6 +334,11 @@ GrB_Info GxB_Global_Option_get_CHAR     // gets the current global option
             (*value) = GB_jitifyer_get_C_preface ( ) ;
             break ;
 
+        case GxB_JIT_CUDA_PREFACE : 
+
+            (*value) = GB_jitifyer_get_CUDA_preface ( ) ;
+            break ;
+
         case GxB_JIT_ERROR_LOG : 
 
             (*value) = GB_jitifyer_get_error_log ( ) ;
@@ -910,6 +915,17 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
+        case GxB_JIT_CUDA_PREFACE : 
+
+            {
+                va_start (ap, field) ;
+                const char **preface = va_arg (ap, const char **) ;
+                va_end (ap) ;
+                GB_RETURN_IF_NULL (preface) ;
+                (*preface) = GB_jitifyer_get_CUDA_preface ( ) ;
+            }
+            break ;
+
         case GxB_JIT_C_CONTROL : 
 
             {
diff --git a/GraphBLAS/Source/GxB_Global_Option_set.c b/GraphBLAS/Source/GxB_Global_Option_set.c
index ed47db4aeb..05905ddfc2 100644
--- a/GraphBLAS/Source/GxB_Global_Option_set.c
+++ b/GraphBLAS/Source/GxB_Global_Option_set.c
@@ -267,6 +267,10 @@ GrB_Info GxB_Global_Option_set_CHAR      // set a global default option
 
             return (GB_jitifyer_set_C_preface (value)) ;
 
+        case GxB_JIT_CUDA_PREFACE : 
+
+            return (GB_jitifyer_set_CUDA_preface (value)) ;
+
         case GxB_JIT_ERROR_LOG : 
 
             return (GB_jitifyer_set_error_log (value)) ;
@@ -544,6 +548,15 @@ GrB_Info GxB_Global_Option_set      // set a global default option
                 return (GB_jitifyer_set_C_preface (C_preface)) ;
             }
 
+        case GxB_JIT_CUDA_PREFACE : 
+
+            {
+                va_start (ap, field) ;
+                char *CUDA_preface = va_arg (ap, char *) ;
+                va_end (ap) ;
+                return (GB_jitifyer_set_CUDA_preface (CUDA_preface)) ;
+            }
+
         case GxB_JIT_USE_CMAKE : 
 
             {
diff --git a/GraphBLAS/Source/GxB_init.c b/GraphBLAS/Source/GxB_init.c
index d1d999caee..fb453ccf50 100644
--- a/GraphBLAS/Source/GxB_init.c
+++ b/GraphBLAS/Source/GxB_init.c
@@ -53,7 +53,7 @@
 
 GrB_Info GxB_init           // start up GraphBLAS and also define malloc, etc
 (
-    GrB_Mode mode,          // blocking or non-blocking mode, GPU or not
+    GrB_Mode mode,          // blocking or non-blocking mode
 
     // pointers to memory management functions
     void * (* user_malloc_function  ) (size_t),         // required
diff --git a/GraphBLAS/Source/JitKernels/GB_jit_kernel.h b/GraphBLAS/Source/JitKernels/GB_jit_kernel.h
index 826a44cc70..f421485751 100644
--- a/GraphBLAS/Source/JitKernels/GB_jit_kernel.h
+++ b/GraphBLAS/Source/JitKernels/GB_jit_kernel.h
@@ -1,5 +1,5 @@
 //------------------------------------------------------------------------------
-// GB_jit_kernel.h:  JIT kernel #include for all kernels
+// GB_jit_kernel.h:  JIT kernel #include for all kernels (both CPU and CUDA)
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
@@ -13,10 +13,17 @@
 #define GB_JIT_KERNEL_H
 
 #define GB_JIT_KERNEL
-#include "GB_Template.h"
-#include "GB_jit_kernel_proto.h"
 
-// for JIT kernels
+#ifndef GB_CUDA_KERNEL
+    // for CPU JIT kernels:
+    #include "GB_Template.h"
+#else
+    // for CUDA JIT kernels:
+    #include "GB_cuda_kernel.cuh"
+#endif
+
+// for all JIT kernels
+#include "GB_jit_kernel_proto.h"
 #if defined (_MSC_VER) && !(defined (__INTEL_COMPILER) || defined(__INTEL_CLANG_COMPILER))
     #define GB_JIT_GLOBAL extern __declspec ( dllexport )
 #else
@@ -24,7 +31,7 @@
 #endif
 
 #ifndef GB_JIT_RUNTIME
-    // for PreJIT kernels
+    // for PreJIT kernels (CPU and CUDA)
     #include "GB_callbacks.h"
 #endif
 
diff --git a/GraphBLAS/Source/Template/GB_Template.h b/GraphBLAS/Source/Template/GB_Template.h
index c608047fa7..f89ae26041 100644
--- a/GraphBLAS/Source/Template/GB_Template.h
+++ b/GraphBLAS/Source/Template/GB_Template.h
@@ -27,6 +27,10 @@
 #include "GraphBLAS.h"
 #undef I
 
+#ifdef GBMATLAB
+#undef GRAPHBLAS_HAS_CUDA
+#endif
+
 //------------------------------------------------------------------------------
 // handle the restrict and 'static inline' keywords
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Template/GB_jit_kernel_proto.h b/GraphBLAS/Source/Template/GB_jit_kernel_proto.h
index 47b2fb6ed8..5c260789ab 100644
--- a/GraphBLAS/Source/Template/GB_jit_kernel_proto.h
+++ b/GraphBLAS/Source/Template/GB_jit_kernel_proto.h
@@ -15,7 +15,7 @@
 //------------------------------------------------------------------------------
 
 #define GB_JIT_QUERY_PROTO(query_func)                                  \
-bool query_func (uint64_t *hash, int v [3], char *defn [5],             \
+bool query_func (uint64_t *hash, int v [3], const char *defn [5],       \
     void *id, void *term, size_t id_size, size_t term_size)
 
 #define GB_JIT_KERNEL_USER_OP_PROTO(GB_jit_kernel_user_op)              \
@@ -557,6 +557,34 @@ GrB_Info GB_jit_kernel_union                                            \
     const bool M_is_B                                                   \
 )
 
+//------------------------------------------------------------------------------
+// CUDA JIT prototypes
+//------------------------------------------------------------------------------
+
+#define GB_JIT_CUDA_KERNEL_REDUCE_PROTO(GB_jit_kernel_reduce)           \
+GrB_Info GB_jit_kernel_reduce                                           \
+(                                                                       \
+    GB_void *zscalar,                                                   \
+    GrB_Matrix V,                                                       \
+    const GrB_Matrix A,                                                 \
+    cudaStream_t stream,                                                \
+    int32_t gridsz,                                                     \
+    int32_t blocksz                                                     \
+)
+
+#define GB_JIT_CUDA_KERNEL_DOT3_PROTO(GB_jit_kernel_AxB_dot3)           \
+GrB_Info GB_jit_kernel_AxB_dot3                                         \
+(                                                                       \
+    GrB_Matrix C,                                                       \
+    const GrB_Matrix M,                                                 \
+    const GrB_Matrix A,                                                 \
+    const GrB_Matrix B,                                                 \
+    cudaStream_t stream,                                                \
+    int device,                                                         \
+    int number_of_sms,                                                  \
+    const GB_callback_struct *restrict my_callback                      \
+)
+
 //------------------------------------------------------------------------------
 // shorthand macros for GB_prejit.c:
 //------------------------------------------------------------------------------
@@ -603,5 +631,12 @@ GrB_Info GB_jit_kernel_union                                            \
 #define JIT_UTYP(g) GB_JIT_KERNEL_USER_TYPE_PROTO(g) ;
 #define JIT_Q(q)    GB_JIT_QUERY_PROTO(q) ;
 
+//------------------------------------------------------------------------------
+// shorthand macros for GB_cuda_prejit.c:
+//------------------------------------------------------------------------------
+
+#define JIT_CUDA_RED(g)  GB_JIT_CUDA_KERNEL_REDUCE_PROTO(g) ;
+#define JIT_CUDA_DOT3(g) GB_JIT_CUDA_KERNEL_DOT3_PROTO(g) ;
+
 #endif
 
diff --git a/GraphBLAS/Source/codegen_aop.m b/GraphBLAS/Source/codegen_aop.m
index 5ebb329e63..709eaecbdb 100644
--- a/GraphBLAS/Source/codegen_aop.m
+++ b/GraphBLAS/Source/codegen_aop.m
@@ -17,7 +17,7 @@
 fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ;
 fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (fh, '// This file has been automatically generated from Generator/GB_aop.h') ;
-fprintf (fh, '\n\n') ;
+fprintf (fh, '\n#include "GB_math.h"\n\n') ;
 fclose (fh) ;
 
 % The ANY operator is not used as a binary operator in the generated functions.
diff --git a/GraphBLAS/Source/codegen_as.m b/GraphBLAS/Source/codegen_as.m
index 12796e3147..78a741395a 100644
--- a/GraphBLAS/Source/codegen_as.m
+++ b/GraphBLAS/Source/codegen_as.m
@@ -17,7 +17,7 @@
 fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ;
 fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (fh, '// This file has been automatically generated from Generator/GB_as.h') ;
-fprintf (fh, '\n\n') ;
+fprintf (fh, '\n#include "GB_math.h"\n\n') ;
 fclose (fh) ;
 
 codegen_as_template ('bool') ;
diff --git a/GraphBLAS/Source/codegen_axb.m b/GraphBLAS/Source/codegen_axb.m
index 354aed8efb..b0207b5290 100644
--- a/GraphBLAS/Source/codegen_axb.m
+++ b/GraphBLAS/Source/codegen_axb.m
@@ -28,7 +28,7 @@
     fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ;
     fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
     fprintf (fh, '// This file has been automatically generated from Generator/GB_AxB.h') ;
-    fprintf (fh, '\n\n') ;
+    fprintf (fh, '\n#include "GB_math.h"\n\n') ;
     fclose (fh) ;
 end
 
diff --git a/GraphBLAS/Source/codegen_ew.m b/GraphBLAS/Source/codegen_ew.m
index 93ea69ef85..8f705b9b84 100644
--- a/GraphBLAS/Source/codegen_ew.m
+++ b/GraphBLAS/Source/codegen_ew.m
@@ -17,7 +17,7 @@
 fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ;
 fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (fh, '// This file has been automatically generated from Generator/GB_ew.h') ;
-fprintf (fh, '\n\n') ;
+fprintf (fh, '\n#include "GB_math.h"\n\n') ;
 fclose (fh) ;
 
 % The ANY operator is not used as a binary operator in the generated functions.
diff --git a/GraphBLAS/Source/codegen_red.m b/GraphBLAS/Source/codegen_red.m
index 4b76152909..d48a24747d 100644
--- a/GraphBLAS/Source/codegen_red.m
+++ b/GraphBLAS/Source/codegen_red.m
@@ -17,7 +17,7 @@
 fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ;
 fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (fh, '// This file has been automatically generated from Generator/GB_red.h') ;
-fprintf (fh, '\n\n') ;
+fprintf (fh, '\n#include "GB_math.h"\n\n') ;
 fclose (fh) ;
 
 fh = fopen ('FactoryKernels/GB_bld__include.h', 'w') ;
@@ -28,7 +28,7 @@
 fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ;
 fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (fh, '// This file has been automatically generated from Generator/GB_bld.h') ;
-fprintf (fh, '\n\n') ;
+fprintf (fh, '\n#include "GB_math.h"\n\n') ;
 fclose (fh) ;
 
 %-------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/codegen_sel.m b/GraphBLAS/Source/codegen_sel.m
index 2ff31c8e32..dd41aca256 100644
--- a/GraphBLAS/Source/codegen_sel.m
+++ b/GraphBLAS/Source/codegen_sel.m
@@ -18,7 +18,7 @@
 fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ;
 fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (fh, '// This file has been automatically generated from Generator/GB_sel.h') ;
-fprintf (fh, '\n\n') ;
+fprintf (fh, '\n#include "GB_math.h"\n\n') ;
 fclose (fh) ;
 
 % NONZOMBIE:         name         selector                     type
diff --git a/GraphBLAS/Source/codegen_unop.m b/GraphBLAS/Source/codegen_unop.m
index 9753df6449..45116d7ab1 100644
--- a/GraphBLAS/Source/codegen_unop.m
+++ b/GraphBLAS/Source/codegen_unop.m
@@ -17,7 +17,7 @@
 fprintf (fh, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.\n') ;
 fprintf (fh, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (fh, '// This file has been automatically generated from Generator/GB_unop.h') ;
-fprintf (fh, '\n\n') ;
+fprintf (fh, '\n#include "GB_math.h"\n\n') ;
 fclose (fh) ;
 
 codegen_unop_identity ;
diff --git a/GraphBLAS/Tcov/.gitignore b/GraphBLAS/Tcov/.gitignore
new file mode 100644
index 0000000000..8faafc0802
--- /dev/null
+++ b/GraphBLAS/Tcov/.gitignore
@@ -0,0 +1,2 @@
+# ignore these files
+log_GB_mex_test21.txt
diff --git a/GraphBLAS/Tcov/log_GB_mex_test21.txt b/GraphBLAS/Tcov/log_GB_mex_test21.txt
deleted file mode 100644
index 369fe5ae07..0000000000
--- a/GraphBLAS/Tcov/log_GB_mex_test21.txt
+++ /dev/null
@@ -1,2574 +0,0 @@
-
-
-================================================================================
-GB_macrofy_cast_output, ztype NULL
-#define GB_PUTC(z,Cx,p)
-
-
-================================================================================
-GB_macrofy_cast_output, cast FC64 to bool
-#define GB_PUTC(z,Cx,p) Cx [p] = (GB_creal (z) != 0 || GB_cimag (z) != 0)
-
-
-================================================================================
-GB_assign_describe
-C<!> = A 
-
-
-================================================================================
-GB_enumify_ewise / GB_macrofy_ewise, C iso
-// op: symbolic only (C is iso)
-
-// binary operator types:
-#define GB_Z_TYPE void
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// binary operator:
-#define GB_BINOP(z,x,y,i,j)
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso)
-#define GB_COPY_B_to_C(Cx,pC,Bx,pB,B_iso)
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE void
-#define GB_PUTC(c,Cx,p)
-#define GB_EWISEOP(Cx,p,aij,bij,i,j)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-// B matrix: hypersparse
-#define GB_B_IS_HYPER  1
-#define GB_B_IS_SPARSE 0
-#define GB_B_IS_BITMAP 0
-#define GB_B_IS_FULL   0
-#define GBP_B(Bp,k,vlen) Bp [k]
-#define GBH_B(Bh,k)      Bh [k]
-#define GBI_B(Bi,p,vlen) Bi [p]
-#define GBB_B(Bb,p)      1
-#define GB_B_NVALS(e) int64_t e = B->nvals
-#define GB_B_NHELD(e) GB_B_NVALS(e)
-#define GB_B_ISO 0
-#define GB_B_TYPE bool
-#define GB_B2TYPE void
-#define GB_DECLAREB(b)
-#define GB_GETB(b,Bx,p,iso)
-
-#include "GB_ewise_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_ewise / GB_macrofy_ewise, C non iso
-// op: (and, bool)
-
-// binary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE bool
-#define GB_Y_TYPE bool
-
-// binary operator:
-#define GB_BINOP(z,x,y,i,j) z = ((x) && (y))
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA]
-#define GB_COPY_B_to_C(Cx,pC,Bx,pB,B_iso) Cx [pC] = Bx [pB]
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(c,Cx,p) Cx [p] = c
-#define GB_EWISEOP(Cx,p,aij,bij,i,j) GB_BINOP (Cx [p], aij, bij, i, j)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE bool
-#define GB_DECLAREA(a) bool a
-#define GB_GETA(a,Ax,p,iso) a = Ax [p]
-
-// B matrix: hypersparse
-#define GB_B_IS_HYPER  1
-#define GB_B_IS_SPARSE 0
-#define GB_B_IS_BITMAP 0
-#define GB_B_IS_FULL   0
-#define GBP_B(Bp,k,vlen) Bp [k]
-#define GBH_B(Bh,k)      Bh [k]
-#define GBI_B(Bi,p,vlen) Bi [p]
-#define GBB_B(Bb,p)      1
-#define GB_B_NVALS(e) int64_t e = B->nvals
-#define GB_B_NHELD(e) GB_B_NVALS(e)
-#define GB_B_ISO 0
-#define GB_B_TYPE bool
-#define GB_B2TYPE bool
-#define GB_DECLAREB(b) bool b
-#define GB_GETB(b,Bx,p,iso) b = Bx [p]
-
-#include "GB_ewise_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_mxm / GB_macrofy_mxm, C iso
-// semiring: symbolic only (C is iso)
-
-// monoid:
-#define GB_Z_TYPE void
-#define GB_UPDATE(z,y)
-#define GB_ADD(z,x,y)
-#define GB_DECLARE_IDENTITY(z)
-#define GB_DECLARE_IDENTITY_CONST(z)
-#define GB_IS_ANY_MONOID 1
-#define GB_Z_IGNORE_OVERFLOW 1
-#define GB_Z_NBITS 0
-#define GB_Z_ATOMIC_BITS 0
-
-// multiplicative operator (flipped):
-#define GB_MULT(z,x,y,i,k,j)
-
-// multiply-add operator:
-#define GB_MULTADD(z,x,y,i,k,j)
-
-// special cases:
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 1
-#define GB_C_TYPE void
-#define GB_PUTC(c,Cx,p)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 1
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-// B matrix: hypersparse
-#define GB_B_IS_HYPER  1
-#define GB_B_IS_SPARSE 0
-#define GB_B_IS_BITMAP 0
-#define GB_B_IS_FULL   0
-#define GBP_B(Bp,k,vlen) Bp [k]
-#define GBH_B(Bh,k)      Bh [k]
-#define GBI_B(Bi,p,vlen) Bi [p]
-#define GBB_B(Bb,p)      1
-#define GB_B_NVALS(e) int64_t e = B->nvals
-#define GB_B_NHELD(e) GB_B_NVALS(e)
-#define GB_B_ISO 1
-#define GB_B_IS_PATTERN 1
-#define GB_B_TYPE void
-#define GB_B2TYPE void
-#define GB_DECLAREB(b)
-#define GB_GETB(b,Bx,p,iso)
-
-#include "GB_mxm_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_mxm / GB_macrofy_mxm, any_pair, flipxy
-// semiring: symbolic only (C is iso)
-
-// monoid:
-#define GB_Z_TYPE void
-#define GB_UPDATE(z,y)
-#define GB_ADD(z,x,y)
-#define GB_DECLARE_IDENTITY(z)
-#define GB_DECLARE_IDENTITY_CONST(z)
-#define GB_IS_ANY_MONOID 1
-#define GB_Z_NBITS 0
-#define GB_Z_ATOMIC_BITS 0
-
-// multiplicative operator (flipped):
-#define GB_MULT(z,x,y,i,k,j)
-
-// multiply-add operator:
-#define GB_MULTADD(z,x,y,i,k,j)
-
-// special cases:
-#define GB_IS_ANY_PAIR_SEMIRING 1
-#define GB_IS_PAIR_MULTIPLIER 1
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE void
-#define GB_PUTC(c,Cx,p)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 1
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-// B matrix: hypersparse
-#define GB_B_IS_HYPER  1
-#define GB_B_IS_SPARSE 0
-#define GB_B_IS_BITMAP 0
-#define GB_B_IS_FULL   0
-#define GBP_B(Bp,k,vlen) Bp [k]
-#define GBH_B(Bh,k)      Bh [k]
-#define GBI_B(Bi,p,vlen) Bi [p]
-#define GBB_B(Bb,p)      1
-#define GB_B_NVALS(e) int64_t e = B->nvals
-#define GB_B_NHELD(e) GB_B_NVALS(e)
-#define GB_B_ISO 1
-#define GB_B_IS_PATTERN 1
-#define GB_B_TYPE void
-#define GB_B2TYPE void
-#define GB_DECLAREB(b)
-#define GB_GETB(b,Bx,p,iso)
-
-#include "GB_mxm_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_mxm / GB_macrofy_mxm, any_pair fp32
-// semiring: (any, pair (flipped), float)
-
-// monoid:
-#define GB_Z_TYPE float
-#define GB_ADD(z,x,y) z = y
-#define GB_UPDATE(z,y) z = y
-#define GB_DECLARE_IDENTITY(z) float z = 0
-#define GB_DECLARE_IDENTITY_CONST(z) const float z = 0
-#define GB_HAS_IDENTITY_BYTE 1
-#define GB_IDENTITY_BYTE 0x00
-#define GB_IS_ANY_MONOID 1
-#define GB_Z_NBITS 32
-#define GB_Z_ATOMIC_BITS 32
-#define GB_Z_HAS_ATOMIC_UPDATE 1
-#define GB_Z_HAS_OMP_ATOMIC_UPDATE 1
-#define GB_Z_HAS_CUDA_ATOMIC_BUILTIN 1
-#define GB_Z_CUDA_ATOMIC GB_cuda_atomic_write
-#define GB_Z_CUDA_ATOMIC_TYPE float
-
-// multiplicative operator (flipped):
-#define GB_MULT(z,y,x,j,k,i) z = 1
-
-// multiply-add operator:
-#define GB_MULTADD(z,y,x,j,k,i) z = 1
-
-// special cases:
-#define GB_IS_PAIR_MULTIPLIER 1
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE float
-#define GB_PUTC(c,Cx,p) Cx [p] = c
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 1
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-// B matrix: hypersparse
-#define GB_B_IS_HYPER  1
-#define GB_B_IS_SPARSE 0
-#define GB_B_IS_BITMAP 0
-#define GB_B_IS_FULL   0
-#define GBP_B(Bp,k,vlen) Bp [k]
-#define GBH_B(Bh,k)      Bh [k]
-#define GBI_B(Bi,p,vlen) Bi [p]
-#define GBB_B(Bb,p)      1
-#define GB_B_NVALS(e) int64_t e = B->nvals
-#define GB_B_NHELD(e) GB_B_NVALS(e)
-#define GB_B_ISO 1
-#define GB_B_IS_PATTERN 1
-#define GB_B_TYPE void
-#define GB_B2TYPE void
-#define GB_DECLAREB(b)
-#define GB_GETB(b,Bx,p,iso)
-
-#include "GB_mxm_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: rowindex
-// op: (rowindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int32_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((i) + (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_DEPENDS_ON_Y 1
-#define GB_ROWINDEX_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: rowindex
-// op: (rowindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((i) + (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_DEPENDS_ON_Y 1
-#define GB_ROWINDEX_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: colindex
-// op: (colindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int32_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) + (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_COLINDEX_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: colindex
-// op: (colindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) + (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_COLINDEX_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: diagindex
-// op: (diagindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int32_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) - ((i) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_OFFDIAG_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: diagindex
-// op: (diagindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) - ((i) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_OFFDIAG_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: tril
-// op: (tril, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) <= ((i) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_TRIL_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: triu
-// op: (triu, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) >= ((i) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_TRIU_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: diag
-// op: (diag, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) == ((i) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_DIAG_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: offdiag
-// op: (offdiag, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) != ((i) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_OFFDIAG_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: colle
-// op: (colle, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) <= (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_COLLE_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: colgt
-// op: (colgt, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) > (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_COLGT_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: rowle
-// op: (rowle, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((i) <= (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_DEPENDS_ON_Y 1
-#define GB_ROWLE_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: rowgt
-// op: (rowgt, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((i) > (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_DEPENDS_ON_Y 1
-#define GB_ROWGT_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: flipdiagindex
-// op: (flipdiagindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int32_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((i) - ((j) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_ENTRY_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: flipdiagindex
-// op: (flipdiagindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((i) - ((j) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_ENTRY_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: opi32
-// op: opi32func, ztype: GxB_FC32_t, xtype: GxB_FC32_t, ytype: GxB_FC32_t
-
-// unary operator types:
-#define GB_Z_TYPE GxB_FC32_t
-#define GB_X_TYPE GxB_FC32_t
-#define GB_Y_TYPE GxB_FC32_t
-
-// index unary operator:
-#ifndef GB_GUARD_opi32func_DEFINED
-#define GB_GUARD_opi32func_DEFINED
-GB_STATIC_INLINE
-void opi32func (GxB_FC32_t *z, const GxB_FC32_t *x, GrB_Index i, GrB_Index j, 
-    const GxB_FC32_t *y)   
-{                          
-    (*z) = (*x) ;          
-}
-#define GB_opi32func_USER_DEFN \
-"void opi32func (GxB_FC32_t *z, const GxB_FC32_t *x, GrB_Index i, GrB_Index j, \n" \
-"    const GxB_FC32_t *y)   \n" \
-"{                          \n" \
-"    (*z) = (*x) ;          \n" \
-"}"
-#endif
-#define GB_IDXUNOP(z,x,i,j,y) opi32func (&(z), &(x), i, j, &(y))
-#define GB_DEPENDS_ON_X 1
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_ENTRY_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_DECLAREA (x) ;                  \
-    GB_GETA (x, Ax, p, ) ;             \
-    GB_IDXUNOP (z, x, i, j, y) ;       \
-    bool keep = (GB_crealf (z) != 0 || GB_cimagf (z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE GxB_FC32_t
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE GxB_FC32_t
-#define GB_A2TYPE GxB_FC32_t
-#define GB_DECLAREA(a) GxB_FC32_t a
-#define GB_GETA(a,Ax,p,iso) a = Ax [p]
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: one
-// op: (one, void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = 1
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 0
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \
-{                                              \
-    GB_DECLAREZ (z) ;                          \
-    GB_UNARYOP (z, aij, Ax [pA],  ,  ,  ) ; \
-    GB_PUTC (z, Cx, pC) ;                      \
-}
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c)
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positioni
-// op: (positioni, void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (i)
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA], i,  ,  )
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = c
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positioni
-// op: (positioni, void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (i)
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \
-{                                              \
-    GB_DECLAREZ (z) ;                          \
-    GB_UNARYOP (z, aij, Ax [pA], i,  ,  ) ; \
-    GB_PUTC (z, Cx, pC) ;                      \
-}
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c)
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positioni1
-// op: (positioni1, void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (i) + 1
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA], i,  ,  )
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = c
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positioni1
-// op: (positioni1, void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (i) + 1
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \
-{                                              \
-    GB_DECLAREZ (z) ;                          \
-    GB_UNARYOP (z, aij, Ax [pA], i,  ,  ) ; \
-    GB_PUTC (z, Cx, pC) ;                      \
-}
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c)
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positionj
-// op: (positionj, void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (j)
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA],  , j,  )
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = c
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positionj
-// op: (positionj, void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (j)
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \
-{                                              \
-    GB_DECLAREZ (z) ;                          \
-    GB_UNARYOP (z, aij, Ax [pA],  , j,  ) ; \
-    GB_PUTC (z, Cx, pC) ;                      \
-}
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c)
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positionj1
-// op: (positionj1, void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (j) + 1
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA],  , j,  )
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = c
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positionj1
-// op: (positionj1, void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (j) + 1
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \
-{                                              \
-    GB_DECLAREZ (z) ;                          \
-    GB_UNARYOP (z, aij, Ax [pA],  , j,  ) ; \
-    GB_PUTC (z, Cx, pC) ;                      \
-}
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c)
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: sqrt
-// op: (sqrt, GxB_FC64_t)
-
-// unary operator types:
-#define GB_Z_TYPE GxB_FC64_t
-#define GB_X_TYPE GxB_FC64_t
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = GB_csqrt (x)
-#define GB_DEPENDS_ON_X 1
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 0
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \
-{                                      \
-    GB_DECLAREA (aij) ;                \
-    GB_GETA (aij, Ax, pA, ) ;          \
-    GB_DECLAREZ (z) ;                  \
-    GB_UNARYOP (z, aij,  ,  ,  ) ;  \
-    GB_PUTC (z, Cx, pC) ;              \
-}
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#ifndef GB_GUARD_GJ_cast_to_int32_DEFINED
-#define GB_GUARD_GJ_cast_to_int32_DEFINED
-GB_STATIC_INLINE
-int32_t GJ_cast_to_int32 (double x)                    
-{                                                      
-    if (isnan (x)) return (0) ;                        
-    if (x <= (double) INT32_MIN) return (INT32_MIN) ;  
-    if (x >= (double) INT32_MAX) return (INT32_MAX) ;  
-    return ((int32_t) x) ;                             
-}
-#endif
-#define GB_PUTC(c,Cx,p) Cx [p] = GJ_cast_to_int32 (GB_creal (c))
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_TYPE int32_t
-#define GB_A2TYPE GxB_FC64_t
-#define GB_DECLAREA(a) GxB_FC64_t a
-#define GB_GETA(a,Ax,p,iso) a = (GxB_FC64_t) (Ax [p])
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_build / GB_macrofy_build: times
-// op: (times, GxB_FC32_t)
-
-// binary dup operator types:
-#define GB_Z_TYPE GxB_FC32_t
-#define GB_X_TYPE GxB_FC32_t
-#define GB_Y_TYPE GxB_FC32_t
-
-// S and T data types:
-#define GB_T_TYPE bool
-#define GB_S_TYPE bool
-
-// binary dup operator:
-#define GB_DUP(z,x,y) z = GB_FC32_mul (x,y)
-#define GB_UPDATE(z,y) GB_DUP(z,z,y)
-
-// build copy/dup methods:
-#define GB_BLD_COPY(Tx,p,Sx,k) Tx [p] = Sx [k]
-#define GB_BLD_DUP(Tx,p,Sx,k) \
-    GxB_FC32_t y = (GxB_FC32_t) Sx [k] ; \
-    GxB_FC32_t x = (GxB_FC32_t) Tx [p] ; \
-    GxB_FC32_t z ; \
-    GB_DUP (z, x, y) ; \
-    Tx [p] = (GB_crealf (z) != 0 || GB_cimagf (z) != 0) ;
-
-#include "GB_kernel_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_build / GB_macrofy_build: and
-// op: (and, bool)
-
-// binary dup operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE bool
-#define GB_Y_TYPE bool
-
-// S and T data types:
-#define GB_T_TYPE GxB_FC32_t
-#define GB_S_TYPE GxB_FC32_t
-
-// binary dup operator:
-#define GB_DUP(z,x,y) z = ((x) && (y))
-#define GB_UPDATE(z,y) z &= y
-
-// build copy/dup methods:
-#define GB_BLD_COPY(Tx,p,Sx,k) Tx [p] = Sx [k]
-#define GB_BLD_DUP(Tx,p,Sx,k) \
-    bool y = (GB_crealf (Sx [k]) != 0 || GB_cimagf (Sx [k]) != 0) ; \
-    bool x = (GB_crealf (Tx [p]) != 0 || GB_cimagf (Tx [p]) != 0) ; \
-    bool z ; \
-    GB_DUP (z, x, y) ; \
-    Tx [p] = (GxB_FC32_t) z ;
-
-#include "GB_kernel_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)=A (assign) 
-// assign/subassign: C(I,J) = A 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: not present
-
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA]
-#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \
-    GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso)
-#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p]
-#define GB_COPY_C_to_xwork(xwork,Cx,p)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(lo:s:hi,lo:s:hi)=A (assign) 
-// assign/subassign: C(I,J) = A 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_STRIDE
-#define GB_J_KIND GB_STRIDE
-#define GB_C_REPLACE 0
-// accum: not present
-
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA]
-#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \
-    GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso)
-#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p]
-#define GB_COPY_C_to_xwork(xwork,Cx,p)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(i,J)=s (row assign) 
-// assign/subassign: C(i,J) = A 
-#define GB_ASSIGN_KIND GB_ROW_ASSIGN
-#define GB_I_KIND GB_ALL
-#define GB_J_KIND GB_LIST
-#define GB_C_REPLACE 0
-// accum: not present
-
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar
-#define GB_COPY_scalar_to_C(Cx,pC,cwork) Cx [pC] = cwork
-#define GB_COPY_C_to_xwork(xwork,Cx,p)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// scalar:
-#define GB_A_TYPE bool
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(I,j)=s (col assign) 
-// assign/subassign: C(I,j) = A 
-#define GB_ASSIGN_KIND GB_COL_ASSIGN
-#define GB_I_KIND GB_LIST
-#define GB_J_KIND GB_ALL
-#define GB_C_REPLACE 0
-// accum: not present
-
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar
-#define GB_COPY_scalar_to_C(Cx,pC,cwork) Cx [pC] = cwork
-#define GB_COPY_C_to_xwork(xwork,Cx,p)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// scalar:
-#define GB_A_TYPE bool
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)=A (assign) 
-// assign/subassign: C(I,J) = scalar 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: not present
-
-
-// C matrix: full
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   1
-#define GBP_C(Cp,k,vlen) ((k) * (vlen))
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) ((p) % (vlen))
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim)
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 1
-#define GB_C_TYPE void
-#define GB_PUTC(cwork,Cx,p)
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar
-#define GB_COPY_scalar_to_C(Cx,pC,cwork)
-#define GB_COPY_C_to_xwork(xwork,Cx,pC)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// scalar:
-#define GB_A_TYPE bool
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)+=s (assign) 
-// assign/subassign: C(I,J) += scalar 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: (plus, float)
-
-// accum operator types:
-#define GB_Z_TYPE float
-#define GB_X_TYPE float
-#define GB_Y_TYPE float
-#define GB_DECLAREZ(zwork) float zwork
-#define GB_DECLAREX(xwork) float xwork
-#define GB_DECLAREY(ywork) float ywork
-
-// accum operator:
-#define GB_UPDATE(z,y)
-#define GB_ACCUM_OP(z,x,y)
-#define GB_ACCUMULATE_scalar(Cx,pC,ywork)
-
-// C matrix: full
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   1
-#define GBP_C(Cp,k,vlen) ((k) * (vlen))
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) ((p) % (vlen))
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim)
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 1
-#define GB_C_TYPE void
-#define GB_PUTC(zwork,Cx,p)
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar
-#define GB_COPY_scalar_to_C(Cx,pC,cwork)
-#define GB_COPY_C_to_xwork(xwork,Cx,pC)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// scalar:
-#define GB_A_TYPE bool
-#define GB_COPY_scalar_to_ywork(ywork,scalar) ywork = (float) (scalar)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)+=s (assign) 
-// assign/subassign: C(I,J) += scalar 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: (plus, float)
-
-// accum operator types:
-#define GB_Z_TYPE float
-#define GB_X_TYPE float
-#define GB_Y_TYPE float
-#define GB_DECLAREZ(zwork) float zwork
-#define GB_DECLAREX(xwork) float xwork
-#define GB_DECLAREY(ywork) float ywork
-
-// accum operator:
-#define GB_UPDATE(z,y)
-#define GB_ACCUM_OP(z,x,y)
-#define GB_ACCUMULATE_scalar(Cx,pC,ywork)
-
-// C matrix: full
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   1
-#define GBP_C(Cp,k,vlen) ((k) * (vlen))
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) ((p) % (vlen))
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim)
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 1
-#define GB_C_TYPE void
-#define GB_PUTC(zwork,Cx,p)
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar
-#define GB_COPY_scalar_to_C(Cx,pC,cwork)
-#define GB_COPY_C_to_xwork(xwork,Cx,pC)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// scalar:
-#define GB_A_TYPE bool
-#define GB_COPY_scalar_to_ywork(ywork,scalar) ywork = (float) (scalar)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)+=A (assign) 
-// assign/subassign: C(I,J) += A 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: (plus, float)
-
-// accum operator types:
-#define GB_Z_TYPE float
-#define GB_X_TYPE float
-#define GB_Y_TYPE float
-#define GB_DECLAREZ(zwork) float zwork
-#define GB_DECLAREX(xwork) float xwork
-#define GB_DECLAREY(ywork) float ywork
-
-// accum operator:
-#define GB_ACCUM_OP(z,x,y) z = (x) + (y)
-#define GB_UPDATE(z,y) z += y
-#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) \
-{                                          \
-    GB_DECLAREY (ywork) ;                  \
-    GB_GETA (ywork, Ax, pA, ) ;            \
-    GB_DECLAREX (xwork) ;                  \
-    GB_COPY_C_to_xwork (xwork, Cx, pC) ;   \
-    GB_DECLAREZ (zwork) ;                  \
-    GB_ACCUM_OP (zwork, xwork, ywork) ;          \
-    GB_PUTC (zwork, Cx, pC) ;              \
-}
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(zwork,Cx,p) Cx [p] = ((zwork) != 0)
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA]
-#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \
-    GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso)
-#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p]
-#define GB_COPY_C_to_xwork(xwork,Cx,p) xwork = (float) (Cx [p])
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE float
-#define GB_DECLAREA(a) float a
-#define GB_GETA(a,Ax,p,iso) a = (float) (Ax [p])
-#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)&=A (assign) 
-// assign/subassign: C(I,J) &= A 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: (and, bool)
-
-// accum operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE bool
-#define GB_Y_TYPE bool
-#define GB_DECLAREZ(zwork) bool zwork
-#define GB_DECLAREX(xwork) bool xwork
-#define GB_DECLAREY(ywork) bool ywork
-
-// accum operator:
-#define GB_ACCUM_OP(z,x,y) z = ((x) && (y))
-#define GB_UPDATE(z,y) z &= y
-#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) \
-{                                          \
-    GB_UPDATE (Cx [pC], Ax [pA]) ;          \
-}
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(zwork,Cx,p) Cx [p] = zwork
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA]
-#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \
-    GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso)
-#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p]
-#define GB_COPY_C_to_xwork(xwork,Cx,p) xwork = Cx [p]
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE bool
-#define GB_DECLAREA(a) bool a
-#define GB_GETA(a,Ax,p,iso) a = Ax [p]
-#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)<=A (assign) 
-// assign/subassign: C(I,J) lt= A 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: (lt, float)
-
-// accum operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE float
-#define GB_Y_TYPE float
-#define GB_DECLAREZ(zwork) bool zwork
-#define GB_DECLAREX(xwork) float xwork
-#define GB_DECLAREY(ywork) float ywork
-
-// accum operator:
-#define GB_ACCUM_OP(z,x,y) z = ((x) < (y))
-#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) \
-{                                          \
-    GB_DECLAREY (ywork) ;                  \
-    GB_GETA (ywork, Ax, pA, ) ;            \
-    GB_DECLAREX (xwork) ;                  \
-    GB_COPY_C_to_xwork (xwork, Cx, pC) ;   \
-    GB_ACCUM_OP (Cx [pC], xwork, ywork) ;          \
-}
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(zwork,Cx,p) Cx [p] = zwork
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA]
-#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \
-    GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso)
-#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p]
-#define GB_COPY_C_to_xwork(xwork,Cx,p) xwork = (float) (Cx [p])
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE float
-#define GB_DECLAREA(a) float a
-#define GB_GETA(a,Ax,p,iso) a = (float) (Ax [p])
-#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)<=H (assign) 
-// assign/subassign: C(I,J) lt= A 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: (lt, float)
-
-// accum operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE float
-#define GB_Y_TYPE float
-#define GB_DECLAREZ(zwork) bool zwork
-#define GB_DECLAREX(xwork) float xwork
-#define GB_DECLAREY(ywork) float ywork
-
-// accum operator:
-#define GB_ACCUM_OP(z,x,y)
-#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork)
-
-// C matrix: full
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   1
-#define GBP_C(Cp,k,vlen) ((k) * (vlen))
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) ((p) % (vlen))
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim)
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 1
-#define GB_C_TYPE void
-#define GB_PUTC(zwork,Cx,p)
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso)
-#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork)
-#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p]
-#define GB_COPY_C_to_xwork(xwork,Cx,pC)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE float
-#define GB_DECLAREA(a) float a
-#define GB_GETA(a,Ax,p,iso) a = (float) (Ax [p])
-#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso)
-
-#include "GB_assign_shared_definitions.h"
diff --git a/GraphBLAS/Test/.gitignore b/GraphBLAS/Test/.gitignore
new file mode 100644
index 0000000000..8faafc0802
--- /dev/null
+++ b/GraphBLAS/Test/.gitignore
@@ -0,0 +1,2 @@
+# ignore these files
+log_GB_mex_test21.txt
diff --git a/GraphBLAS/Test/GB_mex_test11.c b/GraphBLAS/Test/GB_mex_test11.c
index 6ee30630ae..cd33b115c3 100644
--- a/GraphBLAS/Test/GB_mex_test11.c
+++ b/GraphBLAS/Test/GB_mex_test11.c
@@ -235,6 +235,17 @@ if (jit_enabled)
     OK (GxB_Global_Option_get_CHAR (GxB_JIT_C_PREFACE, &t)) ;
     CHECK (MATCH (t, "// more stuff here")) ;
 
+    OK (GxB_get (GxB_JIT_CUDA_PREFACE, &s)) ;
+    printf ("default CUDA preface [%s]\n", s) ;
+    OK (GxB_set (GxB_JIT_CUDA_PREFACE, "// cuda stuff here")) ;
+    OK (GxB_get (GxB_JIT_CUDA_PREFACE, &s)) ;
+    CHECK (MATCH (s, "// cuda stuff here")) ;
+    OK (GxB_Global_Option_get_CHAR (GxB_JIT_CUDA_PREFACE, &t)) ;
+    CHECK (MATCH (t, "// cuda stuff here")) ;
+    OK (GxB_Global_Option_set_CHAR (GxB_JIT_CUDA_PREFACE,
+        "// more cuda stuff here")) ;
+    OK (GxB_Global_Option_get_CHAR (GxB_JIT_CUDA_PREFACE, &t)) ;
+    CHECK (MATCH (t, "// more cuda stuff here")) ;
 
     OK (GxB_Type_new (&MyType, 0, "mytype", "typedef double mytype ;")) ;
     OK (GxB_Type_size (&mysize, MyType)) ;
diff --git a/GraphBLAS/Test/GB_mex_test13.c b/GraphBLAS/Test/GB_mex_test13.c
index fa359450bb..1e7872a342 100644
--- a/GraphBLAS/Test/GB_mex_test13.c
+++ b/GraphBLAS/Test/GB_mex_test13.c
@@ -57,6 +57,7 @@ void mexFunction
     ERR (GxB_Global_Option_set_CHAR (GxB_JIT_C_COMPILER_FLAGS, NULL)) ;
     ERR (GxB_Global_Option_set_CHAR (GxB_JIT_C_LINKER_FLAGS, NULL)) ;
     ERR (GxB_Global_Option_set_CHAR (GxB_JIT_C_PREFACE, NULL)) ;
+    ERR (GxB_Global_Option_set_CHAR (GxB_JIT_CUDA_PREFACE, NULL)) ;
     OK (GxB_Global_Option_set_CHAR (GxB_JIT_ERROR_LOG, NULL)) ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Test/GB_mex_test16.c b/GraphBLAS/Test/GB_mex_test16.c
index 21e955e111..998f111825 100644
--- a/GraphBLAS/Test/GB_mex_test16.c
+++ b/GraphBLAS/Test/GB_mex_test16.c
@@ -120,7 +120,7 @@ void mexFunction
         GrB_FP32, NULL, false, false, s, false, A, B) ;
     CHECK (code == UINT64_MAX) ;
 
-    code = GB_encodify_reduce (&e, &suffix, mon, A) ;
+    code = GB_encodify_reduce (&e, &suffix, GB_JIT_KERNEL_REDUCE, mon, A) ;
     CHECK (code == UINT64_MAX) ;
 
     code = GB_encodify_assign (&e, &suffix, 0, C, false, 0, 0, NULL,
diff --git a/GraphBLAS/Test/GB_mex_test21.c b/GraphBLAS/Test/GB_mex_test21.c
index 37a5bad838..ef7f4c58dc 100644
--- a/GraphBLAS/Test/GB_mex_test21.c
+++ b/GraphBLAS/Test/GB_mex_test21.c
@@ -74,7 +74,7 @@ void mexFunction
     const char *a, *cuda_type ;
     bool user_monoid_atomically ;
     bool has_cheeseburger = GB_enumify_cuda_atomic (&a,
-        &user_monoid_atomically, &cuda_type, NULL, 0, sizeof (uint16_t), 0) ;
+        &user_monoid_atomically, &cuda_type, NULL, 0, sizeof (uint32_t), 0) ;
     CHECK (!has_cheeseburger) ;
     CHECK (user_monoid_atomically) ;
     CHECK (cuda_type == NULL) ;
diff --git a/GraphBLAS/Test/GB_mex_test29.c b/GraphBLAS/Test/GB_mex_test29.c
index 0ec8a45325..14a5b748e5 100644
--- a/GraphBLAS/Test/GB_mex_test29.c
+++ b/GraphBLAS/Test/GB_mex_test29.c
@@ -347,6 +347,15 @@ void mexFunction
     OK (GrB_Global_get_String_ (GrB_GLOBAL, defn2, GxB_JIT_C_PREFACE)) ;
     CHECK (MATCH (defn2, defn)) ;
 
+    OK (GrB_Global_get_String_ (GrB_GLOBAL, defn, GxB_JIT_CUDA_PREFACE)) ;
+    printf ("JIT CUDA preface: [%s]\n", defn) ;
+    OK (GrB_Global_set_String_ (GrB_GLOBAL, "// cu", GxB_JIT_CUDA_PREFACE)) ;
+    OK (GrB_Global_get_String_ (GrB_GLOBAL, defn2, GxB_JIT_CUDA_PREFACE)) ;
+    CHECK (MATCH (defn2, "// cu")) ;
+    OK (GrB_Global_set_String_ (GrB_GLOBAL, defn, GxB_JIT_CUDA_PREFACE)) ;
+    OK (GrB_Global_get_String_ (GrB_GLOBAL, defn2, GxB_JIT_CUDA_PREFACE)) ;
+    CHECK (MATCH (defn2, defn)) ;
+
     OK (GrB_Global_get_String_ (GrB_GLOBAL, defn, GxB_JIT_ERROR_LOG)) ;
     printf ("JIT error log: [%s]\n", defn) ;
     OK (GrB_Global_set_String_ (GrB_GLOBAL, "errlog.txt", GxB_JIT_ERROR_LOG)) ;
diff --git a/GraphBLAS/Test/GB_mex_test9.c b/GraphBLAS/Test/GB_mex_test9.c
index e4f164c6b8..1f5de9ce09 100644
--- a/GraphBLAS/Test/GB_mex_test9.c
+++ b/GraphBLAS/Test/GB_mex_test9.c
@@ -174,7 +174,7 @@ void mexFunction
 
     FILE *fp = fopen ("/tmp/GB_tcov_gunk.h", "w") ;
     GB_macrofy_binop (fp, "nothing", false, false, false,
-        199, false, NULL, NULL, NULL) ;
+        199, false, NULL, NULL, NULL, NULL) ;
     fclose (fp) ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Test/log_GB_mex_test21.txt b/GraphBLAS/Test/log_GB_mex_test21.txt
deleted file mode 100644
index 369fe5ae07..0000000000
--- a/GraphBLAS/Test/log_GB_mex_test21.txt
+++ /dev/null
@@ -1,2574 +0,0 @@
-
-
-================================================================================
-GB_macrofy_cast_output, ztype NULL
-#define GB_PUTC(z,Cx,p)
-
-
-================================================================================
-GB_macrofy_cast_output, cast FC64 to bool
-#define GB_PUTC(z,Cx,p) Cx [p] = (GB_creal (z) != 0 || GB_cimag (z) != 0)
-
-
-================================================================================
-GB_assign_describe
-C<!> = A 
-
-
-================================================================================
-GB_enumify_ewise / GB_macrofy_ewise, C iso
-// op: symbolic only (C is iso)
-
-// binary operator types:
-#define GB_Z_TYPE void
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// binary operator:
-#define GB_BINOP(z,x,y,i,j)
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso)
-#define GB_COPY_B_to_C(Cx,pC,Bx,pB,B_iso)
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE void
-#define GB_PUTC(c,Cx,p)
-#define GB_EWISEOP(Cx,p,aij,bij,i,j)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-// B matrix: hypersparse
-#define GB_B_IS_HYPER  1
-#define GB_B_IS_SPARSE 0
-#define GB_B_IS_BITMAP 0
-#define GB_B_IS_FULL   0
-#define GBP_B(Bp,k,vlen) Bp [k]
-#define GBH_B(Bh,k)      Bh [k]
-#define GBI_B(Bi,p,vlen) Bi [p]
-#define GBB_B(Bb,p)      1
-#define GB_B_NVALS(e) int64_t e = B->nvals
-#define GB_B_NHELD(e) GB_B_NVALS(e)
-#define GB_B_ISO 0
-#define GB_B_TYPE bool
-#define GB_B2TYPE void
-#define GB_DECLAREB(b)
-#define GB_GETB(b,Bx,p,iso)
-
-#include "GB_ewise_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_ewise / GB_macrofy_ewise, C non iso
-// op: (and, bool)
-
-// binary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE bool
-#define GB_Y_TYPE bool
-
-// binary operator:
-#define GB_BINOP(z,x,y,i,j) z = ((x) && (y))
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA]
-#define GB_COPY_B_to_C(Cx,pC,Bx,pB,B_iso) Cx [pC] = Bx [pB]
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(c,Cx,p) Cx [p] = c
-#define GB_EWISEOP(Cx,p,aij,bij,i,j) GB_BINOP (Cx [p], aij, bij, i, j)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE bool
-#define GB_DECLAREA(a) bool a
-#define GB_GETA(a,Ax,p,iso) a = Ax [p]
-
-// B matrix: hypersparse
-#define GB_B_IS_HYPER  1
-#define GB_B_IS_SPARSE 0
-#define GB_B_IS_BITMAP 0
-#define GB_B_IS_FULL   0
-#define GBP_B(Bp,k,vlen) Bp [k]
-#define GBH_B(Bh,k)      Bh [k]
-#define GBI_B(Bi,p,vlen) Bi [p]
-#define GBB_B(Bb,p)      1
-#define GB_B_NVALS(e) int64_t e = B->nvals
-#define GB_B_NHELD(e) GB_B_NVALS(e)
-#define GB_B_ISO 0
-#define GB_B_TYPE bool
-#define GB_B2TYPE bool
-#define GB_DECLAREB(b) bool b
-#define GB_GETB(b,Bx,p,iso) b = Bx [p]
-
-#include "GB_ewise_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_mxm / GB_macrofy_mxm, C iso
-// semiring: symbolic only (C is iso)
-
-// monoid:
-#define GB_Z_TYPE void
-#define GB_UPDATE(z,y)
-#define GB_ADD(z,x,y)
-#define GB_DECLARE_IDENTITY(z)
-#define GB_DECLARE_IDENTITY_CONST(z)
-#define GB_IS_ANY_MONOID 1
-#define GB_Z_IGNORE_OVERFLOW 1
-#define GB_Z_NBITS 0
-#define GB_Z_ATOMIC_BITS 0
-
-// multiplicative operator (flipped):
-#define GB_MULT(z,x,y,i,k,j)
-
-// multiply-add operator:
-#define GB_MULTADD(z,x,y,i,k,j)
-
-// special cases:
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 1
-#define GB_C_TYPE void
-#define GB_PUTC(c,Cx,p)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 1
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-// B matrix: hypersparse
-#define GB_B_IS_HYPER  1
-#define GB_B_IS_SPARSE 0
-#define GB_B_IS_BITMAP 0
-#define GB_B_IS_FULL   0
-#define GBP_B(Bp,k,vlen) Bp [k]
-#define GBH_B(Bh,k)      Bh [k]
-#define GBI_B(Bi,p,vlen) Bi [p]
-#define GBB_B(Bb,p)      1
-#define GB_B_NVALS(e) int64_t e = B->nvals
-#define GB_B_NHELD(e) GB_B_NVALS(e)
-#define GB_B_ISO 1
-#define GB_B_IS_PATTERN 1
-#define GB_B_TYPE void
-#define GB_B2TYPE void
-#define GB_DECLAREB(b)
-#define GB_GETB(b,Bx,p,iso)
-
-#include "GB_mxm_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_mxm / GB_macrofy_mxm, any_pair, flipxy
-// semiring: symbolic only (C is iso)
-
-// monoid:
-#define GB_Z_TYPE void
-#define GB_UPDATE(z,y)
-#define GB_ADD(z,x,y)
-#define GB_DECLARE_IDENTITY(z)
-#define GB_DECLARE_IDENTITY_CONST(z)
-#define GB_IS_ANY_MONOID 1
-#define GB_Z_NBITS 0
-#define GB_Z_ATOMIC_BITS 0
-
-// multiplicative operator (flipped):
-#define GB_MULT(z,x,y,i,k,j)
-
-// multiply-add operator:
-#define GB_MULTADD(z,x,y,i,k,j)
-
-// special cases:
-#define GB_IS_ANY_PAIR_SEMIRING 1
-#define GB_IS_PAIR_MULTIPLIER 1
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE void
-#define GB_PUTC(c,Cx,p)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 1
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-// B matrix: hypersparse
-#define GB_B_IS_HYPER  1
-#define GB_B_IS_SPARSE 0
-#define GB_B_IS_BITMAP 0
-#define GB_B_IS_FULL   0
-#define GBP_B(Bp,k,vlen) Bp [k]
-#define GBH_B(Bh,k)      Bh [k]
-#define GBI_B(Bi,p,vlen) Bi [p]
-#define GBB_B(Bb,p)      1
-#define GB_B_NVALS(e) int64_t e = B->nvals
-#define GB_B_NHELD(e) GB_B_NVALS(e)
-#define GB_B_ISO 1
-#define GB_B_IS_PATTERN 1
-#define GB_B_TYPE void
-#define GB_B2TYPE void
-#define GB_DECLAREB(b)
-#define GB_GETB(b,Bx,p,iso)
-
-#include "GB_mxm_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_mxm / GB_macrofy_mxm, any_pair fp32
-// semiring: (any, pair (flipped), float)
-
-// monoid:
-#define GB_Z_TYPE float
-#define GB_ADD(z,x,y) z = y
-#define GB_UPDATE(z,y) z = y
-#define GB_DECLARE_IDENTITY(z) float z = 0
-#define GB_DECLARE_IDENTITY_CONST(z) const float z = 0
-#define GB_HAS_IDENTITY_BYTE 1
-#define GB_IDENTITY_BYTE 0x00
-#define GB_IS_ANY_MONOID 1
-#define GB_Z_NBITS 32
-#define GB_Z_ATOMIC_BITS 32
-#define GB_Z_HAS_ATOMIC_UPDATE 1
-#define GB_Z_HAS_OMP_ATOMIC_UPDATE 1
-#define GB_Z_HAS_CUDA_ATOMIC_BUILTIN 1
-#define GB_Z_CUDA_ATOMIC GB_cuda_atomic_write
-#define GB_Z_CUDA_ATOMIC_TYPE float
-
-// multiplicative operator (flipped):
-#define GB_MULT(z,y,x,j,k,i) z = 1
-
-// multiply-add operator:
-#define GB_MULTADD(z,y,x,j,k,i) z = 1
-
-// special cases:
-#define GB_IS_PAIR_MULTIPLIER 1
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE float
-#define GB_PUTC(c,Cx,p) Cx [p] = c
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 1
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-// B matrix: hypersparse
-#define GB_B_IS_HYPER  1
-#define GB_B_IS_SPARSE 0
-#define GB_B_IS_BITMAP 0
-#define GB_B_IS_FULL   0
-#define GBP_B(Bp,k,vlen) Bp [k]
-#define GBH_B(Bh,k)      Bh [k]
-#define GBI_B(Bi,p,vlen) Bi [p]
-#define GBB_B(Bb,p)      1
-#define GB_B_NVALS(e) int64_t e = B->nvals
-#define GB_B_NHELD(e) GB_B_NVALS(e)
-#define GB_B_ISO 1
-#define GB_B_IS_PATTERN 1
-#define GB_B_TYPE void
-#define GB_B2TYPE void
-#define GB_DECLAREB(b)
-#define GB_GETB(b,Bx,p,iso)
-
-#include "GB_mxm_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: rowindex
-// op: (rowindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int32_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((i) + (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_DEPENDS_ON_Y 1
-#define GB_ROWINDEX_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: rowindex
-// op: (rowindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((i) + (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_DEPENDS_ON_Y 1
-#define GB_ROWINDEX_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: colindex
-// op: (colindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int32_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) + (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_COLINDEX_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: colindex
-// op: (colindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) + (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_COLINDEX_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: diagindex
-// op: (diagindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int32_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) - ((i) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_OFFDIAG_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: diagindex
-// op: (diagindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) - ((i) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_OFFDIAG_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: tril
-// op: (tril, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) <= ((i) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_TRIL_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: triu
-// op: (triu, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) >= ((i) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_TRIU_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: diag
-// op: (diag, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) == ((i) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_DIAG_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: offdiag
-// op: (offdiag, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) != ((i) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_OFFDIAG_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: colle
-// op: (colle, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) <= (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_COLLE_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: colgt
-// op: (colgt, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((j) > (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_COLGT_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: rowle
-// op: (rowle, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((i) <= (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_DEPENDS_ON_Y 1
-#define GB_ROWLE_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: rowgt
-// op: (rowgt, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((i) > (y))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_DEPENDS_ON_Y 1
-#define GB_ROWGT_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    bool keep ;                        \
-    GB_IDXUNOP (keep, , i, j, y) ;
-
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: flipdiagindex
-// op: (flipdiagindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int32_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((i) - ((j) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_ENTRY_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: flipdiagindex
-// op: (flipdiagindex, GB_void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE int64_t
-
-// index unary operator:
-#define GB_IDXUNOP(z,x,i,j,y) z = ((i) - ((j) + (y)))
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_ENTRY_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_IDXUNOP (z, , i, j, y) ; \
-    bool keep = ((z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE bool
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_select / GB_macrofy_select: opi32
-// op: opi32func, ztype: GxB_FC32_t, xtype: GxB_FC32_t, ytype: GxB_FC32_t
-
-// unary operator types:
-#define GB_Z_TYPE GxB_FC32_t
-#define GB_X_TYPE GxB_FC32_t
-#define GB_Y_TYPE GxB_FC32_t
-
-// index unary operator:
-#ifndef GB_GUARD_opi32func_DEFINED
-#define GB_GUARD_opi32func_DEFINED
-GB_STATIC_INLINE
-void opi32func (GxB_FC32_t *z, const GxB_FC32_t *x, GrB_Index i, GrB_Index j, 
-    const GxB_FC32_t *y)   
-{                          
-    (*z) = (*x) ;          
-}
-#define GB_opi32func_USER_DEFN \
-"void opi32func (GxB_FC32_t *z, const GxB_FC32_t *x, GrB_Index i, GrB_Index j, \n" \
-"    const GxB_FC32_t *y)   \n" \
-"{                          \n" \
-"    (*z) = (*x) ;          \n" \
-"}"
-#endif
-#define GB_IDXUNOP(z,x,i,j,y) opi32func (&(z), &(x), i, j, &(y))
-#define GB_DEPENDS_ON_X 1
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 1
-#define GB_DEPENDS_ON_Y 1
-#define GB_ENTRY_SELECTOR
-
-// test if A(i,j) is to be kept:
-#define GB_TEST_VALUE_OF_ENTRY(keep,p) \
-    GB_Z_TYPE z ;                      \
-    GB_DECLAREA (x) ;                  \
-    GB_GETA (x, Ax, p, ) ;             \
-    GB_IDXUNOP (z, x, i, j, y) ;       \
-    bool keep = (GB_crealf (z) != 0 || GB_cimagf (z) != 0)
-// copy A(i,j) to C(i,j):
-#define GB_SELECT_ENTRY(Cx,pC,Ax,pA) Cx [pC] = Ax [pA]
-
-// C type:
-#define GB_C_TYPE GxB_FC32_t
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE GxB_FC32_t
-#define GB_A2TYPE GxB_FC32_t
-#define GB_DECLAREA(a) GxB_FC32_t a
-#define GB_GETA(a,Ax,p,iso) a = Ax [p]
-
-#include "GB_select_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: one
-// op: (one, void)
-
-// unary operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = 1
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 0
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \
-{                                              \
-    GB_DECLAREZ (z) ;                          \
-    GB_UNARYOP (z, aij, Ax [pA],  ,  ,  ) ; \
-    GB_PUTC (z, Cx, pC) ;                      \
-}
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c)
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positioni
-// op: (positioni, void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (i)
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA], i,  ,  )
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = c
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positioni
-// op: (positioni, void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (i)
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \
-{                                              \
-    GB_DECLAREZ (z) ;                          \
-    GB_UNARYOP (z, aij, Ax [pA], i,  ,  ) ; \
-    GB_PUTC (z, Cx, pC) ;                      \
-}
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c)
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positioni1
-// op: (positioni1, void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (i) + 1
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA], i,  ,  )
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = c
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positioni1
-// op: (positioni1, void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (i) + 1
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 1
-#define GB_DEPENDS_ON_J 0
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \
-{                                              \
-    GB_DECLAREZ (z) ;                          \
-    GB_UNARYOP (z, aij, Ax [pA], i,  ,  ) ; \
-    GB_PUTC (z, Cx, pC) ;                      \
-}
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c)
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positionj
-// op: (positionj, void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (j)
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA],  , j,  )
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = c
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positionj
-// op: (positionj, void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (j)
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \
-{                                              \
-    GB_DECLAREZ (z) ;                          \
-    GB_UNARYOP (z, aij, Ax [pA],  , j,  ) ; \
-    GB_PUTC (z, Cx, pC) ;                      \
-}
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c)
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positionj1
-// op: (positionj1, void)
-
-// unary operator types:
-#define GB_Z_TYPE int32_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (j) + 1
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) GB_UNARYOP (Cx [pC], Ax [pA],  , j,  )
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = c
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: positionj1
-// op: (positionj1, void)
-
-// unary operator types:
-#define GB_Z_TYPE int64_t
-#define GB_X_TYPE void
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = (j) + 1
-#define GB_DEPENDS_ON_X 0
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 1
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \
-{                                              \
-    GB_DECLAREZ (z) ;                          \
-    GB_UNARYOP (z, aij, Ax [pA],  , j,  ) ; \
-    GB_PUTC (z, Cx, pC) ;                      \
-}
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#define GB_PUTC(c,Cx,p) Cx [p] = (int32_t) (c)
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_IS_PATTERN 1
-#define GB_A_TYPE void
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_apply / GB_macrofy_apply: sqrt
-// op: (sqrt, GxB_FC64_t)
-
-// unary operator types:
-#define GB_Z_TYPE GxB_FC64_t
-#define GB_X_TYPE GxB_FC64_t
-#define GB_Y_TYPE void
-
-// unary operator:
-#define GB_UNARYOP(z,x,i,j,y) z = GB_csqrt (x)
-#define GB_DEPENDS_ON_X 1
-#define GB_DEPENDS_ON_Y 0
-#define GB_DEPENDS_ON_I 0
-#define GB_DEPENDS_ON_J 0
-#define GB_UNOP(Cx,pC,Ax,pA,A_iso,i,j,y) \
-{                                      \
-    GB_DECLAREA (aij) ;                \
-    GB_GETA (aij, Ax, pA, ) ;          \
-    GB_DECLAREZ (z) ;                  \
-    GB_UNARYOP (z, aij,  ,  ,  ) ;  \
-    GB_PUTC (z, Cx, pC) ;              \
-}
-
-// C matrix: sparse
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 1
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE int32_t
-#ifndef GB_GUARD_GJ_cast_to_int32_DEFINED
-#define GB_GUARD_GJ_cast_to_int32_DEFINED
-GB_STATIC_INLINE
-int32_t GJ_cast_to_int32 (double x)                    
-{                                                      
-    if (isnan (x)) return (0) ;                        
-    if (x <= (double) INT32_MIN) return (INT32_MIN) ;  
-    if (x >= (double) INT32_MAX) return (INT32_MAX) ;  
-    return ((int32_t) x) ;                             
-}
-#endif
-#define GB_PUTC(c,Cx,p) Cx [p] = GJ_cast_to_int32 (GB_creal (c))
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_HAS_ZOMBIES 0
-#define GB_A_TYPE int32_t
-#define GB_A2TYPE GxB_FC64_t
-#define GB_DECLAREA(a) GxB_FC64_t a
-#define GB_GETA(a,Ax,p,iso) a = (GxB_FC64_t) (Ax [p])
-
-#include "GB_apply_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_build / GB_macrofy_build: times
-// op: (times, GxB_FC32_t)
-
-// binary dup operator types:
-#define GB_Z_TYPE GxB_FC32_t
-#define GB_X_TYPE GxB_FC32_t
-#define GB_Y_TYPE GxB_FC32_t
-
-// S and T data types:
-#define GB_T_TYPE bool
-#define GB_S_TYPE bool
-
-// binary dup operator:
-#define GB_DUP(z,x,y) z = GB_FC32_mul (x,y)
-#define GB_UPDATE(z,y) GB_DUP(z,z,y)
-
-// build copy/dup methods:
-#define GB_BLD_COPY(Tx,p,Sx,k) Tx [p] = Sx [k]
-#define GB_BLD_DUP(Tx,p,Sx,k) \
-    GxB_FC32_t y = (GxB_FC32_t) Sx [k] ; \
-    GxB_FC32_t x = (GxB_FC32_t) Tx [p] ; \
-    GxB_FC32_t z ; \
-    GB_DUP (z, x, y) ; \
-    Tx [p] = (GB_crealf (z) != 0 || GB_cimagf (z) != 0) ;
-
-#include "GB_kernel_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_build / GB_macrofy_build: and
-// op: (and, bool)
-
-// binary dup operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE bool
-#define GB_Y_TYPE bool
-
-// S and T data types:
-#define GB_T_TYPE GxB_FC32_t
-#define GB_S_TYPE GxB_FC32_t
-
-// binary dup operator:
-#define GB_DUP(z,x,y) z = ((x) && (y))
-#define GB_UPDATE(z,y) z &= y
-
-// build copy/dup methods:
-#define GB_BLD_COPY(Tx,p,Sx,k) Tx [p] = Sx [k]
-#define GB_BLD_DUP(Tx,p,Sx,k) \
-    bool y = (GB_crealf (Sx [k]) != 0 || GB_cimagf (Sx [k]) != 0) ; \
-    bool x = (GB_crealf (Tx [p]) != 0 || GB_cimagf (Tx [p]) != 0) ; \
-    bool z ; \
-    GB_DUP (z, x, y) ; \
-    Tx [p] = (GxB_FC32_t) z ;
-
-#include "GB_kernel_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)=A (assign) 
-// assign/subassign: C(I,J) = A 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: not present
-
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA]
-#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \
-    GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso)
-#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p]
-#define GB_COPY_C_to_xwork(xwork,Cx,p)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(lo:s:hi,lo:s:hi)=A (assign) 
-// assign/subassign: C(I,J) = A 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_STRIDE
-#define GB_J_KIND GB_STRIDE
-#define GB_C_REPLACE 0
-// accum: not present
-
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA]
-#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \
-    GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso)
-#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p]
-#define GB_COPY_C_to_xwork(xwork,Cx,p)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE void
-#define GB_DECLAREA(a)
-#define GB_GETA(a,Ax,p,iso)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(i,J)=s (row assign) 
-// assign/subassign: C(i,J) = A 
-#define GB_ASSIGN_KIND GB_ROW_ASSIGN
-#define GB_I_KIND GB_ALL
-#define GB_J_KIND GB_LIST
-#define GB_C_REPLACE 0
-// accum: not present
-
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar
-#define GB_COPY_scalar_to_C(Cx,pC,cwork) Cx [pC] = cwork
-#define GB_COPY_C_to_xwork(xwork,Cx,p)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// scalar:
-#define GB_A_TYPE bool
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(I,j)=s (col assign) 
-// assign/subassign: C(I,j) = A 
-#define GB_ASSIGN_KIND GB_COL_ASSIGN
-#define GB_I_KIND GB_LIST
-#define GB_J_KIND GB_ALL
-#define GB_C_REPLACE 0
-// accum: not present
-
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(cwork,Cx,p) Cx [p] = cwork
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar
-#define GB_COPY_scalar_to_C(Cx,pC,cwork) Cx [pC] = cwork
-#define GB_COPY_C_to_xwork(xwork,Cx,p)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// scalar:
-#define GB_A_TYPE bool
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)=A (assign) 
-// assign/subassign: C(I,J) = scalar 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: not present
-
-
-// C matrix: full
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   1
-#define GBP_C(Cp,k,vlen) ((k) * (vlen))
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) ((p) % (vlen))
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim)
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 1
-#define GB_C_TYPE void
-#define GB_PUTC(cwork,Cx,p)
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar
-#define GB_COPY_scalar_to_C(Cx,pC,cwork)
-#define GB_COPY_C_to_xwork(xwork,Cx,pC)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// scalar:
-#define GB_A_TYPE bool
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)+=s (assign) 
-// assign/subassign: C(I,J) += scalar 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: (plus, float)
-
-// accum operator types:
-#define GB_Z_TYPE float
-#define GB_X_TYPE float
-#define GB_Y_TYPE float
-#define GB_DECLAREZ(zwork) float zwork
-#define GB_DECLAREX(xwork) float xwork
-#define GB_DECLAREY(ywork) float ywork
-
-// accum operator:
-#define GB_UPDATE(z,y)
-#define GB_ACCUM_OP(z,x,y)
-#define GB_ACCUMULATE_scalar(Cx,pC,ywork)
-
-// C matrix: full
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   1
-#define GBP_C(Cp,k,vlen) ((k) * (vlen))
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) ((p) % (vlen))
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim)
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 1
-#define GB_C_TYPE void
-#define GB_PUTC(zwork,Cx,p)
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar
-#define GB_COPY_scalar_to_C(Cx,pC,cwork)
-#define GB_COPY_C_to_xwork(xwork,Cx,pC)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// scalar:
-#define GB_A_TYPE bool
-#define GB_COPY_scalar_to_ywork(ywork,scalar) ywork = (float) (scalar)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)+=s (assign) 
-// assign/subassign: C(I,J) += scalar 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: (plus, float)
-
-// accum operator types:
-#define GB_Z_TYPE float
-#define GB_X_TYPE float
-#define GB_Y_TYPE float
-#define GB_DECLAREZ(zwork) float zwork
-#define GB_DECLAREX(xwork) float xwork
-#define GB_DECLAREY(ywork) float ywork
-
-// accum operator:
-#define GB_UPDATE(z,y)
-#define GB_ACCUM_OP(z,x,y)
-#define GB_ACCUMULATE_scalar(Cx,pC,ywork)
-
-// C matrix: full
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   1
-#define GBP_C(Cp,k,vlen) ((k) * (vlen))
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) ((p) % (vlen))
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim)
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 1
-#define GB_C_TYPE void
-#define GB_PUTC(zwork,Cx,p)
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_scalar_to_cwork(cwork,scalar) cwork = scalar
-#define GB_COPY_scalar_to_C(Cx,pC,cwork)
-#define GB_COPY_C_to_xwork(xwork,Cx,pC)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// scalar:
-#define GB_A_TYPE bool
-#define GB_COPY_scalar_to_ywork(ywork,scalar) ywork = (float) (scalar)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)+=A (assign) 
-// assign/subassign: C(I,J) += A 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: (plus, float)
-
-// accum operator types:
-#define GB_Z_TYPE float
-#define GB_X_TYPE float
-#define GB_Y_TYPE float
-#define GB_DECLAREZ(zwork) float zwork
-#define GB_DECLAREX(xwork) float xwork
-#define GB_DECLAREY(ywork) float ywork
-
-// accum operator:
-#define GB_ACCUM_OP(z,x,y) z = (x) + (y)
-#define GB_UPDATE(z,y) z += y
-#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) \
-{                                          \
-    GB_DECLAREY (ywork) ;                  \
-    GB_GETA (ywork, Ax, pA, ) ;            \
-    GB_DECLAREX (xwork) ;                  \
-    GB_COPY_C_to_xwork (xwork, Cx, pC) ;   \
-    GB_DECLAREZ (zwork) ;                  \
-    GB_ACCUM_OP (zwork, xwork, ywork) ;          \
-    GB_PUTC (zwork, Cx, pC) ;              \
-}
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(zwork,Cx,p) Cx [p] = ((zwork) != 0)
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA]
-#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \
-    GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso)
-#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p]
-#define GB_COPY_C_to_xwork(xwork,Cx,p) xwork = (float) (Cx [p])
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE float
-#define GB_DECLAREA(a) float a
-#define GB_GETA(a,Ax,p,iso) a = (float) (Ax [p])
-#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)&=A (assign) 
-// assign/subassign: C(I,J) &= A 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: (and, bool)
-
-// accum operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE bool
-#define GB_Y_TYPE bool
-#define GB_DECLAREZ(zwork) bool zwork
-#define GB_DECLAREX(xwork) bool xwork
-#define GB_DECLAREY(ywork) bool ywork
-
-// accum operator:
-#define GB_ACCUM_OP(z,x,y) z = ((x) && (y))
-#define GB_UPDATE(z,y) z &= y
-#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) \
-{                                          \
-    GB_UPDATE (Cx [pC], Ax [pA]) ;          \
-}
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(zwork,Cx,p) Cx [p] = zwork
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA]
-#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \
-    GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso)
-#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p]
-#define GB_COPY_C_to_xwork(xwork,Cx,p) xwork = Cx [p]
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE bool
-#define GB_DECLAREA(a) bool a
-#define GB_GETA(a,Ax,p,iso) a = Ax [p]
-#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C(lo:hi,lo:hi)<=A (assign) 
-// assign/subassign: C(I,J) lt= A 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: (lt, float)
-
-// accum operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE float
-#define GB_Y_TYPE float
-#define GB_DECLAREZ(zwork) bool zwork
-#define GB_DECLAREX(xwork) float xwork
-#define GB_DECLAREY(ywork) float ywork
-
-// accum operator:
-#define GB_ACCUM_OP(z,x,y) z = ((x) < (y))
-#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork) \
-{                                          \
-    GB_DECLAREY (ywork) ;                  \
-    GB_GETA (ywork, Ax, pA, ) ;            \
-    GB_DECLAREX (xwork) ;                  \
-    GB_COPY_C_to_xwork (xwork, Cx, pC) ;   \
-    GB_ACCUM_OP (Cx [pC], xwork, ywork) ;          \
-}
-
-// C matrix: hypersparse
-#define GB_C_IS_HYPER  1
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   0
-#define GBP_C(Cp,k,vlen) Cp [k]
-#define GBH_C(Ch,k)      Ch [k]
-#define GBI_C(Ci,p,vlen) Ci [p]
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = C->nvals
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 0
-#define GB_C_IN_ISO 0
-#define GB_C_TYPE bool
-#define GB_PUTC(zwork,Cx,p) Cx [p] = zwork
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso) Cx [pC] = Ax [pA]
-#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork) \
-    GB_COPY_A_to_C (Cx, pC, Ax, pA, A_iso)
-#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p]
-#define GB_COPY_C_to_xwork(xwork,Cx,p) xwork = (float) (Cx [p])
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE float
-#define GB_DECLAREA(a) float a
-#define GB_GETA(a,Ax,p,iso) a = (float) (Ax [p])
-#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso)
-
-#include "GB_assign_shared_definitions.h"
-
-
-================================================================================
-GB_enumify_assign / GB_macrofy_assign: C_iso(lo:hi,lo:hi)<=H (assign) 
-// assign/subassign: C(I,J) lt= A 
-#define GB_ASSIGN_KIND GB_ASSIGN
-#define GB_I_KIND GB_RANGE
-#define GB_J_KIND GB_RANGE
-#define GB_C_REPLACE 0
-// accum: (lt, float)
-
-// accum operator types:
-#define GB_Z_TYPE bool
-#define GB_X_TYPE float
-#define GB_Y_TYPE float
-#define GB_DECLAREZ(zwork) bool zwork
-#define GB_DECLAREX(xwork) float xwork
-#define GB_DECLAREY(ywork) float ywork
-
-// accum operator:
-#define GB_ACCUM_OP(z,x,y)
-#define GB_ACCUMULATE_aij(Cx,pC,Ax,pA,A_iso,ywork)
-
-// C matrix: full
-#define GB_C_IS_HYPER  0
-#define GB_C_IS_SPARSE 0
-#define GB_C_IS_BITMAP 0
-#define GB_C_IS_FULL   1
-#define GBP_C(Cp,k,vlen) ((k) * (vlen))
-#define GBH_C(Ch,k)      (k)
-#define GBI_C(Ci,p,vlen) ((p) % (vlen))
-#define GBB_C(Cb,p)      1
-#define GB_C_NVALS(e) int64_t e = 0 ; GB_INT64_MULT (e, C->vlen, C->vdim)
-#define GB_C_NHELD(e) GB_C_NVALS(e)
-#define GB_C_ISO 1
-#define GB_C_IN_ISO 1
-#define GB_C_TYPE void
-#define GB_PUTC(zwork,Cx,p)
-#define GB_DECLAREC(cwork) bool cwork
-#define GB_COPY_A_to_C(Cx,pC,Ax,pA,A_iso)
-#define GB_COPY_aij_to_C(Cx,pC,Ax,pA,A_iso,cwork)
-#define GB_COPY_aij_to_cwork(cwork,Ax,p,iso) cwork = Ax [p]
-#define GB_COPY_C_to_xwork(xwork,Cx,pC)
-
-// M matrix: none
-#define GB_M_TYPE void
-#define GB_MCAST(Mx,p,msize) 1
-#define GB_MASK_STRUCT 1
-#define GB_MASK_COMP   0
-#define GB_NO_MASK     1
-
-// A matrix: hypersparse
-#define GB_A_IS_HYPER  1
-#define GB_A_IS_SPARSE 0
-#define GB_A_IS_BITMAP 0
-#define GB_A_IS_FULL   0
-#define GBP_A(Ap,k,vlen) Ap [k]
-#define GBH_A(Ah,k)      Ah [k]
-#define GBI_A(Ai,p,vlen) Ai [p]
-#define GBB_A(Ab,p)      1
-#define GB_A_NVALS(e) int64_t e = A->nvals
-#define GB_A_NHELD(e) GB_A_NVALS(e)
-#define GB_A_ISO 0
-#define GB_A_TYPE bool
-#define GB_A2TYPE float
-#define GB_DECLAREA(a) float a
-#define GB_GETA(a,Ax,p,iso) a = (float) (Ax [p])
-#define GB_COPY_aij_to_ywork(ywork,Ax,pA,A_iso) GB_GETA (ywork, Ax, pA, A_iso)
-
-#include "GB_assign_shared_definitions.h"
diff --git a/GraphBLAS/Test/test169.m b/GraphBLAS/Test/test169.m
new file mode 100644
index 0000000000..33c5509b92
--- /dev/null
+++ b/GraphBLAS/Test/test169.m
@@ -0,0 +1,49 @@
+function test169
+%TEST169 C<M>=A+B with different sparsity formats
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+fprintf ('test169:\n') ;
+
+n = 50 ;
+
+desc = struct ('mask', 'complement') ;
+
+for trial = 1:5
+
+    C = GB_spec_random (n, n, 0.5, 1, 'double') ;
+    M = GB_spec_random (n, n, 0.2, 1, 'double') ;
+    A = GB_spec_random (n, n, 0.5, 1, 'double') ;
+    B = GB_spec_random (n, n, 0.5, 1, 'double') ;
+
+    for C_sparsity = [1 2 4 8]
+        C.sparsity = C_sparsity ;
+
+        for M_sparsity = [1 2 4 8]
+            M.sparsity = M_sparsity ;
+
+            for A_sparsity = [1 2 4 8]
+                A.sparsity = A_sparsity ;
+
+                for B_sparsity = [1 2 4 8]
+                    B.sparsity = B_sparsity ;
+
+                    C1 = GB_spec_Matrix_eWiseAdd (C, M, [], 'plus', A, B, desc);
+                    C2 = GB_mex_Matrix_eWiseAdd  (C, M, [], 'plus', A, B, desc);
+                    GB_spec_compare (C1, C2) ;
+
+                    C1 = GB_spec_Matrix_eWiseAdd (C, M, [], 'plus', A, B, [ ]) ;
+                    C2 = GB_mex_Matrix_eWiseAdd  (C, M, [], 'plus', A, B, [ ]) ;
+                    GB_spec_compare (C1, C2) ;
+                end
+            end
+        end
+        fprintf ('.') ;
+    end
+end
+
+fprintf ('\ntest169: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/testall.m b/GraphBLAS/Test/testall.m
index 17204c2a56..50cec10f01 100644
--- a/GraphBLAS/Test/testall.m
+++ b/GraphBLAS/Test/testall.m
@@ -99,6 +99,7 @@ function testall (threads,longtests)
 % tests with high rates (over 100/sec)
 %----------------------------------------
 
+logstat ('test169'    ,t, j0  , f1  ) ; % C<M>=A+B with many formats
 logstat ('test250'    ,t, j44 , f10 ) ; % JIT tests, set/get, other tests
 logstat ('test279'    ,t, j0  , f1  ) ; % blob get/set
 logstat ('test278'    ,t, j0  , f1  ) ; % descriptor get/set
diff --git a/GraphBLAS/Test/unused/test169.m b/GraphBLAS/Test/unused/test169_orig.m
similarity index 100%
rename from GraphBLAS/Test/unused/test169.m
rename to GraphBLAS/Test/unused/test169_orig.m
diff --git a/GraphBLAS/cmake_modules/GraphBLAS_JIT_configure.cmake b/GraphBLAS/cmake_modules/GraphBLAS_JIT_configure.cmake
index db377d925e..0ae9693a6b 100644
--- a/GraphBLAS/cmake_modules/GraphBLAS_JIT_configure.cmake
+++ b/GraphBLAS/cmake_modules/GraphBLAS_JIT_configure.cmake
@@ -114,7 +114,7 @@ if ( GRAPHBLAS_USE_JIT OR GRAPHBLAS_USE_CUDA )
     message ( STATUS "------------------------------------------------------------------------" )
     # one or both JITs are enabled; make sure the cache path exists
     message ( STATUS "JIT C compiler: ${GB_C_COMPILER}" )
-    message ( STATUS "JIT C flags:    ${GB_C_FLAGS}" )
+    message ( STATUS "JIT C flags:    ${GB_C_FLAGS} ${GB_OPENMP_C_FLAGS}" )
     message ( STATUS "JIT link flags: ${GB_C_LINK_FLAGS}" )
     message ( STATUS "JIT lib prefix: ${GB_LIB_PREFIX}" )
     message ( STATUS "JIT lib suffix: ${GB_LIB_SUFFIX}" )
@@ -133,6 +133,4 @@ file ( MAKE_DIRECTORY "${GRAPHBLAS_CACHE_PATH}/lib" )
 file ( MAKE_DIRECTORY "${GRAPHBLAS_CACHE_PATH}/tmp" )
 file ( MAKE_DIRECTORY "${GRAPHBLAS_CACHE_PATH}/lock" )
 file ( MAKE_DIRECTORY "${GRAPHBLAS_CACHE_PATH}/c" )
-file ( MAKE_DIRECTORY "${GRAPHBLAS_CACHE_PATH}/cu" )
-
 
diff --git a/GraphBLAS/cmake_modules/GraphBLAS_PreJIT.cmake b/GraphBLAS/cmake_modules/GraphBLAS_PreJIT.cmake
index ffc8aa851b..e9078c8b0f 100644
--- a/GraphBLAS/cmake_modules/GraphBLAS_PreJIT.cmake
+++ b/GraphBLAS/cmake_modules/GraphBLAS_PreJIT.cmake
@@ -7,7 +7,10 @@
 
 #-------------------------------------------------------------------------------
 
-# create a list of files
+#-------------------------------------------------------------------------------
+# create a list of files of CPU PreJIT kernels
+#-------------------------------------------------------------------------------
+
 file ( GLOB PRE1 "PreJIT/GB_jit_*.c" )
 set ( PREJIT "" )
 set ( PREPRO "" )
@@ -112,3 +115,18 @@ configure_file ( "Config/GB_prejit.c.in"
     "${PROJECT_SOURCE_DIR}/Config/GB_prejit.c"
     NEWLINE_STYLE LF )
 
+#-------------------------------------------------------------------------------
+# create a list of files of CUDA PreJIT kernels
+#-------------------------------------------------------------------------------
+
+# FIXME: add CUDA PreJIT kernels.  For example:
+
+#   ...
+#   elseif ( ${F} MATCHES "^GB_jit__cuda_reduce" )
+#       list ( APPEND PREPRO "JIT_CUDA_RED (" ${F} ")\n" )
+#   endif ( )
+
+# configure_file ( "CUDA/Config/GB_prejit.c.in"
+#     "${PROJECT_SOURCE_DIR}/CUDA/Config/GB_prejit.c"
+#     NEWLINE_STYLE LF )
+
diff --git a/GraphBLAS/cmake_modules/GraphBLAS_version.cmake b/GraphBLAS/cmake_modules/GraphBLAS_version.cmake
index f5df5d16ae..a3eba6df7d 100644
--- a/GraphBLAS/cmake_modules/GraphBLAS_version.cmake
+++ b/GraphBLAS/cmake_modules/GraphBLAS_version.cmake
@@ -8,7 +8,7 @@
 #-------------------------------------------------------------------------------
 
 # version of SuiteSparse:GraphBLAS
-set ( GraphBLAS_DATE "Feb XX, 2024" )   # FIXME for SuiteSparse 7.7.0
+set ( GraphBLAS_DATE "Mar 22, 2024" )
 set ( GraphBLAS_VERSION_MAJOR 9 CACHE STRING "" FORCE )
 set ( GraphBLAS_VERSION_MINOR 1 CACHE STRING "" FORCE )
 set ( GraphBLAS_VERSION_SUB   0 CACHE STRING "" FORCE )
diff --git a/GraphBLAS/rmm_wrap/README.md b/GraphBLAS/rmm_wrap/README.md
index 1b66003598..f0120f4b3e 100644
--- a/GraphBLAS/rmm_wrap/README.md
+++ b/GraphBLAS/rmm_wrap/README.md
@@ -5,7 +5,9 @@ SPDX-License-Identifier: Apache-2.0
 rmm_wrap defines a single global object, the RMM_Wrap_Handle that holds
 an RMM (Rapids Memory Manager) memory resource and a hash map (C++
 std:unordered_map).  This allows rmm_wrap to provide 7 functions to a C
-application:
+application.
+
+Note that the rmm_wrap functions are NOT thread safe.
 
 Create/destroy an RMM resource:
 
diff --git a/GraphBLAS/rmm_wrap/rmm_wrap.cpp b/GraphBLAS/rmm_wrap/rmm_wrap.cpp
index 7246baca51..22362b7c98 100644
--- a/GraphBLAS/rmm_wrap/rmm_wrap.cpp
+++ b/GraphBLAS/rmm_wrap/rmm_wrap.cpp
@@ -36,6 +36,8 @@
 // RMM_Wrap_Handle: a global object containing the RMM context
 //------------------------------------------------------------------------------
 
+// NOTE: this is not thread-safe
+
 // rmm_wrap_context is a pointer to an array of global RMM_Wrap_Handle objects
 // (one per GPU) that all methods in this file can access.  The array of
 // objects cannot be accessed outside this file.
@@ -81,7 +83,6 @@ inline auto make_cuda()
 
 inline auto make_managed()
 {
-    std::cout << "Inside make_managed" << std::endl;
     return std::make_shared<rmm::mr::managed_memory_resource>() ;
 }
 
@@ -131,16 +132,14 @@ inline auto make_and_set_managed_pool
     std::size_t maximum_size
 )
 {
-      std::cout<< " make_managed_pool called with  init_size"
-      <<initial_size<<" max_size "<<maximum_size<<"\n";
 
     auto resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>
                         ( make_managed(), initial_size, maximum_size ) ;
 
-    std::cout << "Created resource" << std::endl;
+    // std::cout << "Created resource" << std::endl;
     rmm::mr::set_current_device_resource( resource.get()) ;
 
-    std::cout << "Set resource" << std::endl;
+    // std::cout << "Set resource" << std::endl;
     return resource;
 }
 
@@ -185,7 +184,9 @@ void rmm_wrap_finalize (void)
 //------------------------------------------------------------------------------
 // get_current_device: helper to get id for currently selected device
 //------------------------------------------------------------------------------
-int get_current_device(void) {
+
+int get_current_device(void)
+{
     int device_id;
     cudaGetDevice(&device_id);
     return device_id;
@@ -194,26 +195,29 @@ int get_current_device(void) {
 //------------------------------------------------------------------------------
 // rmm_wrap_initialize: initialize rmm_wrap_context[device_id]
 //------------------------------------------------------------------------------
-int rmm_wrap_initialize             // returns -1 on error, 0 on success
+
+int rmm_wrap_initialize     // returns -1 on error, 0 on success
 (
-    uint32_t device_id,             // 2, 5, or 7
-    RMM_MODE mode,                  // TODO: describe. Should we default this?
-    size_t init_pool_size,     // TODO: describe. Should we default this?
-    size_t max_pool_size,       // TODO: describe. Should we default this?
-    size_t stream_pool_size     // TODO: describe. Should we default this?
+    uint32_t device_id,     // 2, 5, or 7
+    RMM_MODE mode,          // TODO: describe. Should we default this?
+    size_t init_pool_size,  // TODO: describe. Should we default this?
+    size_t max_pool_size,   // TODO: describe. Should we default this?
+    size_t stream_pool_size // TODO: describe. Should we default this?
 )
 {
 
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
-    if(rmm_wrap_context[device_id] != NULL) {
-        return (-1);
+
+    if (rmm_wrap_context[device_id] != NULL)
+    {
+        return (-1) ;
     }
 
     if(stream_pool_size <= 0)
     {
-        std::cout << "Stream pool size must be >=0" << std::endl;
+        // std::cout << "Stream pool size must be >=0" << std::endl;
         // failed to create the alloc_map
         return (-1) ;
     }
@@ -230,7 +234,7 @@ int rmm_wrap_initialize             // returns -1 on error, 0 on success
     //--------------------------------------------------------------------------
 
     // Set CUDA stream pool
-    std::cout << "Creating rmm_wrap stream pool" << std::endl;
+    // std::cout << "Creating rmm_wrap stream pool" << std::endl;
     rmm_wrap_context[device_id]->stream_pool = make_and_set_cuda_stream_pool(stream_pool_size);
     RMM_WRAP_CHECK_CUDA(cudaStreamCreate(&(rmm_wrap_context[device_id]->main_stream)));
 
@@ -255,8 +259,9 @@ int rmm_wrap_initialize             // returns -1 on error, 0 on success
     }
     else if ( mode == rmm_wrap_managed )
     {
-        std::cout << "Seting managed pool" << std::endl;
-        rmm_wrap_context[device_id]->resource = make_and_set_managed_pool( init_pool_size, max_pool_size);
+        // std::cout << "Seting managed pool" << std::endl;
+        rmm_wrap_context[device_id]->resource = 
+            make_and_set_managed_pool( init_pool_size, max_pool_size);
     }
     else
     {
@@ -264,7 +269,7 @@ int rmm_wrap_initialize             // returns -1 on error, 0 on success
         return (-1) ;
     }
 
-    std::cout << "Setting mode for rmm_wrap context" << std::endl;
+    // std::cout << "Setting mode for rmm_wrap context" << std::endl;
     // Mark down the mode for reference later
     rmm_wrap_context[device_id]->mode = mode;
 
@@ -272,11 +277,11 @@ int rmm_wrap_initialize             // returns -1 on error, 0 on success
     // create size map to lookup size of each allocation
     //--------------------------------------------------------------------------
 
-    std::cout << "Setting size_map for rmm_wrap context" << std::endl;
+    // std::cout << "Setting size_map for rmm_wrap context" << std::endl;
     rmm_wrap_context[device_id]->size_map = std::make_shared<alloc_map> () ;
     if (rmm_wrap_context[device_id]->size_map.get() == NULL)
     {
-        std::cout << "Failed to create size_map" << std::endl;
+        // std::cout << "Failed to create size_map" << std::endl;
         // failed to create the alloc_map
         return (-1) ;
     }
@@ -291,12 +296,14 @@ int rmm_wrap_initialize             // returns -1 on error, 0 on success
 int rmm_wrap_initialize_all_same
 (
     RMM_MODE mode,              // TODO: describe. Should we default this?
-    size_t init_pool_size,     // TODO: describe. Should we default this?
+    size_t init_pool_size,      // TODO: describe. Should we default this?
     size_t max_pool_size,       // TODO: describe. Should we default this?
     size_t stream_pool_size     // TODO: describe. Should we default this?
-) {
+)
+{
 
-    if(rmm_wrap_context != NULL) {
+    if (rmm_wrap_context != NULL)
+    {
         return (-1);
     }
 
@@ -316,7 +323,7 @@ int rmm_wrap_initialize_all_same
 
             intermediate.erase(std::remove_if(intermediate.begin(), intermediate.end(), ::isspace), intermediate.end());
             uint32_t device_id = static_cast<uint32_t>(stoi(intermediate));
-            std::cout << "Found device_id " << device_id << std::endl;
+            // std::cout << "Found device_id " << device_id << std::endl;
             devices.push_back(device_id);
         }
     /**
@@ -325,7 +332,7 @@ int rmm_wrap_initialize_all_same
      */
     } else {
         devices.push_back(0);
-        std::cout << "Using default device_id 0" << std::endl;
+        // std::cout << "Using default device_id 0" << std::endl;
     }
 
     // Allocate rmm_wrap_contexts
@@ -333,7 +340,7 @@ int rmm_wrap_initialize_all_same
     for(int i = 0; i < devices.size(); ++i) {
         rmm_wrap_context[i] = NULL;
         uint32_t device_id = devices[i];
-        std::cout << "Creating rmm_wrap_context for device_id " << device_id << std::endl;
+        // std::cout << "Creating rmm_wrap_context for device_id " << device_id << std::endl;
         int ret = rmm_wrap_initialize(device_id, mode, init_pool_size, max_pool_size, stream_pool_size);
         if(ret < 0) {
             return ret;
@@ -347,7 +354,8 @@ int rmm_wrap_initialize_all_same
 // rmm_wrap_get_next_stream_from_pool: return the next available stream from the pool
 // Output is cudaStream_t
 //------------------------------------------------------------------------------
-void* rmm_wrap_get_next_stream_from_pool(void) {
+void* rmm_wrap_get_next_stream_from_pool(void)
+{
     return rmm_wrap_context[get_current_device()]->stream_pool->get_stream();
 }
 
@@ -355,7 +363,8 @@ void* rmm_wrap_get_next_stream_from_pool(void) {
 // rmm_wrap_get_stream_from_pool: return specific stream from the pool
 // Output is cudaStream_t
 //------------------------------------------------------------------------------
-void* rmm_wrap_get_stream_from_pool(std::size_t stream_id) {
+void* rmm_wrap_get_stream_from_pool(std::size_t stream_id)
+{
     return rmm_wrap_context[get_current_device()]->stream_pool->get_stream(stream_id);
 }
 
@@ -363,7 +372,8 @@ void* rmm_wrap_get_stream_from_pool(std::size_t stream_id) {
 // rmm_wrap_get_main_stream: return the main cuda stream
 // Output is cudaStream_t
 //------------------------------------------------------------------------------
-void* rmm_wrap_get_main_stream(void) {
+void* rmm_wrap_get_main_stream(void)
+{
     return rmm_wrap_context[get_current_device()]->main_stream;
 }
 //------------------------------------------------------------------------------
@@ -477,7 +487,10 @@ void rmm_wrap_free (void *p)
 
 void *rmm_wrap_allocate( std::size_t *size)
 {
-    if (rmm_wrap_context == NULL) return (NULL) ;
+    if (rmm_wrap_context == NULL)
+    {
+        return (NULL) ;
+    }
 
     uint32_t device_id = get_current_device();
 
@@ -499,9 +512,6 @@ void *rmm_wrap_allocate( std::size_t *size)
         *size += (256 - aligned) ;
     }
 
-//  printf(" rmm_wrap_alloc %ld bytes\n",*size) ;
-
-
     rmm::mr::device_memory_resource *memoryresource =
         rmm::mr::get_current_device_resource() ;
     void *p = memoryresource->allocate( *size ) ;
@@ -525,7 +535,10 @@ void *rmm_wrap_allocate( std::size_t *size)
 
 void rmm_wrap_deallocate( void *p, std::size_t size)
 {
-    if (rmm_wrap_context == NULL) return ;
+    if (rmm_wrap_context == NULL)
+    {
+        return ;
+    }
 
     // Note: there are 3 PANIC cases below.  The API of rmm_wrap_deallocate
     // does not allow an error condition to be returned.  These PANICs could be
@@ -543,6 +556,7 @@ void rmm_wrap_deallocate( void *p, std::size_t size)
     }
 
     uint32_t device_id = get_current_device();
+
     // check the size given.  If the input size is zero, then the
     // size is unknown (say rmm_wrap_free(p)).  In that case, just trust the
     // hashmap.  Otherwise, double-check to make sure the size is correct.
@@ -560,7 +574,7 @@ void rmm_wrap_deallocate( void *p, std::size_t size)
        //actual_size = am->at( (std::size_t)(p) )  ;
        auto iter = am->find( (std::size_t)(p) )  ;
        if (iter != am->end() ) actual_size = iter->second;
-       else std::cout<< " rmm_wrap:: tried to free unallocated pointer ! " << p ;
+       // else std::cout<< " rmm_wrap:: tried to free unallocated pointer ! " << p ;
     }
 
     if (actual_size == 0)