Julia -> C function (Create pthead) -> Julia CUDA kernel issue #615

miniskar · 2024-08-18T20:35:00Z

I am facing a weird problem in our application. We have a julia function calling a C function, which is creating a pthread and calling back Julia CUDA kernel. I have created a minimal working example to illustrate and reproduce this problem.

The "main” Julia function calls the C function “call_on_thread”. The C function creates a pthread and calls Julia function “callback" which calls Julia CUDA kernel "saxpy_kernel”. Inside pthread, it creates CUDA device pointers and calls Julia "callback" with CUDA device pointers arguments.

This code hangs during execution when we call "main(true)" in Julia, but it works when I call "main(false)". The difference is that, when we call "main(false)", it first calls "saxpy_kernel" without pthread followed by with pthread.

When I debug the code for the code with pthread and "main(true)", it is getting hanged during execution in the below code when it calls "Core.Compiler.typeinf_type".
https://github.com/JuliaGPU/GPUCompiler.jl/blob/master/src/validation.jl#L17

function typeinf_type(mi::MethodInstance; interp::CC.AbstractInterpreter)
    ty = Core.Compiler.typeinf_type(interp, mi.def, mi.specTypes, mi.sparam_vals)
    return something(ty, Any)
end

Please help to resolve this issue. It is very important for us to make Julia more portable.

With Regards,
Narasinga Rao,
Group Lead,
Oak Ridge National Laboratory,
USA.

Here is the code given

Julia code: julia_cuda.jl

using CUDA

# Define the CUDA kernel for saxpy
function saxpy_kernel(A, B, C, alpha)
    i = threadIdx().x
    #i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
    if i <= length(A)
        C[i] = alpha * A[i] + B[i]
    end
    return nothing
end

export callback
function callback(ctx::Ptr{Cvoid}, device::Cint, A::Ptr{Float32}, B::Ptr{Float32}, C::Ptr{Float32}, alpha::Cfloat, n::Cint)::Cvoid
    GC.gc()
    # Limit BLAS to a single thread
    cu_ctx = unsafe_load(reinterpret(Ptr{CuContext}, ctx))
    CUDA.context!(cu_ctx)
    CUDA.device!(device)
    size_dims=Tuple(Int64[n])
    nthreads =Tuple(Int64[n])
    A_array = unsafe_wrap(CuArray, reinterpret(CuPtr{Float32}, A), size_dims, own=false)
    B_array = unsafe_wrap(CuArray, reinterpret(CuPtr{Float32}, B), size_dims, own=false)
    C_array = unsafe_wrap(CuArray, reinterpret(CuPtr{Float32}, C), size_dims, own=false)
    println(Core.stdout, "CUDA.ctx:$cu_ctx Device:$device Before A: $A_array, B:$B_array, C:$C_array Alpha:$alpha")
    #CUDA.nonblocking_synchronize(CUDA.context())
    CUDA.@sync @cuda threads=nthreads saxpy_kernel(A_array, B_array, C_array, alpha)
    #CUDA.synchronize()
    println(Core.stdout, "After A: $A_array, B:$B_array, C:$C_array Alpha:$alpha")
    println(Core.stdout, "GC call done")
end

function main(disable)
    callback_ptr = @cfunction(callback, Cvoid, (Ptr{Cvoid}, Cint, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cfloat, Cint))
    if !disable
        ccall((:call_directly, "./wip.so"), Cvoid, (Ptr{Cvoid},), callback_ptr)
    end
    println()
    gc_state = @ccall(jl_gc_safe_enter()::Int8)
    ccall((:call_on_thread, "./wip.so"), Cvoid, (Ptr{Cvoid},), callback_ptr)
    @ccall(jl_gc_safe_leave(gc_state::Int8)::Cvoid)
    println("Done")
end

main(true)

C code: c_code.c

#include <julia.h>
#include <pthread.h>
#include <cuda_runtime.h>
#include <cuda.h>

typedef void (*julia_callback)(void *ctx, int device, float *A, float *B, float *C, float alpha, int n);

void call_saxpy(julia_callback callback) {
    printf("Calling Julia from C thread\n");
    int n=8;
    float alpha=2.0f;
    // Allocate device memory
    float *d_A, *d_B, *d_C;
    float A[8]={1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
    float B[8]={2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f};
    float C[8]={3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f};
    CUcontext cuContext;
    CUdevice cuDevice;
    cuInit(0);
    cuDeviceGet(&cuDevice, 0);
    cuCtxCreate(&cuContext, 0, cuDevice);
    cuCtxSetCurrent(cuContext);
    cudaMalloc((void**)&d_A, n * sizeof(float));
    cudaMalloc((void**)&d_B, n * sizeof(float));
    cudaMalloc((void**)&d_C, n * sizeof(float));

    // Copy data from host to device
    cudaMemcpy(d_A, A, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, C, n * sizeof(float), cudaMemcpyHostToDevice);

    // Call the Julia function
    callback((void *)&cuContext, (int)cuDevice, d_A, d_B, d_C, alpha, n);
    // Copy result from device to host
    cudaMemcpy(C, d_C, n * sizeof(float), cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}

void call_directly(julia_callback callback) {
    printf("Calling Julia directly\n");
    call_saxpy(callback);
}

void *thread_function(void* callback) {
    call_saxpy((julia_callback)callback);
    return NULL;
}
void call_on_thread(julia_callback callback) {
    jl_init();
    printf("Creating thread\n");
    pthread_t thread;
    pthread_create(&thread, NULL, thread_function, callback);
    pthread_join(thread, NULL);
}

Makefile:

build:
    gcc -g -O0 -fPIC -shared  -o wip.so c_cuda.c -I$(JULIA)/include/julia -L$(JULIA)/lib -ljulia -lpthread -I$(NVHPC_ROOT)/cuda/include -L$(NVHPC_ROOT)/cuda/lib64  -lcuda -lcudart
run:
    julia julia_cuda.jl

The text was updated successfully, but these errors were encountered:

miniskar · 2024-08-18T20:59:41Z

Here are the parameters received inside typeinf_type function. I am using Julia 1.10.4 version.

typeinf_type: def:saxpy_kernel(A, B, C, alpha) @ Main julia-c-cuda/julia_cuda.jl:4
typeinf_type: spec:Tuple{typeof(saxpy_kernel), CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}, Float32}
typeinf_type: sparam_vals:svec()
typeinf_type: interp:GPUCompiler.GPUInterpreter(0x0000000000007b29, Core.Compiler.CachedMethodTable{Core.Compiler.OverlayMethodTable}(Core.Com
piler.IdDict{Core.Compiler.MethodMatchKey, Union{Nothing, Core.Compiler.MethodMatchResult}}(Any[#undef, #undef, #undef, #undef, #undef, #undef,
#undef, #undef, #undef, #undef, #undef, #undef, #undef, #undef, #undef, #undef, #undef, #undef, #undef, #undef, #undef, #undef, #undef, #undef,
#undef, #undef, #undef, #undef, #undef, #undef, #undef, #undef], 0, 0), Core.Compiler.OverlayMethodTable(0x0000000000007b29, # 208 methods for callable object:

maleadt · 2024-08-19T06:42:06Z

This is not a GPUCompiler.jl issue. Let's keep the discussion on the CUDA.jl repository.

miniskar mentioned this issue Aug 18, 2024

Deadlock when callling CUDA.jl in an adopted thread while blocking the main thread JuliaGPU/CUDA.jl#2449

Closed

4 tasks

maleadt closed this as completed Aug 19, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Julia -> C function (Create pthead) -> Julia CUDA kernel issue #615

Julia -> C function (Create pthead) -> Julia CUDA kernel issue #615

miniskar commented Aug 18, 2024 •

edited

Loading

miniskar commented Aug 18, 2024

maleadt commented Aug 19, 2024 •

edited

Loading

Julia -> C function (Create pthead) -> Julia CUDA kernel issue #615

Julia -> C function (Create pthead) -> Julia CUDA kernel issue #615

Comments

miniskar commented Aug 18, 2024 • edited Loading

miniskar commented Aug 18, 2024

maleadt commented Aug 19, 2024 • edited Loading

miniskar commented Aug 18, 2024 •

edited

Loading

maleadt commented Aug 19, 2024 •

edited

Loading