diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 211dff5..92e88d0 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -3,9 +3,19 @@ steps: plugins: - JuliaCI/julia#v1: version: "1.10" - - JuliaCI/julia-test#v1: ~ - JuliaCI/julia-coverage#v1: codecov: true + commands: | + julia --project -e ' + using Pkg + + println("--- :julia: Instantiating project") + Pkg.develop(path="lib/intrinsics") + Pkg.instantiate() + + + println("+++ :julia: Running tests") + Pkg.test(; coverage=true)' agents: queue: "juliagpu" cuda: "*" diff --git a/codecov.yml b/.github/codecov.yml similarity index 57% rename from codecov.yml rename to .github/codecov.yml index 8f3b3ed..11b566a 100644 --- a/codecov.yml +++ b/.github/codecov.yml @@ -1,7 +1,8 @@ coverage: ignore: - - "lib/lib*.jl" - - "src/kernels" + - "lib/*/lib*.jl" + - "src/kernels/" + - "src/device/" status: patch: false project: false diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 5fce3b2..57ad5df 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -28,7 +28,7 @@ jobs: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - uses: julia-actions/cache@v2 - - uses: julia-actions/julia-buildpkg@v1 + - run: julia --project -e 'using Pkg; Pkg.develop(path="lib/intrinsics")' - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v4 diff --git a/Project.toml b/Project.toml index e614d80..b53059b 100644 --- a/Project.toml +++ b/Project.toml @@ -3,11 +3,17 @@ uuid = "08131aa3-fb12-5dee-8b74-c09406e224a2" version = "0.10.0" [deps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" +LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8" +SPIRV_LLVM_Translator_unified_jll = "85f0d8ed-5b39-5caa-b1ae-7472de402361" [compat] +LLVM = "9.1" OpenCL_jll = "2024.5.8" julia = "1.10" diff --git a/README.md b/README.md index 8b906ac..a8d8d31 100644 --- a/README.md +++ b/README.md @@ -58,38 +58,64 @@ Available platforms: 3 ## Basic example: vector add +The traditional way of using OpenCL is by writing kernel source code in OpenCL C. For +example, a simple vector addition: + ```julia -using LinearAlgebra using OpenCL, pocl_jll -const sum_kernel = " - __kernel void sum(__global const float *a, - __global const float *b, - __global float *c) - { +const source = """ + __kernel void vadd(__global const float *a, + __global const float *b, + __global float *c) { int gid = get_global_id(0); c[gid] = a[gid] + b[gid]; - } -" + }""" + a = rand(Float32, 50_000) b = rand(Float32, 50_000) -a_buff = cl.Buffer(Float32, length(a), (:r, :copy), hostbuf=a) -b_buff = cl.Buffer(Float32, length(b), (:r, :copy), hostbuf=b) -c_buff = cl.Buffer(Float32, length(a), :w) +d_a = CLArray(a; access=:r) +d_b = CLArray(b; access=:r) +d_c = similar(d_a; access=:w) + +p = cl.Program(; source) |> cl.build! +k = cl.Kernel(p, "vadd") -p = cl.Program(source=sum_kernel) |> cl.build! -k = cl.Kernel(p, "sum") +clcall(k, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}}, + d_a, d_b, d_c; global_size=size(a)) -cl.call(k, a_buff, b_buff, c_buff; global_size=size(a)) +c = Array(d_c) + +@assert a + b ≈ c +``` -r = cl.read(c_buff) -if isapprox(norm(r - (a+b)), zero(Float32)) - @info "Success!" -else - @error "Norm should be 0.0f" +## Native example: vector add + +If your platform supports SPIR-V, it's possible to use Julia functions as kernels: + +```julia +using OpenCL, pocl_jll + +function vadd(a, b, c) + gid = get_global_id(1) + @inbounds c[gid] = a[gid] + b[gid] + return end + +a = rand(Float32, 50_000) +b = rand(Float32, 50_000) + +d_a = CLArray(a; access=:r) +d_b = CLArray(b; access=:r) +d_c = similar(d_a; access=:w) + +@opencl global_size=size(a) vadd(d_a, d_b, d_c) + +c = Array(d_c) + +@assert a + b ≈ c ``` diff --git a/examples/demo.jl b/examples/demo.jl deleted file mode 100644 index f79c2fc..0000000 --- a/examples/demo.jl +++ /dev/null @@ -1,36 +0,0 @@ -using OpenCL, LinearAlgebra - -const sum_kernel_src = " - __kernel void sum(__global const float *a, - __global const float *b, - __global float *c) - { - int gid = get_global_id(0); - c[gid] = a[gid] + b[gid]; - } -" -a = rand(Float32, 50_000) -b = rand(Float32, 50_000) - -# create opencl buffer objects -# copies to the device initiated when the kernel function is called -a_buff = cl.Buffer(Float32, length(a), (:r, :copy); hostbuf=a) -b_buff = cl.Buffer(Float32, length(b), (:r, :copy); hostbuf=b) -c_buff = cl.Buffer(Float32, length(a), :w) - -# build the program and construct a kernel object -p = cl.Program(source=sum_kernel_src) |> cl.build! -sum_kernel = cl.Kernel(p, "sum") - -# call the kernel object with global size set to the size our arrays -sum_kernel[size(a)](a_buff, b_buff, c_buff) - -# perform a blocking read of the result from the device -r = cl.read(c_buff) - -# check to see if our result is what we expect! -if isapprox(norm(r - (a+b)), zero(Float32)) - @info("Success!") -else - error("Norm should be 0.0f") -end diff --git a/examples/vadd.jl b/examples/vadd.jl new file mode 100644 index 0000000..d07851e --- /dev/null +++ b/examples/vadd.jl @@ -0,0 +1,27 @@ +using OpenCL, pocl_jll, Test + +const source = """ + __kernel void vadd(__global const float *a, + __global const float *b, + __global float *c) { + int i = get_global_id(0); + c[i] = a[i] + b[i]; + }""" + +dims = (2,) +a = round.(rand(Float32, dims) * 100) +b = round.(rand(Float32, dims) * 100) +c = similar(a) + +d_a = CLArray(a) +d_b = CLArray(b) +d_c = CLArray(c) + +prog = cl.Program(; source) |> cl.build! +kern = cl.Kernel(prog, "vadd") + +len = prod(dims) +clcall(kern, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}}, + d_a, d_b, d_c; global_size=(len,)) +c = Array(d_c) +@test a+b ≈ c diff --git a/examples/vadd_native.jl b/examples/vadd_native.jl new file mode 100644 index 0000000..972209b --- /dev/null +++ b/examples/vadd_native.jl @@ -0,0 +1,21 @@ +using OpenCL, pocl_jll, Test + +function vadd(a, b, c) + i = get_global_id() + @inbounds c[i] = a[i] + b[i] + return +end + +dims = (2,) +a = round.(rand(Float32, dims) * 100) +b = round.(rand(Float32, dims) * 100) +c = similar(a) + +d_a = CLArray(a) +d_b = CLArray(b) +d_c = CLArray(c) + +len = prod(dims) +@opencl global_size=len vadd(d_a, d_b, d_c) +c = Array(d_c) +@test a+b ≈ c diff --git a/lib/CL.jl b/lib/cl/CL.jl similarity index 100% rename from lib/CL.jl rename to lib/cl/CL.jl diff --git a/lib/api.jl b/lib/cl/api.jl similarity index 98% rename from lib/api.jl rename to lib/cl/api.jl index c3a52c5..4340876 100644 --- a/lib/api.jl +++ b/lib/cl/api.jl @@ -69,7 +69,7 @@ function retry_reclaim(f, isfailed) ret end -include("../lib/libopencl.jl") +include("libopencl.jl") # lazy initialization const initialized = Ref{Bool}(false) diff --git a/lib/buffer.jl b/lib/cl/buffer.jl similarity index 100% rename from lib/buffer.jl rename to lib/cl/buffer.jl diff --git a/lib/cmdqueue.jl b/lib/cl/cmdqueue.jl similarity index 100% rename from lib/cmdqueue.jl rename to lib/cl/cmdqueue.jl diff --git a/lib/context.jl b/lib/cl/context.jl similarity index 100% rename from lib/context.jl rename to lib/cl/context.jl diff --git a/lib/device.jl b/lib/cl/device.jl similarity index 100% rename from lib/device.jl rename to lib/cl/device.jl diff --git a/lib/error.jl b/lib/cl/error.jl similarity index 100% rename from lib/error.jl rename to lib/cl/error.jl diff --git a/lib/event.jl b/lib/cl/event.jl similarity index 100% rename from lib/event.jl rename to lib/cl/event.jl diff --git a/lib/kernel.jl b/lib/cl/kernel.jl similarity index 96% rename from lib/kernel.jl rename to lib/cl/kernel.jl index fb3273a..b06c3de 100644 --- a/lib/kernel.jl +++ b/lib/cl/kernel.jl @@ -181,8 +181,13 @@ function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing; end function call(k::Kernel, args...; global_size=(1,), local_size=nothing, - global_work_offset=nothing, wait_on::Vector{Event}=Event[]) + global_work_offset=nothing, wait_on::Vector{Event}=Event[], + svm_pointers::Vector{Ptr{Cvoid}}=Ptr{Cvoid}[]) set_args!(k, args...) + if !isempty(svm_pointers) + clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_SVM_PTRS, + sizeof(svm_pointers), svm_pointers) + end enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on) end @@ -215,8 +220,8 @@ clcall(f::F, types::Tuple, args::Vararg{Any,N}; kwargs...) where {N,F} = clcall(f, _to_tuple_type(types), args...; kwargs...) function clcall(k::Kernel, types::Type{T}, args::Vararg{Any,N}; kwargs...) where {T,N} - call_closure = function (pointers::Vararg{Any,N}) - call(k, pointers...; kwargs...) + call_closure = function (converted_args::Vararg{Any,N}) + call(k, converted_args...; kwargs...) end convert_arguments(call_closure, types, args...) end diff --git a/lib/libopencl.jl b/lib/cl/libopencl.jl similarity index 100% rename from lib/libopencl.jl rename to lib/cl/libopencl.jl diff --git a/lib/memory.jl b/lib/cl/memory.jl similarity index 100% rename from lib/memory.jl rename to lib/cl/memory.jl diff --git a/lib/platform.jl b/lib/cl/platform.jl similarity index 100% rename from lib/platform.jl rename to lib/cl/platform.jl diff --git a/lib/program.jl b/lib/cl/program.jl similarity index 100% rename from lib/program.jl rename to lib/cl/program.jl diff --git a/lib/state.jl b/lib/cl/state.jl similarity index 100% rename from lib/state.jl rename to lib/cl/state.jl diff --git a/lib/svm.jl b/lib/cl/svm.jl similarity index 100% rename from lib/svm.jl rename to lib/cl/svm.jl diff --git a/lib/intrinsics/Project.toml b/lib/intrinsics/Project.toml new file mode 100644 index 0000000..6d9ff96 --- /dev/null +++ b/lib/intrinsics/Project.toml @@ -0,0 +1,9 @@ +name = "SPIRVIntrinsics" +uuid = "71d1d633-e7e8-4a92-83a1-de8814b09ba8" +authors = ["Tim Besard "] +version = "0.1.0" + +[deps] +ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" +SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" diff --git a/lib/intrinsics/src/SPIRVIntrinsics.jl b/lib/intrinsics/src/SPIRVIntrinsics.jl new file mode 100644 index 0000000..848de1a --- /dev/null +++ b/lib/intrinsics/src/SPIRVIntrinsics.jl @@ -0,0 +1,25 @@ +module SPIRVIntrinsics + +using LLVM, LLVM.Interop +using Core: LLVMPtr + +import ExprTools + +import SpecialFunctions + +include("pointer.jl") +include("utils.jl") + +# OpenCL intrinsics +# +# we currently don't implement SPIR-V intrinsics directly, but rely on +# the SPIR-V to LLVM translator supporting OpenCL intrinsics +include("work_item.jl") +include("synchronization.jl") +include("memory.jl") +include("printf.jl") +include("math.jl") +include("integer.jl") +include("atomic.jl") + +end diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl new file mode 100644 index 0000000..144026f --- /dev/null +++ b/lib/intrinsics/src/atomic.jl @@ -0,0 +1,264 @@ +# Atomic Functions + +# TODO: support for 64-bit atomics via atom_cmpxchg (from cl_khr_int64_base_atomics) + +# "atomic operations on 32-bit signed, unsigned integers and single precision +# floating-point to locations in __global or __local memory" + +const atomic_integer_types = [UInt32, Int32] +# TODO: 64-bit atomics with ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS +# TODO: additional floating-point atomics with ZE_extension_float_atomics +const atomic_memory_types = [AS.Local, AS.Global] + + +# generically typed + +for gentype in atomic_integer_types, as in atomic_memory_types +@eval begin + +@device_function atomic_add!(p::LLVMPtr{$gentype,$as}, val::$gentype) = + @builtin_ccall("atomic_add", $gentype, + (LLVMPtr{$gentype,$as}, $gentype), p, val) + +@device_function atomic_sub!(p::LLVMPtr{$gentype,$as}, val::$gentype) = + @builtin_ccall("atomic_sub", $gentype, + (LLVMPtr{$gentype,$as}, $gentype), p, val) + +@device_function atomic_inc!(p::LLVMPtr{$gentype,$as}) = + @builtin_ccall("atomic_inc", $gentype, (LLVMPtr{$gentype,$as},), p) + +@device_function atomic_dec!(p::LLVMPtr{$gentype,$as}) = + @builtin_ccall("atomic_dec", $gentype, (LLVMPtr{$gentype,$as},), p) + +@device_function atomic_min!(p::LLVMPtr{$gentype,$as}, val::$gentype) = + @builtin_ccall("atomic_min", $gentype, + (LLVMPtr{$gentype,$as}, $gentype), p, val) + +@device_function atomic_max!(p::LLVMPtr{$gentype,$as}, val::$gentype) = + @builtin_ccall("atomic_max", $gentype, + (LLVMPtr{$gentype,$as}, $gentype), p, val) + +@device_function atomic_and!(p::LLVMPtr{$gentype,$as}, val::$gentype) = + @builtin_ccall("atomic_and", $gentype, + (LLVMPtr{$gentype,$as}, $gentype), p, val) + +@device_function atomic_or!(p::LLVMPtr{$gentype,$as}, val::$gentype) = + @builtin_ccall("atomic_or", $gentype, + (LLVMPtr{$gentype,$as}, $gentype), p, val) + +@device_function atomic_xor!(p::LLVMPtr{$gentype,$as}, val::$gentype) = + @builtin_ccall("atomic_xor", $gentype, + (LLVMPtr{$gentype,$as}, $gentype), p, val) + +@device_function atomic_xchg!(p::LLVMPtr{$gentype,$as}, val::$gentype) = + @builtin_ccall("atomic_xchg", $gentype, + (LLVMPtr{$gentype,$as}, $gentype), p, val) + +@device_function atomic_cmpxchg!(p::LLVMPtr{$gentype,$as}, cmp::$gentype, val::$gentype) = + @builtin_ccall("atomic_cmpxchg", $gentype, + (LLVMPtr{$gentype,$as}, $gentype, $gentype), p, cmp, val) + +end +end + + +# specifically typed + +for as in atomic_memory_types +@eval begin + +@device_function atomic_xchg!(p::LLVMPtr{Float32,$as}, val::Float32) = + @builtin_ccall("atomic_xchg", Float32, (LLVMPtr{Float32,$as}, Float32,), p, val) + +# XXX: why is only xchg supported on floats? isn't it safe for cmpxchg too, +# which should only perform bitwise comparisons? +@device_function atomic_cmpxchg!(p::LLVMPtr{Float32,$as}, cmp::Float32, val::Float32) = + reinterpret(Float32, atomic_cmpxchg!(reinterpret(LLVMPtr{UInt32,$as}, p), + reinterpret(UInt32, cmp), + reinterpret(UInt32, val))) + +end +end + + + +# documentation + +""" +Read the 32-bit value (referred to as `old`) stored at location pointed by `p`. +Compute `old + val` and store result at location pointed by `p`. The function +returns `old`. +""" +atomic_add! + +""" +Read the 32-bit value (referred to as `old`) stored at location pointed by `p`. +Compute `old - val` and store result at location pointed by `p`. The function +returns `old`. +""" +atomic_sub! + +""" +Swaps the old value stored at location `p` with new value given by `val`. +Returns old value. +""" +atomic_xchg! + +""" +Read the 32-bit value (referred to as `old`) stored at location pointed by `p`. +Compute (`old` + 1) and store result at location pointed by `p`. The function +returns `old`. +""" +atomic_inc! + +""" +Read the 32-bit value (referred to as `old`) stored at location pointed by `p`. +Compute (`old` - 1) and store result at location pointed by `p`. The function +returns `old`. +""" +atomic_dec! + +""" +Read the 32-bit value (referred to as `old`) stored at location pointed by `p`. +Compute `(old == cmp) ? val : old` and store result at location pointed by `p`. +The function returns `old`. +""" +atomic_cmpxchg! + +""" +Read the 32-bit value (referred to as `old`) stored at location pointed by `p`. +Compute `min(old, val)` and store minimum value at location pointed by `p`. The +function returns `old`. +""" +atomic_min! + +""" +Read the 32-bit value (referred to as `old`) stored at location pointed by `p`. +Compute `max(old, val)` and store maximum value at location pointed by `p`. The +function returns `old`. +""" +atomic_max + +""" +Read the 32-bit value (referred to as `old`) stored at location pointed by `p`. +Compute `old & val` and store result at location pointed by `p`. The function +returns `old`. +""" +atomic_and! + +""" +Read the 32-bit value (referred to as `old`) stored at location pointed by `p`. +Compute `old | val` and store result at location pointed by `p`. The function +returns `old`. +""" +atomic_or! + +""" +Read the 32-bit value (referred to as `old`) stored at location pointed by `p`. +Compute `old ^ val` and store result at location pointed by `p`. The function +returns `old`. +""" +atomic_xor! + + + +# +# High-level interface +# + +# prototype of a high-level interface for performing atomic operations on arrays +# +# this design could be generalized by having atomic {field,array}{set,ref} accessors, as +# well as acquire/release operations to implement the fallback functionality where any +# operation can be applied atomically. + +const inplace_ops = Dict( + :(+=) => :(+), + :(-=) => :(-), + :(*=) => :(*), + :(/=) => :(/), + :(÷=) => :(÷), + :(&=) => :(&), + :(|=) => :(|), + :(⊻=) => :(⊻), +) + +struct AtomicError <: Exception + msg::AbstractString +end + +Base.showerror(io::IO, err::AtomicError) = + print(io, "AtomicError: ", err.msg) + +""" + @atomic a[I] = op(a[I], val) + @atomic a[I] ...= val + +Atomically perform a sequence of operations that loads an array element `a[I]`, performs the +operation `op` on that value and a second value `val`, and writes the result back to the +array. This sequence can be written out as a regular assignment, in which case the same +array element should be used in the left and right hand side of the assignment, or as an +in-place application of a known operator. In both cases, the array reference should be pure +and not induce any side-effects. + +!!! warn + This interface is experimental, and might change without warning. Use the lower-level + `atomic_...!` functions for a stable API. +""" +macro atomic(ex) + # decode assignment and call + if ex.head == :(=) + ref = ex.args[1] + rhs = ex.args[2] + Meta.isexpr(rhs, :call) || throw(AtomicError("right-hand side of an @atomic assignment should be a call")) + op = rhs.args[1] + if rhs.args[2] != ref + throw(AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side")) + end + val = rhs.args[3] + elseif haskey(inplace_ops, ex.head) + op = inplace_ops[ex.head] + ref = ex.args[1] + val = ex.args[2] + else + throw(AtomicError("unknown @atomic expression")) + end + + # decode array expression + Meta.isexpr(ref, :ref) || throw(AtomicError("@atomic should be applied to an array reference expression")) + array = ref.args[1] + indices = Expr(:tuple, ref.args[2:end]...) + + esc(quote + $atomic_arrayset($array, $indices, $op, $val) + end) +end + +# FIXME: make this respect the indexing style +@inline atomic_arrayset(A::AbstractArray{T}, Is::Tuple, op::Function, val) where {T} = + atomic_arrayset(A, Base._to_linear_index(A, Is...), op, convert(T, val)) + +# native atomics +for (op,impl) in [(+) => atomic_add!, + (-) => atomic_sub!, + (&) => atomic_and!, + (|) => atomic_or!, + (⊻) => atomic_xor!, + Base.max => atomic_max!, + Base.min => atomic_min!] + @eval @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, ::typeof($op), + val::T) where {T <: Union{Int32,UInt32}} = + $impl(pointer(A, I), val) +end + +# fallback using compare-and-swap +function atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T} + ptr = pointer(A, I) + old = Base.unsafe_load(ptr, 1) + while true + cmp = old + new = convert(T, op(old, val)) + old = atomic_cmpxchg!(ptr, cmp, new) + (old == cmp) && return new + end +end diff --git a/lib/intrinsics/src/integer.jl b/lib/intrinsics/src/integer.jl new file mode 100644 index 0000000..edbd4bb --- /dev/null +++ b/lib/intrinsics/src/integer.jl @@ -0,0 +1,53 @@ +# Integer Functions + +# TODO: vector types +const generic_integer_types = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64] + + +# generically typed + +for gentype in generic_integer_types +@eval begin + +@device_override Base.abs(x::$gentype) = @builtin_ccall("abs", $gentype, ($gentype,), x) +@device_function abs_diff(x::$gentype, y::$gentype) = @builtin_ccall("abs_diff", $gentype, ($gentype, $gentype), x, y) + +@device_function add_sat(x::$gentype, y::$gentype) = @builtin_ccall("add_sat", $gentype, ($gentype, $gentype), x, y) +@device_function hadd(x::$gentype, y::$gentype) = @builtin_ccall("hadd", $gentype, ($gentype, $gentype), x, y) +@device_function rhadd(x::$gentype, y::$gentype) = @builtin_ccall("rhadd", $gentype, ($gentype, $gentype), x, y) + +@device_override Base.clamp(x::$gentype, minval::$gentype, maxval::$gentype) = @builtin_ccall("clamp", $gentype, ($gentype, $gentype, $gentype), x, minval, maxval) + +@device_function clz(x::$gentype) = @builtin_ccall("clz", $gentype, ($gentype,), x) +@device_function ctz(x::$gentype) = @builtin_ccall("ctz", $gentype, ($gentype,), x) + +@device_function mad_hi(a::$gentype, b::$gentype, c::$gentype) = @builtin_ccall("mad_hi", $gentype, ($gentype, $gentype, $gentype), a, b, c) +@device_function mad_sat(a::$gentype, b::$gentype, c::$gentype) = @builtin_ccall("mad_sat", $gentype, ($gentype, $gentype, $gentype), a, b, c) + +# XXX: these definitions introduce ambiguities +#@device_override Base.max(x::$gentype, y::$gentype) = @builtin_ccall("max", $gentype, ($gentype, $gentype), x, y) +#@device_override Base.min(x::$gentype, y::$gentype) = @builtin_ccall("min", $gentype, ($gentype, $gentype), x, y) + +@device_function mul_hi(x::$gentype, y::$gentype) = @builtin_ccall("mul_hi", $gentype, ($gentype, $gentype), x, y) + +@device_function rotate(v::$gentype, i::$gentype) = @builtin_ccall("rotate", $gentype, ($gentype, $gentype), v, i) + +@device_function sub_sat(x::$gentype, y::$gentype) = @builtin_ccall("sub_sat", $gentype, ($gentype, $gentype), x, y) + +@device_function popcount(x::$gentype) = @builtin_ccall("popcount", $gentype, ($gentype,), x) + +@device_function mad24(x::$gentype, y::$gentype, z::$gentype) = @builtin_ccall("mad24", $gentype, ($gentype, $gentype, $gentype), x, y, z) +@device_function mul24(x::$gentype, y::$gentype) = @builtin_ccall("mul24", $gentype, ($gentype, $gentype), x, y) + +end +end + + +# specifically typed + +@device_function upsample(hi::Cchar, lo::Cuchar) = @builtin_ccall("upsample", Cshort, (Cchar, Cuchar), hi, lo) +upsample(hi::Cuchar, lo::Cuchar) = @builtin_ccall("upsample", Cushort, (Cuchar, Cuchar), hi, lo) +upsample(hi::Cshort, lo::Cushort) = @builtin_ccall("upsample", Cint, (Cshort, Cushort), hi, lo) +upsample(hi::Cushort, lo::Cushort) = @builtin_ccall("upsample", Cuint, (Cushort, Cushort), hi, lo) +upsample(hi::Cint, lo::Cuint) = @builtin_ccall("upsample", Clong, (Cint, Cuint), hi, lo) +upsample(hi::Cuint, lo::Cuint) = @builtin_ccall("upsample", Culong, (Cuint, Cuint), hi, lo) diff --git a/lib/intrinsics/src/math.jl b/lib/intrinsics/src/math.jl new file mode 100644 index 0000000..1e4c2a9 --- /dev/null +++ b/lib/intrinsics/src/math.jl @@ -0,0 +1,214 @@ +# Math Functions + +# TODO: vector types +const generic_types = [Float32,Float64] +const generic_types_float = [Float32] +const generic_types_double = [Float64] + + +# generically typed + +for gentype in generic_types +@eval begin + +@device_override Base.acos(x::$gentype) = @builtin_ccall("acos", $gentype, ($gentype,), x) +@device_override Base.acosh(x::$gentype) = @builtin_ccall("acosh", $gentype, ($gentype,), x) +@device_function acospi(x::$gentype) = @builtin_ccall("acospi", $gentype, ($gentype,), x) + +@device_override Base.asin(x::$gentype) = @builtin_ccall("asin", $gentype, ($gentype,), x) +@device_override Base.asinh(x::$gentype) = @builtin_ccall("asinh", $gentype, ($gentype,), x) +@device_function asinpi(x::$gentype) = @builtin_ccall("asinpi", $gentype, ($gentype,), x) + +@device_override Base.atan(y_over_x::$gentype) = @builtin_ccall("atan", $gentype, ($gentype,), y_over_x) +@device_override Base.atan(y::$gentype, x::$gentype) = @builtin_ccall("atan2", $gentype, ($gentype, $gentype), y, x) +@device_override Base.atanh(x::$gentype) = @builtin_ccall("atanh", $gentype, ($gentype,), x) +@device_function atanpi(x::$gentype) = @builtin_ccall("atanpi", $gentype, ($gentype,), x) +@device_function atanpi(y::$gentype, x::$gentype) = @builtin_ccall("atan2pi", $gentype, ($gentype, $gentype), y, x) + +@device_override Base.cbrt(x::$gentype) = @builtin_ccall("cbrt", $gentype, ($gentype,), x) + +@device_override Base.ceil(x::$gentype) = @builtin_ccall("ceil", $gentype, ($gentype,), x) + +@device_override Base.copysign(x::$gentype, y::$gentype) = @builtin_ccall("copysign", $gentype, ($gentype, $gentype), x, y) + +@device_override Base.cos(x::$gentype) = @builtin_ccall("cos", $gentype, ($gentype,), x) +@device_override Base.cosh(x::$gentype) = @builtin_ccall("cosh", $gentype, ($gentype,), x) +@device_function cospi(x::$gentype) = @builtin_ccall("cospi", $gentype, ($gentype,), x) + +@device_override SpecialFunctions.erfc(x::$gentype) = @builtin_ccall("erfc", $gentype, ($gentype,), x) +@device_override SpecialFunctions.erf(x::$gentype) = @builtin_ccall("erf", $gentype, ($gentype,), x) + +@device_override Base.exp(x::$gentype) = @builtin_ccall("exp", $gentype, ($gentype,), x) +@device_override Base.exp2(x::$gentype) = @builtin_ccall("exp2", $gentype, ($gentype,), x) +@device_override Base.exp10(x::$gentype) = @builtin_ccall("exp10", $gentype, ($gentype,), x) +@device_override Base.expm1(x::$gentype) = @builtin_ccall("expm1", $gentype, ($gentype,), x) + +@device_override Base.abs(x::$gentype) = @builtin_ccall("fabs", $gentype, ($gentype,), x) + +@device_function dim(x::$gentype, y::$gentype) = @builtin_ccall("fdim", $gentype, ($gentype, $gentype), x, y) + +@device_override Base.floor(x::$gentype) = @builtin_ccall("floor", $gentype, ($gentype,), x) + +@device_override Base.fma(a::$gentype, b::$gentype, c::$gentype) = @builtin_ccall("fma", $gentype, ($gentype, $gentype, $gentype), a, b, c) + +@device_override Base.max(x::$gentype, y::$gentype) = @builtin_ccall("fmax", $gentype, ($gentype, $gentype), x, y) + +@device_override Base.min(x::$gentype, y::$gentype) = @builtin_ccall("fmin", $gentype, ($gentype, $gentype), x, y) + +# NOTE: Julia's mod behaves differently than fmod +#@device_override Base.mod(x::$gentype, y::$gentype) = @builtin_ccall("fmod", $gentype, ($gentype, $gentype), x, y) +# fract(x::$gentype, $gentype *iptr) = @builtin_ccall("fract", $gentype, ($gentype, $gentype *), x, iptr) + +@device_override Base.hypot(x::$gentype, y::$gentype) = @builtin_ccall("hypot", $gentype, ($gentype, $gentype), x, y) + +@device_override SpecialFunctions.loggamma(x::$gentype) = @builtin_ccall("lgamma", $gentype, ($gentype,), x) + +@device_override Base.log(x::$gentype) = @builtin_ccall("log", $gentype, ($gentype,), x) +@device_override Base.log2(x::$gentype) = @builtin_ccall("log2", $gentype, ($gentype,), x) +@device_override Base.log10(x::$gentype) = @builtin_ccall("log10", $gentype, ($gentype,), x) +@device_override Base.log1p(x::$gentype) = @builtin_ccall("log1p", $gentype, ($gentype,), x) +@device_function logb(x::$gentype) = @builtin_ccall("logb", $gentype, ($gentype,), x) + +@device_function mad(a::$gentype, b::$gentype, c::$gentype) = @builtin_ccall("mad", $gentype, ($gentype, $gentype, $gentype), a, b, c) + +@device_function maxmag(x::$gentype, y::$gentype) = @builtin_ccall("maxmag", $gentype, ($gentype, $gentype), x, y) +@device_function minmag(x::$gentype, y::$gentype) = @builtin_ccall("minmag", $gentype, ($gentype, $gentype), x, y) + +# modf(x::$gentype, $gentype *iptr) = @builtin_ccall("modf", $gentype, ($gentype, $gentype *), x, iptr) + +@device_function nextafter(x::$gentype, y::$gentype) = @builtin_ccall("nextafter", $gentype, ($gentype, $gentype), x, y) + +@device_override Base.:(^)(x::$gentype, y::$gentype) = @builtin_ccall("pow", $gentype, ($gentype, $gentype), x, y) +@device_function powr(x::$gentype, y::$gentype) = @builtin_ccall("powr", $gentype, ($gentype, $gentype), x, y) + +@device_override Base.rem(x::$gentype, y::$gentype) = @builtin_ccall("remainder", $gentype, ($gentype, $gentype), x, y) + +@device_function rint(x::$gentype) = @builtin_ccall("rint", $gentype, ($gentype,), x) + +@device_override Base.round(x::$gentype) = @builtin_ccall("round", $gentype, ($gentype,), x) + +@device_function rsqrt(x::$gentype) = @builtin_ccall("rsqrt", $gentype, ($gentype,), x) + +@device_override Base.sin(x::$gentype) = @builtin_ccall("sin", $gentype, ($gentype,), x) +@device_override function Base.sincos(x::$gentype) + cosval = Ref{$gentype}() + sinval = GC.@preserve cosval begin + ptr = Base.unsafe_convert(Ptr{$gentype}, cosval) + llvm_ptr = reinterpret(LLVMPtr{$gentype, AS.Private}, ptr) + @builtin_ccall("sincos", $gentype, ($gentype, LLVMPtr{$gentype, AS.Private}), x, llvm_ptr) + end + return sinval, cosval[] +end +@device_override Base.sinh(x::$gentype) = @builtin_ccall("sinh", $gentype, ($gentype,), x) +@device_function sinpi(x::$gentype) = @builtin_ccall("sinpi", $gentype, ($gentype,), x) + +@device_override Base.sqrt(x::$gentype) = @builtin_ccall("sqrt", $gentype, ($gentype,), x) + +@device_override Base.tan(x::$gentype) = @builtin_ccall("tan", $gentype, ($gentype,), x) +@device_override Base.tanh(x::$gentype) = @builtin_ccall("tanh", $gentype, ($gentype,), x) +@device_function tanpi(x::$gentype) = @builtin_ccall("tanpi", $gentype, ($gentype,), x) + +@device_override SpecialFunctions.gamma(x::$gentype) = @builtin_ccall("tgamma", $gentype, ($gentype,), x) + +@device_override Base.trunc(x::$gentype) = @builtin_ccall("trunc", $gentype, ($gentype,), x) + +end +end + + +# generically typed -- only floats + +for gentypef in generic_types_float + +if gentypef !== Float32 +@eval begin +@device_override Base.max(x::$gentypef, y::Float32) = @builtin_ccall("fmax", $gentypef, ($gentypef, Float32), x, y) +@device_override Base.min(x::$gentypef, y::Float32) = @builtin_ccall("fmin", $gentypef, ($gentypef, Float32), x, y) +end +end + +end + + +# generically typed -- only doubles + +for gentyped in generic_types_double + +if gentyped !== Float64 +@eval begin +@device_override Base.min(x::$gentyped, y::Float64) = @builtin_ccall("fmin", $gentyped, ($gentyped, Float64), x, y) +@device_override Base.max(x::$gentyped, y::Float64) = @builtin_ccall("fmax", $gentyped, ($gentyped, Float64), x, y) +end +end + +end + + +# specifically typed + +# frexp(x::Float32{n}, Int32{n} *exp) = @builtin_ccall("frexp", Float32{n}, (Float32{n}, Int32{n} *), x, exp) +# frexp(x::Float32, Int32 *exp) = @builtin_ccall("frexp", Float32, (Float32, Int32 *), x, exp) +# frexp(x::Float64{n}, Int32{n} *exp) = @builtin_ccall("frexp", Float64{n}, (Float64{n}, Int32{n} *), x, exp) +# frexp(x::Float64, Int32 *exp) = @builtin_ccall("frexp", Float64, (Float64, Int32 *), x, exp) + +# ilogb(x::Float32{n}) = @builtin_ccall("ilogb", Int32{n}, (Float32{n},), x) +@device_function ilogb(x::Float32) = @builtin_ccall("ilogb", Int32, (Float32,), x) +# ilogb(x::Float64{n}) = @builtin_ccall("ilogb", Int32{n}, (Float64{n},), x) +@device_function ilogb(x::Float64) = @builtin_ccall("ilogb", Int32, (Float64,), x) + +# ldexp(x::Float32{n}, k::Int32{n}) = @builtin_ccall("ldexp", Float32{n}, (Float32{n}, Int32{n}), x, k) +# ldexp(x::Float32{n}, k::Int32) = @builtin_ccall("ldexp", Float32{n}, (Float32{n}, Int32), x, k) +@device_override Base.ldexp(x::Float32, k::Int32) = @builtin_ccall("ldexp", Float32, (Float32, Int32), x, k) +# ldexp(x::Float64{n}, k::Int32{n}) = @builtin_ccall("ldexp", Float64{n}, (Float64{n}, Int32{n}), x, k) +# ldexp(x::Float64{n}, k::Int32) = @builtin_ccall("ldexp", Float64{n}, (Float64{n}, Int32), x, k) +@device_override Base.ldexp(x::Float64, k::Int32) = @builtin_ccall("ldexp", Float64, (Float64, Int32), x, k) + +# lgamma_r(x::Float32{n}, Int32{n} *signp) = @builtin_ccall("lgamma_r", Float32{n}, (Float32{n}, Int32{n} *), x, signp) +# lgamma_r(x::Float32, Int32 *signp) = @builtin_ccall("lgamma_r", Float32, (Float32, Int32 *), x, signp) +# lgamma_r(x::Float64{n}, Int32{n} *signp) = @builtin_ccall("lgamma_r", Float64{n}, (Float64{n}, Int32{n} *), x, signp) +# Float64 lgamma_r(x::Float64, Int32 *signp) = @builtin_ccall("lgamma_r", Float64, (Float64, Int32 *), x, signp) + +# nan(nancode::uintn) = @builtin_ccall("nan", Float32{n}, (uintn,), nancode) +@device_function nan(nancode::UInt32) = @builtin_ccall("nan", Float32, (UInt32,), nancode) +# nan(nancode::UInt64{n}) = @builtin_ccall("nan", Float64{n}, (UInt64{n},), nancode) +@device_function nan(nancode::UInt64) = @builtin_ccall("nan", Float64, (UInt64,), nancode) + +# pown(x::Float32{n}, y::Int32{n}) = @builtin_ccall("pown", Float32{n}, (Float32{n}, Int32{n}), x, y) +@device_override Base.:(^)(x::Float32, y::Int32) = @builtin_ccall("pown", Float32, (Float32, Int32), x, y) +# pown(x::Float64{n}, y::Int32{n}) = @builtin_ccall("pown", Float64{n}, (Float64{n}, Int32{n}), x, y) +@device_override Base.:(^)(x::Float64, y::Int32) = @builtin_ccall("pown", Float64, (Float64, Int32), x, y) + +# remquo(x::Float32{n}, y::Float32{n}, Int32{n} *quo) = @builtin_ccall("remquo", Float32{n}, (Float32{n}, Float32{n}, Int32{n} *), x, y, quo) +# remquo(x::Float32, y::Float32, Int32 *quo) = @builtin_ccall("remquo", Float32, (Float32, Float32, Int32 *), x::Float32, y, quo) +# remquo(x::Float64{n}, y::Float64{n}, Int32{n} *quo) = @builtin_ccall("remquo", Float64{n}, (Float64{n}, Float64{n}, Int32{n} *), x, y, quo) +# remquo(x::Float64, y::Float64, Int32 *quo) = @builtin_ccall("remquo", Float64, (Float64, Float64, Int32 *), x, y, quo) + +# rootn(x::Float32{n}, y::Int32{n}) = @builtin_ccall("rootn", Float32{n}, (Float32{n}, Int32{n}), x, y) +@device_function rootn(x::Float32, y::Int32) = @builtin_ccall("rootn", Float32, (Float32, Int32), x, y) +# rootn(x::Float64{n}, y::Int32{n}) = @builtin_ccall("rootn", Float64{n}, (Float64{n}, Int32{n}), x, y) +# rootn(x::Float64, y::Int32) = @builtin_ccall("rootn", Float64{n}, (Float64, Int32), x, y) + + +# TODO: half and native + +function _mulhi(a::Int64, b::Int64) + shift = sizeof(a) * 4 + mask = typemax(UInt32) + a1, a2 = (a >> shift), a & mask + b1, b2 = (b >> shift), b & mask + a1b1, a1b2, a2b1 = a1*b1, a1*b2, a2*b1 + t1 = a1b2 + _mulhi(a2 % UInt32, b2 % UInt32) + t2 = a2b1 + (t1 & mask) + a1b1 + (t1 >> shift) + (t2 >> shift) +end +@static if isdefined(Base.MultiplicativeInverses, :_mul_high) + _mulhi(a::T, b::T) where {T<:Union{Signed, Unsigned}} = Base.MultiplicativeInverses._mul_high(a, b) + @device_override Base.MultiplicativeInverses._mul_high(a::Int64, b::Int64) = _mulhi(a, b) +else + _mulhi(a::T, b::T) where {T<:Union{Signed, Unsigned}} = ((widen(a)*b) >>> (sizeof(a)*8)) % T + @device_override function Base.div(a::Int64, b::Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64}) + x = _mulhi(a, b.multiplier) + x += (a*b.addmul) % Int64 + ifelse(abs(b.divisor) == 1, a*b.divisor, (signbit(x) + (x >> b.shift)) % Int64) + end +end diff --git a/lib/intrinsics/src/memory.jl b/lib/intrinsics/src/memory.jl new file mode 100644 index 0000000..d5ffe26 --- /dev/null +++ b/lib/intrinsics/src/memory.jl @@ -0,0 +1,38 @@ +# local memory + +# get a pointer to local memory, with known (static) or zero length (dynamic) +@generated function emit_localmemory(::Type{T}, ::Val{len}=Val(0)) where {T,len} + Context() do ctx + # XXX: as long as LLVMPtr is emitted as i8*, it doesn't make sense to type the GV + eltyp = convert(LLVMType, LLVM.Int8Type()) + T_ptr = convert(LLVMType, LLVMPtr{T,AS.Local}) + + # create a function + llvm_f, _ = create_function(T_ptr) + + # create the global variable + mod = LLVM.parent(llvm_f) + gv_typ = LLVM.ArrayType(eltyp, len * sizeof(T)) + gv = GlobalVariable(mod, gv_typ, "local_memory", AS.Local) + if len > 0 + linkage!(gv, LLVM.API.LLVMInternalLinkage) + initializer!(gv, null(gv_typ)) + end + # TODO: Make the alignment configurable + alignment!(gv, Base.datatype_alignment(T)) + + # generate IR + IRBuilder() do builder + entry = BasicBlock(llvm_f, "entry") + position!(builder, entry) + + ptr = gep!(builder, gv_typ, gv, [ConstantInt(0), ConstantInt(0)]) + + untyped_ptr = bitcast!(builder, ptr, T_ptr) + + ret!(builder, untyped_ptr) + end + + call_function(llvm_f, LLVMPtr{T,AS.Local}) + end +end diff --git a/lib/intrinsics/src/pointer.jl b/lib/intrinsics/src/pointer.jl new file mode 100644 index 0000000..228740e --- /dev/null +++ b/lib/intrinsics/src/pointer.jl @@ -0,0 +1,16 @@ +## adrspace aliases + +export AS + +module AS + +const Private = 0 +const Global = 1 +const Constant = 2 +const Local = 3 +const Generic = 4 +const Input = 5 +const Output = 6 +const Count = 7 + +end diff --git a/lib/intrinsics/src/printf.jl b/lib/intrinsics/src/printf.jl new file mode 100644 index 0000000..f7429f8 --- /dev/null +++ b/lib/intrinsics/src/printf.jl @@ -0,0 +1,205 @@ +# printf + +# Formatted Output (B.17) + +@generated function promote_c_argument(arg) + # > When a function with a variable-length argument list is called, the variable + # > arguments are passed using C's old ``default argument promotions.'' These say that + # > types char and short int are automatically promoted to int, and type float is + # > automatically promoted to double. Therefore, varargs functions will never receive + # > arguments of type char, short int, or float. + + if arg == Cchar || arg == Cshort || arg == Cuchar || arg == Cushort + return :(Cint(arg)) + elseif arg == Cfloat + return :(Cdouble(arg)) + else + return :(arg) + end +end + +macro printf(fmt::String, args...) + fmt_val = Val(Symbol(fmt)) + + return :(emit_printf($fmt_val, $(map(arg -> :(promote_c_argument($arg)), esc.(args))...))) +end + +@generated function emit_printf(::Val{fmt}, argspec...) where {fmt} + arg_exprs = [:( argspec[$i] ) for i in 1:length(argspec)] + arg_types = [argspec...] + + Context() do ctx + T_void = LLVM.VoidType() + T_int32 = LLVM.Int32Type() + T_pint8 = LLVM.PointerType(LLVM.Int8Type(), AS.Constant) + + # create functions + param_types = LLVMType[convert(LLVMType, typ) for typ in arg_types] + llvm_f, _ = create_function(T_int32, param_types) + mod = LLVM.parent(llvm_f) + + # generate IR + IRBuilder() do builder + entry = BasicBlock(llvm_f, "entry") + position!(builder, entry) + + str = globalstring_ptr!(builder, String(fmt); addrspace=AS.Constant) + + # invoke printf and return + printf_typ = LLVM.FunctionType(T_int32, [T_pint8]; vararg=true) + printf = LLVM.Function(mod, "printf", printf_typ) + push!(function_attributes(printf), EnumAttribute("nobuiltin")) + chars = call!(builder, printf_typ, printf, [str, parameters(llvm_f)...]) + + ret!(builder, chars) + end + + call_function(llvm_f, Int32, Tuple{arg_types...}, arg_exprs...) + end +end + + +## print-like functionality + +# simple conversions, defining an expression and the resulting argument type. nothing fancy, +# `@print` pretty directly maps to `@printf`; we should just support `write(::IO)`. +const print_conversions = Dict( + Float32 => (x->:(Float64($x)), Float64), + Ptr{<:Any} => (x->:(convert(Ptr{Cvoid}, $x)), Ptr{Cvoid}), + Bool => (x->:(Int32($x)), Int32), +) + +# format specifiers +const print_specifiers = Dict( + # integers + Int16 => "%hd", + Int32 => "%d", + Int64 => Sys.iswindows() ? "%lld" : "%ld", + UInt16 => "%hu", + UInt32 => "%u", + UInt64 => Sys.iswindows() ? "%llu" : "%lu", + + # floating-point + Float64 => "%f", + + # other + Cchar => "%c", + Ptr{Cvoid} => "%p", +) + +@generated function _print(parts...) + fmt = "" + args = Expr[] + + for i in 1:length(parts) + part = :(parts[$i]) + T = parts[i] + + # put literals directly in the format string + if T <: Val + fmt *= string(T.parameters[1]) + continue + end + + # try to convert arguments if they are not supported directly + if !haskey(print_specifiers, T) + for Tmatch in keys(print_conversions) + if T <: Tmatch + conv, T = print_conversions[Tmatch] + part = conv(part) + break + end + end + end + + # render the argument + if haskey(print_specifiers, T) + fmt *= print_specifiers[T] + push!(args, part) + elseif T <: String + @error("@print does not support non-literal strings") + else + @error("@print does not support values of type $T") + end + end + + quote + Base.@_inline_meta + @printf($fmt, $(args...)) + end +end + +""" + @print(xs...) + @println(xs...) + +Print a textual representation of values `xs` to standard output from the GPU. The +functionality builds on `@printf`, and is intended as a more use friendly alternative of +that API. However, that also means there's only limited support for argument types, handling +16/32/64 signed and unsigned integers, 32 and 64-bit floating point numbers, `Cchar`s and +pointers. For more complex output, use `@printf` directly. + +Limited string interpolation is also possible: + +```julia + @print("Hello, World ", 42, "\\n") + @print "Hello, World \$(42)\\n" +``` +""" +macro print(parts...) + args = Union{Val,Expr,Symbol}[] + + parts = [parts...] + while true + isempty(parts) && break + + part = popfirst!(parts) + + # handle string interpolation + if isa(part, Expr) && part.head == :string + parts = vcat(part.args, parts) + continue + end + + # expose literals to the generator by using Val types + if isbits(part) # literal numbers, etc + push!(args, Val(part)) + elseif isa(part, QuoteNode) # literal symbols + push!(args, Val(part.value)) + elseif isa(part, String) # literal strings need to be interned + push!(args, Val(Symbol(part))) + else # actual values that will be passed to printf + push!(args, part) + end + end + + quote + _print($(map(esc, args)...)) + end +end + +@doc (@doc @print) -> +macro println(parts...) + esc(quote + oneAPI.@print($(parts...), "\n") + end) +end + +""" + @show(ex) + +GPU analog of `Base.@show`. It comes with the same type restrictions as [`@printf`](@ref). + +```julia +@show threadIdx().x +``` +""" +macro show(exs...) + blk = Expr(:block) + for ex in exs + push!(blk.args, :(oneAPI.@println($(sprint(Base.show_unquoted,ex)*" = "), + begin local value = $(esc(ex)) end))) + end + isempty(exs) || push!(blk.args, :value) + blk +end diff --git a/lib/intrinsics/src/synchronization.jl b/lib/intrinsics/src/synchronization.jl new file mode 100644 index 0000000..0d6c413 --- /dev/null +++ b/lib/intrinsics/src/synchronization.jl @@ -0,0 +1,21 @@ +# Synchronization Functions + +export barrier + +const cl_mem_fence_flags = UInt32 +const CLK_LOCAL_MEM_FENCE = cl_mem_fence_flags(1) +const CLK_GLOBAL_MEM_FENCE = cl_mem_fence_flags(2) + +#barrier(flags=0) = @builtin_ccall("barrier", Cvoid, (UInt32,), flags) +@device_function barrier(flags=0) = Base.llvmcall((""" + declare void @_Z7barrierj(i32) #0 + define void @entry(i32 %0) #1 { + call void @_Z7barrierj(i32 %0) + ret void + } + attributes #0 = { convergent } + attributes #1 = { alwaysinline } + """, "entry"), + Cvoid, Tuple{Int32}, convert(Int32, flags)) +push!(opencl_builtins, "_Z7barrierj") +# TODO: add support for attributes to @builting_ccall/LLVM.@typed_ccall diff --git a/lib/intrinsics/src/utils.jl b/lib/intrinsics/src/utils.jl new file mode 100644 index 0000000..0c69b80 --- /dev/null +++ b/lib/intrinsics/src/utils.jl @@ -0,0 +1,109 @@ +const opencl_builtins = String["printf"] + +# OpenCL functions need to be mangled according to the C++ Itanium spec. We implement a very +# limited version of that spec here, just enough to support OpenCL built-ins. +# +# This macro also keeps track of called builtins, generating `ccall("extern...", llvmcall)` +# expressions for them (so that we can exclude them during IR verification). +macro builtin_ccall(name, ret, argtypes, args...) + @assert Meta.isexpr(argtypes, :tuple) + argtypes = argtypes.args + + function mangle(T::Type) + if T == Cint + "i" + elseif T == Cuint + "j" + elseif T == Clong + "l" + elseif T == Culong + "m" + elseif T == Clonglong + "x" + elseif T == Culonglong + "y" + elseif T == Cshort + "s" + elseif T == Cushort + "t" + elseif T == Cchar + "c" + elseif T == Cuchar + "h" + elseif T == Cfloat + "f" + elseif T == Cdouble + "d" + elseif T <: LLVMPtr + elt, as = T.parameters + + # mangle address space + ASstr = if as == AS.Global + "CLglobal" + #elseif as == AS.Global_device + # "CLdevice" + #elseif as == AS.Global_host + # "CLhost" + elseif as == AS.Local + "CLlocal" + elseif as == AS.Constant + "CLconstant" + elseif as == AS.Private + "CLprivate" + elseif as == AS.Generic + "CLgeneric" + else + error("Unknown address space $AS") + end + + # encode as vendor qualifier + ASstr = "U" * string(length(ASstr)) * ASstr + + # XXX: where does the V come from? + "P" * ASstr * "V" * mangle(elt) + else + error("Unknown type $T") + end + end + + # C++-style mangling; very limited to just support these intrinsics + # TODO: generalize for use with other intrinsics? do we need to mangle those? + mangled = "_Z$(length(name))$name" + for t in argtypes + # with `@eval @builtin_ccall`, we get actual types in the ast, otherwise symbols + t = (isa(t, Symbol) || isa(t, Expr)) ? eval(t) : t + mangled *= mangle(t) + end + + push!(opencl_builtins, mangled) + esc(quote + @typed_ccall($mangled, llvmcall, $ret, ($(argtypes...),), $(args...)) + end) +end + + +## device overrides + +# local method table for device functions +Base.Experimental.@MethodTable(method_table) + +macro device_override(ex) + esc(quote + Base.Experimental.@overlay($method_table, $ex) + end) +end + +macro device_function(ex) + ex = macroexpand(__module__, ex) + def = ExprTools.splitdef(ex) + + # generate a function that errors + def[:body] = quote + error("This function is not intended for use on the CPU") + end + + esc(quote + $(ExprTools.combinedef(def)) + @device_override $ex + end) +end diff --git a/lib/intrinsics/src/work_item.jl b/lib/intrinsics/src/work_item.jl new file mode 100644 index 0000000..ee3cf73 --- /dev/null +++ b/lib/intrinsics/src/work_item.jl @@ -0,0 +1,28 @@ +# Work-Item Functions + +export get_work_dim, + get_global_size, get_global_id, + get_local_size, get_enqueued_local_size, get_local_id, + get_num_groups, get_group_id, + get_global_offset, + get_global_linear_id, get_local_linear_id + +# NOTE: these functions now unsafely truncate to Int to avoid top bit checks. +# we should probably use range metadata instead. + +@device_function get_work_dim() = @builtin_ccall("get_work_dim", UInt32, ()) % Int + +@device_function get_global_size(dimindx::Integer=1) = @builtin_ccall("get_global_size", UInt, (UInt32,), dimindx-1) % Int +@device_function get_global_id(dimindx::Integer=1) = @builtin_ccall("get_global_id", UInt, (UInt32,), dimindx-1) % Int + 1 + +@device_function get_local_size(dimindx::Integer=1) = @builtin_ccall("get_local_size", UInt, (UInt32,), dimindx-1) % Int +@device_function get_enqueued_local_size(dimindx::Integer=1) = @builtin_ccall("get_enqueued_local_size", UInt, (UInt32,), dimindx-1) % Int +@device_function get_local_id(dimindx::Integer=1) = @builtin_ccall("get_local_id", UInt, (UInt32,), dimindx-1) % Int + 1 + +@device_function get_num_groups(dimindx::Integer=1) = @builtin_ccall("get_num_groups", UInt, (UInt32,), dimindx-1) % Int +@device_function get_group_id(dimindx::Integer=1) = @builtin_ccall("get_group_id", UInt, (UInt32,), dimindx-1) % Int + 1 + +@device_function get_global_offset(dimindx::Integer=1) = @builtin_ccall("get_global_offset", UInt, (UInt32,), dimindx-1) % Int + 1 + +@device_function get_global_linear_id() = @builtin_ccall("get_global_linear_id", UInt, ()) % Int + 1 +@device_function get_local_linear_id() = @builtin_ccall("get_local_linear_id", UInt, ()) % Int + 1 diff --git a/src/OpenCL.jl b/src/OpenCL.jl index e69faed..70e19b9 100644 --- a/src/OpenCL.jl +++ b/src/OpenCL.jl @@ -1,12 +1,45 @@ module OpenCL +using GPUCompiler +using LLVM, LLVM.Interop +using SPIRV_LLVM_Translator_unified_jll +using Adapt using Reexport +using Core: LLVMPtr + # library wrappers -include("../lib/CL.jl") +include("../lib/cl/CL.jl") @reexport using .cl export cl +# device functionality +include("device/runtime.jl") +import SPIRVIntrinsics +let + # re-export functionality from SPIRVIntrinsics + for name in names(SPIRVIntrinsics) + name == :SPIRVIntrinsics && continue + @eval export $name + end + + # import all the others so that the user can refer to them through the OpenCL module + for name in names(SPIRVIntrinsics; all=true) + # bring all the names of this module in scope + name in (:SPIRVIntrinsics, :eval, :include) && continue + startswith(string(name), "#") && continue + @eval begin + using .SPIRVIntrinsics: $name + end + end +end +include("device/array.jl") + +# compiler implementation +include("compiler/compilation.jl") +include("compiler/execution.jl") +include("compiler/reflection.jl") + # high-level functionality include("util.jl") include("array.jl") diff --git a/src/array.jl b/src/array.jl index bd1a73b..5be7542 100644 --- a/src/array.jl +++ b/src/array.jl @@ -93,6 +93,11 @@ function Base.cconvert(::Type{Ptr{T}}, A::CLArray{T}) where T buffer(A) end +function Adapt.adapt_storage(to::KernelAdaptor, xs::CLArray{T,N}) where {T,N} + ptr = adapt(to, buffer(xs)) + CLDeviceArray{T,N,AS.Global}(size(xs), reinterpret(LLVMPtr{T,AS.Global}, ptr)) +end + ## utilities diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl new file mode 100644 index 0000000..423439c --- /dev/null +++ b/src/compiler/compilation.jl @@ -0,0 +1,82 @@ +## gpucompiler interface + +struct OpenCLCompilerParams <: AbstractCompilerParams end +const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams} +const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams} + +GPUCompiler.runtime_module(::CompilerJob{<:Any,OpenCLCompilerParams}) = OpenCL + +GPUCompiler.method_table(::OpenCLCompilerJob) = method_table + +# filter out OpenCL built-ins +# TODO: eagerly lower these using the translator API +GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) = + invoke(GPUCompiler.isintrinsic, + Tuple{CompilerJob{SPIRVCompilerTarget}, typeof(fn)}, + job, fn) || + in(fn, opencl_builtins) + + +## compiler implementation (cache, configure, compile, and link) + +# cache of compilation caches, per context +const _compiler_caches = Dict{cl.Context, Dict{Any, Any}}() +function compiler_cache(ctx::cl.Context) + cache = get(_compiler_caches, ctx, nothing) + if cache === nothing + cache = Dict{Any, Any}() + _compiler_caches[ctx] = cache + end + return cache +end + +# cache of compiler configurations, per device (but additionally configurable via kwargs) +const _toolchain = Ref{Any}() +const _compiler_configs = Dict{UInt, OpenCLCompilerConfig}() +function compiler_config(dev::cl.Device; kwargs...) + h = hash(dev, hash(kwargs)) + config = get(_compiler_configs, h, nothing) + if config === nothing + config = _compiler_config(dev; kwargs...) + _compiler_configs[h] = config + end + return config +end +@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, kwargs...) + supports_fp16 = "cl_khr_fp16" in dev.extensions + supports_fp64 = "cl_khr_fp64" in dev.extensions + + # create GPUCompiler objects + target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...) + params = OpenCLCompilerParams() + CompilerConfig(target, params; kernel, name, always_inline) +end + +# compile to executable machine code +function compile(@nospecialize(job::CompilerJob)) + # TODO: this creates a context; cache those. + obj, meta = JuliaContext() do ctx + GPUCompiler.compile(:obj, job) + end + + (obj, entry=LLVM.name(meta.entry)) +end + +# link into an executable kernel +function link(@nospecialize(job::CompilerJob), compiled) + prog = if "cl_khr_il_program" in cl.device().extensions + cl.Program(; il=compiled.obj) + else + error("Your device does not support SPIR-V, which is currently required for native execution.") + # XXX: kpet/spirv2clc#87, caused by KhronosGroup/SPIRV-LLVM-Translator#2029 + source = mktempdir() do dir + il = joinpath(dir, "kernel.spv") + write(il, compiled.obj) + cmd = `spirv2clc $il` + read(cmd, String) + end + cl.Program(; source) + end + cl.build!(prog) + cl.Kernel(prog, compiled.entry) +end diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl new file mode 100644 index 0000000..235f70e --- /dev/null +++ b/src/compiler/execution.jl @@ -0,0 +1,211 @@ +export @opencl, clfunction, clconvert + + +## high-level @opencl interface + +const MACRO_KWARGS = [:launch] +const COMPILER_KWARGS = [:kernel, :name, :always_inline] +const LAUNCH_KWARGS = [:global_size, :local_size, :queue] + +macro opencl(ex...) + call = ex[end] + kwargs = map(ex[1:end-1]) do kwarg + if kwarg isa Symbol + :($kwarg = $kwarg) + elseif Meta.isexpr(kwarg, :(=)) + kwarg + else + throw(ArgumentError("Invalid keyword argument '$kwarg'")) + end + end + + # destructure the kernel call + Meta.isexpr(call, :call) || throw(ArgumentError("second argument to @opencl should be a function call")) + f = call.args[1] + args = call.args[2:end] + + code = quote end + vars, var_exprs = assign_args!(code, args) + + # group keyword argument + macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs = + split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS) + if !isempty(other_kwargs) + key,val = first(other_kwargs).args + throw(ArgumentError("Unsupported keyword argument '$key'")) + end + + # handle keyword arguments that influence the macro's behavior + launch = true + for kwarg in macro_kwargs + key,val = kwarg.args + if key == :launch + isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @opencl should be a constant value")) + launch = val::Bool + else + throw(ArgumentError("Unsupported keyword argument '$key'")) + end + end + if !launch && !isempty(call_kwargs) + error("@opencl with launch=false does not support launch-time keyword arguments; use them when calling the kernel") + end + + # FIXME: macro hygiene wrt. escaping kwarg values (this broke with 1.5) + # we esc() the whole thing now, necessitating gensyms... + @gensym f_var kernel_f kernel_args kernel_tt kernel + + # convert the arguments, call the compiler and launch the kernel + # while keeping the original arguments alive + push!(code.args, + quote + $f_var = $f + GC.@preserve $(vars...) $f_var begin + $kernel_f = $clconvert($f_var) + $kernel_args = map($clconvert, ($(var_exprs...),)) + $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...} + $kernel = $clfunction($kernel_f, $kernel_tt; $(compiler_kwargs...)) + if $launch + $kernel($(var_exprs...); $(call_kwargs...)) + end + $kernel + end + end) + + return esc(quote + let + $code + end + end) +end + + +## argument conversion + +struct KernelAdaptor + svm_pointers::Vector{Ptr{Cvoid}} +end + +# assume directly-passed pointers are SVM pointers +function Adapt.adapt_storage(to::KernelAdaptor, ptr::Ptr{T}) where {T} + push!(to.svm_pointers, ptr) + return ptr +end + +# convert SVM buffers to their GPU address +function Adapt.adapt_storage(to::KernelAdaptor, buf::cl.SVMBuffer) + ptr = pointer(buf) + push!(to.svm_pointers, ptr) + return ptr +end + +# Base.RefValue isn't GPU compatible, so provide a compatible alternative +# TODO: port improvements from CUDA.jl +struct CLRefValue{T} <: Ref{T} + x::T +end +Base.getindex(r::CLRefValue) = r.x +Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue) = CLRefValue(adapt(to, r[])) + +# broadcast sometimes passes a ref(type), resulting in a GPU-incompatible DataType box. +# avoid that by using a special kind of ref that knows about the boxed type. +struct CLRefType{T} <: Ref{DataType} end +Base.getindex(r::CLRefType{T}) where T = T +Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue{<:Union{DataType,Type}}) = + CLRefType{r[]}() + +# case where type is the function being broadcasted +Adapt.adapt_structure(to::KernelAdaptor, + bc::Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} = + Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes) + +""" + clconvert(x, [pointers]) + +This function is called for every argument to be passed to a kernel, allowing it to be +converted to a GPU-friendly format. By default, the function does nothing and returns the +input object `x` as-is. + +Do not add methods to this function, but instead extend the underlying Adapt.jl package and +register methods for the the `OpenCL.KernelAdaptor` type. + +The `pointers` argument is used to collect pointers to indirect SVM buffers, which need to +be registered with OpenCL before invoking the kernel. +""" +function clconvert(arg, pointers::Vector{Ptr{Cvoid}}=Ptr{Cvoid}[]) + adapt(KernelAdaptor(pointers), arg) +end + + + +## abstract kernel functionality + +abstract type AbstractKernel{F,TT} end + +@inline @generated function (kernel::AbstractKernel{F,TT})(args...; + call_kwargs...) where {F,TT} + sig = Tuple{F, TT.parameters...} # Base.signature_type with a function type + args = (:(kernel.f), (:( clconvert(args[$i], svm_pointers) ) for i in 1:length(args))...) + + # filter out ghost arguments that shouldn't be passed + predicate = dt -> isghosttype(dt) || Core.Compiler.isconstType(dt) + to_pass = map(!predicate, sig.parameters) + call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] + call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]] + + # replace non-isbits arguments (they should be unused, or compilation would have failed) + for (i,dt) in enumerate(call_t) + if !isbitstype(dt) + call_t[i] = Ptr{Any} + call_args[i] = :C_NULL + end + end + + # finalize types + call_tt = Base.to_tuple_type(call_t) + + quote + svm_pointers = Ptr{Cvoid}[] + clcall(kernel.fun, $call_tt, $(call_args...); svm_pointers, call_kwargs...) + end +end + + + +## host-side kernels + +struct HostKernel{F,TT} <: AbstractKernel{F,TT} + f::F + fun::cl.Kernel +end + + +## host-side API + +const clfunction_lock = ReentrantLock() + +function clfunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} + ctx = cl.context() + dev = cl.device() + + Base.@lock clfunction_lock begin + # compile the function + cache = compiler_cache(ctx) + source = methodinstance(F, tt) + config = compiler_config(dev; kwargs...)::OpenCLCompilerConfig + fun = GPUCompiler.cached_compilation(cache, source, config, compile, link) + + # create a callable object that captures the function instance. we don't need to think + # about world age here, as GPUCompiler already does and will return a different object + h = hash(fun, hash(f, hash(tt))) + kernel = get(_kernel_instances, h, nothing) + if kernel === nothing + # create the kernel state object + kernel = HostKernel{F,tt}(f, fun) + _kernel_instances[h] = kernel + end + return kernel::HostKernel{F,tt} + end +end + +# cache of kernel instances +const _kernel_instances = Dict{UInt, Any}() diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl new file mode 100644 index 0000000..a8bb714 --- /dev/null +++ b/src/compiler/reflection.jl @@ -0,0 +1,74 @@ +# code reflection entry-points + +# +# code_* replacements +# + +# function to split off certain kwargs for selective forwarding, at run time. +# `@opencl` does something similar at parse time, using `GPUCompiler.split_kwargs`. +function split_kwargs_runtime(kwargs, wanted::Vector{Symbol}) + remaining = Dict{Symbol, Any}() + extracted = Dict{Symbol, Any}() + for (key, value) in kwargs + if key in wanted + extracted[key] = value + else + remaining[key] = value + end + end + return extracted, remaining +end + +for method in (:code_typed, :code_warntype, :code_llvm, :code_native) + # only code_typed doesn't take a io argument + args = method == :code_typed ? (:job,) : (:io, :job) + + @eval begin + function $method(io::IO, @nospecialize(func), @nospecialize(types); + kernel::Bool=false, kwargs...) + compiler_kwargs, kwargs = split_kwargs_runtime(kwargs, COMPILER_KWARGS) + source = methodinstance(typeof(func), Base.to_tuple_type(types)) + config = compiler_config(cl.device(); kernel, compiler_kwargs...) + job = CompilerJob(source, config) + GPUCompiler.$method($(args...); kwargs...) + end + $method(@nospecialize(func), @nospecialize(types); kwargs...) = + $method(stdout, func, types; kwargs...) + end +end + + + +# +# @device_code_* functions +# + +export @device_code_lowered, @device_code_typed, @device_code_warntype, @device_code_llvm, + @device_code_native, @device_code + +# forward to GPUCompiler +@eval $(Symbol("@device_code_lowered")) = $(getfield(GPUCompiler, Symbol("@device_code_lowered"))) +@eval $(Symbol("@device_code_typed")) = $(getfield(GPUCompiler, Symbol("@device_code_typed"))) +@eval $(Symbol("@device_code_warntype")) = $(getfield(GPUCompiler, Symbol("@device_code_warntype"))) +@eval $(Symbol("@device_code_llvm")) = $(getfield(GPUCompiler, Symbol("@device_code_llvm"))) +@eval $(Symbol("@device_code_native")) = $(getfield(GPUCompiler, Symbol("@device_code_native"))) +@eval $(Symbol("@device_code")) = $(getfield(GPUCompiler, Symbol("@device_code"))) + + +# +# other +# + +""" + Metal.return_type(f, tt) -> r::Type + +Return a type `r` such that `f(args...)::r` where `args::tt`. +""" +function return_type(@nospecialize(func), @nospecialize(tt)) + source = methodinstance(typeof(func), tt) + config = compiler_config(cl.device()) + job = CompilerJob(source, config) + interp = GPUCompiler.get_interpreter(job) + sig = Base.signature_type(func, tt) + Core.Compiler.return_type(interp, sig) +end diff --git a/src/device/array.jl b/src/device/array.jl new file mode 100644 index 0000000..24ae808 --- /dev/null +++ b/src/device/array.jl @@ -0,0 +1,239 @@ +# Contiguous on-device arrays + +export CLDeviceArray, CLDeviceVector, CLDeviceMatrix + + +## construction + +# NOTE: we can't support the typical `tuple or series of integer` style construction, +# because we're currently requiring a trailing pointer argument. + +struct CLDeviceArray{T,N,A} <: DenseArray{T,N} + ptr::LLVMPtr{T,A} + maxsize::Int + + dims::Dims{N} + len::Int + + # inner constructors, fully parameterized, exact types (ie. Int not <:Integer) + # TODO: deprecate; put `ptr` first like oneArray + CLDeviceArray{T,N,A}(dims::Dims{N}, ptr::LLVMPtr{T,A}, + maxsize::Int=prod(dims)*sizeof(T)) where {T,A,N} = + new(ptr, maxsize, dims, prod(dims)) +end + +const CLDeviceVector = CLDeviceArray{T,1,A} where {T,A} +const CLDeviceMatrix = CLDeviceArray{T,2,A} where {T,A} + +# outer constructors, non-parameterized +CLDeviceArray(dims::NTuple{N,<:Integer}, p::LLVMPtr{T,A}) where {T,A,N} = CLDeviceArray{T,N,A}(dims, p) +CLDeviceArray(len::Integer, p::LLVMPtr{T,A}) where {T,A} = CLDeviceVector{T,A}((len,), p) + +# outer constructors, partially parameterized +CLDeviceArray{T}(dims::NTuple{N,<:Integer}, p::LLVMPtr{T,A}) where {T,A,N} = CLDeviceArray{T,N,A}(dims, p) +CLDeviceArray{T}(len::Integer, p::LLVMPtr{T,A}) where {T,A} = CLDeviceVector{T,A}((len,), p) +CLDeviceArray{T,N}(dims::NTuple{N,<:Integer}, p::LLVMPtr{T,A}) where {T,A,N} = CLDeviceArray{T,N,A}(dims, p) +CLDeviceVector{T}(len::Integer, p::LLVMPtr{T,A}) where {T,A} = CLDeviceVector{T,A}((len,), p) + +# outer constructors, fully parameterized +CLDeviceArray{T,N,A}(dims::NTuple{N,<:Integer}, p::LLVMPtr{T,A}) where {T,A,N} = CLDeviceArray{T,N,A}(Int.(dims), p) +CLDeviceVector{T,A}(len::Integer, p::LLVMPtr{T,A}) where {T,A} = CLDeviceVector{T,A}((Int(len),), p) + + +## array interface + +Base.elsize(::Type{<:CLDeviceArray{T}}) where {T} = sizeof(T) + +Base.size(g::CLDeviceArray) = g.dims +Base.sizeof(x::CLDeviceArray) = Base.elsize(x) * length(x) + +# we store the array length too; computing prod(size) is expensive +Base.length(g::CLDeviceArray) = g.len + +Base.pointer(x::CLDeviceArray{T,<:Any,A}) where {T,A} = Base.unsafe_convert(LLVMPtr{T,A}, x) +@inline function Base.pointer(x::CLDeviceArray{T,<:Any,A}, i::Integer) where {T,A} + Base.unsafe_convert(LLVMPtr{T,A}, x) + Base._memory_offset(x, i) +end + +typetagdata(a::CLDeviceArray{<:Any,<:Any,A}, i=1) where {A} = + reinterpret(LLVMPtr{UInt8,A}, a.ptr + a.maxsize) + i - one(i) + + +## conversions + +Base.unsafe_convert(::Type{LLVMPtr{T,A}}, x::CLDeviceArray{T,<:Any,A}) where {T,A} = + x.ptr + + +## indexing intrinsics + +# TODO: how are allocations aligned by the level zero API? keep track of this +# because it enables optimizations like Load Store Vectorization +# (cfr. shared memory and its wider-than-datatype alignment) + +@generated function alignment(::CLDeviceArray{T}) where {T} + if Base.isbitsunion(T) + _, sz, al = Base.uniontype_layout(T) + al + else + Base.datatype_alignment(T) + end +end + +@device_function @inline function arrayref(A::CLDeviceArray{T}, index::Integer) where {T} + @boundscheck checkbounds(A, index) + if isbitstype(T) + arrayref_bits(A, index) + else #if isbitsunion(T) + arrayref_union(A, index) + end +end + +@inline function arrayref_bits(A::CLDeviceArray{T}, index::Integer) where {T} + align = alignment(A) + unsafe_load(pointer(A), index, Val(align)) +end + +@inline @generated function arrayref_union(A::CLDeviceArray{T,<:Any,AS}, index::Integer) where {T,AS} + typs = Base.uniontypes(T) + + # generate code that conditionally loads a value based on the selector value. + # lacking noreturn, we return T to avoid inference thinking this can return Nothing. + ex = :(Base.llvmcall("unreachable", $T, Tuple{})) + for (sel, typ) in Iterators.reverse(enumerate(typs)) + ex = quote + if selector == $(sel-1) + ptr = reinterpret(LLVMPtr{$typ,AS}, data_ptr) + unsafe_load(ptr, 1, Val(align)) + else + $ex + end + end + end + + quote + selector_ptr = typetagdata(A, index) + selector = unsafe_load(selector_ptr) + + align = alignment(A) + data_ptr = pointer(A, index) + + return $ex + end +end + +@device_function @inline function arrayset(A::CLDeviceArray{T}, x::T, index::Integer) where {T} + @boundscheck checkbounds(A, index) + if isbitstype(T) + arrayset_bits(A, x, index) + else #if isbitsunion(T) + arrayset_union(A, x, index) + end + return A +end + +@inline function arrayset_bits(A::CLDeviceArray{T}, x::T, index::Integer) where {T} + align = alignment(A) + unsafe_store!(pointer(A), x, index, Val(align)) +end + +@inline @generated function arrayset_union(A::CLDeviceArray{T,<:Any,AS}, x::T, index::Integer) where {T,AS} + typs = Base.uniontypes(T) + sel = findfirst(isequal(x), typs) + + quote + selector_ptr = typetagdata(A, index) + unsafe_store!(selector_ptr, $(UInt8(sel-1))) + + align = alignment(A) + data_ptr = pointer(A, index) + + unsafe_store!(reinterpret(LLVMPtr{$x,AS}, data_ptr), x, 1, Val(align)) + return + end +end + +@device_function @inline function const_arrayref(A::CLDeviceArray{T}, index::Integer) where {T} + @boundscheck checkbounds(A, index) + align = alignment(A) + unsafe_cached_load(pointer(A), index, Val(align)) +end + + +## indexing + +Base.IndexStyle(::Type{<:CLDeviceArray}) = Base.IndexLinear() + +Base.@propagate_inbounds Base.getindex(A::CLDeviceArray{T}, i1::Integer) where {T} = + arrayref(A, i1) +Base.@propagate_inbounds Base.setindex!(A::CLDeviceArray{T}, x, i1::Integer) where {T} = + arrayset(A, convert(T,x)::T, i1) + +# preserve the specific integer type when indexing device arrays, +# to avoid extending 32-bit hardware indices to 64-bit. +Base.to_index(::CLDeviceArray, i::Integer) = i + +# Base doesn't like Integer indices, so we need our own ND get and setindex! routines. +# See also: https://github.com/JuliaLang/julia/pull/42289 +Base.@propagate_inbounds Base.getindex(A::CLDeviceArray, + I::Union{Integer, CartesianIndex}...) = + A[Base._to_linear_index(A, to_indices(A, I)...)] +Base.@propagate_inbounds Base.setindex!(A::CLDeviceArray, x, + I::Union{Integer, CartesianIndex}...) = + A[Base._to_linear_index(A, to_indices(A, I)...)] = x + + +## const indexing + +""" + Const(A::CLDeviceArray) + +Mark a CLDeviceArray as constant/read-only. The invariant guaranteed is that you will not +modify an CLDeviceArray for the duration of the current kernel. + +This API can only be used on devices with compute capability 3.5 or higher. + +!!! warning + Experimental API. Subject to change without deprecation. +""" +struct Const{T,N,AS} <: DenseArray{T,N} + a::CLDeviceArray{T,N,AS} +end +Base.Experimental.Const(A::CLDeviceArray) = Const(A) + +Base.IndexStyle(::Type{<:Const}) = IndexLinear() +Base.size(C::Const) = size(C.a) +Base.axes(C::Const) = axes(C.a) +Base.@propagate_inbounds Base.getindex(A::Const, i1::Integer) = const_arrayref(A.a, i1) + + +## other + +Base.show(io::IO, a::CLDeviceVector) = + print(io, "$(length(a))-element device array at $(pointer(a))") +Base.show(io::IO, a::CLDeviceArray) = + print(io, "$(join(a.shape, '×')) device array at $(pointer(a))") + +Base.show(io::IO, mime::MIME"text/plain", a::CLDeviceArray) = show(io, a) + +@inline function Base.iterate(A::CLDeviceArray, i=1) + if (i % UInt) - 1 < length(A) + (@inbounds A[i], i + 1) + else + nothing + end +end + +function Base.reinterpret(::Type{T}, a::CLDeviceArray{S,N,A}) where {T,S,N,A} + err = _reinterpret_exception(T, a) + err === nothing || throw(err) + + if sizeof(T) == sizeof(S) # fast case + return CLDeviceArray{T,N,A}(size(a), reinterpret(LLVMPtr{T,A}, a.ptr), a.maxsize) + end + + isize = size(a) + size1 = div(isize[1]*sizeof(S), sizeof(T)) + osize = tuple(size1, Base.tail(isize)...) + return CLDeviceArray{T,N,A}(osize, reinterpret(LLVMPtr{T,A}, a.ptr), a.maxsize) +end diff --git a/src/device/runtime.jl b/src/device/runtime.jl new file mode 100644 index 0000000..cdd146a --- /dev/null +++ b/src/device/runtime.jl @@ -0,0 +1,11 @@ +signal_exception() = return + +malloc(sz) = C_NULL + +report_oom(sz) = return + +report_exception(ex) = return + +report_exception_name(ex) = return + +report_exception_frame(idx, func, file, line) = return diff --git a/test/Project.toml b/test/Project.toml index 382d28b..ba8f907 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] +IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd" diff --git a/test/execution.jl b/test/execution.jl new file mode 100644 index 0000000..a64b542 --- /dev/null +++ b/test/execution.jl @@ -0,0 +1,124 @@ +if !in("cl_khr_il_program", cl.device().extensions) +@warn "Skipping execution tests on $(cl.platform().name)" +else + +@testset "execution" begin + +@testset "@opencl" begin + +dummy() = nothing + +@test_throws UndefVarError @opencl undefined() +@test_throws MethodError @opencl dummy(1) + + +@testset "launch configuration" begin + @opencl dummy() + + global_size = 1 + @opencl global_size dummy() + @opencl global_size=1 dummy() + @opencl global_size=(1,1) dummy() + @opencl global_size=(1,1,1) dummy() + + local_size = 1 + @opencl global_size local_size dummy() + @opencl global_size=1 local_size=1 dummy() + @opencl global_size=(1,1) local_size=(1,1) dummy() + @opencl global_size=(1,1,1) local_size=(1,1,1) dummy() + + @test_throws ArgumentError @opencl global_size=(1,) local_size=(1,1) dummy() + @test_throws InexactError @opencl global_size=(-2) dummy() + @test_throws InexactError @opencl local_size=(-2) dummy() +end + +@testset "launch=false" begin + # XXX: how are svm_pointers handled here? + k = @opencl launch=false dummy() + k() + k(; global_size=1) +end + +@testset "inference" begin + foo() = @opencl dummy() + @inferred foo() + + # with arguments, we call clconvert + kernel(a) = return + bar(a) = @opencl kernel(a) + @inferred bar(CLArray([1])) +end + + +@testset "reflection" begin + OpenCL.code_lowered(dummy, Tuple{}) + OpenCL.code_typed(dummy, Tuple{}) + OpenCL.code_warntype(devnull, dummy, Tuple{}) + OpenCL.code_llvm(devnull, dummy, Tuple{}) + OpenCL.code_native(devnull, dummy, Tuple{}) + + @device_code_lowered @opencl dummy() + @device_code_typed @opencl dummy() + @device_code_warntype io=devnull @opencl dummy() + @device_code_llvm io=devnull @opencl dummy() + @device_code_native io=devnull @opencl dummy() + + mktempdir() do dir + @device_code dir=dir @opencl dummy() + end + + @test_throws ErrorException @device_code_lowered nothing + + # make sure kernel name aliases are preserved in the generated code + @test occursin("dummy", sprint(io->(@device_code_llvm io=io optimize=false @opencl dummy()))) + @test occursin("dummy", sprint(io->(@device_code_llvm io=io @opencl dummy()))) + @test occursin("dummy", sprint(io->(@device_code_native io=io @opencl dummy()))) + + # make sure invalid kernels can be partially reflected upon + let + invalid_kernel() = throw() + @test_throws OpenCL.InvalidIRError @opencl invalid_kernel() + @test_throws OpenCL.InvalidIRError IOCapture.capture() do + @device_code_warntype @opencl invalid_kernel() + end + c = IOCapture.capture() do + try + @device_code_warntype @opencl invalid_kernel() + catch + end + end + @test occursin("Body::Union{}", c.output) + end + + # set name of kernel + @test occursin("mykernel", sprint(io->(@device_code_llvm io=io begin + @opencl name="mykernel" dummy() + end))) + + @test OpenCL.return_type(identity, Tuple{Int}) === Int + @test OpenCL.return_type(sin, Tuple{Float32}) === Float32 + @test OpenCL.return_type(getindex, Tuple{CLDeviceArray{Float32,1,AS.Global},Int32}) === Float32 + @test OpenCL.return_type(getindex, Tuple{Base.RefValue{Integer}}) === Integer +end + +end + +############################################################################### + +@testset "argument passing" begin + +function memset(a, val) + gid = get_global_id(1) + @inbounds a[gid] = val + return +end + +a = CLArray{Int}(undef, 10) +@opencl global_size=length(a) memset(a, 42) +@test all(Array(a) .== 42) + +end + +end + +end diff --git a/test/runtests.jl b/test/runtests.jl index c41a005..bacb223 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,16 +2,19 @@ using Test using OpenCL using pocl_jll +using IOCapture + @info "System information:\n" * sprint(io->OpenCL.versioninfo(io)) @testset "OpenCL.jl" begin @testset "$(platform.name): $(device.name)" for platform in cl.platforms(), - device in cl.devices(platform) + device in cl.devices(platform) cl.platform!(platform) cl.device!(device) +# libopencl wrappers include("platform.jl") include("context.jl") include("device.jl") @@ -22,7 +25,9 @@ include("kernel.jl") include("behaviour.jl") include("memory.jl") include("buffer.jl") + include("array.jl") +include("execution.jl") end