Merge pull request #233 from JuliaGPU/tb/kernelabstractions

Initial KernelAbstractions.jl integration.
JuliaGPU · Sep 17, 2024 · 9c1db8b · 9c1db8b
2 parents daaf3cf + d677ee2
commit 9c1db8b
Show file tree

Hide file tree

Showing 7 changed files with 224 additions and 5 deletions.
diff --git a/Project.toml b/Project.toml
@@ -6,13 +6,15 @@ version = "0.10.0"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
 SPIRV_LLVM_Translator_unified_jll = "85f0d8ed-5b39-5caa-b1ae-7472de402361"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
 [compat]
 LLVM = "9.1"

diff --git a/src/OpenCL.jl b/src/OpenCL.jl
@@ -46,4 +46,8 @@ include("util.jl")
 include("array.jl")
 include("gpuarrays.jl")
 
+include("OpenCLKernels.jl")
+import .OpenCLKernels: OpenCLBackend
+export OpenCLBackend
+
 end
diff --git a/src/OpenCLKernels.jl b/src/OpenCLKernels.jl
@@ -0,0 +1,179 @@
+module OpenCLKernels
+
+using ..OpenCL
+using ..OpenCL: @device_override, SPIRVIntrinsics
+
+import KernelAbstractions as KA
+
+import StaticArrays
+
+import Adapt
+
+
+## Back-end Definition
+
+export OpenCLBackend
+
+struct OpenCLBackend <: KA.GPU
+end
+
+KA.allocate(::OpenCLBackend, ::Type{T}, dims::Tuple) where T = CLArray{T}(undef, dims)
+KA.zeros(::OpenCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.zeros(T, dims)
+KA.ones(::OpenCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.ones(T, dims)
+
+KA.get_backend(::CLArray) = OpenCLBackend()
+KA.synchronize(::OpenCLBackend) = cl.device_synchronize()
+KA.supports_float64(::OpenCLBackend) = false  # XXX: this is platform/device dependent
+
+Adapt.adapt_storage(::OpenCLBackend, a::Array) = Adapt.adapt(CLArray, a)
+Adapt.adapt_storage(::OpenCLBackend, a::CLArray) = a
+Adapt.adapt_storage(::KA.CPU, a::CLArray) = convert(Array, a)
+
+
+## Memory Operations
+
+function KA.copyto!(::OpenCLBackend, A, B)
+    copyto!(A, B)
+    # TODO: Address device to host copies in jl being synchronizing
+end
+
+
+## Kernel Launch
+
+function KA.mkcontext(kernel::KA.Kernel{OpenCLBackend}, _ndrange, iterspace)
+    KA.CompilerMetadata{KA.ndrange(kernel), KA.DynamicCheck}(_ndrange, iterspace)
+end
+function KA.mkcontext(kernel::KA.Kernel{OpenCLBackend}, I, _ndrange, iterspace,
+                      ::Dynamic) where Dynamic
+    KA.CompilerMetadata{KA.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
+end
+
+function KA.launch_config(kernel::KA.Kernel{OpenCLBackend}, ndrange, workgroupsize)
+    if ndrange isa Integer
+        ndrange = (ndrange,)
+    end
+    if workgroupsize isa Integer
+        workgroupsize = (workgroupsize, )
+    end
+
+    # partition checked that the ndrange's agreed
+    if KA.ndrange(kernel) <: KA.StaticSize
+        ndrange = nothing
+    end
+
+    iterspace, dynamic = if KA.workgroupsize(kernel) <: KA.DynamicSize &&
+        workgroupsize === nothing
+        # use ndrange as preliminary workgroupsize for autotuning
+        KA.partition(kernel, ndrange, ndrange)
+    else
+        KA.partition(kernel, ndrange, workgroupsize)
+    end
+
+    return ndrange, workgroupsize, iterspace, dynamic
+end
+
+function threads_to_workgroupsize(threads, ndrange)
+    total = 1
+    return map(ndrange) do n
+        x = min(div(threads, total), n)
+        total *= x
+        return x
+    end
+end
+
+function (obj::KA.Kernel{OpenCLBackend})(args...; ndrange=nothing, workgroupsize=nothing)
+    ndrange, workgroupsize, iterspace, dynamic =
+        KA.launch_config(obj, ndrange, workgroupsize)
+
+    # this might not be the final context, since we may tune the workgroupsize
+    ctx = KA.mkcontext(obj, ndrange, iterspace)
+    kernel = @opencl launch=false obj.f(ctx, args...)
+
+    # figure out the optimal workgroupsize automatically
+    if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing
+        wg_info = cl.work_group_info(kernel.fun, cl.device())
+        wg_size_nd = threads_to_workgroupsize(wg_info.size, ndrange)
+        iterspace, dynamic = KA.partition(obj, ndrange, wg_size_nd)
+        ctx = KA.mkcontext(obj, ndrange, iterspace)
+    end
+
+    groups = length(KA.blocks(iterspace))
+    items = length(KA.workitems(iterspace))
+
+    if groups == 0
+        return nothing
+    end
+
+    # Launch kernel
+    global_size = groups * items
+    local_size = items
+    kernel(ctx, args...; global_size, local_size)
+
+    return nothing
+end
+
+
+## Indexing Functions
+
+@device_override @inline function KA.__index_Local_Linear(ctx)
+    return get_local_id(1)
+end
+
+@device_override @inline function KA.__index_Group_Linear(ctx)
+    return get_group_id(1)
+end
+
+@device_override @inline function KA.__index_Global_Linear(ctx)
+    return get_global_id(1)
+end
+
+@device_override @inline function KA.__index_Local_Cartesian(ctx)
+    @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
+end
+
+@device_override @inline function KA.__index_Group_Cartesian(ctx)
+    @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
+end
+
+@device_override @inline function KA.__index_Global_Cartesian(ctx)
+    return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
+end
+
+@device_override @inline function KA.__validindex(ctx)
+    if KA.__dynamic_checkbounds(ctx)
+        I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
+        return I in KA.__ndrange(ctx)
+    else
+        return true
+    end
+end
+
+
+## Shared and Scratch Memory
+
+@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+    ptr = SPIRVIntrinsics.emit_localmemory(T, Val(prod(Dims)))
+    CLDeviceArray(Dims, ptr)
+end
+
+@device_override @inline function KA.Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
+    StaticArrays.MArray{KA.__size(Dims), T}(undef)
+end
+
+
+## Synchronization and Printing
+
+@device_override @inline function KA.__synchronize()
+    barrier()
+end
+
+@device_override @inline function KA.__print(args...)
+    SPIRVIntrinsics._print(args...)
+end
+
+
+## Other
+
+KA.argconvert(::KA.Kernel{OpenCLBackend}, arg) = clconvert(arg)
+
+end
diff --git a/src/array.jl b/src/array.jl
@@ -76,10 +76,7 @@ Base.sizeof(x::CLArray) = Base.elsize(x) * length(x)
 Base.unsafe_convert(::Type{Ptr{T}}, x::CLArray{T}) where {T} =
     convert(Ptr{T}, pointer(x.data[])) + x.offset*Base.elsize(x)
 
-# XXX: this is wrong
-Base.:(==)(A:: CLArray, B:: CLArray) = buffer(A) == buffer(B) && size(A) == size(B)
-
-
+Base.:(==)(A::CLArray, B::CLArray) = Array(A) == Array(B)
 
 
 ## derived types
@@ -151,7 +148,7 @@ end
 fill(x, dims...) = fill(x, (dims...,))
 
 function Base.fill!(A::CLArray{T}, x::T) where {T}
-    cl.enqueue_svm_fill(pointer(A), x, length(A))
+    isempty(A) || cl.enqueue_svm_fill(pointer(A), x, length(A))
     A
 end
 
@@ -237,3 +234,18 @@ BroadcastStyle(::Type{<:AnyCLArray{T,N}}) where {T,N} = CLArrayStyle{N}()
 # allocation of output arrays
 Base.similar(bc::Broadcasted{CLArrayStyle{N}}, ::Type{T}, dims) where {T,N} =
     similar(CLArray{T}, dims)
+
+
+## regular gpu array adaptor
+
+# We don't convert isbits types in `adapt`, since they are already
+# considered GPU-compatible.
+
+Adapt.adapt_storage(::Type{CLArray}, xs::AT) where {AT<:AbstractArray} =
+  isbitstype(AT) ? xs : convert(CLArray, xs)
+
+# if specific type parameters are specified, preserve those
+Adapt.adapt_storage(::Type{<:CLArray{T}}, xs::AT) where {T, AT<:AbstractArray} =
+  isbitstype(AT) ? xs : convert(CLArray{T}, xs)
+Adapt.adapt_storage(::Type{<:CLArray{T, N}}, xs::AT) where {T, N, AT<:AbstractArray} =
+  isbitstype(AT) ? xs : convert(CLArray{T,N}, xs)
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,5 +1,12 @@
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
@@ -0,0 +1,14 @@
+if !in("cl_khr_il_program", cl.device().extensions)
+@warn "Skipping KernelAbstractions.jl tests on $(cl.platform().name)"
+else
+
+import KernelAbstractions
+include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
+
+skip_tests=Set([
+    "sparse",
+    "Convert", # Need to opt out of i128
+])
+Testsuite.testsuite(OpenCLBackend, "OpenCL", OpenCL, CLArray, CLDeviceArray; skip_tests)
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -28,6 +28,7 @@ include("buffer.jl")
 
 include("array.jl")
 include("execution.jl")
+include("kernelabstractions.jl")
 
 end