Skip to content

Commit

Permalink
Merge pull request #233 from JuliaGPU/tb/kernelabstractions
Browse files Browse the repository at this point in the history
Initial KernelAbstractions.jl integration.
  • Loading branch information
maleadt committed Sep 17, 2024
2 parents daaf3cf + d677ee2 commit 9c1db8b
Show file tree
Hide file tree
Showing 7 changed files with 224 additions and 5 deletions.
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ version = "0.10.0"
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
SPIRV_LLVM_Translator_unified_jll = "85f0d8ed-5b39-5caa-b1ae-7472de402361"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

[compat]
LLVM = "9.1"
Expand Down
4 changes: 4 additions & 0 deletions src/OpenCL.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,8 @@ include("util.jl")
include("array.jl")
include("gpuarrays.jl")

include("OpenCLKernels.jl")
import .OpenCLKernels: OpenCLBackend
export OpenCLBackend

end
179 changes: 179 additions & 0 deletions src/OpenCLKernels.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
module OpenCLKernels

using ..OpenCL
using ..OpenCL: @device_override, SPIRVIntrinsics

import KernelAbstractions as KA

import StaticArrays

import Adapt


## Back-end Definition

export OpenCLBackend

struct OpenCLBackend <: KA.GPU
end

KA.allocate(::OpenCLBackend, ::Type{T}, dims::Tuple) where T = CLArray{T}(undef, dims)
KA.zeros(::OpenCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.zeros(T, dims)
KA.ones(::OpenCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.ones(T, dims)

KA.get_backend(::CLArray) = OpenCLBackend()
KA.synchronize(::OpenCLBackend) = cl.device_synchronize()
KA.supports_float64(::OpenCLBackend) = false # XXX: this is platform/device dependent

Adapt.adapt_storage(::OpenCLBackend, a::Array) = Adapt.adapt(CLArray, a)
Adapt.adapt_storage(::OpenCLBackend, a::CLArray) = a
Adapt.adapt_storage(::KA.CPU, a::CLArray) = convert(Array, a)


## Memory Operations

function KA.copyto!(::OpenCLBackend, A, B)
copyto!(A, B)
# TODO: Address device to host copies in jl being synchronizing
end


## Kernel Launch

function KA.mkcontext(kernel::KA.Kernel{OpenCLBackend}, _ndrange, iterspace)
KA.CompilerMetadata{KA.ndrange(kernel), KA.DynamicCheck}(_ndrange, iterspace)
end
function KA.mkcontext(kernel::KA.Kernel{OpenCLBackend}, I, _ndrange, iterspace,
::Dynamic) where Dynamic
KA.CompilerMetadata{KA.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
end

function KA.launch_config(kernel::KA.Kernel{OpenCLBackend}, ndrange, workgroupsize)
if ndrange isa Integer
ndrange = (ndrange,)
end
if workgroupsize isa Integer
workgroupsize = (workgroupsize, )
end

# partition checked that the ndrange's agreed
if KA.ndrange(kernel) <: KA.StaticSize
ndrange = nothing
end

iterspace, dynamic = if KA.workgroupsize(kernel) <: KA.DynamicSize &&
workgroupsize === nothing
# use ndrange as preliminary workgroupsize for autotuning
KA.partition(kernel, ndrange, ndrange)
else
KA.partition(kernel, ndrange, workgroupsize)
end

return ndrange, workgroupsize, iterspace, dynamic
end

function threads_to_workgroupsize(threads, ndrange)
total = 1
return map(ndrange) do n
x = min(div(threads, total), n)
total *= x
return x
end
end

function (obj::KA.Kernel{OpenCLBackend})(args...; ndrange=nothing, workgroupsize=nothing)
ndrange, workgroupsize, iterspace, dynamic =
KA.launch_config(obj, ndrange, workgroupsize)

# this might not be the final context, since we may tune the workgroupsize
ctx = KA.mkcontext(obj, ndrange, iterspace)
kernel = @opencl launch=false obj.f(ctx, args...)

# figure out the optimal workgroupsize automatically
if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing
wg_info = cl.work_group_info(kernel.fun, cl.device())
wg_size_nd = threads_to_workgroupsize(wg_info.size, ndrange)
iterspace, dynamic = KA.partition(obj, ndrange, wg_size_nd)
ctx = KA.mkcontext(obj, ndrange, iterspace)
end

groups = length(KA.blocks(iterspace))
items = length(KA.workitems(iterspace))

if groups == 0
return nothing
end

# Launch kernel
global_size = groups * items
local_size = items
kernel(ctx, args...; global_size, local_size)

return nothing
end


## Indexing Functions

@device_override @inline function KA.__index_Local_Linear(ctx)
return get_local_id(1)
end

@device_override @inline function KA.__index_Group_Linear(ctx)
return get_group_id(1)
end

@device_override @inline function KA.__index_Global_Linear(ctx)
return get_global_id(1)
end

@device_override @inline function KA.__index_Local_Cartesian(ctx)
@inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
end

@device_override @inline function KA.__index_Group_Cartesian(ctx)
@inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
end

@device_override @inline function KA.__index_Global_Cartesian(ctx)
return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
end

@device_override @inline function KA.__validindex(ctx)
if KA.__dynamic_checkbounds(ctx)
I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
return I in KA.__ndrange(ctx)
else
return true
end
end


## Shared and Scratch Memory

@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
ptr = SPIRVIntrinsics.emit_localmemory(T, Val(prod(Dims)))
CLDeviceArray(Dims, ptr)
end

@device_override @inline function KA.Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
StaticArrays.MArray{KA.__size(Dims), T}(undef)
end


## Synchronization and Printing

@device_override @inline function KA.__synchronize()
barrier()
end

@device_override @inline function KA.__print(args...)
SPIRVIntrinsics._print(args...)
end


## Other

KA.argconvert(::KA.Kernel{OpenCLBackend}, arg) = clconvert(arg)

end
22 changes: 17 additions & 5 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,7 @@ Base.sizeof(x::CLArray) = Base.elsize(x) * length(x)
Base.unsafe_convert(::Type{Ptr{T}}, x::CLArray{T}) where {T} =
convert(Ptr{T}, pointer(x.data[])) + x.offset*Base.elsize(x)

# XXX: this is wrong
Base.:(==)(A:: CLArray, B:: CLArray) = buffer(A) == buffer(B) && size(A) == size(B)


Base.:(==)(A::CLArray, B::CLArray) = Array(A) == Array(B)


## derived types
Expand Down Expand Up @@ -151,7 +148,7 @@ end
fill(x, dims...) = fill(x, (dims...,))

function Base.fill!(A::CLArray{T}, x::T) where {T}
cl.enqueue_svm_fill(pointer(A), x, length(A))
isempty(A) || cl.enqueue_svm_fill(pointer(A), x, length(A))
A
end

Expand Down Expand Up @@ -237,3 +234,18 @@ BroadcastStyle(::Type{<:AnyCLArray{T,N}}) where {T,N} = CLArrayStyle{N}()
# allocation of output arrays
Base.similar(bc::Broadcasted{CLArrayStyle{N}}, ::Type{T}, dims) where {T,N} =
similar(CLArray{T}, dims)


## regular gpu array adaptor

# We don't convert isbits types in `adapt`, since they are already
# considered GPU-compatible.

Adapt.adapt_storage(::Type{CLArray}, xs::AT) where {AT<:AbstractArray} =
isbitstype(AT) ? xs : convert(CLArray, xs)

# if specific type parameters are specified, preserve those
Adapt.adapt_storage(::Type{<:CLArray{T}}, xs::AT) where {T, AT<:AbstractArray} =
isbitstype(AT) ? xs : convert(CLArray{T}, xs)
Adapt.adapt_storage(::Type{<:CLArray{T, N}}, xs::AT) where {T, N, AT<:AbstractArray} =
isbitstype(AT) ? xs : convert(CLArray{T,N}, xs)
7 changes: 7 additions & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"
14 changes: 14 additions & 0 deletions test/kernelabstractions.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
if !in("cl_khr_il_program", cl.device().extensions)
@warn "Skipping KernelAbstractions.jl tests on $(cl.platform().name)"
else

import KernelAbstractions
include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))

skip_tests=Set([
"sparse",
"Convert", # Need to opt out of i128
])
Testsuite.testsuite(OpenCLBackend, "OpenCL", OpenCL, CLArray, CLDeviceArray; skip_tests)

end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ include("buffer.jl")

include("array.jl")
include("execution.jl")
include("kernelabstractions.jl")

end

Expand Down

0 comments on commit 9c1db8b

Please sign in to comment.