Skip to content

Commit

Permalink
Merge branch 'main' into he/gitignore-vscode
Browse files Browse the repository at this point in the history
  • Loading branch information
Sbozzolo authored Oct 22, 2024
2 parents c142cf9 + 96bb8f4 commit 066b459
Show file tree
Hide file tree
Showing 15 changed files with 437 additions and 148 deletions.
4 changes: 4 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ steps:
key: unit_data_copyto
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_copyto.jl"

- label: "Unit: cartesian_field_index"
key: unit_data_cartesian_field_index
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_cartesian_field_index.jl"

- label: "Unit: mapreduce"
key: unit_data_mapreduce
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_mapreduce.jl"
Expand Down
38 changes: 15 additions & 23 deletions ext/cuda/data_layouts_mapreduce.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import ClimaCore.DataLayouts: AbstractDataSingleton
# To implement a single flexible mapreduce, let's define
# a `OnesArray` that has nothing, and always returns 1:
struct OnesArray{T, N} <: AbstractArray{T, N} end
Expand Down Expand Up @@ -38,6 +39,8 @@ function mapreduce_cuda(
n_ops_on_load = cld(nitems, nthreads) == 1 ? 0 : 7
effective_blksize = nthreads * (n_ops_on_load + 1)
nblocks = cld(nitems, effective_blksize)
s = DataLayouts.singleton(data)
us = DataLayouts.UniversalSize(data)

reduce_cuda = CuArray{T}(undef, nblocks, Nf)
shmemsize = nthreads
Expand All @@ -49,6 +52,8 @@ function mapreduce_cuda(
pdata,
pwt,
n_ops_on_load,
s,
us,
Val(shmemsize),
)
# reduce block data
Expand All @@ -71,19 +76,22 @@ function mapreduce_cuda_kernel!(
pdata::AbstractArray{T, N},
pwt::AbstractArray{T, N},
n_ops_on_load::Int,
s::AbstractDataSingleton,
us::DataLayouts.UniversalSize,
::Val{shmemsize},
) where {T, N, shmemsize}
blksize = blockDim().x
nblk = gridDim().x
tidx = threadIdx().x
bidx = blockIdx().x
fidx = blockIdx().y
dataview = _dataview(pdata, fidx)
dataview = _dataview(pdata, s, fidx)
effective_blksize = blksize * (n_ops_on_load + 1)
gidx = _get_gidx(tidx, bidx, effective_blksize)
reduction = CUDA.CuStaticSharedArray(T, shmemsize)
reduction[tidx] = 0
(Nv, Nij, Nf, Nh) = _get_dims(dataview)
(Nij, _, _, Nv, Nh) = DataLayouts.universal_size(us)
Nf = 1 # a view into `fidx` always gives a size of Nf = 1
nitems = Nv * Nij * Nij * Nf * Nh

# load shmem
Expand All @@ -107,29 +115,13 @@ end
@inline function _get_gidx(tidx, bidx, effective_blksize)
return tidx + (bidx - 1) * effective_blksize
end
# for VF DataLayout
@inline function _get_dims(pdata::AbstractArray{FT, 2}) where {FT}
(Nv, Nf) = size(pdata)
return (Nv, 1, Nf, 1)
end
@inline _dataview(pdata::AbstractArray{FT, 2}, fidx) where {FT} =
view(pdata, :, fidx:fidx)

# for IJFH DataLayout
@inline function _get_dims(pdata::AbstractArray{FT, 4}) where {FT}
(Nij, _, Nf, Nh) = size(pdata)
return (1, Nij, Nf, Nh)
end
@inline _dataview(pdata::AbstractArray{FT, 4}, fidx) where {FT} =
view(pdata, :, :, fidx:fidx, :)

# for VIJFH DataLayout
@inline function _get_dims(pdata::AbstractArray{FT, 5}) where {FT}
(Nv, Nij, _, Nf, Nh) = size(pdata)
return (Nv, Nij, Nf, Nh)
@inline function _dataview(pdata::AbstractArray, s::AbstractDataSingleton, fidx)
fdim = DataLayouts.field_dim(s)
Ipre = ntuple(i -> Colon(), Val(fdim - 1))
Ipost = ntuple(i -> Colon(), Val(ndims(pdata) - fdim))
return @inbounds view(pdata, Ipre..., fidx:fidx, Ipost...)
end
@inline _dataview(pdata::AbstractArray{FT, 5}, fidx) where {FT} =
view(pdata, :, :, :, fidx:fidx, :)

@inline function _cuda_reduce!(op, reduction, tidx, reduction_size, N)
if reduction_size > N
Expand Down
14 changes: 14 additions & 0 deletions ext/cuda/data_layouts_threadblock.jl
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,20 @@ end
##### Custom partitions
#####

##### linear partition
@inline function linear_partition(nitems::Integer, n_max_threads::Integer)
threads = min(nitems, n_max_threads)
blocks = cld(nitems, threads)
return (; threads, blocks)
end
@inline function linear_universal_index(us::UniversalSize)
inds = DataLayouts.universal_size(us)
CI = CartesianIndices(map(x -> Base.OneTo(x), inds))
return CI
end
@inline linear_is_valid_index(i::Integer, us::UniversalSize) =
1 i DataLayouts.get_N(us)

##### Column-wise
@inline function columnwise_partition(
us::DataLayouts.UniversalSize,
Expand Down
Loading

0 comments on commit 066b459

Please sign in to comment.