Merge branch 'main' into he/gitignore-vscode

CliMA · Oct 22, 2024 · 066b459 · 066b459
2 parents c142cf9 + 96bb8f4
commit 066b459
Show file tree

Hide file tree

Showing 15 changed files with 437 additions and 148 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -86,6 +86,10 @@ steps:
         key: unit_data_copyto
         command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_copyto.jl"
 
+      - label: "Unit: cartesian_field_index"
+        key: unit_data_cartesian_field_index
+        command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_cartesian_field_index.jl"
+
       - label: "Unit: mapreduce"
         key: unit_data_mapreduce
         command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_mapreduce.jl"

diff --git a/ext/cuda/data_layouts_mapreduce.jl b/ext/cuda/data_layouts_mapreduce.jl
@@ -1,3 +1,4 @@
+import ClimaCore.DataLayouts: AbstractDataSingleton
 # To implement a single flexible mapreduce, let's define
 # a `OnesArray` that has nothing, and always returns 1:
 struct OnesArray{T, N} <: AbstractArray{T, N} end
@@ -38,6 +39,8 @@ function mapreduce_cuda(
     n_ops_on_load = cld(nitems, nthreads) == 1 ? 0 : 7
     effective_blksize = nthreads * (n_ops_on_load + 1)
     nblocks = cld(nitems, effective_blksize)
+    s = DataLayouts.singleton(data)
+    us = DataLayouts.UniversalSize(data)
 
     reduce_cuda = CuArray{T}(undef, nblocks, Nf)
     shmemsize = nthreads
@@ -49,6 +52,8 @@ function mapreduce_cuda(
         pdata,
         pwt,
         n_ops_on_load,
+        s,
+        us,
         Val(shmemsize),
     )
     # reduce block data
@@ -71,19 +76,22 @@ function mapreduce_cuda_kernel!(
     pdata::AbstractArray{T, N},
     pwt::AbstractArray{T, N},
     n_ops_on_load::Int,
+    s::AbstractDataSingleton,
+    us::DataLayouts.UniversalSize,
     ::Val{shmemsize},
 ) where {T, N, shmemsize}
     blksize = blockDim().x
     nblk = gridDim().x
     tidx = threadIdx().x
     bidx = blockIdx().x
     fidx = blockIdx().y
-    dataview = _dataview(pdata, fidx)
+    dataview = _dataview(pdata, s, fidx)
     effective_blksize = blksize * (n_ops_on_load + 1)
     gidx = _get_gidx(tidx, bidx, effective_blksize)
     reduction = CUDA.CuStaticSharedArray(T, shmemsize)
     reduction[tidx] = 0
-    (Nv, Nij, Nf, Nh) = _get_dims(dataview)
+    (Nij, _, _, Nv, Nh) = DataLayouts.universal_size(us)
+    Nf = 1 # a view into `fidx` always gives a size of Nf = 1
     nitems = Nv * Nij * Nij * Nf * Nh
 
     # load shmem
@@ -107,29 +115,13 @@ end
 @inline function _get_gidx(tidx, bidx, effective_blksize)
     return tidx + (bidx - 1) * effective_blksize
 end
-# for VF DataLayout
-@inline function _get_dims(pdata::AbstractArray{FT, 2}) where {FT}
-    (Nv, Nf) = size(pdata)
-    return (Nv, 1, Nf, 1)
-end
-@inline _dataview(pdata::AbstractArray{FT, 2}, fidx) where {FT} =
-    view(pdata, :, fidx:fidx)
-
-# for IJFH DataLayout
-@inline function _get_dims(pdata::AbstractArray{FT, 4}) where {FT}
-    (Nij, _, Nf, Nh) = size(pdata)
-    return (1, Nij, Nf, Nh)
-end
-@inline _dataview(pdata::AbstractArray{FT, 4}, fidx) where {FT} =
-    view(pdata, :, :, fidx:fidx, :)
 
-# for VIJFH DataLayout
-@inline function _get_dims(pdata::AbstractArray{FT, 5}) where {FT}
-    (Nv, Nij, _, Nf, Nh) = size(pdata)
-    return (Nv, Nij, Nf, Nh)
+@inline function _dataview(pdata::AbstractArray, s::AbstractDataSingleton, fidx)
+    fdim = DataLayouts.field_dim(s)
+    Ipre = ntuple(i -> Colon(), Val(fdim - 1))
+    Ipost = ntuple(i -> Colon(), Val(ndims(pdata) - fdim))
+    return @inbounds view(pdata, Ipre..., fidx:fidx, Ipost...)
 end
-@inline _dataview(pdata::AbstractArray{FT, 5}, fidx) where {FT} =
-    view(pdata, :, :, :, fidx:fidx, :)
 
 @inline function _cuda_reduce!(op, reduction, tidx, reduction_size, N)
     if reduction_size > N

diff --git a/ext/cuda/data_layouts_threadblock.jl b/ext/cuda/data_layouts_threadblock.jl
@@ -170,6 +170,20 @@ end
 ##### Custom partitions
 #####
 
+##### linear partition
+@inline function linear_partition(nitems::Integer, n_max_threads::Integer)
+    threads = min(nitems, n_max_threads)
+    blocks = cld(nitems, threads)
+    return (; threads, blocks)
+end
+@inline function linear_universal_index(us::UniversalSize)
+    inds = DataLayouts.universal_size(us)
+    CI = CartesianIndices(map(x -> Base.OneTo(x), inds))
+    return CI
+end
+@inline linear_is_valid_index(i::Integer, us::UniversalSize) =
+    1 ≤ i ≤ DataLayouts.get_N(us)
+
 ##### Column-wise
 @inline function columnwise_partition(
     us::DataLayouts.UniversalSize,