Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop field dimension, support linear indexing and empty fields #1953

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -478,15 +478,15 @@ steps:
key: unit_field
command:
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/unit_field.jl"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl"
# - "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/convergence_field_integrals.jl"

- label: "Unit: field cuda"
key: unit_field_cuda
command:
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/unit_field.jl"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl"
# - "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl"
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/convergence_field_integrals.jl"
env:
CLIMACOMMS_DEVICE: "CUDA"
Expand Down Expand Up @@ -727,7 +727,7 @@ steps:
command:
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/unit_thomas_algorithm.jl"

- label: "Unit: Thomas Algorithm"
- label: "Unit: Thomas Algorithm (CUDA)"
key: "gpu_thomas_algorithm"
command:
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/unit_thomas_algorithm.jl"
Expand Down
24 changes: 13 additions & 11 deletions benchmarks/scripts/thermo_bench_bw.jl
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,8 @@ using Test
)
x = fill((; ts = zero(TBB.PhaseEquil{FT}), nt_core...), cspace)
xv = fill((; ts = nt_ts, nt_core...), cspace)
(_, Nij, _, Nv, Nh) = size(Fields.field_values(x.ts))
fv_ts = Fields.field_values(x.ts)
(_, Nij, _, Nv, Nh) = size(fv_ts)
us = TBB.UniversalSizesStatic(Nv, Nij, Nh)
function to_vec(ξ)
pns = propertynames(ξ)
Expand All @@ -191,7 +192,7 @@ using Test
end
return (; zip(propertynames(ξ), dl_vals)...)
end
x_vec = to_vec(xv)
# x_vec = to_vec(xv)

x_aos = fill((; ρ_read = FT(0), ρ_write = FT(0)), cspace)
x_soa = (;
Expand All @@ -204,20 +205,21 @@ using Test
@. x_aos.ρ_write = 7
TBB.singlefield_bc!(x_soa, us; nreps=1, n_trials = 1)
TBB.singlefield_bc!(x_aos, us; nreps=1, n_trials = 1)

TBB.thermo_func_bc!(x, us; nreps=1, n_trials = 1)
TBB.thermo_func_sol!(x_vec, us; nreps=1, n_trials = 1)
# TBB.thermo_func_sol!(x_vec, us; nreps=1, n_trials = 1)

rc = Fields.rcompare(x_vec, to_vec(x))
rc || Fields.@rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
@test rc # test correctness
# rc = Fields.rcompare(x_vec, to_vec(x))
# rc || Fields.@rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
# @test rc # test correctness

TBB.singlefield_bc!(x_soa, us; nreps=100, bm)
TBB.singlefield_bc!(x_aos, us; nreps=100, bm)
# TBB.singlefield_bc!(x_soa, us; nreps=100, bm)
# TBB.singlefield_bc!(x_aos, us; nreps=100, bm)
TBB.thermo_func_bc!(x, us; nreps=100, bm)
TBB.thermo_func_sol!(x_vec, us; nreps=100, bm)
@info "Success!"
# TBB.thermo_func_sol!(x_vec, us; nreps=100, bm)

TBB.tabulate_benchmark(bm)

end
# end
#! format: on
2 changes: 2 additions & 0 deletions ext/ClimaCoreCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ import ClimaCore.Utilities: cart_ind, linear_ind
import ClimaCore.RecursiveApply:
⊠, ⊞, ⊟, radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax
import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
import ClimaCore.DataLayouts: universal_size
import ClimaCore.DataLayouts: ArraySize
import ClimaCore.DataLayouts: UniversalSize

include(joinpath("cuda", "cuda_utils.jl"))
Expand Down
69 changes: 69 additions & 0 deletions ext/cuda/data_layouts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@ import CUDA
parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} =
CUDA.CuArray{T, N, B} where {N}

# Can we remove this?
# parent_array_type(
# ::Type{<:CUDA.CuArray{T, N, B} where {N}},
# ::Val{ND},
# ) where {T, B, ND} = CUDA.CuArray{T, ND, B}

parent_array_type(
::Type{<:CUDA.CuArray{T, N, B} where {N}},
as::ArraySize,
) where {T, B} = CUDA.CuArray{T, ndims(as), B}

# Ensure that both parent array types have the same memory buffer type.
promote_parent_array_type(
::Type{CUDA.CuArray{T1, N, B} where {N}},
Expand Down Expand Up @@ -54,3 +65,61 @@ function Adapt.adapt_structure(
end,
)
end

import Adapt
import CUDA
function Adapt.adapt_structure(
to::CUDA.KernelAdaptor,
bc::DataLayouts.NonExtrudedBroadcasted{Style},
) where {Style}
DataLayouts.NonExtrudedBroadcasted{Style}(
adapt_f(to, bc.f),
Adapt.adapt(to, bc.args),
Adapt.adapt(to, bc.axes),
)
end

import ClimaCore.DataLayouts as DL
import CUDA
function CUDA.CuArray(fa::DL.FieldArray{FD}) where {FD}
arrays = ntuple(Val(DL.ncomponents(fa))) do f
CUDA.CuArray(fa.arrays[f])
end
return DL.FieldArray{FD}(arrays)
end

DL.field_array(array::CUDA.CuArray, as::ArraySize) =
CUDA.CuArray(DL.field_array(Array(array), as))


# TODO: this could be improved, but it's not typically used at runtime
function copyto_field_array_knl!(x::DL.FieldArray{FD}, y) where {FD}
gidx =
CUDA.threadIdx().x + (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x
I = cart_ind(size(y), gidx)
x[I] = y[I]
return nothing
end

@inline function Base.copyto!(
x::DL.FieldArray{FD, NT},
y::CUDA.CuArray,
) where {FD, NT <: NTuple}
if ndims(eltype(NT)) == ndims(y)
@inbounds for i in 1:DL.tuple_length(NT)
Base.copyto!(x.arrays[i], y)
end
elseif ndims(eltype(NT)) + 1 == ndims(y)
n = prod(size(y))
kernel =
CUDA.@cuda always_inline = true launch = false copyto_field_array_knl!(
x,
y,
)
config = CUDA.launch_configuration(kernel.fun)
threads = min(n, config.threads)
blocks = cld(n, threads)
kernel(x, y; threads, blocks)
end
x
end
84 changes: 70 additions & 14 deletions ext/cuda/data_layouts_copyto.jl
Original file line number Diff line number Diff line change
@@ -1,27 +1,83 @@
import ClimaCore.DataLayouts:
to_non_extruded_broadcasted, has_uniform_datalayouts
DataLayouts._device_dispatch(x::CUDA.CuArray) = ToCUDA()

function knl_copyto!(dest, src, us)
I = universal_index(dest)
if is_valid_index(dest, I, us)
@inbounds dest[I] = src[I]
# function Base.copyto!(
# dest::VIJFH{S, Nv, Nij, Nh},
# bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh},
# ::ToCUDA,
# ) where {S, Nv, Nij, Nh}
# if Nv > 0 && Nh > 0
# us = DataLayouts.UniversalSize(dest)
# n = prod(DataLayouts.universal_size(us))
# if has_uniform_datalayouts(bc)
# bc′ = to_non_extruded_broadcasted(bc)
# auto_launch!(knl_copyto_linear!, (dest, bc′, us), n; auto = true)
# else
# auto_launch!(knl_copyto_cart!, (dest, bc, us), n; auto = true)
# end
# end
# return dest
# end
function knl_copyto_linear!(dest::AbstractData, bc, us)
@inbounds begin
tidx = thread_index()
if tidx ≤ get_N(us)
dest[tidx] = bc[tidx]
end
end
return nothing
end

function knl_copyto_linear!(dest::DataF{S},bc,us) where {S}
@inbounds begin
tidx = thread_index()
if tidx ≤ get_N(us)
dest[] = bc[tidx]
end
end
return nothing
end

function knl_copyto_flat!(dest::AbstractData, bc, us)
@inbounds begin
tidx = thread_index()
if tidx ≤ get_N(us)
n = size(dest)
I = kernel_indexes(tidx, n)
dest[I] = bc[I]
end
end
return nothing
end

function knl_copyto_flat!(
dest::DataF{S},
bc::DataLayouts.BroadcastedUnionDataF{S},
us,
) where {S}
@inbounds begin
tidx = thread_index()
if tidx ≤ get_N(us)
n = size(dest)
# I = kernel_indexes(tidx, n)
dest[] = bc[]
end
end
return nothing
end

function cuda_copyto!(dest::AbstractData, bc)
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
us = DataLayouts.UniversalSize(dest)
n = prod(DataLayouts.universal_size(us))
if Nv > 0 && Nh > 0
args = (dest, bc, us)
threads = threads_via_occupancy(knl_copyto!, args)
n_max_threads = min(threads, get_N(us))
p = partition(dest, n_max_threads)
auto_launch!(
knl_copyto!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
if has_uniform_datalayouts(bc)
bc′ = to_non_extruded_broadcasted(bc)
auto_launch!(knl_copyto_linear!, (dest, bc′, us), n; auto = true)
else
auto_launch!(knl_copyto_flat!, (dest, bc, us), n; auto = true)
end
end
return dest
end
Expand Down
23 changes: 13 additions & 10 deletions ext/cuda/topologies_dss.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ function dss_load_perimeter_data_kernel!(
if gidx ≤ prod(sizep)
(level, p, fidx, elem) = cart_ind(sizep, gidx).I
(ip, jp) = perimeter[p]
data_idx = linear_ind(sized, (level, ip, jp, fidx, elem))
pperimeter_data[level, p, fidx, elem] = pdata[data_idx]
data_idx = linear_ind(sized, (level, ip, jp, elem))
pperimeter_data.arrays[fidx][level, p, elem] =
pdata.arrays[fidx][data_idx]
end
return nothing
end
Expand Down Expand Up @@ -89,7 +90,8 @@ function dss_unload_perimeter_data_kernel!(
(level, p, fidx, elem) = cart_ind(sizep, gidx).I
(ip, jp) = perimeter[p]
data_idx = linear_ind(sized, (level, ip, jp, fidx, elem))
pdata[data_idx] = pperimeter_data[level, p, fidx, elem]
pdata.arrays[fidx][data_idx] =
pperimeter_data.arrays[fidx][level, p, elem]
end
return nothing
end
Expand Down Expand Up @@ -148,12 +150,12 @@ function dss_local_kernel!(
for idx in st:(en - 1)
(lidx, vert) = local_vertices[idx]
ip = perimeter_vertex_node_index(vert)
sum_data += pperimeter_data[level, ip, fidx, lidx]
sum_data += pperimeter_data.arrays[fidx][level, ip, lidx]
end
for idx in st:(en - 1)
(lidx, vert) = local_vertices[idx]
ip = perimeter_vertex_node_index(vert)
pperimeter_data[level, ip, fidx, lidx] = sum_data
pperimeter_data.arrays[fidx][level, ip, lidx] = sum_data
end
elseif gidx ≤ nlevels * nfidx * (nlocalvertices + nlocalfaces) # interior faces
nfacedof = div(nperimeter - 4, 4)
Expand All @@ -169,10 +171,10 @@ function dss_local_kernel!(
ip1 = inc1 == 1 ? first1 + i - 1 : first1 - i + 1
ip2 = inc2 == 1 ? first2 + i - 1 : first2 - i + 1
val =
pperimeter_data[level, ip1, fidx, lidx1] +
pperimeter_data[level, ip2, fidx, lidx2]
pperimeter_data[level, ip1, fidx, lidx1] = val
pperimeter_data[level, ip2, fidx, lidx2] = val
pperimeter_data.arrays[fidx][level, ip1, lidx1] +
pperimeter_data.arrays[fidx][level, ip2, lidx2]
pperimeter_data.arrays[fidx][level, ip1, lidx1] = val
pperimeter_data.arrays[fidx][level, ip2, lidx2] = val
end
end

Expand Down Expand Up @@ -456,7 +458,8 @@ function load_from_recv_buffer_kernel!(
lidx = recv_buf_idx[irecv, 1]
ip = recv_buf_idx[irecv, 2]
idx = level + ((fidx - 1) + (irecv - 1) * nfid) * nlevels
CUDA.@atomic pperimeter_data[level, ip, fidx, lidx] += recv_data[idx]
CUDA.@atomic pperimeter_data.arrays[fidx][level, ip, lidx] +=
recv_data[idx]
end
return nothing
end
Expand Down
Loading
Loading