Skip to content

Commit

Permalink
Add HF datalayouts
Browse files Browse the repository at this point in the history
Fix some bugs

Widen benchmarks, fix CPU-GPU dispatch

Fix code loading

Specialize in gpu pointwise kernels

Fix adapt call

Ensure has_uniform_datalayouts for cuda copyto

Dont limit recursion in get_struct_linear

Bump allocation limit, add mem to benchmark stencil job

Specialize HF for fused kernels

Simplify n_dofs

Add some docs and change HorizontalLayout to horizontal_layout_type

Apply formatter

Revert n_ndofs simplification

Add some unit tests, and fix empty field edge cases

Add some comments, use todata for some broadcasted objects

Forward todata, extend onExtrudedBroadcasted

Apply formatter

Use more non-specific DimensionMismatch error

Add a new unit test

Add more unit tests with edge case fixes

Fix more edge cases
  • Loading branch information
charleskawczynski committed Oct 26, 2024
1 parent 7297f5d commit c450698
Show file tree
Hide file tree
Showing 51 changed files with 2,471 additions and 292 deletions.
31 changes: 31 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ steps:
key: unit_data_cartesian_field_index
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_cartesian_field_index.jl"

- label: "Unit: non_extruded_broadcast"
key: unit_non_extruded_broadcast
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_non_extruded_broadcast.jl"

- label: "Unit: mapreduce"
key: unit_data_mapreduce
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_mapreduce.jl"
Expand Down Expand Up @@ -1352,6 +1356,8 @@ steps:
- label: "Perf: FD operator stencil benchmarks"
key: "perf_fd_ops"
command: "julia --color=yes --project=.buildkite test/Operators/finitedifference/benchmark_stencils.jl"
agents:
slurm_mem: 20GB

- label: "Perf: GPU FD operator stencil benchmarks"
key: "gpu_perf_fd_ops"
Expand Down Expand Up @@ -1601,6 +1607,20 @@ steps:
agents:
slurm_gpus: 1

- label: ":computer: Float 32 3D sphere baroclinic wave (ρe) HF datalayout GPU"
key: "gpu_baroclinic_wave_rho_e_float32_hf"
command:
- "julia --color=yes --project=.buildkite examples/hybrid/driver.jl"
artifact_paths:
- "examples/hybrid/sphere/output/baroclinic_wave_rhoe_hf/Float32/*"
env:
TEST_NAME: "sphere/baroclinic_wave_rhoe_hf"
FLOAT_TYPE: "Float32"
horizontal_layout_type: "IJHF"
CLIMACOMMS_DEVICE: "CUDA"
agents:
slurm_gpus: 1

- label: ":computer: 3D Box limiters advection slotted spheres"
key: "cpu_box_advection_limiter_slotted_spheres"
command:
Expand Down Expand Up @@ -1870,6 +1890,17 @@ steps:
TEST_NAME: "sphere/baroclinic_wave_rhoe"
FLOAT_TYPE: "Float64"

- label: ":computer: Float 64 3D sphere baroclinic wave (ρe) HF datalayout"
key: "cpu_baroclinic_wave_rho_e_float64_hf"
command:
- "julia --color=yes --project=.buildkite examples/hybrid/driver.jl"
artifact_paths:
- "examples/hybrid/sphere/output/baroclinic_wave_rhoe_hf/Float64/*"
env:
TEST_NAME: "sphere/baroclinic_wave_rhoe_hf"
FLOAT_TYPE: "Float64"
horizontal_layout_type: "IJHF"

- label: ":computer: 3D sphere baroclinic wave (ρe)"
key: "cpu_baroclinic_wave_rho_e"
command:
Expand Down
6 changes: 5 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,15 @@ ClimaCore.jl Release Notes
main
-------

- Fixed world-age issue on Julia 1.11 issue [Julia#54780](https://github.com/JuliaLang/julia/issues/54780), PR [#2034](https://github.com/CliMA/ClimaCore.jl/pull/2034).
- We've added new datalayouts: `VIJHF`,`IJHF`,`IHF`,`VIHF`, to explore their performance compared to our existing datalayouts: `VIJFH`,`IJFH`,`IFH`,`VIFH`. PR [#2055](https://github.com/CliMA/ClimaCore.jl/pull/2053), PR [#2052](https://github.com/CliMA/ClimaCore.jl/pull/2055).
- We've refactored some modules to use less internals. PR [#2053](https://github.com/CliMA/ClimaCore.jl/pull/2053), PR [#2052](https://github.com/CliMA/ClimaCore.jl/pull/2052), [#2051](https://github.com/CliMA/ClimaCore.jl/pull/2051), [#2049](https://github.com/CliMA/ClimaCore.jl/pull/2049).
- Some work was done in attempt to reduce specializations and compile time. PR [#2042](https://github.com/CliMA/ClimaCore.jl/pull/2042), [#2041](https://github.com/CliMA/ClimaCore.jl/pull/2041)

v0.14.19
-------

- Fixed world-age issue on Julia 1.11 issue [Julia#54780](https://github.com/JuliaLang/julia/issues/54780), PR [#2034](https://github.com/CliMA/ClimaCore.jl/pull/2034).

### ![][badge-🐛bugfix] Fix undefined behavior in `DataLayout`s

PR [#2034](https://github.com/CliMA/ClimaCore.jl/pull/2034) fixes some undefined
Expand Down
4 changes: 4 additions & 0 deletions docs/src/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ DataLayouts.IFH
DataLayouts.IJFH
DataLayouts.VIFH
DataLayouts.VIJFH
DataLayouts.IHF
DataLayouts.IJHF
DataLayouts.VIHF
DataLayouts.VIJHF
```

## Geometry
Expand Down
14 changes: 12 additions & 2 deletions examples/common_spaces.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,19 @@ function make_horizontal_space(
mesh,
npoly,
context::ClimaComms.SingletonCommsContext,
horizontal_layout_type = DataLayouts.IJFH,
)
quad = Quadratures.GLL{npoly + 1}()
if mesh isa Meshes.AbstractMesh1D
topology = Topologies.IntervalTopology(ClimaComms.device(context), mesh)
space = Spaces.SpectralElementSpace1D(topology, quad)
elseif mesh isa Meshes.AbstractMesh2D
topology = Topologies.Topology2D(context, mesh)
space = Spaces.SpectralElementSpace2D(topology, quad)
space = Spaces.SpectralElementSpace2D(
topology,
quad;
horizontal_layout_type,
)
end
return space
end
Expand All @@ -51,13 +56,18 @@ function make_horizontal_space(
mesh,
npoly,
comms_ctx::ClimaComms.MPICommsContext,
horizontal_layout_type = DataLayouts.IJFH,
)
quad = Quadratures.GLL{npoly + 1}()
if mesh isa Meshes.AbstractMesh1D
error("Distributed mode does not work with 1D horizontal spaces.")
elseif mesh isa Meshes.AbstractMesh2D
topology = Topologies.Topology2D(comms_ctx, mesh)
space = Spaces.SpectralElementSpace2D(topology, quad)
space = Spaces.SpectralElementSpace2D(
topology,
quad;
horizontal_layout_type,
)
end
return space
end
Expand Down
18 changes: 16 additions & 2 deletions examples/hybrid/driver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ ClimaComms.@import_required_backends
import SciMLBase
const comms_ctx = ClimaComms.context()
is_distributed = comms_ctx isa ClimaComms.MPICommsContext
using ClimaCore: DataLayouts

using Logging

Expand Down Expand Up @@ -91,7 +92,17 @@ if haskey(ENV, "RESTART_FILE")
ᶠlocal_geometry = Fields.local_geometry_field(Y.f)
else
t_start = FT(0)
h_space = make_horizontal_space(horizontal_mesh, npoly, comms_ctx)
horizontal_layout_types = Dict()
horizontal_layout_types["IJFH"] = DataLayouts.IJFH
horizontal_layout_types["IJHF"] = DataLayouts.IJHF
horizontal_layout_type =
horizontal_layout_types[get(ENV, "horizontal_layout_type", "IJFH")]
h_space = make_horizontal_space(
horizontal_mesh,
npoly,
comms_ctx,
horizontal_layout_type,
)
center_space, face_space =
make_hybrid_spaces(h_space, z_max, z_elem; z_stretch)
ᶜlocal_geometry = Fields.local_geometry_field(center_space)
Expand Down Expand Up @@ -231,5 +242,8 @@ end
if !is_distributed || ClimaComms.iamroot(comms_ctx)
println("Walltime = $walltime seconds")
ENV["GKSwstype"] = "nul" # avoid displaying plots
postprocessing(sol, output_dir)
# https://github.com/CliMA/ClimaCore.jl/issues/2058
if !(Fields.field_values(sol.u[1].c) isa DataLayouts.VIJHF)
postprocessing(sol, output_dir)
end
end
40 changes: 40 additions & 0 deletions examples/hybrid/sphere/baroclinic_wave_rhoe_hf.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
using ClimaCorePlots, Plots
using ClimaCore.DataLayouts

include("baroclinic_wave_utilities.jl")

const sponge = false

# Variables required for driver.jl (modify as needed)
horizontal_mesh = cubed_sphere_mesh(; radius = R, h_elem = 4)
npoly = 4
z_max = FT(30e3)
z_elem = 10
t_end = FT(60 * 60 * 24 * 10)
dt = FT(400)
dt_save_to_sol = FT(60 * 60 * 24)
dt_save_to_disk = FT(0) # 0 means don't save to disk
ode_algorithm = CTS.SSP333
jacobian_flags = (; ∂ᶜ𝔼ₜ∂ᶠ𝕄_mode = :no_∂ᶜp∂ᶜK, ∂ᶠ𝕄ₜ∂ᶜρ_mode = :exact)

additional_cache(ᶜlocal_geometry, ᶠlocal_geometry, dt) = merge(
hyperdiffusion_cache(ᶜlocal_geometry, ᶠlocal_geometry; κ₄ = FT(2e17)),
sponge ? rayleigh_sponge_cache(ᶜlocal_geometry, ᶠlocal_geometry, dt) : (;),
)
function additional_tendency!(Yₜ, Y, p, t)
hyperdiffusion_tendency!(Yₜ, Y, p, t)
sponge && rayleigh_sponge_tendency!(Yₜ, Y, p, t)
end

center_initial_condition(local_geometry) =
center_initial_condition(local_geometry, Val(:ρe))
function postprocessing(sol, output_dir)
@info "L₂ norm of ρe at t = $(sol.t[1]): $(norm(sol.u[1].c.ρe))"
@info "L₂ norm of ρe at t = $(sol.t[end]): $(norm(sol.u[end].c.ρe))"

anim = Plots.@animate for Y in sol.u
ᶜv = Geometry.UVVector.(Y.c.uₕ).components.data.:2
Plots.plot(ᶜv, level = 3, clim = (-6, 6))
end
Plots.mp4(anim, joinpath(output_dir, "v.mp4"), fps = 5)
end
35 changes: 19 additions & 16 deletions ext/cuda/data_layouts.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@

import ClimaCore.DataLayouts: AbstractData
import ClimaCore.DataLayouts: FusedMultiBroadcast
import ClimaCore.DataLayouts: IJKFVH, IJFH, VIJFH, VIFH, IFH, IJF, IF, VF, DataF
import ClimaCore.DataLayouts:
IJKFVH, IJFH, IJHF, VIJFH, VIJHF, VIFH, VIHF, IFH, IHF, IJF, IF, VF, DataF
import ClimaCore.DataLayouts: IJFHStyle, VIJFHStyle, VFStyle, DataFStyle
import ClimaCore.DataLayouts: IJHFStyle, VIJHFStyle
import ClimaCore.DataLayouts: promote_parent_array_type
import ClimaCore.DataLayouts: parent_array_type
import ClimaCore.DataLayouts: isascalar
Expand Down Expand Up @@ -34,23 +36,24 @@ include("data_layouts_threadblock.jl")
adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)

function Adapt.adapt_structure(
to::CUDA.KernelAdaptor,
bc::DataLayouts.NonExtrudedBroadcasted{Style},
) where {Style}
DataLayouts.NonExtrudedBroadcasted{Style}(
adapt_f(to, bc.f),
Adapt.adapt(to, bc.args),
Adapt.adapt(to, bc.axes),
)
end

function Adapt.adapt_structure(
to::CUDA.KernelAdaptor,
fmbc::FusedMultiBroadcast,
)
FusedMultiBroadcast(
map(fmbc.pairs) do pair
dest = pair.first
bc = pair.second
Pair(
Adapt.adapt(to, dest),
Base.Broadcast.Broadcasted(
bc.style,
adapt_f(to, bc.f),
Adapt.adapt(to, bc.args),
Adapt.adapt(to, bc.axes),
),
)
end,
)
FusedMultiBroadcast(map(fmbc.pairs) do pair
dest = pair.first
bc = pair.second
Pair(Adapt.adapt(to, dest), Adapt.adapt(to, bc))
end)
end
82 changes: 67 additions & 15 deletions ext/cuda/data_layouts_copyto.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,74 @@ function knl_copyto!(dest, src, us)
return nothing
end

function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
us = DataLayouts.UniversalSize(dest)
if Nv > 0 && Nh > 0
args = (dest, bc, us)
threads = threads_via_occupancy(knl_copyto!, args)
n_max_threads = min(threads, get_N(us))
p = partition(dest, n_max_threads)
auto_launch!(
knl_copyto!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
function knl_copyto_linear!(dest, src, us)
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
if linear_is_valid_index(i, us)
@inbounds dest[i] = src[i]
end
return nothing
end

if VERSION v"1.11.0-beta"
# https://github.com/JuliaLang/julia/issues/56295
# Julia 1.11's Base.Broadcast currently requires
# multiple integer indexing, wheras Julia 1.10 did not.
# This means that we cannot reserve linear indexing to
# special-case fixes for https://github.com/JuliaLang/julia/issues/28126
# (including the GPU-variant related issue resolution efforts:
# JuliaGPU/GPUArrays.jl#454, JuliaGPU/GPUArrays.jl#464).
function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
us = DataLayouts.UniversalSize(dest)
if Nv > 0 && Nh > 0
args = (dest, bc, us)
threads = threads_via_occupancy(knl_copyto!, args)
n_max_threads = min(threads, get_N(us))
p = partition(dest, n_max_threads)
auto_launch!(
knl_copyto!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
end
return dest
end
else
function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
us = DataLayouts.UniversalSize(dest)
if Nv > 0 && Nh > 0
if DataLayouts.has_uniform_datalayouts(bc) &&
dest isa DataLayouts.EndsWithField
bc′ = Base.Broadcast.instantiate(
DataLayouts.to_non_extruded_broadcasted(bc),
)
args = (dest, bc′, us)
threads = threads_via_occupancy(knl_copyto_linear!, args)
n_max_threads = min(threads, get_N(us))
p = linear_partition(prod(size(dest)), n_max_threads)
auto_launch!(
knl_copyto_linear!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
else
args = (dest, bc, us)
threads = threads_via_occupancy(knl_copyto!, args)
n_max_threads = min(threads, get_N(us))
p = partition(dest, n_max_threads)
auto_launch!(
knl_copyto!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
end
end
return dest
end
return dest
end

# broadcasting scalar assignment
Expand Down
40 changes: 30 additions & 10 deletions ext/cuda/data_layouts_fill.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,40 @@ function knl_fill!(dest, val, us)
return nothing
end

function knl_fill_linear!(dest, val, us)
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
if linear_is_valid_index(i, us)
@inbounds dest[i] = val
end
return nothing
end

function Base.fill!(dest::AbstractData, bc, ::ToCUDA)
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
us = DataLayouts.UniversalSize(dest)
args = (dest, bc, us)
if Nv > 0 && Nh > 0
args = (dest, bc, us)
threads = threads_via_occupancy(knl_fill!, args)
n_max_threads = min(threads, get_N(us))
p = partition(dest, n_max_threads)
auto_launch!(
knl_fill!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
if !(VERSION v"1.11.0-beta") && dest isa DataLayouts.EndsWithField
threads = threads_via_occupancy(knl_fill_linear!, args)
n_max_threads = min(threads, get_N(us))
p = linear_partition(prod(size(dest)), n_max_threads)
auto_launch!(
knl_fill_linear!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
else
threads = threads_via_occupancy(knl_fill!, args)
n_max_threads = min(threads, get_N(us))
p = partition(dest, n_max_threads)
auto_launch!(
knl_fill!,
args;
threads_s = p.threads,
blocks_s = p.blocks,
)
end
end
return dest
end
Loading

0 comments on commit c450698

Please sign in to comment.