Add HF datalayouts

Fix some bugs Widen benchmarks, fix CPU-GPU dispatch Fix code loading Specialize in gpu pointwise kernels Fix adapt call Ensure has_uniform_datalayouts for cuda copyto Dont limit recursion in get_struct_linear Bump allocation limit, add mem to benchmark stencil job Specialize HF for fused kernels Simplify n_dofs Add some docs and change HorizontalLayout to horizontal_layout_type Apply formatter Revert n_ndofs simplification Add some unit tests, and fix empty field edge cases Add some comments, use todata for some broadcasted objects Forward todata, extend onExtrudedBroadcasted Apply formatter Use more non-specific DimensionMismatch error Add a new unit test Add more unit tests with edge case fixes Fix more edge cases
CliMA · Oct 26, 2024 · c450698 · c450698
1 parent 7297f5d
commit c450698
Show file tree

Hide file tree

Showing 51 changed files with 2,471 additions and 292 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -90,6 +90,10 @@ steps:
         key: unit_data_cartesian_field_index
         command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_cartesian_field_index.jl"
 
+      - label: "Unit: non_extruded_broadcast"
+        key: unit_non_extruded_broadcast
+        command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_non_extruded_broadcast.jl"
+
       - label: "Unit: mapreduce"
         key: unit_data_mapreduce
         command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_mapreduce.jl"
@@ -1352,6 +1356,8 @@ steps:
       - label: "Perf: FD operator stencil benchmarks"
         key: "perf_fd_ops"
         command: "julia --color=yes --project=.buildkite test/Operators/finitedifference/benchmark_stencils.jl"
+        agents:
+          slurm_mem: 20GB
 
       - label: "Perf: GPU FD operator stencil benchmarks"
         key: "gpu_perf_fd_ops"
@@ -1601,6 +1607,20 @@ steps:
         agents:
           slurm_gpus: 1
 
+      - label: ":computer: Float 32 3D sphere baroclinic wave (ρe) HF datalayout GPU"
+        key: "gpu_baroclinic_wave_rho_e_float32_hf"
+        command:
+          - "julia --color=yes --project=.buildkite examples/hybrid/driver.jl"
+        artifact_paths:
+          - "examples/hybrid/sphere/output/baroclinic_wave_rhoe_hf/Float32/*"
+        env:
+          TEST_NAME: "sphere/baroclinic_wave_rhoe_hf"
+          FLOAT_TYPE: "Float32"
+          horizontal_layout_type: "IJHF"
+          CLIMACOMMS_DEVICE: "CUDA"
+        agents:
+          slurm_gpus: 1
+
       - label: ":computer: 3D Box limiters advection slotted spheres"
         key: "cpu_box_advection_limiter_slotted_spheres"
         command:
@@ -1870,6 +1890,17 @@ steps:
           TEST_NAME: "sphere/baroclinic_wave_rhoe"
           FLOAT_TYPE: "Float64"
 
+      - label: ":computer: Float 64 3D sphere baroclinic wave (ρe) HF datalayout"
+        key: "cpu_baroclinic_wave_rho_e_float64_hf"
+        command:
+          - "julia --color=yes --project=.buildkite examples/hybrid/driver.jl"
+        artifact_paths:
+          - "examples/hybrid/sphere/output/baroclinic_wave_rhoe_hf/Float64/*"
+        env:
+          TEST_NAME: "sphere/baroclinic_wave_rhoe_hf"
+          FLOAT_TYPE: "Float64"
+          horizontal_layout_type: "IJHF"
+
       - label: ":computer: 3D sphere baroclinic wave (ρe)"
         key: "cpu_baroclinic_wave_rho_e"
         command:

diff --git a/NEWS.md b/NEWS.md
@@ -4,11 +4,15 @@ ClimaCore.jl Release Notes
 main
 -------
 
- - Fixed world-age issue on Julia 1.11 issue [Julia#54780](https://github.com/JuliaLang/julia/issues/54780), PR [#2034](https://github.com/CliMA/ClimaCore.jl/pull/2034).
+ - We've added new datalayouts: `VIJHF`,`IJHF`,`IHF`,`VIHF`, to explore their performance compared to our existing datalayouts: `VIJFH`,`IJFH`,`IFH`,`VIFH`. PR [#2055](https://github.com/CliMA/ClimaCore.jl/pull/2053), PR [#2052](https://github.com/CliMA/ClimaCore.jl/pull/2055).
+ - We've refactored some modules to use less internals. PR [#2053](https://github.com/CliMA/ClimaCore.jl/pull/2053), PR [#2052](https://github.com/CliMA/ClimaCore.jl/pull/2052), [#2051](https://github.com/CliMA/ClimaCore.jl/pull/2051), [#2049](https://github.com/CliMA/ClimaCore.jl/pull/2049).
+ - Some work was done in attempt to reduce specializations and compile time. PR [#2042](https://github.com/CliMA/ClimaCore.jl/pull/2042), [#2041](https://github.com/CliMA/ClimaCore.jl/pull/2041)
 
 v0.14.19
 -------
 
+ - Fixed world-age issue on Julia 1.11 issue [Julia#54780](https://github.com/JuliaLang/julia/issues/54780), PR [#2034](https://github.com/CliMA/ClimaCore.jl/pull/2034).
+
 ### ![][badge-🐛bugfix] Fix undefined behavior in `DataLayout`s
 
 PR [#2034](https://github.com/CliMA/ClimaCore.jl/pull/2034) fixes some undefined

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -32,6 +32,10 @@ DataLayouts.IFH
 DataLayouts.IJFH
 DataLayouts.VIFH
 DataLayouts.VIJFH
+DataLayouts.IHF
+DataLayouts.IJHF
+DataLayouts.VIHF
+DataLayouts.VIJHF
 ```
 
 ## Geometry

diff --git a/examples/common_spaces.jl b/examples/common_spaces.jl
@@ -35,14 +35,19 @@ function make_horizontal_space(
     mesh,
     npoly,
     context::ClimaComms.SingletonCommsContext,
+    horizontal_layout_type = DataLayouts.IJFH,
 )
     quad = Quadratures.GLL{npoly + 1}()
     if mesh isa Meshes.AbstractMesh1D
         topology = Topologies.IntervalTopology(ClimaComms.device(context), mesh)
         space = Spaces.SpectralElementSpace1D(topology, quad)
     elseif mesh isa Meshes.AbstractMesh2D
         topology = Topologies.Topology2D(context, mesh)
-        space = Spaces.SpectralElementSpace2D(topology, quad)
+        space = Spaces.SpectralElementSpace2D(
+            topology,
+            quad;
+            horizontal_layout_type,
+        )
     end
     return space
 end
@@ -51,13 +56,18 @@ function make_horizontal_space(
     mesh,
     npoly,
     comms_ctx::ClimaComms.MPICommsContext,
+    horizontal_layout_type = DataLayouts.IJFH,
 )
     quad = Quadratures.GLL{npoly + 1}()
     if mesh isa Meshes.AbstractMesh1D
         error("Distributed mode does not work with 1D horizontal spaces.")
     elseif mesh isa Meshes.AbstractMesh2D
         topology = Topologies.Topology2D(comms_ctx, mesh)
-        space = Spaces.SpectralElementSpace2D(topology, quad)
+        space = Spaces.SpectralElementSpace2D(
+            topology,
+            quad;
+            horizontal_layout_type,
+        )
     end
     return space
 end

diff --git a/examples/hybrid/driver.jl b/examples/hybrid/driver.jl
@@ -30,6 +30,7 @@ ClimaComms.@import_required_backends
 import SciMLBase
 const comms_ctx = ClimaComms.context()
 is_distributed = comms_ctx isa ClimaComms.MPICommsContext
+using ClimaCore: DataLayouts
 
 using Logging
 
@@ -91,7 +92,17 @@ if haskey(ENV, "RESTART_FILE")
     ᶠlocal_geometry = Fields.local_geometry_field(Y.f)
 else
     t_start = FT(0)
-    h_space = make_horizontal_space(horizontal_mesh, npoly, comms_ctx)
+    horizontal_layout_types = Dict()
+    horizontal_layout_types["IJFH"] = DataLayouts.IJFH
+    horizontal_layout_types["IJHF"] = DataLayouts.IJHF
+    horizontal_layout_type =
+        horizontal_layout_types[get(ENV, "horizontal_layout_type", "IJFH")]
+    h_space = make_horizontal_space(
+        horizontal_mesh,
+        npoly,
+        comms_ctx,
+        horizontal_layout_type,
+    )
     center_space, face_space =
         make_hybrid_spaces(h_space, z_max, z_elem; z_stretch)
     ᶜlocal_geometry = Fields.local_geometry_field(center_space)
@@ -231,5 +242,8 @@ end
 if !is_distributed || ClimaComms.iamroot(comms_ctx)
     println("Walltime = $walltime seconds")
     ENV["GKSwstype"] = "nul" # avoid displaying plots
-    postprocessing(sol, output_dir)
+    # https://github.com/CliMA/ClimaCore.jl/issues/2058
+    if !(Fields.field_values(sol.u[1].c) isa DataLayouts.VIJHF)
+        postprocessing(sol, output_dir)
+    end
 end
diff --git a/examples/hybrid/sphere/baroclinic_wave_rhoe_hf.jl b/examples/hybrid/sphere/baroclinic_wave_rhoe_hf.jl
@@ -0,0 +1,40 @@
+using ClimaCorePlots, Plots
+using ClimaCore.DataLayouts
+
+include("baroclinic_wave_utilities.jl")
+
+const sponge = false
+
+# Variables required for driver.jl (modify as needed)
+horizontal_mesh = cubed_sphere_mesh(; radius = R, h_elem = 4)
+npoly = 4
+z_max = FT(30e3)
+z_elem = 10
+t_end = FT(60 * 60 * 24 * 10)
+dt = FT(400)
+dt_save_to_sol = FT(60 * 60 * 24)
+dt_save_to_disk = FT(0) # 0 means don't save to disk
+ode_algorithm = CTS.SSP333
+jacobian_flags = (; ∂ᶜ𝔼ₜ∂ᶠ𝕄_mode = :no_∂ᶜp∂ᶜK, ∂ᶠ𝕄ₜ∂ᶜρ_mode = :exact)
+
+additional_cache(ᶜlocal_geometry, ᶠlocal_geometry, dt) = merge(
+    hyperdiffusion_cache(ᶜlocal_geometry, ᶠlocal_geometry; κ₄ = FT(2e17)),
+    sponge ? rayleigh_sponge_cache(ᶜlocal_geometry, ᶠlocal_geometry, dt) : (;),
+)
+function additional_tendency!(Yₜ, Y, p, t)
+    hyperdiffusion_tendency!(Yₜ, Y, p, t)
+    sponge && rayleigh_sponge_tendency!(Yₜ, Y, p, t)
+end
+
+center_initial_condition(local_geometry) =
+    center_initial_condition(local_geometry, Val(:ρe))
+function postprocessing(sol, output_dir)
+    @info "L₂ norm of ρe at t = $(sol.t[1]): $(norm(sol.u[1].c.ρe))"
+    @info "L₂ norm of ρe at t = $(sol.t[end]): $(norm(sol.u[end].c.ρe))"
+
+    anim = Plots.@animate for Y in sol.u
+        ᶜv = Geometry.UVVector.(Y.c.uₕ).components.data.:2
+        Plots.plot(ᶜv, level = 3, clim = (-6, 6))
+    end
+    Plots.mp4(anim, joinpath(output_dir, "v.mp4"), fps = 5)
+end
diff --git a/ext/cuda/data_layouts.jl b/ext/cuda/data_layouts.jl
@@ -1,8 +1,10 @@
 
 import ClimaCore.DataLayouts: AbstractData
 import ClimaCore.DataLayouts: FusedMultiBroadcast
-import ClimaCore.DataLayouts: IJKFVH, IJFH, VIJFH, VIFH, IFH, IJF, IF, VF, DataF
+import ClimaCore.DataLayouts:
+    IJKFVH, IJFH, IJHF, VIJFH, VIJHF, VIFH, VIHF, IFH, IHF, IJF, IF, VF, DataF
 import ClimaCore.DataLayouts: IJFHStyle, VIJFHStyle, VFStyle, DataFStyle
+import ClimaCore.DataLayouts: IJHFStyle, VIJHFStyle
 import ClimaCore.DataLayouts: promote_parent_array_type
 import ClimaCore.DataLayouts: parent_array_type
 import ClimaCore.DataLayouts: isascalar
@@ -34,23 +36,24 @@ include("data_layouts_threadblock.jl")
 adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
 adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)
 
+function Adapt.adapt_structure(
+    to::CUDA.KernelAdaptor,
+    bc::DataLayouts.NonExtrudedBroadcasted{Style},
+) where {Style}
+    DataLayouts.NonExtrudedBroadcasted{Style}(
+        adapt_f(to, bc.f),
+        Adapt.adapt(to, bc.args),
+        Adapt.adapt(to, bc.axes),
+    )
+end
+
 function Adapt.adapt_structure(
     to::CUDA.KernelAdaptor,
     fmbc::FusedMultiBroadcast,
 )
-    FusedMultiBroadcast(
-        map(fmbc.pairs) do pair
-            dest = pair.first
-            bc = pair.second
-            Pair(
-                Adapt.adapt(to, dest),
-                Base.Broadcast.Broadcasted(
-                    bc.style,
-                    adapt_f(to, bc.f),
-                    Adapt.adapt(to, bc.args),
-                    Adapt.adapt(to, bc.axes),
-                ),
-            )
-        end,
-    )
+    FusedMultiBroadcast(map(fmbc.pairs) do pair
+        dest = pair.first
+        bc = pair.second
+        Pair(Adapt.adapt(to, dest), Adapt.adapt(to, bc))
+    end)
 end
diff --git a/ext/cuda/data_layouts_copyto.jl b/ext/cuda/data_layouts_copyto.jl
@@ -8,22 +8,74 @@ function knl_copyto!(dest, src, us)
     return nothing
 end
 
-function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
-    (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
-    us = DataLayouts.UniversalSize(dest)
-    if Nv > 0 && Nh > 0
-        args = (dest, bc, us)
-        threads = threads_via_occupancy(knl_copyto!, args)
-        n_max_threads = min(threads, get_N(us))
-        p = partition(dest, n_max_threads)
-        auto_launch!(
-            knl_copyto!,
-            args;
-            threads_s = p.threads,
-            blocks_s = p.blocks,
-        )
+function knl_copyto_linear!(dest, src, us)
+    i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    if linear_is_valid_index(i, us)
+        @inbounds dest[i] = src[i]
+    end
+    return nothing
+end
+
+if VERSION ≥ v"1.11.0-beta"
+    # https://github.com/JuliaLang/julia/issues/56295
+    # Julia 1.11's Base.Broadcast currently requires
+    # multiple integer indexing, wheras Julia 1.10 did not.
+    # This means that we cannot reserve linear indexing to
+    # special-case fixes for https://github.com/JuliaLang/julia/issues/28126
+    # (including the GPU-variant related issue resolution efforts:
+    # JuliaGPU/GPUArrays.jl#454, JuliaGPU/GPUArrays.jl#464).
+    function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
+        (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
+        us = DataLayouts.UniversalSize(dest)
+        if Nv > 0 && Nh > 0
+            args = (dest, bc, us)
+            threads = threads_via_occupancy(knl_copyto!, args)
+            n_max_threads = min(threads, get_N(us))
+            p = partition(dest, n_max_threads)
+            auto_launch!(
+                knl_copyto!,
+                args;
+                threads_s = p.threads,
+                blocks_s = p.blocks,
+            )
+        end
+        return dest
+    end
+else
+    function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
+        (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
+        us = DataLayouts.UniversalSize(dest)
+        if Nv > 0 && Nh > 0
+            if DataLayouts.has_uniform_datalayouts(bc) &&
+               dest isa DataLayouts.EndsWithField
+                bc′ = Base.Broadcast.instantiate(
+                    DataLayouts.to_non_extruded_broadcasted(bc),
+                )
+                args = (dest, bc′, us)
+                threads = threads_via_occupancy(knl_copyto_linear!, args)
+                n_max_threads = min(threads, get_N(us))
+                p = linear_partition(prod(size(dest)), n_max_threads)
+                auto_launch!(
+                    knl_copyto_linear!,
+                    args;
+                    threads_s = p.threads,
+                    blocks_s = p.blocks,
+                )
+            else
+                args = (dest, bc, us)
+                threads = threads_via_occupancy(knl_copyto!, args)
+                n_max_threads = min(threads, get_N(us))
+                p = partition(dest, n_max_threads)
+                auto_launch!(
+                    knl_copyto!,
+                    args;
+                    threads_s = p.threads,
+                    blocks_s = p.blocks,
+                )
+            end
+        end
+        return dest
     end
-    return dest
 end
 
 # broadcasting scalar assignment

diff --git a/ext/cuda/data_layouts_fill.jl b/ext/cuda/data_layouts_fill.jl
@@ -6,20 +6,40 @@ function knl_fill!(dest, val, us)
     return nothing
 end
 
+function knl_fill_linear!(dest, val, us)
+    i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    if linear_is_valid_index(i, us)
+        @inbounds dest[i] = val
+    end
+    return nothing
+end
+
 function Base.fill!(dest::AbstractData, bc, ::ToCUDA)
     (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
     us = DataLayouts.UniversalSize(dest)
+    args = (dest, bc, us)
     if Nv > 0 && Nh > 0
-        args = (dest, bc, us)
-        threads = threads_via_occupancy(knl_fill!, args)
-        n_max_threads = min(threads, get_N(us))
-        p = partition(dest, n_max_threads)
-        auto_launch!(
-            knl_fill!,
-            args;
-            threads_s = p.threads,
-            blocks_s = p.blocks,
-        )
+        if !(VERSION ≥ v"1.11.0-beta") && dest isa DataLayouts.EndsWithField
+            threads = threads_via_occupancy(knl_fill_linear!, args)
+            n_max_threads = min(threads, get_N(us))
+            p = linear_partition(prod(size(dest)), n_max_threads)
+            auto_launch!(
+                knl_fill_linear!,
+                args;
+                threads_s = p.threads,
+                blocks_s = p.blocks,
+            )
+        else
+            threads = threads_via_occupancy(knl_fill!, args)
+            n_max_threads = min(threads, get_N(us))
+            p = partition(dest, n_max_threads)
+            auto_launch!(
+                knl_fill!,
+                args;
+                threads_s = p.threads,
+                blocks_s = p.blocks,
+            )
+        end
     end
     return dest
 end