diff --git a/examples/hybrid/tuning/mwe_tune_ke.jl b/examples/hybrid/tuning/mwe_tune_ke.jl new file mode 100644 index 0000000000..30440d4133 --- /dev/null +++ b/examples/hybrid/tuning/mwe_tune_ke.jl @@ -0,0 +1,163 @@ +using CUDA +using ClimaComms +using ClimaCore +using LinearAlgebra +using NVTX, Colors + +import ClimaCore: + Domains, + Fields, + Geometry, + Meshes, + Operators, + Spaces, + Topologies, + DataLayouts + +const C1 = ClimaCore.Geometry.Covariant1Vector +const C2 = ClimaCore.Geometry.Covariant2Vector +const C3 = ClimaCore.Geometry.Covariant3Vector +const C12 = ClimaCore.Geometry.Covariant12Vector +const C123 = ClimaCore.Geometry.Covariant123Vector +const CT123 = Geometry.Contravariant123Vector +const ᶜinterp = Operators.InterpolateF2C() +const ᶠinterp = Operators.InterpolateC2F() + +init_uθ(ϕ, z, R) = 1.0 / R +init_vθ(ϕ, z, R) = 1.0 / R +init_w(ϕ, z) = 1.0 + +function center_initial_condition(ᶜlocal_geometry, R) + (; lat, long, z) = ᶜlocal_geometry.coordinates + u₀ = @. init_uθ(lat, z, R) + v₀ = @. init_vθ(lat, z, R) + ᶜuₕ_local = @. Geometry.UVVector(u₀, v₀) + ᶜuₕ = @. Geometry.Covariant12Vector(ᶜuₕ_local, ᶜlocal_geometry) + return ᶜuₕ +end + +function face_initial_condition(local_geometry) + (; lat, long, z) = local_geometry.coordinates + w = @. Geometry.Covariant3Vector(init_w(lat, z)) + return w +end + +# initialize a scalar field (for KE) +function init_scalar_field(space) + Y = map(Fields.local_geometry_field(space)) do local_geometry + h = 0.0 + return h + end + return Y +end + +function compute_kinetic_ca!( + κ::Fields.Field, + uₕ::Fields.Field, + uᵥ::Fields.Field, +) + @assert eltype(uₕ) <: Union{C1, C2, C12} + @assert eltype(uᵥ) <: C3 + #NVTX.@range "compute_kinetic! kernel" color = colorant"brown" begin + @. κ = + 1 / 2 * ( + dot(C123(uₕ), CT123(uₕ)) + + ᶜinterp(dot(C123(uᵥ), CT123(uᵥ))) + + 2 * dot(CT123(uₕ), ᶜinterp(C123(uᵥ))) + ) + #end +end + +function initialize_mwe() + + FT = Float64 + + context = ClimaComms.SingletonCommsContext(ClimaComms.CUDADevice()) + context_cpu = + ClimaComms.SingletonCommsContext(ClimaComms.CPUSingleThreaded()) # CPU context for comparison + R = FT(6.371229e6) + + npoly = 3 + z_max = FT(30e3) + z_elem = 10 + h_elem = 12 + println( + "running KE tuning test on $(context.device); h_elem = $h_elem; z_elem = $z_elem; npoly = $npoly; R = $R; z_max = $z_max; FT = $FT", + ) + # horizontal space + domain = Domains.SphereDomain(R) + horizontal_mesh = Meshes.EquiangularCubedSphere(domain, h_elem) + horizontal_topology = Topologies.Topology2D( + context, + horizontal_mesh, + Topologies.spacefillingcurve(horizontal_mesh), + ) + horizontal_topology_cpu = Topologies.Topology2D( + context_cpu, + horizontal_mesh, + Topologies.spacefillingcurve(horizontal_mesh), + ) + quad = Spaces.Quadratures.GLL{npoly + 1}() + h_space = Spaces.SpectralElementSpace2D(horizontal_topology, quad) + h_space_cpu = Spaces.SpectralElementSpace2D(horizontal_topology_cpu, quad) + + # vertical space + z_domain = Domains.IntervalDomain( + Geometry.ZPoint(zero(z_max)), + Geometry.ZPoint(z_max); + boundary_tags = (:bottom, :top), + ) + z_mesh = Meshes.IntervalMesh(z_domain, nelems = z_elem) + z_topology = Topologies.IntervalTopology(context, z_mesh) + z_topology_cpu = Topologies.IntervalTopology(context_cpu, z_mesh) + + z_center_space = Spaces.CenterFiniteDifferenceSpace(z_topology) + z_center_space_cpu = Spaces.CenterFiniteDifferenceSpace(z_topology_cpu) + + z_face_space = Spaces.FaceFiniteDifferenceSpace(z_topology) + z_face_space_cpu = Spaces.FaceFiniteDifferenceSpace(z_topology_cpu) + + hv_center_space = + Spaces.ExtrudedFiniteDifferenceSpace(h_space, z_center_space) + hv_face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(hv_center_space) + + hv_center_space_cpu = + Spaces.ExtrudedFiniteDifferenceSpace(h_space_cpu, z_center_space_cpu) + hv_face_space_cpu = + Spaces.FaceExtrudedFiniteDifferenceSpace(hv_center_space_cpu) + + # CPU + ᶜlocal_geometry_cpu = Fields.local_geometry_field(hv_center_space_cpu) + ᶠlocal_geometry_cpu = Fields.local_geometry_field(hv_face_space_cpu) + uₕ_cpu = center_initial_condition(ᶜlocal_geometry_cpu, R) + uᵥ_cpu = face_initial_condition(ᶠlocal_geometry_cpu) + κ_cpu = init_scalar_field(hv_center_space_cpu) + + # GPU + ᶜlocal_geometry = Fields.local_geometry_field(hv_center_space) + ᶠlocal_geometry = Fields.local_geometry_field(hv_face_space) + uₕ = center_initial_condition(ᶜlocal_geometry, R) + uᵥ = face_initial_condition(ᶠlocal_geometry) + κ = init_scalar_field(hv_center_space) + + return (; κ = κ, uₕ = uₕ, uᵥ = uᵥ, κ_cpu, uₕ_cpu = uₕ_cpu, uᵥ_cpu = uᵥ_cpu) +end + +function profile_compute_kinetic() + κ, uₕ, uᵥ, κ_cpu, uₕ_cpu, uᵥ_cpu = initialize_mwe() + # compute kinetic energy + κ = compute_kinetic_ca!(κ, uₕ, uᵥ) + κ_cpu = compute_kinetic_ca!(κ_cpu, uₕ_cpu, uᵥ_cpu) + + @show Array(parent(κ)) ≈ parent(κ_cpu) + + nreps = 10 + + for i in 1:nreps + NVTX.@range "compute_kinetic_ca!" color = colorant"blue" payload = i begin + κ = compute_kinetic_ca!(κ, uₕ, uᵥ) + end + end +end + +profile_compute_kinetic() diff --git a/src/Operators/finitedifference.jl b/src/Operators/finitedifference.jl index a15cb9cf7e..41a2d427e3 100644 --- a/src/Operators/finitedifference.jl +++ b/src/Operators/finitedifference.jl @@ -3438,22 +3438,32 @@ function Base.copyto!( Nh = 1 end bounds = window_bounds(space, bc) + + max_threads = 256 + nitems = Nq * Nq * Nh + nthreads = min(max_threads, nitems) + nblocks = cld(nitems, nthreads) # executed - @cuda threads = (Nq, Nq) blocks = (Nh,) copyto_stencil_kernel!( + @cuda threads = (nthreads) blocks = (nblocks,) copyto_stencil_kernel!( strip_space(out, space), strip_space(bc, space), axes(out), bounds, + Nq, + Nh, ) return out end -function copyto_stencil_kernel!(out, bc, space, bds) - i = threadIdx().x - j = threadIdx().y - h = blockIdx().x - hidx = (i, j, h) - apply_stencil!(space, out, bc, hidx, bds) +function copyto_stencil_kernel!(out, bc, space, bds, Nq, Nh) + gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x + if gid ≤ Nq * Nq * Nh + h = cld(gid, Nq * Nq) + j = cld(gid - (h - 1) * Nq * Nq, Nq) + i = gid - (h - 1) * Nq * Nq - (j - 1) * Nq + hidx = (i, j, h) + apply_stencil!(space, out, bc, hidx, bds) + end return nothing end