Skip to content

Commit

Permalink
Add a MWE for tuning compute_kinetic! function.
Browse files Browse the repository at this point in the history
Try a different launch configuration for copyto_stencil_kernel
  • Loading branch information
sriharshakandala committed Jul 19, 2023
1 parent 0e7a5f9 commit be26e08
Show file tree
Hide file tree
Showing 2 changed files with 180 additions and 7 deletions.
163 changes: 163 additions & 0 deletions examples/hybrid/tuning/mwe_tune_ke.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
using CUDA
using ClimaComms
using ClimaCore
using LinearAlgebra
using NVTX, Colors

import ClimaCore:
Domains,
Fields,
Geometry,
Meshes,
Operators,
Spaces,
Topologies,
DataLayouts

const C1 = ClimaCore.Geometry.Covariant1Vector
const C2 = ClimaCore.Geometry.Covariant2Vector
const C3 = ClimaCore.Geometry.Covariant3Vector
const C12 = ClimaCore.Geometry.Covariant12Vector
const C123 = ClimaCore.Geometry.Covariant123Vector
const CT123 = Geometry.Contravariant123Vector
const ᶜinterp = Operators.InterpolateF2C()
const ᶠinterp = Operators.InterpolateC2F()

init_uθ(ϕ, z, R) = 1.0 / R
init_vθ(ϕ, z, R) = 1.0 / R
init_w(ϕ, z) = 1.0

function center_initial_condition(ᶜlocal_geometry, R)
(; lat, long, z) = ᶜlocal_geometry.coordinates
u₀ = @. init_uθ(lat, z, R)
v₀ = @. init_vθ(lat, z, R)
ᶜuₕ_local = @. Geometry.UVVector(u₀, v₀)
ᶜuₕ = @. Geometry.Covariant12Vector(ᶜuₕ_local, ᶜlocal_geometry)
return ᶜuₕ
end

function face_initial_condition(local_geometry)
(; lat, long, z) = local_geometry.coordinates
w = @. Geometry.Covariant3Vector(init_w(lat, z))
return w
end

# initialize a scalar field (for KE)
function init_scalar_field(space)
Y = map(Fields.local_geometry_field(space)) do local_geometry
h = 0.0
return h
end
return Y
end

function compute_kinetic_ca!(
κ::Fields.Field,
uₕ::Fields.Field,
uᵥ::Fields.Field,
)
@assert eltype(uₕ) <: Union{C1, C2, C12}
@assert eltype(uᵥ) <: C3
#NVTX.@range "compute_kinetic! kernel" color = colorant"brown" begin
@. κ =
1 / 2 * (
dot(C123(uₕ), CT123(uₕ)) +
ᶜinterp(dot(C123(uᵥ), CT123(uᵥ))) +
2 * dot(CT123(uₕ), ᶜinterp(C123(uᵥ)))
)
#end
end

function initialize_mwe()

FT = Float64

context = ClimaComms.SingletonCommsContext(ClimaComms.CUDADevice())
context_cpu =
ClimaComms.SingletonCommsContext(ClimaComms.CPUSingleThreaded()) # CPU context for comparison
R = FT(6.371229e6)

npoly = 3
z_max = FT(30e3)
z_elem = 10
h_elem = 12
println(
"running KE tuning test on $(context.device); h_elem = $h_elem; z_elem = $z_elem; npoly = $npoly; R = $R; z_max = $z_max; FT = $FT",
)
# horizontal space
domain = Domains.SphereDomain(R)
horizontal_mesh = Meshes.EquiangularCubedSphere(domain, h_elem)
horizontal_topology = Topologies.Topology2D(
context,
horizontal_mesh,
Topologies.spacefillingcurve(horizontal_mesh),
)
horizontal_topology_cpu = Topologies.Topology2D(
context_cpu,
horizontal_mesh,
Topologies.spacefillingcurve(horizontal_mesh),
)
quad = Spaces.Quadratures.GLL{npoly + 1}()
h_space = Spaces.SpectralElementSpace2D(horizontal_topology, quad)
h_space_cpu = Spaces.SpectralElementSpace2D(horizontal_topology_cpu, quad)

# vertical space
z_domain = Domains.IntervalDomain(
Geometry.ZPoint(zero(z_max)),
Geometry.ZPoint(z_max);
boundary_tags = (:bottom, :top),
)
z_mesh = Meshes.IntervalMesh(z_domain, nelems = z_elem)
z_topology = Topologies.IntervalTopology(context, z_mesh)
z_topology_cpu = Topologies.IntervalTopology(context_cpu, z_mesh)

z_center_space = Spaces.CenterFiniteDifferenceSpace(z_topology)
z_center_space_cpu = Spaces.CenterFiniteDifferenceSpace(z_topology_cpu)

z_face_space = Spaces.FaceFiniteDifferenceSpace(z_topology)
z_face_space_cpu = Spaces.FaceFiniteDifferenceSpace(z_topology_cpu)

hv_center_space =
Spaces.ExtrudedFiniteDifferenceSpace(h_space, z_center_space)
hv_face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(hv_center_space)

hv_center_space_cpu =
Spaces.ExtrudedFiniteDifferenceSpace(h_space_cpu, z_center_space_cpu)
hv_face_space_cpu =
Spaces.FaceExtrudedFiniteDifferenceSpace(hv_center_space_cpu)

# CPU
ᶜlocal_geometry_cpu = Fields.local_geometry_field(hv_center_space_cpu)
ᶠlocal_geometry_cpu = Fields.local_geometry_field(hv_face_space_cpu)
uₕ_cpu = center_initial_condition(ᶜlocal_geometry_cpu, R)
uᵥ_cpu = face_initial_condition(ᶠlocal_geometry_cpu)
κ_cpu = init_scalar_field(hv_center_space_cpu)

# GPU
ᶜlocal_geometry = Fields.local_geometry_field(hv_center_space)
ᶠlocal_geometry = Fields.local_geometry_field(hv_face_space)
uₕ = center_initial_condition(ᶜlocal_geometry, R)
uᵥ = face_initial_condition(ᶠlocal_geometry)
κ = init_scalar_field(hv_center_space)

return (; κ = κ, uₕ = uₕ, uᵥ = uᵥ, κ_cpu, uₕ_cpu = uₕ_cpu, uᵥ_cpu = uᵥ_cpu)
end

function profile_compute_kinetic()
κ, uₕ, uᵥ, κ_cpu, uₕ_cpu, uᵥ_cpu = initialize_mwe()
# compute kinetic energy
κ = compute_kinetic_ca!(κ, uₕ, uᵥ)
κ_cpu = compute_kinetic_ca!(κ_cpu, uₕ_cpu, uᵥ_cpu)

@show Array(parent(κ)) parent(κ_cpu)

nreps = 10

for i in 1:nreps
NVTX.@range "compute_kinetic_ca!" color = colorant"blue" payload = i begin
κ = compute_kinetic_ca!(κ, uₕ, uᵥ)
end
end
end

profile_compute_kinetic()
24 changes: 17 additions & 7 deletions src/Operators/finitedifference.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3438,22 +3438,32 @@ function Base.copyto!(
Nh = 1
end
bounds = window_bounds(space, bc)

max_threads = 256
nitems = Nq * Nq * Nh
nthreads = min(max_threads, nitems)
nblocks = cld(nitems, nthreads)
# executed
@cuda threads = (Nq, Nq) blocks = (Nh,) copyto_stencil_kernel!(
@cuda threads = (nthreads) blocks = (nblocks,) copyto_stencil_kernel!(
strip_space(out, space),
strip_space(bc, space),
axes(out),
bounds,
Nq,
Nh,
)
return out
end

function copyto_stencil_kernel!(out, bc, space, bds)
i = threadIdx().x
j = threadIdx().y
h = blockIdx().x
hidx = (i, j, h)
apply_stencil!(space, out, bc, hidx, bds)
function copyto_stencil_kernel!(out, bc, space, bds, Nq, Nh)
gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
if gid Nq * Nq * Nh
h = cld(gid, Nq * Nq)
j = cld(gid - (h - 1) * Nq * Nq, Nq)
i = gid - (h - 1) * Nq * Nq - (j - 1) * Nq
hidx = (i, j, h)
apply_stencil!(space, out, bc, hidx, bds)
end
return nothing
end

Expand Down

0 comments on commit be26e08

Please sign in to comment.