From 606e364bf2cb0ea6fd2fbb0cd2c20b850f25ca08 Mon Sep 17 00:00:00 2001 From: sriharshakandala Date: Thu, 22 Jun 2023 09:31:36 -0700 Subject: [PATCH] Add support for distributed CUDA mapreduce --- .buildkite/pipeline.yml | 13 +- .../hybrid/box/bubble_3d_invariant_rhoe.jl | 34 +-- src/Fields/mapreduce_cuda.jl | 64 ++++-- test/Fields/reduction_cuda.jl | 47 +---- test/Fields/reduction_cuda_distributed.jl | 198 ++++++++++++++++++ test/Fields/reduction_cuda_utils.jl | 47 +++++ 6 files changed, 319 insertions(+), 84 deletions(-) create mode 100644 test/Fields/reduction_cuda_distributed.jl create mode 100644 test/Fields/reduction_cuda_utils.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 4018292bfe..3defbf13c4 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -334,6 +334,15 @@ steps: agents: slurm_gpus: 1 + - label: "Unit: distributed reduction cuda" + key: unit_distributed_reduction_cuda + command: + - "julia --project -e 'using CUDA; CUDA.versioninfo()'" + - "srun julia --color=yes --check-bounds=yes --project=test test/Fields/reduction_cuda_distributed.jl" + agents: + slurm_gpus_per_task: 1 + slurm_ntasks: 2 + - group: "Unit: Operators" steps: @@ -912,7 +921,7 @@ steps: command: - "nsys profile --trace=nvtx,cuda --output=output/$$BUILDKITE_STEP_KEY/report julia --color=yes --project=examples examples/hybrid/box/bubble_3d_invariant_rhoe.jl" artifact_paths: - - "examples/hybrid/box/output/gpu_bubble_3d_invariant_rhoe/*" + - "examples/hybrid/box/output/gpu_bubble_3d_invariant_rhoe/*_low_*" agents: slurm_gpus: 1 @@ -921,7 +930,7 @@ steps: command: - "nsys profile --trace=nvtx,cuda --output=output/$$BUILDKITE_STEP_KEY/report julia --color=yes --project=examples examples/hybrid/box/bubble_3d_invariant_rhoe.jl Float64 custom 1000 1000 4 16 3 0.05 700.0" artifact_paths: - - "examples/hybrid/box/output/gpu_bubble_3d_invariant_rhoe_custom/*" + - "examples/hybrid/box/output/gpu_bubble_3d_invariant_rhoe/*_custom_*" agents: slurm_gpus: 1 diff --git a/examples/hybrid/box/bubble_3d_invariant_rhoe.jl b/examples/hybrid/box/bubble_3d_invariant_rhoe.jl index 991f680263..90cb0922f8 100644 --- a/examples/hybrid/box/bubble_3d_invariant_rhoe.jl +++ b/examples/hybrid/box/bubble_3d_invariant_rhoe.jl @@ -199,7 +199,8 @@ end function rhs_invariant!(dY, Y, ghost_buffer, t) (; C_p, C_v, MSLP, grav, R_d, T_0) = ghost_buffer.params - (; z, κ₄, cω³, fω¹², fu¹², cuvw, cE, ce, cI, cT, cp, ch_tot) = ghost_buffer + (; z, κ₄, cω³, fω¹², fu¹², fu³, cuvw, cE, ce, cI, cT, cp, ch_tot) = + ghost_buffer cρ = Y.Yc.ρ # scalar on centers fw = Y.w # Covariant3Vector on faces cuₕ = Y.uₕ # Covariant12Vector on centers @@ -301,7 +302,7 @@ function rhs_invariant!(dY, Y, ghost_buffer, t) Geometry.Contravariant12Vector.( Geometry.Covariant123Vector.(Ic2f.(cuₕ)), ) # Contravariant12Vector in 3D - fu³ = Geometry.Contravariant3Vector.(Geometry.Covariant123Vector.(fw)) + fu³ .= Geometry.Contravariant3Vector.(Geometry.Covariant123Vector.(fw)) @. dw -= fω¹² × fu¹² # Covariant3Vector on faces @. duₕ -= If2c(fω¹² × fu³) @@ -370,11 +371,11 @@ function bubble_3d_invariant_ρe(ARGS, comms_ctx, ::Type{FT}) where {FT} args = () end - if ClimaComms.iamroot(comms_ctx) - @info "Context information" device = comms_ctx.device context = - comms_ctx nprocs = ClimaComms.nprocs(comms_ctx) Float_type = FT resolution = - resolution - end + logger_stream = ClimaComms.iamroot(comms_ctx) ? stderr : devnull + prev_logger = global_logger(ConsoleLogger(logger_stream, Logging.Info)) + + @info "Context information" device = comms_ctx.device context = comms_ctx nprocs = + ClimaComms.nprocs(comms_ctx) Float_type = FT resolution = resolution sim_params = SimulationParameters(FT, resolution, args...) (; lxy, lz) = sim_params @@ -395,12 +396,11 @@ function bubble_3d_invariant_ρe(ARGS, comms_ctx, ::Type{FT}) where {FT} w = map(_ -> Geometry.Covariant3Vector(0.0), face_coords) Y = Fields.FieldVector(Yc = Yc, uₕ = uₕ, w = w) - energy_0 = ClimaComms.reduce(comms_ctx, sum(Y.Yc.ρe), +) - mass_0 = ClimaComms.reduce(comms_ctx, sum(Y.Yc.ρ), +) + energy_0 = sum(Y.Yc.ρe) + mass_0 = sum(Y.Yc.ρ) κ₄ = compute_κ₄(sim_params) - @info "summary" energy_0 mass_0 κ₄ - + @info "Initial condition" energy_0 mass_0 κ₄ If2c = Operators.InterpolateF2C() Ic2f = Operators.InterpolateC2F( bottom = Operators.Extrapolate(), @@ -429,6 +429,7 @@ function bubble_3d_invariant_ρe(ARGS, comms_ctx, ::Type{FT}) where {FT} fu¹² = Geometry.Contravariant12Vector.( Geometry.Covariant123Vector.(Ic2f.(Y.uₕ)) ), + fu³ = Geometry.Contravariant3Vector.(Geometry.Covariant123Vector.(Y.w)), cuvw = cuvw, cE = cE, ce = ce, @@ -466,20 +467,23 @@ function bubble_3d_invariant_ρe(ARGS, comms_ctx, ::Type{FT}) where {FT} Es = FT[] Mass = FT[] for sol_step in sol_invariant.u - Es_step = ClimaComms.reduce(comms_ctx, sum(sol_step.Yc.ρe), +) - Mass_step = ClimaComms.reduce(comms_ctx, sum(sol_step.Yc.ρ), +) + Es_step = sum(sol_step.Yc.ρe) + Mass_step = sum(sol_step.Yc.ρ) if ClimaComms.iamroot(comms_ctx) push!(Es, Es_step) push!(Mass, Mass_step) end end + @info "summary" Es[end] Mass[end] #----------------------------------- ENV["GKSwstype"] = "nul" Plots.GRBackend() - dir = "bubble_3d_invariant_rhoe" + dir = + comms_ctx.device isa ClimaComms.CPUDevice ? "bubble_3d_invariant_rhoe" : + "gpu_bubble_3d_invariant_rhoe" path = joinpath(@__DIR__, "output", dir) mkpath(path) @@ -539,4 +543,4 @@ end comms_ctx = ClimaComms.context() ClimaComms.init(comms_ctx) -bubble_3d_invariant_ρe(ARGS, comms_ctx, FloatType) +sol_invariant = bubble_3d_invariant_ρe(ARGS, comms_ctx, FloatType) diff --git a/src/Fields/mapreduce_cuda.jl b/src/Fields/mapreduce_cuda.jl index e3be163493..03ab059be0 100644 --- a/src/Fields/mapreduce_cuda.jl +++ b/src/Fields/mapreduce_cuda.jl @@ -1,31 +1,56 @@ -Base.sum( +function Base.sum( field::Union{Field, Base.Broadcast.Broadcasted{<:FieldStyle}}, ::ClimaComms.CUDADevice, -) = mapreduce_cuda(identity, +, field, weighting = true) #TODO: distributed support to be added +) + context = ClimaComms.context(axes(field)) + localsum = mapreduce_cuda(identity, +, field, weighting = true) + ClimaComms.allreduce!(context, parent(localsum), +) + return localsum[] +end -Base.sum(fn, field::Field, ::ClimaComms.CUDADevice) = - mapreduce_cuda(fn, +, field, weighting = true) #TODO: distributed support to be added +function Base.sum(fn, field::Field, ::ClimaComms.CUDADevice) + context = ClimaComms.context(axes(field)) + localsum = mapreduce_cuda(fn, +, field, weighting = true) + ClimaComms.allreduce!(context, parent(localsum), +) + return localsum[] +end -Base.maximum(fn, field::Field, ::ClimaComms.CUDADevice) = - mapreduce_cuda(fn, max, field) #TODO: distributed support to be added +function Base.maximum(fn, field::Field, ::ClimaComms.CUDADevice) + context = ClimaComms.context(axes(field)) + localmax = mapreduce_cuda(fn, max, field) + ClimaComms.allreduce!(context, parent(localmax), max) + return localmax[] +end -Base.maximum(field::Field, ::ClimaComms.CUDADevice) = - mapreduce_cuda(identity, max, field) #TODO: distributed support to be added +function Base.maximum(field::Field, ::ClimaComms.CUDADevice) + context = ClimaComms.context(axes(field)) + localmax = mapreduce_cuda(identity, max, field) + ClimaComms.allreduce!(context, parent(localmax), max) + return localmax[] +end -Base.minimum(fn, field::Field, ::ClimaComms.CUDADevice) = - mapreduce_cuda(fn, min, field) #TODO: distributed support to be added +function Base.minimum(fn, field::Field, ::ClimaComms.CUDADevice) + context = ClimaComms.context(axes(field)) + localmin = mapreduce_cuda(fn, min, field) + ClimaComms.allreduce!(context, parent(localmin), min) + return localmin[] +end -Base.minimum(field::Field, ::ClimaComms.CUDADevice) = - mapreduce_cuda(identity, min, field) #TODO: distributed support to be added +function Base.minimum(field::Field, ::ClimaComms.CUDADevice) + context = ClimaComms.context(axes(field)) + localmin = mapreduce_cuda(identity, min, field) + ClimaComms.allreduce!(context, parent(localmin), min) + return localmin[] +end Statistics.mean( field::Union{Field, Base.Broadcast.Broadcasted{<:FieldStyle}}, ::ClimaComms.CUDADevice, -) = Base.sum(field) ./ Base.sum(ones(axes(field))) #TODO: distributed support to be added +) = Base.sum(field) ./ Base.sum(ones(axes(field))) Statistics.mean(fn, field::Field, ::ClimaComms.CUDADevice) = - Base.sum(fn, field) ./ Base.sum(ones(axes(field))) #TODO: distributed support to be added + Base.sum(fn, field) ./ Base.sum(ones(axes(field))) function LinearAlgebra.norm( field::Field, @@ -35,10 +60,11 @@ function LinearAlgebra.norm( ) if p == 2 # currently only one which supports structured types + # TODO: perform map without allocation new field if normalize - sqrt.(Statistics.mean(LinearAlgebra.norm_sqr, field)) + sqrt.(Statistics.mean(LinearAlgebra.norm_sqr.(field))) else - sqrt.(sum(LinearAlgebra.norm_sqr, field)) + sqrt.(sum(LinearAlgebra.norm_sqr.(field))) end elseif p == 1 if normalize @@ -106,11 +132,7 @@ function mapreduce_cuda( Val(shmemsize), ) end - if Nf == 1 - return Array(reduce_cuda)[1, 1] - else - return DataLayouts.DataF{S}(CuArray(view(reduce_cuda, 1, :)[:])) - end + return DataLayouts.DataF{S}(Array(Array(reduce_cuda)[1, :])) end function mapreduce_cuda_kernel!( diff --git a/test/Fields/reduction_cuda.jl b/test/Fields/reduction_cuda.jl index 4612eb334e..4870b6eef7 100644 --- a/test/Fields/reduction_cuda.jl +++ b/test/Fields/reduction_cuda.jl @@ -14,52 +14,7 @@ import ClimaCore: Topologies, DataLayouts -# set initial condition for steady-state test -function set_initial_condition(space) - Y = map(Fields.local_geometry_field(space)) do local_geometry - h = 1.0 - return h - end - return Y -end - -# set simple field -function set_simple_field(space) - α0 = 45.0 - h0 = 1.0 - Y = map(Fields.local_geometry_field(space)) do local_geometry - coord = local_geometry.coordinates - ϕ = coord.lat - λ = coord.long - z = coord.z - h = h0 * z * (cosd(α0) * cosd(ϕ) + sind(α0) * cosd(λ) * sind(ϕ)) - return h - end - return Y -end - -function set_elevation(space, h₀) - Y = map(Fields.local_geometry_field(space)) do local_geometry - coord = local_geometry.coordinates - - ϕ = coord.lat - λ = coord.long - FT = eltype(λ) - ϕₘ = FT(0) # degrees (equator) - λₘ = FT(3 / 2 * 180) # degrees - rₘ = - FT(acos(sind(ϕₘ) * sind(ϕ) + cosd(ϕₘ) * cosd(ϕ) * cosd(λ - λₘ))) # Great circle distance (rads) - Rₘ = FT(3π / 4) # Moutain radius - ζₘ = FT(π / 16) # Mountain oscillation half-width - zₛ = ifelse( - rₘ < Rₘ, - FT(h₀ / 2) * (1 + cospi(rₘ / Rₘ)) * (cospi(rₘ / ζₘ))^2, - FT(0), - ) - return zₛ - end - return Y -end +include("reduction_cuda_utils.jl") @testset "test cuda reduction op on surface of sphere" begin FT = Float64 diff --git a/test/Fields/reduction_cuda_distributed.jl b/test/Fields/reduction_cuda_distributed.jl new file mode 100644 index 0000000000..f020c75043 --- /dev/null +++ b/test/Fields/reduction_cuda_distributed.jl @@ -0,0 +1,198 @@ +using Test +using CUDA +using ClimaComms +using Statistics +using LinearAlgebra +using Logging + +import ClimaCore: + Domains, + Fields, + Geometry, + Meshes, + Operators, + Spaces, + Topologies, + DataLayouts + +include("reduction_cuda_utils.jl") + +@testset "test distributed cuda reduction op on surface of sphere" begin + FT = Float64 + context = ClimaComms.context() + context_cpu = ClimaComms.context(ClimaComms.CPUDevice()) # CPU context for comparison + ClimaComms.init(context) + ClimaComms.init(context_cpu) + + logger_stream = ClimaComms.iamroot(context) ? stderr : devnull + prev_logger = global_logger(ConsoleLogger(logger_stream, Logging.Info)) + # Set up discretization + ne = 72 + Nq = 4 + ndof = ne * ne * 6 * Nq * Nq + @info "Configuration" device = context.device nprocs = + ClimaComms.nprocs(context) Ne = ne Nq = Nq ndof = ndof FT = FT + R = FT(6.37122e6) # radius of earth + domain = Domains.SphereDomain(R) + mesh = Meshes.EquiangularCubedSphere(domain, ne) + quad = Spaces.Quadratures.GLL{Nq}() + grid_topology = Topologies.Topology2D(context, mesh) + grid_topology_cpu = Topologies.Topology2D(context_cpu, mesh) + space = Spaces.SpectralElementSpace2D(grid_topology, quad) + space_cpu = Spaces.SpectralElementSpace2D(grid_topology_cpu, quad) + + Y = set_initial_condition(space) + Y_cpu = set_initial_condition(space_cpu) + + h₀ = FT(200) + Z = set_elevation(space, h₀) + Z_cpu = set_elevation(space_cpu, h₀) + + result = Base.sum(Y) + result_cpu = Base.sum(Y_cpu) + + local_max = Base.maximum(identity, Z) + local_max_cpu = Base.maximum(identity, Z_cpu) + + local_min = Base.minimum(identity, Z) + local_min_cpu = Base.minimum(identity, Z_cpu) + # test weighted sum + @test result ≈ 4 * pi * R^2 rtol = 1e-5 + @test result ≈ result_cpu + # test maximum + @test local_max ≈ h₀ + @test local_max ≈ local_max_cpu + # test minimum + @test local_min ≈ FT(0) + @test local_min ≈ local_min_cpu + # testing mean + meanz = Statistics.mean(Z) + meanz_cpu = Statistics.mean(Z_cpu) + @test meanz ≈ meanz_cpu + # testing norm + norm1z = LinearAlgebra.norm(Z, 1) + norm1z_cpu = LinearAlgebra.norm(Z_cpu, 1) + @test norm1z ≈ norm1z_cpu + + norm2z = LinearAlgebra.norm(Z, 2) + norm2z_cpu = LinearAlgebra.norm(Z_cpu, 2) + @test norm2z ≈ norm2z_cpu + + norm3z = LinearAlgebra.norm(Z, 3) + norm3z_cpu = LinearAlgebra.norm(Z_cpu, 3) + @test norm3z ≈ norm3z_cpu + + norminfz = LinearAlgebra.norm(Z, Inf) + norminfz_cpu = LinearAlgebra.norm(Z_cpu, Inf) + @test norminfz ≈ norminfz_cpu +end + +@testset "test cuda reduction op for extruded 3D domain (hollow sphere)" begin + FT = Float64 + context = ClimaComms.context() + context_cpu = ClimaComms.context(ClimaComms.CPUDevice()) # CPU context for comparison + ClimaComms.init(context) + ClimaComms.init(context_cpu) + + vcontext = ClimaComms.SingletonCommsContext(context.device) + vcontext_cpu = ClimaComms.SingletonCommsContext(context_cpu.device) + + logger_stream = ClimaComms.iamroot(context) ? stderr : devnull + prev_logger = global_logger(ConsoleLogger(logger_stream, Logging.Info)) + + R = FT(6.371229e6) + + npoly = 4 + z_max = FT(30e3) + z_elem = 10 + h_elem = 4 + @info "Configuration" device = context.device nprocs = + ClimaComms.nprocs(context) h_elem = h_elem z_elem = z_elem npoly = npoly R = + R z_max = z_max FT = FT + # horizontal space + domain = Domains.SphereDomain(R) + horizontal_mesh = Meshes.EquiangularCubedSphere(domain, h_elem) + horizontal_topology = Topologies.Topology2D( + context, + horizontal_mesh, + Topologies.spacefillingcurve(horizontal_mesh), + ) + horizontal_topology_cpu = Topologies.Topology2D( + context_cpu, + horizontal_mesh, + Topologies.spacefillingcurve(horizontal_mesh), + ) + quad = Spaces.Quadratures.GLL{npoly + 1}() + h_space = Spaces.SpectralElementSpace2D(horizontal_topology, quad) + h_space_cpu = Spaces.SpectralElementSpace2D(horizontal_topology_cpu, quad) + + # vertical space + z_domain = Domains.IntervalDomain( + Geometry.ZPoint(zero(z_max)), + Geometry.ZPoint(z_max); + boundary_tags = (:bottom, :top), + ) + z_mesh = Meshes.IntervalMesh(z_domain, nelems = z_elem) + z_topology = Topologies.IntervalTopology(vcontext, z_mesh) + z_topology_cpu = Topologies.IntervalTopology(vcontext_cpu, z_mesh) + + z_center_space = Spaces.CenterFiniteDifferenceSpace(z_topology) + z_center_space_cpu = Spaces.CenterFiniteDifferenceSpace(z_topology_cpu) + + z_face_space = Spaces.FaceFiniteDifferenceSpace(z_topology) + z_face_space_cpu = Spaces.FaceFiniteDifferenceSpace(z_topology_cpu) + + hv_center_space = + Spaces.ExtrudedFiniteDifferenceSpace(h_space, z_center_space) + hv_face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(hv_center_space) + + hv_center_space_cpu = + Spaces.ExtrudedFiniteDifferenceSpace(h_space_cpu, z_center_space_cpu) + hv_face_space_cpu = + Spaces.FaceExtrudedFiniteDifferenceSpace(hv_center_space_cpu) + + Yc = set_initial_condition(hv_center_space) + Yf = set_initial_condition(hv_face_space) + + Yc_cpu = set_initial_condition(hv_center_space_cpu) + Yf_cpu = set_initial_condition(hv_face_space_cpu) + + resultc = Base.sum(Yc) + resultc_cpu = Base.sum(Yc_cpu) + + resultf = Base.sum(Yf) + resultf_cpu = Base.sum(Yf_cpu) + + @test resultc_cpu ≈ resultc + @test resultf_cpu ≈ resultf + + @test resultc ≈ (4 / 3) * π * ((R + z_max)^3 - R^3) rtol = 1e-2 + @test resultf ≈ (4 / 3) * π * ((R + z_max)^3 - R^3) rtol = 1e-2 + + + Yc = set_simple_field(hv_center_space) + Yc_cpu = set_simple_field(hv_center_space_cpu) + + @test Base.maximum(identity, Yc) ≈ Base.maximum(identity, Yc_cpu) + @test Base.minimum(identity, Yc) ≈ Base.minimum(identity, Yc_cpu) + + @test Statistics.mean(Yc) ≈ Statistics.mean(Yc_cpu) + + @test LinearAlgebra.norm(Yc, 1) ≈ LinearAlgebra.norm(Yc_cpu, 1) + @test LinearAlgebra.norm(Yc, 2) ≈ LinearAlgebra.norm(Yc_cpu, 2) + @test LinearAlgebra.norm(Yc, 3) ≈ LinearAlgebra.norm(Yc_cpu, 3) + @test LinearAlgebra.norm(Yc, Inf) ≈ LinearAlgebra.norm(Yc_cpu, Inf) + + Yf = set_simple_field(hv_face_space) + Yf_cpu = set_simple_field(hv_face_space_cpu) + + @test Base.maximum(identity, Yf) ≈ Base.maximum(identity, Yf_cpu) + @test Base.minimum(identity, Yf) ≈ Base.minimum(identity, Yf_cpu) + + @test Statistics.mean(Yf) ≈ Statistics.mean(Yf_cpu) + + @test LinearAlgebra.norm(Yf, 1) ≈ LinearAlgebra.norm(Yf_cpu, 1) + @test LinearAlgebra.norm(Yf, 2) ≈ LinearAlgebra.norm(Yf_cpu, 2) + @test LinearAlgebra.norm(Yf, 3) ≈ LinearAlgebra.norm(Yf_cpu, 3) + @test LinearAlgebra.norm(Yf, Inf) ≈ LinearAlgebra.norm(Yf_cpu, Inf) +end diff --git a/test/Fields/reduction_cuda_utils.jl b/test/Fields/reduction_cuda_utils.jl new file mode 100644 index 0000000000..256de14b70 --- /dev/null +++ b/test/Fields/reduction_cuda_utils.jl @@ -0,0 +1,47 @@ + +# set initial condition for steady-state test +function set_initial_condition(space) + Y = map(Fields.local_geometry_field(space)) do local_geometry + h = 1.0 + return h + end + return Y +end + +# set simple field +function set_simple_field(space) + α0 = 45.0 + h0 = 1.0 + Y = map(Fields.local_geometry_field(space)) do local_geometry + coord = local_geometry.coordinates + ϕ = coord.lat + λ = coord.long + z = coord.z + h = h0 * z * (cosd(α0) * cosd(ϕ) + sind(α0) * cosd(λ) * sind(ϕ)) + return h + end + return Y +end + +function set_elevation(space, h₀) + Y = map(Fields.local_geometry_field(space)) do local_geometry + coord = local_geometry.coordinates + + ϕ = coord.lat + λ = coord.long + FT = eltype(λ) + ϕₘ = FT(0) # degrees (equator) + λₘ = FT(3 / 2 * 180) # degrees + rₘ = + FT(acos(sind(ϕₘ) * sind(ϕ) + cosd(ϕₘ) * cosd(ϕ) * cosd(λ - λₘ))) # Great circle distance (rads) + Rₘ = FT(3π / 4) # Moutain radius + ζₘ = FT(π / 16) # Mountain oscillation half-width + zₛ = ifelse( + rₘ < Rₘ, + FT(h₀ / 2) * (1 + cospi(rₘ / Rₘ)) * (cospi(rₘ / ζₘ))^2, + FT(0), + ) + return zₛ + end + return Y +end