diff --git a/benchmarks/scripts/benchmark_offset.jl b/benchmarks/scripts/benchmark_offset.jl index 5304c01007..9944ff07e4 100644 --- a/benchmarks/scripts/benchmark_offset.jl +++ b/benchmarks/scripts/benchmark_offset.jl @@ -14,32 +14,33 @@ using Revise; include(joinpath("benchmarks", "scripts", "benchmark_offset.jl")) Clima A100: ``` [ Info: ArrayType = CuArray -Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039 -┌────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ -│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ -├────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ -│ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 68 microseconds, 834 nanoseconds │ 57.7908 │ 1178.35 │ 4 │ 100 │ -│ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 153 nanoseconds │ 68.4046 │ 1394.77 │ 4 │ 100 │ -│ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 576 nanoseconds │ 70.3113 │ 1433.65 │ 4 │ 100 │ -│ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 185 nanoseconds │ 59.2089 │ 1207.27 │ 4 │ 100 │ -└────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ +Problem size: (63, 4, 4, 1, 5400), N reads-writes: 4, N-reps: 100, Float_type = Float32, Device_bandwidth_GBs=2039 +┌────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┐ +│ funcs │ time per call │ bw % │ achieved bw │ +├────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┤ +│ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 84 microseconds, 726 nanoseconds │ 46.9507 │ 957.324 │ +│ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 102 nanoseconds │ 68.4649 │ 1396.0 │ +│ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 331 nanoseconds │ 70.618 │ 1439.9 │ +│ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 390 nanoseconds │ 59.029 │ 1203.6 │ +└────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┘ [ Info: ArrayType = CuArray -Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039 -┌────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ -│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ -├────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ -│ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 68 microseconds, 967 nanoseconds │ 57.6793 │ 1176.08 │ 4 │ 100 │ -│ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 82 nanoseconds │ 68.489 │ 1396.49 │ 4 │ 100 │ -│ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 597 nanoseconds │ 70.2858 │ 1433.13 │ 4 │ 100 │ -│ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 288 nanoseconds │ 59.1188 │ 1205.43 │ 4 │ 100 │ -└────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ +Problem size: (63, 4, 4, 1, 5400), N reads-writes: 4, N-reps: 100, Float_type = Float64, Device_bandwidth_GBs=2039 +┌────────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┐ +│ funcs │ time per call │ bw % │ achieved bw │ +├────────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┤ +│ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 107 microseconds, 387 nanoseconds │ 74.086 │ 1510.61 │ +│ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 105 microseconds, 42 nanoseconds │ 75.7399 │ 1544.34 │ +│ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 102 microseconds, 636 nanoseconds │ 77.5157 │ 1580.54 │ +│ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 106 microseconds, 896 nanoseconds │ 74.4266 │ 1517.56 │ +└────────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┘ ``` =# #! format: off module BenchmarkOffset +import CUDA include("benchmark_utils.jl") add3(x1, x2, x3) = x1 + x2 + x3 @@ -76,7 +77,7 @@ function aos_cart_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30) e = min(e, et) end end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4) return nothing end; function aos_cart_offset_kernel!(X, Y, us) @@ -131,7 +132,7 @@ function aos_lin_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30) e = min(e, et) end end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4) return nothing end; function aos_lin_offset_kernel!(X, Y, us) @@ -184,7 +185,7 @@ function soa_cart_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30) e = min(e, et) end end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4) return nothing end; function soa_cart_index_kernel!(X, Y, us) @@ -229,7 +230,7 @@ function soa_linear_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30) e = min(e, et) end end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4) return nothing end; function soa_linear_index_kernel!(X, Y, us) @@ -258,9 +259,10 @@ end using CUDA using Test @testset "Offset benchmark" begin - bm = BO.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32) # size(problem_size, 4) == 1 to avoid double counting reads/writes ArrayType = CUDA.CuArray; # ArrayType = Base.identity; + device_name = CUDA.name(CUDA.device()) + bm = BO.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=Float32) # size(problem_size, 4) == 1 to avoid double counting reads/writes arr(float_type, problem_size, T) = T(zeros(float_type, problem_size...)) FT = Float64; diff --git a/benchmarks/scripts/benchmark_utils.jl b/benchmarks/scripts/benchmark_utils.jl index dc52b25582..cc410de6f4 100644 --- a/benchmarks/scripts/benchmark_utils.jl +++ b/benchmarks/scripts/benchmark_utils.jl @@ -1,4 +1,5 @@ -import CUDA +# import CUDA +import ClimaComms using BenchmarkTools, Dates using LazyBroadcast: @lazy @@ -14,21 +15,40 @@ macro caller_name(f) end end +""" + device_info(device_name::String) + +Call with `device_info(CUDA.name(CUDA.device()))` +""" +function device_info(device_name) + device_specs = Dict( + "NVIDIA A100-SXM4-80GB" => (; device_bandwidth_GBs = 2_039), + "Tesla P100-PCIE-16GB" => (; device_bandwidth_GBs = 732), + ) + is_cuda = ClimaComms.device() isa ClimaComms.CUDADevice + if is_cuda && haskey(device_specs, device_name) + (; device_bandwidth_GBs) = device_specs[device_name] + return (; device_bandwidth_GBs, exists = true, name = device_name) + else + return (; device_bandwidth_GBs = 1, exists = false, name = device_name) + end +end + Base.@kwdef mutable struct Benchmark - problem_size::Tuple + problem_size = nothing float_type::Type - device_bandwidth_GBs::Int = 2_039 # (A100 SXM4 80GB) data::Vector = [] + unfound_device::Bool = false + unfound_device_name::String = "" + device_name::String = "" end -function perf_stats(; bm::Benchmark, kernel_time_s, n_reads_writes) - N = prod(bm.problem_size) - GB = N * n_reads_writes * sizeof(bm.float_type) / 1024^3 - achieved_bandwidth_GBs = GB / kernel_time_s - bandwidth_efficiency = - achieved_bandwidth_GBs / bm.device_bandwidth_GBs * 100 - return (; N, GB, achieved_bandwidth_GBs, bandwidth_efficiency) -end; +function print_unfound_devices(bm::Benchmark) + bm.unfound_device || return nothing + println("\nUnfound device: $(bm.unfound_device_name). Please") + println("look up specs and add to device_bandwidth() in") + println("$(@__FILE__).\n") +end time_and_units_str(x::Real) = trunc_time(string(compound_period(x, Dates.Second))) @@ -51,46 +71,98 @@ get_Nh(us::UniversalSizesCC) = us.Nh get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh get_N(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = prod((Nv, Nij, Nij, 1, get_Nh(us))) +Base.size(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = + (Nv, Nij, Nij, 1, get_Nh(us)) UniversalSizesCC(Nv, Nij, Nh) = UniversalSizesCC{Nv, Nij}(Nh) UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}() import PrettyTables function tabulate_benchmark(bm) - funcs = map(x -> x.caller, bm.data) + funcs = map(x -> strip(x.caller), bm.data) timings = map(x -> time_and_units_str(x.kernel_time_s), bm.data) n_reads_writes = map(x -> x.n_reads_writes, bm.data) nreps = map(x -> x.nreps, bm.data) + dinfo = device_info(bm.device_name) achieved_bandwidth_GBs = map(x -> x.achieved_bandwidth_GBs, bm.data) - bandwidth_efficiency = map(x -> x.bandwidth_efficiency, bm.data) + bandwidth_efficiency = if dinfo.exists + map(x -> x / dinfo.device_bandwidth_GBs * 100, achieved_bandwidth_GBs) + else + () + end + problem_size = map(x -> x.problem_size, bm.data) + # if we specify the problem size up front, then make + # sure that there is no variation when collecting: + if !isnothing(bm.problem_size) + @assert all(prod.(problem_size) .== prod(bm.problem_size)) + end + N = map(x -> prod(x), problem_size) + no_bw_efficiency = length(bandwidth_efficiency) == 0 header = [ "funcs", "time per call", - "bw %", + (no_bw_efficiency ? () : ("bw %",))..., "achieved bw", - "n-reads/writes", - "n-reps", + (allequal(n_reads_writes) ? () : ("N reads-writes",))..., + (allequal(N) ? () : ("problem size",))..., + (allequal(nreps) ? () : ("n-reps",))..., ] - data = hcat( + args = ( funcs, timings, - bandwidth_efficiency, + (no_bw_efficiency ? () : (bandwidth_efficiency,))..., achieved_bandwidth_GBs, - n_reads_writes, - nreps, + (allequal(n_reads_writes) ? () : (n_reads_writes,))..., + (allequal(N) ? () : (problem_size,))..., + (allequal(nreps) ? () : (nreps,))..., + ) + data = hcat(args...) + n_reads_writes_str = + allequal(n_reads_writes) ? "N reads-writes: $(n_reads_writes[1]), " : "" + problem_size_str = allequal(N) ? "Problem size: $(problem_size[1]), " : "" + nreps_str = allequal(nreps) ? "N-reps: $(nreps[1]), " : "" + device_bandwidth_GBs_str = + dinfo.exists ? "Device_bandwidth_GBs=$(dinfo.device_bandwidth_GBs)" : "" + print_unfound_devices(bm) + title = strip( + "$problem_size_str$n_reads_writes_str$nreps_str Float_type = $(bm.float_type), $device_bandwidth_GBs_str", ) - title = "Problem size: $(bm.problem_size), float_type = $(bm.float_type), device_bandwidth_GBs=$(bm.device_bandwidth_GBs)" PrettyTables.pretty_table(data; title, header, alignment = :l, crop = :none) end -push_info(bm::Nothing; e, nreps, caller, n_reads_writes) = nothing -function push_info(bm; e, nreps, caller, n_reads_writes) - kernel_time_s = e / nreps +push_info( + bm::Nothing; + kernel_time_s, + nreps, + caller, + n_reads_writes, + problem_size, +) = nothing +function push_info( + bm; + kernel_time_s, + nreps, + caller, + n_reads_writes, + problem_size, +) + N = prod(problem_size) + GB = N * n_reads_writes * sizeof(bm.float_type) / 1024^3 + achieved_bandwidth_GBs = GB / kernel_time_s + dinfo = device_info(bm.device_name) + if !dinfo.exists + bm.unfound_device = true + bm.unfound_device_name = dinfo.name + end + nt = (; caller, kernel_time_s, n_reads_writes, nreps, - perf_stats(; bm, kernel_time_s, n_reads_writes)..., + problem_size, + N, + GB, + achieved_bandwidth_GBs, ) push!(bm.data, nt) end diff --git a/benchmarks/scripts/index_swapping.jl b/benchmarks/scripts/index_swapping.jl index 20410f7d68..d926904542 100644 --- a/benchmarks/scripts/index_swapping.jl +++ b/benchmarks/scripts/index_swapping.jl @@ -24,31 +24,22 @@ In particular, Clima A100 ``` [ Info: ArrayType = CuArray -Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039 -┌──────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ -│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ -├──────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ -│ BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) │ 34 microseconds, 617 nanoseconds │ 57.4574 │ 1171.56 │ 2 │ 1000 │ -│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm) │ 60 microseconds, 384 nanoseconds │ 32.939 │ 671.627 │ 2 │ 1000 │ -│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm) │ 68 microseconds, 108 nanoseconds │ 29.2034 │ 595.458 │ 2 │ 1000 │ -│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm) │ 60 microseconds, 395 nanoseconds │ 32.9329 │ 671.502 │ 2 │ 1000 │ -└──────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ -[ Info: ArrayType = CuArray -Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=2039 -┌──────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ -│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ -├──────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ -│ BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) │ 59 microseconds, 558 nanoseconds │ 66.791 │ 1361.87 │ 2 │ 1000 │ -│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm) │ 63 microseconds, 238 nanoseconds │ 62.905 │ 1282.63 │ 2 │ 1000 │ -│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm) │ 80 microseconds, 502 nanoseconds │ 49.4142 │ 1007.56 │ 2 │ 1000 │ -│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm) │ 63 microseconds, 228 nanoseconds │ 62.9142 │ 1282.82 │ 2 │ 1000 │ -└──────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ +Problem size: (5443200,), N reads-writes: 2, N-reps: 1000, Float_type = Float32, Device_bandwidth_GBs=2039 +┌──────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┐ +│ funcs │ time per call │ bw % │ achieved bw │ +├──────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┤ +│ BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) │ 34 microseconds, 738 nanoseconds │ 57.2576 │ 1167.48 │ +│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm) │ 60 microseconds, 528 nanoseconds │ 32.8605 │ 670.025 │ +│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm) │ 68 microseconds, 147 nanoseconds │ 29.1867 │ 595.118 │ +│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm) │ 60 microseconds, 524 nanoseconds │ 32.8627 │ 670.07 │ +└──────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┘ ``` =# #! format: off module IndexSwapBench +import CUDA include("benchmark_utils.jl") foo(x1, x2, x3) = x1 @@ -65,7 +56,7 @@ function at_dot_call!(X, Y; nreps = 1, print_info = true, bm=nothing, n_trials = end e = min(e, et) end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=2) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(X.x1),n_reads_writes=2) return nothing end; @@ -104,7 +95,7 @@ function custom_kernel_bc!(X, Y, us::UniversalSizesStatic; swap=0, printtb=false end e = min(e, et) end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=2) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(X.x1),n_reads_writes=2) return nothing end; @@ -174,8 +165,8 @@ end # module import .IndexSwapBench as BIS using CUDA -bm = BIS.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32) -# bm = BIS.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float64) +device_name = CUDA.name(CUDA.device()) +bm = BIS.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=Float32) ArrayType = CUDA.CuArray; # ArrayType = identity; arr(bm, T) = T(zeros(bm.float_type, bm.problem_size...)) diff --git a/benchmarks/scripts/indexing_and_static_ndranges.jl b/benchmarks/scripts/indexing_and_static_ndranges.jl index 0b2757282c..5f625e821b 100644 --- a/benchmarks/scripts/indexing_and_static_ndranges.jl +++ b/benchmarks/scripts/indexing_and_static_ndranges.jl @@ -107,6 +107,7 @@ Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=20 module IndexStaticRangeBench +import CUDA include("benchmark_utils.jl") # ============================================================ Non-extruded broadcast (start) @@ -253,7 +254,7 @@ function at_dot_call!(X, Y; nreps = 1, bm=nothing, n_trials = 30) end e = min(e, et) end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=1) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(X.x1),n_reads_writes=1) return nothing end; @@ -280,7 +281,7 @@ function custom_sol_kernel!(X, Y, ::Val{N}; nreps = 1, bm=nothing, n_trials = 30 end e = min(e, et) end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=1) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(X.x1),n_reads_writes=1) return nothing end; @@ -346,7 +347,7 @@ function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=false, use_ e = min(e, et) end end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=1) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(X.x1),n_reads_writes=1) return nothing end; @inline get_cart_lin_index(bc, n, I) = I @@ -369,7 +370,8 @@ import .IndexStaticRangeBench as BSR using CUDA using Test -bm = BSR.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32) +device_name = CUDA.name(CUDA.device()) +bm = BSR.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=Float32) # bm = BSR.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float64) ArrayType = CUDA.CuArray; # ArrayType = Base.identity; diff --git a/benchmarks/scripts/thermo_bench.jl b/benchmarks/scripts/thermo_bench.jl index 70c27d1652..0dab71ed7e 100644 --- a/benchmarks/scripts/thermo_bench.jl +++ b/benchmarks/scripts/thermo_bench.jl @@ -10,21 +10,22 @@ to be in your local environment to run. Clima A100: ``` [ Info: device = ClimaComms.CUDADevice() -Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039 -┌──────────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ -│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ -├──────────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ -│ TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm) │ 586 microseconds, 517 nanoseconds │ 15.2602 │ 311.155 │ 9 │ 100 │ -│ TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm) │ 292 microseconds, 178 nanoseconds │ 30.6332 │ 624.611 │ 9 │ 100 │ -│ TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm) │ 586 microseconds, 988 nanoseconds │ 15.2479 │ 310.905 │ 9 │ 100 │ -│ TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm) │ 292 microseconds, 178 nanoseconds │ 30.6332 │ 624.611 │ 9 │ 100 │ -└──────────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ +Problem size: (4, 4, 1, 63, 5400), N reads-writes: 9, N-reps: 100, Float_type = Float32, Device_bandwidth_GBs=2039 +┌──────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┐ +│ funcs │ time per call │ bw % │ achieved bw │ +├──────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┤ +│ TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm) │ 586 microseconds, 353 nanoseconds │ 15.2644 │ 311.242 │ +│ TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm) │ 293 microseconds, 796 nanoseconds │ 30.4645 │ 621.171 │ +│ TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm) │ 586 microseconds, 138 nanoseconds │ 15.27 │ 311.356 │ +│ TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm) │ 293 microseconds, 755 nanoseconds │ 30.4687 │ 621.258 │ +└──────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┘ ``` =# #! format: off module ThermoBench +import CUDA include("benchmark_utils.jl") import ClimaCore @@ -55,7 +56,8 @@ function thermo_func_bc!(x, thermo_params, us; nreps = 1, bm=nothing, n_trials = end e = min(e, et) end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=5+4) # TODO: verify this + s = size(Fields.field_values(x.ρ)) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=5+4) # TODO: verify this return nothing end @@ -75,7 +77,8 @@ function thermo_func_sol!(x, thermo_params, us::UniversalSizesStatic; nreps = 1, end e = min(e, et) end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=5+4) # TODO: verify this + s = size(x.ρ) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=5+4) # TODO: verify this return nothing end @@ -123,7 +126,8 @@ import .TestUtilities as TU; using Test @testset "Thermo state" begin FT = Float32 - bm = TB.Benchmark(;problem_size=(63,4,4,1,5400), float_type=FT) + device_name = CUDA.name(CUDA.device()) + bm = TB.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=FT) device = ClimaComms.device() context = ClimaComms.context(device) cspace = TU.CenterExtrudedFiniteDifferenceSpace( diff --git a/benchmarks/scripts/thermo_bench_bw.jl b/benchmarks/scripts/thermo_bench_bw.jl index 26863d662a..65b9a896d6 100644 --- a/benchmarks/scripts/thermo_bench_bw.jl +++ b/benchmarks/scripts/thermo_bench_bw.jl @@ -14,32 +14,33 @@ using Revise; include(joinpath("benchmarks", "scripts", "thermo_bench_bw.jl")) Clima A100: ``` [ Info: device = ClimaComms.CUDADevice() -Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039 -┌────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ -│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ -├────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ -│ TBB.singlefield_bc!(x_soa, us; nreps=100, bm) │ 67 microseconds, 554 nanoseconds │ 29.4429 │ 600.341 │ 2 │ 100 │ -│ TBB.singlefield_bc!(x_aos, us; nreps=100, bm) │ 69 microseconds, 653 nanoseconds │ 28.5556 │ 582.248 │ 2 │ 100 │ -│ TBB.thermo_func_bc!(x, us; nreps=100, bm) │ 796 microseconds, 877 nanoseconds │ 12.4798 │ 254.462 │ 10 │ 100 │ -│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 131 microseconds, 72 nanoseconds │ 75.873 │ 1547.05 │ 10 │ 100 │ -└────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ +Problem size: (4, 4, 1, 63, 5400), N-reps: 100, Float_type = Float32, Device_bandwidth_GBs=2039 +┌────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┐ +│ funcs │ time per call │ bw % │ achieved bw │ N reads-writes │ +├────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┤ +│ TBB.singlefield_bc!(x_soa, us; nreps=100, bm) │ 62 microseconds, 864 nanoseconds │ 31.6395 │ 645.129 │ 2 │ +│ TBB.singlefield_bc!(x_aos, us; nreps=100, bm) │ 69 microseconds, 858 nanoseconds │ 28.4718 │ 580.541 │ 2 │ +│ TBB.thermo_func_bc!(x, us; nreps=100, bm) │ 794 microseconds, 225 nanoseconds │ 12.5214 │ 255.312 │ 10 │ +│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 133 microseconds, 530 nanoseconds │ 74.4766 │ 1518.58 │ 10 │ +└────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┘ [ Info: device = ClimaComms.CUDADevice() -Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=2039 -┌────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ -│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ -├────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ -│ TBB.singlefield_bc!(x_soa, us; nreps=100, bm) │ 108 microseconds, 790 nanoseconds │ 36.5653 │ 745.567 │ 2 │ 100 │ -│ TBB.singlefield_bc!(x_aos, us; nreps=100, bm) │ 123 microseconds, 730 nanoseconds │ 32.1501 │ 655.541 │ 2 │ 100 │ -│ TBB.thermo_func_bc!(x, us; nreps=100, bm) │ 1 millisecond, 43 microseconds │ 19.0568 │ 388.569 │ 10 │ 100 │ -│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 256 microseconds, 717 nanoseconds │ 77.477 │ 1579.76 │ 10 │ 100 │ -└────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ +Problem size: (4, 4, 1, 63, 5400), N-reps: 100, Float_type = Float64, Device_bandwidth_GBs=2039 +┌────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┐ +│ funcs │ time per call │ bw % │ achieved bw │ N reads-writes │ +├────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┤ +│ TBB.singlefield_bc!(x_soa, us; nreps=100, bm) │ 108 microseconds, 514 nanoseconds │ 36.6585 │ 747.466 │ 2 │ +│ TBB.singlefield_bc!(x_aos, us; nreps=100, bm) │ 118 microseconds, 989 nanoseconds │ 33.4311 │ 681.661 │ 2 │ +│ TBB.thermo_func_bc!(x, us; nreps=100, bm) │ 1 millisecond, 44 microseconds │ 19.0376 │ 388.177 │ 10 │ +│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 257 microseconds, 680 nanoseconds │ 77.1876 │ 1573.86 │ 10 │ +└────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┘ ``` =# #! format: off module ThermoBenchBandwidth +import CUDA include("benchmark_utils.jl") import ClimaCore @@ -74,7 +75,8 @@ function singlefield_bc!(x, us; nreps = 1, bm=nothing, n_trials = 30) end e = min(e, et) end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=2) + s = size(Fields.field_values(x.ρ_read)) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=2) return nothing end @@ -89,7 +91,8 @@ function thermo_func_bc!(x, us; nreps = 1, bm=nothing, n_trials = 30) end e = min(e, et) end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=10) + s = size(Fields.field_values(x.ρ)) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=10) return nothing end @@ -109,7 +112,8 @@ function thermo_func_sol!(x, us::UniversalSizesStatic; nreps = 1, bm=nothing, n_ end e = min(e, et) end - push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=10) + s = size(x.ρ) + push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=10) return nothing end @@ -151,8 +155,9 @@ import .TestUtilities as TU; using Test @testset "Thermo state" begin - FT = Float32 - bm = TBB.Benchmark(;problem_size=(63,4,4,1,5400), float_type=FT) + FT = Float64 + device_name = CUDA.name(CUDA.device()) + bm = TBB.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=FT) device = ClimaComms.device() context = ClimaComms.context(device) cspace = TU.CenterExtrudedFiniteDifferenceSpace( diff --git a/test/DataLayouts/benchmark_copyto.jl b/test/DataLayouts/benchmark_copyto.jl index c435800b47..2ad9a1c467 100644 --- a/test/DataLayouts/benchmark_copyto.jl +++ b/test/DataLayouts/benchmark_copyto.jl @@ -6,24 +6,36 @@ using Test using ClimaCore.DataLayouts using BenchmarkTools import ClimaComms +import ClimaCore @static pkgversion(ClimaComms) >= v"0.6" && ClimaComms.@import_required_backends +if ClimaComms.device() isa ClimaComms.CUDADevice + import CUDA + device_name = CUDA.name(CUDA.device()) # Move to ClimaComms +else + device_name = "CPU" +end + +include(joinpath(pkgdir(ClimaCore), "benchmarks/scripts/benchmark_utils.jl")) -function benchmarkcopyto!(device, data, val, name) +function benchmarkcopyto!(bm, device, data, val) + caller = string(nameof(typeof(data))) + @info "Benchmarking $caller..." data_rhs = similar(data) fill!(data_rhs, val) - println("Benchmarking ClimaCore copyto! for $name DataLayout") bc = Base.Broadcast.broadcasted(identity, data_rhs) bcp = Base.Broadcast.broadcasted(identity, parent(data_rhs)) trial = @benchmark ClimaComms.@cuda_sync $device Base.copyto!($data, $bc) - show(stdout, MIME("text/plain"), trial) - println() - println("Benchmarking array copyto! for $name DataLayout") - trial = @benchmark ClimaComms.@cuda_sync $device Base.copyto!( - $(parent(data)), - $bcp, + t_min = minimum(trial.times) * 1e-9 # to seconds + nreps = length(trial.times) + n_reads_writes = DataLayouts.ncomponents(data) * 2 + push_info( + bm; + kernel_time_s = t_min, + nreps = nreps, + caller, + problem_size = size(data), + n_reads_writes, ) - show(stdout, MIME("text/plain"), trial) - println() end @testset "copyto! with Nf = 1" begin @@ -36,18 +48,20 @@ end Nij = 4 Nh = 30 * 30 * 6 Nk = 6 + bm = Benchmark(; float_type = FT, device_name) #! format: off - data = DataF{S}(device_zeros(FT,Nf)); benchmarkcopyto!(device, data, 3, "DataF" ); @test all(parent(data) .== 3) - data = IJFH{S, Nij, Nh}(device_zeros(FT,Nij,Nij,Nf,Nh)); benchmarkcopyto!(device, data, 3, "IJFH" ); @test all(parent(data) .== 3) - data = IFH{S, Nij, Nh}(device_zeros(FT,Nij,Nf,Nh)); benchmarkcopyto!(device, data, 3, "IFH" ); @test all(parent(data) .== 3) - # The parent array of IJF and IF datalayouts are MArrays, and can therefore not be passed into CUDA kernels on the RHS. - # data = IJF{S, Nij}(device_zeros(FT,Nij,Nij,Nf)); benchmarkcopyto!(device, data, 3, "IJF" ); @test all(parent(data) .== 3) - # data = IF{S, Nij}(device_zeros(FT,Nij,Nf)); benchmarkcopyto!(device, data, 3, "IF" ); @test all(parent(data) .== 3) - data = VF{S, Nv}(device_zeros(FT,Nv,Nf)); benchmarkcopyto!(device, data, 3, "VF" ); @test all(parent(data) .== 3) - data = VIJFH{S,Nv,Nij,Nh}(device_zeros(FT,Nv,Nij,Nij,Nf,Nh));benchmarkcopyto!(device, data, 3, "VIJFH" ); @test all(parent(data) .== 3) - data = VIFH{S, Nv, Nij, Nh}(device_zeros(FT,Nv,Nij,Nf,Nh)); benchmarkcopyto!(device, data, 3, "VIFH" ); @test all(parent(data) .== 3) + data = DataF{S}(device_zeros(FT,Nf)); benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3) + data = IJFH{S, Nij, Nh}(device_zeros(FT,Nij,Nij,Nf,Nh)); benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3) + data = IFH{S, Nij, Nh}(device_zeros(FT,Nij,Nf,Nh)); benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3) + # The parent array of IJF and IF datalayouts are MArrays, and can therefore not bm, be passed into CUDA kernels on the RHS. + # data = IJF{S, Nij}(device_zeros(FT,Nij,Nij,Nf)); benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3) + # data = IF{S, Nij}(device_zeros(FT,Nij,Nf)); benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3) + data = VF{S, Nv}(device_zeros(FT,Nv,Nf)); benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3) + data = VIJFH{S,Nv,Nij,Nh}(device_zeros(FT,Nv,Nij,Nij,Nf,Nh));benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3) + data = VIFH{S, Nv, Nij, Nh}(device_zeros(FT,Nv,Nij,Nf,Nh)); benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3) #! format: on - # data = IJKFVH{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); benchmarkcopyto!(device, data, 3); @test all(parent(data) .== 3) # TODO: test - # data = IH1JH2{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); benchmarkcopyto!(device, data, 3); @test all(parent(data) .== 3) # TODO: test + # data = IJKFVH{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3) # TODO: test + # data = IH1JH2{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3) # TODO: test + tabulate_benchmark(bm) end