diff --git a/benchmarks/scripts/benchmark_offset.jl b/benchmarks/scripts/benchmark_offset.jl
index 5304c01007..9944ff07e4 100644
--- a/benchmarks/scripts/benchmark_offset.jl
+++ b/benchmarks/scripts/benchmark_offset.jl
@@ -14,32 +14,33 @@ using Revise; include(joinpath("benchmarks", "scripts", "benchmark_offset.jl"))
 Clima A100:
 ```
 [ Info: ArrayType = CuArray
-Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
-┌────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
-│ funcs                                                              │ time per call                    │ bw %    │ achieved bw │ n-reads/writes │ n-reps │
-├────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
-│     BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 68 microseconds, 834 nanoseconds │ 57.7908 │ 1178.35     │ 4              │ 100    │
-│     BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100)          │ 58 microseconds, 153 nanoseconds │ 68.4046 │ 1394.77     │ 4              │ 100    │
-│     BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100)        │ 56 microseconds, 576 nanoseconds │ 70.3113 │ 1433.65     │ 4              │ 100    │
-│     BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100)          │ 67 microseconds, 185 nanoseconds │ 59.2089 │ 1207.27     │ 4              │ 100    │
-└────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
+Problem size: (63, 4, 4, 1, 5400), N reads-writes: 4, N-reps: 100,  Float_type = Float32, Device_bandwidth_GBs=2039
+┌────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┐
+│ funcs                                                          │ time per call                    │ bw %    │ achieved bw │
+├────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┤
+│ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 84 microseconds, 726 nanoseconds │ 46.9507 │ 957.324     │
+│ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100)          │ 58 microseconds, 102 nanoseconds │ 68.4649 │ 1396.0      │
+│ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100)        │ 56 microseconds, 331 nanoseconds │ 70.618  │ 1439.9      │
+│ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100)          │ 67 microseconds, 390 nanoseconds │ 59.029  │ 1203.6      │
+└────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┘
 
 [ Info: ArrayType = CuArray
-Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
-┌────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
-│ funcs                                                              │ time per call                    │ bw %    │ achieved bw │ n-reads/writes │ n-reps │
-├────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
-│     BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 68 microseconds, 967 nanoseconds │ 57.6793 │ 1176.08     │ 4              │ 100    │
-│     BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100)          │ 58 microseconds, 82 nanoseconds  │ 68.489  │ 1396.49     │ 4              │ 100    │
-│     BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100)        │ 56 microseconds, 597 nanoseconds │ 70.2858 │ 1433.13     │ 4              │ 100    │
-│     BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100)          │ 67 microseconds, 288 nanoseconds │ 59.1188 │ 1205.43     │ 4              │ 100    │
-└────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
+Problem size: (63, 4, 4, 1, 5400), N reads-writes: 4, N-reps: 100,  Float_type = Float64, Device_bandwidth_GBs=2039
+┌────────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┐
+│ funcs                                                          │ time per call                     │ bw %    │ achieved bw │
+├────────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┤
+│ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 107 microseconds, 387 nanoseconds │ 74.086  │ 1510.61     │
+│ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100)          │ 105 microseconds, 42 nanoseconds  │ 75.7399 │ 1544.34     │
+│ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100)        │ 102 microseconds, 636 nanoseconds │ 77.5157 │ 1580.54     │
+│ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100)          │ 106 microseconds, 896 nanoseconds │ 74.4266 │ 1517.56     │
+└────────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┘
 ```
 =#
 
 #! format: off
 module BenchmarkOffset
 
+import CUDA
 include("benchmark_utils.jl")
 
 add3(x1, x2, x3) = x1 + x2 + x3
@@ -76,7 +77,7 @@ function aos_cart_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
             e = min(e, et)
         end
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4)
     return nothing
 end;
 function aos_cart_offset_kernel!(X, Y, us)
@@ -131,7 +132,7 @@ function aos_lin_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
             e = min(e, et)
         end
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4)
     return nothing
 end;
 function aos_lin_offset_kernel!(X, Y, us)
@@ -184,7 +185,7 @@ function soa_cart_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
             e = min(e, et)
         end
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4)
     return nothing
 end;
 function soa_cart_index_kernel!(X, Y, us)
@@ -229,7 +230,7 @@ function soa_linear_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
             e = min(e, et)
         end
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4)
     return nothing
 end;
 function soa_linear_index_kernel!(X, Y, us)
@@ -258,9 +259,10 @@ end
 using CUDA
 using Test
 @testset "Offset benchmark" begin
-    bm = BO.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32) # size(problem_size, 4) == 1 to avoid double counting reads/writes
     ArrayType = CUDA.CuArray;
     # ArrayType = Base.identity;
+    device_name = CUDA.name(CUDA.device())
+    bm = BO.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=Float32) # size(problem_size, 4) == 1 to avoid double counting reads/writes
     arr(float_type, problem_size, T) = T(zeros(float_type, problem_size...))
 
     FT = Float64;
diff --git a/benchmarks/scripts/benchmark_utils.jl b/benchmarks/scripts/benchmark_utils.jl
index dc52b25582..cc410de6f4 100644
--- a/benchmarks/scripts/benchmark_utils.jl
+++ b/benchmarks/scripts/benchmark_utils.jl
@@ -1,4 +1,5 @@
-import CUDA
+# import CUDA
+import ClimaComms
 using BenchmarkTools, Dates
 using LazyBroadcast: @lazy
 
@@ -14,21 +15,40 @@ macro caller_name(f)
     end
 end
 
+"""
+    device_info(device_name::String)
+
+Call with `device_info(CUDA.name(CUDA.device()))`
+"""
+function device_info(device_name)
+    device_specs = Dict(
+        "NVIDIA A100-SXM4-80GB" => (; device_bandwidth_GBs = 2_039),
+        "Tesla P100-PCIE-16GB" => (; device_bandwidth_GBs = 732),
+    )
+    is_cuda = ClimaComms.device() isa ClimaComms.CUDADevice
+    if is_cuda && haskey(device_specs, device_name)
+        (; device_bandwidth_GBs) = device_specs[device_name]
+        return (; device_bandwidth_GBs, exists = true, name = device_name)
+    else
+        return (; device_bandwidth_GBs = 1, exists = false, name = device_name)
+    end
+end
+
 Base.@kwdef mutable struct Benchmark
-    problem_size::Tuple
+    problem_size = nothing
     float_type::Type
-    device_bandwidth_GBs::Int = 2_039 # (A100 SXM4 80GB)
     data::Vector = []
+    unfound_device::Bool = false
+    unfound_device_name::String = ""
+    device_name::String = ""
 end
 
-function perf_stats(; bm::Benchmark, kernel_time_s, n_reads_writes)
-    N = prod(bm.problem_size)
-    GB = N * n_reads_writes * sizeof(bm.float_type) / 1024^3
-    achieved_bandwidth_GBs = GB / kernel_time_s
-    bandwidth_efficiency =
-        achieved_bandwidth_GBs / bm.device_bandwidth_GBs * 100
-    return (; N, GB, achieved_bandwidth_GBs, bandwidth_efficiency)
-end;
+function print_unfound_devices(bm::Benchmark)
+    bm.unfound_device || return nothing
+    println("\nUnfound device: $(bm.unfound_device_name). Please")
+    println("look up specs and add to device_bandwidth() in")
+    println("$(@__FILE__).\n")
+end
 
 time_and_units_str(x::Real) =
     trunc_time(string(compound_period(x, Dates.Second)))
@@ -51,46 +71,98 @@ get_Nh(us::UniversalSizesCC) = us.Nh
 get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh
 get_N(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} =
     prod((Nv, Nij, Nij, 1, get_Nh(us)))
+Base.size(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} =
+    (Nv, Nij, Nij, 1, get_Nh(us))
 UniversalSizesCC(Nv, Nij, Nh) = UniversalSizesCC{Nv, Nij}(Nh)
 UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}()
 
 import PrettyTables
 function tabulate_benchmark(bm)
-    funcs = map(x -> x.caller, bm.data)
+    funcs = map(x -> strip(x.caller), bm.data)
     timings = map(x -> time_and_units_str(x.kernel_time_s), bm.data)
     n_reads_writes = map(x -> x.n_reads_writes, bm.data)
     nreps = map(x -> x.nreps, bm.data)
+    dinfo = device_info(bm.device_name)
     achieved_bandwidth_GBs = map(x -> x.achieved_bandwidth_GBs, bm.data)
-    bandwidth_efficiency = map(x -> x.bandwidth_efficiency, bm.data)
+    bandwidth_efficiency = if dinfo.exists
+        map(x -> x / dinfo.device_bandwidth_GBs * 100, achieved_bandwidth_GBs)
+    else
+        ()
+    end
+    problem_size = map(x -> x.problem_size, bm.data)
+    # if we specify the problem size up front, then make
+    # sure that there is no variation when collecting:
+    if !isnothing(bm.problem_size)
+        @assert all(prod.(problem_size) .== prod(bm.problem_size))
+    end
+    N = map(x -> prod(x), problem_size)
+    no_bw_efficiency = length(bandwidth_efficiency) == 0
     header = [
         "funcs",
         "time per call",
-        "bw %",
+        (no_bw_efficiency ? () : ("bw %",))...,
         "achieved bw",
-        "n-reads/writes",
-        "n-reps",
+        (allequal(n_reads_writes) ? () : ("N reads-writes",))...,
+        (allequal(N) ? () : ("problem size",))...,
+        (allequal(nreps) ? () : ("n-reps",))...,
     ]
-    data = hcat(
+    args = (
         funcs,
         timings,
-        bandwidth_efficiency,
+        (no_bw_efficiency ? () : (bandwidth_efficiency,))...,
         achieved_bandwidth_GBs,
-        n_reads_writes,
-        nreps,
+        (allequal(n_reads_writes) ? () : (n_reads_writes,))...,
+        (allequal(N) ? () : (problem_size,))...,
+        (allequal(nreps) ? () : (nreps,))...,
+    )
+    data = hcat(args...)
+    n_reads_writes_str =
+        allequal(n_reads_writes) ? "N reads-writes: $(n_reads_writes[1]), " : ""
+    problem_size_str = allequal(N) ? "Problem size: $(problem_size[1]), " : ""
+    nreps_str = allequal(nreps) ? "N-reps: $(nreps[1]), " : ""
+    device_bandwidth_GBs_str =
+        dinfo.exists ? "Device_bandwidth_GBs=$(dinfo.device_bandwidth_GBs)" : ""
+    print_unfound_devices(bm)
+    title = strip(
+        "$problem_size_str$n_reads_writes_str$nreps_str Float_type = $(bm.float_type), $device_bandwidth_GBs_str",
     )
-    title = "Problem size: $(bm.problem_size), float_type = $(bm.float_type), device_bandwidth_GBs=$(bm.device_bandwidth_GBs)"
     PrettyTables.pretty_table(data; title, header, alignment = :l, crop = :none)
 end
 
-push_info(bm::Nothing; e, nreps, caller, n_reads_writes) = nothing
-function push_info(bm; e, nreps, caller, n_reads_writes)
-    kernel_time_s = e / nreps
+push_info(
+    bm::Nothing;
+    kernel_time_s,
+    nreps,
+    caller,
+    n_reads_writes,
+    problem_size,
+) = nothing
+function push_info(
+    bm;
+    kernel_time_s,
+    nreps,
+    caller,
+    n_reads_writes,
+    problem_size,
+)
+    N = prod(problem_size)
+    GB = N * n_reads_writes * sizeof(bm.float_type) / 1024^3
+    achieved_bandwidth_GBs = GB / kernel_time_s
+    dinfo = device_info(bm.device_name)
+    if !dinfo.exists
+        bm.unfound_device = true
+        bm.unfound_device_name = dinfo.name
+    end
+
     nt = (;
         caller,
         kernel_time_s,
         n_reads_writes,
         nreps,
-        perf_stats(; bm, kernel_time_s, n_reads_writes)...,
+        problem_size,
+        N,
+        GB,
+        achieved_bandwidth_GBs,
     )
     push!(bm.data, nt)
 end
diff --git a/benchmarks/scripts/index_swapping.jl b/benchmarks/scripts/index_swapping.jl
index 20410f7d68..d926904542 100644
--- a/benchmarks/scripts/index_swapping.jl
+++ b/benchmarks/scripts/index_swapping.jl
@@ -24,31 +24,22 @@ In particular,
 Clima A100
 ```
 [ Info: ArrayType = CuArray
-Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
-┌──────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
-│ funcs                                                                │ time per call                    │ bw %    │ achieved bw │ n-reads/writes │ n-reps │
-├──────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
-│ BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm)                 │ 34 microseconds, 617 nanoseconds │ 57.4574 │ 1171.56     │ 2              │ 1000   │
-│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm) │ 60 microseconds, 384 nanoseconds │ 32.939  │ 671.627     │ 2              │ 1000   │
-│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm) │ 68 microseconds, 108 nanoseconds │ 29.2034 │ 595.458     │ 2              │ 1000   │
-│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm) │ 60 microseconds, 395 nanoseconds │ 32.9329 │ 671.502     │ 2              │ 1000   │
-└──────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
-[ Info: ArrayType = CuArray
-Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=2039
-┌──────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
-│ funcs                                                                │ time per call                    │ bw %    │ achieved bw │ n-reads/writes │ n-reps │
-├──────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
-│ BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm)                 │ 59 microseconds, 558 nanoseconds │ 66.791  │ 1361.87     │ 2              │ 1000   │
-│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm) │ 63 microseconds, 238 nanoseconds │ 62.905  │ 1282.63     │ 2              │ 1000   │
-│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm) │ 80 microseconds, 502 nanoseconds │ 49.4142 │ 1007.56     │ 2              │ 1000   │
-│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm) │ 63 microseconds, 228 nanoseconds │ 62.9142 │ 1282.82     │ 2              │ 1000   │
-└──────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
+Problem size: (5443200,), N reads-writes: 2, N-reps: 1000,  Float_type = Float32, Device_bandwidth_GBs=2039
+┌──────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┐
+│ funcs                                                                │ time per call                    │ bw %    │ achieved bw │
+├──────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┤
+│ BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm)                 │ 34 microseconds, 738 nanoseconds │ 57.2576 │ 1167.48     │
+│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm) │ 60 microseconds, 528 nanoseconds │ 32.8605 │ 670.025     │
+│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm) │ 68 microseconds, 147 nanoseconds │ 29.1867 │ 595.118     │
+│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm) │ 60 microseconds, 524 nanoseconds │ 32.8627 │ 670.07      │
+└──────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┘
 ```
 =#
 
 #! format: off
 module IndexSwapBench
 
+import CUDA
 include("benchmark_utils.jl")
 
 foo(x1, x2, x3) = x1
@@ -65,7 +56,7 @@ function at_dot_call!(X, Y; nreps = 1, print_info = true, bm=nothing, n_trials =
         end
         e = min(e, et)
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=2)
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(X.x1),n_reads_writes=2)
     return nothing
 end;
 
@@ -104,7 +95,7 @@ function custom_kernel_bc!(X, Y, us::UniversalSizesStatic; swap=0, printtb=false
         end
         e = min(e, et)
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=2)
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(X.x1),n_reads_writes=2)
     return nothing
 end;
 
@@ -174,8 +165,8 @@ end # module
 import .IndexSwapBench as BIS
 
 using CUDA
-bm = BIS.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32)
-# bm = BIS.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float64)
+device_name = CUDA.name(CUDA.device())
+bm = BIS.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=Float32)
 ArrayType = CUDA.CuArray;
 # ArrayType = identity;
 arr(bm, T) = T(zeros(bm.float_type, bm.problem_size...))
diff --git a/benchmarks/scripts/indexing_and_static_ndranges.jl b/benchmarks/scripts/indexing_and_static_ndranges.jl
index 0b2757282c..5f625e821b 100644
--- a/benchmarks/scripts/indexing_and_static_ndranges.jl
+++ b/benchmarks/scripts/indexing_and_static_ndranges.jl
@@ -107,6 +107,7 @@ Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=20
 
 module IndexStaticRangeBench
 
+import CUDA
 include("benchmark_utils.jl")
 
 # ============================================================ Non-extruded broadcast (start)
@@ -253,7 +254,7 @@ function at_dot_call!(X, Y; nreps = 1, bm=nothing, n_trials = 30)
         end
         e = min(e, et)
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=1)
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(X.x1),n_reads_writes=1)
     return nothing
 end;
 
@@ -280,7 +281,7 @@ function custom_sol_kernel!(X, Y, ::Val{N}; nreps = 1, bm=nothing, n_trials = 30
         end
         e = min(e, et)
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=1)
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(X.x1),n_reads_writes=1)
 
     return nothing
 end;
@@ -346,7 +347,7 @@ function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=false, use_
             e = min(e, et)
         end
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=1)
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(X.x1),n_reads_writes=1)
     return nothing
 end;
 @inline get_cart_lin_index(bc, n, I) = I
@@ -369,7 +370,8 @@ import .IndexStaticRangeBench as BSR
 
 using CUDA
 using Test
-bm = BSR.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32)
+device_name = CUDA.name(CUDA.device())
+bm = BSR.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=Float32)
 # bm = BSR.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float64)
 ArrayType = CUDA.CuArray;
 # ArrayType = Base.identity;
diff --git a/benchmarks/scripts/thermo_bench.jl b/benchmarks/scripts/thermo_bench.jl
index 70c27d1652..0dab71ed7e 100644
--- a/benchmarks/scripts/thermo_bench.jl
+++ b/benchmarks/scripts/thermo_bench.jl
@@ -10,21 +10,22 @@ to be in your local environment to run.
 Clima A100:
 ```
 [ Info: device = ClimaComms.CUDADevice()
-Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
-┌──────────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
-│ funcs                                                            │ time per call                     │ bw %    │ achieved bw │ n-reads/writes │ n-reps │
-├──────────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
-│     TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm)      │ 586 microseconds, 517 nanoseconds │ 15.2602 │ 311.155     │ 9              │ 100    │
-│     TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm) │ 292 microseconds, 178 nanoseconds │ 30.6332 │ 624.611     │ 9              │ 100    │
-│     TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm)      │ 586 microseconds, 988 nanoseconds │ 15.2479 │ 310.905     │ 9              │ 100    │
-│     TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm) │ 292 microseconds, 178 nanoseconds │ 30.6332 │ 624.611     │ 9              │ 100    │
-└──────────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
+Problem size: (4, 4, 1, 63, 5400), N reads-writes: 9, N-reps: 100,  Float_type = Float32, Device_bandwidth_GBs=2039
+┌──────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┐
+│ funcs                                                        │ time per call                     │ bw %    │ achieved bw │
+├──────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┤
+│ TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm)      │ 586 microseconds, 353 nanoseconds │ 15.2644 │ 311.242     │
+│ TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm) │ 293 microseconds, 796 nanoseconds │ 30.4645 │ 621.171     │
+│ TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm)      │ 586 microseconds, 138 nanoseconds │ 15.27   │ 311.356     │
+│ TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm) │ 293 microseconds, 755 nanoseconds │ 30.4687 │ 621.258     │
+└──────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┘
 ```
 =#
 
 #! format: off
 module ThermoBench
 
+import CUDA
 include("benchmark_utils.jl")
 
 import ClimaCore
@@ -55,7 +56,8 @@ function thermo_func_bc!(x, thermo_params, us; nreps = 1, bm=nothing, n_trials =
         end
         e = min(e, et)
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=5+4) # TODO: verify this
+    s = size(Fields.field_values(x.ρ))
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=5+4) # TODO: verify this
     return nothing
 end
 
@@ -75,7 +77,8 @@ function thermo_func_sol!(x, thermo_params, us::UniversalSizesStatic; nreps = 1,
         end
         e = min(e, et)
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=5+4) # TODO: verify this
+    s = size(x.ρ)
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=5+4) # TODO: verify this
     return nothing
 end
 
@@ -123,7 +126,8 @@ import .TestUtilities as TU;
 using Test
 @testset "Thermo state" begin
     FT = Float32
-    bm = TB.Benchmark(;problem_size=(63,4,4,1,5400), float_type=FT)
+    device_name = CUDA.name(CUDA.device())
+    bm = TB.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=FT)
     device = ClimaComms.device()
     context = ClimaComms.context(device)
     cspace = TU.CenterExtrudedFiniteDifferenceSpace(
diff --git a/benchmarks/scripts/thermo_bench_bw.jl b/benchmarks/scripts/thermo_bench_bw.jl
index 26863d662a..65b9a896d6 100644
--- a/benchmarks/scripts/thermo_bench_bw.jl
+++ b/benchmarks/scripts/thermo_bench_bw.jl
@@ -14,32 +14,33 @@ using Revise; include(joinpath("benchmarks", "scripts", "thermo_bench_bw.jl"))
 Clima A100:
 ```
 [ Info: device = ClimaComms.CUDADevice()
-Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
-┌────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
-│ funcs                                              │ time per call                     │ bw %    │ achieved bw │ n-reads/writes │ n-reps │
-├────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
-│     TBB.singlefield_bc!(x_soa, us; nreps=100, bm)  │ 67 microseconds, 554 nanoseconds  │ 29.4429 │ 600.341     │ 2              │ 100    │
-│     TBB.singlefield_bc!(x_aos, us; nreps=100, bm)  │ 69 microseconds, 653 nanoseconds  │ 28.5556 │ 582.248     │ 2              │ 100    │
-│     TBB.thermo_func_bc!(x, us; nreps=100, bm)      │ 796 microseconds, 877 nanoseconds │ 12.4798 │ 254.462     │ 10             │ 100    │
-│     TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 131 microseconds, 72 nanoseconds  │ 75.873  │ 1547.05     │ 10             │ 100    │
-└────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
+Problem size: (4, 4, 1, 63, 5400), N-reps: 100,  Float_type = Float32, Device_bandwidth_GBs=2039
+┌────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┐
+│ funcs                                          │ time per call                     │ bw %    │ achieved bw │ N reads-writes │
+├────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┤
+│ TBB.singlefield_bc!(x_soa, us; nreps=100, bm)  │ 62 microseconds, 864 nanoseconds  │ 31.6395 │ 645.129     │ 2              │
+│ TBB.singlefield_bc!(x_aos, us; nreps=100, bm)  │ 69 microseconds, 858 nanoseconds  │ 28.4718 │ 580.541     │ 2              │
+│ TBB.thermo_func_bc!(x, us; nreps=100, bm)      │ 794 microseconds, 225 nanoseconds │ 12.5214 │ 255.312     │ 10             │
+│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 133 microseconds, 530 nanoseconds │ 74.4766 │ 1518.58     │ 10             │
+└────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┘
 
 [ Info: device = ClimaComms.CUDADevice()
-Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=2039
-┌────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
-│ funcs                                              │ time per call                     │ bw %    │ achieved bw │ n-reads/writes │ n-reps │
-├────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
-│     TBB.singlefield_bc!(x_soa, us; nreps=100, bm)  │ 108 microseconds, 790 nanoseconds │ 36.5653 │ 745.567     │ 2              │ 100    │
-│     TBB.singlefield_bc!(x_aos, us; nreps=100, bm)  │ 123 microseconds, 730 nanoseconds │ 32.1501 │ 655.541     │ 2              │ 100    │
-│     TBB.thermo_func_bc!(x, us; nreps=100, bm)      │ 1 millisecond, 43 microseconds    │ 19.0568 │ 388.569     │ 10             │ 100    │
-│     TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 256 microseconds, 717 nanoseconds │ 77.477  │ 1579.76     │ 10             │ 100    │
-└────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
+Problem size: (4, 4, 1, 63, 5400), N-reps: 100,  Float_type = Float64, Device_bandwidth_GBs=2039
+┌────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┐
+│ funcs                                          │ time per call                     │ bw %    │ achieved bw │ N reads-writes │
+├────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┤
+│ TBB.singlefield_bc!(x_soa, us; nreps=100, bm)  │ 108 microseconds, 514 nanoseconds │ 36.6585 │ 747.466     │ 2              │
+│ TBB.singlefield_bc!(x_aos, us; nreps=100, bm)  │ 118 microseconds, 989 nanoseconds │ 33.4311 │ 681.661     │ 2              │
+│ TBB.thermo_func_bc!(x, us; nreps=100, bm)      │ 1 millisecond, 44 microseconds    │ 19.0376 │ 388.177     │ 10             │
+│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 257 microseconds, 680 nanoseconds │ 77.1876 │ 1573.86     │ 10             │
+└────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┘
 ```
 =#
 
 #! format: off
 module ThermoBenchBandwidth
 
+import CUDA
 include("benchmark_utils.jl")
 
 import ClimaCore
@@ -74,7 +75,8 @@ function singlefield_bc!(x, us; nreps = 1, bm=nothing, n_trials = 30)
         end
         e = min(e, et)
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=2)
+    s = size(Fields.field_values(x.ρ_read))
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=2)
     return nothing
 end
 
@@ -89,7 +91,8 @@ function thermo_func_bc!(x, us; nreps = 1, bm=nothing, n_trials = 30)
         end
         e = min(e, et)
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=10)
+    s = size(Fields.field_values(x.ρ))
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=10)
     return nothing
 end
 
@@ -109,7 +112,8 @@ function thermo_func_sol!(x, us::UniversalSizesStatic; nreps = 1, bm=nothing, n_
         end
         e = min(e, et)
     end
-    push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=10)
+    s = size(x.ρ)
+    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=10)
     return nothing
 end
 
@@ -151,8 +155,9 @@ import .TestUtilities as TU;
 
 using Test
 @testset "Thermo state" begin
-    FT = Float32
-    bm = TBB.Benchmark(;problem_size=(63,4,4,1,5400), float_type=FT)
+    FT = Float64
+    device_name = CUDA.name(CUDA.device())
+    bm = TBB.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=FT)
     device = ClimaComms.device()
     context = ClimaComms.context(device)
     cspace = TU.CenterExtrudedFiniteDifferenceSpace(
diff --git a/test/DataLayouts/benchmark_copyto.jl b/test/DataLayouts/benchmark_copyto.jl
index c435800b47..2ad9a1c467 100644
--- a/test/DataLayouts/benchmark_copyto.jl
+++ b/test/DataLayouts/benchmark_copyto.jl
@@ -6,24 +6,36 @@ using Test
 using ClimaCore.DataLayouts
 using BenchmarkTools
 import ClimaComms
+import ClimaCore
 @static pkgversion(ClimaComms) >= v"0.6" && ClimaComms.@import_required_backends
+if ClimaComms.device() isa ClimaComms.CUDADevice
+    import CUDA
+    device_name = CUDA.name(CUDA.device()) # Move to ClimaComms
+else
+    device_name = "CPU"
+end
+
+include(joinpath(pkgdir(ClimaCore), "benchmarks/scripts/benchmark_utils.jl"))
 
-function benchmarkcopyto!(device, data, val, name)
+function benchmarkcopyto!(bm, device, data, val)
+    caller = string(nameof(typeof(data)))
+    @info "Benchmarking $caller..."
     data_rhs = similar(data)
     fill!(data_rhs, val)
-    println("Benchmarking ClimaCore copyto! for $name DataLayout")
     bc = Base.Broadcast.broadcasted(identity, data_rhs)
     bcp = Base.Broadcast.broadcasted(identity, parent(data_rhs))
     trial = @benchmark ClimaComms.@cuda_sync $device Base.copyto!($data, $bc)
-    show(stdout, MIME("text/plain"), trial)
-    println()
-    println("Benchmarking array copyto! for $name DataLayout")
-    trial = @benchmark ClimaComms.@cuda_sync $device Base.copyto!(
-        $(parent(data)),
-        $bcp,
+    t_min = minimum(trial.times) * 1e-9 # to seconds
+    nreps = length(trial.times)
+    n_reads_writes = DataLayouts.ncomponents(data) * 2
+    push_info(
+        bm;
+        kernel_time_s = t_min,
+        nreps = nreps,
+        caller,
+        problem_size = size(data),
+        n_reads_writes,
     )
-    show(stdout, MIME("text/plain"), trial)
-    println()
 end
 
 @testset "copyto! with Nf = 1" begin
@@ -36,18 +48,20 @@ end
     Nij = 4
     Nh = 30 * 30 * 6
     Nk = 6
+    bm = Benchmark(; float_type = FT, device_name)
 #! format: off
-    data = DataF{S}(device_zeros(FT,Nf));                        benchmarkcopyto!(device, data, 3, "DataF" ); @test all(parent(data) .== 3)
-    data = IJFH{S, Nij, Nh}(device_zeros(FT,Nij,Nij,Nf,Nh));     benchmarkcopyto!(device, data, 3, "IJFH"  ); @test all(parent(data) .== 3)
-    data = IFH{S, Nij, Nh}(device_zeros(FT,Nij,Nf,Nh));          benchmarkcopyto!(device, data, 3, "IFH"   ); @test all(parent(data) .== 3)
-    # The parent array of IJF and IF datalayouts are MArrays, and can therefore not be passed into CUDA kernels on the RHS.
-    # data = IJF{S, Nij}(device_zeros(FT,Nij,Nij,Nf));             benchmarkcopyto!(device, data, 3, "IJF"   ); @test all(parent(data) .== 3)
-    # data = IF{S, Nij}(device_zeros(FT,Nij,Nf));                  benchmarkcopyto!(device, data, 3, "IF"    ); @test all(parent(data) .== 3)
-    data = VF{S, Nv}(device_zeros(FT,Nv,Nf));                    benchmarkcopyto!(device, data, 3, "VF"    ); @test all(parent(data) .== 3)
-    data = VIJFH{S,Nv,Nij,Nh}(device_zeros(FT,Nv,Nij,Nij,Nf,Nh));benchmarkcopyto!(device, data, 3, "VIJFH" ); @test all(parent(data) .== 3)
-    data = VIFH{S, Nv, Nij, Nh}(device_zeros(FT,Nv,Nij,Nf,Nh));  benchmarkcopyto!(device, data, 3, "VIFH"  ); @test all(parent(data) .== 3)
+    data = DataF{S}(device_zeros(FT,Nf));                        benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3)
+    data = IJFH{S, Nij, Nh}(device_zeros(FT,Nij,Nij,Nf,Nh));     benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3)
+    data = IFH{S, Nij, Nh}(device_zeros(FT,Nij,Nf,Nh));          benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3)
+    # The parent array of IJF and IF datalayouts are MArrays, and can therefore not bm, be passed into CUDA kernels on the RHS.
+    # data = IJF{S, Nij}(device_zeros(FT,Nij,Nij,Nf));             benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3)
+    # data = IF{S, Nij}(device_zeros(FT,Nij,Nf));                  benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3)
+    data = VF{S, Nv}(device_zeros(FT,Nv,Nf));                    benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3)
+    data = VIJFH{S,Nv,Nij,Nh}(device_zeros(FT,Nv,Nij,Nij,Nf,Nh));benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3)
+    data = VIFH{S, Nv, Nij, Nh}(device_zeros(FT,Nv,Nij,Nf,Nh));  benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3)
 #! format: on
 
-    # data = IJKFVH{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); benchmarkcopyto!(device, data, 3); @test all(parent(data) .== 3) # TODO: test
-    # data = IH1JH2{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); benchmarkcopyto!(device, data, 3); @test all(parent(data) .== 3) # TODO: test
+    # data = IJKFVH{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3) # TODO: test
+    # data = IH1JH2{S}(device_zeros(FT,Nij,Nij,Nk,Nf,Nh)); benchmarkcopyto!(bm, device, data, 3); @test all(parent(data) .== 3) # TODO: test
+    tabulate_benchmark(bm)
 end