diff --git a/test/MatrixFields/band_matrix_row.jl b/test/MatrixFields/band_matrix_row.jl
index 0077948b4a..b48bec969f 100644
--- a/test/MatrixFields/band_matrix_row.jl
+++ b/test/MatrixFields/band_matrix_row.jl
@@ -1,18 +1,6 @@
-using Test
-using JET
 using LinearAlgebra: I
 
-using ClimaCore.MatrixFields
-import ClimaCore: Geometry
-
-macro test_all(expression)
-    return quote
-        local test_func() = $(esc(expression))
-        @test test_func()                   # correctness
-        @test (@allocated test_func()) == 0 # allocations
-        @test_opt test_func()               # type instabilities
-    end
-end
+include("matrix_field_test_utils.jl")
 
 @testset "BandMatrixRow Unit Tests" begin
     @test_all DiagonalMatrixRow(1) ==
@@ -41,14 +29,14 @@ end
               TridiagonalMatrixRow(1, 0, 1) / 2 - I ==
               zero(PentadiagonalMatrixRow{Int})
 
-    T(value) = (; a = (), b = value, c = (value, (; d = (value,)), (;)))
-    @test_all QuaddiagonalMatrixRow(T(0.5), T(1), T(1), T(1 // 2)) +
-              BidiagonalMatrixRow(T(-0.5), T(-1 // 2)) ==
-              QuaddiagonalMatrixRow(T(1), T(1), T(1), T(1)) / 2
-    @test_all PentadiagonalMatrixRow(T(0), T(0.5), T(1), T(1 // 2), T(0)) -
-              TridiagonalMatrixRow(T(1), T(0), T(1)) / 2 -
-              0.5 * DiagonalMatrixRow(T(2)) ==
-              PentadiagonalMatrixRow(T(0), T(0), T(0), T(0), T(0))
+    NT = nested_type
+    @test_all QuaddiagonalMatrixRow(NT(0.5), NT(1), NT(1), NT(1 // 2)) +
+              BidiagonalMatrixRow(NT(-0.5), NT(-1 // 2)) ==
+              QuaddiagonalMatrixRow(NT(1), NT(1), NT(1), NT(1)) / 2
+    @test_all PentadiagonalMatrixRow(NT(0), NT(0.5), NT(1), NT(1 // 2), NT(0)) -
+              TridiagonalMatrixRow(NT(1), NT(0), NT(1)) / 2 -
+              0.5 * DiagonalMatrixRow(NT(2)) ==
+              PentadiagonalMatrixRow(NT(0), NT(0), NT(0), NT(0), NT(0))
 
     @test_throws "Cannot promote" BidiagonalMatrixRow(1, 1) + I
     @test_throws "Cannot promote" BidiagonalMatrixRow(1, 1) +
diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl
index d60f96fcb1..72e4944432 100644
--- a/test/MatrixFields/matrix_field_broadcasting.jl
+++ b/test/MatrixFields/matrix_field_broadcasting.jl
@@ -1,292 +1,80 @@
-using Test
-using JET
-import CUDA
 import BandedMatrices: band
 import LinearAlgebra: I, mul!
-import Random: seed!
 
-using ClimaCore.MatrixFields
-import ClimaCore:
-    Geometry, Domains, Meshes, Topologies, Hypsography, Spaces, Fields
-import ClimaComms
-
-# Using @benchmark from BenchmarkTools is extremely slow; it appears to keep
-# triggering recompilations and allocating a lot of memory in the process.
-# This macro returns the minimum time (in seconds) required to run the
-# expression after it has been compiled.
-macro benchmark(expression)
-    return quote
-        $(esc(expression)) # Compile the expression first. Use esc for hygiene.
-        best_time = Inf
-        start_time = time_ns()
-        while time_ns() - start_time < 1e8 # Benchmark for 0.1 s (1e8 ns).
-            best_time = min(best_time, @elapsed $(esc(expression)))
-        end
-        best_time
-    end
-end
-
-# This function is used for benchmarking ref_set_result!.
-function call_array_func(
-    ref_set_result!::F,
-    ref_result_arrays,
-    inputs_arrays,
-    temp_values_arrays,
-) where {F}
-    for arrays in
-        zip(ref_result_arrays, inputs_arrays..., temp_values_arrays...)
-        ref_set_result!(arrays...)
-    end
-end
-
-function test_matrix_broadcast_against_array_reference(;
-    test_name,
-    inputs,
-    get_result::F1,
-    set_result!::F2,
-    get_temp_values::F3 = (_...) -> (),
-    ref_set_result!::F4,
-    time_ratio_limit = 10,
-    max_eps_error_limit = 7,
-    test_broken_with_cuda = false,
-) where {F1, F2, F3, F4}
-    @testset "$test_name" begin
-        is_using_cuda = ClimaComms.device(inputs[1]) isa ClimaComms.CUDADevice
-        ignore_cuda = (AnyFrameModule(CUDA),)
-
-        if test_broken_with_cuda && is_using_cuda
-            @test_throws CUDA.InvalidIRError get_result(inputs...)
-            @warn "$test_name:\n\tCUDA.InvalidIRError"
-            return
-        end
-
-        result = get_result(inputs...)
-        temp_values = get_temp_values(inputs...)
-
-        # Fill all output fields with NaNs for testing correctness.
-        result .*= NaN
-        for temp_value in temp_values
-            temp_value .*= NaN
-        end
-
-        ref_result_arrays = MatrixFields.field2arrays(result)
-        inputs_arrays = map(MatrixFields.field2arrays, inputs)
-        temp_values_arrays = map(MatrixFields.field2arrays, temp_values)
-
-        best_time = @benchmark set_result!(result, inputs...)
-        best_ref_time = @benchmark call_array_func(
-            ref_set_result!,
-            ref_result_arrays,
-            inputs_arrays,
-            temp_values_arrays,
-        )
-
-        # Compute the maximum error as an integer multiple of machine epsilon.
-        result_arrays = MatrixFields.field2arrays(result)
-        max_error =
-            maximum(zip(result_arrays, ref_result_arrays)) do (array, ref_array)
-                maximum(abs.(array .- ref_array))
-            end
-        max_eps_error = ceil(Int, max_error / eps(typeof(max_error)))
-
-        @info "$test_name:\n\tBest Time = $best_time s\n\tBest Reference Time \
-               = $best_ref_time s\n\tMaximum Error = $max_eps_error eps"
-
-        # Test that set_result! is performant compared to ref_set_result!.
-        @test best_time / best_ref_time < time_ratio_limit
-
-        # Test set_result! for correctness, allocations, and type instabilities.
-        # Ignore the type instabilities in CUDA and the allocations they incur.
-        @test max_eps_error < max_eps_error_limit
-        @test is_using_cuda || (@allocated set_result!(result, inputs...)) == 0
-        @test_opt ignored_modules = ignore_cuda set_result!(result, inputs...)
-
-        # Test ref_set_result! for allocations and type instabilities. This is
-        # helpful for ensuring that the performance comparison is fair.
-        @test (@allocated call_array_func(
-            ref_set_result!,
-            ref_result_arrays,
-            inputs_arrays,
-            temp_values_arrays,
-        )) == 0
-        @test_opt call_array_func(
-            ref_set_result!,
-            ref_result_arrays,
-            inputs_arrays,
-            temp_values_arrays,
-        )
-
-        # Test get_result (the allocating version of set_result!) for type
-        # instabilities. Ignore the type instabilities in CUDA.
-        @test_opt ignored_modules = ignore_cuda get_result(inputs...)
-    end
-end
-
-function test_matrix_broadcast_against_reference(;
-    test_name,
-    inputs,
-    get_result::F1,
-    set_result!::F2,
-    ref_inputs,
-    ref_set_result!::F3,
-) where {F1, F2, F3}
-    @testset "$test_name" begin
-        is_using_cuda = ClimaComms.device(inputs[1]) isa ClimaComms.CUDADevice
-        ignore_cuda = (AnyFrameModule(CUDA),)
-
-        result = get_result(inputs...)
-
-        # Fill the output field with NaNs for testing correctness.
-        result .*= NaN
-
-        ref_result = copy(result)
-
-        best_time = @benchmark set_result!(result, inputs...)
-        best_ref_time = @benchmark ref_set_result!(ref_result, ref_inputs...)
-
-        @info "$test_name:\n\tBest Time = $best_time s\n\tBest Reference Time \
-               = $best_ref_time s"
-
-        # Test that set_result! is performant compared to ref_set_result!.
-        @test best_time < best_ref_time
-
-        # Test set_result! for correctness, allocations, and type instabilities.
-        # Ignore the type instabilities in CUDA and the allocations they incur.
-        # Account for the roundoff errors that CUDA introduces.
-        @test if is_using_cuda
-            max_error = maximum(abs.(parent(result) .- parent(ref_result)))
-            max_error < eps(typeof(max_error))
-        else
-            result == ref_result
-        end
-        @test is_using_cuda || (@allocated set_result!(result, inputs...)) == 0
-        @test_opt ignored_modules = ignore_cuda set_result!(result, inputs...)
-
-        # Test ref_set_result! for allocations and type instabilities. This is
-        # helpful for ensuring that the performance comparison is fair. Ignore
-        # the type instabilities in CUDA and the allocations they incur.
-        @test is_using_cuda ||
-              (@allocated ref_set_result!(ref_result, ref_inputs...)) == 0
-        @test_opt ignored_modules = ignore_cuda ref_set_result!(
-            ref_result,
-            ref_inputs...,
-        )
-
-        # Test get_result (the allocating version of set_result!) for type
-        # instabilities. Ignore the type instabilities in CUDA.
-        @test_opt ignored_modules = ignore_cuda get_result(inputs...)
-    end
-end
-
-function random_test_fields(::Type{FT}) where {FT}
-    velem = 20 # This should be big enough to test high-bandwidth matrices.
-    helem = npoly = 1 # These should be small enough for the tests to be fast.
-
-    comms_ctx = ClimaComms.SingletonCommsContext()
-    hdomain = Domains.SphereDomain(FT(10))
-    hmesh = Meshes.EquiangularCubedSphere(hdomain, helem)
-    htopology = Topologies.Topology2D(comms_ctx, hmesh)
-    quad = Spaces.Quadratures.GLL{npoly + 1}()
-    hspace = Spaces.SpectralElementSpace2D(htopology, quad)
-    vdomain = Domains.IntervalDomain(
-        Geometry.ZPoint(FT(0)),
-        Geometry.ZPoint(FT(10));
-        boundary_tags = (:bottom, :top),
-    )
-    vmesh = Meshes.IntervalMesh(vdomain, nelems = velem)
-    vtopology = Topologies.IntervalTopology(comms_ctx, vmesh)
-    vspace = Spaces.CenterFiniteDifferenceSpace(vtopology)
-    sfc_coord = Fields.coordinate_field(hspace)
-    hypsography =
-        comms_ctx.device isa ClimaComms.CUDADevice ? Hypsography.Flat() :
-        Hypsography.LinearAdaption(
-            @. cosd(sfc_coord.lat) + cosd(sfc_coord.long) + 1
-        ) # TODO: FD operators don't currently work with hypsography on GPUs.
-    center_space =
-        Spaces.ExtrudedFiniteDifferenceSpace(hspace, vspace, hypsography)
-    face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(center_space)
-    ᶜcoord = Fields.coordinate_field(center_space)
-    ᶠcoord = Fields.coordinate_field(face_space)
-
-    seed!(1) # ensures reproducibility
-    ᶜᶜmat = map(c -> DiagonalMatrixRow(ntuple(_ -> rand(FT), 1)...), ᶜcoord)
-    ᶜᶠmat = map(c -> BidiagonalMatrixRow(ntuple(_ -> rand(FT), 2)...), ᶜcoord)
-    ᶠᶠmat = map(c -> TridiagonalMatrixRow(ntuple(_ -> rand(FT), 3)...), ᶠcoord)
-    ᶠᶜmat = map(c -> QuaddiagonalMatrixRow(ntuple(_ -> rand(FT), 4)...), ᶠcoord)
-    ᶜvec = map(c -> rand(FT), ᶜcoord)
-    ᶠvec = map(c -> rand(FT), ᶠcoord)
-
-    return ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec
-end
+include("matrix_field_test_utils.jl")
 
 @testset "Scalar Matrix Field Broadcasting" begin
-    ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec = random_test_fields(Float64)
+    FT = Float64
+    center_space, face_space = test_spaces(FT)
 
-    test_matrix_broadcast_against_array_reference(;
+    seed!(1) # ensures reproducibility
+    ᶜvec = random_field(FT, center_space)
+    ᶠvec = random_field(FT, face_space)
+    ᶜᶜmat = random_field(DiagonalMatrixRow{FT}, center_space)
+    ᶜᶠmat = random_field(BidiagonalMatrixRow{FT}, center_space)
+    ᶠᶠmat = random_field(TridiagonalMatrixRow{FT}, face_space)
+    ᶠᶜmat = random_field(QuaddiagonalMatrixRow{FT}, face_space)
+
+    test_field_broadcast_against_array_reference(;
         test_name = "diagonal matrix times vector",
-        inputs = (ᶜᶜmat, ᶜvec),
-        get_result = (ᶜᶜmat, ᶜvec) -> (@. ᶜᶜmat ⋅ ᶜvec),
-        set_result! = (result, ᶜᶜmat, ᶜvec) -> (@. result = ᶜᶜmat ⋅ ᶜvec),
+        get_result = () -> (@. ᶜᶜmat ⋅ ᶜvec),
+        set_result! = result -> (@. result = ᶜᶜmat ⋅ ᶜvec),
+        input_fields = (ᶜᶜmat, ᶜvec),
         ref_set_result! = (_result, _ᶜᶜmat, _ᶜvec) ->
             mul!(_result, _ᶜᶜmat, _ᶜvec),
-        time_ratio_limit = 10, # This case's ref function is fast on Buildkite.
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "tri-diagonal matrix times vector",
-        inputs = (ᶠᶠmat, ᶠvec),
-        get_result = (ᶠᶠmat, ᶠvec) -> (@. ᶠᶠmat ⋅ ᶠvec),
-        set_result! = (result, ᶠᶠmat, ᶠvec) -> (@. result = ᶠᶠmat ⋅ ᶠvec),
+        get_result = () -> (@. ᶠᶠmat ⋅ ᶠvec),
+        set_result! = result -> (@. result = ᶠᶠmat ⋅ ᶠvec),
+        input_fields = (ᶠᶠmat, ᶠvec),
         ref_set_result! = (_result, _ᶠᶠmat, _ᶠvec) ->
             mul!(_result, _ᶠᶠmat, _ᶠvec),
-        time_ratio_limit = 10, # This case's ref function is fast on Buildkite.
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "quad-diagonal matrix times vector",
-        inputs = (ᶠᶜmat, ᶜvec),
-        get_result = (ᶠᶜmat, ᶜvec) -> (@. ᶠᶜmat ⋅ ᶜvec),
-        set_result! = (result, ᶠᶜmat, ᶜvec) -> (@. result = ᶠᶜmat ⋅ ᶜvec),
+        get_result = () -> (@. ᶠᶜmat ⋅ ᶜvec),
+        set_result! = result -> (@. result = ᶠᶜmat ⋅ ᶜvec),
+        input_fields = (ᶠᶜmat, ᶜvec),
         ref_set_result! = (_result, _ᶠᶜmat, _ᶜvec) ->
             mul!(_result, _ᶠᶜmat, _ᶜvec),
-        time_ratio_limit = 10, # This case's ref function is fast on Buildkite.
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "diagonal matrix times bi-diagonal matrix",
-        inputs = (ᶜᶜmat, ᶜᶠmat),
-        get_result = (ᶜᶜmat, ᶜᶠmat) -> (@. ᶜᶜmat ⋅ ᶜᶠmat),
-        set_result! = (result, ᶜᶜmat, ᶜᶠmat) -> (@. result = ᶜᶜmat ⋅ ᶜᶠmat),
+        get_result = () -> (@. ᶜᶜmat ⋅ ᶜᶠmat),
+        set_result! = result -> (@. result = ᶜᶜmat ⋅ ᶜᶠmat),
+        input_fields = (ᶜᶜmat, ᶜᶠmat),
         ref_set_result! = (_result, _ᶜᶜmat, _ᶜᶠmat) ->
             mul!(_result, _ᶜᶜmat, _ᶜᶠmat),
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "tri-diagonal matrix times tri-diagonal matrix",
-        inputs = (ᶠᶠmat,),
-        get_result = (ᶠᶠmat,) -> (@. ᶠᶠmat ⋅ ᶠᶠmat),
-        set_result! = (result, ᶠᶠmat) -> (@. result = ᶠᶠmat ⋅ ᶠᶠmat),
+        get_result = () -> (@. ᶠᶠmat ⋅ ᶠᶠmat),
+        set_result! = result -> (@. result = ᶠᶠmat ⋅ ᶠᶠmat),
+        input_fields = (ᶠᶠmat,),
         ref_set_result! = (_result, _ᶠᶠmat) -> mul!(_result, _ᶠᶠmat, _ᶠᶠmat),
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "quad-diagonal matrix times diagonal matrix",
-        inputs = (ᶠᶜmat, ᶜᶜmat),
-        get_result = (ᶠᶜmat, ᶜᶜmat) -> (@. ᶠᶜmat ⋅ ᶜᶜmat),
-        set_result! = (result, ᶠᶜmat, ᶜᶜmat) -> (@. result = ᶠᶜmat ⋅ ᶜᶜmat),
+        get_result = () -> (@. ᶠᶜmat ⋅ ᶜᶜmat),
+        set_result! = result -> (@. result = ᶠᶜmat ⋅ ᶜᶜmat),
+        input_fields = (ᶠᶜmat, ᶜᶜmat),
         ref_set_result! = (_result, _ᶠᶜmat, _ᶜᶜmat) ->
             mul!(_result, _ᶠᶜmat, _ᶜᶜmat),
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "diagonal matrix times bi-diagonal matrix times \
                      tri-diagonal matrix times quad-diagonal matrix",
-        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
-        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
-            (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat),
-        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
-            (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat),
-        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+        get_result = () -> (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat),
+        set_result! = result -> (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat),
+        input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_temp_value_fields = () ->
             ((@. ᶜᶜmat ⋅ ᶜᶠmat), (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat)),
         ref_set_result! = (
             _result,
@@ -303,16 +91,14 @@ end
         end,
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "diagonal matrix times bi-diagonal matrix times \
                      tri-diagonal matrix times quad-diagonal matrix, but with \
                      forced right-associativity",
-        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
-        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
-            (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))),
-        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
-            (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))),
-        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+        get_result = () -> (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))),
+        set_result! = result -> (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))),
+        input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_temp_value_fields = () ->
             ((@. ᶠᶠmat ⋅ ᶠᶜmat), (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))),
         ref_set_result! = (
             _result,
@@ -330,16 +116,15 @@ end
         test_broken_with_cuda = true, # TODO: Fix this.
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "diagonal matrix times bi-diagonal matrix times \
                      tri-diagonal matrix times quad-diagonal matrix times \
                      vector",
-        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec),
-        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) ->
-            (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec),
-        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) ->
+        get_result = () -> (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec),
+        set_result! = result ->
             (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec),
-        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> (
+        input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec),
+        get_temp_value_fields = () -> (
             (@. ᶜᶜmat ⋅ ᶜᶠmat),
             (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat),
             (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat),
@@ -362,16 +147,15 @@ end
         end,
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "diagonal matrix times bi-diagonal matrix times \
                      tri-diagonal matrix times quad-diagonal matrix times \
                      vector, but with forced right-associativity",
-        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec),
-        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) ->
-            (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))),
-        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) ->
+        get_result = () -> (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))),
+        set_result! = result ->
             (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))),
-        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> (
+        input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec),
+        get_temp_value_fields = () -> (
             (@. ᶠᶜmat ⋅ ᶜvec),
             (@. ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)),
             (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec))),
@@ -396,14 +180,14 @@ end
         test_broken_with_cuda = true, # TODO: Fix this.
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "linear combination of matrix products and LinearAlgebra.I",
-        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
-        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+        get_result = () ->
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)),
-        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+        set_result! = result ->
             (@. result = 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)),
-        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (
+        input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_temp_value_fields = () -> (
             (@. 2 * ᶠᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
@@ -429,15 +213,15 @@ end
         end,
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "another linear combination of matrix products and \
                      LinearAlgebra.I",
-        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
-        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+        get_result = () ->
             (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)),
-        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result =
+        set_result! = result -> (@. result =
             ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)),
-        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (
+        input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_temp_value_fields = () -> (
             (@. ᶠᶜmat ⋅ ᶜᶜmat),
             (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
             (@. ᶠᶠmat / 3),
@@ -463,14 +247,14 @@ end
         end,
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "matrix times linear combination",
-        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
-        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. ᶜᶠmat ⋅
+        get_result = () -> (@. ᶜᶠmat ⋅
             (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))),
-        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result =
+        set_result! = result -> (@. result =
             ᶜᶠmat ⋅ (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))),
-        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (
+        input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_temp_value_fields = () -> (
             (@. 2 * ᶠᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
@@ -499,16 +283,16 @@ end
         end,
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "linear combination times another linear combination",
-        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
-        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+        get_result = () ->
             (@. (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅
                 (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,))),
-        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result =
+        set_result! = result -> (@. result =
             (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅
             (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,))),
-        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (
+        input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_temp_value_fields = () -> (
             (@. 2 * ᶠᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
@@ -554,22 +338,22 @@ end
         max_eps_error_limit = 30, # This case's roundoff error is large on GPUs.
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "matrix times matrix times linear combination times matrix \
                      times another linear combination times matrix",
-        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
-        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. ᶠᶜmat ⋅ ᶜᶠmat ⋅
+        get_result = () -> (@. ᶠᶜmat ⋅ ᶜᶠmat ⋅
             (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅
             ᶠᶠmat ⋅
             (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)) ⋅
             ᶠᶠmat),
-        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result =
+        set_result! = result -> (@. result =
             ᶠᶜmat ⋅ ᶜᶠmat ⋅
             (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅
             ᶠᶠmat ⋅
             (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)) ⋅
             ᶠᶠmat),
-        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (
+        input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_temp_value_fields = () -> (
             (@. ᶠᶜmat ⋅ ᶜᶠmat),
             (@. 2 * ᶠᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat),
@@ -630,23 +414,21 @@ end
             mul!(_temp14, _temp8, _temp13)
             mul!(_result, _temp14, _ᶠᶠmat)
         end,
-        time_ratio_limit = 10, # This case's ref function is fast on Buildkite.
         max_eps_error_limit = 70, # This case's roundoff error is large on GPUs.
     )
 
-    test_matrix_broadcast_against_array_reference(;
+    test_field_broadcast_against_array_reference(;
         test_name = "matrix constructions and multiplications",
-        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec),
-        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec) ->
+        get_result = () ->
             (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅
                 TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat ⋅
                 DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)),
-        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec) ->
-            (@. result =
-                BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅
-                TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat ⋅
-                DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)),
-        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec) -> (
+        set_result! = result -> (@. result =
+            BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅
+            TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat ⋅
+            DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)),
+        input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec),
+        get_temp_value_fields = () -> (
             (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec)),
             (@. TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1)),
             (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅
@@ -686,15 +468,21 @@ end
 end
 
 @testset "Non-scalar Matrix Field Broadcasting" begin
-    ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec = random_test_fields(Float64)
+    FT = Float64
+    center_space, face_space = test_spaces(FT)
 
-    ᶜlg = Fields.local_geometry_field(ᶜvec)
-    ᶠlg = Fields.local_geometry_field(ᶠvec)
+    ᶜlg = Fields.local_geometry_field(center_space)
+    ᶠlg = Fields.local_geometry_field(face_space)
 
-    ᶜᶠmat2 = map(row -> map(sin, row), ᶜᶠmat)
-    ᶜᶠmat3 = map(row -> map(cos, row), ᶜᶠmat)
-    ᶠᶜmat2 = map(row -> map(sin, row), ᶠᶜmat)
-    ᶠᶜmat3 = map(row -> map(cos, row), ᶠᶜmat)
+    seed!(1) # ensures reproducibility
+    ᶜvec = random_field(FT, center_space)
+    ᶠvec = random_field(FT, face_space)
+    ᶜᶠmat = random_field(BidiagonalMatrixRow{FT}, center_space)
+    ᶜᶠmat2 = random_field(BidiagonalMatrixRow{FT}, center_space)
+    ᶜᶠmat3 = random_field(BidiagonalMatrixRow{FT}, center_space)
+    ᶠᶜmat = random_field(QuaddiagonalMatrixRow{FT}, face_space)
+    ᶠᶜmat2 = random_field(QuaddiagonalMatrixRow{FT}, face_space)
+    ᶠᶜmat3 = random_field(QuaddiagonalMatrixRow{FT}, face_space)
 
     ᶜᶠmat_AC1 = map(row -> map(adjoint ∘ Geometry.Covariant1Vector, row), ᶜᶠmat)
     ᶜᶠmat_C12 = map(
@@ -709,48 +497,26 @@ end
         ᶠᶜmat3,
     )
 
-    test_matrix_broadcast_against_reference(;
+    test_field_broadcast(;
         test_name = "matrix of covectors times matrix of vectors",
-        inputs = (ᶜᶠmat_AC1, ᶠᶜmat_C12),
-        get_result = (ᶜᶠmat_AC1, ᶠᶜmat_C12) -> (@. ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12),
-        set_result! = (result, ᶜᶠmat_AC1, ᶠᶜmat_C12) ->
-            (@. result = ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12),
-        ref_inputs = (ᶜᶠmat, ᶠᶜmat2, ᶠᶜmat3, ᶠlg),
-        ref_set_result! = (result, ᶜᶠmat, ᶠᶜmat2, ᶠᶜmat3, ᶠlg) -> (@. result =
+        get_result = () -> (@. ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12),
+        set_result! = result -> (@. result = ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12),
+        ref_set_result! = result -> (@. result =
             ᶜᶠmat ⋅ (
                 DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:1) ⋅ ᶠᶜmat2 +
                 DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:2) ⋅ ᶠᶜmat3
             )),
     )
 
-    test_matrix_broadcast_against_reference(;
+    test_field_broadcast(;
         test_name = "matrix of covectors times matrix of vectors times matrix \
                      of numbers times matrix of covectors times matrix of \
                      vectors",
-        inputs = (ᶜᶠmat_AC1, ᶠᶜmat_C12, ᶜᶠmat, ᶠᶜmat_AC1, ᶜᶠmat_C12),
-        get_result = (ᶜᶠmat_AC1, ᶠᶜmat_C12, ᶜᶠmat, ᶠᶜmat_AC1, ᶜᶠmat_C12) ->
+        get_result = () ->
             (@. ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12 ⋅ ᶜᶠmat ⋅ ᶠᶜmat_AC1 ⋅ ᶜᶠmat_C12),
-        set_result! = (
-            result,
-            ᶜᶠmat_AC1,
-            ᶠᶜmat_C12,
-            ᶜᶠmat,
-            ᶠᶜmat_AC1,
-            ᶜᶠmat_C12,
-        ) ->
+        set_result! = result ->
             (@. result = ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12 ⋅ ᶜᶠmat ⋅ ᶠᶜmat_AC1 ⋅ ᶜᶠmat_C12),
-        ref_inputs = (ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3, ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3, ᶜlg, ᶠlg),
-        ref_set_result! = (
-            result,
-            ᶜᶠmat,
-            ᶜᶠmat2,
-            ᶜᶠmat3,
-            ᶠᶜmat,
-            ᶠᶜmat2,
-            ᶠᶜmat3,
-            ᶜlg,
-            ᶠlg,
-        ) -> (@. result =
+        ref_set_result! = result -> (@. result =
             ᶜᶠmat ⋅ (
                 DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:1) ⋅ ᶠᶜmat2 +
                 DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:2) ⋅ ᶠᶜmat3
@@ -767,43 +533,15 @@ end
     ᶠᶜmat_C12_AC1 =
         map((row1, row2) -> map(tuple, row1, row2), ᶠᶜmat_C12, ᶠᶜmat_AC1)
 
-    test_matrix_broadcast_against_reference(;
+    test_field_broadcast(;
         test_name = "matrix of covectors and numbers times matrix of vectors \
                      and covectors times matrix of numbers and vectors times \
                      vector of numbers",
-        inputs = (ᶜᶠmat_AC1_num, ᶠᶜmat_C12_AC1, ᶜᶠmat_num_C12, ᶠvec),
-        get_result = (ᶜᶠmat_AC1_num, ᶠᶜmat_C12_AC1, ᶜᶠmat_num_C12, ᶠvec) ->
+        get_result = () ->
             (@. ᶜᶠmat_AC1_num ⋅ ᶠᶜmat_C12_AC1 ⋅ ᶜᶠmat_num_C12 ⋅ ᶠvec),
-        set_result! = (
-            result,
-            ᶜᶠmat_AC1_num,
-            ᶠᶜmat_C12_AC1,
-            ᶜᶠmat_num_C12,
-            ᶠvec,
-        ) -> (@. result = ᶜᶠmat_AC1_num ⋅ ᶠᶜmat_C12_AC1 ⋅ ᶜᶠmat_num_C12 ⋅ ᶠvec),
-        ref_inputs = (
-            ᶜᶠmat,
-            ᶜᶠmat2,
-            ᶜᶠmat3,
-            ᶠᶜmat,
-            ᶠᶜmat2,
-            ᶠᶜmat3,
-            ᶠvec,
-            ᶜlg,
-            ᶠlg,
-        ),
-        ref_set_result! = (
-            result,
-            ᶜᶠmat,
-            ᶜᶠmat2,
-            ᶜᶠmat3,
-            ᶠᶜmat,
-            ᶠᶜmat2,
-            ᶠᶜmat3,
-            ᶠvec,
-            ᶜlg,
-            ᶠlg,
-        ) -> (@. result = tuple(
+        set_result! = result ->
+            (@. result = ᶜᶠmat_AC1_num ⋅ ᶠᶜmat_C12_AC1 ⋅ ᶜᶠmat_num_C12 ⋅ ᶠvec),
+        ref_set_result! = result -> (@. result = tuple(
             ᶜᶠmat ⋅ (
                 DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:1) ⋅ ᶠᶜmat2 +
                 DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:2) ⋅ ᶠᶜmat3
@@ -815,32 +553,20 @@ end
         )),
     )
 
-    T(value1, value2, value3) =
-        (; a = (), b = value1, c = (value2, (; d = (value3,)), (;)))
-    ᶜᶠmat_T = map((rows...) -> map(T, rows...), ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3)
-    ᶠᶜmat_T = map((rows...) -> map(T, rows...), ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3)
-    ᶜvec_T = @. T(ᶜvec, ᶜvec, ᶜvec)
+    ᶜvec_NT = @. nested_type(ᶜvec, ᶜvec, ᶜvec)
+    ᶜᶠmat_NT =
+        map((rows...) -> map(nested_type, rows...), ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3)
+    ᶠᶜmat_NT =
+        map((rows...) -> map(nested_type, rows...), ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3)
 
-    test_matrix_broadcast_against_reference(;
+    test_field_broadcast(;
         test_name = "matrix of nested values times matrix of nested values \
                      times matrix of numbers times matrix of numbers times \
-                     times vector of nested values",
-        inputs = (ᶜᶠmat_T, ᶠᶜmat, ᶜᶠmat, ᶠᶜmat_T, ᶜvec_T),
-        get_result = (ᶜᶠmat_T, ᶠᶜmat, ᶜᶠmat, ᶠᶜmat_T, ᶜvec_T) ->
-            (@. ᶜᶠmat_T ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat_T ⋅ ᶜvec_T),
-        set_result! = (result, ᶜᶠmat_T, ᶠᶜmat, ᶜᶠmat, ᶠᶜmat_T, ᶜvec_T) ->
-            (@. result = ᶜᶠmat_T ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat_T ⋅ ᶜvec_T),
-        ref_inputs = (ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3, ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3, ᶜvec),
-        ref_set_result! = (
-            result,
-            ᶜᶠmat,
-            ᶜᶠmat2,
-            ᶜᶠmat3,
-            ᶠᶜmat,
-            ᶠᶜmat2,
-            ᶠᶜmat3,
-            ᶜvec,
-        ) -> (@. result = T(
+                     vector of nested values",
+        get_result = () -> (@. ᶜᶠmat_NT ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat_NT ⋅ ᶜvec_NT),
+        set_result! = result ->
+            (@. result = ᶜᶠmat_NT ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat_NT ⋅ ᶜvec_NT),
+        ref_set_result! = result -> (@. result = nested_type(
             ᶜᶠmat ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec,
             ᶜᶠmat2 ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat2 ⋅ ᶜvec,
             ᶜᶠmat3 ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat3 ⋅ ᶜvec,
diff --git a/test/MatrixFields/matrix_field_test_utils.jl b/test/MatrixFields/matrix_field_test_utils.jl
new file mode 100644
index 0000000000..555b9fc3a2
--- /dev/null
+++ b/test/MatrixFields/matrix_field_test_utils.jl
@@ -0,0 +1,235 @@
+using Test
+using JET
+import CUDA
+import Random: seed!
+
+import ClimaComms
+import ClimaCore:
+    Geometry, Domains, Meshes, Topologies, Hypsography, Spaces, Fields
+using ClimaCore.MatrixFields
+
+# Test that an expression is true and that it is also type-stable.
+macro test_all(expression)
+    return quote
+        local test_func() = $(esc(expression))
+        @test test_func()                   # correctness
+        @test (@allocated test_func()) == 0 # allocations
+        @test_opt test_func()               # type instabilities
+    end
+end
+
+# Compute the minimum time (in seconds) required to run an expression after it 
+# has been compiled. This macro is used instead of @benchmark from
+# BenchmarkTools.jl because the latter is extremely slow (it appears to keep
+# triggering recompilations and allocating a lot of memory in the process).
+macro benchmark(expression)
+    return quote
+        $(esc(expression)) # Compile the expression first. Use esc for hygiene.
+        best_time = Inf
+        start_time = time_ns()
+        while time_ns() - start_time < 1e8 # Benchmark for 0.1 s (1e8 ns).
+            best_time = min(best_time, @elapsed $(esc(expression)))
+        end
+        best_time
+    end
+end
+
+const ignore_cuda = (AnyFrameModule(CUDA),)
+
+is_using_cuda() = ClimaComms.device() isa ClimaComms.CUDADevice
+
+# Test the allocating and non-allocating versions of a field broadcast against
+# a reference non-allocating implementation. Ensure that they are performant,
+# correct, and type-stable, and print some useful information. If a reference
+# implementation is not available, the performance and correctness checks are
+# skipped.
+function test_field_broadcast(;
+    test_name,
+    get_result::F1,
+    set_result!::F2,
+    ref_set_result!::F3 = nothing,
+    time_ratio_limit = 10,
+    max_eps_error_limit = 10,
+    test_broken_with_cuda = false,
+) where {F1, F2, F3}
+    @testset "$test_name" begin
+        if test_broken_with_cuda && is_using_cuda()
+            @test_throws CUDA.InvalidIRError get_result()
+            @warn "$test_name:\n\tCUDA.InvalidIRError"
+            return
+        end
+
+        result = get_result()
+        time = @benchmark set_result!(result)
+        time_rounded = round(time; sigdigits = 2)
+
+        if isnothing(ref_set_result!)
+            @info "$test_name:\n\tTime = $time_rounded s (reference \
+                   implementation unavailable)"
+        else
+            ref_result = similar(result)
+            ref_time = @benchmark ref_set_result!(ref_result)
+            ref_time_rounded = round(ref_time; sigdigits = 2)
+            time_ratio = time / ref_time
+            time_ratio_rounded = round(time_ratio; sigdigits = 2)
+            max_error = maximum(abs.(parent(result) .- parent(ref_result)))
+            max_eps_error = ceil(Int, max_error / eps(typeof(max_error)))
+
+            @info "$test_name:\n\tTime Ratio = $time_ratio_rounded \
+                   ($time_rounded s vs. $ref_time_rounded s for reference) \
+                   \n\tMaximum Error = $max_eps_error eps"
+
+            # Test that set_result! is performant and correct when compared
+            # against ref_set_result!.
+            @test time / ref_time <= time_ratio_limit
+            @test max_eps_error <= max_eps_error_limit
+        end
+
+        # Test get_result and set_result! for type instabilities, and test
+        # set_result! for allocations. Ignore the type instabilities in CUDA and
+        # the allocations they incur.
+        @test_opt ignored_modules = ignore_cuda get_result()
+        @test_opt ignored_modules = ignore_cuda set_result!(result)
+        @test is_using_cuda() || (@allocated set_result!(result)) == 0
+
+        if !isnothing(ref_set_result!)
+            # Test ref_set_result! for type instabilities and allocations to
+            # ensure that the performance comparison is fair.
+            @test_opt ignored_modules = ignore_cuda ref_set_result!(ref_result)
+            @test is_using_cuda() ||
+                  (@allocated ref_set_result!(ref_result)) == 0
+        end
+    end
+end
+
+# Test the allocating and non-allocating versions of a field broadcast against
+# a reference array-based non-allocating implementation. Ensure that they are
+# performant, correct, and type-stable, and print some useful information. In
+# order for the input arrays and temporary scratch arrays used by the reference
+# implementation to be generated automatically, the corresponding fields must be
+# passed to this function.
+function test_field_broadcast_against_array_reference(;
+    test_name,
+    get_result::F1,
+    set_result!::F2,
+    input_fields,
+    get_temp_value_fields = () -> (),
+    ref_set_result!::F3,
+    time_ratio_limit = 10,
+    max_eps_error_limit = 10,
+    test_broken_with_cuda = false,
+) where {F1, F2, F3}
+    @testset "$test_name" begin
+        if test_broken_with_cuda && is_using_cuda()
+            @test_throws CUDA.InvalidIRError get_result()
+            @warn "$test_name:\n\tCUDA.InvalidIRError"
+            return
+        end
+
+        result = get_result()
+        time = @benchmark set_result!(result)
+        time_rounded = round(time; sigdigits = 2)
+
+        ref_result = similar(result)
+        temp_value_fields = map(similar, get_temp_value_fields())
+
+        result_arrays = MatrixFields.field2arrays(result)
+        ref_result_arrays = MatrixFields.field2arrays(ref_result)
+        inputs_arrays = map(MatrixFields.field2arrays, input_fields)
+        temp_values_arrays = map(MatrixFields.field2arrays, temp_value_fields)
+
+        function call_ref_set_result!()
+            for arrays in
+                zip(ref_result_arrays, inputs_arrays..., temp_values_arrays...)
+                ref_set_result!(arrays...)
+            end
+        end
+
+        ref_time = @benchmark call_ref_set_result!()
+        ref_time_rounded = round(ref_time; sigdigits = 2)
+        time_ratio = time / ref_time
+        time_ratio_rounded = round(time_ratio; sigdigits = 2)
+        max_error =
+            maximum(zip(result_arrays, ref_result_arrays)) do (array, ref_array)
+                maximum(abs.(array .- ref_array))
+            end
+        max_eps_error = ceil(Int, max_error / eps(typeof(max_error)))
+
+        @info "$test_name:\n\tTime Ratio = $time_ratio_rounded ($time_rounded \
+               s vs. $ref_time_rounded s for reference)\n\tMaximum Error = \
+               $max_eps_error eps"
+
+        # Test that set_result! is performant and correct when compared against
+        # ref_set_result!.
+        @test time / ref_time <= time_ratio_limit
+        @test max_eps_error <= max_eps_error_limit
+
+        # Test get_result and set_result! for type instabilities, and test
+        # set_result! for allocations. Ignore the type instabilities in CUDA and
+        # the allocations they incur.
+        @test_opt ignored_modules = ignore_cuda get_result()
+        @test_opt ignored_modules = ignore_cuda set_result!(result)
+        @test is_using_cuda() || (@allocated set_result!(result)) == 0
+
+        # Test ref_set_result! for type instabilities and allocations to ensure
+        # that the performance comparison is fair.
+        @test_opt ignored_modules = ignore_cuda call_ref_set_result!()
+        @test is_using_cuda() || (@allocated call_ref_set_result!()) == 0
+    end
+end
+
+# Generate extruded finite difference spaces for testing. Include topography
+# when possible.
+function test_spaces(::Type{FT}) where {FT}
+    velem = 20 # This should be big enough to test high-bandwidth matrices.
+    helem = npoly = 1 # These should be small enough for the tests to be fast.
+
+    comms_ctx = ClimaComms.SingletonCommsContext()
+    hdomain = Domains.SphereDomain(FT(10))
+    hmesh = Meshes.EquiangularCubedSphere(hdomain, helem)
+    htopology = Topologies.Topology2D(comms_ctx, hmesh)
+    quad = Spaces.Quadratures.GLL{npoly + 1}()
+    hspace = Spaces.SpectralElementSpace2D(htopology, quad)
+    vdomain = Domains.IntervalDomain(
+        Geometry.ZPoint(FT(0)),
+        Geometry.ZPoint(FT(10));
+        boundary_tags = (:bottom, :top),
+    )
+    vmesh = Meshes.IntervalMesh(vdomain, nelems = velem)
+    vtopology = Topologies.IntervalTopology(comms_ctx, vmesh)
+    vspace = Spaces.CenterFiniteDifferenceSpace(vtopology)
+    sfc_coord = Fields.coordinate_field(hspace)
+    hypsography =
+        is_using_cuda() ? Hypsography.Flat() :
+        Hypsography.LinearAdaption(
+            @. cosd(sfc_coord.lat) + cosd(sfc_coord.long) + 1
+        ) # TODO: FD operators don't currently work with hypsography on GPUs.
+    center_space =
+        Spaces.ExtrudedFiniteDifferenceSpace(hspace, vspace, hypsography)
+    face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(center_space)
+
+    return center_space, face_space
+end
+
+# Generate a random field with elements of type T.
+function random_field(::Type{T}, space) where {T}
+    FT = Spaces.undertype(space)
+    field = Fields.Field(T, space)
+    parent(field) .= rand.(FT)
+    return field
+end
+
+# Construct a highly nested type for testing integration with RecursiveApply.
+nested_type(value) = nested_type(value, value, value)
+nested_type(value1, value2, value3) =
+    (; a = (), b = value1, c = (value2, (; d = (value3,)), (;)))
+
+# A shorthand for typeof(nested_type(::FT)).
+const NestedType{FT} = NamedTuple{
+    (:a, :b, :c),
+    Tuple{
+        Tuple{},
+        FT,
+        Tuple{FT, NamedTuple{(:d,), Tuple{Tuple{FT}}}, NamedTuple{(), Tuple{}}},
+    },
+}
diff --git a/test/MatrixFields/rmul_with_projection.jl b/test/MatrixFields/rmul_with_projection.jl
index acc701649f..6bcb95c2a9 100644
--- a/test/MatrixFields/rmul_with_projection.jl
+++ b/test/MatrixFields/rmul_with_projection.jl
@@ -1,11 +1,9 @@
-using Test
-using JET
-using Random: seed!
 using StaticArrays: @SMatrix
 
-import ClimaCore: Geometry
 import ClimaCore.MatrixFields: rmul_with_projection, rmul_return_type
 
+include("matrix_field_test_utils.jl")
+
 function test_rmul_with_projection(x::X, y::Y, lg, expected_result) where {X, Y}
     result = rmul_with_projection(x, y, lg)
     result_type = rmul_return_type(X, Y)
@@ -73,8 +71,7 @@ end
     test_rmul_with_projection(cotensor, cotensor, lg, cotensor * cotensor)
 
     # Test some combinations of complicated nested values.
-    T(value1, value2, value3) =
-        (; a = (), b = value1, c = (value2, (; d = (value3,)), (;)))
+    T = nested_type
     test_rmul_with_projection(
         number,
         T(covector, vector, tensor),