From 3e0ee6aa00fa838cb36a123f6de851b1ea7453ab Mon Sep 17 00:00:00 2001
From: Dennis Yatunin <dyatun@gmail.com>
Date: Fri, 26 May 2023 18:15:17 -0700
Subject: [PATCH 01/10] Add new MatrixFields module, along with unit tests and
 performance tests

---
 .buildkite/pipeline.yml                       |  19 +
 Project.toml                                  |   2 +
 benchmarks/bickleyjet/Manifest.toml           |  32 +-
 docs/Manifest.toml                            |  42 +-
 examples/Manifest.toml                        |  28 +-
 perf/Manifest.toml                            |  28 +-
 src/ClimaCore.jl                              |   1 +
 src/Fields/mapreduce.jl                       |   2 +-
 src/Geometry/axistensors.jl                   |  23 +-
 src/MatrixFields/MatrixFields.jl              |  26 +
 src/MatrixFields/band_matrix_row.jl           | 138 +++
 src/MatrixFields/matrix_field_utils.jl        | 139 +++
 src/MatrixFields/matrix_multiplication.jl     | 381 ++++++++
 src/MatrixFields/rmul_with_projection.jl      |  32 +
 src/RecursiveApply/RecursiveApply.jl          |  18 +
 test/Geometry/axistensors.jl                  |   6 +
 test/MatrixFields/band_matrix_row.jl          |  66 ++
 test/MatrixFields/field2arrays.jl             |  82 ++
 .../MatrixFields/matrix_field_broadcasting.jl | 816 ++++++++++++++++++
 test/MatrixFields/rmul_with_projection.jl     | 123 +++
 test/Project.toml                             |   1 +
 test/runtests.jl                              |   6 +
 22 files changed, 1931 insertions(+), 80 deletions(-)
 create mode 100644 src/MatrixFields/MatrixFields.jl
 create mode 100644 src/MatrixFields/band_matrix_row.jl
 create mode 100644 src/MatrixFields/matrix_field_utils.jl
 create mode 100644 src/MatrixFields/matrix_multiplication.jl
 create mode 100644 src/MatrixFields/rmul_with_projection.jl
 create mode 100644 test/MatrixFields/band_matrix_row.jl
 create mode 100644 test/MatrixFields/field2arrays.jl
 create mode 100644 test/MatrixFields/matrix_field_broadcasting.jl
 create mode 100644 test/MatrixFields/rmul_with_projection.jl

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index b060f6b934..950581d805 100755
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -494,6 +494,25 @@ steps:
         agents:
           slurm_gpus: 1
 
+  - group: "Unit: MatrixFields"
+    steps:
+
+      - label: "Unit: BandMatrixRow"
+        key: unit_band_matrix_row
+        command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/band_matrix_row.jl"
+
+      - label: "Unit: rmul_with_projection"
+        key: unit_rmul_with_projection
+        command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/rmul_with_projection.jl"
+
+      - label: "Unit: field2arrays"
+        key: unit_field2arrays
+        command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/field2arrays.jl"
+
+      - label: "Unit: matrix field broadcasting"
+        key: unit_matrix_field_broadcasting
+        command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl"
+
   - group: "Unit: Hypsography"
     steps:
 
diff --git a/Project.toml b/Project.toml
index f2896a5c86..3edc5686e7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.10.43"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ClimaComms = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
@@ -31,6 +32,7 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
 [compat]
 Adapt = "3"
+BandedMatrices = "0.17"
 BlockArrays = "0.16"
 ClimaComms = "0.5"
 CUDA = "3, 4.2.0"
diff --git a/benchmarks/bickleyjet/Manifest.toml b/benchmarks/bickleyjet/Manifest.toml
index 264b71fda0..1ac7bac797 100644
--- a/benchmarks/bickleyjet/Manifest.toml
+++ b/benchmarks/bickleyjet/Manifest.toml
@@ -52,6 +52,12 @@ git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
 uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 version = "0.4.2"
 
+[[deps.BandedMatrices]]
+deps = ["ArrayLayouts", "FillArrays", "LinearAlgebra", "PrecompileTools", "SparseArrays"]
+git-tree-sha1 = "9ad46355045491b12eab409dee73e9de46293aa2"
+uuid = "aae01518-5342-5314-be14-df237901396f"
+version = "0.17.28"
+
 [[deps.Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
@@ -150,10 +156,10 @@ uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
 version = "0.4.2"
 
 [[deps.ClimaCore]]
-deps = ["Adapt", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "Rotations", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"]
+deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"]
 path = "../.."
 uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
-version = "0.10.39"
+version = "0.10.40"
 
 [[deps.CloseOpenIntervals]]
 deps = ["Static", "StaticArrayInterface"]
@@ -773,9 +779,9 @@ version = "0.1.8"
 
 [[deps.MPItrampoline_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
-git-tree-sha1 = "b3dcf8e1c610a10458df3c62038c8cc3a4d6291d"
+git-tree-sha1 = "228d5366a7c89b3c81469592b6f4c612db693d50"
 uuid = "f1f71cc9-e9ae-5b93-9b94-4fe0e1ad3748"
-version = "5.3.0+0"
+version = "5.3.0+1"
 
 [[deps.MacroTools]]
 deps = ["Markdown", "Random"]
@@ -1012,12 +1018,6 @@ git-tree-sha1 = "6ec7ac8412e83d57e313393220879ede1740f9ee"
 uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
 version = "2.8.2"
 
-[[deps.Quaternions]]
-deps = ["LinearAlgebra", "Random", "RealDot"]
-git-tree-sha1 = "da095158bdc8eaccb7890f9884048555ab771019"
-uuid = "94ee1d12-ae83-5a48-8b1c-48b8ff168ae0"
-version = "0.7.4"
-
 [[deps.REPL]]
 deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
 uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
@@ -1038,12 +1038,6 @@ git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
 uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
 version = "1.5.3"
 
-[[deps.RealDot]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "9f0a1b71baaf7650f4fa8a1d168c7fb6ee41f0c9"
-uuid = "c1ae055f-0cd5-4b69-90a6-9a35b1a98df9"
-version = "0.1.0"
-
 [[deps.RecipesBase]]
 deps = ["PrecompileTools"]
 git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff"
@@ -1097,12 +1091,6 @@ git-tree-sha1 = "9fb3462240d2898be5d4acf8925e47f70ec64d07"
 uuid = "7181ea78-2dcb-4de3-ab41-2b8ab5a31e74"
 version = "0.3.5"
 
-[[deps.Rotations]]
-deps = ["LinearAlgebra", "Quaternions", "Random", "StaticArrays"]
-git-tree-sha1 = "54ccb4dbab4b1f69beb255a2c0ca5f65a9c82f08"
-uuid = "6038ab10-8711-5258-84ad-4b1120ba62dc"
-version = "1.5.1"
-
 [[deps.RuntimeGeneratedFunctions]]
 deps = ["ExprTools", "SHA", "Serialization"]
 git-tree-sha1 = "237edc1563bbf078629b4f8d194bd334e97907cf"
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
index 574c2c2747..d8a61cb4ca 100644
--- a/docs/Manifest.toml
+++ b/docs/Manifest.toml
@@ -98,6 +98,12 @@ git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
 uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 version = "0.4.2"
 
+[[deps.BandedMatrices]]
+deps = ["ArrayLayouts", "FillArrays", "LinearAlgebra", "PrecompileTools", "SparseArrays"]
+git-tree-sha1 = "9ad46355045491b12eab409dee73e9de46293aa2"
+uuid = "aae01518-5342-5314-be14-df237901396f"
+version = "0.17.28"
+
 [[deps.Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
@@ -216,10 +222,10 @@ uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
 version = "0.5.0"
 
 [[deps.ClimaCore]]
-deps = ["Adapt", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"]
+deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"]
 path = ".."
 uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
-version = "0.10.41"
+version = "0.10.42"
 
 [[deps.ClimaCoreMakie]]
 deps = ["ClimaCore", "Makie"]
@@ -312,9 +318,9 @@ version = "0.3.0"
 
 [[deps.Compat]]
 deps = ["Dates", "LinearAlgebra", "UUIDs"]
-git-tree-sha1 = "7a60c856b9fa189eb34f5f8a6f6b5529b7942957"
+git-tree-sha1 = "4e88377ae7ebeaf29a047aa1ee40826e0b708a5d"
 uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "4.6.1"
+version = "4.7.0"
 
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
@@ -620,9 +626,9 @@ version = "3.3.8+0"
 
 [[deps.GPUArrays]]
 deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
-git-tree-sha1 = "745847e65e72a475716952f0a8a7258d42338ce9"
+git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "8.8.0"
+version = "8.8.1"
 
 [[deps.GPUArraysCore]]
 deps = ["Adapt"]
@@ -1047,10 +1053,10 @@ uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9"
 version = "2.35.0+0"
 
 [[deps.Libtiff_jll]]
-deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"]
-git-tree-sha1 = "2da088d113af58221c52828a80378e16be7d037a"
+deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "Pkg", "Zlib_jll", "Zstd_jll"]
+git-tree-sha1 = "3eb79b0ca5764d4799c06699573fd8f533259713"
 uuid = "89763e89-9b03-5906-acba-b20f662cd828"
-version = "4.5.1+1"
+version = "4.4.0+0"
 
 [[deps.Libuuid_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
@@ -1103,9 +1109,9 @@ version = "1.0.0"
 
 [[deps.LoopVectorization]]
 deps = ["ArrayInterface", "ArrayInterfaceCore", "CPUSummary", "ChainRulesCore", "CloseOpenIntervals", "DocStringExtensions", "ForwardDiff", "HostCPUFeatures", "IfElse", "LayoutPointers", "LinearAlgebra", "OffsetArrays", "PolyesterWeave", "PrecompileTools", "SIMDTypes", "SLEEFPirates", "SpecialFunctions", "Static", "StaticArrayInterface", "ThreadingUtilities", "UnPack", "VectorizationBase"]
-git-tree-sha1 = "cdd207c26d86952949f1ac65b21baf078cc1937a"
+git-tree-sha1 = "e4eed22d70ac91d7a4bf9e0f6902383061d17105"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
-version = "0.12.161"
+version = "0.12.162"
 
 [[deps.MKL_jll]]
 deps = ["Artifacts", "IntelOpenMP_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
@@ -1759,9 +1765,9 @@ version = "0.3.9"
 
 [[deps.SpecialFunctions]]
 deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
-git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880"
+git-tree-sha1 = "7beb031cf8145577fbccacd94b8a8f4ce78428d3"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "2.2.0"
+version = "2.3.0"
 
 [[deps.StableHashTraits]]
 deps = ["CRC32c", "Compat", "Dates", "SHA", "Tables", "TupleTools", "UUIDs"]
@@ -1877,9 +1883,9 @@ version = "1.10.1"
 
 [[deps.TaylorSeries]]
 deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"]
-git-tree-sha1 = "c274151bde5a608bb329d76160a9344af707bfb4"
+git-tree-sha1 = "50718b4fc1ce20cecf28d85215028c78b4d875c2"
 uuid = "6aa5eb33-94cf-58f4-a9d0-e4b2c4fc25ea"
-version = "0.15.1"
+version = "0.15.2"
 
 [[deps.TempestRemap_jll]]
 deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "Libdl", "NetCDF_jll", "OpenBLAS32_jll", "Pkg"]
@@ -2052,12 +2058,6 @@ git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a"
 uuid = "aed1982a-8fda-507f-9586-7b0439959a61"
 version = "1.1.34+0"
 
-[[deps.XZ_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "8abe223c2549ea70be752b20a53aa236a7868eb0"
-uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
-version = "5.4.3+0"
-
 [[deps.Xorg_libX11_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"]
 git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527"
diff --git a/examples/Manifest.toml b/examples/Manifest.toml
index efd09f6939..8392af555f 100644
--- a/examples/Manifest.toml
+++ b/examples/Manifest.toml
@@ -208,10 +208,10 @@ uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
 version = "0.5.0"
 
 [[deps.ClimaCore]]
-deps = ["Adapt", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"]
+deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"]
 path = ".."
 uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
-version = "0.10.41"
+version = "0.10.42"
 
 [[deps.ClimaCorePlots]]
 deps = ["ClimaCore", "RecipesBase", "StaticArrays", "TriplotBase"]
@@ -593,9 +593,9 @@ version = "3.3.8+0"
 
 [[deps.GPUArrays]]
 deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
-git-tree-sha1 = "745847e65e72a475716952f0a8a7258d42338ce9"
+git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "8.8.0"
+version = "8.8.1"
 
 [[deps.GPUArraysCore]]
 deps = ["Adapt"]
@@ -950,10 +950,10 @@ uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9"
 version = "2.35.0+0"
 
 [[deps.Libtiff_jll]]
-deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"]
-git-tree-sha1 = "2da088d113af58221c52828a80378e16be7d037a"
+deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "Pkg", "Zlib_jll", "Zstd_jll"]
+git-tree-sha1 = "3eb79b0ca5764d4799c06699573fd8f533259713"
 uuid = "89763e89-9b03-5906-acba-b20f662cd828"
-version = "4.5.1+1"
+version = "4.4.0+0"
 
 [[deps.Libuuid_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
@@ -1530,9 +1530,9 @@ version = "1.21.0"
 
 [[deps.SpecialFunctions]]
 deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
-git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880"
+git-tree-sha1 = "7beb031cf8145577fbccacd94b8a8f4ce78428d3"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "2.2.0"
+version = "2.3.0"
 
 [[deps.SplittablesBase]]
 deps = ["Setfield", "Test"]
@@ -1613,9 +1613,9 @@ version = "1.10.1"
 
 [[deps.TaylorSeries]]
 deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"]
-git-tree-sha1 = "c274151bde5a608bb329d76160a9344af707bfb4"
+git-tree-sha1 = "50718b4fc1ce20cecf28d85215028c78b4d875c2"
 uuid = "6aa5eb33-94cf-58f4-a9d0-e4b2c4fc25ea"
-version = "0.15.1"
+version = "0.15.2"
 
 [[deps.TempestRemap_jll]]
 deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "Libdl", "NetCDF_jll", "OpenBLAS32_jll", "Pkg"]
@@ -1778,12 +1778,6 @@ git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a"
 uuid = "aed1982a-8fda-507f-9586-7b0439959a61"
 version = "1.1.34+0"
 
-[[deps.XZ_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "8abe223c2549ea70be752b20a53aa236a7868eb0"
-uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
-version = "5.4.3+0"
-
 [[deps.Xorg_libX11_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"]
 git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527"
diff --git a/perf/Manifest.toml b/perf/Manifest.toml
index dd7e9e54f5..78846156db 100644
--- a/perf/Manifest.toml
+++ b/perf/Manifest.toml
@@ -203,10 +203,10 @@ uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
 version = "0.5.0"
 
 [[deps.ClimaCore]]
-deps = ["Adapt", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"]
+deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"]
 path = ".."
 uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
-version = "0.10.41"
+version = "0.10.42"
 
 [[deps.ClimaCorePlots]]
 deps = ["ClimaCore", "RecipesBase", "StaticArrays", "TriplotBase"]
@@ -623,9 +623,9 @@ version = "3.3.8+0"
 
 [[deps.GPUArrays]]
 deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
-git-tree-sha1 = "745847e65e72a475716952f0a8a7258d42338ce9"
+git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "8.8.0"
+version = "8.8.1"
 
 [[deps.GPUArraysCore]]
 deps = ["Adapt"]
@@ -1003,10 +1003,10 @@ uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9"
 version = "2.35.0+0"
 
 [[deps.Libtiff_jll]]
-deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"]
-git-tree-sha1 = "2da088d113af58221c52828a80378e16be7d037a"
+deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "Pkg", "Zlib_jll", "Zstd_jll"]
+git-tree-sha1 = "3eb79b0ca5764d4799c06699573fd8f533259713"
 uuid = "89763e89-9b03-5906-acba-b20f662cd828"
-version = "4.5.1+1"
+version = "4.4.0+0"
 
 [[deps.Libuuid_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
@@ -1641,9 +1641,9 @@ version = "1.21.0"
 
 [[deps.SpecialFunctions]]
 deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
-git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880"
+git-tree-sha1 = "7beb031cf8145577fbccacd94b8a8f4ce78428d3"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "2.2.0"
+version = "2.3.0"
 
 [[deps.Static]]
 deps = ["IfElse"]
@@ -1729,9 +1729,9 @@ version = "1.10.1"
 
 [[deps.TaylorSeries]]
 deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"]
-git-tree-sha1 = "c274151bde5a608bb329d76160a9344af707bfb4"
+git-tree-sha1 = "50718b4fc1ce20cecf28d85215028c78b4d875c2"
 uuid = "6aa5eb33-94cf-58f4-a9d0-e4b2c4fc25ea"
-version = "0.15.1"
+version = "0.15.2"
 
 [[deps.TempestRemap_jll]]
 deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "Libdl", "NetCDF_jll", "OpenBLAS32_jll", "Pkg"]
@@ -1894,12 +1894,6 @@ git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a"
 uuid = "aed1982a-8fda-507f-9586-7b0439959a61"
 version = "1.1.34+0"
 
-[[deps.XZ_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "8abe223c2549ea70be752b20a53aa236a7868eb0"
-uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
-version = "5.4.3+0"
-
 [[deps.Xorg_libX11_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"]
 git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527"
diff --git a/src/ClimaCore.jl b/src/ClimaCore.jl
index b37bd8f397..72ada48e34 100644
--- a/src/ClimaCore.jl
+++ b/src/ClimaCore.jl
@@ -14,6 +14,7 @@ include("Topologies/Topologies.jl")
 include("Spaces/Spaces.jl")
 include("Fields/Fields.jl")
 include("Operators/Operators.jl")
+include("MatrixFields/MatrixFields.jl")
 include("Hypsography/Hypsography.jl")
 include("Limiters/Limiters.jl")
 include("InputOutput/InputOutput.jl")
diff --git a/src/Fields/mapreduce.jl b/src/Fields/mapreduce.jl
index c7b1e23b12..27e15d7b46 100644
--- a/src/Fields/mapreduce.jl
+++ b/src/Fields/mapreduce.jl
@@ -1,4 +1,4 @@
-Base.map(fn, field::Field) = Base.broadcast(fn, field)
+Base.map(fn, fields::Field...) = Base.broadcast(fn, fields...)
 
 """
     Fields.local_sum(v::Field)
diff --git a/src/Geometry/axistensors.jl b/src/Geometry/axistensors.jl
index a66b723708..0b4b70ed9e 100644
--- a/src/Geometry/axistensors.jl
+++ b/src/Geometry/axistensors.jl
@@ -269,10 +269,30 @@ Base.propertynames(x::AxisVector) = symbols(axes(x, 1))
     end
 end
 
+const AdjointAxisTensor{T, N, A, S} = Adjoint{T, AxisTensor{T, N, A, S}}
+
+Base.show(io::IO, a::AdjointAxisTensor{T, N, A, S}) where {T, N, A, S} =
+    print(io, "adjoint($(a'))")
+
+components(a::AdjointAxisTensor) = components(parent(a))'
+
+Base.zero(a::AdjointAxisTensor) = zero(typeof(a))
+Base.zero(::Type{AdjointAxisTensor{T, N, A, S}}) where {T, N, A, S} =
+    zero(AxisTensor{T, N, A, S})'
+
+@inline +(a::AdjointAxisTensor) = (+a')'
+@inline -(a::AdjointAxisTensor) = (-a')'
+@inline +(a::AdjointAxisTensor, b::AdjointAxisTensor) = (a' + b')'
+@inline -(a::AdjointAxisTensor, b::AdjointAxisTensor) = (a' - b')'
+@inline *(a::Number, b::AdjointAxisTensor) = (a * b')'
+@inline *(a::AdjointAxisTensor, b::Number) = (a' * b)'
+@inline /(a::AdjointAxisTensor, b::Number) = (a' / b)'
+@inline \(a::Number, b::AdjointAxisTensor) = (a \ b')'
+
+@inline (==)(a::AdjointAxisTensor, b::AdjointAxisTensor) = a' == b'
 
 const AdjointAxisVector{T, A1, S} = Adjoint{T, AxisVector{T, A1, S}}
 
-components(va::AdjointAxisVector) = components(parent(va))'
 Base.@propagate_inbounds Base.getindex(va::AdjointAxisVector, i::Int) =
     getindex(components(va), i)
 Base.@propagate_inbounds Base.getindex(va::AdjointAxisVector, i::Int, j::Int) =
@@ -286,7 +306,6 @@ Axis2Tensor(
 ) = AxisTensor(axes, components)
 
 const AdjointAxis2Tensor{T, A, S} = Adjoint{T, Axis2Tensor{T, A, S}}
-components(va::AdjointAxis2Tensor) = components(parent(va))'
 
 const Axis2TensorOrAdj{T, A, S} =
     Union{Axis2Tensor{T, A, S}, AdjointAxis2Tensor{T, A, S}}
diff --git a/src/MatrixFields/MatrixFields.jl b/src/MatrixFields/MatrixFields.jl
new file mode 100644
index 0000000000..6ed80c32e3
--- /dev/null
+++ b/src/MatrixFields/MatrixFields.jl
@@ -0,0 +1,26 @@
+module MatrixFields
+
+import LinearAlgebra: UniformScaling, Adjoint
+import StaticArrays: SArray, SMatrix, SVector
+import BandedMatrices: BandedMatrix, band, _BandedMatrix
+import ..Utilities: PlusHalf, half
+import ..RecursiveApply:
+    rmap, rpromote_type, rzero, rconvert, radd, rsub, rmul, rdiv
+import ..Geometry
+import ..Spaces
+import ..Fields
+import ..Operators
+
+export ⋅
+export DiagonalMatrixRow,
+    BidiagonalMatrixRow,
+    TridiagonalMatrixRow,
+    QuaddiagonalMatrixRow,
+    PentadiagonalMatrixRow
+
+include("band_matrix_row.jl")
+include("rmul_with_projection.jl")
+include("matrix_multiplication.jl")
+include("matrix_field_utils.jl")
+
+end
diff --git a/src/MatrixFields/band_matrix_row.jl b/src/MatrixFields/band_matrix_row.jl
new file mode 100644
index 0000000000..d637ef089c
--- /dev/null
+++ b/src/MatrixFields/band_matrix_row.jl
@@ -0,0 +1,138 @@
+"""
+    BandMatrixRow{ld}(entries...)
+
+Stores the nonzero entries in a row of a band matrix, starting with the lowest
+diagonal, which has index `ld`. Supported operations include accessing the entry
+on the diagonal with index `d` by calling `row[d]`, taking linear combinations
+with other band matrix rows (and with `LinearAlgebra.I`), and checking for
+equality with other band matrix rows (and with `LinearAlgebra.I`). There are
+several aliases defined for commonly-used subtypes of `BandMatrixRow` (with `T`
+denoting the type of the row's entries):
+- `DiagonalMatrixRow{T}`
+- `BidiagonalMatrixRow{T}`
+- `TridiagonalMatrixRow{T}`
+- `QuaddiagonalMatrixRow{T}`
+- `PentadiagonalMatrixRow{T}`
+"""
+struct BandMatrixRow{ld, bw, T} # bw is the bandwidth (the number of diagonals)
+    entries::NTuple{bw, T}
+    BandMatrixRow{ld, bw, T}(entries::NTuple{bw, Any}) where {ld, bw, T} =
+        new{ld, bw, T}(rconvert(NTuple{bw, T}, entries))
+    # TODO: Remove this inner constructor once Julia's default convert function
+    # is type-stable for nested Tuple/NamedTuple types.
+end
+BandMatrixRow{ld}(entries::Vararg{Any, bw}) where {ld, bw} =
+    BandMatrixRow{ld, bw}(entries...)
+BandMatrixRow{ld, bw}(entries::Vararg{Any, bw}) where {ld, bw} =
+    BandMatrixRow{ld, bw, rpromote_type(map(typeof, entries)...)}(entries)
+
+const DiagonalMatrixRow{T} = BandMatrixRow{0, 1, T}
+const BidiagonalMatrixRow{T} = BandMatrixRow{-1 + half, 2, T}
+const TridiagonalMatrixRow{T} = BandMatrixRow{-1, 3, T}
+const QuaddiagonalMatrixRow{T} = BandMatrixRow{-2 + half, 4, T}
+const PentadiagonalMatrixRow{T} = BandMatrixRow{-2, 5, T}
+
+"""
+    outer_diagonals(::Type{<:BandMatrixRow})
+
+Gets the indices of the lower and upper diagonals, `ld` and `ud`, of the given
+subtype of `BandMatrixRow`.
+"""
+outer_diagonals(::Type{<:BandMatrixRow{ld, bw}}) where {ld, bw} =
+    (ld, ld + bw - 1)
+
+"""
+    band_matrix_row_type(ld, ud, T)
+
+A shorthand for getting the subtype of `BandMatrixRow` that has entries of type
+`T` on the diagonals with indices in the range `ld:ud`.
+"""
+band_matrix_row_type(ld, ud, T) = BandMatrixRow{ld, ud - ld + 1, T}
+
+Base.eltype(::Type{BandMatrixRow{ld, bw, T}}) where {ld, bw, T} = T
+
+Base.zero(::Type{BandMatrixRow{ld, bw, T}}) where {ld, bw, T} =
+    BandMatrixRow{ld}(ntuple(_ -> rzero(T), Val(bw))...)
+
+Base.map(f::F, rows::BandMatrixRow{ld}...) where {F, ld} =
+    BandMatrixRow{ld}(map(f, map(row -> row.entries, rows)...)...)
+
+Base.@propagate_inbounds Base.getindex(row::BandMatrixRow{ld}, d) where {ld} =
+    row.entries[d - ld + 1]
+
+function Base.promote_rule(
+    ::Type{BMR1},
+    ::Type{BMR2},
+) where {BMR1 <: BandMatrixRow, BMR2 <: BandMatrixRow}
+    ld1, ud1 = outer_diagonals(BMR1)
+    ld2, ud2 = outer_diagonals(BMR2)
+    typeof(ld1) == typeof(ld2) || error(
+        "Cannot promote the $(ld1 isa PlusHalf ? "non-" : "")square matrix \
+         row type $BMR1 and the $(ld2 isa PlusHalf ? "non-" : "")square matrix \
+         row type $BMR2 to a common type",
+    )
+    T = rpromote_type(eltype(BMR1), eltype(BMR2))
+    return band_matrix_row_type(min(ld1, ld2), max(ud1, ud2), T)
+end
+
+Base.promote_rule(
+    ::Type{BMR},
+    ::Type{US},
+) where {BMR <: BandMatrixRow, US <: UniformScaling} =
+    promote_rule(BMR, DiagonalMatrixRow{eltype(US)})
+
+function Base.convert(
+    ::Type{BMR},
+    row::BandMatrixRow,
+) where {BMR <: BandMatrixRow}
+    old_ld, old_ud = outer_diagonals(typeof(row))
+    new_ld, new_ud = outer_diagonals(BMR)
+    typeof(old_ld) == typeof(new_ld) || error(
+        "Cannot convert a $(old_ld isa PlusHalf ? "non-" : "")square matrix \
+         row of type $(typeof(row)) to the \
+         $(new_ld isa PlusHalf ? "non-" : "")square matrix row type $BMR",
+    )
+    new_ld <= old_ld && new_ud >= old_ud || error(
+        "Cannot convert a $(typeof(row)) to a $BMR, since that would require \
+         dropping potentially non-zero row entries",
+    )
+    first_zeros = ntuple(_ -> rzero(eltype(BMR)), Val(old_ld - new_ld))
+    last_zeros = ntuple(_ -> rzero(eltype(BMR)), Val(new_ud - old_ud))
+    return BMR((first_zeros..., row.entries..., last_zeros...))
+end
+
+Base.convert(::Type{BMR}, row::UniformScaling) where {BMR <: BandMatrixRow} =
+    convert(BMR, DiagonalMatrixRow(row.λ))
+
+Base.:(==)(row1::BMR, row2::BMR) where {BMR <: BandMatrixRow} =
+    row1.entries == row2.entries
+Base.:(==)(row1::BandMatrixRow, row2::BandMatrixRow) =
+    ==(promote(row1, row2)...)
+Base.:(==)(row1::BandMatrixRow, row2::UniformScaling) =
+    ==(promote(row1, row2)...)
+Base.:(==)(row1::UniformScaling, row2::BandMatrixRow) =
+    ==(promote(row1, row2)...)
+
+Base.:+(row::BandMatrixRow) = map(radd, row)
+Base.:+(row1::BandMatrixRow, row2::BandMatrixRow) =
+    map(radd, promote(row1, row2)...)
+Base.:+(row1::BandMatrixRow, row2::UniformScaling) =
+    map(radd, promote(row1, row2)...)
+Base.:+(row1::UniformScaling, row2::BandMatrixRow) =
+    map(radd, promote(row1, row2)...)
+
+Base.:-(row::BandMatrixRow) = map(rsub, row)
+Base.:-(row1::BandMatrixRow, row2::BandMatrixRow) =
+    map(rsub, promote(row1, row2)...)
+Base.:-(row1::BandMatrixRow, row2::UniformScaling) =
+    map(rsub, promote(row1, row2)...)
+Base.:-(row1::UniformScaling, row2::BandMatrixRow) =
+    map(rsub, promote(row1, row2)...)
+
+Base.:*(row::BandMatrixRow, value::Number) =
+    map(entry -> rmul(entry, value), row)
+Base.:*(value::Number, row::BandMatrixRow) =
+    map(entry -> rmul(value, entry), row)
+
+Base.:/(row::BandMatrixRow, value::Number) =
+    map(entry -> rdiv(entry, value), row)
diff --git a/src/MatrixFields/matrix_field_utils.jl b/src/MatrixFields/matrix_field_utils.jl
new file mode 100644
index 0000000000..706366eba2
--- /dev/null
+++ b/src/MatrixFields/matrix_field_utils.jl
@@ -0,0 +1,139 @@
+function banded_matrix_info(field)
+    space = axes(field)
+    field_ld, field_ud = outer_diagonals(eltype(field))
+
+    # Find the diagonal index of the value that ends up in the bottom-right
+    # corner of the matrix, as well as the amount by which the field's diagonal
+    # indices get shifted when it is converted into a matrix.
+    bottom_corner_matrix_d, matrix_d_minus_field_d = if field_ld isa PlusHalf
+        if space.staggering isa Spaces.CellCenter
+            1, half # field is a face-to-center matrix
+        else
+            -1, -half # field is a center-to-face matrix
+        end
+    else
+        0, 0 # field is either a center-to-center or face-to-face matrix
+    end
+
+    n_rows = Spaces.nlevels(space)
+    n_cols = n_rows + bottom_corner_matrix_d
+    matrix_ld = field_ld + matrix_d_minus_field_d
+    matrix_ud = field_ud + matrix_d_minus_field_d
+    matrix_ld <= 0 && matrix_ud >= 0 ||
+        error("BandedMatrices.jl does not yet support matrices that have \
+               diagonals with indices in the range $matrix_ld:$matrix_ud")
+
+    return n_rows, n_cols, matrix_ld, matrix_ud
+end
+
+"""
+    column_field2array(field)
+
+Converts a field defined on a `FiniteDifferenceSpace` into a `Vector` or a
+`BandedMatrix`, depending on whether or not the elements of the field are
+`BandMatrixRow`s. This involves copying the data stored in the field.
+"""
+function column_field2array(field)
+    space = axes(field)
+    space isa Spaces.FiniteDifferenceSpace ||
+        error("column_field2array requires a field on a FiniteDifferenceSpace")
+    if eltype(field) <: BandMatrixRow # field represents a matrix
+        n_rows, n_cols, matrix_ld, matrix_ud = banded_matrix_info(field)
+        matrix = BandedMatrix{eltype(eltype(field))}(
+            undef,
+            (n_rows, n_cols),
+            (-matrix_ld, matrix_ud),
+        )
+        for (index_of_field_entry, matrix_d) in enumerate(matrix_ld:matrix_ud)
+            # Find the rows for which field_diagonal[row] is inside the matrix.
+            # Note: The matrix index (1, 1) corresponds to the diagonal index 0,
+            # and the matrix index (n_rows, n_cols) corresponds to the diagonal
+            # index n_cols - n_rows.
+            first_row = matrix_d < 0 ? 1 - matrix_d : 1
+            last_row = matrix_d < n_cols - n_rows ? n_rows : n_cols - matrix_d
+
+            # Copy the value in each row from field_diagonal to matrix_diagonal.
+            field_diagonal = field.entries.:($index_of_field_entry)
+            matrix_diagonal = view(matrix, band(matrix_d))
+            for (index_along_diagonal, row) in enumerate(first_row:last_row)
+                matrix_diagonal[index_along_diagonal] =
+                    Fields.field_values(field_diagonal)[row]
+            end
+        end
+        return matrix
+    else # field represents a vector
+        n_rows = Spaces.nlevels(space)
+        return map(row -> Fields.field_values(field)[row], 1:n_rows)
+    end
+end
+
+"""
+    field2arrays(field)
+
+Converts a field defined on a `FiniteDifferenceSpace` or on an
+`ExtrudedFiniteDifferenceSpace` into a tuple of arrays, each of which
+corresponds to a column of the field. This is done by calling
+`column_field2array` on each of the field's columns.
+"""
+function field2arrays(field)
+    space = axes(field)
+    column_indices = if space isa Spaces.FiniteDifferenceSpace
+        (((1, 1), 1),)
+    elseif space isa Spaces.ExtrudedFiniteDifferenceSpace
+        (Spaces.all_nodes(Spaces.horizontal_space(space))...,)
+    else
+        error("Invalid space type: $(typeof(space).name.wrapper)")
+    end
+    return map(column_indices) do ((i, j), h)
+        column_field2array(Spaces.column(field, i, j, h))
+    end
+end
+
+"""
+    column_field2array_view(field)
+
+Similar to `column_field2array(field)`, except that this version avoids copying
+the data stored in the field.
+"""
+function column_field2array_view(field)
+    space = axes(field)
+    space isa Spaces.FiniteDifferenceSpace ||
+        error("column_field2array_view requires a field on a \
+               FiniteDifferenceSpace")
+    if eltype(field) <: BandMatrixRow # field represents a matrix
+        n_rows, n_cols, matrix_ld, matrix_ud = banded_matrix_info(field)
+        data_transpose = reinterpret(eltype(eltype(field)), parent(field)')
+        matrix_transpose =
+            _BandedMatrix(data_transpose, n_cols, matrix_ud, -matrix_ld)
+        return permutedims(matrix_transpose)
+        # TODO: Despite not copying any data, this function still allocates a
+        # small amount of memory because of _BandedMatrix and permutedims.
+    else # field represents a vector
+        return vec(reinterpret(eltype(field), parent(field)'))
+    end
+end
+
+function Base.show(
+    io::IO,
+    field::Fields.Field{<:Fields.AbstractData{<:BandMatrixRow}},
+)
+    print(io, eltype(field), "-valued Field")
+    if eltype(eltype(field)) <: Number
+        if axes(field) isa Spaces.FiniteDifferenceSpace
+            println(io, " that corresponds to the matrix")
+        else
+            println(io, " whose first column corresponds to the matrix")
+        end
+        column_field = Fields.column(field, 1, 1, 1)
+        Base.print_array(io, column_field2array_view(column_field))
+    else
+        # When a BandedMatrix with non-number entries is printed, it currently
+        # either prints in an illegible format (e.g., if it has AxisTensor or
+        # AdjointAxisTensor entries) or crashes during the evaluation of
+        # isassigned (e.g., if it has Tuple or NamedTuple entries). So, for
+        # matrix fields with non-number entries, we fall back to the default
+        # function for printing fields.
+        print(io, ":")
+        Fields._show_compact_field(io, field, "  ", true)
+    end
+end
diff --git a/src/MatrixFields/matrix_multiplication.jl b/src/MatrixFields/matrix_multiplication.jl
new file mode 100644
index 0000000000..7f9144d623
--- /dev/null
+++ b/src/MatrixFields/matrix_multiplication.jl
@@ -0,0 +1,381 @@
+"""
+    MultiplyColumnwiseBandMatrixField
+
+An operator that multiplies a columnwise band matrix field (a field of
+`BandMatrixRow`s) by a regular field or by another columnwise band matrix field,
+i.e., matrix-vector or matrix-matrix multiplication. The `⋅` symbol is an alias
+for `MultiplyColumnwiseBandMatrixField()`.
+"""
+struct MultiplyColumnwiseBandMatrixField <: Operators.FiniteDifferenceOperator end
+const ⋅ = MultiplyColumnwiseBandMatrixField()
+
+#=
+TODO: Rewrite the following derivation in LaTeX and move it into the ClimaCore
+documentation.
+
+Notation:
+
+For any single-column field F, let F[idx] denote the value of F at level idx.
+For any single-column BandMatrixRow field M, let
+    M[idx, idx′] = M[idx][idx′ - idx].
+If there are multiple columns, the following equations apply per column.
+
+Matrix-Vector Multiplication:
+
+Consider a BandMatrixRow field M and a scalar (non-BandMatrixRow) field V.
+From the definition of matrix-vector multiplication,
+    (M ⋅ V)[idx] = ∑_{idx′} M[idx, idx′] * V[idx′].
+If V[idx] is only defined when left_idx ≤ idx ≤ right_idx, this becomes
+    (M ⋅ V)[idx] = ∑_{idx′ ∈ left_idx:right_idx} M[idx, idx′] * V[idx′].
+If M[idx, idx′] is only defined when idx + ld ≤ idx′ ≤ idx + ud, this becomes
+    (M ⋅ V)[idx] =
+        ∑_{idx′ ∈ max(left_idx, idx + ld):min(right_idx, idx + ud)}
+            M[idx, idx′] * V[idx′].
+Replacing the variable idx′ with the variable d = idx′ - idx gives us
+    (M ⋅ V)[idx] =
+        ∑_{d ∈ max(left_idx - idx, ld):min(right_idx - idx, ud)}
+            M[idx, idx + d] * V[idx + d].
+This can be rewritten using the standard indexing notation as
+    (M ⋅ V)[idx] =
+        ∑_{d ∈ max(left_idx - idx, ld):min(right_idx - idx, ud)}
+            M[idx][d] * V[idx + d].
+Finally, we can express this in terms of left/right boundaries and an interior:
+    (M ⋅ V)[idx] =
+        ∑_{
+            d ∈
+                if idx < left_idx - ld
+                    (left_idx - idx):ud
+                elseif idx > right_idx - ud
+                    ld:(right_idx - idx)
+                else
+                    ld:ud
+                end
+        } M[idx][d] * V[idx + d].
+
+Matrix-Matrix Multiplication:
+
+Consider a BandMatrixRow field M1 and another BandMatrixRow field M2.
+From the definition of matrix-matrix multiplication,
+    (M1 ⋅ M2)[idx, idx′] = ∑_{idx′′} M1[idx, idx′′] * M2[idx′′, idx′].
+If M2[idx′′] is only defined when left_idx ≤ idx′′ ≤ right_idx, this becomes
+    (M1 ⋅ M2)[idx, idx′] =
+        ∑_{idx′′ ∈ left_idx:right_idx} M1[idx, idx′′] * M2[idx′′, idx′].
+If M1[idx, idx′′] is only defined when idx + ld1 ≤ idx′′ ≤ idx + ud1, this becomes
+    (M1 ⋅ M2)[idx, idx′] =
+        ∑_{idx′′ ∈ max(left_idx, idx + ld1):min(right_idx, idx + ud1)}
+            M1[idx, idx′′] * M2[idx′′, idx′].
+If M2[idx′′, idx′] is only defined when idx′′ + ld2 ≤ idx′ ≤ idx′′ + ud2, or,
+equivalently, when idx′ - ud2 ≤ idx′′ ≤ idx′ - ld2, this becomes
+    (M1 ⋅ M2)[idx, idx′] =
+        ∑_{
+            idx′′ ∈
+                max(left_idx, idx + ld1, idx′ - ud2):
+                min(right_idx, idx + ud1, idx′ - ld2)
+        } M1[idx, idx′′] * M2[idx′′, idx′].
+Replacing the variable idx′ with the variable prod_d = idx′ - idx gives us
+    (M1 ⋅ M2)[idx, idx + prod_d] =
+        ∑_{
+            idx′′ ∈
+                max(left_idx, idx + ld1, idx + prod_d - ud2):
+                min(right_idx, idx + ud1, idx + prod_d - ld2)
+        } M1[idx, idx′′] * M2[idx′′, idx + prod_d].
+Replacing the variable idx′′ with the variable d = idx′′ - idx gives us
+    (M1 ⋅ M2)[idx, idx + prod_d] =
+        ∑_{
+            d ∈
+                max(left_idx - idx, ld1, prod_d - ud2):
+                min(right_idx - idx, ud1, prod_d - ld2)
+        } M1[idx, idx + d] * M2[idx + d, idx + prod_d].
+This can be rewritten using the standard indexing notation as
+    (M1 ⋅ M2)[idx][prod_d] =
+        ∑_{
+            d ∈
+                max(left_idx - idx, ld1, prod_d - ud2):
+                min(right_idx - idx, ud1, prod_d - ld2)
+        } M1[idx][d] * M2[idx + d][prod_d - d].
+Finally, we can express this in terms of left/right boundaries and an interior:
+    (M1 ⋅ M2)[idx][prod_d] =
+        ∑_{
+            d ∈
+                if idx < left_idx - ld1
+                    max(left_idx - idx, prod_d - ud2):min(ud1, prod_d - ld2)
+                elseif idx > right_idx - ud1
+                    max(ld1, prod_d - ud2):min(right_idx - idx, prod_d - ld2)
+                else
+                    max(ld1, prod_d - ud2):min(ud1, prod_d - ld2)
+                end
+        } M1[idx][d] * M2[idx + d][prod_d - d].
+
+We only need to define (M1 ⋅ M2)[idx][prod_d] when it has a nonzero value in the
+interior, which will be the case when
+    max(ld1, prod_d - ud2) ≤ min(ud1, prod_d - ld2).
+This can be rewritten as a system of four inequalities:
+    ld1 ≤ ud1,
+    ld1 ≤ prod_d - ld2,
+    prod_d - ud2 ≤ ud1, and
+    prod_d - ud2 ≤ prod_d - ld2.
+By definition, ld1 ≤ ud1 and ld2 ≤ ud2, so the first and last inequality are
+always true. Rearranging the remaining two inequalities gives us
+    ld1 + ld2 ≤ prod_d ≤ ud1 + ud2.
+=#
+
+struct TopLeftMatrixCorner <: Operators.AbstractBoundaryCondition end
+struct BottomRightMatrixCorner <: Operators.AbstractBoundaryCondition end
+
+Operators.has_boundary(
+    ::MultiplyColumnwiseBandMatrixField,
+    ::Operators.LeftBoundaryWindow{name},
+) where {name} = true
+Operators.has_boundary(
+    ::MultiplyColumnwiseBandMatrixField,
+    ::Operators.RightBoundaryWindow{name},
+) where {name} = true
+
+Operators.get_boundary(
+    ::MultiplyColumnwiseBandMatrixField,
+    ::Operators.LeftBoundaryWindow{name},
+) where {name} = TopLeftMatrixCorner()
+Operators.get_boundary(
+    ::MultiplyColumnwiseBandMatrixField,
+    ::Operators.RightBoundaryWindow{name},
+) where {name} = BottomRightMatrixCorner()
+
+Operators.stencil_interior_width(
+    ::MultiplyColumnwiseBandMatrixField,
+    matrix1,
+    arg,
+) = ((0, 0), outer_diagonals(eltype(matrix1)))
+
+# Interior indices of a center-to-center or face-to-face matrix.
+matrix_left_interior_index(space, lbw::Integer) =
+    Operators.left_idx(space) - lbw
+matrix_right_interior_index(space, rbw::Integer) =
+    Operators.right_idx(space) - rbw
+
+# Interior indices of a face-to-center matrix.
+matrix_left_interior_index(
+    space::Union{
+        Spaces.CenterFiniteDifferenceSpace,
+        Spaces.CenterExtrudedFiniteDifferenceSpace,
+    },
+    lbw::PlusHalf,
+) = Operators.left_idx(space) - lbw - half
+matrix_right_interior_index(
+    space::Union{
+        Spaces.CenterFiniteDifferenceSpace,
+        Spaces.CenterExtrudedFiniteDifferenceSpace,
+    },
+    rbw::PlusHalf,
+) = Operators.right_idx(space) - rbw + half
+
+# Interior indices of a center-to-face matrix.
+matrix_left_interior_index(
+    space::Union{
+        Spaces.FaceFiniteDifferenceSpace,
+        Spaces.FaceExtrudedFiniteDifferenceSpace,
+    },
+    lbw::PlusHalf,
+) = Operators.left_idx(space) - lbw + half
+matrix_right_interior_index(
+    space::Union{
+        Spaces.FaceFiniteDifferenceSpace,
+        Spaces.FaceExtrudedFiniteDifferenceSpace,
+    },
+    rbw::PlusHalf,
+) = Operators.right_idx(space) - rbw - half
+
+Operators.left_interior_idx(
+    space::Spaces.AbstractSpace,
+    ::MultiplyColumnwiseBandMatrixField,
+    ::TopLeftMatrixCorner,
+    matrix1,
+    arg,
+) = matrix_left_interior_index(space, outer_diagonals(eltype(matrix1))[1])
+Operators.right_interior_idx(
+    space::Spaces.AbstractSpace,
+    ::MultiplyColumnwiseBandMatrixField,
+    ::BottomRightMatrixCorner,
+    matrix1,
+    arg,
+) = matrix_right_interior_index(space, outer_diagonals(eltype(matrix1))[2])
+
+function rmul_type(::Type{T1}, ::Type{T2}, ::Type{LG}) where {T1, T2, LG}
+    type = Base._return_type(rmul_with_projection, Tuple{T1, T2, LG})
+    type == Union{} &&
+        error("Unable to infer result type: Calling rmul_with_projection with \
+               arguments of types $T1 and $T2 will cause it to throw an error")
+    return type
+end
+
+function Operators.return_eltype(
+    ::MultiplyColumnwiseBandMatrixField,
+    matrix1,
+    arg,
+)
+    eltype(matrix1) <: BandMatrixRow || error(
+        "The first argument of ⋅ must have elements of type BandMatrixRow, but \
+         the given argument has elements of type $(eltype(matrix1))",
+    )
+    lg_type = eltype(Fields.local_geometry_field(axes(arg)))
+    if eltype(arg) <: BandMatrixRow # matrix-matrix multiplication
+        matrix2 = arg
+        ld1, ud1 = outer_diagonals(eltype(matrix1))
+        ld2, ud2 = outer_diagonals(eltype(matrix2))
+        prod_ld, prod_ud = ld1 + ld2, ud1 + ud2
+        prod_value_type =
+            rmul_type(eltype(eltype(matrix1)), eltype(eltype(matrix2)), lg_type)
+        return band_matrix_row_type(prod_ld, prod_ud, prod_value_type)
+    else # matrix-vector multiplication
+        vector = arg
+        return rmul_type(eltype(eltype(matrix1)), eltype(vector), lg_type)
+    end
+end
+
+Operators.return_space(::MultiplyColumnwiseBandMatrixField, space1, space2) =
+    space1
+
+# TODO: Use @propagate_inbounds here, and remove @inbounds from this function.
+# As of Julia 1.8, doing this increases compilation time by more than an order
+# of magnitude, and it also makes type inference fail for some complicated
+# matrix field broadcast expressions (in particular, those that involve products
+# of linear combinations of matrix fields). Not using @propagate_inbounds causes
+# matrix field broadcast expressions to take roughly 3 or 4 times longer to
+# evaluate, but this is less significant than the decrease in compilation time.
+function multiply_matrix_at_index(
+    loc,
+    space,
+    idx,
+    hidx,
+    matrix1,
+    arg,
+    boundary_ld1 = nothing,
+    boundary_ud1 = nothing,
+)
+    matrix1_row = Operators.getidx(space, matrix1, loc, idx, hidx)
+    ld1, ud1 = outer_diagonals(eltype(matrix1))
+    ld1_or_boundary_ld1 = isnothing(boundary_ld1) ? ld1 : boundary_ld1
+    ud1_or_boundary_ud1 = isnothing(boundary_ud1) ? ud1 : boundary_ud1
+    prod_type = Operators.return_eltype(⋅, matrix1, arg)
+    if eltype(arg) <: BandMatrixRow # matrix-matrix multiplication
+        matrix2 = arg
+        matrix2_rows = map((ld1:ud1...,)) do d
+            # TODO: Use @propagate_inbounds_meta instead of @inline_meta.
+            Base.@_inline_meta
+            if (
+                (isnothing(boundary_ld1) || d >= boundary_ld1) &&
+                (isnothing(boundary_ud1) || d <= boundary_ud1)
+            )
+                @inbounds Operators.getidx(space, matrix2, loc, idx + d, hidx)
+            else
+                rzero(eltype(matrix2)) # This value is never used.
+            end
+        end # The rows are precomputed to avoid recomputing them multiple times.
+        matrix2_rows_wrapper = BandMatrixRow{ld1}(matrix2_rows...)
+        ld2, ud2 = outer_diagonals(eltype(matrix2))
+        prod_ld, prod_ud = outer_diagonals(prod_type)
+        zero_value = rzero(eltype(prod_type))
+        prod_entries = map((prod_ld:prod_ud...,)) do prod_d
+            # TODO: Use @propagate_inbounds_meta instead of @inline_meta.
+            Base.@_inline_meta
+            min_d = max(ld1_or_boundary_ld1, prod_d - ud2)
+            max_d = min(ud1_or_boundary_ud1, prod_d - ld2)
+            # Note: If min_d:max_d is an empty range, then the current entry
+            # lies outside of the product matrix, so it should never be used in
+            # any computations. By initializing prod_entry to zero_value, we are
+            # implicitly setting all such entries to 0. We could alternatively
+            # set all such entries to NaN (in order to more easily catch user
+            # errors that involve accidentally using these entires), but that
+            # would not generalize to non-floating-point types like Int or Bool.
+            prod_entry = zero_value
+            @inbounds for d in min_d:max_d
+                value1 = matrix1_row[d]
+                value2 = matrix2_rows_wrapper[d][prod_d - d]
+                value2_lg = Geometry.LocalGeometry(space, idx + d, hidx)
+                prod_entry = radd(
+                    prod_entry,
+                    rmul_with_projection(value1, value2, value2_lg),
+                )
+            end # Using this for-loop is currently faster than using mapreduce.
+            prod_entry
+        end
+        return BandMatrixRow{prod_ld}(prod_entries...)
+    else # matrix-vector multiplication
+        vector = arg
+        prod_value = rzero(prod_type)
+        @inbounds for d in ld1_or_boundary_ld1:ud1_or_boundary_ud1
+            value1 = matrix1_row[d]
+            value2 = Operators.getidx(space, vector, loc, idx + d, hidx)
+            value2_lg = Geometry.LocalGeometry(space, idx + d, hidx)
+            prod_value = radd(
+                prod_value,
+                rmul_with_projection(value1, value2, value2_lg),
+            )
+        end # Using this for-loop is currently faster than using mapreduce.
+        return prod_value
+    end
+end
+
+Base.@propagate_inbounds Operators.stencil_interior(
+    ::MultiplyColumnwiseBandMatrixField,
+    loc,
+    space,
+    idx,
+    hidx,
+    matrix1,
+    arg,
+) = multiply_matrix_at_index(loc, space, idx, hidx, matrix1, arg)
+
+Base.@propagate_inbounds Operators.stencil_left_boundary(
+    ::MultiplyColumnwiseBandMatrixField,
+    ::TopLeftMatrixCorner,
+    loc,
+    space,
+    idx,
+    hidx,
+    matrix1,
+    arg,
+) = multiply_matrix_at_index(
+    loc,
+    space,
+    idx,
+    hidx,
+    matrix1,
+    arg,
+    Operators.left_idx(
+        Operators.reconstruct_placeholder_space(axes(arg), space),
+    ) - idx,
+    nothing,
+)
+
+Base.@propagate_inbounds Operators.stencil_right_boundary(
+    ::MultiplyColumnwiseBandMatrixField,
+    ::BottomRightMatrixCorner,
+    loc,
+    space,
+    idx,
+    hidx,
+    matrix1,
+    arg,
+) = multiply_matrix_at_index(
+    loc,
+    space,
+    idx,
+    hidx,
+    matrix1,
+    arg,
+    nothing,
+    Operators.right_idx(
+        Operators.reconstruct_placeholder_space(axes(arg), space),
+    ) - idx,
+)
+
+# For matrix field broadcast expressions involving 4 or more matrices, we
+# sometimes hit a recursion limit and de-optimize.
+# We know that the recursion will terminate due to the fact that broadcast
+# expressions are not self-referential.
+if hasfield(Method, :recursion_relation)
+    dont_limit = (args...) -> true
+    for m in methods(multiply_matrix_at_index)
+        m.recursion_relation = dont_limit
+    end
+end
diff --git a/src/MatrixFields/rmul_with_projection.jl b/src/MatrixFields/rmul_with_projection.jl
new file mode 100644
index 0000000000..76f9846b68
--- /dev/null
+++ b/src/MatrixFields/rmul_with_projection.jl
@@ -0,0 +1,32 @@
+const SingleValue =
+    Union{Number, Geometry.AxisTensor, Geometry.AdjointAxisTensor}
+
+mul_with_projection(x, y, _) = x * y
+mul_with_projection(x::Geometry.AdjointAxisVector, y::Geometry.AxisTensor, lg) =
+    x * Geometry.project(Geometry.dual(axes(x, 2)), y, lg)
+mul_with_projection(::Geometry.AdjointAxisTensor, ::Geometry.AxisTensor, _) =
+    error("mul_with_projection is currently only implemented for covectors, \
+           and higher-order cotensors are not supported")
+# We should add methods for other cotensors (e.g., AdjointAxis2Tensor) when they
+# are needed (e.g., when we need to support matrices that represent the
+# divergence of higher-order tensors).
+
+"""
+    rmul_with_projection(x, y, lg)
+
+Similar to `rmul(x, y)`, but with automatic projection of `y` when `x` contains
+a covector (i.e, an `AdjointAxisVector`). For example, if `x` is a covector
+along the `Covariant3Axis` (e.g., `Covariant3Vector(1)'`), then `y` (or each
+element of `y`) will be projected onto the `Contravariant3Axis`. In general,
+each covector in `x` will cause `y` (or each corresponding element of `y`) to
+be projected onto the dual axis of the covector. In the future, we may extend
+this behavior to higher-order cotensors.
+"""
+rmul_with_projection(x, y, lg) =
+    rmap((x′, y′) -> mul_with_projection(x′, y′, lg), x, y)
+rmul_with_projection(x::SingleValue, y, lg) =
+    rmap(y′ -> mul_with_projection(x, y′, lg), y)
+rmul_with_projection(x, y::SingleValue, lg) =
+    rmap(x′ -> mul_with_projection(x′, y, lg), x)
+rmul_with_projection(x::SingleValue, y::SingleValue, lg) =
+    mul_with_projection(x, y, lg)
diff --git a/src/RecursiveApply/RecursiveApply.jl b/src/RecursiveApply/RecursiveApply.jl
index 2309cfa0ce..dfe43ed074 100755
--- a/src/RecursiveApply/RecursiveApply.jl
+++ b/src/RecursiveApply/RecursiveApply.jl
@@ -92,6 +92,13 @@ rmaptype(
     T2 <: NamedTuple{names, Tup2},
 } = NamedTuple{names, rmaptype(fn, Tup1, Tup2)}
 
+"""
+    rpromote_type(Ts...)
+
+Recursively apply `promote_type` to the input types.
+"""
+rpromote_type(Ts...) = reduce((T1, T2) -> rmaptype(promote_type, T1, T2), Ts)
+
 """
     rzero(T)
 
@@ -105,6 +112,17 @@ rzero(::Type{T}) where {T <: Tuple} =
 rzero(::Type{Tup}) where {names, T, Tup <: NamedTuple{names, T}} =
     NamedTuple{names}(rzero(T))
 
+"""
+    rconvert(T, X)
+
+Identical to `convert(T, X)`, but with improved type stability for nested types.
+"""
+rconvert(::Type{T}, X::T) where {T} = X
+rconvert(::Type{T}, X) where {T} =
+    rmap((zero_value, x) -> convert(typeof(zero_value), x), rzero(T), X)
+# TODO: Remove this function once Julia's default convert function is
+# type-stable for nested Tuple/NamedTuple types.
+
 """
     rmul(X, Y)
     X ⊠ Y
diff --git a/test/Geometry/axistensors.jl b/test/Geometry/axistensors.jl
index de8c46c061..93bde79693 100644
--- a/test/Geometry/axistensors.jl
+++ b/test/Geometry/axistensors.jl
@@ -27,6 +27,12 @@ ClimaCore.Geometry.assert_exact_transform() = true
     @test M[:, 1] == Geometry.Cartesian12Vector(1.0, 0.5)
     @test M[1, :] == Geometry.Covariant12Vector(1.0, 0.0)
 
+    @test x + zero(x) == x
+    @test x' + zero(x') == x'
+
+    @test -x + x * 2 - x / 2 == -x + 2 * x - 2 \ x == x / 2
+    @test -x' + x' * 2 - x' / 2 == -x' + 2 * x' - 2 \ x' == (x / 2)'
+
     @test x * y' ==
           x ⊗ y ==
           Geometry.AxisTensor(
diff --git a/test/MatrixFields/band_matrix_row.jl b/test/MatrixFields/band_matrix_row.jl
new file mode 100644
index 0000000000..0077948b4a
--- /dev/null
+++ b/test/MatrixFields/band_matrix_row.jl
@@ -0,0 +1,66 @@
+using Test
+using JET
+using LinearAlgebra: I
+
+using ClimaCore.MatrixFields
+import ClimaCore: Geometry
+
+macro test_all(expression)
+    return quote
+        local test_func() = $(esc(expression))
+        @test test_func()                   # correctness
+        @test (@allocated test_func()) == 0 # allocations
+        @test_opt test_func()               # type instabilities
+    end
+end
+
+@testset "BandMatrixRow Unit Tests" begin
+    @test_all DiagonalMatrixRow(1) ==
+              DiagonalMatrixRow(0.5) + DiagonalMatrixRow(1 // 2) ==
+              DiagonalMatrixRow(1.5) - DiagonalMatrixRow(1 // 2) ==
+              DiagonalMatrixRow(0.5) * 2 ==
+              0.5 * DiagonalMatrixRow(2) ==
+              DiagonalMatrixRow(2) / 2 ==
+              I
+
+    @test_all DiagonalMatrixRow(1 // 2) + 0.5 * I === DiagonalMatrixRow(1.0)
+    @test_all BidiagonalMatrixRow(1 // 2, 0.5) === BidiagonalMatrixRow(1, 1) / 2
+
+    @test_all convert(TridiagonalMatrixRow{Int}, DiagonalMatrixRow(1)) ===
+              convert(TridiagonalMatrixRow{Int}, I) ===
+              TridiagonalMatrixRow(0, 1, 0)
+
+    @test_all QuaddiagonalMatrixRow(0.5, 1, 1, 1 // 2) +
+              BidiagonalMatrixRow(-0.5, -1 // 2) ==
+              QuaddiagonalMatrixRow(1, 1, 1, 1) / 2
+    @test_all PentadiagonalMatrixRow(0, 0.5, 1, 1 // 2, 0) -
+              TridiagonalMatrixRow(1, 0, 1) / 2 - 0.5 * DiagonalMatrixRow(2) ==
+              PentadiagonalMatrixRow(0, 0, 0, 0, 0)
+
+    @test_all PentadiagonalMatrixRow(0, 0.5, 1, 1 // 2, 0) -
+              TridiagonalMatrixRow(1, 0, 1) / 2 - I ==
+              zero(PentadiagonalMatrixRow{Int})
+
+    T(value) = (; a = (), b = value, c = (value, (; d = (value,)), (;)))
+    @test_all QuaddiagonalMatrixRow(T(0.5), T(1), T(1), T(1 // 2)) +
+              BidiagonalMatrixRow(T(-0.5), T(-1 // 2)) ==
+              QuaddiagonalMatrixRow(T(1), T(1), T(1), T(1)) / 2
+    @test_all PentadiagonalMatrixRow(T(0), T(0.5), T(1), T(1 // 2), T(0)) -
+              TridiagonalMatrixRow(T(1), T(0), T(1)) / 2 -
+              0.5 * DiagonalMatrixRow(T(2)) ==
+              PentadiagonalMatrixRow(T(0), T(0), T(0), T(0), T(0))
+
+    @test_throws "Cannot promote" BidiagonalMatrixRow(1, 1) + I
+    @test_throws "Cannot promote" BidiagonalMatrixRow(1, 1) +
+                                  DiagonalMatrixRow(1)
+
+    @test_throws "Cannot convert" convert(BidiagonalMatrixRow{Int}, I)
+    @test_throws "Cannot convert" convert(
+        BidiagonalMatrixRow{Int},
+        DiagonalMatrixRow(1),
+    )
+    @test_throws "Cannot convert" convert(
+        TridiagonalMatrixRow{Int},
+        PentadiagonalMatrixRow(0, 0, 1, 0, 0),
+    )
+end
diff --git a/test/MatrixFields/field2arrays.jl b/test/MatrixFields/field2arrays.jl
new file mode 100644
index 0000000000..c47595f08a
--- /dev/null
+++ b/test/MatrixFields/field2arrays.jl
@@ -0,0 +1,82 @@
+using Test
+using JET
+
+import ClimaCore: Geometry, Domains, Meshes, Spaces, Fields, MatrixFields
+
+@testset "field2arrays Unit Tests" begin
+    FT = Float64
+    domain = Domains.IntervalDomain(
+        Geometry.ZPoint(FT(1)),
+        Geometry.ZPoint(FT(4));
+        boundary_tags = (:bottom, :top),
+    )
+    mesh = Meshes.IntervalMesh(domain, nelems = 3)
+    center_space = Spaces.CenterFiniteDifferenceSpace(mesh)
+    face_space = Spaces.FaceFiniteDifferenceSpace(center_space)
+    ᶜz = Fields.coordinate_field(center_space).z
+    ᶠz = Fields.coordinate_field(face_space).z
+
+    ᶜᶜmat = map(z -> MatrixFields.TridiagonalMatrixRow(2 * z, 4 * z, 8 * z), ᶜz)
+    ᶜᶠmat = map(z -> MatrixFields.BidiagonalMatrixRow(2 * z, 4 * z), ᶜz)
+    ᶠᶠmat = map(z -> MatrixFields.TridiagonalMatrixRow(2 * z, 4 * z, 8 * z), ᶠz)
+    ᶠᶜmat = map(z -> MatrixFields.BidiagonalMatrixRow(2 * z, 4 * z), ᶠz)
+
+    @test MatrixFields.column_field2array(ᶜz) ==
+          MatrixFields.column_field2array_view(ᶜz) ==
+          [1.5, 2.5, 3.5]
+
+    @test MatrixFields.column_field2array(ᶠz) ==
+          MatrixFields.column_field2array_view(ᶠz) ==
+          [1, 2, 3, 4]
+
+    @test MatrixFields.column_field2array(ᶜᶜmat) ==
+          MatrixFields.column_field2array_view(ᶜᶜmat) ==
+          [
+              6 12 0
+              5 10 20
+              0 7 14
+          ]
+
+    @test MatrixFields.column_field2array(ᶜᶠmat) ==
+          MatrixFields.column_field2array_view(ᶜᶠmat) ==
+          [
+              3 6 0 0
+              0 5 10 0
+              0 0 7 14
+          ]
+
+    @test MatrixFields.column_field2array(ᶠᶠmat) ==
+          MatrixFields.column_field2array_view(ᶠᶠmat) ==
+          [
+              4 8 0 0
+              4 8 16 0
+              0 6 12 24
+              0 0 8 16
+          ]
+
+    @test MatrixFields.column_field2array(ᶠᶜmat) ==
+          MatrixFields.column_field2array_view(ᶠᶜmat) ==
+          [
+              4 0 0
+              4 8 0
+              0 6 12
+              0 0 8
+          ]
+
+    ᶜᶜmat_array_not_view = MatrixFields.column_field2array(ᶜᶜmat)
+    ᶜᶜmat_array_view = MatrixFields.column_field2array_view(ᶜᶜmat)
+    ᶜᶜmat .*= 2
+    @test ᶜᶜmat_array_not_view == MatrixFields.column_field2array(ᶜᶜmat) ./ 2
+    @test ᶜᶜmat_array_view == MatrixFields.column_field2array(ᶜᶜmat)
+
+    @test MatrixFields.field2arrays(ᶜᶜmat) ==
+          (MatrixFields.column_field2array(ᶜᶜmat),)
+
+    # Check for type instabilities.
+    @test_opt MatrixFields.column_field2array(ᶜᶜmat)
+    @test_opt MatrixFields.column_field2array_view(ᶜᶜmat)
+    @test_opt MatrixFields.field2arrays(ᶜᶜmat)
+
+    # Because this test is broken, printing matrix fields allocates some memory.
+    @test_broken (@allocated MatrixFields.column_field2array_view(ᶜᶜmat)) == 0
+end
diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl
new file mode 100644
index 0000000000..b18585c0b1
--- /dev/null
+++ b/test/MatrixFields/matrix_field_broadcasting.jl
@@ -0,0 +1,816 @@
+using Test
+using JET
+using Random: seed!
+using LinearAlgebra: I, mul!
+using BandedMatrices: band
+
+using ClimaCore.MatrixFields
+import ClimaCore:
+    Geometry, Domains, Meshes, Topologies, Hypsography, Spaces, Fields
+import ClimaComms
+
+# Using @benchmark from BenchmarkTools is extremely slow; it appears to keep
+# triggering recompilations and allocating a lot of memory in the process.
+# This macro returns the minimum time (in seconds) required to run the
+# expression after it has been compiled.
+macro benchmark(expression)
+    return quote
+        $(esc(expression)) # Compile the expression first. Use esc for hygiene.
+        best_time = Inf
+        start_time = time_ns()
+        while time_ns() - start_time < 1e8 # Benchmark for 0.1 s (1e8 ns).
+            best_time = min(best_time, @elapsed $(esc(expression)))
+        end
+        best_time
+    end
+end
+
+# This has to be its own function in order to correctly measure its allocations.
+call_array_func(
+    ref_set_result!::F,
+    ref_result_arrays,
+    inputs_arrays,
+    temp_values_arrays,
+) where {F} = foreach(
+    ref_set_result!,
+    ref_result_arrays,
+    inputs_arrays...,
+    temp_values_arrays...,
+)
+
+function test_matrix_broadcast_against_array_reference(;
+    test_name,
+    inputs,
+    get_result::F1,
+    set_result!::F2,
+    temp_values = (),
+    ref_set_result!::F3,
+    time_ratio_limit = 1,
+    max_error_limit = 3,
+) where {F1, F2, F3}
+    @testset "$test_name" begin
+        result = get_result(inputs...)
+
+        # Fill all output fields with NaNs for testing correctness.
+        result .*= NaN
+        for temp_value in temp_values
+            temp_value .*= NaN
+        end
+
+        ref_result_arrays = MatrixFields.field2arrays(result)
+        inputs_arrays = map(MatrixFields.field2arrays, inputs)
+        temp_values_arrays = map(MatrixFields.field2arrays, temp_values)
+
+        best_time = @benchmark set_result!(result, inputs...)
+        best_ref_time = @benchmark call_array_func(
+            ref_set_result!,
+            ref_result_arrays,
+            inputs_arrays,
+            temp_values_arrays,
+        )
+
+        # Compute the maximum error as an integer multiple of machine epsilon.
+        result_arrays = MatrixFields.field2arrays(result)
+        max_error =
+            maximum(zip(result_arrays, ref_result_arrays)) do (array, ref_array)
+                maximum(zip(array, ref_array)) do (value, ref_value)
+                    Int(abs(value - ref_value) / eps(ref_value))
+                end
+            end
+
+        @info "$test_name:\n\tBest Time = $best_time s\n\tBest Reference Time \
+               = $best_ref_time s\n\tMaximum Error = $max_error eps"
+
+        # Test that set_result! is performant compared to ref_set_result!.
+        @test best_time < time_ratio_limit * best_ref_time
+
+        # Test set_result! for correctness, allocations, and type instabilities.
+        @test max_error <= max_error_limit
+        @test (@allocated set_result!(result, inputs...)) == 0
+        @test_opt set_result!(result, inputs...)
+
+        # Test ref_set_result! for allocations and type instabilities. This is
+        # helpful for ensuring that the performance comparison is fair.
+        @test (@allocated call_array_func(
+            ref_set_result!,
+            ref_result_arrays,
+            inputs_arrays,
+            temp_values_arrays,
+        )) == 0
+        @test_opt call_array_func(
+            ref_set_result!,
+            ref_result_arrays,
+            inputs_arrays,
+            temp_values_arrays,
+        )
+
+        # Test get_result (the allocating version of set_result!) for type
+        # instabilities.
+        @test_opt get_result(inputs...)
+    end
+end
+
+function test_matrix_broadcast_against_reference(;
+    test_name,
+    inputs,
+    get_result::F1,
+    set_result!::F2,
+    ref_inputs,
+    ref_set_result!::F3,
+) where {F1, F2, F3}
+    @testset "$test_name" begin
+        result = get_result(inputs...)
+
+        # Fill the output field with NaNs for testing correctness.
+        result .*= NaN
+
+        ref_result = copy(result)
+
+        best_time = @benchmark set_result!(result, inputs...)
+        best_ref_time = @benchmark ref_set_result!(ref_result, ref_inputs...)
+
+        @info "$test_name:\n\tBest Time = $best_time s\n\tBest Reference Time \
+               = $best_ref_time s"
+
+        # Test that set_result! is performant compared to ref_set_result!.
+        @test best_time < best_ref_time
+
+        # Test set_result! for correctness, allocations, and type instabilities.
+        @test result == ref_result
+        @test (@allocated set_result!(result, inputs...)) == 0
+        @test_opt set_result!(result, inputs...)
+
+        # Test ref_set_result! for allocations and type instabilities. This is
+        # helpful for ensuring that the performance comparison is fair.
+        @test (@allocated ref_set_result!(ref_result, ref_inputs...)) == 0
+        @test_opt ref_set_result!(ref_result, ref_inputs...)
+
+        # Test get_result (the allocating version of set_result!) for type
+        # instabilities.
+        @test_opt get_result(inputs...)
+    end
+end
+
+function random_test_fields(::Type{FT}) where {FT}
+    velem = 20 # This should be big enough to test high-bandwidth matrices.
+    helem = npoly = 1 # These should be small enough for the tests to be fast.
+
+    hdomain = Domains.SphereDomain(FT(10))
+    hmesh = Meshes.EquiangularCubedSphere(hdomain, helem)
+    htopology = Topologies.Topology2D(ClimaComms.SingletonCommsContext(), hmesh)
+    quad = Spaces.Quadratures.GLL{npoly + 1}()
+    hspace = Spaces.SpectralElementSpace2D(htopology, quad)
+    vdomain = Domains.IntervalDomain(
+        Geometry.ZPoint(FT(0)),
+        Geometry.ZPoint(FT(10));
+        boundary_tags = (:bottom, :top),
+    )
+    vmesh = Meshes.IntervalMesh(vdomain, nelems = velem)
+    vspace = Spaces.CenterFiniteDifferenceSpace(vmesh)
+    sfc_coord = Fields.coordinate_field(hspace)
+    hypsography = Hypsography.LinearAdaption(
+        @. cosd(sfc_coord.lat) + cosd(sfc_coord.long) + 1
+    )
+    center_space =
+        Spaces.ExtrudedFiniteDifferenceSpace(hspace, vspace, hypsography)
+    face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(center_space)
+    ᶜcoord = Fields.coordinate_field(center_space)
+    ᶠcoord = Fields.coordinate_field(face_space)
+
+    seed!(1) # ensures reproducibility
+    ᶜᶜmat = map(_ -> DiagonalMatrixRow(rand(FT, 1)...), ᶜcoord)
+    ᶜᶠmat = map(_ -> BidiagonalMatrixRow(rand(FT, 2)...), ᶜcoord)
+    ᶠᶠmat = map(_ -> TridiagonalMatrixRow(rand(FT, 3)...), ᶠcoord)
+    ᶠᶜmat = map(_ -> QuaddiagonalMatrixRow(rand(FT, 4)...), ᶠcoord)
+    ᶜvec = map(_ -> rand(FT), ᶜcoord)
+    ᶠvec = map(_ -> rand(FT), ᶠcoord)
+
+    return ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec
+end
+
+@testset "Scalar Matrix Field Broadcasting" begin
+    ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec = random_test_fields(Float64)
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "diagonal matrix times vector",
+        inputs = (ᶜᶜmat, ᶜvec),
+        get_result = (ᶜᶜmat, ᶜvec) -> (@. ᶜᶜmat ⋅ ᶜvec),
+        set_result! = (result, ᶜᶜmat, ᶜvec) -> (@. result = ᶜᶜmat ⋅ ᶜvec),
+        ref_set_result! = (_result, _ᶜᶜmat, _ᶜvec) ->
+            mul!(_result, _ᶜᶜmat, _ᶜvec),
+        time_ratio_limit = 2, # This case's ref function is fast on Buildkite.
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "tri-diagonal matrix times vector",
+        inputs = (ᶠᶠmat, ᶠvec),
+        get_result = (ᶠᶠmat, ᶠvec) -> (@. ᶠᶠmat ⋅ ᶠvec),
+        set_result! = (result, ᶠᶠmat, ᶠvec) -> (@. result = ᶠᶠmat ⋅ ᶠvec),
+        ref_set_result! = (_result, _ᶠᶠmat, _ᶠvec) ->
+            mul!(_result, _ᶠᶠmat, _ᶠvec),
+        time_ratio_limit = 2, # This case's ref function is fast on Buildkite.
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "quad-diagonal matrix times vector",
+        inputs = (ᶠᶜmat, ᶜvec),
+        get_result = (ᶠᶜmat, ᶜvec) -> (@. ᶠᶜmat ⋅ ᶜvec),
+        set_result! = (result, ᶠᶜmat, ᶜvec) -> (@. result = ᶠᶜmat ⋅ ᶜvec),
+        ref_set_result! = (_result, _ᶠᶜmat, _ᶜvec) ->
+            mul!(_result, _ᶠᶜmat, _ᶜvec),
+        time_ratio_limit = 5, # This case's ref function is fast on Buildkite.
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "diagonal matrix times bi-diagonal matrix",
+        inputs = (ᶜᶜmat, ᶜᶠmat),
+        get_result = (ᶜᶜmat, ᶜᶠmat) -> (@. ᶜᶜmat ⋅ ᶜᶠmat),
+        set_result! = (result, ᶜᶜmat, ᶜᶠmat) -> (@. result = ᶜᶜmat ⋅ ᶜᶠmat),
+        ref_set_result! = (_result, _ᶜᶜmat, _ᶜᶠmat) ->
+            mul!(_result, _ᶜᶜmat, _ᶜᶠmat),
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "tri-diagonal matrix times tri-diagonal matrix",
+        inputs = (ᶠᶠmat,),
+        get_result = (ᶠᶠmat,) -> (@. ᶠᶠmat ⋅ ᶠᶠmat),
+        set_result! = (result, ᶠᶠmat) -> (@. result = ᶠᶠmat ⋅ ᶠᶠmat),
+        ref_set_result! = (_result, _ᶠᶠmat) -> mul!(_result, _ᶠᶠmat, _ᶠᶠmat),
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "quad-diagonal matrix times diagonal matrix",
+        inputs = (ᶠᶜmat, ᶜᶜmat),
+        get_result = (ᶠᶜmat, ᶜᶜmat) -> (@. ᶠᶜmat ⋅ ᶜᶜmat),
+        set_result! = (result, ᶠᶜmat, ᶜᶜmat) -> (@. result = ᶠᶜmat ⋅ ᶜᶜmat),
+        ref_set_result! = (_result, _ᶠᶜmat, _ᶜᶜmat) ->
+            mul!(_result, _ᶠᶜmat, _ᶜᶜmat),
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "diagonal matrix times bi-diagonal matrix times \
+                     tri-diagonal matrix times quad-diagonal matrix",
+        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+            (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat),
+        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+            (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat),
+        temp_values = ((@. ᶜᶜmat ⋅ ᶜᶠmat), (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat)),
+        ref_set_result! = (
+            _result,
+            _ᶜᶜmat,
+            _ᶜᶠmat,
+            _ᶠᶠmat,
+            _ᶠᶜmat,
+            _temp1,
+            _temp2,
+        ) -> begin
+            mul!(_temp1, _ᶜᶜmat, _ᶜᶠmat)
+            mul!(_temp2, _temp1, _ᶠᶠmat)
+            mul!(_result, _temp2, _ᶠᶜmat)
+        end,
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "diagonal matrix times bi-diagonal matrix times \
+                     tri-diagonal matrix times quad-diagonal matrix, but with \
+                     forced right-associativity",
+        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+            (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))),
+        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+            (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))),
+        temp_values = ((@. ᶠᶠmat ⋅ ᶠᶜmat), (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))),
+        ref_set_result! = (
+            _result,
+            _ᶜᶜmat,
+            _ᶜᶠmat,
+            _ᶠᶠmat,
+            _ᶠᶜmat,
+            _temp1,
+            _temp2,
+        ) -> begin
+            mul!(_temp1, _ᶠᶠmat, _ᶠᶜmat)
+            mul!(_temp2, _ᶜᶠmat, _temp1)
+            mul!(_result, _ᶜᶜmat, _temp2)
+        end,
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "diagonal matrix times bi-diagonal matrix times \
+                     tri-diagonal matrix times quad-diagonal matrix times \
+                     vector",
+        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec),
+        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) ->
+            (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec),
+        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) ->
+            (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec),
+        temp_values = (
+            (@. ᶜᶜmat ⋅ ᶜᶠmat),
+            (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat),
+            (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat),
+        ),
+        ref_set_result! = (
+            _result,
+            _ᶜᶜmat,
+            _ᶜᶠmat,
+            _ᶠᶠmat,
+            _ᶠᶜmat,
+            _ᶜvec,
+            _temp1,
+            _temp2,
+            _temp3,
+        ) -> begin
+            mul!(_temp1, _ᶜᶜmat, _ᶜᶠmat)
+            mul!(_temp2, _temp1, _ᶠᶠmat)
+            mul!(_temp3, _temp2, _ᶠᶜmat)
+            mul!(_result, _temp3, _ᶜvec)
+        end,
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "diagonal matrix times bi-diagonal matrix times \
+                     tri-diagonal matrix times quad-diagonal matrix times \
+                     vector, but with forced right-associativity",
+        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec),
+        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) ->
+            (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))),
+        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) ->
+            (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))),
+        temp_values = (
+            (@. ᶠᶜmat ⋅ ᶜvec),
+            (@. ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)),
+            (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec))),
+        ),
+        ref_set_result! = (
+            _result,
+            _ᶜᶜmat,
+            _ᶜᶠmat,
+            _ᶠᶠmat,
+            _ᶠᶜmat,
+            _ᶜvec,
+            _temp1,
+            _temp2,
+            _temp3,
+        ) -> begin
+            mul!(_temp1, _ᶠᶜmat, _ᶜvec)
+            mul!(_temp2, _ᶠᶠmat, _temp1)
+            mul!(_temp3, _ᶜᶠmat, _temp2)
+            mul!(_result, _ᶜᶜmat, _temp3)
+        end,
+        time_ratio_limit = 15, # This case's ref function is fast on Buildkite.
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "linear combination of matrix products and LinearAlgebra.I",
+        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+            (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)),
+        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+            (@. result = 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)),
+        temp_values = (
+            (@. 2 * ᶠᶜmat),
+            (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat),
+            (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
+            (@. ᶠᶠmat ⋅ ᶠᶠmat),
+        ),
+        ref_set_result! = (
+            _result,
+            _ᶜᶜmat,
+            _ᶜᶠmat,
+            _ᶠᶠmat,
+            _ᶠᶜmat,
+            _temp1,
+            _temp2,
+            _temp3,
+            _temp4,
+        ) -> begin
+            @. _temp1 = 0 + 2 * _ᶠᶜmat # This allocates without the `0 + `.
+            mul!(_temp2, _temp1, _ᶜᶜmat)
+            mul!(_temp3, _temp2, _ᶜᶠmat)
+            mul!(_temp4, _ᶠᶠmat, _ᶠᶠmat)
+            copyto!(_result, 4I) # We can't directly use I in array broadcasts.
+            @. _result = _temp3 + _temp4 / 3 - _result
+        end,
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "another linear combination of matrix products and \
+                     LinearAlgebra.I",
+        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+            (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)),
+        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result =
+            ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)),
+        temp_values = (
+            (@. ᶠᶜmat ⋅ ᶜᶜmat),
+            (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
+            (@. ᶠᶠmat / 3),
+            (@. (ᶠᶠmat / 3) ⋅ ᶠᶠmat),
+        ),
+        ref_set_result! = (
+            _result,
+            _ᶜᶜmat,
+            _ᶜᶠmat,
+            _ᶠᶠmat,
+            _ᶠᶜmat,
+            _temp1,
+            _temp2,
+            _temp3,
+            _temp4,
+        ) -> begin
+            mul!(_temp1, _ᶠᶜmat, _ᶜᶜmat)
+            mul!(_temp2, _temp1, _ᶜᶠmat)
+            @. _temp3 = 0 + _ᶠᶠmat / 3 # This allocates without the `0 + `.
+            mul!(_temp4, _temp3, _ᶠᶠmat)
+            copyto!(_result, 4I) # We can't directly use I in array broadcasts.
+            @. _result = _temp2 * 2 - _temp4 + _result
+        end,
+        max_error_limit = 512, # Why is there a 512 * eps error on Buildkite?
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "matrix times linear combination",
+        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. ᶜᶠmat ⋅
+            (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))),
+        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result =
+            ᶜᶠmat ⋅ (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))),
+        temp_values = (
+            (@. 2 * ᶠᶜmat),
+            (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat),
+            (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
+            (@. ᶠᶠmat ⋅ ᶠᶠmat),
+            (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)),
+        ),
+        ref_set_result! = (
+            _result,
+            _ᶜᶜmat,
+            _ᶜᶠmat,
+            _ᶠᶠmat,
+            _ᶠᶜmat,
+            _temp1,
+            _temp2,
+            _temp3,
+            _temp4,
+            _temp5,
+        ) -> begin
+            @. _temp1 = 0 + 2 * _ᶠᶜmat # This allocates without the `0 + `.
+            mul!(_temp2, _temp1, _ᶜᶜmat)
+            mul!(_temp3, _temp2, _ᶜᶠmat)
+            mul!(_temp4, _ᶠᶠmat, _ᶠᶠmat)
+            copyto!(_temp5, 4I) # We can't directly use I in array broadcasts.
+            @. _temp5 = _temp3 + _temp4 / 3 - _temp5
+            mul!(_result, _ᶜᶠmat, _temp5)
+        end,
+        max_error_limit = 276, # Why is there a 276 * eps error on Buildkite?
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "linear combination times another linear combination",
+        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+            (@. (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅
+                (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,))),
+        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result =
+            (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅
+            (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,))),
+        temp_values = (
+            (@. 2 * ᶠᶜmat),
+            (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat),
+            (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
+            (@. ᶠᶠmat ⋅ ᶠᶠmat),
+            (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)),
+            (@. ᶠᶜmat ⋅ ᶜᶜmat),
+            (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
+            (@. ᶠᶠmat / 3),
+            (@. (ᶠᶠmat / 3) ⋅ ᶠᶠmat),
+            (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)),
+        ),
+        ref_set_result! = (
+            _result,
+            _ᶜᶜmat,
+            _ᶜᶠmat,
+            _ᶠᶠmat,
+            _ᶠᶜmat,
+            _temp1,
+            _temp2,
+            _temp3,
+            _temp4,
+            _temp5,
+            _temp6,
+            _temp7,
+            _temp8,
+            _temp9,
+            _temp10,
+        ) -> begin
+            @. _temp1 = 0 + 2 * _ᶠᶜmat # This allocates without the `0 + `.
+            mul!(_temp2, _temp1, _ᶜᶜmat)
+            mul!(_temp3, _temp2, _ᶜᶠmat)
+            mul!(_temp4, _ᶠᶠmat, _ᶠᶠmat)
+            copyto!(_temp5, 4I) # We can't directly use I in array broadcasts.
+            @. _temp5 = _temp3 + _temp4 / 3 - _temp5
+            mul!(_temp6, _ᶠᶜmat, _ᶜᶜmat)
+            mul!(_temp7, _temp6, _ᶜᶠmat)
+            @. _temp8 = 0 + _ᶠᶠmat / 3 # This allocates without the `0 + `.
+            mul!(_temp9, _temp8, _ᶠᶠmat)
+            copyto!(_temp10, 4I) # We can't directly use I in array broadcasts.
+            @. _temp10 = _temp7 * 2 - _temp9 + _temp10
+            mul!(_result, _temp5, _temp10)
+        end,
+        max_error_limit = 44, # Why is there a 44 * eps error on Buildkite?
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "matrix times matrix times linear combination times matrix \
+                     times another linear combination times matrix",
+        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat),
+        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. ᶠᶜmat ⋅ ᶜᶠmat ⋅
+            (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅
+            ᶠᶠmat ⋅
+            (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)) ⋅
+            ᶠᶠmat),
+        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result =
+            ᶠᶜmat ⋅ ᶜᶠmat ⋅
+            (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅
+            ᶠᶠmat ⋅
+            (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)) ⋅
+            ᶠᶠmat),
+        temp_values = (
+            (@. ᶠᶜmat ⋅ ᶜᶠmat),
+            (@. 2 * ᶠᶜmat),
+            (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat),
+            (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
+            (@. ᶠᶠmat ⋅ ᶠᶠmat),
+            (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)),
+            (@. ᶠᶜmat ⋅ ᶜᶠmat ⋅
+                (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))),
+            (@. ᶠᶜmat ⋅ ᶜᶠmat ⋅
+                (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅
+                ᶠᶠmat),
+            (@. ᶠᶜmat ⋅ ᶜᶜmat),
+            (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
+            (@. ᶠᶠmat / 3),
+            (@. (ᶠᶠmat / 3) ⋅ ᶠᶠmat),
+            (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)),
+            (@. ᶠᶜmat ⋅ ᶜᶠmat ⋅
+                (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅
+                ᶠᶠmat ⋅
+                (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,))),
+        ),
+        ref_set_result! = (
+            _result,
+            _ᶜᶜmat,
+            _ᶜᶠmat,
+            _ᶠᶠmat,
+            _ᶠᶜmat,
+            _temp1,
+            _temp2,
+            _temp3,
+            _temp4,
+            _temp5,
+            _temp6,
+            _temp7,
+            _temp8,
+            _temp9,
+            _temp10,
+            _temp11,
+            _temp12,
+            _temp13,
+            _temp14,
+        ) -> begin
+            mul!(_temp1, _ᶠᶜmat, _ᶜᶠmat)
+            @. _temp2 = 0 + 2 * _ᶠᶜmat # This allocates without the `0 + `.
+            mul!(_temp3, _temp2, _ᶜᶜmat)
+            mul!(_temp4, _temp3, _ᶜᶠmat)
+            mul!(_temp5, _ᶠᶠmat, _ᶠᶠmat)
+            copyto!(_temp6, 4I) # We can't directly use I in array broadcasts.
+            @. _temp6 = _temp4 + _temp5 / 3 - _temp6
+            mul!(_temp7, _temp1, _temp6)
+            mul!(_temp8, _temp7, _ᶠᶠmat)
+            mul!(_temp9, _ᶠᶜmat, _ᶜᶜmat)
+            mul!(_temp10, _temp9, _ᶜᶠmat)
+            @. _temp11 = 0 + _ᶠᶠmat / 3 # This allocates without the `0 + `.
+            mul!(_temp12, _temp11, _ᶠᶠmat)
+            copyto!(_temp13, 4I) # We can't directly use I in array broadcasts.
+            @. _temp13 = _temp10 * 2 - _temp12 + _temp13
+            mul!(_temp14, _temp8, _temp13)
+            mul!(_result, _temp14, _ᶠᶠmat)
+        end,
+        time_ratio_limit = 2, # This case's ref function is fast on Buildkite.
+        max_error_limit = 3038, # Why is there a 3038 * eps error on Buildkite?
+    )
+
+    test_matrix_broadcast_against_array_reference(;
+        test_name = "matrix constructions and multiplications",
+        inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec),
+        get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec) ->
+            (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅
+                TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat ⋅
+                DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)),
+        set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec) ->
+            (@. result =
+                BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅
+                TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat ⋅
+                DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)),
+        temp_values = (
+            (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec)),
+            (@. TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1)),
+            (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅
+                TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1)),
+            (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅
+                TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat),
+            (@. DiagonalMatrixRow(ᶠvec)),
+            (@. DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)),
+        ),
+        ref_set_result! = (
+            _result,
+            _ᶜᶜmat,
+            _ᶜᶠmat,
+            _ᶠᶠmat,
+            _ᶠᶜmat,
+            _ᶜvec,
+            _ᶠvec,
+            _temp1,
+            _temp2,
+            _temp3,
+            _temp4,
+            _temp5,
+            _temp6,
+        ) -> begin
+            mul!(view(_temp1, band(0)), _ᶜᶠmat, _ᶠvec)
+            mul!(view(_temp1, band(1)), _ᶜᶜmat, _ᶜvec)
+            copyto!(view(_temp2, band(-1)), 1, _ᶠvec, 2)
+            mul!(view(_temp2, band(0)), _ᶠᶜmat, _ᶜvec)
+            fill!(view(_temp2, band(1)), 1)
+            mul!(_temp3, _temp1, _temp2)
+            mul!(_temp4, _temp3, _ᶠᶠmat)
+            copyto!(view(_temp5, band(0)), 1, _ᶠvec, 1)
+            mul!(view(_temp6, band(0)), _temp5, _ᶠvec)
+            mul!(_result, _temp4, _temp6)
+        end,
+    )
+end
+
+@testset "Non-scalar Matrix Field Broadcasting" begin
+    ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec = random_test_fields(Float64)
+
+    ᶜlg = Fields.local_geometry_field(ᶜvec)
+    ᶠlg = Fields.local_geometry_field(ᶠvec)
+
+    ᶜᶠmat2 = map(row -> map(sin, row), ᶜᶠmat)
+    ᶜᶠmat3 = map(row -> map(cos, row), ᶜᶠmat)
+    ᶠᶜmat2 = map(row -> map(sin, row), ᶠᶜmat)
+    ᶠᶜmat3 = map(row -> map(cos, row), ᶠᶜmat)
+
+    ᶜᶠmat_AC1 = map(row -> map(adjoint ∘ Geometry.Covariant1Vector, row), ᶜᶠmat)
+    ᶜᶠmat_C12 = map(
+        (row1, row2) -> map(Geometry.Covariant12Vector, row1, row2),
+        ᶜᶠmat2,
+        ᶜᶠmat3,
+    )
+    ᶠᶜmat_AC1 = map(row -> map(adjoint ∘ Geometry.Covariant1Vector, row), ᶠᶜmat)
+    ᶠᶜmat_C12 = map(
+        (row1, row2) -> map(Geometry.Covariant12Vector, row1, row2),
+        ᶠᶜmat2,
+        ᶠᶜmat3,
+    )
+
+    test_matrix_broadcast_against_reference(;
+        test_name = "matrix of covectors times matrix of vectors",
+        inputs = (ᶜᶠmat_AC1, ᶠᶜmat_C12),
+        get_result = (ᶜᶠmat_AC1, ᶠᶜmat_C12) -> (@. ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12),
+        set_result! = (result, ᶜᶠmat_AC1, ᶠᶜmat_C12) ->
+            (@. result = ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12),
+        ref_inputs = (ᶜᶠmat, ᶠᶜmat2, ᶠᶜmat3, ᶠlg),
+        ref_set_result! = (result, ᶜᶠmat, ᶠᶜmat2, ᶠᶜmat3, ᶠlg) -> (@. result =
+            ᶜᶠmat ⋅ (
+                DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:1) ⋅ ᶠᶜmat2 +
+                DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:2) ⋅ ᶠᶜmat3
+            )),
+    )
+
+    test_matrix_broadcast_against_reference(;
+        test_name = "matrix of covectors times matrix of vectors times matrix \
+                     of numbers times matrix of covectors times matrix of \
+                     vectors",
+        inputs = (ᶜᶠmat_AC1, ᶠᶜmat_C12, ᶜᶠmat, ᶠᶜmat_AC1, ᶜᶠmat_C12),
+        get_result = (ᶜᶠmat_AC1, ᶠᶜmat_C12, ᶜᶠmat, ᶠᶜmat_AC1, ᶜᶠmat_C12) ->
+            (@. ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12 ⋅ ᶜᶠmat ⋅ ᶠᶜmat_AC1 ⋅ ᶜᶠmat_C12),
+        set_result! = (
+            result,
+            ᶜᶠmat_AC1,
+            ᶠᶜmat_C12,
+            ᶜᶠmat,
+            ᶠᶜmat_AC1,
+            ᶜᶠmat_C12,
+        ) ->
+            (@. result = ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12 ⋅ ᶜᶠmat ⋅ ᶠᶜmat_AC1 ⋅ ᶜᶠmat_C12),
+        ref_inputs = (ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3, ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3, ᶜlg, ᶠlg),
+        ref_set_result! = (
+            result,
+            ᶜᶠmat,
+            ᶜᶠmat2,
+            ᶜᶠmat3,
+            ᶠᶜmat,
+            ᶠᶜmat2,
+            ᶠᶜmat3,
+            ᶜlg,
+            ᶠlg,
+        ) -> (@. result =
+            ᶜᶠmat ⋅ (
+                DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:1) ⋅ ᶠᶜmat2 +
+                DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:2) ⋅ ᶠᶜmat3
+            ) ⋅ ᶜᶠmat ⋅ ᶠᶜmat ⋅ (
+                DiagonalMatrixRow(ᶜlg.gⁱʲ.components.data.:1) ⋅ ᶜᶠmat2 +
+                DiagonalMatrixRow(ᶜlg.gⁱʲ.components.data.:2) ⋅ ᶜᶠmat3
+            )),
+    )
+
+    ᶜᶠmat_AC1_num =
+        map((row1, row2) -> map(tuple, row1, row2), ᶜᶠmat_AC1, ᶜᶠmat)
+    ᶜᶠmat_num_C12 =
+        map((row1, row2) -> map(tuple, row1, row2), ᶜᶠmat, ᶜᶠmat_C12)
+    ᶠᶜmat_C12_AC1 =
+        map((row1, row2) -> map(tuple, row1, row2), ᶠᶜmat_C12, ᶠᶜmat_AC1)
+
+    test_matrix_broadcast_against_reference(;
+        test_name = "matrix of covectors and numbers times matrix of vectors \
+                     and covectors times matrix of numbers and vectors times \
+                     vector of numbers",
+        inputs = (ᶜᶠmat_AC1_num, ᶠᶜmat_C12_AC1, ᶜᶠmat_num_C12, ᶠvec),
+        get_result = (ᶜᶠmat_AC1_num, ᶠᶜmat_C12_AC1, ᶜᶠmat_num_C12, ᶠvec) ->
+            (@. ᶜᶠmat_AC1_num ⋅ ᶠᶜmat_C12_AC1 ⋅ ᶜᶠmat_num_C12 ⋅ ᶠvec),
+        set_result! = (
+            result,
+            ᶜᶠmat_AC1_num,
+            ᶠᶜmat_C12_AC1,
+            ᶜᶠmat_num_C12,
+            ᶠvec,
+        ) -> (@. result = ᶜᶠmat_AC1_num ⋅ ᶠᶜmat_C12_AC1 ⋅ ᶜᶠmat_num_C12 ⋅ ᶠvec),
+        ref_inputs = (
+            ᶜᶠmat,
+            ᶜᶠmat2,
+            ᶜᶠmat3,
+            ᶠᶜmat,
+            ᶠᶜmat2,
+            ᶠᶜmat3,
+            ᶠvec,
+            ᶜlg,
+            ᶠlg,
+        ),
+        ref_set_result! = (
+            result,
+            ᶜᶠmat,
+            ᶜᶠmat2,
+            ᶜᶠmat3,
+            ᶠᶜmat,
+            ᶠᶜmat2,
+            ᶠᶜmat3,
+            ᶠvec,
+            ᶜlg,
+            ᶠlg,
+        ) -> (@. result = tuple(
+            ᶜᶠmat ⋅ (
+                DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:1) ⋅ ᶠᶜmat2 +
+                DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:2) ⋅ ᶠᶜmat3
+            ) ⋅ ᶜᶠmat ⋅ ᶠvec,
+            ᶜᶠmat ⋅ ᶠᶜmat ⋅ (
+                DiagonalMatrixRow(ᶜlg.gⁱʲ.components.data.:1) ⋅ ᶜᶠmat2 +
+                DiagonalMatrixRow(ᶜlg.gⁱʲ.components.data.:2) ⋅ ᶜᶠmat3
+            ) ⋅ ᶠvec,
+        )),
+    )
+
+    T(value1, value2, value3) =
+        (; a = (), b = value1, c = (value2, (; d = (value3,)), (;)))
+    ᶜᶠmat_T = map((rows...) -> map(T, rows...), ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3)
+    ᶠᶜmat_T = map((rows...) -> map(T, rows...), ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3)
+    ᶜvec_T = @. T(ᶜvec, ᶜvec, ᶜvec)
+
+    test_matrix_broadcast_against_reference(;
+        test_name = "matrix of nested values times matrix of nested values \
+                     times matrix of numbers times matrix of numbers times \
+                     times vector of nested values",
+        inputs = (ᶜᶠmat_T, ᶠᶜmat, ᶜᶠmat, ᶠᶜmat_T, ᶜvec_T),
+        get_result = (ᶜᶠmat_T, ᶠᶜmat, ᶜᶠmat, ᶠᶜmat_T, ᶜvec_T) ->
+            (@. ᶜᶠmat_T ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat_T ⋅ ᶜvec_T),
+        set_result! = (result, ᶜᶠmat_T, ᶠᶜmat, ᶜᶠmat, ᶠᶜmat_T, ᶜvec_T) ->
+            (@. result = ᶜᶠmat_T ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat_T ⋅ ᶜvec_T),
+        ref_inputs = (ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3, ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3, ᶜvec),
+        ref_set_result! = (
+            result,
+            ᶜᶠmat,
+            ᶜᶠmat2,
+            ᶜᶠmat3,
+            ᶠᶜmat,
+            ᶠᶜmat2,
+            ᶠᶜmat3,
+            ᶜvec,
+        ) -> (@. result = T(
+            ᶜᶠmat ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec,
+            ᶜᶠmat2 ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat2 ⋅ ᶜvec,
+            ᶜᶠmat3 ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat3 ⋅ ᶜvec,
+        )),
+    )
+end
diff --git a/test/MatrixFields/rmul_with_projection.jl b/test/MatrixFields/rmul_with_projection.jl
new file mode 100644
index 0000000000..d7f4855865
--- /dev/null
+++ b/test/MatrixFields/rmul_with_projection.jl
@@ -0,0 +1,123 @@
+using Test
+using JET
+using Random: seed!
+using StaticArrays: @SMatrix
+
+import ClimaCore: Geometry
+import ClimaCore.MatrixFields: rmul_with_projection
+
+function test_rmul_with_projection(x, y, lg, expected_result)
+    result = rmul_with_projection(x, y, lg)
+
+    # Compute the maximum error as an integer multiple of machine epsilon.
+    FT = Geometry.undertype(typeof(lg))
+    object2tuple(obj) =
+        reinterpret(NTuple{sizeof(obj) ÷ sizeof(FT), FT}, [obj])[1]
+    max_error = maximum(
+        ((value, expected_value),) ->
+            Int(abs(value - expected_value) / eps(expected_value)),
+        zip(object2tuple(result), object2tuple(expected_result)),
+    )
+
+    @test max_error <= 1                                   # correctness
+    @test (@allocated rmul_with_projection(x, y, lg)) == 0 # allocations
+    @test_opt rmul_with_projection(x, y, lg)               # type instabilities
+end
+
+@testset "rmul_with_projection Unit Tests" begin
+    seed!(1) # ensures reproducibility
+
+    FT = Float64
+    coord = Geometry.LatLongZPoint(rand(FT), rand(FT), rand(FT))
+    ∂x∂ξ = Geometry.AxisTensor(
+        (Geometry.LocalAxis{(1, 2, 3)}(), Geometry.CovariantAxis{(1, 2, 3)}()),
+        (@SMatrix rand(FT, 3, 3)),
+    )
+    lg = Geometry.LocalGeometry(coord, rand(FT), rand(FT), ∂x∂ξ)
+
+    number = rand(FT)
+    covector = Geometry.Covariant12Vector(rand(FT), rand(FT))'
+    vector = Geometry.Covariant123Vector(rand(FT), rand(FT), rand(FT))
+    tensor = vector * vector'
+    projected_vector =
+        Geometry.project(Geometry.Contravariant12Axis(), vector, lg)
+    projected_tensor =
+        Geometry.project(Geometry.Contravariant12Axis(), tensor, lg)
+
+    # Test all required combinations of single values.
+    test_rmul_with_projection(number, number, lg, number * number)
+    test_rmul_with_projection(number, covector, lg, number * covector)
+    test_rmul_with_projection(number, vector, lg, number * vector)
+    test_rmul_with_projection(number, tensor, lg, number * tensor)
+    test_rmul_with_projection(covector, number, lg, covector * number)
+    test_rmul_with_projection(vector, number, lg, vector * number)
+    test_rmul_with_projection(tensor, number, lg, tensor * number)
+    test_rmul_with_projection(covector, vector, lg, covector * projected_vector)
+    test_rmul_with_projection(covector, tensor, lg, covector * projected_tensor)
+
+    # Test some combinations of complicated nested values.
+    T(value1, value2, value3) =
+        (; a = (), b = value1, c = (value2, (; d = (value3,)), (;)))
+    test_rmul_with_projection(
+        number,
+        T(covector, vector, tensor),
+        lg,
+        T(number * covector, number * vector, number * tensor),
+    )
+    test_rmul_with_projection(
+        T(covector, vector, tensor),
+        number,
+        lg,
+        T(covector * number, vector * number, tensor * number),
+    )
+    test_rmul_with_projection(
+        vector,
+        T(number, number, number),
+        lg,
+        T(vector * number, vector * number, vector * number),
+    )
+    test_rmul_with_projection(
+        T(number, number, number),
+        covector,
+        lg,
+        T(number * covector, number * covector, number * covector),
+    )
+    test_rmul_with_projection(
+        T(number, vector, number),
+        T(covector, number, tensor),
+        lg,
+        T(number * covector, vector * number, number * tensor),
+    )
+    test_rmul_with_projection(
+        T(covector, number, tensor),
+        T(number, vector, number),
+        lg,
+        T(covector * number, number * vector, tensor * number),
+    )
+    test_rmul_with_projection(
+        covector,
+        T(vector, number, tensor),
+        lg,
+        T(
+            covector * projected_vector,
+            covector * number,
+            covector * projected_tensor,
+        ),
+    )
+    test_rmul_with_projection(
+        T(covector, number, covector),
+        vector,
+        lg,
+        T(
+            covector * projected_vector,
+            number * vector,
+            covector * projected_vector,
+        ),
+    )
+    test_rmul_with_projection(
+        T(covector, number, covector),
+        T(number, vector, tensor),
+        lg,
+        T(covector * number, number * vector, covector * projected_tensor),
+    )
+end
diff --git a/test/Project.toml b/test/Project.toml
index 2df49394e3..9351af89f0 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -2,6 +2,7 @@
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
 AssociatedLegendrePolynomials = "2119f1ac-fb78-50f5-8cc0-dda848ebdb19"
+BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
diff --git a/test/runtests.jl b/test/runtests.jl
index f04ac240fd..28967cdd3c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -75,6 +75,12 @@ if !Sys.iswindows()
     @safetestset "Hybrid - dss opt" begin @time include("Operators/hybrid/dss_opt.jl") end
     @safetestset "Hybrid - opt" begin @time include("Operators/hybrid/opt.jl") end
 
+    @safetestset "MatrixFields - BandMatrixRow" begin @time include("MatrixFields/band_matrix_row.jl") end
+    @safetestset "MatrixFields - rmul_with_projection" begin @time include("MatrixFields/rmul_with_projection.jl") end
+    @safetestset "MatrixFields - field2arrays" begin @time include("MatrixFields/field2arrays.jl") end
+    # now part of buildkite
+    # @safetestset "MatrixFields - matrix field broadcasting" begin @time include("MatrixFields/matrix_field_broadcasting.jl") end
+
     @safetestset "Hypsography - 2d" begin @time include("Hypsography/2d.jl") end
     @safetestset "Hypsography - 3d sphere" begin @time include("Hypsography/3dsphere.jl") end
 

From d9337b9f44430591ec52e94a0f28f9b98feaac78 Mon Sep 17 00:00:00 2001
From: Dennis Yatunin <dyatun@gmail.com>
Date: Tue, 27 Jun 2023 16:43:02 -0700
Subject: [PATCH 02/10] Add GPU unit test

---
 .buildkite/pipeline.yml                        | 10 ++++++++--
 test/MatrixFields/matrix_field_broadcasting.jl |  6 ++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 950581d805..75bc995f53 100755
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -509,10 +509,16 @@ steps:
         key: unit_field2arrays
         command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/field2arrays.jl"
 
-      - label: "Unit: matrix field broadcasting"
-        key: unit_matrix_field_broadcasting
+      - label: "Unit: matrix field broadcasting (CPU)"
+        key: unit_matrix_field_broadcasting_cpu
         command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl"
 
+      - label: "Unit: matrix field broadcasting (GPU)"
+        key: unit_matrix_field_broadcasting_gpu
+        command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl"
+        agents:
+          slurm_gpus: 1
+
   - group: "Unit: Hypsography"
     steps:
 
diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl
index b18585c0b1..714dbaa1c6 100644
--- a/test/MatrixFields/matrix_field_broadcasting.jl
+++ b/test/MatrixFields/matrix_field_broadcasting.jl
@@ -155,9 +155,10 @@ function random_test_fields(::Type{FT}) where {FT}
     velem = 20 # This should be big enough to test high-bandwidth matrices.
     helem = npoly = 1 # These should be small enough for the tests to be fast.
 
+    comms_ctx = ClimaComms.SingletonCommsContext()
     hdomain = Domains.SphereDomain(FT(10))
     hmesh = Meshes.EquiangularCubedSphere(hdomain, helem)
-    htopology = Topologies.Topology2D(ClimaComms.SingletonCommsContext(), hmesh)
+    htopology = Topologies.Topology2D(comms_ctx, hmesh)
     quad = Spaces.Quadratures.GLL{npoly + 1}()
     hspace = Spaces.SpectralElementSpace2D(htopology, quad)
     vdomain = Domains.IntervalDomain(
@@ -166,7 +167,8 @@ function random_test_fields(::Type{FT}) where {FT}
         boundary_tags = (:bottom, :top),
     )
     vmesh = Meshes.IntervalMesh(vdomain, nelems = velem)
-    vspace = Spaces.CenterFiniteDifferenceSpace(vmesh)
+    vtopology = Tpologies.IntervalTopology(comms_ctx, vmesh)
+    vspace = Spaces.CenterFiniteDifferenceSpace(vtopology)
     sfc_coord = Fields.coordinate_field(hspace)
     hypsography = Hypsography.LinearAdaption(
         @. cosd(sfc_coord.lat) + cosd(sfc_coord.long) + 1

From 3b495d777929767f56b4174f4683a18918162090 Mon Sep 17 00:00:00 2001
From: Dennis Yatunin <dyatun@gmail.com>
Date: Thu, 6 Jul 2023 19:42:09 -0700
Subject: [PATCH 03/10] Fix GPU-related bugs

---
 src/MatrixFields/MatrixFields.jl              |   1 +
 src/MatrixFields/matrix_field_utils.jl        |  94 +++++++-------
 src/MatrixFields/matrix_multiplication.jl     |   2 +-
 src/Operators/common.jl                       |  32 +++--
 src/Operators/finitedifference.jl             |   2 +-
 src/Operators/spectralelement.jl              |   2 +-
 .../MatrixFields/matrix_field_broadcasting.jl | 115 +++++++++++-------
 7 files changed, 148 insertions(+), 100 deletions(-)

diff --git a/src/MatrixFields/MatrixFields.jl b/src/MatrixFields/MatrixFields.jl
index 6ed80c32e3..0a0f241acd 100644
--- a/src/MatrixFields/MatrixFields.jl
+++ b/src/MatrixFields/MatrixFields.jl
@@ -1,5 +1,6 @@
 module MatrixFields
 
+import CUDA: @allowscalar
 import LinearAlgebra: UniformScaling, Adjoint
 import StaticArrays: SArray, SMatrix, SVector
 import BandedMatrices: BandedMatrix, band, _BandedMatrix
diff --git a/src/MatrixFields/matrix_field_utils.jl b/src/MatrixFields/matrix_field_utils.jl
index 706366eba2..4a4d4c350e 100644
--- a/src/MatrixFields/matrix_field_utils.jl
+++ b/src/MatrixFields/matrix_field_utils.jl
@@ -31,12 +31,11 @@ end
 
 Converts a field defined on a `FiniteDifferenceSpace` into a `Vector` or a
 `BandedMatrix`, depending on whether or not the elements of the field are
-`BandMatrixRow`s. This involves copying the data stored in the field.
+`BandMatrixRow`s. This involves copying the data stored in the field. Because
+`BandedMatrix` does not currently support operations with `CuArray`s, all GPU
+data is copied to the CPU.
 """
-function column_field2array(field)
-    space = axes(field)
-    space isa Spaces.FiniteDifferenceSpace ||
-        error("column_field2array requires a field on a FiniteDifferenceSpace")
+function column_field2array(field::Fields.FiniteDifferenceField)
     if eltype(field) <: BandMatrixRow # field represents a matrix
         n_rows, n_cols, matrix_ld, matrix_ud = banded_matrix_info(field)
         matrix = BandedMatrix{eltype(eltype(field))}(
@@ -45,47 +44,24 @@ function column_field2array(field)
             (-matrix_ld, matrix_ud),
         )
         for (index_of_field_entry, matrix_d) in enumerate(matrix_ld:matrix_ud)
-            # Find the rows for which field_diagonal[row] is inside the matrix.
+            matrix_diagonal = view(matrix, band(matrix_d))
+            field_diagonal = field.entries.:($index_of_field_entry)
+            field_diagonal_data =
+                vec(reinterpret(eltype(eltype(field)), parent(field_diagonal)'))
+
+            # Find the rows for which field_diagonal_data[row] is in the matrix.
             # Note: The matrix index (1, 1) corresponds to the diagonal index 0,
             # and the matrix index (n_rows, n_cols) corresponds to the diagonal
             # index n_cols - n_rows.
             first_row = matrix_d < 0 ? 1 - matrix_d : 1
             last_row = matrix_d < n_cols - n_rows ? n_rows : n_cols - matrix_d
 
-            # Copy the value in each row from field_diagonal to matrix_diagonal.
-            field_diagonal = field.entries.:($index_of_field_entry)
-            matrix_diagonal = view(matrix, band(matrix_d))
-            for (index_along_diagonal, row) in enumerate(first_row:last_row)
-                matrix_diagonal[index_along_diagonal] =
-                    Fields.field_values(field_diagonal)[row]
-            end
+            @allowscalar matrix_diagonal .=
+                view(field_diagonal_data, first_row:last_row)
         end
         return matrix
     else # field represents a vector
-        n_rows = Spaces.nlevels(space)
-        return map(row -> Fields.field_values(field)[row], 1:n_rows)
-    end
-end
-
-"""
-    field2arrays(field)
-
-Converts a field defined on a `FiniteDifferenceSpace` or on an
-`ExtrudedFiniteDifferenceSpace` into a tuple of arrays, each of which
-corresponds to a column of the field. This is done by calling
-`column_field2array` on each of the field's columns.
-"""
-function field2arrays(field)
-    space = axes(field)
-    column_indices = if space isa Spaces.FiniteDifferenceSpace
-        (((1, 1), 1),)
-    elseif space isa Spaces.ExtrudedFiniteDifferenceSpace
-        (Spaces.all_nodes(Spaces.horizontal_space(space))...,)
-    else
-        error("Invalid space type: $(typeof(space).name.wrapper)")
-    end
-    return map(column_indices) do ((i, j), h)
-        column_field2array(Spaces.column(field, i, j, h))
+        return @allowscalar Array(column_field2array_view(field))
     end
 end
 
@@ -95,16 +71,13 @@ end
 Similar to `column_field2array(field)`, except that this version avoids copying
 the data stored in the field.
 """
-function column_field2array_view(field)
-    space = axes(field)
-    space isa Spaces.FiniteDifferenceSpace ||
-        error("column_field2array_view requires a field on a \
-               FiniteDifferenceSpace")
+function column_field2array_view(field::Fields.FiniteDifferenceField)
     if eltype(field) <: BandMatrixRow # field represents a matrix
         n_rows, n_cols, matrix_ld, matrix_ud = banded_matrix_info(field)
-        data_transpose = reinterpret(eltype(eltype(field)), parent(field)')
+        field_data_transpose =
+            reinterpret(eltype(eltype(field)), parent(field)')
         matrix_transpose =
-            _BandedMatrix(data_transpose, n_cols, matrix_ud, -matrix_ld)
+            _BandedMatrix(field_data_transpose, n_cols, matrix_ud, -matrix_ld)
         return permutedims(matrix_transpose)
         # TODO: Despite not copying any data, this function still allocates a
         # small amount of memory because of _BandedMatrix and permutedims.
@@ -113,6 +86,36 @@ function column_field2array_view(field)
     end
 end
 
+column_indices(field::Fields.FiniteDifferenceField) = (((1, 1), 1),)
+column_indices(field::Fields.ExtrudedFiniteDifferenceField) =
+    (Spaces.all_nodes(Spaces.horizontal_space(axes(field)))...,)
+column_indices(field) =
+    error("Invalid space type: $(typeof(axes(field)).name.wrapper)")
+
+"""
+    field2arrays(field)
+
+Converts a field defined on a `FiniteDifferenceSpace` or on an
+`ExtrudedFiniteDifferenceSpace` into a tuple of arrays, each of which
+corresponds to a column of the field. This is done by calling
+`column_field2array` on each of the field's columns.
+"""
+field2arrays(field) =
+    map(column_indices(field)) do ((i, j), h)
+        column_field2array(Spaces.column(field, i, j, h))
+    end
+
+"""
+    field2arrays_view(field)
+
+Similar to `field2arrays(field)`, except that this version calls
+`column_field2array_view` instead of `column_field2array`.
+"""
+field2arrays_view(field) =
+    map(column_indices(field)) do ((i, j), h)
+        column_field2array_view(Spaces.column(field, i, j, h))
+    end
+
 function Base.show(
     io::IO,
     field::Fields.Field{<:Fields.AbstractData{<:BandMatrixRow}},
@@ -125,7 +128,8 @@ function Base.show(
             println(io, " whose first column corresponds to the matrix")
         end
         column_field = Fields.column(field, 1, 1, 1)
-        Base.print_array(io, column_field2array_view(column_field))
+        io = IOContext(io, :compact => true, :limit => true)
+        @allowscalar Base.print_array(io, column_field2array_view(column_field))
     else
         # When a BandedMatrix with non-number entries is printed, it currently
         # either prints in an illegible format (e.g., if it has AxisTensor or
diff --git a/src/MatrixFields/matrix_multiplication.jl b/src/MatrixFields/matrix_multiplication.jl
index 7f9144d623..45ddaba9bc 100644
--- a/src/MatrixFields/matrix_multiplication.jl
+++ b/src/MatrixFields/matrix_multiplication.jl
@@ -216,7 +216,7 @@ function Operators.return_eltype(
         "The first argument of ⋅ must have elements of type BandMatrixRow, but \
          the given argument has elements of type $(eltype(matrix1))",
     )
-    lg_type = eltype(Fields.local_geometry_field(axes(arg)))
+    lg_type = Operators.local_geometry_type(axes(arg))
     if eltype(arg) <: BandMatrixRow # matrix-matrix multiplication
         matrix2 = arg
         ld1, ud1 = outer_diagonals(eltype(matrix1))
diff --git a/src/Operators/common.jl b/src/Operators/common.jl
index 8251ee62b3..07315befdc 100644
--- a/src/Operators/common.jl
+++ b/src/Operators/common.jl
@@ -65,33 +65,34 @@ end
 # when sending Broadcasted objects to the GPU, we strip out the space
 # information at each level of the broadcast tree and Fields, replacing with
 # PlaceholderSpace. this reduces the amount runtime parameter data we send to
-# the GPU, which is quite limited (~4kB).
+# the GPU, which is quite limited (~4kB). However, we need to keep the eltype of
+# the local geometry data, since it may needed in order to infer the return
+# eltype of a Field operation.
 
 # Functions for CUDASpectralStyle
-struct PlaceholderSpace <: Spaces.AbstractSpace end
-struct CenterPlaceholderSpace <: Spaces.AbstractSpace end
-struct FacePlaceholderSpace <: Spaces.AbstractSpace end
-
+struct PlaceholderSpace{LG} <: Spaces.AbstractSpace end
+struct CenterPlaceholderSpace{LG} <: Spaces.AbstractSpace end
+struct FacePlaceholderSpace{LG} <: Spaces.AbstractSpace end
 
 placeholder_space(current_space::T, parent_space::T) where {T} =
-    PlaceholderSpace()
+    PlaceholderSpace{local_geometry_type(current_space)}()
 placeholder_space(current_space, parent_space) = current_space
 placeholder_space(
     current_space::Spaces.CenterFiniteDifferenceSpace,
     parent_space::Spaces.FaceFiniteDifferenceSpace,
-) = CenterPlaceholderSpace()
+) = CenterPlaceholderSpace{local_geometry_type(current_space)}()
 placeholder_space(
     current_space::Spaces.CenterExtrudedFiniteDifferenceSpace,
     parent_space::Spaces.FaceExtrudedFiniteDifferenceSpace,
-) = CenterPlaceholderSpace()
+) = CenterPlaceholderSpace{local_geometry_type(current_space)}()
 placeholder_space(
     current_space::Spaces.FaceFiniteDifferenceSpace,
     parent_space::Spaces.CenterFiniteDifferenceSpace,
-) = FacePlaceholderSpace()
+) = FacePlaceholderSpace{local_geometry_type(current_space)}()
 placeholder_space(
     current_space::Spaces.FaceExtrudedFiniteDifferenceSpace,
     parent_space::Spaces.CenterExtrudedFiniteDifferenceSpace,
-) = FacePlaceholderSpace()
+) = FacePlaceholderSpace{local_geometry_type(current_space)}()
 
 @inline reconstruct_placeholder_space(::PlaceholderSpace, parent_space) =
     parent_space
@@ -114,6 +115,11 @@ placeholder_space(
 @inline reconstruct_placeholder_space(current_space, parent_space) =
     current_space
 
+local_geometry_type(space::Spaces.AbstractSpace) =
+    eltype(Spaces.local_geometry_data(space))
+local_geometry_type(::PlaceholderSpace{LG}) where {LG} = LG
+local_geometry_type(::CenterPlaceholderSpace{LG}) where {LG} = LG
+local_geometry_type(::FacePlaceholderSpace{LG}) where {LG} = LG
 
 strip_space(obj, parent_space) = obj
 
@@ -131,7 +137,11 @@ function strip_space(
     new_space = placeholder_space(current_space, parent_space)
     return Base.Broadcast.Broadcasted{Style}(
         bc.f,
-        map(arg -> strip_space(arg, current_space), bc.args),
+        strip_space_args(bc.args, current_space),
         new_space,
     )
 end
+
+strip_space_args(::Tuple{}, space) = ()
+strip_space_args(args::Tuple, space) =
+    (strip_space(args[1], space), strip_space_args(Base.tail(args), space)...)
diff --git a/src/Operators/finitedifference.jl b/src/Operators/finitedifference.jl
index a15cb9cf7e..79b4d0c599 100644
--- a/src/Operators/finitedifference.jl
+++ b/src/Operators/finitedifference.jl
@@ -3415,7 +3415,7 @@ function strip_space(bc::StencilBroadcasted{Style}, parent_space) where {Style}
     new_space = placeholder_space(current_space, parent_space)
     return StencilBroadcasted{Style}(
         bc.op,
-        map(arg -> strip_space(arg, current_space), bc.args),
+        strip_space_args(bc.args, current_space),
         new_space,
     )
 end
diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl
index aad2f26f0a..88df089300 100644
--- a/src/Operators/spectralelement.jl
+++ b/src/Operators/spectralelement.jl
@@ -236,7 +236,7 @@ function strip_space(bc::SpectralBroadcasted{Style}, parent_space) where {Style}
     new_space = placeholder_space(current_space, parent_space)
     return SpectralBroadcasted{Style}(
         bc.op,
-        map(arg -> strip_space(arg, current_space), bc.args),
+        strip_space_args(bc.args, current_space),
         new_space,
     )
 end
diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl
index 714dbaa1c6..66a084c60b 100644
--- a/test/MatrixFields/matrix_field_broadcasting.jl
+++ b/test/MatrixFields/matrix_field_broadcasting.jl
@@ -1,8 +1,9 @@
 using Test
 using JET
-using Random: seed!
-using LinearAlgebra: I, mul!
-using BandedMatrices: band
+import CUDA
+import BandedMatrices: band
+import LinearAlgebra: I, mul!
+import Random: seed!
 
 using ClimaCore.MatrixFields
 import ClimaCore:
@@ -43,13 +44,23 @@ function test_matrix_broadcast_against_array_reference(;
     inputs,
     get_result::F1,
     set_result!::F2,
-    temp_values = (),
-    ref_set_result!::F3,
+    get_temp_values::F3 = (_...) -> (),
+    ref_set_result!::F4,
     time_ratio_limit = 1,
-    max_error_limit = 3,
-) where {F1, F2, F3}
+    max_error_limit = 4,
+    test_broken_with_cuda = false,
+) where {F1, F2, F3, F4}
     @testset "$test_name" begin
+        is_using_cuda = ClimaComms.device(inputs[1]) isa ClimaComms.CUDADevice
+        ignore_cuda = (AnyFrameModule(CUDA),)
+
+        if test_broken_with_cuda && is_using_cuda
+            @test_throws CUDA.InvalidIRError get_result(inputs...)
+            return
+        end
+
         result = get_result(inputs...)
+        temp_values = get_temp_values(inputs...)
 
         # Fill all output fields with NaNs for testing correctness.
         result .*= NaN
@@ -85,9 +96,10 @@ function test_matrix_broadcast_against_array_reference(;
         @test best_time < time_ratio_limit * best_ref_time
 
         # Test set_result! for correctness, allocations, and type instabilities.
+        # Ignore the type instabilities in CUDA and the allocations they incur.
         @test max_error <= max_error_limit
-        @test (@allocated set_result!(result, inputs...)) == 0
-        @test_opt set_result!(result, inputs...)
+        @test is_using_cuda || (@allocated set_result!(result, inputs...)) == 0
+        @test_opt ignored_modules = ignore_cuda set_result!(result, inputs...)
 
         # Test ref_set_result! for allocations and type instabilities. This is
         # helpful for ensuring that the performance comparison is fair.
@@ -105,8 +117,8 @@ function test_matrix_broadcast_against_array_reference(;
         )
 
         # Test get_result (the allocating version of set_result!) for type
-        # instabilities.
-        @test_opt get_result(inputs...)
+        # instabilities. Ignore the type instabilities in CUDA.
+        @test_opt ignored_modules = ignore_cuda get_result(inputs...)
     end
 end
 
@@ -119,6 +131,9 @@ function test_matrix_broadcast_against_reference(;
     ref_set_result!::F3,
 ) where {F1, F2, F3}
     @testset "$test_name" begin
+        is_using_cuda = ClimaComms.device(inputs[1]) isa ClimaComms.CUDADevice
+        ignore_cuda = (AnyFrameModule(CUDA),)
+
         result = get_result(inputs...)
 
         # Fill the output field with NaNs for testing correctness.
@@ -136,18 +151,30 @@ function test_matrix_broadcast_against_reference(;
         @test best_time < best_ref_time
 
         # Test set_result! for correctness, allocations, and type instabilities.
-        @test result == ref_result
-        @test (@allocated set_result!(result, inputs...)) == 0
-        @test_opt set_result!(result, inputs...)
+        # Ignore the type instabilities in CUDA and the allocations they incur.
+        # Account for the roundoff errors that CUDA introduces.
+        @test if is_using_cuda
+            error_array = abs.(parent(result) .- parent(ref_result))
+            CUDA.@allowscalar maximum(error_array) < eps(eltype(error_array))
+        else
+            result == ref_result
+        end
+        @test is_using_cuda || (@allocated set_result!(result, inputs...)) == 0
+        @test_opt ignored_modules = ignore_cuda set_result!(result, inputs...)
 
         # Test ref_set_result! for allocations and type instabilities. This is
-        # helpful for ensuring that the performance comparison is fair.
-        @test (@allocated ref_set_result!(ref_result, ref_inputs...)) == 0
-        @test_opt ref_set_result!(ref_result, ref_inputs...)
+        # helpful for ensuring that the performance comparison is fair. Ignore
+        # the type instabilities in CUDA and the allocations they incur.
+        @test is_using_cuda ||
+              (@allocated ref_set_result!(ref_result, ref_inputs...)) == 0
+        @test_opt ignored_modules = ignore_cuda ref_set_result!(
+            ref_result,
+            ref_inputs...,
+        )
 
         # Test get_result (the allocating version of set_result!) for type
-        # instabilities.
-        @test_opt get_result(inputs...)
+        # instabilities. Ignore the type instabilities in CUDA.
+        @test_opt ignored_modules = ignore_cuda get_result(inputs...)
     end
 end
 
@@ -167,12 +194,14 @@ function random_test_fields(::Type{FT}) where {FT}
         boundary_tags = (:bottom, :top),
     )
     vmesh = Meshes.IntervalMesh(vdomain, nelems = velem)
-    vtopology = Tpologies.IntervalTopology(comms_ctx, vmesh)
+    vtopology = Topologies.IntervalTopology(comms_ctx, vmesh)
     vspace = Spaces.CenterFiniteDifferenceSpace(vtopology)
     sfc_coord = Fields.coordinate_field(hspace)
-    hypsography = Hypsography.LinearAdaption(
-        @. cosd(sfc_coord.lat) + cosd(sfc_coord.long) + 1
-    )
+    hypsography =
+        comms_ctx.device isa ClimaComms.CUDADevice ? Hypsography.Flat() :
+        Hypsography.LinearAdaption(
+            @. cosd(sfc_coord.lat) + cosd(sfc_coord.long) + 1
+        ) # TODO: FD operators don't currently work with hypsography on GPUs.
     center_space =
         Spaces.ExtrudedFiniteDifferenceSpace(hspace, vspace, hypsography)
     face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(center_space)
@@ -180,12 +209,12 @@ function random_test_fields(::Type{FT}) where {FT}
     ᶠcoord = Fields.coordinate_field(face_space)
 
     seed!(1) # ensures reproducibility
-    ᶜᶜmat = map(_ -> DiagonalMatrixRow(rand(FT, 1)...), ᶜcoord)
-    ᶜᶠmat = map(_ -> BidiagonalMatrixRow(rand(FT, 2)...), ᶜcoord)
-    ᶠᶠmat = map(_ -> TridiagonalMatrixRow(rand(FT, 3)...), ᶠcoord)
-    ᶠᶜmat = map(_ -> QuaddiagonalMatrixRow(rand(FT, 4)...), ᶠcoord)
-    ᶜvec = map(_ -> rand(FT), ᶜcoord)
-    ᶠvec = map(_ -> rand(FT), ᶠcoord)
+    ᶜᶜmat = map(c -> DiagonalMatrixRow(ntuple(_ -> rand(FT), 1)...), ᶜcoord)
+    ᶜᶠmat = map(c -> BidiagonalMatrixRow(ntuple(_ -> rand(FT), 2)...), ᶜcoord)
+    ᶠᶠmat = map(c -> TridiagonalMatrixRow(ntuple(_ -> rand(FT), 3)...), ᶠcoord)
+    ᶠᶜmat = map(c -> QuaddiagonalMatrixRow(ntuple(_ -> rand(FT), 4)...), ᶠcoord)
+    ᶜvec = map(c -> rand(FT), ᶜcoord)
+    ᶠvec = map(c -> rand(FT), ᶠcoord)
 
     return ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec
 end
@@ -257,7 +286,8 @@ end
             (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat),
         set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
             (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat),
-        temp_values = ((@. ᶜᶜmat ⋅ ᶜᶠmat), (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat)),
+        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+            ((@. ᶜᶜmat ⋅ ᶜᶠmat), (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat)),
         ref_set_result! = (
             _result,
             _ᶜᶜmat,
@@ -282,7 +312,8 @@ end
             (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))),
         set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
             (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))),
-        temp_values = ((@. ᶠᶠmat ⋅ ᶠᶜmat), (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))),
+        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
+            ((@. ᶠᶠmat ⋅ ᶠᶜmat), (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))),
         ref_set_result! = (
             _result,
             _ᶜᶜmat,
@@ -296,6 +327,7 @@ end
             mul!(_temp2, _ᶜᶠmat, _temp1)
             mul!(_result, _ᶜᶜmat, _temp2)
         end,
+        test_broken_with_cuda = true, # TODO: Fix this.
     )
 
     test_matrix_broadcast_against_array_reference(;
@@ -307,7 +339,7 @@ end
             (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec),
         set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) ->
             (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec),
-        temp_values = (
+        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> (
             (@. ᶜᶜmat ⋅ ᶜᶠmat),
             (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat),
             (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat),
@@ -339,7 +371,7 @@ end
             (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))),
         set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) ->
             (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))),
-        temp_values = (
+        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> (
             (@. ᶠᶜmat ⋅ ᶜvec),
             (@. ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)),
             (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec))),
@@ -361,6 +393,7 @@ end
             mul!(_result, _ᶜᶜmat, _temp3)
         end,
         time_ratio_limit = 15, # This case's ref function is fast on Buildkite.
+        test_broken_with_cuda = true, # TODO: Fix this.
     )
 
     test_matrix_broadcast_against_array_reference(;
@@ -370,7 +403,7 @@ end
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)),
         set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) ->
             (@. result = 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)),
-        temp_values = (
+        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (
             (@. 2 * ᶠᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
@@ -404,7 +437,7 @@ end
             (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)),
         set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result =
             ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)),
-        temp_values = (
+        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (
             (@. ᶠᶜmat ⋅ ᶜᶜmat),
             (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
             (@. ᶠᶠmat / 3),
@@ -438,7 +471,7 @@ end
             (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))),
         set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result =
             ᶜᶠmat ⋅ (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))),
-        temp_values = (
+        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (
             (@. 2 * ᶠᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
@@ -465,7 +498,7 @@ end
             @. _temp5 = _temp3 + _temp4 / 3 - _temp5
             mul!(_result, _ᶜᶠmat, _temp5)
         end,
-        max_error_limit = 276, # Why is there a 276 * eps error on Buildkite?
+        max_error_limit = 296, # Why is there a 296 * eps error on Buildkite?
     )
 
     test_matrix_broadcast_against_array_reference(;
@@ -477,7 +510,7 @@ end
         set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result =
             (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅
             (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,))),
-        temp_values = (
+        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (
             (@. 2 * ᶠᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat),
@@ -520,7 +553,7 @@ end
             @. _temp10 = _temp7 * 2 - _temp9 + _temp10
             mul!(_result, _temp5, _temp10)
         end,
-        max_error_limit = 44, # Why is there a 44 * eps error on Buildkite?
+        max_error_limit = 2496, # Why is there a 2496 * eps error on Buildkite?
     )
 
     test_matrix_broadcast_against_array_reference(;
@@ -538,7 +571,7 @@ end
             ᶠᶠmat ⋅
             (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)) ⋅
             ᶠᶠmat),
-        temp_values = (
+        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (
             (@. ᶠᶜmat ⋅ ᶜᶠmat),
             (@. 2 * ᶠᶜmat),
             (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat),
@@ -615,7 +648,7 @@ end
                 BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅
                 TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat ⋅
                 DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)),
-        temp_values = (
+        get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec) -> (
             (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec)),
             (@. TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1)),
             (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅

From 60e729f5ef49ea1adba82456ab7ab19ac3ea7039 Mon Sep 17 00:00:00 2001
From: Dennis Yatunin <dyatun@gmail.com>
Date: Fri, 7 Jul 2023 11:25:22 -0700
Subject: [PATCH 04/10] Additional fixes

---
 src/MatrixFields/matrix_field_utils.jl        | 40 +++++++++----------
 .../MatrixFields/matrix_field_broadcasting.jl | 26 ++++++------
 2 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/src/MatrixFields/matrix_field_utils.jl b/src/MatrixFields/matrix_field_utils.jl
index 4a4d4c350e..7883634892 100644
--- a/src/MatrixFields/matrix_field_utils.jl
+++ b/src/MatrixFields/matrix_field_utils.jl
@@ -45,19 +45,19 @@ function column_field2array(field::Fields.FiniteDifferenceField)
         )
         for (index_of_field_entry, matrix_d) in enumerate(matrix_ld:matrix_ud)
             matrix_diagonal = view(matrix, band(matrix_d))
-            field_diagonal = field.entries.:($index_of_field_entry)
-            field_diagonal_data =
-                vec(reinterpret(eltype(eltype(field)), parent(field_diagonal)'))
+            diagonal_field = field.entries.:($index_of_field_entry)
+            diagonal_data =
+                vec(reinterpret(eltype(eltype(field)), parent(diagonal_field)'))
 
-            # Find the rows for which field_diagonal_data[row] is in the matrix.
+            # Find the rows for which diagonal_data[row] is in the matrix.
             # Note: The matrix index (1, 1) corresponds to the diagonal index 0,
             # and the matrix index (n_rows, n_cols) corresponds to the diagonal
             # index n_cols - n_rows.
             first_row = matrix_d < 0 ? 1 - matrix_d : 1
             last_row = matrix_d < n_cols - n_rows ? n_rows : n_cols - matrix_d
 
-            @allowscalar matrix_diagonal .=
-                view(field_diagonal_data, first_row:last_row)
+            diagonal_data_view = view(diagonal_data, first_row:last_row)
+            @allowscalar copyto!(matrix_diagonal, diagonal_data_view)
         end
         return matrix
     else # field represents a vector
@@ -73,7 +73,7 @@ the data stored in the field.
 """
 function column_field2array_view(field::Fields.FiniteDifferenceField)
     if eltype(field) <: BandMatrixRow # field represents a matrix
-        n_rows, n_cols, matrix_ld, matrix_ud = banded_matrix_info(field)
+        _, n_cols, matrix_ld, matrix_ud = banded_matrix_info(field)
         field_data_transpose =
             reinterpret(eltype(eltype(field)), parent(field)')
         matrix_transpose =
@@ -86,11 +86,12 @@ function column_field2array_view(field::Fields.FiniteDifferenceField)
     end
 end
 
-column_indices(field::Fields.FiniteDifferenceField) = (((1, 1), 1),)
-column_indices(field::Fields.ExtrudedFiniteDifferenceField) =
+all_columns(::Fields.FiniteDifferenceField) = (((1, 1), 1),)
+all_columns(field::Fields.ExtrudedFiniteDifferenceField) =
     (Spaces.all_nodes(Spaces.horizontal_space(axes(field)))...,)
-column_indices(field) =
-    error("Invalid space type: $(typeof(axes(field)).name.wrapper)")
+
+column_map(f::F, field) where {F} =
+    map((((i, j), h),) -> f(Spaces.column(field, i, j, h)), all_columns(field))
 
 """
     field2arrays(field)
@@ -100,10 +101,7 @@ Converts a field defined on a `FiniteDifferenceSpace` or on an
 corresponds to a column of the field. This is done by calling
 `column_field2array` on each of the field's columns.
 """
-field2arrays(field) =
-    map(column_indices(field)) do ((i, j), h)
-        column_field2array(Spaces.column(field, i, j, h))
-    end
+field2arrays(field) = column_map(column_field2array, field)
 
 """
     field2arrays_view(field)
@@ -111,18 +109,18 @@ field2arrays(field) =
 Similar to `field2arrays(field)`, except that this version calls
 `column_field2array_view` instead of `column_field2array`.
 """
-field2arrays_view(field) =
-    map(column_indices(field)) do ((i, j), h)
-        column_field2array_view(Spaces.column(field, i, j, h))
-    end
+field2arrays_view(field) = column_map(column_field2array_view, field)
 
 function Base.show(
     io::IO,
     field::Fields.Field{<:Fields.AbstractData{<:BandMatrixRow}},
 )
+    is_matrix_field =
+        field isa Fields.FiniteDifferenceField ||
+        field isa Fields.ExtrudedFiniteDifferenceField
     print(io, eltype(field), "-valued Field")
-    if eltype(eltype(field)) <: Number
-        if axes(field) isa Spaces.FiniteDifferenceSpace
+    if is_matrix_field && eltype(eltype(field)) <: Number
+        if field isa Fields.FiniteDifferenceField
             println(io, " that corresponds to the matrix")
         else
             println(io, " whose first column corresponds to the matrix")
diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl
index 66a084c60b..7c1465bdfc 100644
--- a/test/MatrixFields/matrix_field_broadcasting.jl
+++ b/test/MatrixFields/matrix_field_broadcasting.jl
@@ -2,7 +2,7 @@ using Test
 using JET
 import CUDA
 import BandedMatrices: band
-import LinearAlgebra: I, mul!
+import LinearAlgebra: I, mul!, norm
 import Random: seed!
 
 using ClimaCore.MatrixFields
@@ -47,7 +47,7 @@ function test_matrix_broadcast_against_array_reference(;
     get_temp_values::F3 = (_...) -> (),
     ref_set_result!::F4,
     time_ratio_limit = 1,
-    max_error_limit = 4,
+    max_eps_error_limit = 6,
     test_broken_with_cuda = false,
 ) where {F1, F2, F3, F4}
     @testset "$test_name" begin
@@ -56,6 +56,7 @@ function test_matrix_broadcast_against_array_reference(;
 
         if test_broken_with_cuda && is_using_cuda
             @test_throws CUDA.InvalidIRError get_result(inputs...)
+            @warn "$test_name:\n\tCUDA.InvalidIRError"
             return
         end
 
@@ -84,20 +85,19 @@ function test_matrix_broadcast_against_array_reference(;
         result_arrays = MatrixFields.field2arrays(result)
         max_error =
             maximum(zip(result_arrays, ref_result_arrays)) do (array, ref_array)
-                maximum(zip(array, ref_array)) do (value, ref_value)
-                    Int(abs(value - ref_value) / eps(ref_value))
-                end
+                maximum(abs.(array .- ref_array))
             end
+        max_eps_error = Int(max_error / eps(typeof(max_error)))
 
         @info "$test_name:\n\tBest Time = $best_time s\n\tBest Reference Time \
-               = $best_ref_time s\n\tMaximum Error = $max_error eps"
+               = $best_ref_time s\n\tMaximum Error = $max_eps_error eps"
 
         # Test that set_result! is performant compared to ref_set_result!.
-        @test best_time < time_ratio_limit * best_ref_time
+        @test best_time / best_ref_time < time_ratio_limit
 
         # Test set_result! for correctness, allocations, and type instabilities.
         # Ignore the type instabilities in CUDA and the allocations they incur.
-        @test max_error <= max_error_limit
+        @test max_eps_error <= max_eps_error_limit
         @test is_using_cuda || (@allocated set_result!(result, inputs...)) == 0
         @test_opt ignored_modules = ignore_cuda set_result!(result, inputs...)
 
@@ -154,8 +154,8 @@ function test_matrix_broadcast_against_reference(;
         # Ignore the type instabilities in CUDA and the allocations they incur.
         # Account for the roundoff errors that CUDA introduces.
         @test if is_using_cuda
-            error_array = abs.(parent(result) .- parent(ref_result))
-            CUDA.@allowscalar maximum(error_array) < eps(eltype(error_array))
+            max_error = maximum(abs.(parent(result) .- parent(ref_result)))
+            max_error < eps(typeof(max_error))
         else
             result == ref_result
         end
@@ -461,7 +461,6 @@ end
             copyto!(_result, 4I) # We can't directly use I in array broadcasts.
             @. _result = _temp2 * 2 - _temp4 + _result
         end,
-        max_error_limit = 512, # Why is there a 512 * eps error on Buildkite?
     )
 
     test_matrix_broadcast_against_array_reference(;
@@ -498,7 +497,6 @@ end
             @. _temp5 = _temp3 + _temp4 / 3 - _temp5
             mul!(_result, _ᶜᶠmat, _temp5)
         end,
-        max_error_limit = 296, # Why is there a 296 * eps error on Buildkite?
     )
 
     test_matrix_broadcast_against_array_reference(;
@@ -553,7 +551,7 @@ end
             @. _temp10 = _temp7 * 2 - _temp9 + _temp10
             mul!(_result, _temp5, _temp10)
         end,
-        max_error_limit = 2496, # Why is there a 2496 * eps error on Buildkite?
+        max_eps_error_limit = 30, # This case's roundoff error is large on GPUs.
     )
 
     test_matrix_broadcast_against_array_reference(;
@@ -633,7 +631,7 @@ end
             mul!(_result, _temp14, _ᶠᶠmat)
         end,
         time_ratio_limit = 2, # This case's ref function is fast on Buildkite.
-        max_error_limit = 3038, # Why is there a 3038 * eps error on Buildkite?
+        max_eps_error_limit = 70, # This case's roundoff error is large on GPUs.
     )
 
     test_matrix_broadcast_against_array_reference(;

From 4ce2749c66ea58e23548e2aa2429e03a5a64a1ed Mon Sep 17 00:00:00 2001
From: Dennis Yatunin <dyatun@gmail.com>
Date: Fri, 7 Jul 2023 13:36:55 -0700
Subject: [PATCH 05/10] More fixes

---
 test/MatrixFields/matrix_field_broadcasting.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl
index 7c1465bdfc..3650bfe9f0 100644
--- a/test/MatrixFields/matrix_field_broadcasting.jl
+++ b/test/MatrixFields/matrix_field_broadcasting.jl
@@ -2,7 +2,7 @@ using Test
 using JET
 import CUDA
 import BandedMatrices: band
-import LinearAlgebra: I, mul!, norm
+import LinearAlgebra: I, mul!
 import Random: seed!
 
 using ClimaCore.MatrixFields
@@ -47,7 +47,7 @@ function test_matrix_broadcast_against_array_reference(;
     get_temp_values::F3 = (_...) -> (),
     ref_set_result!::F4,
     time_ratio_limit = 1,
-    max_eps_error_limit = 6,
+    max_eps_error_limit = 7,
     test_broken_with_cuda = false,
 ) where {F1, F2, F3, F4}
     @testset "$test_name" begin
@@ -87,7 +87,7 @@ function test_matrix_broadcast_against_array_reference(;
             maximum(zip(result_arrays, ref_result_arrays)) do (array, ref_array)
                 maximum(abs.(array .- ref_array))
             end
-        max_eps_error = Int(max_error / eps(typeof(max_error)))
+        max_eps_error = ceil(Int, max_error / eps(typeof(max_error)))
 
         @info "$test_name:\n\tBest Time = $best_time s\n\tBest Reference Time \
                = $best_ref_time s\n\tMaximum Error = $max_eps_error eps"
@@ -97,7 +97,7 @@ function test_matrix_broadcast_against_array_reference(;
 
         # Test set_result! for correctness, allocations, and type instabilities.
         # Ignore the type instabilities in CUDA and the allocations they incur.
-        @test max_eps_error <= max_eps_error_limit
+        @test max_eps_error < max_eps_error_limit
         @test is_using_cuda || (@allocated set_result!(result, inputs...)) == 0
         @test_opt ignored_modules = ignore_cuda set_result!(result, inputs...)
 

From 851e5a9f7393a1356c5adc7629715e5283914525 Mon Sep 17 00:00:00 2001
From: Dennis Yatunin <dyatun@gmail.com>
Date: Fri, 7 Jul 2023 15:36:06 -0700
Subject: [PATCH 06/10] Address comments

---
 src/MatrixFields/matrix_field_utils.jl        | 38 +++++++++----------
 .../MatrixFields/matrix_field_broadcasting.jl | 16 ++++----
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/src/MatrixFields/matrix_field_utils.jl b/src/MatrixFields/matrix_field_utils.jl
index 7883634892..56de58e185 100644
--- a/src/MatrixFields/matrix_field_utils.jl
+++ b/src/MatrixFields/matrix_field_utils.jl
@@ -1,28 +1,26 @@
+# Finds the diagonal index of the value that ends up in the bottom-right corner
+# of the BandedMatrix generated from a matrix field.
+bottom_corner_matrix_d(_, ::Int) = 0 # center-to-center or face-to-face matrix
+bottom_corner_matrix_d(::Spaces.CellCenter, ::PlusHalf) = 1 # face-to-center
+bottom_corner_matrix_d(::Spaces.CellFace, ::PlusHalf) = -1 # center-to-face
+
+# Finds the amount by which the field's diagonal indices get shifted when it is
+# converted into a BandedMatrix (whose diagonal indices have to be integers).
+matrix_d_minus_field_d(_, ::Int) = 0 # center-to-center or face-to-face matrix
+matrix_d_minus_field_d(::Spaces.CellCenter, ::PlusHalf) = half # face-to-center
+matrix_d_minus_field_d(::Spaces.CellFace, ::PlusHalf) = -half # center-to-face
+
 function banded_matrix_info(field)
-    space = axes(field)
     field_ld, field_ud = outer_diagonals(eltype(field))
-
-    # Find the diagonal index of the value that ends up in the bottom-right
-    # corner of the matrix, as well as the amount by which the field's diagonal
-    # indices get shifted when it is converted into a matrix.
-    bottom_corner_matrix_d, matrix_d_minus_field_d = if field_ld isa PlusHalf
-        if space.staggering isa Spaces.CellCenter
-            1, half # field is a face-to-center matrix
-        else
-            -1, -half # field is a center-to-face matrix
-        end
-    else
-        0, 0 # field is either a center-to-center or face-to-face matrix
-    end
-
+    space = axes(field)
+    staggering = space.staggering
     n_rows = Spaces.nlevels(space)
-    n_cols = n_rows + bottom_corner_matrix_d
-    matrix_ld = field_ld + matrix_d_minus_field_d
-    matrix_ud = field_ud + matrix_d_minus_field_d
+    n_cols = n_rows + bottom_corner_matrix_d(staggering, field_ld)
+    matrix_ld = field_ld + matrix_d_minus_field_d(staggering, field_ld)
+    matrix_ud = field_ud + matrix_d_minus_field_d(staggering, field_ld)
     matrix_ld <= 0 && matrix_ud >= 0 ||
         error("BandedMatrices.jl does not yet support matrices that have \
                diagonals with indices in the range $matrix_ld:$matrix_ud")
-
     return n_rows, n_cols, matrix_ld, matrix_ud
 end
 
@@ -88,7 +86,7 @@ end
 
 all_columns(::Fields.FiniteDifferenceField) = (((1, 1), 1),)
 all_columns(field::Fields.ExtrudedFiniteDifferenceField) =
-    (Spaces.all_nodes(Spaces.horizontal_space(axes(field)))...,)
+    Spaces.all_nodes(Spaces.horizontal_space(axes(field)))
 
 column_map(f::F, field) where {F} =
     map((((i, j), h),) -> f(Spaces.column(field, i, j, h)), all_columns(field))
diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl
index 3650bfe9f0..8f68b0a600 100644
--- a/test/MatrixFields/matrix_field_broadcasting.jl
+++ b/test/MatrixFields/matrix_field_broadcasting.jl
@@ -26,18 +26,18 @@ macro benchmark(expression)
     end
 end
 
-# This has to be its own function in order to correctly measure its allocations.
-call_array_func(
+# This function is used for benchmarking ref_set_result!.
+function call_array_func(
     ref_set_result!::F,
     ref_result_arrays,
     inputs_arrays,
     temp_values_arrays,
-) where {F} = foreach(
-    ref_set_result!,
-    ref_result_arrays,
-    inputs_arrays...,
-    temp_values_arrays...,
-)
+) where {F}
+    for arrays in
+        zip(ref_result_arrays, inputs_arrays..., temp_values_arrays...)
+        ref_set_result!(arrays...)
+    end
+end
 
 function test_matrix_broadcast_against_array_reference(;
     test_name,

From 3ce6f45976db1a08bc93517b4c4dd9edb0295f13 Mon Sep 17 00:00:00 2001
From: Dennis Yatunin <dyatun@gmail.com>
Date: Fri, 7 Jul 2023 15:37:14 -0700
Subject: [PATCH 07/10] Try removing check-bounds from GPU job to see if that
 fixes the PTX error

---
 .buildkite/pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 75bc995f53..45617e88f1 100755
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -515,7 +515,7 @@ steps:
 
       - label: "Unit: matrix field broadcasting (GPU)"
         key: unit_matrix_field_broadcasting_gpu
-        command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl"
+        command: "julia --color=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl"
         agents:
           slurm_gpus: 1
 

From 0c61d69860662f9d43f944edcbd638fa5ed45cd7 Mon Sep 17 00:00:00 2001
From: Dennis Yatunin <dyatun@gmail.com>
Date: Fri, 7 Jul 2023 16:08:45 -0700
Subject: [PATCH 08/10] Try allocating more memory for GPU job instead

---
 .buildkite/pipeline.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 45617e88f1..8fb892dd49 100755
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -515,9 +515,10 @@ steps:
 
       - label: "Unit: matrix field broadcasting (GPU)"
         key: unit_matrix_field_broadcasting_gpu
-        command: "julia --color=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl"
+        command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl"
         agents:
           slurm_gpus: 1
+          slurm_mem: 20GB
 
   - group: "Unit: Hypsography"
     steps:

From 22d0e466c94d939997f67f4d92d2b593af12dc97 Mon Sep 17 00:00:00 2001
From: Dennis Yatunin <dyatun@gmail.com>
Date: Fri, 7 Jul 2023 16:53:30 -0700
Subject: [PATCH 09/10] Remove check-bounds again and use more memory

---
 .buildkite/pipeline.yml                | 4 ++--
 src/MatrixFields/matrix_field_utils.jl | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 8fb892dd49..42a855062b 100755
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -515,10 +515,10 @@ steps:
 
       - label: "Unit: matrix field broadcasting (GPU)"
         key: unit_matrix_field_broadcasting_gpu
-        command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl"
+        command: "julia --color=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl"
         agents:
           slurm_gpus: 1
-          slurm_mem: 20GB
+          slurm_mem: 80GB
 
   - group: "Unit: Hypsography"
     steps:
diff --git a/src/MatrixFields/matrix_field_utils.jl b/src/MatrixFields/matrix_field_utils.jl
index 56de58e185..0d8470d6a3 100644
--- a/src/MatrixFields/matrix_field_utils.jl
+++ b/src/MatrixFields/matrix_field_utils.jl
@@ -95,7 +95,7 @@ column_map(f::F, field) where {F} =
     field2arrays(field)
 
 Converts a field defined on a `FiniteDifferenceSpace` or on an
-`ExtrudedFiniteDifferenceSpace` into a tuple of arrays, each of which
+`ExtrudedFiniteDifferenceSpace` into a collection of arrays, each of which
 corresponds to a column of the field. This is done by calling
 `column_field2array` on each of the field's columns.
 """

From c40b2df9b03f498547bca9e388297d619249e1c7 Mon Sep 17 00:00:00 2001
From: Dennis Yatunin <dyatun@gmail.com>
Date: Wed, 12 Jul 2023 17:24:43 -0700
Subject: [PATCH 10/10] Address comments

---
 .buildkite/pipeline.yml                   |   2 +-
 src/MatrixFields/MatrixFields.jl          |   6 +-
 src/MatrixFields/matrix_field_utils.jl    |  13 +-
 src/MatrixFields/matrix_multiplication.jl |  13 +-
 src/MatrixFields/rmul_with_projection.jl  | 153 +++++++++++++++++++---
 src/Operators/common.jl                   |  26 ++--
 test/MatrixFields/rmul_with_projection.jl |   9 +-
 7 files changed, 171 insertions(+), 51 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 42a855062b..ff02381cdb 100755
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -518,7 +518,7 @@ steps:
         command: "julia --color=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl"
         agents:
           slurm_gpus: 1
-          slurm_mem: 80GB
+          slurm_mem: 20GB
 
   - group: "Unit: Hypsography"
     steps:
diff --git a/src/MatrixFields/MatrixFields.jl b/src/MatrixFields/MatrixFields.jl
index 0a0f241acd..17214e97f6 100644
--- a/src/MatrixFields/MatrixFields.jl
+++ b/src/MatrixFields/MatrixFields.jl
@@ -1,12 +1,12 @@
 module MatrixFields
 
 import CUDA: @allowscalar
-import LinearAlgebra: UniformScaling, Adjoint
-import StaticArrays: SArray, SMatrix, SVector
+import LinearAlgebra: UniformScaling, Adjoint, AdjointAbsVec
+import StaticArrays: SMatrix, SVector
 import BandedMatrices: BandedMatrix, band, _BandedMatrix
 import ..Utilities: PlusHalf, half
 import ..RecursiveApply:
-    rmap, rpromote_type, rzero, rconvert, radd, rsub, rmul, rdiv
+    rmap, rmaptype, rpromote_type, rzero, rconvert, radd, rsub, rmul, rdiv
 import ..Geometry
 import ..Spaces
 import ..Fields
diff --git a/src/MatrixFields/matrix_field_utils.jl b/src/MatrixFields/matrix_field_utils.jl
index 0d8470d6a3..d73e1f218e 100644
--- a/src/MatrixFields/matrix_field_utils.jl
+++ b/src/MatrixFields/matrix_field_utils.jl
@@ -84,9 +84,16 @@ function column_field2array_view(field::Fields.FiniteDifferenceField)
     end
 end
 
-all_columns(::Fields.FiniteDifferenceField) = (((1, 1), 1),)
-all_columns(field::Fields.ExtrudedFiniteDifferenceField) =
-    Spaces.all_nodes(Spaces.horizontal_space(axes(field)))
+all_columns(::Fields.ColumnField) = (((1, 1), 1),)
+all_columns(field) = all_columns(axes(field))
+all_columns(space::Spaces.ExtrudedFiniteDifferenceSpace) =
+    Spaces.all_nodes(Spaces.horizontal_space(space))
+
+# TODO: Unify FiniteDifferenceField and ColumnField so that we can use this
+# version instead.
+# all_columns(::Fields.FiniteDifferenceField) = (((1, 1), 1),)
+# all_columns(field::Fields.ExtrudedFiniteDifferenceField) =
+#     Spaces.all_nodes(Spaces.horizontal_space(axes(field)))
 
 column_map(f::F, field) where {F} =
     map((((i, j), h),) -> f(Spaces.column(field, i, j, h)), all_columns(field))
diff --git a/src/MatrixFields/matrix_multiplication.jl b/src/MatrixFields/matrix_multiplication.jl
index 45ddaba9bc..08d0bed830 100644
--- a/src/MatrixFields/matrix_multiplication.jl
+++ b/src/MatrixFields/matrix_multiplication.jl
@@ -199,14 +199,6 @@ Operators.right_interior_idx(
     arg,
 ) = matrix_right_interior_index(space, outer_diagonals(eltype(matrix1))[2])
 
-function rmul_type(::Type{T1}, ::Type{T2}, ::Type{LG}) where {T1, T2, LG}
-    type = Base._return_type(rmul_with_projection, Tuple{T1, T2, LG})
-    type == Union{} &&
-        error("Unable to infer result type: Calling rmul_with_projection with \
-               arguments of types $T1 and $T2 will cause it to throw an error")
-    return type
-end
-
 function Operators.return_eltype(
     ::MultiplyColumnwiseBandMatrixField,
     matrix1,
@@ -216,18 +208,17 @@ function Operators.return_eltype(
         "The first argument of ⋅ must have elements of type BandMatrixRow, but \
          the given argument has elements of type $(eltype(matrix1))",
     )
-    lg_type = Operators.local_geometry_type(axes(arg))
     if eltype(arg) <: BandMatrixRow # matrix-matrix multiplication
         matrix2 = arg
         ld1, ud1 = outer_diagonals(eltype(matrix1))
         ld2, ud2 = outer_diagonals(eltype(matrix2))
         prod_ld, prod_ud = ld1 + ld2, ud1 + ud2
         prod_value_type =
-            rmul_type(eltype(eltype(matrix1)), eltype(eltype(matrix2)), lg_type)
+            rmul_return_type(eltype(eltype(matrix1)), eltype(eltype(matrix2)))
         return band_matrix_row_type(prod_ld, prod_ud, prod_value_type)
     else # matrix-vector multiplication
         vector = arg
-        return rmul_type(eltype(eltype(matrix1)), eltype(vector), lg_type)
+        return rmul_return_type(eltype(eltype(matrix1)), eltype(vector))
     end
 end
 
diff --git a/src/MatrixFields/rmul_with_projection.jl b/src/MatrixFields/rmul_with_projection.jl
index 76f9846b68..9cc449583f 100644
--- a/src/MatrixFields/rmul_with_projection.jl
+++ b/src/MatrixFields/rmul_with_projection.jl
@@ -1,26 +1,27 @@
 const SingleValue =
     Union{Number, Geometry.AxisTensor, Geometry.AdjointAxisTensor}
 
+"""
+    mul_with_projection(x, y, lg)
+
+Similar to `x * y`, except that this version automatically projects `y` to avoid
+`DimensionMismatch` errors for `AxisTensor`s. For example, if `x` is a covector
+along the `Covariant3Axis` (e.g., `Covariant3Vector(1)'`), then `y` will be
+projected onto the `Contravariant3Axis`. In general, the first axis of `y` will
+be projected onto the dual of the last axis of `x`.
+"""
 mul_with_projection(x, y, _) = x * y
-mul_with_projection(x::Geometry.AdjointAxisVector, y::Geometry.AxisTensor, lg) =
-    x * Geometry.project(Geometry.dual(axes(x, 2)), y, lg)
-mul_with_projection(::Geometry.AdjointAxisTensor, ::Geometry.AxisTensor, _) =
-    error("mul_with_projection is currently only implemented for covectors, \
-           and higher-order cotensors are not supported")
-# We should add methods for other cotensors (e.g., AdjointAxis2Tensor) when they
-# are needed (e.g., when we need to support matrices that represent the
-# divergence of higher-order tensors).
+mul_with_projection(
+    x::Union{Geometry.AdjointAxisVector, Geometry.Axis2TensorOrAdj},
+    y::Geometry.AxisTensor,
+    lg,
+) = x * Geometry.project(Geometry.dual(axes(x)[2]), y, lg)
 
 """
     rmul_with_projection(x, y, lg)
 
-Similar to `rmul(x, y)`, but with automatic projection of `y` when `x` contains
-a covector (i.e, an `AdjointAxisVector`). For example, if `x` is a covector
-along the `Covariant3Axis` (e.g., `Covariant3Vector(1)'`), then `y` (or each
-element of `y`) will be projected onto the `Contravariant3Axis`. In general,
-each covector in `x` will cause `y` (or each corresponding element of `y`) to
-be projected onto the dual axis of the covector. In the future, we may extend
-this behavior to higher-order cotensors.
+Similar to `rmul(x, y)`, except that this version calls `mul_with_projection`
+instead of `*`.
 """
 rmul_with_projection(x, y, lg) =
     rmap((x′, y′) -> mul_with_projection(x′, y′, lg), x, y)
@@ -30,3 +31,125 @@ rmul_with_projection(x, y::SingleValue, lg) =
     rmap(x′ -> mul_with_projection(x′, y, lg), x)
 rmul_with_projection(x::SingleValue, y::SingleValue, lg) =
     mul_with_projection(x, y, lg)
+
+axis_tensor_type(::Type{T}, ::Type{Tuple{A1}}) where {T, A1} =
+    Geometry.AxisVector{T, A1, SVector{length(A1.instance), T}}
+function axis_tensor_type(::Type{T}, ::Type{Tuple{A1, A2}}) where {T, A1, A2}
+    N1 = length(A1.instance)
+    N2 = length(A2.instance)
+    S = SMatrix{N1, N2, T, N1 * N2}
+    return Geometry.Axis2Tensor{T, Tuple{A1, A2}, S}
+end
+
+adjoint_type(::Type{A}) where {A} = Adjoint{eltype(A), A}
+adjoint_type(::Type{A}) where {T, S, A <: Adjoint{T, S}} = S
+
+"""
+    mul_return_type(X, Y)
+
+Computes the return type of `mul_with_projection(x, y, lg)`, where `x isa X`
+and `y isa Y`. This can also be used to obtain the return type of `x * y`,
+although `x * y` will throw an error when projection is necessary.
+
+Note: this should be equivalent to calling the internal function `_return_type`:
+`Base._return_type(mul_with_projection, Tuple{X, Y, LG})`, where `lg isa LG`.
+"""
+mul_return_type(::Type{X}, ::Type{Y}) where {X, Y} = error(
+    "Unable to infer return type: Multiplying an object of type $X with an \
+     object of type $Y will result in a method error",
+)
+# Note: If the behavior of * changes for any relevant types, the corresponding
+# methods below should be updated.
+
+# Methods from Base:
+mul_return_type(::Type{X}, ::Type{Y}) where {X <: Number, Y <: Number} =
+    promote_type(X, Y)
+mul_return_type(
+    ::Type{X},
+    ::Type{Y},
+) where {X <: AdjointAbsVec, Y <: AbstractMatrix} =
+    adjoint_type(mul_return_type(adjoint_type(Y), adjoint_type(X)))
+
+# Methods from ClimaCore: 
+mul_return_type(
+    ::Type{X},
+    ::Type{Y},
+) where {T, N, A, X <: Number, Y <: Geometry.AxisTensor{T, N, A}} =
+    axis_tensor_type(promote_type(X, T), A)
+mul_return_type(
+    ::Type{X},
+    ::Type{Y},
+) where {T, N, A, X <: Geometry.AxisTensor{T, N, A}, Y <: Number} =
+    axis_tensor_type(promote_type(T, Y), A)
+mul_return_type(
+    ::Type{X},
+    ::Type{Y},
+) where {T, N, A, X <: Number, Y <: Geometry.AdjointAxisTensor{T, N, A}} =
+    adjoint_type(axis_tensor_type(promote_type(X, T), A))
+mul_return_type(
+    ::Type{X},
+    ::Type{Y},
+) where {T, N, A, X <: Geometry.AdjointAxisTensor{T, N, A}, Y <: Number} =
+    adjoint_type(axis_tensor_type(promote_type(T, Y), A))
+mul_return_type(
+    ::Type{X},
+    ::Type{Y},
+) where {
+    T1,
+    T2,
+    X <: Geometry.AdjointAxisVector{T1},
+    Y <: Geometry.AxisVector{T2},
+} = promote_type(T1, T2) # This comes from the definition of dot.
+mul_return_type(
+    ::Type{X},
+    ::Type{Y},
+) where {
+    T1,
+    T2,
+    A1,
+    A2,
+    X <: Geometry.AxisVector{T1, A1},
+    Y <: Geometry.AdjointAxisVector{T2, A2},
+} = axis_tensor_type(promote_type(T1, T2), Tuple{A1, A2})
+mul_return_type(
+    ::Type{X},
+    ::Type{Y},
+) where {
+    T1,
+    T2,
+    A1,
+    X <: Geometry.Axis2TensorOrAdj{T1, <:Tuple{A1, Any}},
+    Y <: Geometry.AxisVector{T2},
+} = axis_tensor_type(promote_type(T1, T2), Tuple{A1})
+mul_return_type(
+    ::Type{X},
+    ::Type{Y},
+) where {
+    T1,
+    T2,
+    A1,
+    A2,
+    X <: Geometry.Axis2TensorOrAdj{T1, <:Tuple{A1, Any}},
+    Y <: Geometry.Axis2TensorOrAdj{T2, <:Tuple{Any, A2}},
+} = axis_tensor_type(promote_type(T1, T2), Tuple{A1, A2})
+
+"""
+    rmul_return_type(X, Y)
+
+Computes the return type of `rmul_with_projection(x, y, lg)`, where `x isa X`
+and `y isa Y`. This can also be used to obtain the return type of `rmul(x, y)`,
+although `rmul(x, y)` will throw an error when projection is necessary.
+
+Note: this should be equivalent to calling the internal function `_return_type`:
+`Base._return_type(rmul_with_projection, Tuple{X, Y, LG})`, where `lg isa LG`.
+"""
+rmul_return_type(::Type{X}, ::Type{Y}) where {X, Y} =
+    rmaptype((X′, Y′) -> mul_return_type(X′, Y′), X, Y)
+rmul_return_type(::Type{X}, ::Type{Y}) where {X <: SingleValue, Y} =
+    rmaptype(Y′ -> mul_return_type(X, Y′), Y)
+rmul_return_type(::Type{X}, ::Type{Y}) where {X, Y <: SingleValue} =
+    rmaptype(X′ -> mul_return_type(X′, Y), X)
+rmul_return_type(
+    ::Type{X},
+    ::Type{Y},
+) where {X <: SingleValue, Y <: SingleValue} = mul_return_type(X, Y)
diff --git a/src/Operators/common.jl b/src/Operators/common.jl
index 07315befdc..44a5ddef17 100644
--- a/src/Operators/common.jl
+++ b/src/Operators/common.jl
@@ -65,34 +65,33 @@ end
 # when sending Broadcasted objects to the GPU, we strip out the space
 # information at each level of the broadcast tree and Fields, replacing with
 # PlaceholderSpace. this reduces the amount runtime parameter data we send to
-# the GPU, which is quite limited (~4kB). However, we need to keep the eltype of
-# the local geometry data, since it may needed in order to infer the return
-# eltype of a Field operation.
+# the GPU, which is quite limited (~4kB).
 
 # Functions for CUDASpectralStyle
-struct PlaceholderSpace{LG} <: Spaces.AbstractSpace end
-struct CenterPlaceholderSpace{LG} <: Spaces.AbstractSpace end
-struct FacePlaceholderSpace{LG} <: Spaces.AbstractSpace end
+struct PlaceholderSpace <: Spaces.AbstractSpace end
+struct CenterPlaceholderSpace <: Spaces.AbstractSpace end
+struct FacePlaceholderSpace <: Spaces.AbstractSpace end
+
 
 placeholder_space(current_space::T, parent_space::T) where {T} =
-    PlaceholderSpace{local_geometry_type(current_space)}()
+    PlaceholderSpace()
 placeholder_space(current_space, parent_space) = current_space
 placeholder_space(
     current_space::Spaces.CenterFiniteDifferenceSpace,
     parent_space::Spaces.FaceFiniteDifferenceSpace,
-) = CenterPlaceholderSpace{local_geometry_type(current_space)}()
+) = CenterPlaceholderSpace()
 placeholder_space(
     current_space::Spaces.CenterExtrudedFiniteDifferenceSpace,
     parent_space::Spaces.FaceExtrudedFiniteDifferenceSpace,
-) = CenterPlaceholderSpace{local_geometry_type(current_space)}()
+) = CenterPlaceholderSpace()
 placeholder_space(
     current_space::Spaces.FaceFiniteDifferenceSpace,
     parent_space::Spaces.CenterFiniteDifferenceSpace,
-) = FacePlaceholderSpace{local_geometry_type(current_space)}()
+) = FacePlaceholderSpace()
 placeholder_space(
     current_space::Spaces.FaceExtrudedFiniteDifferenceSpace,
     parent_space::Spaces.CenterExtrudedFiniteDifferenceSpace,
-) = FacePlaceholderSpace{local_geometry_type(current_space)}()
+) = FacePlaceholderSpace()
 
 @inline reconstruct_placeholder_space(::PlaceholderSpace, parent_space) =
     parent_space
@@ -115,11 +114,6 @@ placeholder_space(
 @inline reconstruct_placeholder_space(current_space, parent_space) =
     current_space
 
-local_geometry_type(space::Spaces.AbstractSpace) =
-    eltype(Spaces.local_geometry_data(space))
-local_geometry_type(::PlaceholderSpace{LG}) where {LG} = LG
-local_geometry_type(::CenterPlaceholderSpace{LG}) where {LG} = LG
-local_geometry_type(::FacePlaceholderSpace{LG}) where {LG} = LG
 
 strip_space(obj, parent_space) = obj
 
diff --git a/test/MatrixFields/rmul_with_projection.jl b/test/MatrixFields/rmul_with_projection.jl
index d7f4855865..44b610f9cc 100644
--- a/test/MatrixFields/rmul_with_projection.jl
+++ b/test/MatrixFields/rmul_with_projection.jl
@@ -4,10 +4,11 @@ using Random: seed!
 using StaticArrays: @SMatrix
 
 import ClimaCore: Geometry
-import ClimaCore.MatrixFields: rmul_with_projection
+import ClimaCore.MatrixFields: rmul_with_projection, rmul_return_type
 
-function test_rmul_with_projection(x, y, lg, expected_result)
+function test_rmul_with_projection(x::X, y::Y, lg, expected_result) where {X, Y}
     result = rmul_with_projection(x, y, lg)
+    result_type = rmul_return_type(X, Y)
 
     # Compute the maximum error as an integer multiple of machine epsilon.
     FT = Geometry.undertype(typeof(lg))
@@ -22,6 +23,10 @@ function test_rmul_with_projection(x, y, lg, expected_result)
     @test max_error <= 1                                   # correctness
     @test (@allocated rmul_with_projection(x, y, lg)) == 0 # allocations
     @test_opt rmul_with_projection(x, y, lg)               # type instabilities
+
+    @test result_type == typeof(result)                    # correctness
+    @test (@allocated rmul_return_type(X, Y)) == 0         # allocations
+    @test_opt rmul_return_type(X, Y)                       # type instabilities
 end
 
 @testset "rmul_with_projection Unit Tests" begin