From 3e0ee6aa00fa838cb36a123f6de851b1ea7453ab Mon Sep 17 00:00:00 2001 From: Dennis Yatunin Date: Fri, 26 May 2023 18:15:17 -0700 Subject: [PATCH 01/10] Add new MatrixFields module, along with unit tests and performance tests --- .buildkite/pipeline.yml | 19 + Project.toml | 2 + benchmarks/bickleyjet/Manifest.toml | 32 +- docs/Manifest.toml | 42 +- examples/Manifest.toml | 28 +- perf/Manifest.toml | 28 +- src/ClimaCore.jl | 1 + src/Fields/mapreduce.jl | 2 +- src/Geometry/axistensors.jl | 23 +- src/MatrixFields/MatrixFields.jl | 26 + src/MatrixFields/band_matrix_row.jl | 138 +++ src/MatrixFields/matrix_field_utils.jl | 139 +++ src/MatrixFields/matrix_multiplication.jl | 381 ++++++++ src/MatrixFields/rmul_with_projection.jl | 32 + src/RecursiveApply/RecursiveApply.jl | 18 + test/Geometry/axistensors.jl | 6 + test/MatrixFields/band_matrix_row.jl | 66 ++ test/MatrixFields/field2arrays.jl | 82 ++ .../MatrixFields/matrix_field_broadcasting.jl | 816 ++++++++++++++++++ test/MatrixFields/rmul_with_projection.jl | 123 +++ test/Project.toml | 1 + test/runtests.jl | 6 + 22 files changed, 1931 insertions(+), 80 deletions(-) create mode 100644 src/MatrixFields/MatrixFields.jl create mode 100644 src/MatrixFields/band_matrix_row.jl create mode 100644 src/MatrixFields/matrix_field_utils.jl create mode 100644 src/MatrixFields/matrix_multiplication.jl create mode 100644 src/MatrixFields/rmul_with_projection.jl create mode 100644 test/MatrixFields/band_matrix_row.jl create mode 100644 test/MatrixFields/field2arrays.jl create mode 100644 test/MatrixFields/matrix_field_broadcasting.jl create mode 100644 test/MatrixFields/rmul_with_projection.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index b060f6b934..950581d805 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -494,6 +494,25 @@ steps: agents: slurm_gpus: 1 + - group: "Unit: MatrixFields" + steps: + + - label: "Unit: BandMatrixRow" + key: unit_band_matrix_row + command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/band_matrix_row.jl" + + - label: "Unit: rmul_with_projection" + key: unit_rmul_with_projection + command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/rmul_with_projection.jl" + + - label: "Unit: field2arrays" + key: unit_field2arrays + command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/field2arrays.jl" + + - label: "Unit: matrix field broadcasting" + key: unit_matrix_field_broadcasting + command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl" + - group: "Unit: Hypsography" steps: diff --git a/Project.toml b/Project.toml index f2896a5c86..3edc5686e7 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.10.43" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +BandedMatrices = "aae01518-5342-5314-be14-df237901396f" BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ClimaComms = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d" @@ -31,6 +32,7 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" [compat] Adapt = "3" +BandedMatrices = "0.17" BlockArrays = "0.16" ClimaComms = "0.5" CUDA = "3, 4.2.0" diff --git a/benchmarks/bickleyjet/Manifest.toml b/benchmarks/bickleyjet/Manifest.toml index 264b71fda0..1ac7bac797 100644 --- a/benchmarks/bickleyjet/Manifest.toml +++ b/benchmarks/bickleyjet/Manifest.toml @@ -52,6 +52,12 @@ git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66" uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" version = "0.4.2" +[[deps.BandedMatrices]] +deps = ["ArrayLayouts", "FillArrays", "LinearAlgebra", "PrecompileTools", "SparseArrays"] +git-tree-sha1 = "9ad46355045491b12eab409dee73e9de46293aa2" +uuid = "aae01518-5342-5314-be14-df237901396f" +version = "0.17.28" + [[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" @@ -150,10 +156,10 @@ uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d" version = "0.4.2" [[deps.ClimaCore]] -deps = ["Adapt", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "Rotations", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"] +deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"] path = "../.." uuid = "d414da3d-4745-48bb-8d80-42e94e092884" -version = "0.10.39" +version = "0.10.40" [[deps.CloseOpenIntervals]] deps = ["Static", "StaticArrayInterface"] @@ -773,9 +779,9 @@ version = "0.1.8" [[deps.MPItrampoline_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"] -git-tree-sha1 = "b3dcf8e1c610a10458df3c62038c8cc3a4d6291d" +git-tree-sha1 = "228d5366a7c89b3c81469592b6f4c612db693d50" uuid = "f1f71cc9-e9ae-5b93-9b94-4fe0e1ad3748" -version = "5.3.0+0" +version = "5.3.0+1" [[deps.MacroTools]] deps = ["Markdown", "Random"] @@ -1012,12 +1018,6 @@ git-tree-sha1 = "6ec7ac8412e83d57e313393220879ede1740f9ee" uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" version = "2.8.2" -[[deps.Quaternions]] -deps = ["LinearAlgebra", "Random", "RealDot"] -git-tree-sha1 = "da095158bdc8eaccb7890f9884048555ab771019" -uuid = "94ee1d12-ae83-5a48-8b1c-48b8ff168ae0" -version = "0.7.4" - [[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" @@ -1038,12 +1038,6 @@ git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" version = "1.5.3" -[[deps.RealDot]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "9f0a1b71baaf7650f4fa8a1d168c7fb6ee41f0c9" -uuid = "c1ae055f-0cd5-4b69-90a6-9a35b1a98df9" -version = "0.1.0" - [[deps.RecipesBase]] deps = ["PrecompileTools"] git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff" @@ -1097,12 +1091,6 @@ git-tree-sha1 = "9fb3462240d2898be5d4acf8925e47f70ec64d07" uuid = "7181ea78-2dcb-4de3-ab41-2b8ab5a31e74" version = "0.3.5" -[[deps.Rotations]] -deps = ["LinearAlgebra", "Quaternions", "Random", "StaticArrays"] -git-tree-sha1 = "54ccb4dbab4b1f69beb255a2c0ca5f65a9c82f08" -uuid = "6038ab10-8711-5258-84ad-4b1120ba62dc" -version = "1.5.1" - [[deps.RuntimeGeneratedFunctions]] deps = ["ExprTools", "SHA", "Serialization"] git-tree-sha1 = "237edc1563bbf078629b4f8d194bd334e97907cf" diff --git a/docs/Manifest.toml b/docs/Manifest.toml index 574c2c2747..d8a61cb4ca 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -98,6 +98,12 @@ git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66" uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" version = "0.4.2" +[[deps.BandedMatrices]] +deps = ["ArrayLayouts", "FillArrays", "LinearAlgebra", "PrecompileTools", "SparseArrays"] +git-tree-sha1 = "9ad46355045491b12eab409dee73e9de46293aa2" +uuid = "aae01518-5342-5314-be14-df237901396f" +version = "0.17.28" + [[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" @@ -216,10 +222,10 @@ uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d" version = "0.5.0" [[deps.ClimaCore]] -deps = ["Adapt", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"] +deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"] path = ".." uuid = "d414da3d-4745-48bb-8d80-42e94e092884" -version = "0.10.41" +version = "0.10.42" [[deps.ClimaCoreMakie]] deps = ["ClimaCore", "Makie"] @@ -312,9 +318,9 @@ version = "0.3.0" [[deps.Compat]] deps = ["Dates", "LinearAlgebra", "UUIDs"] -git-tree-sha1 = "7a60c856b9fa189eb34f5f8a6f6b5529b7942957" +git-tree-sha1 = "4e88377ae7ebeaf29a047aa1ee40826e0b708a5d" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "4.6.1" +version = "4.7.0" [[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] @@ -620,9 +626,9 @@ version = "3.3.8+0" [[deps.GPUArrays]] deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] -git-tree-sha1 = "745847e65e72a475716952f0a8a7258d42338ce9" +git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.8.0" +version = "8.8.1" [[deps.GPUArraysCore]] deps = ["Adapt"] @@ -1047,10 +1053,10 @@ uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9" version = "2.35.0+0" [[deps.Libtiff_jll]] -deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"] -git-tree-sha1 = "2da088d113af58221c52828a80378e16be7d037a" +deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "Pkg", "Zlib_jll", "Zstd_jll"] +git-tree-sha1 = "3eb79b0ca5764d4799c06699573fd8f533259713" uuid = "89763e89-9b03-5906-acba-b20f662cd828" -version = "4.5.1+1" +version = "4.4.0+0" [[deps.Libuuid_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] @@ -1103,9 +1109,9 @@ version = "1.0.0" [[deps.LoopVectorization]] deps = ["ArrayInterface", "ArrayInterfaceCore", "CPUSummary", "ChainRulesCore", "CloseOpenIntervals", "DocStringExtensions", "ForwardDiff", "HostCPUFeatures", "IfElse", "LayoutPointers", "LinearAlgebra", "OffsetArrays", "PolyesterWeave", "PrecompileTools", "SIMDTypes", "SLEEFPirates", "SpecialFunctions", "Static", "StaticArrayInterface", "ThreadingUtilities", "UnPack", "VectorizationBase"] -git-tree-sha1 = "cdd207c26d86952949f1ac65b21baf078cc1937a" +git-tree-sha1 = "e4eed22d70ac91d7a4bf9e0f6902383061d17105" uuid = "bdcacae8-1622-11e9-2a5c-532679323890" -version = "0.12.161" +version = "0.12.162" [[deps.MKL_jll]] deps = ["Artifacts", "IntelOpenMP_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"] @@ -1759,9 +1765,9 @@ version = "0.3.9" [[deps.SpecialFunctions]] deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880" +git-tree-sha1 = "7beb031cf8145577fbccacd94b8a8f4ce78428d3" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "2.2.0" +version = "2.3.0" [[deps.StableHashTraits]] deps = ["CRC32c", "Compat", "Dates", "SHA", "Tables", "TupleTools", "UUIDs"] @@ -1877,9 +1883,9 @@ version = "1.10.1" [[deps.TaylorSeries]] deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"] -git-tree-sha1 = "c274151bde5a608bb329d76160a9344af707bfb4" +git-tree-sha1 = "50718b4fc1ce20cecf28d85215028c78b4d875c2" uuid = "6aa5eb33-94cf-58f4-a9d0-e4b2c4fc25ea" -version = "0.15.1" +version = "0.15.2" [[deps.TempestRemap_jll]] deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "Libdl", "NetCDF_jll", "OpenBLAS32_jll", "Pkg"] @@ -2052,12 +2058,6 @@ git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" uuid = "aed1982a-8fda-507f-9586-7b0439959a61" version = "1.1.34+0" -[[deps.XZ_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "8abe223c2549ea70be752b20a53aa236a7868eb0" -uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" -version = "5.4.3+0" - [[deps.Xorg_libX11_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" diff --git a/examples/Manifest.toml b/examples/Manifest.toml index efd09f6939..8392af555f 100644 --- a/examples/Manifest.toml +++ b/examples/Manifest.toml @@ -208,10 +208,10 @@ uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d" version = "0.5.0" [[deps.ClimaCore]] -deps = ["Adapt", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"] +deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"] path = ".." uuid = "d414da3d-4745-48bb-8d80-42e94e092884" -version = "0.10.41" +version = "0.10.42" [[deps.ClimaCorePlots]] deps = ["ClimaCore", "RecipesBase", "StaticArrays", "TriplotBase"] @@ -593,9 +593,9 @@ version = "3.3.8+0" [[deps.GPUArrays]] deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] -git-tree-sha1 = "745847e65e72a475716952f0a8a7258d42338ce9" +git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.8.0" +version = "8.8.1" [[deps.GPUArraysCore]] deps = ["Adapt"] @@ -950,10 +950,10 @@ uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9" version = "2.35.0+0" [[deps.Libtiff_jll]] -deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"] -git-tree-sha1 = "2da088d113af58221c52828a80378e16be7d037a" +deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "Pkg", "Zlib_jll", "Zstd_jll"] +git-tree-sha1 = "3eb79b0ca5764d4799c06699573fd8f533259713" uuid = "89763e89-9b03-5906-acba-b20f662cd828" -version = "4.5.1+1" +version = "4.4.0+0" [[deps.Libuuid_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] @@ -1530,9 +1530,9 @@ version = "1.21.0" [[deps.SpecialFunctions]] deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880" +git-tree-sha1 = "7beb031cf8145577fbccacd94b8a8f4ce78428d3" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "2.2.0" +version = "2.3.0" [[deps.SplittablesBase]] deps = ["Setfield", "Test"] @@ -1613,9 +1613,9 @@ version = "1.10.1" [[deps.TaylorSeries]] deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"] -git-tree-sha1 = "c274151bde5a608bb329d76160a9344af707bfb4" +git-tree-sha1 = "50718b4fc1ce20cecf28d85215028c78b4d875c2" uuid = "6aa5eb33-94cf-58f4-a9d0-e4b2c4fc25ea" -version = "0.15.1" +version = "0.15.2" [[deps.TempestRemap_jll]] deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "Libdl", "NetCDF_jll", "OpenBLAS32_jll", "Pkg"] @@ -1778,12 +1778,6 @@ git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" uuid = "aed1982a-8fda-507f-9586-7b0439959a61" version = "1.1.34+0" -[[deps.XZ_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "8abe223c2549ea70be752b20a53aa236a7868eb0" -uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" -version = "5.4.3+0" - [[deps.Xorg_libX11_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" diff --git a/perf/Manifest.toml b/perf/Manifest.toml index dd7e9e54f5..78846156db 100644 --- a/perf/Manifest.toml +++ b/perf/Manifest.toml @@ -203,10 +203,10 @@ uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d" version = "0.5.0" [[deps.ClimaCore]] -deps = ["Adapt", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"] +deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DiffEqBase", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "Requires", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "UnPack"] path = ".." uuid = "d414da3d-4745-48bb-8d80-42e94e092884" -version = "0.10.41" +version = "0.10.42" [[deps.ClimaCorePlots]] deps = ["ClimaCore", "RecipesBase", "StaticArrays", "TriplotBase"] @@ -623,9 +623,9 @@ version = "3.3.8+0" [[deps.GPUArrays]] deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] -git-tree-sha1 = "745847e65e72a475716952f0a8a7258d42338ce9" +git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.8.0" +version = "8.8.1" [[deps.GPUArraysCore]] deps = ["Adapt"] @@ -1003,10 +1003,10 @@ uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9" version = "2.35.0+0" [[deps.Libtiff_jll]] -deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"] -git-tree-sha1 = "2da088d113af58221c52828a80378e16be7d037a" +deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "Pkg", "Zlib_jll", "Zstd_jll"] +git-tree-sha1 = "3eb79b0ca5764d4799c06699573fd8f533259713" uuid = "89763e89-9b03-5906-acba-b20f662cd828" -version = "4.5.1+1" +version = "4.4.0+0" [[deps.Libuuid_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] @@ -1641,9 +1641,9 @@ version = "1.21.0" [[deps.SpecialFunctions]] deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880" +git-tree-sha1 = "7beb031cf8145577fbccacd94b8a8f4ce78428d3" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "2.2.0" +version = "2.3.0" [[deps.Static]] deps = ["IfElse"] @@ -1729,9 +1729,9 @@ version = "1.10.1" [[deps.TaylorSeries]] deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"] -git-tree-sha1 = "c274151bde5a608bb329d76160a9344af707bfb4" +git-tree-sha1 = "50718b4fc1ce20cecf28d85215028c78b4d875c2" uuid = "6aa5eb33-94cf-58f4-a9d0-e4b2c4fc25ea" -version = "0.15.1" +version = "0.15.2" [[deps.TempestRemap_jll]] deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "Libdl", "NetCDF_jll", "OpenBLAS32_jll", "Pkg"] @@ -1894,12 +1894,6 @@ git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" uuid = "aed1982a-8fda-507f-9586-7b0439959a61" version = "1.1.34+0" -[[deps.XZ_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "8abe223c2549ea70be752b20a53aa236a7868eb0" -uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" -version = "5.4.3+0" - [[deps.Xorg_libX11_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" diff --git a/src/ClimaCore.jl b/src/ClimaCore.jl index b37bd8f397..72ada48e34 100644 --- a/src/ClimaCore.jl +++ b/src/ClimaCore.jl @@ -14,6 +14,7 @@ include("Topologies/Topologies.jl") include("Spaces/Spaces.jl") include("Fields/Fields.jl") include("Operators/Operators.jl") +include("MatrixFields/MatrixFields.jl") include("Hypsography/Hypsography.jl") include("Limiters/Limiters.jl") include("InputOutput/InputOutput.jl") diff --git a/src/Fields/mapreduce.jl b/src/Fields/mapreduce.jl index c7b1e23b12..27e15d7b46 100644 --- a/src/Fields/mapreduce.jl +++ b/src/Fields/mapreduce.jl @@ -1,4 +1,4 @@ -Base.map(fn, field::Field) = Base.broadcast(fn, field) +Base.map(fn, fields::Field...) = Base.broadcast(fn, fields...) """ Fields.local_sum(v::Field) diff --git a/src/Geometry/axistensors.jl b/src/Geometry/axistensors.jl index a66b723708..0b4b70ed9e 100644 --- a/src/Geometry/axistensors.jl +++ b/src/Geometry/axistensors.jl @@ -269,10 +269,30 @@ Base.propertynames(x::AxisVector) = symbols(axes(x, 1)) end end +const AdjointAxisTensor{T, N, A, S} = Adjoint{T, AxisTensor{T, N, A, S}} + +Base.show(io::IO, a::AdjointAxisTensor{T, N, A, S}) where {T, N, A, S} = + print(io, "adjoint($(a'))") + +components(a::AdjointAxisTensor) = components(parent(a))' + +Base.zero(a::AdjointAxisTensor) = zero(typeof(a)) +Base.zero(::Type{AdjointAxisTensor{T, N, A, S}}) where {T, N, A, S} = + zero(AxisTensor{T, N, A, S})' + +@inline +(a::AdjointAxisTensor) = (+a')' +@inline -(a::AdjointAxisTensor) = (-a')' +@inline +(a::AdjointAxisTensor, b::AdjointAxisTensor) = (a' + b')' +@inline -(a::AdjointAxisTensor, b::AdjointAxisTensor) = (a' - b')' +@inline *(a::Number, b::AdjointAxisTensor) = (a * b')' +@inline *(a::AdjointAxisTensor, b::Number) = (a' * b)' +@inline /(a::AdjointAxisTensor, b::Number) = (a' / b)' +@inline \(a::Number, b::AdjointAxisTensor) = (a \ b')' + +@inline (==)(a::AdjointAxisTensor, b::AdjointAxisTensor) = a' == b' const AdjointAxisVector{T, A1, S} = Adjoint{T, AxisVector{T, A1, S}} -components(va::AdjointAxisVector) = components(parent(va))' Base.@propagate_inbounds Base.getindex(va::AdjointAxisVector, i::Int) = getindex(components(va), i) Base.@propagate_inbounds Base.getindex(va::AdjointAxisVector, i::Int, j::Int) = @@ -286,7 +306,6 @@ Axis2Tensor( ) = AxisTensor(axes, components) const AdjointAxis2Tensor{T, A, S} = Adjoint{T, Axis2Tensor{T, A, S}} -components(va::AdjointAxis2Tensor) = components(parent(va))' const Axis2TensorOrAdj{T, A, S} = Union{Axis2Tensor{T, A, S}, AdjointAxis2Tensor{T, A, S}} diff --git a/src/MatrixFields/MatrixFields.jl b/src/MatrixFields/MatrixFields.jl new file mode 100644 index 0000000000..6ed80c32e3 --- /dev/null +++ b/src/MatrixFields/MatrixFields.jl @@ -0,0 +1,26 @@ +module MatrixFields + +import LinearAlgebra: UniformScaling, Adjoint +import StaticArrays: SArray, SMatrix, SVector +import BandedMatrices: BandedMatrix, band, _BandedMatrix +import ..Utilities: PlusHalf, half +import ..RecursiveApply: + rmap, rpromote_type, rzero, rconvert, radd, rsub, rmul, rdiv +import ..Geometry +import ..Spaces +import ..Fields +import ..Operators + +export ⋅ +export DiagonalMatrixRow, + BidiagonalMatrixRow, + TridiagonalMatrixRow, + QuaddiagonalMatrixRow, + PentadiagonalMatrixRow + +include("band_matrix_row.jl") +include("rmul_with_projection.jl") +include("matrix_multiplication.jl") +include("matrix_field_utils.jl") + +end diff --git a/src/MatrixFields/band_matrix_row.jl b/src/MatrixFields/band_matrix_row.jl new file mode 100644 index 0000000000..d637ef089c --- /dev/null +++ b/src/MatrixFields/band_matrix_row.jl @@ -0,0 +1,138 @@ +""" + BandMatrixRow{ld}(entries...) + +Stores the nonzero entries in a row of a band matrix, starting with the lowest +diagonal, which has index `ld`. Supported operations include accessing the entry +on the diagonal with index `d` by calling `row[d]`, taking linear combinations +with other band matrix rows (and with `LinearAlgebra.I`), and checking for +equality with other band matrix rows (and with `LinearAlgebra.I`). There are +several aliases defined for commonly-used subtypes of `BandMatrixRow` (with `T` +denoting the type of the row's entries): +- `DiagonalMatrixRow{T}` +- `BidiagonalMatrixRow{T}` +- `TridiagonalMatrixRow{T}` +- `QuaddiagonalMatrixRow{T}` +- `PentadiagonalMatrixRow{T}` +""" +struct BandMatrixRow{ld, bw, T} # bw is the bandwidth (the number of diagonals) + entries::NTuple{bw, T} + BandMatrixRow{ld, bw, T}(entries::NTuple{bw, Any}) where {ld, bw, T} = + new{ld, bw, T}(rconvert(NTuple{bw, T}, entries)) + # TODO: Remove this inner constructor once Julia's default convert function + # is type-stable for nested Tuple/NamedTuple types. +end +BandMatrixRow{ld}(entries::Vararg{Any, bw}) where {ld, bw} = + BandMatrixRow{ld, bw}(entries...) +BandMatrixRow{ld, bw}(entries::Vararg{Any, bw}) where {ld, bw} = + BandMatrixRow{ld, bw, rpromote_type(map(typeof, entries)...)}(entries) + +const DiagonalMatrixRow{T} = BandMatrixRow{0, 1, T} +const BidiagonalMatrixRow{T} = BandMatrixRow{-1 + half, 2, T} +const TridiagonalMatrixRow{T} = BandMatrixRow{-1, 3, T} +const QuaddiagonalMatrixRow{T} = BandMatrixRow{-2 + half, 4, T} +const PentadiagonalMatrixRow{T} = BandMatrixRow{-2, 5, T} + +""" + outer_diagonals(::Type{<:BandMatrixRow}) + +Gets the indices of the lower and upper diagonals, `ld` and `ud`, of the given +subtype of `BandMatrixRow`. +""" +outer_diagonals(::Type{<:BandMatrixRow{ld, bw}}) where {ld, bw} = + (ld, ld + bw - 1) + +""" + band_matrix_row_type(ld, ud, T) + +A shorthand for getting the subtype of `BandMatrixRow` that has entries of type +`T` on the diagonals with indices in the range `ld:ud`. +""" +band_matrix_row_type(ld, ud, T) = BandMatrixRow{ld, ud - ld + 1, T} + +Base.eltype(::Type{BandMatrixRow{ld, bw, T}}) where {ld, bw, T} = T + +Base.zero(::Type{BandMatrixRow{ld, bw, T}}) where {ld, bw, T} = + BandMatrixRow{ld}(ntuple(_ -> rzero(T), Val(bw))...) + +Base.map(f::F, rows::BandMatrixRow{ld}...) where {F, ld} = + BandMatrixRow{ld}(map(f, map(row -> row.entries, rows)...)...) + +Base.@propagate_inbounds Base.getindex(row::BandMatrixRow{ld}, d) where {ld} = + row.entries[d - ld + 1] + +function Base.promote_rule( + ::Type{BMR1}, + ::Type{BMR2}, +) where {BMR1 <: BandMatrixRow, BMR2 <: BandMatrixRow} + ld1, ud1 = outer_diagonals(BMR1) + ld2, ud2 = outer_diagonals(BMR2) + typeof(ld1) == typeof(ld2) || error( + "Cannot promote the $(ld1 isa PlusHalf ? "non-" : "")square matrix \ + row type $BMR1 and the $(ld2 isa PlusHalf ? "non-" : "")square matrix \ + row type $BMR2 to a common type", + ) + T = rpromote_type(eltype(BMR1), eltype(BMR2)) + return band_matrix_row_type(min(ld1, ld2), max(ud1, ud2), T) +end + +Base.promote_rule( + ::Type{BMR}, + ::Type{US}, +) where {BMR <: BandMatrixRow, US <: UniformScaling} = + promote_rule(BMR, DiagonalMatrixRow{eltype(US)}) + +function Base.convert( + ::Type{BMR}, + row::BandMatrixRow, +) where {BMR <: BandMatrixRow} + old_ld, old_ud = outer_diagonals(typeof(row)) + new_ld, new_ud = outer_diagonals(BMR) + typeof(old_ld) == typeof(new_ld) || error( + "Cannot convert a $(old_ld isa PlusHalf ? "non-" : "")square matrix \ + row of type $(typeof(row)) to the \ + $(new_ld isa PlusHalf ? "non-" : "")square matrix row type $BMR", + ) + new_ld <= old_ld && new_ud >= old_ud || error( + "Cannot convert a $(typeof(row)) to a $BMR, since that would require \ + dropping potentially non-zero row entries", + ) + first_zeros = ntuple(_ -> rzero(eltype(BMR)), Val(old_ld - new_ld)) + last_zeros = ntuple(_ -> rzero(eltype(BMR)), Val(new_ud - old_ud)) + return BMR((first_zeros..., row.entries..., last_zeros...)) +end + +Base.convert(::Type{BMR}, row::UniformScaling) where {BMR <: BandMatrixRow} = + convert(BMR, DiagonalMatrixRow(row.λ)) + +Base.:(==)(row1::BMR, row2::BMR) where {BMR <: BandMatrixRow} = + row1.entries == row2.entries +Base.:(==)(row1::BandMatrixRow, row2::BandMatrixRow) = + ==(promote(row1, row2)...) +Base.:(==)(row1::BandMatrixRow, row2::UniformScaling) = + ==(promote(row1, row2)...) +Base.:(==)(row1::UniformScaling, row2::BandMatrixRow) = + ==(promote(row1, row2)...) + +Base.:+(row::BandMatrixRow) = map(radd, row) +Base.:+(row1::BandMatrixRow, row2::BandMatrixRow) = + map(radd, promote(row1, row2)...) +Base.:+(row1::BandMatrixRow, row2::UniformScaling) = + map(radd, promote(row1, row2)...) +Base.:+(row1::UniformScaling, row2::BandMatrixRow) = + map(radd, promote(row1, row2)...) + +Base.:-(row::BandMatrixRow) = map(rsub, row) +Base.:-(row1::BandMatrixRow, row2::BandMatrixRow) = + map(rsub, promote(row1, row2)...) +Base.:-(row1::BandMatrixRow, row2::UniformScaling) = + map(rsub, promote(row1, row2)...) +Base.:-(row1::UniformScaling, row2::BandMatrixRow) = + map(rsub, promote(row1, row2)...) + +Base.:*(row::BandMatrixRow, value::Number) = + map(entry -> rmul(entry, value), row) +Base.:*(value::Number, row::BandMatrixRow) = + map(entry -> rmul(value, entry), row) + +Base.:/(row::BandMatrixRow, value::Number) = + map(entry -> rdiv(entry, value), row) diff --git a/src/MatrixFields/matrix_field_utils.jl b/src/MatrixFields/matrix_field_utils.jl new file mode 100644 index 0000000000..706366eba2 --- /dev/null +++ b/src/MatrixFields/matrix_field_utils.jl @@ -0,0 +1,139 @@ +function banded_matrix_info(field) + space = axes(field) + field_ld, field_ud = outer_diagonals(eltype(field)) + + # Find the diagonal index of the value that ends up in the bottom-right + # corner of the matrix, as well as the amount by which the field's diagonal + # indices get shifted when it is converted into a matrix. + bottom_corner_matrix_d, matrix_d_minus_field_d = if field_ld isa PlusHalf + if space.staggering isa Spaces.CellCenter + 1, half # field is a face-to-center matrix + else + -1, -half # field is a center-to-face matrix + end + else + 0, 0 # field is either a center-to-center or face-to-face matrix + end + + n_rows = Spaces.nlevels(space) + n_cols = n_rows + bottom_corner_matrix_d + matrix_ld = field_ld + matrix_d_minus_field_d + matrix_ud = field_ud + matrix_d_minus_field_d + matrix_ld <= 0 && matrix_ud >= 0 || + error("BandedMatrices.jl does not yet support matrices that have \ + diagonals with indices in the range $matrix_ld:$matrix_ud") + + return n_rows, n_cols, matrix_ld, matrix_ud +end + +""" + column_field2array(field) + +Converts a field defined on a `FiniteDifferenceSpace` into a `Vector` or a +`BandedMatrix`, depending on whether or not the elements of the field are +`BandMatrixRow`s. This involves copying the data stored in the field. +""" +function column_field2array(field) + space = axes(field) + space isa Spaces.FiniteDifferenceSpace || + error("column_field2array requires a field on a FiniteDifferenceSpace") + if eltype(field) <: BandMatrixRow # field represents a matrix + n_rows, n_cols, matrix_ld, matrix_ud = banded_matrix_info(field) + matrix = BandedMatrix{eltype(eltype(field))}( + undef, + (n_rows, n_cols), + (-matrix_ld, matrix_ud), + ) + for (index_of_field_entry, matrix_d) in enumerate(matrix_ld:matrix_ud) + # Find the rows for which field_diagonal[row] is inside the matrix. + # Note: The matrix index (1, 1) corresponds to the diagonal index 0, + # and the matrix index (n_rows, n_cols) corresponds to the diagonal + # index n_cols - n_rows. + first_row = matrix_d < 0 ? 1 - matrix_d : 1 + last_row = matrix_d < n_cols - n_rows ? n_rows : n_cols - matrix_d + + # Copy the value in each row from field_diagonal to matrix_diagonal. + field_diagonal = field.entries.:($index_of_field_entry) + matrix_diagonal = view(matrix, band(matrix_d)) + for (index_along_diagonal, row) in enumerate(first_row:last_row) + matrix_diagonal[index_along_diagonal] = + Fields.field_values(field_diagonal)[row] + end + end + return matrix + else # field represents a vector + n_rows = Spaces.nlevels(space) + return map(row -> Fields.field_values(field)[row], 1:n_rows) + end +end + +""" + field2arrays(field) + +Converts a field defined on a `FiniteDifferenceSpace` or on an +`ExtrudedFiniteDifferenceSpace` into a tuple of arrays, each of which +corresponds to a column of the field. This is done by calling +`column_field2array` on each of the field's columns. +""" +function field2arrays(field) + space = axes(field) + column_indices = if space isa Spaces.FiniteDifferenceSpace + (((1, 1), 1),) + elseif space isa Spaces.ExtrudedFiniteDifferenceSpace + (Spaces.all_nodes(Spaces.horizontal_space(space))...,) + else + error("Invalid space type: $(typeof(space).name.wrapper)") + end + return map(column_indices) do ((i, j), h) + column_field2array(Spaces.column(field, i, j, h)) + end +end + +""" + column_field2array_view(field) + +Similar to `column_field2array(field)`, except that this version avoids copying +the data stored in the field. +""" +function column_field2array_view(field) + space = axes(field) + space isa Spaces.FiniteDifferenceSpace || + error("column_field2array_view requires a field on a \ + FiniteDifferenceSpace") + if eltype(field) <: BandMatrixRow # field represents a matrix + n_rows, n_cols, matrix_ld, matrix_ud = banded_matrix_info(field) + data_transpose = reinterpret(eltype(eltype(field)), parent(field)') + matrix_transpose = + _BandedMatrix(data_transpose, n_cols, matrix_ud, -matrix_ld) + return permutedims(matrix_transpose) + # TODO: Despite not copying any data, this function still allocates a + # small amount of memory because of _BandedMatrix and permutedims. + else # field represents a vector + return vec(reinterpret(eltype(field), parent(field)')) + end +end + +function Base.show( + io::IO, + field::Fields.Field{<:Fields.AbstractData{<:BandMatrixRow}}, +) + print(io, eltype(field), "-valued Field") + if eltype(eltype(field)) <: Number + if axes(field) isa Spaces.FiniteDifferenceSpace + println(io, " that corresponds to the matrix") + else + println(io, " whose first column corresponds to the matrix") + end + column_field = Fields.column(field, 1, 1, 1) + Base.print_array(io, column_field2array_view(column_field)) + else + # When a BandedMatrix with non-number entries is printed, it currently + # either prints in an illegible format (e.g., if it has AxisTensor or + # AdjointAxisTensor entries) or crashes during the evaluation of + # isassigned (e.g., if it has Tuple or NamedTuple entries). So, for + # matrix fields with non-number entries, we fall back to the default + # function for printing fields. + print(io, ":") + Fields._show_compact_field(io, field, " ", true) + end +end diff --git a/src/MatrixFields/matrix_multiplication.jl b/src/MatrixFields/matrix_multiplication.jl new file mode 100644 index 0000000000..7f9144d623 --- /dev/null +++ b/src/MatrixFields/matrix_multiplication.jl @@ -0,0 +1,381 @@ +""" + MultiplyColumnwiseBandMatrixField + +An operator that multiplies a columnwise band matrix field (a field of +`BandMatrixRow`s) by a regular field or by another columnwise band matrix field, +i.e., matrix-vector or matrix-matrix multiplication. The `⋅` symbol is an alias +for `MultiplyColumnwiseBandMatrixField()`. +""" +struct MultiplyColumnwiseBandMatrixField <: Operators.FiniteDifferenceOperator end +const ⋅ = MultiplyColumnwiseBandMatrixField() + +#= +TODO: Rewrite the following derivation in LaTeX and move it into the ClimaCore +documentation. + +Notation: + +For any single-column field F, let F[idx] denote the value of F at level idx. +For any single-column BandMatrixRow field M, let + M[idx, idx′] = M[idx][idx′ - idx]. +If there are multiple columns, the following equations apply per column. + +Matrix-Vector Multiplication: + +Consider a BandMatrixRow field M and a scalar (non-BandMatrixRow) field V. +From the definition of matrix-vector multiplication, + (M ⋅ V)[idx] = ∑_{idx′} M[idx, idx′] * V[idx′]. +If V[idx] is only defined when left_idx ≤ idx ≤ right_idx, this becomes + (M ⋅ V)[idx] = ∑_{idx′ ∈ left_idx:right_idx} M[idx, idx′] * V[idx′]. +If M[idx, idx′] is only defined when idx + ld ≤ idx′ ≤ idx + ud, this becomes + (M ⋅ V)[idx] = + ∑_{idx′ ∈ max(left_idx, idx + ld):min(right_idx, idx + ud)} + M[idx, idx′] * V[idx′]. +Replacing the variable idx′ with the variable d = idx′ - idx gives us + (M ⋅ V)[idx] = + ∑_{d ∈ max(left_idx - idx, ld):min(right_idx - idx, ud)} + M[idx, idx + d] * V[idx + d]. +This can be rewritten using the standard indexing notation as + (M ⋅ V)[idx] = + ∑_{d ∈ max(left_idx - idx, ld):min(right_idx - idx, ud)} + M[idx][d] * V[idx + d]. +Finally, we can express this in terms of left/right boundaries and an interior: + (M ⋅ V)[idx] = + ∑_{ + d ∈ + if idx < left_idx - ld + (left_idx - idx):ud + elseif idx > right_idx - ud + ld:(right_idx - idx) + else + ld:ud + end + } M[idx][d] * V[idx + d]. + +Matrix-Matrix Multiplication: + +Consider a BandMatrixRow field M1 and another BandMatrixRow field M2. +From the definition of matrix-matrix multiplication, + (M1 ⋅ M2)[idx, idx′] = ∑_{idx′′} M1[idx, idx′′] * M2[idx′′, idx′]. +If M2[idx′′] is only defined when left_idx ≤ idx′′ ≤ right_idx, this becomes + (M1 ⋅ M2)[idx, idx′] = + ∑_{idx′′ ∈ left_idx:right_idx} M1[idx, idx′′] * M2[idx′′, idx′]. +If M1[idx, idx′′] is only defined when idx + ld1 ≤ idx′′ ≤ idx + ud1, this becomes + (M1 ⋅ M2)[idx, idx′] = + ∑_{idx′′ ∈ max(left_idx, idx + ld1):min(right_idx, idx + ud1)} + M1[idx, idx′′] * M2[idx′′, idx′]. +If M2[idx′′, idx′] is only defined when idx′′ + ld2 ≤ idx′ ≤ idx′′ + ud2, or, +equivalently, when idx′ - ud2 ≤ idx′′ ≤ idx′ - ld2, this becomes + (M1 ⋅ M2)[idx, idx′] = + ∑_{ + idx′′ ∈ + max(left_idx, idx + ld1, idx′ - ud2): + min(right_idx, idx + ud1, idx′ - ld2) + } M1[idx, idx′′] * M2[idx′′, idx′]. +Replacing the variable idx′ with the variable prod_d = idx′ - idx gives us + (M1 ⋅ M2)[idx, idx + prod_d] = + ∑_{ + idx′′ ∈ + max(left_idx, idx + ld1, idx + prod_d - ud2): + min(right_idx, idx + ud1, idx + prod_d - ld2) + } M1[idx, idx′′] * M2[idx′′, idx + prod_d]. +Replacing the variable idx′′ with the variable d = idx′′ - idx gives us + (M1 ⋅ M2)[idx, idx + prod_d] = + ∑_{ + d ∈ + max(left_idx - idx, ld1, prod_d - ud2): + min(right_idx - idx, ud1, prod_d - ld2) + } M1[idx, idx + d] * M2[idx + d, idx + prod_d]. +This can be rewritten using the standard indexing notation as + (M1 ⋅ M2)[idx][prod_d] = + ∑_{ + d ∈ + max(left_idx - idx, ld1, prod_d - ud2): + min(right_idx - idx, ud1, prod_d - ld2) + } M1[idx][d] * M2[idx + d][prod_d - d]. +Finally, we can express this in terms of left/right boundaries and an interior: + (M1 ⋅ M2)[idx][prod_d] = + ∑_{ + d ∈ + if idx < left_idx - ld1 + max(left_idx - idx, prod_d - ud2):min(ud1, prod_d - ld2) + elseif idx > right_idx - ud1 + max(ld1, prod_d - ud2):min(right_idx - idx, prod_d - ld2) + else + max(ld1, prod_d - ud2):min(ud1, prod_d - ld2) + end + } M1[idx][d] * M2[idx + d][prod_d - d]. + +We only need to define (M1 ⋅ M2)[idx][prod_d] when it has a nonzero value in the +interior, which will be the case when + max(ld1, prod_d - ud2) ≤ min(ud1, prod_d - ld2). +This can be rewritten as a system of four inequalities: + ld1 ≤ ud1, + ld1 ≤ prod_d - ld2, + prod_d - ud2 ≤ ud1, and + prod_d - ud2 ≤ prod_d - ld2. +By definition, ld1 ≤ ud1 and ld2 ≤ ud2, so the first and last inequality are +always true. Rearranging the remaining two inequalities gives us + ld1 + ld2 ≤ prod_d ≤ ud1 + ud2. +=# + +struct TopLeftMatrixCorner <: Operators.AbstractBoundaryCondition end +struct BottomRightMatrixCorner <: Operators.AbstractBoundaryCondition end + +Operators.has_boundary( + ::MultiplyColumnwiseBandMatrixField, + ::Operators.LeftBoundaryWindow{name}, +) where {name} = true +Operators.has_boundary( + ::MultiplyColumnwiseBandMatrixField, + ::Operators.RightBoundaryWindow{name}, +) where {name} = true + +Operators.get_boundary( + ::MultiplyColumnwiseBandMatrixField, + ::Operators.LeftBoundaryWindow{name}, +) where {name} = TopLeftMatrixCorner() +Operators.get_boundary( + ::MultiplyColumnwiseBandMatrixField, + ::Operators.RightBoundaryWindow{name}, +) where {name} = BottomRightMatrixCorner() + +Operators.stencil_interior_width( + ::MultiplyColumnwiseBandMatrixField, + matrix1, + arg, +) = ((0, 0), outer_diagonals(eltype(matrix1))) + +# Interior indices of a center-to-center or face-to-face matrix. +matrix_left_interior_index(space, lbw::Integer) = + Operators.left_idx(space) - lbw +matrix_right_interior_index(space, rbw::Integer) = + Operators.right_idx(space) - rbw + +# Interior indices of a face-to-center matrix. +matrix_left_interior_index( + space::Union{ + Spaces.CenterFiniteDifferenceSpace, + Spaces.CenterExtrudedFiniteDifferenceSpace, + }, + lbw::PlusHalf, +) = Operators.left_idx(space) - lbw - half +matrix_right_interior_index( + space::Union{ + Spaces.CenterFiniteDifferenceSpace, + Spaces.CenterExtrudedFiniteDifferenceSpace, + }, + rbw::PlusHalf, +) = Operators.right_idx(space) - rbw + half + +# Interior indices of a center-to-face matrix. +matrix_left_interior_index( + space::Union{ + Spaces.FaceFiniteDifferenceSpace, + Spaces.FaceExtrudedFiniteDifferenceSpace, + }, + lbw::PlusHalf, +) = Operators.left_idx(space) - lbw + half +matrix_right_interior_index( + space::Union{ + Spaces.FaceFiniteDifferenceSpace, + Spaces.FaceExtrudedFiniteDifferenceSpace, + }, + rbw::PlusHalf, +) = Operators.right_idx(space) - rbw - half + +Operators.left_interior_idx( + space::Spaces.AbstractSpace, + ::MultiplyColumnwiseBandMatrixField, + ::TopLeftMatrixCorner, + matrix1, + arg, +) = matrix_left_interior_index(space, outer_diagonals(eltype(matrix1))[1]) +Operators.right_interior_idx( + space::Spaces.AbstractSpace, + ::MultiplyColumnwiseBandMatrixField, + ::BottomRightMatrixCorner, + matrix1, + arg, +) = matrix_right_interior_index(space, outer_diagonals(eltype(matrix1))[2]) + +function rmul_type(::Type{T1}, ::Type{T2}, ::Type{LG}) where {T1, T2, LG} + type = Base._return_type(rmul_with_projection, Tuple{T1, T2, LG}) + type == Union{} && + error("Unable to infer result type: Calling rmul_with_projection with \ + arguments of types $T1 and $T2 will cause it to throw an error") + return type +end + +function Operators.return_eltype( + ::MultiplyColumnwiseBandMatrixField, + matrix1, + arg, +) + eltype(matrix1) <: BandMatrixRow || error( + "The first argument of ⋅ must have elements of type BandMatrixRow, but \ + the given argument has elements of type $(eltype(matrix1))", + ) + lg_type = eltype(Fields.local_geometry_field(axes(arg))) + if eltype(arg) <: BandMatrixRow # matrix-matrix multiplication + matrix2 = arg + ld1, ud1 = outer_diagonals(eltype(matrix1)) + ld2, ud2 = outer_diagonals(eltype(matrix2)) + prod_ld, prod_ud = ld1 + ld2, ud1 + ud2 + prod_value_type = + rmul_type(eltype(eltype(matrix1)), eltype(eltype(matrix2)), lg_type) + return band_matrix_row_type(prod_ld, prod_ud, prod_value_type) + else # matrix-vector multiplication + vector = arg + return rmul_type(eltype(eltype(matrix1)), eltype(vector), lg_type) + end +end + +Operators.return_space(::MultiplyColumnwiseBandMatrixField, space1, space2) = + space1 + +# TODO: Use @propagate_inbounds here, and remove @inbounds from this function. +# As of Julia 1.8, doing this increases compilation time by more than an order +# of magnitude, and it also makes type inference fail for some complicated +# matrix field broadcast expressions (in particular, those that involve products +# of linear combinations of matrix fields). Not using @propagate_inbounds causes +# matrix field broadcast expressions to take roughly 3 or 4 times longer to +# evaluate, but this is less significant than the decrease in compilation time. +function multiply_matrix_at_index( + loc, + space, + idx, + hidx, + matrix1, + arg, + boundary_ld1 = nothing, + boundary_ud1 = nothing, +) + matrix1_row = Operators.getidx(space, matrix1, loc, idx, hidx) + ld1, ud1 = outer_diagonals(eltype(matrix1)) + ld1_or_boundary_ld1 = isnothing(boundary_ld1) ? ld1 : boundary_ld1 + ud1_or_boundary_ud1 = isnothing(boundary_ud1) ? ud1 : boundary_ud1 + prod_type = Operators.return_eltype(⋅, matrix1, arg) + if eltype(arg) <: BandMatrixRow # matrix-matrix multiplication + matrix2 = arg + matrix2_rows = map((ld1:ud1...,)) do d + # TODO: Use @propagate_inbounds_meta instead of @inline_meta. + Base.@_inline_meta + if ( + (isnothing(boundary_ld1) || d >= boundary_ld1) && + (isnothing(boundary_ud1) || d <= boundary_ud1) + ) + @inbounds Operators.getidx(space, matrix2, loc, idx + d, hidx) + else + rzero(eltype(matrix2)) # This value is never used. + end + end # The rows are precomputed to avoid recomputing them multiple times. + matrix2_rows_wrapper = BandMatrixRow{ld1}(matrix2_rows...) + ld2, ud2 = outer_diagonals(eltype(matrix2)) + prod_ld, prod_ud = outer_diagonals(prod_type) + zero_value = rzero(eltype(prod_type)) + prod_entries = map((prod_ld:prod_ud...,)) do prod_d + # TODO: Use @propagate_inbounds_meta instead of @inline_meta. + Base.@_inline_meta + min_d = max(ld1_or_boundary_ld1, prod_d - ud2) + max_d = min(ud1_or_boundary_ud1, prod_d - ld2) + # Note: If min_d:max_d is an empty range, then the current entry + # lies outside of the product matrix, so it should never be used in + # any computations. By initializing prod_entry to zero_value, we are + # implicitly setting all such entries to 0. We could alternatively + # set all such entries to NaN (in order to more easily catch user + # errors that involve accidentally using these entires), but that + # would not generalize to non-floating-point types like Int or Bool. + prod_entry = zero_value + @inbounds for d in min_d:max_d + value1 = matrix1_row[d] + value2 = matrix2_rows_wrapper[d][prod_d - d] + value2_lg = Geometry.LocalGeometry(space, idx + d, hidx) + prod_entry = radd( + prod_entry, + rmul_with_projection(value1, value2, value2_lg), + ) + end # Using this for-loop is currently faster than using mapreduce. + prod_entry + end + return BandMatrixRow{prod_ld}(prod_entries...) + else # matrix-vector multiplication + vector = arg + prod_value = rzero(prod_type) + @inbounds for d in ld1_or_boundary_ld1:ud1_or_boundary_ud1 + value1 = matrix1_row[d] + value2 = Operators.getidx(space, vector, loc, idx + d, hidx) + value2_lg = Geometry.LocalGeometry(space, idx + d, hidx) + prod_value = radd( + prod_value, + rmul_with_projection(value1, value2, value2_lg), + ) + end # Using this for-loop is currently faster than using mapreduce. + return prod_value + end +end + +Base.@propagate_inbounds Operators.stencil_interior( + ::MultiplyColumnwiseBandMatrixField, + loc, + space, + idx, + hidx, + matrix1, + arg, +) = multiply_matrix_at_index(loc, space, idx, hidx, matrix1, arg) + +Base.@propagate_inbounds Operators.stencil_left_boundary( + ::MultiplyColumnwiseBandMatrixField, + ::TopLeftMatrixCorner, + loc, + space, + idx, + hidx, + matrix1, + arg, +) = multiply_matrix_at_index( + loc, + space, + idx, + hidx, + matrix1, + arg, + Operators.left_idx( + Operators.reconstruct_placeholder_space(axes(arg), space), + ) - idx, + nothing, +) + +Base.@propagate_inbounds Operators.stencil_right_boundary( + ::MultiplyColumnwiseBandMatrixField, + ::BottomRightMatrixCorner, + loc, + space, + idx, + hidx, + matrix1, + arg, +) = multiply_matrix_at_index( + loc, + space, + idx, + hidx, + matrix1, + arg, + nothing, + Operators.right_idx( + Operators.reconstruct_placeholder_space(axes(arg), space), + ) - idx, +) + +# For matrix field broadcast expressions involving 4 or more matrices, we +# sometimes hit a recursion limit and de-optimize. +# We know that the recursion will terminate due to the fact that broadcast +# expressions are not self-referential. +if hasfield(Method, :recursion_relation) + dont_limit = (args...) -> true + for m in methods(multiply_matrix_at_index) + m.recursion_relation = dont_limit + end +end diff --git a/src/MatrixFields/rmul_with_projection.jl b/src/MatrixFields/rmul_with_projection.jl new file mode 100644 index 0000000000..76f9846b68 --- /dev/null +++ b/src/MatrixFields/rmul_with_projection.jl @@ -0,0 +1,32 @@ +const SingleValue = + Union{Number, Geometry.AxisTensor, Geometry.AdjointAxisTensor} + +mul_with_projection(x, y, _) = x * y +mul_with_projection(x::Geometry.AdjointAxisVector, y::Geometry.AxisTensor, lg) = + x * Geometry.project(Geometry.dual(axes(x, 2)), y, lg) +mul_with_projection(::Geometry.AdjointAxisTensor, ::Geometry.AxisTensor, _) = + error("mul_with_projection is currently only implemented for covectors, \ + and higher-order cotensors are not supported") +# We should add methods for other cotensors (e.g., AdjointAxis2Tensor) when they +# are needed (e.g., when we need to support matrices that represent the +# divergence of higher-order tensors). + +""" + rmul_with_projection(x, y, lg) + +Similar to `rmul(x, y)`, but with automatic projection of `y` when `x` contains +a covector (i.e, an `AdjointAxisVector`). For example, if `x` is a covector +along the `Covariant3Axis` (e.g., `Covariant3Vector(1)'`), then `y` (or each +element of `y`) will be projected onto the `Contravariant3Axis`. In general, +each covector in `x` will cause `y` (or each corresponding element of `y`) to +be projected onto the dual axis of the covector. In the future, we may extend +this behavior to higher-order cotensors. +""" +rmul_with_projection(x, y, lg) = + rmap((x′, y′) -> mul_with_projection(x′, y′, lg), x, y) +rmul_with_projection(x::SingleValue, y, lg) = + rmap(y′ -> mul_with_projection(x, y′, lg), y) +rmul_with_projection(x, y::SingleValue, lg) = + rmap(x′ -> mul_with_projection(x′, y, lg), x) +rmul_with_projection(x::SingleValue, y::SingleValue, lg) = + mul_with_projection(x, y, lg) diff --git a/src/RecursiveApply/RecursiveApply.jl b/src/RecursiveApply/RecursiveApply.jl index 2309cfa0ce..dfe43ed074 100755 --- a/src/RecursiveApply/RecursiveApply.jl +++ b/src/RecursiveApply/RecursiveApply.jl @@ -92,6 +92,13 @@ rmaptype( T2 <: NamedTuple{names, Tup2}, } = NamedTuple{names, rmaptype(fn, Tup1, Tup2)} +""" + rpromote_type(Ts...) + +Recursively apply `promote_type` to the input types. +""" +rpromote_type(Ts...) = reduce((T1, T2) -> rmaptype(promote_type, T1, T2), Ts) + """ rzero(T) @@ -105,6 +112,17 @@ rzero(::Type{T}) where {T <: Tuple} = rzero(::Type{Tup}) where {names, T, Tup <: NamedTuple{names, T}} = NamedTuple{names}(rzero(T)) +""" + rconvert(T, X) + +Identical to `convert(T, X)`, but with improved type stability for nested types. +""" +rconvert(::Type{T}, X::T) where {T} = X +rconvert(::Type{T}, X) where {T} = + rmap((zero_value, x) -> convert(typeof(zero_value), x), rzero(T), X) +# TODO: Remove this function once Julia's default convert function is +# type-stable for nested Tuple/NamedTuple types. + """ rmul(X, Y) X ⊠ Y diff --git a/test/Geometry/axistensors.jl b/test/Geometry/axistensors.jl index de8c46c061..93bde79693 100644 --- a/test/Geometry/axistensors.jl +++ b/test/Geometry/axistensors.jl @@ -27,6 +27,12 @@ ClimaCore.Geometry.assert_exact_transform() = true @test M[:, 1] == Geometry.Cartesian12Vector(1.0, 0.5) @test M[1, :] == Geometry.Covariant12Vector(1.0, 0.0) + @test x + zero(x) == x + @test x' + zero(x') == x' + + @test -x + x * 2 - x / 2 == -x + 2 * x - 2 \ x == x / 2 + @test -x' + x' * 2 - x' / 2 == -x' + 2 * x' - 2 \ x' == (x / 2)' + @test x * y' == x ⊗ y == Geometry.AxisTensor( diff --git a/test/MatrixFields/band_matrix_row.jl b/test/MatrixFields/band_matrix_row.jl new file mode 100644 index 0000000000..0077948b4a --- /dev/null +++ b/test/MatrixFields/band_matrix_row.jl @@ -0,0 +1,66 @@ +using Test +using JET +using LinearAlgebra: I + +using ClimaCore.MatrixFields +import ClimaCore: Geometry + +macro test_all(expression) + return quote + local test_func() = $(esc(expression)) + @test test_func() # correctness + @test (@allocated test_func()) == 0 # allocations + @test_opt test_func() # type instabilities + end +end + +@testset "BandMatrixRow Unit Tests" begin + @test_all DiagonalMatrixRow(1) == + DiagonalMatrixRow(0.5) + DiagonalMatrixRow(1 // 2) == + DiagonalMatrixRow(1.5) - DiagonalMatrixRow(1 // 2) == + DiagonalMatrixRow(0.5) * 2 == + 0.5 * DiagonalMatrixRow(2) == + DiagonalMatrixRow(2) / 2 == + I + + @test_all DiagonalMatrixRow(1 // 2) + 0.5 * I === DiagonalMatrixRow(1.0) + @test_all BidiagonalMatrixRow(1 // 2, 0.5) === BidiagonalMatrixRow(1, 1) / 2 + + @test_all convert(TridiagonalMatrixRow{Int}, DiagonalMatrixRow(1)) === + convert(TridiagonalMatrixRow{Int}, I) === + TridiagonalMatrixRow(0, 1, 0) + + @test_all QuaddiagonalMatrixRow(0.5, 1, 1, 1 // 2) + + BidiagonalMatrixRow(-0.5, -1 // 2) == + QuaddiagonalMatrixRow(1, 1, 1, 1) / 2 + @test_all PentadiagonalMatrixRow(0, 0.5, 1, 1 // 2, 0) - + TridiagonalMatrixRow(1, 0, 1) / 2 - 0.5 * DiagonalMatrixRow(2) == + PentadiagonalMatrixRow(0, 0, 0, 0, 0) + + @test_all PentadiagonalMatrixRow(0, 0.5, 1, 1 // 2, 0) - + TridiagonalMatrixRow(1, 0, 1) / 2 - I == + zero(PentadiagonalMatrixRow{Int}) + + T(value) = (; a = (), b = value, c = (value, (; d = (value,)), (;))) + @test_all QuaddiagonalMatrixRow(T(0.5), T(1), T(1), T(1 // 2)) + + BidiagonalMatrixRow(T(-0.5), T(-1 // 2)) == + QuaddiagonalMatrixRow(T(1), T(1), T(1), T(1)) / 2 + @test_all PentadiagonalMatrixRow(T(0), T(0.5), T(1), T(1 // 2), T(0)) - + TridiagonalMatrixRow(T(1), T(0), T(1)) / 2 - + 0.5 * DiagonalMatrixRow(T(2)) == + PentadiagonalMatrixRow(T(0), T(0), T(0), T(0), T(0)) + + @test_throws "Cannot promote" BidiagonalMatrixRow(1, 1) + I + @test_throws "Cannot promote" BidiagonalMatrixRow(1, 1) + + DiagonalMatrixRow(1) + + @test_throws "Cannot convert" convert(BidiagonalMatrixRow{Int}, I) + @test_throws "Cannot convert" convert( + BidiagonalMatrixRow{Int}, + DiagonalMatrixRow(1), + ) + @test_throws "Cannot convert" convert( + TridiagonalMatrixRow{Int}, + PentadiagonalMatrixRow(0, 0, 1, 0, 0), + ) +end diff --git a/test/MatrixFields/field2arrays.jl b/test/MatrixFields/field2arrays.jl new file mode 100644 index 0000000000..c47595f08a --- /dev/null +++ b/test/MatrixFields/field2arrays.jl @@ -0,0 +1,82 @@ +using Test +using JET + +import ClimaCore: Geometry, Domains, Meshes, Spaces, Fields, MatrixFields + +@testset "field2arrays Unit Tests" begin + FT = Float64 + domain = Domains.IntervalDomain( + Geometry.ZPoint(FT(1)), + Geometry.ZPoint(FT(4)); + boundary_tags = (:bottom, :top), + ) + mesh = Meshes.IntervalMesh(domain, nelems = 3) + center_space = Spaces.CenterFiniteDifferenceSpace(mesh) + face_space = Spaces.FaceFiniteDifferenceSpace(center_space) + ᶜz = Fields.coordinate_field(center_space).z + ᶠz = Fields.coordinate_field(face_space).z + + ᶜᶜmat = map(z -> MatrixFields.TridiagonalMatrixRow(2 * z, 4 * z, 8 * z), ᶜz) + ᶜᶠmat = map(z -> MatrixFields.BidiagonalMatrixRow(2 * z, 4 * z), ᶜz) + ᶠᶠmat = map(z -> MatrixFields.TridiagonalMatrixRow(2 * z, 4 * z, 8 * z), ᶠz) + ᶠᶜmat = map(z -> MatrixFields.BidiagonalMatrixRow(2 * z, 4 * z), ᶠz) + + @test MatrixFields.column_field2array(ᶜz) == + MatrixFields.column_field2array_view(ᶜz) == + [1.5, 2.5, 3.5] + + @test MatrixFields.column_field2array(ᶠz) == + MatrixFields.column_field2array_view(ᶠz) == + [1, 2, 3, 4] + + @test MatrixFields.column_field2array(ᶜᶜmat) == + MatrixFields.column_field2array_view(ᶜᶜmat) == + [ + 6 12 0 + 5 10 20 + 0 7 14 + ] + + @test MatrixFields.column_field2array(ᶜᶠmat) == + MatrixFields.column_field2array_view(ᶜᶠmat) == + [ + 3 6 0 0 + 0 5 10 0 + 0 0 7 14 + ] + + @test MatrixFields.column_field2array(ᶠᶠmat) == + MatrixFields.column_field2array_view(ᶠᶠmat) == + [ + 4 8 0 0 + 4 8 16 0 + 0 6 12 24 + 0 0 8 16 + ] + + @test MatrixFields.column_field2array(ᶠᶜmat) == + MatrixFields.column_field2array_view(ᶠᶜmat) == + [ + 4 0 0 + 4 8 0 + 0 6 12 + 0 0 8 + ] + + ᶜᶜmat_array_not_view = MatrixFields.column_field2array(ᶜᶜmat) + ᶜᶜmat_array_view = MatrixFields.column_field2array_view(ᶜᶜmat) + ᶜᶜmat .*= 2 + @test ᶜᶜmat_array_not_view == MatrixFields.column_field2array(ᶜᶜmat) ./ 2 + @test ᶜᶜmat_array_view == MatrixFields.column_field2array(ᶜᶜmat) + + @test MatrixFields.field2arrays(ᶜᶜmat) == + (MatrixFields.column_field2array(ᶜᶜmat),) + + # Check for type instabilities. + @test_opt MatrixFields.column_field2array(ᶜᶜmat) + @test_opt MatrixFields.column_field2array_view(ᶜᶜmat) + @test_opt MatrixFields.field2arrays(ᶜᶜmat) + + # Because this test is broken, printing matrix fields allocates some memory. + @test_broken (@allocated MatrixFields.column_field2array_view(ᶜᶜmat)) == 0 +end diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl new file mode 100644 index 0000000000..b18585c0b1 --- /dev/null +++ b/test/MatrixFields/matrix_field_broadcasting.jl @@ -0,0 +1,816 @@ +using Test +using JET +using Random: seed! +using LinearAlgebra: I, mul! +using BandedMatrices: band + +using ClimaCore.MatrixFields +import ClimaCore: + Geometry, Domains, Meshes, Topologies, Hypsography, Spaces, Fields +import ClimaComms + +# Using @benchmark from BenchmarkTools is extremely slow; it appears to keep +# triggering recompilations and allocating a lot of memory in the process. +# This macro returns the minimum time (in seconds) required to run the +# expression after it has been compiled. +macro benchmark(expression) + return quote + $(esc(expression)) # Compile the expression first. Use esc for hygiene. + best_time = Inf + start_time = time_ns() + while time_ns() - start_time < 1e8 # Benchmark for 0.1 s (1e8 ns). + best_time = min(best_time, @elapsed $(esc(expression))) + end + best_time + end +end + +# This has to be its own function in order to correctly measure its allocations. +call_array_func( + ref_set_result!::F, + ref_result_arrays, + inputs_arrays, + temp_values_arrays, +) where {F} = foreach( + ref_set_result!, + ref_result_arrays, + inputs_arrays..., + temp_values_arrays..., +) + +function test_matrix_broadcast_against_array_reference(; + test_name, + inputs, + get_result::F1, + set_result!::F2, + temp_values = (), + ref_set_result!::F3, + time_ratio_limit = 1, + max_error_limit = 3, +) where {F1, F2, F3} + @testset "$test_name" begin + result = get_result(inputs...) + + # Fill all output fields with NaNs for testing correctness. + result .*= NaN + for temp_value in temp_values + temp_value .*= NaN + end + + ref_result_arrays = MatrixFields.field2arrays(result) + inputs_arrays = map(MatrixFields.field2arrays, inputs) + temp_values_arrays = map(MatrixFields.field2arrays, temp_values) + + best_time = @benchmark set_result!(result, inputs...) + best_ref_time = @benchmark call_array_func( + ref_set_result!, + ref_result_arrays, + inputs_arrays, + temp_values_arrays, + ) + + # Compute the maximum error as an integer multiple of machine epsilon. + result_arrays = MatrixFields.field2arrays(result) + max_error = + maximum(zip(result_arrays, ref_result_arrays)) do (array, ref_array) + maximum(zip(array, ref_array)) do (value, ref_value) + Int(abs(value - ref_value) / eps(ref_value)) + end + end + + @info "$test_name:\n\tBest Time = $best_time s\n\tBest Reference Time \ + = $best_ref_time s\n\tMaximum Error = $max_error eps" + + # Test that set_result! is performant compared to ref_set_result!. + @test best_time < time_ratio_limit * best_ref_time + + # Test set_result! for correctness, allocations, and type instabilities. + @test max_error <= max_error_limit + @test (@allocated set_result!(result, inputs...)) == 0 + @test_opt set_result!(result, inputs...) + + # Test ref_set_result! for allocations and type instabilities. This is + # helpful for ensuring that the performance comparison is fair. + @test (@allocated call_array_func( + ref_set_result!, + ref_result_arrays, + inputs_arrays, + temp_values_arrays, + )) == 0 + @test_opt call_array_func( + ref_set_result!, + ref_result_arrays, + inputs_arrays, + temp_values_arrays, + ) + + # Test get_result (the allocating version of set_result!) for type + # instabilities. + @test_opt get_result(inputs...) + end +end + +function test_matrix_broadcast_against_reference(; + test_name, + inputs, + get_result::F1, + set_result!::F2, + ref_inputs, + ref_set_result!::F3, +) where {F1, F2, F3} + @testset "$test_name" begin + result = get_result(inputs...) + + # Fill the output field with NaNs for testing correctness. + result .*= NaN + + ref_result = copy(result) + + best_time = @benchmark set_result!(result, inputs...) + best_ref_time = @benchmark ref_set_result!(ref_result, ref_inputs...) + + @info "$test_name:\n\tBest Time = $best_time s\n\tBest Reference Time \ + = $best_ref_time s" + + # Test that set_result! is performant compared to ref_set_result!. + @test best_time < best_ref_time + + # Test set_result! for correctness, allocations, and type instabilities. + @test result == ref_result + @test (@allocated set_result!(result, inputs...)) == 0 + @test_opt set_result!(result, inputs...) + + # Test ref_set_result! for allocations and type instabilities. This is + # helpful for ensuring that the performance comparison is fair. + @test (@allocated ref_set_result!(ref_result, ref_inputs...)) == 0 + @test_opt ref_set_result!(ref_result, ref_inputs...) + + # Test get_result (the allocating version of set_result!) for type + # instabilities. + @test_opt get_result(inputs...) + end +end + +function random_test_fields(::Type{FT}) where {FT} + velem = 20 # This should be big enough to test high-bandwidth matrices. + helem = npoly = 1 # These should be small enough for the tests to be fast. + + hdomain = Domains.SphereDomain(FT(10)) + hmesh = Meshes.EquiangularCubedSphere(hdomain, helem) + htopology = Topologies.Topology2D(ClimaComms.SingletonCommsContext(), hmesh) + quad = Spaces.Quadratures.GLL{npoly + 1}() + hspace = Spaces.SpectralElementSpace2D(htopology, quad) + vdomain = Domains.IntervalDomain( + Geometry.ZPoint(FT(0)), + Geometry.ZPoint(FT(10)); + boundary_tags = (:bottom, :top), + ) + vmesh = Meshes.IntervalMesh(vdomain, nelems = velem) + vspace = Spaces.CenterFiniteDifferenceSpace(vmesh) + sfc_coord = Fields.coordinate_field(hspace) + hypsography = Hypsography.LinearAdaption( + @. cosd(sfc_coord.lat) + cosd(sfc_coord.long) + 1 + ) + center_space = + Spaces.ExtrudedFiniteDifferenceSpace(hspace, vspace, hypsography) + face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(center_space) + ᶜcoord = Fields.coordinate_field(center_space) + ᶠcoord = Fields.coordinate_field(face_space) + + seed!(1) # ensures reproducibility + ᶜᶜmat = map(_ -> DiagonalMatrixRow(rand(FT, 1)...), ᶜcoord) + ᶜᶠmat = map(_ -> BidiagonalMatrixRow(rand(FT, 2)...), ᶜcoord) + ᶠᶠmat = map(_ -> TridiagonalMatrixRow(rand(FT, 3)...), ᶠcoord) + ᶠᶜmat = map(_ -> QuaddiagonalMatrixRow(rand(FT, 4)...), ᶠcoord) + ᶜvec = map(_ -> rand(FT), ᶜcoord) + ᶠvec = map(_ -> rand(FT), ᶠcoord) + + return ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec +end + +@testset "Scalar Matrix Field Broadcasting" begin + ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec = random_test_fields(Float64) + + test_matrix_broadcast_against_array_reference(; + test_name = "diagonal matrix times vector", + inputs = (ᶜᶜmat, ᶜvec), + get_result = (ᶜᶜmat, ᶜvec) -> (@. ᶜᶜmat ⋅ ᶜvec), + set_result! = (result, ᶜᶜmat, ᶜvec) -> (@. result = ᶜᶜmat ⋅ ᶜvec), + ref_set_result! = (_result, _ᶜᶜmat, _ᶜvec) -> + mul!(_result, _ᶜᶜmat, _ᶜvec), + time_ratio_limit = 2, # This case's ref function is fast on Buildkite. + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "tri-diagonal matrix times vector", + inputs = (ᶠᶠmat, ᶠvec), + get_result = (ᶠᶠmat, ᶠvec) -> (@. ᶠᶠmat ⋅ ᶠvec), + set_result! = (result, ᶠᶠmat, ᶠvec) -> (@. result = ᶠᶠmat ⋅ ᶠvec), + ref_set_result! = (_result, _ᶠᶠmat, _ᶠvec) -> + mul!(_result, _ᶠᶠmat, _ᶠvec), + time_ratio_limit = 2, # This case's ref function is fast on Buildkite. + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "quad-diagonal matrix times vector", + inputs = (ᶠᶜmat, ᶜvec), + get_result = (ᶠᶜmat, ᶜvec) -> (@. ᶠᶜmat ⋅ ᶜvec), + set_result! = (result, ᶠᶜmat, ᶜvec) -> (@. result = ᶠᶜmat ⋅ ᶜvec), + ref_set_result! = (_result, _ᶠᶜmat, _ᶜvec) -> + mul!(_result, _ᶠᶜmat, _ᶜvec), + time_ratio_limit = 5, # This case's ref function is fast on Buildkite. + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "diagonal matrix times bi-diagonal matrix", + inputs = (ᶜᶜmat, ᶜᶠmat), + get_result = (ᶜᶜmat, ᶜᶠmat) -> (@. ᶜᶜmat ⋅ ᶜᶠmat), + set_result! = (result, ᶜᶜmat, ᶜᶠmat) -> (@. result = ᶜᶜmat ⋅ ᶜᶠmat), + ref_set_result! = (_result, _ᶜᶜmat, _ᶜᶠmat) -> + mul!(_result, _ᶜᶜmat, _ᶜᶠmat), + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "tri-diagonal matrix times tri-diagonal matrix", + inputs = (ᶠᶠmat,), + get_result = (ᶠᶠmat,) -> (@. ᶠᶠmat ⋅ ᶠᶠmat), + set_result! = (result, ᶠᶠmat) -> (@. result = ᶠᶠmat ⋅ ᶠᶠmat), + ref_set_result! = (_result, _ᶠᶠmat) -> mul!(_result, _ᶠᶠmat, _ᶠᶠmat), + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "quad-diagonal matrix times diagonal matrix", + inputs = (ᶠᶜmat, ᶜᶜmat), + get_result = (ᶠᶜmat, ᶜᶜmat) -> (@. ᶠᶜmat ⋅ ᶜᶜmat), + set_result! = (result, ᶠᶜmat, ᶜᶜmat) -> (@. result = ᶠᶜmat ⋅ ᶜᶜmat), + ref_set_result! = (_result, _ᶠᶜmat, _ᶜᶜmat) -> + mul!(_result, _ᶠᶜmat, _ᶜᶜmat), + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "diagonal matrix times bi-diagonal matrix times \ + tri-diagonal matrix times quad-diagonal matrix", + inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat), + set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat), + temp_values = ((@. ᶜᶜmat ⋅ ᶜᶠmat), (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat)), + ref_set_result! = ( + _result, + _ᶜᶜmat, + _ᶜᶠmat, + _ᶠᶠmat, + _ᶠᶜmat, + _temp1, + _temp2, + ) -> begin + mul!(_temp1, _ᶜᶜmat, _ᶜᶠmat) + mul!(_temp2, _temp1, _ᶠᶠmat) + mul!(_result, _temp2, _ᶠᶜmat) + end, + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "diagonal matrix times bi-diagonal matrix times \ + tri-diagonal matrix times quad-diagonal matrix, but with \ + forced right-associativity", + inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))), + set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))), + temp_values = ((@. ᶠᶠmat ⋅ ᶠᶜmat), (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))), + ref_set_result! = ( + _result, + _ᶜᶜmat, + _ᶜᶠmat, + _ᶠᶠmat, + _ᶠᶜmat, + _temp1, + _temp2, + ) -> begin + mul!(_temp1, _ᶠᶠmat, _ᶠᶜmat) + mul!(_temp2, _ᶜᶠmat, _temp1) + mul!(_result, _ᶜᶜmat, _temp2) + end, + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "diagonal matrix times bi-diagonal matrix times \ + tri-diagonal matrix times quad-diagonal matrix times \ + vector", + inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec), + get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> + (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec), + set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> + (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec), + temp_values = ( + (@. ᶜᶜmat ⋅ ᶜᶠmat), + (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat), + (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat), + ), + ref_set_result! = ( + _result, + _ᶜᶜmat, + _ᶜᶠmat, + _ᶠᶠmat, + _ᶠᶜmat, + _ᶜvec, + _temp1, + _temp2, + _temp3, + ) -> begin + mul!(_temp1, _ᶜᶜmat, _ᶜᶠmat) + mul!(_temp2, _temp1, _ᶠᶠmat) + mul!(_temp3, _temp2, _ᶠᶜmat) + mul!(_result, _temp3, _ᶜvec) + end, + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "diagonal matrix times bi-diagonal matrix times \ + tri-diagonal matrix times quad-diagonal matrix times \ + vector, but with forced right-associativity", + inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec), + get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> + (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))), + set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> + (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))), + temp_values = ( + (@. ᶠᶜmat ⋅ ᶜvec), + (@. ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)), + (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec))), + ), + ref_set_result! = ( + _result, + _ᶜᶜmat, + _ᶜᶠmat, + _ᶠᶠmat, + _ᶠᶜmat, + _ᶜvec, + _temp1, + _temp2, + _temp3, + ) -> begin + mul!(_temp1, _ᶠᶜmat, _ᶜvec) + mul!(_temp2, _ᶠᶠmat, _temp1) + mul!(_temp3, _ᶜᶠmat, _temp2) + mul!(_result, _ᶜᶜmat, _temp3) + end, + time_ratio_limit = 15, # This case's ref function is fast on Buildkite. + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "linear combination of matrix products and LinearAlgebra.I", + inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)), + set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + (@. result = 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)), + temp_values = ( + (@. 2 * ᶠᶜmat), + (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat), + (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), + (@. ᶠᶠmat ⋅ ᶠᶠmat), + ), + ref_set_result! = ( + _result, + _ᶜᶜmat, + _ᶜᶠmat, + _ᶠᶠmat, + _ᶠᶜmat, + _temp1, + _temp2, + _temp3, + _temp4, + ) -> begin + @. _temp1 = 0 + 2 * _ᶠᶜmat # This allocates without the `0 + `. + mul!(_temp2, _temp1, _ᶜᶜmat) + mul!(_temp3, _temp2, _ᶜᶠmat) + mul!(_temp4, _ᶠᶠmat, _ᶠᶠmat) + copyto!(_result, 4I) # We can't directly use I in array broadcasts. + @. _result = _temp3 + _temp4 / 3 - _result + end, + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "another linear combination of matrix products and \ + LinearAlgebra.I", + inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)), + set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = + ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)), + temp_values = ( + (@. ᶠᶜmat ⋅ ᶜᶜmat), + (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), + (@. ᶠᶠmat / 3), + (@. (ᶠᶠmat / 3) ⋅ ᶠᶠmat), + ), + ref_set_result! = ( + _result, + _ᶜᶜmat, + _ᶜᶠmat, + _ᶠᶠmat, + _ᶠᶜmat, + _temp1, + _temp2, + _temp3, + _temp4, + ) -> begin + mul!(_temp1, _ᶠᶜmat, _ᶜᶜmat) + mul!(_temp2, _temp1, _ᶜᶠmat) + @. _temp3 = 0 + _ᶠᶠmat / 3 # This allocates without the `0 + `. + mul!(_temp4, _temp3, _ᶠᶠmat) + copyto!(_result, 4I) # We can't directly use I in array broadcasts. + @. _result = _temp2 * 2 - _temp4 + _result + end, + max_error_limit = 512, # Why is there a 512 * eps error on Buildkite? + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "matrix times linear combination", + inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. ᶜᶠmat ⋅ + (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))), + set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = + ᶜᶠmat ⋅ (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))), + temp_values = ( + (@. 2 * ᶠᶜmat), + (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat), + (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), + (@. ᶠᶠmat ⋅ ᶠᶠmat), + (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)), + ), + ref_set_result! = ( + _result, + _ᶜᶜmat, + _ᶜᶠmat, + _ᶠᶠmat, + _ᶠᶜmat, + _temp1, + _temp2, + _temp3, + _temp4, + _temp5, + ) -> begin + @. _temp1 = 0 + 2 * _ᶠᶜmat # This allocates without the `0 + `. + mul!(_temp2, _temp1, _ᶜᶜmat) + mul!(_temp3, _temp2, _ᶜᶠmat) + mul!(_temp4, _ᶠᶠmat, _ᶠᶠmat) + copyto!(_temp5, 4I) # We can't directly use I in array broadcasts. + @. _temp5 = _temp3 + _temp4 / 3 - _temp5 + mul!(_result, _ᶜᶠmat, _temp5) + end, + max_error_limit = 276, # Why is there a 276 * eps error on Buildkite? + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "linear combination times another linear combination", + inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + (@. (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅ + (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,))), + set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = + (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅ + (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,))), + temp_values = ( + (@. 2 * ᶠᶜmat), + (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat), + (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), + (@. ᶠᶠmat ⋅ ᶠᶠmat), + (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)), + (@. ᶠᶜmat ⋅ ᶜᶜmat), + (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), + (@. ᶠᶠmat / 3), + (@. (ᶠᶠmat / 3) ⋅ ᶠᶠmat), + (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)), + ), + ref_set_result! = ( + _result, + _ᶜᶜmat, + _ᶜᶠmat, + _ᶠᶠmat, + _ᶠᶜmat, + _temp1, + _temp2, + _temp3, + _temp4, + _temp5, + _temp6, + _temp7, + _temp8, + _temp9, + _temp10, + ) -> begin + @. _temp1 = 0 + 2 * _ᶠᶜmat # This allocates without the `0 + `. + mul!(_temp2, _temp1, _ᶜᶜmat) + mul!(_temp3, _temp2, _ᶜᶠmat) + mul!(_temp4, _ᶠᶠmat, _ᶠᶠmat) + copyto!(_temp5, 4I) # We can't directly use I in array broadcasts. + @. _temp5 = _temp3 + _temp4 / 3 - _temp5 + mul!(_temp6, _ᶠᶜmat, _ᶜᶜmat) + mul!(_temp7, _temp6, _ᶜᶠmat) + @. _temp8 = 0 + _ᶠᶠmat / 3 # This allocates without the `0 + `. + mul!(_temp9, _temp8, _ᶠᶠmat) + copyto!(_temp10, 4I) # We can't directly use I in array broadcasts. + @. _temp10 = _temp7 * 2 - _temp9 + _temp10 + mul!(_result, _temp5, _temp10) + end, + max_error_limit = 44, # Why is there a 44 * eps error on Buildkite? + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "matrix times matrix times linear combination times matrix \ + times another linear combination times matrix", + inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. ᶠᶜmat ⋅ ᶜᶠmat ⋅ + (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅ + ᶠᶠmat ⋅ + (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)) ⋅ + ᶠᶠmat), + set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = + ᶠᶜmat ⋅ ᶜᶠmat ⋅ + (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅ + ᶠᶠmat ⋅ + (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)) ⋅ + ᶠᶠmat), + temp_values = ( + (@. ᶠᶜmat ⋅ ᶜᶠmat), + (@. 2 * ᶠᶜmat), + (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat), + (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), + (@. ᶠᶠmat ⋅ ᶠᶠmat), + (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)), + (@. ᶠᶜmat ⋅ ᶜᶠmat ⋅ + (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))), + (@. ᶠᶜmat ⋅ ᶜᶠmat ⋅ + (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅ + ᶠᶠmat), + (@. ᶠᶜmat ⋅ ᶜᶜmat), + (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), + (@. ᶠᶠmat / 3), + (@. (ᶠᶠmat / 3) ⋅ ᶠᶠmat), + (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)), + (@. ᶠᶜmat ⋅ ᶜᶠmat ⋅ + (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅ + ᶠᶠmat ⋅ + (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,))), + ), + ref_set_result! = ( + _result, + _ᶜᶜmat, + _ᶜᶠmat, + _ᶠᶠmat, + _ᶠᶜmat, + _temp1, + _temp2, + _temp3, + _temp4, + _temp5, + _temp6, + _temp7, + _temp8, + _temp9, + _temp10, + _temp11, + _temp12, + _temp13, + _temp14, + ) -> begin + mul!(_temp1, _ᶠᶜmat, _ᶜᶠmat) + @. _temp2 = 0 + 2 * _ᶠᶜmat # This allocates without the `0 + `. + mul!(_temp3, _temp2, _ᶜᶜmat) + mul!(_temp4, _temp3, _ᶜᶠmat) + mul!(_temp5, _ᶠᶠmat, _ᶠᶠmat) + copyto!(_temp6, 4I) # We can't directly use I in array broadcasts. + @. _temp6 = _temp4 + _temp5 / 3 - _temp6 + mul!(_temp7, _temp1, _temp6) + mul!(_temp8, _temp7, _ᶠᶠmat) + mul!(_temp9, _ᶠᶜmat, _ᶜᶜmat) + mul!(_temp10, _temp9, _ᶜᶠmat) + @. _temp11 = 0 + _ᶠᶠmat / 3 # This allocates without the `0 + `. + mul!(_temp12, _temp11, _ᶠᶠmat) + copyto!(_temp13, 4I) # We can't directly use I in array broadcasts. + @. _temp13 = _temp10 * 2 - _temp12 + _temp13 + mul!(_temp14, _temp8, _temp13) + mul!(_result, _temp14, _ᶠᶠmat) + end, + time_ratio_limit = 2, # This case's ref function is fast on Buildkite. + max_error_limit = 3038, # Why is there a 3038 * eps error on Buildkite? + ) + + test_matrix_broadcast_against_array_reference(; + test_name = "matrix constructions and multiplications", + inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec), + get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec) -> + (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅ + TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat ⋅ + DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)), + set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec) -> + (@. result = + BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅ + TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat ⋅ + DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)), + temp_values = ( + (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec)), + (@. TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1)), + (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅ + TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1)), + (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅ + TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat), + (@. DiagonalMatrixRow(ᶠvec)), + (@. DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)), + ), + ref_set_result! = ( + _result, + _ᶜᶜmat, + _ᶜᶠmat, + _ᶠᶠmat, + _ᶠᶜmat, + _ᶜvec, + _ᶠvec, + _temp1, + _temp2, + _temp3, + _temp4, + _temp5, + _temp6, + ) -> begin + mul!(view(_temp1, band(0)), _ᶜᶠmat, _ᶠvec) + mul!(view(_temp1, band(1)), _ᶜᶜmat, _ᶜvec) + copyto!(view(_temp2, band(-1)), 1, _ᶠvec, 2) + mul!(view(_temp2, band(0)), _ᶠᶜmat, _ᶜvec) + fill!(view(_temp2, band(1)), 1) + mul!(_temp3, _temp1, _temp2) + mul!(_temp4, _temp3, _ᶠᶠmat) + copyto!(view(_temp5, band(0)), 1, _ᶠvec, 1) + mul!(view(_temp6, band(0)), _temp5, _ᶠvec) + mul!(_result, _temp4, _temp6) + end, + ) +end + +@testset "Non-scalar Matrix Field Broadcasting" begin + ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec = random_test_fields(Float64) + + ᶜlg = Fields.local_geometry_field(ᶜvec) + ᶠlg = Fields.local_geometry_field(ᶠvec) + + ᶜᶠmat2 = map(row -> map(sin, row), ᶜᶠmat) + ᶜᶠmat3 = map(row -> map(cos, row), ᶜᶠmat) + ᶠᶜmat2 = map(row -> map(sin, row), ᶠᶜmat) + ᶠᶜmat3 = map(row -> map(cos, row), ᶠᶜmat) + + ᶜᶠmat_AC1 = map(row -> map(adjoint ∘ Geometry.Covariant1Vector, row), ᶜᶠmat) + ᶜᶠmat_C12 = map( + (row1, row2) -> map(Geometry.Covariant12Vector, row1, row2), + ᶜᶠmat2, + ᶜᶠmat3, + ) + ᶠᶜmat_AC1 = map(row -> map(adjoint ∘ Geometry.Covariant1Vector, row), ᶠᶜmat) + ᶠᶜmat_C12 = map( + (row1, row2) -> map(Geometry.Covariant12Vector, row1, row2), + ᶠᶜmat2, + ᶠᶜmat3, + ) + + test_matrix_broadcast_against_reference(; + test_name = "matrix of covectors times matrix of vectors", + inputs = (ᶜᶠmat_AC1, ᶠᶜmat_C12), + get_result = (ᶜᶠmat_AC1, ᶠᶜmat_C12) -> (@. ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12), + set_result! = (result, ᶜᶠmat_AC1, ᶠᶜmat_C12) -> + (@. result = ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12), + ref_inputs = (ᶜᶠmat, ᶠᶜmat2, ᶠᶜmat3, ᶠlg), + ref_set_result! = (result, ᶜᶠmat, ᶠᶜmat2, ᶠᶜmat3, ᶠlg) -> (@. result = + ᶜᶠmat ⋅ ( + DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:1) ⋅ ᶠᶜmat2 + + DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:2) ⋅ ᶠᶜmat3 + )), + ) + + test_matrix_broadcast_against_reference(; + test_name = "matrix of covectors times matrix of vectors times matrix \ + of numbers times matrix of covectors times matrix of \ + vectors", + inputs = (ᶜᶠmat_AC1, ᶠᶜmat_C12, ᶜᶠmat, ᶠᶜmat_AC1, ᶜᶠmat_C12), + get_result = (ᶜᶠmat_AC1, ᶠᶜmat_C12, ᶜᶠmat, ᶠᶜmat_AC1, ᶜᶠmat_C12) -> + (@. ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12 ⋅ ᶜᶠmat ⋅ ᶠᶜmat_AC1 ⋅ ᶜᶠmat_C12), + set_result! = ( + result, + ᶜᶠmat_AC1, + ᶠᶜmat_C12, + ᶜᶠmat, + ᶠᶜmat_AC1, + ᶜᶠmat_C12, + ) -> + (@. result = ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12 ⋅ ᶜᶠmat ⋅ ᶠᶜmat_AC1 ⋅ ᶜᶠmat_C12), + ref_inputs = (ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3, ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3, ᶜlg, ᶠlg), + ref_set_result! = ( + result, + ᶜᶠmat, + ᶜᶠmat2, + ᶜᶠmat3, + ᶠᶜmat, + ᶠᶜmat2, + ᶠᶜmat3, + ᶜlg, + ᶠlg, + ) -> (@. result = + ᶜᶠmat ⋅ ( + DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:1) ⋅ ᶠᶜmat2 + + DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:2) ⋅ ᶠᶜmat3 + ) ⋅ ᶜᶠmat ⋅ ᶠᶜmat ⋅ ( + DiagonalMatrixRow(ᶜlg.gⁱʲ.components.data.:1) ⋅ ᶜᶠmat2 + + DiagonalMatrixRow(ᶜlg.gⁱʲ.components.data.:2) ⋅ ᶜᶠmat3 + )), + ) + + ᶜᶠmat_AC1_num = + map((row1, row2) -> map(tuple, row1, row2), ᶜᶠmat_AC1, ᶜᶠmat) + ᶜᶠmat_num_C12 = + map((row1, row2) -> map(tuple, row1, row2), ᶜᶠmat, ᶜᶠmat_C12) + ᶠᶜmat_C12_AC1 = + map((row1, row2) -> map(tuple, row1, row2), ᶠᶜmat_C12, ᶠᶜmat_AC1) + + test_matrix_broadcast_against_reference(; + test_name = "matrix of covectors and numbers times matrix of vectors \ + and covectors times matrix of numbers and vectors times \ + vector of numbers", + inputs = (ᶜᶠmat_AC1_num, ᶠᶜmat_C12_AC1, ᶜᶠmat_num_C12, ᶠvec), + get_result = (ᶜᶠmat_AC1_num, ᶠᶜmat_C12_AC1, ᶜᶠmat_num_C12, ᶠvec) -> + (@. ᶜᶠmat_AC1_num ⋅ ᶠᶜmat_C12_AC1 ⋅ ᶜᶠmat_num_C12 ⋅ ᶠvec), + set_result! = ( + result, + ᶜᶠmat_AC1_num, + ᶠᶜmat_C12_AC1, + ᶜᶠmat_num_C12, + ᶠvec, + ) -> (@. result = ᶜᶠmat_AC1_num ⋅ ᶠᶜmat_C12_AC1 ⋅ ᶜᶠmat_num_C12 ⋅ ᶠvec), + ref_inputs = ( + ᶜᶠmat, + ᶜᶠmat2, + ᶜᶠmat3, + ᶠᶜmat, + ᶠᶜmat2, + ᶠᶜmat3, + ᶠvec, + ᶜlg, + ᶠlg, + ), + ref_set_result! = ( + result, + ᶜᶠmat, + ᶜᶠmat2, + ᶜᶠmat3, + ᶠᶜmat, + ᶠᶜmat2, + ᶠᶜmat3, + ᶠvec, + ᶜlg, + ᶠlg, + ) -> (@. result = tuple( + ᶜᶠmat ⋅ ( + DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:1) ⋅ ᶠᶜmat2 + + DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:2) ⋅ ᶠᶜmat3 + ) ⋅ ᶜᶠmat ⋅ ᶠvec, + ᶜᶠmat ⋅ ᶠᶜmat ⋅ ( + DiagonalMatrixRow(ᶜlg.gⁱʲ.components.data.:1) ⋅ ᶜᶠmat2 + + DiagonalMatrixRow(ᶜlg.gⁱʲ.components.data.:2) ⋅ ᶜᶠmat3 + ) ⋅ ᶠvec, + )), + ) + + T(value1, value2, value3) = + (; a = (), b = value1, c = (value2, (; d = (value3,)), (;))) + ᶜᶠmat_T = map((rows...) -> map(T, rows...), ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3) + ᶠᶜmat_T = map((rows...) -> map(T, rows...), ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3) + ᶜvec_T = @. T(ᶜvec, ᶜvec, ᶜvec) + + test_matrix_broadcast_against_reference(; + test_name = "matrix of nested values times matrix of nested values \ + times matrix of numbers times matrix of numbers times \ + times vector of nested values", + inputs = (ᶜᶠmat_T, ᶠᶜmat, ᶜᶠmat, ᶠᶜmat_T, ᶜvec_T), + get_result = (ᶜᶠmat_T, ᶠᶜmat, ᶜᶠmat, ᶠᶜmat_T, ᶜvec_T) -> + (@. ᶜᶠmat_T ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat_T ⋅ ᶜvec_T), + set_result! = (result, ᶜᶠmat_T, ᶠᶜmat, ᶜᶠmat, ᶠᶜmat_T, ᶜvec_T) -> + (@. result = ᶜᶠmat_T ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat_T ⋅ ᶜvec_T), + ref_inputs = (ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3, ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3, ᶜvec), + ref_set_result! = ( + result, + ᶜᶠmat, + ᶜᶠmat2, + ᶜᶠmat3, + ᶠᶜmat, + ᶠᶜmat2, + ᶠᶜmat3, + ᶜvec, + ) -> (@. result = T( + ᶜᶠmat ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec, + ᶜᶠmat2 ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat2 ⋅ ᶜvec, + ᶜᶠmat3 ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat3 ⋅ ᶜvec, + )), + ) +end diff --git a/test/MatrixFields/rmul_with_projection.jl b/test/MatrixFields/rmul_with_projection.jl new file mode 100644 index 0000000000..d7f4855865 --- /dev/null +++ b/test/MatrixFields/rmul_with_projection.jl @@ -0,0 +1,123 @@ +using Test +using JET +using Random: seed! +using StaticArrays: @SMatrix + +import ClimaCore: Geometry +import ClimaCore.MatrixFields: rmul_with_projection + +function test_rmul_with_projection(x, y, lg, expected_result) + result = rmul_with_projection(x, y, lg) + + # Compute the maximum error as an integer multiple of machine epsilon. + FT = Geometry.undertype(typeof(lg)) + object2tuple(obj) = + reinterpret(NTuple{sizeof(obj) ÷ sizeof(FT), FT}, [obj])[1] + max_error = maximum( + ((value, expected_value),) -> + Int(abs(value - expected_value) / eps(expected_value)), + zip(object2tuple(result), object2tuple(expected_result)), + ) + + @test max_error <= 1 # correctness + @test (@allocated rmul_with_projection(x, y, lg)) == 0 # allocations + @test_opt rmul_with_projection(x, y, lg) # type instabilities +end + +@testset "rmul_with_projection Unit Tests" begin + seed!(1) # ensures reproducibility + + FT = Float64 + coord = Geometry.LatLongZPoint(rand(FT), rand(FT), rand(FT)) + ∂x∂ξ = Geometry.AxisTensor( + (Geometry.LocalAxis{(1, 2, 3)}(), Geometry.CovariantAxis{(1, 2, 3)}()), + (@SMatrix rand(FT, 3, 3)), + ) + lg = Geometry.LocalGeometry(coord, rand(FT), rand(FT), ∂x∂ξ) + + number = rand(FT) + covector = Geometry.Covariant12Vector(rand(FT), rand(FT))' + vector = Geometry.Covariant123Vector(rand(FT), rand(FT), rand(FT)) + tensor = vector * vector' + projected_vector = + Geometry.project(Geometry.Contravariant12Axis(), vector, lg) + projected_tensor = + Geometry.project(Geometry.Contravariant12Axis(), tensor, lg) + + # Test all required combinations of single values. + test_rmul_with_projection(number, number, lg, number * number) + test_rmul_with_projection(number, covector, lg, number * covector) + test_rmul_with_projection(number, vector, lg, number * vector) + test_rmul_with_projection(number, tensor, lg, number * tensor) + test_rmul_with_projection(covector, number, lg, covector * number) + test_rmul_with_projection(vector, number, lg, vector * number) + test_rmul_with_projection(tensor, number, lg, tensor * number) + test_rmul_with_projection(covector, vector, lg, covector * projected_vector) + test_rmul_with_projection(covector, tensor, lg, covector * projected_tensor) + + # Test some combinations of complicated nested values. + T(value1, value2, value3) = + (; a = (), b = value1, c = (value2, (; d = (value3,)), (;))) + test_rmul_with_projection( + number, + T(covector, vector, tensor), + lg, + T(number * covector, number * vector, number * tensor), + ) + test_rmul_with_projection( + T(covector, vector, tensor), + number, + lg, + T(covector * number, vector * number, tensor * number), + ) + test_rmul_with_projection( + vector, + T(number, number, number), + lg, + T(vector * number, vector * number, vector * number), + ) + test_rmul_with_projection( + T(number, number, number), + covector, + lg, + T(number * covector, number * covector, number * covector), + ) + test_rmul_with_projection( + T(number, vector, number), + T(covector, number, tensor), + lg, + T(number * covector, vector * number, number * tensor), + ) + test_rmul_with_projection( + T(covector, number, tensor), + T(number, vector, number), + lg, + T(covector * number, number * vector, tensor * number), + ) + test_rmul_with_projection( + covector, + T(vector, number, tensor), + lg, + T( + covector * projected_vector, + covector * number, + covector * projected_tensor, + ), + ) + test_rmul_with_projection( + T(covector, number, covector), + vector, + lg, + T( + covector * projected_vector, + number * vector, + covector * projected_vector, + ), + ) + test_rmul_with_projection( + T(covector, number, covector), + T(number, vector, tensor), + lg, + T(covector * number, number * vector, covector * projected_tensor), + ) +end diff --git a/test/Project.toml b/test/Project.toml index 2df49394e3..9351af89f0 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -2,6 +2,7 @@ Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" AssociatedLegendrePolynomials = "2119f1ac-fb78-50f5-8cc0-dda848ebdb19" +BandedMatrices = "aae01518-5342-5314-be14-df237901396f" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" diff --git a/test/runtests.jl b/test/runtests.jl index f04ac240fd..28967cdd3c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -75,6 +75,12 @@ if !Sys.iswindows() @safetestset "Hybrid - dss opt" begin @time include("Operators/hybrid/dss_opt.jl") end @safetestset "Hybrid - opt" begin @time include("Operators/hybrid/opt.jl") end + @safetestset "MatrixFields - BandMatrixRow" begin @time include("MatrixFields/band_matrix_row.jl") end + @safetestset "MatrixFields - rmul_with_projection" begin @time include("MatrixFields/rmul_with_projection.jl") end + @safetestset "MatrixFields - field2arrays" begin @time include("MatrixFields/field2arrays.jl") end + # now part of buildkite + # @safetestset "MatrixFields - matrix field broadcasting" begin @time include("MatrixFields/matrix_field_broadcasting.jl") end + @safetestset "Hypsography - 2d" begin @time include("Hypsography/2d.jl") end @safetestset "Hypsography - 3d sphere" begin @time include("Hypsography/3dsphere.jl") end From d9337b9f44430591ec52e94a0f28f9b98feaac78 Mon Sep 17 00:00:00 2001 From: Dennis Yatunin Date: Tue, 27 Jun 2023 16:43:02 -0700 Subject: [PATCH 02/10] Add GPU unit test --- .buildkite/pipeline.yml | 10 ++++++++-- test/MatrixFields/matrix_field_broadcasting.jl | 6 ++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 950581d805..75bc995f53 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -509,10 +509,16 @@ steps: key: unit_field2arrays command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/field2arrays.jl" - - label: "Unit: matrix field broadcasting" - key: unit_matrix_field_broadcasting + - label: "Unit: matrix field broadcasting (CPU)" + key: unit_matrix_field_broadcasting_cpu command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl" + - label: "Unit: matrix field broadcasting (GPU)" + key: unit_matrix_field_broadcasting_gpu + command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl" + agents: + slurm_gpus: 1 + - group: "Unit: Hypsography" steps: diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl index b18585c0b1..714dbaa1c6 100644 --- a/test/MatrixFields/matrix_field_broadcasting.jl +++ b/test/MatrixFields/matrix_field_broadcasting.jl @@ -155,9 +155,10 @@ function random_test_fields(::Type{FT}) where {FT} velem = 20 # This should be big enough to test high-bandwidth matrices. helem = npoly = 1 # These should be small enough for the tests to be fast. + comms_ctx = ClimaComms.SingletonCommsContext() hdomain = Domains.SphereDomain(FT(10)) hmesh = Meshes.EquiangularCubedSphere(hdomain, helem) - htopology = Topologies.Topology2D(ClimaComms.SingletonCommsContext(), hmesh) + htopology = Topologies.Topology2D(comms_ctx, hmesh) quad = Spaces.Quadratures.GLL{npoly + 1}() hspace = Spaces.SpectralElementSpace2D(htopology, quad) vdomain = Domains.IntervalDomain( @@ -166,7 +167,8 @@ function random_test_fields(::Type{FT}) where {FT} boundary_tags = (:bottom, :top), ) vmesh = Meshes.IntervalMesh(vdomain, nelems = velem) - vspace = Spaces.CenterFiniteDifferenceSpace(vmesh) + vtopology = Tpologies.IntervalTopology(comms_ctx, vmesh) + vspace = Spaces.CenterFiniteDifferenceSpace(vtopology) sfc_coord = Fields.coordinate_field(hspace) hypsography = Hypsography.LinearAdaption( @. cosd(sfc_coord.lat) + cosd(sfc_coord.long) + 1 From 3b495d777929767f56b4174f4683a18918162090 Mon Sep 17 00:00:00 2001 From: Dennis Yatunin Date: Thu, 6 Jul 2023 19:42:09 -0700 Subject: [PATCH 03/10] Fix GPU-related bugs --- src/MatrixFields/MatrixFields.jl | 1 + src/MatrixFields/matrix_field_utils.jl | 94 +++++++------- src/MatrixFields/matrix_multiplication.jl | 2 +- src/Operators/common.jl | 32 +++-- src/Operators/finitedifference.jl | 2 +- src/Operators/spectralelement.jl | 2 +- .../MatrixFields/matrix_field_broadcasting.jl | 115 +++++++++++------- 7 files changed, 148 insertions(+), 100 deletions(-) diff --git a/src/MatrixFields/MatrixFields.jl b/src/MatrixFields/MatrixFields.jl index 6ed80c32e3..0a0f241acd 100644 --- a/src/MatrixFields/MatrixFields.jl +++ b/src/MatrixFields/MatrixFields.jl @@ -1,5 +1,6 @@ module MatrixFields +import CUDA: @allowscalar import LinearAlgebra: UniformScaling, Adjoint import StaticArrays: SArray, SMatrix, SVector import BandedMatrices: BandedMatrix, band, _BandedMatrix diff --git a/src/MatrixFields/matrix_field_utils.jl b/src/MatrixFields/matrix_field_utils.jl index 706366eba2..4a4d4c350e 100644 --- a/src/MatrixFields/matrix_field_utils.jl +++ b/src/MatrixFields/matrix_field_utils.jl @@ -31,12 +31,11 @@ end Converts a field defined on a `FiniteDifferenceSpace` into a `Vector` or a `BandedMatrix`, depending on whether or not the elements of the field are -`BandMatrixRow`s. This involves copying the data stored in the field. +`BandMatrixRow`s. This involves copying the data stored in the field. Because +`BandedMatrix` does not currently support operations with `CuArray`s, all GPU +data is copied to the CPU. """ -function column_field2array(field) - space = axes(field) - space isa Spaces.FiniteDifferenceSpace || - error("column_field2array requires a field on a FiniteDifferenceSpace") +function column_field2array(field::Fields.FiniteDifferenceField) if eltype(field) <: BandMatrixRow # field represents a matrix n_rows, n_cols, matrix_ld, matrix_ud = banded_matrix_info(field) matrix = BandedMatrix{eltype(eltype(field))}( @@ -45,47 +44,24 @@ function column_field2array(field) (-matrix_ld, matrix_ud), ) for (index_of_field_entry, matrix_d) in enumerate(matrix_ld:matrix_ud) - # Find the rows for which field_diagonal[row] is inside the matrix. + matrix_diagonal = view(matrix, band(matrix_d)) + field_diagonal = field.entries.:($index_of_field_entry) + field_diagonal_data = + vec(reinterpret(eltype(eltype(field)), parent(field_diagonal)')) + + # Find the rows for which field_diagonal_data[row] is in the matrix. # Note: The matrix index (1, 1) corresponds to the diagonal index 0, # and the matrix index (n_rows, n_cols) corresponds to the diagonal # index n_cols - n_rows. first_row = matrix_d < 0 ? 1 - matrix_d : 1 last_row = matrix_d < n_cols - n_rows ? n_rows : n_cols - matrix_d - # Copy the value in each row from field_diagonal to matrix_diagonal. - field_diagonal = field.entries.:($index_of_field_entry) - matrix_diagonal = view(matrix, band(matrix_d)) - for (index_along_diagonal, row) in enumerate(first_row:last_row) - matrix_diagonal[index_along_diagonal] = - Fields.field_values(field_diagonal)[row] - end + @allowscalar matrix_diagonal .= + view(field_diagonal_data, first_row:last_row) end return matrix else # field represents a vector - n_rows = Spaces.nlevels(space) - return map(row -> Fields.field_values(field)[row], 1:n_rows) - end -end - -""" - field2arrays(field) - -Converts a field defined on a `FiniteDifferenceSpace` or on an -`ExtrudedFiniteDifferenceSpace` into a tuple of arrays, each of which -corresponds to a column of the field. This is done by calling -`column_field2array` on each of the field's columns. -""" -function field2arrays(field) - space = axes(field) - column_indices = if space isa Spaces.FiniteDifferenceSpace - (((1, 1), 1),) - elseif space isa Spaces.ExtrudedFiniteDifferenceSpace - (Spaces.all_nodes(Spaces.horizontal_space(space))...,) - else - error("Invalid space type: $(typeof(space).name.wrapper)") - end - return map(column_indices) do ((i, j), h) - column_field2array(Spaces.column(field, i, j, h)) + return @allowscalar Array(column_field2array_view(field)) end end @@ -95,16 +71,13 @@ end Similar to `column_field2array(field)`, except that this version avoids copying the data stored in the field. """ -function column_field2array_view(field) - space = axes(field) - space isa Spaces.FiniteDifferenceSpace || - error("column_field2array_view requires a field on a \ - FiniteDifferenceSpace") +function column_field2array_view(field::Fields.FiniteDifferenceField) if eltype(field) <: BandMatrixRow # field represents a matrix n_rows, n_cols, matrix_ld, matrix_ud = banded_matrix_info(field) - data_transpose = reinterpret(eltype(eltype(field)), parent(field)') + field_data_transpose = + reinterpret(eltype(eltype(field)), parent(field)') matrix_transpose = - _BandedMatrix(data_transpose, n_cols, matrix_ud, -matrix_ld) + _BandedMatrix(field_data_transpose, n_cols, matrix_ud, -matrix_ld) return permutedims(matrix_transpose) # TODO: Despite not copying any data, this function still allocates a # small amount of memory because of _BandedMatrix and permutedims. @@ -113,6 +86,36 @@ function column_field2array_view(field) end end +column_indices(field::Fields.FiniteDifferenceField) = (((1, 1), 1),) +column_indices(field::Fields.ExtrudedFiniteDifferenceField) = + (Spaces.all_nodes(Spaces.horizontal_space(axes(field)))...,) +column_indices(field) = + error("Invalid space type: $(typeof(axes(field)).name.wrapper)") + +""" + field2arrays(field) + +Converts a field defined on a `FiniteDifferenceSpace` or on an +`ExtrudedFiniteDifferenceSpace` into a tuple of arrays, each of which +corresponds to a column of the field. This is done by calling +`column_field2array` on each of the field's columns. +""" +field2arrays(field) = + map(column_indices(field)) do ((i, j), h) + column_field2array(Spaces.column(field, i, j, h)) + end + +""" + field2arrays_view(field) + +Similar to `field2arrays(field)`, except that this version calls +`column_field2array_view` instead of `column_field2array`. +""" +field2arrays_view(field) = + map(column_indices(field)) do ((i, j), h) + column_field2array_view(Spaces.column(field, i, j, h)) + end + function Base.show( io::IO, field::Fields.Field{<:Fields.AbstractData{<:BandMatrixRow}}, @@ -125,7 +128,8 @@ function Base.show( println(io, " whose first column corresponds to the matrix") end column_field = Fields.column(field, 1, 1, 1) - Base.print_array(io, column_field2array_view(column_field)) + io = IOContext(io, :compact => true, :limit => true) + @allowscalar Base.print_array(io, column_field2array_view(column_field)) else # When a BandedMatrix with non-number entries is printed, it currently # either prints in an illegible format (e.g., if it has AxisTensor or diff --git a/src/MatrixFields/matrix_multiplication.jl b/src/MatrixFields/matrix_multiplication.jl index 7f9144d623..45ddaba9bc 100644 --- a/src/MatrixFields/matrix_multiplication.jl +++ b/src/MatrixFields/matrix_multiplication.jl @@ -216,7 +216,7 @@ function Operators.return_eltype( "The first argument of ⋅ must have elements of type BandMatrixRow, but \ the given argument has elements of type $(eltype(matrix1))", ) - lg_type = eltype(Fields.local_geometry_field(axes(arg))) + lg_type = Operators.local_geometry_type(axes(arg)) if eltype(arg) <: BandMatrixRow # matrix-matrix multiplication matrix2 = arg ld1, ud1 = outer_diagonals(eltype(matrix1)) diff --git a/src/Operators/common.jl b/src/Operators/common.jl index 8251ee62b3..07315befdc 100644 --- a/src/Operators/common.jl +++ b/src/Operators/common.jl @@ -65,33 +65,34 @@ end # when sending Broadcasted objects to the GPU, we strip out the space # information at each level of the broadcast tree and Fields, replacing with # PlaceholderSpace. this reduces the amount runtime parameter data we send to -# the GPU, which is quite limited (~4kB). +# the GPU, which is quite limited (~4kB). However, we need to keep the eltype of +# the local geometry data, since it may needed in order to infer the return +# eltype of a Field operation. # Functions for CUDASpectralStyle -struct PlaceholderSpace <: Spaces.AbstractSpace end -struct CenterPlaceholderSpace <: Spaces.AbstractSpace end -struct FacePlaceholderSpace <: Spaces.AbstractSpace end - +struct PlaceholderSpace{LG} <: Spaces.AbstractSpace end +struct CenterPlaceholderSpace{LG} <: Spaces.AbstractSpace end +struct FacePlaceholderSpace{LG} <: Spaces.AbstractSpace end placeholder_space(current_space::T, parent_space::T) where {T} = - PlaceholderSpace() + PlaceholderSpace{local_geometry_type(current_space)}() placeholder_space(current_space, parent_space) = current_space placeholder_space( current_space::Spaces.CenterFiniteDifferenceSpace, parent_space::Spaces.FaceFiniteDifferenceSpace, -) = CenterPlaceholderSpace() +) = CenterPlaceholderSpace{local_geometry_type(current_space)}() placeholder_space( current_space::Spaces.CenterExtrudedFiniteDifferenceSpace, parent_space::Spaces.FaceExtrudedFiniteDifferenceSpace, -) = CenterPlaceholderSpace() +) = CenterPlaceholderSpace{local_geometry_type(current_space)}() placeholder_space( current_space::Spaces.FaceFiniteDifferenceSpace, parent_space::Spaces.CenterFiniteDifferenceSpace, -) = FacePlaceholderSpace() +) = FacePlaceholderSpace{local_geometry_type(current_space)}() placeholder_space( current_space::Spaces.FaceExtrudedFiniteDifferenceSpace, parent_space::Spaces.CenterExtrudedFiniteDifferenceSpace, -) = FacePlaceholderSpace() +) = FacePlaceholderSpace{local_geometry_type(current_space)}() @inline reconstruct_placeholder_space(::PlaceholderSpace, parent_space) = parent_space @@ -114,6 +115,11 @@ placeholder_space( @inline reconstruct_placeholder_space(current_space, parent_space) = current_space +local_geometry_type(space::Spaces.AbstractSpace) = + eltype(Spaces.local_geometry_data(space)) +local_geometry_type(::PlaceholderSpace{LG}) where {LG} = LG +local_geometry_type(::CenterPlaceholderSpace{LG}) where {LG} = LG +local_geometry_type(::FacePlaceholderSpace{LG}) where {LG} = LG strip_space(obj, parent_space) = obj @@ -131,7 +137,11 @@ function strip_space( new_space = placeholder_space(current_space, parent_space) return Base.Broadcast.Broadcasted{Style}( bc.f, - map(arg -> strip_space(arg, current_space), bc.args), + strip_space_args(bc.args, current_space), new_space, ) end + +strip_space_args(::Tuple{}, space) = () +strip_space_args(args::Tuple, space) = + (strip_space(args[1], space), strip_space_args(Base.tail(args), space)...) diff --git a/src/Operators/finitedifference.jl b/src/Operators/finitedifference.jl index a15cb9cf7e..79b4d0c599 100644 --- a/src/Operators/finitedifference.jl +++ b/src/Operators/finitedifference.jl @@ -3415,7 +3415,7 @@ function strip_space(bc::StencilBroadcasted{Style}, parent_space) where {Style} new_space = placeholder_space(current_space, parent_space) return StencilBroadcasted{Style}( bc.op, - map(arg -> strip_space(arg, current_space), bc.args), + strip_space_args(bc.args, current_space), new_space, ) end diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl index aad2f26f0a..88df089300 100644 --- a/src/Operators/spectralelement.jl +++ b/src/Operators/spectralelement.jl @@ -236,7 +236,7 @@ function strip_space(bc::SpectralBroadcasted{Style}, parent_space) where {Style} new_space = placeholder_space(current_space, parent_space) return SpectralBroadcasted{Style}( bc.op, - map(arg -> strip_space(arg, current_space), bc.args), + strip_space_args(bc.args, current_space), new_space, ) end diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl index 714dbaa1c6..66a084c60b 100644 --- a/test/MatrixFields/matrix_field_broadcasting.jl +++ b/test/MatrixFields/matrix_field_broadcasting.jl @@ -1,8 +1,9 @@ using Test using JET -using Random: seed! -using LinearAlgebra: I, mul! -using BandedMatrices: band +import CUDA +import BandedMatrices: band +import LinearAlgebra: I, mul! +import Random: seed! using ClimaCore.MatrixFields import ClimaCore: @@ -43,13 +44,23 @@ function test_matrix_broadcast_against_array_reference(; inputs, get_result::F1, set_result!::F2, - temp_values = (), - ref_set_result!::F3, + get_temp_values::F3 = (_...) -> (), + ref_set_result!::F4, time_ratio_limit = 1, - max_error_limit = 3, -) where {F1, F2, F3} + max_error_limit = 4, + test_broken_with_cuda = false, +) where {F1, F2, F3, F4} @testset "$test_name" begin + is_using_cuda = ClimaComms.device(inputs[1]) isa ClimaComms.CUDADevice + ignore_cuda = (AnyFrameModule(CUDA),) + + if test_broken_with_cuda && is_using_cuda + @test_throws CUDA.InvalidIRError get_result(inputs...) + return + end + result = get_result(inputs...) + temp_values = get_temp_values(inputs...) # Fill all output fields with NaNs for testing correctness. result .*= NaN @@ -85,9 +96,10 @@ function test_matrix_broadcast_against_array_reference(; @test best_time < time_ratio_limit * best_ref_time # Test set_result! for correctness, allocations, and type instabilities. + # Ignore the type instabilities in CUDA and the allocations they incur. @test max_error <= max_error_limit - @test (@allocated set_result!(result, inputs...)) == 0 - @test_opt set_result!(result, inputs...) + @test is_using_cuda || (@allocated set_result!(result, inputs...)) == 0 + @test_opt ignored_modules = ignore_cuda set_result!(result, inputs...) # Test ref_set_result! for allocations and type instabilities. This is # helpful for ensuring that the performance comparison is fair. @@ -105,8 +117,8 @@ function test_matrix_broadcast_against_array_reference(; ) # Test get_result (the allocating version of set_result!) for type - # instabilities. - @test_opt get_result(inputs...) + # instabilities. Ignore the type instabilities in CUDA. + @test_opt ignored_modules = ignore_cuda get_result(inputs...) end end @@ -119,6 +131,9 @@ function test_matrix_broadcast_against_reference(; ref_set_result!::F3, ) where {F1, F2, F3} @testset "$test_name" begin + is_using_cuda = ClimaComms.device(inputs[1]) isa ClimaComms.CUDADevice + ignore_cuda = (AnyFrameModule(CUDA),) + result = get_result(inputs...) # Fill the output field with NaNs for testing correctness. @@ -136,18 +151,30 @@ function test_matrix_broadcast_against_reference(; @test best_time < best_ref_time # Test set_result! for correctness, allocations, and type instabilities. - @test result == ref_result - @test (@allocated set_result!(result, inputs...)) == 0 - @test_opt set_result!(result, inputs...) + # Ignore the type instabilities in CUDA and the allocations they incur. + # Account for the roundoff errors that CUDA introduces. + @test if is_using_cuda + error_array = abs.(parent(result) .- parent(ref_result)) + CUDA.@allowscalar maximum(error_array) < eps(eltype(error_array)) + else + result == ref_result + end + @test is_using_cuda || (@allocated set_result!(result, inputs...)) == 0 + @test_opt ignored_modules = ignore_cuda set_result!(result, inputs...) # Test ref_set_result! for allocations and type instabilities. This is - # helpful for ensuring that the performance comparison is fair. - @test (@allocated ref_set_result!(ref_result, ref_inputs...)) == 0 - @test_opt ref_set_result!(ref_result, ref_inputs...) + # helpful for ensuring that the performance comparison is fair. Ignore + # the type instabilities in CUDA and the allocations they incur. + @test is_using_cuda || + (@allocated ref_set_result!(ref_result, ref_inputs...)) == 0 + @test_opt ignored_modules = ignore_cuda ref_set_result!( + ref_result, + ref_inputs..., + ) # Test get_result (the allocating version of set_result!) for type - # instabilities. - @test_opt get_result(inputs...) + # instabilities. Ignore the type instabilities in CUDA. + @test_opt ignored_modules = ignore_cuda get_result(inputs...) end end @@ -167,12 +194,14 @@ function random_test_fields(::Type{FT}) where {FT} boundary_tags = (:bottom, :top), ) vmesh = Meshes.IntervalMesh(vdomain, nelems = velem) - vtopology = Tpologies.IntervalTopology(comms_ctx, vmesh) + vtopology = Topologies.IntervalTopology(comms_ctx, vmesh) vspace = Spaces.CenterFiniteDifferenceSpace(vtopology) sfc_coord = Fields.coordinate_field(hspace) - hypsography = Hypsography.LinearAdaption( - @. cosd(sfc_coord.lat) + cosd(sfc_coord.long) + 1 - ) + hypsography = + comms_ctx.device isa ClimaComms.CUDADevice ? Hypsography.Flat() : + Hypsography.LinearAdaption( + @. cosd(sfc_coord.lat) + cosd(sfc_coord.long) + 1 + ) # TODO: FD operators don't currently work with hypsography on GPUs. center_space = Spaces.ExtrudedFiniteDifferenceSpace(hspace, vspace, hypsography) face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(center_space) @@ -180,12 +209,12 @@ function random_test_fields(::Type{FT}) where {FT} ᶠcoord = Fields.coordinate_field(face_space) seed!(1) # ensures reproducibility - ᶜᶜmat = map(_ -> DiagonalMatrixRow(rand(FT, 1)...), ᶜcoord) - ᶜᶠmat = map(_ -> BidiagonalMatrixRow(rand(FT, 2)...), ᶜcoord) - ᶠᶠmat = map(_ -> TridiagonalMatrixRow(rand(FT, 3)...), ᶠcoord) - ᶠᶜmat = map(_ -> QuaddiagonalMatrixRow(rand(FT, 4)...), ᶠcoord) - ᶜvec = map(_ -> rand(FT), ᶜcoord) - ᶠvec = map(_ -> rand(FT), ᶠcoord) + ᶜᶜmat = map(c -> DiagonalMatrixRow(ntuple(_ -> rand(FT), 1)...), ᶜcoord) + ᶜᶠmat = map(c -> BidiagonalMatrixRow(ntuple(_ -> rand(FT), 2)...), ᶜcoord) + ᶠᶠmat = map(c -> TridiagonalMatrixRow(ntuple(_ -> rand(FT), 3)...), ᶠcoord) + ᶠᶜmat = map(c -> QuaddiagonalMatrixRow(ntuple(_ -> rand(FT), 4)...), ᶠcoord) + ᶜvec = map(c -> rand(FT), ᶜcoord) + ᶠvec = map(c -> rand(FT), ᶠcoord) return ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec end @@ -257,7 +286,8 @@ end (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat), set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat), - temp_values = ((@. ᶜᶜmat ⋅ ᶜᶠmat), (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat)), + get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + ((@. ᶜᶜmat ⋅ ᶜᶠmat), (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat)), ref_set_result! = ( _result, _ᶜᶜmat, @@ -282,7 +312,8 @@ end (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))), set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))), - temp_values = ((@. ᶠᶠmat ⋅ ᶠᶜmat), (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))), + get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + ((@. ᶠᶠmat ⋅ ᶠᶜmat), (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))), ref_set_result! = ( _result, _ᶜᶜmat, @@ -296,6 +327,7 @@ end mul!(_temp2, _ᶜᶠmat, _temp1) mul!(_result, _ᶜᶜmat, _temp2) end, + test_broken_with_cuda = true, # TODO: Fix this. ) test_matrix_broadcast_against_array_reference(; @@ -307,7 +339,7 @@ end (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec), set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec), - temp_values = ( + get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> ( (@. ᶜᶜmat ⋅ ᶜᶠmat), (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat), (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat), @@ -339,7 +371,7 @@ end (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))), set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))), - temp_values = ( + get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> ( (@. ᶠᶜmat ⋅ ᶜvec), (@. ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)), (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec))), @@ -361,6 +393,7 @@ end mul!(_result, _ᶜᶜmat, _temp3) end, time_ratio_limit = 15, # This case's ref function is fast on Buildkite. + test_broken_with_cuda = true, # TODO: Fix this. ) test_matrix_broadcast_against_array_reference(; @@ -370,7 +403,7 @@ end (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)), set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)), - temp_values = ( + get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> ( (@. 2 * ᶠᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), @@ -404,7 +437,7 @@ end (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)), set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)), - temp_values = ( + get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> ( (@. ᶠᶜmat ⋅ ᶜᶜmat), (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), (@. ᶠᶠmat / 3), @@ -438,7 +471,7 @@ end (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))), set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = ᶜᶠmat ⋅ (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))), - temp_values = ( + get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> ( (@. 2 * ᶠᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), @@ -465,7 +498,7 @@ end @. _temp5 = _temp3 + _temp4 / 3 - _temp5 mul!(_result, _ᶜᶠmat, _temp5) end, - max_error_limit = 276, # Why is there a 276 * eps error on Buildkite? + max_error_limit = 296, # Why is there a 296 * eps error on Buildkite? ) test_matrix_broadcast_against_array_reference(; @@ -477,7 +510,7 @@ end set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅ (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,))), - temp_values = ( + get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> ( (@. 2 * ᶠᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), @@ -520,7 +553,7 @@ end @. _temp10 = _temp7 * 2 - _temp9 + _temp10 mul!(_result, _temp5, _temp10) end, - max_error_limit = 44, # Why is there a 44 * eps error on Buildkite? + max_error_limit = 2496, # Why is there a 2496 * eps error on Buildkite? ) test_matrix_broadcast_against_array_reference(; @@ -538,7 +571,7 @@ end ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)) ⋅ ᶠᶠmat), - temp_values = ( + get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> ( (@. ᶠᶜmat ⋅ ᶜᶠmat), (@. 2 * ᶠᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat), @@ -615,7 +648,7 @@ end BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅ TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat ⋅ DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)), - temp_values = ( + get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec) -> ( (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec)), (@. TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1)), (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅ From 60e729f5ef49ea1adba82456ab7ab19ac3ea7039 Mon Sep 17 00:00:00 2001 From: Dennis Yatunin Date: Fri, 7 Jul 2023 11:25:22 -0700 Subject: [PATCH 04/10] Additional fixes --- src/MatrixFields/matrix_field_utils.jl | 40 +++++++++---------- .../MatrixFields/matrix_field_broadcasting.jl | 26 ++++++------ 2 files changed, 31 insertions(+), 35 deletions(-) diff --git a/src/MatrixFields/matrix_field_utils.jl b/src/MatrixFields/matrix_field_utils.jl index 4a4d4c350e..7883634892 100644 --- a/src/MatrixFields/matrix_field_utils.jl +++ b/src/MatrixFields/matrix_field_utils.jl @@ -45,19 +45,19 @@ function column_field2array(field::Fields.FiniteDifferenceField) ) for (index_of_field_entry, matrix_d) in enumerate(matrix_ld:matrix_ud) matrix_diagonal = view(matrix, band(matrix_d)) - field_diagonal = field.entries.:($index_of_field_entry) - field_diagonal_data = - vec(reinterpret(eltype(eltype(field)), parent(field_diagonal)')) + diagonal_field = field.entries.:($index_of_field_entry) + diagonal_data = + vec(reinterpret(eltype(eltype(field)), parent(diagonal_field)')) - # Find the rows for which field_diagonal_data[row] is in the matrix. + # Find the rows for which diagonal_data[row] is in the matrix. # Note: The matrix index (1, 1) corresponds to the diagonal index 0, # and the matrix index (n_rows, n_cols) corresponds to the diagonal # index n_cols - n_rows. first_row = matrix_d < 0 ? 1 - matrix_d : 1 last_row = matrix_d < n_cols - n_rows ? n_rows : n_cols - matrix_d - @allowscalar matrix_diagonal .= - view(field_diagonal_data, first_row:last_row) + diagonal_data_view = view(diagonal_data, first_row:last_row) + @allowscalar copyto!(matrix_diagonal, diagonal_data_view) end return matrix else # field represents a vector @@ -73,7 +73,7 @@ the data stored in the field. """ function column_field2array_view(field::Fields.FiniteDifferenceField) if eltype(field) <: BandMatrixRow # field represents a matrix - n_rows, n_cols, matrix_ld, matrix_ud = banded_matrix_info(field) + _, n_cols, matrix_ld, matrix_ud = banded_matrix_info(field) field_data_transpose = reinterpret(eltype(eltype(field)), parent(field)') matrix_transpose = @@ -86,11 +86,12 @@ function column_field2array_view(field::Fields.FiniteDifferenceField) end end -column_indices(field::Fields.FiniteDifferenceField) = (((1, 1), 1),) -column_indices(field::Fields.ExtrudedFiniteDifferenceField) = +all_columns(::Fields.FiniteDifferenceField) = (((1, 1), 1),) +all_columns(field::Fields.ExtrudedFiniteDifferenceField) = (Spaces.all_nodes(Spaces.horizontal_space(axes(field)))...,) -column_indices(field) = - error("Invalid space type: $(typeof(axes(field)).name.wrapper)") + +column_map(f::F, field) where {F} = + map((((i, j), h),) -> f(Spaces.column(field, i, j, h)), all_columns(field)) """ field2arrays(field) @@ -100,10 +101,7 @@ Converts a field defined on a `FiniteDifferenceSpace` or on an corresponds to a column of the field. This is done by calling `column_field2array` on each of the field's columns. """ -field2arrays(field) = - map(column_indices(field)) do ((i, j), h) - column_field2array(Spaces.column(field, i, j, h)) - end +field2arrays(field) = column_map(column_field2array, field) """ field2arrays_view(field) @@ -111,18 +109,18 @@ field2arrays(field) = Similar to `field2arrays(field)`, except that this version calls `column_field2array_view` instead of `column_field2array`. """ -field2arrays_view(field) = - map(column_indices(field)) do ((i, j), h) - column_field2array_view(Spaces.column(field, i, j, h)) - end +field2arrays_view(field) = column_map(column_field2array_view, field) function Base.show( io::IO, field::Fields.Field{<:Fields.AbstractData{<:BandMatrixRow}}, ) + is_matrix_field = + field isa Fields.FiniteDifferenceField || + field isa Fields.ExtrudedFiniteDifferenceField print(io, eltype(field), "-valued Field") - if eltype(eltype(field)) <: Number - if axes(field) isa Spaces.FiniteDifferenceSpace + if is_matrix_field && eltype(eltype(field)) <: Number + if field isa Fields.FiniteDifferenceField println(io, " that corresponds to the matrix") else println(io, " whose first column corresponds to the matrix") diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl index 66a084c60b..7c1465bdfc 100644 --- a/test/MatrixFields/matrix_field_broadcasting.jl +++ b/test/MatrixFields/matrix_field_broadcasting.jl @@ -2,7 +2,7 @@ using Test using JET import CUDA import BandedMatrices: band -import LinearAlgebra: I, mul! +import LinearAlgebra: I, mul!, norm import Random: seed! using ClimaCore.MatrixFields @@ -47,7 +47,7 @@ function test_matrix_broadcast_against_array_reference(; get_temp_values::F3 = (_...) -> (), ref_set_result!::F4, time_ratio_limit = 1, - max_error_limit = 4, + max_eps_error_limit = 6, test_broken_with_cuda = false, ) where {F1, F2, F3, F4} @testset "$test_name" begin @@ -56,6 +56,7 @@ function test_matrix_broadcast_against_array_reference(; if test_broken_with_cuda && is_using_cuda @test_throws CUDA.InvalidIRError get_result(inputs...) + @warn "$test_name:\n\tCUDA.InvalidIRError" return end @@ -84,20 +85,19 @@ function test_matrix_broadcast_against_array_reference(; result_arrays = MatrixFields.field2arrays(result) max_error = maximum(zip(result_arrays, ref_result_arrays)) do (array, ref_array) - maximum(zip(array, ref_array)) do (value, ref_value) - Int(abs(value - ref_value) / eps(ref_value)) - end + maximum(abs.(array .- ref_array)) end + max_eps_error = Int(max_error / eps(typeof(max_error))) @info "$test_name:\n\tBest Time = $best_time s\n\tBest Reference Time \ - = $best_ref_time s\n\tMaximum Error = $max_error eps" + = $best_ref_time s\n\tMaximum Error = $max_eps_error eps" # Test that set_result! is performant compared to ref_set_result!. - @test best_time < time_ratio_limit * best_ref_time + @test best_time / best_ref_time < time_ratio_limit # Test set_result! for correctness, allocations, and type instabilities. # Ignore the type instabilities in CUDA and the allocations they incur. - @test max_error <= max_error_limit + @test max_eps_error <= max_eps_error_limit @test is_using_cuda || (@allocated set_result!(result, inputs...)) == 0 @test_opt ignored_modules = ignore_cuda set_result!(result, inputs...) @@ -154,8 +154,8 @@ function test_matrix_broadcast_against_reference(; # Ignore the type instabilities in CUDA and the allocations they incur. # Account for the roundoff errors that CUDA introduces. @test if is_using_cuda - error_array = abs.(parent(result) .- parent(ref_result)) - CUDA.@allowscalar maximum(error_array) < eps(eltype(error_array)) + max_error = maximum(abs.(parent(result) .- parent(ref_result))) + max_error < eps(typeof(max_error)) else result == ref_result end @@ -461,7 +461,6 @@ end copyto!(_result, 4I) # We can't directly use I in array broadcasts. @. _result = _temp2 * 2 - _temp4 + _result end, - max_error_limit = 512, # Why is there a 512 * eps error on Buildkite? ) test_matrix_broadcast_against_array_reference(; @@ -498,7 +497,6 @@ end @. _temp5 = _temp3 + _temp4 / 3 - _temp5 mul!(_result, _ᶜᶠmat, _temp5) end, - max_error_limit = 296, # Why is there a 296 * eps error on Buildkite? ) test_matrix_broadcast_against_array_reference(; @@ -553,7 +551,7 @@ end @. _temp10 = _temp7 * 2 - _temp9 + _temp10 mul!(_result, _temp5, _temp10) end, - max_error_limit = 2496, # Why is there a 2496 * eps error on Buildkite? + max_eps_error_limit = 30, # This case's roundoff error is large on GPUs. ) test_matrix_broadcast_against_array_reference(; @@ -633,7 +631,7 @@ end mul!(_result, _temp14, _ᶠᶠmat) end, time_ratio_limit = 2, # This case's ref function is fast on Buildkite. - max_error_limit = 3038, # Why is there a 3038 * eps error on Buildkite? + max_eps_error_limit = 70, # This case's roundoff error is large on GPUs. ) test_matrix_broadcast_against_array_reference(; From 4ce2749c66ea58e23548e2aa2429e03a5a64a1ed Mon Sep 17 00:00:00 2001 From: Dennis Yatunin Date: Fri, 7 Jul 2023 13:36:55 -0700 Subject: [PATCH 05/10] More fixes --- test/MatrixFields/matrix_field_broadcasting.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl index 7c1465bdfc..3650bfe9f0 100644 --- a/test/MatrixFields/matrix_field_broadcasting.jl +++ b/test/MatrixFields/matrix_field_broadcasting.jl @@ -2,7 +2,7 @@ using Test using JET import CUDA import BandedMatrices: band -import LinearAlgebra: I, mul!, norm +import LinearAlgebra: I, mul! import Random: seed! using ClimaCore.MatrixFields @@ -47,7 +47,7 @@ function test_matrix_broadcast_against_array_reference(; get_temp_values::F3 = (_...) -> (), ref_set_result!::F4, time_ratio_limit = 1, - max_eps_error_limit = 6, + max_eps_error_limit = 7, test_broken_with_cuda = false, ) where {F1, F2, F3, F4} @testset "$test_name" begin @@ -87,7 +87,7 @@ function test_matrix_broadcast_against_array_reference(; maximum(zip(result_arrays, ref_result_arrays)) do (array, ref_array) maximum(abs.(array .- ref_array)) end - max_eps_error = Int(max_error / eps(typeof(max_error))) + max_eps_error = ceil(Int, max_error / eps(typeof(max_error))) @info "$test_name:\n\tBest Time = $best_time s\n\tBest Reference Time \ = $best_ref_time s\n\tMaximum Error = $max_eps_error eps" @@ -97,7 +97,7 @@ function test_matrix_broadcast_against_array_reference(; # Test set_result! for correctness, allocations, and type instabilities. # Ignore the type instabilities in CUDA and the allocations they incur. - @test max_eps_error <= max_eps_error_limit + @test max_eps_error < max_eps_error_limit @test is_using_cuda || (@allocated set_result!(result, inputs...)) == 0 @test_opt ignored_modules = ignore_cuda set_result!(result, inputs...) From 851e5a9f7393a1356c5adc7629715e5283914525 Mon Sep 17 00:00:00 2001 From: Dennis Yatunin Date: Fri, 7 Jul 2023 15:36:06 -0700 Subject: [PATCH 06/10] Address comments --- src/MatrixFields/matrix_field_utils.jl | 38 +++++++++---------- .../MatrixFields/matrix_field_broadcasting.jl | 16 ++++---- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/src/MatrixFields/matrix_field_utils.jl b/src/MatrixFields/matrix_field_utils.jl index 7883634892..56de58e185 100644 --- a/src/MatrixFields/matrix_field_utils.jl +++ b/src/MatrixFields/matrix_field_utils.jl @@ -1,28 +1,26 @@ +# Finds the diagonal index of the value that ends up in the bottom-right corner +# of the BandedMatrix generated from a matrix field. +bottom_corner_matrix_d(_, ::Int) = 0 # center-to-center or face-to-face matrix +bottom_corner_matrix_d(::Spaces.CellCenter, ::PlusHalf) = 1 # face-to-center +bottom_corner_matrix_d(::Spaces.CellFace, ::PlusHalf) = -1 # center-to-face + +# Finds the amount by which the field's diagonal indices get shifted when it is +# converted into a BandedMatrix (whose diagonal indices have to be integers). +matrix_d_minus_field_d(_, ::Int) = 0 # center-to-center or face-to-face matrix +matrix_d_minus_field_d(::Spaces.CellCenter, ::PlusHalf) = half # face-to-center +matrix_d_minus_field_d(::Spaces.CellFace, ::PlusHalf) = -half # center-to-face + function banded_matrix_info(field) - space = axes(field) field_ld, field_ud = outer_diagonals(eltype(field)) - - # Find the diagonal index of the value that ends up in the bottom-right - # corner of the matrix, as well as the amount by which the field's diagonal - # indices get shifted when it is converted into a matrix. - bottom_corner_matrix_d, matrix_d_minus_field_d = if field_ld isa PlusHalf - if space.staggering isa Spaces.CellCenter - 1, half # field is a face-to-center matrix - else - -1, -half # field is a center-to-face matrix - end - else - 0, 0 # field is either a center-to-center or face-to-face matrix - end - + space = axes(field) + staggering = space.staggering n_rows = Spaces.nlevels(space) - n_cols = n_rows + bottom_corner_matrix_d - matrix_ld = field_ld + matrix_d_minus_field_d - matrix_ud = field_ud + matrix_d_minus_field_d + n_cols = n_rows + bottom_corner_matrix_d(staggering, field_ld) + matrix_ld = field_ld + matrix_d_minus_field_d(staggering, field_ld) + matrix_ud = field_ud + matrix_d_minus_field_d(staggering, field_ld) matrix_ld <= 0 && matrix_ud >= 0 || error("BandedMatrices.jl does not yet support matrices that have \ diagonals with indices in the range $matrix_ld:$matrix_ud") - return n_rows, n_cols, matrix_ld, matrix_ud end @@ -88,7 +86,7 @@ end all_columns(::Fields.FiniteDifferenceField) = (((1, 1), 1),) all_columns(field::Fields.ExtrudedFiniteDifferenceField) = - (Spaces.all_nodes(Spaces.horizontal_space(axes(field)))...,) + Spaces.all_nodes(Spaces.horizontal_space(axes(field))) column_map(f::F, field) where {F} = map((((i, j), h),) -> f(Spaces.column(field, i, j, h)), all_columns(field)) diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl index 3650bfe9f0..8f68b0a600 100644 --- a/test/MatrixFields/matrix_field_broadcasting.jl +++ b/test/MatrixFields/matrix_field_broadcasting.jl @@ -26,18 +26,18 @@ macro benchmark(expression) end end -# This has to be its own function in order to correctly measure its allocations. -call_array_func( +# This function is used for benchmarking ref_set_result!. +function call_array_func( ref_set_result!::F, ref_result_arrays, inputs_arrays, temp_values_arrays, -) where {F} = foreach( - ref_set_result!, - ref_result_arrays, - inputs_arrays..., - temp_values_arrays..., -) +) where {F} + for arrays in + zip(ref_result_arrays, inputs_arrays..., temp_values_arrays...) + ref_set_result!(arrays...) + end +end function test_matrix_broadcast_against_array_reference(; test_name, From 3ce6f45976db1a08bc93517b4c4dd9edb0295f13 Mon Sep 17 00:00:00 2001 From: Dennis Yatunin Date: Fri, 7 Jul 2023 15:37:14 -0700 Subject: [PATCH 07/10] Try removing check-bounds from GPU job to see if that fixes the PTX error --- .buildkite/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 75bc995f53..45617e88f1 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -515,7 +515,7 @@ steps: - label: "Unit: matrix field broadcasting (GPU)" key: unit_matrix_field_broadcasting_gpu - command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl" + command: "julia --color=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl" agents: slurm_gpus: 1 From 0c61d69860662f9d43f944edcbd638fa5ed45cd7 Mon Sep 17 00:00:00 2001 From: Dennis Yatunin Date: Fri, 7 Jul 2023 16:08:45 -0700 Subject: [PATCH 08/10] Try allocating more memory for GPU job instead --- .buildkite/pipeline.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 45617e88f1..8fb892dd49 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -515,9 +515,10 @@ steps: - label: "Unit: matrix field broadcasting (GPU)" key: unit_matrix_field_broadcasting_gpu - command: "julia --color=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl" + command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl" agents: slurm_gpus: 1 + slurm_mem: 20GB - group: "Unit: Hypsography" steps: From 22d0e466c94d939997f67f4d92d2b593af12dc97 Mon Sep 17 00:00:00 2001 From: Dennis Yatunin Date: Fri, 7 Jul 2023 16:53:30 -0700 Subject: [PATCH 09/10] Remove check-bounds again and use more memory --- .buildkite/pipeline.yml | 4 ++-- src/MatrixFields/matrix_field_utils.jl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 8fb892dd49..42a855062b 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -515,10 +515,10 @@ steps: - label: "Unit: matrix field broadcasting (GPU)" key: unit_matrix_field_broadcasting_gpu - command: "julia --color=yes --check-bounds=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl" + command: "julia --color=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl" agents: slurm_gpus: 1 - slurm_mem: 20GB + slurm_mem: 80GB - group: "Unit: Hypsography" steps: diff --git a/src/MatrixFields/matrix_field_utils.jl b/src/MatrixFields/matrix_field_utils.jl index 56de58e185..0d8470d6a3 100644 --- a/src/MatrixFields/matrix_field_utils.jl +++ b/src/MatrixFields/matrix_field_utils.jl @@ -95,7 +95,7 @@ column_map(f::F, field) where {F} = field2arrays(field) Converts a field defined on a `FiniteDifferenceSpace` or on an -`ExtrudedFiniteDifferenceSpace` into a tuple of arrays, each of which +`ExtrudedFiniteDifferenceSpace` into a collection of arrays, each of which corresponds to a column of the field. This is done by calling `column_field2array` on each of the field's columns. """ From c40b2df9b03f498547bca9e388297d619249e1c7 Mon Sep 17 00:00:00 2001 From: Dennis Yatunin Date: Wed, 12 Jul 2023 17:24:43 -0700 Subject: [PATCH 10/10] Address comments --- .buildkite/pipeline.yml | 2 +- src/MatrixFields/MatrixFields.jl | 6 +- src/MatrixFields/matrix_field_utils.jl | 13 +- src/MatrixFields/matrix_multiplication.jl | 13 +- src/MatrixFields/rmul_with_projection.jl | 153 +++++++++++++++++++--- src/Operators/common.jl | 26 ++-- test/MatrixFields/rmul_with_projection.jl | 9 +- 7 files changed, 171 insertions(+), 51 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 42a855062b..ff02381cdb 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -518,7 +518,7 @@ steps: command: "julia --color=yes --project=test test/MatrixFields/matrix_field_broadcasting.jl" agents: slurm_gpus: 1 - slurm_mem: 80GB + slurm_mem: 20GB - group: "Unit: Hypsography" steps: diff --git a/src/MatrixFields/MatrixFields.jl b/src/MatrixFields/MatrixFields.jl index 0a0f241acd..17214e97f6 100644 --- a/src/MatrixFields/MatrixFields.jl +++ b/src/MatrixFields/MatrixFields.jl @@ -1,12 +1,12 @@ module MatrixFields import CUDA: @allowscalar -import LinearAlgebra: UniformScaling, Adjoint -import StaticArrays: SArray, SMatrix, SVector +import LinearAlgebra: UniformScaling, Adjoint, AdjointAbsVec +import StaticArrays: SMatrix, SVector import BandedMatrices: BandedMatrix, band, _BandedMatrix import ..Utilities: PlusHalf, half import ..RecursiveApply: - rmap, rpromote_type, rzero, rconvert, radd, rsub, rmul, rdiv + rmap, rmaptype, rpromote_type, rzero, rconvert, radd, rsub, rmul, rdiv import ..Geometry import ..Spaces import ..Fields diff --git a/src/MatrixFields/matrix_field_utils.jl b/src/MatrixFields/matrix_field_utils.jl index 0d8470d6a3..d73e1f218e 100644 --- a/src/MatrixFields/matrix_field_utils.jl +++ b/src/MatrixFields/matrix_field_utils.jl @@ -84,9 +84,16 @@ function column_field2array_view(field::Fields.FiniteDifferenceField) end end -all_columns(::Fields.FiniteDifferenceField) = (((1, 1), 1),) -all_columns(field::Fields.ExtrudedFiniteDifferenceField) = - Spaces.all_nodes(Spaces.horizontal_space(axes(field))) +all_columns(::Fields.ColumnField) = (((1, 1), 1),) +all_columns(field) = all_columns(axes(field)) +all_columns(space::Spaces.ExtrudedFiniteDifferenceSpace) = + Spaces.all_nodes(Spaces.horizontal_space(space)) + +# TODO: Unify FiniteDifferenceField and ColumnField so that we can use this +# version instead. +# all_columns(::Fields.FiniteDifferenceField) = (((1, 1), 1),) +# all_columns(field::Fields.ExtrudedFiniteDifferenceField) = +# Spaces.all_nodes(Spaces.horizontal_space(axes(field))) column_map(f::F, field) where {F} = map((((i, j), h),) -> f(Spaces.column(field, i, j, h)), all_columns(field)) diff --git a/src/MatrixFields/matrix_multiplication.jl b/src/MatrixFields/matrix_multiplication.jl index 45ddaba9bc..08d0bed830 100644 --- a/src/MatrixFields/matrix_multiplication.jl +++ b/src/MatrixFields/matrix_multiplication.jl @@ -199,14 +199,6 @@ Operators.right_interior_idx( arg, ) = matrix_right_interior_index(space, outer_diagonals(eltype(matrix1))[2]) -function rmul_type(::Type{T1}, ::Type{T2}, ::Type{LG}) where {T1, T2, LG} - type = Base._return_type(rmul_with_projection, Tuple{T1, T2, LG}) - type == Union{} && - error("Unable to infer result type: Calling rmul_with_projection with \ - arguments of types $T1 and $T2 will cause it to throw an error") - return type -end - function Operators.return_eltype( ::MultiplyColumnwiseBandMatrixField, matrix1, @@ -216,18 +208,17 @@ function Operators.return_eltype( "The first argument of ⋅ must have elements of type BandMatrixRow, but \ the given argument has elements of type $(eltype(matrix1))", ) - lg_type = Operators.local_geometry_type(axes(arg)) if eltype(arg) <: BandMatrixRow # matrix-matrix multiplication matrix2 = arg ld1, ud1 = outer_diagonals(eltype(matrix1)) ld2, ud2 = outer_diagonals(eltype(matrix2)) prod_ld, prod_ud = ld1 + ld2, ud1 + ud2 prod_value_type = - rmul_type(eltype(eltype(matrix1)), eltype(eltype(matrix2)), lg_type) + rmul_return_type(eltype(eltype(matrix1)), eltype(eltype(matrix2))) return band_matrix_row_type(prod_ld, prod_ud, prod_value_type) else # matrix-vector multiplication vector = arg - return rmul_type(eltype(eltype(matrix1)), eltype(vector), lg_type) + return rmul_return_type(eltype(eltype(matrix1)), eltype(vector)) end end diff --git a/src/MatrixFields/rmul_with_projection.jl b/src/MatrixFields/rmul_with_projection.jl index 76f9846b68..9cc449583f 100644 --- a/src/MatrixFields/rmul_with_projection.jl +++ b/src/MatrixFields/rmul_with_projection.jl @@ -1,26 +1,27 @@ const SingleValue = Union{Number, Geometry.AxisTensor, Geometry.AdjointAxisTensor} +""" + mul_with_projection(x, y, lg) + +Similar to `x * y`, except that this version automatically projects `y` to avoid +`DimensionMismatch` errors for `AxisTensor`s. For example, if `x` is a covector +along the `Covariant3Axis` (e.g., `Covariant3Vector(1)'`), then `y` will be +projected onto the `Contravariant3Axis`. In general, the first axis of `y` will +be projected onto the dual of the last axis of `x`. +""" mul_with_projection(x, y, _) = x * y -mul_with_projection(x::Geometry.AdjointAxisVector, y::Geometry.AxisTensor, lg) = - x * Geometry.project(Geometry.dual(axes(x, 2)), y, lg) -mul_with_projection(::Geometry.AdjointAxisTensor, ::Geometry.AxisTensor, _) = - error("mul_with_projection is currently only implemented for covectors, \ - and higher-order cotensors are not supported") -# We should add methods for other cotensors (e.g., AdjointAxis2Tensor) when they -# are needed (e.g., when we need to support matrices that represent the -# divergence of higher-order tensors). +mul_with_projection( + x::Union{Geometry.AdjointAxisVector, Geometry.Axis2TensorOrAdj}, + y::Geometry.AxisTensor, + lg, +) = x * Geometry.project(Geometry.dual(axes(x)[2]), y, lg) """ rmul_with_projection(x, y, lg) -Similar to `rmul(x, y)`, but with automatic projection of `y` when `x` contains -a covector (i.e, an `AdjointAxisVector`). For example, if `x` is a covector -along the `Covariant3Axis` (e.g., `Covariant3Vector(1)'`), then `y` (or each -element of `y`) will be projected onto the `Contravariant3Axis`. In general, -each covector in `x` will cause `y` (or each corresponding element of `y`) to -be projected onto the dual axis of the covector. In the future, we may extend -this behavior to higher-order cotensors. +Similar to `rmul(x, y)`, except that this version calls `mul_with_projection` +instead of `*`. """ rmul_with_projection(x, y, lg) = rmap((x′, y′) -> mul_with_projection(x′, y′, lg), x, y) @@ -30,3 +31,125 @@ rmul_with_projection(x, y::SingleValue, lg) = rmap(x′ -> mul_with_projection(x′, y, lg), x) rmul_with_projection(x::SingleValue, y::SingleValue, lg) = mul_with_projection(x, y, lg) + +axis_tensor_type(::Type{T}, ::Type{Tuple{A1}}) where {T, A1} = + Geometry.AxisVector{T, A1, SVector{length(A1.instance), T}} +function axis_tensor_type(::Type{T}, ::Type{Tuple{A1, A2}}) where {T, A1, A2} + N1 = length(A1.instance) + N2 = length(A2.instance) + S = SMatrix{N1, N2, T, N1 * N2} + return Geometry.Axis2Tensor{T, Tuple{A1, A2}, S} +end + +adjoint_type(::Type{A}) where {A} = Adjoint{eltype(A), A} +adjoint_type(::Type{A}) where {T, S, A <: Adjoint{T, S}} = S + +""" + mul_return_type(X, Y) + +Computes the return type of `mul_with_projection(x, y, lg)`, where `x isa X` +and `y isa Y`. This can also be used to obtain the return type of `x * y`, +although `x * y` will throw an error when projection is necessary. + +Note: this should be equivalent to calling the internal function `_return_type`: +`Base._return_type(mul_with_projection, Tuple{X, Y, LG})`, where `lg isa LG`. +""" +mul_return_type(::Type{X}, ::Type{Y}) where {X, Y} = error( + "Unable to infer return type: Multiplying an object of type $X with an \ + object of type $Y will result in a method error", +) +# Note: If the behavior of * changes for any relevant types, the corresponding +# methods below should be updated. + +# Methods from Base: +mul_return_type(::Type{X}, ::Type{Y}) where {X <: Number, Y <: Number} = + promote_type(X, Y) +mul_return_type( + ::Type{X}, + ::Type{Y}, +) where {X <: AdjointAbsVec, Y <: AbstractMatrix} = + adjoint_type(mul_return_type(adjoint_type(Y), adjoint_type(X))) + +# Methods from ClimaCore: +mul_return_type( + ::Type{X}, + ::Type{Y}, +) where {T, N, A, X <: Number, Y <: Geometry.AxisTensor{T, N, A}} = + axis_tensor_type(promote_type(X, T), A) +mul_return_type( + ::Type{X}, + ::Type{Y}, +) where {T, N, A, X <: Geometry.AxisTensor{T, N, A}, Y <: Number} = + axis_tensor_type(promote_type(T, Y), A) +mul_return_type( + ::Type{X}, + ::Type{Y}, +) where {T, N, A, X <: Number, Y <: Geometry.AdjointAxisTensor{T, N, A}} = + adjoint_type(axis_tensor_type(promote_type(X, T), A)) +mul_return_type( + ::Type{X}, + ::Type{Y}, +) where {T, N, A, X <: Geometry.AdjointAxisTensor{T, N, A}, Y <: Number} = + adjoint_type(axis_tensor_type(promote_type(T, Y), A)) +mul_return_type( + ::Type{X}, + ::Type{Y}, +) where { + T1, + T2, + X <: Geometry.AdjointAxisVector{T1}, + Y <: Geometry.AxisVector{T2}, +} = promote_type(T1, T2) # This comes from the definition of dot. +mul_return_type( + ::Type{X}, + ::Type{Y}, +) where { + T1, + T2, + A1, + A2, + X <: Geometry.AxisVector{T1, A1}, + Y <: Geometry.AdjointAxisVector{T2, A2}, +} = axis_tensor_type(promote_type(T1, T2), Tuple{A1, A2}) +mul_return_type( + ::Type{X}, + ::Type{Y}, +) where { + T1, + T2, + A1, + X <: Geometry.Axis2TensorOrAdj{T1, <:Tuple{A1, Any}}, + Y <: Geometry.AxisVector{T2}, +} = axis_tensor_type(promote_type(T1, T2), Tuple{A1}) +mul_return_type( + ::Type{X}, + ::Type{Y}, +) where { + T1, + T2, + A1, + A2, + X <: Geometry.Axis2TensorOrAdj{T1, <:Tuple{A1, Any}}, + Y <: Geometry.Axis2TensorOrAdj{T2, <:Tuple{Any, A2}}, +} = axis_tensor_type(promote_type(T1, T2), Tuple{A1, A2}) + +""" + rmul_return_type(X, Y) + +Computes the return type of `rmul_with_projection(x, y, lg)`, where `x isa X` +and `y isa Y`. This can also be used to obtain the return type of `rmul(x, y)`, +although `rmul(x, y)` will throw an error when projection is necessary. + +Note: this should be equivalent to calling the internal function `_return_type`: +`Base._return_type(rmul_with_projection, Tuple{X, Y, LG})`, where `lg isa LG`. +""" +rmul_return_type(::Type{X}, ::Type{Y}) where {X, Y} = + rmaptype((X′, Y′) -> mul_return_type(X′, Y′), X, Y) +rmul_return_type(::Type{X}, ::Type{Y}) where {X <: SingleValue, Y} = + rmaptype(Y′ -> mul_return_type(X, Y′), Y) +rmul_return_type(::Type{X}, ::Type{Y}) where {X, Y <: SingleValue} = + rmaptype(X′ -> mul_return_type(X′, Y), X) +rmul_return_type( + ::Type{X}, + ::Type{Y}, +) where {X <: SingleValue, Y <: SingleValue} = mul_return_type(X, Y) diff --git a/src/Operators/common.jl b/src/Operators/common.jl index 07315befdc..44a5ddef17 100644 --- a/src/Operators/common.jl +++ b/src/Operators/common.jl @@ -65,34 +65,33 @@ end # when sending Broadcasted objects to the GPU, we strip out the space # information at each level of the broadcast tree and Fields, replacing with # PlaceholderSpace. this reduces the amount runtime parameter data we send to -# the GPU, which is quite limited (~4kB). However, we need to keep the eltype of -# the local geometry data, since it may needed in order to infer the return -# eltype of a Field operation. +# the GPU, which is quite limited (~4kB). # Functions for CUDASpectralStyle -struct PlaceholderSpace{LG} <: Spaces.AbstractSpace end -struct CenterPlaceholderSpace{LG} <: Spaces.AbstractSpace end -struct FacePlaceholderSpace{LG} <: Spaces.AbstractSpace end +struct PlaceholderSpace <: Spaces.AbstractSpace end +struct CenterPlaceholderSpace <: Spaces.AbstractSpace end +struct FacePlaceholderSpace <: Spaces.AbstractSpace end + placeholder_space(current_space::T, parent_space::T) where {T} = - PlaceholderSpace{local_geometry_type(current_space)}() + PlaceholderSpace() placeholder_space(current_space, parent_space) = current_space placeholder_space( current_space::Spaces.CenterFiniteDifferenceSpace, parent_space::Spaces.FaceFiniteDifferenceSpace, -) = CenterPlaceholderSpace{local_geometry_type(current_space)}() +) = CenterPlaceholderSpace() placeholder_space( current_space::Spaces.CenterExtrudedFiniteDifferenceSpace, parent_space::Spaces.FaceExtrudedFiniteDifferenceSpace, -) = CenterPlaceholderSpace{local_geometry_type(current_space)}() +) = CenterPlaceholderSpace() placeholder_space( current_space::Spaces.FaceFiniteDifferenceSpace, parent_space::Spaces.CenterFiniteDifferenceSpace, -) = FacePlaceholderSpace{local_geometry_type(current_space)}() +) = FacePlaceholderSpace() placeholder_space( current_space::Spaces.FaceExtrudedFiniteDifferenceSpace, parent_space::Spaces.CenterExtrudedFiniteDifferenceSpace, -) = FacePlaceholderSpace{local_geometry_type(current_space)}() +) = FacePlaceholderSpace() @inline reconstruct_placeholder_space(::PlaceholderSpace, parent_space) = parent_space @@ -115,11 +114,6 @@ placeholder_space( @inline reconstruct_placeholder_space(current_space, parent_space) = current_space -local_geometry_type(space::Spaces.AbstractSpace) = - eltype(Spaces.local_geometry_data(space)) -local_geometry_type(::PlaceholderSpace{LG}) where {LG} = LG -local_geometry_type(::CenterPlaceholderSpace{LG}) where {LG} = LG -local_geometry_type(::FacePlaceholderSpace{LG}) where {LG} = LG strip_space(obj, parent_space) = obj diff --git a/test/MatrixFields/rmul_with_projection.jl b/test/MatrixFields/rmul_with_projection.jl index d7f4855865..44b610f9cc 100644 --- a/test/MatrixFields/rmul_with_projection.jl +++ b/test/MatrixFields/rmul_with_projection.jl @@ -4,10 +4,11 @@ using Random: seed! using StaticArrays: @SMatrix import ClimaCore: Geometry -import ClimaCore.MatrixFields: rmul_with_projection +import ClimaCore.MatrixFields: rmul_with_projection, rmul_return_type -function test_rmul_with_projection(x, y, lg, expected_result) +function test_rmul_with_projection(x::X, y::Y, lg, expected_result) where {X, Y} result = rmul_with_projection(x, y, lg) + result_type = rmul_return_type(X, Y) # Compute the maximum error as an integer multiple of machine epsilon. FT = Geometry.undertype(typeof(lg)) @@ -22,6 +23,10 @@ function test_rmul_with_projection(x, y, lg, expected_result) @test max_error <= 1 # correctness @test (@allocated rmul_with_projection(x, y, lg)) == 0 # allocations @test_opt rmul_with_projection(x, y, lg) # type instabilities + + @test result_type == typeof(result) # correctness + @test (@allocated rmul_return_type(X, Y)) == 0 # allocations + @test_opt rmul_return_type(X, Y) # type instabilities end @testset "rmul_with_projection Unit Tests" begin