diff --git a/test/MatrixFields/band_matrix_row.jl b/test/MatrixFields/band_matrix_row.jl index 0077948b4a..b48bec969f 100644 --- a/test/MatrixFields/band_matrix_row.jl +++ b/test/MatrixFields/band_matrix_row.jl @@ -1,18 +1,6 @@ -using Test -using JET using LinearAlgebra: I -using ClimaCore.MatrixFields -import ClimaCore: Geometry - -macro test_all(expression) - return quote - local test_func() = $(esc(expression)) - @test test_func() # correctness - @test (@allocated test_func()) == 0 # allocations - @test_opt test_func() # type instabilities - end -end +include("matrix_field_test_utils.jl") @testset "BandMatrixRow Unit Tests" begin @test_all DiagonalMatrixRow(1) == @@ -41,14 +29,14 @@ end TridiagonalMatrixRow(1, 0, 1) / 2 - I == zero(PentadiagonalMatrixRow{Int}) - T(value) = (; a = (), b = value, c = (value, (; d = (value,)), (;))) - @test_all QuaddiagonalMatrixRow(T(0.5), T(1), T(1), T(1 // 2)) + - BidiagonalMatrixRow(T(-0.5), T(-1 // 2)) == - QuaddiagonalMatrixRow(T(1), T(1), T(1), T(1)) / 2 - @test_all PentadiagonalMatrixRow(T(0), T(0.5), T(1), T(1 // 2), T(0)) - - TridiagonalMatrixRow(T(1), T(0), T(1)) / 2 - - 0.5 * DiagonalMatrixRow(T(2)) == - PentadiagonalMatrixRow(T(0), T(0), T(0), T(0), T(0)) + NT = nested_type + @test_all QuaddiagonalMatrixRow(NT(0.5), NT(1), NT(1), NT(1 // 2)) + + BidiagonalMatrixRow(NT(-0.5), NT(-1 // 2)) == + QuaddiagonalMatrixRow(NT(1), NT(1), NT(1), NT(1)) / 2 + @test_all PentadiagonalMatrixRow(NT(0), NT(0.5), NT(1), NT(1 // 2), NT(0)) - + TridiagonalMatrixRow(NT(1), NT(0), NT(1)) / 2 - + 0.5 * DiagonalMatrixRow(NT(2)) == + PentadiagonalMatrixRow(NT(0), NT(0), NT(0), NT(0), NT(0)) @test_throws "Cannot promote" BidiagonalMatrixRow(1, 1) + I @test_throws "Cannot promote" BidiagonalMatrixRow(1, 1) + diff --git a/test/MatrixFields/matrix_field_broadcasting.jl b/test/MatrixFields/matrix_field_broadcasting.jl index d60f96fcb1..72e4944432 100644 --- a/test/MatrixFields/matrix_field_broadcasting.jl +++ b/test/MatrixFields/matrix_field_broadcasting.jl @@ -1,292 +1,80 @@ -using Test -using JET -import CUDA import BandedMatrices: band import LinearAlgebra: I, mul! -import Random: seed! -using ClimaCore.MatrixFields -import ClimaCore: - Geometry, Domains, Meshes, Topologies, Hypsography, Spaces, Fields -import ClimaComms - -# Using @benchmark from BenchmarkTools is extremely slow; it appears to keep -# triggering recompilations and allocating a lot of memory in the process. -# This macro returns the minimum time (in seconds) required to run the -# expression after it has been compiled. -macro benchmark(expression) - return quote - $(esc(expression)) # Compile the expression first. Use esc for hygiene. - best_time = Inf - start_time = time_ns() - while time_ns() - start_time < 1e8 # Benchmark for 0.1 s (1e8 ns). - best_time = min(best_time, @elapsed $(esc(expression))) - end - best_time - end -end - -# This function is used for benchmarking ref_set_result!. -function call_array_func( - ref_set_result!::F, - ref_result_arrays, - inputs_arrays, - temp_values_arrays, -) where {F} - for arrays in - zip(ref_result_arrays, inputs_arrays..., temp_values_arrays...) - ref_set_result!(arrays...) - end -end - -function test_matrix_broadcast_against_array_reference(; - test_name, - inputs, - get_result::F1, - set_result!::F2, - get_temp_values::F3 = (_...) -> (), - ref_set_result!::F4, - time_ratio_limit = 10, - max_eps_error_limit = 7, - test_broken_with_cuda = false, -) where {F1, F2, F3, F4} - @testset "$test_name" begin - is_using_cuda = ClimaComms.device(inputs[1]) isa ClimaComms.CUDADevice - ignore_cuda = (AnyFrameModule(CUDA),) - - if test_broken_with_cuda && is_using_cuda - @test_throws CUDA.InvalidIRError get_result(inputs...) - @warn "$test_name:\n\tCUDA.InvalidIRError" - return - end - - result = get_result(inputs...) - temp_values = get_temp_values(inputs...) - - # Fill all output fields with NaNs for testing correctness. - result .*= NaN - for temp_value in temp_values - temp_value .*= NaN - end - - ref_result_arrays = MatrixFields.field2arrays(result) - inputs_arrays = map(MatrixFields.field2arrays, inputs) - temp_values_arrays = map(MatrixFields.field2arrays, temp_values) - - best_time = @benchmark set_result!(result, inputs...) - best_ref_time = @benchmark call_array_func( - ref_set_result!, - ref_result_arrays, - inputs_arrays, - temp_values_arrays, - ) - - # Compute the maximum error as an integer multiple of machine epsilon. - result_arrays = MatrixFields.field2arrays(result) - max_error = - maximum(zip(result_arrays, ref_result_arrays)) do (array, ref_array) - maximum(abs.(array .- ref_array)) - end - max_eps_error = ceil(Int, max_error / eps(typeof(max_error))) - - @info "$test_name:\n\tBest Time = $best_time s\n\tBest Reference Time \ - = $best_ref_time s\n\tMaximum Error = $max_eps_error eps" - - # Test that set_result! is performant compared to ref_set_result!. - @test best_time / best_ref_time < time_ratio_limit - - # Test set_result! for correctness, allocations, and type instabilities. - # Ignore the type instabilities in CUDA and the allocations they incur. - @test max_eps_error < max_eps_error_limit - @test is_using_cuda || (@allocated set_result!(result, inputs...)) == 0 - @test_opt ignored_modules = ignore_cuda set_result!(result, inputs...) - - # Test ref_set_result! for allocations and type instabilities. This is - # helpful for ensuring that the performance comparison is fair. - @test (@allocated call_array_func( - ref_set_result!, - ref_result_arrays, - inputs_arrays, - temp_values_arrays, - )) == 0 - @test_opt call_array_func( - ref_set_result!, - ref_result_arrays, - inputs_arrays, - temp_values_arrays, - ) - - # Test get_result (the allocating version of set_result!) for type - # instabilities. Ignore the type instabilities in CUDA. - @test_opt ignored_modules = ignore_cuda get_result(inputs...) - end -end - -function test_matrix_broadcast_against_reference(; - test_name, - inputs, - get_result::F1, - set_result!::F2, - ref_inputs, - ref_set_result!::F3, -) where {F1, F2, F3} - @testset "$test_name" begin - is_using_cuda = ClimaComms.device(inputs[1]) isa ClimaComms.CUDADevice - ignore_cuda = (AnyFrameModule(CUDA),) - - result = get_result(inputs...) - - # Fill the output field with NaNs for testing correctness. - result .*= NaN - - ref_result = copy(result) - - best_time = @benchmark set_result!(result, inputs...) - best_ref_time = @benchmark ref_set_result!(ref_result, ref_inputs...) - - @info "$test_name:\n\tBest Time = $best_time s\n\tBest Reference Time \ - = $best_ref_time s" - - # Test that set_result! is performant compared to ref_set_result!. - @test best_time < best_ref_time - - # Test set_result! for correctness, allocations, and type instabilities. - # Ignore the type instabilities in CUDA and the allocations they incur. - # Account for the roundoff errors that CUDA introduces. - @test if is_using_cuda - max_error = maximum(abs.(parent(result) .- parent(ref_result))) - max_error < eps(typeof(max_error)) - else - result == ref_result - end - @test is_using_cuda || (@allocated set_result!(result, inputs...)) == 0 - @test_opt ignored_modules = ignore_cuda set_result!(result, inputs...) - - # Test ref_set_result! for allocations and type instabilities. This is - # helpful for ensuring that the performance comparison is fair. Ignore - # the type instabilities in CUDA and the allocations they incur. - @test is_using_cuda || - (@allocated ref_set_result!(ref_result, ref_inputs...)) == 0 - @test_opt ignored_modules = ignore_cuda ref_set_result!( - ref_result, - ref_inputs..., - ) - - # Test get_result (the allocating version of set_result!) for type - # instabilities. Ignore the type instabilities in CUDA. - @test_opt ignored_modules = ignore_cuda get_result(inputs...) - end -end - -function random_test_fields(::Type{FT}) where {FT} - velem = 20 # This should be big enough to test high-bandwidth matrices. - helem = npoly = 1 # These should be small enough for the tests to be fast. - - comms_ctx = ClimaComms.SingletonCommsContext() - hdomain = Domains.SphereDomain(FT(10)) - hmesh = Meshes.EquiangularCubedSphere(hdomain, helem) - htopology = Topologies.Topology2D(comms_ctx, hmesh) - quad = Spaces.Quadratures.GLL{npoly + 1}() - hspace = Spaces.SpectralElementSpace2D(htopology, quad) - vdomain = Domains.IntervalDomain( - Geometry.ZPoint(FT(0)), - Geometry.ZPoint(FT(10)); - boundary_tags = (:bottom, :top), - ) - vmesh = Meshes.IntervalMesh(vdomain, nelems = velem) - vtopology = Topologies.IntervalTopology(comms_ctx, vmesh) - vspace = Spaces.CenterFiniteDifferenceSpace(vtopology) - sfc_coord = Fields.coordinate_field(hspace) - hypsography = - comms_ctx.device isa ClimaComms.CUDADevice ? Hypsography.Flat() : - Hypsography.LinearAdaption( - @. cosd(sfc_coord.lat) + cosd(sfc_coord.long) + 1 - ) # TODO: FD operators don't currently work with hypsography on GPUs. - center_space = - Spaces.ExtrudedFiniteDifferenceSpace(hspace, vspace, hypsography) - face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(center_space) - ᶜcoord = Fields.coordinate_field(center_space) - ᶠcoord = Fields.coordinate_field(face_space) - - seed!(1) # ensures reproducibility - ᶜᶜmat = map(c -> DiagonalMatrixRow(ntuple(_ -> rand(FT), 1)...), ᶜcoord) - ᶜᶠmat = map(c -> BidiagonalMatrixRow(ntuple(_ -> rand(FT), 2)...), ᶜcoord) - ᶠᶠmat = map(c -> TridiagonalMatrixRow(ntuple(_ -> rand(FT), 3)...), ᶠcoord) - ᶠᶜmat = map(c -> QuaddiagonalMatrixRow(ntuple(_ -> rand(FT), 4)...), ᶠcoord) - ᶜvec = map(c -> rand(FT), ᶜcoord) - ᶠvec = map(c -> rand(FT), ᶠcoord) - - return ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec -end +include("matrix_field_test_utils.jl") @testset "Scalar Matrix Field Broadcasting" begin - ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec = random_test_fields(Float64) + FT = Float64 + center_space, face_space = test_spaces(FT) - test_matrix_broadcast_against_array_reference(; + seed!(1) # ensures reproducibility + ᶜvec = random_field(FT, center_space) + ᶠvec = random_field(FT, face_space) + ᶜᶜmat = random_field(DiagonalMatrixRow{FT}, center_space) + ᶜᶠmat = random_field(BidiagonalMatrixRow{FT}, center_space) + ᶠᶠmat = random_field(TridiagonalMatrixRow{FT}, face_space) + ᶠᶜmat = random_field(QuaddiagonalMatrixRow{FT}, face_space) + + test_field_broadcast_against_array_reference(; test_name = "diagonal matrix times vector", - inputs = (ᶜᶜmat, ᶜvec), - get_result = (ᶜᶜmat, ᶜvec) -> (@. ᶜᶜmat ⋅ ᶜvec), - set_result! = (result, ᶜᶜmat, ᶜvec) -> (@. result = ᶜᶜmat ⋅ ᶜvec), + get_result = () -> (@. ᶜᶜmat ⋅ ᶜvec), + set_result! = result -> (@. result = ᶜᶜmat ⋅ ᶜvec), + input_fields = (ᶜᶜmat, ᶜvec), ref_set_result! = (_result, _ᶜᶜmat, _ᶜvec) -> mul!(_result, _ᶜᶜmat, _ᶜvec), - time_ratio_limit = 10, # This case's ref function is fast on Buildkite. ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "tri-diagonal matrix times vector", - inputs = (ᶠᶠmat, ᶠvec), - get_result = (ᶠᶠmat, ᶠvec) -> (@. ᶠᶠmat ⋅ ᶠvec), - set_result! = (result, ᶠᶠmat, ᶠvec) -> (@. result = ᶠᶠmat ⋅ ᶠvec), + get_result = () -> (@. ᶠᶠmat ⋅ ᶠvec), + set_result! = result -> (@. result = ᶠᶠmat ⋅ ᶠvec), + input_fields = (ᶠᶠmat, ᶠvec), ref_set_result! = (_result, _ᶠᶠmat, _ᶠvec) -> mul!(_result, _ᶠᶠmat, _ᶠvec), - time_ratio_limit = 10, # This case's ref function is fast on Buildkite. ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "quad-diagonal matrix times vector", - inputs = (ᶠᶜmat, ᶜvec), - get_result = (ᶠᶜmat, ᶜvec) -> (@. ᶠᶜmat ⋅ ᶜvec), - set_result! = (result, ᶠᶜmat, ᶜvec) -> (@. result = ᶠᶜmat ⋅ ᶜvec), + get_result = () -> (@. ᶠᶜmat ⋅ ᶜvec), + set_result! = result -> (@. result = ᶠᶜmat ⋅ ᶜvec), + input_fields = (ᶠᶜmat, ᶜvec), ref_set_result! = (_result, _ᶠᶜmat, _ᶜvec) -> mul!(_result, _ᶠᶜmat, _ᶜvec), - time_ratio_limit = 10, # This case's ref function is fast on Buildkite. ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "diagonal matrix times bi-diagonal matrix", - inputs = (ᶜᶜmat, ᶜᶠmat), - get_result = (ᶜᶜmat, ᶜᶠmat) -> (@. ᶜᶜmat ⋅ ᶜᶠmat), - set_result! = (result, ᶜᶜmat, ᶜᶠmat) -> (@. result = ᶜᶜmat ⋅ ᶜᶠmat), + get_result = () -> (@. ᶜᶜmat ⋅ ᶜᶠmat), + set_result! = result -> (@. result = ᶜᶜmat ⋅ ᶜᶠmat), + input_fields = (ᶜᶜmat, ᶜᶠmat), ref_set_result! = (_result, _ᶜᶜmat, _ᶜᶠmat) -> mul!(_result, _ᶜᶜmat, _ᶜᶠmat), ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "tri-diagonal matrix times tri-diagonal matrix", - inputs = (ᶠᶠmat,), - get_result = (ᶠᶠmat,) -> (@. ᶠᶠmat ⋅ ᶠᶠmat), - set_result! = (result, ᶠᶠmat) -> (@. result = ᶠᶠmat ⋅ ᶠᶠmat), + get_result = () -> (@. ᶠᶠmat ⋅ ᶠᶠmat), + set_result! = result -> (@. result = ᶠᶠmat ⋅ ᶠᶠmat), + input_fields = (ᶠᶠmat,), ref_set_result! = (_result, _ᶠᶠmat) -> mul!(_result, _ᶠᶠmat, _ᶠᶠmat), ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "quad-diagonal matrix times diagonal matrix", - inputs = (ᶠᶜmat, ᶜᶜmat), - get_result = (ᶠᶜmat, ᶜᶜmat) -> (@. ᶠᶜmat ⋅ ᶜᶜmat), - set_result! = (result, ᶠᶜmat, ᶜᶜmat) -> (@. result = ᶠᶜmat ⋅ ᶜᶜmat), + get_result = () -> (@. ᶠᶜmat ⋅ ᶜᶜmat), + set_result! = result -> (@. result = ᶠᶜmat ⋅ ᶜᶜmat), + input_fields = (ᶠᶜmat, ᶜᶜmat), ref_set_result! = (_result, _ᶠᶜmat, _ᶜᶜmat) -> mul!(_result, _ᶠᶜmat, _ᶜᶜmat), ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "diagonal matrix times bi-diagonal matrix times \ tri-diagonal matrix times quad-diagonal matrix", - inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), - get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> - (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat), - set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> - (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat), - get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + get_result = () -> (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat), + set_result! = result -> (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat), + input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_temp_value_fields = () -> ((@. ᶜᶜmat ⋅ ᶜᶠmat), (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat)), ref_set_result! = ( _result, @@ -303,16 +91,14 @@ end end, ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "diagonal matrix times bi-diagonal matrix times \ tri-diagonal matrix times quad-diagonal matrix, but with \ forced right-associativity", - inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), - get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> - (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))), - set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> - (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))), - get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + get_result = () -> (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))), + set_result! = result -> (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))), + input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_temp_value_fields = () -> ((@. ᶠᶠmat ⋅ ᶠᶜmat), (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ ᶠᶜmat))), ref_set_result! = ( _result, @@ -330,16 +116,15 @@ end test_broken_with_cuda = true, # TODO: Fix this. ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "diagonal matrix times bi-diagonal matrix times \ tri-diagonal matrix times quad-diagonal matrix times \ vector", - inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec), - get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> - (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec), - set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> + get_result = () -> (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec), + set_result! = result -> (@. result = ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec), - get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> ( + input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec), + get_temp_value_fields = () -> ( (@. ᶜᶜmat ⋅ ᶜᶠmat), (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat), (@. ᶜᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶠmat ⋅ ᶠᶜmat), @@ -362,16 +147,15 @@ end end, ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "diagonal matrix times bi-diagonal matrix times \ tri-diagonal matrix times quad-diagonal matrix times \ vector, but with forced right-associativity", - inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec), - get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> - (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))), - set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> + get_result = () -> (@. ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))), + set_result! = result -> (@. result = ᶜᶜmat ⋅ (ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)))), - get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec) -> ( + input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec), + get_temp_value_fields = () -> ( (@. ᶠᶜmat ⋅ ᶜvec), (@. ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec)), (@. ᶜᶠmat ⋅ (ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜvec))), @@ -396,14 +180,14 @@ end test_broken_with_cuda = true, # TODO: Fix this. ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "linear combination of matrix products and LinearAlgebra.I", - inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), - get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + get_result = () -> (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)), - set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + set_result! = result -> (@. result = 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)), - get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> ( + input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_temp_value_fields = () -> ( (@. 2 * ᶠᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), @@ -429,15 +213,15 @@ end end, ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "another linear combination of matrix products and \ LinearAlgebra.I", - inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), - get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + get_result = () -> (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)), - set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = + set_result! = result -> (@. result = ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)), - get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> ( + input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_temp_value_fields = () -> ( (@. ᶠᶜmat ⋅ ᶜᶜmat), (@. ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), (@. ᶠᶠmat / 3), @@ -463,14 +247,14 @@ end end, ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "matrix times linear combination", - inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), - get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. ᶜᶠmat ⋅ + get_result = () -> (@. ᶜᶠmat ⋅ (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))), - set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = + set_result! = result -> (@. result = ᶜᶠmat ⋅ (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,))), - get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> ( + input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_temp_value_fields = () -> ( (@. 2 * ᶠᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), @@ -499,16 +283,16 @@ end end, ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "linear combination times another linear combination", - inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), - get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> + get_result = () -> (@. (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅ (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,))), - set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = + set_result! = result -> (@. result = (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅ (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,))), - get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> ( + input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_temp_value_fields = () -> ( (@. 2 * ᶠᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat), @@ -554,22 +338,22 @@ end max_eps_error_limit = 30, # This case's roundoff error is large on GPUs. ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "matrix times matrix times linear combination times matrix \ times another linear combination times matrix", - inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), - get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. ᶠᶜmat ⋅ ᶜᶠmat ⋅ + get_result = () -> (@. ᶠᶜmat ⋅ ᶜᶠmat ⋅ (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅ ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)) ⋅ ᶠᶠmat), - set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> (@. result = + set_result! = result -> (@. result = ᶠᶜmat ⋅ ᶜᶠmat ⋅ (2 * ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat + ᶠᶠmat ⋅ ᶠᶠmat / 3 - (4I,)) ⋅ ᶠᶠmat ⋅ (ᶠᶜmat ⋅ ᶜᶜmat ⋅ ᶜᶠmat * 2 - (ᶠᶠmat / 3) ⋅ ᶠᶠmat + (4I,)) ⋅ ᶠᶠmat), - get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat) -> ( + input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat), + get_temp_value_fields = () -> ( (@. ᶠᶜmat ⋅ ᶜᶠmat), (@. 2 * ᶠᶜmat), (@. 2 * ᶠᶜmat ⋅ ᶜᶜmat), @@ -630,23 +414,21 @@ end mul!(_temp14, _temp8, _temp13) mul!(_result, _temp14, _ᶠᶠmat) end, - time_ratio_limit = 10, # This case's ref function is fast on Buildkite. max_eps_error_limit = 70, # This case's roundoff error is large on GPUs. ) - test_matrix_broadcast_against_array_reference(; + test_field_broadcast_against_array_reference(; test_name = "matrix constructions and multiplications", - inputs = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec), - get_result = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec) -> + get_result = () -> (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅ TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat ⋅ DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)), - set_result! = (result, ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec) -> - (@. result = - BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅ - TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat ⋅ - DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)), - get_temp_values = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec) -> ( + set_result! = result -> (@. result = + BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅ + TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1) ⋅ ᶠᶠmat ⋅ + DiagonalMatrixRow(DiagonalMatrixRow(ᶠvec) ⋅ ᶠvec)), + input_fields = (ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec), + get_temp_value_fields = () -> ( (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec)), (@. TridiagonalMatrixRow(ᶠvec, ᶠᶜmat ⋅ ᶜvec, 1)), (@. BidiagonalMatrixRow(ᶜᶠmat ⋅ ᶠvec, ᶜᶜmat ⋅ ᶜvec) ⋅ @@ -686,15 +468,21 @@ end end @testset "Non-scalar Matrix Field Broadcasting" begin - ᶜᶜmat, ᶜᶠmat, ᶠᶠmat, ᶠᶜmat, ᶜvec, ᶠvec = random_test_fields(Float64) + FT = Float64 + center_space, face_space = test_spaces(FT) - ᶜlg = Fields.local_geometry_field(ᶜvec) - ᶠlg = Fields.local_geometry_field(ᶠvec) + ᶜlg = Fields.local_geometry_field(center_space) + ᶠlg = Fields.local_geometry_field(face_space) - ᶜᶠmat2 = map(row -> map(sin, row), ᶜᶠmat) - ᶜᶠmat3 = map(row -> map(cos, row), ᶜᶠmat) - ᶠᶜmat2 = map(row -> map(sin, row), ᶠᶜmat) - ᶠᶜmat3 = map(row -> map(cos, row), ᶠᶜmat) + seed!(1) # ensures reproducibility + ᶜvec = random_field(FT, center_space) + ᶠvec = random_field(FT, face_space) + ᶜᶠmat = random_field(BidiagonalMatrixRow{FT}, center_space) + ᶜᶠmat2 = random_field(BidiagonalMatrixRow{FT}, center_space) + ᶜᶠmat3 = random_field(BidiagonalMatrixRow{FT}, center_space) + ᶠᶜmat = random_field(QuaddiagonalMatrixRow{FT}, face_space) + ᶠᶜmat2 = random_field(QuaddiagonalMatrixRow{FT}, face_space) + ᶠᶜmat3 = random_field(QuaddiagonalMatrixRow{FT}, face_space) ᶜᶠmat_AC1 = map(row -> map(adjoint ∘ Geometry.Covariant1Vector, row), ᶜᶠmat) ᶜᶠmat_C12 = map( @@ -709,48 +497,26 @@ end ᶠᶜmat3, ) - test_matrix_broadcast_against_reference(; + test_field_broadcast(; test_name = "matrix of covectors times matrix of vectors", - inputs = (ᶜᶠmat_AC1, ᶠᶜmat_C12), - get_result = (ᶜᶠmat_AC1, ᶠᶜmat_C12) -> (@. ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12), - set_result! = (result, ᶜᶠmat_AC1, ᶠᶜmat_C12) -> - (@. result = ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12), - ref_inputs = (ᶜᶠmat, ᶠᶜmat2, ᶠᶜmat3, ᶠlg), - ref_set_result! = (result, ᶜᶠmat, ᶠᶜmat2, ᶠᶜmat3, ᶠlg) -> (@. result = + get_result = () -> (@. ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12), + set_result! = result -> (@. result = ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12), + ref_set_result! = result -> (@. result = ᶜᶠmat ⋅ ( DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:1) ⋅ ᶠᶜmat2 + DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:2) ⋅ ᶠᶜmat3 )), ) - test_matrix_broadcast_against_reference(; + test_field_broadcast(; test_name = "matrix of covectors times matrix of vectors times matrix \ of numbers times matrix of covectors times matrix of \ vectors", - inputs = (ᶜᶠmat_AC1, ᶠᶜmat_C12, ᶜᶠmat, ᶠᶜmat_AC1, ᶜᶠmat_C12), - get_result = (ᶜᶠmat_AC1, ᶠᶜmat_C12, ᶜᶠmat, ᶠᶜmat_AC1, ᶜᶠmat_C12) -> + get_result = () -> (@. ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12 ⋅ ᶜᶠmat ⋅ ᶠᶜmat_AC1 ⋅ ᶜᶠmat_C12), - set_result! = ( - result, - ᶜᶠmat_AC1, - ᶠᶜmat_C12, - ᶜᶠmat, - ᶠᶜmat_AC1, - ᶜᶠmat_C12, - ) -> + set_result! = result -> (@. result = ᶜᶠmat_AC1 ⋅ ᶠᶜmat_C12 ⋅ ᶜᶠmat ⋅ ᶠᶜmat_AC1 ⋅ ᶜᶠmat_C12), - ref_inputs = (ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3, ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3, ᶜlg, ᶠlg), - ref_set_result! = ( - result, - ᶜᶠmat, - ᶜᶠmat2, - ᶜᶠmat3, - ᶠᶜmat, - ᶠᶜmat2, - ᶠᶜmat3, - ᶜlg, - ᶠlg, - ) -> (@. result = + ref_set_result! = result -> (@. result = ᶜᶠmat ⋅ ( DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:1) ⋅ ᶠᶜmat2 + DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:2) ⋅ ᶠᶜmat3 @@ -767,43 +533,15 @@ end ᶠᶜmat_C12_AC1 = map((row1, row2) -> map(tuple, row1, row2), ᶠᶜmat_C12, ᶠᶜmat_AC1) - test_matrix_broadcast_against_reference(; + test_field_broadcast(; test_name = "matrix of covectors and numbers times matrix of vectors \ and covectors times matrix of numbers and vectors times \ vector of numbers", - inputs = (ᶜᶠmat_AC1_num, ᶠᶜmat_C12_AC1, ᶜᶠmat_num_C12, ᶠvec), - get_result = (ᶜᶠmat_AC1_num, ᶠᶜmat_C12_AC1, ᶜᶠmat_num_C12, ᶠvec) -> + get_result = () -> (@. ᶜᶠmat_AC1_num ⋅ ᶠᶜmat_C12_AC1 ⋅ ᶜᶠmat_num_C12 ⋅ ᶠvec), - set_result! = ( - result, - ᶜᶠmat_AC1_num, - ᶠᶜmat_C12_AC1, - ᶜᶠmat_num_C12, - ᶠvec, - ) -> (@. result = ᶜᶠmat_AC1_num ⋅ ᶠᶜmat_C12_AC1 ⋅ ᶜᶠmat_num_C12 ⋅ ᶠvec), - ref_inputs = ( - ᶜᶠmat, - ᶜᶠmat2, - ᶜᶠmat3, - ᶠᶜmat, - ᶠᶜmat2, - ᶠᶜmat3, - ᶠvec, - ᶜlg, - ᶠlg, - ), - ref_set_result! = ( - result, - ᶜᶠmat, - ᶜᶠmat2, - ᶜᶠmat3, - ᶠᶜmat, - ᶠᶜmat2, - ᶠᶜmat3, - ᶠvec, - ᶜlg, - ᶠlg, - ) -> (@. result = tuple( + set_result! = result -> + (@. result = ᶜᶠmat_AC1_num ⋅ ᶠᶜmat_C12_AC1 ⋅ ᶜᶠmat_num_C12 ⋅ ᶠvec), + ref_set_result! = result -> (@. result = tuple( ᶜᶠmat ⋅ ( DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:1) ⋅ ᶠᶜmat2 + DiagonalMatrixRow(ᶠlg.gⁱʲ.components.data.:2) ⋅ ᶠᶜmat3 @@ -815,32 +553,20 @@ end )), ) - T(value1, value2, value3) = - (; a = (), b = value1, c = (value2, (; d = (value3,)), (;))) - ᶜᶠmat_T = map((rows...) -> map(T, rows...), ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3) - ᶠᶜmat_T = map((rows...) -> map(T, rows...), ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3) - ᶜvec_T = @. T(ᶜvec, ᶜvec, ᶜvec) + ᶜvec_NT = @. nested_type(ᶜvec, ᶜvec, ᶜvec) + ᶜᶠmat_NT = + map((rows...) -> map(nested_type, rows...), ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3) + ᶠᶜmat_NT = + map((rows...) -> map(nested_type, rows...), ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3) - test_matrix_broadcast_against_reference(; + test_field_broadcast(; test_name = "matrix of nested values times matrix of nested values \ times matrix of numbers times matrix of numbers times \ - times vector of nested values", - inputs = (ᶜᶠmat_T, ᶠᶜmat, ᶜᶠmat, ᶠᶜmat_T, ᶜvec_T), - get_result = (ᶜᶠmat_T, ᶠᶜmat, ᶜᶠmat, ᶠᶜmat_T, ᶜvec_T) -> - (@. ᶜᶠmat_T ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat_T ⋅ ᶜvec_T), - set_result! = (result, ᶜᶠmat_T, ᶠᶜmat, ᶜᶠmat, ᶠᶜmat_T, ᶜvec_T) -> - (@. result = ᶜᶠmat_T ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat_T ⋅ ᶜvec_T), - ref_inputs = (ᶜᶠmat, ᶜᶠmat2, ᶜᶠmat3, ᶠᶜmat, ᶠᶜmat2, ᶠᶜmat3, ᶜvec), - ref_set_result! = ( - result, - ᶜᶠmat, - ᶜᶠmat2, - ᶜᶠmat3, - ᶠᶜmat, - ᶠᶜmat2, - ᶠᶜmat3, - ᶜvec, - ) -> (@. result = T( + vector of nested values", + get_result = () -> (@. ᶜᶠmat_NT ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat_NT ⋅ ᶜvec_NT), + set_result! = result -> + (@. result = ᶜᶠmat_NT ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat_NT ⋅ ᶜvec_NT), + ref_set_result! = result -> (@. result = nested_type( ᶜᶠmat ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat ⋅ ᶜvec, ᶜᶠmat2 ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat2 ⋅ ᶜvec, ᶜᶠmat3 ⋅ ᶠᶜmat ⋅ ᶜᶠmat ⋅ ᶠᶜmat3 ⋅ ᶜvec, diff --git a/test/MatrixFields/matrix_field_test_utils.jl b/test/MatrixFields/matrix_field_test_utils.jl new file mode 100644 index 0000000000..555b9fc3a2 --- /dev/null +++ b/test/MatrixFields/matrix_field_test_utils.jl @@ -0,0 +1,235 @@ +using Test +using JET +import CUDA +import Random: seed! + +import ClimaComms +import ClimaCore: + Geometry, Domains, Meshes, Topologies, Hypsography, Spaces, Fields +using ClimaCore.MatrixFields + +# Test that an expression is true and that it is also type-stable. +macro test_all(expression) + return quote + local test_func() = $(esc(expression)) + @test test_func() # correctness + @test (@allocated test_func()) == 0 # allocations + @test_opt test_func() # type instabilities + end +end + +# Compute the minimum time (in seconds) required to run an expression after it +# has been compiled. This macro is used instead of @benchmark from +# BenchmarkTools.jl because the latter is extremely slow (it appears to keep +# triggering recompilations and allocating a lot of memory in the process). +macro benchmark(expression) + return quote + $(esc(expression)) # Compile the expression first. Use esc for hygiene. + best_time = Inf + start_time = time_ns() + while time_ns() - start_time < 1e8 # Benchmark for 0.1 s (1e8 ns). + best_time = min(best_time, @elapsed $(esc(expression))) + end + best_time + end +end + +const ignore_cuda = (AnyFrameModule(CUDA),) + +is_using_cuda() = ClimaComms.device() isa ClimaComms.CUDADevice + +# Test the allocating and non-allocating versions of a field broadcast against +# a reference non-allocating implementation. Ensure that they are performant, +# correct, and type-stable, and print some useful information. If a reference +# implementation is not available, the performance and correctness checks are +# skipped. +function test_field_broadcast(; + test_name, + get_result::F1, + set_result!::F2, + ref_set_result!::F3 = nothing, + time_ratio_limit = 10, + max_eps_error_limit = 10, + test_broken_with_cuda = false, +) where {F1, F2, F3} + @testset "$test_name" begin + if test_broken_with_cuda && is_using_cuda() + @test_throws CUDA.InvalidIRError get_result() + @warn "$test_name:\n\tCUDA.InvalidIRError" + return + end + + result = get_result() + time = @benchmark set_result!(result) + time_rounded = round(time; sigdigits = 2) + + if isnothing(ref_set_result!) + @info "$test_name:\n\tTime = $time_rounded s (reference \ + implementation unavailable)" + else + ref_result = similar(result) + ref_time = @benchmark ref_set_result!(ref_result) + ref_time_rounded = round(ref_time; sigdigits = 2) + time_ratio = time / ref_time + time_ratio_rounded = round(time_ratio; sigdigits = 2) + max_error = maximum(abs.(parent(result) .- parent(ref_result))) + max_eps_error = ceil(Int, max_error / eps(typeof(max_error))) + + @info "$test_name:\n\tTime Ratio = $time_ratio_rounded \ + ($time_rounded s vs. $ref_time_rounded s for reference) \ + \n\tMaximum Error = $max_eps_error eps" + + # Test that set_result! is performant and correct when compared + # against ref_set_result!. + @test time / ref_time <= time_ratio_limit + @test max_eps_error <= max_eps_error_limit + end + + # Test get_result and set_result! for type instabilities, and test + # set_result! for allocations. Ignore the type instabilities in CUDA and + # the allocations they incur. + @test_opt ignored_modules = ignore_cuda get_result() + @test_opt ignored_modules = ignore_cuda set_result!(result) + @test is_using_cuda() || (@allocated set_result!(result)) == 0 + + if !isnothing(ref_set_result!) + # Test ref_set_result! for type instabilities and allocations to + # ensure that the performance comparison is fair. + @test_opt ignored_modules = ignore_cuda ref_set_result!(ref_result) + @test is_using_cuda() || + (@allocated ref_set_result!(ref_result)) == 0 + end + end +end + +# Test the allocating and non-allocating versions of a field broadcast against +# a reference array-based non-allocating implementation. Ensure that they are +# performant, correct, and type-stable, and print some useful information. In +# order for the input arrays and temporary scratch arrays used by the reference +# implementation to be generated automatically, the corresponding fields must be +# passed to this function. +function test_field_broadcast_against_array_reference(; + test_name, + get_result::F1, + set_result!::F2, + input_fields, + get_temp_value_fields = () -> (), + ref_set_result!::F3, + time_ratio_limit = 10, + max_eps_error_limit = 10, + test_broken_with_cuda = false, +) where {F1, F2, F3} + @testset "$test_name" begin + if test_broken_with_cuda && is_using_cuda() + @test_throws CUDA.InvalidIRError get_result() + @warn "$test_name:\n\tCUDA.InvalidIRError" + return + end + + result = get_result() + time = @benchmark set_result!(result) + time_rounded = round(time; sigdigits = 2) + + ref_result = similar(result) + temp_value_fields = map(similar, get_temp_value_fields()) + + result_arrays = MatrixFields.field2arrays(result) + ref_result_arrays = MatrixFields.field2arrays(ref_result) + inputs_arrays = map(MatrixFields.field2arrays, input_fields) + temp_values_arrays = map(MatrixFields.field2arrays, temp_value_fields) + + function call_ref_set_result!() + for arrays in + zip(ref_result_arrays, inputs_arrays..., temp_values_arrays...) + ref_set_result!(arrays...) + end + end + + ref_time = @benchmark call_ref_set_result!() + ref_time_rounded = round(ref_time; sigdigits = 2) + time_ratio = time / ref_time + time_ratio_rounded = round(time_ratio; sigdigits = 2) + max_error = + maximum(zip(result_arrays, ref_result_arrays)) do (array, ref_array) + maximum(abs.(array .- ref_array)) + end + max_eps_error = ceil(Int, max_error / eps(typeof(max_error))) + + @info "$test_name:\n\tTime Ratio = $time_ratio_rounded ($time_rounded \ + s vs. $ref_time_rounded s for reference)\n\tMaximum Error = \ + $max_eps_error eps" + + # Test that set_result! is performant and correct when compared against + # ref_set_result!. + @test time / ref_time <= time_ratio_limit + @test max_eps_error <= max_eps_error_limit + + # Test get_result and set_result! for type instabilities, and test + # set_result! for allocations. Ignore the type instabilities in CUDA and + # the allocations they incur. + @test_opt ignored_modules = ignore_cuda get_result() + @test_opt ignored_modules = ignore_cuda set_result!(result) + @test is_using_cuda() || (@allocated set_result!(result)) == 0 + + # Test ref_set_result! for type instabilities and allocations to ensure + # that the performance comparison is fair. + @test_opt ignored_modules = ignore_cuda call_ref_set_result!() + @test is_using_cuda() || (@allocated call_ref_set_result!()) == 0 + end +end + +# Generate extruded finite difference spaces for testing. Include topography +# when possible. +function test_spaces(::Type{FT}) where {FT} + velem = 20 # This should be big enough to test high-bandwidth matrices. + helem = npoly = 1 # These should be small enough for the tests to be fast. + + comms_ctx = ClimaComms.SingletonCommsContext() + hdomain = Domains.SphereDomain(FT(10)) + hmesh = Meshes.EquiangularCubedSphere(hdomain, helem) + htopology = Topologies.Topology2D(comms_ctx, hmesh) + quad = Spaces.Quadratures.GLL{npoly + 1}() + hspace = Spaces.SpectralElementSpace2D(htopology, quad) + vdomain = Domains.IntervalDomain( + Geometry.ZPoint(FT(0)), + Geometry.ZPoint(FT(10)); + boundary_tags = (:bottom, :top), + ) + vmesh = Meshes.IntervalMesh(vdomain, nelems = velem) + vtopology = Topologies.IntervalTopology(comms_ctx, vmesh) + vspace = Spaces.CenterFiniteDifferenceSpace(vtopology) + sfc_coord = Fields.coordinate_field(hspace) + hypsography = + is_using_cuda() ? Hypsography.Flat() : + Hypsography.LinearAdaption( + @. cosd(sfc_coord.lat) + cosd(sfc_coord.long) + 1 + ) # TODO: FD operators don't currently work with hypsography on GPUs. + center_space = + Spaces.ExtrudedFiniteDifferenceSpace(hspace, vspace, hypsography) + face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(center_space) + + return center_space, face_space +end + +# Generate a random field with elements of type T. +function random_field(::Type{T}, space) where {T} + FT = Spaces.undertype(space) + field = Fields.Field(T, space) + parent(field) .= rand.(FT) + return field +end + +# Construct a highly nested type for testing integration with RecursiveApply. +nested_type(value) = nested_type(value, value, value) +nested_type(value1, value2, value3) = + (; a = (), b = value1, c = (value2, (; d = (value3,)), (;))) + +# A shorthand for typeof(nested_type(::FT)). +const NestedType{FT} = NamedTuple{ + (:a, :b, :c), + Tuple{ + Tuple{}, + FT, + Tuple{FT, NamedTuple{(:d,), Tuple{Tuple{FT}}}, NamedTuple{(), Tuple{}}}, + }, +} diff --git a/test/MatrixFields/rmul_with_projection.jl b/test/MatrixFields/rmul_with_projection.jl index acc701649f..6bcb95c2a9 100644 --- a/test/MatrixFields/rmul_with_projection.jl +++ b/test/MatrixFields/rmul_with_projection.jl @@ -1,11 +1,9 @@ -using Test -using JET -using Random: seed! using StaticArrays: @SMatrix -import ClimaCore: Geometry import ClimaCore.MatrixFields: rmul_with_projection, rmul_return_type +include("matrix_field_test_utils.jl") + function test_rmul_with_projection(x::X, y::Y, lg, expected_result) where {X, Y} result = rmul_with_projection(x, y, lg) result_type = rmul_return_type(X, Y) @@ -73,8 +71,7 @@ end test_rmul_with_projection(cotensor, cotensor, lg, cotensor * cotensor) # Test some combinations of complicated nested values. - T(value1, value2, value3) = - (; a = (), b = value1, c = (value2, (; d = (value3,)), (;))) + T = nested_type test_rmul_with_projection( number, T(covector, vector, tensor),