Skip to content

Commit

Permalink
Merge pull request #1732 from CliMA/ck/specialize_multiple_field_solve
Browse files Browse the repository at this point in the history
Specialize cases in run_field_matrix_solver, add debug info
  • Loading branch information
charleskawczynski authored May 17, 2024
2 parents a9d80a4 + 5f2e58e commit db8780f
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 61 deletions.
54 changes: 36 additions & 18 deletions ext/cuda/cuda_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ get_n_items(arr::AbstractArray) = prod(size(parent(arr)))
get_n_items(tup::Tuple) = prod(tup)

const reported_stats = Dict()
# Call via ClimaCore.DataLayouts.empty_kernel_stats()
empty_kernel_stats(::ClimaComms.CUDADevice) = empty!(reported_stats)
collect_kernel_stats() = false

Expand All @@ -22,58 +23,75 @@ collect_kernel_stats() = false
AbstractData,
Field,
};
auto = false,
threads_s,
blocks_s,
always_inline = true
)
Launch a cuda kernel, using `CUDA.launch_configuration`
Launch a cuda kernel, using `CUDA.launch_configuration` (if `auto=true`)
to determine the number of threads/blocks.
Suggested threads and blocks (`threads_s`, `blocks_s`) can be given
to benchmark compare against auto-determined threads/blocks.
to benchmark compare against auto-determined threads/blocks (if `auto=false`).
"""
function auto_launch!(
f!::F!,
args,
data;
threads_s,
blocks_s,
auto = false,
threads_s = nothing,
blocks_s = nothing,
always_inline = true,
caller = :unknown,
) where {F!}
nitems = get_n_items(data)
# For now, we'll simply use the
# suggested threads and blocks:
kernel =
CUDA.@cuda always_inline = always_inline threads = threads_s blocks =
blocks_s f!(args...)
if auto
nitems = get_n_items(data)
kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
config = CUDA.launch_configuration(kernel.fun)
threads = min(nitems, config.threads)
blocks = cld(nitems, threads)
kernel(args...; threads, blocks) # This knows to use always_inline from above.
else
kernel =
CUDA.@cuda always_inline = always_inline threads = threads_s blocks =
blocks_s f!(args...)
end

if collect_kernel_stats() # only for development use
key = (F!, typeof(args))
key = (F!, typeof(args), CUDA.registers(kernel))
# CUDA.registers(kernel) > 50 || return nothing # for debugging
# occursin("single_field_solve_kernel", string(nameof(F!))) || return nothing
if !haskey(reported_stats, key)
nitems = get_n_items(data)
kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
config = CUDA.launch_configuration(kernel.fun)
threads = min(nitems, config.threads)
blocks = cld(nitems, threads)
# For now, let's just collect info, later we can benchmark
#! format: off
s = ""
s *= "Launching kernel $f! with following config:\n"
s *= " nitems: $(nitems)\n"
s *= " threads_s: $(threads_s)\n"
s *= " blocks_s: $(blocks_s)\n"
isnothing(threads_s) || (s *= " threads_s: $(threads_s)\n")
isnothing(blocks_s) || (s *= " blocks_s: $(blocks_s)\n")
s *= " threads: $(threads)\n"
s *= " blocks: $(blocks)\n"
s *= " Δthreads: $(threads - prod(threads_s))\n"
s *= " Δblocks: $(blocks - prod(blocks_s))\n"
isnothing(threads_s) || (s *= " Δthreads: $(threads - prod(threads_s))\n")
isnothing(blocks_s) || (s *= " Δblocks: $(blocks - prod(blocks_s))\n")
s *= " maxthreads: $(CUDA.maxthreads(kernel))\n"
s *= " registers: $(CUDA.registers(kernel))\n"
s *= " threads_s_frac: $(prod(threads_s)/CUDA.maxthreads(kernel))\n"
isnothing(threads_s) || ( s *= " threads_s_frac: $(prod(threads_s)/CUDA.maxthreads(kernel))\n")
s *= " memory: $(CUDA.memory(kernel))\n"
@info s
#! format: on
reported_stats[key] = true
# error("Oops") # for debugging
# Main.Infiltrator.@exfiltrate # for debugging/performance optimization
end
# For now, let's just collect info, later we can benchmark
# kernel(args...; threads, blocks) # This knows to use always_inline from above.
# end
end
return nothing
end

"""
Expand Down
35 changes: 9 additions & 26 deletions ext/cuda/matrix_fields_multiple_field_solve.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,25 +50,15 @@ NVTX.@annotate function multiple_field_solve!(
)
end

column_A(A::UniformScaling, i, j, h) = A
column_A(A, i, j, h) = Spaces.column(A, i, j, h)

function get_ijhn(Ni, Nj, Nh, Nnames, blockIdx, threadIdx, blockDim, gridDim)
tidx = (blockIdx.x - 1) * blockDim.x + threadIdx.x
(i, j, h, n) = if 1 tidx prod((Ni, Nj, Nh, Nnames))
CartesianIndices((1:Ni, 1:Nj, 1:Nh, 1:Nnames))[tidx].I
else
(-1, -1, -1, -1)
end
return (i, j, h, n)
end
Base.@propagate_inbounds column_A(A::UniformScaling, i, j, h) = A
Base.@propagate_inbounds column_A(A, i, j, h) = Spaces.column(A, i, j, h)

@generated function generated_single_field_solve!(
device,
caches,
xs,
As,
bs,
device,
i,
j,
h,
Expand All @@ -78,11 +68,11 @@ end
return quote
Base.Cartesian.@nif $Nnames ξ -> (iname == ξ) ξ -> begin
_single_field_solve!(
device,
column_A(caches[ξ], i, j, h),
column_A(xs[ξ], i, j, h),
column_A(As[ξ], i, j, h),
column_A(bs[ξ], i, j, h),
device,
)
end
end
Expand All @@ -99,23 +89,16 @@ function multiple_field_solve_kernel!(
) where {Nnames}
@inbounds begin
Ni, Nj, _, _, Nh = size(Fields.field_values(x1))
(i, j, h, iname) = get_ijhn(
Ni,
Nj,
Nh,
Nnames,
CUDA.blockIdx(),
CUDA.threadIdx(),
CUDA.blockDim(),
CUDA.gridDim(),
)
if 1 i <= Ni && 1 j Nj && 1 h Nh && 1 iname Nnames
tidx = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
if 1 tidx prod((Ni, Nj, Nh, Nnames))
(i, j, h, iname) =
CartesianIndices((1:Ni, 1:Nj, 1:Nh, 1:Nnames))[tidx].I
generated_single_field_solve!(
device,
caches,
xs,
As,
bs,
device,
i,
j,
h,
Expand Down
43 changes: 34 additions & 9 deletions ext/cuda/matrix_fields_single_field_solve.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,44 @@ import LinearAlgebra: UniformScaling
import ClimaCore.Operators
import ClimaCore.Fields: Field
import ClimaCore.Fields
import ClimaCore.Spaces
import ClimaCore.Topologies
import ClimaCore.MatrixFields: single_field_solve!
import ClimaCore.MatrixFields: _single_field_solve!
import ClimaCore.MatrixFields: band_matrix_solve!, unzip_tuple_field_values
import ClimaCore.RecursiveApply: , , , rmap, rzero, rdiv

# called by TuplesOfNTuples.jl's `inner_dispatch`:
# which requires a particular argument order:
_single_field_solve!(
cache::Fields.Field,
x::Fields.Field,
A::Union{Fields.Field, UniformScaling},
b::Fields.Field,
dev::ClimaComms.CUDADevice,
) = _single_field_solve!(dev, cache, x, A, b)
function single_field_solve!(device::ClimaComms.CUDADevice, cache, x, A, b)
Ni, Nj, _, _, Nh = size(Fields.field_values(A))
Ni, Nj, _, _, Nh = size(Fields.field_values(A))
nitems = Ni * Nj * Nh
nthreads = min(256, nitems)
nblocks = cld(nitems, nthreads)
args = (device, cache, x, A, b)
auto_launch!(
single_field_solve_kernel!,
args,
x;
threads_s = nthreads,
blocks_s = nblocks,
)
end

function single_field_solve_kernel!(device, cache, x, A, b)
idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
Ni, Nj, _, _, Nh = size(Fields.field_values(A))
if idx <= Ni * Nj * Nh
i, j, h = Topologies._get_idx((Ni, Nj, Nh), idx)
_single_field_solve!(
device,
Spaces.column(cache, i, j, h),
Spaces.column(x, i, j, h),
Spaces.column(A, i, j, h),
Spaces.column(b, i, j, h),
)
end
return nothing
end

function _single_field_solve!(
::ClimaComms.CUDADevice,
Expand Down
26 changes: 18 additions & 8 deletions src/MatrixFields/field_matrix_solver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -247,14 +247,24 @@ function check_field_matrix_solver(::BlockDiagonalSolve, _, A, b)
end
end

NVTX.@annotate run_field_matrix_solver!(::BlockDiagonalSolve, cache, x, A, b) =
multiple_field_solve!(cache, x, A, b)

# This may be helpful for debugging:
# run_field_matrix_solver!(::BlockDiagonalSolve, cache, x, A, b) =
# foreach(matrix_row_keys(keys(A))) do name
# single_field_solve!(cache[name], x[name], A[name, name], b[name])
# end
NVTX.@annotate function run_field_matrix_solver!(
::BlockDiagonalSolve,
cache,
x,
A,
b,
)
names = matrix_row_keys(keys(A))
if length(names) == 1 ||
all(name -> A[name, name] isa UniformScaling, names.values)
foreach(names) do name
single_field_solve!(cache[name], x[name], A[name, name], b[name])
end
else
multiple_field_solve!(cache, x, A, b)
end
return nothing
end

"""
BlockLowerTriangularSolve(names₁...; [alg₁], [alg₂])
Expand Down

0 comments on commit db8780f

Please sign in to comment.