Skip to content

Commit

Permalink
Extend tests for restarts
Browse files Browse the repository at this point in the history
  • Loading branch information
Sbozzolo committed Sep 20, 2024
1 parent 448d485 commit e154aa9
Show file tree
Hide file tree
Showing 4 changed files with 352 additions and 234 deletions.
49 changes: 38 additions & 11 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,44 @@ steps:
--job_id sphere_ssp_baroclinic_wave_rhoe_equilmoist_earth
artifact_paths: "sphere_ssp_baroclinic_wave_rhoe_equilmoist_earth/output_active/*"

- group: "Restarting"
steps:

- label: ":computer: test restart"
command: >
julia --color=yes --project=examples test/restart.jl
agents:
slurm_mem: 16GB

- label: ":computer: test restart GPU"
command: >
julia --color=yes --project=examples test/restart.jl
env:
CLIMACOMMS_DEVICE: "CUDA"
agents:
slurm_gpus: 1
slurm_mem: 16G

- label: ":computer: test restart MPI"
command: >
srun julia --color=yes --project=examples test/restart.jl
env:
CLIMACOMMS_CONTEXT: "MPI"
agents:
slurm_ntasks: 2
slurm_mem: 16G

- label: ":computer: test restart GPU MPI"
command: >
srun julia --color=yes --project=examples test/restart.jl
env:
CLIMACOMMS_CONTEXT: "MPI"
CLIMACOMMS_DEVICE: "CUDA"
agents:
slurm_gpus_per_task: 1
slurm_ntasks: 2
slurm_mem: 16G

- group: "MPI Examples"
steps:

Expand Down Expand Up @@ -447,17 +485,6 @@ steps:
#retry:
# automatic: true

- label: ":computer: MPI GPU test restart"
command: >
srun julia --color=yes --project=examples test/restart.jl
env:
CLIMACOMMS_CONTEXT: "MPI"
CLIMACOMMS_DEVICE: "CUDA"
agents:
slurm_gpus_per_task: 1
slurm_ntasks: 2
slurm_mem: 16GB

- label: ":computer: MPI no lim aquaplanet (ρe) equilmoist clearsky radiation"
command: >
srun julia --color=yes --project=examples examples/hybrid/driver.jl
Expand Down
4 changes: 2 additions & 2 deletions examples/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -336,9 +336,9 @@ weakdeps = ["CUDA", "MPI"]

[[deps.ClimaCore]]
deps = ["Adapt", "BandedMatrices", "BlockArrays", "ClimaComms", "CubedSphere", "DataStructures", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LinearAlgebra", "MultiBroadcastFusion", "NVTX", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "StaticArrays", "Statistics", "Unrolled"]
git-tree-sha1 = "806e8490ff1aa664ca579544d798f8addfa1b07d"
git-tree-sha1 = "527b11c35f00db0064b77a25fc881f2a2982abda"
uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
version = "0.14.15"
version = "0.14.16"
weakdeps = ["CUDA", "Krylov"]

[deps.ClimaCore.extensions]
Expand Down
85 changes: 41 additions & 44 deletions src/solver/type_getters.jl
Original file line number Diff line number Diff line change
Expand Up @@ -301,9 +301,7 @@ function get_state_restart(config::AtmosConfig, restart_file, atmos_model_hash)
atmos_model_hash_in_restart =
InputOutput.HDF5.read_attribute(reader.file, "atmos_model_hash")
if atmos_model_hash_in_restart != atmos_model_hash
error(
"Restart file $(restart_file) was constructed with a different AtmosModel",
)
@warn "Restart file $(restart_file) was constructed with a different AtmosModel, no consistency check was performed"
end
return (Y, t_start)
end
Expand Down Expand Up @@ -484,6 +482,42 @@ thermo_state_type(::EquilMoistModel, ::Type{FT}) where {FT} = TD.PhaseEquil{FT}
thermo_state_type(::NonEquilMoistModel, ::Type{FT}) where {FT} =
TD.PhaseNonEquil{FT}

function get_restart_file(config::AtmosConfig, base_output_dir)
restart_file = nothing
(; parsed_args) = config

if parsed_args["detect_restart_file"] && isdir(base_output_dir)
lowercase(parsed_args["output_dir_style"]) == "activelink" ||
error("detect_restart_file works only with ActiveLink")
# output_dir will be something like ABC/DEF/output_1234
name_rx = r"output_(\d\d\d\d)"
restart_file_rx = r"day\d+\.\w+\.hdf5"

existing_outputs =
filter(x -> !isnothing(match(name_rx, x)), readdir(base_output_dir))
if !isempty(existing_outputs)
latest_output = first(sort(existing_outputs, rev = true))
previous_folder = joinpath(base_output_dir, latest_output)
possible_restart_files = filter(
f -> occursin(restart_file_rx, f),
readdir(previous_folder),
)
if !isempty(possible_restart_files)
restart_file_name =
last(CA.sort_files_by_time(possible_restart_files))
restart_file = joinpath(previous_folder, restart_file_name)
end
end
end

# If a restart file was passed, override what we detected automatically
if !isnothing(parsed_args["restart_file"])
restart_file = parsed_args["restart_file"]
end

return restart_file
end

function get_sim_info(config::AtmosConfig)
(; parsed_args) = config
FT = eltype(config)
Expand All @@ -503,53 +537,16 @@ function get_sim_info(config::AtmosConfig)
haskey(allowed_dir_styles, lowercase(requested_style)) ||
error("output_dir_style $(requested_style) not available")

# We look for a restart before creating a new output dir because we want to
# look for previous folders
restart_file = get_restart_file(config, base_output_dir)

output_dir = OutputPathGenerator.generate_output_path(
base_output_dir;
context = config.comms_ctx,
style = allowed_dir_styles[lowercase(requested_style)],
)

restart_file = nothing

if parsed_args["detect_restart_file"]
lowercase(parsed_args["output_dir_style"]) == "activelink" ||
error("detect_restart_file works only with ActiveLink")
# output_dir will be something like ......./output_1234
name_rx = r"output_(\d\d\d\d)"
restart_file_rx = r"day\d+\.\w+\.hdf5"
counter_match = match(name_rx, output_dir)
if !isnothing(counter_match)
counter = parse(Int, counter_match[1])
if counter != 0
# As implemented in ClimaUtilities.OutputPathGenerator
previous_counter = counter - 1
previous_counter_str = lpad(previous_counter, 4, "0")
counter_str = lpad(counter, 4, "0")

previous_folder =
replace(output_dir, counter_str => previous_counter_str)

isdir(previous_folder) ||
error("Could not find a folder in $(previous_folder)")

possible_restart_files = filter(
f -> occursin(restart_file_rx, f),
readdir(previous_folder),
)
if !isempty(possible_restart_files)
restart_file_name =
last(CA.sort_files_by_time(possible_restart_files))
restart_file = joinpath(previous_folder, restart_file_name)
end
end
end
end

# If a restart file was passed, override what we detected automatically
if !isnothing(parsed_args["restart_file"])
restart_file = parsed_args["restart_file"]
end

isnothing(restart_file) ||
@info "Restarting simulation from file $restart_file"

Expand Down
Loading

0 comments on commit e154aa9

Please sign in to comment.