diff --git a/src/solver/solve.jl b/src/solver/solve.jl index 6fd13e9bec..ea84ce9cf7 100644 --- a/src/solver/solve.jl +++ b/src/solver/solve.jl @@ -68,8 +68,13 @@ function solve_atmos!(simulation) return AtmosSolveResults(sol, :success, walltime) end catch ret_code - CA.save_restart_func(integrator, simulation.output_dir) - CA.save_to_disk_func(integrator, simulation.output_dir) + if !CA.is_distributed(comms_ctx) + # We can only save when not distributed because we don't have a way to sync the + # MPI processes (maybe just one MPI rank crashes, leading to a hanginging + # simulation) + CA.save_restart_func(integrator, simulation.output_dir) + CA.save_to_disk_func(integrator, simulation.output_dir) + end @error "ClimaAtmos simulation crashed. Stacktrace for failed simulation" exception = (ret_code, catch_backtrace()) return AtmosSolveResults(nothing, :simulation_crashed, nothing)