Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error handling on failed writes #904

Draft
wants to merge 10 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion iowrite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1557,6 +1557,7 @@ bool writeRestart(
fname.width(7);
fname.fill('0');
fname << fileIndex << "." << currentDate << ".vlsv";
P::lastRestart = fname.str();

phiprof::Timer openTimer {"open"};
//Open the file with vlsvWriter:
Expand Down Expand Up @@ -1588,7 +1589,9 @@ bool writeRestart(
MPI_Info_set(MPIinfo, factor, stripeChar);
}

if( vlsvWriter.open( fname.str(), MPI_COMM_WORLD, masterProcessId, MPIinfo ) == false) return false;
if (vlsvWriter.open( fname.str(), MPI_COMM_WORLD, masterProcessId, MPIinfo ) == false) {
return false;
}

if( MPIinfo != MPI_INFO_NULL ) {
MPI_Info_free(&MPIinfo);
Expand Down
2 changes: 2 additions & 0 deletions parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ std::array<FsGridTools::Task_t,3> P::overrideReadFsGridDecomposition = {0,0,0};
std::string tracerString; /*!< Fieldline tracer to use for coupling ionosphere and magnetosphere */
bool P::computeCurvature;

std::string P::lastRestart {""};

bool P::addParameters() {
typedef Readparameters RP;
// the other default parameters we read through the add/get interface
Expand Down
2 changes: 2 additions & 0 deletions parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ struct Parameters {

static bool computeCurvature; /*<! Boolean flag, if true the curvature of magnetic field is computed. */

static std::string lastRestart; // Last restart file written

/*! \brief Add the global parameters.
*
* This function adds all the parameters that are loaded at a global level.
Expand Down
15 changes: 12 additions & 3 deletions vlasiator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,8 @@ int main(int argn,char* args[]) {
writeGhosts
) == false
) {
cerr << "FAILED TO WRITE GRID AT " << __FILE__ << " " << __LINE__ << endl;
// TODO make this std::format when we get C++20
abort_mpi(std::string(__FILE__) + ":" + std::to_string(__LINE__) + ": FAILED TO WRITE GRID", 1);
}

phiprof::stop("Initialization");
Expand Down Expand Up @@ -729,7 +730,8 @@ int main(int argn,char* args[]) {
writeGhosts
) == false
) {
cerr << "FAILED TO WRITE GRID AT " << __FILE__ << " " << __LINE__ << endl;
// TODO make this std::format when we get C++20
abort_mpi(std::string(__FILE__) + ":" + std::to_string(__LINE__) + ": FAILED TO WRITE GRID", 1);
}

P::systemWriteDistributionWriteStride.pop_back();
Expand Down Expand Up @@ -946,7 +948,8 @@ int main(int argn,char* args[]) {
writeGhosts
) == false
) {
cerr << "FAILED TO WRITE GRID AT" << __FILE__ << " " << __LINE__ << endl;
// TODO make this std::format when we get C++20
abort_mpi(std::string(__FILE__) + ":" + std::to_string(__LINE__) + ": FAILED TO WRITE GRID", 1);
}
P::systemWrites[i]++;
// Special case for large timesteps
Expand Down Expand Up @@ -1029,6 +1032,12 @@ int main(int argn,char* args[]) {
version,
config,
outputReducer,"restart",(uint)P::t,P::restartStripeFactor) == false ) {
// If restart write fails, remove the malformed file and hope someone clears space soon
MPI_Barrier(MPI_COMM_WORLD);
if(!P::lastRestart.empty()) {
std::remove(P::lastRestart.c_str());
P::lastRestart.clear();
}
logFile << "(IO): ERROR Failed to write restart!" << endl << writeVerbose;
cerr << "FAILED TO WRITE RESTART" << endl;
}
Expand Down
Loading