From 4f728c49fbce2396b9db8d69cb8524ba0be19b27 Mon Sep 17 00:00:00 2001 From: "Eric T. Johnson" Date: Sun, 3 Dec 2023 09:01:42 -0800 Subject: [PATCH] Add a Python parser for diag.out files and fix header formatting (#2666) This PR adds some Python code that can parse the files and sanitize any duplicate entries (which often occurs when a run is restarted from an earlier checkpoint). It also fixes some mistakes in the gravity_diag.out header formatting, and cleans up some of the other header-writing code. N.B.: Standard streams in C++ are right-justified by default (compared to Python, where numbers are right-justified and strings are left-justified). * update the header printing code for gravity_diag.out to match the others (the column numbers were 26 characters long instead of 25, and number 8 was missing) * simplify species formatting for species_diag.out (the previous code was just adding leading spaces) * update the header field widths for amr_diag.out to match what's used for the data --- Source/driver/sum_integrated_quantities.cpp | 61 ++++++------- Util/scripts/diag_parser.py | 99 +++++++++++++++++++++ 2 files changed, 125 insertions(+), 35 deletions(-) create mode 100644 Util/scripts/diag_parser.py diff --git a/Source/driver/sum_integrated_quantities.cpp b/Source/driver/sum_integrated_quantities.cpp index 0542c4e7b9..6dc1be4004 100644 --- a/Source/driver/sum_integrated_quantities.cpp +++ b/Source/driver/sum_integrated_quantities.cpp @@ -359,7 +359,7 @@ Castro::sum_integrated_quantities () header << std::endl; data_log1 << std::setw(intwidth) << "# COLUMN 1"; - data_log1 << std::setw(fixwidth) << " 2"; + data_log1 << std::setw(fixwidth) << 2; for (int icol = 3; icol <= n; ++icol) { data_log1 << std::setw(datwidth) << icol; @@ -482,28 +482,29 @@ Castro::sum_integrated_quantities () if (time == 0.0) { - log << std::setw(intwidth) << "# COLUMN 1"; - log << std::setw(fixwidth) << " 2"; - log << std::setw(fixwidth) << " 3"; - log << std::setw(fixwidth) << " 4"; - log << std::setw(fixwidth) << " 5"; - log << std::setw(fixwidth) << " 6"; - log << std::setw(fixwidth) << " 7"; + int n = 0; std::ostringstream header; - header << std::setw(intwidth) << "# TIMESTEP"; - header << std::setw(fixwidth) << " TIME"; + header << std::setw(intwidth) << "# TIMESTEP"; ++n; + header << std::setw(fixwidth) << " TIME"; ++n; - header << std::setw(datwidth) << " h_+ (x)"; - header << std::setw(datwidth) << " h_x (x)"; - header << std::setw(datwidth) << " h_+ (y)"; - header << std::setw(datwidth) << " h_x (y)"; - header << std::setw(datwidth) << " h_+ (z)"; - header << std::setw(datwidth) << " h_x (z)"; + header << std::setw(datwidth) << " h_+ (x)"; ++n; + header << std::setw(datwidth) << " h_x (x)"; ++n; + header << std::setw(datwidth) << " h_+ (y)"; ++n; + header << std::setw(datwidth) << " h_x (y)"; ++n; + header << std::setw(datwidth) << " h_+ (z)"; ++n; + header << std::setw(datwidth) << " h_x (z)"; ++n; header << std::endl; + log << std::setw(intwidth) << "# COLUMN 1"; + log << std::setw(fixwidth) << 2; + + for (int i = 3; i <= n; ++i) { + log << std::setw(datwidth) << i; + } + log << std::endl; log << header.str(); @@ -589,27 +590,17 @@ Castro::sum_integrated_quantities () std::ostringstream header; - header << std::setw(intwidth) << "# TIMESTEP"; ++n; - header << std::setw(fixwidth) << " TIME"; ++n; - - // We need to be careful here since the species names have differing numbers of characters + header << std::setw(intwidth) << "# TIMESTEP"; ++n; + header << std::setw(fixwidth) << " TIME"; ++n; for (int i = 0; i < NumSpec; i++) { - std::string outString{}; - std::string massString{"Mass "}; - std::string specString{species_names[i]}; - while (static_cast(outString.length() + specString.length() + massString.length()) < datwidth) { - outString += " "; - } - outString += massString; - outString += specString; - header << std::setw(datwidth) << outString; ++n; + header << std::setw(datwidth) << ("Mass " + species_names[i]); ++n; } header << std::endl; log << std::setw(intwidth) << "# COLUMN 1"; - log << std::setw(fixwidth) << " 2"; + log << std::setw(fixwidth) << 2; for (int i = 3; i <= n; ++i) { log << std::setw(datwidth) << i; @@ -694,19 +685,19 @@ Castro::sum_integrated_quantities () header << std::setw(fixwidth) << " DT"; ++n; header << std::setw(intwidth) << " FINEST LEV"; ++n; header << std::setw(fixwidth) << " MAX NUMBER OF SUBCYCLES"; ++n; - header << std::setw(fixwidth) << " COARSE TIMESTEP WALLTIME"; ++n; + header << std::setw(datwidth) << " COARSE TIMESTEP WALLTIME"; ++n; #ifdef AMREX_USE_GPU - header << std::setw(fixwidth) << " MAXIMUM GPU MEMORY USED"; ++n; - header << std::setw(fixwidth) << " MINIMUM GPU MEMORY FREE"; ++n; + header << std::setw(datwidth) << " MAXIMUM GPU MEMORY USED"; ++n; + header << std::setw(datwidth) << " MINIMUM GPU MEMORY FREE"; ++n; #endif header << std::endl; log << std::setw(intwidth) << "# COLUMN 1"; - log << std::setw(fixwidth) << " 2"; + log << std::setw(fixwidth) << 2; for (int i = 3; i < 4; ++i) { - log << std::setw(datwidth) << i; + log << std::setw(fixwidth) << i; } log << std::setw(intwidth) << 4; // Handle the finest lev column diff --git a/Util/scripts/diag_parser.py b/Util/scripts/diag_parser.py new file mode 100644 index 0000000000..aef233cc44 --- /dev/null +++ b/Util/scripts/diag_parser.py @@ -0,0 +1,99 @@ +"""Helper functions for working with Castro diagnostic files (*_diag.out) + +To use these in a standalone script, you can do one of the following: + +* append $CASTRO_HOME/Util/scripts to sys.path at the top of your script: + sys.path.append("/Util/scripts") + +* add a symlink to this file in the same directory as your script: + $ ln -s "$CASTRO_HOME/Util/scripts/diag_parser.py" . + +* copy this file into the same directory as your script + +Then you can do `from diag_parser import deduplicate, read_diag_file`. +""" + +from pathlib import Path + +import numpy as np + +""" Format notes +files are opened in Castro.cpp, data is written in sum_integrated_quantities.cpp + +data_logs[0]: grid_diag.out +intwidth, fixwidth, datwidth* + +data_logs[1]: gravity_diag.out +- this was previously missing the last column number (8), which we handle for + backwards compatibility +intwidth, fixwidth, datwidth, datwidth, datwidth, datwidth, datwidth, datwidth + +data_logs[2]: species_diag.out +intwidth, fixwidth, datwidth* + +data_logs[3]: amr_diag.out +- if compiled with GPU support, this will have two additional integer fields at + the end with size `datwidth` for the GPU memory usage +- column 5 (max number of subcycles) is an integer +intwidth, fixwidth, fixwidth, intwidth, fixwidth, datwidth +""" + +datwidth = 25 # Floating point data in scientific notation +fixwidth = 25 # Floating point data not in scientific notation +intwidth = 12 # Integer data + +# Any additional columns after these are assumed to be floating point values in +# scientific notation (amr_diag.out gets special handling) +FIELD_WIDTHS = { + "grid_diag.out": [intwidth, fixwidth], + "gravity_diag.out": [intwidth, fixwidth] + [datwidth] * 6, + "species_diag.out": [intwidth, fixwidth], + "amr_diag.out": [intwidth, fixwidth, fixwidth, intwidth, fixwidth, datwidth], +} + + +def read_diag_file(file_path): + """Reads a Castro diagnostic file into a numpy structured array. + + Currently only supports the default files that Castro generates. + """ + if not isinstance(file_path, Path): + file_path = Path(file_path) + filetype = file_path.name + if filetype not in FIELD_WIDTHS: + raise ValueError("Unsupported file name") + widths = FIELD_WIDTHS[filetype] + with open(file_path, "r") as f: + # try getting the number of columns from the first line + first_line = f.readline().rstrip("\n") + if filetype == "gravity_diag.out": + # gravity_diag.out is missing the last column number, but it + # fortunately has a fixed number of columns + num_columns = 8 + else: + num_columns = int(first_line.split()[-1]) + # pad out the widths list on the right if necessary + widths.extend([datwidth] * (num_columns - len(widths))) + # infer datatypes from the widths + dtypes = [int if w == intwidth else float for w in widths] + # amr_diag.out has several integer columns with long names + if filetype == "amr_diag.out": + dtypes[4] = int # max number of subcycles + if num_columns >= 8: + dtypes[6] = int # maximum gpu memory used + dtypes[7] = int # minimum gpu memory free + # already read the first header line, so we don't need to skip any rows + data = np.genfromtxt( + f, delimiter=widths, comments="#", dtype=dtypes, names=True + ) + return data + + +def deduplicate(data): + """Deduplicate based on the timestep, keeping the only last occurrence.""" + # get the unique indices into the reversed timestep array, so we find the + # final occurrence of each timestep + _, rev_indices = np.unique(data["TIMESTEP"][::-1], return_index=True) + # np.unique() sorts by value, so we don't need to un-reverse rev_indices + unique_indices = data.shape[0] - rev_indices - 1 + return data[unique_indices]