From 4f728c49fbce2396b9db8d69cb8524ba0be19b27 Mon Sep 17 00:00:00 2001
From: "Eric T. Johnson" <yut23@users.noreply.github.com>
Date: Sun, 3 Dec 2023 09:01:42 -0800
Subject: [PATCH] Add a Python parser for diag.out files and fix header
 formatting (#2666)

This PR adds some Python code that can parse the files and sanitize any
duplicate entries (which often occurs when a run is restarted from an earlier
checkpoint). It also fixes some mistakes in the gravity_diag.out header
formatting, and cleans up some of the other header-writing code.

N.B.: Standard streams in C++ are right-justified by default (compared to
Python, where numbers are right-justified and strings are left-justified).

* update the header printing code for gravity_diag.out to match the others
   (the column numbers were 26 characters long instead of 25, and number 8
   was missing)

* simplify species formatting for species_diag.out (the previous code was just
   adding leading spaces)

* update the header field widths for amr_diag.out to match what's used for
   the data
---
 Source/driver/sum_integrated_quantities.cpp | 61 ++++++-------
 Util/scripts/diag_parser.py                 | 99 +++++++++++++++++++++
 2 files changed, 125 insertions(+), 35 deletions(-)
 create mode 100644 Util/scripts/diag_parser.py

diff --git a/Source/driver/sum_integrated_quantities.cpp b/Source/driver/sum_integrated_quantities.cpp
index 0542c4e7b9..6dc1be4004 100644
--- a/Source/driver/sum_integrated_quantities.cpp
+++ b/Source/driver/sum_integrated_quantities.cpp
@@ -359,7 +359,7 @@ Castro::sum_integrated_quantities ()
                    header << std::endl;
 
                    data_log1 << std::setw(intwidth) << "#   COLUMN 1";
-                   data_log1 << std::setw(fixwidth) << "                        2";
+                   data_log1 << std::setw(fixwidth) << 2;
 
                    for (int icol = 3; icol <= n; ++icol) {
                        data_log1 << std::setw(datwidth) << icol;
@@ -482,28 +482,29 @@ Castro::sum_integrated_quantities ()
 
             if (time == 0.0) {
 
-                log << std::setw(intwidth) << "#   COLUMN 1";
-                log << std::setw(fixwidth) << "                         2";
-                log << std::setw(fixwidth) << "                         3";
-                log << std::setw(fixwidth) << "                         4";
-                log << std::setw(fixwidth) << "                         5";
-                log << std::setw(fixwidth) << "                         6";
-                log << std::setw(fixwidth) << "                         7";
+                int n = 0;
 
                 std::ostringstream header;
 
-                header << std::setw(intwidth) << "#   TIMESTEP";
-                header << std::setw(fixwidth) << "                     TIME";
+                header << std::setw(intwidth) << "#   TIMESTEP";              ++n;
+                header << std::setw(fixwidth) << "                     TIME"; ++n;
 
-                header << std::setw(datwidth) << "             h_+ (x)";
-                header << std::setw(datwidth) << "             h_x (x)";
-                header << std::setw(datwidth) << "             h_+ (y)";
-                header << std::setw(datwidth) << "             h_x (y)";
-                header << std::setw(datwidth) << "             h_+ (z)";
-                header << std::setw(datwidth) << "             h_x (z)";
+                header << std::setw(datwidth) << "                  h_+ (x)"; ++n;
+                header << std::setw(datwidth) << "                  h_x (x)"; ++n;
+                header << std::setw(datwidth) << "                  h_+ (y)"; ++n;
+                header << std::setw(datwidth) << "                  h_x (y)"; ++n;
+                header << std::setw(datwidth) << "                  h_+ (z)"; ++n;
+                header << std::setw(datwidth) << "                  h_x (z)"; ++n;
 
                 header << std::endl;
 
+                log << std::setw(intwidth) << "#   COLUMN 1";
+                log << std::setw(fixwidth) << 2;
+
+                for (int i = 3; i <= n; ++i) {
+                    log << std::setw(datwidth) << i;
+                }
+
                 log << std::endl;
 
                 log << header.str();
@@ -589,27 +590,17 @@ Castro::sum_integrated_quantities ()
 
                 std::ostringstream header;
 
-                header << std::setw(intwidth) << "#   TIMESTEP";           ++n;
-                header << std::setw(fixwidth) << "                  TIME"; ++n;
-
-                // We need to be careful here since the species names have differing numbers of characters
+                header << std::setw(intwidth) << "#   TIMESTEP";              ++n;
+                header << std::setw(fixwidth) << "                     TIME"; ++n;
 
                 for (int i = 0; i < NumSpec; i++) {
-                    std::string outString{};
-                    std::string massString{"Mass "};
-                    std::string specString{species_names[i]};
-                    while (static_cast<int>(outString.length() + specString.length() + massString.length()) < datwidth) {
-                        outString += " ";
-                    }
-                    outString += massString;
-                    outString += specString;
-                    header << std::setw(datwidth) << outString; ++n;
+                    header << std::setw(datwidth) << ("Mass " + species_names[i]); ++n;
                 }
 
                 header << std::endl;
 
                 log << std::setw(intwidth) << "#   COLUMN 1";
-                log << std::setw(fixwidth) << "                        2";
+                log << std::setw(fixwidth) << 2;
 
                 for (int i = 3; i <= n; ++i) {
                     log << std::setw(datwidth) << i;
@@ -694,19 +685,19 @@ Castro::sum_integrated_quantities ()
                 header << std::setw(fixwidth) << "                       DT"; ++n;
                 header << std::setw(intwidth) << "  FINEST LEV";              ++n;
                 header << std::setw(fixwidth) << "  MAX NUMBER OF SUBCYCLES"; ++n;
-                header << std::setw(fixwidth) << " COARSE TIMESTEP WALLTIME"; ++n;
+                header << std::setw(datwidth) << " COARSE TIMESTEP WALLTIME"; ++n;
 #ifdef AMREX_USE_GPU
-                header << std::setw(fixwidth) << "  MAXIMUM GPU MEMORY USED"; ++n;
-                header << std::setw(fixwidth) << "  MINIMUM GPU MEMORY FREE"; ++n;
+                header << std::setw(datwidth) << "  MAXIMUM GPU MEMORY USED"; ++n;
+                header << std::setw(datwidth) << "  MINIMUM GPU MEMORY FREE"; ++n;
 #endif
 
                 header << std::endl;
 
                 log << std::setw(intwidth) << "#   COLUMN 1";
-                log << std::setw(fixwidth) << "                        2";
+                log << std::setw(fixwidth) << 2;
 
                 for (int i = 3; i < 4; ++i) {
-                    log << std::setw(datwidth) << i;
+                    log << std::setw(fixwidth) << i;
                 }
 
                 log << std::setw(intwidth) << 4; // Handle the finest lev column
diff --git a/Util/scripts/diag_parser.py b/Util/scripts/diag_parser.py
new file mode 100644
index 0000000000..aef233cc44
--- /dev/null
+++ b/Util/scripts/diag_parser.py
@@ -0,0 +1,99 @@
+"""Helper functions for working with Castro diagnostic files (*_diag.out)
+
+To use these in a standalone script, you can do one of the following:
+
+* append $CASTRO_HOME/Util/scripts to sys.path at the top of your script:
+    sys.path.append("<path to Castro>/Util/scripts")
+
+* add a symlink to this file in the same directory as your script:
+    $ ln -s "$CASTRO_HOME/Util/scripts/diag_parser.py" .
+
+* copy this file into the same directory as your script
+
+Then you can do `from diag_parser import deduplicate, read_diag_file`.
+"""
+
+from pathlib import Path
+
+import numpy as np
+
+""" Format notes
+files are opened in Castro.cpp, data is written in sum_integrated_quantities.cpp
+
+data_logs[0]: grid_diag.out
+intwidth, fixwidth, datwidth*
+
+data_logs[1]: gravity_diag.out
+- this was previously missing the last column number (8), which we handle for
+  backwards compatibility
+intwidth, fixwidth, datwidth, datwidth, datwidth, datwidth, datwidth, datwidth
+
+data_logs[2]: species_diag.out
+intwidth, fixwidth, datwidth*
+
+data_logs[3]: amr_diag.out
+- if compiled with GPU support, this will have two additional integer fields at
+  the end with size `datwidth` for the GPU memory usage
+- column 5 (max number of subcycles) is an integer
+intwidth, fixwidth, fixwidth, intwidth, fixwidth, datwidth
+"""
+
+datwidth = 25  # Floating point data in scientific notation
+fixwidth = 25  # Floating point data not in scientific notation
+intwidth = 12  # Integer data
+
+# Any additional columns after these are assumed to be floating point values in
+# scientific notation (amr_diag.out gets special handling)
+FIELD_WIDTHS = {
+    "grid_diag.out": [intwidth, fixwidth],
+    "gravity_diag.out": [intwidth, fixwidth] + [datwidth] * 6,
+    "species_diag.out": [intwidth, fixwidth],
+    "amr_diag.out": [intwidth, fixwidth, fixwidth, intwidth, fixwidth, datwidth],
+}
+
+
+def read_diag_file(file_path):
+    """Reads a Castro diagnostic file into a numpy structured array.
+
+    Currently only supports the default files that Castro generates.
+    """
+    if not isinstance(file_path, Path):
+        file_path = Path(file_path)
+    filetype = file_path.name
+    if filetype not in FIELD_WIDTHS:
+        raise ValueError("Unsupported file name")
+    widths = FIELD_WIDTHS[filetype]
+    with open(file_path, "r") as f:
+        # try getting the number of columns from the first line
+        first_line = f.readline().rstrip("\n")
+        if filetype == "gravity_diag.out":
+            # gravity_diag.out is missing the last column number, but it
+            # fortunately has a fixed number of columns
+            num_columns = 8
+        else:
+            num_columns = int(first_line.split()[-1])
+        # pad out the widths list on the right if necessary
+        widths.extend([datwidth] * (num_columns - len(widths)))
+        # infer datatypes from the widths
+        dtypes = [int if w == intwidth else float for w in widths]
+        # amr_diag.out has several integer columns with long names
+        if filetype == "amr_diag.out":
+            dtypes[4] = int  # max number of subcycles
+            if num_columns >= 8:
+                dtypes[6] = int  # maximum gpu memory used
+                dtypes[7] = int  # minimum gpu memory free
+        # already read the first header line, so we don't need to skip any rows
+        data = np.genfromtxt(
+            f, delimiter=widths, comments="#", dtype=dtypes, names=True
+        )
+    return data
+
+
+def deduplicate(data):
+    """Deduplicate based on the timestep, keeping the only last occurrence."""
+    # get the unique indices into the reversed timestep array, so we find the
+    # final occurrence of each timestep
+    _, rev_indices = np.unique(data["TIMESTEP"][::-1], return_index=True)
+    # np.unique() sorts by value, so we don't need to un-reverse rev_indices
+    unique_indices = data.shape[0] - rev_indices - 1
+    return data[unique_indices]