Merge pull request #10 from QuantumPioneer/feat/mulliken

Add Parsing of Mulliken Charges + Next Alpha Release
QuantumPioneer · Apr 22, 2024 · 27f38ce · 27f38ce
2 parents 943436e + 8ca3611
commit 27f38ce
Show file tree

Hide file tree

Showing 5 changed files with 102 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -9,11 +9,14 @@ Install with `pip install fastlogfileparser` or `conda` (forthcoming!).
  4. Retrieves values at every step, not just convergence
 
 ## Usage
-### Gaussian
+The best way to see how `fastlogfileparser` works is to check out the [tests](./test/gaussian_test.py)!
+They show the syntax for importing, calling, and then accessing the values.
+A brief summary of overall workflow and usage is provided below.
 
-There is a single function `fast_gaussian_logfile_parser` inside `fastlogfileparser.gaussian` which reads logfiles and returns the result as a [namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple) (which prevents accidentally changing the values and allows using `.` syntax to access them).
+### Design
+There is a single function `fast_{software}_logfile_parser` inside `fastlogfileparser.{software}` (where `{software}` is the name of the corresponding package like `gaussian` or `orca`) which reads log files and returns the result as a [namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple) (which prevents accidentally changing the values and allows using `.` syntax to access them).
 
-#### Usage Example
+### Usage Example
 
 ```python
 from fastlogfileparser.gaussian import fast_gaussian_logfile_parser as fglp
@@ -28,30 +31,58 @@ print(job_1.frequency_modes)
 print(job_1._fields)
 
 # can also be accessed via
-from fastlogfileparser.gaussian import FIELDS
+from fastlogfileparser.gaussian import ALL_FIELDS
 ```
 
 Fast logfile parser is fastest when you ask it to retrieve only the fields you want, i.e.:
 ```python
 job_1, job_2, job_3 = fglp(FNAME, get=("gibbs", "scf"))
 ```
 
-#### Retrieved Values
+### Retrieved Values
+
+#### Gaussian
 
 | Quantity | Key | Type | Frequency |
 | -------- | --- | ---- | --------- |
+| Route Section | `route_section` | string | 1/job |
+| Normal Termination | `normal_termination` | boolean | 1/job |
+| Error | `error_string` | str | 1/job |
+| Maximum Allowed Steps | `max_steps` | int | 1/job |
+| CPU Time | `cpu_time` | float | 1/job |
+| Wall Time | `wall_time` | float | 1/job |
 | Gibbs free energy at 298K | `gibbs` | float | 1/job |
 | Gibbs free energy at 0K | `e0_zpe` | float | 1/job |
 | Enthalpy at 298K | `e0_h` | float | 1/job |
-| E0 $^1$ | `E0` | float | 1/job |
+| HF $^1$ | `hf` | float | 1/job |
 | Per-atom Zero Point Energy | `zpe_per_atom` | float | 1/job |
-| Standardized xyz coords | `std_xyz` | list[float] | 1/step/job |
-| ... | ... | ... | ... | 
+| Wavefunction Energy $^3$ | `wavefunction_energy` | float | 1/job |
+| SCF Energy | `scf` | list[float] | 1/job |
+| Vibrational Frequencies | `frequencies` | list[float] | 1/job |
+| Frequency Modes | `frequency_modes` | list[list[float]] | 1/job |
+| Standardized xyz coords | `std_xyz` | list[list[float]] | 1/step/job |
+| Input xyz coords | `xyz` | list[list[float]] | 1/step/job |
+| Standardized forces | `std_forces` | list[list[float]] | 1/step/job |
+| Mulliken Charges (Summed into Heavy) | `mulliken_charges_summed` | list[list[float]] | 2/job |
+| Charge and Multiplicity | `charge_and_multiplicity` | list[int] | 1/job |
 | Number of Atoms $^2$ | `number_of_atoms` | int | 1/job |
 | Number of Optimization Steps $^2$ | `number_of_optimization_steps` | int | 1/job |
 
 $1$ equals E0 only for non-wavefunction methods <br>
 $2$ requires `std_xyz` to be parsed to find these values <br>
+$3$ E0 for wavefunction methods <br>
+
+#### Orca
+
+| Quantity | Key | Type | Frequency |
+| -------- | --- | ---- | --------- |
+| Route Section | `route_section` | string | 1/job |
+| Total Run Time $^1$ | `run_time` | float | 1/job |
+| Charge and Multiplicity | `charge_and_multiplicity` | list[int] | 1/job |
+| Final Single Point Energy | `energy` | float | 1/job |
+| Input xyz coords | `input_coordinates` | list[list[float]] | 1/job |
+
+$1$ ignores milliseconds <br>
 
 ## How much fast-ly-er?
 `FastLogfileParser` uses REGEX and only REGEX to retrieve data from logfiles, spending as much time in Python's excellent C-based REGEX library as possible.

diff --git a/fastlogfileparser/gaussian/utils/postprocessing.py b/fastlogfileparser/gaussian/utils/postprocessing.py
@@ -7,6 +7,18 @@
     _unix_time_to_seconds,
 )
 
+
+def _mulliken(in_list):
+    out = []
+    for i in in_list:
+        inner_out = []
+        for row in i.split(sep="\n"):
+            atom_idx, _, mulliken_charge, _ = row.split()
+            inner_out.append([int(atom_idx), float(mulliken_charge)])
+        out.append(inner_out)
+    return out
+
+
 POSTPROCESSING_FUNCTIONS = {
     "cpu_time": _unix_time_to_seconds,
     "wall_time": _unix_time_to_seconds,
@@ -27,4 +39,5 @@
     "xyz": _columns_to_floats,
     "route_section": lambda in_list: in_list[0],
     "charge_and_multiplicity": _charge_and_multiplicity,
+    "mulliken_charges_summed": _mulliken,
 }
diff --git a/fastlogfileparser/gaussian/utils/regexes.py b/fastlogfileparser/gaussian/utils/regexes.py
@@ -42,6 +42,12 @@
         r"([\s+\d+\s+\d\s+-?\d\.\d\d\s+-?\d\.\d\d\s+-?\d\.\d\d\s+-?\d\.\d\d\s+-?\d\.\d\d\s+-?\d\.\d\d\s+-?\d\.\d\d\s+-?\d\.\d\d\s+-?\d\.\d\d]+)\n"
         r"(?:\s+\d+\s+\d+\s+\d+)?\n"
     ),
+    "mulliken_charges_summed": (
+        r" Mulliken charges and spin densities with hydrogens summed into heavy atoms:\n"
+        r"               1          2\n"
+        r"((?:\s+\d+\s+[a-zA-Z]{1,3}\s+-?\d+\.\d+\s+-?\d+\.\d+)+)\n"
+        r" APT charges:"
+    ),
     "charge_and_multiplicity": r" Charge = {1,2}(-?\d) Multiplicity = (\d)",
 }
 
@@ -57,7 +63,7 @@
 RETRIEVAL_PATTERNS = {**DATA, **METADATA}
 
 # other options:
-# homo-lumo gap, polarizability, dipole moment, mulliken and APT partial charges, occupancy
+# homo-lumo gap, polarizability, dipole moment, APT partial charges, occupancy
 
 
 COMPILED_PATTERNS = {pattern_name: re.compile(pattern) for (pattern_name, pattern) in RETRIEVAL_PATTERNS.items()}
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "fastlogfileparser"
-version = "1.0.0a6"
+version = "1.0.0a7"
 authors = [
     { name = "Jackson Burns" },
 ]

diff --git a/test/gaussian_test.py b/test/gaussian_test.py
@@ -18,6 +18,48 @@ def test_duplicated_frequencies_handling():
     assert result.frequencies == [355.51, 1029.1913, 1349.894, 1491.2134, 3851.6427, 3853.9729]
 
 
+@pytest.mark.dependency(**pytest_dep_args)
+def test_mulliken_charges():
+    """
+    Mulliken charges summed into heavy atoms.
+    """
+
+    file = os.path.join(os.path.dirname(__file__), "data", "rxn_233.log")
+    result, _, _ = fast_gaussian_logfile_parser(file)
+    assert result.mulliken_charges_summed == [
+        [
+            [2, -0.022831],
+            [3, 0.023347],
+            [13, 0.264456],
+            [14, 0.279877],
+            [15, 0.244798],
+            [16, 0.135469],
+            [17, 0.178793],
+            [18, 0.254435],
+            [19, -0.293571],
+            [20, -0.302496],
+            [21, -0.278843],
+            [22, -0.245202],
+            [23, -0.238233],
+        ],
+        [
+            [2, -0.050449],
+            [3, 0.010743],
+            [13, 0.283808],
+            [14, 0.299583],
+            [15, 0.220744],
+            [16, 0.157101],
+            [17, 0.138796],
+            [18, 0.243924],
+            [19, -0.225737],
+            [20, -0.287798],
+            [21, -0.292178],
+            [22, -0.25534],
+            [23, -0.243197],
+        ],
+    ]
+
+
 @pytest.mark.dependency(**pytest_dep_args)
 def test_fast_gaussian_logfile_parser():
     """