From 8bc011e15f7ca1459207464581e44fa98231d96b Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Thu, 24 Aug 2023 15:30:41 +0100
Subject: [PATCH] Clean up NMR utils and add tests for Bruker reader

---
 pydatalab/pydatalab/apps/nmr/utils.py | 34 +++++++++++------
 pydatalab/tests/apps/test_nmr.py      | 54 +++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 11 deletions(-)
 create mode 100644 pydatalab/tests/apps/test_nmr.py

diff --git a/pydatalab/pydatalab/apps/nmr/utils.py b/pydatalab/pydatalab/apps/nmr/utils.py
index b2288f424..89ec8e41f 100644
--- a/pydatalab/pydatalab/apps/nmr/utils.py
+++ b/pydatalab/pydatalab/apps/nmr/utils.py
@@ -1,6 +1,7 @@
 import itertools
 import os
 import re
+from pathlib import Path
 
 import matplotlib.pyplot as plt
 import nmrglue as ng
@@ -13,15 +14,27 @@
 ######################################################################################
 
 
-def read_bruker_1d(data, process_number=1, verbose=True, sample_mass_mg=None):
+def read_bruker_1d(
+    data: Path | pd.DataFrame,
+    process_number: int = 1,
+    verbose: bool = False,
+    sample_mass_mg: float | None = None,
+) -> tuple[pd.DataFrame | None, dict, str | None, tuple[int, ...]]:
     """Read a 1D bruker nmr spectrum and return it as a df.
 
-    arguments:
+    Parameters:
+        data: The directory of the full bruker data file, or a pandas DataFrame which
+            will be returned without further processing.
+        process_number: The process number of the processed data you want to plot [default: 1].
+        verbose: Whether to print information such as the spectrum title to stdout.
+        sample_mass_mg: The (optional) sample mass. If provided, the resulting DataFrame will have a "intensity_per_scan_per_gram" column.
+
+    Returns:
+        df: A pandas DataFrame containing the spectrum data, or None if the reading failed.
+        a_dic: A dictionary containing the acquisition parameters.
+        topspin_title: The title of the spectrum, as stored in the topspin "title" file.
+        shape: The shape of the spectrum data array.
 
-    data: The directory of the full bruker data file. You may also supply a df as this argument. In this case, the df is returned as is.
-    process_number: The process number of the processed data you want to plot [default 1]
-    verbose: Whether to print information such as the spectrum title to stdout (default True)
-    sample_mass_mg: The (optional) sample mass. If provided, the resulting DataFrame will have a "intensity_per_scan_per_gram" column.
     """
 
     # if df is provided, just return it as-is. This functionality is provided to make functions calling read_bruker_1d flexible by default.
@@ -32,12 +45,12 @@ def read_bruker_1d(data, process_number=1, verbose=True, sample_mass_mg=None):
             print("data frame provided to read_bruker_1d(). Returning it as is.")
         return data
     else:
-        data_dir = data
+        data_dir = Path(data)
 
-    processed_data_dir = os.path.join(data_dir, "pdata", str(process_number))
+    processed_data_dir = data_dir / "pdata" / str(process_number)
 
-    a_dic, a_data = ng.fileio.bruker.read(data_dir)  # aquisition_data
-    p_dic, p_data = ng.fileio.bruker.read_pdata(processed_data_dir)  # processing data
+    a_dic, a_data = ng.fileio.bruker.read(str(data_dir))  # aquisition_data
+    p_dic, p_data = ng.fileio.bruker.read_pdata(str(processed_data_dir))  # processing data
 
     try:
         with open(os.path.join(processed_data_dir, "title"), "r") as f:
@@ -46,7 +59,6 @@ def read_bruker_1d(data, process_number=1, verbose=True, sample_mass_mg=None):
         topspin_title = None
 
     if len(p_data.shape) > 1:
-        print("data is more than one dimensional - read failed")
         return None, a_dic, topspin_title, p_data.shape
 
     nscans = a_dic["acqus"]["NS"]
diff --git a/pydatalab/tests/apps/test_nmr.py b/pydatalab/tests/apps/test_nmr.py
new file mode 100644
index 000000000..f9be2b30f
--- /dev/null
+++ b/pydatalab/tests/apps/test_nmr.py
@@ -0,0 +1,54 @@
+import zipfile
+from pathlib import Path
+
+import pytest
+
+from pydatalab.apps.nmr.utils import read_bruker_1d
+
+
+def _extract_example(filename, dir):
+    with zipfile.ZipFile(filename, "r") as zip_ref:
+        zip_ref.extractall(dir)
+    return Path(dir) / filename.stem
+
+
+@pytest.fixture(scope="function")
+def nmr_1d_solution_example(tmpdir):
+    zip_path = Path(__file__).parent.parent.parent / "example_data" / "NMR" / "1.zip"
+    return _extract_example(zip_path, tmpdir)
+
+
+@pytest.fixture(scope="function")
+def nmr_1d_solid_example(tmpdir):
+    zip_path = Path(__file__).parent.parent.parent / "example_data" / "NMR" / "71.zip"
+    return _extract_example(zip_path, tmpdir)
+
+
+@pytest.fixture(scope="function")
+def nmr_2d_matpass_example(tmpdir):
+    zip_path = Path(__file__).parent.parent.parent / "example_data" / "NMR" / "72.zip"
+    return _extract_example(zip_path, tmpdir)
+
+
+def test_bruker_reader_solution(nmr_1d_solution_example):
+    df, a_dic, topspin_title, shape = read_bruker_1d(nmr_1d_solution_example)
+    assert df is not None
+    assert a_dic
+    assert topspin_title
+    assert shape == (4096,)
+
+
+def test_bruker_reader_solid(nmr_1d_solid_example):
+    df, a_dic, topspin_title, shape = read_bruker_1d(nmr_1d_solid_example)
+    assert df is not None
+    assert a_dic
+    assert topspin_title
+    assert shape == (9984,)
+
+
+def test_bruker_reader_2D(nmr_2d_matpass_example):
+    df, a_dic, topspin_title, shape = read_bruker_1d(nmr_2d_matpass_example)
+    assert df is None
+    assert a_dic
+    assert topspin_title
+    assert shape == (8, 4096)