From 5119a327c7b5fd6e29c603ec08ff1b05e0a12f72 Mon Sep 17 00:00:00 2001
From: Franck Albinet <franckalbinet@gmail.com>
Date: Tue, 24 Sep 2024 09:57:55 +0200
Subject: [PATCH] add callback to remove nan values in data provider submitted
 dataset

---
 marisco/_modidx.py        |    6 +
 marisco/callbacks.py      |   22 +-
 nbs/api/callbacks.ipynb   |   22 +
 nbs/handlers/_ospar.ipynb | 1236 ++++++++++++++++++-------------------
 4 files changed, 655 insertions(+), 631 deletions(-)

diff --git a/marisco/_modidx.py b/marisco/_modidx.py
index 7884151..bed3ac5 100644
--- a/marisco/_modidx.py
+++ b/marisco/_modidx.py
@@ -42,6 +42,12 @@
                                                                                     'marisco/callbacks.py'),
                                    'marisco.callbacks.LowerStripNameCB._safe_transform': ( 'api/callbacks.html#lowerstripnamecb._safe_transform',
                                                                                            'marisco/callbacks.py'),
+                                   'marisco.callbacks.RemoveAllNAValuesCB': ( 'api/callbacks.html#removeallnavaluescb',
+                                                                              'marisco/callbacks.py'),
+                                   'marisco.callbacks.RemoveAllNAValuesCB.__call__': ( 'api/callbacks.html#removeallnavaluescb.__call__',
+                                                                                       'marisco/callbacks.py'),
+                                   'marisco.callbacks.RemoveAllNAValuesCB.__init__': ( 'api/callbacks.html#removeallnavaluescb.__init__',
+                                                                                       'marisco/callbacks.py'),
                                    'marisco.callbacks.ReshapeLongToWide': ('api/callbacks.html#reshapelongtowide', 'marisco/callbacks.py'),
                                    'marisco.callbacks.ReshapeLongToWide.__call__': ( 'api/callbacks.html#reshapelongtowide.__call__',
                                                                                      'marisco/callbacks.py'),
diff --git a/marisco/callbacks.py b/marisco/callbacks.py
index eba96f5..572582c 100644
--- a/marisco/callbacks.py
+++ b/marisco/callbacks.py
@@ -4,7 +4,7 @@
 
 # %% auto 0
 __all__ = ['Callback', 'run_cbs', 'Transformer', 'SanitizeLonLatCB', 'AddSampleTypeIdColumnCB', 'AddNuclideIdColumnCB',
-           'LowerStripNameCB', 'ReshapeLongToWide', 'CompareDfsAndTfmCB', 'EncodeTimeCB']
+           'LowerStripNameCB', 'RemoveAllNAValuesCB', 'ReshapeLongToWide', 'CompareDfsAndTfmCB', 'EncodeTimeCB']
 
 # %% ../nbs/api/callbacks.ipynb 2
 import copy
@@ -128,7 +128,21 @@ def __call__(self, tfm):
         for key in tfm.dfs.keys():
             tfm.dfs[key][self.col_dst] = tfm.dfs[key][self.col_src].apply(self._safe_transform)
 
-# %% ../nbs/api/callbacks.ipynb 32
+# %% ../nbs/api/callbacks.ipynb 31
+class RemoveAllNAValuesCB(Callback):
+    "Remove rows with all NA values."
+    def __init__(self, 
+                 cols_to_check:dict # A dictionary with the sample type as key and the column name to check as value
+                ):
+        fc.store_attr()
+
+    def __call__(self, tfm):
+        for k in tfm.dfs.keys():
+            col_to_check = self.cols_to_check[k]
+            mask = tfm.dfs[k][col_to_check].isnull().all(axis=1)
+            tfm.dfs[k] = tfm.dfs[k][~mask]
+
+# %% ../nbs/api/callbacks.ipynb 33
 class ReshapeLongToWide(Callback):
     def __init__(self, columns=['nuclide'], values=['value'], 
                  num_fill_value=-999, str_fill_value='STR FILL VALUE'):
@@ -182,7 +196,7 @@ def __call__(self, tfm):
             tfm.dfs[grp] = self.pivot(tfm.dfs[grp])
             tfm.dfs[grp].columns = self.renamed_cols(tfm.dfs[grp].columns)
 
-# %% ../nbs/api/callbacks.ipynb 34
+# %% ../nbs/api/callbacks.ipynb 35
 class CompareDfsAndTfmCB(Callback):
     def __init__(self, dfs: Dict[str, pd.DataFrame]): 
         "Create a dataframe of dropped data. Data included in the `dfs` not in the `tfm`."
@@ -219,7 +233,7 @@ def _compute_stats(self,
             'Number of rows in tfm.dfs + Number of dropped rows': len(tfm.dfs[grp].index) + len(tfm.dfs_dropped[grp].index)
         }
 
-# %% ../nbs/api/callbacks.ipynb 39
+# %% ../nbs/api/callbacks.ipynb 40
 class EncodeTimeCB(Callback):
     "Encode time as `int` representing seconds since xxx"    
     def __init__(self, cfg , verbose=False): fc.store_attr()
diff --git a/nbs/api/callbacks.ipynb b/nbs/api/callbacks.ipynb
index a89ac51..6efa888 100644
--- a/nbs/api/callbacks.ipynb
+++ b/nbs/api/callbacks.ipynb
@@ -464,6 +464,28 @@
     "## Change structure"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "701be72f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exports\n",
+    "class RemoveAllNAValuesCB(Callback):\n",
+    "    \"Remove rows with all NA values.\"\n",
+    "    def __init__(self, \n",
+    "                 cols_to_check:dict # A dictionary with the sample type as key and the column name to check as value\n",
+    "                ):\n",
+    "        fc.store_attr()\n",
+    "\n",
+    "    def __call__(self, tfm):\n",
+    "        for k in tfm.dfs.keys():\n",
+    "            col_to_check = self.cols_to_check[k]\n",
+    "            mask = tfm.dfs[k][col_to_check].isnull().all(axis=1)\n",
+    "            tfm.dfs[k] = tfm.dfs[k][~mask]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "3a32c3bc",
diff --git a/nbs/handlers/_ospar.ipynb b/nbs/handlers/_ospar.ipynb
index 6f60866..f772a76 100644
--- a/nbs/handlers/_ospar.ipynb
+++ b/nbs/handlers/_ospar.ipynb
@@ -57,7 +57,16 @@
    "execution_count": null,
    "id": "f69f5756",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
    "source": [
     "#| hide\n",
     "%load_ext autoreload\n",
@@ -82,18 +91,57 @@
     "from collections import OrderedDict, defaultdict\n",
     "import re\n",
     "\n",
-    "from marisco.utils import (has_valid_varname, match_worms, Remapper, ddmm_to_dd,\n",
-    "                           match_maris_lut, Match, get_unique_across_dfs)\n",
-    "from marisco.callbacks import (Callback, Transformer, EncodeTimeCB, AddSampleTypeIdColumnCB,\n",
-    "                               AddNuclideIdColumnCB, LowerStripNameCB, SanitizeLonLatCB, \n",
-    "                               ReshapeLongToWide, CompareDfsAndTfmCB)\n",
-    "from marisco.metadata import (GlobAttrsFeeder, BboxCB, DepthRangeCB, \n",
-    "                              TimeRangeCB, ZoteroCB, KeyValuePairCB)\n",
-    "from marisco.configs import (nuc_lut_path, nc_tpl_path, cfg, cache_path, \n",
-    "                             cdl_cfg, Enums, lut_path, species_lut_path, \n",
-    "                             sediments_lut_path, bodyparts_lut_path, \n",
-    "                             detection_limit_lut_path, filtered_lut_path, \n",
-    "                             area_lut_path, get_lut, unit_lut_path)\n",
+    "from marisco.utils import (\n",
+    "    has_valid_varname, \n",
+    "    match_worms, \n",
+    "    Remapper, \n",
+    "    ddmm_to_dd,\n",
+    "    match_maris_lut, \n",
+    "    Match, \n",
+    "    get_unique_across_dfs\n",
+    "    )\n",
+    "\n",
+    "from marisco.callbacks import (\n",
+    "    Callback, \n",
+    "    Transformer, \n",
+    "    RemoveAllNAValuesCB,\n",
+    "    EncodeTimeCB, \n",
+    "    AddSampleTypeIdColumnCB,\n",
+    "    AddNuclideIdColumnCB, \n",
+    "    LowerStripNameCB, \n",
+    "    SanitizeLonLatCB, \n",
+    "    ReshapeLongToWide, \n",
+    "    CompareDfsAndTfmCB,\n",
+    "    RemoveAllNAValuesCB\n",
+    "    )\n",
+    "\n",
+    "from marisco.metadata import (\n",
+    "    GlobAttrsFeeder, \n",
+    "    BboxCB, \n",
+    "    DepthRangeCB, \n",
+    "    TimeRangeCB, \n",
+    "    ZoteroCB, \n",
+    "    KeyValuePairCB\n",
+    "    )\n",
+    "\n",
+    "from marisco.configs import (\n",
+    "    nuc_lut_path, \n",
+    "    nc_tpl_path, \n",
+    "    cfg, \n",
+    "    cache_path, \n",
+    "    cdl_cfg, \n",
+    "    Enums, \n",
+    "    lut_path, \n",
+    "    species_lut_path, \n",
+    "    sediments_lut_path, \n",
+    "    bodyparts_lut_path, \n",
+    "    detection_limit_lut_path, \n",
+    "    filtered_lut_path, \n",
+    "    area_lut_path,\n",
+    "    get_lut,\n",
+    "    unit_lut_path\n",
+    "    )\n",
+    "\n",
     "from marisco.serializers import NetCDFEncoder,  OpenRefineCsvEncoder\n",
     "\n",
     "import warnings\n",
@@ -309,6 +357,95 @@
     "    print(f'{key} columns: ', dfs[key].columns)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "aaa55881",
+   "metadata": {},
+   "source": [
+    "## Remove missing data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d3936614",
+   "metadata": {},
+   "source": [
+    ":::{.callout-tip}\n",
+    "\n",
+    "**FEEDBACK TO DATA PROVIDER**: The `Seawater` dataset contains 548 rows with all NA values as shown below.\n",
+    "\n",
+    ":::"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc0e120f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "seawater: 538 rows with all NA values\n",
+      "biota: 0 rows with all NA values\n"
+     ]
+    }
+   ],
+   "source": [
+    "#| eval: false\n",
+    "dfs = load_data(fname_in)\n",
+    "for key in dfs.keys():\n",
+    "    cols_to_check = dfs[key].columns[1:]\n",
+    "    mask = dfs[key][cols_to_check].isnull().all(axis=1)\n",
+    "    print(f'{key}: {mask.sum()} rows with all NA values')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89d292e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exports\n",
+    "common_cols = [\n",
+    "    'Contracting Party', 'RSC Sub-division', 'Station ID', 'Sample ID',\n",
+    "    'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM', 'LongS', 'LongDir',\n",
+    "    'Sample type', 'Sampling date', 'Nuclide', 'Value type', 'Activity or MDA',\n",
+    "    'Uncertainty', 'Unit', 'Data provider', 'Measurement Comment',\n",
+    "    'Sample Comment', 'Reference Comment'\n",
+    "]\n",
+    "\n",
+    "cols_to_check = {\n",
+    "    'seawater': common_cols + ['Sampling depth'],\n",
+    "    'biota': common_cols + ['Biological group', 'Species', 'Body Part']\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "97cb5905",
+   "metadata": {},
+   "source": [
+    "Let's use the `RemoveAllNAValuesCB` callback to remove all rows with all NA values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5beea658",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| eval: false\n",
+    "dfs = load_data(fname_in)\n",
+    "tfm = Transformer(dfs, cbs=[RemoveAllNAValuesCB(cols_to_check)])\n",
+    "\n",
+    "# Test that all NA values have been removed\n",
+    "fc.test_eq(tfm()['seawater'][cols_to_check['seawater']].isnull().all(axis=1).sum(), 0)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "8326e234",
@@ -379,31 +516,53 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b5d8e77b",
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "52c9d0fe",
+   "metadata": {},
+   "source": [
+    "### Remap nuclide names to MARIS data formats"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9ff7a3f",
+   "metadata": {},
+   "source": [
+    "We below map nuclide names used by OSPAR to the MARIS standard nuclide names. \n",
+    "\n",
+    "Remapping data provider nomenclatures into MARIS standards is one recurrent operation and is done in a semi-automated manner according to the following pattern:\n",
+    "\n",
+    "1. **Inspect** data provider nomenclature:\n",
+    "2. **Match** automatically against MARIS nomenclature (using a fuzzy matching algorithm); \n",
+    "3. **Fix** potential mismatches; \n",
+    "4. **Apply** the lookup table to the dataframe.\n",
+    "\n",
+    "As now on, we will use this pattern to remap the OSPAR data provider nomenclatures into MARIS standards and name it for the sake of brevity **IMFA** (**I**nspect, **M**atch, **F**ix, **A**pply)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "abd510d4",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array(['137Cs', '239,240Pu', '226Ra', '228Ra', '99Tc', '3H', '210Po',\n",
-       "       '210Pb', nan, 'RA-226', 'RA-228'], dtype=object)"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
-    "load_data(fname_in)['seawater']['Nuclide'].unique()"
+    ":::{.callout-tip}\n",
+    "\n",
+    "**FEEDBACK TO DATA PROVIDER**: The `Nuclide` column has inconsistent naming. E.g:\n",
+    "\n",
+    "- `Cs-137`,  `137Cs` or `CS-137`\n",
+    "- `239, 240 pu` or `239,240 pu`\n",
+    "- `ra-226` and `226ra` \n",
+    "\n",
+    "See below:\n",
+    "\n",
+    ":::"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1f324e4a",
+   "id": "9691ccab",
    "metadata": {},
    "outputs": [
     {
@@ -429,161 +588,123 @@
        "      <th></th>\n",
        "      <th>index</th>\n",
        "      <th>value</th>\n",
-       "      <th>n_chars</th>\n",
-       "      <th>stripped_chars</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>0</td>\n",
-       "      <td>239,240Pu</td>\n",
-       "      <td>9.0</td>\n",
-       "      <td>9.0</td>\n",
+       "      <td>239, 240 Pu</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>1</td>\n",
-       "      <td>210Po</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>5.0</td>\n",
+       "      <td>137Cs</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>2</td>\n",
-       "      <td>RA-228</td>\n",
-       "      <td>6.0</td>\n",
-       "      <td>6.0</td>\n",
+       "      <td>CS-137</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>3</td>\n",
-       "      <td>Cs-134</td>\n",
-       "      <td>6.0</td>\n",
-       "      <td>6.0</td>\n",
+       "      <td>Cs-137</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>4</td>\n",
-       "      <td>239, 240 Pu</td>\n",
-       "      <td>11.0</td>\n",
-       "      <td>9.0</td>\n",
+       "      <td>Cs-134</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
        "      <td>5</td>\n",
-       "      <td>238Pu</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>5.0</td>\n",
+       "      <td>210Po</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
        "      <td>6</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>239,240Pu</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
        "      <td>7</td>\n",
-       "      <td>226Ra</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>5.0</td>\n",
+       "      <td>228Ra</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
        "      <td>8</td>\n",
-       "      <td>Cs-137</td>\n",
-       "      <td>6.0</td>\n",
-       "      <td>6.0</td>\n",
+       "      <td>210Pb</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
        "      <td>9</td>\n",
-       "      <td>210Pb</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>5.0</td>\n",
+       "      <td>RA-228</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
        "      <td>10</td>\n",
-       "      <td>3H</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>2.0</td>\n",
+       "      <td>238Pu</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>11</th>\n",
        "      <td>11</td>\n",
-       "      <td>241Am</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>5.0</td>\n",
+       "      <td>3H</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>12</th>\n",
        "      <td>12</td>\n",
-       "      <td>228Ra</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>5.0</td>\n",
+       "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>13</th>\n",
        "      <td>13</td>\n",
-       "      <td>137Cs</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>5.0</td>\n",
+       "      <td>RA-226</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>14</th>\n",
        "      <td>14</td>\n",
-       "      <td>CS-137</td>\n",
-       "      <td>6.0</td>\n",
-       "      <td>6.0</td>\n",
+       "      <td>99Tc</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>15</th>\n",
        "      <td>15</td>\n",
-       "      <td>RA-226</td>\n",
-       "      <td>6.0</td>\n",
-       "      <td>6.0</td>\n",
+       "      <td>241Am</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>16</th>\n",
        "      <td>16</td>\n",
        "      <td>CS-134</td>\n",
-       "      <td>6.0</td>\n",
-       "      <td>6.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>17</th>\n",
        "      <td>17</td>\n",
-       "      <td>99Tc</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>4.0</td>\n",
+       "      <td>226Ra</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "    index        value  n_chars  stripped_chars\n",
-       "0       0    239,240Pu      9.0             9.0\n",
-       "1       1        210Po      5.0             5.0\n",
-       "2       2       RA-228      6.0             6.0\n",
-       "3       3       Cs-134      6.0             6.0\n",
-       "4       4  239, 240 Pu     11.0             9.0\n",
-       "5       5        238Pu      5.0             5.0\n",
-       "6       6          NaN      NaN             NaN\n",
-       "7       7        226Ra      5.0             5.0\n",
-       "8       8       Cs-137      6.0             6.0\n",
-       "9       9        210Pb      5.0             5.0\n",
-       "10     10           3H      2.0             2.0\n",
-       "11     11        241Am      5.0             5.0\n",
-       "12     12        228Ra      5.0             5.0\n",
-       "13     13        137Cs      5.0             5.0\n",
-       "14     14       CS-137      6.0             6.0\n",
-       "15     15       RA-226      6.0             6.0\n",
-       "16     16       CS-134      6.0             6.0\n",
-       "17     17         99Tc      4.0             4.0"
+       "    index        value\n",
+       "0       0  239, 240 Pu\n",
+       "1       1        137Cs\n",
+       "2       2       CS-137\n",
+       "3       3       Cs-137\n",
+       "4       4       Cs-134\n",
+       "5       5        210Po\n",
+       "6       6    239,240Pu\n",
+       "7       7        228Ra\n",
+       "8       8        210Pb\n",
+       "9       9       RA-228\n",
+       "10     10        238Pu\n",
+       "11     11           3H\n",
+       "12     12          NaN\n",
+       "13     13       RA-226\n",
+       "14     14         99Tc\n",
+       "15     15        241Am\n",
+       "16     16       CS-134\n",
+       "17     17        226Ra"
       ]
      },
      "execution_count": null,
@@ -593,324 +714,54 @@
    ],
    "source": [
     "#| eval: false\n",
-    "df = get_unique_across_dfs(load_data(fname_in), 'Nuclide', as_df=True, include_nchars=True)\n",
-    "df['stripped_chars'] = df['value'].str.strip().str.replace(' ', '').str.len()\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7444a821",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "df['stripped_chars'] = df['value'].str.strip().str.replace(' ', '').str.len()\n",
-    "print(df[df['n_chars'] != df['stripped_chars']])"
+    "dfs = load_data(fname_in)\n",
+    "get_unique_across_dfs(dfs, col_name='Nuclide', as_df=True)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "f5befd9b",
+   "id": "f6879cf4",
    "metadata": {},
    "source": [
-    "#### Lower & strip nuclide names"
+    "Let's now create an instance of a fuzzy matching algorithm `Remapper`:"
    ]
   },
   {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "8a2311cd",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "582e03a6",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "We use the `LowerStripRdnNameCB` callback. For each dataframe in the dictionary of dataframes, it corrects the nuclide name by converting it lowercase, striping any leading or trailing whitespace(s) and ensuring the number comes before letters (e.g. `137cs`)."
+    "#| eval: false\n",
+    "remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='Nuclide', as_df=True),\n",
+    "                    maris_lut_fn=nuc_lut_path,\n",
+    "                    maris_col_id='nuclide_id',\n",
+    "                    maris_col_name='nc_name',\n",
+    "                    provider_col_to_match='value',\n",
+    "                    provider_col_key='value',\n",
+    "                    fname_cache='nuclides_ospar.pkl')"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "86cdc845",
+   "id": "857f4cb6",
    "metadata": {},
    "source": [
-    "For instance:"
+    "And try to match HELCOM to MARIS nuclide names as automatically as possible. The `match_score` column allows to assess the results:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8a3fa068",
+   "id": "34f3a398",
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "seawater nuclides: \n",
-      "['137cs' '239,240pu' '226ra' '228ra' '99tc' '3h' '210po' '210pb' nan\n",
-      " 'ra-226' 'ra-228']\n",
-      "biota nuclides: \n",
-      "['239,240pu' '99tc' '137cs' '226ra' '228ra' '238pu' '239, 240 pu' '241am'\n",
-      " 'cs-137' 'cs-134' '3h' '210pb' '210po']\n"
-     ]
-    }
-   ],
-   "source": [
-    "#|eval: false\n",
-    "dfs = load_data(fname_in)\n",
-    "tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(col_src='Nuclide', col_dst='NUCLIDE')])\n",
-    "print('seawater nuclides: ')\n",
-    "print(tfm()['seawater']['NUCLIDE'].unique())\n",
-    "print('biota nuclides: ')\n",
-    "print(tfm()['biota']['NUCLIDE'].unique())"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "52c9d0fe",
-   "metadata": {},
-   "source": [
-    "### Remap nuclide names to MARIS data formats"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d9ff7a3f",
-   "metadata": {},
-   "source": [
-    "We below map nuclide names used by OSPAR to the MARIS standard nuclide names. \n",
-    "\n",
-    "Remapping data provider nomenclatures into MARIS standards is one recurrent operation and is done in a semi-automated manner according to the following pattern:\n",
-    "\n",
-    "1. **Inspect** data provider nomenclature:\n",
-    "2. **Match** automatically against MARIS nomenclature (using a fuzzy matching algorithm); \n",
-    "3. **Fix** potential mismatches; \n",
-    "4. **Apply** the lookup table to the dataframe.\n",
-    "\n",
-    "As now on, we will use this pattern to remap the OSPAR data provider nomenclatures into MARIS standards and name it for the sake of brevity **IMFA** (**I**nspect, **M**atch, **F**ix, **A**pply)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "abd510d4",
-   "metadata": {},
-   "source": [
-    ":::{.callout-tip}\n",
-    "\n",
-    "**FEEDBACK TO DATA PROVIDER**: The `Nuclide` column has inconsistent naming. E.g:\n",
-    "\n",
-    "- `Cs-137`,  `137Cs` or `CS-137`\n",
-    "- `239, 240 pu` or `239,240 pu`\n",
-    "- `ra-226` and `226ra` \n",
-    "\n",
-    "See below:\n",
-    "\n",
-    ":::"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9691ccab",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>index</th>\n",
-       "      <th>value</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
-       "      <td>239,240Pu</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>210Po</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2</td>\n",
-       "      <td>RA-228</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>3</td>\n",
-       "      <td>Cs-134</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>4</td>\n",
-       "      <td>239, 240 Pu</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>5</td>\n",
-       "      <td>238Pu</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>6</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>7</td>\n",
-       "      <td>226Ra</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>8</td>\n",
-       "      <td>Cs-137</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>9</td>\n",
-       "      <td>210Pb</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>10</td>\n",
-       "      <td>3H</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>11</td>\n",
-       "      <td>241Am</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>12</td>\n",
-       "      <td>228Ra</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>13</td>\n",
-       "      <td>137Cs</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>14</td>\n",
-       "      <td>CS-137</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>15</td>\n",
-       "      <td>RA-226</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>16</td>\n",
-       "      <td>CS-134</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>17</td>\n",
-       "      <td>99Tc</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    index        value\n",
-       "0       0    239,240Pu\n",
-       "1       1        210Po\n",
-       "2       2       RA-228\n",
-       "3       3       Cs-134\n",
-       "4       4  239, 240 Pu\n",
-       "5       5        238Pu\n",
-       "6       6          NaN\n",
-       "7       7        226Ra\n",
-       "8       8       Cs-137\n",
-       "9       9        210Pb\n",
-       "10     10           3H\n",
-       "11     11        241Am\n",
-       "12     12        228Ra\n",
-       "13     13        137Cs\n",
-       "14     14       CS-137\n",
-       "15     15       RA-226\n",
-       "16     16       CS-134\n",
-       "17     17         99Tc"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#| eval: false\n",
-    "dfs = load_data(fname_in)\n",
-    "get_unique_across_dfs(dfs, col_name='Nuclide', as_df=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f6879cf4",
-   "metadata": {},
-   "source": [
-    "Let's now create an instance of a fuzzy matching algorithm `Remapper`:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "582e03a6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| eval: false\n",
-    "remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='Nuclide', as_df=True),\n",
-    "                    maris_lut_fn=nuc_lut_path,\n",
-    "                    maris_col_id='nuclide_id',\n",
-    "                    maris_col_name='nc_name',\n",
-    "                    provider_col_to_match='value',\n",
-    "                    provider_col_key='value',\n",
-    "                    fname_cache='nuclides_ospar.pkl')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "857f4cb6",
-   "metadata": {},
-   "source": [
-    "And try to match HELCOM to MARIS nuclide names as automatically as possible. The `match_score` column allows to assess the results:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "34f3a398",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Processing: 100%|██████████| 18/18 [00:00<00:00, 51.23it/s]\n"
+      "Processing: 100%|██████████| 18/18 [00:00<00:00, 41.59it/s]\n"
      ]
     },
     {
@@ -959,9 +810,9 @@
        "      <td>6</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>210Po</th>\n",
-       "      <td>ag106m</td>\n",
-       "      <td>210Po</td>\n",
+       "      <th>241Am</th>\n",
+       "      <td>pu241</td>\n",
+       "      <td>241Am</td>\n",
        "      <td>4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -977,15 +828,15 @@
        "      <td>4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>241Am</th>\n",
-       "      <td>pu241</td>\n",
-       "      <td>241Am</td>\n",
+       "      <th>210Pb</th>\n",
+       "      <td>ag106m</td>\n",
+       "      <td>210Pb</td>\n",
        "      <td>4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>210Pb</th>\n",
+       "      <th>210Po</th>\n",
        "      <td>ag106m</td>\n",
-       "      <td>210Pb</td>\n",
+       "      <td>210Po</td>\n",
        "      <td>4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1013,27 +864,21 @@
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>Cs-137</th>\n",
-       "      <td>cs137</td>\n",
-       "      <td>Cs-137</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
        "      <th>Cs-134</th>\n",
        "      <td>cs134</td>\n",
        "      <td>Cs-134</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>RA-228</th>\n",
-       "      <td>ra228</td>\n",
-       "      <td>RA-228</td>\n",
+       "      <th>Cs-137</th>\n",
+       "      <td>cs137</td>\n",
+       "      <td>Cs-137</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>CS-137</th>\n",
-       "      <td>cs137</td>\n",
-       "      <td>CS-137</td>\n",
+       "      <th>RA-228</th>\n",
+       "      <td>ra228</td>\n",
+       "      <td>RA-228</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1043,6 +888,12 @@
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>CS-137</th>\n",
+       "      <td>cs137</td>\n",
+       "      <td>CS-137</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
        "      <th>CS-134</th>\n",
        "      <td>cs134</td>\n",
        "      <td>CS-134</td>\n",
@@ -1057,20 +908,20 @@
        "source_key                                              \n",
        "239, 240 Pu              pu240  239, 240 Pu            8\n",
        "239,240Pu                pu240    239,240Pu            6\n",
-       "210Po                   ag106m        210Po            4\n",
+       "241Am                    pu241        241Am            4\n",
        "137Cs                       h3        137Cs            4\n",
        "228Ra                     u238        228Ra            4\n",
-       "241Am                    pu241        241Am            4\n",
        "210Pb                   ag106m        210Pb            4\n",
+       "210Po                   ag106m        210Po            4\n",
        "226Ra                     u235        226Ra            4\n",
        "238Pu                     u238        238Pu            3\n",
        "99Tc                        tu         99Tc            3\n",
        "3H                          h3           3H            2\n",
-       "Cs-137                   cs137       Cs-137            1\n",
        "Cs-134                   cs134       Cs-134            1\n",
+       "Cs-137                   cs137       Cs-137            1\n",
        "RA-228                   ra228       RA-228            1\n",
-       "CS-137                   cs137       CS-137            1\n",
        "RA-226                   ra226       RA-226            1\n",
+       "CS-137                   cs137       CS-137            1\n",
        "CS-134                   cs134       CS-134            1"
       ]
      },
@@ -1133,7 +984,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Processing: 100%|██████████| 18/18 [00:00<00:00, 53.16it/s]\n"
+      "Processing: 100%|██████████| 18/18 [00:00<00:00, 50.88it/s]\n"
      ]
     },
     {
@@ -1176,15 +1027,9 @@
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>RA-228</th>\n",
-       "      <td>ra228</td>\n",
-       "      <td>RA-228</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>Cs-134</th>\n",
-       "      <td>cs134</td>\n",
-       "      <td>Cs-134</td>\n",
+       "      <th>CS-137</th>\n",
+       "      <td>cs137</td>\n",
+       "      <td>CS-137</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1194,9 +1039,15 @@
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>CS-137</th>\n",
-       "      <td>cs137</td>\n",
-       "      <td>CS-137</td>\n",
+       "      <th>Cs-134</th>\n",
+       "      <td>cs134</td>\n",
+       "      <td>Cs-134</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>RA-228</th>\n",
+       "      <td>ra228</td>\n",
+       "      <td>RA-228</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1219,10 +1070,10 @@
        "           matched_maris_name source_name  match_score\n",
        "source_key                                            \n",
        "3H                         h3          3H            2\n",
-       "RA-228                  ra228      RA-228            1\n",
-       "Cs-134                  cs134      Cs-134            1\n",
-       "Cs-137                  cs137      Cs-137            1\n",
        "CS-137                  cs137      CS-137            1\n",
+       "Cs-137                  cs137      Cs-137            1\n",
+       "Cs-134                  cs134      Cs-134            1\n",
+       "RA-228                  ra228      RA-228            1\n",
        "RA-226                  ra226      RA-226            1\n",
        "CS-134                  cs134      CS-134            1"
       ]
@@ -1323,34 +1174,202 @@
     "    print(f'{key} NUCLIDE unique: ', dfs_out[key]['NUCLIDE'].unique())"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "a54f21ff",
+   "metadata": {},
+   "source": [
+    "### Add Nuclide Id column"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0deedefa",
+   "metadata": {},
+   "source": [
+    "The `nuclide_id` column is added to the dataframe for legacy reasons (again Open Refine output)."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2c5be367",
+   "id": "635c8f39",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>NUCLIDE</th>\n",
+       "      <th>nuclide_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>pu239_240_tot</td>\n",
+       "      <td>77</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>tc99</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>pu239_240_tot</td>\n",
+       "      <td>77</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>pu239_240_tot</td>\n",
+       "      <td>77</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>tc99</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15309</th>\n",
+       "      <td>tc99</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15310</th>\n",
+       "      <td>pu239_240_tot</td>\n",
+       "      <td>77</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15311</th>\n",
+       "      <td>cs137</td>\n",
+       "      <td>33</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15312</th>\n",
+       "      <td>cs137</td>\n",
+       "      <td>33</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15313</th>\n",
+       "      <td>tc99</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>15314 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             NUCLIDE  nuclide_id\n",
+       "0      pu239_240_tot          77\n",
+       "1               tc99          15\n",
+       "2      pu239_240_tot          77\n",
+       "3      pu239_240_tot          77\n",
+       "4               tc99          15\n",
+       "...              ...         ...\n",
+       "15309           tc99          15\n",
+       "15310  pu239_240_tot          77\n",
+       "15311          cs137          33\n",
+       "15312          cs137          33\n",
+       "15313           tc99          15\n",
+       "\n",
+       "[15314 rows x 2 columns]"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#| eval: false\n",
+    "dfs = load_data(fname_in)\n",
+    "tfm = Transformer(dfs, cbs=[RemapNuclideNameCB(lut_nuclides),\n",
+    "                            AddNuclideIdColumnCB(col_value='NUCLIDE')\n",
+    "                            ])\n",
+    "dfs_out = tfm()\n",
+    "\n",
+    "# For instance\n",
+    "dfs_out['biota'][['NUCLIDE', 'nuclide_id']]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ba5585f",
+   "metadata": {},
+   "source": [
+    "## Standardize Time"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c7bce267",
+   "metadata": {},
+   "source": [
+    "#### Parse time"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e5455ec",
+   "metadata": {},
+   "source": [
+    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: `time`.*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9e0b1805",
+   "metadata": {},
+   "source": [
+    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Open Refine format variables: `begperiod` "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c807cd86",
+   "metadata": {},
+   "source": [
+    "Create a callback that remaps the time format in the dictionary of dataframes (i.e. `%m/%d/%y %H:%M:%S`):"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "44976e05",
+   "id": "166fb92c",
    "metadata": {},
    "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c4b7ef1b",
-   "metadata": {},
    "source": [
-    "Many entries of OSPAR Nuclide are NAN. "
+    "# TODO"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "dd7d2c53",
+   "id": "101d4fb0",
    "metadata": {},
    "outputs": [
     {
@@ -1399,8 +1418,8 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>16799</th>\n",
-       "      <td>97147</td>\n",
+       "      <th>0</th>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1423,8 +1442,8 @@
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>16800</th>\n",
-       "      <td>97148</td>\n",
+       "      <th>1</th>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1447,8 +1466,8 @@
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>16801</th>\n",
-       "      <td>97149</td>\n",
+       "      <th>2</th>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1471,8 +1490,8 @@
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>16802</th>\n",
-       "      <td>97150</td>\n",
+       "      <th>3</th>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1495,8 +1514,8 @@
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>16803</th>\n",
-       "      <td>97151</td>\n",
+       "      <th>4</th>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1543,17 +1562,17 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>18474</th>\n",
-       "      <td>120366</td>\n",
-       "      <td>Ireland</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>N8</td>\n",
+       "      <th>18851</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>53.0</td>\n",
-       "      <td>39.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>N</td>\n",
-       "      <td>5.0</td>\n",
        "      <td>...</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1562,22 +1581,22 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>2021 data</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>18475</th>\n",
-       "      <td>120367</td>\n",
-       "      <td>Ireland</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>N9</td>\n",
+       "      <th>18852</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>53.0</td>\n",
-       "      <td>53.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>N</td>\n",
-       "      <td>5.0</td>\n",
        "      <td>...</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1586,22 +1605,22 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>2021 data</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>18476</th>\n",
-       "      <td>120368</td>\n",
-       "      <td>Ireland</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>N10</td>\n",
+       "      <th>18853</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>53.0</td>\n",
-       "      <td>52.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>N</td>\n",
-       "      <td>5.0</td>\n",
        "      <td>...</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1610,22 +1629,22 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>2021 data</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>18477</th>\n",
-       "      <td>120369</td>\n",
-       "      <td>Ireland</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>Salthill</td>\n",
+       "      <th>18854</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>53.0</td>\n",
-       "      <td>15.0</td>\n",
-       "      <td>40.0</td>\n",
-       "      <td>N</td>\n",
-       "      <td>9.0</td>\n",
        "      <td>...</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1634,22 +1653,22 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>2021 data</td>\n",
-       "      <td>Woodstown (County Waterford) and Salthill (Cou...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>18478</th>\n",
-       "      <td>120370</td>\n",
-       "      <td>Ireland</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>Woodstown</td>\n",
+       "      <th>18855</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>52.0</td>\n",
-       "      <td>11.0</td>\n",
-       "      <td>55.0</td>\n",
-       "      <td>N</td>\n",
-       "      <td>6.0</td>\n",
        "      <td>...</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
@@ -1664,63 +1683,63 @@
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>546 rows × 25 columns</p>\n",
+       "<p>18318 rows × 25 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "           ID Contracting Party  RSC Sub-division Station ID Sample ID  LatD  \\\n",
-       "16799   97147               NaN               NaN        NaN       NaN   NaN   \n",
-       "16800   97148               NaN               NaN        NaN       NaN   NaN   \n",
-       "16801   97149               NaN               NaN        NaN       NaN   NaN   \n",
-       "16802   97150               NaN               NaN        NaN       NaN   NaN   \n",
-       "16803   97151               NaN               NaN        NaN       NaN   NaN   \n",
-       "...       ...               ...               ...        ...       ...   ...   \n",
-       "18474  120366           Ireland               4.0         N8       NaN  53.0   \n",
-       "18475  120367           Ireland               4.0         N9       NaN  53.0   \n",
-       "18476  120368           Ireland               4.0        N10       NaN  53.0   \n",
-       "18477  120369           Ireland               1.0   Salthill       NaN  53.0   \n",
-       "18478  120370           Ireland               1.0  Woodstown       NaN  52.0   \n",
+       "       ID Contracting Party  RSC Sub-division Station ID Sample ID  LatD  \\\n",
+       "0     NaN               NaN               NaN        NaN       NaN   NaN   \n",
+       "1     NaN               NaN               NaN        NaN       NaN   NaN   \n",
+       "2     NaN               NaN               NaN        NaN       NaN   NaN   \n",
+       "3     NaN               NaN               NaN        NaN       NaN   NaN   \n",
+       "4     NaN               NaN               NaN        NaN       NaN   NaN   \n",
+       "...    ..               ...               ...        ...       ...   ...   \n",
+       "18851 NaN               NaN               NaN        NaN       NaN   NaN   \n",
+       "18852 NaN               NaN               NaN        NaN       NaN   NaN   \n",
+       "18853 NaN               NaN               NaN        NaN       NaN   NaN   \n",
+       "18854 NaN               NaN               NaN        NaN       NaN   NaN   \n",
+       "18855 NaN               NaN               NaN        NaN       NaN   NaN   \n",
        "\n",
        "       LatM  LatS LatDir  LongD  ...  Sampling date  Nuclide Value type  \\\n",
-       "16799   NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
-       "16800   NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
-       "16801   NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
-       "16802   NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
-       "16803   NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
+       "0       NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
+       "1       NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
+       "2       NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
+       "3       NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
+       "4       NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
        "...     ...   ...    ...    ...  ...            ...      ...        ...   \n",
-       "18474  39.0   0.0      N    5.0  ...            NaN      NaN        NaN   \n",
-       "18475  53.0   0.0      N    5.0  ...            NaN      NaN        NaN   \n",
-       "18476  52.0   0.0      N    5.0  ...            NaN      NaN        NaN   \n",
-       "18477  15.0  40.0      N    9.0  ...            NaN      NaN        NaN   \n",
-       "18478  11.0  55.0      N    6.0  ...            NaN      NaN        NaN   \n",
+       "18851   NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
+       "18852   NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
+       "18853   NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
+       "18854   NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
+       "18855   NaN   NaN    NaN    NaN  ...            NaN      NaN        NaN   \n",
        "\n",
        "      Activity or MDA  Uncertainty Unit Data provider Measurement Comment  \\\n",
-       "16799             NaN          NaN  NaN           NaN                 NaN   \n",
-       "16800             NaN          NaN  NaN           NaN                 NaN   \n",
-       "16801             NaN          NaN  NaN           NaN                 NaN   \n",
-       "16802             NaN          NaN  NaN           NaN                 NaN   \n",
-       "16803             NaN          NaN  NaN           NaN                 NaN   \n",
+       "0                 NaN          NaN  NaN           NaN                 NaN   \n",
+       "1                 NaN          NaN  NaN           NaN                 NaN   \n",
+       "2                 NaN          NaN  NaN           NaN                 NaN   \n",
+       "3                 NaN          NaN  NaN           NaN                 NaN   \n",
+       "4                 NaN          NaN  NaN           NaN                 NaN   \n",
        "...               ...          ...  ...           ...                 ...   \n",
-       "18474             NaN          NaN  NaN           NaN           2021 data   \n",
-       "18475             NaN          NaN  NaN           NaN           2021 data   \n",
-       "18476             NaN          NaN  NaN           NaN           2021 data   \n",
-       "18477             NaN          NaN  NaN           NaN           2021 data   \n",
-       "18478             NaN          NaN  NaN           NaN                 NaN   \n",
+       "18851             NaN          NaN  NaN           NaN                 NaN   \n",
+       "18852             NaN          NaN  NaN           NaN                 NaN   \n",
+       "18853             NaN          NaN  NaN           NaN                 NaN   \n",
+       "18854             NaN          NaN  NaN           NaN                 NaN   \n",
+       "18855             NaN          NaN  NaN           NaN                 NaN   \n",
        "\n",
-       "                                          Sample Comment  Reference Comment  \n",
-       "16799                                                NaN                NaN  \n",
-       "16800                                                NaN                NaN  \n",
-       "16801                                                NaN                NaN  \n",
-       "16802                                                NaN                NaN  \n",
-       "16803                                                NaN                NaN  \n",
-       "...                                                  ...                ...  \n",
-       "18474                                                NaN                NaN  \n",
-       "18475                                                NaN                NaN  \n",
-       "18476                                                NaN                NaN  \n",
-       "18477  Woodstown (County Waterford) and Salthill (Cou...                NaN  \n",
-       "18478                                                NaN                NaN  \n",
+       "       Sample Comment  Reference Comment  \n",
+       "0                 NaN                NaN  \n",
+       "1                 NaN                NaN  \n",
+       "2                 NaN                NaN  \n",
+       "3                 NaN                NaN  \n",
+       "4                 NaN                NaN  \n",
+       "...               ...                ...  \n",
+       "18851             NaN                NaN  \n",
+       "18852             NaN                NaN  \n",
+       "18853             NaN                NaN  \n",
+       "18854             NaN                NaN  \n",
+       "18855             NaN                NaN  \n",
        "\n",
-       "[546 rows x 25 columns]"
+       "[18318 rows x 25 columns]"
       ]
      },
      "execution_count": null,
@@ -1729,55 +1748,35 @@
     }
    ],
    "source": [
-    "dfs['seawater'][dfs['seawater']['Nuclide'].isna()]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4a33dd67",
-   "metadata": {},
-   "source": [
-    "***"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2ba5585f",
-   "metadata": {},
-   "source": [
-    "### Standardize Time"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c7bce267",
-   "metadata": {},
-   "source": [
-    "#### Parse time"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0e5455ec",
-   "metadata": {},
-   "source": [
-    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: `time`.*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9e0b1805",
-   "metadata": {},
-   "source": [
-    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Open Refine format variables: `begperiod` "
+    "dfs = load_data(fname_in)\n",
+    "tfm = Transformer(dfs, cbs=[\n",
+    "    RemoveAllNAValuesCB(cols_to_check),\n",
+    "    ])\n",
+    "\n",
+    "dfs_test = tfm()\n",
+    "mask = dfs_test['seawater'][['Sampling date']].isna()\n",
+    "dfs_test['seawater'][mask]"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "c807cd86",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4292c628",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(18318, 25)"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "Create a callback that remaps the time format in the dictionary of dataframes (i.e. `%m/%d/%y %H:%M:%S`):"
+    "dfs_test['seawater'].shape"
    ]
   },
   {
@@ -1789,9 +1788,9 @@
    "source": [
     "#| export\n",
     "class ParseTimeCB(Callback):\n",
+    "    \"Parse the time format in the dataframe.\"\n",
     "    def __init__(self):\n",
     "        fc.store_attr()\n",
-    "            \n",
     "        \n",
     "    def __call__(self, tfm):\n",
     "        for grp in tfm.dfs.keys():\n",
@@ -1801,12 +1800,6 @@
     "            self._remove_nan(df)\n",
     "\n",
     "    def _process_dates(self, df: pd.DataFrame):\n",
-    "        \"\"\"\n",
-    "        Process and correct date and time information in the DataFrame.\n",
-    "\n",
-    "        Args:\n",
-    "            df (pd.DataFrame): DataFrame containing the 'Sampling date' column.\n",
-    "        \"\"\"\n",
     "        if 'Sampling date' in df.columns:\n",
     "            # Convert 'Sampling date' to datetime, ignoring errors to avoid NaNs\n",
     "            df['time'] = pd.to_datetime(df['Sampling date'], format='%d/%m/%Y', errors='coerce')\n",
@@ -1815,23 +1808,10 @@
     "            df['time'] = pd.NaT                \n",
     "                    \n",
     "    def _define_beg_period(self, df: pd.DataFrame):\n",
-    "        \"\"\"\n",
-    "        Create a standardized date representation for Open Refine.\n",
-    "        \n",
-    "        Args:\n",
-    "            df (pd.DataFrame): DataFrame containing the 'time' column.\n",
-    "        \"\"\"\n",
     "        df['begperiod'] = df['time']\n",
     "\n",
     "    def _remove_nan(self, df: pd.DataFrame):\n",
-    "        \"\"\"\n",
-    "        Remove rows with NaN entries in the 'time' column.\n",
-    "        \n",
-    "        Args:\n",
-    "            df (pd.DataFrame): DataFrame containing the 'time' column.\n",
-    "        \"\"\"\n",
-    "        df.dropna(subset=['time'], inplace=True)\n",
-    "\n"
+    "        df.dropna(subset=['time'], inplace=True)"
    ]
   },
   {
@@ -1878,9 +1858,11 @@
    "source": [
     "#|eval: false\n",
     "dfs = load_data(fname_in)\n",
-    "tfm = Transformer(dfs, cbs=[ParseTimeCB(),\n",
-    "                            CompareDfsAndTfmCB(dfs)\n",
-    "                            ])\n",
+    "tfm = Transformer(dfs, cbs=[\n",
+    "    RemoveAllNAValuesCB(cols_to_check),\n",
+    "    ParseTimeCB(),\n",
+    "    CompareDfsAndTfmCB(dfs)\n",
+    "    ])\n",
     "tfm()\n",
     "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n",
     "print(tfm.dfs['seawater'][['begperiod','time']])"