From 3285c9a8f72710a797f677cee2a4d00bb5a998fc Mon Sep 17 00:00:00 2001
From: Franck Albinet <franckalbinet@gmail.com>
Date: Thu, 26 Sep 2024 10:02:47 +0200
Subject: [PATCH] sanitize uncertainties + fix measurements + match biota
 species in ospar handler

---
 marisco/handlers/helcom.py |    3 +-
 marisco/utils.py           |   31 +-
 nbs/api/utils.ipynb        |   19 +
 nbs/handlers/_ospar.ipynb  | 1046 ++++++++++++++++++++++++++++++------
 nbs/handlers/helcom.ipynb  |   21 +-
 5 files changed, 924 insertions(+), 196 deletions(-)

diff --git a/marisco/handlers/helcom.py b/marisco/handlers/helcom.py
index 3b96b9e..b712b07 100644
--- a/marisco/handlers/helcom.py
+++ b/marisco/handlers/helcom.py
@@ -149,8 +149,7 @@ def _define_beg_period(self, df):
         "Create a standardized date representation for Open Refine."
         df['begperiod'] = df['time']
 
-# %% ../../nbs/handlers/helcom.ipynb 58
-# Columns of interest
+# %% ../../nbs/handlers/helcom.ipynb 59
 coi_val = {'seawater' : {'val': 'VALUE_Bq/m³'},
            'biota':  {'val': 'VALUE_Bq/kg'},
            'sediment': {'val': 'VALUE_Bq/kg'}}
diff --git a/marisco/utils.py b/marisco/utils.py
index 5747ed3..9874774 100644
--- a/marisco/utils.py
+++ b/marisco/utils.py
@@ -3,7 +3,7 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/api/utils.ipynb.
 
 # %% auto 0
-__all__ = ['get_unique_across_dfs', 'Remapper', 'has_valid_varname', 'get_bbox', 'ddmm_to_dd', 'download_files_in_folder',
+__all__ = ['NA', 'get_unique_across_dfs', 'Remapper', 'has_valid_varname', 'get_bbox', 'ddmm_to_dd', 'download_files_in_folder',
            'download_file', 'match_worms', 'Match', 'match_maris_lut', 'test_dfs']
 
 # %% ../nbs/api/utils.ipynb 2
@@ -26,7 +26,10 @@
 import jellyfish as jf
 from collections.abc import Callable
 
-# %% ../nbs/api/utils.ipynb 6
+# %% ../nbs/api/utils.ipynb 5
+NA = 'Not available'
+
+# %% ../nbs/api/utils.ipynb 8
 def get_unique_across_dfs(dfs:dict,  # Dictionary of dataframes
                           col_name:str='NUCLIDE', # Column name to extract unique values from
                           as_df:bool=False, # Return a DataFrame of unique values
@@ -41,7 +44,7 @@ def get_unique_across_dfs(dfs:dict,  # Dictionary of dataframes
         if include_nchars: df_uniques['n_chars'] = df_uniques['value'].str.len()
         return df_uniques
 
-# %% ../nbs/api/utils.ipynb 12
+# %% ../nbs/api/utils.ipynb 14
 class Remapper():
     "Remap a data provider lookup table to a MARIS lookup table using fuzzy matching."
     def __init__(self,
@@ -104,7 +107,7 @@ def _format_output(self):
         return df_lut.sort_values(by='match_score', ascending=False)
 
 
-# %% ../nbs/api/utils.ipynb 14
+# %% ../nbs/api/utils.ipynb 16
 def has_valid_varname(
     var_names:list, # variable names
     cdl_path:str, # Path to MARIS CDL file (point of truth)
@@ -137,7 +140,7 @@ def has_valid_varname(
                 print(f'"{name}" variable name not found in MARIS CDL')
     return has_valid  
 
-# %% ../nbs/api/utils.ipynb 18
+# %% ../nbs/api/utils.ipynb 20
 def get_bbox(df,
              coord_cols=('lon', 'lat')
             ):
@@ -146,7 +149,7 @@ def get_bbox(df,
     arr = [(row[x], row[y]) for _, row in df.iterrows()]
     return MultiPoint(arr).envelope
 
-# %% ../nbs/api/utils.ipynb 24
+# %% ../nbs/api/utils.ipynb 26
 def ddmm_to_dd(
     ddmmmm:float # Coordinates in degrees/minutes decimal format
     ) -> float: # Coordinates in degrees decimal format
@@ -155,7 +158,7 @@ def ddmm_to_dd(
     mins = mins * 100
     return round(int(degs) + (mins / 60), 6)
 
-# %% ../nbs/api/utils.ipynb 27
+# %% ../nbs/api/utils.ipynb 29
 def download_files_in_folder(owner:str, 
                              repo:str, 
                              src_dir:str, 
@@ -189,7 +192,7 @@ def download_file(owner, repo, src_dir, dest_dir, fname):
     else:
         print(f"Error: {response.status_code}")
 
-# %% ../nbs/api/utils.ipynb 29
+# %% ../nbs/api/utils.ipynb 31
 def match_worms(
     name:str # Name of species to look up in WoRMS
     ):
@@ -212,7 +215,7 @@ def match_worms(
     else:
         return -1
 
-# %% ../nbs/api/utils.ipynb 34
+# %% ../nbs/api/utils.ipynb 36
 @dataclass
 class Match:
     "Match between a data provider name and a MARIS lookup table."
@@ -221,7 +224,7 @@ class Match:
     source_name: str
     match_score: int
 
-# %% ../nbs/api/utils.ipynb 35
+# %% ../nbs/api/utils.ipynb 37
 def match_maris_lut(
     lut_path: str, # Path to MARIS species authoritative species look-up table
     data_provider_name: str, # Name of data provider nomenclature item to look up 
@@ -238,7 +241,7 @@ def match_maris_lut(
     df = df.sort_values(by='score', ascending=True)[:nresults]
     return df[[maris_id, maris_name, 'score']]
 
-# %% ../nbs/api/utils.ipynb 42
+# %% ../nbs/api/utils.ipynb 44
 def get_bbox(df,
              coord_cols=('lon', 'lat')
             ):
@@ -246,7 +249,7 @@ def get_bbox(df,
     arr = [(row[x], row[y]) for _, row in df.iterrows()]
     return MultiPoint(arr).envelope
 
-# %% ../nbs/api/utils.ipynb 49
+# %% ../nbs/api/utils.ipynb 51
 def download_files_in_folder(owner:str, 
                              repo:str, 
                              src_dir:str, 
@@ -280,7 +283,7 @@ def download_file(owner, repo, src_dir, dest_dir, fname):
     else:
         print(f"Error: {response.status_code}")
 
-# %% ../nbs/api/utils.ipynb 51
+# %% ../nbs/api/utils.ipynb 53
 def match_worms(
     name:str # Name of species to look up in WoRMS
     ):
@@ -303,7 +306,7 @@ def match_worms(
     else:
         return -1
 
-# %% ../nbs/api/utils.ipynb 56
+# %% ../nbs/api/utils.ipynb 58
 def test_dfs(
     dfs1:dict, # First dictionary of DataFrames to compare 
     dfs2:dict # Second dictionary of DataFrames to compare
diff --git a/nbs/api/utils.ipynb b/nbs/api/utils.ipynb
index 40ff7c5..cdbde2b 100644
--- a/nbs/api/utils.ipynb
+++ b/nbs/api/utils.ipynb
@@ -59,6 +59,25 @@
     "import pandas as pd"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "fb15839e",
+   "metadata": {},
+   "source": [
+    "We define below useful constants throughout the package."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6dff2deb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exports\n",
+    "NA = 'Not available'"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7ddde356",
diff --git a/nbs/handlers/_ospar.ipynb b/nbs/handlers/_ospar.ipynb
index f381711..5da9120 100644
--- a/nbs/handlers/_ospar.ipynb
+++ b/nbs/handlers/_ospar.ipynb
@@ -142,6 +142,7 @@
     "    unit_lut_path\n",
     "    )\n",
     "\n",
+    "from marisco.utils import NA\n",
     "from marisco.serializers import NetCDFEncoder,  OpenRefineCsvEncoder\n",
     "\n",
     "import warnings\n",
@@ -1547,28 +1548,10 @@
   },
   {
    "cell_type": "markdown",
-   "id": "02decc88",
+   "id": "0fe8ab6e",
    "metadata": {},
    "source": [
-    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: ``value``.*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e44772c2",
-   "metadata": {},
-   "source": [
-    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Open Refine format variables: ``activity``.*"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a2b34550",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# TO BE REFACTORED"
+    "We allocate each column containing measurement values into a single column `value` and remove `NA` where needed."
    ]
   },
   {
@@ -1578,39 +1561,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# | export\n",
+    "# | exports\n",
     "class SanitizeValue(Callback):\n",
-    "    \"Sanitize value by removing blank entries.\"\n",
-    "\n",
-    "    def __init__(self):\n",
-    "        \"\"\"\n",
-    "        Initialize the SanitizeValue callback.\n",
-    "        \"\"\"\n",
+    "    \"Sanitize value by removing blank entries and populating `value` column.\"\n",
+    "    def __init__(self, \n",
+    "                 value_col: str='Activity or MDA' # Column name to sanitize\n",
+    "                 ):\n",
     "        fc.store_attr()\n",
     "\n",
     "    def __call__(self, tfm):\n",
-    "        \"\"\"\n",
-    "        Sanitize the DataFrames in the transformer by removing rows with blank values in specified columns.\n",
-    "        Args:\n",
-    "            tfm (Transformer): The transformer object containing DataFrames.\n",
-    "        \"\"\"\n",
-    "        for grp in tfm.dfs.keys():\n",
-    "            self._sanitize_dataframe(tfm.dfs[grp], grp)\n",
-    "\n",
-    "\n",
-    "    def _sanitize_dataframe(self, df: pd.DataFrame, grp: str):\n",
-    "        \"\"\"\n",
-    "        Remove rows where value column (i.e. 'Activity or MDA') is blank and remap to 'value' column.\n",
-    "\n",
-    "        Args:\n",
-    "            df (pd.DataFrame): DataFrame to sanitize.\n",
-    "            grp (str): Group name to determine column names.\n",
-    "        \"\"\"\n",
-    "        value_col = 'Activity or MDA'\n",
-    "        if value_col in df.columns:\n",
-    "            df.dropna(subset=[value_col], inplace=True)\n",
-    "            df['value'] = df[value_col]\n",
-    "            "
+    "        for df in tfm.dfs.values():\n",
+    "            df.dropna(subset=[self.value_col], inplace=True)\n",
+    "            df['value'] = df[self.value_col]"
    ]
   },
   {
@@ -1620,35 +1582,74 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "                                                    seawater  biota\n",
-      "Number of rows in dfs                                  18856  15314\n",
-      "Number of rows in tfm.dfs                              18308  15314\n",
-      "Number of dropped rows                                   548      0\n",
-      "Number of rows in tfm.dfs + Number of dropped rows     18856  15314 \n",
-      "\n"
-     ]
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>value</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.26</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.20</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   value\n",
+       "0   0.20\n",
+       "1   0.27\n",
+       "2   0.26\n",
+       "3   0.25\n",
+       "4   0.20"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "#|eval: false\n",
     "dfs = load_data(fname_in)\n",
-    "tfm = Transformer(dfs, cbs=[SanitizeValue(),\n",
-    "                            CompareDfsAndTfmCB(dfs)\n",
-    "                            ])\n",
+    "tfm = Transformer(dfs, cbs=[SanitizeValue()])\n",
     "\n",
-    "tfm()\n",
-    "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "197588fb",
-   "metadata": {},
-   "source": [
-    "***"
+    "tfm()['seawater'][['value']].head()"
    ]
   },
   {
@@ -1656,23 +1657,7 @@
    "id": "7c83412b",
    "metadata": {},
    "source": [
-    "### Normalize uncertainty"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1ceaf5b7",
-   "metadata": {},
-   "source": [
-    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: ``uncertainty``.*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "41097d83",
-   "metadata": {},
-   "source": [
-    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Open Refine format variable: `Uncertainty`.*"
+    "## Normalize uncertainty"
    ]
   },
   {
@@ -1680,11 +1665,9 @@
    "id": "13a44f1a",
    "metadata": {},
    "source": [
-    "For each sample type in the OSPAR dataset, the reported uncertainty is given as an expanded uncertainty with a coverage factor 𝑘=2\n",
-    "\n",
-    "For further details, refer to the [OSPAR reporting guidelines](https://mcc.jrc.ec.europa.eu/documents/OSPAR/Guidelines_forestimationof_a_%20measurefor_uncertainty_in_OSPARmonitoring.pdf)\n",
+    "For each sample type in the OSPAR dataset, the reported uncertainty is given as an expanded uncertainty with a coverage factor `𝑘=2`. For further details, refer to the [OSPAR reporting guidelines](https://mcc.jrc.ec.europa.eu/documents/OSPAR/Guidelines_forestimationof_a_%20measurefor_uncertainty_in_OSPARmonitoring.pdf).\n",
     "\n",
-    "Note: The OSPAR uncertainty values are normalized to standard uncertainty with a coverage factor \n",
+    "**Note**: The OSPAR uncertainty values are normalized to standard uncertainty with a coverage factor \n",
     "𝑘=1."
    ]
   },
@@ -1693,82 +1676,39 @@
    "id": "97a933ab",
    "metadata": {},
    "source": [
-    "NormalizeUncCB callback normalizes the uncertainty"
+    "`NormalizeUncCB` callback normalizes the uncertainty using the following `lambda` function:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "87265333",
+   "id": "d6c84351",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| export\n",
-    "# Make measurement and uncertainty units consistent\n",
-    "def unc_exp2stan(df: pd.DataFrame, unc_col: str) -> pd.Series:\n",
-    "    \"\"\"\n",
-    "    Convert expanded uncertainty (k=2) to standard uncertainty (k=1).\n",
-    "\n",
-    "    Args:\n",
-    "        df (pd.DataFrame): DataFrame containing uncertainty values.\n",
-    "        unc_col (str): Column name of the uncertainty values to be converted.\n",
-    "\n",
-    "    Returns:\n",
-    "        pd.Series: Series of standard uncertainty values.\n",
-    "    \"\"\"\n",
-    "    k = 2\n",
-    "    return df[unc_col] / k"
+    "#| exports\n",
+    "unc_exp2stan = lambda df, unc_col: df[unc_col] / 2"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8ad1b008",
+   "id": "ecb2866d",
    "metadata": {},
    "outputs": [],
    "source": [
+    "#| exports\n",
     "class NormalizeUncCB(Callback):\n",
-    "    \"\"\"Callback to normalize uncertainty values in DataFrames. This callback applies a conversion function to standardize the uncertainty values in each DataFrame.\"\"\"\n",
-    "\n",
-    "    def __init__(self, fn_convert_unc: Callable[[pd.DataFrame, str], pd.Series]):\n",
-    "        \"\"\"\n",
-    "        Initialize the NormalizeUncCB with a conversion function.\n",
-    "\n",
-    "        Args:\n",
-    "            fn_convert_unc (Callable[[pd.DataFrame, str], pd.Series]): \n",
-    "                Function that takes a DataFrame and a column name, and returns a Series of converted uncertainty values.\n",
-    "        \"\"\"\n",
+    "    \"\"\"Normalize uncertainty values in DataFrames.\"\"\"\n",
+    "    def __init__(self, \n",
+    "                 col_unc: str='Uncertainty', # Column name to normalize\n",
+    "                 fn_convert_unc: Callable=unc_exp2stan, # Function correcting coverage factor\n",
+    "                 ): \n",
     "        fc.store_attr()\n",
     "\n",
-    "    def __call__(self, tfm: 'Transformer'):\n",
-    "        \"\"\"\n",
-    "        Apply the conversion function to the 'Uncertainty' column in each DataFrame within the transformer.\n",
-    "\n",
-    "        Args:\n",
-    "            tfm (Transformer): The transformer object containing DataFrames.\n",
-    "        \"\"\"\n",
-    "        for grp, df in tfm.dfs.items():\n",
-    "            df['uncertainty'] = self._convert_uncertainty(df)\n",
-    "\n",
-    "    def _convert_uncertainty(self, df: pd.DataFrame) -> pd.Series:\n",
-    "        \"\"\"\n",
-    "        Convert the uncertainty values in the DataFrame using the provided conversion function.\n",
-    "\n",
-    "        Args:\n",
-    "            df (pd.DataFrame): DataFrame containing the 'Uncertainty' column.\n",
-    "\n",
-    "        Returns:\n",
-    "            pd.Series: Converted uncertainty values.\n",
-    "        \"\"\"\n",
-    "        return self.fn_convert_unc(df, 'Uncertainty')\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fd159712",
-   "metadata": {},
-   "source": [
-    "Apply the transformer for callback NormalizeUncCB(). Then, print the value (i.e. activity per unit ) and standard uncertainty for each sample type."
+    "    def __call__(self, tfm):\n",
+    "        for df in tfm.dfs.values():\n",
+    "            df['uncertainty'] = self.fn_convert_unc(df, self.col_unc)"
    ]
   },
   {
@@ -1781,18 +1721,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "                                                    seawater  biota\n",
-      "Number of rows in dfs                                  18856  15314\n",
-      "Number of rows in tfm.dfs                              18308  15314\n",
-      "Number of dropped rows                                   548      0\n",
-      "Number of rows in tfm.dfs + Number of dropped rows     18856  15314 \n",
       "\n",
+      "seawater:\n",
       "   value  uncertainty\n",
       "0   0.20          NaN\n",
       "1   0.27          NaN\n",
       "2   0.26          NaN\n",
       "3   0.25          NaN\n",
       "4   0.20          NaN\n",
+      "\n",
+      "biota:\n",
       "     value  uncertainty\n",
       "0   0.3510        0.033\n",
       "1  39.0000        7.500\n",
@@ -1807,32 +1745,806 @@
     "dfs = load_data(fname_in)\n",
     "tfm = Transformer(dfs, cbs=[       \n",
     "                            SanitizeValue(),               \n",
-    "                            NormalizeUncCB(unc_exp2stan),\n",
-    "                            CompareDfsAndTfmCB(dfs)\n",
+    "                            NormalizeUncCB()\n",
     "                            ])\n",
-    "\n",
-    "\n",
     "tfm()\n",
-    "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n",
     "\n",
-    "print(tfm.dfs['seawater'][['value', 'uncertainty']][:5])\n",
-    "print(tfm.dfs['biota'][['value', 'uncertainty']][:5])\n"
+    "for grp in ['seawater', 'biota']:\n",
+    "    print(f'\\n{grp}:')\n",
+    "    print(tfm.dfs[grp][['value', 'uncertainty']].head())"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "61c204ca",
+   "id": "96d25e19",
    "metadata": {},
    "source": [
-    "***"
+    "## Remap Biota species"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "96d25e19",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9dcc466",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>value</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Unknown</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Homarus gammarus</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>SPRATTUS SPRATTUS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>Anarhichas denticulatus</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>MOLVA MOLVA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>151</th>\n",
+       "      <td>151</td>\n",
+       "      <td>MELANOGRAMMUS AEGLEFINUS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>152</th>\n",
+       "      <td>152</td>\n",
+       "      <td>MERLUCCIUS MERLUCCIUS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>153</th>\n",
+       "      <td>153</td>\n",
+       "      <td>PECTEN MAXIMUS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>154</th>\n",
+       "      <td>154</td>\n",
+       "      <td>LITTORINA LITTOREA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>155</th>\n",
+       "      <td>155</td>\n",
+       "      <td>Pleuronectes platessa</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>156 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     index                     value\n",
+       "0        0                   Unknown\n",
+       "1        1          Homarus gammarus\n",
+       "2        2         SPRATTUS SPRATTUS\n",
+       "3        3   Anarhichas denticulatus\n",
+       "4        4               MOLVA MOLVA\n",
+       "..     ...                       ...\n",
+       "151    151  MELANOGRAMMUS AEGLEFINUS\n",
+       "152    152     MERLUCCIUS MERLUCCIUS\n",
+       "153    153            PECTEN MAXIMUS\n",
+       "154    154        LITTORINA LITTOREA\n",
+       "155    155     Pleuronectes platessa\n",
+       "\n",
+       "[156 rows x 2 columns]"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dfs = load_data(fname_in)\n",
+    "get_unique_across_dfs(dfs, col_name='Species', as_df=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f6e4a3d",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### Lookup transformations "
+    "#| eval: false\n",
+    "remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='Species', as_df=True),\n",
+    "                     maris_lut_fn=species_lut_path,\n",
+    "                    maris_col_id='species_id',\n",
+    "                    maris_col_name='species',\n",
+    "                    provider_col_to_match='value',\n",
+    "                    provider_col_key='value',\n",
+    "                    fname_cache='species_ospar.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cb98f5b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processing: 100%|██████████| 156/156 [00:23<00:00,  6.65it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>matched_maris_name</th>\n",
+       "      <th>source_name</th>\n",
+       "      <th>match_score</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>source_key</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>RHODYMENIA PSEUDOPALAMATA &amp; PALMARIA PALMATA</th>\n",
+       "      <td>Lomentaria catenata</td>\n",
+       "      <td>RHODYMENIA PSEUDOPALAMATA &amp; PALMARIA PALMATA</td>\n",
+       "      <td>31</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Mixture of green, red and brown algae</th>\n",
+       "      <td>Mercenaria mercenaria</td>\n",
+       "      <td>Mixture of green, red and brown algae</td>\n",
+       "      <td>26</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Solea solea (S.vulgaris)</th>\n",
+       "      <td>Loligo vulgaris</td>\n",
+       "      <td>Solea solea (S.vulgaris)</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>SOLEA SOLEA (S.VULGARIS)</th>\n",
+       "      <td>Loligo vulgaris</td>\n",
+       "      <td>SOLEA SOLEA (S.VULGARIS)</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>CERASTODERMA (CARDIUM) EDULE</th>\n",
+       "      <td>Cerastoderma edule</td>\n",
+       "      <td>CERASTODERMA (CARDIUM) EDULE</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Cerastoderma (Cardium) Edule</th>\n",
+       "      <td>Cerastoderma edule</td>\n",
+       "      <td>Cerastoderma (Cardium) Edule</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NUCELLA LAPILLUS</th>\n",
+       "      <td>Mugil cephalus</td>\n",
+       "      <td>NUCELLA LAPILLUS</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>DICENTRARCHUS (MORONE) LABRAX</th>\n",
+       "      <td>Dicentrarchus labrax</td>\n",
+       "      <td>DICENTRARCHUS (MORONE) LABRAX</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MONODONTA LINEATA</th>\n",
+       "      <td>Ophiothrix lineata</td>\n",
+       "      <td>MONODONTA LINEATA</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Pleuronectiformes [order]</th>\n",
+       "      <td>Pleuronectiformes</td>\n",
+       "      <td>Pleuronectiformes [order]</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>RAJIDAE/BATOIDEA</th>\n",
+       "      <td>Batoidea</td>\n",
+       "      <td>RAJIDAE/BATOIDEA</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>PALMARIA PALMATA</th>\n",
+       "      <td>Alaria marginata</td>\n",
+       "      <td>PALMARIA PALMATA</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Rhodymenia spp.</th>\n",
+       "      <td>Rhodymenia</td>\n",
+       "      <td>Rhodymenia spp.</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Sepia spp.</th>\n",
+       "      <td>Sepia</td>\n",
+       "      <td>Sepia spp.</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unknown</th>\n",
+       "      <td>Plankton</td>\n",
+       "      <td>unknown</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>RAJA DIPTURUS BATIS</th>\n",
+       "      <td>Dipturus batis</td>\n",
+       "      <td>RAJA DIPTURUS BATIS</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Flatfish</th>\n",
+       "      <td>Lambia</td>\n",
+       "      <td>Flatfish</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Unknown</th>\n",
+       "      <td>Plankton</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>FUCUS SPP.</th>\n",
+       "      <td>Fucus</td>\n",
+       "      <td>FUCUS SPP.</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Tapes sp.</th>\n",
+       "      <td>Tapes</td>\n",
+       "      <td>Tapes sp.</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Fucus sp.</th>\n",
+       "      <td>Fucus</td>\n",
+       "      <td>Fucus sp.</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Patella sp.</th>\n",
+       "      <td>Patella aspera</td>\n",
+       "      <td>Patella sp.</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>FUCUS spp</th>\n",
+       "      <td>Fucus</td>\n",
+       "      <td>FUCUS spp</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Gadus sp.</th>\n",
+       "      <td>Gadus</td>\n",
+       "      <td>Gadus sp.</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>RHODYMENIA spp</th>\n",
+       "      <td>Rhodymenia</td>\n",
+       "      <td>RHODYMENIA spp</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Thunnus sp.</th>\n",
+       "      <td>Thunnus</td>\n",
+       "      <td>Thunnus sp.</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>PECTINIDAE</th>\n",
+       "      <td>Buccinidae</td>\n",
+       "      <td>PECTINIDAE</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Gaidropsarus argenteus</th>\n",
+       "      <td>Gaidropsarus argentatus</td>\n",
+       "      <td>Gaidropsarus argenteus</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>PLUERONECTES PLATESSA</th>\n",
+       "      <td>Pleuronectes platessa</td>\n",
+       "      <td>PLUERONECTES PLATESSA</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ASCOPHYLLUN NODOSUM</th>\n",
+       "      <td>Ascophyllum nodosum</td>\n",
+       "      <td>ASCOPHYLLUN NODOSUM</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Sebastes vivipares</th>\n",
+       "      <td>Sebastes viviparus</td>\n",
+       "      <td>Sebastes vivipares</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                   matched_maris_name  \\\n",
+       "source_key                                                              \n",
+       "RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA      Lomentaria catenata   \n",
+       "Mixture of green, red and brown algae           Mercenaria mercenaria   \n",
+       "Solea solea (S.vulgaris)                              Loligo vulgaris   \n",
+       "SOLEA SOLEA (S.VULGARIS)                              Loligo vulgaris   \n",
+       "CERASTODERMA (CARDIUM) EDULE                       Cerastoderma edule   \n",
+       "Cerastoderma (Cardium) Edule                       Cerastoderma edule   \n",
+       "NUCELLA LAPILLUS                                       Mugil cephalus   \n",
+       "DICENTRARCHUS (MORONE) LABRAX                    Dicentrarchus labrax   \n",
+       "MONODONTA LINEATA                                  Ophiothrix lineata   \n",
+       "Pleuronectiformes [order]                           Pleuronectiformes   \n",
+       "RAJIDAE/BATOIDEA                                             Batoidea   \n",
+       "PALMARIA PALMATA                                     Alaria marginata   \n",
+       "Rhodymenia spp.                                            Rhodymenia   \n",
+       "Sepia spp.                                                      Sepia   \n",
+       "unknown                                                      Plankton   \n",
+       "RAJA DIPTURUS BATIS                                    Dipturus batis   \n",
+       "Flatfish                                                       Lambia   \n",
+       "Unknown                                                      Plankton   \n",
+       "FUCUS SPP.                                                      Fucus   \n",
+       "Tapes sp.                                                       Tapes   \n",
+       "Fucus sp.                                                       Fucus   \n",
+       "Patella sp.                                            Patella aspera   \n",
+       "FUCUS spp                                                       Fucus   \n",
+       "Gadus sp.                                                       Gadus   \n",
+       "RHODYMENIA spp                                             Rhodymenia   \n",
+       "Thunnus sp.                                                   Thunnus   \n",
+       "PECTINIDAE                                                 Buccinidae   \n",
+       "Gaidropsarus argenteus                        Gaidropsarus argentatus   \n",
+       "PLUERONECTES PLATESSA                           Pleuronectes platessa   \n",
+       "ASCOPHYLLUN NODOSUM                               Ascophyllum nodosum   \n",
+       "Sebastes vivipares                                 Sebastes viviparus   \n",
+       "\n",
+       "                                                                               source_name  \\\n",
+       "source_key                                                                                   \n",
+       "RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA  RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA   \n",
+       "Mixture of green, red and brown algae                Mixture of green, red and brown algae   \n",
+       "Solea solea (S.vulgaris)                                          Solea solea (S.vulgaris)   \n",
+       "SOLEA SOLEA (S.VULGARIS)                                          SOLEA SOLEA (S.VULGARIS)   \n",
+       "CERASTODERMA (CARDIUM) EDULE                                  CERASTODERMA (CARDIUM) EDULE   \n",
+       "Cerastoderma (Cardium) Edule                                  Cerastoderma (Cardium) Edule   \n",
+       "NUCELLA LAPILLUS                                                          NUCELLA LAPILLUS   \n",
+       "DICENTRARCHUS (MORONE) LABRAX                                DICENTRARCHUS (MORONE) LABRAX   \n",
+       "MONODONTA LINEATA                                                        MONODONTA LINEATA   \n",
+       "Pleuronectiformes [order]                                        Pleuronectiformes [order]   \n",
+       "RAJIDAE/BATOIDEA                                                          RAJIDAE/BATOIDEA   \n",
+       "PALMARIA PALMATA                                                          PALMARIA PALMATA   \n",
+       "Rhodymenia spp.                                                            Rhodymenia spp.   \n",
+       "Sepia spp.                                                                      Sepia spp.   \n",
+       "unknown                                                                            unknown   \n",
+       "RAJA DIPTURUS BATIS                                                    RAJA DIPTURUS BATIS   \n",
+       "Flatfish                                                                          Flatfish   \n",
+       "Unknown                                                                            Unknown   \n",
+       "FUCUS SPP.                                                                      FUCUS SPP.   \n",
+       "Tapes sp.                                                                        Tapes sp.   \n",
+       "Fucus sp.                                                                        Fucus sp.   \n",
+       "Patella sp.                                                                    Patella sp.   \n",
+       "FUCUS spp                                                                        FUCUS spp   \n",
+       "Gadus sp.                                                                        Gadus sp.   \n",
+       "RHODYMENIA spp                                                              RHODYMENIA spp   \n",
+       "Thunnus sp.                                                                    Thunnus sp.   \n",
+       "PECTINIDAE                                                                      PECTINIDAE   \n",
+       "Gaidropsarus argenteus                                              Gaidropsarus argenteus   \n",
+       "PLUERONECTES PLATESSA                                                PLUERONECTES PLATESSA   \n",
+       "ASCOPHYLLUN NODOSUM                                                    ASCOPHYLLUN NODOSUM   \n",
+       "Sebastes vivipares                                                      Sebastes vivipares   \n",
+       "\n",
+       "                                              match_score  \n",
+       "source_key                                                 \n",
+       "RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA           31  \n",
+       "Mixture of green, red and brown algae                  26  \n",
+       "Solea solea (S.vulgaris)                               12  \n",
+       "SOLEA SOLEA (S.VULGARIS)                               12  \n",
+       "CERASTODERMA (CARDIUM) EDULE                           10  \n",
+       "Cerastoderma (Cardium) Edule                           10  \n",
+       "NUCELLA LAPILLUS                                        9  \n",
+       "DICENTRARCHUS (MORONE) LABRAX                           9  \n",
+       "MONODONTA LINEATA                                       9  \n",
+       "Pleuronectiformes [order]                               8  \n",
+       "RAJIDAE/BATOIDEA                                        8  \n",
+       "PALMARIA PALMATA                                        7  \n",
+       "Rhodymenia spp.                                         5  \n",
+       "Sepia spp.                                              5  \n",
+       "unknown                                                 5  \n",
+       "RAJA DIPTURUS BATIS                                     5  \n",
+       "Flatfish                                                5  \n",
+       "Unknown                                                 5  \n",
+       "FUCUS SPP.                                              5  \n",
+       "Tapes sp.                                               4  \n",
+       "Fucus sp.                                               4  \n",
+       "Patella sp.                                             4  \n",
+       "FUCUS spp                                               4  \n",
+       "Gadus sp.                                               4  \n",
+       "RHODYMENIA spp                                          4  \n",
+       "Thunnus sp.                                             4  \n",
+       "PECTINIDAE                                              3  \n",
+       "Gaidropsarus argenteus                                  2  \n",
+       "PLUERONECTES PLATESSA                                   2  \n",
+       "ASCOPHYLLUN NODOSUM                                     1  \n",
+       "Sebastes vivipares                                      1  "
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "remapper.generate_lookup_table(as_df=True)\n",
+    "remapper.select_match(match_score_threshold=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f073cc1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#|exports\n",
+    "fixes_biota_species = {\n",
+    "    'PECTINIDAE': NA, # Dropped. In Worms as PECTINIDAE is a family.\n",
+    "    'Unknown': NA,\n",
+    "    'unknown': NA,\n",
+    "    'PALMARIA PALMATA': NA, # Dropped. In Worms 'Palmaria palmata (Linnaeus) F.Weber & D.Mohr, 1805',\n",
+    "    'RAJIDAE/BATOIDEA': NA, # Mix \n",
+    "    'MONODONTA LINEATA': 'Phorcus lineatus',\n",
+    "    'NUCELLA LAPILLUS': NA, # Dropped. In Worms 'Nucella lapillus (Linnaeus, 1758)', \n",
+    "    'SOLEA SOLEA (S.VULGARIS)': 'Solea solea',\n",
+    "    'Solea solea (S.vulgaris)': 'Solea solea',\n",
+    "    'Mixture of green, red and brown algae': NA, # Mix \n",
+    "    'RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA': NA, # Mix\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fdc3e95a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processing: 100%|██████████| 156/156 [00:23<00:00,  6.66it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>matched_maris_name</th>\n",
+       "      <th>source_name</th>\n",
+       "      <th>match_score</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>source_key</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>CERASTODERMA (CARDIUM) EDULE</th>\n",
+       "      <td>Cerastoderma edule</td>\n",
+       "      <td>CERASTODERMA (CARDIUM) EDULE</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Cerastoderma (Cardium) Edule</th>\n",
+       "      <td>Cerastoderma edule</td>\n",
+       "      <td>Cerastoderma (Cardium) Edule</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>DICENTRARCHUS (MORONE) LABRAX</th>\n",
+       "      <td>Dicentrarchus labrax</td>\n",
+       "      <td>DICENTRARCHUS (MORONE) LABRAX</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Pleuronectiformes [order]</th>\n",
+       "      <td>Pleuronectiformes</td>\n",
+       "      <td>Pleuronectiformes [order]</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>FUCUS SPP.</th>\n",
+       "      <td>Fucus</td>\n",
+       "      <td>FUCUS SPP.</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Flatfish</th>\n",
+       "      <td>Lambia</td>\n",
+       "      <td>Flatfish</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Sepia spp.</th>\n",
+       "      <td>Sepia</td>\n",
+       "      <td>Sepia spp.</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Rhodymenia spp.</th>\n",
+       "      <td>Rhodymenia</td>\n",
+       "      <td>Rhodymenia spp.</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>RAJA DIPTURUS BATIS</th>\n",
+       "      <td>Dipturus batis</td>\n",
+       "      <td>RAJA DIPTURUS BATIS</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>RHODYMENIA spp</th>\n",
+       "      <td>Rhodymenia</td>\n",
+       "      <td>RHODYMENIA spp</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Thunnus sp.</th>\n",
+       "      <td>Thunnus</td>\n",
+       "      <td>Thunnus sp.</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>FUCUS spp</th>\n",
+       "      <td>Fucus</td>\n",
+       "      <td>FUCUS spp</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Gadus sp.</th>\n",
+       "      <td>Gadus</td>\n",
+       "      <td>Gadus sp.</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Fucus sp.</th>\n",
+       "      <td>Fucus</td>\n",
+       "      <td>Fucus sp.</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Tapes sp.</th>\n",
+       "      <td>Tapes</td>\n",
+       "      <td>Tapes sp.</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Patella sp.</th>\n",
+       "      <td>Patella aspera</td>\n",
+       "      <td>Patella sp.</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Gaidropsarus argenteus</th>\n",
+       "      <td>Gaidropsarus argentatus</td>\n",
+       "      <td>Gaidropsarus argenteus</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>PLUERONECTES PLATESSA</th>\n",
+       "      <td>Pleuronectes platessa</td>\n",
+       "      <td>PLUERONECTES PLATESSA</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ASCOPHYLLUN NODOSUM</th>\n",
+       "      <td>Ascophyllum nodosum</td>\n",
+       "      <td>ASCOPHYLLUN NODOSUM</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Sebastes vivipares</th>\n",
+       "      <td>Sebastes viviparus</td>\n",
+       "      <td>Sebastes vivipares</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                    matched_maris_name  \\\n",
+       "source_key                                               \n",
+       "CERASTODERMA (CARDIUM) EDULE        Cerastoderma edule   \n",
+       "Cerastoderma (Cardium) Edule        Cerastoderma edule   \n",
+       "DICENTRARCHUS (MORONE) LABRAX     Dicentrarchus labrax   \n",
+       "Pleuronectiformes [order]            Pleuronectiformes   \n",
+       "FUCUS SPP.                                       Fucus   \n",
+       "Flatfish                                        Lambia   \n",
+       "Sepia spp.                                       Sepia   \n",
+       "Rhodymenia spp.                             Rhodymenia   \n",
+       "RAJA DIPTURUS BATIS                     Dipturus batis   \n",
+       "RHODYMENIA spp                              Rhodymenia   \n",
+       "Thunnus sp.                                    Thunnus   \n",
+       "FUCUS spp                                        Fucus   \n",
+       "Gadus sp.                                        Gadus   \n",
+       "Fucus sp.                                        Fucus   \n",
+       "Tapes sp.                                        Tapes   \n",
+       "Patella sp.                             Patella aspera   \n",
+       "Gaidropsarus argenteus         Gaidropsarus argentatus   \n",
+       "PLUERONECTES PLATESSA            Pleuronectes platessa   \n",
+       "ASCOPHYLLUN NODOSUM                Ascophyllum nodosum   \n",
+       "Sebastes vivipares                  Sebastes viviparus   \n",
+       "\n",
+       "                                                 source_name  match_score  \n",
+       "source_key                                                                 \n",
+       "CERASTODERMA (CARDIUM) EDULE    CERASTODERMA (CARDIUM) EDULE           10  \n",
+       "Cerastoderma (Cardium) Edule    Cerastoderma (Cardium) Edule           10  \n",
+       "DICENTRARCHUS (MORONE) LABRAX  DICENTRARCHUS (MORONE) LABRAX            9  \n",
+       "Pleuronectiformes [order]          Pleuronectiformes [order]            8  \n",
+       "FUCUS SPP.                                        FUCUS SPP.            5  \n",
+       "Flatfish                                            Flatfish            5  \n",
+       "Sepia spp.                                        Sepia spp.            5  \n",
+       "Rhodymenia spp.                              Rhodymenia spp.            5  \n",
+       "RAJA DIPTURUS BATIS                      RAJA DIPTURUS BATIS            5  \n",
+       "RHODYMENIA spp                                RHODYMENIA spp            4  \n",
+       "Thunnus sp.                                      Thunnus sp.            4  \n",
+       "FUCUS spp                                          FUCUS spp            4  \n",
+       "Gadus sp.                                          Gadus sp.            4  \n",
+       "Fucus sp.                                          Fucus sp.            4  \n",
+       "Tapes sp.                                          Tapes sp.            4  \n",
+       "Patella sp.                                      Patella sp.            4  \n",
+       "Gaidropsarus argenteus                Gaidropsarus argenteus            2  \n",
+       "PLUERONECTES PLATESSA                  PLUERONECTES PLATESSA            2  \n",
+       "ASCOPHYLLUN NODOSUM                      ASCOPHYLLUN NODOSUM            1  \n",
+       "Sebastes vivipares                        Sebastes vivipares            1  "
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#| eval: false\n",
+    "remapper.generate_lookup_table(fixes=fixes_biota_species)\n",
+    "remapper.select_match(match_score_threshold=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a70cafab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TO BE DONE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3a74941",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exports\n",
+    "# class RemapBiotaSpeciesCB(Callback):\n",
+    "#     \"Biota species standardized to MARIS format.\"\n",
+    "#     def __init__(self, \n",
+    "#                  fn_lut:Callable # Function that returns the lookup table dictionary\n",
+    "#                 ):\n",
+    "#         fc.store_attr()\n",
+    "\n",
+    "#     def __call__(self, tfm):\n",
+    "#         \"Remap biota species names in the DataFrame using the lookup table and print unmatched RUBIN values.\"\n",
+    "#         lut = self.fn_lut()\n",
+    "#         tfm.dfs['biota']['species'] = tfm.dfs['biota']['RUBIN'].apply(lambda x: self._get_species(x, lut))\n",
+    "\n",
+    "#     def _get_species(self, \n",
+    "#                      rubin_value:str, # The RUBIN value from the DataFrame\n",
+    "#                      lut:dict # The lookup table dictionary\n",
+    "#                     ):\n",
+    "#         \"Get the matched_id from the lookup table and print RUBIN if the matched_id is -1.\"\n",
+    "#         match = lut.get(rubin_value.strip(), Match(-1, None, None, None))\n",
+    "#         if match.matched_id == -1:\n",
+    "#             self.print_unmatched_rubin(rubin_value)\n",
+    "#         return match.matched_id\n",
+    "\n",
+    "#     def print_unmatched_rubin(self, \n",
+    "#                               rubin_value: str # The RUBIN value from the DataFrame\n",
+    "#                              ):\n",
+    "#         \"Print the RUBIN value if the matched_id is -1.\"\n",
+    "#         print(f\"Unmatched RUBIN: {rubin_value}\")"
    ]
   },
   {
diff --git a/nbs/handlers/helcom.ipynb b/nbs/handlers/helcom.ipynb
index f87518d..6842dbc 100644
--- a/nbs/handlers/helcom.ipynb
+++ b/nbs/handlers/helcom.ipynb
@@ -1700,6 +1700,14 @@
     "## Sanitize value"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "6de49e39",
+   "metadata": {},
+   "source": [
+    "We allocate each column containing measurement values (named differently across sample types) into a single column `value` and remove NA where needed."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1708,24 +1716,11 @@
    "outputs": [],
    "source": [
     "#| exports\n",
-    "# Columns of interest\n",
     "coi_val = {'seawater' : {'val': 'VALUE_Bq/m³'},\n",
     "           'biota':  {'val': 'VALUE_Bq/kg'},\n",
     "           'sediment': {'val': 'VALUE_Bq/kg'}}"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "18c59ae1-c523-4aa6-bc04-e824390bf06d",
-   "metadata": {},
-   "source": [
-    "**Comment (FA)**: Those lines can be simplified I think:\n",
-    "```\n",
-    "value_col = self.coi.get(grp, {}).get('val')\n",
-    "if value_col and value_col in df.columns:\n",
-    "```"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,