From 5119a327c7b5fd6e29c603ec08ff1b05e0a12f72 Mon Sep 17 00:00:00 2001 From: Franck Albinet Date: Tue, 24 Sep 2024 09:57:55 +0200 Subject: [PATCH] add callback to remove nan values in data provider submitted dataset --- marisco/_modidx.py | 6 + marisco/callbacks.py | 22 +- nbs/api/callbacks.ipynb | 22 + nbs/handlers/_ospar.ipynb | 1236 ++++++++++++++++++------------------- 4 files changed, 655 insertions(+), 631 deletions(-) diff --git a/marisco/_modidx.py b/marisco/_modidx.py index 7884151..bed3ac5 100644 --- a/marisco/_modidx.py +++ b/marisco/_modidx.py @@ -42,6 +42,12 @@ 'marisco/callbacks.py'), 'marisco.callbacks.LowerStripNameCB._safe_transform': ( 'api/callbacks.html#lowerstripnamecb._safe_transform', 'marisco/callbacks.py'), + 'marisco.callbacks.RemoveAllNAValuesCB': ( 'api/callbacks.html#removeallnavaluescb', + 'marisco/callbacks.py'), + 'marisco.callbacks.RemoveAllNAValuesCB.__call__': ( 'api/callbacks.html#removeallnavaluescb.__call__', + 'marisco/callbacks.py'), + 'marisco.callbacks.RemoveAllNAValuesCB.__init__': ( 'api/callbacks.html#removeallnavaluescb.__init__', + 'marisco/callbacks.py'), 'marisco.callbacks.ReshapeLongToWide': ('api/callbacks.html#reshapelongtowide', 'marisco/callbacks.py'), 'marisco.callbacks.ReshapeLongToWide.__call__': ( 'api/callbacks.html#reshapelongtowide.__call__', 'marisco/callbacks.py'), diff --git a/marisco/callbacks.py b/marisco/callbacks.py index eba96f5..572582c 100644 --- a/marisco/callbacks.py +++ b/marisco/callbacks.py @@ -4,7 +4,7 @@ # %% auto 0 __all__ = ['Callback', 'run_cbs', 'Transformer', 'SanitizeLonLatCB', 'AddSampleTypeIdColumnCB', 'AddNuclideIdColumnCB', - 'LowerStripNameCB', 'ReshapeLongToWide', 'CompareDfsAndTfmCB', 'EncodeTimeCB'] + 'LowerStripNameCB', 'RemoveAllNAValuesCB', 'ReshapeLongToWide', 'CompareDfsAndTfmCB', 'EncodeTimeCB'] # %% ../nbs/api/callbacks.ipynb 2 import copy @@ -128,7 +128,21 @@ def __call__(self, tfm): for key in tfm.dfs.keys(): tfm.dfs[key][self.col_dst] = tfm.dfs[key][self.col_src].apply(self._safe_transform) -# %% ../nbs/api/callbacks.ipynb 32 +# %% ../nbs/api/callbacks.ipynb 31 +class RemoveAllNAValuesCB(Callback): + "Remove rows with all NA values." + def __init__(self, + cols_to_check:dict # A dictionary with the sample type as key and the column name to check as value + ): + fc.store_attr() + + def __call__(self, tfm): + for k in tfm.dfs.keys(): + col_to_check = self.cols_to_check[k] + mask = tfm.dfs[k][col_to_check].isnull().all(axis=1) + tfm.dfs[k] = tfm.dfs[k][~mask] + +# %% ../nbs/api/callbacks.ipynb 33 class ReshapeLongToWide(Callback): def __init__(self, columns=['nuclide'], values=['value'], num_fill_value=-999, str_fill_value='STR FILL VALUE'): @@ -182,7 +196,7 @@ def __call__(self, tfm): tfm.dfs[grp] = self.pivot(tfm.dfs[grp]) tfm.dfs[grp].columns = self.renamed_cols(tfm.dfs[grp].columns) -# %% ../nbs/api/callbacks.ipynb 34 +# %% ../nbs/api/callbacks.ipynb 35 class CompareDfsAndTfmCB(Callback): def __init__(self, dfs: Dict[str, pd.DataFrame]): "Create a dataframe of dropped data. Data included in the `dfs` not in the `tfm`." @@ -219,7 +233,7 @@ def _compute_stats(self, 'Number of rows in tfm.dfs + Number of dropped rows': len(tfm.dfs[grp].index) + len(tfm.dfs_dropped[grp].index) } -# %% ../nbs/api/callbacks.ipynb 39 +# %% ../nbs/api/callbacks.ipynb 40 class EncodeTimeCB(Callback): "Encode time as `int` representing seconds since xxx" def __init__(self, cfg , verbose=False): fc.store_attr() diff --git a/nbs/api/callbacks.ipynb b/nbs/api/callbacks.ipynb index a89ac51..6efa888 100644 --- a/nbs/api/callbacks.ipynb +++ b/nbs/api/callbacks.ipynb @@ -464,6 +464,28 @@ "## Change structure" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "701be72f", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class RemoveAllNAValuesCB(Callback):\n", + " \"Remove rows with all NA values.\"\n", + " def __init__(self, \n", + " cols_to_check:dict # A dictionary with the sample type as key and the column name to check as value\n", + " ):\n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm):\n", + " for k in tfm.dfs.keys():\n", + " col_to_check = self.cols_to_check[k]\n", + " mask = tfm.dfs[k][col_to_check].isnull().all(axis=1)\n", + " tfm.dfs[k] = tfm.dfs[k][~mask]" + ] + }, { "cell_type": "markdown", "id": "3a32c3bc", diff --git a/nbs/handlers/_ospar.ipynb b/nbs/handlers/_ospar.ipynb index 6f60866..f772a76 100644 --- a/nbs/handlers/_ospar.ipynb +++ b/nbs/handlers/_ospar.ipynb @@ -57,7 +57,16 @@ "execution_count": null, "id": "f69f5756", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "#| hide\n", "%load_ext autoreload\n", @@ -82,18 +91,57 @@ "from collections import OrderedDict, defaultdict\n", "import re\n", "\n", - "from marisco.utils import (has_valid_varname, match_worms, Remapper, ddmm_to_dd,\n", - " match_maris_lut, Match, get_unique_across_dfs)\n", - "from marisco.callbacks import (Callback, Transformer, EncodeTimeCB, AddSampleTypeIdColumnCB,\n", - " AddNuclideIdColumnCB, LowerStripNameCB, SanitizeLonLatCB, \n", - " ReshapeLongToWide, CompareDfsAndTfmCB)\n", - "from marisco.metadata import (GlobAttrsFeeder, BboxCB, DepthRangeCB, \n", - " TimeRangeCB, ZoteroCB, KeyValuePairCB)\n", - "from marisco.configs import (nuc_lut_path, nc_tpl_path, cfg, cache_path, \n", - " cdl_cfg, Enums, lut_path, species_lut_path, \n", - " sediments_lut_path, bodyparts_lut_path, \n", - " detection_limit_lut_path, filtered_lut_path, \n", - " area_lut_path, get_lut, unit_lut_path)\n", + "from marisco.utils import (\n", + " has_valid_varname, \n", + " match_worms, \n", + " Remapper, \n", + " ddmm_to_dd,\n", + " match_maris_lut, \n", + " Match, \n", + " get_unique_across_dfs\n", + " )\n", + "\n", + "from marisco.callbacks import (\n", + " Callback, \n", + " Transformer, \n", + " RemoveAllNAValuesCB,\n", + " EncodeTimeCB, \n", + " AddSampleTypeIdColumnCB,\n", + " AddNuclideIdColumnCB, \n", + " LowerStripNameCB, \n", + " SanitizeLonLatCB, \n", + " ReshapeLongToWide, \n", + " CompareDfsAndTfmCB,\n", + " RemoveAllNAValuesCB\n", + " )\n", + "\n", + "from marisco.metadata import (\n", + " GlobAttrsFeeder, \n", + " BboxCB, \n", + " DepthRangeCB, \n", + " TimeRangeCB, \n", + " ZoteroCB, \n", + " KeyValuePairCB\n", + " )\n", + "\n", + "from marisco.configs import (\n", + " nuc_lut_path, \n", + " nc_tpl_path, \n", + " cfg, \n", + " cache_path, \n", + " cdl_cfg, \n", + " Enums, \n", + " lut_path, \n", + " species_lut_path, \n", + " sediments_lut_path, \n", + " bodyparts_lut_path, \n", + " detection_limit_lut_path, \n", + " filtered_lut_path, \n", + " area_lut_path,\n", + " get_lut,\n", + " unit_lut_path\n", + " )\n", + "\n", "from marisco.serializers import NetCDFEncoder, OpenRefineCsvEncoder\n", "\n", "import warnings\n", @@ -309,6 +357,95 @@ " print(f'{key} columns: ', dfs[key].columns)" ] }, + { + "cell_type": "markdown", + "id": "aaa55881", + "metadata": {}, + "source": [ + "## Remove missing data" + ] + }, + { + "cell_type": "markdown", + "id": "d3936614", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: The `Seawater` dataset contains 548 rows with all NA values as shown below.\n", + "\n", + ":::" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc0e120f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "seawater: 538 rows with all NA values\n", + "biota: 0 rows with all NA values\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "for key in dfs.keys():\n", + " cols_to_check = dfs[key].columns[1:]\n", + " mask = dfs[key][cols_to_check].isnull().all(axis=1)\n", + " print(f'{key}: {mask.sum()} rows with all NA values')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89d292e2", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "common_cols = [\n", + " 'Contracting Party', 'RSC Sub-division', 'Station ID', 'Sample ID',\n", + " 'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM', 'LongS', 'LongDir',\n", + " 'Sample type', 'Sampling date', 'Nuclide', 'Value type', 'Activity or MDA',\n", + " 'Uncertainty', 'Unit', 'Data provider', 'Measurement Comment',\n", + " 'Sample Comment', 'Reference Comment'\n", + "]\n", + "\n", + "cols_to_check = {\n", + " 'seawater': common_cols + ['Sampling depth'],\n", + " 'biota': common_cols + ['Biological group', 'Species', 'Body Part']\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "97cb5905", + "metadata": {}, + "source": [ + "Let's use the `RemoveAllNAValuesCB` callback to remove all rows with all NA values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5beea658", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[RemoveAllNAValuesCB(cols_to_check)])\n", + "\n", + "# Test that all NA values have been removed\n", + "fc.test_eq(tfm()['seawater'][cols_to_check['seawater']].isnull().all(axis=1).sum(), 0)" + ] + }, { "cell_type": "markdown", "id": "8326e234", @@ -379,31 +516,53 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "b5d8e77b", + "attachments": {}, + "cell_type": "markdown", + "id": "52c9d0fe", + "metadata": {}, + "source": [ + "### Remap nuclide names to MARIS data formats" + ] + }, + { + "cell_type": "markdown", + "id": "d9ff7a3f", + "metadata": {}, + "source": [ + "We below map nuclide names used by OSPAR to the MARIS standard nuclide names. \n", + "\n", + "Remapping data provider nomenclatures into MARIS standards is one recurrent operation and is done in a semi-automated manner according to the following pattern:\n", + "\n", + "1. **Inspect** data provider nomenclature:\n", + "2. **Match** automatically against MARIS nomenclature (using a fuzzy matching algorithm); \n", + "3. **Fix** potential mismatches; \n", + "4. **Apply** the lookup table to the dataframe.\n", + "\n", + "As now on, we will use this pattern to remap the OSPAR data provider nomenclatures into MARIS standards and name it for the sake of brevity **IMFA** (**I**nspect, **M**atch, **F**ix, **A**pply)." + ] + }, + { + "cell_type": "markdown", + "id": "abd510d4", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['137Cs', '239,240Pu', '226Ra', '228Ra', '99Tc', '3H', '210Po',\n", - " '210Pb', nan, 'RA-226', 'RA-228'], dtype=object)" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "load_data(fname_in)['seawater']['Nuclide'].unique()" + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: The `Nuclide` column has inconsistent naming. E.g:\n", + "\n", + "- `Cs-137`, `137Cs` or `CS-137`\n", + "- `239, 240 pu` or `239,240 pu`\n", + "- `ra-226` and `226ra` \n", + "\n", + "See below:\n", + "\n", + ":::" ] }, { "cell_type": "code", "execution_count": null, - "id": "1f324e4a", + "id": "9691ccab", "metadata": {}, "outputs": [ { @@ -429,161 +588,123 @@ " \n", " index\n", " value\n", - " n_chars\n", - " stripped_chars\n", " \n", " \n", " \n", " \n", " 0\n", " 0\n", - " 239,240Pu\n", - " 9.0\n", - " 9.0\n", + " 239, 240 Pu\n", " \n", " \n", " 1\n", " 1\n", - " 210Po\n", - " 5.0\n", - " 5.0\n", + " 137Cs\n", " \n", " \n", " 2\n", " 2\n", - " RA-228\n", - " 6.0\n", - " 6.0\n", + " CS-137\n", " \n", " \n", " 3\n", " 3\n", - " Cs-134\n", - " 6.0\n", - " 6.0\n", + " Cs-137\n", " \n", " \n", " 4\n", " 4\n", - " 239, 240 Pu\n", - " 11.0\n", - " 9.0\n", + " Cs-134\n", " \n", " \n", " 5\n", " 5\n", - " 238Pu\n", - " 5.0\n", - " 5.0\n", + " 210Po\n", " \n", " \n", " 6\n", " 6\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 239,240Pu\n", " \n", " \n", " 7\n", " 7\n", - " 226Ra\n", - " 5.0\n", - " 5.0\n", + " 228Ra\n", " \n", " \n", " 8\n", " 8\n", - " Cs-137\n", - " 6.0\n", - " 6.0\n", + " 210Pb\n", " \n", " \n", " 9\n", " 9\n", - " 210Pb\n", - " 5.0\n", - " 5.0\n", + " RA-228\n", " \n", " \n", " 10\n", " 10\n", - " 3H\n", - " 2.0\n", - " 2.0\n", + " 238Pu\n", " \n", " \n", " 11\n", " 11\n", - " 241Am\n", - " 5.0\n", - " 5.0\n", + " 3H\n", " \n", " \n", " 12\n", " 12\n", - " 228Ra\n", - " 5.0\n", - " 5.0\n", + " NaN\n", " \n", " \n", " 13\n", " 13\n", - " 137Cs\n", - " 5.0\n", - " 5.0\n", + " RA-226\n", " \n", " \n", " 14\n", " 14\n", - " CS-137\n", - " 6.0\n", - " 6.0\n", + " 99Tc\n", " \n", " \n", " 15\n", " 15\n", - " RA-226\n", - " 6.0\n", - " 6.0\n", + " 241Am\n", " \n", " \n", " 16\n", " 16\n", " CS-134\n", - " 6.0\n", - " 6.0\n", " \n", " \n", " 17\n", " 17\n", - " 99Tc\n", - " 4.0\n", - " 4.0\n", + " 226Ra\n", " \n", " \n", "\n", "" ], "text/plain": [ - " index value n_chars stripped_chars\n", - "0 0 239,240Pu 9.0 9.0\n", - "1 1 210Po 5.0 5.0\n", - "2 2 RA-228 6.0 6.0\n", - "3 3 Cs-134 6.0 6.0\n", - "4 4 239, 240 Pu 11.0 9.0\n", - "5 5 238Pu 5.0 5.0\n", - "6 6 NaN NaN NaN\n", - "7 7 226Ra 5.0 5.0\n", - "8 8 Cs-137 6.0 6.0\n", - "9 9 210Pb 5.0 5.0\n", - "10 10 3H 2.0 2.0\n", - "11 11 241Am 5.0 5.0\n", - "12 12 228Ra 5.0 5.0\n", - "13 13 137Cs 5.0 5.0\n", - "14 14 CS-137 6.0 6.0\n", - "15 15 RA-226 6.0 6.0\n", - "16 16 CS-134 6.0 6.0\n", - "17 17 99Tc 4.0 4.0" + " index value\n", + "0 0 239, 240 Pu\n", + "1 1 137Cs\n", + "2 2 CS-137\n", + "3 3 Cs-137\n", + "4 4 Cs-134\n", + "5 5 210Po\n", + "6 6 239,240Pu\n", + "7 7 228Ra\n", + "8 8 210Pb\n", + "9 9 RA-228\n", + "10 10 238Pu\n", + "11 11 3H\n", + "12 12 NaN\n", + "13 13 RA-226\n", + "14 14 99Tc\n", + "15 15 241Am\n", + "16 16 CS-134\n", + "17 17 226Ra" ] }, "execution_count": null, @@ -593,324 +714,54 @@ ], "source": [ "#| eval: false\n", - "df = get_unique_across_dfs(load_data(fname_in), 'Nuclide', as_df=True, include_nchars=True)\n", - "df['stripped_chars'] = df['value'].str.strip().str.replace(' ', '').str.len()\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7444a821", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "df['stripped_chars'] = df['value'].str.strip().str.replace(' ', '').str.len()\n", - "print(df[df['n_chars'] != df['stripped_chars']])" + "dfs = load_data(fname_in)\n", + "get_unique_across_dfs(dfs, col_name='Nuclide', as_df=True)" ] }, { "cell_type": "markdown", - "id": "f5befd9b", + "id": "f6879cf4", "metadata": {}, "source": [ - "#### Lower & strip nuclide names" + "Let's now create an instance of a fuzzy matching algorithm `Remapper`:" ] }, { - "attachments": {}, - "cell_type": "markdown", - "id": "8a2311cd", + "cell_type": "code", + "execution_count": null, + "id": "582e03a6", "metadata": {}, + "outputs": [], "source": [ - "We use the `LowerStripRdnNameCB` callback. For each dataframe in the dictionary of dataframes, it corrects the nuclide name by converting it lowercase, striping any leading or trailing whitespace(s) and ensuring the number comes before letters (e.g. `137cs`)." + "#| eval: false\n", + "remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='Nuclide', as_df=True),\n", + " maris_lut_fn=nuc_lut_path,\n", + " maris_col_id='nuclide_id',\n", + " maris_col_name='nc_name',\n", + " provider_col_to_match='value',\n", + " provider_col_key='value',\n", + " fname_cache='nuclides_ospar.pkl')" ] }, { "cell_type": "markdown", - "id": "86cdc845", + "id": "857f4cb6", "metadata": {}, "source": [ - "For instance:" + "And try to match HELCOM to MARIS nuclide names as automatically as possible. The `match_score` column allows to assess the results:" ] }, { "cell_type": "code", "execution_count": null, - "id": "8a3fa068", + "id": "34f3a398", "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "seawater nuclides: \n", - "['137cs' '239,240pu' '226ra' '228ra' '99tc' '3h' '210po' '210pb' nan\n", - " 'ra-226' 'ra-228']\n", - "biota nuclides: \n", - "['239,240pu' '99tc' '137cs' '226ra' '228ra' '238pu' '239, 240 pu' '241am'\n", - " 'cs-137' 'cs-134' '3h' '210pb' '210po']\n" - ] - } - ], - "source": [ - "#|eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(col_src='Nuclide', col_dst='NUCLIDE')])\n", - "print('seawater nuclides: ')\n", - "print(tfm()['seawater']['NUCLIDE'].unique())\n", - "print('biota nuclides: ')\n", - "print(tfm()['biota']['NUCLIDE'].unique())" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "52c9d0fe", - "metadata": {}, - "source": [ - "### Remap nuclide names to MARIS data formats" - ] - }, - { - "cell_type": "markdown", - "id": "d9ff7a3f", - "metadata": {}, - "source": [ - "We below map nuclide names used by OSPAR to the MARIS standard nuclide names. \n", - "\n", - "Remapping data provider nomenclatures into MARIS standards is one recurrent operation and is done in a semi-automated manner according to the following pattern:\n", - "\n", - "1. **Inspect** data provider nomenclature:\n", - "2. **Match** automatically against MARIS nomenclature (using a fuzzy matching algorithm); \n", - "3. **Fix** potential mismatches; \n", - "4. **Apply** the lookup table to the dataframe.\n", - "\n", - "As now on, we will use this pattern to remap the OSPAR data provider nomenclatures into MARIS standards and name it for the sake of brevity **IMFA** (**I**nspect, **M**atch, **F**ix, **A**pply)." - ] - }, - { - "cell_type": "markdown", - "id": "abd510d4", - "metadata": {}, - "source": [ - ":::{.callout-tip}\n", - "\n", - "**FEEDBACK TO DATA PROVIDER**: The `Nuclide` column has inconsistent naming. E.g:\n", - "\n", - "- `Cs-137`, `137Cs` or `CS-137`\n", - "- `239, 240 pu` or `239,240 pu`\n", - "- `ra-226` and `226ra` \n", - "\n", - "See below:\n", - "\n", - ":::" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9691ccab", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexvalue
00239,240Pu
11210Po
22RA-228
33Cs-134
44239, 240 Pu
55238Pu
66NaN
77226Ra
88Cs-137
99210Pb
10103H
1111241Am
1212228Ra
1313137Cs
1414CS-137
1515RA-226
1616CS-134
171799Tc
\n", - "
" - ], - "text/plain": [ - " index value\n", - "0 0 239,240Pu\n", - "1 1 210Po\n", - "2 2 RA-228\n", - "3 3 Cs-134\n", - "4 4 239, 240 Pu\n", - "5 5 238Pu\n", - "6 6 NaN\n", - "7 7 226Ra\n", - "8 8 Cs-137\n", - "9 9 210Pb\n", - "10 10 3H\n", - "11 11 241Am\n", - "12 12 228Ra\n", - "13 13 137Cs\n", - "14 14 CS-137\n", - "15 15 RA-226\n", - "16 16 CS-134\n", - "17 17 99Tc" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "get_unique_across_dfs(dfs, col_name='Nuclide', as_df=True)" - ] - }, - { - "cell_type": "markdown", - "id": "f6879cf4", - "metadata": {}, - "source": [ - "Let's now create an instance of a fuzzy matching algorithm `Remapper`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "582e03a6", - "metadata": {}, - "outputs": [], - "source": [ - "#| eval: false\n", - "remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='Nuclide', as_df=True),\n", - " maris_lut_fn=nuc_lut_path,\n", - " maris_col_id='nuclide_id',\n", - " maris_col_name='nc_name',\n", - " provider_col_to_match='value',\n", - " provider_col_key='value',\n", - " fname_cache='nuclides_ospar.pkl')" - ] - }, - { - "cell_type": "markdown", - "id": "857f4cb6", - "metadata": {}, - "source": [ - "And try to match HELCOM to MARIS nuclide names as automatically as possible. The `match_score` column allows to assess the results:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "34f3a398", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing: 100%|██████████| 18/18 [00:00<00:00, 51.23it/s]\n" + "Processing: 100%|██████████| 18/18 [00:00<00:00, 41.59it/s]\n" ] }, { @@ -959,9 +810,9 @@ " 6\n", " \n", " \n", - " 210Po\n", - " ag106m\n", - " 210Po\n", + " 241Am\n", + " pu241\n", + " 241Am\n", " 4\n", " \n", " \n", @@ -977,15 +828,15 @@ " 4\n", " \n", " \n", - " 241Am\n", - " pu241\n", - " 241Am\n", + " 210Pb\n", + " ag106m\n", + " 210Pb\n", " 4\n", " \n", " \n", - " 210Pb\n", + " 210Po\n", " ag106m\n", - " 210Pb\n", + " 210Po\n", " 4\n", " \n", " \n", @@ -1013,27 +864,21 @@ " 2\n", " \n", " \n", - " Cs-137\n", - " cs137\n", - " Cs-137\n", - " 1\n", - " \n", - " \n", " Cs-134\n", " cs134\n", " Cs-134\n", " 1\n", " \n", " \n", - " RA-228\n", - " ra228\n", - " RA-228\n", + " Cs-137\n", + " cs137\n", + " Cs-137\n", " 1\n", " \n", " \n", - " CS-137\n", - " cs137\n", - " CS-137\n", + " RA-228\n", + " ra228\n", + " RA-228\n", " 1\n", " \n", " \n", @@ -1043,6 +888,12 @@ " 1\n", " \n", " \n", + " CS-137\n", + " cs137\n", + " CS-137\n", + " 1\n", + " \n", + " \n", " CS-134\n", " cs134\n", " CS-134\n", @@ -1057,20 +908,20 @@ "source_key \n", "239, 240 Pu pu240 239, 240 Pu 8\n", "239,240Pu pu240 239,240Pu 6\n", - "210Po ag106m 210Po 4\n", + "241Am pu241 241Am 4\n", "137Cs h3 137Cs 4\n", "228Ra u238 228Ra 4\n", - "241Am pu241 241Am 4\n", "210Pb ag106m 210Pb 4\n", + "210Po ag106m 210Po 4\n", "226Ra u235 226Ra 4\n", "238Pu u238 238Pu 3\n", "99Tc tu 99Tc 3\n", "3H h3 3H 2\n", - "Cs-137 cs137 Cs-137 1\n", "Cs-134 cs134 Cs-134 1\n", + "Cs-137 cs137 Cs-137 1\n", "RA-228 ra228 RA-228 1\n", - "CS-137 cs137 CS-137 1\n", "RA-226 ra226 RA-226 1\n", + "CS-137 cs137 CS-137 1\n", "CS-134 cs134 CS-134 1" ] }, @@ -1133,7 +984,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 18/18 [00:00<00:00, 53.16it/s]\n" + "Processing: 100%|██████████| 18/18 [00:00<00:00, 50.88it/s]\n" ] }, { @@ -1176,15 +1027,9 @@ " 2\n", " \n", " \n", - " RA-228\n", - " ra228\n", - " RA-228\n", - " 1\n", - " \n", - " \n", - " Cs-134\n", - " cs134\n", - " Cs-134\n", + " CS-137\n", + " cs137\n", + " CS-137\n", " 1\n", " \n", " \n", @@ -1194,9 +1039,15 @@ " 1\n", " \n", " \n", - " CS-137\n", - " cs137\n", - " CS-137\n", + " Cs-134\n", + " cs134\n", + " Cs-134\n", + " 1\n", + " \n", + " \n", + " RA-228\n", + " ra228\n", + " RA-228\n", " 1\n", " \n", " \n", @@ -1219,10 +1070,10 @@ " matched_maris_name source_name match_score\n", "source_key \n", "3H h3 3H 2\n", - "RA-228 ra228 RA-228 1\n", - "Cs-134 cs134 Cs-134 1\n", - "Cs-137 cs137 Cs-137 1\n", "CS-137 cs137 CS-137 1\n", + "Cs-137 cs137 Cs-137 1\n", + "Cs-134 cs134 Cs-134 1\n", + "RA-228 ra228 RA-228 1\n", "RA-226 ra226 RA-226 1\n", "CS-134 cs134 CS-134 1" ] @@ -1323,34 +1174,202 @@ " print(f'{key} NUCLIDE unique: ', dfs_out[key]['NUCLIDE'].unique())" ] }, + { + "cell_type": "markdown", + "id": "a54f21ff", + "metadata": {}, + "source": [ + "### Add Nuclide Id column" + ] + }, + { + "cell_type": "markdown", + "id": "0deedefa", + "metadata": {}, + "source": [ + "The `nuclide_id` column is added to the dataframe for legacy reasons (again Open Refine output)." + ] + }, { "cell_type": "code", "execution_count": null, - "id": "2c5be367", + "id": "635c8f39", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NUCLIDEnuclide_id
0pu239_240_tot77
1tc9915
2pu239_240_tot77
3pu239_240_tot77
4tc9915
.........
15309tc9915
15310pu239_240_tot77
15311cs13733
15312cs13733
15313tc9915
\n", + "

15314 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " NUCLIDE nuclide_id\n", + "0 pu239_240_tot 77\n", + "1 tc99 15\n", + "2 pu239_240_tot 77\n", + "3 pu239_240_tot 77\n", + "4 tc99 15\n", + "... ... ...\n", + "15309 tc99 15\n", + "15310 pu239_240_tot 77\n", + "15311 cs137 33\n", + "15312 cs137 33\n", + "15313 tc99 15\n", + "\n", + "[15314 rows x 2 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[RemapNuclideNameCB(lut_nuclides),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE')\n", + " ])\n", + "dfs_out = tfm()\n", + "\n", + "# For instance\n", + "dfs_out['biota'][['NUCLIDE', 'nuclide_id']]" + ] + }, + { + "cell_type": "markdown", + "id": "2ba5585f", + "metadata": {}, + "source": [ + "## Standardize Time" + ] + }, + { + "cell_type": "markdown", + "id": "c7bce267", + "metadata": {}, + "source": [ + "#### Parse time" + ] + }, + { + "cell_type": "markdown", + "id": "0e5455ec", + "metadata": {}, + "source": [ + "     *NetCDF format variable: `time`.*" + ] + }, + { + "cell_type": "markdown", + "id": "9e0b1805", + "metadata": {}, + "source": [ + "     *Open Refine format variables: `begperiod` " + ] + }, + { + "cell_type": "markdown", + "id": "c807cd86", + "metadata": {}, + "source": [ + "Create a callback that remaps the time format in the dictionary of dataframes (i.e. `%m/%d/%y %H:%M:%S`):" + ] }, { "cell_type": "code", "execution_count": null, - "id": "44976e05", + "id": "166fb92c", "metadata": {}, "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "c4b7ef1b", - "metadata": {}, "source": [ - "Many entries of OSPAR Nuclide are NAN. " + "# TODO" ] }, { "cell_type": "code", "execution_count": null, - "id": "dd7d2c53", + "id": "101d4fb0", "metadata": {}, "outputs": [ { @@ -1399,8 +1418,8 @@ " \n", " \n", " \n", - " 16799\n", - " 97147\n", + " 0\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", @@ -1423,8 +1442,8 @@ " NaN\n", " \n", " \n", - " 16800\n", - " 97148\n", + " 1\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", @@ -1447,8 +1466,8 @@ " NaN\n", " \n", " \n", - " 16801\n", - " 97149\n", + " 2\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", @@ -1471,8 +1490,8 @@ " NaN\n", " \n", " \n", - " 16802\n", - " 97150\n", + " 3\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", @@ -1495,8 +1514,8 @@ " NaN\n", " \n", " \n", - " 16803\n", - " 97151\n", + " 4\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", @@ -1543,17 +1562,17 @@ " ...\n", " \n", " \n", - " 18474\n", - " 120366\n", - " Ireland\n", - " 4.0\n", - " N8\n", + " 18851\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " 53.0\n", - " 39.0\n", - " 0.0\n", - " N\n", - " 5.0\n", " ...\n", " NaN\n", " NaN\n", @@ -1562,22 +1581,22 @@ " NaN\n", " NaN\n", " NaN\n", - " 2021 data\n", + " NaN\n", " NaN\n", " NaN\n", " \n", " \n", - " 18475\n", - " 120367\n", - " Ireland\n", - " 4.0\n", - " N9\n", + " 18852\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " 53.0\n", - " 53.0\n", - " 0.0\n", - " N\n", - " 5.0\n", " ...\n", " NaN\n", " NaN\n", @@ -1586,22 +1605,22 @@ " NaN\n", " NaN\n", " NaN\n", - " 2021 data\n", + " NaN\n", " NaN\n", " NaN\n", " \n", " \n", - " 18476\n", - " 120368\n", - " Ireland\n", - " 4.0\n", - " N10\n", + " 18853\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " 53.0\n", - " 52.0\n", - " 0.0\n", - " N\n", - " 5.0\n", " ...\n", " NaN\n", " NaN\n", @@ -1610,22 +1629,22 @@ " NaN\n", " NaN\n", " NaN\n", - " 2021 data\n", + " NaN\n", " NaN\n", " NaN\n", " \n", " \n", - " 18477\n", - " 120369\n", - " Ireland\n", - " 1.0\n", - " Salthill\n", + " 18854\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " 53.0\n", - " 15.0\n", - " 40.0\n", - " N\n", - " 9.0\n", " ...\n", " NaN\n", " NaN\n", @@ -1634,22 +1653,22 @@ " NaN\n", " NaN\n", " NaN\n", - " 2021 data\n", - " Woodstown (County Waterford) and Salthill (Cou...\n", + " NaN\n", + " NaN\n", " NaN\n", " \n", " \n", - " 18478\n", - " 120370\n", - " Ireland\n", - " 1.0\n", - " Woodstown\n", + " 18855\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " 52.0\n", - " 11.0\n", - " 55.0\n", - " N\n", - " 6.0\n", " ...\n", " NaN\n", " NaN\n", @@ -1664,63 +1683,63 @@ " \n", " \n", "\n", - "

546 rows × 25 columns

\n", + "

18318 rows × 25 columns

\n", "" ], "text/plain": [ - " ID Contracting Party RSC Sub-division Station ID Sample ID LatD \\\n", - "16799 97147 NaN NaN NaN NaN NaN \n", - "16800 97148 NaN NaN NaN NaN NaN \n", - "16801 97149 NaN NaN NaN NaN NaN \n", - "16802 97150 NaN NaN NaN NaN NaN \n", - "16803 97151 NaN NaN NaN NaN NaN \n", - "... ... ... ... ... ... ... \n", - "18474 120366 Ireland 4.0 N8 NaN 53.0 \n", - "18475 120367 Ireland 4.0 N9 NaN 53.0 \n", - "18476 120368 Ireland 4.0 N10 NaN 53.0 \n", - "18477 120369 Ireland 1.0 Salthill NaN 53.0 \n", - "18478 120370 Ireland 1.0 Woodstown NaN 52.0 \n", + " ID Contracting Party RSC Sub-division Station ID Sample ID LatD \\\n", + "0 NaN NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN NaN \n", + "... .. ... ... ... ... ... \n", + "18851 NaN NaN NaN NaN NaN NaN \n", + "18852 NaN NaN NaN NaN NaN NaN \n", + "18853 NaN NaN NaN NaN NaN NaN \n", + "18854 NaN NaN NaN NaN NaN NaN \n", + "18855 NaN NaN NaN NaN NaN NaN \n", "\n", " LatM LatS LatDir LongD ... Sampling date Nuclide Value type \\\n", - "16799 NaN NaN NaN NaN ... NaN NaN NaN \n", - "16800 NaN NaN NaN NaN ... NaN NaN NaN \n", - "16801 NaN NaN NaN NaN ... NaN NaN NaN \n", - "16802 NaN NaN NaN NaN ... NaN NaN NaN \n", - "16803 NaN NaN NaN NaN ... NaN NaN NaN \n", + "0 NaN NaN NaN NaN ... NaN NaN NaN \n", + "1 NaN NaN NaN NaN ... NaN NaN NaN \n", + "2 NaN NaN NaN NaN ... NaN NaN NaN \n", + "3 NaN NaN NaN NaN ... NaN NaN NaN \n", + "4 NaN NaN NaN NaN ... NaN NaN NaN \n", "... ... ... ... ... ... ... ... ... \n", - "18474 39.0 0.0 N 5.0 ... NaN NaN NaN \n", - "18475 53.0 0.0 N 5.0 ... NaN NaN NaN \n", - "18476 52.0 0.0 N 5.0 ... NaN NaN NaN \n", - "18477 15.0 40.0 N 9.0 ... NaN NaN NaN \n", - "18478 11.0 55.0 N 6.0 ... NaN NaN NaN \n", + "18851 NaN NaN NaN NaN ... NaN NaN NaN \n", + "18852 NaN NaN NaN NaN ... NaN NaN NaN \n", + "18853 NaN NaN NaN NaN ... NaN NaN NaN \n", + "18854 NaN NaN NaN NaN ... NaN NaN NaN \n", + "18855 NaN NaN NaN NaN ... NaN NaN NaN \n", "\n", " Activity or MDA Uncertainty Unit Data provider Measurement Comment \\\n", - "16799 NaN NaN NaN NaN NaN \n", - "16800 NaN NaN NaN NaN NaN \n", - "16801 NaN NaN NaN NaN NaN \n", - "16802 NaN NaN NaN NaN NaN \n", - "16803 NaN NaN NaN NaN NaN \n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN \n", "... ... ... ... ... ... \n", - "18474 NaN NaN NaN NaN 2021 data \n", - "18475 NaN NaN NaN NaN 2021 data \n", - "18476 NaN NaN NaN NaN 2021 data \n", - "18477 NaN NaN NaN NaN 2021 data \n", - "18478 NaN NaN NaN NaN NaN \n", + "18851 NaN NaN NaN NaN NaN \n", + "18852 NaN NaN NaN NaN NaN \n", + "18853 NaN NaN NaN NaN NaN \n", + "18854 NaN NaN NaN NaN NaN \n", + "18855 NaN NaN NaN NaN NaN \n", "\n", - " Sample Comment Reference Comment \n", - "16799 NaN NaN \n", - "16800 NaN NaN \n", - "16801 NaN NaN \n", - "16802 NaN NaN \n", - "16803 NaN NaN \n", - "... ... ... \n", - "18474 NaN NaN \n", - "18475 NaN NaN \n", - "18476 NaN NaN \n", - "18477 Woodstown (County Waterford) and Salthill (Cou... NaN \n", - "18478 NaN NaN \n", + " Sample Comment Reference Comment \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "... ... ... \n", + "18851 NaN NaN \n", + "18852 NaN NaN \n", + "18853 NaN NaN \n", + "18854 NaN NaN \n", + "18855 NaN NaN \n", "\n", - "[546 rows x 25 columns]" + "[18318 rows x 25 columns]" ] }, "execution_count": null, @@ -1729,55 +1748,35 @@ } ], "source": [ - "dfs['seawater'][dfs['seawater']['Nuclide'].isna()]" - ] - }, - { - "cell_type": "markdown", - "id": "4a33dd67", - "metadata": {}, - "source": [ - "***" - ] - }, - { - "cell_type": "markdown", - "id": "2ba5585f", - "metadata": {}, - "source": [ - "### Standardize Time" - ] - }, - { - "cell_type": "markdown", - "id": "c7bce267", - "metadata": {}, - "source": [ - "#### Parse time" - ] - }, - { - "cell_type": "markdown", - "id": "0e5455ec", - "metadata": {}, - "source": [ - "     *NetCDF format variable: `time`.*" - ] - }, - { - "cell_type": "markdown", - "id": "9e0b1805", - "metadata": {}, - "source": [ - "     *Open Refine format variables: `begperiod` " + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemoveAllNAValuesCB(cols_to_check),\n", + " ])\n", + "\n", + "dfs_test = tfm()\n", + "mask = dfs_test['seawater'][['Sampling date']].isna()\n", + "dfs_test['seawater'][mask]" ] }, { - "cell_type": "markdown", - "id": "c807cd86", + "cell_type": "code", + "execution_count": null, + "id": "4292c628", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(18318, 25)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Create a callback that remaps the time format in the dictionary of dataframes (i.e. `%m/%d/%y %H:%M:%S`):" + "dfs_test['seawater'].shape" ] }, { @@ -1789,9 +1788,9 @@ "source": [ "#| export\n", "class ParseTimeCB(Callback):\n", + " \"Parse the time format in the dataframe.\"\n", " def __init__(self):\n", " fc.store_attr()\n", - " \n", " \n", " def __call__(self, tfm):\n", " for grp in tfm.dfs.keys():\n", @@ -1801,12 +1800,6 @@ " self._remove_nan(df)\n", "\n", " def _process_dates(self, df: pd.DataFrame):\n", - " \"\"\"\n", - " Process and correct date and time information in the DataFrame.\n", - "\n", - " Args:\n", - " df (pd.DataFrame): DataFrame containing the 'Sampling date' column.\n", - " \"\"\"\n", " if 'Sampling date' in df.columns:\n", " # Convert 'Sampling date' to datetime, ignoring errors to avoid NaNs\n", " df['time'] = pd.to_datetime(df['Sampling date'], format='%d/%m/%Y', errors='coerce')\n", @@ -1815,23 +1808,10 @@ " df['time'] = pd.NaT \n", " \n", " def _define_beg_period(self, df: pd.DataFrame):\n", - " \"\"\"\n", - " Create a standardized date representation for Open Refine.\n", - " \n", - " Args:\n", - " df (pd.DataFrame): DataFrame containing the 'time' column.\n", - " \"\"\"\n", " df['begperiod'] = df['time']\n", "\n", " def _remove_nan(self, df: pd.DataFrame):\n", - " \"\"\"\n", - " Remove rows with NaN entries in the 'time' column.\n", - " \n", - " Args:\n", - " df (pd.DataFrame): DataFrame containing the 'time' column.\n", - " \"\"\"\n", - " df.dropna(subset=['time'], inplace=True)\n", - "\n" + " df.dropna(subset=['time'], inplace=True)" ] }, { @@ -1878,9 +1858,11 @@ "source": [ "#|eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[ParseTimeCB(),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemoveAllNAValuesCB(cols_to_check),\n", + " ParseTimeCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", "tfm()\n", "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", "print(tfm.dfs['seawater'][['begperiod','time']])"