From 3285c9a8f72710a797f677cee2a4d00bb5a998fc Mon Sep 17 00:00:00 2001 From: Franck Albinet Date: Thu, 26 Sep 2024 10:02:47 +0200 Subject: [PATCH] sanitize uncertainties + fix measurements + match biota species in ospar handler --- marisco/handlers/helcom.py | 3 +- marisco/utils.py | 31 +- nbs/api/utils.ipynb | 19 + nbs/handlers/_ospar.ipynb | 1046 ++++++++++++++++++++++++++++++------ nbs/handlers/helcom.ipynb | 21 +- 5 files changed, 924 insertions(+), 196 deletions(-) diff --git a/marisco/handlers/helcom.py b/marisco/handlers/helcom.py index 3b96b9e..b712b07 100644 --- a/marisco/handlers/helcom.py +++ b/marisco/handlers/helcom.py @@ -149,8 +149,7 @@ def _define_beg_period(self, df): "Create a standardized date representation for Open Refine." df['begperiod'] = df['time'] -# %% ../../nbs/handlers/helcom.ipynb 58 -# Columns of interest +# %% ../../nbs/handlers/helcom.ipynb 59 coi_val = {'seawater' : {'val': 'VALUE_Bq/m³'}, 'biota': {'val': 'VALUE_Bq/kg'}, 'sediment': {'val': 'VALUE_Bq/kg'}} diff --git a/marisco/utils.py b/marisco/utils.py index 5747ed3..9874774 100644 --- a/marisco/utils.py +++ b/marisco/utils.py @@ -3,7 +3,7 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/api/utils.ipynb. # %% auto 0 -__all__ = ['get_unique_across_dfs', 'Remapper', 'has_valid_varname', 'get_bbox', 'ddmm_to_dd', 'download_files_in_folder', +__all__ = ['NA', 'get_unique_across_dfs', 'Remapper', 'has_valid_varname', 'get_bbox', 'ddmm_to_dd', 'download_files_in_folder', 'download_file', 'match_worms', 'Match', 'match_maris_lut', 'test_dfs'] # %% ../nbs/api/utils.ipynb 2 @@ -26,7 +26,10 @@ import jellyfish as jf from collections.abc import Callable -# %% ../nbs/api/utils.ipynb 6 +# %% ../nbs/api/utils.ipynb 5 +NA = 'Not available' + +# %% ../nbs/api/utils.ipynb 8 def get_unique_across_dfs(dfs:dict, # Dictionary of dataframes col_name:str='NUCLIDE', # Column name to extract unique values from as_df:bool=False, # Return a DataFrame of unique values @@ -41,7 +44,7 @@ def get_unique_across_dfs(dfs:dict, # Dictionary of dataframes if include_nchars: df_uniques['n_chars'] = df_uniques['value'].str.len() return df_uniques -# %% ../nbs/api/utils.ipynb 12 +# %% ../nbs/api/utils.ipynb 14 class Remapper(): "Remap a data provider lookup table to a MARIS lookup table using fuzzy matching." def __init__(self, @@ -104,7 +107,7 @@ def _format_output(self): return df_lut.sort_values(by='match_score', ascending=False) -# %% ../nbs/api/utils.ipynb 14 +# %% ../nbs/api/utils.ipynb 16 def has_valid_varname( var_names:list, # variable names cdl_path:str, # Path to MARIS CDL file (point of truth) @@ -137,7 +140,7 @@ def has_valid_varname( print(f'"{name}" variable name not found in MARIS CDL') return has_valid -# %% ../nbs/api/utils.ipynb 18 +# %% ../nbs/api/utils.ipynb 20 def get_bbox(df, coord_cols=('lon', 'lat') ): @@ -146,7 +149,7 @@ def get_bbox(df, arr = [(row[x], row[y]) for _, row in df.iterrows()] return MultiPoint(arr).envelope -# %% ../nbs/api/utils.ipynb 24 +# %% ../nbs/api/utils.ipynb 26 def ddmm_to_dd( ddmmmm:float # Coordinates in degrees/minutes decimal format ) -> float: # Coordinates in degrees decimal format @@ -155,7 +158,7 @@ def ddmm_to_dd( mins = mins * 100 return round(int(degs) + (mins / 60), 6) -# %% ../nbs/api/utils.ipynb 27 +# %% ../nbs/api/utils.ipynb 29 def download_files_in_folder(owner:str, repo:str, src_dir:str, @@ -189,7 +192,7 @@ def download_file(owner, repo, src_dir, dest_dir, fname): else: print(f"Error: {response.status_code}") -# %% ../nbs/api/utils.ipynb 29 +# %% ../nbs/api/utils.ipynb 31 def match_worms( name:str # Name of species to look up in WoRMS ): @@ -212,7 +215,7 @@ def match_worms( else: return -1 -# %% ../nbs/api/utils.ipynb 34 +# %% ../nbs/api/utils.ipynb 36 @dataclass class Match: "Match between a data provider name and a MARIS lookup table." @@ -221,7 +224,7 @@ class Match: source_name: str match_score: int -# %% ../nbs/api/utils.ipynb 35 +# %% ../nbs/api/utils.ipynb 37 def match_maris_lut( lut_path: str, # Path to MARIS species authoritative species look-up table data_provider_name: str, # Name of data provider nomenclature item to look up @@ -238,7 +241,7 @@ def match_maris_lut( df = df.sort_values(by='score', ascending=True)[:nresults] return df[[maris_id, maris_name, 'score']] -# %% ../nbs/api/utils.ipynb 42 +# %% ../nbs/api/utils.ipynb 44 def get_bbox(df, coord_cols=('lon', 'lat') ): @@ -246,7 +249,7 @@ def get_bbox(df, arr = [(row[x], row[y]) for _, row in df.iterrows()] return MultiPoint(arr).envelope -# %% ../nbs/api/utils.ipynb 49 +# %% ../nbs/api/utils.ipynb 51 def download_files_in_folder(owner:str, repo:str, src_dir:str, @@ -280,7 +283,7 @@ def download_file(owner, repo, src_dir, dest_dir, fname): else: print(f"Error: {response.status_code}") -# %% ../nbs/api/utils.ipynb 51 +# %% ../nbs/api/utils.ipynb 53 def match_worms( name:str # Name of species to look up in WoRMS ): @@ -303,7 +306,7 @@ def match_worms( else: return -1 -# %% ../nbs/api/utils.ipynb 56 +# %% ../nbs/api/utils.ipynb 58 def test_dfs( dfs1:dict, # First dictionary of DataFrames to compare dfs2:dict # Second dictionary of DataFrames to compare diff --git a/nbs/api/utils.ipynb b/nbs/api/utils.ipynb index 40ff7c5..cdbde2b 100644 --- a/nbs/api/utils.ipynb +++ b/nbs/api/utils.ipynb @@ -59,6 +59,25 @@ "import pandas as pd" ] }, + { + "cell_type": "markdown", + "id": "fb15839e", + "metadata": {}, + "source": [ + "We define below useful constants throughout the package." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6dff2deb", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "NA = 'Not available'" + ] + }, { "cell_type": "markdown", "id": "7ddde356", diff --git a/nbs/handlers/_ospar.ipynb b/nbs/handlers/_ospar.ipynb index f381711..5da9120 100644 --- a/nbs/handlers/_ospar.ipynb +++ b/nbs/handlers/_ospar.ipynb @@ -142,6 +142,7 @@ " unit_lut_path\n", " )\n", "\n", + "from marisco.utils import NA\n", "from marisco.serializers import NetCDFEncoder, OpenRefineCsvEncoder\n", "\n", "import warnings\n", @@ -1547,28 +1548,10 @@ }, { "cell_type": "markdown", - "id": "02decc88", + "id": "0fe8ab6e", "metadata": {}, "source": [ - "     *NetCDF format variable: ``value``.*" - ] - }, - { - "cell_type": "markdown", - "id": "e44772c2", - "metadata": {}, - "source": [ - "     *Open Refine format variables: ``activity``.*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2b34550", - "metadata": {}, - "outputs": [], - "source": [ - "# TO BE REFACTORED" + "We allocate each column containing measurement values into a single column `value` and remove `NA` where needed." ] }, { @@ -1578,39 +1561,18 @@ "metadata": {}, "outputs": [], "source": [ - "# | export\n", + "# | exports\n", "class SanitizeValue(Callback):\n", - " \"Sanitize value by removing blank entries.\"\n", - "\n", - " def __init__(self):\n", - " \"\"\"\n", - " Initialize the SanitizeValue callback.\n", - " \"\"\"\n", + " \"Sanitize value by removing blank entries and populating `value` column.\"\n", + " def __init__(self, \n", + " value_col: str='Activity or MDA' # Column name to sanitize\n", + " ):\n", " fc.store_attr()\n", "\n", " def __call__(self, tfm):\n", - " \"\"\"\n", - " Sanitize the DataFrames in the transformer by removing rows with blank values in specified columns.\n", - " Args:\n", - " tfm (Transformer): The transformer object containing DataFrames.\n", - " \"\"\"\n", - " for grp in tfm.dfs.keys():\n", - " self._sanitize_dataframe(tfm.dfs[grp], grp)\n", - "\n", - "\n", - " def _sanitize_dataframe(self, df: pd.DataFrame, grp: str):\n", - " \"\"\"\n", - " Remove rows where value column (i.e. 'Activity or MDA') is blank and remap to 'value' column.\n", - "\n", - " Args:\n", - " df (pd.DataFrame): DataFrame to sanitize.\n", - " grp (str): Group name to determine column names.\n", - " \"\"\"\n", - " value_col = 'Activity or MDA'\n", - " if value_col in df.columns:\n", - " df.dropna(subset=[value_col], inplace=True)\n", - " df['value'] = df[value_col]\n", - " " + " for df in tfm.dfs.values():\n", + " df.dropna(subset=[self.value_col], inplace=True)\n", + " df['value'] = df[self.value_col]" ] }, { @@ -1620,35 +1582,74 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - " seawater biota\n", - "Number of rows in dfs 18856 15314\n", - "Number of rows in tfm.dfs 18308 15314\n", - "Number of dropped rows 548 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 18856 15314 \n", - "\n" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value
00.20
10.27
20.26
30.25
40.20
\n", + "
" + ], + "text/plain": [ + " value\n", + "0 0.20\n", + "1 0.27\n", + "2 0.26\n", + "3 0.25\n", + "4 0.20" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "#|eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[SanitizeValue(),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", + "tfm = Transformer(dfs, cbs=[SanitizeValue()])\n", "\n", - "tfm()\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" - ] - }, - { - "cell_type": "markdown", - "id": "197588fb", - "metadata": {}, - "source": [ - "***" + "tfm()['seawater'][['value']].head()" ] }, { @@ -1656,23 +1657,7 @@ "id": "7c83412b", "metadata": {}, "source": [ - "### Normalize uncertainty" - ] - }, - { - "cell_type": "markdown", - "id": "1ceaf5b7", - "metadata": {}, - "source": [ - "     *NetCDF format variable: ``uncertainty``.*" - ] - }, - { - "cell_type": "markdown", - "id": "41097d83", - "metadata": {}, - "source": [ - "     *Open Refine format variable: `Uncertainty`.*" + "## Normalize uncertainty" ] }, { @@ -1680,11 +1665,9 @@ "id": "13a44f1a", "metadata": {}, "source": [ - "For each sample type in the OSPAR dataset, the reported uncertainty is given as an expanded uncertainty with a coverage factor 𝑘=2\n", - "\n", - "For further details, refer to the [OSPAR reporting guidelines](https://mcc.jrc.ec.europa.eu/documents/OSPAR/Guidelines_forestimationof_a_%20measurefor_uncertainty_in_OSPARmonitoring.pdf)\n", + "For each sample type in the OSPAR dataset, the reported uncertainty is given as an expanded uncertainty with a coverage factor `𝑘=2`. For further details, refer to the [OSPAR reporting guidelines](https://mcc.jrc.ec.europa.eu/documents/OSPAR/Guidelines_forestimationof_a_%20measurefor_uncertainty_in_OSPARmonitoring.pdf).\n", "\n", - "Note: The OSPAR uncertainty values are normalized to standard uncertainty with a coverage factor \n", + "**Note**: The OSPAR uncertainty values are normalized to standard uncertainty with a coverage factor \n", "𝑘=1." ] }, @@ -1693,82 +1676,39 @@ "id": "97a933ab", "metadata": {}, "source": [ - "NormalizeUncCB callback normalizes the uncertainty" + "`NormalizeUncCB` callback normalizes the uncertainty using the following `lambda` function:" ] }, { "cell_type": "code", "execution_count": null, - "id": "87265333", + "id": "d6c84351", "metadata": {}, "outputs": [], "source": [ - "#| export\n", - "# Make measurement and uncertainty units consistent\n", - "def unc_exp2stan(df: pd.DataFrame, unc_col: str) -> pd.Series:\n", - " \"\"\"\n", - " Convert expanded uncertainty (k=2) to standard uncertainty (k=1).\n", - "\n", - " Args:\n", - " df (pd.DataFrame): DataFrame containing uncertainty values.\n", - " unc_col (str): Column name of the uncertainty values to be converted.\n", - "\n", - " Returns:\n", - " pd.Series: Series of standard uncertainty values.\n", - " \"\"\"\n", - " k = 2\n", - " return df[unc_col] / k" + "#| exports\n", + "unc_exp2stan = lambda df, unc_col: df[unc_col] / 2" ] }, { "cell_type": "code", "execution_count": null, - "id": "8ad1b008", + "id": "ecb2866d", "metadata": {}, "outputs": [], "source": [ + "#| exports\n", "class NormalizeUncCB(Callback):\n", - " \"\"\"Callback to normalize uncertainty values in DataFrames. This callback applies a conversion function to standardize the uncertainty values in each DataFrame.\"\"\"\n", - "\n", - " def __init__(self, fn_convert_unc: Callable[[pd.DataFrame, str], pd.Series]):\n", - " \"\"\"\n", - " Initialize the NormalizeUncCB with a conversion function.\n", - "\n", - " Args:\n", - " fn_convert_unc (Callable[[pd.DataFrame, str], pd.Series]): \n", - " Function that takes a DataFrame and a column name, and returns a Series of converted uncertainty values.\n", - " \"\"\"\n", + " \"\"\"Normalize uncertainty values in DataFrames.\"\"\"\n", + " def __init__(self, \n", + " col_unc: str='Uncertainty', # Column name to normalize\n", + " fn_convert_unc: Callable=unc_exp2stan, # Function correcting coverage factor\n", + " ): \n", " fc.store_attr()\n", "\n", - " def __call__(self, tfm: 'Transformer'):\n", - " \"\"\"\n", - " Apply the conversion function to the 'Uncertainty' column in each DataFrame within the transformer.\n", - "\n", - " Args:\n", - " tfm (Transformer): The transformer object containing DataFrames.\n", - " \"\"\"\n", - " for grp, df in tfm.dfs.items():\n", - " df['uncertainty'] = self._convert_uncertainty(df)\n", - "\n", - " def _convert_uncertainty(self, df: pd.DataFrame) -> pd.Series:\n", - " \"\"\"\n", - " Convert the uncertainty values in the DataFrame using the provided conversion function.\n", - "\n", - " Args:\n", - " df (pd.DataFrame): DataFrame containing the 'Uncertainty' column.\n", - "\n", - " Returns:\n", - " pd.Series: Converted uncertainty values.\n", - " \"\"\"\n", - " return self.fn_convert_unc(df, 'Uncertainty')\n" - ] - }, - { - "cell_type": "markdown", - "id": "fd159712", - "metadata": {}, - "source": [ - "Apply the transformer for callback NormalizeUncCB(). Then, print the value (i.e. activity per unit ) and standard uncertainty for each sample type." + " def __call__(self, tfm):\n", + " for df in tfm.dfs.values():\n", + " df['uncertainty'] = self.fn_convert_unc(df, self.col_unc)" ] }, { @@ -1781,18 +1721,16 @@ "name": "stdout", "output_type": "stream", "text": [ - " seawater biota\n", - "Number of rows in dfs 18856 15314\n", - "Number of rows in tfm.dfs 18308 15314\n", - "Number of dropped rows 548 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 18856 15314 \n", "\n", + "seawater:\n", " value uncertainty\n", "0 0.20 NaN\n", "1 0.27 NaN\n", "2 0.26 NaN\n", "3 0.25 NaN\n", "4 0.20 NaN\n", + "\n", + "biota:\n", " value uncertainty\n", "0 0.3510 0.033\n", "1 39.0000 7.500\n", @@ -1807,32 +1745,806 @@ "dfs = load_data(fname_in)\n", "tfm = Transformer(dfs, cbs=[ \n", " SanitizeValue(), \n", - " NormalizeUncCB(unc_exp2stan),\n", - " CompareDfsAndTfmCB(dfs)\n", + " NormalizeUncCB()\n", " ])\n", - "\n", - "\n", "tfm()\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", "\n", - "print(tfm.dfs['seawater'][['value', 'uncertainty']][:5])\n", - "print(tfm.dfs['biota'][['value', 'uncertainty']][:5])\n" + "for grp in ['seawater', 'biota']:\n", + " print(f'\\n{grp}:')\n", + " print(tfm.dfs[grp][['value', 'uncertainty']].head())" ] }, { "cell_type": "markdown", - "id": "61c204ca", + "id": "96d25e19", "metadata": {}, "source": [ - "***" + "## Remap Biota species" ] }, { - "cell_type": "markdown", - "id": "96d25e19", + "cell_type": "code", + "execution_count": null, + "id": "e9dcc466", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexvalue
00Unknown
11Homarus gammarus
22SPRATTUS SPRATTUS
33Anarhichas denticulatus
44MOLVA MOLVA
.........
151151MELANOGRAMMUS AEGLEFINUS
152152MERLUCCIUS MERLUCCIUS
153153PECTEN MAXIMUS
154154LITTORINA LITTOREA
155155Pleuronectes platessa
\n", + "

156 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " index value\n", + "0 0 Unknown\n", + "1 1 Homarus gammarus\n", + "2 2 SPRATTUS SPRATTUS\n", + "3 3 Anarhichas denticulatus\n", + "4 4 MOLVA MOLVA\n", + ".. ... ...\n", + "151 151 MELANOGRAMMUS AEGLEFINUS\n", + "152 152 MERLUCCIUS MERLUCCIUS\n", + "153 153 PECTEN MAXIMUS\n", + "154 154 LITTORINA LITTOREA\n", + "155 155 Pleuronectes platessa\n", + "\n", + "[156 rows x 2 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfs = load_data(fname_in)\n", + "get_unique_across_dfs(dfs, col_name='Species', as_df=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f6e4a3d", "metadata": {}, + "outputs": [], "source": [ - "### Lookup transformations " + "#| eval: false\n", + "remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='Species', as_df=True),\n", + " maris_lut_fn=species_lut_path,\n", + " maris_col_id='species_id',\n", + " maris_col_name='species',\n", + " provider_col_to_match='value',\n", + " provider_col_key='value',\n", + " fname_cache='species_ospar.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cb98f5b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 100%|██████████| 156/156 [00:23<00:00, 6.65it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
matched_maris_namesource_namematch_score
source_key
RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATALomentaria catenataRHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA31
Mixture of green, red and brown algaeMercenaria mercenariaMixture of green, red and brown algae26
Solea solea (S.vulgaris)Loligo vulgarisSolea solea (S.vulgaris)12
SOLEA SOLEA (S.VULGARIS)Loligo vulgarisSOLEA SOLEA (S.VULGARIS)12
CERASTODERMA (CARDIUM) EDULECerastoderma eduleCERASTODERMA (CARDIUM) EDULE10
Cerastoderma (Cardium) EduleCerastoderma eduleCerastoderma (Cardium) Edule10
NUCELLA LAPILLUSMugil cephalusNUCELLA LAPILLUS9
DICENTRARCHUS (MORONE) LABRAXDicentrarchus labraxDICENTRARCHUS (MORONE) LABRAX9
MONODONTA LINEATAOphiothrix lineataMONODONTA LINEATA9
Pleuronectiformes [order]PleuronectiformesPleuronectiformes [order]8
RAJIDAE/BATOIDEABatoideaRAJIDAE/BATOIDEA8
PALMARIA PALMATAAlaria marginataPALMARIA PALMATA7
Rhodymenia spp.RhodymeniaRhodymenia spp.5
Sepia spp.SepiaSepia spp.5
unknownPlanktonunknown5
RAJA DIPTURUS BATISDipturus batisRAJA DIPTURUS BATIS5
FlatfishLambiaFlatfish5
UnknownPlanktonUnknown5
FUCUS SPP.FucusFUCUS SPP.5
Tapes sp.TapesTapes sp.4
Fucus sp.FucusFucus sp.4
Patella sp.Patella asperaPatella sp.4
FUCUS sppFucusFUCUS spp4
Gadus sp.GadusGadus sp.4
RHODYMENIA sppRhodymeniaRHODYMENIA spp4
Thunnus sp.ThunnusThunnus sp.4
PECTINIDAEBuccinidaePECTINIDAE3
Gaidropsarus argenteusGaidropsarus argentatusGaidropsarus argenteus2
PLUERONECTES PLATESSAPleuronectes platessaPLUERONECTES PLATESSA2
ASCOPHYLLUN NODOSUMAscophyllum nodosumASCOPHYLLUN NODOSUM1
Sebastes viviparesSebastes viviparusSebastes vivipares1
\n", + "
" + ], + "text/plain": [ + " matched_maris_name \\\n", + "source_key \n", + "RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA Lomentaria catenata \n", + "Mixture of green, red and brown algae Mercenaria mercenaria \n", + "Solea solea (S.vulgaris) Loligo vulgaris \n", + "SOLEA SOLEA (S.VULGARIS) Loligo vulgaris \n", + "CERASTODERMA (CARDIUM) EDULE Cerastoderma edule \n", + "Cerastoderma (Cardium) Edule Cerastoderma edule \n", + "NUCELLA LAPILLUS Mugil cephalus \n", + "DICENTRARCHUS (MORONE) LABRAX Dicentrarchus labrax \n", + "MONODONTA LINEATA Ophiothrix lineata \n", + "Pleuronectiformes [order] Pleuronectiformes \n", + "RAJIDAE/BATOIDEA Batoidea \n", + "PALMARIA PALMATA Alaria marginata \n", + "Rhodymenia spp. Rhodymenia \n", + "Sepia spp. Sepia \n", + "unknown Plankton \n", + "RAJA DIPTURUS BATIS Dipturus batis \n", + "Flatfish Lambia \n", + "Unknown Plankton \n", + "FUCUS SPP. Fucus \n", + "Tapes sp. Tapes \n", + "Fucus sp. Fucus \n", + "Patella sp. Patella aspera \n", + "FUCUS spp Fucus \n", + "Gadus sp. Gadus \n", + "RHODYMENIA spp Rhodymenia \n", + "Thunnus sp. Thunnus \n", + "PECTINIDAE Buccinidae \n", + "Gaidropsarus argenteus Gaidropsarus argentatus \n", + "PLUERONECTES PLATESSA Pleuronectes platessa \n", + "ASCOPHYLLUN NODOSUM Ascophyllum nodosum \n", + "Sebastes vivipares Sebastes viviparus \n", + "\n", + " source_name \\\n", + "source_key \n", + "RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA \n", + "Mixture of green, red and brown algae Mixture of green, red and brown algae \n", + "Solea solea (S.vulgaris) Solea solea (S.vulgaris) \n", + "SOLEA SOLEA (S.VULGARIS) SOLEA SOLEA (S.VULGARIS) \n", + "CERASTODERMA (CARDIUM) EDULE CERASTODERMA (CARDIUM) EDULE \n", + "Cerastoderma (Cardium) Edule Cerastoderma (Cardium) Edule \n", + "NUCELLA LAPILLUS NUCELLA LAPILLUS \n", + "DICENTRARCHUS (MORONE) LABRAX DICENTRARCHUS (MORONE) LABRAX \n", + "MONODONTA LINEATA MONODONTA LINEATA \n", + "Pleuronectiformes [order] Pleuronectiformes [order] \n", + "RAJIDAE/BATOIDEA RAJIDAE/BATOIDEA \n", + "PALMARIA PALMATA PALMARIA PALMATA \n", + "Rhodymenia spp. Rhodymenia spp. \n", + "Sepia spp. Sepia spp. \n", + "unknown unknown \n", + "RAJA DIPTURUS BATIS RAJA DIPTURUS BATIS \n", + "Flatfish Flatfish \n", + "Unknown Unknown \n", + "FUCUS SPP. FUCUS SPP. \n", + "Tapes sp. Tapes sp. \n", + "Fucus sp. Fucus sp. \n", + "Patella sp. Patella sp. \n", + "FUCUS spp FUCUS spp \n", + "Gadus sp. Gadus sp. \n", + "RHODYMENIA spp RHODYMENIA spp \n", + "Thunnus sp. Thunnus sp. \n", + "PECTINIDAE PECTINIDAE \n", + "Gaidropsarus argenteus Gaidropsarus argenteus \n", + "PLUERONECTES PLATESSA PLUERONECTES PLATESSA \n", + "ASCOPHYLLUN NODOSUM ASCOPHYLLUN NODOSUM \n", + "Sebastes vivipares Sebastes vivipares \n", + "\n", + " match_score \n", + "source_key \n", + "RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA 31 \n", + "Mixture of green, red and brown algae 26 \n", + "Solea solea (S.vulgaris) 12 \n", + "SOLEA SOLEA (S.VULGARIS) 12 \n", + "CERASTODERMA (CARDIUM) EDULE 10 \n", + "Cerastoderma (Cardium) Edule 10 \n", + "NUCELLA LAPILLUS 9 \n", + "DICENTRARCHUS (MORONE) LABRAX 9 \n", + "MONODONTA LINEATA 9 \n", + "Pleuronectiformes [order] 8 \n", + "RAJIDAE/BATOIDEA 8 \n", + "PALMARIA PALMATA 7 \n", + "Rhodymenia spp. 5 \n", + "Sepia spp. 5 \n", + "unknown 5 \n", + "RAJA DIPTURUS BATIS 5 \n", + "Flatfish 5 \n", + "Unknown 5 \n", + "FUCUS SPP. 5 \n", + "Tapes sp. 4 \n", + "Fucus sp. 4 \n", + "Patella sp. 4 \n", + "FUCUS spp 4 \n", + "Gadus sp. 4 \n", + "RHODYMENIA spp 4 \n", + "Thunnus sp. 4 \n", + "PECTINIDAE 3 \n", + "Gaidropsarus argenteus 2 \n", + "PLUERONECTES PLATESSA 2 \n", + "ASCOPHYLLUN NODOSUM 1 \n", + "Sebastes vivipares 1 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "remapper.generate_lookup_table(as_df=True)\n", + "remapper.select_match(match_score_threshold=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f073cc1b", + "metadata": {}, + "outputs": [], + "source": [ + "#|exports\n", + "fixes_biota_species = {\n", + " 'PECTINIDAE': NA, # Dropped. In Worms as PECTINIDAE is a family.\n", + " 'Unknown': NA,\n", + " 'unknown': NA,\n", + " 'PALMARIA PALMATA': NA, # Dropped. In Worms 'Palmaria palmata (Linnaeus) F.Weber & D.Mohr, 1805',\n", + " 'RAJIDAE/BATOIDEA': NA, # Mix \n", + " 'MONODONTA LINEATA': 'Phorcus lineatus',\n", + " 'NUCELLA LAPILLUS': NA, # Dropped. In Worms 'Nucella lapillus (Linnaeus, 1758)', \n", + " 'SOLEA SOLEA (S.VULGARIS)': 'Solea solea',\n", + " 'Solea solea (S.vulgaris)': 'Solea solea',\n", + " 'Mixture of green, red and brown algae': NA, # Mix \n", + " 'RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA': NA, # Mix\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdc3e95a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 100%|██████████| 156/156 [00:23<00:00, 6.66it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
matched_maris_namesource_namematch_score
source_key
CERASTODERMA (CARDIUM) EDULECerastoderma eduleCERASTODERMA (CARDIUM) EDULE10
Cerastoderma (Cardium) EduleCerastoderma eduleCerastoderma (Cardium) Edule10
DICENTRARCHUS (MORONE) LABRAXDicentrarchus labraxDICENTRARCHUS (MORONE) LABRAX9
Pleuronectiformes [order]PleuronectiformesPleuronectiformes [order]8
FUCUS SPP.FucusFUCUS SPP.5
FlatfishLambiaFlatfish5
Sepia spp.SepiaSepia spp.5
Rhodymenia spp.RhodymeniaRhodymenia spp.5
RAJA DIPTURUS BATISDipturus batisRAJA DIPTURUS BATIS5
RHODYMENIA sppRhodymeniaRHODYMENIA spp4
Thunnus sp.ThunnusThunnus sp.4
FUCUS sppFucusFUCUS spp4
Gadus sp.GadusGadus sp.4
Fucus sp.FucusFucus sp.4
Tapes sp.TapesTapes sp.4
Patella sp.Patella asperaPatella sp.4
Gaidropsarus argenteusGaidropsarus argentatusGaidropsarus argenteus2
PLUERONECTES PLATESSAPleuronectes platessaPLUERONECTES PLATESSA2
ASCOPHYLLUN NODOSUMAscophyllum nodosumASCOPHYLLUN NODOSUM1
Sebastes viviparesSebastes viviparusSebastes vivipares1
\n", + "
" + ], + "text/plain": [ + " matched_maris_name \\\n", + "source_key \n", + "CERASTODERMA (CARDIUM) EDULE Cerastoderma edule \n", + "Cerastoderma (Cardium) Edule Cerastoderma edule \n", + "DICENTRARCHUS (MORONE) LABRAX Dicentrarchus labrax \n", + "Pleuronectiformes [order] Pleuronectiformes \n", + "FUCUS SPP. Fucus \n", + "Flatfish Lambia \n", + "Sepia spp. Sepia \n", + "Rhodymenia spp. Rhodymenia \n", + "RAJA DIPTURUS BATIS Dipturus batis \n", + "RHODYMENIA spp Rhodymenia \n", + "Thunnus sp. Thunnus \n", + "FUCUS spp Fucus \n", + "Gadus sp. Gadus \n", + "Fucus sp. Fucus \n", + "Tapes sp. Tapes \n", + "Patella sp. Patella aspera \n", + "Gaidropsarus argenteus Gaidropsarus argentatus \n", + "PLUERONECTES PLATESSA Pleuronectes platessa \n", + "ASCOPHYLLUN NODOSUM Ascophyllum nodosum \n", + "Sebastes vivipares Sebastes viviparus \n", + "\n", + " source_name match_score \n", + "source_key \n", + "CERASTODERMA (CARDIUM) EDULE CERASTODERMA (CARDIUM) EDULE 10 \n", + "Cerastoderma (Cardium) Edule Cerastoderma (Cardium) Edule 10 \n", + "DICENTRARCHUS (MORONE) LABRAX DICENTRARCHUS (MORONE) LABRAX 9 \n", + "Pleuronectiformes [order] Pleuronectiformes [order] 8 \n", + "FUCUS SPP. FUCUS SPP. 5 \n", + "Flatfish Flatfish 5 \n", + "Sepia spp. Sepia spp. 5 \n", + "Rhodymenia spp. Rhodymenia spp. 5 \n", + "RAJA DIPTURUS BATIS RAJA DIPTURUS BATIS 5 \n", + "RHODYMENIA spp RHODYMENIA spp 4 \n", + "Thunnus sp. Thunnus sp. 4 \n", + "FUCUS spp FUCUS spp 4 \n", + "Gadus sp. Gadus sp. 4 \n", + "Fucus sp. Fucus sp. 4 \n", + "Tapes sp. Tapes sp. 4 \n", + "Patella sp. Patella sp. 4 \n", + "Gaidropsarus argenteus Gaidropsarus argenteus 2 \n", + "PLUERONECTES PLATESSA PLUERONECTES PLATESSA 2 \n", + "ASCOPHYLLUN NODOSUM ASCOPHYLLUN NODOSUM 1 \n", + "Sebastes vivipares Sebastes vivipares 1 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "remapper.generate_lookup_table(fixes=fixes_biota_species)\n", + "remapper.select_match(match_score_threshold=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a70cafab", + "metadata": {}, + "outputs": [], + "source": [ + "# TO BE DONE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3a74941", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "# class RemapBiotaSpeciesCB(Callback):\n", + "# \"Biota species standardized to MARIS format.\"\n", + "# def __init__(self, \n", + "# fn_lut:Callable # Function that returns the lookup table dictionary\n", + "# ):\n", + "# fc.store_attr()\n", + "\n", + "# def __call__(self, tfm):\n", + "# \"Remap biota species names in the DataFrame using the lookup table and print unmatched RUBIN values.\"\n", + "# lut = self.fn_lut()\n", + "# tfm.dfs['biota']['species'] = tfm.dfs['biota']['RUBIN'].apply(lambda x: self._get_species(x, lut))\n", + "\n", + "# def _get_species(self, \n", + "# rubin_value:str, # The RUBIN value from the DataFrame\n", + "# lut:dict # The lookup table dictionary\n", + "# ):\n", + "# \"Get the matched_id from the lookup table and print RUBIN if the matched_id is -1.\"\n", + "# match = lut.get(rubin_value.strip(), Match(-1, None, None, None))\n", + "# if match.matched_id == -1:\n", + "# self.print_unmatched_rubin(rubin_value)\n", + "# return match.matched_id\n", + "\n", + "# def print_unmatched_rubin(self, \n", + "# rubin_value: str # The RUBIN value from the DataFrame\n", + "# ):\n", + "# \"Print the RUBIN value if the matched_id is -1.\"\n", + "# print(f\"Unmatched RUBIN: {rubin_value}\")" ] }, { diff --git a/nbs/handlers/helcom.ipynb b/nbs/handlers/helcom.ipynb index f87518d..6842dbc 100644 --- a/nbs/handlers/helcom.ipynb +++ b/nbs/handlers/helcom.ipynb @@ -1700,6 +1700,14 @@ "## Sanitize value" ] }, + { + "cell_type": "markdown", + "id": "6de49e39", + "metadata": {}, + "source": [ + "We allocate each column containing measurement values (named differently across sample types) into a single column `value` and remove NA where needed." + ] + }, { "cell_type": "code", "execution_count": null, @@ -1708,24 +1716,11 @@ "outputs": [], "source": [ "#| exports\n", - "# Columns of interest\n", "coi_val = {'seawater' : {'val': 'VALUE_Bq/m³'},\n", " 'biota': {'val': 'VALUE_Bq/kg'},\n", " 'sediment': {'val': 'VALUE_Bq/kg'}}" ] }, - { - "cell_type": "markdown", - "id": "18c59ae1-c523-4aa6-bc04-e824390bf06d", - "metadata": {}, - "source": [ - "**Comment (FA)**: Those lines can be simplified I think:\n", - "```\n", - "value_col = self.coi.get(grp, {}).get('val')\n", - "if value_col and value_col in df.columns:\n", - "```" - ] - }, { "cell_type": "code", "execution_count": null,