From a030739e536a4decfd5519b268d1449e4e3413a5 Mon Sep 17 00:00:00 2001 From: niallmurphy93 Date: Thu, 10 Oct 2024 12:32:24 +0100 Subject: [PATCH 1/9] Update inout module: Add warning for None values in configs - Modified write_toml function to warn about None values without removing them - Added flatten_dict helper function - Updated both inout.ipynb and generated inout.py - Kept None values to ensure failure if configs are not properly updated --- marisco/inout.py | 22 ++++++++++++++++++++-- nbs/api/inout.ipynb | 30 ++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/marisco/inout.py b/marisco/inout.py index aef8b4a..2ccab9f 100644 --- a/marisco/inout.py +++ b/marisco/inout.py @@ -3,20 +3,38 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/api/inout.ipynb. # %% auto 0 -__all__ = ['write_toml', 'read_toml'] +__all__ = ['write_toml', 'flatten_dict', 'read_toml'] # %% ../nbs/api/inout.ipynb 2 import tomli_w import tomli +from typing import Dict, Any + # %% ../nbs/api/inout.ipynb 3 -def write_toml(fname, cfg): +def write_toml(fname: str, cfg: Dict[str, Any]): "Write a TOML file from a dictionary." + none_keys = [k for k, v in flatten_dict(cfg).items() if v is None] + if none_keys: + print(f"Warning: The following config keys have None values: {', '.join(none_keys)}") + print(f'Creating {fname}') with open(fname, "wb") as f: tomli_w.dump(cfg, f) # %% ../nbs/api/inout.ipynb 4 +def flatten_dict(d: Dict[str, Any], parent_key: str = '', sep: str = '.') -> Dict[str, Any]: + """Flatten a nested dictionary.""" + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + +# %% ../nbs/api/inout.ipynb 5 def read_toml(fname): "Read a TOML file into a dictionary." with open(fname, "rb") as f: diff --git a/nbs/api/inout.ipynb b/nbs/api/inout.ipynb index 651d1f6..0e5c7dc 100644 --- a/nbs/api/inout.ipynb +++ b/nbs/api/inout.ipynb @@ -29,7 +29,8 @@ "source": [ "#| export\n", "import tomli_w\n", - "import tomli" + "import tomli\n", + "from typing import Dict, Any\n" ] }, { @@ -40,13 +41,38 @@ "outputs": [], "source": [ "#| exports\n", - "def write_toml(fname, cfg):\n", + "def write_toml(fname: str, cfg: Dict[str, Any]):\n", " \"Write a TOML file from a dictionary.\"\n", + " none_keys = [k for k, v in flatten_dict(cfg).items() if v is None]\n", + " if none_keys:\n", + " print(f\"Warning: The following config keys have None values: {', '.join(none_keys)}\")\n", + " \n", " print(f'Creating {fname}')\n", " with open(fname, \"wb\") as f:\n", " tomli_w.dump(cfg, f)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "72ceebef", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "\n", + "def flatten_dict(d: Dict[str, Any], parent_key: str = '', sep: str = '.') -> Dict[str, Any]:\n", + " \"\"\"Flatten a nested dictionary.\"\"\"\n", + " items = []\n", + " for k, v in d.items():\n", + " new_key = f\"{parent_key}{sep}{k}\" if parent_key else k\n", + " if isinstance(v, dict):\n", + " items.extend(flatten_dict(v, new_key, sep=sep).items())\n", + " else:\n", + " items.append((new_key, v))\n", + " return dict(items)" + ] + }, { "cell_type": "code", "execution_count": null, From 1067c4ffb0422c198acf04852fb5363b280ca92f Mon Sep 17 00:00:00 2001 From: niallmurphy93 Date: Thu, 10 Oct 2024 13:01:41 +0100 Subject: [PATCH 2/9] Include _modidx.py in commit - Added _modidx.py to version control - Note: Previous commit failed to include this file - This commit ensures _modidx.py is properly tracked --- marisco/_modidx.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/marisco/_modidx.py b/marisco/_modidx.py index d9b8d51..a35ae1b 100644 --- a/marisco/_modidx.py +++ b/marisco/_modidx.py @@ -606,7 +606,8 @@ 'marisco.handlers.ospar.load_data': ('handlers/ospar.html#load_data', 'marisco/handlers/ospar.py'), 'marisco.handlers.ospar.unc_exp2stan': ( 'handlers/ospar.html#unc_exp2stan', 'marisco/handlers/ospar.py')}, - 'marisco.inout': { 'marisco.inout.read_toml': ('api/inout.html#read_toml', 'marisco/inout.py'), + 'marisco.inout': { 'marisco.inout.flatten_dict': ('api/inout.html#flatten_dict', 'marisco/inout.py'), + 'marisco.inout.read_toml': ('api/inout.html#read_toml', 'marisco/inout.py'), 'marisco.inout.write_toml': ('api/inout.html#write_toml', 'marisco/inout.py')}, 'marisco.metadata': { 'marisco.metadata.BboxCB': ('api/metadata.html#bboxcb', 'marisco/metadata.py'), 'marisco.metadata.BboxCB.__call__': ('api/metadata.html#bboxcb.__call__', 'marisco/metadata.py'), From 6b015ab367417e2eb0e50688d545f21dc301574f Mon Sep 17 00:00:00 2001 From: niallmurphy93 Date: Mon, 14 Oct 2024 11:25:31 +0100 Subject: [PATCH 3/9] Improve sediment sample depth note - Clarify note applies to sediment samples - Use correct variable name 'sliceup' - Explain implications of missing depth info - Suggest flagging data for review - Provide rationale for excluding incomplete data This improves clarity --- install_configure_guide/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/install_configure_guide/README.md b/install_configure_guide/README.md index 483fad2..187b528 100644 --- a/install_configure_guide/README.md +++ b/install_configure_guide/README.md @@ -828,6 +828,10 @@ An integer value (the 'bodypar_id' defined in the LUT). ### Description: Top of sediment core interval relative to the water-sediment interface (cm). +**_NOTE:_** +For sediment samples, if the top slice depth is missing (i.e., sliceup = -1), there should not be multiple grab samples for the same latitude, longitude, and time. Multiple samples at the same location and time likely indicate a core was taken, but slice top and bottom information is missing. In such cases, these records should be flagged for review and excluded from analysis until complete depth information is provided. + + ### Lookup Table (LUT) in use: No From 2acac05a99dc62d2a3778e52d5e0bb067fb52234 Mon Sep 17 00:00:00 2001 From: Franck Albinet Date: Mon, 14 Oct 2024 12:35:56 +0200 Subject: [PATCH 4/9] add metadata documentation - sample uniquenes and others --- marisco/configs.py | 18 +- marisco/handlers/helcom.py | 21 +- marisco/utils.py | 22 +- nbs/api/configs.ipynb | 18 +- nbs/api/utils.ipynb | 16 + nbs/handlers/_geotraces.ipynb | 10531 ++++++++++++++-- .../_helcom-investigation-uniqueness.ipynb | 9716 ++++++++++++++ nbs/handlers/helcom.ipynb | 469 +- nbs/metadata/.notest | 0 nbs/metadata/field-definition.ipynb | 1471 +++ nbs/metadata/sample-uniqueness.ipynb | 184 + nbs/sidebar.yml | 5 +- 12 files changed, 21356 insertions(+), 1115 deletions(-) create mode 100644 nbs/handlers/_helcom-investigation-uniqueness.ipynb create mode 100644 nbs/metadata/.notest create mode 100644 nbs/metadata/field-definition.ipynb create mode 100644 nbs/metadata/sample-uniqueness.ipynb diff --git a/marisco/configs.py b/marisco/configs.py index 18b5781..ac812a0 100644 --- a/marisco/configs.py +++ b/marisco/configs.py @@ -264,7 +264,23 @@ def cache_path(): 'standard_name': 'sediment_type_tbd' }, 'dtype': 'sed_type_t' - } + }, + 'top': { + 'name': 'top', + 'attrs': { + 'long_name': 'Top depth of sediment layer', + 'standard_name': 'top_depth_of_sediment_layer_tbd' + }, + 'dtype': 'f4' + }, + 'bottom': { + 'name': 'bottom', + 'attrs': { + 'long_name': 'Bottom depth of sediment layer', + 'standard_name': 'bottom_depth_of_sediment_layer_tbd' + }, + 'dtype': 'f4' + }, }, 'suffixes': { 'uncertainty': { diff --git a/marisco/handlers/helcom.py b/marisco/handlers/helcom.py index bb8aabe..cd62c25 100644 --- a/marisco/handlers/helcom.py +++ b/marisco/handlers/helcom.py @@ -525,7 +525,7 @@ def _apply_dry_wet_ratio(self, df: pd.DataFrame) -> None: df.loc[df['dry_wet_ratio'] == 0, 'dry_wet_ratio'] = np.NaN -# %% ../../nbs/handlers/helcom.ipynb 172 +# %% ../../nbs/handlers/helcom.ipynb 173 class ParseCoordinates(Callback): """ Get geographical coordinates from columns expressed in degrees decimal format @@ -575,13 +575,14 @@ def _safe_convert(self, value) -> str: print(f"Error converting value {value}: {e}") return value -# %% ../../nbs/handlers/helcom.ipynb 183 +# %% ../../nbs/handlers/helcom.ipynb 184 def get_common_rules( vars: dict, # Configuration dictionary encoding_type: str # Encoding type (`netcdf` or `openrefine`) ) -> dict: # Common renaming rules for NetCDF and OpenRefine. "Get common renaming rules for NetCDF and OpenRefine." common = { + 'KEY': 'key', 'lat': 'latitude' if encoding_type == 'openrefine' else vars['defaults']['lat']['name'], 'lon': 'longitude' if encoding_type == 'openrefine' else vars['defaults']['lon']['name'], 'time': 'begperiod' if encoding_type == 'openrefine' else vars['defaults']['time']['name'], @@ -615,7 +616,7 @@ def get_common_rules( return common -# %% ../../nbs/handlers/helcom.ipynb 184 +# %% ../../nbs/handlers/helcom.ipynb 185 def get_specific_rules( vars: dict, # Configuration dictionary encoding_type: str # Encoding type (`netcdf` or `openrefine`) @@ -630,6 +631,8 @@ def get_specific_rules( }, 'sediment': { 'sed_type': vars['sed']['sed_type']['name'], + 'top': vars['sed']['top']['name'], + 'bottom': vars['sed']['bottom']['name'], } } elif encoding_type == 'openrefine': @@ -654,7 +657,7 @@ def get_specific_rules( } } -# %% ../../nbs/handlers/helcom.ipynb 185 +# %% ../../nbs/handlers/helcom.ipynb 186 def get_renaming_rules( encoding_type: str = 'netcdf' # Encoding type (`netcdf` or `openrefine`) ) -> dict: # Renaming rules for NetCDF and OpenRefine. @@ -674,7 +677,7 @@ def get_renaming_rules( return dict(rules) -# %% ../../nbs/handlers/helcom.ipynb 186 +# %% ../../nbs/handlers/helcom.ipynb 187 class SelectAndRenameColumnCB(Callback): "Select and rename columns in a DataFrame based on renaming rules for a specified encoding type." def __init__(self, @@ -745,7 +748,7 @@ def _apply_renaming(self, return df, not_found_keys -# %% ../../nbs/handlers/helcom.ipynb 195 +# %% ../../nbs/handlers/helcom.ipynb 197 kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides', 'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure', 'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments', @@ -757,7 +760,7 @@ def _apply_renaming(self, 'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans', 'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)'] -# %% ../../nbs/handlers/helcom.ipynb 196 +# %% ../../nbs/handlers/helcom.ipynb 198 def get_attrs( tfm: Transformer, # Transformer object zotero_key: str, # Zotero dataset record key @@ -773,7 +776,7 @@ def get_attrs( KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs)) ])() -# %% ../../nbs/handlers/helcom.ipynb 198 +# %% ../../nbs/handlers/helcom.ipynb 200 def enums_xtra( tfm: Transformer, # Transformer object vars: list # List of variables to extract from the transformer @@ -787,7 +790,7 @@ def enums_xtra( xtras[f'{var}_t'] = enums.filter(f'{var}_t', unique_vals) return xtras -# %% ../../nbs/handlers/helcom.ipynb 200 +# %% ../../nbs/handlers/helcom.ipynb 202 def encode( fname_in: str, # Input file name fname_out_nc: str, # Output file name diff --git a/marisco/utils.py b/marisco/utils.py index 9874774..8a8d77f 100644 --- a/marisco/utils.py +++ b/marisco/utils.py @@ -107,7 +107,7 @@ def _format_output(self): return df_lut.sort_values(by='match_score', ascending=False) -# %% ../nbs/api/utils.ipynb 16 +# %% ../nbs/api/utils.ipynb 17 def has_valid_varname( var_names:list, # variable names cdl_path:str, # Path to MARIS CDL file (point of truth) @@ -140,7 +140,7 @@ def has_valid_varname( print(f'"{name}" variable name not found in MARIS CDL') return has_valid -# %% ../nbs/api/utils.ipynb 20 +# %% ../nbs/api/utils.ipynb 21 def get_bbox(df, coord_cols=('lon', 'lat') ): @@ -149,7 +149,7 @@ def get_bbox(df, arr = [(row[x], row[y]) for _, row in df.iterrows()] return MultiPoint(arr).envelope -# %% ../nbs/api/utils.ipynb 26 +# %% ../nbs/api/utils.ipynb 27 def ddmm_to_dd( ddmmmm:float # Coordinates in degrees/minutes decimal format ) -> float: # Coordinates in degrees decimal format @@ -158,7 +158,7 @@ def ddmm_to_dd( mins = mins * 100 return round(int(degs) + (mins / 60), 6) -# %% ../nbs/api/utils.ipynb 29 +# %% ../nbs/api/utils.ipynb 30 def download_files_in_folder(owner:str, repo:str, src_dir:str, @@ -192,7 +192,7 @@ def download_file(owner, repo, src_dir, dest_dir, fname): else: print(f"Error: {response.status_code}") -# %% ../nbs/api/utils.ipynb 31 +# %% ../nbs/api/utils.ipynb 32 def match_worms( name:str # Name of species to look up in WoRMS ): @@ -215,7 +215,7 @@ def match_worms( else: return -1 -# %% ../nbs/api/utils.ipynb 36 +# %% ../nbs/api/utils.ipynb 37 @dataclass class Match: "Match between a data provider name and a MARIS lookup table." @@ -224,7 +224,7 @@ class Match: source_name: str match_score: int -# %% ../nbs/api/utils.ipynb 37 +# %% ../nbs/api/utils.ipynb 38 def match_maris_lut( lut_path: str, # Path to MARIS species authoritative species look-up table data_provider_name: str, # Name of data provider nomenclature item to look up @@ -241,7 +241,7 @@ def match_maris_lut( df = df.sort_values(by='score', ascending=True)[:nresults] return df[[maris_id, maris_name, 'score']] -# %% ../nbs/api/utils.ipynb 44 +# %% ../nbs/api/utils.ipynb 45 def get_bbox(df, coord_cols=('lon', 'lat') ): @@ -249,7 +249,7 @@ def get_bbox(df, arr = [(row[x], row[y]) for _, row in df.iterrows()] return MultiPoint(arr).envelope -# %% ../nbs/api/utils.ipynb 51 +# %% ../nbs/api/utils.ipynb 52 def download_files_in_folder(owner:str, repo:str, src_dir:str, @@ -283,7 +283,7 @@ def download_file(owner, repo, src_dir, dest_dir, fname): else: print(f"Error: {response.status_code}") -# %% ../nbs/api/utils.ipynb 53 +# %% ../nbs/api/utils.ipynb 54 def match_worms( name:str # Name of species to look up in WoRMS ): @@ -306,7 +306,7 @@ def match_worms( else: return -1 -# %% ../nbs/api/utils.ipynb 58 +# %% ../nbs/api/utils.ipynb 59 def test_dfs( dfs1:dict, # First dictionary of DataFrames to compare dfs2:dict # Second dictionary of DataFrames to compare diff --git a/nbs/api/configs.ipynb b/nbs/api/configs.ipynb index 619d9f6..b12a53d 100644 --- a/nbs/api/configs.ipynb +++ b/nbs/api/configs.ipynb @@ -420,7 +420,23 @@ " 'standard_name': 'sediment_type_tbd'\n", " },\n", " 'dtype': 'sed_type_t'\n", - " }\n", + " },\n", + " 'top': {\n", + " 'name': 'top',\n", + " 'attrs': {\n", + " 'long_name': 'Top depth of sediment layer',\n", + " 'standard_name': 'top_depth_of_sediment_layer_tbd'\n", + " },\n", + " 'dtype': 'f4'\n", + " },\n", + " 'bottom': {\n", + " 'name': 'bottom',\n", + " 'attrs': {\n", + " 'long_name': 'Bottom depth of sediment layer',\n", + " 'standard_name': 'bottom_depth_of_sediment_layer_tbd'\n", + " },\n", + " 'dtype': 'f4'\n", + " },\n", " },\n", " 'suffixes': {\n", " 'uncertainty': {\n", diff --git a/nbs/api/utils.ipynb b/nbs/api/utils.ipynb index cdbde2b..37763cd 100644 --- a/nbs/api/utils.ipynb +++ b/nbs/api/utils.ipynb @@ -304,6 +304,22 @@ " return df_lut.sort_values(by='match_score', ascending=False)\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "17bab3e5", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide \n", + "# Setting unique universal id\n", + "# TO BE DONE\n", + "# import hashlib\n", + "# combined_str = \"32.123_-180.435_2022-01-01T00:00:00.000\"\n", + "# hash_object = hashlib.sha256(combined_str.encode())\n", + "# unique_id = hash_object.hexdigest(); unique_id" + ] + }, { "attachments": {}, "cell_type": "markdown", diff --git a/nbs/handlers/_geotraces.ipynb b/nbs/handlers/_geotraces.ipynb index f7bcc00..bba4508 100644 --- a/nbs/handlers/_geotraces.ipynb +++ b/nbs/handlers/_geotraces.ipynb @@ -249,6 +249,13 @@ "id": "2ae36303-1f1f-443a-8611-1f1c69cc7254", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "df shape: (105417, 1188)\n" + ] + }, { "data": { "text/html": [ @@ -487,148 +494,14 @@ "source": [ "#| eval: false\n", "df = load_data(fname_in)\n", + "print(f'df shape: {df.shape}')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, - "id": "8cb03efb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['Cruise', 'Station:METAVAR:INDEXED_TEXT', 'Type',\n", - " 'yyyy-mm-ddThh:mm:ss.sss', 'Longitude [degrees_east]',\n", - " 'Latitude [degrees_north]', 'Bot. Depth [m]',\n", - " 'Operator's Cruise Name:METAVAR:INDEXED_TEXT',\n", - " 'Ship Name:METAVAR:INDEXED_TEXT', 'Period:METAVAR:INDEXED_TEXT',\n", - " ...\n", - " 'QV:SEADATANET.581', 'Co_CELL_CONC_BOTTLE [amol/cell]',\n", - " 'QV:SEADATANET.582', 'Ni_CELL_CONC_BOTTLE [amol/cell]',\n", - " 'QV:SEADATANET.583', 'Cu_CELL_CONC_BOTTLE [amol/cell]',\n", - " 'QV:SEADATANET.584', 'Zn_CELL_CONC_BOTTLE [amol/cell]',\n", - " 'QV:SEADATANET.585', 'QV:ODV:SAMPLE'],\n", - " dtype='object', length=1188)" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55d645fa-660d-4298-b72b-fb06dbd2e2d7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ctdsal_d_conc_sensor [pss-78]\n", - "salinity_d_conc_bottle\n", - "salinity_d_conc_pump\n", - "salinity_d_conc_fish\n", - "salinity_d_conc_uway\n", - "salinity_d_conc_boat_pump\n", - "ctdtmp_t_value_sensor [deg c]\n", - "oxygen_d_conc_bottle [umol/kg]\n", - "ctdoxy_d_conc_sensor [umol/kg]\n", - "U_236_238_T_RATIO_BOTTLE [per 10^12]\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "def find_print_col(s, cols, lower=True):\n", - " cols = cols if not lower else [col.lower() for col in cols]\n", - " for col in cols:\n", - " if s in col: print(col)\n", - "\n", - "find_print_col('sal', df.columns)\n", - "find_print_col('tmp', df.columns)\n", - "find_print_col('oxy', df.columns)\n", - "find_print_col('U_236_238', df.columns, lower=False)" - ] - }, - { - "cell_type": "markdown", - "id": "bf1a71c0", - "metadata": {}, - "source": [ - "## Data transformation pipeline" - ] - }, - { - "cell_type": "markdown", - "id": "c4407027-942c-4240-a92d-a40311c05afd", - "metadata": {}, - "source": [ - "### Select columns of interest" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55d50ff7-63dc-4719-a6c1-c2ff7e6ecb7f", - "metadata": {}, - "outputs": [], - "source": [ - "#| hide\n", - "# U_236_238\n", - "# Done: Th_232, I_129, Ac_227" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d3a7bba-39d7-4fc3-8f0e-fb83ff52dcc2", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "common_coi = ['yyyy-mm-ddThh:mm:ss.sss', 'Longitude [degrees_east]',\n", - " 'Latitude [degrees_north]', 'Bot. Depth [m]', 'DEPTH [m]']\n", - "\n", - "nuclides_pattern = ['^TRITI', '^Th_228', '^Th_23[024]', '^Pa_231', \n", - " '^U_236_[DT]', '^Be_', '^Cs_137', '^Pb_210', '^Po_210',\n", - " '^Ra_22[3468]', 'Np_237', '^Pu_239_[D]', '^Pu_240', '^Pu_239_Pu_240',\n", - " '^I_129', '^Ac_227'] \n", - "\n", - "class SelectColsOfInterestCB(Callback):\n", - " \"Select columns of interest.\"\n", - " def __init__(self, common_coi, nuclides_pattern): fc.store_attr()\n", - " def __call__(self, tfm):\n", - " nuc_of_interest = [c for c in tfm.df.columns if \n", - " any(re.match(pattern, c) for pattern in self.nuclides_pattern)]\n", - "\n", - " tfm.df = tfm.df[self.common_coi + nuc_of_interest]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9005e7e4-f0d7-4944-abea-60e5f5522e22", - "metadata": {}, - "outputs": [], - "source": [ - "#| eval: false\n", - "tfm = Transformer(df, cbs=[\n", - " SelectColsOfInterestCB(common_coi, nuclides_pattern)\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52f52056-f58d-4ef5-b5de-dc8adf51eac0", + "id": "26d94fd4", "metadata": {}, "outputs": [ { @@ -655,24 +528,8 @@ " yyyy-mm-ddThh:mm:ss.sss\n", " Longitude [degrees_east]\n", " Latitude [degrees_north]\n", - " Bot. Depth [m]\n", " DEPTH [m]\n", - " TRITIUM_D_CONC_BOTTLE [TU]\n", - " Cs_137_D_CONC_BOTTLE [uBq/kg]\n", - " I_129_D_CONC_BOTTLE [atoms/kg]\n", - " Np_237_D_CONC_BOTTLE [uBq/kg]\n", - " Pu_239_D_CONC_BOTTLE [uBq/kg]\n", - " ...\n", - " Th_230_TP_CONC_PUMP [uBq/kg]\n", - " Th_230_SPT_CONC_PUMP [uBq/kg]\n", - " Th_230_LPT_CONC_PUMP [uBq/kg]\n", - " Th_232_TP_CONC_PUMP [pmol/kg]\n", - " Th_232_SPT_CONC_PUMP [pmol/kg]\n", - " Th_232_LPT_CONC_PUMP [pmol/kg]\n", - " Th_234_SPT_CONC_PUMP [mBq/kg]\n", - " Th_234_LPT_CONC_PUMP [mBq/kg]\n", - " Po_210_TP_CONC_UWAY [mBq/kg]\n", - " Pb_210_TP_CONC_UWAY [mBq/kg]\n", + " BODC Bottle Number:INTEGER\n", " \n", " \n", " \n", @@ -681,124 +538,43 @@ " 2014-05-17T22:29:00\n", " 349.29999\n", " 38.4329\n", - " 4854.0\n", " 2957.1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 1214048\n", " \n", " \n", " 1\n", " 2014-05-17T22:29:00\n", " 349.29999\n", " 38.4329\n", - " 4854.0\n", " 2957.2\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 1214039\n", " \n", " \n", " 2\n", " 2014-05-17T22:29:00\n", " 349.29999\n", " 38.4329\n", - " 4854.0\n", " 2957.2\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 1214027\n", " \n", " \n", " 3\n", " 2014-05-17T22:29:00\n", " 349.29999\n", " 38.4329\n", - " 4854.0\n", " 2957.2\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 1214018\n", " \n", " \n", " 4\n", " 2014-05-17T22:29:00\n", " 349.29999\n", " 38.4329\n", - " 4854.0\n", " 2957.2\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 1214036\n", " \n", " \n", "\n", - "

5 rows × 85 columns

\n", "" ], "text/plain": [ @@ -809,63 +585,12 @@ "3 2014-05-17T22:29:00 349.29999 38.4329 \n", "4 2014-05-17T22:29:00 349.29999 38.4329 \n", "\n", - " Bot. Depth [m] DEPTH [m] TRITIUM_D_CONC_BOTTLE [TU] \\\n", - "0 4854.0 2957.1 NaN \n", - "1 4854.0 2957.2 NaN \n", - "2 4854.0 2957.2 NaN \n", - "3 4854.0 2957.2 NaN \n", - "4 4854.0 2957.2 NaN \n", - "\n", - " Cs_137_D_CONC_BOTTLE [uBq/kg] I_129_D_CONC_BOTTLE [atoms/kg] \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - " Np_237_D_CONC_BOTTLE [uBq/kg] Pu_239_D_CONC_BOTTLE [uBq/kg] ... \\\n", - "0 NaN NaN ... \n", - "1 NaN NaN ... \n", - "2 NaN NaN ... \n", - "3 NaN NaN ... \n", - "4 NaN NaN ... \n", - "\n", - " Th_230_TP_CONC_PUMP [uBq/kg] Th_230_SPT_CONC_PUMP [uBq/kg] \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - " Th_230_LPT_CONC_PUMP [uBq/kg] Th_232_TP_CONC_PUMP [pmol/kg] \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - " Th_232_SPT_CONC_PUMP [pmol/kg] Th_232_LPT_CONC_PUMP [pmol/kg] \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - " Th_234_SPT_CONC_PUMP [mBq/kg] Th_234_LPT_CONC_PUMP [mBq/kg] \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - " Po_210_TP_CONC_UWAY [mBq/kg] Pb_210_TP_CONC_UWAY [mBq/kg] \n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - "[5 rows x 85 columns]" + " DEPTH [m] BODC Bottle Number:INTEGER \n", + "0 2957.1 1214048 \n", + "1 2957.2 1214039 \n", + "2 2957.2 1214027 \n", + "3 2957.2 1214018 \n", + "4 2957.2 1214036 " ] }, "execution_count": null, @@ -874,117 +599,24 @@ } ], "source": [ - "#| eval: false\n", - "df_test = tfm()\n", - "df_test.head()" - ] - }, - { - "cell_type": "markdown", - "id": "f6ceb332-536a-4054-a85a-a56960fb28a1", - "metadata": {}, - "source": [ - "### Reshape: wide to long\n", + "# 'BODC Bottle Number:INTEGER'\n", "\n", - "So that we can extract information such as sample methodology, filtering status, units included in Geotraces nuclides name." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b060bb07-5565-4928-8b43-4abc5e64eb97", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "class WideToLongCB(Callback):\n", - " \"\"\"\n", - " Get Geotraces nuclide names as values not column names \n", - " to extract contained information (unit, sampling method, ...).\n", - " \"\"\"\n", - " def __init__(self, common_coi, nuclides_pattern, \n", - " var_name='nuclide', value_name='value'): \n", - " fc.store_attr()\n", - " \n", - " def __call__(self, tfm):\n", - " nuc_of_interest = [c for c in tfm.df.columns if \n", - " any(re.match(pattern, c) for pattern in self.nuclides_pattern)]\n", - " tfm.df = pd.melt(tfm.df, id_vars=self.common_coi, value_vars=nuc_of_interest, \n", - " var_name=self.var_name, value_name=self.value_name)\n", - " tfm.df.dropna(subset='value', inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c95cc28d-b236-412d-9378-f06e85f95560", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(26745, 7)" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| eval: false\n", - "tfm = Transformer(df, cbs=[\n", - " SelectColsOfInterestCB(common_coi, nuclides_pattern),\n", - " WideToLongCB(common_coi, nuclides_pattern)\n", - "])\n", - "df_test = tfm()\n", - "df_test.shape" - ] - }, - { - "cell_type": "markdown", - "id": "de71e3a4-3f0e-4392-8338-1f9ef907f5da", - "metadata": {}, - "source": [ - "### Extract" - ] - }, - { - "cell_type": "markdown", - "id": "adee33d8-310b-43db-9eea-58ccaeed2065", - "metadata": {}, - "source": [ - "#### Unit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e982489-7c69-4b6d-9930-8f786220ad5b", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "class ExtractUnitCB(Callback):\n", - " \"\"\"\n", - " Extract units from nuclide names.\n", - " \"\"\"\n", - " def __init__(self, var_name='nuclide'): \n", - " fc.store_attr()\n", - " self.unit_col_name = cdl_cfg()['vars']['suffixes']['unit']['name']\n", + "cols_unique = [\n", + " 'yyyy-mm-ddThh:mm:ss.sss', \n", + " 'Longitude [degrees_east]',\n", + " 'Latitude [degrees_north]',\n", + " 'DEPTH [m]',\n", + " 'BODC Bottle Number:INTEGER'\n", + " # 'Rosette Bottle Number:INTEGER',\n", + "]\n", "\n", - " def extract_unit(self, s):\n", - " match = re.search(r'\\[(.*?)\\]', s)\n", - " return match.group(1) if match else None\n", - " \n", - " def __call__(self, tfm):\n", - " tfm.df[self.unit_col_name] = tfm.df[self.var_name].apply(self.extract_unit)" + "df[cols_unique].head()" ] }, { "cell_type": "code", "execution_count": null, - "id": "739decc1-64e8-4ea3-8f09-89a8b04dc1f8", + "id": "df416ab1", "metadata": {}, "outputs": [ { @@ -1008,97 +640,405 @@ " \n", " \n", " \n", + " Cruise\n", + " Station:METAVAR:INDEXED_TEXT\n", + " Type\n", " yyyy-mm-ddThh:mm:ss.sss\n", " Longitude [degrees_east]\n", " Latitude [degrees_north]\n", " Bot. Depth [m]\n", - " DEPTH [m]\n", - " nuclide\n", - " value\n", - " _unit\n", + " Operator's Cruise Name:METAVAR:INDEXED_TEXT\n", + " Ship Name:METAVAR:INDEXED_TEXT\n", + " Period:METAVAR:INDEXED_TEXT\n", + " ...\n", + " QV:SEADATANET.581\n", + " Co_CELL_CONC_BOTTLE [amol/cell]\n", + " QV:SEADATANET.582\n", + " Ni_CELL_CONC_BOTTLE [amol/cell]\n", + " QV:SEADATANET.583\n", + " Cu_CELL_CONC_BOTTLE [amol/cell]\n", + " QV:SEADATANET.584\n", + " Zn_CELL_CONC_BOTTLE [amol/cell]\n", + " QV:SEADATANET.585\n", + " QV:ODV:SAMPLE\n", " \n", " \n", " \n", " \n", - " 9223\n", - " 2010-10-17T00:13:29\n", - " 350.33792\n", - " 38.3271\n", - " 2827.0\n", - " 17.8\n", - " TRITIUM_D_CONC_BOTTLE [TU]\n", - " 0.733\n", - " TU\n", + " 9571\n", + " GA03\n", + " Station 12\n", + " B\n", + " 2010-11-02T17:03:15\n", + " 335.5022\n", + " 17.4021\n", + " 3548.0\n", + " KN199\n", + " Knorr\n", + " 15/10/2010 - 04/11/2010\n", + " ...\n", + " 9\n", + " NaN\n", + " 9\n", + " 0.07\n", + " 1\n", + " 1.48\n", + " 1\n", + " 21.700001\n", + " 1\n", + " 1\n", " \n", " \n", - " 9231\n", - " 2010-10-17T00:13:29\n", - " 350.33792\n", - " 38.3271\n", - " 2827.0\n", - " 34.7\n", - " TRITIUM_D_CONC_BOTTLE [TU]\n", - " 0.696\n", - " TU\n", + " 9573\n", + " GA03\n", + " Station 12\n", + " B\n", + " 2010-11-02T17:03:15\n", + " 335.5022\n", + " 17.4021\n", + " 3548.0\n", + " KN199\n", + " Knorr\n", + " 15/10/2010 - 04/11/2010\n", + " ...\n", + " 1\n", + " 0.10\n", + " 1\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " 1.040000\n", + " 1\n", + " 1\n", " \n", " \n", - " 9237\n", - " 2010-10-17T00:13:29\n", - " 350.33792\n", - " 38.3271\n", - " 2827.0\n", - " 67.5\n", - " TRITIUM_D_CONC_BOTTLE [TU]\n", - " 0.718\n", - " TU\n", + " 9574\n", + " GA03\n", + " Station 12\n", + " B\n", + " 2010-11-02T17:03:15\n", + " 335.5022\n", + " 17.4021\n", + " 3548.0\n", + " KN199\n", + " Knorr\n", + " 15/10/2010 - 04/11/2010\n", + " ...\n", + " 1\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " 22.100000\n", + " 1\n", + " 1\n", " \n", " \n", - " 9244\n", - " 2010-10-17T00:13:29\n", - " 350.33792\n", - " 38.3271\n", - " 2827.0\n", - " 91.9\n", - " TRITIUM_D_CONC_BOTTLE [TU]\n", - " 0.709\n", - " TU\n", + " 9576\n", + " GA03\n", + " Station 12\n", + " B\n", + " 2010-11-02T17:03:15\n", + " 335.5022\n", + " 17.4021\n", + " 3548.0\n", + " KN199\n", + " Knorr\n", + " 15/10/2010 - 04/11/2010\n", + " ...\n", + " 1\n", + " 0.10\n", + " 1\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " 2.500000\n", + " 1\n", + " 1\n", " \n", " \n", - " 9256\n", - " 2010-10-17T00:13:29\n", - " 350.33792\n", - " 38.3271\n", - " 2827.0\n", - " 136.6\n", - " TRITIUM_D_CONC_BOTTLE [TU]\n", - " 0.692\n", - " TU\n", + " 9577\n", + " GA03\n", + " Station 12\n", + " B\n", + " 2010-11-02T17:03:15\n", + " 335.5022\n", + " 17.4021\n", + " 3548.0\n", + " KN199\n", + " Knorr\n", + " 15/10/2010 - 04/11/2010\n", + " ...\n", + " 1\n", + " 0.07\n", + " 1\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " 0.560000\n", + " 1\n", + " 1\n", + " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 92211\n", + " GP16\n", + " 36\n", + " B\n", + " 2013-12-17T04:09:34\n", + " 207.9930\n", + " -10.5019\n", + " 5162.0\n", + " TN303\n", + " Thomas G. Thompson\n", + " 25/10/2013 - 20/12/2013\n", + " ...\n", + " 1\n", + " NaN\n", + " 9\n", + " 3.72\n", + " 1\n", + " NaN\n", + " 9\n", + " 6.950000\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 92212\n", + " GP16\n", + " 36\n", + " B\n", + " 2013-12-17T04:09:34\n", + " 207.9930\n", + " -10.5019\n", + " 5162.0\n", + " TN303\n", + " Thomas G. Thompson\n", + " 25/10/2013 - 20/12/2013\n", + " ...\n", + " 1\n", + " 1.06\n", + " 1\n", + " 1.68\n", + " 1\n", + " NaN\n", + " 9\n", + " 11.300000\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 92213\n", + " GP16\n", + " 36\n", + " B\n", + " 2013-12-17T04:09:34\n", + " 207.9930\n", + " -10.5019\n", + " 5162.0\n", + " TN303\n", + " Thomas G. Thompson\n", + " 25/10/2013 - 20/12/2013\n", + " ...\n", + " 1\n", + " NaN\n", + " 9\n", + " 14.70\n", + " 1\n", + " NaN\n", + " 9\n", + " 16.299999\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 92214\n", + " GP16\n", + " 36\n", + " B\n", + " 2013-12-17T04:09:34\n", + " 207.9930\n", + " -10.5019\n", + " 5162.0\n", + " TN303\n", + " Thomas G. Thompson\n", + " 25/10/2013 - 20/12/2013\n", + " ...\n", + " 1\n", + " NaN\n", + " 9\n", + " 2.05\n", + " 1\n", + " NaN\n", + " 9\n", + " 12.800000\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 92215\n", + " GP16\n", + " 36\n", + " B\n", + " 2013-12-17T04:09:34\n", + " 207.9930\n", + " -10.5019\n", + " 5162.0\n", + " TN303\n", + " Thomas G. Thompson\n", + " 25/10/2013 - 20/12/2013\n", + " ...\n", + " 1\n", + " NaN\n", + " 9\n", + " 1.07\n", + " 1\n", + " NaN\n", + " 9\n", + " 22.900000\n", + " 1\n", + " 1\n", " \n", " \n", "\n", + "

423 rows × 1188 columns

\n", "" ], "text/plain": [ - " yyyy-mm-ddThh:mm:ss.sss Longitude [degrees_east] \\\n", - "9223 2010-10-17T00:13:29 350.33792 \n", - "9231 2010-10-17T00:13:29 350.33792 \n", - "9237 2010-10-17T00:13:29 350.33792 \n", - "9244 2010-10-17T00:13:29 350.33792 \n", - "9256 2010-10-17T00:13:29 350.33792 \n", + " Cruise Station:METAVAR:INDEXED_TEXT Type yyyy-mm-ddThh:mm:ss.sss \\\n", + "9571 GA03 Station 12 B 2010-11-02T17:03:15 \n", + "9573 GA03 Station 12 B 2010-11-02T17:03:15 \n", + "9574 GA03 Station 12 B 2010-11-02T17:03:15 \n", + "9576 GA03 Station 12 B 2010-11-02T17:03:15 \n", + "9577 GA03 Station 12 B 2010-11-02T17:03:15 \n", + "... ... ... ... ... \n", + "92211 GP16 36 B 2013-12-17T04:09:34 \n", + "92212 GP16 36 B 2013-12-17T04:09:34 \n", + "92213 GP16 36 B 2013-12-17T04:09:34 \n", + "92214 GP16 36 B 2013-12-17T04:09:34 \n", + "92215 GP16 36 B 2013-12-17T04:09:34 \n", "\n", - " Latitude [degrees_north] Bot. Depth [m] DEPTH [m] \\\n", - "9223 38.3271 2827.0 17.8 \n", - "9231 38.3271 2827.0 34.7 \n", - "9237 38.3271 2827.0 67.5 \n", - "9244 38.3271 2827.0 91.9 \n", - "9256 38.3271 2827.0 136.6 \n", + " Longitude [degrees_east] Latitude [degrees_north] Bot. Depth [m] \\\n", + "9571 335.5022 17.4021 3548.0 \n", + "9573 335.5022 17.4021 3548.0 \n", + "9574 335.5022 17.4021 3548.0 \n", + "9576 335.5022 17.4021 3548.0 \n", + "9577 335.5022 17.4021 3548.0 \n", + "... ... ... ... \n", + "92211 207.9930 -10.5019 5162.0 \n", + "92212 207.9930 -10.5019 5162.0 \n", + "92213 207.9930 -10.5019 5162.0 \n", + "92214 207.9930 -10.5019 5162.0 \n", + "92215 207.9930 -10.5019 5162.0 \n", "\n", - " nuclide value _unit \n", - "9223 TRITIUM_D_CONC_BOTTLE [TU] 0.733 TU \n", - "9231 TRITIUM_D_CONC_BOTTLE [TU] 0.696 TU \n", - "9237 TRITIUM_D_CONC_BOTTLE [TU] 0.718 TU \n", - "9244 TRITIUM_D_CONC_BOTTLE [TU] 0.709 TU \n", - "9256 TRITIUM_D_CONC_BOTTLE [TU] 0.692 TU " + " Operator's Cruise Name:METAVAR:INDEXED_TEXT \\\n", + "9571 KN199 \n", + "9573 KN199 \n", + "9574 KN199 \n", + "9576 KN199 \n", + "9577 KN199 \n", + "... ... \n", + "92211 TN303 \n", + "92212 TN303 \n", + "92213 TN303 \n", + "92214 TN303 \n", + "92215 TN303 \n", + "\n", + " Ship Name:METAVAR:INDEXED_TEXT Period:METAVAR:INDEXED_TEXT ... \\\n", + "9571 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9573 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9574 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9576 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9577 Knorr 15/10/2010 - 04/11/2010 ... \n", + "... ... ... ... \n", + "92211 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", + "92212 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", + "92213 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", + "92214 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", + "92215 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", + "\n", + " QV:SEADATANET.581 Co_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.582 \\\n", + "9571 9 NaN 9 \n", + "9573 1 0.10 1 \n", + "9574 1 NaN 9 \n", + "9576 1 0.10 1 \n", + "9577 1 0.07 1 \n", + "... ... ... ... \n", + "92211 1 NaN 9 \n", + "92212 1 1.06 1 \n", + "92213 1 NaN 9 \n", + "92214 1 NaN 9 \n", + "92215 1 NaN 9 \n", + "\n", + " Ni_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.583 \\\n", + "9571 0.07 1 \n", + "9573 NaN 9 \n", + "9574 NaN 9 \n", + "9576 NaN 9 \n", + "9577 NaN 9 \n", + "... ... ... \n", + "92211 3.72 1 \n", + "92212 1.68 1 \n", + "92213 14.70 1 \n", + "92214 2.05 1 \n", + "92215 1.07 1 \n", + "\n", + " Cu_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.584 \\\n", + "9571 1.48 1 \n", + "9573 NaN 9 \n", + "9574 NaN 9 \n", + "9576 NaN 9 \n", + "9577 NaN 9 \n", + "... ... ... \n", + "92211 NaN 9 \n", + "92212 NaN 9 \n", + "92213 NaN 9 \n", + "92214 NaN 9 \n", + "92215 NaN 9 \n", + "\n", + " Zn_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.585 QV:ODV:SAMPLE \n", + "9571 21.700001 1 1 \n", + "9573 1.040000 1 1 \n", + "9574 22.100000 1 1 \n", + "9576 2.500000 1 1 \n", + "9577 0.560000 1 1 \n", + "... ... ... ... \n", + "92211 6.950000 1 1 \n", + "92212 11.300000 1 1 \n", + "92213 16.299999 1 1 \n", + "92214 12.800000 1 1 \n", + "92215 22.900000 1 1 \n", + "\n", + "[423 rows x 1188 columns]" ] }, "execution_count": null, @@ -1107,75 +1047,34 @@ } ], "source": [ - "#| eval: false\n", - "tfm = Transformer(df, cbs=[\n", - " SelectColsOfInterestCB(common_coi, nuclides_pattern),\n", - " WideToLongCB(common_coi, nuclides_pattern),\n", - " ExtractUnitCB()\n", - "])\n", - "\n", - "df_test = tfm()\n", - "df_test.head()" - ] - }, - { - "cell_type": "markdown", - "id": "219fc817-6700-4c3a-b353-08cc89c05538", - "metadata": {}, - "source": [ - "#### Filtering status" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "efb9477e-f593-4d15-b8b8-c073bd6bb590", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "phase = {\n", - " 'D': {'filt': 1, 'group': 'seawater'},\n", - " 'T': {'filt': 2, 'group': 'seawater'},\n", - " 'TP': {'filt': 1, 'group': 'suspended-matter'}, \n", - " 'LPT': {'filt': 1, 'group': 'suspended-matter'},\n", - " 'SPT': {'filt': 1, 'group': 'suspended-matter'}}" + "df[df[cols_unique].duplicated()]" ] }, { "cell_type": "code", "execution_count": null, - "id": "3ba72b3a-a013-4b5d-881a-9fd0a7e8b74c", + "id": "2dba1f37", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([nan, 24.0, 23.0, ..., 3056.0, 3057.0, 0], dtype=object)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#| exports\n", - "class ExtractFilteringStatusCB(Callback):\n", - " \"Extract filtering status from nuclide names.\"\n", - " def __init__(self, phase, var_name='nuclide'): \n", - " fc.store_attr()\n", - " self.filt_col_name = cdl_cfg()['vars']['suffixes']['filtered']['name']\n", - "\n", - " def extract_filt_status(self, s):\n", - " matched_string = self.match(s)\n", - " return self.phase[matched_string.group(1)]['filt'] if matched_string else None\n", - "\n", - " def match(self, s):\n", - " return re.search(r'_(' + '|'.join(self.phase.keys()) + ')_', s)\n", - " \n", - " def extract_group(self, s):\n", - " matched_string = self.match(s)\n", - " return self.phase[matched_string.group(1)]['group'] if matched_string else None\n", - " \n", - " def __call__(self, tfm):\n", - " tfm.df[self.filt_col_name] = tfm.df[self.var_name].apply(self.extract_filt_status)\n", - " tfm.df['group'] = tfm.df[self.var_name].apply(self.extract_group)" + "df['GEOTRACES Sample ID:INDEXED_TEXT'].unique()" ] }, { "cell_type": "code", "execution_count": null, - "id": "f4821951-c14a-4315-8d03-ec82f83a242a", + "id": "2b35da85", "metadata": {}, "outputs": [ { @@ -1199,109 +1098,405 @@ " \n", " \n", " \n", + " Cruise\n", + " Station:METAVAR:INDEXED_TEXT\n", + " Type\n", " yyyy-mm-ddThh:mm:ss.sss\n", " Longitude [degrees_east]\n", " Latitude [degrees_north]\n", " Bot. Depth [m]\n", - " DEPTH [m]\n", - " nuclide\n", - " value\n", - " _unit\n", - " _filt\n", - " group\n", + " Operator's Cruise Name:METAVAR:INDEXED_TEXT\n", + " Ship Name:METAVAR:INDEXED_TEXT\n", + " Period:METAVAR:INDEXED_TEXT\n", + " ...\n", + " QV:SEADATANET.581\n", + " Co_CELL_CONC_BOTTLE [amol/cell]\n", + " QV:SEADATANET.582\n", + " Ni_CELL_CONC_BOTTLE [amol/cell]\n", + " QV:SEADATANET.583\n", + " Cu_CELL_CONC_BOTTLE [amol/cell]\n", + " QV:SEADATANET.584\n", + " Zn_CELL_CONC_BOTTLE [amol/cell]\n", + " QV:SEADATANET.585\n", + " QV:ODV:SAMPLE\n", " \n", " \n", " \n", " \n", - " 9223\n", - " 2010-10-17T00:13:29\n", - " 350.33792\n", - " 38.3271\n", - " 2827.0\n", - " 17.8\n", - " TRITIUM_D_CONC_BOTTLE [TU]\n", - " 0.733\n", - " TU\n", + " 5372\n", + " GA02\n", + " 001\n", + " B\n", + " 2011-03-05T20:45:44\n", + " 307.31189\n", + " -49.5472\n", + " 2322.0\n", + " JC057\n", + " RRS James Cook\n", + " 02/03/2011 - 06/04/2011\n", + " ...\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", " 1\n", - " seawater\n", " \n", " \n", - " 9231\n", - " 2010-10-17T00:13:29\n", - " 350.33792\n", - " 38.3271\n", - " 2827.0\n", - " 34.7\n", - " TRITIUM_D_CONC_BOTTLE [TU]\n", - " 0.696\n", - " TU\n", + " 5373\n", + " GA02\n", + " 001\n", + " B\n", + " 2011-03-05T20:45:44\n", + " 307.31189\n", + " -49.5472\n", + " 2322.0\n", + " JC057\n", + " RRS James Cook\n", + " 02/03/2011 - 06/04/2011\n", + " ...\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", " 1\n", - " seawater\n", " \n", " \n", - " 9237\n", - " 2010-10-17T00:13:29\n", - " 350.33792\n", - " 38.3271\n", - " 2827.0\n", - " 67.5\n", - " TRITIUM_D_CONC_BOTTLE [TU]\n", - " 0.718\n", - " TU\n", + " 5443\n", + " GA02\n", + " 001\n", + " B\n", + " 2012-07-31T11:17:29\n", + " 325.02731\n", + " 60.6544\n", + " 2971.0\n", + " PE358\n", + " Pelagia\n", + " 29/07/2012 - 19/08/2012\n", + " ...\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", " 1\n", - " seawater\n", " \n", " \n", - " 9244\n", - " 2010-10-17T00:13:29\n", - " 350.33792\n", - " 38.3271\n", - " 2827.0\n", - " 91.9\n", - " TRITIUM_D_CONC_BOTTLE [TU]\n", - " 0.709\n", - " TU\n", + " 5446\n", + " GA02\n", + " 001\n", + " B\n", + " 2010-04-29T14:37:28\n", + " 349.01709\n", + " 60.4053\n", + " 756.0\n", + " PE319\n", + " Pelagia\n", + " 28/04/2010 - 26/05/2010\n", + " ...\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", " 1\n", - " seawater\n", " \n", " \n", - " 9256\n", - " 2010-10-17T00:13:29\n", - " 350.33792\n", - " 38.3271\n", - " 2827.0\n", - " 136.6\n", - " TRITIUM_D_CONC_BOTTLE [TU]\n", - " 0.692\n", - " TU\n", + " 5491\n", + " GA02\n", + " 002\n", + " B\n", + " 2012-08-02T00:54:40\n", + " 320.49039\n", + " 59.2002\n", + " 3031.0\n", + " PE358\n", + " Pelagia\n", + " 29/07/2012 - 19/08/2012\n", + " ...\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " 1\n", + " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 100417\n", + " GPpr11\n", + " (8)\n", + " B\n", + " 2016-04-02T04:53:15\n", + " 147.07700\n", + " -50.3882\n", + " 4416.0\n", + " IN2016_V02\n", + " Investigator\n", + " 15/03/2016 - 13/04/2016\n", + " ...\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " 1\n", + " \n", + " \n", + " 100418\n", + " GPpr11\n", + " (8)\n", + " B\n", + " 2016-04-02T04:53:15\n", + " 147.07700\n", + " -50.3882\n", + " 4416.0\n", + " IN2016_V02\n", + " Investigator\n", + " 15/03/2016 - 13/04/2016\n", + " ...\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " 1\n", + " \n", + " \n", + " 100419\n", + " GPpr11\n", + " (8)\n", + " B\n", + " 2016-04-02T04:53:15\n", + " 147.07700\n", + " -50.3882\n", + " 4416.0\n", + " IN2016_V02\n", + " Investigator\n", + " 15/03/2016 - 13/04/2016\n", + " ...\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " 1\n", + " \n", + " \n", + " 100421\n", + " GPpr11\n", + " (8)\n", + " B\n", + " 2016-04-02T04:53:15\n", + " 147.07700\n", + " -50.3882\n", + " 4416.0\n", + " IN2016_V02\n", + " Investigator\n", + " 15/03/2016 - 13/04/2016\n", + " ...\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " 1\n", + " \n", + " \n", + " 100474\n", + " GPpr11\n", + " (8)\n", + " B\n", + " 2016-04-02T04:53:15\n", + " 147.07700\n", + " -50.3882\n", + " 4416.0\n", + " IN2016_V02\n", + " Investigator\n", + " 15/03/2016 - 13/04/2016\n", + " ...\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", + " NaN\n", + " 9\n", " 1\n", - " seawater\n", " \n", " \n", "\n", + "

802 rows × 1188 columns

\n", "" ], "text/plain": [ - " yyyy-mm-ddThh:mm:ss.sss Longitude [degrees_east] \\\n", - "9223 2010-10-17T00:13:29 350.33792 \n", - "9231 2010-10-17T00:13:29 350.33792 \n", - "9237 2010-10-17T00:13:29 350.33792 \n", - "9244 2010-10-17T00:13:29 350.33792 \n", - "9256 2010-10-17T00:13:29 350.33792 \n", + " Cruise Station:METAVAR:INDEXED_TEXT Type yyyy-mm-ddThh:mm:ss.sss \\\n", + "5372 GA02 001 B 2011-03-05T20:45:44 \n", + "5373 GA02 001 B 2011-03-05T20:45:44 \n", + "5443 GA02 001 B 2012-07-31T11:17:29 \n", + "5446 GA02 001 B 2010-04-29T14:37:28 \n", + "5491 GA02 002 B 2012-08-02T00:54:40 \n", + "... ... ... ... ... \n", + "100417 GPpr11 (8) B 2016-04-02T04:53:15 \n", + "100418 GPpr11 (8) B 2016-04-02T04:53:15 \n", + "100419 GPpr11 (8) B 2016-04-02T04:53:15 \n", + "100421 GPpr11 (8) B 2016-04-02T04:53:15 \n", + "100474 GPpr11 (8) B 2016-04-02T04:53:15 \n", "\n", - " Latitude [degrees_north] Bot. Depth [m] DEPTH [m] \\\n", - "9223 38.3271 2827.0 17.8 \n", - "9231 38.3271 2827.0 34.7 \n", - "9237 38.3271 2827.0 67.5 \n", - "9244 38.3271 2827.0 91.9 \n", - "9256 38.3271 2827.0 136.6 \n", + " Longitude [degrees_east] Latitude [degrees_north] Bot. Depth [m] \\\n", + "5372 307.31189 -49.5472 2322.0 \n", + "5373 307.31189 -49.5472 2322.0 \n", + "5443 325.02731 60.6544 2971.0 \n", + "5446 349.01709 60.4053 756.0 \n", + "5491 320.49039 59.2002 3031.0 \n", + "... ... ... ... \n", + "100417 147.07700 -50.3882 4416.0 \n", + "100418 147.07700 -50.3882 4416.0 \n", + "100419 147.07700 -50.3882 4416.0 \n", + "100421 147.07700 -50.3882 4416.0 \n", + "100474 147.07700 -50.3882 4416.0 \n", "\n", - " nuclide value _unit _filt group \n", - "9223 TRITIUM_D_CONC_BOTTLE [TU] 0.733 TU 1 seawater \n", - "9231 TRITIUM_D_CONC_BOTTLE [TU] 0.696 TU 1 seawater \n", - "9237 TRITIUM_D_CONC_BOTTLE [TU] 0.718 TU 1 seawater \n", - "9244 TRITIUM_D_CONC_BOTTLE [TU] 0.709 TU 1 seawater \n", - "9256 TRITIUM_D_CONC_BOTTLE [TU] 0.692 TU 1 seawater " + " Operator's Cruise Name:METAVAR:INDEXED_TEXT \\\n", + "5372 JC057 \n", + "5373 JC057 \n", + "5443 PE358 \n", + "5446 PE319 \n", + "5491 PE358 \n", + "... ... \n", + "100417 IN2016_V02 \n", + "100418 IN2016_V02 \n", + "100419 IN2016_V02 \n", + "100421 IN2016_V02 \n", + "100474 IN2016_V02 \n", + "\n", + " Ship Name:METAVAR:INDEXED_TEXT Period:METAVAR:INDEXED_TEXT ... \\\n", + "5372 RRS James Cook 02/03/2011 - 06/04/2011 ... \n", + "5373 RRS James Cook 02/03/2011 - 06/04/2011 ... \n", + "5443 Pelagia 29/07/2012 - 19/08/2012 ... \n", + "5446 Pelagia 28/04/2010 - 26/05/2010 ... \n", + "5491 Pelagia 29/07/2012 - 19/08/2012 ... \n", + "... ... ... ... \n", + "100417 Investigator 15/03/2016 - 13/04/2016 ... \n", + "100418 Investigator 15/03/2016 - 13/04/2016 ... \n", + "100419 Investigator 15/03/2016 - 13/04/2016 ... \n", + "100421 Investigator 15/03/2016 - 13/04/2016 ... \n", + "100474 Investigator 15/03/2016 - 13/04/2016 ... \n", + "\n", + " QV:SEADATANET.581 Co_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.582 \\\n", + "5372 9 NaN 9 \n", + "5373 9 NaN 9 \n", + "5443 9 NaN 9 \n", + "5446 9 NaN 9 \n", + "5491 9 NaN 9 \n", + "... ... ... ... \n", + "100417 9 NaN 9 \n", + "100418 9 NaN 9 \n", + "100419 9 NaN 9 \n", + "100421 9 NaN 9 \n", + "100474 9 NaN 9 \n", + "\n", + " Ni_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.583 \\\n", + "5372 NaN 9 \n", + "5373 NaN 9 \n", + "5443 NaN 9 \n", + "5446 NaN 9 \n", + "5491 NaN 9 \n", + "... ... ... \n", + "100417 NaN 9 \n", + "100418 NaN 9 \n", + "100419 NaN 9 \n", + "100421 NaN 9 \n", + "100474 NaN 9 \n", + "\n", + " Cu_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.584 \\\n", + "5372 NaN 9 \n", + "5373 NaN 9 \n", + "5443 NaN 9 \n", + "5446 NaN 9 \n", + "5491 NaN 9 \n", + "... ... ... \n", + "100417 NaN 9 \n", + "100418 NaN 9 \n", + "100419 NaN 9 \n", + "100421 NaN 9 \n", + "100474 NaN 9 \n", + "\n", + " Zn_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.585 QV:ODV:SAMPLE \n", + "5372 NaN 9 1 \n", + "5373 NaN 9 1 \n", + "5443 NaN 9 1 \n", + "5446 NaN 9 1 \n", + "5491 NaN 9 1 \n", + "... ... ... ... \n", + "100417 NaN 9 1 \n", + "100418 NaN 9 1 \n", + "100419 NaN 9 1 \n", + "100421 NaN 9 1 \n", + "100474 NaN 9 1 \n", + "\n", + "[802 rows x 1188 columns]" ] }, "execution_count": null, @@ -1310,91 +1505,4608 @@ } ], "source": [ - "#|eval: false\n", - "tfm = Transformer(df, cbs=[\n", - " SelectColsOfInterestCB(common_coi, nuclides_pattern),\n", - " WideToLongCB(common_coi, nuclides_pattern),\n", - " ExtractUnitCB(),\n", - " ExtractFilteringStatusCB(phase)\n", - "])\n", - "\n", - "df_test = tfm()\n", - "df_test.head()" - ] - }, - { - "cell_type": "markdown", - "id": "53acd63b-9fb9-4f51-a525-e020602893fc", - "metadata": {}, - "source": [ - "#### Sampling method" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7c79502-f09e-49c0-851b-cdb2eca82eac", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "# To be validated\n", - "smp_method = {\n", - " 'BOTTLE': 1,\n", - " 'FISH': 18,\n", - " 'PUMP': 14,\n", - " 'UWAY': 24}" + "df[df['GEOTRACES Sample ID:INDEXED_TEXT'] == 24.0]" ] }, { "cell_type": "code", "execution_count": null, - "id": "3b4663f8-6cb1-45c3-8437-97a6ba9c5214", + "id": "530a6a62", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([19., 16., 12., 9., 15., 22., 13., 2., 23., 3., 18., 14., 21.,\n", + " 10., 17., 7., 4., 6., 1., 8., 24., 5., 11., 20., nan, 25.,\n", + " 26., 36., 35., 34., 32., 33., 31., 28., 27., 30., 29.])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#| exports\n", - "class ExtractSamplingMethodCB(Callback):\n", - " \"Extract sampling method from nuclide names.\"\n", - " def __init__(self, smp_method, var_name='nuclide'): \n", - " fc.store_attr()\n", - " self.smp_method_col_name = cdl_cfg()['vars']['suffixes']['sampling_method']['name']\n", - "\n", - " def extract_smp_method(self, s):\n", - " match = re.search(r'_(' + '|'.join(self.smp_method.keys()) + ') ', s)\n", - " return self.smp_method[match.group(1)] if match else None\n", - " \n", - " def __call__(self, tfm):\n", - " tfm.df[self.smp_method_col_name] = tfm.df[self.var_name].apply(self.extract_smp_method)" + "df['Rosette Bottle Number:INTEGER'].unique()" ] }, { "cell_type": "code", "execution_count": null, - "id": "429020cf-3e5d-4efc-963d-af82bd1a0820", + "id": "c6eeadf2", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", + "name": "stdout", + "output_type": "stream", + "text": [ + "Cruise\n", + "['GA01' 'GA02' 'GA03' 'GA04N' 'GA04S']\n", + "\n", + "Station:METAVAR:INDEXED_TEXT\n", + "[0 1 2 3 4]\n", + "\n", + "Type\n", + "['B']\n", + "\n", + "yyyy-mm-ddThh:mm:ss.sss\n", + "['2014-05-17T22:29:00' '2014-05-19T18:19:18' '2014-05-20T23:04:59'\n", + " '2014-05-21T04:20:56' '2014-05-21T16:32:53']\n", + "\n", + "Longitude [degrees_east]\n", + "[349.29999 349.96399 350.54053 350.35678 350.23337]\n", + "\n", + "Latitude [degrees_north]\n", + "[38.4329 40.3333 40.3331 40.3329 40.333 ]\n", + "\n", + "Bot. Depth [m]\n", + "[4854. 3578. 153. 439. 804.]\n", + "\n", + "Operator's Cruise Name:METAVAR:INDEXED_TEXT\n", + "['GEOVIDE' 'JC057' 'PE319' 'PE321' 'PE358']\n", + "\n", + "Ship Name:METAVAR:INDEXED_TEXT\n", + "['Pourquoi pas?' 'RRS James Cook' 'Pelagia' 'Knorr' 'Angeles Alvarino']\n", + "\n", + "Period:METAVAR:INDEXED_TEXT\n", + "['15/05/2014 - 30/06/2014' '02/03/2011 - 06/04/2011'\n", + " '28/04/2010 - 26/05/2010' '11/06/2010 - 08/07/2010'\n", + " '29/07/2012 - 19/08/2012']\n", + "\n", + "Chief Scientist:METAVAR:INDEXED_TEXT\n", + "['Sarthou Geraldine' 'Rijkenberg Micha' 'Gerringa Loes' 'Jenkins William'\n", + " 'Boyle Edward']\n", + "\n", + "GEOTRACES Scientist:METAVAR:INDEXED_TEXT\n", + "['Sarthou Geraldine' 'de Baar Hein' 'Boyle Edward' 'Jenkins William'\n", + " 'Garcia-Orellana Jordi']\n", + "\n", + "Cruise Aliases:METAVAR:INDEXED_TEXT\n", + "[nan '64PE319' '64PE321' 'THOR' '64PE374']\n", + "\n", + "Cruise Information Link:METAVAR:INDEXED_TEXT\n", + "['https://www.bodc.ac.uk/resources/inventories/cruise_inventory/report/15251/'\n", + " 'https://www.bodc.ac.uk/resources/inventories/cruise_inventory/report/10584/'\n", + " 'https://www.bodc.ac.uk/resources/inventories/cruise_inventory/report/10001/'\n", + " 'https://www.bodc.ac.uk/resources/inventories/cruise_inventory/report/10002/'\n", + " 'https://www.bodc.ac.uk/resources/inventories/cruise_inventory/report/13372/']\n", + "\n", + "BODC Cruise Number:METAVAR:INTEGER\n", + "[15251 10584 10001 10002 13372]\n", + "\n", + "CTDPRS_T_VALUE_SENSOR [dbar]\n", + "[3.00085e+03 3.00096e+03 3.00106e+03 3.00116e+03 2.80000e+00]\n", + "\n", + "QV:SEADATANET\n", + "[0]\n", + "\n", + "DEPTH [m]\n", + "[2957.1 2957.2 2957.3 2957.4 3. ]\n", + "\n", + "QV:SEADATANET.1\n", + "[0]\n", + "\n", + "Rosette Bottle Number:INTEGER\n", + "[19. 16. 12. 9. 15.]\n", + "\n", + "QV:SEADATANET.2\n", + "[0 9]\n", + "\n", + "GEOTRACES Sample ID:INDEXED_TEXT\n", + "[nan 24.0 23.0 22.0 21.0]\n", + "\n", + "QV:SEADATANET.3\n", + "[9 0]\n", + "\n", + "Bottle Flag:INDEXED_TEXT\n", + "['No problem reported (0)' 'Bottle misfire (3)' 'Questionable depth (8)'\n", + " 'Bottle leak (5)' 'No sample (7)']\n", + "\n", + "QV:SEADATANET.4\n", + "[0]\n", + "\n", + "Cast Identifier:INDEXED_TEXT\n", + "['GEOP_000_01' 'geoh_001_01' 'geoh_001_05' 'geoh_001_11' 'geoh_001_03']\n", + "\n", + "QV:SEADATANET.5\n", + "[0]\n", + "\n", + "Sampling Device:INDEXED_TEXT\n", + "['UCCTD' 'CTD' 'SAP' 'CPUMP' 'GPUMP']\n", + "\n", + "QV:SEADATANET.6\n", + "[0]\n", + "\n", + "BODC Bottle Number:INTEGER\n", + "[1214048 1214039 1214027 1214018 1214036]\n", + "\n", + "QV:SEADATANET.7\n", + "[0]\n", + "\n", + "BODC Event Number:INTEGER\n", + "[1898200 1896667 1896681 1896702 1896674]\n", + "\n", + "QV:SEADATANET.8\n", + "[0]\n", + "\n", + "Single-Cell ID:INDEXED_TEXT\n", + "[nan 'C253' 'C261' 'C266' 'C256']\n", + "\n", + "QV:SEADATANET.9\n", + "[9 0]\n", + "\n", + "NCBI_Metagenome_BioSample_Accession:INDEXED_TEXT\n", + "[nan 'SAMN07136678' 'SAMN07136679' 'SAMN07136680' 'SAMN07136681']\n", + "\n", + "QV:SEADATANET.10\n", + "[9 0]\n", + "\n", + "NCBI_Single-Cell-Genome_BioProject_Accession:INDEXED_TEXT\n", + "[nan 'PRJNA445865;PRJEB33281']\n", + "\n", + "QV:SEADATANET.11\n", + "[9 0]\n", + "\n", + "NCBI_16S-18S-rRNA-gene_BioSample_Accession:INDEXED_TEXT\n", + "[nan 'SAMN15928680' 'SAMN15928676' 'SAMN15928677' 'SAMN15928678']\n", + "\n", + "QV:SEADATANET.12\n", + "[9 0]\n", + "\n", + "EMBL_EBI_Metagenome_MGNIFY_Analysis_Accession:INDEXED_TEXT\n", + "[nan 'https://www.ebi.ac.uk/metagenomics/analyses/MGYA00463269'\n", + " 'https://www.ebi.ac.uk/metagenomics/analyses/MGYA00466185'\n", + " 'https://www.ebi.ac.uk/metagenomics/analyses/MGYA00466186'\n", + " 'https://www.ebi.ac.uk/metagenomics/analyses/MGYA00452433']\n", + "\n", + "QV:SEADATANET.13\n", + "[9 0]\n", + "\n", + "CTDTMP_T_VALUE_SENSOR [deg C]\n", + "[2.8403 2.8405 2.8404 2.839 2.8406]\n", + "\n", + "QV:SEADATANET.14\n", + "[1 9 2 3]\n", + "\n", + "CTDSAL_D_CONC_SENSOR [pss-78]\n", + "[34.950802 34.950699 34.9506 35.146198 35.130901]\n", + "\n", + "QV:SEADATANET.15\n", + "[1 9 4 2 3]\n", + "\n", + "SALINITY_D_CONC_BOTTLE\n", + "[34.9519 34.952702 34.951302 34.960201 34.954399]\n", + "\n", + "QV:SEADATANET.16\n", + "[1 3 9 2 4]\n", + "\n", + "CFC-11_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 3.6151 3.6904 3.9974 4.3853]\n", + "\n", + "QV:SEADATANET.17\n", + "[9 1 3 6]\n", + "\n", + "CFC-12_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 2.1131 2.136 2.2993 2.5283]\n", + "\n", + "QV:SEADATANET.18\n", + "[9 1 3 6]\n", + "\n", + "CFC113_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.202 0.204 0.176 0.154]\n", + "\n", + "QV:SEADATANET.19\n", + "[9 1 6 3]\n", + "\n", + "SF6_D_CONC_BOTTLE [fmol/kg]\n", + "[ nan 1.313 1.362 1.4 1.467]\n", + "\n", + "QV:SEADATANET.20\n", + "[9 1 6 3]\n", + "\n", + "He_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 1.6641 1.6745 1.6909 1.6908]\n", + "\n", + "QV:SEADATANET.21\n", + "[9 1 3]\n", + "\n", + "Ne_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 6.579 6.777 6.908 6.924]\n", + "\n", + "QV:SEADATANET.22\n", + "[9 1 3]\n", + "\n", + "Ar_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 11.714 11.708 11.781 11.998]\n", + "\n", + "QV:SEADATANET.23\n", + "[9 1]\n", + "\n", + "Kr_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 2.651 2.6589 2.6564 2.6689]\n", + "\n", + "QV:SEADATANET.24\n", + "[9 1]\n", + "\n", + "Xe_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.36615 0.36581 0.36629 0.36818]\n", + "\n", + "QV:SEADATANET.25\n", + "[9 1]\n", + "\n", + "SALINITY_D_CONC_PUMP\n", + "[ nan 34.91 34.919998 35.02 34.98 ]\n", + "\n", + "QV:SEADATANET.26\n", + "[9 1 2 3]\n", + "\n", + "SALINITY_D_CONC_FISH\n", + "[ nan 35.419167 34.453396 35.397396 34.531933]\n", + "\n", + "QV:SEADATANET.27\n", + "[9 1 2 3]\n", + "\n", + "SALINITY_D_CONC_UWAY\n", + "[ nan 36.34 36.240002 36.950001 36.509998]\n", + "\n", + "QV:SEADATANET.28\n", + "[9 1 2 3]\n", + "\n", + "CFC-11_D_CONC_UWAY [pmol/kg]\n", + "[ nan 1.918561 1.552457 1.83768 1.76309 ]\n", + "\n", + "QV:SEADATANET.29\n", + "[9 1]\n", + "\n", + "CFC-12_D_CONC_UWAY [pmol/kg]\n", + "[ nan 1.163526 0.950414 1.115449 1.094962]\n", + "\n", + "QV:SEADATANET.30\n", + "[9 1]\n", + "\n", + "CFC113_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.14083 0.140568 0.16651 0.157061]\n", + "\n", + "QV:SEADATANET.31\n", + "[9 1 3]\n", + "\n", + "SF6_D_CONC_UWAY [fmol/kg]\n", + "[ nan 1.188855 1.12 1.19016 1.20843 ]\n", + "\n", + "QV:SEADATANET.32\n", + "[9 1 3]\n", + "\n", + "SALINITY_D_CONC_BOAT_PUMP\n", + "[ nan 31.404301 31.8871 32.318501 32.295399]\n", + "\n", + "QV:SEADATANET.33\n", + "[9 2]\n", + "\n", + "OXYGEN_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 285.600006 247.300003 166.199997 180.199997]\n", + "\n", + "QV:SEADATANET.34\n", + "[9 1 3 6 4]\n", + "\n", + "CTDOXY_D_CONC_SENSOR [umol/kg]\n", + "[226.399994 249.5 250.699997 251.600006 253.100006]\n", + "\n", + "QV:SEADATANET.35\n", + "[1 9 3 2 4]\n", + "\n", + "PHOSPHATE_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 1.012 1.014 1.032 2.367]\n", + "\n", + "QV:SEADATANET.36\n", + "[9 1 3 6 2]\n", + "\n", + "PHOSPHATE_LL_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.003707 0.01561 0.036098 0.066439]\n", + "\n", + "QV:SEADATANET.37\n", + "[9 1 3 6 2]\n", + "\n", + "SILICATE_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.88 0.85 0.54 1.06]\n", + "\n", + "QV:SEADATANET.38\n", + "[9 2 4 3 6]\n", + "\n", + "NITRATE_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.04 0.03 0.94 4.43]\n", + "\n", + "QV:SEADATANET.39\n", + "[9 2 3 6 1]\n", + "\n", + "NITRATE_LL_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.023512 0.101463 0.284098 1.146146]\n", + "\n", + "QV:SEADATANET.40\n", + "[9 1 3]\n", + "\n", + "NITRITE_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0. 0.02 0.11 0.03]\n", + "\n", + "QV:SEADATANET.41\n", + "[9 6 2 3 1]\n", + "\n", + "NITRITE_LL_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.00322 0.025073 0.068976 0.106537]\n", + "\n", + "QV:SEADATANET.42\n", + "[9 1 3 6]\n", + "\n", + "NO2+NO3_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.025268 0.019512 0.787902 3.837268]\n", + "\n", + "QV:SEADATANET.43\n", + "[9 1 6 3 4]\n", + "\n", + "NO2+NO3_LL_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.019415 0.016878 0.017659 0.048683]\n", + "\n", + "QV:SEADATANET.44\n", + "[9 1]\n", + "\n", + "NH4_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.022439 0.013659 0.021463 0.029268]\n", + "\n", + "QV:SEADATANET.45\n", + "[9 1 6 3 4]\n", + "\n", + "TALK_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 2277.800049 2278. 2277.600098 2276.699951]\n", + "\n", + "QV:SEADATANET.46\n", + "[9 1 3]\n", + "\n", + "DIC_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 2073.100098 2075.100098 2093.399902 2123.300049]\n", + "\n", + "QV:SEADATANET.47\n", + "[9 1 3]\n", + "\n", + "PH_SWS_BOTTLE\n", + "[ nan 8.006 8.005 7.989 7.943]\n", + "\n", + "QV:SEADATANET.48\n", + "[9 1]\n", + "\n", + "PH_TOT_BOTTLE\n", + "[ nan 7.801 7.649 7.629 7.6405]\n", + "\n", + "QV:SEADATANET.49\n", + "[9 3 1]\n", + "\n", + "DOC_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 62.439026 53.658539 67.31707 40. ]\n", + "\n", + "QV:SEADATANET.50\n", + "[9 3 1]\n", + "\n", + "TDN_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 3.512195 3.414634 2.926829 5.073171]\n", + "\n", + "QV:SEADATANET.51\n", + "[9 1 3]\n", + "\n", + "PHOSPHATE_D_CONC_PUMP [umol/kg]\n", + "[ nan 2.27 2.18 1.68 1.5 ]\n", + "\n", + "QV:SEADATANET.52\n", + "[9 1 2 3 6]\n", + "\n", + "SILICATE_D_CONC_PUMP [umol/kg]\n", + "[ nan 22.450001 23.52 23.030001 25.959999]\n", + "\n", + "QV:SEADATANET.53\n", + "[9 1 2 3 6]\n", + "\n", + "NITRATE_D_CONC_PUMP [umol/kg]\n", + "[ nan 34.41 31.969999 24.17 21.620001]\n", + "\n", + "QV:SEADATANET.54\n", + "[9 1 2 3 6]\n", + "\n", + "NITRITE_D_CONC_PUMP [umol/kg]\n", + "[ nan 0. 0.01 0.16 0.11]\n", + "\n", + "QV:SEADATANET.55\n", + "[9 6 1 3 2]\n", + "\n", + "PHOSPHATE_D_CONC_FISH [umol/kg]\n", + "[ nan 0.007 0.334 0.009 0.217]\n", + "\n", + "QV:SEADATANET.56\n", + "[9 1 6 2 3]\n", + "\n", + "PHOSPHATE_LL_D_CONC_FISH [umol/kg]\n", + "[ nan 0.004683 0.014049 0.038049 0.021951]\n", + "\n", + "QV:SEADATANET.57\n", + "[9 1 6]\n", + "\n", + "SILICATE_D_CONC_FISH [umol/kg]\n", + "[ nan 0.48 0.24 0.42 0.71]\n", + "\n", + "QV:SEADATANET.58\n", + "[9 1 2 3 6]\n", + "\n", + "NITRATE_D_CONC_FISH [umol/kg]\n", + "[ nan 2.422 1.101 13.464 0.027]\n", + "\n", + "QV:SEADATANET.59\n", + "[9 1 2 3 6]\n", + "\n", + "NITRATE_LL_D_CONC_FISH [umol/kg]\n", + "[ nan 0.030439 0.02039 0.025659 0.034634]\n", + "\n", + "QV:SEADATANET.60\n", + "[9 1 3]\n", + "\n", + "NITRITE_D_CONC_FISH [umol/kg]\n", + "[ nan 0.098 0.002 0.015 0.069]\n", + "\n", + "QV:SEADATANET.61\n", + "[9 1 2 6 3]\n", + "\n", + "NITRITE_LL_D_CONC_FISH [umol/kg]\n", + "[ nan 0.001171 0.002439 0.000976 0.000683]\n", + "\n", + "QV:SEADATANET.62\n", + "[9 1]\n", + "\n", + "NO2+NO3_D_CONC_FISH [umol/kg]\n", + "[ nan 0.019512 0.058537 0.195122 1.053659]\n", + "\n", + "QV:SEADATANET.63\n", + "[9 6 1]\n", + "\n", + "NO2+NO3_LL_D_CONC_FISH [umol/kg]\n", + "[ nan 0.003707 0.004079 0.003059 0.004301]\n", + "\n", + "QV:SEADATANET.64\n", + "[9 1]\n", + "\n", + "DOC_D_CONC_FISH [umol/kg]\n", + "[ nan 73.268295 67.024391 74.731705 63.900002]\n", + "\n", + "QV:SEADATANET.65\n", + "[9 1]\n", + "\n", + "PHOSPHATE_D_CONC_UWAY [umol/kg]\n", + "[ nan 0.03]\n", + "\n", + "QV:SEADATANET.66\n", + "[9 1]\n", + "\n", + "SILICATE_D_CONC_UWAY [umol/kg]\n", + "[ nan 0.39]\n", + "\n", + "QV:SEADATANET.67\n", + "[9 1]\n", + "\n", + "NITRATE_D_CONC_UWAY [umol/kg]\n", + "[ nan 0.02]\n", + "\n", + "QV:SEADATANET.68\n", + "[9 1]\n", + "\n", + "NITRITE_D_CONC_UWAY [umol/kg]\n", + "[nan 0.]\n", + "\n", + "QV:SEADATANET.69\n", + "[9 6]\n", + "\n", + "DOC_D_CONC_UWAY [umol/kg]\n", + "[ nan 83.609756 76.975609 75.31707 67.902435]\n", + "\n", + "QV:SEADATANET.70\n", + "[9 1]\n", + "\n", + "NITRATE_D_CONC_BOAT_PUMP [umol/kg]\n", + "[ nan 0. 0.1 8.22 1.97]\n", + "\n", + "QV:SEADATANET.71\n", + "[9 2]\n", + "\n", + "NITRITE_D_CONC_BOAT_PUMP [umol/kg]\n", + "[ nan 0. 0.11 0.08]\n", + "\n", + "QV:SEADATANET.72\n", + "[9 2]\n", + "\n", + "DIC_13_12_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 1.15 0.99 0.78 0.68]\n", + "\n", + "QV:SEADATANET.73\n", + "[9 1 3]\n", + "\n", + "DIC_14_12_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 22.360001 20.950001 22.15 23.299999]\n", + "\n", + "QV:SEADATANET.74\n", + "[9 1]\n", + "\n", + "He_3_4_D_DELTA_BOTTLE [%]\n", + "[ nan -1.74 -1.32 -1.27 -1.13]\n", + "\n", + "QV:SEADATANET.75\n", + "[9 1]\n", + "\n", + "TRITIUM_D_CONC_BOTTLE [TU]\n", + "[ nan 0.733 0.696 0.718 0.709]\n", + "\n", + "QV:SEADATANET.76\n", + "[9 1 3 4]\n", + "\n", + "H2O_2_1_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan -1.78 -2.11 -2.16 -2.14]\n", + "\n", + "QV:SEADATANET.77\n", + "[9 1 3]\n", + "\n", + "H2O_18_16_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan -0.26 -0.35 -0.38 -0.44]\n", + "\n", + "QV:SEADATANET.78\n", + "[9 1 3]\n", + "\n", + "NITRATE_15_14_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 13.7 3.38 7.47 4.665]\n", + "\n", + "QV:SEADATANET.79\n", + "[9 1 2]\n", + "\n", + "NITRATE_18_16_D_DELTA_BOTTLE [per 10^3]\n", + "[nan 8.8 5.1 3.4 3.2]\n", + "\n", + "QV:SEADATANET.80\n", + "[9 1 2]\n", + "\n", + "SILICATE_30_28_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 1.85 1.9 1.96 1.83]\n", + "\n", + "QV:SEADATANET.81\n", + "[9 1 3 2]\n", + "\n", + "Al_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 24.507317 6.222439 8.497561 10.097561]\n", + "\n", + "QV:SEADATANET.82\n", + "[9 1 3 6 2]\n", + "\n", + "Ba_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 48.783298 44.298702 52.265789 67.36132 ]\n", + "\n", + "QV:SEADATANET.83\n", + "[9 1 3 2 4]\n", + "\n", + "Cd_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.05282 0.06973 0.15567 0.37431]\n", + "\n", + "QV:SEADATANET.84\n", + "[9 1 3 6 2]\n", + "\n", + "Co_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 35.191807 32.344975 40.712002 55.530731]\n", + "\n", + "QV:SEADATANET.85\n", + "[9 1 3 2 6]\n", + "\n", + "Cr_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 3.27958 3.27481 3.26909 3.26816]\n", + "\n", + "QV:SEADATANET.86\n", + "[9 1]\n", + "\n", + "Cu_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 1.102439 1.073171 1.107317 1.160976]\n", + "\n", + "QV:SEADATANET.87\n", + "[9 1 3 2 4]\n", + "\n", + "Fe_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 1.04 0.5 0.72 0.82]\n", + "\n", + "QV:SEADATANET.88\n", + "[9 3 1 2 6]\n", + "\n", + "Fe_II_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.273171 0.126829 0.078049 0.058537]\n", + "\n", + "QV:SEADATANET.89\n", + "[9 1 6 2 3]\n", + "\n", + "Fe_S_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.292 0.198 0.359 0.421]\n", + "\n", + "QV:SEADATANET.90\n", + "[9 1 3]\n", + "\n", + "Ga_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 39.5 36.599998 42.900002 37.400002]\n", + "\n", + "QV:SEADATANET.91\n", + "[9 3 1 2 4]\n", + "\n", + "Hf_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.31 0.13 0.23 0.47]\n", + "\n", + "QV:SEADATANET.92\n", + "[9 1]\n", + "\n", + "Hg_0_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.06 0. 0.08 0.11]\n", + "\n", + "QV:SEADATANET.93\n", + "[9 2 3 5]\n", + "\n", + "Hg_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 2.868293 1.960976 1.609756 0.985366]\n", + "\n", + "QV:SEADATANET.94\n", + "[9 3 1 2 5]\n", + "\n", + "Hg_DM_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0. 0.005 0.004 0.076]\n", + "\n", + "QV:SEADATANET.95\n", + "[9 3 2 5]\n", + "\n", + "Hg_Me_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.116 0.106 0.12 0.148]\n", + "\n", + "QV:SEADATANET.96\n", + "[9 1 4]\n", + "\n", + "Hg_MM_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.012 0.026 0.045 0.038]\n", + "\n", + "QV:SEADATANET.97\n", + "[9 2 3 4 1]\n", + "\n", + "Hg_T_CONC_BOTTLE [pmol/kg]\n", + "[ nan 1.401 0.7435 1.649 1.213 ]\n", + "\n", + "QV:SEADATANET.98\n", + "[9 1 4]\n", + "\n", + "I_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 20.760977 20.497561 21.736586 19.980488]\n", + "\n", + "QV:SEADATANET.99\n", + "[9 1 6]\n", + "\n", + "I_V_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 333.346344 341.453644 327.112183 282.21463 ]\n", + "\n", + "QV:SEADATANET.100\n", + "[9 1]\n", + "\n", + "Mn_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.087805 0.126829 0.165854 0.17561 ]\n", + "\n", + "QV:SEADATANET.101\n", + "[9 1 3 2 4]\n", + "\n", + "Mo_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 118.411842 118.910263 116.68261 117.252235]\n", + "\n", + "QV:SEADATANET.102\n", + "[9 1 3]\n", + "\n", + "Nb_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 2.82 3.3 3.26 2.99]\n", + "\n", + "QV:SEADATANET.103\n", + "[9 1 6]\n", + "\n", + "Ni_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 2.144948 2.450863 2.629792 2.739713]\n", + "\n", + "QV:SEADATANET.104\n", + "[9 1 3 2 4]\n", + "\n", + "Pb_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 45.299999 39. 71.300003 51. ]\n", + "\n", + "QV:SEADATANET.105\n", + "[9 4 3 1 2]\n", + "\n", + "Pb_TD_CONC_BOTTLE [pmol/kg]\n", + "[ nan 17.550821 17.42984 20.257715 20.011883]\n", + "\n", + "QV:SEADATANET.106\n", + "[9 1]\n", + "\n", + "Ti_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 45.433681 49.948601 46.342361 65.870621]\n", + "\n", + "QV:SEADATANET.107\n", + "[9 1 3 6 4]\n", + "\n", + "U_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 11.61627 11.85819 11.862291 11.694177]\n", + "\n", + "QV:SEADATANET.108\n", + "[9 1 3]\n", + "\n", + "V_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 30.959999 31.75 31.959999 31.950001]\n", + "\n", + "QV:SEADATANET.109\n", + "[9 1 2 3]\n", + "\n", + "Zn_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.074824 0.35851 0.125532 0.195598]\n", + "\n", + "QV:SEADATANET.110\n", + "[9 1 3 6 4]\n", + "\n", + "Hg_Me_T_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.05197 0.03679 0.0472 0.04341]\n", + "\n", + "QV:SEADATANET.111\n", + "[9 1 4]\n", + "\n", + "Al_D_CONC_FISH [nmol/kg]\n", + "[ nan 2.6387 1.289014 2.513986 1.166395]\n", + "\n", + "QV:SEADATANET.112\n", + "[9 1 3 6]\n", + "\n", + "Ba_D_CONC_FISH [nmol/kg]\n", + "[ nan 37.900002 39.400002 39.200001 38.5 ]\n", + "\n", + "QV:SEADATANET.113\n", + "[9 1]\n", + "\n", + "Cd_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.03147 0.00021 0.08086 0.00187]\n", + "\n", + "QV:SEADATANET.114\n", + "[9 1]\n", + "\n", + "Co_D_CONC_FISH [pmol/kg]\n", + "[ nan 36.097561 35.121952 25.365854 16.585365]\n", + "\n", + "QV:SEADATANET.115\n", + "[9 1 6 3]\n", + "\n", + "Cu_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.917073 0.526829 0.57561 0.419512]\n", + "\n", + "QV:SEADATANET.116\n", + "[9 1]\n", + "\n", + "Fe_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.091487 0.010606 0.016417 0.028293]\n", + "\n", + "QV:SEADATANET.117\n", + "[9 1 5 2 3]\n", + "\n", + "Fe_II_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.058537 0.068293 0.243902 0.204878]\n", + "\n", + "QV:SEADATANET.118\n", + "[9 6 1]\n", + "\n", + "Fe_S_CONC_FISH [nmol/kg]\n", + "[ nan 0.171459 0.092 0.063 0.298 ]\n", + "\n", + "QV:SEADATANET.119\n", + "[9 1 3]\n", + "\n", + "Ga_D_CONC_FISH [pmol/kg]\n", + "[ nan 34.700001 32.099998 35.700001 33.700001]\n", + "\n", + "QV:SEADATANET.120\n", + "[9 1 3]\n", + "\n", + "Hf_D_CONC_FISH [pmol/kg]\n", + "[ nan 0.2 0.04 0.06 0.08]\n", + "\n", + "QV:SEADATANET.121\n", + "[9 1]\n", + "\n", + "Hg_D_CONC_FISH [pmol/kg]\n", + "[ nan 0.084195 0.127122 0.357073 0.116488]\n", + "\n", + "QV:SEADATANET.122\n", + "[9 3 1 6]\n", + "\n", + "Mn_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.849437 0.485266 0.858047 0.506071]\n", + "\n", + "QV:SEADATANET.123\n", + "[9 1 3]\n", + "\n", + "Mo_D_CONC_FISH [nmol/kg]\n", + "[ nan 111.099998 110.199997 108.800003 109.900002]\n", + "\n", + "QV:SEADATANET.124\n", + "[9 1 3]\n", + "\n", + "Ni_D_CONC_FISH [nmol/kg]\n", + "[ nan 2.148 2.33 2.161 4.001539]\n", + "\n", + "QV:SEADATANET.125\n", + "[9 1 3]\n", + "\n", + "Pb_D_CONC_FISH [pmol/kg]\n", + "[ nan 43.299999 17. 20.200001 14. ]\n", + "\n", + "QV:SEADATANET.126\n", + "[9 1]\n", + "\n", + "Pb_TD_CONC_FISH [pmol/kg]\n", + "[ nan 23.029144 22.486393 20.799829 23.19504 ]\n", + "\n", + "QV:SEADATANET.127\n", + "[9 1]\n", + "\n", + "Ti_D_CONC_FISH [pmol/kg]\n", + "[ nan 41.784595 23.785187 39.185116]\n", + "\n", + "QV:SEADATANET.128\n", + "[9 1]\n", + "\n", + "V_D_CONC_FISH [nmol/kg]\n", + "[ nan 33.5 33.099998 33.200001 33.400002]\n", + "\n", + "QV:SEADATANET.129\n", + "[9 1 3]\n", + "\n", + "Zn_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.85 0.02 0.01 0.015]\n", + "\n", + "QV:SEADATANET.130\n", + "[9 1 6]\n", + "\n", + "Hf_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.2 0.38 0.18 0.16]\n", + "\n", + "QV:SEADATANET.131\n", + "[9 1]\n", + "\n", + "Al_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 0.878049 1.473171 1.326829 1.629268]\n", + "\n", + "QV:SEADATANET.132\n", + "[9 2]\n", + "\n", + "Ba_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 44.540001 11.87 43.75 54.419998]\n", + "\n", + "QV:SEADATANET.133\n", + "[9 1]\n", + "\n", + "Cd_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 0.43405 0.12185 0.3503 0.48855]\n", + "\n", + "QV:SEADATANET.134\n", + "[9 1]\n", + "\n", + "Co_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[ nan 105.599998 382.600006 451.5 394.5 ]\n", + "\n", + "QV:SEADATANET.135\n", + "[9 1]\n", + "\n", + "Cu_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 1.78645 3.42765 2.68315 2.3425 ]\n", + "\n", + "QV:SEADATANET.136\n", + "[9 1]\n", + "\n", + "Fe_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 0.1286 0.3085 3.669 8.3961]\n", + "\n", + "QV:SEADATANET.137\n", + "[9 2]\n", + "\n", + "Fe_II_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 0.00505 0.01465 0. ]\n", + "\n", + "QV:SEADATANET.138\n", + "[9 6]\n", + "\n", + "Ga_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[ nan 3.09 5.14 4.55 7.31]\n", + "\n", + "QV:SEADATANET.139\n", + "[9 1 2 3]\n", + "\n", + "Mn_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 1.85565 11.3677 29.95565 19.59445]\n", + "\n", + "QV:SEADATANET.140\n", + "[9 1]\n", + "\n", + "Ni_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 5.03085 4.10155 4.8299 5.45035]\n", + "\n", + "QV:SEADATANET.141\n", + "[9 1]\n", + "\n", + "Pb_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[ nan 11.1 2.75 2.7 5.2 ]\n", + "\n", + "QV:SEADATANET.142\n", + "[9 1]\n", + "\n", + "V_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 31.51 26.34 29.67 31.91]\n", + "\n", + "QV:SEADATANET.143\n", + "[9 1]\n", + "\n", + "Zn_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 0.5064 0.3742 0.516 1.1316]\n", + "\n", + "QV:SEADATANET.144\n", + "[9 2]\n", + "\n", + "Al_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 2.009756 1.053659 1.258537 1.960976]\n", + "\n", + "QV:SEADATANET.145\n", + "[9 2]\n", + "\n", + "Ba_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 62.75 63.990002 66.800003 68.099998]\n", + "\n", + "QV:SEADATANET.146\n", + "[9 1]\n", + "\n", + "Cd_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 0.31845 0.31255 0.29615 0.2863 ]\n", + "\n", + "QV:SEADATANET.147\n", + "[9 1 2]\n", + "\n", + "Co_D_CONC_SUBICE_PUMP [pmol/kg]\n", + "[ nan 308.5 268.600006 279.899994 253.100006]\n", + "\n", + "QV:SEADATANET.148\n", + "[9 1]\n", + "\n", + "Cu_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 14.3195 6.8502 6.7681 7.4731]\n", + "\n", + "QV:SEADATANET.149\n", + "[9 1 2]\n", + "\n", + "Fe_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 5.9468 3.0739 3.4268 3.6126]\n", + "\n", + "QV:SEADATANET.150\n", + "[9 2]\n", + "\n", + "Ga_D_CONC_SUBICE_PUMP [pmol/kg]\n", + "[ nan 6.81 6.95 7.42 7.34]\n", + "\n", + "QV:SEADATANET.151\n", + "[9 1 2]\n", + "\n", + "Mn_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 5.2041 4.5234 3.8534 5.91495]\n", + "\n", + "QV:SEADATANET.152\n", + "[9 1 2]\n", + "\n", + "Ni_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 10.52805 8.257 8.30705 8.384 ]\n", + "\n", + "QV:SEADATANET.153\n", + "[9 1 2]\n", + "\n", + "Pb_D_CONC_SUBICE_PUMP [pmol/kg]\n", + "[ nan 10.1 1.6 1.2 1.4]\n", + "\n", + "QV:SEADATANET.154\n", + "[9 2 6]\n", + "\n", + "V_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 13.09 13.5 13.95 13.29]\n", + "\n", + "QV:SEADATANET.155\n", + "[9 1 2]\n", + "\n", + "Zn_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 19.6989 1.0666 0.9169 3.0386]\n", + "\n", + "QV:SEADATANET.156\n", + "[9 4 2]\n", + "\n", + "Ba_138_134_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 0.57 0.53 0.55 0.58]\n", + "\n", + "QV:SEADATANET.157\n", + "[9 1]\n", + "\n", + "Cd_114_110_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 1.084213 0.96041 0.551941 0.450833]\n", + "\n", + "QV:SEADATANET.158\n", + "[9 1 2]\n", + "\n", + "Cu_65_63_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 0.559117 0.54069 0.514726 0.542622]\n", + "\n", + "QV:SEADATANET.159\n", + "[9 1 2 3]\n", + "\n", + "Cr_53_52_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 1.02023 1.03613 1.04769 1.06464]\n", + "\n", + "QV:SEADATANET.160\n", + "[9 1]\n", + "\n", + "Fe_56_54_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 0.03 0.02 0.07 -0.26]\n", + "\n", + "QV:SEADATANET.161\n", + "[9 1 3 2 6]\n", + "\n", + "Ni_60_58_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 1.63836 1.602155 1.524474 1.447863]\n", + "\n", + "QV:SEADATANET.162\n", + "[9 1]\n", + "\n", + "Zn_66_64_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan -0.175932 -0.195314 0.104437 0.101797]\n", + "\n", + "QV:SEADATANET.163\n", + "[9 1 2 3]\n", + "\n", + "Cd_114_110_D_DELTA_FISH [per 10^3]\n", + "[ nan 0.716152 0.750159 0.476101 0.664141]\n", + "\n", + "QV:SEADATANET.164\n", + "[9 1]\n", + "\n", + "Fe_56_54_D_DELTA_FISH [per 10^3]\n", + "[ nan -0.11 0.52 0.32 0.5 ]\n", + "\n", + "QV:SEADATANET.165\n", + "[9 1 2 6 5]\n", + "\n", + "Ni_60_58_D_DELTA_FISH [per 10^3]\n", + "[ nan 1.486192 1.608341 1.431721 1.54732 ]\n", + "\n", + "QV:SEADATANET.166\n", + "[9 1]\n", + "\n", + "Zn_66_64_D_DELTA_FISH [per 10^3]\n", + "[ nan 0.4 0.61 -0.12 0.18]\n", + "\n", + "QV:SEADATANET.167\n", + "[9 1]\n", + "\n", + "Ba_138_134_D_DELTA_BOAT_PUMP [per 10^3]\n", + "[ nan 0.443654 0.616907 0.491612 0.396128]\n", + "\n", + "QV:SEADATANET.168\n", + "[9 1]\n", + "\n", + "Ba_138_134_D_DELTA_SUBICE_PUMP [per 10^3]\n", + "[ nan 0.353344 0.348303]\n", + "\n", + "QV:SEADATANET.169\n", + "[9 1]\n", + "\n", + "Cs_137_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 1655.10083 1438.428833 1366.985596 936.56958 ]\n", + "\n", + "QV:SEADATANET.170\n", + "[9 1 3]\n", + "\n", + "I_129_D_CONC_BOTTLE [atoms/kg]\n", + "[ nan 4.99553638e+09 5.46595840e+09 4.74542438e+09\n", + " 5.02532045e+09]\n", + "\n", + "QV:SEADATANET.171\n", + "[9 2]\n", + "\n", + "Np_237_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 0.168566 0.169112 0.162868 0.161249]\n", + "\n", + "QV:SEADATANET.172\n", + "[9 1 3]\n", + "\n", + "Pu_239_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 6.238634 6.482439 6.656293 6.81278 ]\n", + "\n", + "QV:SEADATANET.173\n", + "[9 1]\n", + "\n", + "Pu_239_Pu_240_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 4285.794922 3697.453613 5849.404785 5792.272949]\n", + "\n", + "QV:SEADATANET.174\n", + "[9 1 4]\n", + "\n", + "Pu_240_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 4.024683 4.245073 4.100585 4.72439 ]\n", + "\n", + "QV:SEADATANET.175\n", + "[9 1]\n", + "\n", + "U_236_238_T_RATIO_BOTTLE [per 10^12]\n", + "[ nan 1529.460449 1479.776367 1505.207764 1489.274292]\n", + "\n", + "QV:SEADATANET.176\n", + "[9 2]\n", + "\n", + "U_236_D_CONC_BOTTLE [atoms/kg]\n", + "[ nan 5363510.5 5209238. 4758632. 4707234. ]\n", + "\n", + "QV:SEADATANET.177\n", + "[9 1]\n", + "\n", + "U_236_T_CONC_BOTTLE [atoms/kg]\n", + "[ nan 9355403. 12158179. 15394789. 18341200.]\n", + "\n", + "QV:SEADATANET.178\n", + "[9 1 2]\n", + "\n", + "U_236_D_CONC_FISH [atoms/kg]\n", + "[ nan 6168131. 7304443. 5495804.5 5656699.5]\n", + "\n", + "QV:SEADATANET.179\n", + "[9 1]\n", + "\n", + "Cs_137_D_CONC_UWAY [uBq/kg]\n", + "[ nan 1966.119995 1712.800049 1561.294678 1780.053711]\n", + "\n", + "QV:SEADATANET.180\n", + "[9 1]\n", + "\n", + "Pu_239_Pu_240_D_CONC_UWAY [uBq/kg]\n", + "[ nan 5350.819336 5146.790039 5680.321777 4982.546387]\n", + "\n", + "QV:SEADATANET.181\n", + "[9 1]\n", + "\n", + "Pb_206_204_D_RATIO_BOTTLE\n", + "[ nan 18.51 18.57 18.91 18.389999]\n", + "\n", + "QV:SEADATANET.182\n", + "[9 4 1 3 0]\n", + "\n", + "Pb_206_204_TD_RATIO_BOTTLE\n", + "[ nan 18.475285 18.532042 18.523636 18.514265]\n", + "\n", + "QV:SEADATANET.183\n", + "[9 1]\n", + "\n", + "Pb_206_207_D_RATIO_BOTTLE\n", + "[ nan 1.1794 1.1826 1.2018 1.1702]\n", + "\n", + "QV:SEADATANET.184\n", + "[9 4 1 3 2]\n", + "\n", + "Pb_206_207_TD_RATIO_BOTTLE\n", + "[ nan 1.179389 1.182446 1.181789 1.181492]\n", + "\n", + "QV:SEADATANET.185\n", + "[9 1]\n", + "\n", + "Pb_208_207_D_RATIO_BOTTLE\n", + "[ nan 2.4485 2.4512 2.4571 2.4406]\n", + "\n", + "QV:SEADATANET.186\n", + "[9 4 1 3 0]\n", + "\n", + "Pb_208_207_TD_RATIO_BOTTLE\n", + "[ nan 2.465975 2.473708 2.473035 2.472412]\n", + "\n", + "QV:SEADATANET.187\n", + "[9 1]\n", + "\n", + "Pb_207_204_TD_RATIO_BOTTLE\n", + "[ nan 15.624 15.618 15.614 15.627]\n", + "\n", + "QV:SEADATANET.188\n", + "[9 1]\n", + "\n", + "Pb_208_204_TD_RATIO_BOTTLE\n", + "[ nan 38.016998 37.922001 37.941002 38.014999]\n", + "\n", + "QV:SEADATANET.189\n", + "[9 1]\n", + "\n", + "Pb_208_206_D_RATIO_BOTTLE\n", + "[ nan 2.0788 2.0764 2.0771 2.0738]\n", + "\n", + "QV:SEADATANET.190\n", + "[9 1 2 3]\n", + "\n", + "Pb_208_206_TD_RATIO_BOTTLE\n", + "[ nan 2.099135 2.105348 2.102701 2.432718]\n", + "\n", + "QV:SEADATANET.191\n", + "[9 1]\n", + "\n", + "Pb_206_204_D_RATIO_FISH\n", + "[ nan 18.58 18.379999 18.200001 18.6 ]\n", + "\n", + "QV:SEADATANET.192\n", + "[9 1]\n", + "\n", + "Pb_206_204_TD_RATIO_FISH\n", + "[ nan 18.42308 18.403351 18.324005 18.443455]\n", + "\n", + "QV:SEADATANET.193\n", + "[9 1]\n", + "\n", + "Pb_206_207_D_RATIO_FISH\n", + "[ nan 1.1727 1.1778 1.1776 1.1831]\n", + "\n", + "QV:SEADATANET.194\n", + "[9 1]\n", + "\n", + "Pb_206_207_TD_RATIO_FISH\n", + "[ nan 1.177033 1.175863 1.171213 1.178684]\n", + "\n", + "QV:SEADATANET.195\n", + "[9 1]\n", + "\n", + "Pb_208_207_D_RATIO_FISH\n", + "[ nan 2.4476 2.4578 2.4544 2.4549]\n", + "\n", + "QV:SEADATANET.196\n", + "[9 1]\n", + "\n", + "Pb_208_207_TD_RATIO_FISH\n", + "[ nan 2.455154 2.44959 2.450164 2.453791]\n", + "\n", + "QV:SEADATANET.197\n", + "[9 1]\n", + "\n", + "Pb_207_204_TD_RATIO_FISH\n", + "[ nan 15.634182 15.619825 15.654844 15.625303]\n", + "\n", + "QV:SEADATANET.198\n", + "[9 1]\n", + "\n", + "Pb_208_204_TD_RATIO_FISH\n", + "[ nan 38.277477 38.029686 38.144962 38.043201]\n", + "\n", + "QV:SEADATANET.199\n", + "[9 1]\n", + "\n", + "Pb_208_206_TD_RATIO_FISH\n", + "[ nan 2.113995 2.104017 2.101173 2.097486]\n", + "\n", + "QV:SEADATANET.200\n", + "[9 1]\n", + "\n", + "Pb_206_204_D_RATIO_BOAT_PUMP\n", + "[ nan 18.24 19.040001 18.379999 18.360001]\n", + "\n", + "QV:SEADATANET.201\n", + "[9 1]\n", + "\n", + "Pb_206_207_D_RATIO_BOAT_PUMP\n", + "[ nan 1.1724 1.2178 1.1964 1.1754]\n", + "\n", + "QV:SEADATANET.202\n", + "[9 1]\n", + "\n", + "Pb_208_207_D_RATIO_BOAT_PUMP\n", + "[ nan 2.4494 2.4808 2.4642 2.4447]\n", + "\n", + "QV:SEADATANET.203\n", + "[9 1]\n", + "\n", + "Pa_231_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 0.796751 0.780488 1.447151 2.211385]\n", + "\n", + "QV:SEADATANET.204\n", + "[9 1 3 2 '9']\n", + "\n", + "Pb_210_D_CONC_BOTTLE [mBq/kg]\n", + "[ nan 2.328333 1.363333 1.011667 1.741667]\n", + "\n", + "QV:SEADATANET.205\n", + "[9 1 3]\n", + "\n", + "Po_210_D_CONC_BOTTLE [mBq/kg]\n", + "[ nan 0.533333 2.033333 2.233333 2.183333]\n", + "\n", + "QV:SEADATANET.206\n", + "[9 1 3]\n", + "\n", + "Ra_224_D_CONC_BOTTLE [mBq/kg]\n", + "[ nan 0.237398 0.177236 0.352846 0.110569]\n", + "\n", + "QV:SEADATANET.207\n", + "[9 1]\n", + "\n", + "Ra_226_D_CONC_BOTTLE [mBq/kg]\n", + "[ nan 1.437218 1.297032 1.39294 1.299583]\n", + "\n", + "QV:SEADATANET.208\n", + "[9 1 3]\n", + "\n", + "Ra_228_T_CONC_BOTTLE [mBq/kg]\n", + "[ nan 0.112195 0.463415 0.443902 0.123577]\n", + "\n", + "QV:SEADATANET.209\n", + "[9 1]\n", + "\n", + "Ra_228_D_CONC_BOTTLE [mBq/kg]\n", + "[ nan 0.463653 0.35308 0.454824 0.100843]\n", + "\n", + "QV:SEADATANET.210\n", + "[9 1]\n", + "\n", + "Th_230_T_CONC_BOTTLE [uBq/kg]\n", + "[ nan 1.16 1.2 1.42 1.27]\n", + "\n", + "QV:SEADATANET.211\n", + "[9 1]\n", + "\n", + "Th_230_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 0.715444 2.764224 3.170732 4.65041 ]\n", + "\n", + "QV:SEADATANET.212\n", + "[9 1 3 2 4]\n", + "\n", + "Th_232_T_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.87393 1.262899 1.12791 0.50296 ]\n", + "\n", + "QV:SEADATANET.213\n", + "[9 1]\n", + "\n", + "Th_232_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.18998 0.932646 0.069088 0.155441]\n", + "\n", + "QV:SEADATANET.214\n", + "[9 1 3 2 5]\n", + "\n", + "Th_234_T_CONC_BOTTLE [mBq/kg]\n", + "[ nan 31.234859 28.776278 35.996681 40.823105]\n", + "\n", + "QV:SEADATANET.215\n", + "[9 1 3 2]\n", + "\n", + "Ac_227_D_CONC_PUMP [uBq/kg]\n", + "[nan 1.4 0. 0.8 0.7]\n", + "\n", + "QV:SEADATANET.216\n", + "[9 3 2]\n", + "\n", + "Be_7_T_CONC_PUMP [uBq/kg]\n", + "[ nan 1024.390259 936.585388 624.390259 253.658539]\n", + "\n", + "QV:SEADATANET.217\n", + "[9 1 6]\n", + "\n", + "Be_7_D_CONC_PUMP [uBq/kg]\n", + "[ nan 758.299988 770. 540.799988 3736.699951]\n", + "\n", + "QV:SEADATANET.218\n", + "[9 1 2 6]\n", + "\n", + "Ra_223_D_CONC_PUMP [mBq/kg]\n", + "[ nan 0.02439 0. 0.006504 0.011382]\n", + "\n", + "QV:SEADATANET.219\n", + "[9 1 6 3 2]\n", + "\n", + "Ra_224_D_CONC_PUMP [mBq/kg]\n", + "[ nan 0.113821 0.099187 0.100813 0.982114]\n", + "\n", + "QV:SEADATANET.220\n", + "[9 1 3 6 2]\n", + "\n", + "Ra_226_D_CONC_PUMP [mBq/kg]\n", + "[ nan 1.430898 1.24878 1.382117 1.512195]\n", + "\n", + "QV:SEADATANET.221\n", + "[9 1 8 3 2]\n", + "\n", + "Ra_228_T_CONC_PUMP [mBq/kg]\n", + "[ nan 0.435772 0.130081 0.268293 0.201626]\n", + "\n", + "QV:SEADATANET.222\n", + "[9 1]\n", + "\n", + "Ra_228_D_CONC_PUMP [mBq/kg]\n", + "[ nan 0.338211 0.214634 0.172358 0.18374 ]\n", + "\n", + "QV:SEADATANET.223\n", + "[9 1 3 6 2]\n", + "\n", + "Th_228_D_CONC_PUMP [uBq/kg]\n", + "[ nan 78.048798 55.284565 87.804901 69.918709]\n", + "\n", + "QV:SEADATANET.224\n", + "[9 1 6 2 3]\n", + "\n", + "Th_234_T_CONC_PUMP [mBq/kg]\n", + "[ nan 20.9 18.700001 20. 17.4 ]\n", + "\n", + "QV:SEADATANET.225\n", + "[9 2 3 1]\n", + "\n", + "Pa_231_D_CONC_FISH [uBq/kg]\n", + "[ nan 2.831112 0.69904 0.681564 0.454376]\n", + "\n", + "QV:SEADATANET.226\n", + "[9 3 1]\n", + "\n", + "Pb_210_D_CONC_FISH [mBq/kg]\n", + "[ nan 1.170732 2.347967 1.365854 1.707317]\n", + "\n", + "QV:SEADATANET.227\n", + "[9 1]\n", + "\n", + "Po_210_D_CONC_FISH [mBq/kg]\n", + "[ nan 0.686179 0.746341 0.663415 0.414634]\n", + "\n", + "QV:SEADATANET.228\n", + "[9 1]\n", + "\n", + "Ra_226_D_CONC_FISH [mBq/kg]\n", + "[ nan 1.495935 1.349593 1.479675 1.398374]\n", + "\n", + "QV:SEADATANET.229\n", + "[9 1]\n", + "\n", + "Ra_228_T_CONC_FISH [mBq/kg]\n", + "[ nan 0.160976 0.523577 0.478049 0.247154]\n", + "\n", + "QV:SEADATANET.230\n", + "[9 1]\n", + "\n", + "Th_230_D_CONC_FISH [uBq/kg]\n", + "[ nan 3.160352 0.645745 1.002804 1.017998]\n", + "\n", + "QV:SEADATANET.231\n", + "[9 3 1]\n", + "\n", + "Th_232_D_CONC_FISH [pmol/kg]\n", + "[ nan 0.384452 0.823081 0.396779 0.371307]\n", + "\n", + "QV:SEADATANET.232\n", + "[9 1]\n", + "\n", + "Th_234_T_CONC_FISH [mBq/kg]\n", + "[ nan 27.479675]\n", + "\n", + "QV:SEADATANET.233\n", + "[9 1]\n", + "\n", + "Pa_231_D_CONC_UWAY [uBq/kg]\n", + "[ nan 0.603631 0.755251 1.121143]\n", + "\n", + "QV:SEADATANET.234\n", + "[9 1]\n", + "\n", + "Po_210_D_CONC_UWAY [mBq/kg]\n", + "[ nan 0.416585 0.446829 0.393333 0.434309]\n", + "\n", + "QV:SEADATANET.235\n", + "[9 1]\n", + "\n", + "Pb_210_D_CONC_UWAY [mBq/kg]\n", + "[ nan 0.833008 0.730569 0.789268 0.931057]\n", + "\n", + "QV:SEADATANET.236\n", + "[9 1]\n", + "\n", + "Ra_224_D_CONC_UWAY [mBq/kg]\n", + "[ nan 0.091057 0.043902 0.20813 0.279675]\n", + "\n", + "QV:SEADATANET.237\n", + "[9 1 2]\n", + "\n", + "Ra_226_D_CONC_UWAY [mBq/kg]\n", + "[ nan 1.927839 1.767957 1.703508 1.596485]\n", + "\n", + "QV:SEADATANET.238\n", + "[9 1 2]\n", + "\n", + "Ra_228_D_CONC_UWAY [mBq/kg]\n", + "[ nan 0.316783 0.445928 0.276271 0.349226]\n", + "\n", + "QV:SEADATANET.239\n", + "[9 1 2]\n", + "\n", + "Th_228_D_CONC_UWAY [uBq/kg]\n", + "[ nan 382.500275 81.612221 207.02478 92.250481]\n", + "\n", + "QV:SEADATANET.240\n", + "[9 2]\n", + "\n", + "Th_230_D_CONC_UWAY [uBq/kg]\n", + "[ nan 1.128682 2.108295 2.05976 ]\n", + "\n", + "QV:SEADATANET.241\n", + "[9 1]\n", + "\n", + "Th_232_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.050588 0.045396 0.057749]\n", + "\n", + "QV:SEADATANET.242\n", + "[9 1]\n", + "\n", + "Th_234_T_CONC_UWAY [mBq/kg]\n", + "[ nan 28.40683 28.622438 29.650732 31.100489]\n", + "\n", + "QV:SEADATANET.243\n", + "[9 1 2]\n", + "\n", + "Pa_231_D_CONC_BOAT_PUMP [uBq/kg]\n", + "[ nan 0.2133]\n", + "\n", + "QV:SEADATANET.244\n", + "[9 1]\n", + "\n", + "Th_230_D_CONC_BOAT_PUMP [uBq/kg]\n", + "[ nan 1.461]\n", + "\n", + "QV:SEADATANET.245\n", + "[9 1]\n", + "\n", + "Th_232_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[ nan 0.29776]\n", + "\n", + "QV:SEADATANET.246\n", + "[9 1]\n", + "\n", + "Pa_231_D_CONC_SUBICE_PUMP [uBq/kg]\n", + "[ nan 0.011 0.015 0.088 0.0417]\n", + "\n", + "QV:SEADATANET.247\n", + "[9 1]\n", + "\n", + "Th_230_D_CONC_SUBICE_PUMP [uBq/kg]\n", + "[ nan 1.135 1.192 1.306 1.319]\n", + "\n", + "QV:SEADATANET.248\n", + "[9 1]\n", + "\n", + "Th_232_D_CONC_SUBICE_PUMP [pmol/kg]\n", + "[ nan 1.0646 1.1204 1.238 1.368 ]\n", + "\n", + "QV:SEADATANET.249\n", + "[9 1]\n", + "\n", + "Th_234_T_CONC_SUBICE_PUMP [mBq/kg]\n", + "[ nan 30.700001 29.4 34.700001 27.9 ]\n", + "\n", + "QV:SEADATANET.250\n", + "[9 2]\n", + "\n", + "Hf_176_177_D_EPSILON_BOTTLE [per 10^4]\n", + "[ nan 0.41 -0.26 -0.25 -1.94]\n", + "\n", + "QV:SEADATANET.251\n", + "[9 1]\n", + "\n", + "Nd_143_144_D_EPSILON_BOTTLE [per 10^4]\n", + "[ nan -14.15286 -13.879521 -13.883313 -13.361316]\n", + "\n", + "QV:SEADATANET.252\n", + "[9 1 3 2 4]\n", + "\n", + "Hf_176_177_D_EPSILON_FISH [per 10^4]\n", + "[ nan -0.28 -0.27 0.3 0.31]\n", + "\n", + "QV:SEADATANET.253\n", + "[9 1]\n", + "\n", + "Nd_143_144_D_EPSILON_FISH [per 10^4]\n", + "[ nan -14.153927 -13.015296 -14.404537 -12.36 ]\n", + "\n", + "QV:SEADATANET.254\n", + "[9 1]\n", + "\n", + "Nd_143_144_D_EPSILON_UWAY [per 10^4]\n", + "[ nan -7.2 -4. -9. -8.4]\n", + "\n", + "QV:SEADATANET.255\n", + "[9 1]\n", + "\n", + "Y_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 114.879997 114.400002 112.089996 116.050003]\n", + "\n", + "QV:SEADATANET.256\n", + "[9 1 3 5 4]\n", + "\n", + "La_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 11.05 10.66 10.1 11.95]\n", + "\n", + "QV:SEADATANET.257\n", + "[9 1 3 2 4]\n", + "\n", + "Ce_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 14.333636 16.887186 12.796327 11.228178]\n", + "\n", + "QV:SEADATANET.258\n", + "[9 1 3 2]\n", + "\n", + "Pr_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 3.647191 3.897719 3.518812 3.357544]\n", + "\n", + "QV:SEADATANET.259\n", + "[9 1 3 2]\n", + "\n", + "Nd_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 17.587685 17.614305 17.587372 17.616825]\n", + "\n", + "QV:SEADATANET.260\n", + "[9 1 3 2 4]\n", + "\n", + "Sm_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 3.101465 3.444644 3.13025 2.889282]\n", + "\n", + "QV:SEADATANET.261\n", + "[9 1 3 2]\n", + "\n", + "Eu_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.934788 0.983839 0.934371 0.824531]\n", + "\n", + "QV:SEADATANET.262\n", + "[9 1 2 3]\n", + "\n", + "Gd_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 4.627342 4.938366 4.587365 4.290338]\n", + "\n", + "QV:SEADATANET.263\n", + "[9 1 3 2]\n", + "\n", + "Tb_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.765122 0.802596 0.772829 0.748298]\n", + "\n", + "QV:SEADATANET.264\n", + "[9 1 3 2]\n", + "\n", + "Dy_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 5.565683 5.875991 5.80047 5.379701]\n", + "\n", + "QV:SEADATANET.265\n", + "[9 1 3 2]\n", + "\n", + "Ho_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 1.445829 1.472963 1.449741 1.420948]\n", + "\n", + "QV:SEADATANET.266\n", + "[9 1 3 2]\n", + "\n", + "Er_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 4.508715 4.63153 4.620785 4.540745]\n", + "\n", + "QV:SEADATANET.267\n", + "[9 1 3 2]\n", + "\n", + "Tm_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.607171 0.630057 0.631402 0.621412]\n", + "\n", + "QV:SEADATANET.268\n", + "[9 1 2 3]\n", + "\n", + "Yb_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 3.79999 3.855863 3.872933 3.808411]\n", + "\n", + "QV:SEADATANET.269\n", + "[9 1 2 3]\n", + "\n", + "Lu_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.579948 0.613831 0.625943 0.592847]\n", + "\n", + "QV:SEADATANET.270\n", + "[9 1 2 3]\n", + "\n", + "Y_D_CONC_FISH [pmol/kg]\n", + "[ nan 123.199806 125.192375 124.840157 131.079376]\n", + "\n", + "QV:SEADATANET.271\n", + "[9 1]\n", + "\n", + "La_D_CONC_FISH [pmol/kg]\n", + "[ nan 22.059525 19.920168 20.493565 21.766468]\n", + "\n", + "QV:SEADATANET.272\n", + "[9 1 7]\n", + "\n", + "Ce_D_CONC_FISH [pmol/kg]\n", + "[ nan 26.446739 22.531816 25.272701 27.210564]\n", + "\n", + "QV:SEADATANET.273\n", + "[9 1 7]\n", + "\n", + "Pr_D_CONC_FISH [pmol/kg]\n", + "[ nan 5.208297 4.478406 4.820566 5.175287]\n", + "\n", + "QV:SEADATANET.274\n", + "[9 1 7]\n", + "\n", + "Nd_D_CONC_FISH [pmol/kg]\n", + "[ nan 17.279907 16.940922 17.141417 23.242701]\n", + "\n", + "QV:SEADATANET.275\n", + "[9 1 7 4]\n", + "\n", + "Sm_D_CONC_FISH [pmol/kg]\n", + "[ nan 4.621849 3.958291 4.408125 4.655192]\n", + "\n", + "QV:SEADATANET.276\n", + "[9 1 7]\n", + "\n", + "Eu_D_CONC_FISH [pmol/kg]\n", + "[ nan 1.179016 1.035409 1.127002 1.176829]\n", + "\n", + "QV:SEADATANET.277\n", + "[9 1 7]\n", + "\n", + "Gd_D_CONC_FISH [pmol/kg]\n", + "[ nan 5.729134 5.32105 5.606716 5.810727]\n", + "\n", + "QV:SEADATANET.278\n", + "[9 1 7]\n", + "\n", + "Tb_D_CONC_FISH [pmol/kg]\n", + "[ nan 0.846657 0.804053 0.852279 0.879247]\n", + "\n", + "QV:SEADATANET.279\n", + "[9 1 7]\n", + "\n", + "Dy_D_CONC_FISH [pmol/kg]\n", + "[ nan 5.795363 5.628952 5.82169 6.007077]\n", + "\n", + "QV:SEADATANET.280\n", + "[9 1 7]\n", + "\n", + "Ho_D_CONC_FISH [pmol/kg]\n", + "[ nan 1.385252 1.404803 1.416585 1.423634]\n", + "\n", + "QV:SEADATANET.281\n", + "[9 1 7]\n", + "\n", + "Er_D_CONC_FISH [pmol/kg]\n", + "[ nan 4.349482 4.470709 4.437563 4.423068]\n", + "\n", + "QV:SEADATANET.282\n", + "[9 1 7]\n", + "\n", + "Tm_D_CONC_FISH [pmol/kg]\n", + "[ nan 0.576506 0.594909 0.590402 0.593117]\n", + "\n", + "QV:SEADATANET.283\n", + "[9 1 7]\n", + "\n", + "Yb_D_CONC_FISH [pmol/kg]\n", + "[ nan 3.441847 3.676881 3.536759 3.556391]\n", + "\n", + "QV:SEADATANET.284\n", + "[9 1 7]\n", + "\n", + "Lu_D_CONC_FISH [pmol/kg]\n", + "[ nan 0.532376 0.575817 0.552478 0.554649]\n", + "\n", + "QV:SEADATANET.285\n", + "[9 1 7]\n", + "\n", + "La_D_CONC_UWAY [pmol/kg]\n", + "[ nan 26.299999 33.599998 32.599998 31. ]\n", + "\n", + "QV:SEADATANET.286\n", + "[9 1 3]\n", + "\n", + "Ce_D_CONC_UWAY [pmol/kg]\n", + "[ nan 4.7 16.200001 2.92 3.43 ]\n", + "\n", + "QV:SEADATANET.287\n", + "[9 1 3]\n", + "\n", + "Pr_D_CONC_UWAY [pmol/kg]\n", + "[ nan 3.44 5.8 4.36 4.48]\n", + "\n", + "QV:SEADATANET.288\n", + "[9 1]\n", + "\n", + "Nd_D_CONC_UWAY [pmol/kg]\n", + "[ nan 14.3 22.6 18.5 18.299999]\n", + "\n", + "QV:SEADATANET.289\n", + "[9 1]\n", + "\n", + "Sm_D_CONC_UWAY [pmol/kg]\n", + "[ nan 2.74 4.98 3.48 3.73]\n", + "\n", + "QV:SEADATANET.290\n", + "[9 1]\n", + "\n", + "Eu_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.78 1.19 0.87 0.96]\n", + "\n", + "QV:SEADATANET.291\n", + "[9 1]\n", + "\n", + "Gd_D_CONC_UWAY [pmol/kg]\n", + "[ nan 3.9 6.49 5.05 5.65]\n", + "\n", + "QV:SEADATANET.292\n", + "[9 1]\n", + "\n", + "Tb_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.69 0.96 0.8 0.74]\n", + "\n", + "QV:SEADATANET.293\n", + "[9 1]\n", + "\n", + "Dy_D_CONC_UWAY [pmol/kg]\n", + "[ nan 5. 7.24 5.97 6.55]\n", + "\n", + "QV:SEADATANET.294\n", + "[9 1]\n", + "\n", + "Ho_D_CONC_UWAY [pmol/kg]\n", + "[ nan 1.47 1.99 1.77 1.88]\n", + "\n", + "QV:SEADATANET.295\n", + "[9 1]\n", + "\n", + "Er_D_CONC_UWAY [pmol/kg]\n", + "[ nan 5.04 6.6 6.03 6.11]\n", + "\n", + "QV:SEADATANET.296\n", + "[9 1]\n", + "\n", + "Tm_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.74 1.08 0.92 0.83]\n", + "\n", + "QV:SEADATANET.297\n", + "[9 1]\n", + "\n", + "Yb_D_CONC_UWAY [pmol/kg]\n", + "[ nan 4.41 7.22 6.06 5.98]\n", + "\n", + "QV:SEADATANET.298\n", + "[9 1]\n", + "\n", + "Lu_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.82 1.29 1.1 1.05]\n", + "\n", + "QV:SEADATANET.299\n", + "[9 1]\n", + "\n", + "La_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[ nan 25.700001]\n", + "\n", + "QV:SEADATANET.300\n", + "[9 1]\n", + "\n", + "Ce_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 8.1]\n", + "\n", + "QV:SEADATANET.301\n", + "[9 1]\n", + "\n", + "Pr_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 4.9]\n", + "\n", + "QV:SEADATANET.302\n", + "[9 1]\n", + "\n", + "Nd_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[ nan 22.4]\n", + "\n", + "QV:SEADATANET.303\n", + "[9 1]\n", + "\n", + "Sm_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 5.7]\n", + "\n", + "QV:SEADATANET.304\n", + "[9 1]\n", + "\n", + "Eu_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 1.5]\n", + "\n", + "QV:SEADATANET.305\n", + "[9 1]\n", + "\n", + "Gd_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 8.6]\n", + "\n", + "QV:SEADATANET.306\n", + "[9 1]\n", + "\n", + "Tb_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 1.2]\n", + "\n", + "QV:SEADATANET.307\n", + "[9 1]\n", + "\n", + "Dy_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 9.9]\n", + "\n", + "QV:SEADATANET.308\n", + "[9 1]\n", + "\n", + "Ho_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 2.2]\n", + "\n", + "QV:SEADATANET.309\n", + "[9 1]\n", + "\n", + "Er_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 7.8]\n", + "\n", + "QV:SEADATANET.310\n", + "[9 1]\n", + "\n", + "Tm_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 1.]\n", + "\n", + "QV:SEADATANET.311\n", + "[9 1]\n", + "\n", + "Yb_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 6.8]\n", + "\n", + "QV:SEADATANET.312\n", + "[9 1]\n", + "\n", + "Lu_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 1.1]\n", + "\n", + "QV:SEADATANET.313\n", + "[9 1]\n", + "\n", + "Cu_Cu'_D_CONC_BOTTLE [fmol/kg]\n", + "[ nan 1.18 0.08 1.39 1.24]\n", + "\n", + "QV:SEADATANET.314\n", + "[9 1]\n", + "\n", + "L1Cu_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 3.248976 2.680488 3.141659 3.027707]\n", + "\n", + "QV:SEADATANET.315\n", + "[9 1 3 2 0]\n", + "\n", + "L1Cu_D_LogK_BOTTLE\n", + "[ nan 13.204 13.179 13.135 13.142]\n", + "\n", + "QV:SEADATANET.316\n", + "[9 1 3]\n", + "\n", + "LFe_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.452872 0.311019 0.500923 0.807415]\n", + "\n", + "QV:SEADATANET.317\n", + "[9 3 1]\n", + "\n", + "LFe_D_LogK_BOTTLE\n", + "[ nan 24.34 22.700001 22.514463 22.812017]\n", + "\n", + "QV:SEADATANET.318\n", + "[9 3 1]\n", + "\n", + "L1Fe_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.926829 1.356098 1.629268 0.8 ]\n", + "\n", + "QV:SEADATANET.319\n", + "[9 1 6 3]\n", + "\n", + "L1Fe_D_LogK_BOTTLE\n", + "[ nan 13.06 12.6 12.61 12.29]\n", + "\n", + "QV:SEADATANET.320\n", + "[9 1 6 7 3]\n", + "\n", + "L2Fe_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.409756 0.780488 0. 0.429268]\n", + "\n", + "QV:SEADATANET.321\n", + "[9 1 6 3]\n", + "\n", + "L2Fe_D_LogK_BOTTLE\n", + "[ nan 12. 11.66 0. 11.85]\n", + "\n", + "QV:SEADATANET.322\n", + "[9 1 6 3]\n", + "\n", + "Cu_Cu'_D_CONC_FISH [fmol/kg]\n", + "[ nan 2.45 4.64 28.450001 222.330002]\n", + "\n", + "QV:SEADATANET.323\n", + "[9 1]\n", + "\n", + "L1Cu_D_CONC_FISH [nmol/kg]\n", + "[ nan 1.882927 2.282927 3.02439 3.082927]\n", + "\n", + "QV:SEADATANET.324\n", + "[9 1 3]\n", + "\n", + "L1Cu_D_LogK_FISH\n", + "[ nan 13.85 13.6 13.2 13.45]\n", + "\n", + "QV:SEADATANET.325\n", + "[9 1 3]\n", + "\n", + "L1Fe_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.858537 1.180488 0. 0.887805]\n", + "\n", + "QV:SEADATANET.326\n", + "[9 1 6]\n", + "\n", + "L1Fe_D_LogK_FISH\n", + "[ nan 12.46 12.51 0. 12.23]\n", + "\n", + "QV:SEADATANET.327\n", + "[9 1 6 7]\n", + "\n", + "L2Fe_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.556098 1.170732 0.965854 0.692683]\n", + "\n", + "QV:SEADATANET.328\n", + "[9 1 6]\n", + "\n", + "L2Fe_D_LogK_FISH\n", + "[ nan 11.32 11.19 11.18 11.92]\n", + "\n", + "QV:SEADATANET.329\n", + "[9 1 6]\n", + "\n", + "Al_TP_CONC_BOTTLE [nmol/kg]\n", + "[ nan 7.472453 3.5 4.489644 2.28 ]\n", + "\n", + "QV:SEADATANET.330\n", + "[9 1 6 3]\n", + "\n", + "Al_TPL_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.845571 1.510146 0.347541 0.23839 ]\n", + "\n", + "QV:SEADATANET.331\n", + "[9 1 6 3]\n", + "\n", + "Al_TPR_CONC_BOTTLE [nmol/kg]\n", + "[ nan 19.539667 43.703709 8.205854 4.534156]\n", + "\n", + "QV:SEADATANET.332\n", + "[9 1 6]\n", + "\n", + "Ba_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 135.708298 44.02 201.399643 333.720001]\n", + "\n", + "QV:SEADATANET.333\n", + "[9 1 3 6]\n", + "\n", + "Ba_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 625.636475 67.758514 81.765289 83.128609]\n", + "\n", + "QV:SEADATANET.334\n", + "[9 1 3 6]\n", + "\n", + "Cd_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 3.62 4.7 3.08 2.15]\n", + "\n", + "QV:SEADATANET.335\n", + "[9 1 6 3]\n", + "\n", + "Cd_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 1.453659 1.960976 0.965854 2.380488]\n", + "\n", + "QV:SEADATANET.336\n", + "[9 1 3 6]\n", + "\n", + "Cd_TPR_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.058537 0.04878 0.039024 0.068293]\n", + "\n", + "QV:SEADATANET.337\n", + "[9 1 6]\n", + "\n", + "Co_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 5.47 3.33 2.03 2.24]\n", + "\n", + "QV:SEADATANET.338\n", + "[9 1 6 3]\n", + "\n", + "Co_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 2.146341 2.526829 2.009756 1.687805]\n", + "\n", + "QV:SEADATANET.339\n", + "[9 1 3 6]\n", + "\n", + "Co_TPR_CONC_BOTTLE [pmol/kg]\n", + "[ nan 1.356098 2.380488 0.8 0.585366]\n", + "\n", + "QV:SEADATANET.340\n", + "[9 1]\n", + "\n", + "Cr_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 3.86 9.74 18.68 22.870001]\n", + "\n", + "QV:SEADATANET.341\n", + "[9 1 6 3]\n", + "\n", + "Cu_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 4.88 12.33 8.93 3.26]\n", + "\n", + "QV:SEADATANET.342\n", + "[9 1 3 6]\n", + "\n", + "Cu_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 16.388987 8.389944 6.808309 5.773022]\n", + "\n", + "QV:SEADATANET.343\n", + "[9 1 3 6]\n", + "\n", + "Fe_TP_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.77 0.37 1.11 2. ]\n", + "\n", + "QV:SEADATANET.344\n", + "[9 1 6 3]\n", + "\n", + "Fe_TPL_CONC_BOTTLE [nmol/kg]\n", + "[ nan 1.017063 1.771346 0.402605 0.199941]\n", + "\n", + "QV:SEADATANET.345\n", + "[9 1 3 6]\n", + "\n", + "Fe_TPR_CONC_BOTTLE [nmol/kg]\n", + "[ nan 4.455463 9.519083 2.132 1.211259]\n", + "\n", + "QV:SEADATANET.346\n", + "[9 1]\n", + "\n", + "Mn_TP_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.04 0.02 0.06 0.17]\n", + "\n", + "QV:SEADATANET.347\n", + "[9 1 6 3]\n", + "\n", + "Mn_TPL_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.304741 0.51999 0.060361 0.050039]\n", + "\n", + "QV:SEADATANET.348\n", + "[9 1 3 6]\n", + "\n", + "Mn_TPR_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.034166 0.065141 0.012185 0.007015]\n", + "\n", + "QV:SEADATANET.349\n", + "[9 1 6]\n", + "\n", + "Mo_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.91 1.56 0.33 0.22]\n", + "\n", + "QV:SEADATANET.350\n", + "[9 1 3 6]\n", + "\n", + "Mo_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 45.811729 4.875809 3.80094 2.154378]\n", + "\n", + "QV:SEADATANET.351\n", + "[9 3 1 6]\n", + "\n", + "Ni_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0. 2.511323 33.178623 38.417618]\n", + "\n", + "QV:SEADATANET.352\n", + "[9 6 1]\n", + "\n", + "Ni_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 25.03315 16.92819 13.370416 9.952879]\n", + "\n", + "QV:SEADATANET.353\n", + "[9 1 6]\n", + "\n", + "P_TP_CONC_BOTTLE [nmol/kg]\n", + "[ nan 38.150002 32.130001 19.440001 14.98 ]\n", + "\n", + "QV:SEADATANET.354\n", + "[9 1 6 3]\n", + "\n", + "P_TPL_CONC_BOTTLE [nmol/kg]\n", + "[ nan 17.380283 20.647444 11.461259 10.311629]\n", + "\n", + "QV:SEADATANET.355\n", + "[9 1 3 6]\n", + "\n", + "P_TPR_CONC_BOTTLE [nmol/kg]\n", + "[ nan 3.621961 5.256117 1.741298 2.019054]\n", + "\n", + "QV:SEADATANET.356\n", + "[9 1 6]\n", + "\n", + "Pb_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 1.12 0.48 0.8 1.83]\n", + "\n", + "QV:SEADATANET.357\n", + "[9 1 6 3]\n", + "\n", + "Pb_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.937513 0.490884 0.405313 0.664039]\n", + "\n", + "QV:SEADATANET.358\n", + "[9 1 6]\n", + "\n", + "Sc_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.414146 0.433854 0.498537 0.57561 ]\n", + "\n", + "QV:SEADATANET.359\n", + "[9 1 3]\n", + "\n", + "Th_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.01 0.05 0.04 0.03]\n", + "\n", + "QV:SEADATANET.360\n", + "[9 6 2 1 3]\n", + "\n", + "Th_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0. 0.009463 0.012878 0.017561]\n", + "\n", + "QV:SEADATANET.361\n", + "[9 6 1]\n", + "\n", + "Ti_TP_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.06 0.01 0.08 0.15]\n", + "\n", + "QV:SEADATANET.362\n", + "[9 1 6 3]\n", + "\n", + "Ti_TPL_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.001639 0.002976 0.000624 0.000332]\n", + "\n", + "QV:SEADATANET.363\n", + "[9 1 3 6]\n", + "\n", + "Ti_TPR_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.434312 0.86641 0.192166 0.128127]\n", + "\n", + "QV:SEADATANET.364\n", + "[9 1 6]\n", + "\n", + "V_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 7.47 11.41 11.5 13.02]\n", + "\n", + "QV:SEADATANET.365\n", + "[9 1 3 6]\n", + "\n", + "V_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 72.975609 35.707317 20.780487 216.585373]\n", + "\n", + "QV:SEADATANET.366\n", + "[9 1 3 6]\n", + "\n", + "Zn_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 11.699382 14.316231 37.984665 73.345901]\n", + "\n", + "QV:SEADATANET.367\n", + "[9 1 3 6]\n", + "\n", + "Zn_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 30.335958 52.920467 44.784649 43.065662]\n", + "\n", + "QV:SEADATANET.368\n", + "[9 1 3 6]\n", + "\n", + "Ag_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.049659 0.011415 0.01639 0.007346]\n", + "\n", + "QV:SEADATANET.369\n", + "[9 2 1]\n", + "\n", + "Ag_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.985366 0.274146 0.236098 0.17561 ]\n", + "\n", + "QV:SEADATANET.370\n", + "[9 2 1 3 4]\n", + "\n", + "Al_TP_CONC_PUMP [nmol/kg]\n", + "[ nan 4.682927 6.634146 4.292683 4.390244]\n", + "\n", + "QV:SEADATANET.371\n", + "[9 1]\n", + "\n", + "Al_LPT_CONC_PUMP [nmol/kg]\n", + "[ nan 4.92143 2.053793 2.657613 7.085427]\n", + "\n", + "QV:SEADATANET.372\n", + "[9 1 3 6 4]\n", + "\n", + "Al_SPT_CONC_PUMP [nmol/kg]\n", + "[ nan 0.737598 2.816617 0.362548 25.65716 ]\n", + "\n", + "QV:SEADATANET.373\n", + "[9 1 2 3 6]\n", + "\n", + "Al_SPL_CONC_PUMP [nmol/kg]\n", + "[ nan 0.358899 0.134947 0.29612 0.251254]\n", + "\n", + "QV:SEADATANET.374\n", + "[9 2]\n", + "\n", + "Ba_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 13.294371 18.433971 20.170162 33.29546 ]\n", + "\n", + "QV:SEADATANET.375\n", + "[9 1 3 4 2]\n", + "\n", + "Ba_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 3.435481 334.05838 8.401126 321.196442]\n", + "\n", + "QV:SEADATANET.376\n", + "[9 1 2 3 6]\n", + "\n", + "Ba_SPL_CONC_PUMP [pmol/kg]\n", + "[ nan 92.825371 204.898071 290.218628 189.205826]\n", + "\n", + "QV:SEADATANET.377\n", + "[9 2]\n", + "\n", + "Cd_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 13.756098 5.717073 3.404878 3.736585]\n", + "\n", + "QV:SEADATANET.378\n", + "[9 1]\n", + "\n", + "Cd_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.975389 0.039125 0.037386 0.012737]\n", + "\n", + "QV:SEADATANET.379\n", + "[9 1 3 4 6]\n", + "\n", + "Cd_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.271325 3.442991 0.028339 0.63321 ]\n", + "\n", + "QV:SEADATANET.380\n", + "[9 1 2 3 4]\n", + "\n", + "Co_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.993266 0.164983 0.20018 0.488921]\n", + "\n", + "QV:SEADATANET.381\n", + "[9 1 3 4 2]\n", + "\n", + "Co_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.653975 2.075954 0.121729 1.814991]\n", + "\n", + "QV:SEADATANET.382\n", + "[9 1 2 3 4]\n", + "\n", + "Cu_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 3.34821 2.004086 1.426711 2.59067 ]\n", + "\n", + "QV:SEADATANET.383\n", + "[9 1 3 2 4]\n", + "\n", + "Cu_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 6.878622 26.956652 0.89219 19.770351]\n", + "\n", + "QV:SEADATANET.384\n", + "[9 1 2 3 4]\n", + "\n", + "Cu_SPL_CONC_PUMP [pmol/kg]\n", + "[ nan 8.997337 4.554814 8.533956 6.668941]\n", + "\n", + "QV:SEADATANET.385\n", + "[9 2 3]\n", + "\n", + "Fe_TP_CONC_PUMP [nmol/kg]\n", + "[ nan 11.804878 5.268293 3.317073 1.268293]\n", + "\n", + "QV:SEADATANET.386\n", + "[9 1]\n", + "\n", + "Fe_LPT_CONC_PUMP [nmol/kg]\n", + "[ nan 1.050323 0.344769 0.625939 1.439502]\n", + "\n", + "QV:SEADATANET.387\n", + "[9 1 3 6 4]\n", + "\n", + "Fe_SPT_CONC_PUMP [nmol/kg]\n", + "[ nan 0.255904 0.79256 0.160249 5.349295]\n", + "\n", + "QV:SEADATANET.388\n", + "[9 1 2 3 4]\n", + "\n", + "Fe_SPL_CONC_PUMP [nmol/kg]\n", + "[ nan 0.342561 0.114592 0.258232 0.32794 ]\n", + "\n", + "QV:SEADATANET.389\n", + "[9 2]\n", + "\n", + "Ga_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 0.712051 0.860803 0.944894 0.887285]\n", + "\n", + "QV:SEADATANET.390\n", + "[9 1 6]\n", + "\n", + "Hg_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.033 0.017 0.024 0.042]\n", + "\n", + "QV:SEADATANET.391\n", + "[9 2 3]\n", + "\n", + "Hg_MM_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.00141 0.00088 0.00038 0.00178]\n", + "\n", + "QV:SEADATANET.392\n", + "[9 2 5 3]\n", + "\n", + "Mn_TP_CONC_PUMP [nmol/kg]\n", + "[ nan 0.188293 0.118049 0.078049 0.059512]\n", + "\n", + "QV:SEADATANET.393\n", + "[9 1]\n", + "\n", + "Mn_LPT_CONC_PUMP [nmol/kg]\n", + "[ nan 0.014976 0.004817 0.010099 0.02417 ]\n", + "\n", + "QV:SEADATANET.394\n", + "[9 1 3 4 2]\n", + "\n", + "Mn_SPT_CONC_PUMP [nmol/kg]\n", + "[ nan 0.006083 0.030332 0.005697 0.165448]\n", + "\n", + "QV:SEADATANET.395\n", + "[9 1 2 3 6]\n", + "\n", + "Mn_SPL_CONC_PUMP [nmol/kg]\n", + "[ nan 0.020698 0.054247 0.130304 0.048691]\n", + "\n", + "QV:SEADATANET.396\n", + "[9 2]\n", + "\n", + "Mo_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 3.259011 11.614256 1.130145 2.436231]\n", + "\n", + "QV:SEADATANET.397\n", + "[9 1 6 3]\n", + "\n", + "Mo_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 3.179421 0.536872 0.182223 0.113805]\n", + "\n", + "QV:SEADATANET.398\n", + "[9 1 3]\n", + "\n", + "Mo_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.823194 0.253418 1.101504 2.246607]\n", + "\n", + "QV:SEADATANET.399\n", + "[9 1 2 3]\n", + "\n", + "Ni_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 39.466263 136.473419 16.966465 16.998934]\n", + "\n", + "QV:SEADATANET.400\n", + "[9 3 1 6]\n", + "\n", + "Ni_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 1.411194 2.831841 2.533521 3.43655 ]\n", + "\n", + "QV:SEADATANET.401\n", + "[9 1 3 2]\n", + "\n", + "Ni_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 3.485622 7.531662 11.42548 11.258619]\n", + "\n", + "QV:SEADATANET.402\n", + "[9 1 2 3]\n", + "\n", + "Ni_SPL_CONC_PUMP [pmol/kg]\n", + "[ nan 39.856834 6.581006 11.0156 3.719055]\n", + "\n", + "QV:SEADATANET.403\n", + "[9 2]\n", + "\n", + "P_TP_CONC_PUMP [nmol/kg]\n", + "[ nan 35.465439 24.159981 7.821284 4.762243]\n", + "\n", + "QV:SEADATANET.404\n", + "[9 1]\n", + "\n", + "P_LPT_CONC_PUMP [nmol/kg]\n", + "[ nan 4.30732 0.595033 0.434555 0.282651]\n", + "\n", + "QV:SEADATANET.405\n", + "[9 1 3 2 4]\n", + "\n", + "P_SPT_CONC_PUMP [nmol/kg]\n", + "[ nan 2.650167 17.605572 0.099575 2.681759]\n", + "\n", + "QV:SEADATANET.406\n", + "[9 1 2 3 4]\n", + "\n", + "P_SPL_CONC_PUMP [nmol/kg]\n", + "[ nan 7.477912 1.229058 0.857984 0.173157]\n", + "\n", + "QV:SEADATANET.407\n", + "[9 2]\n", + "\n", + "Pb_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 13.876318 2.93466 -0.048195 -1.068311]\n", + "\n", + "QV:SEADATANET.408\n", + "[9 1 6 3]\n", + "\n", + "Pb_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.629705 0.160938 0.235971 0.469684]\n", + "\n", + "QV:SEADATANET.409\n", + "[9 1 3 4 2]\n", + "\n", + "Pb_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.359329 0.805823 0.101191 2.653187]\n", + "\n", + "QV:SEADATANET.410\n", + "[9 1 2 3 6]\n", + "\n", + "Th_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 0.010341 0.009102 0.003893 0.003454]\n", + "\n", + "QV:SEADATANET.411\n", + "[9 6 1 3]\n", + "\n", + "Th_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.301463 0.044098 0.020976 0.021268]\n", + "\n", + "QV:SEADATANET.412\n", + "[9 2 1]\n", + "\n", + "Th_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.587317 0.027122 0.022146 0.059317]\n", + "\n", + "QV:SEADATANET.413\n", + "[9 2 1 3]\n", + "\n", + "Ti_LPT_CONC_PUMP [nmol/kg]\n", + "[ nan 0.074035 0.096704 0.045821 0.103233]\n", + "\n", + "QV:SEADATANET.414\n", + "[9 1 3 6 4]\n", + "\n", + "Ti_SPT_CONC_PUMP [nmol/kg]\n", + "[ nan 0.071292 0.004644 0.010673 0.36272 ]\n", + "\n", + "QV:SEADATANET.415\n", + "[9 1 2 3 6]\n", + "\n", + "U_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 1.133128 0.911309 0.6291 0.549211]\n", + "\n", + "QV:SEADATANET.416\n", + "[9 1]\n", + "\n", + "V_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 11.621088 5.792983 4.533802 4.30882 ]\n", + "\n", + "QV:SEADATANET.417\n", + "[9 1]\n", + "\n", + "V_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 3.602153 1.478805 2.053688 4.783217]\n", + "\n", + "QV:SEADATANET.418\n", + "[9 1 3 2 6]\n", + "\n", + "V_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 1.331966 9.289524 18.583929 31.730047]\n", + "\n", + "QV:SEADATANET.419\n", + "[9 1 2 3 4]\n", + "\n", + "V_SPL_CONC_PUMP [pmol/kg]\n", + "[ nan 21.144424 5.6307 3.015887 4.041974]\n", + "\n", + "QV:SEADATANET.420\n", + "[9 2]\n", + "\n", + "Zn_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 1650.754272 -19.898796 -28.986933 -29.430882]\n", + "\n", + "QV:SEADATANET.421\n", + "[9 3 6 1]\n", + "\n", + "Al_TP_CONC_FISH [nmol/kg]\n", + "[ nan 687.715454 51.490208 19.100677 10.364789]\n", + "\n", + "QV:SEADATANET.422\n", + "[9 3 1 6]\n", + "\n", + "Al_TPL_CONC_FISH [nmol/kg]\n", + "[ nan 660.996948 41.436481 11.532884 5.344119]\n", + "\n", + "QV:SEADATANET.423\n", + "[9 1 6]\n", + "\n", + "Ba_TP_CONC_FISH [pmol/kg]\n", + "[ nan 111.671021 51.580975 19.200584 14.317268]\n", + "\n", + "QV:SEADATANET.424\n", + "[9 1]\n", + "\n", + "Ba_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 46.484684 28.02878 7.490732 5.551902]\n", + "\n", + "QV:SEADATANET.425\n", + "[9 1]\n", + "\n", + "Cd_TP_CONC_FISH [pmol/kg]\n", + "[ nan 102.007706 136.569656 92.71405 88.721367]\n", + "\n", + "QV:SEADATANET.426\n", + "[9 1]\n", + "\n", + "Cd_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 103.190826 194.30751 87.460876 80.36937 ]\n", + "\n", + "QV:SEADATANET.427\n", + "[9 1]\n", + "\n", + "Co_TP_CONC_FISH [pmol/kg]\n", + "[ nan 15.210341 18.204781 12.33639 11.280293]\n", + "\n", + "QV:SEADATANET.428\n", + "[9 1]\n", + "\n", + "Co_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 15.543317 17.88595 7.469463 6.682634]\n", + "\n", + "QV:SEADATANET.429\n", + "[9 1 6]\n", + "\n", + "Cu_TP_CONC_FISH [pmol/kg]\n", + "[ nan 88.650826 103.526047 66.520195 56.520878]\n", + "\n", + "QV:SEADATANET.430\n", + "[9 1 6]\n", + "\n", + "Cu_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 76.036781 60.740292 43.786732 27.000389]\n", + "\n", + "QV:SEADATANET.431\n", + "[9 1 6]\n", + "\n", + "Fe_TP_CONC_FISH [nmol/kg]\n", + "[ nan 1.882218 3.733868 0.954305 1.130148]\n", + "\n", + "QV:SEADATANET.432\n", + "[9 1 6]\n", + "\n", + "Fe_TPL_CONC_FISH [nmol/kg]\n", + "[ nan 1.138439 1.863248 0.386656 0.310107]\n", + "\n", + "QV:SEADATANET.433\n", + "[9 1 6]\n", + "\n", + "Mn_TP_CONC_FISH [nmol/kg]\n", + "[ nan 0.151841 0.201322 0.15209 0.110522]\n", + "\n", + "QV:SEADATANET.434\n", + "[9 1]\n", + "\n", + "Mn_TPL_CONC_FISH [nmol/kg]\n", + "[ nan 0.147664 0.238492 0.131041 0.076773]\n", + "\n", + "QV:SEADATANET.435\n", + "[9 1]\n", + "\n", + "Ni_TP_CONC_FISH [pmol/kg]\n", + "[ nan 83.675415 144.370926 138.708786 138.701263]\n", + "\n", + "QV:SEADATANET.436\n", + "[9 1]\n", + "\n", + "Ni_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 67.48732 141.175323 104.912193 95.014244]\n", + "\n", + "QV:SEADATANET.437\n", + "[9 1]\n", + "\n", + "P_TP_CONC_FISH [nmol/kg]\n", + "[ nan 187.721634 263.543549 191.073456 190.559769]\n", + "\n", + "QV:SEADATANET.438\n", + "[9 1]\n", + "\n", + "P_TPL_CONC_FISH [nmol/kg]\n", + "[ nan 105.829308 282.871338 147.596603 126.760963]\n", + "\n", + "QV:SEADATANET.439\n", + "[9 1]\n", + "\n", + "Pb_TP_CONC_FISH [pmol/kg]\n", + "[ nan 2.362012 2.137587 1.174115 1.322653]\n", + "\n", + "QV:SEADATANET.440\n", + "[9 1]\n", + "\n", + "Pb_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 2.227902 2.211512 1.216 1.249463]\n", + "\n", + "QV:SEADATANET.441\n", + "[9 1]\n", + "\n", + "Th_TP_CONC_FISH [pmol/kg]\n", + "[ nan 0.130695 0.107883 0.036751 0.048958]\n", + "\n", + "QV:SEADATANET.442\n", + "[9 1 6]\n", + "\n", + "Th_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 0.034049 0. ]\n", + "\n", + "QV:SEADATANET.443\n", + "[9 1 6]\n", + "\n", + "Ti_TP_CONC_FISH [nmol/kg]\n", + "[ nan 0.095881 0.138347 0.050543 0.078738]\n", + "\n", + "QV:SEADATANET.444\n", + "[9 1 6]\n", + "\n", + "Ti_TPL_CONC_FISH [nmol/kg]\n", + "[ nan 0. 0.003446]\n", + "\n", + "QV:SEADATANET.445\n", + "[9 6 1]\n", + "\n", + "V_TP_CONC_FISH [pmol/kg]\n", + "[ nan 68.910828 63.827221 7.692878 7.591122]\n", + "\n", + "QV:SEADATANET.446\n", + "[9 1]\n", + "\n", + "V_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 69.329071 63.438828 6.152683 4.277951]\n", + "\n", + "QV:SEADATANET.447\n", + "[9 1]\n", + "\n", + "Zn_TP_CONC_FISH [pmol/kg]\n", + "[ nan 588.273376 180.921753 107.334045 193.626343]\n", + "\n", + "QV:SEADATANET.448\n", + "[9 1]\n", + "\n", + "Zn_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 513.994629 177.999512 115.275703 124.16478 ]\n", + "\n", + "QV:SEADATANET.449\n", + "[9 1 6]\n", + "\n", + "bSi_30_28_TP_DELTA_PUMP [per 10^3]\n", + "[ nan 1.07 1.64 1.33 1.08]\n", + "\n", + "QV:SEADATANET.450\n", + "[9 1]\n", + "\n", + "PIC_LPT_CONC_PUMP [umol C/kg]\n", + "[ nan 0.030585 0.00865 0.011919 0.008463]\n", + "\n", + "QV:SEADATANET.451\n", + "[9 1 2 6 4]\n", + "\n", + "PIC_SPT_CONC_PUMP [umol C/kg]\n", + "[ nan 0.112049 0.084065 0.088756 0.042667]\n", + "\n", + "QV:SEADATANET.452\n", + "[9 1 4 6 2]\n", + "\n", + "C_LPT_CONC_PUMP [umol C/kg]\n", + "[ nan 0.666634 0.365649 0.107361 0.070052]\n", + "\n", + "QV:SEADATANET.453\n", + "[9 1]\n", + "\n", + "C_SPT_CONC_PUMP [umol C/kg]\n", + "[ nan 2.702056 0.212912 0.580666 0.43848 ]\n", + "\n", + "QV:SEADATANET.454\n", + "[9 1 4]\n", + "\n", + "POC_LPT_CONC_PUMP [umol C/kg]\n", + "[ nan 0.398764 0.109805 0.027512 0.020317]\n", + "\n", + "QV:SEADATANET.455\n", + "[9 1 4 2 3]\n", + "\n", + "POC_SPT_CONC_PUMP [umol C/kg]\n", + "[ nan 2.536488 0.643902 0.42978 0.253106]\n", + "\n", + "QV:SEADATANET.456\n", + "[9 1 2 4 3]\n", + "\n", + "N_LPT_CONC_PUMP [nmol N/kg]\n", + "[ nan 85.375427 37.764053 15.441927 15.14714 ]\n", + "\n", + "QV:SEADATANET.457\n", + "[9 1 4 6 3]\n", + "\n", + "N_SPT_CONC_PUMP [nmol N/kg]\n", + "[ nan 533.570679 256.519867 82.778313 71.194481]\n", + "\n", + "QV:SEADATANET.458\n", + "[9 1 4 6 2]\n", + "\n", + "bSi_TP_CONC_PUMP [nmol Si/kg]\n", + "[ nan 302.439026 360.975616 273.170746 136.585373]\n", + "\n", + "QV:SEADATANET.459\n", + "[9 1]\n", + "\n", + "bSi_LPT_CONC_PUMP [nmol Si/kg]\n", + "[ nan 16.904764 29.141895 10.76722 5.698265]\n", + "\n", + "QV:SEADATANET.460\n", + "[9 1 3 2 4]\n", + "\n", + "bSi_SPT_CONC_PUMP [nmol Si/kg]\n", + "[ nan 4.904008 49.978989 15.02632 13.257302]\n", + "\n", + "QV:SEADATANET.461\n", + "[9 1 3 2 4]\n", + "\n", + "PARTICLEMASS_LPT_CONC_PUMP [ug/kg]\n", + "[ nan 21.502323 5.0924 2.740238 2.023667]\n", + "\n", + "QV:SEADATANET.462\n", + "[9 1 3 2 4]\n", + "\n", + "PARTICLEMASS_SPT_CONC_PUMP [ug/kg]\n", + "[ nan 82.256165 23.490496 19.031048 11.234714]\n", + "\n", + "QV:SEADATANET.463\n", + "[9 1 3 2 4]\n", + "\n", + "Po_210_TP_CONC_BOTTLE [mBq/kg]\n", + "[ nan 0.090601 0.029121 0.016718 0.040343]\n", + "\n", + "QV:SEADATANET.464\n", + "[9 1]\n", + "\n", + "Pb_210_TP_CONC_BOTTLE [mBq/kg]\n", + "[ nan 0.114567 0.107668 0.041092 0.087087]\n", + "\n", + "QV:SEADATANET.465\n", + "[9 1]\n", + "\n", + "Po_210_SPT_CONC_PUMP [mBq/kg]\n", + "[ nan 0.1792 0.0932 0.1027 0.2343]\n", + "\n", + "QV:SEADATANET.466\n", + "[9 1 6]\n", + "\n", + "Po_210_LPT_CONC_PUMP [mBq/kg]\n", + "[ nan 0.06 0.015 0.033 0.051]\n", + "\n", + "QV:SEADATANET.467\n", + "[9 1 6]\n", + "\n", + "Pb_210_SPT_CONC_PUMP [mBq/kg]\n", + "[ nan 0.1508 0.1742 0.276 0.1953]\n", + "\n", + "QV:SEADATANET.468\n", + "[9 1 6]\n", + "\n", + "Pb_210_LPT_CONC_PUMP [mBq/kg]\n", + "[ nan 0.093 0.075 0.102 0.084]\n", + "\n", + "QV:SEADATANET.469\n", + "[9 1]\n", + "\n", + "Pa_231_TP_CONC_PUMP [uBq/kg]\n", + "[ nan 0.035907 0.017296 0.01316 0.113627]\n", + "\n", + "QV:SEADATANET.470\n", + "[9 1 3]\n", + "\n", + "Pa_231_SPT_CONC_PUMP [uBq/kg]\n", + "[ nan 0.002451 0.007665 0.039516 0.110679]\n", + "\n", + "QV:SEADATANET.471\n", + "[9 1 6 3]\n", + "\n", + "Pa_231_LPT_CONC_PUMP [uBq/kg]\n", + "[ nan 0.007314 0.008882 0.010333 0.000989]\n", + "\n", + "QV:SEADATANET.472\n", + "[9 1 4]\n", + "\n", + "Th_228_SPT_CONC_PUMP [uBq/kg]\n", + "[ nan 6.504068 8.130088 9.756098 13.008137]\n", + "\n", + "QV:SEADATANET.473\n", + "[9 1 6 2 3]\n", + "\n", + "Th_228_LPT_CONC_PUMP [uBq/kg]\n", + "[ nan 1.23 -0.4 0.23 0.25]\n", + "\n", + "QV:SEADATANET.474\n", + "[9 2 6 3]\n", + "\n", + "Th_230_TP_CONC_PUMP [uBq/kg]\n", + "[ nan 0.391425 0.566893 0.595886 0.506238]\n", + "\n", + "QV:SEADATANET.475\n", + "[9 1]\n", + "\n", + "Th_230_SPT_CONC_PUMP [uBq/kg]\n", + "[ nan 0.10078 0.257379 0.834169 2.136799]\n", + "\n", + "QV:SEADATANET.476\n", + "[9 1 3]\n", + "\n", + "Th_230_LPT_CONC_PUMP [uBq/kg]\n", + "[ nan 0.102821 0.129562 0.132244 0.029499]\n", + "\n", + "QV:SEADATANET.477\n", + "[9 1 2 4]\n", + "\n", + "Th_232_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 0.091275 0.086139 0.054428 0.30588 ]\n", + "\n", + "QV:SEADATANET.478\n", + "[9 1]\n", + "\n", + "Th_232_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.021205 0.082637 0.459946 1.867481]\n", + "\n", + "QV:SEADATANET.479\n", + "[9 1 3 6]\n", + "\n", + "Th_232_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.058755 0.11342 0.107164 0.63596 ]\n", + "\n", + "QV:SEADATANET.480\n", + "[9 1 4]\n", + "\n", + "Th_234_SPT_CONC_PUMP [mBq/kg]\n", + "[ nan 2.781774 1.471779 3.213887 2.923771]\n", + "\n", + "QV:SEADATANET.481\n", + "[9 1 4 6 2]\n", + "\n", + "Th_234_LPT_CONC_PUMP [mBq/kg]\n", + "[ nan 0.363003 0.373994 0.352724 0.647588]\n", + "\n", + "QV:SEADATANET.482\n", + "[9 1 2 4 3]\n", + "\n", + "Po_210_TP_CONC_UWAY [mBq/kg]\n", + "[ nan 0.019187 0.04813 0.137561 0.054797]\n", + "\n", + "QV:SEADATANET.483\n", + "[9 1]\n", + "\n", + "Pb_210_TP_CONC_UWAY [mBq/kg]\n", + "[ nan 0.030081 0.057073 0.143415 0.045528]\n", + "\n", + "QV:SEADATANET.484\n", + "[9 1]\n", + "\n", + "Nd_143_144_TP_EPSILON_PUMP [per 10^4]\n", + "[ nan -6.748074 -7.76467 -5.730428 -7.449456]\n", + "\n", + "QV:SEADATANET.485\n", + "[9 1 3]\n", + "\n", + "Y_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 1.32 0.74 2.91 0.568]\n", + "\n", + "QV:SEADATANET.486\n", + "[9 1 2 6 3]\n", + "\n", + "La_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.96 1.73 0.6 1.01]\n", + "\n", + "QV:SEADATANET.487\n", + "[9 1 4 2 6]\n", + "\n", + "Ce_TP_CONC_BOTTLE [pmol/kg]\n", + "[nan 2. 2.9 1.1 1.5]\n", + "\n", + "QV:SEADATANET.488\n", + "[9 1 4 3 2]\n", + "\n", + "Pr_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.22 0.37 0.06 0.16]\n", + "\n", + "QV:SEADATANET.489\n", + "[9 1 4 2 6]\n", + "\n", + "Nd_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.76 1.38 0.22 0.51]\n", + "\n", + "QV:SEADATANET.490\n", + "[9 1 4]\n", + "\n", + "Sm_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.16 0.33 0.05 0.11]\n", + "\n", + "QV:SEADATANET.491\n", + "[9 1]\n", + "\n", + "Eu_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.032 0.06 0.011 0.022]\n", + "\n", + "QV:SEADATANET.492\n", + "[9 1]\n", + "\n", + "Gd_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.124 0.238 0.044 0.095]\n", + "\n", + "QV:SEADATANET.493\n", + "[9 1]\n", + "\n", + "Tb_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.018 0.035 0.007 0.031]\n", + "\n", + "QV:SEADATANET.494\n", + "[9 1 2]\n", + "\n", + "Dy_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.107 0.192 0.041 0.071]\n", + "\n", + "QV:SEADATANET.495\n", + "[9 1 2]\n", + "\n", + "Ho_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.02 0.036 0.008 0.014]\n", + "\n", + "QV:SEADATANET.496\n", + "[9 1 2]\n", + "\n", + "Er_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.051 0.097 0.023 0.039]\n", + "\n", + "QV:SEADATANET.497\n", + "[9 1 2]\n", + "\n", + "Tm_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.0078 0.013 0.0037 0.0065]\n", + "\n", + "QV:SEADATANET.498\n", + "[9 1 2]\n", + "\n", + "Yb_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.0495 0.0822 0.0212 0.036 ]\n", + "\n", + "QV:SEADATANET.499\n", + "[9 1 2]\n", + "\n", + "Lu_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.0051 0.0077 0.0017 0.0037]\n", + "\n", + "QV:SEADATANET.500\n", + "[9 1 2]\n", + "\n", + "Y_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 1.970732 0.357073 0.192195 0.190244]\n", + "\n", + "QV:SEADATANET.501\n", + "[9 2 1 3]\n", + "\n", + "Y_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 3.229268 0.68878 0.618537 0.657561]\n", + "\n", + "QV:SEADATANET.502\n", + "[9 2 1 3]\n", + "\n", + "Nd_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 0.600837 0.532094 1.052143 0.952824]\n", + "\n", + "QV:SEADATANET.503\n", + "[9 1 3]\n", + "\n", + "Nd_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 1.336585 0.218537 0.115122 0.130732]\n", + "\n", + "QV:SEADATANET.504\n", + "[9 2 3 1]\n", + "\n", + "Nd_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 2.57561 0.281951 0.303415 0.418537]\n", + "\n", + "QV:SEADATANET.505\n", + "[9 2 3 1]\n", + "\n", + "Y_TP_CONC_FISH [pmol/kg]\n", + "[ nan 12.603902 1.971317 1.403707 1.233561]\n", + "\n", + "QV:SEADATANET.506\n", + "[9 1]\n", + "\n", + "La_TP_CONC_FISH [pmol/kg]\n", + "[ nan 2.045171 1.915512 1.972293 2.076 ]\n", + "\n", + "QV:SEADATANET.507\n", + "[9 1]\n", + "\n", + "Fe_56_54_TP_DELTA_BOTTLE [per 10^3]\n", + "[ nan 0.046448 0.109383 0.016848 -0.015711]\n", + "\n", + "QV:SEADATANET.508\n", + "[9 1]\n", + "\n", + "Ba_138_134_SPL_DELTA_PUMP [per 10^3]\n", + "[ nan 0. -0.05 -0.02 -0.03]\n", + "\n", + "QV:SEADATANET.509\n", + "[9 2]\n", + "\n", + "Cu_65_63_SPT_DELTA_PUMP [per 10^3]\n", + "[ nan 0.37 0.27 0.24 0.11]\n", + "\n", + "QV:SEADATANET.510\n", + "[9 2 3]\n", + "\n", + "Cu_65_63_SPL_DELTA_PUMP [per 10^3]\n", + "[ nan 0.48 0.54 0.39 0.27]\n", + "\n", + "QV:SEADATANET.511\n", + "[9 2 3]\n", + "\n", + "Allo_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 2.880534 0. 5.768254 6.78763 ]\n", + "\n", + "QV:SEADATANET.512\n", + "[9 1 6]\n", + "\n", + "Anth_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[nan 0.]\n", + "\n", + "QV:SEADATANET.513\n", + "[9 6]\n", + "\n", + "But fuco_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 35.493546 33.028984 42.024036 48.899803]\n", + "\n", + "QV:SEADATANET.514\n", + "[9 1 6]\n", + "\n", + "Alpha Car_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 3.524788 55.936646 3.891268 32.790211]\n", + "\n", + "QV:SEADATANET.515\n", + "[9 1 6]\n", + "\n", + "Beta Car_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 10.269084 8.585586 10.358151 7.232014]\n", + "\n", + "QV:SEADATANET.516\n", + "[9 1 6]\n", + "\n", + "Diadino_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 69.338997 35.938 13.28 10.502 ]\n", + "\n", + "QV:SEADATANET.517\n", + "[9 1 6]\n", + "\n", + "Diato_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 3.443475 3.578867 2.974312 7.085016]\n", + "\n", + "QV:SEADATANET.518\n", + "[9 1 6]\n", + "\n", + "Fuco_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 145.857208 143.758408 212.44043 108.200089]\n", + "\n", + "QV:SEADATANET.519\n", + "[9 1 6]\n", + "\n", + "Hex fuco_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 102.854836 93.955658 94.594887 68.711449]\n", + "\n", + "QV:SEADATANET.520\n", + "[9 1 6]\n", + "\n", + "Lut_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 0. 12.51382 3.665026 5.146942]\n", + "\n", + "QV:SEADATANET.521\n", + "[9 6 1]\n", + "\n", + "Neo_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 4.245147 36.744522 2.638522 37.6022 ]\n", + "\n", + "QV:SEADATANET.522\n", + "[9 1 6]\n", + "\n", + "Perid_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 11.945248 5.510989 12.257776 3.910178]\n", + "\n", + "QV:SEADATANET.523\n", + "[9 1 6]\n", + "\n", + "Pras_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 3.851663 46.023136 2.398029 39.186272]\n", + "\n", + "QV:SEADATANET.524\n", + "[9 1 6]\n", + "\n", + "Viola_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 8.175071 8.402634 5.047509 4.284502]\n", + "\n", + "QV:SEADATANET.525\n", + "[9 1 6]\n", + "\n", + "Zea_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 0. 14.658606 16.749643 10.810811]\n", + "\n", + "QV:SEADATANET.526\n", + "[9 6 1]\n", + "\n", + "Gyrox_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[nan 0.]\n", + "\n", + "QV:SEADATANET.527\n", + "[9 6]\n", + "\n", + "Allo_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 0. 1.620942 20.188374 17.506992]\n", + "\n", + "QV:SEADATANET.528\n", + "[9 6 1]\n", + "\n", + "But fuco_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 18.731256 34.212151 48.980167 55.461548]\n", + "\n", + "QV:SEADATANET.529\n", + "[9 1]\n", + "\n", + "Beta Car_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 7.805306 7.048862 8.974472 7.832905]\n", + "\n", + "QV:SEADATANET.530\n", + "[9 1]\n", + "\n", + "Diadino_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 31.242001 21.532 80.603996 89.880745]\n", + "\n", + "QV:SEADATANET.531\n", + "[9 1]\n", + "\n", + "Fuco_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 10.968168 42.386414 53.230206 67.281281]\n", + "\n", + "QV:SEADATANET.532\n", + "[9 1]\n", + "\n", + "Hex fuco_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 88.186478 89.810249 170.852112 254.361237]\n", + "\n", + "QV:SEADATANET.533\n", + "[9 1]\n", + "\n", + "Lut_HPLC_TP_CONC_FISH [ng/liter]\n", + "[nan 0.]\n", + "\n", + "QV:SEADATANET.534\n", + "[9 6]\n", + "\n", + "Perid_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 5.472818 4.42072 23.007784 81.729637]\n", + "\n", + "QV:SEADATANET.535\n", + "[9 1 6]\n", + "\n", + "Viola_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 4.317011 5.415626 0. 24.516607]\n", + "\n", + "QV:SEADATANET.536\n", + "[9 1 6]\n", + "\n", + "Zea_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 32.383663 0. 15.97846 13.495225]\n", + "\n", + "QV:SEADATANET.537\n", + "[9 1 6]\n", + "\n", + "Chl a_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 343.280304 291.095856 368.654602 234.799271]\n", + "\n", + "QV:SEADATANET.538\n", + "[9 1 6]\n", + "\n", + "Chl b_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 0. 85.002571 59.243439 60.365414]\n", + "\n", + "QV:SEADATANET.539\n", + "[9 6 1]\n", + "\n", + "Chl c3_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 0. 29.862658 17.607647 46.841129]\n", + "\n", + "QV:SEADATANET.540\n", + "[9 6 1]\n", + "\n", + "Chl c1-chl c2_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 318.122711 3348.00293 142.516449 7650.768066]\n", + "\n", + "QV:SEADATANET.541\n", + "[9 1]\n", + "\n", + "Chl c TOT_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 131.300003 122.699997 177.5 76.599998]\n", + "\n", + "QV:SEADATANET.542\n", + "[9 1 6]\n", + "\n", + "Chl a allom_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 5.085326 52.472664 6.207444 80.428101]\n", + "\n", + "QV:SEADATANET.543\n", + "[9 1]\n", + "\n", + "Chl a epimer_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 6.699603 53.29155 10.527572 4.659686]\n", + "\n", + "QV:SEADATANET.544\n", + "[9 1]\n", + "\n", + "Chlide a_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 24.69055 288.909882 4.629637 1406.627686]\n", + "\n", + "QV:SEADATANET.545\n", + "[9 1 6]\n", + "\n", + "DV chl a_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 0. 6.046967 6.508017 11.18438 ]\n", + "\n", + "QV:SEADATANET.546\n", + "[9 6 1]\n", + "\n", + "Chl a-DV chla_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 449. 404.5 500.899994 241. ]\n", + "\n", + "QV:SEADATANET.547\n", + "[9 1 6]\n", + "\n", + "Chl a_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 181.916824 181.609406 245.424957 517.605774]\n", + "\n", + "QV:SEADATANET.548\n", + "[9 1]\n", + "\n", + "Chl b_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 0. 53.777664 34.019558]\n", + "\n", + "QV:SEADATANET.549\n", + "[9 6 1]\n", + "\n", + "Chl c3_HPLC_TP_CONC_FISH [ng/liter]\n", + "[nan 0.]\n", + "\n", + "QV:SEADATANET.550\n", + "[9 6]\n", + "\n", + "DV chl a_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 14.120045 0. ]\n", + "\n", + "QV:SEADATANET.551\n", + "[9 1 6]\n", + "\n", + "CHLA_FLUOR_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 53.660454 60.322136 54.215595 87.284599]\n", + "\n", + "QV:SEADATANET.552\n", + "[9 1 6]\n", + "\n", + "PHAEO_FLUOR_TP_CONC_BOTTLE [ng/liter]\n", + "[nan 31. 32. 28. 29.]\n", + "\n", + "QV:SEADATANET.553\n", + "[9 1]\n", + "\n", + "PEP_VAAEAVLSMTK_NiSOD_ProSyn_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 441.946869 379.980927 97.860695 40.193665]\n", + "\n", + "QV:SEADATANET.554\n", + "[9 1 6]\n", + "\n", + "PEP_SPYNQSLVANQIVNK_IdiA_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 0. 48.103069 25.950068 203.370743]\n", + "\n", + "QV:SEADATANET.555\n", + "[9 6 1]\n", + "\n", + "PEP_LHNFISSAESPK_Fld_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 1.39776 34.549454 7.477602 0.495968]\n", + "\n", + "QV:SEADATANET.556\n", + "[9 1 6]\n", + "\n", + "PEP_AGADMVGYVDK_Fld_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 15.13088 78.002304 13.546335 1.454976]\n", + "\n", + "QV:SEADATANET.557\n", + "[9 1 6]\n", + "\n", + "PEP_TVGIYYATTTGK_Fld_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 93.825279 279.165466 85.259254 7.3104 ]\n", + "\n", + "QV:SEADATANET.558\n", + "[9 1]\n", + "\n", + "PEP_VNSVIDAIAEAAK_P-II-glnB-glnK_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 17.52832 25.690355 6.244293 0.482912]\n", + "\n", + "QV:SEADATANET.559\n", + "[9 1 6]\n", + "\n", + "PEP_LSHQAIAEAIGSTR_NtcA_Cyano_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 15.792 16.514317 4.540525 1.601824]\n", + "\n", + "QV:SEADATANET.560\n", + "[9 1 6]\n", + "\n", + "PEP_SKLEDDPANPELILTAR_PhoP_Syn_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 0. 0.491568 0.142632 0.579424]\n", + "\n", + "QV:SEADATANET.561\n", + "[9 6 1]\n", + "\n", + "PEP_LIDQDGVPVVFGGWTSASR_UreaTran_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 1791.224976 1486.048218 539.431274 85.778305]\n", + "\n", + "QV:SEADATANET.562\n", + "[9 1 6]\n", + "\n", + "PEP_VVGEDYLPLGNTEVAPIISK_UreaTran_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 1830.47168 2480.089355 728.808899 138.801819]\n", + "\n", + "QV:SEADATANET.563\n", + "[9 1 6]\n", + "\n", + "PEP_IEYIVEDGASDWPTFAEK_UreaTran_ProSyn_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 13561.165039 7878.208984 2308.209961 0. ]\n", + "\n", + "QV:SEADATANET.564\n", + "[9 1 6]\n", + "\n", + "PEP_IPEDIAFAESR_UreC_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[nan 1.1 0.8 1.4 2.1]\n", + "\n", + "QV:SEADATANET.565\n", + "[9 1 6]\n", + "\n", + "PEP_VGVAGPVGSGK_UreG_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[nan 0.5 0.3 0.4 0.8]\n", + "\n", + "QV:SEADATANET.566\n", + "[9 1 6]\n", + "\n", + "PEP_FDYDGDYGTVLNR_UDP-sulfoquin_m-taxa_TP_CONC_PUMP [fmol/liter]\n", + "[nan 3.5 2.6 4.6 2.1]\n", + "\n", + "QV:SEADATANET.567\n", + "[9 1 6]\n", + "\n", + "PEP_NEAVENDLIVDNK_UDP-sulfoquin_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[nan 1.8 1.1 2.4 2.9]\n", + "\n", + "QV:SEADATANET.568\n", + "[9 1 6]\n", + "\n", + "PEP_EAYPDFASAK_NH4-transporter_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[nan 1.2 1.1 0.1 0. ]\n", + "\n", + "QV:SEADATANET.569\n", + "[9 1 6]\n", + "\n", + "PEP_FDSLINSADNVMTYK_Glut-synt_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 2.00000000e-01 1.00000000e-01 3.00000000e-01\n", + " 1.02400002e+02]\n", + "\n", + "QV:SEADATANET.570\n", + "[9 1 6]\n", + "\n", + "PEP_EGYFPVSPNDTAQDIR_Glut-synt_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 10.8 6.6 9.6 21.299999]\n", + "\n", + "QV:SEADATANET.571\n", + "[9 1 6]\n", + "\n", + "PEP_HAPSFLAFTNPTTNSYK_Glut-synt_ProSyn_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 29.9 15.6 17.700001 52. ]\n", + "\n", + "QV:SEADATANET.572\n", + "[9 1]\n", + "\n", + "PEP_VASLTGADINYLPNPR_UDP-sulfoquin_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[nan 2.1 1.4 2.6 2.4]\n", + "\n", + "QV:SEADATANET.573\n", + "[9 1 6]\n", + "\n", + "CELL_VOLUME_BOTTLE [um^3]\n", + "[ nan 29.4 4.7 6.2 338. ]\n", + "\n", + "QV:SEADATANET.574\n", + "[9 1]\n", + "\n", + "CELL_TYPE_BOTTLE\n", + "[nan 2. 3. 1. 5.]\n", + "\n", + "QV:SEADATANET.575\n", + "[9 1]\n", + "\n", + "Fe_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 0.69 3.14 56.5 1.39]\n", + "\n", + "QV:SEADATANET.576\n", + "[9 1]\n", + "\n", + "C_CELL_CONC_BOTTLE [fmol/cell]\n", + "[ nan 430. 76.599998 100. 4260. ]\n", + "\n", + "QV:SEADATANET.577\n", + "[9 1]\n", + "\n", + "Si_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 33. 40.400002 27.9 12.9 ]\n", + "\n", + "QV:SEADATANET.578\n", + "[9 1]\n", + "\n", + "P_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 38. 200. 375. 6050.]\n", + "\n", + "QV:SEADATANET.579\n", + "[9 1]\n", + "\n", + "S_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 287. 870. 13100. 462.]\n", + "\n", + "QV:SEADATANET.580\n", + "[9 1]\n", + "\n", + "Mn_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 0.38 7.76 0.06 0.46]\n", + "\n", + "QV:SEADATANET.581\n", + "[9 1]\n", + "\n", + "Co_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 0.07 0.1 0.05 0.01]\n", + "\n", + "QV:SEADATANET.582\n", + "[9 1]\n", + "\n", + "Ni_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 0.07 0.22 3.03 0.37]\n", + "\n", + "QV:SEADATANET.583\n", + "[9 1]\n", + "\n", + "Cu_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 1.48 0.71 4.54 1.7 ]\n", + "\n", + "QV:SEADATANET.584\n", + "[9 1]\n", + "\n", + "Zn_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 1.4 21.700001 1.04 22.1 ]\n", + "\n", + "QV:SEADATANET.585\n", + "[9 1]\n", + "\n", + "QV:ODV:SAMPLE\n", + "[1]\n", + "\n" + ] + } + ], + "source": [ + "for col in df.columns:\n", + " print(col)\n", + " print(df[col].unique()[:5])\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e301352", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Pourquoi pas?', 'RRS James Cook', 'Pelagia', 'Knorr',\n", + " 'Angeles Alvarino', 'RRS Discovery', 'Meteor', 'FS Meteor',\n", + " 'James Cook', 'Atlantic Explorer', 'Pelican', 'Point Sur',\n", + " 'Sagar Kanya', 'Hakuho Maru', 'Tangaroa', 'Aurora Australis',\n", + " 'Marion Dufresne', 'FS Polarstern', 'Jakov Smirnitskiy',\n", + " 'Investigator', 'S. A. Agulhas II', 'Sam Rothberg', 'Healy',\n", + " 'Amundsen', 'Polarstern', \"L'Atalante\", 'RV Southern Surveyor',\n", + " 'Roger Revelle', 'Thomas G. Thompson', 'Sonne', 'Kilo Moana',\n", + " 'John P. Tully', 'Nathaniel B. Palmer', 'Akademik Tryoshnikov'],\n", + " dtype=object)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Ship Name:METAVAR:INDEXED_TEXT\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85f010f0", + "metadata": {}, + "outputs": [], + "source": [ + "# GEOTRACES Sample ID" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a01362a", + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'GEOTRACES Sample ID'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/indexes/base.py:3802\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3801\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3802\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3803\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:153\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:182\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'GEOTRACES Sample ID'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [71], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mGEOTRACES Sample ID\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/frame.py:4090\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4089\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4090\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4091\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4092\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/indexes/base.py:3809\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3805\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3807\u001b[0m ):\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3809\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3810\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3812\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'GEOTRACES Sample ID'" + ] + } + ], + "source": [ + "df['GEOTRACES Sample ID']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cb03efb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Cruise', 'Station:METAVAR:INDEXED_TEXT', 'Type',\n", + " 'yyyy-mm-ddThh:mm:ss.sss', 'Longitude [degrees_east]',\n", + " 'Latitude [degrees_north]', 'Bot. Depth [m]',\n", + " 'Operator's Cruise Name:METAVAR:INDEXED_TEXT',\n", + " 'Ship Name:METAVAR:INDEXED_TEXT', 'Period:METAVAR:INDEXED_TEXT',\n", + " ...\n", + " 'QV:SEADATANET.581', 'Co_CELL_CONC_BOTTLE [amol/cell]',\n", + " 'QV:SEADATANET.582', 'Ni_CELL_CONC_BOTTLE [amol/cell]',\n", + " 'QV:SEADATANET.583', 'Cu_CELL_CONC_BOTTLE [amol/cell]',\n", + " 'QV:SEADATANET.584', 'Zn_CELL_CONC_BOTTLE [amol/cell]',\n", + " 'QV:SEADATANET.585', 'QV:ODV:SAMPLE'],\n", + " dtype='object', length=1188)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55d645fa-660d-4298-b72b-fb06dbd2e2d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ctdsal_d_conc_sensor [pss-78]\n", + "salinity_d_conc_bottle\n", + "salinity_d_conc_pump\n", + "salinity_d_conc_fish\n", + "salinity_d_conc_uway\n", + "salinity_d_conc_boat_pump\n", + "ctdtmp_t_value_sensor [deg c]\n", + "oxygen_d_conc_bottle [umol/kg]\n", + "ctdoxy_d_conc_sensor [umol/kg]\n", + "U_236_238_T_RATIO_BOTTLE [per 10^12]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "def find_print_col(s, cols, lower=True):\n", + " cols = cols if not lower else [col.lower() for col in cols]\n", + " for col in cols:\n", + " if s in col: print(col)\n", + "\n", + "find_print_col('sal', df.columns)\n", + "find_print_col('tmp', df.columns)\n", + "find_print_col('oxy', df.columns)\n", + "find_print_col('U_236_238', df.columns, lower=False)" + ] + }, + { + "cell_type": "markdown", + "id": "bf1a71c0", + "metadata": {}, + "source": [ + "## Data transformation pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "c4407027-942c-4240-a92d-a40311c05afd", + "metadata": {}, + "source": [ + "### Select columns of interest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55d50ff7-63dc-4719-a6c1-c2ff7e6ecb7f", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "# U_236_238\n", + "# Done: Th_232, I_129, Ac_227" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d3a7bba-39d7-4fc3-8f0e-fb83ff52dcc2", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "common_coi = ['yyyy-mm-ddThh:mm:ss.sss', 'Longitude [degrees_east]',\n", + " 'Latitude [degrees_north]', 'Bot. Depth [m]', 'DEPTH [m]']\n", + "\n", + "nuclides_pattern = ['^TRITI', '^Th_228', '^Th_23[024]', '^Pa_231', \n", + " '^U_236_[DT]', '^Be_', '^Cs_137', '^Pb_210', '^Po_210',\n", + " '^Ra_22[3468]', 'Np_237', '^Pu_239_[D]', '^Pu_240', '^Pu_239_Pu_240',\n", + " '^I_129', '^Ac_227'] \n", + "\n", + "class SelectColsOfInterestCB(Callback):\n", + " \"Select columns of interest.\"\n", + " def __init__(self, common_coi, nuclides_pattern): fc.store_attr()\n", + " def __call__(self, tfm):\n", + " nuc_of_interest = [c for c in tfm.df.columns if \n", + " any(re.match(pattern, c) for pattern in self.nuclides_pattern)]\n", + "\n", + " tfm.df = tfm.df[self.common_coi + nuc_of_interest]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9005e7e4-f0d7-4944-abea-60e5f5522e22", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "tfm = Transformer(df, cbs=[\n", + " SelectColsOfInterestCB(common_coi, nuclides_pattern)\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52f52056-f58d-4ef5-b5de-dc8adf51eac0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yyyy-mm-ddThh:mm:ss.sssLongitude [degrees_east]Latitude [degrees_north]Bot. Depth [m]DEPTH [m]TRITIUM_D_CONC_BOTTLE [TU]Cs_137_D_CONC_BOTTLE [uBq/kg]I_129_D_CONC_BOTTLE [atoms/kg]Np_237_D_CONC_BOTTLE [uBq/kg]Pu_239_D_CONC_BOTTLE [uBq/kg]...Th_230_TP_CONC_PUMP [uBq/kg]Th_230_SPT_CONC_PUMP [uBq/kg]Th_230_LPT_CONC_PUMP [uBq/kg]Th_232_TP_CONC_PUMP [pmol/kg]Th_232_SPT_CONC_PUMP [pmol/kg]Th_232_LPT_CONC_PUMP [pmol/kg]Th_234_SPT_CONC_PUMP [mBq/kg]Th_234_LPT_CONC_PUMP [mBq/kg]Po_210_TP_CONC_UWAY [mBq/kg]Pb_210_TP_CONC_UWAY [mBq/kg]
02014-05-17T22:29:00349.2999938.43294854.02957.1NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
12014-05-17T22:29:00349.2999938.43294854.02957.2NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22014-05-17T22:29:00349.2999938.43294854.02957.2NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
32014-05-17T22:29:00349.2999938.43294854.02957.2NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
42014-05-17T22:29:00349.2999938.43294854.02957.2NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 85 columns

\n", + "
" + ], + "text/plain": [ + " yyyy-mm-ddThh:mm:ss.sss Longitude [degrees_east] Latitude [degrees_north] \\\n", + "0 2014-05-17T22:29:00 349.29999 38.4329 \n", + "1 2014-05-17T22:29:00 349.29999 38.4329 \n", + "2 2014-05-17T22:29:00 349.29999 38.4329 \n", + "3 2014-05-17T22:29:00 349.29999 38.4329 \n", + "4 2014-05-17T22:29:00 349.29999 38.4329 \n", + "\n", + " Bot. Depth [m] DEPTH [m] TRITIUM_D_CONC_BOTTLE [TU] \\\n", + "0 4854.0 2957.1 NaN \n", + "1 4854.0 2957.2 NaN \n", + "2 4854.0 2957.2 NaN \n", + "3 4854.0 2957.2 NaN \n", + "4 4854.0 2957.2 NaN \n", + "\n", + " Cs_137_D_CONC_BOTTLE [uBq/kg] I_129_D_CONC_BOTTLE [atoms/kg] \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " Np_237_D_CONC_BOTTLE [uBq/kg] Pu_239_D_CONC_BOTTLE [uBq/kg] ... \\\n", + "0 NaN NaN ... \n", + "1 NaN NaN ... \n", + "2 NaN NaN ... \n", + "3 NaN NaN ... \n", + "4 NaN NaN ... \n", + "\n", + " Th_230_TP_CONC_PUMP [uBq/kg] Th_230_SPT_CONC_PUMP [uBq/kg] \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " Th_230_LPT_CONC_PUMP [uBq/kg] Th_232_TP_CONC_PUMP [pmol/kg] \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " Th_232_SPT_CONC_PUMP [pmol/kg] Th_232_LPT_CONC_PUMP [pmol/kg] \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " Th_234_SPT_CONC_PUMP [mBq/kg] Th_234_LPT_CONC_PUMP [mBq/kg] \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " Po_210_TP_CONC_UWAY [mBq/kg] Pb_210_TP_CONC_UWAY [mBq/kg] \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + "[5 rows x 85 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "df_test = tfm()\n", + "df_test.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f6ceb332-536a-4054-a85a-a56960fb28a1", + "metadata": {}, + "source": [ + "### Reshape: wide to long\n", + "\n", + "So that we can extract information such as sample methodology, filtering status, units included in Geotraces nuclides name." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b060bb07-5565-4928-8b43-4abc5e64eb97", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class WideToLongCB(Callback):\n", + " \"\"\"\n", + " Get Geotraces nuclide names as values not column names \n", + " to extract contained information (unit, sampling method, ...).\n", + " \"\"\"\n", + " def __init__(self, common_coi, nuclides_pattern, \n", + " var_name='nuclide', value_name='value'): \n", + " fc.store_attr()\n", + " \n", + " def __call__(self, tfm):\n", + " nuc_of_interest = [c for c in tfm.df.columns if \n", + " any(re.match(pattern, c) for pattern in self.nuclides_pattern)]\n", + " tfm.df = pd.melt(tfm.df, id_vars=self.common_coi, value_vars=nuc_of_interest, \n", + " var_name=self.var_name, value_name=self.value_name)\n", + " tfm.df.dropna(subset='value', inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c95cc28d-b236-412d-9378-f06e85f95560", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(26745, 7)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "tfm = Transformer(df, cbs=[\n", + " SelectColsOfInterestCB(common_coi, nuclides_pattern),\n", + " WideToLongCB(common_coi, nuclides_pattern)\n", + "])\n", + "df_test = tfm()\n", + "df_test.shape" + ] + }, + { + "cell_type": "markdown", + "id": "de71e3a4-3f0e-4392-8338-1f9ef907f5da", + "metadata": {}, + "source": [ + "### Extract" + ] + }, + { + "cell_type": "markdown", + "id": "adee33d8-310b-43db-9eea-58ccaeed2065", + "metadata": {}, + "source": [ + "#### Unit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e982489-7c69-4b6d-9930-8f786220ad5b", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class ExtractUnitCB(Callback):\n", + " \"\"\"\n", + " Extract units from nuclide names.\n", + " \"\"\"\n", + " def __init__(self, var_name='nuclide'): \n", + " fc.store_attr()\n", + " self.unit_col_name = cdl_cfg()['vars']['suffixes']['unit']['name']\n", + "\n", + " def extract_unit(self, s):\n", + " match = re.search(r'\\[(.*?)\\]', s)\n", + " return match.group(1) if match else None\n", + " \n", + " def __call__(self, tfm):\n", + " tfm.df[self.unit_col_name] = tfm.df[self.var_name].apply(self.extract_unit)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "739decc1-64e8-4ea3-8f09-89a8b04dc1f8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yyyy-mm-ddThh:mm:ss.sssLongitude [degrees_east]Latitude [degrees_north]Bot. Depth [m]DEPTH [m]nuclidevalue_unit
92232010-10-17T00:13:29350.3379238.32712827.017.8TRITIUM_D_CONC_BOTTLE [TU]0.733TU
92312010-10-17T00:13:29350.3379238.32712827.034.7TRITIUM_D_CONC_BOTTLE [TU]0.696TU
92372010-10-17T00:13:29350.3379238.32712827.067.5TRITIUM_D_CONC_BOTTLE [TU]0.718TU
92442010-10-17T00:13:29350.3379238.32712827.091.9TRITIUM_D_CONC_BOTTLE [TU]0.709TU
92562010-10-17T00:13:29350.3379238.32712827.0136.6TRITIUM_D_CONC_BOTTLE [TU]0.692TU
\n", + "
" + ], + "text/plain": [ + " yyyy-mm-ddThh:mm:ss.sss Longitude [degrees_east] \\\n", + "9223 2010-10-17T00:13:29 350.33792 \n", + "9231 2010-10-17T00:13:29 350.33792 \n", + "9237 2010-10-17T00:13:29 350.33792 \n", + "9244 2010-10-17T00:13:29 350.33792 \n", + "9256 2010-10-17T00:13:29 350.33792 \n", + "\n", + " Latitude [degrees_north] Bot. Depth [m] DEPTH [m] \\\n", + "9223 38.3271 2827.0 17.8 \n", + "9231 38.3271 2827.0 34.7 \n", + "9237 38.3271 2827.0 67.5 \n", + "9244 38.3271 2827.0 91.9 \n", + "9256 38.3271 2827.0 136.6 \n", + "\n", + " nuclide value _unit \n", + "9223 TRITIUM_D_CONC_BOTTLE [TU] 0.733 TU \n", + "9231 TRITIUM_D_CONC_BOTTLE [TU] 0.696 TU \n", + "9237 TRITIUM_D_CONC_BOTTLE [TU] 0.718 TU \n", + "9244 TRITIUM_D_CONC_BOTTLE [TU] 0.709 TU \n", + "9256 TRITIUM_D_CONC_BOTTLE [TU] 0.692 TU " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "tfm = Transformer(df, cbs=[\n", + " SelectColsOfInterestCB(common_coi, nuclides_pattern),\n", + " WideToLongCB(common_coi, nuclides_pattern),\n", + " ExtractUnitCB()\n", + "])\n", + "\n", + "df_test = tfm()\n", + "df_test.head()" + ] + }, + { + "cell_type": "markdown", + "id": "219fc817-6700-4c3a-b353-08cc89c05538", + "metadata": {}, + "source": [ + "#### Filtering status" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efb9477e-f593-4d15-b8b8-c073bd6bb590", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "phase = {\n", + " 'D': {'filt': 1, 'group': 'seawater'},\n", + " 'T': {'filt': 2, 'group': 'seawater'},\n", + " 'TP': {'filt': 1, 'group': 'suspended-matter'}, \n", + " 'LPT': {'filt': 1, 'group': 'suspended-matter'},\n", + " 'SPT': {'filt': 1, 'group': 'suspended-matter'}}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ba72b3a-a013-4b5d-881a-9fd0a7e8b74c", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class ExtractFilteringStatusCB(Callback):\n", + " \"Extract filtering status from nuclide names.\"\n", + " def __init__(self, phase, var_name='nuclide'): \n", + " fc.store_attr()\n", + " self.filt_col_name = cdl_cfg()['vars']['suffixes']['filtered']['name']\n", + "\n", + " def extract_filt_status(self, s):\n", + " matched_string = self.match(s)\n", + " return self.phase[matched_string.group(1)]['filt'] if matched_string else None\n", + "\n", + " def match(self, s):\n", + " return re.search(r'_(' + '|'.join(self.phase.keys()) + ')_', s)\n", + " \n", + " def extract_group(self, s):\n", + " matched_string = self.match(s)\n", + " return self.phase[matched_string.group(1)]['group'] if matched_string else None\n", + " \n", + " def __call__(self, tfm):\n", + " tfm.df[self.filt_col_name] = tfm.df[self.var_name].apply(self.extract_filt_status)\n", + " tfm.df['group'] = tfm.df[self.var_name].apply(self.extract_group)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4821951-c14a-4315-8d03-ec82f83a242a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yyyy-mm-ddThh:mm:ss.sssLongitude [degrees_east]Latitude [degrees_north]Bot. Depth [m]DEPTH [m]nuclidevalue_unit_filtgroup
92232010-10-17T00:13:29350.3379238.32712827.017.8TRITIUM_D_CONC_BOTTLE [TU]0.733TU1seawater
92312010-10-17T00:13:29350.3379238.32712827.034.7TRITIUM_D_CONC_BOTTLE [TU]0.696TU1seawater
92372010-10-17T00:13:29350.3379238.32712827.067.5TRITIUM_D_CONC_BOTTLE [TU]0.718TU1seawater
92442010-10-17T00:13:29350.3379238.32712827.091.9TRITIUM_D_CONC_BOTTLE [TU]0.709TU1seawater
92562010-10-17T00:13:29350.3379238.32712827.0136.6TRITIUM_D_CONC_BOTTLE [TU]0.692TU1seawater
\n", + "
" + ], + "text/plain": [ + " yyyy-mm-ddThh:mm:ss.sss Longitude [degrees_east] \\\n", + "9223 2010-10-17T00:13:29 350.33792 \n", + "9231 2010-10-17T00:13:29 350.33792 \n", + "9237 2010-10-17T00:13:29 350.33792 \n", + "9244 2010-10-17T00:13:29 350.33792 \n", + "9256 2010-10-17T00:13:29 350.33792 \n", + "\n", + " Latitude [degrees_north] Bot. Depth [m] DEPTH [m] \\\n", + "9223 38.3271 2827.0 17.8 \n", + "9231 38.3271 2827.0 34.7 \n", + "9237 38.3271 2827.0 67.5 \n", + "9244 38.3271 2827.0 91.9 \n", + "9256 38.3271 2827.0 136.6 \n", + "\n", + " nuclide value _unit _filt group \n", + "9223 TRITIUM_D_CONC_BOTTLE [TU] 0.733 TU 1 seawater \n", + "9231 TRITIUM_D_CONC_BOTTLE [TU] 0.696 TU 1 seawater \n", + "9237 TRITIUM_D_CONC_BOTTLE [TU] 0.718 TU 1 seawater \n", + "9244 TRITIUM_D_CONC_BOTTLE [TU] 0.709 TU 1 seawater \n", + "9256 TRITIUM_D_CONC_BOTTLE [TU] 0.692 TU 1 seawater " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#|eval: false\n", + "tfm = Transformer(df, cbs=[\n", + " SelectColsOfInterestCB(common_coi, nuclides_pattern),\n", + " WideToLongCB(common_coi, nuclides_pattern),\n", + " ExtractUnitCB(),\n", + " ExtractFilteringStatusCB(phase)\n", + "])\n", + "\n", + "df_test = tfm()\n", + "df_test.head()" + ] + }, + { + "cell_type": "markdown", + "id": "53acd63b-9fb9-4f51-a525-e020602893fc", + "metadata": {}, + "source": [ + "#### Sampling method" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7c79502-f09e-49c0-851b-cdb2eca82eac", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "# To be validated\n", + "smp_method = {\n", + " 'BOTTLE': 1,\n", + " 'FISH': 18,\n", + " 'PUMP': 14,\n", + " 'UWAY': 24}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b4663f8-6cb1-45c3-8437-97a6ba9c5214", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class ExtractSamplingMethodCB(Callback):\n", + " \"Extract sampling method from nuclide names.\"\n", + " def __init__(self, smp_method, var_name='nuclide'): \n", + " fc.store_attr()\n", + " self.smp_method_col_name = cdl_cfg()['vars']['suffixes']['sampling_method']['name']\n", + "\n", + " def extract_smp_method(self, s):\n", + " match = re.search(r'_(' + '|'.join(self.smp_method.keys()) + ') ', s)\n", + " return self.smp_method[match.group(1)] if match else None\n", + " \n", + " def __call__(self, tfm):\n", + " tfm.df[self.smp_method_col_name] = tfm.df[self.var_name].apply(self.extract_smp_method)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "429020cf-3e5d-4efc-963d-af82bd1a0820", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2242,63 +6954,458 @@ { "cell_type": "code", "execution_count": null, - "id": "f4d2ffaa", + "id": "f4d2ffaa", + "metadata": {}, + "outputs": [], + "source": [ + "# TO BE DONE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "347a0371-c6e4-4b7d-b1f8-8ffb00509e42", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "class DispatchToGroupCB(Callback):\n", + " \"Convert to a dictionary of dataframe with sample type (seawater,...) as keys.\"\n", + " def __init__(self, group_name='group'): \n", + " fc.store_attr()\n", + " \n", + " def __call__(self, tfm):\n", + " tfm.dfs = dict(tuple(tfm.df.groupby(self.group_name)))\n", + " for key in tfm.dfs:\n", + " tfm.dfs[key] = tfm.dfs[key].drop(self.group_name, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "befc6bf4-4ecc-4165-892c-1ce5c334dfab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dfs_test keys: dict_keys(['seawater', 'suspended-matter'])\n", + " time lon lat tot_depth smp_depth nuclide \\\n", + "9223 2010-10-17T00:13:29 170.33792 38.3271 2827.0 17.8 h3 \n", + "9231 2010-10-17T00:13:29 170.33792 38.3271 2827.0 34.7 h3 \n", + "9237 2010-10-17T00:13:29 170.33792 38.3271 2827.0 67.5 h3 \n", + "9244 2010-10-17T00:13:29 170.33792 38.3271 2827.0 91.9 h3 \n", + "9256 2010-10-17T00:13:29 170.33792 38.3271 2827.0 136.6 h3 \n", + "\n", + " value _unit _filt _sampmet \n", + "9223 0.733 7.0 1 1 \n", + "9231 0.696 7.0 1 1 \n", + "9237 0.718 7.0 1 1 \n", + "9244 0.709 7.0 1 1 \n", + "9256 0.692 7.0 1 1 \n" + ] + } + ], + "source": [ + "#|eval: false\n", + "df = pd.read_csv(fname_in)\n", + "\n", + "tfm = Transformer(df, cbs=[\n", + " SelectColsOfInterestCB(common_coi, nuclides_pattern),\n", + " WideToLongCB(common_coi, nuclides_pattern),\n", + " ExtractUnitCB(),\n", + " ExtractFilteringStatusCB(phase),\n", + " ExtractSamplingMethodCB(smp_method),\n", + " RenameNuclideCB(nuclides_name),\n", + " StandardizeUnitCB(units_lut),\n", + " RenameColumnCB(renaming_rules),\n", + " UnshiftLongitudeCB(),\n", + " DispatchToGroupCB()\n", + "])\n", + "\n", + "dfs_test = tfm()\n", + "print(f'dfs_test keys: {dfs_test.keys()}')\n", + "print(dfs_test['seawater'].head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d095bf1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['time', 'lon', 'lat', 'tot_depth', 'smp_depth', 'nuclide', 'value',\n", + " '_unit', '_filt', '_sampmet'],\n", + " dtype='object')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfs_test['seawater'].columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "579daae3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(19139, 10)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfs_test['seawater'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a2e6b3d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "10733" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# TO BE DONE" + "dfs_test['seawater'][['lon', 'time', 'tot_depth', 'smp_depth', 'lat']].duplicated().sum()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "3167c7a3-d383-49d6-9bd6-40ad9eb0543d", + "metadata": {}, + "source": [ + "### Rehape: long to wide" ] }, { "cell_type": "code", "execution_count": null, - "id": "347a0371-c6e4-4b7d-b1f8-8ffb00509e42", + "id": "a2c222a4-24b8-4b5a-a6fc-37aeb66bc502", "metadata": {}, "outputs": [], "source": [ "#| export\n", - "class DispatchToGroupCB(Callback):\n", - " \"Convert to a dictionary of dataframe with sample type (seawater,...) as keys.\"\n", - " def __init__(self, group_name='group'): \n", - " fc.store_attr()\n", + "# class ReshapeLongToWide(Callback):\n", + "# \"Convert data from long to wide with renamed columns.\"\n", + "# def __init__(self, columns='nuclide', values=['value']):\n", + "# fc.store_attr()\n", + "# # Retrieve all possible derived vars (e.g 'unc', 'dl', ...) from configs\n", + "# self.derived_cols = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]\n", + " \n", + "# def renamed_cols(self, cols):\n", + "# \"Flatten columns name\"\n", + "# return [inner if outer == \"value\" else f'{inner}{outer}'\n", + "# if inner else outer\n", + "# for outer, inner in cols]\n", + "\n", + "# def pivot(self, df):\n", + "# # Among all possible 'derived cols' select the ones present in df\n", + "# derived_coi = [col for col in self.derived_cols if col in df.columns]\n", " \n", - " def __call__(self, tfm):\n", - " tfm.dfs = dict(tuple(tfm.df.groupby(self.group_name)))\n", - " for key in tfm.dfs:\n", - " tfm.dfs[key] = tfm.dfs[key].drop(self.group_name, axis=1)" + "# df.reset_index(names='sample', inplace=True)\n", + " \n", + "# idx = list(set(df.columns) - set([self.columns] + derived_coi + self.values))\n", + "# return df.pivot_table(index=idx,\n", + "# columns=self.columns,\n", + "# values=self.values + derived_coi,\n", + "# fill_value=np.nan,\n", + "# aggfunc=lambda x: x\n", + "# ).reset_index()\n", + "\n", + "# def __call__(self, tfm):\n", + "# for k in tfm.dfs.keys():\n", + "# tfm.dfs[k] = self.pivot(tfm.dfs[k])\n", + "# tfm.dfs[k].columns = self.renamed_cols(tfm.dfs[k].columns)" ] }, { "cell_type": "code", "execution_count": null, - "id": "befc6bf4-4ecc-4165-892c-1ce5c334dfab", + "id": "088aaa9d-1daf-4765-986c-4d8207caeeef", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "dfs_test keys: dict_keys(['seawater', 'suspended-matter'])\n", - " time lon lat tot_depth smp_depth nuclide \\\n", - "9223 2010-10-17T00:13:29 170.33792 38.3271 2827.0 17.8 h3 \n", - "9231 2010-10-17T00:13:29 170.33792 38.3271 2827.0 34.7 h3 \n", - "9237 2010-10-17T00:13:29 170.33792 38.3271 2827.0 67.5 h3 \n", - "9244 2010-10-17T00:13:29 170.33792 38.3271 2827.0 91.9 h3 \n", - "9256 2010-10-17T00:13:29 170.33792 38.3271 2827.0 136.6 h3 \n", - "\n", - " value _unit _filt _sampmet \n", - "9223 0.733 7.0 1 1 \n", - "9231 0.696 7.0 1 1 \n", - "9237 0.718 7.0 1 1 \n", - "9244 0.709 7.0 1 1 \n", - "9256 0.692 7.0 1 1 \n" + "shape: (8406, 89)\n", + "columns: Index(['time', 'lat', 'tot_depth', 'smp_depth', 'lon', 'ac227_filt',\n", + " 'be7_filt', 'cs137_filt', 'h3_filt', 'i129_filt', 'np237_filt',\n", + " 'pa231_filt', 'pb210_filt', 'po210_filt', 'pu239_filt',\n", + " 'pu239_240_tot_filt', 'pu240_filt', 'ra223_filt', 'ra224_filt',\n", + " 'ra226_filt', 'ra228_filt', 'th228_filt', 'th230_filt', 'th232_filt',\n", + " 'th234_filt', 'u236_filt', 'ac227_sampmet', 'be7_sampmet',\n", + " 'cs137_sampmet', 'h3_sampmet', 'i129_sampmet', 'np237_sampmet',\n", + " 'pa231_sampmet', 'pb210_sampmet', 'po210_sampmet', 'pu239_sampmet',\n", + " 'pu239_240_tot_sampmet', 'pu240_sampmet', 'ra223_sampmet',\n", + " 'ra224_sampmet', 'ra226_sampmet', 'ra228_sampmet', 'th228_sampmet',\n", + " 'th230_sampmet', 'th232_sampmet', 'th234_sampmet', 'u236_sampmet',\n", + " 'ac227_unit', 'be7_unit', 'cs137_unit', 'h3_unit', 'i129_unit',\n", + " 'np237_unit', 'pa231_unit', 'pb210_unit', 'po210_unit', 'pu239_unit',\n", + " 'pu239_240_tot_unit', 'pu240_unit', 'ra223_unit', 'ra224_unit',\n", + " 'ra226_unit', 'ra228_unit', 'th228_unit', 'th230_unit', 'th232_unit',\n", + " 'th234_unit', 'u236_unit', 'ac227', 'be7', 'cs137', 'h3', 'i129',\n", + " 'np237', 'pa231', 'pb210', 'po210', 'pu239', 'pu239_240_tot', 'pu240',\n", + " 'ra223', 'ra224', 'ra226', 'ra228', 'th228', 'th230', 'th232', 'th234',\n", + " 'u236'],\n", + " dtype='object')\n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
yyyy-mm-ddThh:mm:ss.sssLongitude [degrees_east]Latitude [degrees_north]
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timelattot_depthsmp_depthlonac227_filtbe7_filtcs137_filth3_filti129_filt...pu240ra223ra224ra226ra228th228th230th232th234u236
org_index
02007-07-30T10:37:1975.0005181.07.0-145.9999NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN0.028751NaN
12007-07-30T10:37:1975.0005181.025.7-145.9999NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN0.034914NaN
22007-07-30T10:37:1975.0005181.049.5-145.9999NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN0.037409NaN
32007-07-30T10:37:1975.0005181.074.6-145.9999NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN0.033527NaN
42007-07-30T10:37:1975.0005181.0100.2-145.9999NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN0.034267NaN
\n", + "

5 rows × 89 columns

\n", + "
" + ], + "text/plain": [ + " time lat tot_depth smp_depth lon \\\n", + "org_index \n", + "0 2007-07-30T10:37:19 75.0005 181.0 7.0 -145.9999 \n", + "1 2007-07-30T10:37:19 75.0005 181.0 25.7 -145.9999 \n", + "2 2007-07-30T10:37:19 75.0005 181.0 49.5 -145.9999 \n", + "3 2007-07-30T10:37:19 75.0005 181.0 74.6 -145.9999 \n", + "4 2007-07-30T10:37:19 75.0005 181.0 100.2 -145.9999 \n", + "\n", + " ac227_filt be7_filt cs137_filt h3_filt i129_filt ... pu240 \\\n", + "org_index ... \n", + "0 NaN NaN NaN NaN NaN ... NaN \n", + "1 NaN NaN NaN NaN NaN ... NaN \n", + "2 NaN NaN NaN NaN NaN ... NaN \n", + "3 NaN NaN NaN NaN NaN ... NaN \n", + "4 NaN NaN NaN NaN NaN ... NaN \n", + "\n", + " ra223 ra224 ra226 ra228 th228 th230 th232 th234 u236 \n", + "org_index \n", + "0 NaN NaN NaN NaN NaN NaN NaN 0.028751 NaN \n", + "1 NaN NaN NaN NaN NaN NaN NaN 0.034914 NaN \n", + "2 NaN NaN NaN NaN NaN NaN NaN 0.037409 NaN \n", + "3 NaN NaN NaN NaN NaN NaN NaN 0.033527 NaN \n", + "4 NaN NaN NaN NaN NaN NaN NaN 0.034267 NaN \n", + "\n", + "[5 rows x 89 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "#|eval: false\n", - "df = pd.read_csv(fname_in)\n", - "\n", "tfm = Transformer(df, cbs=[\n", " SelectColsOfInterestCB(common_coi, nuclides_pattern),\n", " WideToLongCB(common_coi, nuclides_pattern),\n", @@ -2309,42 +7416,26 @@ " StandardizeUnitCB(units_lut),\n", " RenameColumnCB(renaming_rules),\n", " UnshiftLongitudeCB(),\n", - " DispatchToGroupCB()\n", + " DispatchToGroupCB(),\n", + " ReshapeLongToWide()\n", "])\n", "\n", "dfs_test = tfm()\n", - "print(f'dfs_test keys: {dfs_test.keys()}')\n", - "print(dfs_test['seawater'].head())" + "print('shape: ', dfs_test['seawater'].shape)\n", + "print('columns: ', dfs_test['seawater'] .columns)\n", + "dfs_test['seawater'].head()" ] }, { "cell_type": "code", "execution_count": null, - "id": "7d095bf1", + "id": "242efadc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Index(['lon', 'time', 'tot_depth', 'smp_depth', 'lat', 'ac227_filt',\n", - " 'be7_filt', 'cs137_filt', 'h3_filt', 'i129_filt', 'np237_filt',\n", - " 'pa231_filt', 'pb210_filt', 'po210_filt', 'pu239_filt',\n", - " 'pu239_240_tot_filt', 'pu240_filt', 'ra223_filt', 'ra224_filt',\n", - " 'ra226_filt', 'ra228_filt', 'th228_filt', 'th230_filt', 'th232_filt',\n", - " 'th234_filt', 'u236_filt', 'ac227_sampmet', 'be7_sampmet',\n", - " 'cs137_sampmet', 'h3_sampmet', 'i129_sampmet', 'np237_sampmet',\n", - " 'pa231_sampmet', 'pb210_sampmet', 'po210_sampmet', 'pu239_sampmet',\n", - " 'pu239_240_tot_sampmet', 'pu240_sampmet', 'ra223_sampmet',\n", - " 'ra224_sampmet', 'ra226_sampmet', 'ra228_sampmet', 'th228_sampmet',\n", - " 'th230_sampmet', 'th232_sampmet', 'th234_sampmet', 'u236_sampmet',\n", - " 'ac227_unit', 'be7_unit', 'cs137_unit', 'h3_unit', 'i129_unit',\n", - " 'np237_unit', 'pa231_unit', 'pb210_unit', 'po210_unit', 'pu239_unit',\n", - " 'pu239_240_tot_unit', 'pu240_unit', 'ra223_unit', 'ra224_unit',\n", - " 'ra226_unit', 'ra228_unit', 'th228_unit', 'th230_unit', 'th234_unit',\n", - " 'u236_unit', 'ac227', 'be7', 'cs137', 'h3', 'i129', 'np237', 'pa231',\n", - " 'pb210', 'po210', 'pu239', 'pu239_240_tot', 'pu240', 'ra223', 'ra224',\n", - " 'ra226', 'ra228', 'th228', 'th230', 'th234', 'u236'],\n", - " dtype='object')" + "0" ] }, "execution_count": null, @@ -2353,40 +7444,29 @@ } ], "source": [ - "dfs_test['seawater'].columns" + "dfs_test['seawater'][['lon', 'time', 'lat', 'smp_depth', 'tot_depth']].duplicated().sum()" ] }, { "cell_type": "code", "execution_count": null, - "id": "579daae3", + "id": "413d49ad", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(19139, 87)" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "dfs_test['seawater'].shape" + "result =dfs_test['seawater'].drop(['lon', 'time', 'lat', 'smp_depth', 'tot_depth'], axis=1)" ] }, { "cell_type": "code", "execution_count": null, - "id": "8a2e6b3d", + "id": "f1a5099d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "10733" + "(8406, 89)" ] }, "execution_count": null, @@ -2395,91 +7475,15 @@ } ], "source": [ - "dfs_test['seawater'][['lon', 'time', 'tot_depth', 'smp_depth', 'lat']].duplicated().sum()\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "3167c7a3-d383-49d6-9bd6-40ad9eb0543d", - "metadata": {}, - "source": [ - "### Rehape: long to wide" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2c222a4-24b8-4b5a-a6fc-37aeb66bc502", - "metadata": {}, - "outputs": [], - "source": [ - "#| export\n", - "# class ReshapeLongToWide(Callback):\n", - "# \"Convert data from long to wide with renamed columns.\"\n", - "# def __init__(self, columns='nuclide', values=['value']):\n", - "# fc.store_attr()\n", - "# # Retrieve all possible derived vars (e.g 'unc', 'dl', ...) from configs\n", - "# self.derived_cols = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]\n", - " \n", - "# def renamed_cols(self, cols):\n", - "# \"Flatten columns name\"\n", - "# return [inner if outer == \"value\" else f'{inner}{outer}'\n", - "# if inner else outer\n", - "# for outer, inner in cols]\n", - "\n", - "# def pivot(self, df):\n", - "# # Among all possible 'derived cols' select the ones present in df\n", - "# derived_coi = [col for col in self.derived_cols if col in df.columns]\n", - " \n", - "# df.reset_index(names='sample', inplace=True)\n", - " \n", - "# idx = list(set(df.columns) - set([self.columns] + derived_coi + self.values))\n", - "# return df.pivot_table(index=idx,\n", - "# columns=self.columns,\n", - "# values=self.values + derived_coi,\n", - "# fill_value=np.nan,\n", - "# aggfunc=lambda x: x\n", - "# ).reset_index()\n", - "\n", - "# def __call__(self, tfm):\n", - "# for k in tfm.dfs.keys():\n", - "# tfm.dfs[k] = self.pivot(tfm.dfs[k])\n", - "# tfm.dfs[k].columns = self.renamed_cols(tfm.dfs[k].columns)" + "dfs_test['seawater'].shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "088aaa9d-1daf-4765-986c-4d8207caeeef", + "id": "308d4270", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape: (19139, 87)\n", - "columns: Index(['lon', 'time', 'tot_depth', 'smp_depth', 'lat', 'ac227_filt',\n", - " 'be7_filt', 'cs137_filt', 'h3_filt', 'i129_filt', 'np237_filt',\n", - " 'pa231_filt', 'pb210_filt', 'po210_filt', 'pu239_filt',\n", - " 'pu239_240_tot_filt', 'pu240_filt', 'ra223_filt', 'ra224_filt',\n", - " 'ra226_filt', 'ra228_filt', 'th228_filt', 'th230_filt', 'th232_filt',\n", - " 'th234_filt', 'u236_filt', 'ac227_sampmet', 'be7_sampmet',\n", - " 'cs137_sampmet', 'h3_sampmet', 'i129_sampmet', 'np237_sampmet',\n", - " 'pa231_sampmet', 'pb210_sampmet', 'po210_sampmet', 'pu239_sampmet',\n", - " 'pu239_240_tot_sampmet', 'pu240_sampmet', 'ra223_sampmet',\n", - " 'ra224_sampmet', 'ra226_sampmet', 'ra228_sampmet', 'th228_sampmet',\n", - " 'th230_sampmet', 'th232_sampmet', 'th234_sampmet', 'u236_sampmet',\n", - " 'ac227_unit', 'be7_unit', 'cs137_unit', 'h3_unit', 'i129_unit',\n", - " 'np237_unit', 'pa231_unit', 'pb210_unit', 'po210_unit', 'pu239_unit',\n", - " 'pu239_240_tot_unit', 'pu240_unit', 'ra223_unit', 'ra224_unit',\n", - " 'ra226_unit', 'ra228_unit', 'th228_unit', 'th230_unit', 'th234_unit',\n", - " 'u236_unit', 'ac227', 'be7', 'cs137', 'h3', 'i129', 'np237', 'pa231',\n", - " 'pb210', 'po210', 'pu239', 'pu239_240_tot', 'pu240', 'ra223', 'ra224',\n", - " 'ra226', 'ra228', 'th228', 'th230', 'th234', 'u236'],\n", - " dtype='object')\n" - ] - }, { "data": { "text/html": [ @@ -2501,18 +7505,17 @@ " \n", " \n", " \n", - " lon\n", - " time\n", - " tot_depth\n", - " smp_depth\n", - " lat\n", " ac227_filt\n", " be7_filt\n", " cs137_filt\n", " h3_filt\n", " i129_filt\n", + " np237_filt\n", + " pa231_filt\n", + " pb210_filt\n", + " po210_filt\n", + " pu239_filt\n", " ...\n", - " pu239_240_tot\n", " pu240\n", " ra223\n", " ra224\n", @@ -2520,49 +7523,2968 @@ " ra228\n", " th228\n", " th230\n", + " th232\n", " th234\n", " u236\n", " \n", " \n", - " org_index\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " org_index\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 21\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000091\n", + " 0.00112\n", + " 0.000327\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.030734\n", + " NaN\n", + " \n", + " \n", + " 31\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000044\n", + " 0.000888\n", + " 0.000311\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.025989\n", + " NaN\n", + " \n", + " \n", + " 64\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000208\n", + " 0.001169\n", + " 0.000262\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.037707\n", + " NaN\n", + " \n", + " \n", + " 91\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.00028\n", + " 0.001167\n", + " 0.00028\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.038946\n", + " NaN\n", + " \n", + " \n", + " 99\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000166\n", + " 0.001224\n", + " 0.000259\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.033485\n", + " NaN\n", + " \n", + " \n", + " 106\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000132\n", + " 0.001208\n", + " 0.000312\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.029319\n", + " NaN\n", + " \n", + " \n", + " 116\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000107\n", + " 0.001141\n", + " 0.000262\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.027518\n", + " NaN\n", + " \n", + " \n", + " 123\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.00013\n", + " 0.001148\n", + " 0.000254\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.030561\n", + " NaN\n", + " \n", + " \n", + " 135\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000159\n", + " 0.001218\n", + " 0.000402\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.032424\n", + " NaN\n", + " \n", + " \n", + " 149\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000202\n", + " 0.001083\n", + " 0.000763\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.033362\n", + " NaN\n", + " \n", + " \n", + " 164\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000171\n", + " 0.001096\n", + " 0.000675\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.03475\n", + " NaN\n", + " \n", + " \n", + " 181\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000356\n", + " 0.001098\n", + " 0.000411\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.031671\n", + " NaN\n", + " \n", + " \n", + " 200\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000416\n", + " 0.001101\n", + " 0.000585\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.033616\n", + " NaN\n", + " \n", + " \n", + " 232\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000735\n", + " 0.001301\n", + " 0.001382\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.031999\n", + " NaN\n", + " \n", + " \n", + " 251\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000914\n", + " 0.001424\n", + " 0.001457\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.03428\n", + " NaN\n", + " \n", + " \n", + " 259\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000956\n", + " 0.001498\n", + " 0.001259\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.033478\n", + " NaN\n", + " \n", + " \n", + " 274\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000753\n", + " 0.001491\n", + " 0.000927\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.032871\n", + " NaN\n", + " \n", + " \n", + " 294\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.001002\n", + " 0.00154\n", + " 0.002\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.030905\n", + " NaN\n", + " \n", + " \n", + " 301\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000776\n", + " 0.001385\n", + " 0.001361\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.032596\n", + " NaN\n", + " \n", + " \n", + " 308\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000517\n", + " 0.00115\n", + " 0.000867\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.030861\n", + " NaN\n", + " \n", + " \n", + " 316\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000618\n", + " 0.00105\n", + " 0.000496\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.039277\n", + " NaN\n", + " \n", + " \n", + " 341\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000293\n", + " 0.001099\n", + " 0.000506\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.031319\n", + " NaN\n", + " \n", + " \n", + " 357\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000319\n", + " 0.001054\n", + " 0.00081\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.032725\n", + " NaN\n", + " \n", + " \n", + " 380\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000272\n", + " 0.000787\n", + " 0.000914\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.025919\n", + " NaN\n", + " \n", + " \n", + " 388\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000158\n", + " 0.000541\n", + " 0.000554\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.022308\n", + " NaN\n", + " \n", + " \n", + " 1158\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " ...\n", + " 0.000008\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 1586\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.0\n", + " 0.000122\n", + " 0.001512\n", + " 0.000197\n", + " 0.000093\n", + " 0.000003\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 1942\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.0\n", + " 0.000062\n", + " 0.001473\n", + " 0.000088\n", + " 0.000042\n", + " 0.000002\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 2050\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.0\n", + " 0.000085\n", + " 0.001403\n", + " 0.000106\n", + " 0.000073\n", + " 0.000002\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 2520\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000003\n", + " 0.000023\n", + " 0.001693\n", + " 0.000098\n", + " 0.000031\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 2526\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " 1\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.000004\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 2570\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000005\n", + " 0.000067\n", + " 0.00159\n", + " 0.001132\n", + " 0.000127\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 2928\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000002\n", + " 0.000163\n", + " 0.00133\n", + " 0.000481\n", + " 0.000154\n", + " 0.000001\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 3085\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000008\n", + " 0.000076\n", + " 0.001403\n", + " 0.000359\n", + " 0.000182\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 3309\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000003\n", + " 0.00005\n", + " 0.001454\n", + " 0.000421\n", + " 0.00012\n", + " 0.000001\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 3375\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000026\n", + " 0.001293\n", + " 0.000382\n", + " 0.000088\n", + " 0.000001\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 3716\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.001785\n", + " 0.000008\n", + " NaN\n", + " 0.000003\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 4050\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.0\n", + " NaN\n", + " NaN\n", + " 6168131.0\n", + " \n", + " \n", + " 4254\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.00101\n", + " 0.000049\n", + " NaN\n", + " 0.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 4382\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.0\n", + " NaN\n", + " NaN\n", + " 7304443.0\n", + " \n", + " \n", + " 4394\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.001033\n", + " 0.000035\n", + " NaN\n", + " 0.000001\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 4748\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.0\n", + " NaN\n", + " NaN\n", + " 5495804.5\n", + " \n", + " \n", + " 5152\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.0\n", + " NaN\n", + " NaN\n", + " 5656699.5\n", + " \n", + " \n", + " 5167\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " 1\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 5728\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000003\n", + " 0.000036\n", + " 0.000347\n", + " 0.00028\n", + " 0.000061\n", + " 0.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 5782\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.0\n", + " 0.000124\n", + " 0.001204\n", + " 0.000991\n", + " 0.00005\n", + " 0.000001\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 5853\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.0\n", + " 0.000321\n", + " 0.001742\n", + " 0.000884\n", + " 0.000358\n", + " NaN\n", + " NaN\n", + " 0.0347\n", + " NaN\n", + " \n", + " \n", + " 5886\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000001\n", + " 0.000143\n", + " 0.001335\n", + " 0.000126\n", + " 0.000179\n", + " NaN\n", + " NaN\n", + " 0.0444\n", + " NaN\n", + " \n", + " \n", + " 5889\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000007\n", + " 0.000072\n", + " 0.001772\n", + " 0.000039\n", + " 0.000079\n", + " NaN\n", + " NaN\n", + " 0.041\n", + " NaN\n", + " \n", + " \n", + " 5895\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.00001\n", + " 0.000103\n", + " 0.002146\n", + " 0.00019\n", + " 0.000089\n", + " NaN\n", + " NaN\n", + " 0.043\n", + " NaN\n", + " \n", + " \n", + " 5960\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000002\n", + " 0.000164\n", + " 0.001283\n", + " 0.000175\n", + " 0.000199\n", + " NaN\n", + " NaN\n", + " 0.0459\n", + " NaN\n", + " \n", + " \n", + " 5963\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000004\n", + " 0.000198\n", + " 0.001422\n", + " 0.000227\n", + " 0.000219\n", + " NaN\n", + " NaN\n", + " 0.0428\n", + " NaN\n", + " \n", + " \n", + " 5966\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000002\n", + " 0.000184\n", + " 0.001377\n", + " 0.000174\n", + " 0.00021\n", + " NaN\n", + " NaN\n", + " 0.042\n", + " NaN\n", + " \n", + " \n", + " 5969\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000002\n", + " 0.000208\n", + " 0.001348\n", + " 0.000227\n", + " 0.000248\n", + " NaN\n", + " NaN\n", + " 0.0394\n", + " NaN\n", + " \n", + " \n", + " 5974\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000005\n", + " 0.000093\n", + " 0.001494\n", + " 0.000081\n", + " 0.000093\n", + " NaN\n", + " NaN\n", + " 0.0413\n", + " NaN\n", + " \n", + " \n", + " 5978\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000008\n", + " 0.00006\n", + " 0.001682\n", + " 0.000068\n", + " 0.000066\n", + " NaN\n", + " NaN\n", + " 0.041\n", + " NaN\n", + " \n", + " \n", + " 6048\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000001\n", + " 0.000156\n", + " 0.001363\n", + " 0.000135\n", + " 0.000227\n", + " NaN\n", + " NaN\n", + " 0.0407\n", + " NaN\n", + " \n", + " \n", + " 6051\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000003\n", + " 0.000099\n", + " 0.001313\n", + " 0.000077\n", + " 0.0001\n", + " NaN\n", + " NaN\n", + " 0.0427\n", + " NaN\n", + " \n", + " \n", + " 6054\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000007\n", + " 0.000035\n", + " 0.001751\n", + " 0.0\n", + " 0.000047\n", + " NaN\n", + " NaN\n", + " 0.0389\n", + " NaN\n", + " \n", + " \n", + " 6057\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000008\n", + " 0.000037\n", + " 0.001895\n", + " 0.000081\n", + " 0.000038\n", + " NaN\n", + " NaN\n", + " 0.041\n", + " NaN\n", + " \n", + " \n", + " 6059\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000008\n", + " 0.000057\n", + " 0.002024\n", + " 0.000062\n", + " 0.000047\n", + " NaN\n", + " NaN\n", + " 0.0386\n", + " NaN\n", + " \n", + " \n", + " 6065\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.00001\n", + " 0.000043\n", + " 0.002252\n", + " 0.000048\n", + " 0.000048\n", + " NaN\n", + " NaN\n", + " 0.0414\n", + " NaN\n", + " \n", + " \n", + " 6127\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000003\n", + " 0.000182\n", + " 0.001275\n", + " 0.000148\n", + " 0.000188\n", + " NaN\n", + " NaN\n", + " 0.039\n", + " NaN\n", + " \n", + " \n", + " 6132\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000001\n", + " 0.000139\n", + " 0.001186\n", + " 0.000107\n", + " 0.000159\n", + " NaN\n", + " NaN\n", + " 0.0391\n", + " NaN\n", + " \n", + " \n", + " 6135\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000002\n", + " 0.000143\n", + " 0.001344\n", + " 0.000113\n", + " 0.00015\n", + " NaN\n", + " NaN\n", + " 0.0402\n", + " NaN\n", + " \n", + " \n", + " 6139\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000003\n", + " 0.000166\n", + " 0.001335\n", + " 0.0\n", + " 0.000167\n", + " NaN\n", + " NaN\n", + " 0.0411\n", + " NaN\n", + " \n", + " \n", + " 6142\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000004\n", + " 0.000136\n", + " 0.001468\n", + " 0.0001\n", + " 0.000118\n", + " NaN\n", + " NaN\n", + " 0.041\n", + " NaN\n", + " \n", + " \n", + " 6145\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000006\n", + " 0.00006\n", + " 0.001393\n", + " 0.0\n", + " 0.000057\n", + " NaN\n", + " NaN\n", + " 0.0386\n", + " NaN\n", + " \n", + " \n", + " 6148\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000008\n", + " 0.000025\n", + " 0.00151\n", + " 0.000025\n", + " 0.000013\n", + " NaN\n", + " NaN\n", + " 0.0358\n", + " NaN\n", + " \n", + " \n", + " 6152\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000009\n", + " 0.000017\n", + " 0.001719\n", + " 0.0\n", + " 0.000009\n", + " NaN\n", + " NaN\n", + " 0.0367\n", + " NaN\n", + " \n", + " \n", + " 6155\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000007\n", + " 0.000015\n", + " 0.001608\n", + " 0.0\n", + " 0.00001\n", + " NaN\n", + " NaN\n", + " 0.0422\n", + " NaN\n", + " \n", + " \n", + " 6156\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000012\n", + " 0.000016\n", + " 0.002023\n", + " 0.000059\n", + " 0.000013\n", + " NaN\n", + " NaN\n", + " 0.0372\n", + " NaN\n", + " \n", + " \n", + " 6159\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000007\n", + " 0.000026\n", + " 0.002378\n", + " 0.0\n", + " 0.000033\n", + " NaN\n", + " NaN\n", + " 0.0417\n", + " NaN\n", + " \n", + " \n", + " 6162\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000011\n", + " 0.00007\n", + " 0.002719\n", + " 0.000079\n", + " 0.000049\n", + " NaN\n", + " NaN\n", + " 0.0356\n", + " NaN\n", + " \n", + " \n", + " 6165\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000012\n", + " 0.000054\n", + " 0.002514\n", + " 0.000085\n", + " 0.000067\n", + " NaN\n", + " NaN\n", + " 0.0405\n", + " NaN\n", + " \n", + " \n", + " 6337\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000003\n", + " 0.000157\n", + " 0.001343\n", + " 0.000131\n", + " 0.000129\n", + " NaN\n", + " NaN\n", + " 0.0389\n", + " NaN\n", + " \n", + " \n", + " 6339\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000004\n", + " 0.000107\n", + " 0.001288\n", + " 0.000124\n", + " 0.000123\n", + " NaN\n", + " NaN\n", + " 0.04\n", + " NaN\n", + " \n", + " \n", + " 6341\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000004\n", + " 0.000061\n", + " 0.001318\n", + " 0.0\n", + " 0.000077\n", + " NaN\n", + " NaN\n", + " 0.0393\n", + " NaN\n", + " \n", + " \n", + " 6344\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000011\n", + " 0.000023\n", + " 0.001486\n", + " 0.0\n", + " 0.000019\n", + " NaN\n", + " NaN\n", + " 0.0345\n", + " NaN\n", + " \n", + " \n", + " 6347\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.00001\n", + " 0.000026\n", + " 0.001975\n", + " 0.0\n", + " 0.000022\n", + " NaN\n", + " NaN\n", + " 0.0394\n", + " NaN\n", + " \n", + " \n", + " 6352\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000014\n", + " 0.000045\n", + " 0.002144\n", + " 0.000036\n", + " 0.000048\n", + " NaN\n", + " NaN\n", + " 0.0385\n", + " NaN\n", + " \n", + " \n", + " 6354\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000012\n", + " 0.000053\n", + " 0.002277\n", + " 0.000084\n", + " 0.000057\n", + " NaN\n", + " NaN\n", + " 0.0364\n", + " NaN\n", + " \n", + " \n", + " 6365\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000019\n", + " 0.001758\n", + " 0.000031\n", + " 0.00001\n", + " 0.000012\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 6404\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000689\n", + " 0.00134\n", + " 0.000891\n", + " 0.000484\n", + " 0.0\n", + " NaN\n", + " NaN\n", + " 20850986.0\n", + " \n", + " \n", + " 6421\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000077\n", + " 0.002422\n", + " 0.000086\n", + " 0.000044\n", + " 0.000022\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 6445\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000002\n", + " 0.001076\n", + " 0.001784\n", + " 0.003084\n", + " 0.000761\n", + " 0.000002\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 6451\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.0\n", + " 0.000537\n", + " 0.001745\n", + " 0.002753\n", + " 0.000606\n", + " 0.000002\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 6490\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.0\n", + " 0.000114\n", + " 0.001393\n", + " 0.000052\n", + " 0.000079\n", + " NaN\n", + " NaN\n", + " 0.035\n", + " NaN\n", + " \n", + " \n", + " 6494\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000002\n", + " 0.000104\n", + " 0.001377\n", + " 0.000067\n", + " 0.000066\n", + " NaN\n", + " NaN\n", + " 0.0376\n", + " NaN\n", + " \n", + " \n", + " 6496\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000002\n", + " 0.00007\n", + " 0.001401\n", + " 0.000078\n", + " 0.000057\n", + " 0.000009\n", + " NaN\n", + " 0.0415\n", + " NaN\n", + " \n", + " \n", + " 6500\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000004\n", + " 0.000033\n", + " 0.001397\n", + " 0.000051\n", + " 0.000018\n", + " NaN\n", + " NaN\n", + " 0.0409\n", + " NaN\n", + " \n", + " \n", + " 6503\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000004\n", + " 0.000063\n", + " 0.001733\n", + " 0.000058\n", + " 0.000047\n", + " NaN\n", + " NaN\n", + " 0.0409\n", + " NaN\n", + " \n", + " \n", + " 6561\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.0\n", + " 0.000432\n", + " 0.001528\n", + " 0.00078\n", + " 0.000193\n", + " NaN\n", + " NaN\n", + " 0.0361\n", + " NaN\n", + " \n", + " \n", + " 6591\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.0\n", + " 0.000116\n", + " 0.001357\n", + " 0.000087\n", + " 0.000139\n", + " NaN\n", + " NaN\n", + " 0.0403\n", + " NaN\n", + " \n", + " \n", + " 6597\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000001\n", + " 0.000056\n", + " 0.001344\n", + " 0.0\n", + " 0.000036\n", + " NaN\n", + " NaN\n", + " 0.0379\n", + " NaN\n", + " \n", + " \n", + " 6600\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000004\n", + " 0.000018\n", + " 0.001391\n", + " 0.000036\n", + " 0.000008\n", + " NaN\n", + " NaN\n", + " 0.0398\n", + " NaN\n", + " \n", + " \n", + " 6603\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000006\n", + " 0.000024\n", + " 0.001787\n", + " 0.000034\n", + " 0.000013\n", + " NaN\n", + " NaN\n", + " 0.041\n", + " NaN\n", + " \n", + " \n", + " 6605\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000006\n", + " 0.000024\n", + " 0.001825\n", + " 0.000037\n", + " 0.000014\n", + " NaN\n", + " NaN\n", + " 0.0397\n", + " NaN\n", + " \n", + " \n", + " 6607\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000007\n", + " 0.000064\n", + " 0.002153\n", + " 0.000073\n", + " 0.000043\n", + " NaN\n", + " NaN\n", + " 0.0358\n", + " NaN\n", + " \n", + " \n", + " 6609\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000015\n", + " 0.000087\n", + " 0.002212\n", + " 0.000131\n", + " 0.000059\n", + " NaN\n", + " NaN\n", + " 0.0407\n", + " NaN\n", + " \n", + " \n", + " 6638\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " 0.000092\n", + " 0.001672\n", + " 0.00011\n", + " 0.000077\n", + " 0.000006\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 6647\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " 1.0\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000004\n", + " 0.000365\n", + " 0.001607\n", + " 0.000965\n", + " 0.000344\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 6679\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.0\n", + " 0.000301\n", + " 0.001302\n", + " 0.00023\n", + " 0.000247\n", + " NaN\n", + " NaN\n", + " 0.0395\n", + " NaN\n", + " \n", + " \n", + " 6682\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000002\n", + " 0.000216\n", + " 0.001409\n", + " 0.000114\n", + " 0.00018\n", + " NaN\n", + " NaN\n", + " 0.0388\n", + " NaN\n", + " \n", + " \n", + " 6685\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000002\n", + " 0.000198\n", + " 0.001358\n", + " 0.000082\n", + " 0.000126\n", + " NaN\n", + " NaN\n", + " 0.0388\n", + " NaN\n", + " \n", + " \n", + " 6687\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000004\n", + " 0.000162\n", + " 0.001364\n", + " 0.000093\n", + " 0.000122\n", + " NaN\n", + " NaN\n", + " 0.0359\n", + " NaN\n", + " \n", + " \n", + " 6690\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000003\n", + " 0.000136\n", + " 0.001391\n", + " 0.000075\n", + " 0.000092\n", + " NaN\n", + " NaN\n", + " 0.0381\n", + " NaN\n", + " \n", + " \n", + " 6693\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000001\n", + " 0.00008\n", + " 0.00145\n", + " 0.000039\n", + " 0.000044\n", + " NaN\n", + " NaN\n", + " 0.0359\n", + " NaN\n", + " \n", + " \n", + " 6696\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000004\n", + " 0.000036\n", + " 0.001476\n", + " 0.0\n", + " 0.000022\n", + " NaN\n", + " NaN\n", + " 0.0391\n", + " NaN\n", + " \n", + " \n", + " 6697\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000004\n", + " 0.000035\n", + " 0.001459\n", + " 0.0\n", + " 0.000016\n", + " NaN\n", + " NaN\n", + " 0.0383\n", + " NaN\n", + " \n", + " \n", + " 6700\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000006\n", + " 0.000021\n", + " 0.001471\n", + " 0.0\n", + " 0.000012\n", + " NaN\n", + " NaN\n", + " 0.0401\n", + " NaN\n", + " \n", + " \n", + " 6702\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000009\n", + " 0.000021\n", + " 0.001737\n", + " 0.0\n", + " 0.000009\n", + " NaN\n", + " NaN\n", + " 0.0378\n", + " NaN\n", + " \n", + " \n", + " 6705\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000013\n", + " 0.000021\n", + " 0.001933\n", + " 0.0\n", + " 0.000008\n", + " NaN\n", + " NaN\n", + " 0.0349\n", + " NaN\n", + " \n", + " \n", + " 6708\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000015\n", + " 0.000054\n", + " 0.001905\n", + " 0.0\n", + " 0.000054\n", + " NaN\n", + " NaN\n", + " 0.0409\n", + " NaN\n", + " \n", + " \n", + " 6714\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000019\n", + " 0.000056\n", + " 0.001907\n", + " 0.000049\n", + " 0.000048\n", + " NaN\n", + " NaN\n", + " 0.0396\n", + " NaN\n", + " \n", + " \n", + " 6716\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000013\n", + " 0.000049\n", + " 0.002008\n", + " 0.000053\n", + " 0.000046\n", + " NaN\n", + " NaN\n", + " 0.0395\n", + " NaN\n", + " \n", + " \n", + " 6718\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000009\n", + " 0.000048\n", + " 0.00208\n", + " 0.000065\n", + " 0.000046\n", + " NaN\n", + " NaN\n", + " 0.0388\n", + " NaN\n", + " \n", + " \n", + " 6785\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.0\n", + " 0.00044\n", + " 0.001823\n", + " 0.000926\n", + " 0.000475\n", + " NaN\n", + " NaN\n", + " 0.0379\n", + " NaN\n", + " \n", + " \n", + " 6803\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000001\n", + " 0.000068\n", + " 0.001321\n", + " 0.00005\n", + " 0.00006\n", + " NaN\n", + " NaN\n", + " 0.0422\n", + " NaN\n", + " \n", + " \n", + " 6806\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000004\n", + " 0.000035\n", + " 0.001451\n", + " 0.000046\n", + " 0.000028\n", + " NaN\n", + " NaN\n", + " 0.0386\n", + " NaN\n", + " \n", + " \n", + " 6809\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000005\n", + " 0.000019\n", + " 0.001532\n", + " 0.0\n", + " 0.000014\n", + " NaN\n", + " NaN\n", + " 0.0404\n", + " NaN\n", " \n", - " \n", - " \n", " \n", - " 9223\n", - " 170.33792\n", - " 2010-10-17T00:13:29\n", - " 2827.0\n", - " 17.8\n", - " 38.3271\n", + " 6813\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " 1.0\n", " NaN\n", " ...\n", " NaN\n", + " 0.000008\n", + " 0.000026\n", + " 0.002118\n", + " 0.000039\n", + " 0.000018\n", + " NaN\n", + " NaN\n", + " 0.0366\n", + " NaN\n", + " \n", + " \n", + " 6815\n", + " NaN\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", @@ -2571,22 +10493,46 @@ " NaN\n", " NaN\n", " NaN\n", + " ...\n", + " NaN\n", + " 0.00001\n", + " 0.000053\n", + " 0.002066\n", + " 0.000051\n", + " 0.000038\n", + " NaN\n", + " NaN\n", + " 0.0402\n", " NaN\n", " \n", " \n", - " 9231\n", - " 170.33792\n", - " 2010-10-17T00:13:29\n", - " 2827.0\n", - " 34.7\n", - " 38.3271\n", + " 6817\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " 1.0\n", " NaN\n", " ...\n", " NaN\n", + " 0.000011\n", + " 0.000051\n", + " 0.002092\n", + " 0.000072\n", + " 0.000039\n", + " NaN\n", + " NaN\n", + " 0.0406\n", + " NaN\n", + " \n", + " \n", + " 6819\n", + " NaN\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", @@ -2595,22 +10541,45 @@ " NaN\n", " NaN\n", " NaN\n", + " ...\n", + " NaN\n", + " 0.000005\n", + " 0.000054\n", + " 0.001933\n", + " 0.000065\n", + " 0.000032\n", + " NaN\n", + " NaN\n", + " 0.038\n", " NaN\n", " \n", " \n", - " 9237\n", - " 170.33792\n", - " 2010-10-17T00:13:29\n", - " 2827.0\n", - " 67.5\n", - " 38.3271\n", + " 6929\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " 1.0\n", " NaN\n", " ...\n", " NaN\n", + " 0.000002\n", + " 0.00006\n", + " 0.001478\n", + " 0.000027\n", + " 0.000055\n", + " NaN\n", + " NaN\n", + " 0.0404\n", + " NaN\n", + " \n", + " \n", + " 6932\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", @@ -2620,21 +10589,45 @@ " NaN\n", " NaN\n", " NaN\n", + " ...\n", + " NaN\n", + " 0.000006\n", + " 0.000042\n", + " 0.001748\n", + " 0.000033\n", + " 0.000032\n", + " NaN\n", + " NaN\n", + " 0.0377\n", + " NaN\n", " \n", " \n", - " 9244\n", - " 170.33792\n", - " 2010-10-17T00:13:29\n", - " 2827.0\n", - " 91.9\n", - " 38.3271\n", + " 6934\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " 1.0\n", " NaN\n", " ...\n", " NaN\n", + " 0.000012\n", + " 0.000034\n", + " 0.002046\n", + " 0.0\n", + " 0.000014\n", + " NaN\n", + " NaN\n", + " 0.0419\n", + " NaN\n", + " \n", + " \n", + " 6936\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", @@ -2644,21 +10637,118 @@ " NaN\n", " NaN\n", " NaN\n", + " ...\n", + " NaN\n", + " 0.000008\n", + " 0.000054\n", + " 0.002178\n", + " 0.000078\n", + " 0.00003\n", + " NaN\n", + " NaN\n", + " 0.039\n", + " NaN\n", " \n", " \n", - " 9256\n", - " 170.33792\n", - " 2010-10-17T00:13:29\n", - " 2827.0\n", - " 136.6\n", - " 38.3271\n", + " 6938\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000011\n", + " 0.000055\n", + " 0.001974\n", + " 0.000036\n", + " 0.000037\n", + " NaN\n", + " NaN\n", + " 0.0393\n", + " NaN\n", + " \n", + " \n", + " 6940\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.000006\n", + " 0.000058\n", + " 0.00234\n", + " 0.000055\n", + " 0.000038\n", + " NaN\n", + " NaN\n", + " 0.0374\n", + " NaN\n", + " \n", + " \n", + " 6942\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " 0.00001\n", + " 0.000053\n", + " 0.002209\n", + " 0.000075\n", + " 0.000029\n", + " NaN\n", + " NaN\n", + " 0.0367\n", + " NaN\n", + " \n", + " \n", + " 6959\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " 1.0\n", " NaN\n", " ...\n", " NaN\n", + " 0.00001\n", + " 0.000238\n", + " 0.002039\n", + " 0.00179\n", + " 0.000376\n", + " NaN\n", + " NaN\n", + " 0.034\n", + " NaN\n", + " \n", + " \n", + " 6982\n", + " NaN\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", @@ -2667,47 +10757,573 @@ " NaN\n", " NaN\n", " NaN\n", + " ...\n", + " NaN\n", + " 0.000007\n", + " 0.000107\n", + " 0.001355\n", + " 0.000786\n", + " 0.000061\n", + " NaN\n", + " NaN\n", + " 0.0237\n", " NaN\n", " \n", " \n", "\n", - "

5 rows × 87 columns

\n", + "

134 rows × 84 columns

\n", "" ], "text/plain": [ - " lon time tot_depth smp_depth lat \\\n", - "org_index \n", - "9223 170.33792 2010-10-17T00:13:29 2827.0 17.8 38.3271 \n", - "9231 170.33792 2010-10-17T00:13:29 2827.0 34.7 38.3271 \n", - "9237 170.33792 2010-10-17T00:13:29 2827.0 67.5 38.3271 \n", - "9244 170.33792 2010-10-17T00:13:29 2827.0 91.9 38.3271 \n", - "9256 170.33792 2010-10-17T00:13:29 2827.0 136.6 38.3271 \n", - "\n", - " ac227_filt be7_filt cs137_filt h3_filt i129_filt ... \\\n", - "org_index ... \n", - "9223 NaN NaN NaN 1.0 NaN ... \n", - "9231 NaN NaN NaN 1.0 NaN ... \n", - "9237 NaN NaN NaN 1.0 NaN ... \n", - "9244 NaN NaN NaN 1.0 NaN ... \n", - "9256 NaN NaN NaN 1.0 NaN ... \n", - "\n", - " pu239_240_tot pu240 ra223 ra224 ra226 ra228 th228 th230 \\\n", - "org_index \n", - "9223 NaN NaN NaN NaN NaN NaN NaN NaN \n", - "9231 NaN NaN NaN NaN NaN NaN NaN NaN \n", - "9237 NaN NaN NaN NaN NaN NaN NaN NaN \n", - "9244 NaN NaN NaN NaN NaN NaN NaN NaN \n", - "9256 NaN NaN NaN NaN NaN NaN NaN NaN \n", - "\n", - " th234 u236 \n", - "org_index \n", - "9223 NaN NaN \n", - "9231 NaN NaN \n", - "9237 NaN NaN \n", - "9244 NaN NaN \n", - "9256 NaN NaN \n", - "\n", - "[5 rows x 87 columns]" + " ac227_filt be7_filt cs137_filt h3_filt i129_filt np237_filt \\\n", + "org_index \n", + "21 NaN NaN NaN NaN NaN NaN \n", + "31 NaN NaN NaN NaN NaN NaN \n", + "64 NaN NaN NaN NaN NaN NaN \n", + "91 NaN NaN NaN NaN NaN NaN \n", + "99 NaN NaN NaN NaN NaN NaN \n", + "106 NaN NaN NaN NaN NaN NaN \n", + "116 NaN NaN NaN NaN NaN NaN \n", + "123 NaN NaN 1.0 NaN NaN NaN \n", + "135 NaN NaN 1.0 NaN NaN NaN \n", + "149 NaN NaN 1.0 NaN NaN NaN \n", + "164 NaN NaN NaN NaN NaN NaN \n", + "181 NaN NaN NaN NaN NaN NaN \n", + "200 NaN NaN NaN NaN NaN NaN \n", + "232 NaN NaN 1.0 NaN NaN NaN \n", + "251 NaN NaN NaN NaN NaN NaN \n", + "259 NaN NaN NaN NaN NaN NaN \n", + "274 NaN NaN 1.0 NaN NaN NaN \n", + "294 NaN NaN NaN NaN NaN NaN \n", + "301 NaN NaN NaN NaN NaN NaN \n", + "308 NaN NaN NaN NaN NaN NaN \n", + "316 NaN NaN NaN NaN NaN NaN \n", + "341 NaN NaN NaN NaN NaN NaN \n", + "357 NaN NaN 1.0 NaN NaN NaN \n", + "380 NaN NaN NaN NaN NaN NaN \n", + "388 NaN NaN NaN NaN NaN NaN \n", + "1158 NaN NaN 1.0 NaN NaN 1.0 \n", + "1586 NaN NaN NaN NaN NaN NaN \n", + "1942 NaN NaN NaN 1.0 NaN NaN \n", + "2050 NaN NaN NaN 1.0 NaN NaN \n", + "2520 NaN NaN NaN NaN NaN NaN \n", + "2526 NaN NaN NaN 1.0 NaN NaN \n", + "2570 NaN NaN NaN 1.0 NaN NaN \n", + "2928 NaN NaN NaN 1.0 NaN NaN \n", + "3085 NaN NaN NaN NaN NaN NaN \n", + "3309 NaN NaN NaN 1.0 NaN NaN \n", + "3375 NaN NaN NaN 1.0 NaN NaN \n", + "3716 NaN NaN NaN 1.0 NaN NaN \n", + "4050 NaN NaN NaN NaN NaN NaN \n", + "4254 NaN NaN NaN 1.0 NaN NaN \n", + "4382 NaN NaN NaN NaN NaN NaN \n", + "4394 NaN NaN NaN 1.0 NaN NaN \n", + "4748 NaN NaN NaN NaN NaN NaN \n", + "5152 NaN NaN NaN NaN NaN NaN \n", + "5167 NaN NaN NaN 1.0 NaN NaN \n", + "5728 NaN NaN NaN NaN NaN NaN \n", + "5782 NaN NaN NaN NaN NaN NaN \n", + "5853 NaN NaN NaN NaN NaN NaN \n", + "5886 NaN NaN NaN NaN NaN NaN \n", + "5889 NaN NaN NaN NaN NaN NaN \n", + "5895 NaN NaN NaN NaN NaN NaN \n", + "5960 NaN NaN NaN NaN NaN NaN \n", + "5963 NaN NaN NaN NaN NaN NaN \n", + "5966 NaN NaN NaN NaN NaN NaN \n", + "5969 NaN NaN NaN NaN NaN NaN \n", + "5974 NaN NaN NaN NaN NaN NaN \n", + "5978 NaN NaN NaN NaN NaN NaN \n", + "6048 NaN NaN NaN NaN NaN NaN \n", + "6051 NaN NaN NaN NaN NaN NaN \n", + "6054 NaN NaN NaN NaN NaN NaN \n", + "6057 NaN NaN NaN NaN NaN NaN \n", + "6059 NaN NaN NaN NaN NaN NaN \n", + "6065 NaN NaN NaN NaN NaN NaN \n", + "6127 NaN NaN NaN NaN NaN NaN \n", + "6132 NaN NaN NaN NaN NaN NaN \n", + "6135 NaN NaN NaN NaN NaN NaN \n", + "6139 NaN NaN NaN NaN NaN NaN \n", + "6142 NaN NaN NaN NaN NaN NaN \n", + "6145 NaN NaN NaN NaN NaN NaN \n", + "6148 NaN NaN NaN NaN NaN NaN \n", + "6152 NaN NaN NaN NaN NaN NaN \n", + "6155 NaN NaN NaN NaN NaN NaN \n", + "6156 NaN NaN NaN NaN NaN NaN \n", + "6159 NaN NaN NaN NaN NaN NaN \n", + "6162 NaN NaN NaN NaN NaN NaN \n", + "6165 NaN NaN NaN NaN NaN NaN \n", + "6337 NaN NaN NaN NaN NaN NaN \n", + "6339 NaN NaN NaN NaN NaN NaN \n", + "6341 NaN NaN NaN NaN NaN NaN \n", + "6344 NaN NaN NaN NaN NaN NaN \n", + "6347 NaN NaN NaN NaN NaN NaN \n", + "6352 NaN NaN NaN NaN NaN NaN \n", + "6354 NaN NaN NaN NaN NaN NaN \n", + "6365 NaN NaN NaN NaN NaN NaN \n", + "6404 NaN NaN NaN NaN 1 NaN \n", + "6421 NaN NaN NaN NaN NaN NaN \n", + "6445 NaN NaN NaN NaN NaN NaN \n", + "6451 NaN NaN NaN NaN NaN NaN \n", + "6490 NaN NaN NaN NaN NaN NaN \n", + "6494 NaN NaN NaN NaN NaN NaN \n", + "6496 NaN NaN NaN NaN NaN NaN \n", + "6500 NaN NaN NaN NaN NaN NaN \n", + "6503 NaN NaN NaN NaN NaN NaN \n", + "6561 NaN NaN NaN NaN NaN NaN \n", + "6591 NaN NaN NaN NaN NaN NaN \n", + "6597 NaN NaN NaN NaN NaN NaN \n", + "6600 NaN NaN NaN NaN NaN NaN \n", + "6603 NaN NaN NaN NaN NaN NaN \n", + "6605 NaN NaN NaN NaN NaN NaN \n", + "6607 NaN NaN NaN NaN NaN NaN \n", + "6609 NaN NaN NaN NaN NaN NaN \n", + "6638 NaN NaN NaN NaN NaN NaN \n", + "6647 NaN NaN NaN NaN NaN NaN \n", + "6679 NaN NaN NaN NaN NaN NaN \n", + "6682 NaN NaN NaN NaN NaN NaN \n", + "6685 NaN NaN NaN NaN NaN NaN \n", + "6687 NaN NaN NaN NaN NaN NaN \n", + "6690 NaN NaN NaN NaN NaN NaN \n", + "6693 NaN NaN NaN NaN NaN NaN \n", + "6696 NaN NaN NaN NaN NaN NaN \n", + "6697 NaN NaN NaN NaN NaN NaN \n", + "6700 NaN NaN NaN NaN NaN NaN \n", + "6702 NaN NaN NaN NaN NaN NaN \n", + "6705 NaN NaN NaN NaN NaN NaN \n", + "6708 NaN NaN NaN NaN NaN NaN \n", + "6714 NaN NaN NaN NaN NaN NaN \n", + "6716 NaN NaN NaN NaN NaN NaN \n", + "6718 NaN NaN NaN NaN NaN NaN \n", + "6785 NaN NaN NaN NaN NaN NaN \n", + "6803 NaN NaN NaN NaN NaN NaN \n", + "6806 NaN NaN NaN NaN NaN NaN \n", + "6809 NaN NaN NaN NaN NaN NaN \n", + "6813 NaN NaN NaN NaN NaN NaN \n", + "6815 NaN NaN NaN NaN NaN NaN \n", + "6817 NaN NaN NaN NaN NaN NaN \n", + "6819 NaN NaN NaN NaN NaN NaN \n", + "6929 NaN NaN NaN NaN NaN NaN \n", + "6932 NaN NaN NaN NaN NaN NaN \n", + "6934 NaN NaN NaN NaN NaN NaN \n", + "6936 NaN NaN NaN NaN NaN NaN \n", + "6938 NaN NaN NaN NaN NaN NaN \n", + "6940 NaN NaN NaN NaN NaN NaN \n", + "6942 NaN NaN NaN NaN NaN NaN \n", + "6959 NaN NaN NaN NaN NaN NaN \n", + "6982 NaN NaN NaN NaN NaN NaN \n", + "\n", + " pa231_filt pb210_filt po210_filt pu239_filt ... pu240 \\\n", + "org_index ... \n", + "21 NaN 1.0 1.0 NaN ... NaN \n", + "31 NaN 1.0 1.0 NaN ... NaN \n", + "64 NaN 1.0 1.0 NaN ... NaN \n", + "91 NaN 1.0 1.0 NaN ... NaN \n", + "99 NaN 1.0 1.0 NaN ... NaN \n", + "106 NaN 1.0 1.0 NaN ... NaN \n", + "116 NaN 1.0 1.0 NaN ... NaN \n", + "123 NaN 1.0 1.0 NaN ... NaN \n", + "135 NaN 1.0 1.0 NaN ... NaN \n", + "149 NaN 1.0 1.0 NaN ... NaN \n", + "164 NaN 1.0 1.0 NaN ... NaN \n", + "181 NaN 1.0 1.0 NaN ... NaN \n", + "200 NaN 1.0 1.0 NaN ... NaN \n", + "232 NaN 1.0 1.0 NaN ... NaN \n", + "251 NaN 1.0 1.0 NaN ... NaN \n", + "259 NaN 1.0 1.0 NaN ... NaN \n", + "274 NaN 1.0 1.0 NaN ... NaN \n", + "294 NaN 1.0 1.0 NaN ... NaN \n", + "301 NaN 1.0 1.0 NaN ... NaN \n", + "308 NaN 1.0 1.0 NaN ... NaN \n", + "316 NaN 1.0 1.0 NaN ... NaN \n", + "341 NaN 1.0 1.0 NaN ... NaN \n", + "357 NaN 1.0 1.0 NaN ... NaN \n", + "380 NaN 1.0 1.0 NaN ... NaN \n", + "388 NaN 1.0 1.0 NaN ... NaN \n", + "1158 1 NaN NaN 1.0 ... 0.000008 \n", + "1586 1 1.0 NaN NaN ... NaN \n", + "1942 1 NaN NaN NaN ... NaN \n", + "2050 1 NaN NaN NaN ... NaN \n", + "2520 NaN 1.0 1.0 NaN ... NaN \n", + "2526 1 1.0 1.0 NaN ... NaN \n", + "2570 NaN NaN NaN NaN ... NaN \n", + "2928 1 NaN NaN NaN ... NaN \n", + "3085 NaN 1.0 1.0 NaN ... NaN \n", + "3309 1 NaN NaN NaN ... NaN \n", + "3375 1 NaN NaN NaN ... NaN \n", + "3716 1 NaN NaN NaN ... NaN \n", + "4050 1 1.0 1.0 NaN ... NaN \n", + "4254 1 NaN NaN NaN ... NaN \n", + "4382 1 1.0 1.0 NaN ... NaN \n", + "4394 1 NaN NaN NaN ... NaN \n", + "4748 1 1.0 1.0 NaN ... NaN \n", + "5152 1 1.0 1.0 NaN ... NaN \n", + "5167 1 1.0 1.0 NaN ... NaN \n", + "5728 1 1.0 1.0 NaN ... NaN \n", + "5782 1 NaN NaN NaN ... NaN \n", + "5853 NaN NaN NaN NaN ... NaN \n", + "5886 NaN NaN NaN NaN ... NaN \n", + "5889 NaN NaN NaN NaN ... NaN \n", + "5895 NaN NaN NaN NaN ... NaN \n", + "5960 NaN NaN NaN NaN ... NaN \n", + "5963 NaN NaN NaN NaN ... NaN \n", + "5966 NaN NaN NaN NaN ... NaN \n", + "5969 NaN NaN NaN NaN ... NaN \n", + "5974 NaN NaN NaN NaN ... NaN \n", + "5978 NaN NaN NaN NaN ... NaN \n", + "6048 NaN NaN NaN NaN ... NaN \n", + "6051 NaN NaN NaN NaN ... NaN \n", + "6054 NaN NaN NaN NaN ... NaN \n", + "6057 NaN NaN NaN NaN ... NaN \n", + "6059 NaN NaN NaN NaN ... NaN \n", + "6065 NaN NaN NaN NaN ... NaN \n", + "6127 NaN NaN NaN NaN ... NaN \n", + "6132 NaN NaN NaN NaN ... NaN \n", + "6135 NaN NaN NaN NaN ... NaN \n", + "6139 NaN NaN NaN NaN ... NaN \n", + "6142 NaN NaN NaN NaN ... NaN \n", + "6145 NaN NaN NaN NaN ... NaN \n", + "6148 NaN NaN NaN NaN ... NaN \n", + "6152 NaN NaN NaN NaN ... NaN \n", + "6155 NaN NaN NaN NaN ... NaN \n", + "6156 NaN NaN NaN NaN ... NaN \n", + "6159 NaN NaN NaN NaN ... NaN \n", + "6162 NaN NaN NaN NaN ... NaN \n", + "6165 NaN NaN NaN NaN ... NaN \n", + "6337 NaN NaN NaN NaN ... NaN \n", + "6339 NaN NaN NaN NaN ... NaN \n", + "6341 NaN NaN NaN NaN ... NaN \n", + "6344 NaN NaN NaN NaN ... NaN \n", + "6347 NaN NaN NaN NaN ... NaN \n", + "6352 NaN NaN NaN NaN ... NaN \n", + "6354 NaN NaN NaN NaN ... NaN \n", + "6365 1 NaN NaN NaN ... NaN \n", + "6404 1 NaN NaN NaN ... NaN \n", + "6421 1 NaN NaN NaN ... NaN \n", + "6445 1 NaN NaN NaN ... NaN \n", + "6451 NaN NaN NaN NaN ... NaN \n", + "6490 NaN NaN NaN NaN ... NaN \n", + "6494 NaN NaN NaN NaN ... NaN \n", + "6496 1 NaN NaN NaN ... NaN \n", + "6500 NaN NaN NaN NaN ... NaN \n", + "6503 NaN 1.0 1.0 NaN ... NaN \n", + "6561 NaN NaN NaN NaN ... NaN \n", + "6591 NaN NaN NaN NaN ... NaN \n", + "6597 NaN NaN NaN NaN ... NaN \n", + "6600 NaN NaN NaN NaN ... NaN \n", + "6603 NaN NaN NaN NaN ... NaN \n", + "6605 NaN NaN NaN NaN ... NaN \n", + "6607 NaN NaN NaN NaN ... NaN \n", + "6609 NaN NaN NaN NaN ... NaN \n", + "6638 1 NaN NaN NaN ... NaN \n", + "6647 NaN 1.0 1.0 NaN ... NaN \n", + "6679 NaN NaN NaN NaN ... NaN \n", + "6682 NaN NaN NaN NaN ... NaN \n", + "6685 NaN NaN NaN NaN ... NaN \n", + "6687 NaN NaN NaN NaN ... NaN \n", + "6690 NaN NaN NaN NaN ... NaN \n", + "6693 NaN NaN NaN NaN ... NaN \n", + "6696 NaN NaN NaN NaN ... NaN \n", + "6697 NaN NaN NaN NaN ... NaN \n", + "6700 NaN NaN NaN NaN ... NaN \n", + "6702 NaN NaN NaN NaN ... NaN \n", + "6705 NaN NaN NaN NaN ... NaN \n", + "6708 NaN NaN NaN NaN ... NaN \n", + "6714 NaN NaN NaN NaN ... NaN \n", + "6716 NaN NaN NaN NaN ... NaN \n", + "6718 NaN NaN NaN NaN ... NaN \n", + "6785 NaN NaN NaN NaN ... NaN \n", + "6803 NaN NaN NaN NaN ... NaN \n", + "6806 NaN NaN NaN NaN ... NaN \n", + "6809 NaN NaN NaN NaN ... NaN \n", + "6813 NaN NaN NaN NaN ... NaN \n", + "6815 NaN NaN NaN NaN ... NaN \n", + "6817 NaN NaN NaN NaN ... NaN \n", + "6819 NaN NaN NaN NaN ... NaN \n", + "6929 NaN NaN NaN NaN ... NaN \n", + "6932 NaN NaN NaN NaN ... NaN \n", + "6934 NaN NaN NaN NaN ... NaN \n", + "6936 NaN NaN NaN NaN ... NaN \n", + "6938 NaN NaN NaN NaN ... NaN \n", + "6940 NaN NaN NaN NaN ... NaN \n", + "6942 NaN NaN NaN NaN ... NaN \n", + "6959 NaN NaN NaN NaN ... NaN \n", + "6982 NaN NaN NaN NaN ... NaN \n", + "\n", + " ra223 ra224 ra226 ra228 th228 th230 th232 \\\n", + "org_index \n", + "21 NaN 0.000091 0.00112 0.000327 NaN NaN NaN \n", + "31 NaN 0.000044 0.000888 0.000311 NaN NaN NaN \n", + "64 NaN 0.000208 0.001169 0.000262 NaN NaN NaN \n", + "91 NaN 0.00028 0.001167 0.00028 NaN NaN NaN \n", + "99 NaN 0.000166 0.001224 0.000259 NaN NaN NaN \n", + "106 NaN 0.000132 0.001208 0.000312 NaN NaN NaN \n", + "116 NaN 0.000107 0.001141 0.000262 NaN NaN NaN \n", + "123 NaN 0.00013 0.001148 0.000254 NaN NaN NaN \n", + "135 NaN 0.000159 0.001218 0.000402 NaN NaN NaN \n", + "149 NaN 0.000202 0.001083 0.000763 NaN NaN NaN \n", + "164 NaN 0.000171 0.001096 0.000675 NaN NaN NaN \n", + "181 NaN 0.000356 0.001098 0.000411 NaN NaN NaN \n", + "200 NaN 0.000416 0.001101 0.000585 NaN NaN NaN \n", + "232 NaN 0.000735 0.001301 0.001382 NaN NaN NaN \n", + "251 NaN 0.000914 0.001424 0.001457 NaN NaN NaN \n", + "259 NaN 0.000956 0.001498 0.001259 NaN NaN NaN \n", + "274 NaN 0.000753 0.001491 0.000927 NaN NaN NaN \n", + "294 NaN 0.001002 0.00154 0.002 NaN NaN NaN \n", + "301 NaN 0.000776 0.001385 0.001361 NaN NaN NaN \n", + "308 NaN 0.000517 0.00115 0.000867 NaN NaN NaN \n", + "316 NaN 0.000618 0.00105 0.000496 NaN NaN NaN \n", + "341 NaN 0.000293 0.001099 0.000506 NaN NaN NaN \n", + "357 NaN 0.000319 0.001054 0.00081 NaN NaN NaN \n", + "380 NaN 0.000272 0.000787 0.000914 NaN NaN NaN \n", + "388 NaN 0.000158 0.000541 0.000554 NaN NaN NaN \n", + "1158 NaN NaN NaN NaN NaN NaN NaN \n", + "1586 0.0 0.000122 0.001512 0.000197 0.000093 0.000003 NaN \n", + "1942 0.0 0.000062 0.001473 0.000088 0.000042 0.000002 NaN \n", + "2050 0.0 0.000085 0.001403 0.000106 0.000073 0.000002 NaN \n", + "2520 0.000003 0.000023 0.001693 0.000098 0.000031 NaN NaN \n", + "2526 NaN NaN NaN NaN NaN 0.000004 NaN \n", + "2570 0.000005 0.000067 0.00159 0.001132 0.000127 NaN NaN \n", + "2928 0.000002 0.000163 0.00133 0.000481 0.000154 0.000001 NaN \n", + "3085 0.000008 0.000076 0.001403 0.000359 0.000182 NaN NaN \n", + "3309 0.000003 0.00005 0.001454 0.000421 0.00012 0.000001 NaN \n", + "3375 NaN 0.000026 0.001293 0.000382 0.000088 0.000001 NaN \n", + "3716 NaN NaN 0.001785 0.000008 NaN 0.000003 NaN \n", + "4050 NaN NaN NaN NaN NaN 0.0 NaN \n", + "4254 NaN NaN 0.00101 0.000049 NaN 0.0 NaN \n", + "4382 NaN NaN NaN NaN NaN 0.0 NaN \n", + "4394 NaN NaN 0.001033 0.000035 NaN 0.000001 NaN \n", + "4748 NaN NaN NaN NaN NaN 0.0 NaN \n", + "5152 NaN NaN NaN NaN NaN 0.0 NaN \n", + "5167 NaN NaN NaN NaN NaN 0.0 NaN \n", + "5728 0.000003 0.000036 0.000347 0.00028 0.000061 0.0 NaN \n", + "5782 0.0 0.000124 0.001204 0.000991 0.00005 0.000001 NaN \n", + "5853 0.0 0.000321 0.001742 0.000884 0.000358 NaN NaN \n", + "5886 0.000001 0.000143 0.001335 0.000126 0.000179 NaN NaN \n", + "5889 0.000007 0.000072 0.001772 0.000039 0.000079 NaN NaN \n", + "5895 0.00001 0.000103 0.002146 0.00019 0.000089 NaN NaN \n", + "5960 0.000002 0.000164 0.001283 0.000175 0.000199 NaN NaN \n", + "5963 0.000004 0.000198 0.001422 0.000227 0.000219 NaN NaN \n", + "5966 0.000002 0.000184 0.001377 0.000174 0.00021 NaN NaN \n", + "5969 0.000002 0.000208 0.001348 0.000227 0.000248 NaN NaN \n", + "5974 0.000005 0.000093 0.001494 0.000081 0.000093 NaN NaN \n", + "5978 0.000008 0.00006 0.001682 0.000068 0.000066 NaN NaN \n", + "6048 0.000001 0.000156 0.001363 0.000135 0.000227 NaN NaN \n", + "6051 0.000003 0.000099 0.001313 0.000077 0.0001 NaN NaN \n", + "6054 0.000007 0.000035 0.001751 0.0 0.000047 NaN NaN \n", + "6057 0.000008 0.000037 0.001895 0.000081 0.000038 NaN NaN \n", + "6059 0.000008 0.000057 0.002024 0.000062 0.000047 NaN NaN \n", + "6065 0.00001 0.000043 0.002252 0.000048 0.000048 NaN NaN \n", + "6127 0.000003 0.000182 0.001275 0.000148 0.000188 NaN NaN \n", + "6132 0.000001 0.000139 0.001186 0.000107 0.000159 NaN NaN \n", + "6135 0.000002 0.000143 0.001344 0.000113 0.00015 NaN NaN \n", + "6139 0.000003 0.000166 0.001335 0.0 0.000167 NaN NaN \n", + "6142 0.000004 0.000136 0.001468 0.0001 0.000118 NaN NaN \n", + "6145 0.000006 0.00006 0.001393 0.0 0.000057 NaN NaN \n", + "6148 0.000008 0.000025 0.00151 0.000025 0.000013 NaN NaN \n", + "6152 0.000009 0.000017 0.001719 0.0 0.000009 NaN NaN \n", + "6155 0.000007 0.000015 0.001608 0.0 0.00001 NaN NaN \n", + "6156 0.000012 0.000016 0.002023 0.000059 0.000013 NaN NaN \n", + "6159 0.000007 0.000026 0.002378 0.0 0.000033 NaN NaN \n", + "6162 0.000011 0.00007 0.002719 0.000079 0.000049 NaN NaN \n", + "6165 0.000012 0.000054 0.002514 0.000085 0.000067 NaN NaN \n", + "6337 0.000003 0.000157 0.001343 0.000131 0.000129 NaN NaN \n", + "6339 0.000004 0.000107 0.001288 0.000124 0.000123 NaN NaN \n", + "6341 0.000004 0.000061 0.001318 0.0 0.000077 NaN NaN \n", + "6344 0.000011 0.000023 0.001486 0.0 0.000019 NaN NaN \n", + "6347 0.00001 0.000026 0.001975 0.0 0.000022 NaN NaN \n", + "6352 0.000014 0.000045 0.002144 0.000036 0.000048 NaN NaN \n", + "6354 0.000012 0.000053 0.002277 0.000084 0.000057 NaN NaN \n", + "6365 NaN 0.000019 0.001758 0.000031 0.00001 0.000012 NaN \n", + "6404 NaN 0.000689 0.00134 0.000891 0.000484 0.0 NaN \n", + "6421 NaN 0.000077 0.002422 0.000086 0.000044 0.000022 NaN \n", + "6445 0.000002 0.001076 0.001784 0.003084 0.000761 0.000002 NaN \n", + "6451 0.0 0.000537 0.001745 0.002753 0.000606 0.000002 NaN \n", + "6490 0.0 0.000114 0.001393 0.000052 0.000079 NaN NaN \n", + "6494 0.000002 0.000104 0.001377 0.000067 0.000066 NaN NaN \n", + "6496 0.000002 0.00007 0.001401 0.000078 0.000057 0.000009 NaN \n", + "6500 0.000004 0.000033 0.001397 0.000051 0.000018 NaN NaN \n", + "6503 0.000004 0.000063 0.001733 0.000058 0.000047 NaN NaN \n", + "6561 0.0 0.000432 0.001528 0.00078 0.000193 NaN NaN \n", + "6591 0.0 0.000116 0.001357 0.000087 0.000139 NaN NaN \n", + "6597 0.000001 0.000056 0.001344 0.0 0.000036 NaN NaN \n", + "6600 0.000004 0.000018 0.001391 0.000036 0.000008 NaN NaN \n", + "6603 0.000006 0.000024 0.001787 0.000034 0.000013 NaN NaN \n", + "6605 0.000006 0.000024 0.001825 0.000037 0.000014 NaN NaN \n", + "6607 0.000007 0.000064 0.002153 0.000073 0.000043 NaN NaN \n", + "6609 0.000015 0.000087 0.002212 0.000131 0.000059 NaN NaN \n", + "6638 NaN 0.000092 0.001672 0.00011 0.000077 0.000006 NaN \n", + "6647 0.000004 0.000365 0.001607 0.000965 0.000344 NaN NaN \n", + "6679 0.0 0.000301 0.001302 0.00023 0.000247 NaN NaN \n", + "6682 0.000002 0.000216 0.001409 0.000114 0.00018 NaN NaN \n", + "6685 0.000002 0.000198 0.001358 0.000082 0.000126 NaN NaN \n", + "6687 0.000004 0.000162 0.001364 0.000093 0.000122 NaN NaN \n", + "6690 0.000003 0.000136 0.001391 0.000075 0.000092 NaN NaN \n", + "6693 0.000001 0.00008 0.00145 0.000039 0.000044 NaN NaN \n", + "6696 0.000004 0.000036 0.001476 0.0 0.000022 NaN NaN \n", + "6697 0.000004 0.000035 0.001459 0.0 0.000016 NaN NaN \n", + "6700 0.000006 0.000021 0.001471 0.0 0.000012 NaN NaN \n", + "6702 0.000009 0.000021 0.001737 0.0 0.000009 NaN NaN \n", + "6705 0.000013 0.000021 0.001933 0.0 0.000008 NaN NaN \n", + "6708 0.000015 0.000054 0.001905 0.0 0.000054 NaN NaN \n", + "6714 0.000019 0.000056 0.001907 0.000049 0.000048 NaN NaN \n", + "6716 0.000013 0.000049 0.002008 0.000053 0.000046 NaN NaN \n", + "6718 0.000009 0.000048 0.00208 0.000065 0.000046 NaN NaN \n", + "6785 0.0 0.00044 0.001823 0.000926 0.000475 NaN NaN \n", + "6803 0.000001 0.000068 0.001321 0.00005 0.00006 NaN NaN \n", + "6806 0.000004 0.000035 0.001451 0.000046 0.000028 NaN NaN \n", + "6809 0.000005 0.000019 0.001532 0.0 0.000014 NaN NaN \n", + "6813 0.000008 0.000026 0.002118 0.000039 0.000018 NaN NaN \n", + "6815 0.00001 0.000053 0.002066 0.000051 0.000038 NaN NaN \n", + "6817 0.000011 0.000051 0.002092 0.000072 0.000039 NaN NaN \n", + "6819 0.000005 0.000054 0.001933 0.000065 0.000032 NaN NaN \n", + "6929 0.000002 0.00006 0.001478 0.000027 0.000055 NaN NaN \n", + "6932 0.000006 0.000042 0.001748 0.000033 0.000032 NaN NaN \n", + "6934 0.000012 0.000034 0.002046 0.0 0.000014 NaN NaN \n", + "6936 0.000008 0.000054 0.002178 0.000078 0.00003 NaN NaN \n", + "6938 0.000011 0.000055 0.001974 0.000036 0.000037 NaN NaN \n", + "6940 0.000006 0.000058 0.00234 0.000055 0.000038 NaN NaN \n", + "6942 0.00001 0.000053 0.002209 0.000075 0.000029 NaN NaN \n", + "6959 0.00001 0.000238 0.002039 0.00179 0.000376 NaN NaN \n", + "6982 0.000007 0.000107 0.001355 0.000786 0.000061 NaN NaN \n", + "\n", + " th234 u236 \n", + "org_index \n", + "21 0.030734 NaN \n", + "31 0.025989 NaN \n", + "64 0.037707 NaN \n", + "91 0.038946 NaN \n", + "99 0.033485 NaN \n", + "106 0.029319 NaN \n", + "116 0.027518 NaN \n", + "123 0.030561 NaN \n", + "135 0.032424 NaN \n", + "149 0.033362 NaN \n", + "164 0.03475 NaN \n", + "181 0.031671 NaN \n", + "200 0.033616 NaN \n", + "232 0.031999 NaN \n", + "251 0.03428 NaN \n", + "259 0.033478 NaN \n", + "274 0.032871 NaN \n", + "294 0.030905 NaN \n", + "301 0.032596 NaN \n", + "308 0.030861 NaN \n", + "316 0.039277 NaN \n", + "341 0.031319 NaN \n", + "357 0.032725 NaN \n", + "380 0.025919 NaN \n", + "388 0.022308 NaN \n", + "1158 NaN NaN \n", + "1586 NaN NaN \n", + "1942 NaN NaN \n", + "2050 NaN NaN \n", + "2520 NaN NaN \n", + "2526 NaN NaN \n", + "2570 NaN NaN \n", + "2928 NaN NaN \n", + "3085 NaN NaN \n", + "3309 NaN NaN \n", + "3375 NaN NaN \n", + "3716 NaN NaN \n", + "4050 NaN 6168131.0 \n", + "4254 NaN NaN \n", + "4382 NaN 7304443.0 \n", + "4394 NaN NaN \n", + "4748 NaN 5495804.5 \n", + "5152 NaN 5656699.5 \n", + "5167 NaN NaN \n", + "5728 NaN NaN \n", + "5782 NaN NaN \n", + "5853 0.0347 NaN \n", + "5886 0.0444 NaN \n", + "5889 0.041 NaN \n", + "5895 0.043 NaN \n", + "5960 0.0459 NaN \n", + "5963 0.0428 NaN \n", + "5966 0.042 NaN \n", + "5969 0.0394 NaN \n", + "5974 0.0413 NaN \n", + "5978 0.041 NaN \n", + "6048 0.0407 NaN \n", + "6051 0.0427 NaN \n", + "6054 0.0389 NaN \n", + "6057 0.041 NaN \n", + "6059 0.0386 NaN \n", + "6065 0.0414 NaN \n", + "6127 0.039 NaN \n", + "6132 0.0391 NaN \n", + "6135 0.0402 NaN \n", + "6139 0.0411 NaN \n", + "6142 0.041 NaN \n", + "6145 0.0386 NaN \n", + "6148 0.0358 NaN \n", + "6152 0.0367 NaN \n", + "6155 0.0422 NaN \n", + "6156 0.0372 NaN \n", + "6159 0.0417 NaN \n", + "6162 0.0356 NaN \n", + "6165 0.0405 NaN \n", + "6337 0.0389 NaN \n", + "6339 0.04 NaN \n", + "6341 0.0393 NaN \n", + "6344 0.0345 NaN \n", + "6347 0.0394 NaN \n", + "6352 0.0385 NaN \n", + "6354 0.0364 NaN \n", + "6365 NaN NaN \n", + "6404 NaN 20850986.0 \n", + "6421 NaN NaN \n", + "6445 NaN NaN \n", + "6451 NaN NaN \n", + "6490 0.035 NaN \n", + "6494 0.0376 NaN \n", + "6496 0.0415 NaN \n", + "6500 0.0409 NaN \n", + "6503 0.0409 NaN \n", + "6561 0.0361 NaN \n", + "6591 0.0403 NaN \n", + "6597 0.0379 NaN \n", + "6600 0.0398 NaN \n", + "6603 0.041 NaN \n", + "6605 0.0397 NaN \n", + "6607 0.0358 NaN \n", + "6609 0.0407 NaN \n", + "6638 NaN NaN \n", + "6647 NaN NaN \n", + "6679 0.0395 NaN \n", + "6682 0.0388 NaN \n", + "6685 0.0388 NaN \n", + "6687 0.0359 NaN \n", + "6690 0.0381 NaN \n", + "6693 0.0359 NaN \n", + "6696 0.0391 NaN \n", + "6697 0.0383 NaN \n", + "6700 0.0401 NaN \n", + "6702 0.0378 NaN \n", + "6705 0.0349 NaN \n", + "6708 0.0409 NaN \n", + "6714 0.0396 NaN \n", + "6716 0.0395 NaN \n", + "6718 0.0388 NaN \n", + "6785 0.0379 NaN \n", + "6803 0.0422 NaN \n", + "6806 0.0386 NaN \n", + "6809 0.0404 NaN \n", + "6813 0.0366 NaN \n", + "6815 0.0402 NaN \n", + "6817 0.0406 NaN \n", + "6819 0.038 NaN \n", + "6929 0.0404 NaN \n", + "6932 0.0377 NaN \n", + "6934 0.0419 NaN \n", + "6936 0.039 NaN \n", + "6938 0.0393 NaN \n", + "6940 0.0374 NaN \n", + "6942 0.0367 NaN \n", + "6959 0.034 NaN \n", + "6982 0.0237 NaN \n", + "\n", + "[134 rows x 84 columns]" ] }, "execution_count": null, @@ -2716,73 +11332,38 @@ } ], "source": [ - "#|eval: false\n", - "tfm = Transformer(df, cbs=[\n", - " SelectColsOfInterestCB(common_coi, nuclides_pattern),\n", - " WideToLongCB(common_coi, nuclides_pattern),\n", - " ExtractUnitCB(),\n", - " ExtractFilteringStatusCB(phase),\n", - " ExtractSamplingMethodCB(smp_method),\n", - " RenameNuclideCB(nuclides_name),\n", - " StandardizeUnitCB(units_lut),\n", - " RenameColumnCB(renaming_rules),\n", - " UnshiftLongitudeCB(),\n", - " DispatchToGroupCB(),\n", - " ReshapeLongToWide()\n", - "])\n", - "\n", - "dfs_test = tfm()\n", - "print('shape: ', dfs_test['seawater'].shape)\n", - "print('columns: ', dfs_test['seawater'] .columns)\n", - "dfs_test['seawater'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "413d49ad", - "metadata": {}, - "outputs": [], - "source": [ - "result = dfs_test['seawater'].drop(['lon', 'time', 'lat', 'smp_depth', 'tot_depth'], axis=1)" + "result[(~(result.isna())).sum(axis=1) > 20]" ] }, { "cell_type": "code", "execution_count": null, - "id": "c4823aa7", + "id": "1f5c5407", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Index(['ac227_filt', 'be7_filt', 'cs137_filt', 'h3_filt', 'i129_filt',\n", - " 'np237_filt', 'pa231_filt', 'pb210_filt', 'po210_filt', 'pu239_filt',\n", - " 'pu239_240_tot_filt', 'pu240_filt', 'ra223_filt', 'ra224_filt',\n", - " 'ra226_filt', 'ra228_filt', 'th228_filt', 'th230_filt', 'th232_filt',\n", - " 'th234_filt', 'u236_filt', 'ac227_sampmet', 'be7_sampmet',\n", - " 'cs137_sampmet', 'h3_sampmet', 'i129_sampmet', 'np237_sampmet',\n", - " 'pa231_sampmet', 'pb210_sampmet', 'po210_sampmet', 'pu239_sampmet',\n", - " 'pu239_240_tot_sampmet', 'pu240_sampmet', 'ra223_sampmet',\n", - " 'ra224_sampmet', 'ra226_sampmet', 'ra228_sampmet', 'th228_sampmet',\n", - " 'th230_sampmet', 'th232_sampmet', 'th234_sampmet', 'u236_sampmet',\n", - " 'ac227_unit', 'be7_unit', 'cs137_unit', 'h3_unit', 'i129_unit',\n", - " 'np237_unit', 'pa231_unit', 'pb210_unit', 'po210_unit', 'pu239_unit',\n", - " 'pu239_240_tot_unit', 'pu240_unit', 'ra223_unit', 'ra224_unit',\n", - " 'ra226_unit', 'ra228_unit', 'th228_unit', 'th230_unit', 'th234_unit',\n", - " 'u236_unit', 'ac227', 'be7', 'cs137', 'h3', 'i129', 'np237', 'pa231',\n", - " 'pb210', 'po210', 'pu239', 'pu239_240_tot', 'pu240', 'ra223', 'ra224',\n", - " 'ra226', 'ra228', 'th228', 'th230', 'th234', 'u236'],\n", - " dtype='object')" + "" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjEAAAGdCAYAAADjWSL8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAwBklEQVR4nO3df0yU557//9cI4ygWpqKFgZXDcU+t0WKbjbQ4brdaFZAs0tamuseE6K6r7rZqCJrusY3puG3VNflqG8i6rmuqFY3NprWnSS2Cn1Zcg/iDlFSNazw51OoeEI8HwV8dpnB//5h40xFRhgMdLub5SCZw3/d7Zq7rPZfnvHrP3IzDsixLAAAAhhkS6QEAAAD0BiEGAAAYiRADAACMRIgBAABGIsQAAAAjEWIAAICRCDEAAMBIhBgAAGCk2EgPoL90dHToD3/4g+Lj4+VwOCI9HAAA0AOWZenGjRtKTU3VkCEPPtcyaEPMH/7wB6WlpUV6GAAAoBcuXbqkMWPGPLBm0IaY+Ph4ScEmJCQkRHg0UiAQUEVFhXJycuR0OiM9nIihD53oRRB9CKIPnehFULT2obW1VWlpafb/jz/IoA0xd99CSkhIGDAhJi4uTgkJCVG1GO9FHzrRiyD6EEQfOtGLoGjvQ08+CsIHewEAgJEIMQAAwEiEGAAAYCRCDAAAMBIhBgAAGIkQAwAAjESIAQAARiLEAAAAIxFiAACAkQgxAADASIQYAABgJEIMAAAwEiEGAAAYiRADAACMFBvpAZjql7/5Iqx6V4ylTc9KGb6D8rc//OvF+8N3G/82Is8LAEB/4EwMAAAwEiEGAAAYiRADAACMRIgBAABGIsQAAAAjhRVitm7dqqeeekoJCQlKSEiQ1+vVl19+aR9ftGiRHA5HyG3KlCkhj+H3+7VixQqNHj1aI0aMUEFBgS5fvhxS09zcrMLCQrndbrndbhUWFur69eu9nyUAABh0wgoxY8aM0caNG3Xq1CmdOnVKM2bM0IsvvqizZ8/aNbNnz1ZDQ4N9O3DgQMhjFBUVaf/+/dq3b5+OHj2qmzdvKj8/X+3t7XbNggULVFdXp/LycpWXl6uurk6FhYV/5lQBAMBgEtbfiZkzZ07I9nvvvaetW7eqpqZGTz75pCTJ5XLJ4/Hc9/4tLS3asWOHdu/erVmzZkmSysrKlJaWpkOHDik3N1fnzp1TeXm5ampqlJWVJUnavn27vF6vzp8/r/Hjx4c9SQAAMPj0+jMx7e3t2rdvn27duiWv12vvP3z4sJKSkvTEE09oyZIlampqso/V1tYqEAgoJyfH3peamqqMjAxVV1dLko4dOya3220HGEmaMmWK3G63XQMAABD2X+w9ffq0vF6vfvjhBz3yyCPav3+/Jk6cKEnKy8vTq6++qvT0dNXX12vt2rWaMWOGamtr5XK51NjYqKFDh2rkyJEhj5mcnKzGxkZJUmNjo5KSkro8b1JSkl1zP36/X36/395ubW2VJAUCAQUCgXCn+VCuGCu8+iFWyM9I6I8+9HYMA2EskUYvguhDEH3oRC+CorUP4cw37BAzfvx41dXV6fr16/rkk0+0cOFCVVVVaeLEiZo/f75dl5GRoczMTKWnp+uLL77Q3Llzu31My7LkcHT+Kf6f/t5dzb02bNigdevWddlfUVGhuLi4nk6vxzY927v7vZPZ0bcDCcO9n0+KpMrKykgPYcCgF0H0IYg+dKIXQdHWh9u3b/e4NuwQM3ToUD3++OOSpMzMTJ08eVIffPCBtm3b1qU2JSVF6enpunDhgiTJ4/Gora1Nzc3NIWdjmpqaNHXqVLvmypUrXR7r6tWrSk5O7nZca9asUXFxsb3d2tqqtLQ05eTkKCEhIdxpPlSG72BY9a4hlt7J7NDaU0Pk74jMdyed8eVG5Hl/KhAIqLKyUtnZ2XI6nZEeTkTRiyD6EEQfOtGLoGjtw913Unriz/4CSMuyQt7G+alr167p0qVLSklJkSRNnjxZTqdTlZWVmjdvniSpoaFBZ86c0aZNmyRJXq9XLS0tOnHihJ59Nni64/jx42ppabGDzv24XC65XK4u+51OZ7+8+L39Ekd/hyNiXwA5kP4R9NfrYiJ6EUQfguhDJ3oRFG19CGeuYYWYN998U3l5eUpLS9ONGze0b98+HT58WOXl5bp586Z8Pp9eeeUVpaSk6LvvvtObb76p0aNH6+WXX5Ykud1uLV68WKtWrdKoUaOUmJio1atXa9KkSfbVShMmTNDs2bO1ZMkS++zO0qVLlZ+fz5VJAADAFlaIuXLligoLC9XQ0CC3262nnnpK5eXlys7O1p07d3T69Gl99NFHun79ulJSUvTCCy/o448/Vnx8vP0YW7ZsUWxsrObNm6c7d+5o5syZ2rlzp2JiYuyaPXv2aOXKlfZVTAUFBSotLe2jKQMAgMEgrBCzY8eObo8NHz5cBw8+/HMiw4YNU0lJiUpKSrqtSUxMVFlZWThDAwAAUYbvTgIAAEYixAAAACMRYgAAgJEIMQAAwEiEGAAAYCRCDAAAMBIhBgAAGIkQAwAAjESIAQAARiLEAAAAIxFiAACAkQgxAADASIQYAABgJEIMAAAwEiEGAAAYiRADAACMRIgBAABGIsQAAAAjEWIAAICRCDEAAMBIhBgAAGAkQgwAADASIQYAABiJEAMAAIxEiAEAAEYixAAAACMRYgAAgJEIMQAAwEiEGAAAYCRCDAAAMBIhBgAAGIkQAwAAjESIAQAARiLEAAAAIxFiAACAkQgxAADASIQYAABgJEIMAAAwUlghZuvWrXrqqaeUkJCghIQEeb1effnll/Zxy7Lk8/mUmpqq4cOHa/r06Tp79mzIY/j9fq1YsUKjR4/WiBEjVFBQoMuXL4fUNDc3q7CwUG63W263W4WFhbp+/XrvZwkAAAadsELMmDFjtHHjRp06dUqnTp3SjBkz9OKLL9pBZdOmTdq8ebNKS0t18uRJeTweZWdn68aNG/ZjFBUVaf/+/dq3b5+OHj2qmzdvKj8/X+3t7XbNggULVFdXp/LycpWXl6uurk6FhYV9NGUAADAYxIZTPGfOnJDt9957T1u3blVNTY0mTpyo999/X2+99Zbmzp0rSdq1a5eSk5O1d+9eLVu2TC0tLdqxY4d2796tWbNmSZLKysqUlpamQ4cOKTc3V+fOnVN5eblqamqUlZUlSdq+fbu8Xq/Onz+v8ePH98W8AQCA4cIKMT/V3t6u//7v/9atW7fk9XpVX1+vxsZG5eTk2DUul0vTpk1TdXW1li1bptraWgUCgZCa1NRUZWRkqLq6Wrm5uTp27JjcbrcdYCRpypQpcrvdqq6u7jbE+P1++f1+e7u1tVWSFAgEFAgEejvNbrlirPDqh1ghPyOhP/rQ2zEMhLFEGr0Iog9B9KETvQiK1j6EM9+wQ8zp06fl9Xr1ww8/6JFHHtH+/fs1ceJEVVdXS5KSk5ND6pOTk3Xx4kVJUmNjo4YOHaqRI0d2qWlsbLRrkpKSujxvUlKSXXM/GzZs0Lp167rsr6ioUFxcXHiT7IFNz/bufu9kdvTtQMJw4MCBiD33vSorKyM9hAGDXgTRhyD60IleBEVbH27fvt3j2rBDzPjx41VXV6fr16/rk08+0cKFC1VVVWUfdzgcIfWWZXXZd697a+5X/7DHWbNmjYqLi+3t1tZWpaWlKScnRwkJCQ+dV7gyfAfDqncNsfROZofWnhoif8eD+zGYhduHM77cn2FUkREIBFRZWans7Gw5nc5IDydi6EMQfehEL4KitQ9330npibBDzNChQ/X4449LkjIzM3Xy5El98MEH+pd/+RdJwTMpKSkpdn1TU5N9dsbj8aitrU3Nzc0hZ2Oampo0depUu+bKlStdnvfq1atdzvL8lMvlksvl6rLf6XT2y4vvb+9dEPF3OHp938Gkp32Ihn+4/bVGTUMfguhDJ3oRFG19CGeuf/bfibEsS36/X2PHjpXH4wk57dXW1qaqqio7oEyePFlOpzOkpqGhQWfOnLFrvF6vWlpadOLECbvm+PHjamlpsWsAAADCOhPz5ptvKi8vT2lpabpx44b27dunw4cPq7y8XA6HQ0VFRVq/fr3GjRuncePGaf369YqLi9OCBQskSW63W4sXL9aqVas0atQoJSYmavXq1Zo0aZJ9tdKECRM0e/ZsLVmyRNu2bZMkLV26VPn5+VyZBAAAbGGFmCtXrqiwsFANDQ1yu9166qmnVF5eruzsbEnSG2+8oTt37ui1115Tc3OzsrKyVFFRofj4ePsxtmzZotjYWM2bN0937tzRzJkztXPnTsXExNg1e/bs0cqVK+2rmAoKClRaWtoX8wUAAINEWCFmx44dDzzucDjk8/nk8/m6rRk2bJhKSkpUUlLSbU1iYqLKysrCGRoAAIgyfHcSAAAwEiEGAAAYiRADAACMRIgBAABGIsQAAAAjEWIAAICRCDEAAMBIhBgAAGAkQgwAADASIQYAABiJEAMAAIxEiAEAAEYixAAAACMRYgAAgJEIMQAAwEiEGAAAYCRCDAAAMBIhBgAAGIkQAwAAjESIAQAARiLEAAAAIxFiAACAkQgxAADASIQYAABgJEIMAAAwEiEGAAAYiRADAACMRIgBAABGIsQAAAAjEWIAAICRCDEAAMBIhBgAAGAkQgwAADASIQYAABiJEAMAAIxEiAEAAEYixAAAACMRYgAAgJHCCjEbNmzQM888o/j4eCUlJemll17S+fPnQ2oWLVokh8MRcpsyZUpIjd/v14oVKzR69GiNGDFCBQUFunz5ckhNc3OzCgsL5Xa75Xa7VVhYqOvXr/dulgAAYNAJK8RUVVXp9ddfV01NjSorK/Xjjz8qJydHt27dCqmbPXu2Ghoa7NuBAwdCjhcVFWn//v3at2+fjh49qps3byo/P1/t7e12zYIFC1RXV6fy8nKVl5errq5OhYWFf8ZUAQDAYBIbTnF5eXnI9ocffqikpCTV1tbq+eeft/e7XC55PJ77PkZLS4t27Nih3bt3a9asWZKksrIypaWl6dChQ8rNzdW5c+dUXl6umpoaZWVlSZK2b98ur9er8+fPa/z48WFNEgAADD5hhZh7tbS0SJISExND9h8+fFhJSUl69NFHNW3aNL333ntKSkqSJNXW1ioQCCgnJ8euT01NVUZGhqqrq5Wbm6tjx47J7XbbAUaSpkyZIrfbrerq6vuGGL/fL7/fb2+3trZKkgKBgAKBwJ8zzftyxVjh1Q+xQn5Gq3D70B+v3UBxd26DeY49QR+C6EMnehEUrX0IZ769DjGWZam4uFjPPfecMjIy7P15eXl69dVXlZ6ervr6eq1du1YzZsxQbW2tXC6XGhsbNXToUI0cOTLk8ZKTk9XY2ChJamxstEPPTyUlJdk199qwYYPWrVvXZX9FRYXi4uJ6O81ubXq2d/d7J7OjbwdiqJ724d63IgejysrKSA9hQKAPQfShE70IirY+3L59u8e1vQ4xy5cv17fffqujR4+G7J8/f779e0ZGhjIzM5Wenq4vvvhCc+fO7fbxLMuSw+Gwt3/6e3c1P7VmzRoVFxfb262trUpLS1NOTo4SEhJ6PK+eyvAdDKveNcTSO5kdWntqiPwd959DNAi3D2d8uT/DqCIjEAiosrJS2dnZcjqdkR5OxNCHIPrQiV4ERWsf7r6T0hO9CjErVqzQ559/riNHjmjMmDEPrE1JSVF6erouXLggSfJ4PGpra1Nzc3PI2ZimpiZNnTrVrrly5UqXx7p69aqSk5Pv+zwul0sul6vLfqfT2S8vvr+9d0HE3+Ho9X0Hk572IRr+4fbXGjUNfQiiD53oRVC09SGcuYZ1dZJlWVq+fLk+/fRTffXVVxo7duxD73Pt2jVdunRJKSkpkqTJkyfL6XSGnB5raGjQmTNn7BDj9XrV0tKiEydO2DXHjx9XS0uLXQMAAKJbWGdiXn/9de3du1e//e1vFR8fb38+xe12a/jw4bp586Z8Pp9eeeUVpaSk6LvvvtObb76p0aNH6+WXX7ZrFy9erFWrVmnUqFFKTEzU6tWrNWnSJPtqpQkTJmj27NlasmSJtm3bJklaunSp8vPzuTIJAABICjPEbN26VZI0ffr0kP0ffvihFi1apJiYGJ0+fVofffSRrl+/rpSUFL3wwgv6+OOPFR8fb9dv2bJFsbGxmjdvnu7cuaOZM2dq586diomJsWv27NmjlStX2lcxFRQUqLS0tLfzBAAAg0xYIcayHnxZ7PDhw3Xw4MM/8Dps2DCVlJSopKSk25rExESVlZWFMzwAABBF+O4kAABgJEIMAAAwEiEGAAAYiRADAACMRIgBAABGIsQAAAAjEWIAAICRCDEAAMBIhBgAAGAkQgwAADASIQYAABiJEAMAAIxEiAEAAEYixAAAACMRYgAAgJEIMQAAwEiEGAAAYCRCDAAAMBIhBgAAGIkQAwAAjESIAQAARiLEAAAAIxFiAACAkQgxAADASIQYAABgJEIMAAAwEiEGAAAYiRADAACMRIgBAABGIsQAAAAjEWIAAICRCDEAAMBIhBgAAGAkQgwAADASIQYAABiJEAMAAIxEiAEAAEYixAAAACOFFWI2bNigZ555RvHx8UpKStJLL72k8+fPh9RYliWfz6fU1FQNHz5c06dP19mzZ0Nq/H6/VqxYodGjR2vEiBEqKCjQ5cuXQ2qam5tVWFgot9stt9utwsJCXb9+vXezBAAAg05YIaaqqkqvv/66ampqVFlZqR9//FE5OTm6deuWXbNp0yZt3rxZpaWlOnnypDwej7Kzs3Xjxg27pqioSPv379e+fft09OhR3bx5U/n5+Wpvb7drFixYoLq6OpWXl6u8vFx1dXUqLCzsgykDAIDBIDac4vLy8pDtDz/8UElJSaqtrdXzzz8vy7L0/vvv66233tLcuXMlSbt27VJycrL27t2rZcuWqaWlRTt27NDu3bs1a9YsSVJZWZnS0tJ06NAh5ebm6ty5cyovL1dNTY2ysrIkSdu3b5fX69X58+c1fvz4vpg7AAAwWFgh5l4tLS2SpMTERElSfX29GhsblZOTY9e4XC5NmzZN1dXVWrZsmWpraxUIBEJqUlNTlZGRoerqauXm5urYsWNyu912gJGkKVOmyO12q7q6+r4hxu/3y+/329utra2SpEAgoEAg8OdM875cMVZ49UOskJ/RKtw+9MdrN1DcndtgnmNP0Icg+tCJXgRFax/CmW+vQ4xlWSouLtZzzz2njIwMSVJjY6MkKTk5OaQ2OTlZFy9etGuGDh2qkSNHdqm5e//GxkYlJSV1ec6kpCS75l4bNmzQunXruuyvqKhQXFxcmLN7uE3P9u5+72R29O1ADNXTPhw4cKCfRxJ5lZWVkR7CgEAfguhDJ3oRFG19uH37do9rex1ili9frm+//VZHjx7tcszhcIRsW5bVZd+97q25X/2DHmfNmjUqLi62t1tbW5WWlqacnBwlJCQ88Ll7I8N3MKx61xBL72R2aO2pIfJ3PLgXg1m4fTjjy/0ZRhUZgUBAlZWVys7OltPpjPRwIoY+BNGHTvQiKFr7cPedlJ7oVYhZsWKFPv/8cx05ckRjxoyx93s8HknBMykpKSn2/qamJvvsjMfjUVtbm5qbm0POxjQ1NWnq1Kl2zZUrV7o879WrV7uc5bnL5XLJ5XJ12e90Ovvlxfe39y6I+Dscvb7vYNLTPkTDP9z+WqOmoQ9B9KETvQiKtj6EM9ewrk6yLEvLly/Xp59+qq+++kpjx44NOT527Fh5PJ6QU19tbW2qqqqyA8rkyZPldDpDahoaGnTmzBm7xuv1qqWlRSdOnLBrjh8/rpaWFrsGAABEt7DOxLz++uvau3evfvvb3yo+Pt7+fIrb7dbw4cPlcDhUVFSk9evXa9y4cRo3bpzWr1+vuLg4LViwwK5dvHixVq1apVGjRikxMVGrV6/WpEmT7KuVJkyYoNmzZ2vJkiXatm2bJGnp0qXKz8/nyiQAACApzBCzdetWSdL06dND9n/44YdatGiRJOmNN97QnTt39Nprr6m5uVlZWVmqqKhQfHy8Xb9lyxbFxsZq3rx5unPnjmbOnKmdO3cqJibGrtmzZ49WrlxpX8VUUFCg0tLS3swRAAAMQmGFGMt6+GWxDodDPp9PPp+v25phw4appKREJSUl3dYkJiaqrKwsnOEBAIAowncnAQAAIxFiAACAkQgxAADASIQYAABgJEIMAAAwEiEGAAAYiRADAACMRIgBAABGIsQAAAAjEWIAAICRCDEAAMBIhBgAAGAkQgwAADASIQYAABiJEAMAAIxEiAEAAEYixAAAACMRYgAAgJEIMQAAwEiEGAAAYCRCDAAAMBIhBgAAGIkQAwAAjESIAQAARiLEAAAAIxFiAACAkQgxAADASIQYAABgJEIMAAAwEiEGAAAYiRADAACMRIgBAABGIsQAAAAjEWIAAICRCDEAAMBIhBgAAGAkQgwAADASIQYAABgp7BBz5MgRzZkzR6mpqXI4HPrss89Cji9atEgOhyPkNmXKlJAav9+vFStWaPTo0RoxYoQKCgp0+fLlkJrm5mYVFhbK7XbL7XarsLBQ169fD3uCAABgcAo7xNy6dUtPP/20SktLu62ZPXu2Ghoa7NuBAwdCjhcVFWn//v3at2+fjh49qps3byo/P1/t7e12zYIFC1RXV6fy8nKVl5errq5OhYWF4Q4XAAAMUrHh3iEvL095eXkPrHG5XPJ4PPc91tLSoh07dmj37t2aNWuWJKmsrExpaWk6dOiQcnNzde7cOZWXl6umpkZZWVmSpO3bt8vr9er8+fMaP358uMMGAACDTNghpicOHz6spKQkPfroo5o2bZree+89JSUlSZJqa2sVCASUk5Nj16empiojI0PV1dXKzc3VsWPH5Ha77QAjSVOmTJHb7VZ1dfV9Q4zf75ff77e3W1tbJUmBQECBQKDP5+iKscKrH2KF/IxW4fahP167geLu3AbzHHuCPgTRh070Iiha+xDOfPs8xOTl5enVV19Venq66uvrtXbtWs2YMUO1tbVyuVxqbGzU0KFDNXLkyJD7JScnq7GxUZLU2Nhoh56fSkpKsmvutWHDBq1bt67L/oqKCsXFxfXBzEJterZ393sns6NvB2Konvbh3rciB6PKyspID2FAoA9B9KETvQiKtj7cvn27x7V9HmLmz59v/56RkaHMzEylp6friy++0Ny5c7u9n2VZcjgc9vZPf++u5qfWrFmj4uJie7u1tVVpaWnKyclRQkJCb6byQBm+g2HVu4ZYeiezQ2tPDZG/4/5ziAbh9uGML/dnGFVkBAIBVVZWKjs7W06nM9LDiRj6EEQfOtGLoGjtw913UnqiX95O+qmUlBSlp6frwoULkiSPx6O2tjY1NzeHnI1pamrS1KlT7ZorV650eayrV68qOTn5vs/jcrnkcrm67Hc6nf3y4vvbexdE/B2OXt93MOlpH6LhH25/rVHT0Icg+tCJXgRFWx/CmWu//52Ya9eu6dKlS0pJSZEkTZ48WU6nM+T0WENDg86cOWOHGK/Xq5aWFp04ccKuOX78uFpaWuwaAAAQ3cI+E3Pz5k397ne/s7fr6+tVV1enxMREJSYmyufz6ZVXXlFKSoq+++47vfnmmxo9erRefvllSZLb7dbixYu1atUqjRo1SomJiVq9erUmTZpkX600YcIEzZ49W0uWLNG2bdskSUuXLlV+fj5XJgEAAEm9CDGnTp3SCy+8YG/f/RzKwoULtXXrVp0+fVofffSRrl+/rpSUFL3wwgv6+OOPFR8fb99ny5Ytio2N1bx583Tnzh3NnDlTO3fuVExMjF2zZ88erVy50r6KqaCg4IF/mwYAAESXsEPM9OnTZVndXx578ODDP/A6bNgwlZSUqKSkpNuaxMRElZWVhTs8AAAQJfjuJAAAYCRCDAAAMBIhBgAAGIkQAwAAjESIAQAARiLEAAAAIxFiAACAkQgxAADASIQYAABgJEIMAAAwEiEGAAAYiRADAACMRIgBAABGIsQAAAAjEWIAAICRCDEAAMBIhBgAAGAkQgwAADASIQYAABiJEAMAAIxEiAEAAEYixAAAACMRYgAAgJEIMQAAwEiEGAAAYCRCDAAAMBIhBgAAGIkQAwAAjESIAQAARiLEAAAAIxFiAACAkQgxAADASIQYAABgJEIMAAAwEiEGAAAYiRADAACMRIgBAABGCjvEHDlyRHPmzFFqaqocDoc+++yzkOOWZcnn8yk1NVXDhw/X9OnTdfbs2ZAav9+vFStWaPTo0RoxYoQKCgp0+fLlkJrm5mYVFhbK7XbL7XarsLBQ169fD3uCAABgcAo7xNy6dUtPP/20SktL73t806ZN2rx5s0pLS3Xy5El5PB5lZ2frxo0bdk1RUZH279+vffv26ejRo7p586by8/PV3t5u1yxYsEB1dXUqLy9XeXm56urqVFhY2IspAgCAwSg23Dvk5eUpLy/vvscsy9L777+vt956S3PnzpUk7dq1S8nJydq7d6+WLVumlpYW7dixQ7t379asWbMkSWVlZUpLS9OhQ4eUm5urc+fOqby8XDU1NcrKypIkbd++XV6vV+fPn9f48eN7O18AADBI9OlnYurr69XY2KicnBx7n8vl0rRp01RdXS1Jqq2tVSAQCKlJTU1VRkaGXXPs2DG53W47wEjSlClT5Ha77RoAABDdwj4T8yCNjY2SpOTk5JD9ycnJunjxol0zdOhQjRw5skvN3fs3NjYqKSmpy+MnJSXZNffy+/3y+/32dmtrqyQpEAgoEAj0ckbdc8VY4dUPsUJ+Rqtw+9Afr91AcXdug3mOPUEfguhDJ3oRFK19CGe+fRpi7nI4HCHblmV12Xeve2vuV/+gx9mwYYPWrVvXZX9FRYXi4uJ6MuywbHq2d/d7J7OjbwdiqJ724cCBA/08ksirrKyM9BAGBPoQRB860YugaOvD7du3e1zbpyHG4/FICp5JSUlJsfc3NTXZZ2c8Ho/a2trU3NwccjamqalJU6dOtWuuXLnS5fGvXr3a5SzPXWvWrFFxcbG93draqrS0NOXk5CghIeHPn9w9MnwHw6p3DbH0TmaH1p4aIn/HgwPdYBZuH874cn+GUUVGIBBQZWWlsrOz5XQ6Iz2ciKEPQfShE70IitY+3H0npSf6NMSMHTtWHo9HlZWV+qu/+itJUltbm6qqqvRv//ZvkqTJkyfL6XSqsrJS8+bNkyQ1NDTozJkz2rRpkyTJ6/WqpaVFJ06c0LPPBk95HD9+XC0tLXbQuZfL5ZLL5eqy3+l09suL72/vXRDxdzh6fd/BpKd9iIZ/uP21Rk1DH4LoQyd6ERRtfQhnrmGHmJs3b+p3v/udvV1fX6+6ujolJibqF7/4hYqKirR+/XqNGzdO48aN0/r16xUXF6cFCxZIktxutxYvXqxVq1Zp1KhRSkxM1OrVqzVp0iT7aqUJEyZo9uzZWrJkibZt2yZJWrp0qfLz87kyCQAASOpFiDl16pReeOEFe/vuWzgLFy7Uzp079cYbb+jOnTt67bXX1NzcrKysLFVUVCg+Pt6+z5YtWxQbG6t58+bpzp07mjlzpnbu3KmYmBi7Zs+ePVq5cqV9FVNBQUG3f5sGAABEn7BDzPTp02VZ3V9Z4nA45PP55PP5uq0ZNmyYSkpKVFJS0m1NYmKiysrKwh0eAACIEnx3EgAAMBIhBgAAGIkQAwAAjESIAQAARiLEAAAAIxFiAACAkQgxAADASIQYAABgpH75Fmugr/zyN19Eeghh+27j30Z6CAAQFTgTAwAAjESIAQAARiLEAAAAIxFiAACAkQgxAADASIQYAABgJEIMAAAwEiEGAAAYiRADAACMRIgBAABGIsQAAAAjEWIAAICRCDEAAMBIhBgAAGAkQgwAADASIQYAABiJEAMAAIxEiAEAAEYixAAAACMRYgAAgJEIMQAAwEiEGAAAYCRCDAAAMBIhBgAAGIkQAwAAjESIAQAARiLEAAAAIxFiAACAkWIjPQAAkffL33wRsed2xVja9KyU4Tsof7ujx/f7buPf9uOoAJigz8/E+Hw+ORyOkJvH47GPW5Yln8+n1NRUDR8+XNOnT9fZs2dDHsPv92vFihUaPXq0RowYoYKCAl2+fLmvhwoAAAzWL28nPfnkk2poaLBvp0+fto9t2rRJmzdvVmlpqU6ePCmPx6Ps7GzduHHDrikqKtL+/fu1b98+HT16VDdv3lR+fr7a29v7Y7gAAMBA/fJ2UmxsbMjZl7ssy9L777+vt956S3PnzpUk7dq1S8nJydq7d6+WLVumlpYW7dixQ7t379asWbMkSWVlZUpLS9OhQ4eUm5vbH0MGAACG6ZcQc+HCBaWmpsrlcikrK0vr16/XX/7lX6q+vl6NjY3Kycmxa10ul6ZNm6bq6motW7ZMtbW1CgQCITWpqanKyMhQdXV1tyHG7/fL7/fb262trZKkQCCgQCDQ53N0xVjh1Q+xQn5Gq2joQ0/X2926/lif4Qp3Pffpc/dyTQyEvvWlgbQeIo1eBEVrH8KZr8OyrD79X68vv/xSt2/f1hNPPKErV67o3Xff1f/+7//q7NmzOn/+vP76r/9a//d//6fU1FT7PkuXLtXFixd18OBB7d27V3//938fEkgkKScnR2PHjtW2bdvu+7w+n0/r1q3rsn/v3r2Ki4vryykCAIB+cvv2bS1YsEAtLS1KSEh4YG2fn4nJy8uzf580aZK8Xq9+9atfadeuXZoyZYokyeEIvQLBsqwu++71sJo1a9aouLjY3m5tbVVaWppycnIe2oTeyPAdDKveNcTSO5kdWntqiPwdPb8CY7CJhj6c8fXsLc9AIKDKykplZ2fL6XT286geLNz13Jd6uyZ62mdTDKT1EGn0Iiha+3D3nZSe6PdLrEeMGKFJkybpwoULeumllyRJjY2NSklJsWuampqUnJwsSfJ4PGpra1Nzc7NGjhwZUjN16tRun8flcsnlcnXZ73Q6++XFD+dS0JD7dTh6fd/BZDD3Idz11l9rNBwD4bUId01Eumf9ZSCsh4GCXgRFWx/CmWu//7E7v9+vc+fOKSUlRWPHjpXH41FlZaV9vK2tTVVVVXZAmTx5spxOZ0hNQ0ODzpw588AQAwAAokufn4lZvXq15syZo1/84hdqamrSu+++q9bWVi1cuFAOh0NFRUVav369xo0bp3Hjxmn9+vWKi4vTggULJElut1uLFy/WqlWrNGrUKCUmJmr16tWaNGmSfbUSAABAn4eYy5cv69e//rX++Mc/6rHHHtOUKVNUU1Oj9PR0SdIbb7yhO3fu6LXXXlNzc7OysrJUUVGh+Ph4+zG2bNmi2NhYzZs3T3fu3NHMmTO1c+dOxcTE9PVwAQCAofo8xOzbt++Bxx0Oh3w+n3w+X7c1w4YNU0lJiUpKSvp4dAAAYLDgCyABAICRCDEAAMBIhBgAAGAkQgwAADASIQYAABiJEAMAAIxEiAEAAEYixAAAACMRYgAAgJEIMQAAwEiEGAAAYCRCDAAAMBIhBgAAGIkQAwAAjESIAQAARiLEAAAAIxFiAACAkWIjPQBgsPnlb77oUZ0rxtKmZ6UM30H52x39PCoAGHw4EwMAAIxEiAEAAEYixAAAACMRYgAAgJEIMQAAwEiEGAAAYCRCDAAAMBIhBgAAGIkQAwAAjESIAQAARiLEAAAAIxFiAACAkQgxAADASIQYAABgpNhIDwAAeuOXv/ki0kMI23cb/zbSQwAGFc7EAAAAIxFiAACAkQgxAADASIQYAABgJEIMAAAw0oAPMf/+7/+usWPHatiwYZo8ebL+53/+J9JDAgAAA8CADjEff/yxioqK9NZbb+mbb77R3/zN3ygvL0/ff/99pIcGAAAibECHmM2bN2vx4sX6x3/8R02YMEHvv/++0tLStHXr1kgPDQAARNiA/WN3bW1tqq2t1W9+85uQ/Tk5Oaquru5S7/f75ff77e2WlhZJ0p/+9CcFAoE+H1/sj7fCq++wdPt2h2IDQ9Te4ejz8ZiCPnSiF0HR1Idr1651eywQCOj27du6du2anE7nzziqgefP7UXWhv/XD6Pqf8fXzAzZjtY1cePGDUmSZVkPrR2wIeaPf/yj2tvblZycHLI/OTlZjY2NXeo3bNigdevWddk/duzYfhtjuBZEegADBH3oRC+CoqUPo/+/SI8AAxnrI9SNGzfkdrsfWDNgQ8xdDkfof5lZltVlnyStWbNGxcXF9nZHR4f+9Kc/adSoUfet/7m1trYqLS1Nly5dUkJCQqSHEzH0oRO9CKIPQfShE70IitY+WJalGzduKDU19aG1AzbEjB49WjExMV3OujQ1NXU5OyNJLpdLLpcrZN+jjz7an0PslYSEhKhajN2hD53oRRB9CKIPnehFUDT24WFnYO4asB/sHTp0qCZPnqzKysqQ/ZWVlZo6dWqERgUAAAaKAXsmRpKKi4tVWFiozMxMeb1e/ed//qe+//57/dM//VOkhwYAACJsQIeY+fPn69q1a/rXf/1XNTQ0KCMjQwcOHFB6enqkhxY2l8ult99+u8tbXtGGPnSiF0H0IYg+dKIXQfTh4RxWT65hAgAAGGAG7GdiAAAAHoQQAwAAjESIAQAARiLEAAAAIxFi+pnP55PD4Qi5eTyeSA+r3x05ckRz5sxRamqqHA6HPvvss5DjlmXJ5/MpNTVVw4cP1/Tp03X27NnIDLYfPawPixYt6rI+pkyZEpnB9qMNGzbomWeeUXx8vJKSkvTSSy/p/PnzITXRsCZ60odoWRNbt27VU089Zf8hN6/Xqy+//NI+Hg3rQXp4H6JlPfQWIeZn8OSTT6qhocG+nT59OtJD6ne3bt3S008/rdLS0vse37RpkzZv3qzS0lKdPHlSHo9H2dnZ9hd/DRYP64MkzZ49O2R9HDhw4Gcc4c+jqqpKr7/+umpqalRZWakff/xROTk5unWr84tUo2FN9KQPUnSsiTFjxmjjxo06deqUTp06pRkzZujFF1+0g0o0rAfp4X2QomM99JqFfvX2229bTz/9dKSHEVGSrP3799vbHR0dlsfjsTZu3Gjv++GHHyy32239x3/8RwRG+PO4tw+WZVkLFy60XnzxxYiMJ5KamposSVZVVZVlWdG7Ju7tg2VF75qwLMsaOXKk9V//9V9Rux7uutsHy4ru9dATnIn5GVy4cEGpqakaO3as/u7v/k6///3vIz2kiKqvr1djY6NycnLsfS6XS9OmTVN1dXUERxYZhw8fVlJSkp544gktWbJETU1NkR5Sv2tpaZEkJSYmSoreNXFvH+6KtjXR3t6uffv26datW/J6vVG7Hu7tw13Rth7CMaD/Yu9gkJWVpY8++khPPPGErly5onfffVdTp07V2bNnNWrUqEgPLyLufqnnvV/kmZycrIsXL0ZiSBGTl5enV199Venp6aqvr9fatWs1Y8YM1dbWDtq/0mlZloqLi/Xcc88pIyNDUnSuifv1QYquNXH69Gl5vV798MMPeuSRR7R//35NnDjRDirRsh6664MUXeuhNwgx/SwvL8/+fdKkSfJ6vfrVr36lXbt2qbi4OIIjizyHwxGybVlWl32D3fz58+3fMzIylJmZqfT0dH3xxReaO3duBEfWf5YvX65vv/1WR48e7XIsmtZEd32IpjUxfvx41dXV6fr16/rkk0+0cOFCVVVV2cejZT1014eJEydG1XroDd5O+pmNGDFCkyZN0oULFyI9lIi5e3XW3f/6vqupqanLf3lFm5SUFKWnpw/a9bFixQp9/vnn+vrrrzVmzBh7f7Stie76cD+DeU0MHTpUjz/+uDIzM7VhwwY9/fTT+uCDD6JuPXTXh/sZzOuhNwgxPzO/369z584pJSUl0kOJmLFjx8rj8aiystLe19bWpqqqKk2dOjWCI4u8a9eu6dKlS4NufViWpeXLl+vTTz/VV199pbFjx4Ycj5Y18bA+3M9gXRP3Y1mW/H5/1KyH7tztw/1E03rokYh9pDhKrFq1yjp8+LD1+9//3qqpqbHy8/Ot+Ph467vvvov00PrVjRs3rG+++cb65ptvLEnW5s2brW+++ca6ePGiZVmWtXHjRsvtdluffvqpdfr0aevXv/61lZKSYrW2tkZ45H3rQX24ceOGtWrVKqu6utqqr6+3vv76a8vr9Vp/8Rd/Mej68M///M+W2+22Dh8+bDU0NNi327dv2zXRsCYe1odoWhNr1qyxjhw5YtXX11vffvut9eabb1pDhgyxKioqLMuKjvVgWQ/uQzSth94ixPSz+fPnWykpKZbT6bRSU1OtuXPnWmfPno30sPrd119/bUnqclu4cKFlWcFLat9++23L4/FYLpfLev75563Tp09HdtD94EF9uH37tpWTk2M99thjltPptH7xi19YCxcutL7//vtID7vP3a8HkqwPP/zQromGNfGwPkTTmviHf/gHKz093Ro6dKj12GOPWTNnzrQDjGVFx3qwrAf3IZrWQ285LMuyfr7zPgAAAH2Dz8QAAAAjEWIAAICRCDEAAMBIhBgAAGAkQgwAADASIQYAABiJEAMAAIxEiAEAAEYixAAAACMRYgAAgJEIMQAAwEiEGAAAYKT/H2MI6ejRA/K+AAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "result.columns" + "(~(result.isna())).sum(axis=1).hist()" ] }, { @@ -2795,18 +11376,18 @@ "data": { "text/plain": [ "org_index\n", - "9223 False\n", - "9231 False\n", - "9237 False\n", - "9244 False\n", - "9256 False\n", - " ... \n", - "6173855 False\n", - "6173858 False\n", - "6174035 False\n", - "6174038 False\n", - "6174041 False\n", - "Length: 19139, dtype: bool" + "0 False\n", + "1 False\n", + "2 False\n", + "3 False\n", + "4 False\n", + " ... \n", + "8401 False\n", + "8402 False\n", + "8403 False\n", + "8404 False\n", + "8405 False\n", + "Length: 8406, dtype: bool" ] }, "execution_count": null, diff --git a/nbs/handlers/_helcom-investigation-uniqueness.ipynb b/nbs/handlers/_helcom-investigation-uniqueness.ipynb new file mode 100644 index 0000000..929a6c0 --- /dev/null +++ b/nbs/handlers/_helcom-investigation-uniqueness.ipynb @@ -0,0 +1,9716 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "bb60862d", + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp handlers.helcom" + ] + }, + { + "cell_type": "markdown", + "id": "416a6a41", + "metadata": {}, + "source": [ + "# HELCOM" + ] + }, + { + "cell_type": "markdown", + "id": "5709cfb6", + "metadata": {}, + "source": [ + "> This data pipeline, known as a \"handler\" in Marisco terminology, is designed to clean, standardize, and encode [HELCOM data](https://helcom.fi/about-us) into `NetCDF` format. The handler processes raw HELCOM data, applying various transformations and lookups to align it with `MARIS` data standards.\n", + "\n", + "Key functions of this handler:\n", + "\n", + "- **Cleans** and **normalizes** raw HELCOM data\n", + "- **Applies standardized nomenclature** and units\n", + "- **Encodes the processed data** into `NetCDF` format compatible with MARIS requirements\n", + "\n", + "This handler is a crucial component in the Marisco data processing workflow, ensuring HELCOM data is properly integrated into the MARIS database.\n", + "\n", + "\n", + "\n", + "Note: *Additionally, an optional encoder (pipeline) is provided below to process data into a `.csv` format compatible with the MARIS master database. This feature is maintained for legacy purposes, as data ingestion was previously performed using OpenRefine.*" + ] + }, + { + "cell_type": "markdown", + "id": "0801c877", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "For new MARIS users, please refer to [Understanding MARIS Data Formats (NetCDF and Open Refine)](https://github.com/franckalbinet/marisco/tree/main/install_configure_guide) for detailed information.\n", + "\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "id": "0b121843", + "metadata": {}, + "source": [ + "The present notebook pretends to be an instance of [Literate Programming](https://www.wikiwand.com/en/articles/Literate_programming) in the sense that it is a narrative that includes code snippets that are interspersed with explanations. When a function or a class needs to be exported in a dedicated python module (in our case `marisco/handlers/helcom.py`) the code snippet is added to the module using `#| exports` as provided by the wonderful [nbdev](https://nbdev.readthedocs.io/en/latest/) library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0db45fee", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "#| hide\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a8d979f", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "import pandas as pd \n", + "import numpy as np\n", + "from functools import partial \n", + "import fastcore.all as fc \n", + "from pathlib import Path \n", + "from dataclasses import asdict\n", + "from typing import List, Dict, Callable, Tuple, Any \n", + "from collections import OrderedDict, defaultdict\n", + "import re\n", + "\n", + "from marisco.utils import (\n", + " has_valid_varname, \n", + " match_worms, \n", + " Remapper, \n", + " ddmm_to_dd,\n", + " match_maris_lut, \n", + " Match, \n", + " get_unique_across_dfs\n", + ")\n", + "\n", + "from marisco.callbacks import (\n", + " Callback, \n", + " Transformer, \n", + " EncodeTimeCB, \n", + " AddSampleTypeIdColumnCB,\n", + " AddNuclideIdColumnCB, \n", + " LowerStripNameCB, \n", + " SanitizeLonLatCB, \n", + " ReshapeLongToWide, \n", + " CompareDfsAndTfmCB, \n", + " RemapCB\n", + ")\n", + "\n", + "from marisco.metadata import (\n", + " GlobAttrsFeeder, \n", + " BboxCB, \n", + " DepthRangeCB, \n", + " TimeRangeCB, \n", + " ZoteroCB, \n", + " KeyValuePairCB\n", + ")\n", + "\n", + "from marisco.configs import (\n", + " nuc_lut_path, \n", + " nc_tpl_path, \n", + " cfg, \n", + " cache_path, \n", + " cdl_cfg, \n", + " Enums, \n", + " lut_path, \n", + " species_lut_path, \n", + " sediments_lut_path, \n", + " bodyparts_lut_path, \n", + " detection_limit_lut_path, \n", + " filtered_lut_path, \n", + " area_lut_path, \n", + " get_lut, \n", + " unit_lut_path\n", + ")\n", + "\n", + "from marisco.serializers import (\n", + " NetCDFEncoder, \n", + " OpenRefineCsvEncoder\n", + ")\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5519e669", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "pd.set_option('display.max_rows', 100)" + ] + }, + { + "cell_type": "markdown", + "id": "e045eeae", + "metadata": {}, + "source": [ + "## Configuration & file paths" + ] + }, + { + "cell_type": "markdown", + "id": "8b0b476d", + "metadata": {}, + "source": [ + "- **fname_in**: path to the folder containing the HELCOM data in CSV format. The path can be defined as a relative path. \n", + "\n", + "- **fname_out_nc**: path and filename for the NetCDF output.The path can be defined as a relative path. \n", + "\n", + "- **fname_out_csv**: path and filename for the Open Refine csv output.The path can be defined as a relative path.\n", + "\n", + "- **Zotero key**: used to retrieve attributes related to the dataset from [Zotero](https://www.zotero.org/). The MARIS datasets include a [library](https://maris.iaea.org/datasets) available on [Zotero](https://www.zotero.org/groups/2432820/maris/library). \n", + "\n", + "- **ref_id**: refers to the location in Archive of the Zotero library.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "715e849d", + "metadata": {}, + "outputs": [], + "source": [ + "# | exports\n", + "fname_in = '../../_data/accdb/mors/csv'\n", + "fname_out_nc = '../../_data/output/100-HELCOM-MORS-2024.nc'\n", + "fname_out_csv = '../../_data/output/100-HELCOM-MORS-2024.csv'\n", + "zotero_key ='26VMZZ2Q' # HELCOM MORS zotero key\n", + "ref_id = 100 # HELCOM MORS reference id as defined by MARIS" + ] + }, + { + "cell_type": "markdown", + "id": "0f88d99c", + "metadata": {}, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "markdown", + "id": "dcbbc83f", + "metadata": {}, + "source": [ + "[Helcom MORS (Monitoring of Radioactive Substances in the Baltic Sea) data](https://helcom.fi/about-us) is provided as a Microsoft Access database. \n", + "[`Mdbtools`](https://github.com/mdbtools/mdbtools) can be used to convert the tables of the Microsoft Access database to `.csv` files on Unix-like OS.\n", + "\n", + "**Example steps**:\n", + "\n", + "\n", + "1. [Download data](https://metadata.helcom.fi/geonetwork/srv/fin/catalog.search#/metadata/2fdd2d46-0329-40e3-bf96-cb08c7206a24)\n", + "\n", + "2. Install mdbtools via VScode Terminal: \n", + "\n", + " ```\n", + " sudo apt-get -y install mdbtools\n", + " ```\n", + "\n", + "3. Install unzip via VScode Terminal:\n", + "\n", + " ```\n", + " sudo apt-get -y install unzip\n", + " ```\n", + "\n", + "4. In `VS Code` terminal (for instance), navigate to the marisco data folder:\n", + "\n", + " ```\n", + " cd /home/marisco/downloads/marisco/_data/accdb/mors_19840101_20211231\n", + " ```\n", + "\n", + "5. Unzip `MORS_ENVIRONMENT.zip`:\n", + "\n", + " ```\n", + " unzip MORS_ENVIRONMENT.zip \n", + " ```\n", + "\n", + "6. Run `preprocess.sh` to generate the required data files:\n", + "\n", + " ```\n", + " ./preprocess.sh MORS_ENVIRONMENT.zip\n", + " ```\n", + "\n", + "7. Content of `preprocess.sh` script:\n", + "\n", + " ```\n", + " #!/bin/bash\n", + "\n", + " # Example of use: ./preprocess.sh MORS_ENVIRONMENT.zip\n", + " unzip $1\n", + " dbname=$(ls *.accdb)\n", + " mkdir csv\n", + " for table in $(mdb-tables -1 \"$dbname\"); do\n", + " echo \"Export table $table\"\n", + " mdb-export \"$dbname\" \"$table\" > \"csv/$table.csv\"\n", + " done\n", + " ```\n", + "\n", + "Once converted to `.csv` files, the data is ready to be loaded into a dictionary of dataframes.\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3f4c788", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "default_smp_types = [('SEA', 'seawater'), ('SED', 'sediment'), ('BIO', 'biota')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93f0655f", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def load_data(src_dir: str|Path, # The directory where the source CSV files are located\n", + " smp_types: list=default_smp_types # A list of tuples, each containing the file prefix and the corresponding sample type name\n", + " ) -> Dict[str, pd.DataFrame]: # A dictionary with sample types as keys and their corresponding dataframes as values\n", + " \"Load HELCOM data and return the data in a dictionary of dataframes with the dictionary key as the sample type.\"\n", + " src_path = Path(src_dir)\n", + " \n", + " def load_and_merge(file_prefix: str) -> pd.DataFrame:\n", + " try:\n", + " df_meas = pd.read_csv(src_path / f'{file_prefix}02.csv')\n", + " df_smp = pd.read_csv(src_path / f'{file_prefix}01.csv')\n", + " return pd.merge(df_meas, df_smp, on='KEY', how='left')\n", + " except FileNotFoundError as e:\n", + " print(f\"Error loading files for {file_prefix}: {e}\")\n", + " return pd.DataFrame() # Return an empty DataFrame if files are not found\n", + " \n", + " return {smp_type: load_and_merge(file_prefix) for file_prefix, smp_type in smp_types}" + ] + }, + { + "cell_type": "markdown", + "id": "69e48dc6", + "metadata": {}, + "source": [ + "`dfs` is a dictionary of dataframes created from the Helcom dataset located at the path `fname_in`. The data to be included in each dataframe is sorted by sample type. Each dictionary is defined with a key equal to the sample type. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb4bf289", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "keys/sample types: dict_keys(['seawater', 'sediment', 'biota'])\n", + "seawater columns: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/m³', 'VALUE_Bq/m³', 'ERROR%_m³',\n", + " 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR',\n", + " 'MONTH', 'DAY', 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", + " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'TDEPTH', 'SDEPTH', 'SALIN',\n", + " 'TTEMP', 'FILT', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'DATE_OF_ENTRY_y'],\n", + " dtype='object')\n", + "sediment columns: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',\n", + " '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',\n", + " 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',\n", + " 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", + " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',\n", + " 'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',\n", + " 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],\n", + " dtype='object')\n", + "biota columns: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'BASIS',\n", + " 'ERROR%', 'NUMBER', 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY',\n", + " 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY', 'STATION',\n", + " 'LATITUDE ddmmmm', 'LATITUDE dddddd', 'LONGITUDE ddmmmm',\n", + " 'LONGITUDE dddddd', 'SDEPTH', 'RUBIN', 'BIOTATYPE', 'TISSUE', 'NO',\n", + " 'LENGTH', 'WEIGHT', 'DW%', 'LOI%', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN',\n", + " 'DATE_OF_ENTRY_y'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "\n", + "#|eval: false\n", + "dfs = load_data(fname_in)\n", + "print('keys/sample types: ', dfs.keys())\n", + "\n", + "for key in dfs.keys():\n", + " print(f'{key} columns: ', dfs[key].columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a8ab3c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',\n", + " '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',\n", + " 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',\n", + " 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", + " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',\n", + " 'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',\n", + " 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],\n", + " dtype='object')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs['sediment'].columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0af1ec68", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(39817, 35)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs['sediment'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0f74d5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "25991" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs['sediment']['KEY'].duplicated().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98927b4c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "KEY\n", + "SSSSM2021030 11\n", + "SSAAS1987036 11\n", + "SSSSI2006030 11\n", + "SSSSI2010030 11\n", + "SSSSI2010003 10\n", + " ..\n", + "SSSSI2006020 1\n", + "SSSSI2006014 1\n", + "SSSSI2006013 1\n", + "SCLOR2006078 1\n", + "SLVDC1997011 1\n", + "Name: count, Length: 13826, dtype: int64" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs['sediment']['KEY'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df06bf0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KEYNUCLIDEMETHOD< VALUE_Bq/kgVALUE_Bq/kgERROR%_kg< VALUE_Bq/m²VALUE_Bq/m²ERROR%_m²DATE_OF_ENTRY_x...LOWSLIAREASEDIOXICDW%LOI%MORS_SUBBASINHELCOM_SUBBASINSUM_LINKDATE_OF_ENTRY_y
0SKRIL2012048RA226NaNNaN35.026.0NaNNaNNaN08/20/14 00:00:00...20.00.006NaNNaNNaNNaN11.011.0NaN08/20/14 00:00:00
1SKRIL2012049RA226NaNNaN36.022.0NaNNaNNaN08/20/14 00:00:00...27.00.006NaNNaNNaNNaN11.011.0NaN08/20/14 00:00:00
186SKRIL2012048CS137NaNNaN3.033.0NaNNaNNaN08/20/14 00:00:00...20.00.006NaNNaNNaNNaN11.011.0NaN08/20/14 00:00:00
187SKRIL2012049CS137NaN<1.0NaNNaNNaNNaN08/20/14 00:00:00...27.00.006NaNNaNNaNNaN11.011.0NaN08/20/14 00:00:00
562SKRIL2012048RA228NaNNaN60.020.0NaNNaNNaN08/20/14 00:00:00...20.00.006NaNNaNNaNNaN11.011.0NaN08/20/14 00:00:00
563SKRIL2012049RA228NaNNaN59.020.0NaNNaNNaN08/20/14 00:00:00...27.00.006NaNNaNNaNNaN11.011.0NaN08/20/14 00:00:00
825SKRIL2012048K40NaNNaN980.020.0NaNNaNNaN08/20/14 00:00:00...20.00.006NaNNaNNaNNaN11.011.0NaN08/20/14 00:00:00
826SKRIL2012049K40NaNNaN950.020.0NaNNaNNaN08/20/14 00:00:00...27.00.006NaNNaNNaNNaN11.011.0NaN08/20/14 00:00:00
\n", + "

8 rows × 35 columns

\n", + "
" + ], + "text/plain": [ + " KEY NUCLIDE METHOD < VALUE_Bq/kg VALUE_Bq/kg ERROR%_kg \\\n", + "0 SKRIL2012048 RA226 NaN NaN 35.0 26.0 \n", + "1 SKRIL2012049 RA226 NaN NaN 36.0 22.0 \n", + "186 SKRIL2012048 CS137 NaN NaN 3.0 33.0 \n", + "187 SKRIL2012049 CS137 NaN < 1.0 NaN \n", + "562 SKRIL2012048 RA228 NaN NaN 60.0 20.0 \n", + "563 SKRIL2012049 RA228 NaN NaN 59.0 20.0 \n", + "825 SKRIL2012048 K40 NaN NaN 980.0 20.0 \n", + "826 SKRIL2012049 K40 NaN NaN 950.0 20.0 \n", + "\n", + " < VALUE_Bq/m² VALUE_Bq/m² ERROR%_m² DATE_OF_ENTRY_x ... LOWSLI \\\n", + "0 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", + "1 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", + "186 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", + "187 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", + "562 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", + "563 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", + "825 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", + "826 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", + "\n", + " AREA SEDI OXIC DW% LOI% MORS_SUBBASIN HELCOM_SUBBASIN SUM_LINK \\\n", + "0 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "1 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "186 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "187 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "562 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "563 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "825 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "826 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "\n", + " DATE_OF_ENTRY_y \n", + "0 08/20/14 00:00:00 \n", + "1 08/20/14 00:00:00 \n", + "186 08/20/14 00:00:00 \n", + "187 08/20/14 00:00:00 \n", + "562 08/20/14 00:00:00 \n", + "563 08/20/14 00:00:00 \n", + "825 08/20/14 00:00:00 \n", + "826 08/20/14 00:00:00 \n", + "\n", + "[8 rows x 35 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "# SKRIL2012048, SSSSM2021030\n", + "df = dfs['sediment'][dfs['sediment']['KEY'].isin(['SKRIL2012048', 'SKRIL2012049'])]; df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16f99ba4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',\n", + " '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',\n", + " 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',\n", + " 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", + " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',\n", + " 'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',\n", + " 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],\n", + " dtype='object')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07c10fa1", + "metadata": {}, + "outputs": [], + "source": [ + "# coi_val = {'seawater' : {'val': 'VALUE_Bq/m³'},\n", + "# 'biota': {'val': 'VALUE_Bq/kg'},\n", + "# 'sediment': {'val': 'VALUE_Bq/kg'}}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3585d711", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',\n", + " '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',\n", + " 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',\n", + " 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", + " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',\n", + " 'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',\n", + " 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],\n", + " dtype='object')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04be1fb8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KEYLATITUDE (ddmmmm)LATITUDE (dddddd)DATENUCLIDEVALUE_Bq/kgSEQUENCEUPPSLILOWSLI
0SKRIL201204859.459,666706/17/12 00:00:00RA22635.02012048.015.020.0
1SKRIL201204959.459,666706/17/12 00:00:00RA22636.02012049.020.027.0
186SKRIL201204859.459,666706/17/12 00:00:00CS1373.02012048.015.020.0
187SKRIL201204959.459,666706/17/12 00:00:00CS1371.02012049.020.027.0
562SKRIL201204859.459,666706/17/12 00:00:00RA22860.02012048.015.020.0
563SKRIL201204959.459,666706/17/12 00:00:00RA22859.02012049.020.027.0
825SKRIL201204859.459,666706/17/12 00:00:00K40980.02012048.015.020.0
826SKRIL201204959.459,666706/17/12 00:00:00K40950.02012049.020.027.0
\n", + "
" + ], + "text/plain": [ + " KEY LATITUDE (ddmmmm) LATITUDE (dddddd) DATE \\\n", + "0 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "1 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "186 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "187 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "562 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "563 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "825 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "826 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "\n", + " NUCLIDE VALUE_Bq/kg SEQUENCE UPPSLI LOWSLI \n", + "0 RA226 35.0 2012048.0 15.0 20.0 \n", + "1 RA226 36.0 2012049.0 20.0 27.0 \n", + "186 CS137 3.0 2012048.0 15.0 20.0 \n", + "187 CS137 1.0 2012049.0 20.0 27.0 \n", + "562 RA228 60.0 2012048.0 15.0 20.0 \n", + "563 RA228 59.0 2012049.0 20.0 27.0 \n", + "825 K40 980.0 2012048.0 15.0 20.0 \n", + "826 K40 950.0 2012049.0 20.0 27.0 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "# Same key => several nuclides\n", + "# Same lat, lon, date, nuclide => different values\n", + "# lat, lon, date, nuclide, 'upsli', 'lowsli' should be unique\n", + "df[['KEY', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'NUCLIDE', 'VALUE_Bq/kg', 'SEQUENCE', 'UPPSLI', 'LOWSLI']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbf47fbe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KEYLATITUDE (ddmmmm)LATITUDE (dddddd)DATENUCLIDEVALUE_Bq/kgSEQUENCEUPPSLILOWSLI
0SKRIL201204859.459,666706/17/12 00:00:00RA22635.02012048.015.020.0
1SKRIL201204959.459,666706/17/12 00:00:00RA22636.02012049.020.027.0
186SKRIL201204859.459,666706/17/12 00:00:00CS1373.02012048.015.020.0
187SKRIL201204959.459,666706/17/12 00:00:00CS1371.02012049.020.027.0
562SKRIL201204859.459,666706/17/12 00:00:00RA22860.02012048.015.020.0
563SKRIL201204959.459,666706/17/12 00:00:00RA22859.02012049.020.027.0
825SKRIL201204859.459,666706/17/12 00:00:00K40980.02012048.015.020.0
826SKRIL201204959.459,666706/17/12 00:00:00K40950.02012049.020.027.0
\n", + "
" + ], + "text/plain": [ + " KEY LATITUDE (ddmmmm) LATITUDE (dddddd) DATE \\\n", + "0 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "1 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "186 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "187 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "562 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "563 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "825 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "826 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "\n", + " NUCLIDE VALUE_Bq/kg SEQUENCE UPPSLI LOWSLI \n", + "0 RA226 35.0 2012048.0 15.0 20.0 \n", + "1 RA226 36.0 2012049.0 20.0 27.0 \n", + "186 CS137 3.0 2012048.0 15.0 20.0 \n", + "187 CS137 1.0 2012049.0 20.0 27.0 \n", + "562 RA228 60.0 2012048.0 15.0 20.0 \n", + "563 RA228 59.0 2012049.0 20.0 27.0 \n", + "825 K40 980.0 2012048.0 15.0 20.0 \n", + "826 K40 950.0 2012049.0 20.0 27.0 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "# Same sample, several nuclides\n", + "df_test = df[['KEY', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'NUCLIDE', 'VALUE_Bq/kg', 'SEQUENCE', 'UPPSLI', 'LOWSLI']]; df_test\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23031b91", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "# df_test.pivot_table(index=['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'YEAR', 'MONTH', 'DAY'],\n", + "# columns='NUCLIDE',\n", + "# values='VALUE_Bq/kg',\n", + "# fill_value=np.nan,\n", + "# aggfunc=lambda x: x).reset_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53cdf8c0", + "metadata": {}, + "outputs": [], + "source": [ + "# #| eval: false\n", + "# # Preprocess the data\n", + "# df_test['VALUE_Bq/kg'] = df_test['VALUE_Bq/kg'].fillna(-999)\n", + "\n", + "# # Then pivot\n", + "# pivoted = df_test.pivot_table(index=['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'YEAR', 'MONTH', 'DAY'],\n", + "# columns='NUCLIDE',\n", + "# values='VALUE_Bq/kg',\n", + "# aggfunc='first').reset_index()\n", + "\n", + "# # Replace -999 with 'Below Detection Limit' or any other indicator\n", + "# pivoted = pivoted.replace(-999, np.nan)\n", + "# pivoted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc75cf0f", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "# dfs['sediment'][['LATITUDE (ddmmm)', 'LONGITUDE (ddmmm)', '']].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30bd602a", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "# dfs['sediment'].head()" + ] + }, + { + "cell_type": "markdown", + "id": "5687eade", + "metadata": {}, + "source": [ + "## Add sample type column" + ] + }, + { + "cell_type": "markdown", + "id": "a984410e", + "metadata": {}, + "source": [ + "The sample type (`seawater`, `biota`, `sediment`, ...) as defined in the `configs.ipynb` are encoded group names in NetCDF produced. Addition of sample type ids into individual dataframes is done using the `AddSampleTypeIdColumnCB` callback for legacy purposes (i.e. Open Refine output)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf5ba759", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " KEY samptype_id\n", + "0 WKRIL2012003 1\n", + "1 WKRIL2012004 1\n", + "2 WKRIL2012005 1\n", + "3 WKRIL2012006 1\n", + "4 WKRIL2012007 1\n", + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of dropped rows 0 0 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "\n", + "print(tfm()['seawater'][['KEY', 'samptype_id']].head())\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "142ddab3", + "metadata": {}, + "source": [ + "## Normalize nuclide names" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8a2311cd", + "metadata": {}, + "source": [ + "### Lower & strip nuclide names" + ] + }, + { + "cell_type": "markdown", + "id": "4b7b4ceb", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: Some nuclide names contain one or multiple trailing spaces.\n", + "\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "id": "64d84ed7", + "metadata": {}, + "source": [ + "This is demonstrated below for the `NUCLIDE` column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a2306ba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " index value n_chars stripped_chars\n", + "6 6 TC99 7 4\n", + "16 16 CS137 6 5\n", + "33 33 CS137 9 5\n", + "41 41 CS134 8 5\n", + "43 43 SR90 6 4\n", + "46 46 SR90 5 4\n", + "48 48 K40 8 3\n", + "49 49 PU238 8 5\n", + "64 64 CO60 8 4\n", + "65 65 AM241 8 5\n", + "66 66 CS137 8 5\n", + "83 83 SR90 8 4\n", + "86 86 SR90 7 4\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "df = get_unique_across_dfs(load_data(fname_in), 'NUCLIDE', as_df=True, include_nchars=True)\n", + "df['stripped_chars'] = df['value'].str.strip().str.replace(' ', '').str.len()\n", + "print(df[df['n_chars'] != df['stripped_chars']])" + ] + }, + { + "cell_type": "markdown", + "id": "518174ba", + "metadata": {}, + "source": [ + "To fix this issue, we use the `LowerStripNameCB` callback. For each dataframe in the dictionary of dataframes, it corrects the nuclide name by converting it lowercase, striping any leading or trailing whitespace(s) and ensuring the number comes before letters (e.g. `137cs`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a3fa068", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "seawater nuclides: \n", + "['cs137' 'sr90' 'h3' 'cs134' 'pu238' 'pu239240' 'am241' 'cm242' 'cm244'\n", + " 'tc99' 'k40' 'ru103' 'sr89' 'sb125' 'nb95' 'ru106' 'zr95' 'ag110m'\n", + " 'cm243244' 'ba140' 'ce144' 'u234' 'u238' 'co60' 'pu239' 'pb210' 'po210'\n", + " 'np237' 'pu240' 'mn54']\n", + "sediment nuclides: \n", + "['ra226' 'cs137' 'ra228' 'k40' 'sr90' 'cs134137' 'cs134' 'pu239240'\n", + " 'pu238' 'co60' 'ru103' 'ru106' 'sb125' 'ag110m' 'ce144' 'am241' 'be7'\n", + " 'th228' 'pb210' 'co58' 'mn54' 'zr95' 'ba140' 'po210' 'ra224' 'nb95'\n", + " 'pu238240' 'pu241' 'pu239' 'eu155' 'ir192' 'th232' 'cd109' 'sb124' 'zn65'\n", + " 'th234' 'tl208' 'pb212' 'pb214' 'bi214' 'ac228' 'ra223' 'u235' 'bi212']\n", + "biota nuclides: \n", + "['cs134' 'k40' 'co60' 'cs137' 'sr90' 'ag108m' 'mn54' 'co58' 'ag110m'\n", + " 'zn65' 'sb125' 'pu239240' 'ru106' 'be7' 'ce144' 'pb210' 'po210' 'sb124'\n", + " 'sr89' 'zr95' 'te129m' 'ru103' 'nb95' 'ce141' 'la140' 'i131' 'ba140'\n", + " 'pu238' 'u235' 'bi214' 'pb214' 'pb212' 'tl208' 'ac228' 'ra223' 'eu155'\n", + " 'ra226' 'gd153' 'sn113' 'fe59' 'tc99' 'co57' 'sn117m' 'eu152' 'sc46'\n", + " 'rb86' 'ra224' 'th232' 'cs134137' 'am241' 'ra228' 'th228' 'k-40' 'cs138'\n", + " 'cs139' 'cs140' 'cs141' 'cs142' 'cs143' 'cs144' 'cs145' 'cs146']\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE')])\n", + "\n", + "for key in tfm().keys():\n", + " print(f'{key} nuclides: ')\n", + " print(tfm()[key]['NUCLIDE'].unique())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "52c9d0fe", + "metadata": {}, + "source": [ + "### Remap nuclide names to MARIS data formats" + ] + }, + { + "cell_type": "markdown", + "id": "a58baf14", + "metadata": {}, + "source": [ + "We below map nuclide names used by HELCOM to the MARIS standard nuclide names. \n", + "\n", + "Remapping data provider nomenclatures into MARIS standards is one recurrent operation and is done in a semi-automated manner according to the following pattern:\n", + "\n", + "1. **Inspect** data provider nomenclature:\n", + "2. **Match** automatically against MARIS nomenclature (using a fuzzy matching algorithm); \n", + "3. **Fix** potential mismatches; \n", + "4. **Apply** the lookup table to the dataframe.\n", + "\n", + "As now on, we will use this pattern to remap the HELCOM data provider nomenclatures into MARIS standards and name it for the sake of brevity **IMFA** (**I**nspect, **M**atch, **F**ix, **A**pply)." + ] + }, + { + "cell_type": "markdown", + "id": "ae4b31bc", + "metadata": {}, + "source": [ + "The unique values of the data provider nuclide names. The `get_unique_across_dfs` is a utility function allowing to retrieve unique values of a specific column across all dataframes (please remind that we have one dataframe per sample type - biota, ...)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e32ee8d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexvalue
00sb125
11ce141
22gd153
33ra226
44ra228
\n", + "
" + ], + "text/plain": [ + " index value\n", + "0 0 sb125\n", + "1 1 ce141\n", + "2 2 gd153\n", + "3 3 ra226\n", + "4 4 ra228" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE')])\n", + "dfs_output = tfm()\n", + "\n", + "get_unique_across_dfs(dfs_output, col_name='NUCLIDE', as_df=True).head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "614c1bdf", + "metadata": {}, + "source": [ + "Let's now create an instance of a fuzzy matching algorithm `Remapper`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcdbc619", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs_output, col_name='NUCLIDE', as_df=True),\n", + " maris_lut_fn=nuc_lut_path,\n", + " maris_col_id='nuclide_id',\n", + " maris_col_name='nc_name',\n", + " provider_col_to_match='value',\n", + " provider_col_key='value',\n", + " fname_cache='nuclides_helcom.pkl')" + ] + }, + { + "cell_type": "markdown", + "id": "f7e0ea0c", + "metadata": {}, + "source": [ + "And try to match HELCOM to MARIS nuclide names as automatically as possible. The `match_score` column allows to assess the results:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb645c29", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 100%|██████████| 77/77 [00:01<00:00, 49.22it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
matched_maris_namesource_namematch_score
source_key
cm243244cm244cm2432443
cs134137cs137cs1341373
pu238240pu240pu2382403
pu239240pu240pu2392403
cs143cs127cs1432
cs145cs136cs1452
cs142ce144cs1422
cs140ce140cs1401
k-40k40k-401
cs144ce144cs1441
cs141ce141cs1411
cs138cs137cs1381
cs139ce139cs1391
cs146cs136cs1461
\n", + "
" + ], + "text/plain": [ + " matched_maris_name source_name match_score\n", + "source_key \n", + "cm243244 cm244 cm243244 3\n", + "cs134137 cs137 cs134137 3\n", + "pu238240 pu240 pu238240 3\n", + "pu239240 pu240 pu239240 3\n", + "cs143 cs127 cs143 2\n", + "cs145 cs136 cs145 2\n", + "cs142 ce144 cs142 2\n", + "cs140 ce140 cs140 1\n", + "k-40 k40 k-40 1\n", + "cs144 ce144 cs144 1\n", + "cs141 ce141 cs141 1\n", + "cs138 cs137 cs138 1\n", + "cs139 ce139 cs139 1\n", + "cs146 cs136 cs146 1" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "remapper.generate_lookup_table(as_df=True)\n", + "remapper.select_match(match_score_threshold=1)" + ] + }, + { + "cell_type": "markdown", + "id": "4a5cb838", + "metadata": {}, + "source": [ + "We then manually inspect the remaining unmatched names and create a fixes table to map them to the correct MARIS standards:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60cf885b", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "fixes_nuclide_names = {\n", + " 'cs134137': 'cs134_137_tot',\n", + " 'cm243244': 'cm243_244_tot',\n", + " 'pu239240': 'pu239_240_tot',\n", + " 'pu238240': 'pu238_240_tot',\n", + " 'cs143': 'cs137',\n", + " 'cs145': 'cs137',\n", + " 'cs142': 'cs137',\n", + " 'cs141': 'cs137',\n", + " 'cs144': 'cs137',\n", + " 'k-40': 'k40',\n", + " 'cs140': 'cs137',\n", + " 'cs146': 'cs137',\n", + " 'cs139': 'cs137',\n", + " 'cs138': 'cs137'\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "dd575e7e", + "metadata": {}, + "source": [ + "Let's try to match again but this time we use the `fixes_nuclide_names` to map the nuclide names to the MARIS standards:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73410b14", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 100%|██████████| 77/77 [00:01<00:00, 52.46it/s]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "remapper.generate_lookup_table(as_df=True, fixes=fixes_nuclide_names)\n", + "fc.test_eq(len(remapper.select_match(match_score_threshold=1)), 0)" + ] + }, + { + "cell_type": "markdown", + "id": "abd1276f", + "metadata": {}, + "source": [ + "Test passes! We can now create a callback `RemapNuclideNameCB` to remap the nuclide names. Note that we pass `overwrite=False` to the `Remapper` constructor to now use the cached version.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a189ef9", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "# Create a lookup table for nuclide names\n", + "lut_nuclides = lambda df: Remapper(provider_lut_df=df,\n", + " maris_lut_fn=nuc_lut_path,\n", + " maris_col_id='nuclide_id',\n", + " maris_col_name='nc_name',\n", + " provider_col_to_match='value',\n", + " provider_col_key='value',\n", + " fname_cache='nuclides_helcom.pkl').generate_lookup_table(fixes=fixes_nuclide_names, \n", + " as_df=False, overwrite=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03d47237", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class RemapNuclideNameCB(Callback):\n", + " \"Remap data provider nuclide names to MARIS nuclide names.\"\n", + " def __init__(self, \n", + " fn_lut: Callable # Function that returns the lookup table dictionary\n", + " ):\n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm: Transformer):\n", + " df_uniques = get_unique_across_dfs(tfm.dfs, col_name='NUCLIDE', as_df=True)\n", + " lut = {k: v.matched_maris_name for k, v in self.fn_lut(df_uniques).items()} \n", + " for k in tfm.dfs.keys():\n", + " tfm.dfs[k]['NUCLIDE'] = tfm.dfs[k]['NUCLIDE'].replace(lut)" + ] + }, + { + "cell_type": "markdown", + "id": "ce649d7a", + "metadata": {}, + "source": [ + "Let's see it in action, along with the `RemapRdnNameCB` callback:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c9a9ff7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['cs134', 'k40', 'co60', 'cs137', 'sr90', 'ag108m', 'mn54', 'co58',\n", + " 'ag110m', 'zn65', 'sb125', 'pu239_240_tot', 'ru106', 'be7',\n", + " 'ce144', 'pb210', 'po210', 'sb124', 'sr89', 'zr95', 'te129m',\n", + " 'ru103', 'nb95', 'ce141', 'la140', 'i131', 'ba140', 'pu238',\n", + " 'u235', 'bi214', 'pb214', 'pb212', 'tl208', 'ac228', 'ra223',\n", + " 'eu155', 'ra226', 'gd153', 'sn113', 'fe59', 'tc99', 'co57',\n", + " 'sn117m', 'eu152', 'sc46', 'rb86', 'ra224', 'th232',\n", + " 'cs134_137_tot', 'am241', 'ra228', 'th228'], dtype=object)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides)\n", + " ])\n", + "dfs_out = tfm()\n", + "\n", + "# For instance\n", + "dfs_out['biota'].NUCLIDE.unique()\n" + ] + }, + { + "cell_type": "markdown", + "id": "f91ba2d3", + "metadata": {}, + "source": [ + "### Add Nuclide Id column" + ] + }, + { + "cell_type": "markdown", + "id": "49a6c352", + "metadata": {}, + "source": [ + "The `nuclide_id` column is added to the dataframe for legacy reasons (again Open Refine output)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ec4271e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NUCLIDEnuclide_id
0cs13431
1k404
2co609
3cs13733
4cs13431
.........
15822k404
15823cs13733
15824be72
15825k404
15826cs13733
\n", + "

15827 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " NUCLIDE nuclide_id\n", + "0 cs134 31\n", + "1 k40 4\n", + "2 co60 9\n", + "3 cs137 33\n", + "4 cs134 31\n", + "... ... ...\n", + "15822 k40 4\n", + "15823 cs137 33\n", + "15824 be7 2\n", + "15825 k40 4\n", + "15826 cs137 33\n", + "\n", + "[15827 rows x 2 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE')\n", + " ])\n", + "dfs_out = tfm()\n", + "\n", + "# For instance\n", + "dfs_out['biota'][['NUCLIDE', 'nuclide_id']]" + ] + }, + { + "cell_type": "markdown", + "id": "02e9e1f4", + "metadata": {}, + "source": [ + "## Standardize Time" + ] + }, + { + "cell_type": "markdown", + "id": "24856dc5", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: Time/date is provide in the `DATE`, `YEAR`\n", + ", `MONTH`, `DAY` columns. Note that the `DATE` contains missing values as indicated below. When missing, we fallback on the `YEAR`, `MONTH`, `DAY` columns. Note also that sometimes `DAY` and `MONTH` contain 0. In this case we systematically set them to 1.\n", + "\n", + ":::" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "612873e6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "seawater DATE null values: 502\n", + "sediment DATE null values: 741\n", + "biota DATE null values: 72\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "for key in dfs.keys():\n", + " print(f'{key} DATE null values: ', dfs[key]['DATE'].isna().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae547a0c", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class ParseTimeCB(Callback):\n", + " \"Parse and standardize time information in the dataframe.\"\n", + " def __call__(self, tfm: Transformer):\n", + " for df in tfm.dfs.values():\n", + " self._process_dates(df)\n", + " self._define_beg_period(df)\n", + "\n", + " def _process_dates(self, df: pd.DataFrame) -> None:\n", + " \"Process and correct date and time information in the DataFrame.\"\n", + " df['time'] = self._parse_date(df)\n", + " self._handle_missing_dates(df)\n", + " self._fill_missing_time(df)\n", + "\n", + " def _parse_date(self, df: pd.DataFrame) -> pd.Series:\n", + " \"Parse the DATE column if present.\"\n", + " return pd.to_datetime(df['DATE'], format='%m/%d/%y %H:%M:%S', errors='coerce')\n", + "\n", + " def _handle_missing_dates(self, df: pd.DataFrame):\n", + " \"Handle cases where DAY or MONTH is 0 or missing.\"\n", + " df.loc[df[\"DAY\"] == 0, \"DAY\"] = 1\n", + " df.loc[df[\"MONTH\"] == 0, \"MONTH\"] = 1\n", + " \n", + " missing_day_month = (df[\"DAY\"].isna()) & (df[\"MONTH\"].isna()) & (df[\"YEAR\"].notna())\n", + " df.loc[missing_day_month, [\"DAY\", \"MONTH\"]] = 1\n", + "\n", + " def _fill_missing_time(self, df: pd.DataFrame) -> None:\n", + " \"Fill missing time values using YEAR, MONTH, and DAY columns.\"\n", + " missing_time = df['time'].isna()\n", + " df.loc[missing_time, 'time'] = pd.to_datetime(\n", + " df.loc[missing_time, ['YEAR', 'MONTH', 'DAY']], \n", + " format='%Y%m%d', \n", + " errors='coerce'\n", + " )\n", + "\n", + " def _define_beg_period(self, df: pd.DataFrame) -> None:\n", + " \"Create a standardized date representation for Open Refine.\"\n", + " df['begperiod'] = df['time']" + ] + }, + { + "cell_type": "markdown", + "id": "48c34819", + "metadata": {}, + "source": [ + "Apply the transformer for callbacks `ParseTimeCB`. Then, print the ``begperiod`` and `time` data for `seawater`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2b90d07", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of dropped rows 0 0 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n", + " begperiod time\n", + "0 2012-05-23 2012-05-23\n", + "1 2012-05-23 2012-05-23\n", + "2 2012-06-17 2012-06-17\n", + "3 2012-05-24 2012-05-24\n", + "4 2012-05-24 2012-05-24\n", + "... ... ...\n", + "21211 2021-10-15 2021-10-15\n", + "21212 2021-11-04 2021-11-04\n", + "21213 2021-10-15 2021-10-15\n", + "21214 2021-05-17 2021-05-17\n", + "21215 2021-05-13 2021-05-13\n", + "\n", + "[21216 rows x 2 columns]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[ParseTimeCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", + "print(tfm.dfs['seawater'][['begperiod','time']])" + ] + }, + { + "cell_type": "markdown", + "id": "28dd488a", + "metadata": {}, + "source": [ + "NetCDF time format requires the time to be encoded as number of milliseconds since a time of origin. In our case the time of origin is `1970-01-01` as indicated in `configs.ipynb` `CONFIFS['units']['time']` dictionary." + ] + }, + { + "cell_type": "markdown", + "id": "486b2966", + "metadata": {}, + "source": [ + "`EncodeTimeCB` converts the HELCOM `time` format to the MARIS NetCDF `time` format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b8edc56", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8 of 21216 entries for `time` are invalid for seawater.\n", + "1 of 39817 entries for `time` are invalid for sediment.\n", + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21208 39816 15827\n", + "Number of dropped rows 8 1 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[ParseTimeCB(),\n", + " EncodeTimeCB(cfg(), verbose=True),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0be521de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KEYNUCLIDEMETHOD< VALUE_Bq/m³VALUE_Bq/m³ERROR%_m³DATE_OF_ENTRY_xCOUNTRYLABORATORYSEQUENCE...TDEPTHSDEPTHSALINTTEMPFILTMORS_SUBBASINHELCOM_SUBBASINDATE_OF_ENTRY_ytimebegperiod
0WKRIL2012003CS137NaNNaN5.332.00000008/20/14 00:00:0090.0KRIL2012003.0...NaN0.0NaNNaNNaN11.011.008/20/14 00:00:0013377312002012-05-23
1WKRIL2012004CS137NaNNaN19.920.00000008/20/14 00:00:0090.0KRIL2012004.0...NaN29.0NaNNaNNaN11.011.008/20/14 00:00:0013377312002012-05-23
2WKRIL2012005CS137NaNNaN25.520.00000008/20/14 00:00:0090.0KRIL2012005.0...NaN0.0NaNNaNNaN11.03.008/20/14 00:00:0013398912002012-06-17
3WKRIL2012006CS137NaNNaN17.029.00000008/20/14 00:00:0090.0KRIL2012006.0...NaN0.0NaNNaNNaN11.011.008/20/14 00:00:0013378176002012-05-24
4WKRIL2012007CS137NaNNaN22.218.00000008/20/14 00:00:0090.0KRIL2012007.0...NaN39.0NaNNaNNaN11.011.008/20/14 00:00:0013378176002012-05-24
..................................................................
21211WSSSM2021005H3SSM45NaN1030.093.20388309/06/22 00:00:0077.0SSSM202105.0...NaN1.0NaNNaNN1.08.009/06/22 00:00:0016342560002021-10-15
21212WSSSM2021006H3SSM45NaN2240.043.30357109/06/22 00:00:0077.0SSSM202106.0...NaN1.0NaNNaNN10.010.009/06/22 00:00:0016359840002021-11-04
21213WSSSM2021007H3SSM45NaN2060.047.08737909/06/22 00:00:0077.0SSSM202107.0...NaN1.0NaNNaNN12.012.009/06/22 00:00:0016342560002021-10-15
21214WSSSM2021008H3SSM45NaN2300.043.47826109/06/22 00:00:0077.0SSSM202108.0...NaN1.0NaNNaNN12.012.009/06/22 00:00:0016212096002021-05-17
21215WSSSM2021004H3SSM45<NaNNaN09/06/22 00:00:0077.0SSSM202104.0...NaN1.0NaNNaNN15.018.009/06/22 00:00:0016208640002021-05-13
\n", + "

21208 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " KEY NUCLIDE METHOD < VALUE_Bq/m³ VALUE_Bq/m³ ERROR%_m³ \\\n", + "0 WKRIL2012003 CS137 NaN NaN 5.3 32.000000 \n", + "1 WKRIL2012004 CS137 NaN NaN 19.9 20.000000 \n", + "2 WKRIL2012005 CS137 NaN NaN 25.5 20.000000 \n", + "3 WKRIL2012006 CS137 NaN NaN 17.0 29.000000 \n", + "4 WKRIL2012007 CS137 NaN NaN 22.2 18.000000 \n", + "... ... ... ... ... ... ... \n", + "21211 WSSSM2021005 H3 SSM45 NaN 1030.0 93.203883 \n", + "21212 WSSSM2021006 H3 SSM45 NaN 2240.0 43.303571 \n", + "21213 WSSSM2021007 H3 SSM45 NaN 2060.0 47.087379 \n", + "21214 WSSSM2021008 H3 SSM45 NaN 2300.0 43.478261 \n", + "21215 WSSSM2021004 H3 SSM45 < NaN NaN \n", + "\n", + " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... TDEPTH SDEPTH \\\n", + "0 08/20/14 00:00:00 90.0 KRIL 2012003.0 ... NaN 0.0 \n", + "1 08/20/14 00:00:00 90.0 KRIL 2012004.0 ... NaN 29.0 \n", + "2 08/20/14 00:00:00 90.0 KRIL 2012005.0 ... NaN 0.0 \n", + "3 08/20/14 00:00:00 90.0 KRIL 2012006.0 ... NaN 0.0 \n", + "4 08/20/14 00:00:00 90.0 KRIL 2012007.0 ... NaN 39.0 \n", + "... ... ... ... ... ... ... ... \n", + "21211 09/06/22 00:00:00 77.0 SSSM 202105.0 ... NaN 1.0 \n", + "21212 09/06/22 00:00:00 77.0 SSSM 202106.0 ... NaN 1.0 \n", + "21213 09/06/22 00:00:00 77.0 SSSM 202107.0 ... NaN 1.0 \n", + "21214 09/06/22 00:00:00 77.0 SSSM 202108.0 ... NaN 1.0 \n", + "21215 09/06/22 00:00:00 77.0 SSSM 202104.0 ... NaN 1.0 \n", + "\n", + " SALIN TTEMP FILT MORS_SUBBASIN HELCOM_SUBBASIN DATE_OF_ENTRY_y \\\n", + "0 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", + "1 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", + "2 NaN NaN NaN 11.0 3.0 08/20/14 00:00:00 \n", + "3 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", + "4 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", + "... ... ... ... ... ... ... \n", + "21211 NaN NaN N 1.0 8.0 09/06/22 00:00:00 \n", + "21212 NaN NaN N 10.0 10.0 09/06/22 00:00:00 \n", + "21213 NaN NaN N 12.0 12.0 09/06/22 00:00:00 \n", + "21214 NaN NaN N 12.0 12.0 09/06/22 00:00:00 \n", + "21215 NaN NaN N 15.0 18.0 09/06/22 00:00:00 \n", + "\n", + " time begperiod \n", + "0 1337731200 2012-05-23 \n", + "1 1337731200 2012-05-23 \n", + "2 1339891200 2012-06-17 \n", + "3 1337817600 2012-05-24 \n", + "4 1337817600 2012-05-24 \n", + "... ... ... \n", + "21211 1634256000 2021-10-15 \n", + "21212 1635984000 2021-11-04 \n", + "21213 1634256000 2021-10-15 \n", + "21214 1621209600 2021-05-17 \n", + "21215 1620864000 2021-05-13 \n", + "\n", + "[21208 rows x 29 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfm.dfs['seawater']" + ] + }, + { + "cell_type": "markdown", + "id": "69ef4f4b", + "metadata": {}, + "source": [ + "## Sanitize value" + ] + }, + { + "cell_type": "markdown", + "id": "6de49e39", + "metadata": {}, + "source": [ + "We allocate each column containing measurement values (named differently across sample types as `unit` are mentioned as well in column names) into a single column `value` and remove NA where needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8580f592", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "coi_val = {'seawater' : {'val': 'VALUE_Bq/m³'},\n", + " 'biota': {'val': 'VALUE_Bq/kg'},\n", + " 'sediment': {'val': 'VALUE_Bq/kg'}}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "def0a599", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class SanitizeValue(Callback):\n", + " \"Sanitize value/measurement by removing blank entries and populating `value` column.\"\n", + " def __init__(self, \n", + " coi: Dict[str, Dict[str, str]] # Columns of interest. Format: {group_name: {'val': 'column_name'}}\n", + " ): \n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm: Transformer):\n", + " for grp, df in tfm.dfs.items():\n", + " value_col = self.coi[grp]['val']\n", + " df.dropna(subset=[value_col], inplace=True)\n", + " df['value'] = df[value_col]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bccb7a50", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21122 39532 15798\n", + "Number of dropped rows 94 285 29\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[SanitizeValue(coi_val),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "be199c49", + "metadata": {}, + "source": [ + "## Normalize uncertainty" + ] + }, + { + "cell_type": "markdown", + "id": "7515714b", + "metadata": {}, + "source": [ + "Function `unc_rel2stan` converts uncertainty from relative uncertainty to standard uncertainty." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76077d40", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def unc_rel2stan(\n", + " df: pd.DataFrame, # DataFrame containing measurement and uncertainty columns\n", + " meas_col: str, # Name of the column with measurement values\n", + " unc_col: str # Name of the column with relative uncertainty values (percentages)\n", + ") -> pd.Series: # Series with calculated absolute uncertainties\n", + " \"Convert relative uncertainty to absolute uncertainty.\"\n", + " return df.apply(lambda row: row[unc_col] * row[meas_col] / 100, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "2917d107", + "metadata": {}, + "source": [ + "For each sample type in the Helcom dataset, the uncertainty is given as a relative uncertainty. The column names for both the value and the uncertainty vary by sample type. The coi_units_unc dictionary defines the column names for the Value and Uncertainty for each sample type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b231b09b", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "# Columns of interest\n", + "coi_units_unc = [('seawater', 'VALUE_Bq/m³', 'ERROR%_m³'),\n", + " ('biota', 'VALUE_Bq/kg', 'ERROR%'),\n", + " ('sediment', 'VALUE_Bq/kg', 'ERROR%_kg')]" + ] + }, + { + "cell_type": "markdown", + "id": "f20c9a4b", + "metadata": {}, + "source": [ + "NormalizeUncCB callback normalizes the uncertainty by converting from relative uncertainty to standard uncertainty. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cf262ed", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class NormalizeUncCB(Callback):\n", + " \"Convert from relative error % to uncertainty of activity unit.\"\n", + " def __init__(self, \n", + " fn_convert_unc: Callable=unc_rel2stan, # Function converting relative uncertainty to absolute uncertainty\n", + " coi: List[Tuple[str, str, str]]=coi_units_unc # List of columns of interest\n", + " ):\n", + " fc.store_attr()\n", + " \n", + " def __call__(self, tfm: Transformer):\n", + " for grp, val, unc in self.coi:\n", + " if grp in tfm.dfs:\n", + " df = tfm.dfs[grp]\n", + " df['uncertainty'] = self.fn_convert_unc(df, val, unc)" + ] + }, + { + "cell_type": "markdown", + "id": "8545b262", + "metadata": {}, + "source": [ + "Apply the transformer for callback NormalizeUncCB(). Then, print the value (i.e. activity per unit ) and standard uncertainty for each sample type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd9e14e2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " value uncertainty\n", + "0 5.3 1.696\n", + "1 19.9 3.980\n", + "2 25.5 5.100\n", + "3 17.0 4.930\n", + "4 22.2 3.996\n", + " value uncertainty\n", + "0 0.010140 NaN\n", + "1 135.300000 4.830210\n", + "2 0.013980 NaN\n", + "3 4.338000 0.150962\n", + "4 0.009614 NaN\n", + " value uncertainty\n", + "0 35.0 9.10\n", + "1 36.0 7.92\n", + "2 38.0 9.12\n", + "3 36.0 9.00\n", + "4 30.0 6.90\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[NormalizeUncCB(),\n", + " SanitizeValue(coi_val)])\n", + "\n", + "print(tfm()['seawater'][['value', 'uncertainty']][:5])\n", + "print(tfm()['biota'][['value', 'uncertainty']][:5])\n", + "print(tfm()['sediment'][['value', 'uncertainty']][:5])" + ] + }, + { + "cell_type": "markdown", + "id": "9392b0cb", + "metadata": {}, + "source": [ + "## Remap Biota species" + ] + }, + { + "cell_type": "markdown", + "id": "abd63300", + "metadata": {}, + "source": [ + "We follow in the next following processing steps the same approach as for remapping of nuclide names above." + ] + }, + { + "cell_type": "markdown", + "id": "02e7dbf2", + "metadata": {}, + "source": [ + "Let's inspect the `RUBIN_NAME.csv` file provided by HELCOM describing the biota species nomenclature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb121e8b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RUBIN_IDRUBINSCIENTIFIC NAMEENGLISH NAME
011ABRA BRAABRAMIS BRAMABREAM
112ANGU ANGANGUILLA ANGUILLAEEL
213ARCT ISLARCTICA ISLANDICAISLAND CYPRINE
314ASTE RUBASTERIAS RUBENSCOMMON STARFISH
415CARD EDUCARDIUM EDULECOCKLE
\n", + "
" + ], + "text/plain": [ + " RUBIN_ID RUBIN SCIENTIFIC NAME ENGLISH NAME\n", + "0 11 ABRA BRA ABRAMIS BRAMA BREAM\n", + "1 12 ANGU ANG ANGUILLA ANGUILLA EEL\n", + "2 13 ARCT ISL ARCTICA ISLANDICA ISLAND CYPRINE\n", + "3 14 ASTE RUB ASTERIAS RUBENS COMMON STARFISH\n", + "4 15 CARD EDU CARDIUM EDULE COCKLE" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv').head()" + ] + }, + { + "cell_type": "markdown", + "id": "3ec2bd53", + "metadata": {}, + "source": [ + "We try to remap the `SCIENTIFIC NAME` column to the `species` column of the MARIS nomenclature, again using a `Remapper` object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da393947", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 100%|██████████| 46/46 [00:06<00:00, 6.94it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
matched_maris_namesource_namematch_score
source_key
STIZ LUCSander luciopercaSTIZOSTEDION LUCIOPERCA10
LAMI SACLaminaria japonicaLAMINARIA SACCHARINA7
CARD EDUCardiidaeCARDIUM EDULE6
ENCH CIMEchinodermataENCHINODERMATA CIM5
PSET MAXPinctada maximaPSETTA MAXIMA5
MACO BALMacoma balthicaMACOMA BALTICA1
STUC PECStuckenia pectinataSTUCKENIA PECTINATE1
\n", + "
" + ], + "text/plain": [ + " matched_maris_name source_name match_score\n", + "source_key \n", + "STIZ LUC Sander lucioperca STIZOSTEDION LUCIOPERCA 10\n", + "LAMI SAC Laminaria japonica LAMINARIA SACCHARINA 7\n", + "CARD EDU Cardiidae CARDIUM EDULE 6\n", + "ENCH CIM Echinodermata ENCHINODERMATA CIM 5\n", + "PSET MAX Pinctada maxima PSETTA MAXIMA 5\n", + "MACO BAL Macoma balthica MACOMA BALTICA 1\n", + "STUC PEC Stuckenia pectinata STUCKENIA PECTINATE 1" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "remapper = Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv'),\n", + " maris_lut_fn=species_lut_path,\n", + " maris_col_id='species_id',\n", + " maris_col_name='species',\n", + " provider_col_to_match='SCIENTIFIC NAME',\n", + " provider_col_key='RUBIN',\n", + " fname_cache='species_helcom.pkl'\n", + " )\n", + "\n", + "remapper.generate_lookup_table(as_df=True)\n", + "remapper.select_match(match_score_threshold=1)" + ] + }, + { + "cell_type": "markdown", + "id": "e592a7a9", + "metadata": {}, + "source": [ + "We fix below some of the entries that are not properly matched by the `Remapper` object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e31a799", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "fixes_biota_species = {\n", + " 'CARDIUM EDULE': 'Cerastoderma edule',\n", + " 'LAMINARIA SACCHARINA': 'Saccharina latissima',\n", + " 'PSETTA MAXIMA': 'Scophthalmus maximus',\n", + " 'STIZOSTEDION LUCIOPERCA': 'Sander luciopercas'}" + ] + }, + { + "cell_type": "markdown", + "id": "f7d1d994", + "metadata": {}, + "source": [ + "And give it an another try:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a70225a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 100%|██████████| 46/46 [00:07<00:00, 5.79it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
matched_maris_namesource_namematch_score
source_key
ENCH CIMEchinodermataENCHINODERMATA CIM5
MACO BALMacoma balthicaMACOMA BALTICA1
STIZ LUCSander luciopercaSTIZOSTEDION LUCIOPERCA1
STUC PECStuckenia pectinataSTUCKENIA PECTINATE1
\n", + "
" + ], + "text/plain": [ + " matched_maris_name source_name match_score\n", + "source_key \n", + "ENCH CIM Echinodermata ENCHINODERMATA CIM 5\n", + "MACO BAL Macoma balthica MACOMA BALTICA 1\n", + "STIZ LUC Sander lucioperca STIZOSTEDION LUCIOPERCA 1\n", + "STUC PEC Stuckenia pectinata STUCKENIA PECTINATE 1" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "remapper.generate_lookup_table(fixes=fixes_biota_species)\n", + "remapper.select_match(match_score_threshold=1)" + ] + }, + { + "cell_type": "markdown", + "id": "e6f49b32", + "metadata": {}, + "source": [ + "Visual inspection of the remaining unperfectly matched entries seem acceptable to proceed. \n", + "\n", + "We can now use the generic `RemapCB` callback to perform the remapping of the `RUBIN` column to the `species` column after having defined the lookup table `lut_biota`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccd6c46e", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "lut_biota = lambda: Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv'),\n", + " maris_lut_fn=species_lut_path,\n", + " maris_col_id='species_id',\n", + " maris_col_name='species',\n", + " provider_col_to_match='SCIENTIFIC NAME',\n", + " provider_col_key='RUBIN',\n", + " fname_cache='species_helcom.pkl'\n", + " ).generate_lookup_table(fixes=fixes_biota_species, as_df=False, overwrite=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b83ffe12", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 99 243 50 139 270 192 191 284 84 269 122 96 287 279\n", + " 278 288 286 244 129 275 271 285 283 247 120 59 280 274\n", + " 273 290 289 272 277 276 21 282 110 281 245 704 1524 703\n", + " 1611 621 60]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota')\n", + " ])\n", + "\n", + "# For instance:\n", + "print(tfm()['biota']['species'].unique())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2c74e492", + "metadata": {}, + "source": [ + "## Remap Biota tissues\n", + "Let's inspect the `TISSUE.csv` file provided by HELCOM describing the tissue nomenclature. Biota tissue is known as `body part` in the maris data set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a38df50b-46a9-4a2d-9379-e670eb0d0bb6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TISSUETISSUE_DESCRIPTION
01WHOLE FISH
12WHOLE FISH WITHOUT ENTRAILS
23WHOLE FISH WITHOUT HEAD AND ENTRAILS
34FLESH WITH BONES
45FLESH WITHOUT BONES (FILETS)
\n", + "
" + ], + "text/plain": [ + " TISSUE TISSUE_DESCRIPTION\n", + "0 1 WHOLE FISH\n", + "1 2 WHOLE FISH WITHOUT ENTRAILS\n", + "2 3 WHOLE FISH WITHOUT HEAD AND ENTRAILS\n", + "3 4 FLESH WITH BONES\n", + "4 5 FLESH WITHOUT BONES (FILETS)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv').head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2613f239", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 0%| | 0/29 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
matched_maris_namesource_namematch_score
source_key
3Flesh without bonesWHOLE FISH WITHOUT HEAD AND ENTRAILS20
2Flesh without bonesWHOLE FISH WITHOUT ENTRAILS13
8Soft partsSKIN/EPIDERMIS10
5Flesh without bonesFLESH WITHOUT BONES (FILETS)9
1Whole animalWHOLE FISH5
12BrainENTRAILS5
15Stomach and intestineSTOMACH + INTESTINE3
41Whole animalWHOLE ANIMALS1
\n", + "" + ], + "text/plain": [ + " matched_maris_name source_name \\\n", + "source_key \n", + "3 Flesh without bones WHOLE FISH WITHOUT HEAD AND ENTRAILS \n", + "2 Flesh without bones WHOLE FISH WITHOUT ENTRAILS \n", + "8 Soft parts SKIN/EPIDERMIS \n", + "5 Flesh without bones FLESH WITHOUT BONES (FILETS) \n", + "1 Whole animal WHOLE FISH \n", + "12 Brain ENTRAILS \n", + "15 Stomach and intestine STOMACH + INTESTINE \n", + "41 Whole animal WHOLE ANIMALS \n", + "\n", + " match_score \n", + "source_key \n", + "3 20 \n", + "2 13 \n", + "8 10 \n", + "5 9 \n", + "1 5 \n", + "12 5 \n", + "15 3 \n", + "41 1 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "remapper = Remapper(provider_lut_df=pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv'),\n", + " maris_lut_fn=bodyparts_lut_path,\n", + " maris_col_id='bodypar_id',\n", + " maris_col_name='bodypar',\n", + " provider_col_to_match='TISSUE_DESCRIPTION',\n", + " provider_col_key='TISSUE',\n", + " fname_cache='tissues_helcom.pkl'\n", + " )\n", + "\n", + "remapper.generate_lookup_table(as_df=True)\n", + "remapper.select_match(match_score_threshold=1)" + ] + }, + { + "cell_type": "markdown", + "id": "0fee1bb9", + "metadata": {}, + "source": [ + "We fix below some of the entries that are not properly matched by the `Remapper` object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6e2b06f-5eb1-4708-8087-75c836f08112", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "fixes_biota_tissues = {\n", + " 'WHOLE FISH WITHOUT HEAD AND ENTRAILS': 'Whole animal eviscerated without head',\n", + " 'ENTRAILS': 'Viscera',\n", + " 'SKIN/EPIDERMIS': 'Skin'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c07fc4b8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 100%|██████████| 29/29 [00:00<00:00, 137.97it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
matched_maris_namesource_namematch_score
source_key
2Flesh without bonesWHOLE FISH WITHOUT ENTRAILS13
5Flesh without bonesFLESH WITHOUT BONES (FILETS)9
1Whole animalWHOLE FISH5
15Stomach and intestineSTOMACH + INTESTINE3
41Whole animalWHOLE ANIMALS1
\n", + "
" + ], + "text/plain": [ + " matched_maris_name source_name match_score\n", + "source_key \n", + "2 Flesh without bones WHOLE FISH WITHOUT ENTRAILS 13\n", + "5 Flesh without bones FLESH WITHOUT BONES (FILETS) 9\n", + "1 Whole animal WHOLE FISH 5\n", + "15 Stomach and intestine STOMACH + INTESTINE 3\n", + "41 Whole animal WHOLE ANIMALS 1" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "remapper.generate_lookup_table(as_df=True, fixes=fixes_biota_tissues)\n", + "remapper.select_match(match_score_threshold=1)" + ] + }, + { + "cell_type": "markdown", + "id": "6ef75cb1", + "metadata": {}, + "source": [ + "Visual inspection of the remaining unperfectly matched entries seem acceptable to proceed. \n", + "\n", + "We can now use the generic `RemapCB` callback to perform the remapping of the `TISSUE` column to the `body_part` column after having defined the lookup table `lut_tissues`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c42eb30", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "lut_tissues = lambda: Remapper(provider_lut_df=pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv'),\n", + " maris_lut_fn=bodyparts_lut_path,\n", + " maris_col_id='bodypar_id',\n", + " maris_col_name='bodypar',\n", + " provider_col_to_match='TISSUE_DESCRIPTION',\n", + " provider_col_key='TISSUE',\n", + " fname_cache='tissues_helcom.pkl'\n", + " ).generate_lookup_table(fixes=fixes_biota_tissues, as_df=False, overwrite=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d1887c6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " TISSUE body_part\n", + "0 5 52\n", + "1 5 52\n", + "2 5 52\n", + "3 5 52\n", + "4 5 52\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota')\n", + " ])\n", + "\n", + "print(tfm()['biota'][['TISSUE', 'body_part']][:5])\n" + ] + }, + { + "cell_type": "markdown", + "id": "cc596011", + "metadata": {}, + "source": [ + "## Remap biogroup" + ] + }, + { + "cell_type": "markdown", + "id": "da42ebe6", + "metadata": {}, + "source": [ + "`get_biogroup_lut` reads the file at `species_lut_path()` and from the contents of this file creates a dictionary linking `species_id` to `biogroup_id`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf290302", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "lut_biogroup = lambda: get_lut(species_lut_path().parent, species_lut_path().name, \n", + " key='species_id', value='biogroup_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a37157", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 4 2 14 11 8 3]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota')\n", + " ])\n", + "\n", + "print(tfm()['biota']['bio_group'].unique())\n" + ] + }, + { + "cell_type": "markdown", + "id": "2bea8647", + "metadata": {}, + "source": [ + "## Remap Taxon Information\n", + "Currently, the details (`Taxonname`, `TaxonRepName`, `Taxonrank`) are used for importing into the MARIS master database, but they are not included in the NetCDF encoding. \n", + "\n", + "We first need to retrieve the taxon information from the `dbo_species.xlsx` file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "324d52dc", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "# TODO: Include Commonname field after next MARIS data reconciling process.\n", + "def get_taxon_info_lut(\n", + " maris_lut:str # Path to the MARIS lookup table (Excel file)\n", + ") -> dict: # A dictionary mapping species_id to biogroup_id\n", + " \"Retrieve a lookup table for Taxonname from a MARIS lookup table.\"\n", + " species = pd.read_excel(maris_lut)\n", + " return species[['species_id', 'Taxonname', 'Taxonrank','TaxonDB','TaxonDBID','TaxonDBURL']].set_index('species_id').to_dict()\n", + "\n", + "lut_taxon = lambda: get_taxon_info_lut(species_lut_path())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b04111c3", + "metadata": {}, + "outputs": [], + "source": [ + "# | exports\n", + "class RemapTaxonInformationCB(Callback):\n", + " \"Update taxon information based on MARIS species LUT.\"\n", + " def __init__(self, fn_lut: Callable):\n", + " self.fn_lut = fn_lut\n", + "\n", + " def __call__(self, tfm: Transformer):\n", + " lut = self.fn_lut()\n", + " df = tfm.dfs['biota']\n", + " \n", + " df['TaxonRepName'] = df.get('RUBIN', 'Unknown')\n", + " \n", + " taxon_columns = ['Taxonname', 'Taxonrank', 'TaxonDB', 'TaxonDBID', 'TaxonDBURL']\n", + " for col in taxon_columns:\n", + " df[col] = df['species'].map(lut[col]).fillna('Unknown')\n", + " \n", + " unmatched = df[df['Taxonname'] == 'Unknown']['species'].unique()\n", + " if len(unmatched) > 0:\n", + " print(f\"Unmatched species IDs: {', '.join(unmatched)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40c7c54e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " TaxonRepName Taxonname Taxonrank TaxonDB TaxonDBID \\\n", + "0 GADU MOR Gadus morhua species Wikidata Q199788 \n", + "40 SPRA SPR Sprattus sprattus species Wikidata Q506823 \n", + "44 CLUP HAR Clupea harengus species Wikidata Q2396858 \n", + "77 MERL MNG Merlangius merlangus species Wikidata Q273083 \n", + "78 LIMA LIM Limanda limanda species Wikidata Q1135526 \n", + "\n", + " TaxonDBURL \n", + "0 https://www.wikidata.org/wiki/Q199788 \n", + "40 https://www.wikidata.org/wiki/Q506823 \n", + "44 https://www.wikidata.org/wiki/Q2396858 \n", + "77 https://www.wikidata.org/wiki/Q273083 \n", + "78 https://www.wikidata.org/wiki/Q1135526 \n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[ \n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", + " RemapTaxonInformationCB(lut_taxon)\n", + " ])\n", + "tfm()\n", + "print(tfm.dfs['biota'][['TaxonRepName', 'Taxonname', 'Taxonrank',\n", + " 'TaxonDB','TaxonDBID','TaxonDBURL']].drop_duplicates().head())" + ] + }, + { + "cell_type": "markdown", + "id": "adcf607d", + "metadata": {}, + "source": [ + "## Remap Sediment types\n", + "We use again the same **IMFA** (Inspect, Match, Fix, Apply) pattern to remap the HELCOM sediment types." + ] + }, + { + "cell_type": "markdown", + "id": "0f938d40", + "metadata": {}, + "source": [ + "Let's inspect the `SEDIMENT_TYPE.csv` file provided by HELCOM describing the sediment type nomenclature:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5f6b82a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SEDISEDIMENT TYPERECOMMENDED TO BE USED
0-99NO DATANaN
10GRAVELYES
21SANDYES
32FINE SANDNO
43SILTYES
\n", + "
" + ], + "text/plain": [ + " SEDI SEDIMENT TYPE RECOMMENDED TO BE USED\n", + "0 -99 NO DATA NaN\n", + "1 0 GRAVEL YES\n", + "2 1 SAND YES\n", + "3 2 FINE SAND NO\n", + "4 3 SILT YES" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv').head()" + ] + }, + { + "cell_type": "markdown", + "id": "05762600", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: The `SEDI` values `56` and `73` are not found in the `SEDIMENT_TYPE.csv` lookup table provided. Note also there are many `nan` values in the `SEDIMENT_TYPE.csv` file.\n", + "\n", + "We reassign them to `-99` for now but should be clarified/fixed. This is demonstrated below.\n", + "\n", + ":::" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbc6540f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing SEDI values: {56.0, 73.0, nan}\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "df_sed_lut = pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv')\n", + "dfs = load_data(fname_in)\n", + "\n", + "sediment_sedi = set(dfs['sediment'].SEDI.unique())\n", + "lookup_sedi = set(df_sed_lut['SEDI'])\n", + "missing = sediment_sedi - lookup_sedi\n", + "print(f\"Missing SEDI values: {missing if missing else 'None'}\")" + ] + }, + { + "cell_type": "markdown", + "id": "34f305d9", + "metadata": {}, + "source": [ + "Let's try to match as many as possible:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac413a89", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 100%|██████████| 47/47 [00:00<00:00, 141.72it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
matched_maris_namesource_namematch_score
source_key
-99SoftNO DATA5
50Mud and gravelMUD AND GARVEL2
46Glacial clayCLACIAL CLAY1
\n", + "
" + ], + "text/plain": [ + " matched_maris_name source_name match_score\n", + "source_key \n", + "-99 Soft NO DATA 5\n", + " 50 Mud and gravel MUD AND GARVEL 2\n", + " 46 Glacial clay CLACIAL CLAY 1" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "remapper = Remapper(provider_lut_df=pd.read_csv(Path(fname_in)/'SEDIMENT_TYPE.csv'),\n", + " maris_lut_fn=sediments_lut_path,\n", + " maris_col_id='sedtype_id',\n", + " maris_col_name='sedtype',\n", + " provider_col_to_match='SEDIMENT TYPE',\n", + " provider_col_key='SEDI',\n", + " fname_cache='sediments_helcom.pkl'\n", + " )\n", + "\n", + "remapper.generate_lookup_table(as_df=True)\n", + "remapper.select_match(match_score_threshold=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9bbc268", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "fixes_sediments = {\n", + " 'NO DATA': '(Not available)'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10fd41a0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 100%|██████████| 47/47 [00:00<00:00, 102.45it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
matched_maris_namesource_namematch_score
source_key
50Mud and gravelMUD AND GARVEL2
46Glacial clayCLACIAL CLAY1
\n", + "
" + ], + "text/plain": [ + " matched_maris_name source_name match_score\n", + "source_key \n", + "50 Mud and gravel MUD AND GARVEL 2\n", + "46 Glacial clay CLACIAL CLAY 1" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "remapper.generate_lookup_table(as_df=True, fixes=fixes_sediments)\n", + "remapper.select_match(match_score_threshold=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cad7ec2-97fd-43a8-83cb-c965ae89efde", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class RemapSedimentCB(Callback):\n", + " \"Update sediment id based on MARIS species LUT (dbo_sedtype.xlsx).\"\n", + " def __init__(self, \n", + " fn_lut: Callable, # Function that returns the lookup table dictionary\n", + " ):\n", + " fc.store_attr()\n", + "\n", + " def _fix_inconsistent_sedi(self, df:pd.DataFrame) -> pd.DataFrame:\n", + " \"Temporary fix for inconsistent SEDI values. Data provider to confirm and clarify.\"\n", + " df['SEDI'] = df['SEDI'].replace({56: -99, 73: -99, np.nan: -99})\n", + " return df\n", + " \n", + " def __call__(self, tfm: Transformer):\n", + " \"Remap sediment types in the DataFrame using the lookup table and handle specific replacements.\"\n", + " lut = self.fn_lut()\n", + " \n", + " # Set SedRepName (TBC: what's used for?)\n", + " tfm.dfs['sediment']['SedRepName'] = tfm.dfs['sediment']['SEDI'] \n", + " \n", + " tfm.dfs['sediment'] = self._fix_inconsistent_sedi(tfm.dfs['sediment'])\n", + " tfm.dfs['sediment']['sed_type'] = tfm.dfs['sediment']['SEDI'].apply(lambda x: self._get_sediment_type(x, lut))\n", + "\n", + " def _get_sediment_type(self, \n", + " sedi_value: int, # The `SEDI` value from the DataFrame\n", + " lut: dict # The lookup table dictionary\n", + " ) -> Match: # The Match object\n", + " \"Get the matched_id from the lookup table and print SEDI if the matched_id is -1.\"\n", + " match = lut.get(sedi_value, Match(-1, None, None, None))\n", + " \n", + " if match.matched_id == -1:\n", + " self._print_unmatched_sedi(sedi_value)\n", + " return match.matched_id\n", + "\n", + " def _print_unmatched_sedi(self, \n", + " sedi_value: int # The `SEDI` value from the DataFram\n", + " ) -> None:\n", + " \"Print the SEDI value if the matched_id is -1.\"\n", + " print(f\"Unmatched SEDI: {sedi_value}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25a4ff58", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "lut_sediments = lambda: Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv'),\n", + " maris_lut_fn=sediments_lut_path,\n", + " maris_col_id='sedtype_id',\n", + " maris_col_name='sedtype',\n", + " provider_col_to_match='SEDIMENT TYPE',\n", + " provider_col_key='SEDI',\n", + " fname_cache='sediments_helcom.pkl'\n", + " ).generate_lookup_table(fixes=fixes_sediments, as_df=False, overwrite=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f131e929", + "metadata": {}, + "source": [ + "Apply the transformer for callbacks `RemapSedimentCB(get_maris_sediments)`. Then, print the `SEDI` and `sed_type` for the `biota` dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16d42cb0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 2, 58, 30, 59, 55, 56, 36, 29, 47, 4, 54, 33, 6, 44, 42, 48,\n", + " 61, 57, 28, 49, 32, 45, 39, 46, 38, 31, 60, 62, 26, 53, 52, 1, 51,\n", + " 37, 34, 50, 7, 10, 41, 43, 35])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[RemapSedimentCB(lut_sediments)])\n", + "\n", + "tfm()['sediment']['sed_type'].unique()" + ] + }, + { + "cell_type": "markdown", + "id": "c3a0add1", + "metadata": {}, + "source": [ + "## Remap units" + ] + }, + { + "cell_type": "markdown", + "id": "7a4064ed", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: The handling of unit types varies between `biota` and `sediment` sample types. For consistency and ease of use, it would be beneficial to have dedicated unit columns for all sample types.\n", + "\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "id": "e6a682ac", + "metadata": {}, + "source": [ + "For `seawater` and `sediment` sample types, the HELCOM dataset refers to units direcly in the name of certain columns, such as `VALUE_Bq/m³` or `VALUE_Bq/kg`. As for `biota`, the units are included in the `BASIS` column. This is shown below: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cab93970", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "biota: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'BASIS',\n", + " 'ERROR%', 'NUMBER', 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY',\n", + " 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY', 'STATION',\n", + " 'LATITUDE ddmmmm', 'LATITUDE dddddd', 'LONGITUDE ddmmmm',\n", + " 'LONGITUDE dddddd', 'SDEPTH', 'RUBIN', 'BIOTATYPE', 'TISSUE', 'NO',\n", + " 'LENGTH', 'WEIGHT', 'DW%', 'LOI%', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN',\n", + " 'DATE_OF_ENTRY_y'],\n", + " dtype='object')\n", + "sediment: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',\n", + " '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',\n", + " 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',\n", + " 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", + " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',\n", + " 'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',\n", + " 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],\n", + " dtype='object')\n", + "seawater: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/m³', 'VALUE_Bq/m³', 'ERROR%_m³',\n", + " 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR',\n", + " 'MONTH', 'DAY', 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", + " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'TDEPTH', 'SDEPTH', 'SALIN',\n", + " 'TTEMP', 'FILT', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'DATE_OF_ENTRY_y'],\n", + " dtype='object')\n" + ] + }, + { + "data": { + "text/plain": [ + "array(['W', nan, 'D', 'F'], dtype=object)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "for grp in ['biota', 'sediment', 'seawater']:\n", + " print(f\"{grp}: {dfs[grp].columns}\")\n", + " \n", + "dfs['biota']['BASIS'].unique()" + ] + }, + { + "cell_type": "markdown", + "id": "f7cbefe4", + "metadata": {}, + "source": [ + "Given the inconsistent handling of units across sample types, we need to define custom mapping rules for standardizing the units. Below the MARIS unit types:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12a86baf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unit_idunitunit_sanitized
0-1Not applicableNot applicable
10NOT AVAILABLENOT AVAILABLE
21Bq/m3Bq per m3
32Bq/m2Bq per m2
43Bq/kgBq per kg
54Bq/kgdBq per kgd
65Bq/kgwBq per kgw
76kg/kgkg per kg
87TUTU
98DELTA/millDELTA per mill
109atom/kgatom per kg
1110atom/kgdatom per kgd
1211atom/kgwatom per kgw
1312atom/latom per l
1413Bq/kgCBq per kgC
\n", + "
" + ], + "text/plain": [ + " unit_id unit unit_sanitized\n", + "0 -1 Not applicable Not applicable\n", + "1 0 NOT AVAILABLE NOT AVAILABLE\n", + "2 1 Bq/m3 Bq per m3\n", + "3 2 Bq/m2 Bq per m2\n", + "4 3 Bq/kg Bq per kg\n", + "5 4 Bq/kgd Bq per kgd\n", + "6 5 Bq/kgw Bq per kgw\n", + "7 6 kg/kg kg per kg\n", + "8 7 TU TU\n", + "9 8 DELTA/mill DELTA per mill\n", + "10 9 atom/kg atom per kg\n", + "11 10 atom/kgd atom per kgd\n", + "12 11 atom/kgw atom per kgw\n", + "13 12 atom/l atom per l\n", + "14 13 Bq/kgC Bq per kgC" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "pd.read_excel(unit_lut_path())[['unit_id', 'unit', 'unit_sanitized']]" + ] + }, + { + "cell_type": "markdown", + "id": "9ec28334", + "metadata": {}, + "source": [ + "We define unit names renaming rules from HELCOM in an **ad hoc** way for now:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea7fa747", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "lut_units = {\n", + " 'seawater': 1, # 'Bq/m3'\n", + " 'sediment': 4, # 'Bq/kgd' for sediment\n", + " 'biota': {\n", + " 'D': 4, # 'Bq/kgd'\n", + " 'W': 5, # 'Bq/kgw'\n", + " 'F': 5 # 'Bq/kgw' (assumed to be 'Fresh', so set to wet)\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e404d620", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class RemapUnitCB(Callback):\n", + " \"Set the `unit` id column in the DataFrames based on a lookup table.\"\n", + " def __init__(self, \n", + " lut_units: dict=lut_units # Dictionary containing renaming rules for different unit categories\n", + " ):\n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm: Transformer):\n", + " for grp in tfm.dfs.keys():\n", + " if grp in ['seawater', 'sediment']:\n", + " tfm.dfs[grp]['unit'] = self.lut_units[grp]\n", + " else:\n", + " tfm.dfs[grp]['unit'] = tfm.dfs[grp]['BASIS'].apply(lambda x: lut_units[grp].get(x, 0))" + ] + }, + { + "cell_type": "markdown", + "id": "3a03fcc9", + "metadata": {}, + "source": [ + "Apply the transformer for callback `RemapUnitCB()`. Then, print the unique `unit` for the `seawater` dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa0f0abf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "biota: [5 0 4]\n", + "sediment: [4]\n", + "seawater: [1]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[RemapUnitCB()])\n", + "\n", + "for grp in ['biota', 'sediment', 'seawater']:\n", + " print(f\"{grp}: {tfm()[grp]['unit'].unique()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "5d978c67", + "metadata": {}, + "source": [ + "## Remap detection limit\n", + "Detection limits are encoded as follows in MARIS:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1b07268", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamename_sanitized
0-1Not applicableNot applicable
10Not AvailableNot available
21=Detected value
32<Detection limit
43NDNot detected
54DEDerived
\n", + "
" + ], + "text/plain": [ + " id name name_sanitized\n", + "0 -1 Not applicable Not applicable\n", + "1 0 Not Available Not available\n", + "2 1 = Detected value\n", + "3 2 < Detection limit\n", + "4 3 ND Not detected\n", + "5 4 DE Derived" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "pd.read_excel(detection_limit_lut_path())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7083b6f", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "lut_dl = lambda: pd.read_excel(detection_limit_lut_path(), usecols=['name','id']).set_index('name').to_dict()['id']" + ] + }, + { + "cell_type": "markdown", + "id": "3023ddb4", + "metadata": {}, + "source": [ + "Based on columns of interest for each sample type:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2dc43c01", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "coi_dl = {'seawater' : {'val' : 'VALUE_Bq/m³',\n", + " 'unc' : 'ERROR%_m³',\n", + " 'dl' : '< VALUE_Bq/m³'},\n", + " 'biota': {'val' : 'VALUE_Bq/kg',\n", + " 'unc' : 'ERROR%',\n", + " 'dl' : '< VALUE_Bq/kg'},\n", + " 'sediment': {\n", + " 'val' : 'VALUE_Bq/kg',\n", + " 'unc' : 'ERROR%_kg',\n", + " 'dl' : '< VALUE_Bq/kg'}}" + ] + }, + { + "cell_type": "markdown", + "id": "3d8ac6a6", + "metadata": {}, + "source": [ + "We follow the following business logic to encode the detection limit:" + ] + }, + { + "cell_type": "markdown", + "id": "f6f4784b", + "metadata": {}, + "source": [ + "`RemapDetectionLimitCB` creates a `detection_limit` column with values determined as follows:\n", + "1. Perform a lookup with the appropriate columns value type (or detection limit) columns (`< VALUE_Bq/m³` or `< VALUE_Bq/kg`) against the table returned from the function `get_detectionlimit_lut`.\n", + "2. If `< VALUE_Bq/m³` or `< VALUE_Bq/kg` is NaN but both activity values (`VALUE_Bq/m³` or `VALUE_Bq/kg`) and standard uncertainty (`ERROR%_m³`, `ERROR%`, or `ERROR%_kg`) are provided, then assign the ID of `1` (i.e. \"Detected value\").\n", + "3. For other NaN values in the `detection_limit` column, set them to `0` (i.e. `Not Available`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a72f956", + "metadata": {}, + "outputs": [], + "source": [ + "# | exports\n", + "# TO BE REFACTORED\n", + "class RemapDetectionLimitCB(Callback):\n", + " \"Remap value type to MARIS format.\"\n", + " def __init__(self, \n", + " coi: dict, # Configuration options for column names\n", + " fn_lut: Callable # Function that returns a lookup table\n", + " ):\n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm: Transformer):\n", + " \"Remap detection limits in the DataFrames using the lookup table.\"\n", + " lut = self.fn_lut()\n", + " \n", + " for grp in tfm.dfs:\n", + " df = tfm.dfs[grp]\n", + " self._update_detection_limit(df, grp, lut)\n", + " \n", + " def _update_detection_limit(self, \n", + " df: pd.DataFrame, # The DataFrame to modify\n", + " grp: str, # The group name to get the column configuration\n", + " lut: dict # The lookup table dictionary\n", + " ) -> None:\n", + " \"Update detection limit column in the DataFrame based on lookup table and rules.\"\n", + " detection_col = self.coi[grp]['dl']\n", + " value_col = self.coi[grp]['val']\n", + " uncertainty_col = self.coi[grp]['unc']\n", + " \n", + " # Copy detection limit column\n", + " df['detection_limit'] = df[detection_col]\n", + " \n", + " # Fill values with '=' or 'Not Available'\n", + " condition = ((df[value_col].notna()) & (df[uncertainty_col].notna()) &\n", + " (~df['detection_limit'].isin(lut.keys())))\n", + " df.loc[condition, 'detection_limit'] = '='\n", + " df.loc[~df['detection_limit'].isin(lut.keys()), 'detection_limit'] = 'Not Available'\n", + " \n", + " # Perform lookup\n", + " df['detection_limit'] = df['detection_limit'].map(lut)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ba3694d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "biota: [2 1 0]\n", + "sediment: [1 2 0]\n", + "seawater: [1 2 0]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " NormalizeUncCB(),\n", + " SanitizeValue(coi_val), \n", + " RemapUnitCB(),\n", + " RemapDetectionLimitCB(coi_dl, lut_dl)])\n", + "\n", + "\n", + "for grp in ['biota', 'sediment', 'seawater']:\n", + " print(f\"{grp}: {tfm()[grp]['detection_limit'].unique()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "0026620e", + "metadata": {}, + "source": [ + "## Remap filtering status" + ] + }, + { + "cell_type": "markdown", + "id": "33ea63f3", + "metadata": {}, + "source": [ + "HELCOM filtered status is encoded as follows in the `FILT` column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5eacd28c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexvalue
00NaN
11F
22n
33N
\n", + "
" + ], + "text/plain": [ + " index value\n", + "0 0 NaN\n", + "1 1 F\n", + "2 2 n\n", + "3 3 N" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "get_unique_across_dfs(dfs, col_name='FILT', as_df=True).head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "703ee067", + "metadata": {}, + "source": [ + "While MARIS uses a different encoding for filtered status:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34e737e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idname
0-1Not applicable
10Not available
21Yes
32No
\n", + "
" + ], + "text/plain": [ + " id name\n", + "0 -1 Not applicable\n", + "1 0 Not available\n", + "2 1 Yes\n", + "3 2 No" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "pd.read_excel(filtered_lut_path())" + ] + }, + { + "cell_type": "markdown", + "id": "37fbf457", + "metadata": {}, + "source": [ + "For only four categories to remap, the `Remapper` is an overkill. We can use a simple dictionary to map the values:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d2b4bbc", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "lut_filtered = {\n", + " 'N': 2,\n", + " 'n': 2,\n", + " 'F': 1\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "b43ea425", + "metadata": {}, + "source": [ + "`RemapFiltCB` converts the HELCOM `FILT` format to the MARIS `FILT` format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8f58336", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class RemapFiltCB(Callback):\n", + " \"Lookup FILT value in dataframe using the lookup table.\"\n", + " def __init__(self,\n", + " lut_filtered: dict=lut_filtered, # Dictionary mapping FILT codes to their corresponding names\n", + " ):\n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm):\n", + " for df in tfm.dfs.values():\n", + " if 'FILT' in df.columns:\n", + " df['FILT'] = df['FILT'].map(lambda x: self.lut_filtered.get(x, 0))" + ] + }, + { + "cell_type": "markdown", + "id": "719feb2c", + "metadata": {}, + "source": [ + "For instance:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2d13536", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 2 1]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[RemapFiltCB(lut_filtered)])\n", + "\n", + "print(tfm()['seawater']['FILT'].unique())" + ] + }, + { + "cell_type": "markdown", + "id": "c2e5ef74", + "metadata": {}, + "source": [ + "## Add Sample Laboratory code" + ] + }, + { + "cell_type": "markdown", + "id": "b3a02de8", + "metadata": {}, + "source": [ + "Sample Laboratory code is currently stored in MARIS master DB but not encoded as NetCDF variable. Decision to include it in the NetCDF output is TBD." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f29d5b8", + "metadata": {}, + "outputs": [], + "source": [ + "# | exports\n", + "class AddSampleLabCodeCB(Callback):\n", + " \"Remap `KEY` column to `samplabcode` in each DataFrame.\"\n", + " def __call__(self, tfm: Transformer):\n", + " for grp in tfm.dfs:\n", + " self._remap_sample_id(tfm.dfs[grp])\n", + " \n", + " def _remap_sample_id(self, df: pd.DataFrame):\n", + " df['samplabcode'] = df['KEY']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a13ddf94", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['WKRIL2012003' 'WKRIL2012004' 'WKRIL2012005' ... 'WSSSM2021006'\n", + " 'WSSSM2021007' 'WSSSM2021008']\n", + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of dropped rows 0 0 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " AddSampleLabCodeCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "\n", + "print(tfm()['seawater']['samplabcode'].unique())\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n" + ] + }, + { + "cell_type": "markdown", + "id": "fe0fb210", + "metadata": {}, + "source": [ + "## Add measurement note" + ] + }, + { + "cell_type": "markdown", + "id": "9c05383c", + "metadata": {}, + "source": [ + "The HELCOM dataset includes a look-up table `ANALYSIS_METHOD.csv` capturing the measurement method used as described by HELCOM. For instance:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0985b9e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
METHODDESCRIPTIONCOUNTRY
0BFFG01Gammaspectrometric analysis with Germanium det...6
1BFFG02Sr-90, a) Y-90 extraction method dried ash and...6
2BFFG03Pu238, Pu239241; Ashing and and drying the tra...6
3BFFG04Am-241 (not to in use any more)6
4CLOR01137Cs and 40K activity concentrations are dete...67
\n", + "
" + ], + "text/plain": [ + " METHOD DESCRIPTION COUNTRY\n", + "0 BFFG01 Gammaspectrometric analysis with Germanium det... 6\n", + "1 BFFG02 Sr-90, a) Y-90 extraction method dried ash and... 6\n", + "2 BFFG03 Pu238, Pu239241; Ashing and and drying the tra... 6\n", + "3 BFFG04 Am-241 (not to in use any more) 6\n", + "4 CLOR01 137Cs and 40K activity concentrations are dete... 67" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "pd.read_csv(Path(fname_in) / 'ANALYSIS_METHOD.csv').head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d9976e2", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "lut_method = lambda: pd.read_csv(Path(fname_in) / 'ANALYSIS_METHOD.csv').set_index('METHOD').to_dict()['DESCRIPTION']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "016db0d9", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class AddMeasurementNoteCB(Callback):\n", + " \"Record measurement notes by adding a 'measurenote' column to DataFrames.\"\n", + " def __init__(self, \n", + " fn_lut: Callable # Function that returns the lookup dictionary with `METHOD` as key and `DESCRIPTION` as value\n", + " ):\n", + " fc.store_attr()\n", + " \n", + " def __call__(self, tfm: Transformer):\n", + " lut = self.fn_lut()\n", + " for df in tfm.dfs.values():\n", + " if 'METHOD' in df.columns:\n", + " df['measurementnote'] = df['METHOD'].map(lambda x: lut.get(x, 0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e100431c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0\n", + " 'Radiochemical method Radiocaesium separation from seawater samples.134+137Cs was adsorbed on AMP mat, dissolved with NaOH and after purification precipitated as chloroplatinate (Cs2PtCl6).Counting with low background anticoincidence beta counter.'\n", + " 'Radiochem. meth of Sr90. Precipation with oxalate and separation of calcium, barium, radium and ytrium couting with low background anticoincidence beta counter. 1982-1994'\n", + " 'For tritium liquid scintialtion counting, combined with electrolytic enrichment of analysed water samples, double distilled, before and after electrolysis in cells. Liquid Scintillation spectrometer LKB Wallac model 1410'\n", + " 'Pretreatment drying (sediment, biota samples) and ashing (biota samples)or vaporization to 1000 ml (sea water samples), measured by gamma-spectrometry using HPGe detectors sediment, biota, sea water /Cs-137, Cs-134, K-40']\n", + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of dropped rows 0 0 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " AddMeasurementNoteCB(lut_method),\n", + " CompareDfsAndTfmCB(dfs)])\n", + "\n", + "tfm()\n", + "print(tfm.dfs['seawater']['measurementnote'].unique()[:5])\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" + ] + }, + { + "cell_type": "markdown", + "id": "b90fa59a", + "metadata": {}, + "source": [ + "## Add station" + ] + }, + { + "cell_type": "markdown", + "id": "0dfa0216", + "metadata": {}, + "source": [ + "*For MARIS master DB import only (not included in the NetCDF output).*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "768db093", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class RemapStationIdCB(Callback):\n", + " \"Remap Station ID to MARIS format.\"\n", + " def __init__(self):\n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm: Transformer):\n", + " \"Iterate through all DataFrames in the transformer object and remap `STATION` to `station_id`.\"\n", + " for grp in tfm.dfs.keys(): \n", + " tfm.dfs[grp]['station'] = tfm.dfs[grp]['STATION']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ccb2604", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of dropped rows 0 0 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemapStationIdCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" + ] + }, + { + "cell_type": "markdown", + "id": "ff696fec", + "metadata": {}, + "source": [ + "## Add slice position (top and bottom)" + ] + }, + { + "cell_type": "markdown", + "id": "f615911d", + "metadata": {}, + "source": [ + "*For MARIS master DB import only (not included in the NetCDF output).*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf398df9", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class RemapSedSliceTopBottomCB(Callback):\n", + " \"Remap Sediment slice top and bottom to MARIS format.\"\n", + " def __call__(self, tfm: Transformer):\n", + " \"Iterate through all DataFrames in the transformer object and remap sediment slice top and bottom.\"\n", + " tfm.dfs['sediment']['top'] = tfm.dfs['sediment']['UPPSLI']\n", + " tfm.dfs['sediment']['bottom'] = tfm.dfs['sediment']['LOWSLI']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6479e6f3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " top bottom\n", + "0 15.0 20.0\n", + "1 20.0 27.0\n", + "2 0.0 2.0\n", + "3 2.0 4.0\n", + "4 4.0 6.0\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[RemapSedSliceTopBottomCB()])\n", + "tfm()\n", + "print(tfm.dfs['sediment'][['top','bottom']].head())" + ] + }, + { + "cell_type": "markdown", + "id": "5e4bbf53", + "metadata": {}, + "source": [ + "## Add dry to wet ratio" + ] + }, + { + "cell_type": "markdown", + "id": "bb091cc0", + "metadata": {}, + "source": [ + "*`DW%` is not included in the NetCDF output currently.*" + ] + }, + { + "cell_type": "markdown", + "id": "4735dd22", + "metadata": {}, + "source": [ + "HELCOM Description:\n", + "\n", + "**Sediment:**\n", + "1. DW%: DRY WEIGHT AS PERCENTAGE (%) OF FRESH WEIGHT.\n", + "2. VALUE_Bq/kg: Measured radioactivity concentration in Bq/kg dry wt. in scientific format(e.g. 123 = 1.23E+02, 0.076 = 7.6E-02)\n", + "\n", + "**Biota:**\n", + "1. WEIGHT: Average weight (in g) of specimen in the sample\n", + "2. DW%: DRY WEIGHT AS PERCENTAGE (%) OF FRESH WEIGHT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef385c79", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class LookupDryWetRatio(Callback):\n", + " \"Lookup dry-wet ratio and format for MARIS.\"\n", + " def __call__(self, tfm: Transformer):\n", + " \"Iterate through all DataFrames in the transformer object and apply the dry-wet ratio lookup.\"\n", + " for grp in tfm.dfs.keys():\n", + " if 'DW%' in tfm.dfs[grp].columns:\n", + " self._apply_dry_wet_ratio(tfm.dfs[grp])\n", + "\n", + " def _apply_dry_wet_ratio(self, df: pd.DataFrame) -> None:\n", + " \"Apply dry-wet ratio conversion and formatting to the given DataFrame.\"\n", + " df['dry_wet_ratio'] = df['DW%']\n", + " # Convert 'DW%' = 0% to NaN.\n", + " df.loc[df['dry_wet_ratio'] == 0, 'dry_wet_ratio'] = np.NaN\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9d714bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of dropped rows 0 0 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n", + "0 18.453\n", + "1 18.453\n", + "2 18.453\n", + "3 18.453\n", + "4 18.458\n", + "Name: dry_wet_ratio, dtype: float64\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " LookupDryWetRatio(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", + "print(tfm.dfs['biota']['dry_wet_ratio'].head())\n" + ] + }, + { + "cell_type": "markdown", + "id": "963b9aa0", + "metadata": {}, + "source": [ + "## Standardize Coordinates" + ] + }, + { + "cell_type": "markdown", + "id": "d3203cb3", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: Column names for geographical coordinates are inconsistent across sample types (biota, sediment, seawater). Sometimes using parentheses, sometimes not.\n", + "\n", + ":::" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03c04fe9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "seawater: ['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)']\n", + "sediment: ['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)']\n", + "biota: ['LATITUDE ddmmmm', 'LATITUDE dddddd', 'LONGITUDE ddmmmm', 'LONGITUDE dddddd']\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "for grp in dfs.keys():\n", + " print(f'{grp}: {[col for col in dfs[grp].columns if \"LON\" in col or \"LAT\" in col]}')" + ] + }, + { + "cell_type": "markdown", + "id": "7150dcb6", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: \n", + "\n", + "- Geographical coordinates are provided in both decimal degree and degree-minute formats. Some coordinates are missing the decimal format and obliged us to use the degree-minute format with less precision.\n", + "- Also note that latitude values have `,` as decimal separator while longitude values have `.` as decimal separator (see below)\n", + "\n", + ":::" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "484b281b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LATITUDE (ddmmmm)LATITUDE (dddddd)
059.40059,6667
159.40059,6667
259.51659,86
359.51659,86
459.51659,86
\n", + "
" + ], + "text/plain": [ + " LATITUDE (ddmmmm) LATITUDE (dddddd)\n", + "0 59.400 59,6667\n", + "1 59.400 59,6667\n", + "2 59.516 59,86\n", + "3 59.516 59,86\n", + "4 59.516 59,86" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs['sediment'][['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)']].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61afcc23", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class ParseCoordinates(Callback):\n", + " \"\"\"\n", + " Get geographical coordinates from columns expressed in degrees decimal format \n", + " or from columns in degrees/minutes decimal format where degrees decimal format is missing.\n", + " \"\"\"\n", + " def __init__(self, \n", + " fn_convert_cor: Callable # Function that converts coordinates from degree-minute to decimal degree format\n", + " ):\n", + " self.fn_convert_cor = fn_convert_cor\n", + "\n", + " def __call__(self, tfm:Transformer):\n", + " for df in tfm.dfs.values():\n", + " self._format_coordinates(df)\n", + "\n", + " def _format_coordinates(self, df:pd.DataFrame) -> None:\n", + " coord_cols = self._get_coord_columns(df.columns)\n", + " \n", + " for coord in ['lat', 'lon']:\n", + " decimal_col, minute_col = coord_cols[f'{coord}_d'], coord_cols[f'{coord}_m']\n", + " \n", + " condition = df[decimal_col].isna() | (df[decimal_col] == 0)\n", + " df[coord] = np.where(condition,\n", + " df[minute_col].apply(self._safe_convert),\n", + " df[decimal_col])\n", + " \n", + " df.dropna(subset=['lat', 'lon'], inplace=True)\n", + "\n", + " def _get_coord_columns(self, columns) -> dict:\n", + " return {\n", + " 'lon_d': self._find_coord_column(columns, 'LON', 'dddddd'),\n", + " 'lat_d': self._find_coord_column(columns, 'LAT', 'dddddd'),\n", + " 'lon_m': self._find_coord_column(columns, 'LON', 'ddmmmm'),\n", + " 'lat_m': self._find_coord_column(columns, 'LAT', 'ddmmmm')\n", + " }\n", + "\n", + " def _find_coord_column(self, columns, coord_type, coord_format) -> str:\n", + " pattern = re.compile(f'{coord_type}.*{coord_format}', re.IGNORECASE)\n", + " matching_columns = [col for col in columns if pattern.search(col)]\n", + " return matching_columns[0] if matching_columns else None\n", + "\n", + " def _safe_convert(self, value) -> str:\n", + " if pd.isna(value):\n", + " return value\n", + " try:\n", + " return self.fn_convert_cor(value)\n", + " except Exception as e:\n", + " print(f\"Error converting value {value}: {e}\")\n", + " return value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1baf7136", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21208 39816 15827\n", + "Number of dropped rows 8 1 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n", + " lat lon\n", + "0 54.283333 12.316667\n", + "1 54.283333 12.316667\n", + "2 54.283333 12.316667\n", + "3 54.283333 12.316667\n", + "4 54.283333 12.316667\n", + "... ... ...\n", + "15822 60.373333 18.395667\n", + "15823 60.373333 18.395667\n", + "15824 60.503333 18.366667\n", + "15825 60.503333 18.366667\n", + "15826 60.503333 18.366667\n", + "\n", + "[15827 rows x 2 columns]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[ \n", + " ParseCoordinates(ddmm_to_dd),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", + "print(tfm.dfs['biota'][['lat','lon']])" + ] + }, + { + "cell_type": "markdown", + "id": "754289f1", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: Some samples have (lon, lat): (0, 0) or are outside lon/lat possible values. \n", + "\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "id": "5a055628", + "metadata": {}, + "source": [ + "Sanitize coordinates drops a row when both longitude & latitude equal 0 or data contains unrealistic longitude & latitude values. Converts longitude & latitude `,` separator to `.` separator.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99a85059", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21208 39816 15827\n", + "Number of dropped rows 8 1 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n", + " lat lon\n", + "0 54.283333 12.316667\n", + "1 54.283333 12.316667\n", + "2 54.283333 12.316667\n", + "3 54.283333 12.316667\n", + "4 54.283333 12.316667\n", + "... ... ...\n", + "15822 60.373333 18.395667\n", + "15823 60.373333 18.395667\n", + "15824 60.503333 18.366667\n", + "15825 60.503333 18.366667\n", + "15826 60.503333 18.366667\n", + "\n", + "[15827 rows x 2 columns]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " ParseCoordinates(ddmm_to_dd),\n", + " SanitizeLonLatCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", + "print(tfm.dfs['biota'][['lat','lon']])\n" + ] + }, + { + "cell_type": "markdown", + "id": "47716bff", + "metadata": {}, + "source": [ + "## Review all callbacks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8a07959", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21114 39531 15798\n", + "Number of dropped rows 102 286 29\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " AddSampleTypeIdColumnCB(),\n", + " LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE'),\n", + " ParseTimeCB(),\n", + " EncodeTimeCB(cfg()),\n", + " SanitizeValue(coi_val), \n", + " NormalizeUncCB(),\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", + " RemapTaxonInformationCB(lut_taxon),\n", + " RemapSedimentCB(lut_sediments),\n", + " RemapUnitCB(),\n", + " RemapDetectionLimitCB(coi_dl, lut_dl),\n", + " RemapFiltCB(lut_filtered),\n", + " AddSampleLabCodeCB(),\n", + " AddMeasurementNoteCB(lut_method),\n", + " RemapStationIdCB(),\n", + " RemapSedSliceTopBottomCB(),\n", + " LookupDryWetRatio(),\n", + " ParseCoordinates(ddmm_to_dd),\n", + " SanitizeLonLatCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n" + ] + }, + { + "cell_type": "markdown", + "id": "2f13c7a2", + "metadata": {}, + "source": [ + "For instance, to inspect dropped rows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29baf65c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KEYNUCLIDEMETHOD< VALUE_Bq/m³VALUE_Bq/m³ERROR%_m³DATE_OF_ENTRY_xCOUNTRYLABORATORYSEQUENCE...LONGITUDE (ddmmmm)LONGITUDE (dddddd)TDEPTHSDEPTHSALINTTEMPFILTMORS_SUBBASINHELCOM_SUBBASINDATE_OF_ENTRY_y
13439WRISO2001025CS137RISO02NaNNaN10.0NaN26.0RISO2001025.0...10.50010.83333322.020.00.00NaNN5.05.0NaN
14017WLEPA2002001CS134LEPA02<NaNNaNNaN93.0LEPA2002001.0...21.03021.05000016.00.03.7714.40N4.09.0NaN
14020WLEPA2002002CS134LEPA02<NaNNaNNaN93.0LEPA2002004.0...20.57420.95666714.00.06.5711.95N4.09.0NaN
14023WLEPA2002003CS134LEPA02<NaNNaNNaN93.0LEPA2002007.0...19.23619.39333373.00.07.009.19N4.09.0NaN
14026WLEPA2002004CS134LEPA02<NaNNaNNaN93.0LEPA2002010.0...20.20520.34170047.00.07.068.65N4.09.0NaN
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " KEY NUCLIDE METHOD < VALUE_Bq/m³ VALUE_Bq/m³ ERROR%_m³ \\\n", + "13439 WRISO2001025 CS137 RISO02 NaN NaN 10.0 \n", + "14017 WLEPA2002001 CS134 LEPA02 < NaN NaN \n", + "14020 WLEPA2002002 CS134 LEPA02 < NaN NaN \n", + "14023 WLEPA2002003 CS134 LEPA02 < NaN NaN \n", + "14026 WLEPA2002004 CS134 LEPA02 < NaN NaN \n", + "\n", + " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... LONGITUDE (ddmmmm) \\\n", + "13439 NaN 26.0 RISO 2001025.0 ... 10.500 \n", + "14017 NaN 93.0 LEPA 2002001.0 ... 21.030 \n", + "14020 NaN 93.0 LEPA 2002004.0 ... 20.574 \n", + "14023 NaN 93.0 LEPA 2002007.0 ... 19.236 \n", + "14026 NaN 93.0 LEPA 2002010.0 ... 20.205 \n", + "\n", + " LONGITUDE (dddddd) TDEPTH SDEPTH SALIN TTEMP FILT MORS_SUBBASIN \\\n", + "13439 10.833333 22.0 20.0 0.00 NaN N 5.0 \n", + "14017 21.050000 16.0 0.0 3.77 14.40 N 4.0 \n", + "14020 20.956667 14.0 0.0 6.57 11.95 N 4.0 \n", + "14023 19.393333 73.0 0.0 7.00 9.19 N 4.0 \n", + "14026 20.341700 47.0 0.0 7.06 8.65 N 4.0 \n", + "\n", + " HELCOM_SUBBASIN DATE_OF_ENTRY_y \n", + "13439 5.0 NaN \n", + "14017 9.0 NaN \n", + "14020 9.0 NaN \n", + "14023 9.0 NaN \n", + "14026 9.0 NaN \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfm.dfs_dropped['seawater'].head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e17f6685", + "metadata": {}, + "source": [ + "## Rename columns of interest for NetCDF or Open Refine" + ] + }, + { + "cell_type": "markdown", + "id": "af441203", + "metadata": {}, + "source": [ + "> Column names are standardized to MARIS NetCDF format (i.e. PEP8 ). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66e7bfc7", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def get_common_rules(\n", + " vars: dict, # Configuration dictionary\n", + " encoding_type: str # Encoding type (`netcdf` or `openrefine`)\n", + " ) -> dict: # Common renaming rules for NetCDF and OpenRefine.\n", + " \"Get common renaming rules for NetCDF and OpenRefine.\"\n", + " common = {\n", + " 'KEY': 'key',\n", + " 'lat': 'latitude' if encoding_type == 'openrefine' else vars['defaults']['lat']['name'],\n", + " 'lon': 'longitude' if encoding_type == 'openrefine' else vars['defaults']['lon']['name'],\n", + " 'time': 'begperiod' if encoding_type == 'openrefine' else vars['defaults']['time']['name'],\n", + " 'NUCLIDE': 'nuclide_id' if encoding_type == 'openrefine' else 'nuclide',\n", + " 'detection_limit': 'detection' if encoding_type == 'openrefine' else vars['suffixes']['detection_limit']['name'],\n", + " 'unit': 'unit_id' if encoding_type == 'openrefine' else vars['suffixes']['unit']['name'],\n", + " 'value': 'activity' if encoding_type == 'openrefine' else 'value',\n", + " 'uncertainty': 'uncertaint' if encoding_type == 'openrefine' else vars['suffixes']['uncertainty']['name'],\n", + " 'SDEPTH': 'sampdepth' if encoding_type == 'openrefine' else vars['defaults']['smp_depth']['name'],\n", + " 'TDEPTH': 'totdepth' if encoding_type == 'openrefine' else vars['defaults']['tot_depth']['name'],\n", + " }\n", + " \n", + " if encoding_type == 'openrefine':\n", + " common.update({\n", + " 'samptype_id': 'samptype_id',\n", + " 'station': 'station',\n", + " 'samplabcode': 'samplabcode',\n", + " 'SALIN': 'salinity',\n", + " 'TTEMP': 'temperatur',\n", + " 'FILT': 'filtered',\n", + " 'measurenote': 'measurenote'\n", + " })\n", + " else:\n", + " common.update({\n", + " 'counting_method': vars['suffixes']['counting_method']['name'],\n", + " 'sampling_method': vars['suffixes']['sampling_method']['name'],\n", + " 'preparation_method': vars['suffixes']['preparation_method']['name'],\n", + " 'SALIN': vars['suffixes']['salinity']['name'],\n", + " 'TTEMP': vars['suffixes']['temperature']['name'],\n", + " })\n", + " \n", + " return common" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bc3002a", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def get_specific_rules(\n", + " vars: dict, # Configuration dictionary\n", + " encoding_type: str # Encoding type (`netcdf` or `openrefine`)\n", + " ) -> dict: # Specific renaming rules for NetCDF and OpenRefine.\n", + " \"Get specific renaming rules for NetCDF and OpenRefine.\"\n", + " if encoding_type == 'netcdf':\n", + " return {\n", + " 'biota': {\n", + " 'species': vars['bio']['species']['name'],\n", + " 'body_part': vars['bio']['body_part']['name'],\n", + " 'bio_group': vars['bio']['bio_group']['name']\n", + " },\n", + " 'sediment': {\n", + " 'sed_type': vars['sed']['sed_type']['name'],\n", + " 'top': vars['sed']['top']['name'],\n", + " 'bottom': vars['sed']['bottom']['name'],\n", + " }\n", + " }\n", + " elif encoding_type == 'openrefine':\n", + " return {\n", + " 'biota': {\n", + " 'species': 'species_id',\n", + " 'Taxonname': 'Taxonname',\n", + " 'TaxonRepName': 'TaxonRepName',\n", + " 'Taxonrank': 'Taxonrank',\n", + " 'TaxonDB': 'TaxonDB',\n", + " 'TaxonDBID': 'TaxonDBID',\n", + " 'TaxonDBURL': 'TaxonDBURL',\n", + " 'body_part': 'bodypar_id',\n", + " 'dry_wet_ratio': 'percentwt',\n", + " },\n", + " 'sediment': {\n", + " 'sed_type': 'sedtype_id',\n", + " 'top': 'sliceup',\n", + " 'bottom': 'slicedown',\n", + " 'SedRepName': 'SedRepName',\n", + " 'dry_wet_ratio': 'percentwt',\n", + " }\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbfc4bf7", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def get_renaming_rules(\n", + " encoding_type: str = 'netcdf' # Encoding type (`netcdf` or `openrefine`)\n", + " ) -> dict: # Renaming rules for NetCDF and OpenRefine.\n", + " \"Get renaming rules for NetCDF and OpenRefine.\"\n", + " vars = cdl_cfg()['vars']\n", + " \n", + " if encoding_type not in ['netcdf', 'openrefine']:\n", + " raise ValueError(\"Invalid encoding_type provided. Please use 'netcdf' or 'openrefine'.\")\n", + " \n", + " common_rules = get_common_rules(vars, encoding_type)\n", + " specific_rules = get_specific_rules(vars, encoding_type)\n", + " \n", + " rules = defaultdict(dict)\n", + " for sample_type in ['seawater', 'biota', 'sediment']:\n", + " rules[sample_type] = common_rules.copy()\n", + " rules[sample_type].update(specific_rules.get(sample_type, {}))\n", + " \n", + " return dict(rules)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b7476af", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class SelectAndRenameColumnCB(Callback):\n", + " \"Select and rename columns in a DataFrame based on renaming rules for a specified encoding type.\"\n", + " def __init__(self, \n", + " fn_renaming_rules: Callable, # A function that returns an OrderedDict of renaming rules \n", + " encoding_type: str='netcdf', # The encoding type (`netcdf` or `openrefine`) to determine which renaming rules to use\n", + " verbose: bool=False # Whether to print out renaming rules that were not applied\n", + " ):\n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm: Transformer):\n", + " \"Apply column selection and renaming to DataFrames in the transformer, and identify unused rules.\"\n", + " try:\n", + " renaming_rules = self.fn_renaming_rules(self.encoding_type)\n", + " except ValueError as e:\n", + " print(f\"Error fetching renaming rules: {e}\")\n", + " return\n", + "\n", + " for group in tfm.dfs.keys():\n", + " # Get relevant renaming rules for the current group\n", + " group_rules = self._get_group_rules(renaming_rules, group)\n", + "\n", + " if not group_rules:\n", + " continue\n", + "\n", + " # Apply renaming rules and track keys not found in the DataFrame\n", + " df = tfm.dfs[group]\n", + " df, not_found_keys = self._apply_renaming(df, group_rules)\n", + " tfm.dfs[group] = df\n", + " \n", + " # Print any renaming rules that were not used\n", + " if not_found_keys and self.verbose:\n", + " print(f\"\\nGroup '{group}' has the following renaming rules not applied:\")\n", + " for old_col in not_found_keys:\n", + " print(f\"Key '{old_col}' from renaming rules was not found in the DataFrame.\")\n", + "\n", + " def _get_group_rules(self, \n", + " renaming_rules: OrderedDict, # Renaming rules\n", + " group: str # Group name to filter rules\n", + " ) -> OrderedDict: # Renaming rules applicable to the specified group\n", + " \"Retrieve and merge renaming rules for the specified group based on the encoding type.\"\n", + " relevant_rules = [rules for key, rules in renaming_rules.items() if group in key]\n", + " merged_rules = OrderedDict()\n", + " for rules in relevant_rules:\n", + " merged_rules.update(rules)\n", + " return merged_rules\n", + "\n", + " def _apply_renaming(self, \n", + " df: pd.DataFrame, # DataFrame to modify\n", + " rename_rules: OrderedDict # Renaming rules\n", + " ) -> tuple: # (Renamed and filtered df, Column names from renaming rules that were not found in the DataFrame)\n", + " \"\"\"\n", + " Select columns based on renaming rules and apply renaming, only for existing columns\n", + " while maintaining the order of the dictionary columns.\"\"\"\n", + " existing_columns = set(df.columns)\n", + " valid_rules = OrderedDict((old_col, new_col) for old_col, new_col in rename_rules.items() if old_col in existing_columns)\n", + "\n", + " # Create a list to maintain the order of columns\n", + " columns_to_keep = [col for col in rename_rules.keys() if col in existing_columns]\n", + " columns_to_keep += [new_col for old_col, new_col in valid_rules.items() if new_col in df.columns]\n", + "\n", + " df = df[list(OrderedDict.fromkeys(columns_to_keep))]\n", + "\n", + " # Apply renaming\n", + " df.rename(columns=valid_rules, inplace=True)\n", + "\n", + " # Determine which keys were not found\n", + " not_found_keys = set(rename_rules.keys()) - existing_columns\n", + " return df, not_found_keys\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a4a8682-672f-4188-9091-821b727b4764", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "seawater columns:\n", + "Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", + " 'smp_depth', 'tot_depth', '_sal', '_temp'],\n", + " dtype='object')\n", + "sediment columns:\n", + "Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", + " 'tot_depth', 'sed_type', 'top', 'bottom'],\n", + " dtype='object')\n", + "biota columns:\n", + "Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", + " 'smp_depth', 'species', 'body_part', 'bio_group'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),\n", + " LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE'),\n", + " ParseTimeCB(),\n", + " EncodeTimeCB(cfg()),\n", + " SanitizeValue(coi_val), \n", + " NormalizeUncCB(),\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", + " RemapTaxonInformationCB(lut_taxon),\n", + " RemapSedimentCB(lut_sediments),\n", + " RemapUnitCB(),\n", + " RemapDetectionLimitCB(coi_dl, lut_dl),\n", + " RemapFiltCB(lut_filtered),\n", + " AddSampleLabCodeCB(),\n", + " AddMeasurementNoteCB(lut_method),\n", + " RemapStationIdCB(),\n", + " RemapSedSliceTopBottomCB(),\n", + " LookupDryWetRatio(),\n", + " ParseCoordinates(ddmm_to_dd),\n", + " SanitizeLonLatCB(),\n", + " CompareDfsAndTfmCB(dfs),\n", + " SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),\n", + " ])\n", + "\n", + "tfm()\n", + "for grp in tfm.dfs.keys():\n", + " print(f'{grp} columns:')\n", + " print(tfm.dfs[grp].columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a941172", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
keylatlontimenuclide_dl_unitvalue_unctot_depthsed_typetopbottom
0SKRIL201204859.666724.00001339891200ra2261435.09.1071.0015.020.0
1SKRIL201204959.666724.00001339891200ra2261436.07.9271.0020.027.0
2SKRIL201205059.860028.84331344556800ra2261438.09.1223.000.02.0
3SKRIL201205159.860028.84331344556800ra2261436.09.0023.002.04.0
4SKRIL201205259.860028.84331344556800ra2261430.06.9023.004.06.0
\n", + "
" + ], + "text/plain": [ + " key lat lon time nuclide _dl _unit value \\\n", + "0 SKRIL2012048 59.6667 24.0000 1339891200 ra226 1 4 35.0 \n", + "1 SKRIL2012049 59.6667 24.0000 1339891200 ra226 1 4 36.0 \n", + "2 SKRIL2012050 59.8600 28.8433 1344556800 ra226 1 4 38.0 \n", + "3 SKRIL2012051 59.8600 28.8433 1344556800 ra226 1 4 36.0 \n", + "4 SKRIL2012052 59.8600 28.8433 1344556800 ra226 1 4 30.0 \n", + "\n", + " _unc tot_depth sed_type top bottom \n", + "0 9.10 71.0 0 15.0 20.0 \n", + "1 7.92 71.0 0 20.0 27.0 \n", + "2 9.12 23.0 0 0.0 2.0 \n", + "3 9.00 23.0 0 2.0 4.0 \n", + "4 6.90 23.0 0 4.0 6.0 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "result = tfm.dfs['sediment']; result.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82af4c5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
keylatlontimenuclide_dl_unitvalue_unctot_depthsed_typetopbottom
1042SKRIL201207859.750027.41671338076800sr90144.90.980056.00NaNNaN
1043SKRIL201207959.750027.41671338076800sr90140.50.100056.00NaNNaN
1050SKRIL201210860.083327.83331337817600sr90140.90.198050.00NaNNaN
1051SKRIL201210960.083327.83331337817600sr90141.00.200050.00NaNNaN
1055SKRIL201220058.000020.75001340150400sr90148.91.780097.00NaNNaN
..........................................
39795SSSSM202002362.099818.54851600646400k4014941.097.487688.025.010.0
39796SSSSM202002462.099818.54851600646400k4014885.092.571088.020.05.0
39797SSSSM202002462.099818.54851600646400cs13714299.029.810388.020.05.0
39798SSSSM202002562.099818.54851600646400k4014956.0104.012888.025.010.0
39799SSSSM202002562.099818.54851600646400cs1371438.54.958888.025.010.0
\n", + "

2147 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " key lat lon time nuclide _dl _unit value \\\n", + "1042 SKRIL2012078 59.7500 27.4167 1338076800 sr90 1 4 4.9 \n", + "1043 SKRIL2012079 59.7500 27.4167 1338076800 sr90 1 4 0.5 \n", + "1050 SKRIL2012108 60.0833 27.8333 1337817600 sr90 1 4 0.9 \n", + "1051 SKRIL2012109 60.0833 27.8333 1337817600 sr90 1 4 1.0 \n", + "1055 SKRIL2012200 58.0000 20.7500 1340150400 sr90 1 4 8.9 \n", + "... ... ... ... ... ... ... ... ... \n", + "39795 SSSSM2020023 62.0998 18.5485 1600646400 k40 1 4 941.0 \n", + "39796 SSSSM2020024 62.0998 18.5485 1600646400 k40 1 4 885.0 \n", + "39797 SSSSM2020024 62.0998 18.5485 1600646400 cs137 1 4 299.0 \n", + "39798 SSSSM2020025 62.0998 18.5485 1600646400 k40 1 4 956.0 \n", + "39799 SSSSM2020025 62.0998 18.5485 1600646400 cs137 1 4 38.5 \n", + "\n", + " _unc tot_depth sed_type top bottom \n", + "1042 0.9800 56.0 0 NaN NaN \n", + "1043 0.1000 56.0 0 NaN NaN \n", + "1050 0.1980 50.0 0 NaN NaN \n", + "1051 0.2000 50.0 0 NaN NaN \n", + "1055 1.7800 97.0 0 NaN NaN \n", + "... ... ... ... ... ... \n", + "39795 97.4876 88.0 2 5.0 10.0 \n", + "39796 92.5710 88.0 2 0.0 5.0 \n", + "39797 29.8103 88.0 2 0.0 5.0 \n", + "39798 104.0128 88.0 2 5.0 10.0 \n", + "39799 4.9588 88.0 2 5.0 10.0 \n", + "\n", + "[2147 rows x 13 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "columns_to_check = ['lat', 'lon', 'time', 'nuclide','tot_depth', 'top', 'bottom']\n", + "# result[columns_to_check].duplicated().sum()\n", + "result[result.duplicated(subset=columns_to_check, keep=False)]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a65ca33a", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "# Preprocess the data\n", + "result['VALUE_Bq/kg'] = df_test['VALUE_Bq/kg'].fillna(-999)\n", + "\n", + "# Then pivot\n", + "pivoted = df_test.pivot_table(index=['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'YEAR', 'MONTH', 'DAY'],\n", + " columns='NUCLIDE',\n", + " values='VALUE_Bq/kg',\n", + " aggfunc='first').reset_index()\n", + "\n", + "# Replace -999 with 'Below Detection Limit' or any other indicator\n", + "pivoted = pivoted.replace(-999, np.nan)\n", + "pivoted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e254d6f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['seawater', 'sediment', 'biota'])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "tfm.dfs.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9919f7bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
latlontimenuclide_dl_unitvalue_unctot_depthsed_type
059.666724.00001339891200ra2261435.09.1071.00
159.666724.00001339891200ra2261436.07.9271.00
259.860028.84331344556800ra2261438.09.1223.00
359.860028.84331344556800ra2261436.09.0023.00
459.860028.84331344556800ra2261430.06.9023.00
\n", + "
" + ], + "text/plain": [ + " lat lon time nuclide _dl _unit value _unc tot_depth \\\n", + "0 59.6667 24.0000 1339891200 ra226 1 4 35.0 9.10 71.0 \n", + "1 59.6667 24.0000 1339891200 ra226 1 4 36.0 7.92 71.0 \n", + "2 59.8600 28.8433 1344556800 ra226 1 4 38.0 9.12 23.0 \n", + "3 59.8600 28.8433 1344556800 ra226 1 4 36.0 9.00 23.0 \n", + "4 59.8600 28.8433 1344556800 ra226 1 4 30.0 6.90 23.0 \n", + "\n", + " sed_type \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "tfm.dfs['sediment'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96992efb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(39531, 10)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "tfm.dfs['sediment'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61bc8ebf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", + " 'smp_depth', 'species', 'body_part', 'bio_group'],\n", + " dtype='object')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "tfm.dfs['biota'].columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94fc2c70", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['lon', 'smp_depth', 'lat', 'time', 'tot_depth', 'ag110m_dl', 'am241_dl',\n", + " 'ba140_dl', 'ce144_dl', 'cm242_dl',\n", + " ...\n", + " 'pu240', 'ru103', 'ru106', 'sb125', 'sr89', 'sr90', 'tc99', 'u234',\n", + " 'u238', 'zr95'],\n", + " dtype='object', length=175)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "tfm.dfs['seawater'].columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ab2f150", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "tfm.dfs['seawater'][['lat','lon', 'time', 'smp_depth', 'tot_depth']].duplicated().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9409257", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "seawater duplicated lat/lon/time/smp_depth/tot_depth:\n", + "0\n", + "sediment duplicated lat/lon/time/smp_depth/tot_depth:\n" + ] + }, + { + "ename": "KeyError", + "evalue": "\"['smp_depth'] not in index\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [101], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m grp \u001b[38;5;129;01min\u001b[39;00m tfm\u001b[38;5;241m.\u001b[39mdfs\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mgrp\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m duplicated lat/lon/time/smp_depth/tot_depth:\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mtfm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdfs\u001b[49m\u001b[43m[\u001b[49m\u001b[43mgrp\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mlat\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mlon\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtime\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msmp_depth\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtot_depth\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mduplicated()\u001b[38;5;241m.\u001b[39msum())\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/frame.py:4096\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4094\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 4095\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 4096\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcolumns\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 4098\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 4099\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/indexes/base.py:6199\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[0;34m(self, key, axis_name)\u001b[0m\n\u001b[1;32m 6196\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 6197\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 6199\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6201\u001b[0m keyarr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[1;32m 6202\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[1;32m 6203\u001b[0m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/indexes/base.py:6251\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[0;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[1;32m 6248\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6250\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[38;5;241m.\u001b[39mnonzero()[\u001b[38;5;241m0\u001b[39m]]\u001b[38;5;241m.\u001b[39munique())\n\u001b[0;32m-> 6251\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mKeyError\u001b[0m: \"['smp_depth'] not in index\"" + ] + } + ], + "source": [ + "#| eval: false\n", + "tfm.dfs['sediment'][['lat','lon', 'time', 'smp_depth', 'tot_depth']].duplicated().sum()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8b7efe2d", + "metadata": {}, + "source": [ + "## Reshape: long to wide" + ] + }, + { + "cell_type": "markdown", + "id": "dd59b263", + "metadata": {}, + "source": [ + "Convert data from long to wide and rename columns to comply with NetCDF format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a330905", + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Must produce aggregated value", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [98], line 30\u001b[0m\n\u001b[1;32m 2\u001b[0m dfs \u001b[38;5;241m=\u001b[39m load_data(fname_in)\n\u001b[1;32m 3\u001b[0m tfm \u001b[38;5;241m=\u001b[39m Transformer(dfs, cbs\u001b[38;5;241m=\u001b[39m[AddSampleTypeIdColumnCB(),\n\u001b[1;32m 4\u001b[0m LowerStripNameCB(col_src\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNUCLIDE\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[1;32m 5\u001b[0m RemapNuclideNameCB(lut_nuclides),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 27\u001b[0m ReshapeLongToWide()\n\u001b[1;32m 28\u001b[0m ])\n\u001b[0;32m---> 30\u001b[0m \u001b[43mtfm\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m grp \u001b[38;5;129;01min\u001b[39;00m tfm\u001b[38;5;241m.\u001b[39mdfs\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mgrp\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m columns:\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", + "File \u001b[0;32m~/pro/IAEA/MARIS/marisco/marisco/callbacks.py:70\u001b[0m, in \u001b[0;36mTransformer.__call__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 69\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTransform the dataframe(s) according to the specified callbacks.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 70\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcbs: \u001b[43mrun_cbs\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcbs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 71\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdfs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdfs\n", + "File \u001b[0;32m~/pro/IAEA/MARIS/marisco/marisco/callbacks.py:38\u001b[0m, in \u001b[0;36mrun_cbs\u001b[0;34m(cbs, obj)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m cb \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28msorted\u001b[39m(cbs, key\u001b[38;5;241m=\u001b[39mattrgetter(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124morder\u001b[39m\u001b[38;5;124m'\u001b[39m)):\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m cb\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__doc__\u001b[39m: obj\u001b[38;5;241m.\u001b[39mlogs\u001b[38;5;241m.\u001b[39mappend(cb\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__doc__\u001b[39m)\n\u001b[0;32m---> 38\u001b[0m \u001b[43mcb\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/pro/IAEA/MARIS/marisco/marisco/callbacks.py:266\u001b[0m, in \u001b[0;36mReshapeLongToWide.__call__\u001b[0;34m(self, tfm)\u001b[0m\n\u001b[1;32m 264\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, tfm):\n\u001b[1;32m 265\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m grp \u001b[38;5;129;01min\u001b[39;00m tfm\u001b[38;5;241m.\u001b[39mdfs\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[0;32m--> 266\u001b[0m tfm\u001b[38;5;241m.\u001b[39mdfs[grp] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtfm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdfs\u001b[49m\u001b[43m[\u001b[49m\u001b[43mgrp\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 267\u001b[0m tfm\u001b[38;5;241m.\u001b[39mdfs[grp]\u001b[38;5;241m.\u001b[39mcolumns \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrenamed_cols(tfm\u001b[38;5;241m.\u001b[39mdfs[grp]\u001b[38;5;241m.\u001b[39mcolumns)\n", + "File \u001b[0;32m~/pro/IAEA/MARIS/marisco/marisco/callbacks.py:247\u001b[0m, in \u001b[0;36mReshapeLongToWide.pivot\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 243\u001b[0m idx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mset\u001b[39m(df\u001b[38;5;241m.\u001b[39mcolumns) \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mset\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns \u001b[38;5;241m+\u001b[39m derived_coi \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvalues))\n\u001b[1;32m 245\u001b[0m df, num_fill_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fill_nan_values(df, idx)\n\u001b[0;32m--> 247\u001b[0m pivot_df \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43midx\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 248\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 249\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mderived_coi\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 250\u001b[0m \u001b[43m \u001b[49m\u001b[43mfill_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnan\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 251\u001b[0m \u001b[43m \u001b[49m\u001b[43maggfunc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mreset_index()\n\u001b[1;32m 253\u001b[0m pivot_df[idx] \u001b[38;5;241m=\u001b[39m pivot_df[idx]\u001b[38;5;241m.\u001b[39mreplace({\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_fill_value: np\u001b[38;5;241m.\u001b[39mnan, num_fill_value: np\u001b[38;5;241m.\u001b[39mnan})\n\u001b[1;32m 254\u001b[0m pivot_df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mset_index(pivot_df)\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/frame.py:9482\u001b[0m, in \u001b[0;36mDataFrame.pivot_table\u001b[0;34m(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)\u001b[0m\n\u001b[1;32m 9465\u001b[0m \u001b[38;5;129m@Substitution\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 9466\u001b[0m \u001b[38;5;129m@Appender\u001b[39m(_shared_docs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpivot_table\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 9467\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpivot_table\u001b[39m(\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 9478\u001b[0m sort: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 9479\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame:\n\u001b[1;32m 9480\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mreshape\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpivot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pivot_table\n\u001b[0;32m-> 9482\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpivot_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 9483\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9484\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9485\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9486\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9487\u001b[0m \u001b[43m \u001b[49m\u001b[43maggfunc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maggfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9488\u001b[0m \u001b[43m \u001b[49m\u001b[43mfill_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfill_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9489\u001b[0m \u001b[43m \u001b[49m\u001b[43mmargins\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmargins\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9490\u001b[0m \u001b[43m \u001b[49m\u001b[43mdropna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9491\u001b[0m \u001b[43m \u001b[49m\u001b[43mmargins_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmargins_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9492\u001b[0m \u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9493\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9494\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/reshape/pivot.py:102\u001b[0m, in \u001b[0;36mpivot_table\u001b[0;34m(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)\u001b[0m\n\u001b[1;32m 99\u001b[0m table \u001b[38;5;241m=\u001b[39m concat(pieces, keys\u001b[38;5;241m=\u001b[39mkeys, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m table\u001b[38;5;241m.\u001b[39m__finalize__(data, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpivot_table\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 102\u001b[0m table \u001b[38;5;241m=\u001b[39m \u001b[43m__internal_pivot_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 103\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 104\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 106\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 107\u001b[0m \u001b[43m \u001b[49m\u001b[43maggfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 108\u001b[0m \u001b[43m \u001b[49m\u001b[43mfill_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 109\u001b[0m \u001b[43m \u001b[49m\u001b[43mmargins\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 110\u001b[0m \u001b[43m \u001b[49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 111\u001b[0m \u001b[43m \u001b[49m\u001b[43mmargins_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 112\u001b[0m \u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 113\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 114\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m table\u001b[38;5;241m.\u001b[39m__finalize__(data, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpivot_table\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/reshape/pivot.py:183\u001b[0m, in \u001b[0;36m__internal_pivot_table\u001b[0;34m(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m observed \u001b[38;5;129;01mis\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mno_default \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\n\u001b[1;32m 174\u001b[0m ping\u001b[38;5;241m.\u001b[39m_passed_categorical \u001b[38;5;28;01mfor\u001b[39;00m ping \u001b[38;5;129;01min\u001b[39;00m grouped\u001b[38;5;241m.\u001b[39m_grouper\u001b[38;5;241m.\u001b[39mgroupings\n\u001b[1;32m 175\u001b[0m ):\n\u001b[1;32m 176\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 177\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe default value of observed=False is deprecated and will change \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto observed=True in a future version of pandas. Specify \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 181\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[1;32m 182\u001b[0m )\n\u001b[0;32m--> 183\u001b[0m agged \u001b[38;5;241m=\u001b[39m \u001b[43mgrouped\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43maggfunc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dropna \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(agged, ABCDataFrame) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(agged\u001b[38;5;241m.\u001b[39mcolumns):\n\u001b[1;32m 186\u001b[0m agged \u001b[38;5;241m=\u001b[39m agged\u001b[38;5;241m.\u001b[39mdropna(how\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mall\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/groupby/generic.py:1466\u001b[0m, in \u001b[0;36mDataFrameGroupBy.aggregate\u001b[0;34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1463\u001b[0m \u001b[38;5;66;03m# grouper specific aggregations\u001b[39;00m\n\u001b[1;32m 1464\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_grouper\u001b[38;5;241m.\u001b[39mnkeys \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 1465\u001b[0m \u001b[38;5;66;03m# test_groupby_as_index_series_scalar gets here with 'not self.as_index'\u001b[39;00m\n\u001b[0;32m-> 1466\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_python_agg_general\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1467\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m args \u001b[38;5;129;01mor\u001b[39;00m kwargs:\n\u001b[1;32m 1468\u001b[0m \u001b[38;5;66;03m# test_pass_args_kwargs gets here (with and without as_index)\u001b[39;00m\n\u001b[1;32m 1469\u001b[0m \u001b[38;5;66;03m# can't return early\u001b[39;00m\n\u001b[1;32m 1470\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_aggregate_frame(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/groupby/generic.py:1532\u001b[0m, in \u001b[0;36mDataFrameGroupBy._python_agg_general\u001b[0;34m(self, func, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1530\u001b[0m output: \u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mint\u001b[39m, ArrayLike] \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 1531\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m idx, (name, ser) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(obj\u001b[38;5;241m.\u001b[39mitems()):\n\u001b[0;32m-> 1532\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_grouper\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg_series\u001b[49m\u001b[43m(\u001b[49m\u001b[43mser\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1533\u001b[0m output[idx] \u001b[38;5;241m=\u001b[39m result\n\u001b[1;32m 1535\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj\u001b[38;5;241m.\u001b[39m_constructor(output)\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/groupby/ops.py:863\u001b[0m, in \u001b[0;36mBaseGrouper.agg_series\u001b[0;34m(self, obj, func, preserve_dtype)\u001b[0m\n\u001b[1;32m 856\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39m_values, np\u001b[38;5;241m.\u001b[39mndarray):\n\u001b[1;32m 857\u001b[0m \u001b[38;5;66;03m# we can preserve a little bit more aggressively with EA dtype\u001b[39;00m\n\u001b[1;32m 858\u001b[0m \u001b[38;5;66;03m# because maybe_cast_pointwise_result will do a try/except\u001b[39;00m\n\u001b[1;32m 859\u001b[0m \u001b[38;5;66;03m# with _from_sequence. NB we are assuming here that _from_sequence\u001b[39;00m\n\u001b[1;32m 860\u001b[0m \u001b[38;5;66;03m# is sufficiently strict that it casts appropriately.\u001b[39;00m\n\u001b[1;32m 861\u001b[0m preserve_dtype \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 863\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_aggregate_series_pure_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 865\u001b[0m npvalues \u001b[38;5;241m=\u001b[39m lib\u001b[38;5;241m.\u001b[39mmaybe_convert_objects(result, try_float\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 866\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m preserve_dtype:\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/groupby/ops.py:889\u001b[0m, in \u001b[0;36mBaseGrouper._aggregate_series_pure_python\u001b[0;34m(self, obj, func)\u001b[0m\n\u001b[1;32m 885\u001b[0m res \u001b[38;5;241m=\u001b[39m extract_result(res)\n\u001b[1;32m 887\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m initialized:\n\u001b[1;32m 888\u001b[0m \u001b[38;5;66;03m# We only do this validation on the first iteration\u001b[39;00m\n\u001b[0;32m--> 889\u001b[0m \u001b[43mcheck_result_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mres\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgroup\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 890\u001b[0m initialized \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 892\u001b[0m result[i] \u001b[38;5;241m=\u001b[39m res\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/groupby/ops.py:88\u001b[0m, in \u001b[0;36mcheck_result_array\u001b[0;34m(obj, dtype)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj, np\u001b[38;5;241m.\u001b[39mndarray):\n\u001b[1;32m 85\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[1;32m 86\u001b[0m \u001b[38;5;66;03m# If it is object dtype, the function can be a reduction/aggregation\u001b[39;00m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;66;03m# and still return an ndarray e.g. test_agg_over_numpy_arrays\u001b[39;00m\n\u001b[0;32m---> 88\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMust produce aggregated value\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mValueError\u001b[0m: Must produce aggregated value" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),\n", + " LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE'),\n", + " ParseTimeCB(),\n", + " EncodeTimeCB(cfg()),\n", + " SanitizeValue(coi_val), \n", + " NormalizeUncCB(),\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", + " RemapTaxonInformationCB(lut_taxon),\n", + " RemapSedimentCB(lut_sediments),\n", + " RemapUnitCB(),\n", + " RemapDetectionLimitCB(coi_dl, lut_dl),\n", + " RemapFiltCB(lut_filtered),\n", + " AddSampleLabCodeCB(),\n", + " AddMeasurementNoteCB(lut_method),\n", + " RemapStationIdCB(),\n", + " RemapSedSliceTopBottomCB(),\n", + " LookupDryWetRatio(),\n", + " ParseCoordinates(ddmm_to_dd),\n", + " SanitizeLonLatCB(),\n", + " SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),\n", + " ReshapeLongToWide()\n", + " ])\n", + "\n", + "tfm()\n", + "for grp in tfm.dfs.keys():\n", + " print(f'{grp} columns:')\n", + " print(tfm.dfs[grp].columns)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8ba0e40a", + "metadata": {}, + "source": [ + "## NetCDF encoder" + ] + }, + { + "cell_type": "markdown", + "id": "21af7a47-0760-45bd-97f7-033bb7aa886e", + "metadata": {}, + "source": [ + "### Example change logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75d1968d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[\"Convert values from 'NUCLIDE' to lowercase, strip spaces, and store in 'None'.\",\n", + " 'Parse and standardize time information in the dataframe.',\n", + " 'Encode time as `int` representing seconds since xxx',\n", + " 'Sanitize value/measurement by removing blank entries and populating `value` column.',\n", + " 'Convert from relative error % to uncertainty of activity unit.',\n", + " \"Remap values from 'RUBIN' to 'species' for groups: b, i, o, t, a.\",\n", + " \"Remap values from 'TISSUE' to 'body_part' for groups: b, i, o, t, a.\",\n", + " \"Remap values from 'species' to 'bio_group' for groups: b, i, o, t, a.\",\n", + " 'Update taxon information based on MARIS species LUT.',\n", + " 'Update sediment id based on MARIS species LUT (dbo_sedtype.xlsx).',\n", + " 'Set the `unit` id column in the DataFrames based on a lookup table.',\n", + " 'Remap value type to MARIS format.',\n", + " 'Lookup FILT value in dataframe using the lookup table.',\n", + " 'Remap `KEY` column to `samplabcode` in each DataFrame.',\n", + " \"Record measurement notes by adding a 'measurenote' column to DataFrames.\",\n", + " 'Remap Station ID to MARIS format.',\n", + " 'Remap Sediment slice top and bottom to MARIS format.',\n", + " 'Lookup dry-wet ratio and format for MARIS.',\n", + " '\\n Get geographical coordinates from columns expressed in degrees decimal format \\n or from columns in degrees/minutes decimal format where degrees decimal format is missing.\\n ',\n", + " 'Drop row when both longitude & latitude equal 0. Drop unrealistic longitude & latitude values. Convert longitude & latitude `,` separator to `.` separator.',\n", + " 'Select and rename columns in a DataFrame based on renaming rules for a specified encoding type.']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "\n", + "tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),\n", + " LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE'),\n", + " ParseTimeCB(),\n", + " EncodeTimeCB(cfg()),\n", + " SanitizeValue(coi_val), \n", + " NormalizeUncCB(),\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", + " RemapTaxonInformationCB(lut_taxon),\n", + " RemapSedimentCB(lut_sediments),\n", + " RemapUnitCB(),\n", + " RemapDetectionLimitCB(coi_dl, lut_dl),\n", + " RemapFiltCB(lut_filtered),\n", + " AddSampleLabCodeCB(),\n", + " AddMeasurementNoteCB(lut_method),\n", + " RemapStationIdCB(),\n", + " RemapSedSliceTopBottomCB(),\n", + " LookupDryWetRatio(),\n", + " ParseCoordinates(ddmm_to_dd),\n", + " SanitizeLonLatCB(),\n", + " SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),\n", + " ReshapeLongToWide()\n", + " ])\n", + "\n", + "tfm()\n", + "tfm.logs" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b82526cc", + "metadata": {}, + "source": [ + "### Feed global attributes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac6ba4f8", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',\n", + " 'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',\n", + " 'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',\n", + " 'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',\n", + " 'Earth Science > Oceans > Water Quality > Ocean Contaminants',\n", + " 'Earth Science > Biological Classification > Animals/Vertebrates > Fish',\n", + " 'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',\n", + " 'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',\n", + " 'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',\n", + " 'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6aa393b", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def get_attrs(\n", + " tfm: Transformer, # Transformer object\n", + " zotero_key: str, # Zotero dataset record key\n", + " kw: list = kw # List of keywords\n", + " ) -> dict: # Global attributes\n", + " \"Retrieve all global attributes.\"\n", + " return GlobAttrsFeeder(tfm.dfs, cbs=[\n", + " BboxCB(),\n", + " DepthRangeCB(),\n", + " TimeRangeCB(cfg()),\n", + " ZoteroCB(zotero_key, cfg=cfg()),\n", + " KeyValuePairCB('keywords', ', '.join(kw)),\n", + " KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))\n", + " ])()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2e8aad3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'geospatial_lat_min': '31.17',\n", + " 'geospatial_lat_max': '65.75',\n", + " 'geospatial_lon_min': '9.6333',\n", + " 'geospatial_lon_max': '53.5',\n", + " 'geospatial_bounds': 'POLYGON ((9.6333 53.5, 31.17 53.5, 31.17 65.75, 9.6333 65.75, 9.6333 53.5))',\n", + " 'time_coverage_start': '1984-01-10T00:00:00',\n", + " 'time_coverage_end': '2021-12-15T00:00:00',\n", + " 'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances',\n", + " 'summary': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\\n\\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.\\n\\nThe database is updated and quality assured annually by HELCOM MORS EG.',\n", + " 'creator_name': '[{\"creatorType\": \"author\", \"name\": \"HELCOM MORS\"}]',\n", + " 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments, Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes, Earth Science > Oceans > Water Quality > Ocean Contaminants, Earth Science > Biological Classification > Animals/Vertebrates > Fish, Earth Science > Biosphere > Ecosystems > Marine Ecosystems, Earth Science > Biological Classification > Animals/Invertebrates > Mollusks, Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans, Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)',\n", + " 'publisher_postprocess_logs': \"Convert values from 'NUCLIDE' to lowercase, strip spaces, and store in 'None'., Parse and standardize time information in the dataframe., Encode time as `int` representing seconds since xxx, Sanitize value/measurement by removing blank entries and populating `value` column., Convert from relative error % to uncertainty of activity unit., Remap values from 'RUBIN' to 'species' for groups: b, i, o, t, a., Remap values from 'TISSUE' to 'body_part' for groups: b, i, o, t, a., Remap values from 'species' to 'bio_group' for groups: b, i, o, t, a., Update taxon information based on MARIS species LUT., Update sediment id based on MARIS species LUT (dbo_sedtype.xlsx)., Set the `unit` id column in the DataFrames based on a lookup table., Remap value type to MARIS format., Lookup FILT value in dataframe using the lookup table., Remap `KEY` column to `samplabcode` in each DataFrame., Record measurement notes by adding a 'measurenote' column to DataFrames., Remap Station ID to MARIS format., Remap Sediment slice top and bottom to MARIS format., Lookup dry-wet ratio and format for MARIS., \\n Get geographical coordinates from columns expressed in degrees decimal format \\n or from columns in degrees/minutes decimal format where degrees decimal format is missing.\\n , Drop row when both longitude & latitude equal 0. Drop unrealistic longitude & latitude values. Convert longitude & latitude `,` separator to `.` separator., Select and rename columns in a DataFrame based on renaming rules for a specified encoding type.\"}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "get_attrs(tfm, zotero_key=zotero_key, kw=kw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "471ebcce-b8c8-4963-8c1c-f32e820f51d7", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def enums_xtra(\n", + " tfm: Transformer, # Transformer object\n", + " vars: list # List of variables to extract from the transformer\n", + " ):\n", + " \"Retrieve a subset of the lengthy enum as `species_t` for instance.\"\n", + " enums = Enums(lut_src_dir=lut_path(), cdl_enums=cdl_cfg()['enums'])\n", + " xtras = {}\n", + " for var in vars:\n", + " unique_vals = tfm.unique(var)\n", + " if unique_vals.any():\n", + " xtras[f'{var}_t'] = enums.filter(f'{var}_t', unique_vals)\n", + " return xtras" + ] + }, + { + "cell_type": "markdown", + "id": "5e109f56", + "metadata": {}, + "source": [ + "### Encoding NetCDF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1923236b-db58-4173-93ea-c416f5343eba", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def encode(\n", + " fname_in: str, # Input file name\n", + " fname_out_nc: str, # Output file name\n", + " nc_tpl_path: str, # NetCDF template file name\n", + " **kwargs # Additional arguments\n", + " ) -> None:\n", + " \"Encode data to NetCDF.\"\n", + " dfs = load_data(fname_in)\n", + " tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),\n", + " LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE'),\n", + " ParseTimeCB(),\n", + " EncodeTimeCB(cfg()),\n", + " SanitizeValue(coi_val), \n", + " NormalizeUncCB(),\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", + " RemapTaxonInformationCB(lut_taxon),\n", + " RemapSedimentCB(lut_sediments),\n", + " RemapUnitCB(),\n", + " RemapDetectionLimitCB(coi_dl, lut_dl),\n", + " RemapFiltCB(lut_filtered),\n", + " AddSampleLabCodeCB(),\n", + " AddMeasurementNoteCB(lut_method),\n", + " RemapStationIdCB(),\n", + " RemapSedSliceTopBottomCB(),\n", + " LookupDryWetRatio(),\n", + " ParseCoordinates(ddmm_to_dd),\n", + " SanitizeLonLatCB(),\n", + " SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),\n", + " ReshapeLongToWide()\n", + " ])\n", + " tfm()\n", + " encoder = NetCDFEncoder(tfm.dfs, \n", + " src_fname=nc_tpl_path,\n", + " dest_fname=fname_out_nc, \n", + " global_attrs=get_attrs(tfm, zotero_key=zotero_key, kw=kw),\n", + " verbose=kwargs.get('verbose', False),\n", + " enums_xtra=enums_xtra(tfm, vars=['species', 'body_part'])\n", + " )\n", + " encoder.encode()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fd973e4", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "encode(fname_in, fname_out_nc, nc_tpl_path(), verbose=False)" + ] + }, + { + "cell_type": "markdown", + "id": "05beed7f", + "metadata": {}, + "source": [ + "## Open Refine Pipeline (WIP)" + ] + }, + { + "cell_type": "markdown", + "id": "94f45970", + "metadata": {}, + "source": [ + "### Rename columns for Open Refine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9468d6dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Group 'seawater' has the following renaming rules not applied:\n", + "Key 'measurenote' from renaming rules was not found in the DataFrame.\n", + "\n", + "Group 'sediment' has the following renaming rules not applied:\n", + "Key 'SDEPTH' from renaming rules was not found in the DataFrame.\n", + "Key 'measurenote' from renaming rules was not found in the DataFrame.\n", + "Key 'TTEMP' from renaming rules was not found in the DataFrame.\n", + "Key 'FILT' from renaming rules was not found in the DataFrame.\n", + "Key 'SALIN' from renaming rules was not found in the DataFrame.\n", + "\n", + "Group 'biota' has the following renaming rules not applied:\n", + "Key 'TDEPTH' from renaming rules was not found in the DataFrame.\n", + "Key 'measurenote' from renaming rules was not found in the DataFrame.\n", + "Key 'TTEMP' from renaming rules was not found in the DataFrame.\n", + "Key 'FILT' from renaming rules was not found in the DataFrame.\n", + "Key 'SALIN' from renaming rules was not found in the DataFrame.\n", + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21114 39531 15798\n", + "Number of dropped rows 102 286 29\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " AddSampleTypeIdColumnCB(),\n", + " LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE'),\n", + " ParseTimeCB(),\n", + " EncodeTimeCB(cfg()), \n", + " SanitizeValue(coi_val), \n", + " NormalizeUncCB(),\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", + " RemapTaxonInformationCB(lut_taxon),\n", + " RemapSedimentCB(lut_sediments),\n", + " RemapUnitCB(),\n", + " RemapDetectionLimitCB(coi_dl, lut_dl),\n", + " RemapFiltCB(lut_filtered),\n", + " AddSampleLabCodeCB(),\n", + " AddMeasurementNoteCB(lut_method),\n", + " RemapStationIdCB(),\n", + " RemapSedSliceTopBottomCB(),\n", + " LookupDryWetRatio(),\n", + " ParseCoordinates(ddmm_to_dd),\n", + " SanitizeLonLatCB(),\n", + " SelectAndRenameColumnCB(get_renaming_rules, encoding_type='openrefine', verbose=True),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" + ] + }, + { + "cell_type": "markdown", + "id": "b2e68ad8", + "metadata": {}, + "source": [ + "**Example of data included in dfs_dropped.**\n", + "\n", + "Main reasons for data to be dropped from dfs:\n", + "- No activity value reported (e.g. VALUE_Bq/kg)\n", + "- No time value reported. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fe229c0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KEYNUCLIDEMETHOD< VALUE_Bq/kgVALUE_Bq/kgERROR%_kg< VALUE_Bq/m²VALUE_Bq/m²ERROR%_m²DATE_OF_ENTRY_x...LOWSLIAREASEDIOXICDW%LOI%MORS_SUBBASINHELCOM_SUBBASINSUM_LINKDATE_OF_ENTRY_y
11784SLREB1998021SR902NaNNaNNaNNaNNaNNaNNaN...12.00.0210055.0ONaNNaN14.014.0aNaN
11824SLVDC1997023CS1371NaNNaNNaNNaNNaNNaNNaN...14.00.0210055.0ONaNNaN9.09.0aNaN
11832SLVDC1997031CS1371NaNNaNNaNNaNNaNNaNNaN...14.00.0210055.0ONaNNaN9.09.0aNaN
11841SLVDC1997040CS1371NaNNaNNaNNaNNaNNaNNaN...16.00.0210055.0ONaNNaN9.09.0aNaN
11849SLVDC1998011CS1371NaNNaNNaNNaNNaNNaNNaN...16.00.0210055.0ONaNNaN14.014.0aNaN
..................................................................
39769SSSSM2021030CO60SSSM43<NaNNaN<NaNNaN09/06/22 00:00:00...2.00.01608NaNNaN28.20000015.012.012.0NaN09/06/22 00:00:00
39774SSSSM2021030RA226SSSM43<NaNNaN<NaNNaN09/06/22 00:00:00...2.00.01608NaNNaN28.20000015.012.012.0NaN09/06/22 00:00:00
39775SSSSM2021030RA223SSSM43<NaNNaN<NaNNaN09/06/22 00:00:00...2.00.01608NaNNaN28.20000015.012.012.0NaN09/06/22 00:00:00
39777SSSSM2021031CS137SSSM43<NaNNaN<0.0NaN09/06/22 00:00:00...2.00.01608NaNNaN31.993243NaN13.013.0NaN09/06/22 00:00:00
39779SSSSM2021031CO60SSSM43<NaNNaN<NaNNaN09/06/22 00:00:00...2.00.01608NaNNaN31.993243NaN13.013.0NaN09/06/22 00:00:00
\n", + "

286 rows × 35 columns

\n", + "
" + ], + "text/plain": [ + " KEY NUCLIDE METHOD < VALUE_Bq/kg VALUE_Bq/kg ERROR%_kg \\\n", + "11784 SLREB1998021 SR90 2 NaN NaN NaN \n", + "11824 SLVDC1997023 CS137 1 NaN NaN NaN \n", + "11832 SLVDC1997031 CS137 1 NaN NaN NaN \n", + "11841 SLVDC1997040 CS137 1 NaN NaN NaN \n", + "11849 SLVDC1998011 CS137 1 NaN NaN NaN \n", + "... ... ... ... ... ... ... \n", + "39769 SSSSM2021030 CO60 SSSM43 < NaN NaN \n", + "39774 SSSSM2021030 RA226 SSSM43 < NaN NaN \n", + "39775 SSSSM2021030 RA223 SSSM43 < NaN NaN \n", + "39777 SSSSM2021031 CS137 SSSM43 < NaN NaN \n", + "39779 SSSSM2021031 CO60 SSSM43 < NaN NaN \n", + "\n", + " < VALUE_Bq/m² VALUE_Bq/m² ERROR%_m² DATE_OF_ENTRY_x ... LOWSLI \\\n", + "11784 NaN NaN NaN NaN ... 12.0 \n", + "11824 NaN NaN NaN NaN ... 14.0 \n", + "11832 NaN NaN NaN NaN ... 14.0 \n", + "11841 NaN NaN NaN NaN ... 16.0 \n", + "11849 NaN NaN NaN NaN ... 16.0 \n", + "... ... ... ... ... ... ... \n", + "39769 < NaN NaN 09/06/22 00:00:00 ... 2.0 \n", + "39774 < NaN NaN 09/06/22 00:00:00 ... 2.0 \n", + "39775 < NaN NaN 09/06/22 00:00:00 ... 2.0 \n", + "39777 < 0.0 NaN 09/06/22 00:00:00 ... 2.0 \n", + "39779 < NaN NaN 09/06/22 00:00:00 ... 2.0 \n", + "\n", + " AREA SEDI OXIC DW% LOI% MORS_SUBBASIN HELCOM_SUBBASIN \\\n", + "11784 0.02100 55.0 O NaN NaN 14.0 14.0 \n", + "11824 0.02100 55.0 O NaN NaN 9.0 9.0 \n", + "11832 0.02100 55.0 O NaN NaN 9.0 9.0 \n", + "11841 0.02100 55.0 O NaN NaN 9.0 9.0 \n", + "11849 0.02100 55.0 O NaN NaN 14.0 14.0 \n", + "... ... ... ... ... ... ... ... \n", + "39769 0.01608 NaN NaN 28.200000 15.0 12.0 12.0 \n", + "39774 0.01608 NaN NaN 28.200000 15.0 12.0 12.0 \n", + "39775 0.01608 NaN NaN 28.200000 15.0 12.0 12.0 \n", + "39777 0.01608 NaN NaN 31.993243 NaN 13.0 13.0 \n", + "39779 0.01608 NaN NaN 31.993243 NaN 13.0 13.0 \n", + "\n", + " SUM_LINK DATE_OF_ENTRY_y \n", + "11784 a NaN \n", + "11824 a NaN \n", + "11832 a NaN \n", + "11841 a NaN \n", + "11849 a NaN \n", + "... ... ... \n", + "39769 NaN 09/06/22 00:00:00 \n", + "39774 NaN 09/06/22 00:00:00 \n", + "39775 NaN 09/06/22 00:00:00 \n", + "39777 NaN 09/06/22 00:00:00 \n", + "39779 NaN 09/06/22 00:00:00 \n", + "\n", + "[286 rows x 35 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "grp='sediment'\n", + "#grp='seawater'\n", + "#grp='biota'\n", + "\n", + "tfm.dfs_dropped[grp]" + ] + }, + { + "cell_type": "markdown", + "id": "97b6241c", + "metadata": {}, + "source": [ + "## Open Refine encoder (WIP)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bd81eaf", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "def encode_or(\n", + " fname_in: str, # Input file name\n", + " fname_out_csv: str, # Output file name\n", + " ref_id: str, # Reference ID as defined in MARIS master DB\n", + " **kwargs # Additional arguments\n", + " ) -> None:\n", + " \"Encode data to Open Refine CSV.\"\n", + " dfs = load_data(fname_in)\n", + " tfm = Transformer(dfs, cbs=[\n", + " AddSampleTypeIdColumnCB(),\n", + " LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE'),\n", + " ParseTimeCB(),\n", + " EncodeTimeCB(cfg()), \n", + " SanitizeValue(coi_val), \n", + " NormalizeUncCB(),\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", + " RemapTaxonInformationCB(lut_taxon),\n", + " RemapSedimentCB(lut_sediments),\n", + " RemapUnitCB(),\n", + " RemapDetectionLimitCB(coi_dl, lut_dl),\n", + " RemapFiltCB(lut_filtered),\n", + " AddSampleLabCodeCB(),\n", + " AddMeasurementNoteCB(lut_method),\n", + " RemapStationIdCB(),\n", + " RemapSedSliceTopBottomCB(),\n", + " LookupDryWetRatio(),\n", + " ParseCoordinates(ddmm_to_dd),\n", + " SanitizeLonLatCB(),\n", + " SelectAndRenameColumnCB(get_renaming_rules, encoding_type='openrefine', verbose=True),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + " \n", + " tfm()\n", + "\n", + " encoder = OpenRefineCsvEncoder(tfm.dfs, \n", + " dest_fname=fname_out_csv, \n", + " ref_id = ref_id,\n", + " verbose = True\n", + " )\n", + " encoder.encode()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74b3978a", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "encode_or(fname_in, fname_out_csv, ref_id, verbose=True)" + ] + }, + { + "cell_type": "markdown", + "id": "a0ecc9e8", + "metadata": {}, + "source": [ + "### Open Refine Variables not included in Helcom" + ] + }, + { + "cell_type": "markdown", + "id": "11ca95cc", + "metadata": {}, + "source": [ + "| Field name | Full name | HELCOM |\n", + "|-----------------|--------------------------|------------|\n", + "| sampquality | Sample quality | N |\n", + "| lab_id | Laboratory ID | N |\n", + "| profile_id | Profile ID | N |\n", + "| transect_id | Transect ID | N |\n", + "| endperiod | End period | N |\n", + "| vartype | Variable type | N |\n", + "| freq | Frequency | N |\n", + "| rl_detection | Range low detection | N |\n", + "| rangelow | Range low | N |\n", + "| rangeupp | Range upper | N |\n", + "| Commonname | Common name | N |\n", + "| volume | Volume | N |\n", + "| filtpore | Filter pore | N |\n", + "| acid | Acidified | N |\n", + "| oxygen | Oxygen | N |\n", + "| samparea | Sample area | N |\n", + "| drywt | Dry weight | N |\n", + "| wetwt | Wet weight | N |\n", + "| sampmet_id | Sampling method ID | N |\n", + "| drymet_id | Drying method ID | N |\n", + "| prepmet_id | Preparation method ID | N |\n", + "| counmet_id | Counting method ID | N |\n", + "| refnote | Reference note | N |\n", + "| sampnote | Sample note | N |\n", + "| gfe | Good for export | ? |" + ] + }, + { + "cell_type": "markdown", + "id": "7f4d9df4", + "metadata": {}, + "source": [ + "**TODO**:\n", + "\n", + "- Should we use a single encoder for both NetCDF and OpenRefine? If so, should we have a single encode function that accepts a variable 'encoding_type'." + ] + }, + { + "cell_type": "markdown", + "id": "1a206afa", + "metadata": {}, + "source": [ + "TODO: Include FILT for NetCDF" + ] + }, + { + "cell_type": "markdown", + "id": "fc44bf97", + "metadata": {}, + "source": [ + "TODO: Check sediment 'DW%' data that is less than 1%. Is this realistic? Check the 'DW%' data that is 0%. Run below before SelectAndRenameColumnCB. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "002712da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'seawater': KEY NUCLIDE METHOD < VALUE_Bq/m³ VALUE_Bq/m³ ERROR%_m³ \\\n", + " 0 WKRIL2012003 cs137 NaN NaN 5.3 32.000000 \n", + " 1 WKRIL2012004 cs137 NaN NaN 19.9 20.000000 \n", + " 2 WKRIL2012005 cs137 NaN NaN 25.5 20.000000 \n", + " 3 WKRIL2012006 cs137 NaN NaN 17.0 29.000000 \n", + " 4 WKRIL2012007 cs137 NaN NaN 22.2 18.000000 \n", + " ... ... ... ... ... ... ... \n", + " 21211 WSSSM2021005 h3 SSM45 NaN 1030.0 93.203883 \n", + " 21212 WSSSM2021006 h3 SSM45 NaN 2240.0 43.303571 \n", + " 21213 WSSSM2021007 h3 SSM45 NaN 2060.0 47.087379 \n", + " 21214 WSSSM2021008 h3 SSM45 NaN 2300.0 43.478261 \n", + " 21215 WSSSM2021004 h3 SSM45 < NaN NaN \n", + " \n", + " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... \\\n", + " 0 08/20/14 00:00:00 90.0 KRIL 2012003.0 ... \n", + " 1 08/20/14 00:00:00 90.0 KRIL 2012004.0 ... \n", + " 2 08/20/14 00:00:00 90.0 KRIL 2012005.0 ... \n", + " 3 08/20/14 00:00:00 90.0 KRIL 2012006.0 ... \n", + " 4 08/20/14 00:00:00 90.0 KRIL 2012007.0 ... \n", + " ... ... ... ... ... ... \n", + " 21211 09/06/22 00:00:00 77.0 SSSM 202105.0 ... \n", + " 21212 09/06/22 00:00:00 77.0 SSSM 202106.0 ... \n", + " 21213 09/06/22 00:00:00 77.0 SSSM 202107.0 ... \n", + " 21214 09/06/22 00:00:00 77.0 SSSM 202108.0 ... \n", + " 21215 09/06/22 00:00:00 77.0 SSSM 202104.0 ... \n", + " \n", + " LONGITUDE (ddmmmm) LONGITUDE (dddddd) TDEPTH SDEPTH SALIN TTEMP \\\n", + " 0 29.2000 29.3333 NaN 0.0 NaN NaN \n", + " 1 29.2000 29.3333 NaN 29.0 NaN NaN \n", + " 2 23.0900 23.1500 NaN 0.0 NaN NaN \n", + " 3 27.5900 27.9833 NaN 0.0 NaN NaN \n", + " 4 27.5900 27.9833 NaN 39.0 NaN NaN \n", + " ... ... ... ... ... ... ... \n", + " 21211 18.2143 18.3572 NaN 1.0 NaN NaN \n", + " 21212 17.0000 17.0000 NaN 1.0 NaN NaN \n", + " 21213 11.5671 11.9452 NaN 1.0 NaN NaN \n", + " 21214 11.5671 11.9452 NaN 1.0 NaN NaN \n", + " 21215 11.1470 11.2450 NaN 1.0 NaN NaN \n", + " \n", + " FILT MORS_SUBBASIN HELCOM_SUBBASIN DATE_OF_ENTRY_y \n", + " 0 NaN 11.0 11.0 08/20/14 00:00:00 \n", + " 1 NaN 11.0 11.0 08/20/14 00:00:00 \n", + " 2 NaN 11.0 3.0 08/20/14 00:00:00 \n", + " 3 NaN 11.0 11.0 08/20/14 00:00:00 \n", + " 4 NaN 11.0 11.0 08/20/14 00:00:00 \n", + " ... ... ... ... ... \n", + " 21211 N 1.0 8.0 09/06/22 00:00:00 \n", + " 21212 N 10.0 10.0 09/06/22 00:00:00 \n", + " 21213 N 12.0 12.0 09/06/22 00:00:00 \n", + " 21214 N 12.0 12.0 09/06/22 00:00:00 \n", + " 21215 N 15.0 18.0 09/06/22 00:00:00 \n", + " \n", + " [21216 rows x 27 columns],\n", + " 'sediment': KEY NUCLIDE METHOD < VALUE_Bq/kg VALUE_Bq/kg ERROR%_kg \\\n", + " 0 SKRIL2012048 ra226 NaN NaN 35.0 26.00 \n", + " 1 SKRIL2012049 ra226 NaN NaN 36.0 22.00 \n", + " 2 SKRIL2012050 ra226 NaN NaN 38.0 24.00 \n", + " 3 SKRIL2012051 ra226 NaN NaN 36.0 25.00 \n", + " 4 SKRIL2012052 ra226 NaN NaN 30.0 23.00 \n", + " ... ... ... ... ... ... ... \n", + " 39812 SSSSM2020029 ac228 SSSM43 NaN 37.5 5.00 \n", + " 39813 SSSSM2020030 k40 SSSM43 NaN 526.0 1.72 \n", + " 39814 SSSSM2020030 cs137 SSSM43 NaN 17.2 2.21 \n", + " 39815 SSSSM2020031 k40 SSSM43 NaN 1000.0 1.80 \n", + " 39816 SSSSM2020031 cs137 SSSM43 NaN 64.0 1.20 \n", + " \n", + " < VALUE_Bq/m² VALUE_Bq/m² ERROR%_m² DATE_OF_ENTRY_x ... LOWSLI \\\n", + " 0 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", + " 1 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", + " 2 NaN NaN NaN 08/20/14 00:00:00 ... 2.0 \n", + " 3 NaN NaN NaN 08/20/14 00:00:00 ... 4.0 \n", + " 4 NaN NaN NaN 08/20/14 00:00:00 ... 6.0 \n", + " ... ... ... ... ... ... ... \n", + " 39812 NaN 255.0 28.0 04/22/22 00:00:00 ... 2.0 \n", + " 39813 NaN 5690.0 2.0 04/22/22 00:00:00 ... 2.0 \n", + " 39814 NaN 186.0 2.0 04/22/22 00:00:00 ... 2.0 \n", + " 39815 NaN 16000.0 2.0 04/22/22 00:00:00 ... 2.0 \n", + " 39816 NaN 1020.0 1.0 04/22/22 00:00:00 ... 2.0 \n", + " \n", + " AREA SEDI OXIC DW% LOI% MORS_SUBBASIN HELCOM_SUBBASIN SUM_LINK \\\n", + " 0 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + " 1 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + " 2 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + " 3 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + " 4 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + " ... ... ... ... ... ... ... ... ... \n", + " 39812 0.019 0.0 O 28.73 14.0 13.0 13.0 NaN \n", + " 39813 0.019 0.0 O 32.03 NaN 12.0 12.0 NaN \n", + " 39814 0.019 0.0 O 32.03 NaN 12.0 12.0 NaN \n", + " 39815 0.017 0.0 O 48.77 NaN 1.0 8.0 NaN \n", + " 39816 0.017 0.0 O 48.77 NaN 1.0 8.0 NaN \n", + " \n", + " DATE_OF_ENTRY_y \n", + " 0 08/20/14 00:00:00 \n", + " 1 08/20/14 00:00:00 \n", + " 2 08/20/14 00:00:00 \n", + " 3 08/20/14 00:00:00 \n", + " 4 08/20/14 00:00:00 \n", + " ... ... \n", + " 39812 04/22/22 00:00:00 \n", + " 39813 04/22/22 00:00:00 \n", + " 39814 04/22/22 00:00:00 \n", + " 39815 04/22/22 00:00:00 \n", + " 39816 04/22/22 00:00:00 \n", + " \n", + " [39817 rows x 35 columns],\n", + " 'biota': KEY NUCLIDE METHOD < VALUE_Bq/kg VALUE_Bq/kg BASIS ERROR% \\\n", + " 0 BVTIG2012041 cs134 VTIG01 < 0.010140 W NaN \n", + " 1 BVTIG2012041 k40 VTIG01 135.300000 W 3.57 \n", + " 2 BVTIG2012041 co60 VTIG01 < 0.013980 W NaN \n", + " 3 BVTIG2012041 cs137 VTIG01 4.338000 W 3.48 \n", + " 4 BVTIG2012040 cs134 VTIG01 < 0.009614 W NaN \n", + " ... ... ... ... ... ... ... ... \n", + " 15822 BSSSM2020016 k40 SSSM42 NaN 65.000000 D 10.20 \n", + " 15823 BSSSM2020016 cs137 SSSM42 NaN 4.500000 D 6.20 \n", + " 15824 BSSSM2020017 be7 SSSM42 NaN 94.000000 D 3.40 \n", + " 15825 BSSSM2020017 k40 SSSM42 NaN 1100.000000 D 1.60 \n", + " 15826 BSSSM2020017 cs137 SSSM42 NaN 13.000000 D 2.50 \n", + " \n", + " NUMBER DATE_OF_ENTRY_x COUNTRY ... BIOTATYPE TISSUE NO \\\n", + " 0 NaN 02/27/14 00:00:00 6.0 ... F 5 16.0 \n", + " 1 NaN 02/27/14 00:00:00 6.0 ... F 5 16.0 \n", + " 2 NaN 02/27/14 00:00:00 6.0 ... F 5 16.0 \n", + " 3 NaN 02/27/14 00:00:00 6.0 ... F 5 16.0 \n", + " 4 NaN 02/27/14 00:00:00 6.0 ... F 5 17.0 \n", + " ... ... ... ... ... ... ... ... \n", + " 15822 NaN 04/22/22 00:00:00 77.0 ... B 41 319.0 \n", + " 15823 NaN 04/22/22 00:00:00 77.0 ... B 41 319.0 \n", + " 15824 NaN 04/22/22 00:00:00 77.0 ... P 51 NaN \n", + " 15825 NaN 04/22/22 00:00:00 77.0 ... P 51 NaN \n", + " 15826 NaN 04/22/22 00:00:00 77.0 ... P 51 NaN \n", + " \n", + " LENGTH WEIGHT DW% LOI% MORS_SUBBASIN HELCOM_SUBBASIN \\\n", + " 0 45.7 948.0 18.453 92.9 2.0 16 \n", + " 1 45.7 948.0 18.453 92.9 2.0 16 \n", + " 2 45.7 948.0 18.453 92.9 2.0 16 \n", + " 3 45.7 948.0 18.453 92.9 2.0 16 \n", + " 4 45.9 964.0 18.458 92.9 2.0 16 \n", + " ... ... ... ... ... ... ... \n", + " 15822 NaN NaN 41.000 0.0 1.0 8 \n", + " 15823 NaN NaN 41.000 0.0 1.0 8 \n", + " 15824 NaN NaN 21.000 0.0 1.0 8 \n", + " 15825 NaN NaN 21.000 0.0 1.0 8 \n", + " 15826 NaN NaN 21.000 0.0 1.0 8 \n", + " \n", + " DATE_OF_ENTRY_y \n", + " 0 02/27/14 00:00:00 \n", + " 1 02/27/14 00:00:00 \n", + " 2 02/27/14 00:00:00 \n", + " 3 02/27/14 00:00:00 \n", + " 4 02/27/14 00:00:00 \n", + " ... ... \n", + " 15822 04/22/22 00:00:00 \n", + " 15823 04/22/22 00:00:00 \n", + " 15824 04/22/22 00:00:00 \n", + " 15825 04/22/22 00:00:00 \n", + " 15826 04/22/22 00:00:00 \n", + " \n", + " [15827 rows x 33 columns]}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(col_src='NUCLIDE'),\n", + " ])\n", + "tfm()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de551778", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KEYNUCLIDEMETHOD< VALUE_Bq/kgVALUE_Bq/kgERROR%_kg< VALUE_Bq/m²VALUE_Bq/m²ERROR%_m²DATE_OF_ENTRY_x...LOWSLIAREASEDIOXICDW%LOI%MORS_SUBBASINHELCOM_SUBBASINSUM_LINKDATE_OF_ENTRY_y
30938SLVEA2010001cs137LVEA01NaN334.251.57NaN131.88641179.0NaN...2.00.01515.0O0.1150.914.014.0NaN11/11/11 00:00:00
30939SLVEA2010002cs137LVEA01NaN343.581.49NaN132.09241179.0NaN...4.00.01515.0A0.1590.814.014.0NaN11/11/11 00:00:00
30940SLVEA2010003cs137LVEA01NaN334.691.56NaN134.39041179.0NaN...6.00.01515.0A0.1890.814.014.0NaN11/11/11 00:00:00
30941SLVEA2010004cs137LVEA01NaN348.501.56NaN136.69941179.0NaN...8.00.01515.0A0.1940.814.014.0NaN11/11/11 00:00:00
30942SLVEA2010005cs137LVEA01NaN258.671.73NaN104.89441179.0NaN...10.00.01515.0A0.1950.814.014.0NaN11/11/11 00:00:00
30943SLVEA2010006cs137LVEA01NaN182.022.05NaN77.52341179.0NaN...12.00.01515.0A0.2210.814.014.0NaN11/11/11 00:00:00
30944SLVEA2010007cs137LVEA01NaN116.342.79NaN46.94641179.0NaN...14.00.01515.0A0.2380.814.014.0NaN11/11/11 00:00:00
30945SLVEA2010008cs137LVEA01NaN94.072.61NaN38.16241179.0NaN...16.00.01515.0A0.2340.814.014.0NaN11/11/11 00:00:00
30946SLVEA2010009cs137LVEA01NaN69.703.12NaN27.44441179.0NaN...18.00.01515.0A0.2420.814.014.0NaN11/11/11 00:00:00
30947SLVEA2010010cs137LVEA01NaN59.633.40NaN24.22041179.0NaN...20.00.01515.0A0.2570.714.014.0NaN11/11/11 00:00:00
30948SLVEA2010011cs137LVEA01<12.243.88<5.03541179.0NaN...22.00.01515.0A0.2640.714.014.0NaN11/11/11 00:00:00
30949SLVEA2010012cs137LVEA01<0.83NaN<0.33041179.0NaN...24.00.01515.0A0.2440.814.014.0NaN11/11/11 00:00:00
30950SLVEA2010013cs137LVEA01NaN331.611.40NaN125.56641179.0NaN...2.00.01515.0O0.1150.914.014.0NaN11/11/11 00:00:00
30951SLVEA2010014cs137LVEA01NaN352.061.33NaN144.51641179.0NaN...4.00.01515.0A0.1640.814.014.0NaN11/11/11 00:00:00
30952SLVEA2010015cs137LVEA01NaN367.111.36NaN139.43441179.0NaN...6.00.01515.0A0.1910.814.014.0NaN11/11/11 00:00:00
30953SLVEA2010016cs137LVEA01NaN328.971.42NaN124.34841179.0NaN...8.00.01515.0A0.1880.814.014.0NaN11/11/11 00:00:00
30954SLVEA2010017cs137LVEA01NaN356.301.37NaN135.44741179.0NaN...10.00.01515.0A0.1790.814.014.0NaN11/11/11 00:00:00
30955SLVEA2010018cs137LVEA01NaN314.751.42NaN118.76541179.0NaN...12.00.01515.0A0.1860.814.014.0NaN11/11/11 00:00:00
30956SLVEA2010019cs137LVEA01NaN261.641.52NaN104.58041179.0NaN...14.00.01515.0A0.1940.814.014.0NaN11/11/11 00:00:00
30957SLVEA2010020cs137LVEA01NaN181.001.76NaN74.05841179.0NaN...16.00.01515.0A0.2090.814.014.0NaN11/11/11 00:00:00
30958SLVEA2010021cs137LVEA01NaN143.652.02NaN57.68041179.0NaN...18.00.01515.0A0.2140.814.014.0NaN11/11/11 00:00:00
30959SLVEA2010022cs137LVEA01NaN109.362.15NaN42.15341179.0NaN...20.00.01515.0A0.2180.814.014.0NaN11/11/11 00:00:00
30960SLVEA2010023cs137LVEA01NaN94.121.39NaN35.87341179.0NaN...22.00.01515.0A0.2120.814.014.0NaN11/11/11 00:00:00
30961SLVEA2010024cs137LVEA01NaN96.631.35NaN38.86441179.0NaN...24.00.01515.0A0.2170.814.014.0NaN11/11/11 00:00:00
\n", + "

24 rows × 35 columns

\n", + "
" + ], + "text/plain": [ + " KEY NUCLIDE METHOD < VALUE_Bq/kg VALUE_Bq/kg ERROR%_kg \\\n", + "30938 SLVEA2010001 cs137 LVEA01 NaN 334.25 1.57 \n", + "30939 SLVEA2010002 cs137 LVEA01 NaN 343.58 1.49 \n", + "30940 SLVEA2010003 cs137 LVEA01 NaN 334.69 1.56 \n", + "30941 SLVEA2010004 cs137 LVEA01 NaN 348.50 1.56 \n", + "30942 SLVEA2010005 cs137 LVEA01 NaN 258.67 1.73 \n", + "30943 SLVEA2010006 cs137 LVEA01 NaN 182.02 2.05 \n", + "30944 SLVEA2010007 cs137 LVEA01 NaN 116.34 2.79 \n", + "30945 SLVEA2010008 cs137 LVEA01 NaN 94.07 2.61 \n", + "30946 SLVEA2010009 cs137 LVEA01 NaN 69.70 3.12 \n", + "30947 SLVEA2010010 cs137 LVEA01 NaN 59.63 3.40 \n", + "30948 SLVEA2010011 cs137 LVEA01 < 12.24 3.88 \n", + "30949 SLVEA2010012 cs137 LVEA01 < 0.83 NaN \n", + "30950 SLVEA2010013 cs137 LVEA01 NaN 331.61 1.40 \n", + "30951 SLVEA2010014 cs137 LVEA01 NaN 352.06 1.33 \n", + "30952 SLVEA2010015 cs137 LVEA01 NaN 367.11 1.36 \n", + "30953 SLVEA2010016 cs137 LVEA01 NaN 328.97 1.42 \n", + "30954 SLVEA2010017 cs137 LVEA01 NaN 356.30 1.37 \n", + "30955 SLVEA2010018 cs137 LVEA01 NaN 314.75 1.42 \n", + "30956 SLVEA2010019 cs137 LVEA01 NaN 261.64 1.52 \n", + "30957 SLVEA2010020 cs137 LVEA01 NaN 181.00 1.76 \n", + "30958 SLVEA2010021 cs137 LVEA01 NaN 143.65 2.02 \n", + "30959 SLVEA2010022 cs137 LVEA01 NaN 109.36 2.15 \n", + "30960 SLVEA2010023 cs137 LVEA01 NaN 94.12 1.39 \n", + "30961 SLVEA2010024 cs137 LVEA01 NaN 96.63 1.35 \n", + "\n", + " < VALUE_Bq/m² VALUE_Bq/m² ERROR%_m² DATE_OF_ENTRY_x ... LOWSLI \\\n", + "30938 NaN 131.886 41179.0 NaN ... 2.0 \n", + "30939 NaN 132.092 41179.0 NaN ... 4.0 \n", + "30940 NaN 134.390 41179.0 NaN ... 6.0 \n", + "30941 NaN 136.699 41179.0 NaN ... 8.0 \n", + "30942 NaN 104.894 41179.0 NaN ... 10.0 \n", + "30943 NaN 77.523 41179.0 NaN ... 12.0 \n", + "30944 NaN 46.946 41179.0 NaN ... 14.0 \n", + "30945 NaN 38.162 41179.0 NaN ... 16.0 \n", + "30946 NaN 27.444 41179.0 NaN ... 18.0 \n", + "30947 NaN 24.220 41179.0 NaN ... 20.0 \n", + "30948 < 5.035 41179.0 NaN ... 22.0 \n", + "30949 < 0.330 41179.0 NaN ... 24.0 \n", + "30950 NaN 125.566 41179.0 NaN ... 2.0 \n", + "30951 NaN 144.516 41179.0 NaN ... 4.0 \n", + "30952 NaN 139.434 41179.0 NaN ... 6.0 \n", + "30953 NaN 124.348 41179.0 NaN ... 8.0 \n", + "30954 NaN 135.447 41179.0 NaN ... 10.0 \n", + "30955 NaN 118.765 41179.0 NaN ... 12.0 \n", + "30956 NaN 104.580 41179.0 NaN ... 14.0 \n", + "30957 NaN 74.058 41179.0 NaN ... 16.0 \n", + "30958 NaN 57.680 41179.0 NaN ... 18.0 \n", + "30959 NaN 42.153 41179.0 NaN ... 20.0 \n", + "30960 NaN 35.873 41179.0 NaN ... 22.0 \n", + "30961 NaN 38.864 41179.0 NaN ... 24.0 \n", + "\n", + " AREA SEDI OXIC DW% LOI% MORS_SUBBASIN HELCOM_SUBBASIN \\\n", + "30938 0.0151 5.0 O 0.115 0.9 14.0 14.0 \n", + "30939 0.0151 5.0 A 0.159 0.8 14.0 14.0 \n", + "30940 0.0151 5.0 A 0.189 0.8 14.0 14.0 \n", + "30941 0.0151 5.0 A 0.194 0.8 14.0 14.0 \n", + "30942 0.0151 5.0 A 0.195 0.8 14.0 14.0 \n", + "30943 0.0151 5.0 A 0.221 0.8 14.0 14.0 \n", + "30944 0.0151 5.0 A 0.238 0.8 14.0 14.0 \n", + "30945 0.0151 5.0 A 0.234 0.8 14.0 14.0 \n", + "30946 0.0151 5.0 A 0.242 0.8 14.0 14.0 \n", + "30947 0.0151 5.0 A 0.257 0.7 14.0 14.0 \n", + "30948 0.0151 5.0 A 0.264 0.7 14.0 14.0 \n", + "30949 0.0151 5.0 A 0.244 0.8 14.0 14.0 \n", + "30950 0.0151 5.0 O 0.115 0.9 14.0 14.0 \n", + "30951 0.0151 5.0 A 0.164 0.8 14.0 14.0 \n", + "30952 0.0151 5.0 A 0.191 0.8 14.0 14.0 \n", + "30953 0.0151 5.0 A 0.188 0.8 14.0 14.0 \n", + "30954 0.0151 5.0 A 0.179 0.8 14.0 14.0 \n", + "30955 0.0151 5.0 A 0.186 0.8 14.0 14.0 \n", + "30956 0.0151 5.0 A 0.194 0.8 14.0 14.0 \n", + "30957 0.0151 5.0 A 0.209 0.8 14.0 14.0 \n", + "30958 0.0151 5.0 A 0.214 0.8 14.0 14.0 \n", + "30959 0.0151 5.0 A 0.218 0.8 14.0 14.0 \n", + "30960 0.0151 5.0 A 0.212 0.8 14.0 14.0 \n", + "30961 0.0151 5.0 A 0.217 0.8 14.0 14.0 \n", + "\n", + " SUM_LINK DATE_OF_ENTRY_y \n", + "30938 NaN 11/11/11 00:00:00 \n", + "30939 NaN 11/11/11 00:00:00 \n", + "30940 NaN 11/11/11 00:00:00 \n", + "30941 NaN 11/11/11 00:00:00 \n", + "30942 NaN 11/11/11 00:00:00 \n", + "30943 NaN 11/11/11 00:00:00 \n", + "30944 NaN 11/11/11 00:00:00 \n", + "30945 NaN 11/11/11 00:00:00 \n", + "30946 NaN 11/11/11 00:00:00 \n", + "30947 NaN 11/11/11 00:00:00 \n", + "30948 NaN 11/11/11 00:00:00 \n", + "30949 NaN 11/11/11 00:00:00 \n", + "30950 NaN 11/11/11 00:00:00 \n", + "30951 NaN 11/11/11 00:00:00 \n", + "30952 NaN 11/11/11 00:00:00 \n", + "30953 NaN 11/11/11 00:00:00 \n", + "30954 NaN 11/11/11 00:00:00 \n", + "30955 NaN 11/11/11 00:00:00 \n", + "30956 NaN 11/11/11 00:00:00 \n", + "30957 NaN 11/11/11 00:00:00 \n", + "30958 NaN 11/11/11 00:00:00 \n", + "30959 NaN 11/11/11 00:00:00 \n", + "30960 NaN 11/11/11 00:00:00 \n", + "30961 NaN 11/11/11 00:00:00 \n", + "\n", + "[24 rows x 35 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "grp='sediment'\n", + "check_data_sediment=tfm.dfs[grp][(tfm.dfs[grp]['DW%'] < 1) & (tfm.dfs[grp]['DW%'] > 0.001) ]\n", + "check_data_sediment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fe533d7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KEYNUCLIDEMETHOD< VALUE_Bq/kgVALUE_Bq/kgERROR%_kg< VALUE_Bq/m²VALUE_Bq/m²ERROR%_m²DATE_OF_ENTRY_x...LOWSLIAREASEDIOXICDW%LOI%MORS_SUBBASINHELCOM_SUBBASINSUM_LINKDATE_OF_ENTRY_y
9824SERPC1997001cs134NaNNaN3.8020.0NaN5.75NaNNaN...2.00.0085.0A0.00.011.011.0aNaN
9825SERPC1997001cs137NaNNaN389.004.0NaN589.00NaNNaN...2.00.0085.0A0.00.011.011.0aNaN
9826SERPC1997002cs134NaNNaN4.7813.0NaN12.00NaNNaN...4.00.0085.0A0.00.011.011.0aNaN
9827SERPC1997002cs137NaNNaN420.004.0NaN1060.00NaNNaN...4.00.0085.0A0.00.011.011.0aNaN
9828SERPC1997003cs134NaNNaN3.1217.0NaN12.00NaNNaN...6.00.0085.0A0.00.011.011.0aNaN
..................................................................
15257SKRIL1999062th2281NaN68.00NaNNaNNaNNaNNaN...15.00.0060.0O0.00.011.011.0aNaN
15258SKRIL1999063k401NaN1210.00NaNNaNNaNNaNNaN...21.50.0060.0O0.00.011.011.0aNaN
15259SKRIL1999063ra226KRIL01NaN56.50NaNNaNNaNNaNNaN...21.50.0060.0O0.00.011.011.0aNaN
15260SKRIL1999063ra228KRIL01NaN72.20NaNNaNNaNNaNNaN...21.50.0060.0O0.00.011.011.0aNaN
15261SKRIL1999063th2281NaN74.20NaNNaNNaNNaNNaN...21.50.0060.0O0.00.011.011.0aNaN
\n", + "

302 rows × 35 columns

\n", + "
" + ], + "text/plain": [ + " KEY NUCLIDE METHOD < VALUE_Bq/kg VALUE_Bq/kg ERROR%_kg \\\n", + "9824 SERPC1997001 cs134 NaN NaN 3.80 20.0 \n", + "9825 SERPC1997001 cs137 NaN NaN 389.00 4.0 \n", + "9826 SERPC1997002 cs134 NaN NaN 4.78 13.0 \n", + "9827 SERPC1997002 cs137 NaN NaN 420.00 4.0 \n", + "9828 SERPC1997003 cs134 NaN NaN 3.12 17.0 \n", + "... ... ... ... ... ... ... \n", + "15257 SKRIL1999062 th228 1 NaN 68.00 NaN \n", + "15258 SKRIL1999063 k40 1 NaN 1210.00 NaN \n", + "15259 SKRIL1999063 ra226 KRIL01 NaN 56.50 NaN \n", + "15260 SKRIL1999063 ra228 KRIL01 NaN 72.20 NaN \n", + "15261 SKRIL1999063 th228 1 NaN 74.20 NaN \n", + "\n", + " < VALUE_Bq/m² VALUE_Bq/m² ERROR%_m² DATE_OF_ENTRY_x ... LOWSLI \\\n", + "9824 NaN 5.75 NaN NaN ... 2.0 \n", + "9825 NaN 589.00 NaN NaN ... 2.0 \n", + "9826 NaN 12.00 NaN NaN ... 4.0 \n", + "9827 NaN 1060.00 NaN NaN ... 4.0 \n", + "9828 NaN 12.00 NaN NaN ... 6.0 \n", + "... ... ... ... ... ... ... \n", + "15257 NaN NaN NaN NaN ... 15.0 \n", + "15258 NaN NaN NaN NaN ... 21.5 \n", + "15259 NaN NaN NaN NaN ... 21.5 \n", + "15260 NaN NaN NaN NaN ... 21.5 \n", + "15261 NaN NaN NaN NaN ... 21.5 \n", + "\n", + " AREA SEDI OXIC DW% LOI% MORS_SUBBASIN HELCOM_SUBBASIN SUM_LINK \\\n", + "9824 0.008 5.0 A 0.0 0.0 11.0 11.0 a \n", + "9825 0.008 5.0 A 0.0 0.0 11.0 11.0 a \n", + "9826 0.008 5.0 A 0.0 0.0 11.0 11.0 a \n", + "9827 0.008 5.0 A 0.0 0.0 11.0 11.0 a \n", + "9828 0.008 5.0 A 0.0 0.0 11.0 11.0 a \n", + "... ... ... ... ... ... ... ... ... \n", + "15257 0.006 0.0 O 0.0 0.0 11.0 11.0 a \n", + "15258 0.006 0.0 O 0.0 0.0 11.0 11.0 a \n", + "15259 0.006 0.0 O 0.0 0.0 11.0 11.0 a \n", + "15260 0.006 0.0 O 0.0 0.0 11.0 11.0 a \n", + "15261 0.006 0.0 O 0.0 0.0 11.0 11.0 a \n", + "\n", + " DATE_OF_ENTRY_y \n", + "9824 NaN \n", + "9825 NaN \n", + "9826 NaN \n", + "9827 NaN \n", + "9828 NaN \n", + "... ... \n", + "15257 NaN \n", + "15258 NaN \n", + "15259 NaN \n", + "15260 NaN \n", + "15261 NaN \n", + "\n", + "[302 rows x 35 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "grp='sediment'\n", + "check_data_sediment=tfm.dfs[grp][(tfm.dfs[grp]['DW%'] == 0) ]\n", + "check_data_sediment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "357222d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KEYNUCLIDEMETHOD< VALUE_Bq/kgVALUE_Bq/kgBASISERROR%NUMBERDATE_OF_ENTRY_xCOUNTRY...BIOTATYPETISSUENOLENGTHWEIGHTDW%LOI%MORS_SUBBASINHELCOM_SUBBASINDATE_OF_ENTRY_y
5971BERPC1997002k40NaNNaN116.00W3.0NaNNaN91.0...F50.00.00.00.00.011.011NaN
5972BERPC1997002cs137NaNNaN12.60W4.0NaNNaN91.0...F50.00.00.00.00.011.011NaN
5973BERPC1997002cs134NaNNaN0.14W18.0NaNNaN91.0...F50.00.00.00.00.011.011NaN
5974BERPC1997001k40NaNNaN116.00W4.0NaNNaN91.0...F50.00.00.00.00.011.011NaN
5975BERPC1997001cs137NaNNaN12.00W4.0NaNNaN91.0...F50.00.00.00.00.011.011NaN
5976BERPC1997001cs134NaNNaN0.21W24.0NaNNaN91.0...F50.00.00.00.00.011.011NaN
\n", + "

6 rows × 33 columns

\n", + "
" + ], + "text/plain": [ + " KEY NUCLIDE METHOD < VALUE_Bq/kg VALUE_Bq/kg BASIS ERROR% \\\n", + "5971 BERPC1997002 k40 NaN NaN 116.00 W 3.0 \n", + "5972 BERPC1997002 cs137 NaN NaN 12.60 W 4.0 \n", + "5973 BERPC1997002 cs134 NaN NaN 0.14 W 18.0 \n", + "5974 BERPC1997001 k40 NaN NaN 116.00 W 4.0 \n", + "5975 BERPC1997001 cs137 NaN NaN 12.00 W 4.0 \n", + "5976 BERPC1997001 cs134 NaN NaN 0.21 W 24.0 \n", + "\n", + " NUMBER DATE_OF_ENTRY_x COUNTRY ... BIOTATYPE TISSUE NO LENGTH \\\n", + "5971 NaN NaN 91.0 ... F 5 0.0 0.0 \n", + "5972 NaN NaN 91.0 ... F 5 0.0 0.0 \n", + "5973 NaN NaN 91.0 ... F 5 0.0 0.0 \n", + "5974 NaN NaN 91.0 ... F 5 0.0 0.0 \n", + "5975 NaN NaN 91.0 ... F 5 0.0 0.0 \n", + "5976 NaN NaN 91.0 ... F 5 0.0 0.0 \n", + "\n", + " WEIGHT DW% LOI% MORS_SUBBASIN HELCOM_SUBBASIN DATE_OF_ENTRY_y \n", + "5971 0.0 0.0 0.0 11.0 11 NaN \n", + "5972 0.0 0.0 0.0 11.0 11 NaN \n", + "5973 0.0 0.0 0.0 11.0 11 NaN \n", + "5974 0.0 0.0 0.0 11.0 11 NaN \n", + "5975 0.0 0.0 0.0 11.0 11 NaN \n", + "5976 0.0 0.0 0.0 11.0 11 NaN \n", + "\n", + "[6 rows x 33 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "grp='biota'\n", + "check_data_sediment=tfm.dfs[grp][(tfm.dfs[grp]['DW%'] == 0) ]\n", + "check_data_sediment" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/handlers/helcom.ipynb b/nbs/handlers/helcom.ipynb index 9150d13..7e47bcb 100644 --- a/nbs/handlers/helcom.ipynb +++ b/nbs/handlers/helcom.ipynb @@ -470,19 +470,19 @@ "output_type": "stream", "text": [ " index value n_chars stripped_chars\n", - "14 14 CS137 9 5\n", - "20 20 SR90 6 4\n", - "31 31 PU238 8 5\n", - "34 34 CS137 6 5\n", - "37 37 K40 8 3\n", - "53 53 SR90 7 4\n", - "54 54 SR90 5 4\n", - "59 59 SR90 8 4\n", - "62 62 CO60 8 4\n", - "69 69 CS134 8 5\n", - "73 73 TC99 7 4\n", - "75 75 AM241 8 5\n", - "91 91 CS137 8 5\n" + "6 6 TC99 7 4\n", + "16 16 CS137 6 5\n", + "33 33 CS137 9 5\n", + "41 41 CS134 8 5\n", + "43 43 SR90 6 4\n", + "46 46 SR90 5 4\n", + "48 48 K40 8 3\n", + "49 49 PU238 8 5\n", + "64 64 CO60 8 4\n", + "65 65 AM241 8 5\n", + "66 66 CS137 8 5\n", + "83 83 SR90 8 4\n", + "86 86 SR90 7 4\n" ] } ], @@ -612,39 +612,39 @@ " \n", " 0\n", " 0\n", - " pu239240\n", + " sb125\n", " \n", " \n", " 1\n", " 1\n", - " cs144\n", + " ce141\n", " \n", " \n", " 2\n", " 2\n", - " cs141\n", + " gd153\n", " \n", " \n", " 3\n", " 3\n", - " cs140\n", + " ra226\n", " \n", " \n", " 4\n", " 4\n", - " sn117m\n", + " ra228\n", " \n", " \n", "\n", "" ], "text/plain": [ - " index value\n", - "0 0 pu239240\n", - "1 1 cs144\n", - "2 2 cs141\n", - "3 3 cs140\n", - "4 4 sn117m" + " index value\n", + "0 0 sb125\n", + "1 1 ce141\n", + "2 2 gd153\n", + "3 3 ra226\n", + "4 4 ra228" ] }, "execution_count": null, @@ -704,7 +704,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 77/77 [00:02<00:00, 32.60it/s]\n" + "Processing: 100%|██████████| 77/77 [00:01<00:00, 39.99it/s]\n" ] }, { @@ -741,18 +741,6 @@ " \n", " \n", " \n", - " pu239240\n", - " pu240\n", - " pu239240\n", - " 3\n", - " \n", - " \n", - " pu238240\n", - " pu240\n", - " pu238240\n", - " 3\n", - " \n", - " \n", " cm243244\n", " cm244\n", " cm243244\n", @@ -765,9 +753,21 @@ " 3\n", " \n", " \n", - " cs142\n", - " ce144\n", - " cs142\n", + " pu238240\n", + " pu240\n", + " pu238240\n", + " 3\n", + " \n", + " \n", + " pu239240\n", + " pu240\n", + " pu239240\n", + " 3\n", + " \n", + " \n", + " cs143\n", + " cs127\n", + " cs143\n", " 2\n", " \n", " \n", @@ -777,12 +777,24 @@ " 2\n", " \n", " \n", - " cs143\n", - " cs127\n", - " cs143\n", + " cs142\n", + " ce144\n", + " cs142\n", " 2\n", " \n", " \n", + " cs140\n", + " ce140\n", + " cs140\n", + " 1\n", + " \n", + " \n", + " k-40\n", + " k40\n", + " k-40\n", + " 1\n", + " \n", + " \n", " cs144\n", " ce144\n", " cs144\n", @@ -795,12 +807,6 @@ " 1\n", " \n", " \n", - " cs140\n", - " ce140\n", - " cs140\n", - " 1\n", - " \n", - " \n", " cs138\n", " cs137\n", " cs138\n", @@ -818,12 +824,6 @@ " cs146\n", " 1\n", " \n", - " \n", - " k-40\n", - " k40\n", - " k-40\n", - " 1\n", - " \n", " \n", "\n", "" @@ -831,20 +831,20 @@ "text/plain": [ " matched_maris_name source_name match_score\n", "source_key \n", - "pu239240 pu240 pu239240 3\n", - "pu238240 pu240 pu238240 3\n", "cm243244 cm244 cm243244 3\n", "cs134137 cs137 cs134137 3\n", - "cs142 ce144 cs142 2\n", - "cs145 cs136 cs145 2\n", + "pu238240 pu240 pu238240 3\n", + "pu239240 pu240 pu239240 3\n", "cs143 cs127 cs143 2\n", + "cs145 cs136 cs145 2\n", + "cs142 ce144 cs142 2\n", + "cs140 ce140 cs140 1\n", + "k-40 k40 k-40 1\n", "cs144 ce144 cs144 1\n", "cs141 ce141 cs141 1\n", - "cs140 ce140 cs140 1\n", "cs138 cs137 cs138 1\n", "cs139 ce139 cs139 1\n", - "cs146 cs136 cs146 1\n", - "k-40 k40 k-40 1" + "cs146 cs136 cs146 1" ] }, "execution_count": null, @@ -910,7 +910,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 77/77 [00:01<00:00, 47.36it/s]\n" + "Processing: 100%|██████████| 77/77 [00:01<00:00, 51.81it/s]\n" ] } ], @@ -2085,7 +2085,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 46/46 [00:07<00:00, 6.41it/s]\n" + "Processing: 100%|██████████| 46/46 [00:06<00:00, 6.81it/s]\n" ] }, { @@ -2240,7 +2240,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 46/46 [00:07<00:00, 5.79it/s]\n" + "Processing: 0%| | 0/46 [00:00\n", " 0\n", " 0\n", - " n\n", + " NaN\n", " \n", " \n", " 1\n", " 1\n", - " NaN\n", + " F\n", " \n", " \n", " 2\n", " 2\n", - " N\n", + " n\n", " \n", " \n", " 3\n", " 3\n", - " F\n", + " N\n", " \n", " \n", "\n", @@ -4033,10 +4033,10 @@ ], "text/plain": [ " index value\n", - "0 0 n\n", - "1 1 NaN\n", - "2 2 N\n", - "3 3 F" + "0 0 NaN\n", + "1 1 F\n", + "2 2 n\n", + "3 3 N" ] }, "execution_count": null, @@ -4286,7 +4286,7 @@ "id": "fe0fb210", "metadata": {}, "source": [ - "## Add masurement note" + "## Add measurement note" ] }, { @@ -4719,11 +4719,94 @@ "source": [ ":::{.callout-tip}\n", "\n", - "**FEEDBACK TO DATA PROVIDER**: Geographical coordinates are provided in both decimal degree and degree-minute formats. Some coordinates are missing the decimal format and obliged us to use the degree-minute format with less precision.\n", + "**FEEDBACK TO DATA PROVIDER**: \n", + "\n", + "- Geographical coordinates are provided in both decimal degree and degree-minute formats. Some coordinates are missing the decimal format and obliged us to use the degree-minute format with less precision.\n", + "- Also note that latitude values have `,` as decimal separator while longitude values have `.` as decimal separator (see below)\n", "\n", ":::" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "484b281b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LATITUDE (ddmmmm)LATITUDE (dddddd)
059.40059,6667
159.40059,6667
259.51659,86
359.51659,86
459.51659,86
\n", + "
" + ], + "text/plain": [ + " LATITUDE (ddmmmm) LATITUDE (dddddd)\n", + "0 59.400 59,6667\n", + "1 59.400 59,6667\n", + "2 59.516 59,86\n", + "3 59.516 59,86\n", + "4 59.516 59,86" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs['sediment'][['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)']].head()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -5211,6 +5294,7 @@ " ) -> dict: # Common renaming rules for NetCDF and OpenRefine.\n", " \"Get common renaming rules for NetCDF and OpenRefine.\"\n", " common = {\n", + " 'KEY': 'key',\n", " 'lat': 'latitude' if encoding_type == 'openrefine' else vars['defaults']['lat']['name'],\n", " 'lon': 'longitude' if encoding_type == 'openrefine' else vars['defaults']['lon']['name'],\n", " 'time': 'begperiod' if encoding_type == 'openrefine' else vars['defaults']['time']['name'],\n", @@ -5267,6 +5351,8 @@ " },\n", " 'sediment': {\n", " 'sed_type': vars['sed']['sed_type']['name'],\n", + " 'top': vars['sed']['top']['name'],\n", + " 'bottom': vars['sed']['bottom']['name'],\n", " }\n", " }\n", " elif encoding_type == 'openrefine':\n", @@ -5409,15 +5495,15 @@ "output_type": "stream", "text": [ "seawater columns:\n", - "Index(['lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", + "Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", " 'smp_depth', 'tot_depth', '_sal', '_temp'],\n", " dtype='object')\n", "sediment columns:\n", - "Index(['lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", - " 'tot_depth', 'sed_type'],\n", + "Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", + " 'tot_depth', 'sed_type', 'top', 'bottom'],\n", " dtype='object')\n", "biota columns:\n", - "Index(['lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", + "Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", " 'smp_depth', 'species', 'body_part', 'bio_group'],\n", " dtype='object')\n" ] @@ -5434,9 +5520,9 @@ " EncodeTimeCB(cfg()),\n", " SanitizeValue(coi_val), \n", " NormalizeUncCB(),\n", - " RemapBiotaSpeciesCB(lut_biota),\n", - " RemapBiotaBodyPartCB(lut_tissues),\n", - " RemapBiogroupCB(lut_biogroup),\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", " RemapTaxonInformationCB(lut_taxon),\n", " RemapSedimentCB(lut_sediments),\n", " RemapUnitCB(),\n", @@ -5459,6 +5545,159 @@ " print(tfm.dfs[grp].columns)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a941172", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
keylatlontimenuclide_dl_unitvalue_unctot_depthsed_typetopbottom
0SKRIL201204859.666724.00001339891200ra2261435.09.1071.0015.020.0
1SKRIL201204959.666724.00001339891200ra2261436.07.9271.0020.027.0
2SKRIL201205059.860028.84331344556800ra2261438.09.1223.000.02.0
3SKRIL201205159.860028.84331344556800ra2261436.09.0023.002.04.0
4SKRIL201205259.860028.84331344556800ra2261430.06.9023.004.06.0
\n", + "
" + ], + "text/plain": [ + " key lat lon time nuclide _dl _unit value \\\n", + "0 SKRIL2012048 59.6667 24.0000 1339891200 ra226 1 4 35.0 \n", + "1 SKRIL2012049 59.6667 24.0000 1339891200 ra226 1 4 36.0 \n", + "2 SKRIL2012050 59.8600 28.8433 1344556800 ra226 1 4 38.0 \n", + "3 SKRIL2012051 59.8600 28.8433 1344556800 ra226 1 4 36.0 \n", + "4 SKRIL2012052 59.8600 28.8433 1344556800 ra226 1 4 30.0 \n", + "\n", + " _unc tot_depth sed_type top bottom \n", + "0 9.10 71.0 0 15.0 20.0 \n", + "1 7.92 71.0 0 20.0 27.0 \n", + "2 9.12 23.0 0 0.0 2.0 \n", + "3 9.00 23.0 0 2.0 4.0 \n", + "4 6.90 23.0 0 4.0 6.0 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "result = tfm.dfs['sediment']; result.head()" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -5483,30 +5722,26 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "seawater columns:\n", - "Index(['smp_depth', 'tot_depth', 'lon', 'time', 'lat', 'ag110m_dl', 'am241_dl',\n", - " 'ba140_dl', 'ce144_dl', 'cm242_dl',\n", - " ...\n", - " 'pu240', 'ru103', 'ru106', 'sb125', 'sr89', 'sr90', 'tc99', 'u234',\n", - " 'u238', 'zr95'],\n", - " dtype='object', length=175)\n", - "sediment columns:\n", - "Index(['tot_depth', 'lon', 'sed_type', 'time', 'lat', 'ac228_dl', 'ag110m_dl',\n", - " 'am241_dl', 'ba140_dl', 'be7_dl',\n", - " ...\n", - " 'sb124', 'sb125', 'sr90', 'th228', 'th232', 'th234', 'tl208', 'u235',\n", - " 'zn65', 'zr95'],\n", - " dtype='object', length=177)\n", - "biota columns:\n", - "Index(['body_part', 'smp_depth', 'lon', 'time', 'lat', 'bio_group', 'species',\n", - " 'ac228_dl', 'ag108m_dl', 'ag110m_dl',\n", - " ...\n", - " 'sr89', 'sr90', 'tc99', 'te129m', 'th228', 'th232', 'tl208', 'u235',\n", - " 'zn65', 'zr95'],\n", - " dtype='object', length=211)\n" + "ename": "ValueError", + "evalue": "Must produce aggregated value", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [98], line 30\u001b[0m\n\u001b[1;32m 2\u001b[0m dfs \u001b[38;5;241m=\u001b[39m load_data(fname_in)\n\u001b[1;32m 3\u001b[0m tfm \u001b[38;5;241m=\u001b[39m Transformer(dfs, cbs\u001b[38;5;241m=\u001b[39m[AddSampleTypeIdColumnCB(),\n\u001b[1;32m 4\u001b[0m LowerStripNameCB(col_src\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNUCLIDE\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[1;32m 5\u001b[0m RemapNuclideNameCB(lut_nuclides),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 27\u001b[0m ReshapeLongToWide()\n\u001b[1;32m 28\u001b[0m ])\n\u001b[0;32m---> 30\u001b[0m \u001b[43mtfm\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m grp \u001b[38;5;129;01min\u001b[39;00m tfm\u001b[38;5;241m.\u001b[39mdfs\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mgrp\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m columns:\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", + "File \u001b[0;32m~/pro/IAEA/MARIS/marisco/marisco/callbacks.py:70\u001b[0m, in \u001b[0;36mTransformer.__call__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 69\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTransform the dataframe(s) according to the specified callbacks.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 70\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcbs: \u001b[43mrun_cbs\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcbs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 71\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdfs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdfs\n", + "File \u001b[0;32m~/pro/IAEA/MARIS/marisco/marisco/callbacks.py:38\u001b[0m, in \u001b[0;36mrun_cbs\u001b[0;34m(cbs, obj)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m cb \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28msorted\u001b[39m(cbs, key\u001b[38;5;241m=\u001b[39mattrgetter(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124morder\u001b[39m\u001b[38;5;124m'\u001b[39m)):\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m cb\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__doc__\u001b[39m: obj\u001b[38;5;241m.\u001b[39mlogs\u001b[38;5;241m.\u001b[39mappend(cb\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__doc__\u001b[39m)\n\u001b[0;32m---> 38\u001b[0m \u001b[43mcb\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/pro/IAEA/MARIS/marisco/marisco/callbacks.py:266\u001b[0m, in \u001b[0;36mReshapeLongToWide.__call__\u001b[0;34m(self, tfm)\u001b[0m\n\u001b[1;32m 264\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, tfm):\n\u001b[1;32m 265\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m grp \u001b[38;5;129;01min\u001b[39;00m tfm\u001b[38;5;241m.\u001b[39mdfs\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[0;32m--> 266\u001b[0m tfm\u001b[38;5;241m.\u001b[39mdfs[grp] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtfm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdfs\u001b[49m\u001b[43m[\u001b[49m\u001b[43mgrp\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 267\u001b[0m tfm\u001b[38;5;241m.\u001b[39mdfs[grp]\u001b[38;5;241m.\u001b[39mcolumns \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrenamed_cols(tfm\u001b[38;5;241m.\u001b[39mdfs[grp]\u001b[38;5;241m.\u001b[39mcolumns)\n", + "File \u001b[0;32m~/pro/IAEA/MARIS/marisco/marisco/callbacks.py:247\u001b[0m, in \u001b[0;36mReshapeLongToWide.pivot\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 243\u001b[0m idx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mset\u001b[39m(df\u001b[38;5;241m.\u001b[39mcolumns) \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mset\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns \u001b[38;5;241m+\u001b[39m derived_coi \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvalues))\n\u001b[1;32m 245\u001b[0m df, num_fill_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fill_nan_values(df, idx)\n\u001b[0;32m--> 247\u001b[0m pivot_df \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43midx\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 248\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 249\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mderived_coi\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 250\u001b[0m \u001b[43m \u001b[49m\u001b[43mfill_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnan\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 251\u001b[0m \u001b[43m \u001b[49m\u001b[43maggfunc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mreset_index()\n\u001b[1;32m 253\u001b[0m pivot_df[idx] \u001b[38;5;241m=\u001b[39m pivot_df[idx]\u001b[38;5;241m.\u001b[39mreplace({\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_fill_value: np\u001b[38;5;241m.\u001b[39mnan, num_fill_value: np\u001b[38;5;241m.\u001b[39mnan})\n\u001b[1;32m 254\u001b[0m pivot_df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mset_index(pivot_df)\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/frame.py:9482\u001b[0m, in \u001b[0;36mDataFrame.pivot_table\u001b[0;34m(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)\u001b[0m\n\u001b[1;32m 9465\u001b[0m \u001b[38;5;129m@Substitution\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 9466\u001b[0m \u001b[38;5;129m@Appender\u001b[39m(_shared_docs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpivot_table\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 9467\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpivot_table\u001b[39m(\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 9478\u001b[0m sort: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 9479\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame:\n\u001b[1;32m 9480\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mreshape\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpivot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pivot_table\n\u001b[0;32m-> 9482\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpivot_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 9483\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9484\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9485\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9486\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9487\u001b[0m \u001b[43m \u001b[49m\u001b[43maggfunc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maggfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9488\u001b[0m \u001b[43m \u001b[49m\u001b[43mfill_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfill_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9489\u001b[0m \u001b[43m \u001b[49m\u001b[43mmargins\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmargins\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9490\u001b[0m \u001b[43m \u001b[49m\u001b[43mdropna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9491\u001b[0m \u001b[43m \u001b[49m\u001b[43mmargins_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmargins_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9492\u001b[0m \u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9493\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9494\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/reshape/pivot.py:102\u001b[0m, in \u001b[0;36mpivot_table\u001b[0;34m(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)\u001b[0m\n\u001b[1;32m 99\u001b[0m table \u001b[38;5;241m=\u001b[39m concat(pieces, keys\u001b[38;5;241m=\u001b[39mkeys, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m table\u001b[38;5;241m.\u001b[39m__finalize__(data, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpivot_table\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 102\u001b[0m table \u001b[38;5;241m=\u001b[39m \u001b[43m__internal_pivot_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 103\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 104\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 106\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 107\u001b[0m \u001b[43m \u001b[49m\u001b[43maggfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 108\u001b[0m \u001b[43m \u001b[49m\u001b[43mfill_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 109\u001b[0m \u001b[43m \u001b[49m\u001b[43mmargins\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 110\u001b[0m \u001b[43m \u001b[49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 111\u001b[0m \u001b[43m \u001b[49m\u001b[43mmargins_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 112\u001b[0m \u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 113\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 114\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m table\u001b[38;5;241m.\u001b[39m__finalize__(data, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpivot_table\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/reshape/pivot.py:183\u001b[0m, in \u001b[0;36m__internal_pivot_table\u001b[0;34m(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m observed \u001b[38;5;129;01mis\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mno_default \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\n\u001b[1;32m 174\u001b[0m ping\u001b[38;5;241m.\u001b[39m_passed_categorical \u001b[38;5;28;01mfor\u001b[39;00m ping \u001b[38;5;129;01min\u001b[39;00m grouped\u001b[38;5;241m.\u001b[39m_grouper\u001b[38;5;241m.\u001b[39mgroupings\n\u001b[1;32m 175\u001b[0m ):\n\u001b[1;32m 176\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 177\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe default value of observed=False is deprecated and will change \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto observed=True in a future version of pandas. Specify \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 181\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[1;32m 182\u001b[0m )\n\u001b[0;32m--> 183\u001b[0m agged \u001b[38;5;241m=\u001b[39m \u001b[43mgrouped\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43maggfunc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dropna \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(agged, ABCDataFrame) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(agged\u001b[38;5;241m.\u001b[39mcolumns):\n\u001b[1;32m 186\u001b[0m agged \u001b[38;5;241m=\u001b[39m agged\u001b[38;5;241m.\u001b[39mdropna(how\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mall\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/groupby/generic.py:1466\u001b[0m, in \u001b[0;36mDataFrameGroupBy.aggregate\u001b[0;34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1463\u001b[0m \u001b[38;5;66;03m# grouper specific aggregations\u001b[39;00m\n\u001b[1;32m 1464\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_grouper\u001b[38;5;241m.\u001b[39mnkeys \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 1465\u001b[0m \u001b[38;5;66;03m# test_groupby_as_index_series_scalar gets here with 'not self.as_index'\u001b[39;00m\n\u001b[0;32m-> 1466\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_python_agg_general\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1467\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m args \u001b[38;5;129;01mor\u001b[39;00m kwargs:\n\u001b[1;32m 1468\u001b[0m \u001b[38;5;66;03m# test_pass_args_kwargs gets here (with and without as_index)\u001b[39;00m\n\u001b[1;32m 1469\u001b[0m \u001b[38;5;66;03m# can't return early\u001b[39;00m\n\u001b[1;32m 1470\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_aggregate_frame(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/groupby/generic.py:1532\u001b[0m, in \u001b[0;36mDataFrameGroupBy._python_agg_general\u001b[0;34m(self, func, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1530\u001b[0m output: \u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mint\u001b[39m, ArrayLike] \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 1531\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m idx, (name, ser) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(obj\u001b[38;5;241m.\u001b[39mitems()):\n\u001b[0;32m-> 1532\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_grouper\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg_series\u001b[49m\u001b[43m(\u001b[49m\u001b[43mser\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1533\u001b[0m output[idx] \u001b[38;5;241m=\u001b[39m result\n\u001b[1;32m 1535\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj\u001b[38;5;241m.\u001b[39m_constructor(output)\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/groupby/ops.py:863\u001b[0m, in \u001b[0;36mBaseGrouper.agg_series\u001b[0;34m(self, obj, func, preserve_dtype)\u001b[0m\n\u001b[1;32m 856\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39m_values, np\u001b[38;5;241m.\u001b[39mndarray):\n\u001b[1;32m 857\u001b[0m \u001b[38;5;66;03m# we can preserve a little bit more aggressively with EA dtype\u001b[39;00m\n\u001b[1;32m 858\u001b[0m \u001b[38;5;66;03m# because maybe_cast_pointwise_result will do a try/except\u001b[39;00m\n\u001b[1;32m 859\u001b[0m \u001b[38;5;66;03m# with _from_sequence. NB we are assuming here that _from_sequence\u001b[39;00m\n\u001b[1;32m 860\u001b[0m \u001b[38;5;66;03m# is sufficiently strict that it casts appropriately.\u001b[39;00m\n\u001b[1;32m 861\u001b[0m preserve_dtype \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 863\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_aggregate_series_pure_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 865\u001b[0m npvalues \u001b[38;5;241m=\u001b[39m lib\u001b[38;5;241m.\u001b[39mmaybe_convert_objects(result, try_float\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 866\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m preserve_dtype:\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/groupby/ops.py:889\u001b[0m, in \u001b[0;36mBaseGrouper._aggregate_series_pure_python\u001b[0;34m(self, obj, func)\u001b[0m\n\u001b[1;32m 885\u001b[0m res \u001b[38;5;241m=\u001b[39m extract_result(res)\n\u001b[1;32m 887\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m initialized:\n\u001b[1;32m 888\u001b[0m \u001b[38;5;66;03m# We only do this validation on the first iteration\u001b[39;00m\n\u001b[0;32m--> 889\u001b[0m \u001b[43mcheck_result_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mres\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgroup\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 890\u001b[0m initialized \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 892\u001b[0m result[i] \u001b[38;5;241m=\u001b[39m res\n", + "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/groupby/ops.py:88\u001b[0m, in \u001b[0;36mcheck_result_array\u001b[0;34m(obj, dtype)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj, np\u001b[38;5;241m.\u001b[39mndarray):\n\u001b[1;32m 85\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[1;32m 86\u001b[0m \u001b[38;5;66;03m# If it is object dtype, the function can be a reduction/aggregation\u001b[39;00m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;66;03m# and still return an ndarray e.g. test_agg_over_numpy_arrays\u001b[39;00m\n\u001b[0;32m---> 88\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMust produce aggregated value\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mValueError\u001b[0m: Must produce aggregated value" ] } ], diff --git a/nbs/metadata/.notest b/nbs/metadata/.notest new file mode 100644 index 0000000..e69de29 diff --git a/nbs/metadata/field-definition.ipynb b/nbs/metadata/field-definition.ipynb new file mode 100644 index 0000000..eafdcee --- /dev/null +++ b/nbs/metadata/field-definition.ipynb @@ -0,0 +1,1471 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "416a6a41", + "metadata": {}, + "source": [ + "# Field definitions" + ] + }, + { + "cell_type": "markdown", + "id": "5709cfb6", + "metadata": {}, + "source": [ + "The MARIS data is converted to a standardized CSV format for importing into the MARIS database using Open Refine. The standardized variable names for Open Refine are provided in Table 1, and a detailed description of each variable is given below. Additionally, MARIS data is available in NetCDF4 format. The standardized variable names for NetCDF4 are also provided in Table 1, with descriptions for each variable included below." + ] + }, + { + "cell_type": "markdown", + "id": "777372a6", + "metadata": {}, + "source": [ + "## Table 1 Standardized Variable Names (Open Refine and NetCDF)" + ] + }, + { + "cell_type": "markdown", + "id": "2734b95b", + "metadata": {}, + "source": [ + "| Friendly Name | Open Refine Name | NetCDF Name |\n", + "|------------------------|-------------------|----------------------------------------|\n", + "| Sample quality | sampquality | _ |\n", + "| Sample type ID | samptype_id | *Sample type included as netcdf.group* |\n", + "| Laboratory ID | lab_id | _ |\n", + "| Latitude | latitude | lat |\n", + "| Longitude | longitude | lon |\n", + "| Station | station | _ |\n", + "| Sample lab code | samplabcode | _ |\n", + "| Profile ID | profile_id | _ |\n", + "| Transect ID | transect_id | _ |\n", + "| Sampling depth | sampdepth | smp_depth |\n", + "| Total depth | totdepth | tot_depth |\n", + "| Begin period | begperiod | time |\n", + "| End period | endperiod | _ |\n", + "| Nuclide ID | nuclide_id | nuclide |\n", + "| Detection | detection | detection_limit |\n", + "| Activity | activity | value |\n", + "| Uncertainty | uncertaint | uncertainty |\n", + "| Unit ID | unit_id | unit |\n", + "| Variable type | vartype | _ |\n", + "| Frequency | freq | _ |\n", + "| Range low detection | rl_detection | _ |\n", + "| Range low | rangelow | _ |\n", + "| Range upper | rangeupp | _ |\n", + "| Species ID | species_id | species |\n", + "| Biological group | _ | bio_group |\n", + "| Taxon name | Taxonname | _ |\n", + "| Taxon reported name | TaxonRepName | _ |\n", + "| Common name | Commonname | _ |\n", + "| Taxon rank | Taxonrank | _ |\n", + "| Taxon database | TaxonDB | _ |\n", + "| Taxon database ID | TaxonDBID | _ |\n", + "| Taxon database URL | TaxonDBURL | _ |\n", + "| Body part ID | bodypar_id | body_part |\n", + "| Slice up | sliceup | _ |\n", + "| Slice down | slicedown | _ |\n", + "| Sediment type ID | sedtype_id | sed_type |\n", + "| Sediment reported name | SedRepName | _ |\n", + "| Volume | volume | _ |\n", + "| Salinity | salinity | salinity |\n", + "| Temperature | temperatur | temperature |\n", + "| Filtered | filtered | filtered |\n", + "| Filter pore | filtpore | _ |\n", + "| Acidified | acid | _ |\n", + "| Oxygen | oxygen | _ |\n", + "| Sample area | samparea | _ |\n", + "| Dry weight | drywt | _ |\n", + "| Wet weight | wetwt | _ |\n", + "| Percent weight | percentwt | _ |\n", + "| Sampling method ID | sampmet_id | sampling_method |\n", + "| Drying method ID | drymet_id | _ |\n", + "| Preparation method ID | prepmet_id | preparation_method |\n", + "| Counting method ID | counmet_id | counting_method |\n", + "| Reference ID | ref_id | _ |\n", + "| Reference note | refnote | _ |\n", + "| Sample note | sampnote | _ |\n", + "| Measurement note | measurenote | _ |\n", + "| Good for export | gfe | _ |" + ] + }, + { + "cell_type": "markdown", + "id": "0f727972", + "metadata": {}, + "source": [ + "\n", + "| Friendly Name | Open Refine Name | NetCDF Name |\n", + "| ---------------------- | ---------------- | -------------------------------------- |\n", + "| Sample quality | sampquality | |\n", + "| Sample type ID | samptype_id | *Sample type included as netcdf.group* |\n", + "| Laboratory ID | lab_id | |\n", + "| Latitude | latitude | lat |\n", + "| Longitude | longitude | lon |\n", + "| Station | station | |\n", + "| Sample lab code | samplabcode | |\n", + "| Profile ID | profile_id | |\n", + "| Transect ID | transect_id | |\n", + "| Sampling depth | sampdepth | smp_depth |\n", + "| Total depth | totdepth | tot_depth |\n", + "| Begin period | begperiod | time |\n", + "| End period | endperiod | |\n", + "| Nuclide ID | nuclide_id | nuclide |\n", + "| Detection | detection | detection_limit |\n", + "| Activity | activity | value |\n", + "| Uncertainty | uncertaint | uncertainty |\n", + "| Unit ID | unit_id | unit |\n", + "| Variable type | vartype | |\n", + "| Frequency | freq | |\n", + "| Range low detection | rl_detection | |\n", + "| Range low | rangelow | |\n", + "| Range upper | rangeupp | |\n", + "| Species ID | species_id | species |\n", + "| Biological group | | bio_group |\n", + "| Taxon name | Taxonname | |\n", + "| Taxon reported name | TaxonRepName | |\n", + "| Common name | Commonname | |\n", + "| Taxon rank | Taxonrank | |\n", + "| Taxon database | TaxonDB | |\n", + "| Taxon database ID | TaxonDBID | |\n", + "| Taxon database URL | TaxonDBURL | |\n", + "| Body part ID | bodypar_id | body_part |\n", + "| Slice up | sliceup | |\n", + "| Slice down | slicedown | |\n", + "| Sediment type ID | sedtype_id | sed_type |\n", + "| Sediment reported name | SedRepName | |\n", + "| Volume | volume | |\n", + "| Salinity | salinity | salinity |\n", + "| Temperature | temperatur | temperature |\n", + "| Filtered | filtered | filtered |\n", + "| Filter pore | filtpore | |\n", + "| Acidified | acid | |\n", + "| Oxygen | oxygen | |\n", + "| Sample area | samparea | |\n", + "| Dry weight | drywt | |\n", + "| Wet weight | wetwt | |\n", + "| Percent weight | percentwt | |\n", + "| Sampling method ID | sampmet_id | sampling_method |\n", + "| Drying method ID | drymet_id | |\n", + "| Preparation method ID | prepmet_id | preparation_method |\n", + "| Counting method ID | counmet_id | counting_method |\n", + "| Reference ID | ref_id | |\n", + "| Reference note | refnote | |\n", + "| Sample note | sampnote | |\n", + "| Measurement note | measurenote | |\n", + "| Good for export | gfe | |\n" + ] + }, + { + "cell_type": "markdown", + "id": "464bac4f", + "metadata": {}, + "source": [ + "## Variable Descriptions\n", + "\n", + "## Sample Quality \n", + "\n", + "### Description: \n", + "Defines the quality of the sample. Examples include: Good (G), Caution (C), Fail (F).\n", + "\n", + "### Lookup Table (LUT) in use: \n", + "No. \n", + "\n", + "### Open Refine Variable Name: \n", + "`sampquality`\n", + "\n", + "### Open Refine Data Type: \n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF \n", + "\n", + "### NetCDF Data Type: \n", + "Not included in NetCDF\n", + "\n", + "---\n", + "\n", + "## Sample Type ID\n", + "\n", + "### Description:\n", + "In MARIS, samples are categorized by type into WATER, BIOTA, SEDIMENT, and SUSPENDED types. The NetCDF data format separates MARIS data into ‘NetCDF groups’ by the sample type. Open Refine formats MARIS data into separate CSV files by sample type.\n", + "\n", + "- **SEAWATER** includes seawater and brackish water.\n", + "- **BIOTA** includes various types of biota.\n", + "- **SEDIMENT** includes various types of sediments.\n", + "- **SUSPENDED** includes various types of suspended matter.\n", + "\n", + "### Lookup Table (LUT) in use: \n", + "No.\n", + "\n", + "### Open Refine Variable Name:\n", + "`samptype_id`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value :\n", + " - 1 : ``SEAWATER``\n", + " - 2 : ``BIOTA``\n", + " - 3 : ``SEDIMENT``\n", + " - 4 : ``SUSPENDED``\n", + "\n", + "### NetCDF Variable Name:\n", + "*Sample type is included as netcdf.group*\n", + "\n", + "### NetCDF Data Type:\n", + "string\n", + "\n", + "---\n", + "\n", + "## Laboratory ID\n", + "\n", + "### Description:\n", + "The Laboratory ID identifies the laboratory that processed the sample.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_lab.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_lab.xlsx)\n", + "\n", + "### Open Refine Variable Name:\n", + "`lab_id`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Latitude Decimal\n", + "\n", + "### Description:\n", + "Latitude in decimal format (DDD.DDDDD°) with ranges from -90° to 90°.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`latitude`\n", + "\n", + "### Open Refine Data Type:\n", + "Float with values between -90° to 90°.\n", + "\n", + "### NetCDF Variable Name:\n", + "`lat`\n", + "\n", + "### NetCDF Data Type:\n", + "Float with values between -90° to 90°.\n", + "\n", + "---\n", + "\n", + "## Longitude Decimal\n", + "\n", + "### Description:\n", + "Longitude in decimal format (DDD.DDDDD°) with ranges from -180° to 180°.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`longitude`\n", + "\n", + "### Open Refine Data Type:\n", + "Float with values between -180° to 180°.\n", + "\n", + "### NetCDF Variable Name:\n", + "`lon`\n", + "\n", + "### NetCDF Data Type:\n", + "Float with values between -180° to 180°.\n", + "\n", + "---\n", + "\n", + "## Station\n", + "\n", + "### Description:\n", + "The name of the station where the sample was collected.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No.\n", + "\n", + "### Open Refine Variable Name:\n", + "`station`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Sample Lab Code\n", + "\n", + "### Description:\n", + "The data provider's sample laboratory code should be stored exactly as provided, without any modifications.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No.\n", + "\n", + "### Open Refine Variable Name:\n", + "`samplabcode`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Profile ID\n", + "\n", + "### Description:\n", + "Profile ID is provided as is by the data provider and is an identifier for linking data which are part of a sequence, i.e., a vertical profile.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No.\n", + "\n", + "### Open Refine Variable Name:\n", + "`profile_id`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not defined in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not defined in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Transect ID\n", + "\n", + "### Description:\n", + "Transect ID is provided as is by the data provider and is an identifier for linking data which are part of a sequence, i.e., a horizontal transect.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No.\n", + "\n", + "### Open Refine Variable Name:\n", + "`transect_id`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not defined in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not defined in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Sampling Depth\n", + "\n", + "### Description:\n", + "Depth from the water surface in meters at which the sample was taken. A value of \"0\" indicates that the sample was collected at the surface. A value of \"-1\" indicates that sample depth information is not available.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`sampdepth`\n", + "\n", + "### Open Refine Data Type:\n", + "Float\n", + "\n", + "### NetCDF Variable Name:\n", + "`smp_depth`\n", + "\n", + "### NetCDF Data Type:\n", + "Float\n", + "\n", + "---\n", + "\n", + "## Total Depth\n", + "\n", + "### Description:\n", + "Total water column depth in meters (m).\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`totdepth`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "`tot_depth`\n", + "\n", + "### NetCDF Data Type:\n", + "float\n", + "\n", + "---\n", + "\n", + "## Begin Period\n", + "\n", + "### Description:\n", + "'Begin Period' refers to the date when the collection of sample(s) began. If only a year is provided in the dataset, set the date to January 1st of that year (e.g., 2024 becomes 2024-01-01). If both a year and month are provided, set the date to the first day of that month (e.g., May 2024 becomes 2024-05-01). Date format of yyyy-mm-dd.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`begperiod`\n", + "\n", + "### Open Refine Data Type:\n", + "DATETIME string in the format (yyyy-mm-dd hh:mm:ss)\n", + "\n", + "### NetCDF Variable Name:\n", + "`time`\n", + "\n", + "### NetCDF Data Type:\n", + "DATETIME string in the format (yyyy-mm-dd hh:mm:ss)\n", + "\n", + "---\n", + "\n", + "## End Period\n", + "\n", + "### Description:\n", + "'End Period' refers to the date when the collection of sample(s) ended. If only a year is provided in the dataset, set the date to January 1st of that year (e.g., 2024 becomes 2024-01-01). If both a year and month are provided, set the date to the first day of that month (e.g., May 2024 becomes 2024-05-01). Date format of yyyy-mm-dd.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`endperiod`\n", + "\n", + "### Open Refine Data Type:\n", + "DATETIME string in the format (yyyy-mm-dd hh:mm:ss)\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Nuclide ID\n", + "\n", + "### Description:\n", + "Identifier for a specific nuclide (isotope) within the MARIS database.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_nuclide.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_nuclide.xlsx)\n", + "\n", + "### Open Refine Variable Name:\n", + "`nuclide_id`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "`nuclide`\n", + "\n", + "### NetCDF Data Type:\n", + "string\n", + "\n", + "---\n", + "\n", + "## Detection Limit\n", + "\n", + "### Description:\n", + "The detection limit variable describes the Activity variable as follows:\n", + "\n", + "'<': The reported value for the Activity variable is the Minimum Detectable Activity (MDA) or ISO11029 detection limit.\n", + "'=': The Activity variable represents the measured value, and an associated uncertainty should be provided.\n", + "'ND': Indicates that neither an activity value nor an MDA (or detection limit) is reported.\n", + "'DE': When the reported Activity variable is an aggregation of multiple samples, the detection limit variable is defined as Derived (DE), see Variable Type for more information related to aggregation of activity reported. \n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_detectlimit.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_detectlimit.xlsx).\n", + "\n", + "### Open Refine Variable Name:\n", + "`detection`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "`detection_limit`\n", + "\n", + "### NetCDF Data Type:\n", + "An integer value (the 'id' defined in the LUT).\n", + "\n", + "---\n", + "\n", + "## Activity\n", + "\n", + "### Description:\n", + "The measured activity value or MDA for the nuclide reported. Several variables are used to describe the Activity variable, including:\n", + "\n", + "**Nuclide ID**: Describes the nuclide for which the activity is reported.\n", + "**Detection Limit**: Indicates whether the reported Activity value is a measured activity value or below the detection limit.\n", + "**Unit ID**: Describes the unit associated with the reported Activity variable.\n", + "**Uncertainty**: The associated uncertainty of the Activity variable.\n", + "**Variable Type**: Describes whether the reported Activity variable is an aggregate, sum, mean, median, etc.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No.\n", + "\n", + "### Open Refine Variable Name:\n", + "`activity`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "`value`\n", + "\n", + "### NetCDF Data Type:\n", + "float\n", + "\n", + "---\n", + "\n", + "## Uncertainty\n", + "\n", + "### Description:\n", + "The uncertainty associated with the measurement of the activity must be reported as a 1 sigma (k=1) measurement uncertainty. This uncertainty should be expressed in the same units as the activity variable.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`uncertaint`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "`uncertainty`\n", + "\n", + "### NetCDF Data Type:\n", + "float\n", + "\n", + "---\n", + "\n", + "## Unit ID\n", + "\n", + "### Description:\n", + "Represents the ID value from the Lookup Table (LUT) corresponding to the unit of measurement for both the activity variable and the uncertainty variable (if applicable). For seawater measurements, ensure that the unit is converted from 'Bq L⁻¹' to 'Bq m⁻³', and use the corresponding LUT value for 'Bq m⁻³'.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_unit.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_unit.xlsx).\n", + "\n", + "### Open Refine Variable Name:\n", + "`unit_id`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "`unit`\n", + "\n", + "### NetCDF Data Type:\n", + "An integer value (the 'id' defined in the LUT).\n", + "\n", + "---\n", + "\n", + "## Variable Type\n", + "\n", + "### Description:\n", + "Describes the type of aggregation applied to the measurements if they are not reported as individual values. Possible values include:\n", + "\n", + "AM: Arithmetic Mean\n", + "GM: Geometric Mean\n", + "MED: Median\n", + "MAX: Maximum\n", + "MIN: Minimum\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`vartype`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Frequency\n", + "\n", + "### Description:\n", + "Indicates how often the sample is taken or the measurement is recorded. This variable helps to understand the regularity of data collection and can include details such as daily, weekly, monthly, or any other specified time interval.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`freq`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Range Low Detection\n", + "\n", + "### Description:\n", + "If aggregation occurred when evaluating the Activity variable, the 'Range Low Detection' value describes the 'Range Low' variable as follows:\n", + "\n", + "- '<': The reported value for the Activity variable is the Minimum Detectable Activity (MDA) or ISO11029 detection limit.\n", + "- '=': The Activity variable represents the measured value, with an associated uncertainty provided.\n", + "- 'ND': Indicates that neither an activity value nor an MDA (or detection limit) is reported.\n", + "- 'DE': When the reported Activity variable is an aggregation of multiple samples, the detection limit variable is defined as Derived (DE). See the Variable Type section for more details on aggregation of reported activity.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_detectlimit.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_detectlimit.xlsx).\n", + "\n", + "### Open Refine Variable Name:\n", + "`rl_detection`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Range Low\n", + "\n", + "### Description:\n", + "If aggregation is applied to the measurements, the 'Range Low ' variable represents the smallest activity measured within the aggregated dataset. This value should be reported in a format consistent with the unit specified by the Unit ID variable.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`rangelow`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Range Upper\n", + "\n", + "### Description:\n", + "If aggregation is applied to the measurements, the 'Range Upper ' variable represents the largest activity measured within the aggregated dataset. This value should be reported in a format consistent with the unit specified by the Unit ID variable.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`rangeupp`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Species ID\n", + "\n", + "### Description:\n", + "Represents the identifier for the species included in the sample. If a specific species is not provided but a biological group (e.g., 'Fish') is specified, use this group information to define the species.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_species.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_species.xlsx).\n", + "\n", + "### Open Refine Variable Name:\n", + "`species_id`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'species_id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "`species`\n", + "\n", + "### NetCDF Data Type:\n", + "An integer value (the 'species_id' defined in the LUT).\n", + "\n", + "---\n", + "\n", + "## Biological Group\n", + "\n", + "### Description:\n", + "The biological group of the sample, if applicable. Grouping of related species (e.g. crustaceans, molluscs, fish etc.).\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_species.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_species.xlsx) includes a \"biogroup_id\" for all species which links to the [dbo_biogroup.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_biogroup.xlsx) look-up.\n", + "\n", + "### Open Refine Variable Name:\n", + "None\n", + "\n", + "### Open Refine Data Type:\n", + "None\n", + "\n", + "### NetCDF Variable Name:\n", + "`bio_group`\n", + "\n", + "### NetCDF Data Type:\n", + "An integer value (the 'biogroup_id' defined in the LUT).\n", + "\n", + "---\n", + "\n", + "## Taxon Name\n", + "\n", + "### Description:\n", + "Scientific name of the taxon.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No, 'Taxon Name' is defined for each species in the [dbo_species.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_specie s.xlsx) look-up.\n", + "\n", + "### Open Refine Variable Name:\n", + "`Taxonname`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Taxon Reported Name\n", + "\n", + "### Description:\n", + "Taxon name reported by the data provider.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`TaxonRepName`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Common Name\n", + "\n", + "### Description:\n", + "Common name of the species or organism.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`Commonname`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Taxon Rank\n", + "\n", + "### Description:\n", + "Rank of the taxon in the biological classification system.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No, 'Taxon Rank' is defined for each species in the [dbo_species.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_specie s.xlsx) look-up.\n", + "\n", + "### Open Refine Variable Name:\n", + "`Taxonrank`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Taxon Database\n", + "\n", + "### Description:\n", + "Database or repository where taxon information is stored.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No, 'Taxon Database' is defined for each species in the [dbo_species.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_specie s.xlsx) look-up.\n", + "\n", + "### Open Refine Variable Name:\n", + "`TaxonDB`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Taxon Database ID\n", + "\n", + "### Description:\n", + "Identifier for the taxon in the taxon database.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No, 'Taxon Database ID' is defined for each species in the [dbo_species.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_specie s.xlsx) look-up.\n", + "\n", + "### Open Refine Variable Name:\n", + "`TaxonDBID`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Taxon Database URL\n", + "\n", + "### Description:\n", + "URL the taxon database.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No, 'Taxon Database URL' is defined for each species in the [dbo_species.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_specie s.xlsx) look-up.\n", + "\n", + "### Open Refine Variable Name:\n", + "`TaxonDBURL`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Body Part ID\n", + "\n", + "### Description:\n", + "Represents the identifier for the specific body part of the sample.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_bodypar.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_bodypar.xlsx).\n", + "\n", + "### Open Refine Variable Name:\n", + "`bodypar_id`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'bodypar_id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "`body_part`\n", + "\n", + "### NetCDF Data Type:\n", + "An integer value (the 'bodypar_id' defined in the LUT).\n", + "\n", + "---\n", + "\n", + "## Slice Up\n", + "\n", + "### Description:\n", + "Top of sediment core interval relative to the water-sediment interface (cm).\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`sliceup`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Slice Down\n", + "\n", + "### Description:\n", + "Bottom of sediment core interval relative to the water-sediment interface (cm).\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`slicedown`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Sediment Type ID\n", + "\n", + "### Description:\n", + "Represents the classification of sediment according to the Udden-Wentworth scale.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_sedtype.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_sedtype.xlsx).\n", + "\n", + "### Open Refine Variable Name:\n", + "`sedtype_id`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'sedtype_id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "`sed_type`\n", + "\n", + "### NetCDF Data Type:\n", + "An integer value (the 'sedtype_id' defined in the LUT).\n", + "\n", + "---\n", + "\n", + "## Sediment Reported Name\n", + "\n", + "### Description:\n", + "Name of the sediment as reported by the data provider. The sediment name should be stored exactly as provided, without any modifications.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No.\n", + "\n", + "### Open Refine Variable Name:\n", + "`SedRepName`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Volume\n", + "\n", + "### Description:\n", + "Volume of the sample.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No.\n", + "\n", + "### Open Refine Variable Name:\n", + "`volume`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Salinity\n", + "\n", + "### Description:\n", + "Salinity of the sample, expressed in practical salinity units (PSU).If required, consult TEOS-10 guidelines (www.teos-10.org/) for converting from Absolute Salinity (g/kg) to Practical Salinity.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`salinity`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "`salinity`\n", + "\n", + "### NetCDF Data Type:\n", + "float\n", + "\n", + "---\n", + "\n", + "## Temperature\n", + "\n", + "### Description:\n", + "Temperature of the sample (°C).\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`temperatur`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "`temperature`\n", + "\n", + "### NetCDF Data Type:\n", + "float\n", + "\n", + "---\n", + "\n", + "## Filtered\n", + "\n", + "### Description:\n", + "Indicates whether the sample was filtered:\n", + "- Y : Sample was filtered.\n", + "- N : Sample was not filtered.\n", + "- NA : Not applicable or information not available.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`filtered`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Filter Pore Mesh size\n", + "\n", + "### Description:\n", + "The pore size of the filter used, if applicable, expressed in micrometers (µm).\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`filtpore`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Acidified\n", + "\n", + "### Description:\n", + "Indicates if the sample was acidified.\n", + "- A: Sample acidified\n", + "- NA: Sample not acidified\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`acid`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Oxygen\n", + "\n", + "### Description:\n", + "Dissolved oxygen concentration.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`oxygen`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Sample Area\n", + "\n", + "### Description:\n", + "Sample surface area of sediment (cm2).\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`samparea`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Dry Weight\n", + "\n", + "### Description:\n", + "Dry weight of the sample, expressed in grams (g).\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`drywt`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Wet Weight\n", + "\n", + "### Description:\n", + "Wet weight of the sample, expressed in grams (g).\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`wetwt`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Percent Weight\n", + "\n", + "### Description:\n", + "Expressed as a percentage. This is calculated by dividing the dry weight by the wet weight and then multiplying by 100. The reported value should be greater than 0 and less than 100. \n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`percentwt`\n", + "\n", + "### Open Refine Data Type:\n", + "float\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Sampling Method ID\n", + "\n", + "### Description:\n", + "Identifier for the method used to collect the sample.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_sampmet.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_sampmet.xlsx).\n", + "\n", + "### Open Refine Variable Name:\n", + "`sampmet_id`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'sampmet_id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "`sampling_method`\n", + "\n", + "### NetCDF Data Type:\n", + "An integer value (the 'sampmet_id' defined in the LUT).\n", + "\n", + "---\n", + "\n", + "## Drying Method ID\n", + "\n", + "### Description:\n", + "Identifier for the method used to dry the sample.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_sampmet.xlsx]\n", + "\n", + "### Open Refine Variable Name:\n", + "`drymet_id`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Preparation Method ID\n", + "\n", + "### Description:\n", + "Identifier for the method used to prepare the sample.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_prepmet.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_prepmet.xlsx).\n", + "### Open Refine Variable Name:\n", + "`prepmet_id`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'prepmet_id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "`preparation_method`\n", + "\n", + "### NetCDF Data Type:\n", + "An integer value (the 'prepmet_id' defined in the LUT).\n", + "\n", + "\n", + "---\n", + "\n", + "## Counting Method ID\n", + "\n", + "### Description:\n", + "Identifier for the method used to count the sample.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_counmet.xlsx](https://github.com/franckalbinet/marisco/blob/main/nbs/files/lut/dbo_counmet.xlsx).\n", + "\n", + "### Open Refine Variable Name:\n", + "`counmet_id`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'counmet_id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "`counting_method`\n", + "\n", + "### NetCDF Data Type:\n", + "An integer value (the 'counmet_id' defined in the LUT).\n", + "\n", + "---\n", + "\n", + "## Reference ID\n", + "\n", + "### Description:\n", + "Identifier which identifies the source provider of the data.\n", + "\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "Yes, [dbo_ref.xlsx]\n", + "\n", + "### Open Refine Variable Name:\n", + "`ref_id`\n", + "\n", + "### Open Refine Data Type:\n", + "An integer value (the 'id' defined in the LUT).\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Reference Note\n", + "\n", + "### Description:\n", + "Notes or comments related to the reference or source.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`refnote`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Sample Note\n", + "\n", + "### Description:\n", + "Notes or comments related to the sample.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`sampnote`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Measurement Note\n", + "\n", + "### Description:\n", + "Notes or comments related to the measurement process.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`measurenote`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "## Good for Export\n", + "\n", + "### Description:\n", + "Indicates if the sample data is deemed good for export.\n", + "\n", + "### Lookup Table (LUT) in use:\n", + "No\n", + "\n", + "### Open Refine Variable Name:\n", + "`gfe`\n", + "\n", + "### Open Refine Data Type:\n", + "string\n", + "\n", + "### NetCDF Variable Name:\n", + "Not included in NetCDF.\n", + "\n", + "### NetCDF Data Type:\n", + "Not included in NetCDF.\n", + "\n", + "---\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/metadata/sample-uniqueness.ipynb b/nbs/metadata/sample-uniqueness.ipynb new file mode 100644 index 0000000..c57e36a --- /dev/null +++ b/nbs/metadata/sample-uniqueness.ipynb @@ -0,0 +1,184 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "416a6a41", + "metadata": {}, + "source": [ + "# Sample uniqueness" + ] + }, + { + "cell_type": "markdown", + "id": "5709cfb6", + "metadata": {}, + "source": [ + "> What constitutes a **single sample** in the context of MARIS database?\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "f322162d", + "metadata": {}, + "source": [ + "## Rule 1" + ] + }, + { + "cell_type": "markdown", + "id": "587d9ec4", + "metadata": {}, + "source": [ + "We also use `station` and `samplabcode`, when available. `Station` is a name given to a sampling location, `samplabcode` is the data provider’s unique ID. **TO BE CLARIFIED**" + ] + }, + { + "cell_type": "markdown", + "id": "617643d9", + "metadata": {}, + "source": [ + "**Seawater**" + ] + }, + { + "cell_type": "markdown", + "id": "92d905ec", + "metadata": {}, + "source": [ + "As you can see in most cases for seawater we can use the required information – lat, lon, time, sample depth – to define a unique sample ID to which we can link measurements. If sample depth is not provided we assume surface and indicate this using a value = -1 beforehand.\n", + "\n", + "Questions:\n", + "- cases where sample depth is not provided several times at the same location and time?" + ] + }, + { + "cell_type": "markdown", + "id": "d30653d7", + "metadata": {}, + "source": [ + "**Sediment**" + ] + }, + { + "cell_type": "markdown", + "id": "9c6dbbfc", + "metadata": {}, + "source": [ + "\n", + "For sediment we extend this a bit using `top`, `bottom` (`sliceup` and `slicedown`), `SedRepName` (mud, clay, ...?).\n", + "\n", + "If `top` and `bottom` are missing for a sediment sample we assume that it is a grab sample from the surface. In this case I think we set `top`: -1 to indicate this (I will check for sure).\n", + "\n", + "Questions:\n", + "- so in essence same as above\n" + ] + }, + { + "cell_type": "markdown", + "id": "6a21d508", + "metadata": {}, + "source": [ + "**Biota**\n", + "\n", + "For biota it is extended using the `taxon` and `tissue` IDs (`species_id`, `bodypar_id`). " + ] + }, + { + "cell_type": "markdown", + "id": "1014e890", + "metadata": {}, + "source": [ + "\n", + "For biota it is extended using the `taxon` and `tissue` IDs (`species_id`, `bodypar_id`). " + ] + }, + { + "cell_type": "markdown", + "id": "c173218c", + "metadata": {}, + "source": [ + "Item 1 examples: \n", + "\n", + "> in geotraces when we have the same (e.g for seawater) lon, lat, time, smp_depth for several nuclides measurement in a given rosette (at least that's what I understand);\n", + "\n", + "We ignore rosette (cast?) and bottle IDs. We assume that all measurements with the same lon, lat, time, depth are the same sample. Salinity is used to provide further confidence. Geotraces is more or less a big edge case. Normally data is not provided with such detail.\n", + "\n", + "Questions:\n", + "- so it means that we have to include nuclide type to get unicity" + ] + }, + { + "cell_type": "markdown", + "id": "5079fcd4", + "metadata": {}, + "source": [ + "> or in OSPAR sediment when we have records where top, bottom is NaN for a given lon, lat, time. In that case our compound index would be (lon, lat, time, top, bottom);\n", + "\n", + "See above, if missing top = -1. There should not be multiple grab sediment samples for the same lat, lon and time. If there are then it probably indicates that a core was taken but slice top and bottom is missing. **In this case the records should be ignored until this information is provided.**\n", + "\n", + "In the HELCOM example where top and bottom are NaN and actual values for the same lat, lon, time then I would assume a grab sample and a core were taken simultaneously so they are all different samples. If there are multiple records with NaN for top and bottom then it could be multiple grab samples at the same location and time but this **would be unusual and should be queried with the data provider.**" + ] + }, + { + "cell_type": "markdown", + "id": "07920dc7", + "metadata": {}, + "source": [ + "> or (not sure it happens sometimes but that's a point Niall mentioned) when we have replicates at the same location, time, depth, ...\n", + "\n", + "Sometimes there are replicates, yes. E.g. sometimes TEPCO report quick results for certain samples and then reanalyse them for a longer time for more precision – they report both. So in this case replicates are valid. \n", + "\n", + "Question:\n", + "- what do we do in such case?\n" + ] + }, + { + "cell_type": "markdown", + "id": "9bb18195", + "metadata": {}, + "source": [ + "## Rule 2: Location must be inferred." + ] + }, + { + "cell_type": "markdown", + "id": "8c131c34", + "metadata": {}, + "source": [ + "\n", + "Another example is when we do not have detailed information about sampling location or time and are forced to make general assumptions (e.g. the location of a port where multiple samples of the same species are landed and/or the sampling date for such samples is reported simply as a year or a quarter and we are forced to assume the mid-point) then, unless samplabcode is provided, there can be replicates. Currently we can live with this (though if we spot it we can force unique sample IDs by temporarily injecting dummy values for samplabcode which are removed after the sample IDs are generated)." + ] + }, + { + "cell_type": "markdown", + "id": "3406100e", + "metadata": {}, + "source": [ + "## Niall's situation" + ] + }, + { + "cell_type": "markdown", + "id": "ecf17046", + "metadata": {}, + "source": [ + "1) in geotraces when we have the same (e.g for seawater) lon, lat, time, smp_depth for several nuclides measurement in a given rosette (at least that's what I understand);\n", + "2) in OSPAR sediment when we have records where top, bottom is NaN for a given lon, lat, time. In that case our compound index would be (lon, lat, time, top, bottom);\n", + "3) In situations where a nuclide is measured for a sample using more than one method\n", + "4) In situations where rapid analysis and detailed analysis is reported (rapid -while arriving at lab- vs detailed measurement afterwards);\n", + "5) In a situation where a sample is collected and split into two or more sub-samples. For this sample the compound index would be the same. Sometimes this type of sample is sent to several laboratories (ring trial/inter-lab comparison);\n", + "6) In situations where a nuclide is measured for a sample using more than one method (e.g. Am241 normally measured by alpha and gamma spectrometry);" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/sidebar.yml b/nbs/sidebar.yml index 0e7c672..484df70 100644 --- a/nbs/sidebar.yml +++ b/nbs/sidebar.yml @@ -14,4 +14,7 @@ website: - api/nc_template.ipynb - api/serializers.ipynb - api/utils.ipynb - + - section: Metadata + contents: + - metadata/field-definition.ipynb + - metadata/sample-uniqueness.ipynb \ No newline at end of file From b3ebef6904cfe57554e90b031d304e17b37399a7 Mon Sep 17 00:00:00 2001 From: niallmurphy93 Date: Tue, 15 Oct 2024 03:15:52 +0100 Subject: [PATCH 5/9] Refactoring OSPAR handler: Implement generic RemapCB for data remapping - Convert existing callbacks to use the generic RemapCB - Enhance code reusability and maintainability - Standardize remapping process across different data types - Improve consistency in handling OSPAR data transformations --- marisco/callbacks.py | 5 +- marisco/utils.py | 1 - nbs/api/callbacks.ipynb | 19 +- nbs/api/utils.ipynb | 52 +- nbs/handlers/_ospar.ipynb | 4991 +++++++++++++++++++++++++++---------- 5 files changed, 3679 insertions(+), 1389 deletions(-) diff --git a/marisco/callbacks.py b/marisco/callbacks.py index 9f4297c..8007569 100644 --- a/marisco/callbacks.py +++ b/marisco/callbacks.py @@ -137,7 +137,8 @@ def __init__(self, col_remap: str, # Name of the column to remap col_src: str, # Name of the column with the source values dest_grps: list[str]|str=grp_names(), # List of destination groups - default_value: Any = -1 # Default value for unmatched entries + default_value: Any = -1, # Default value for unmatched entries + verbose: bool = False, # Whether to print unmatched values ): fc.store_attr() self.lut = None @@ -157,7 +158,7 @@ def _remap_value(self, value: str) -> Any: value = value.strip() if isinstance(value, str) else value match = self.lut.get(value, Match(self.default_value, None, None, None)) if isinstance(match, Match): - if match.matched_id == self.default_value: + if match.matched_id == self.default_value and self.verbose: print(f"Unmatched value: {value}") return match.matched_id else: diff --git a/marisco/utils.py b/marisco/utils.py index 9874774..9ce74e4 100644 --- a/marisco/utils.py +++ b/marisco/utils.py @@ -106,7 +106,6 @@ def _format_output(self): df_lut.index.name = 'source_key' return df_lut.sort_values(by='match_score', ascending=False) - # %% ../nbs/api/utils.ipynb 16 def has_valid_varname( var_names:list, # variable names diff --git a/nbs/api/callbacks.ipynb b/nbs/api/callbacks.ipynb index 8d5172e..28305b6 100644 --- a/nbs/api/callbacks.ipynb +++ b/nbs/api/callbacks.ipynb @@ -445,7 +445,19 @@ "execution_count": null, "id": "8c905654", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'Callback' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#| exports\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mRemapCB\u001b[39;00m(\u001b[43mCallback\u001b[49m):\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeneric MARIS remapping callback.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \n\u001b[1;32m 5\u001b[0m fn_lut: Callable, \u001b[38;5;66;03m# Function that returns the lookup table dictionary\u001b[39;00m\n\u001b[1;32m 6\u001b[0m col_remap: \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;66;03m# Name of the column to remap\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 10\u001b[0m verbose: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;66;03m# Whether to print unmatched values\u001b[39;00m\n\u001b[1;32m 11\u001b[0m ):\n", + "\u001b[0;31mNameError\u001b[0m: name 'Callback' is not defined" + ] + } + ], "source": [ "#| exports\n", "class RemapCB(Callback):\n", @@ -455,7 +467,8 @@ " col_remap: str, # Name of the column to remap\n", " col_src: str, # Name of the column with the source values\n", " dest_grps: list[str]|str=grp_names(), # List of destination groups\n", - " default_value: Any = -1 # Default value for unmatched entries\n", + " default_value: Any = -1, # Default value for unmatched entries\n", + " verbose: bool = False, # Whether to print unmatched values\n", " ):\n", " fc.store_attr()\n", " self.lut = None\n", @@ -475,7 +488,7 @@ " value = value.strip() if isinstance(value, str) else value\n", " match = self.lut.get(value, Match(self.default_value, None, None, None))\n", " if isinstance(match, Match):\n", - " if match.matched_id == self.default_value:\n", + " if match.matched_id == self.default_value and self.verbose:\n", " print(f\"Unmatched value: {value}\")\n", " return match.matched_id \n", " else:\n", diff --git a/nbs/api/utils.ipynb b/nbs/api/utils.ipynb index cdbde2b..dbf2dbd 100644 --- a/nbs/api/utils.ipynb +++ b/nbs/api/utils.ipynb @@ -199,20 +199,20 @@ " \n", " 0\n", " 0\n", - " cs137\n", + " cs134\n", " 5\n", " \n", " \n", " 1\n", " 1\n", - " cs134_137_tot\n", - " 13\n", + " cs137\n", + " 5\n", " \n", " \n", " 2\n", " 2\n", - " cs134\n", - " 5\n", + " cs134_137_tot\n", + " 13\n", " \n", " \n", "\n", @@ -220,9 +220,9 @@ ], "text/plain": [ " index value n_chars\n", - "0 0 cs137 5\n", - "1 1 cs134_137_tot 13\n", - "2 2 cs134 5" + "0 0 cs134 5\n", + "1 1 cs137 5\n", + "2 2 cs134_137_tot 13" ] }, "execution_count": null, @@ -239,7 +239,20 @@ "execution_count": null, "id": "cf58241b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'pd' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#| exports\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mRemapper\u001b[39;00m():\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRemap a data provider lookup table to a MARIS lookup table using fuzzy matching.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 5\u001b[0m provider_lut_df:pd\u001b[38;5;241m.\u001b[39mDataFrame, \u001b[38;5;66;03m# Data provider lookup table to be remapped\u001b[39;00m\n\u001b[1;32m 6\u001b[0m maris_lut_fn:\u001b[38;5;28mcallable\u001b[39m, \u001b[38;5;66;03m# Function that returns the MARIS lookup table path\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 11\u001b[0m fname_cache \u001b[38;5;66;03m# Cache file name\u001b[39;00m\n\u001b[1;32m 12\u001b[0m ):\n", + "Cell \u001b[0;32mIn[2], line 5\u001b[0m, in \u001b[0;36mRemapper\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mRemapper\u001b[39;00m():\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRemap a data provider lookup table to a MARIS lookup table using fuzzy matching.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m----> 5\u001b[0m provider_lut_df:\u001b[43mpd\u001b[49m\u001b[38;5;241m.\u001b[39mDataFrame, \u001b[38;5;66;03m# Data provider lookup table to be remapped\u001b[39;00m\n\u001b[1;32m 6\u001b[0m maris_lut_fn:\u001b[38;5;28mcallable\u001b[39m, \u001b[38;5;66;03m# Function that returns the MARIS lookup table path\u001b[39;00m\n\u001b[1;32m 7\u001b[0m maris_col_id:\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;66;03m# MARIS lookup table column name for the id\u001b[39;00m\n\u001b[1;32m 8\u001b[0m maris_col_name:\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;66;03m# MARIS lookup table column name for the name\u001b[39;00m\n\u001b[1;32m 9\u001b[0m provider_col_to_match:\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;66;03m# Data provider lookup table column name for the name to match\u001b[39;00m\n\u001b[1;32m 10\u001b[0m provider_col_key, \u001b[38;5;66;03m# Data provider lookup table column name for the key\u001b[39;00m\n\u001b[1;32m 11\u001b[0m fname_cache \u001b[38;5;66;03m# Cache file name\u001b[39;00m\n\u001b[1;32m 12\u001b[0m ):\n\u001b[1;32m 13\u001b[0m fc\u001b[38;5;241m.\u001b[39mstore_attr()\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache_file \u001b[38;5;241m=\u001b[39m cache_path() \u001b[38;5;241m/\u001b[39m fname_cache\n", + "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined" + ] + } + ], "source": [ "#| exports\n", "class Remapper():\n", @@ -301,7 +314,7 @@ " df_lut = pd.DataFrame.from_dict(self.lut, orient='index', \n", " columns=['matched_maris_name', 'source_name', 'match_score'])\n", " df_lut.index.name = 'source_key'\n", - " return df_lut.sort_values(by='match_score', ascending=False)\n" + " return df_lut.sort_values(by='match_score', ascending=False)" ] }, { @@ -432,8 +445,9 @@ "(-10.0, 40.0, 5.0, 50.0)" ] }, + "execution_count": null, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ @@ -453,8 +467,9 @@ "'POLYGON ((-10 40, 5 40, 5 50, -10 50, -10 40))'" ] }, + "execution_count": null, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ @@ -486,8 +501,9 @@ "(0.0, 1.0, 0.0, 1.0)" ] }, + "execution_count": null, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ @@ -646,7 +662,7 @@ " 'order': 'Decapoda',\n", " 'family': 'Aristeidae',\n", " 'genus': 'Aristeus',\n", - " 'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-06-10',\n", + " 'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-10-14',\n", " 'lsid': 'urn:lsid:marinespecies.org:taxname:107083',\n", " 'isMarine': 1,\n", " 'isBrackish': 0,\n", @@ -657,8 +673,9 @@ " 'modified': '2022-08-24T09:48:14.813Z'}]]" ] }, + "execution_count": null, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ @@ -990,8 +1007,9 @@ "52 51 Soft clay 7" ] }, + "execution_count": null, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ @@ -1363,7 +1381,7 @@ " 'order': 'Decapoda',\n", " 'family': 'Aristeidae',\n", " 'genus': 'Aristeus',\n", - " 'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-06-10',\n", + " 'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-10-14',\n", " 'lsid': 'urn:lsid:marinespecies.org:taxname:107083',\n", " 'isMarine': 1,\n", " 'isBrackish': 0,\n", diff --git a/nbs/handlers/_ospar.ipynb b/nbs/handlers/_ospar.ipynb index dd296f4..52d0512 100644 --- a/nbs/handlers/_ospar.ipynb +++ b/nbs/handlers/_ospar.ipynb @@ -10,6 +10,16 @@ "#| default_exp handlers.ospar" ] }, + { + "cell_type": "markdown", + "id": "e57dc381", + "metadata": {}, + "source": [ + "# TODO\n", + "- [ ] Update link to OSPAR data. \n", + "- [ ] `Unknown` nuclides found for `seawater` sample type. The remapper returns `Unknown` for `nan` entries. The nuclide lut (dbo_nuclide.xlsx) uses `NOT AVAILABLE` for `nan` entries. \n" + ] + }, { "cell_type": "markdown", "id": "712eab9d", @@ -57,16 +67,7 @@ "execution_count": null, "id": "f69f5756", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "#| hide\n", "%load_ext autoreload\n", @@ -596,92 +597,92 @@ " \n", " 0\n", " 0\n", - " 239, 240 Pu\n", + " 210Pb\n", " \n", " \n", " 1\n", " 1\n", - " 137Cs\n", + " NaN\n", " \n", " \n", " 2\n", " 2\n", - " CS-137\n", + " 238Pu\n", " \n", " \n", " 3\n", " 3\n", - " Cs-137\n", + " 239,240Pu\n", " \n", " \n", " 4\n", " 4\n", - " Cs-134\n", + " 239, 240 Pu\n", " \n", " \n", " 5\n", " 5\n", - " 210Po\n", + " 137Cs\n", " \n", " \n", " 6\n", " 6\n", - " 239,240Pu\n", + " 226Ra\n", " \n", " \n", " 7\n", " 7\n", - " 228Ra\n", + " 99Tc\n", " \n", " \n", " 8\n", " 8\n", - " 210Pb\n", + " RA-226\n", " \n", " \n", " 9\n", " 9\n", - " RA-228\n", + " CS-134\n", " \n", " \n", " 10\n", " 10\n", - " 238Pu\n", + " Cs-137\n", " \n", " \n", " 11\n", " 11\n", - " 3H\n", + " Cs-134\n", " \n", " \n", " 12\n", " 12\n", - " NaN\n", + " 228Ra\n", " \n", " \n", " 13\n", " 13\n", - " RA-226\n", + " CS-137\n", " \n", " \n", " 14\n", " 14\n", - " 99Tc\n", + " RA-228\n", " \n", " \n", " 15\n", " 15\n", - " 241Am\n", + " 3H\n", " \n", " \n", " 16\n", " 16\n", - " CS-134\n", + " 210Po\n", " \n", " \n", " 17\n", " 17\n", - " 226Ra\n", + " 241Am\n", " \n", " \n", "\n", @@ -689,24 +690,24 @@ ], "text/plain": [ " index value\n", - "0 0 239, 240 Pu\n", - "1 1 137Cs\n", - "2 2 CS-137\n", - "3 3 Cs-137\n", - "4 4 Cs-134\n", - "5 5 210Po\n", - "6 6 239,240Pu\n", - "7 7 228Ra\n", - "8 8 210Pb\n", - "9 9 RA-228\n", - "10 10 238Pu\n", - "11 11 3H\n", - "12 12 NaN\n", - "13 13 RA-226\n", - "14 14 99Tc\n", - "15 15 241Am\n", - "16 16 CS-134\n", - "17 17 226Ra" + "0 0 210Pb\n", + "1 1 NaN\n", + "2 2 238Pu\n", + "3 3 239,240Pu\n", + "4 4 239, 240 Pu\n", + "5 5 137Cs\n", + "6 6 226Ra\n", + "7 7 99Tc\n", + "8 8 RA-226\n", + "9 9 CS-134\n", + "10 10 Cs-137\n", + "11 11 Cs-134\n", + "12 12 228Ra\n", + "13 13 CS-137\n", + "14 14 RA-228\n", + "15 15 3H\n", + "16 16 210Po\n", + "17 17 241Am" ] }, "execution_count": null, @@ -763,7 +764,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 18/18 [00:00<00:00, 49.79it/s]\n" + "Processing: 0%| | 0/18 [00:006\n", " \n", " \n", - " 241Am\n", - " pu241\n", - " 241Am\n", - " 4\n", - " \n", - " \n", - " 137Cs\n", - " h3\n", - " 137Cs\n", + " 210Pb\n", + " ag106m\n", + " 210Pb\n", " 4\n", " \n", " \n", @@ -830,15 +832,15 @@ " 4\n", " \n", " \n", - " 210Pb\n", + " 210Po\n", " ag106m\n", - " 210Pb\n", + " 210Po\n", " 4\n", " \n", " \n", - " 210Po\n", - " ag106m\n", - " 210Po\n", + " 241Am\n", + " pu241\n", + " 241Am\n", " 4\n", " \n", " \n", @@ -848,10 +850,10 @@ " 4\n", " \n", " \n", - " 238Pu\n", - " u238\n", - " 238Pu\n", - " 3\n", + " 137Cs\n", + " h3\n", + " 137Cs\n", + " 4\n", " \n", " \n", " 99Tc\n", @@ -860,15 +862,21 @@ " 3\n", " \n", " \n", + " 238Pu\n", + " u238\n", + " 238Pu\n", + " 3\n", + " \n", + " \n", " 3H\n", " h3\n", " 3H\n", " 2\n", " \n", " \n", - " Cs-134\n", - " cs134\n", - " Cs-134\n", + " RA-226\n", + " ra226\n", + " RA-226\n", " 1\n", " \n", " \n", @@ -878,15 +886,9 @@ " 1\n", " \n", " \n", - " RA-228\n", - " ra228\n", - " RA-228\n", - " 1\n", - " \n", - " \n", - " RA-226\n", - " ra226\n", - " RA-226\n", + " Cs-134\n", + " cs134\n", + " Cs-134\n", " 1\n", " \n", " \n", @@ -896,6 +898,12 @@ " 1\n", " \n", " \n", + " RA-228\n", + " ra228\n", + " RA-228\n", + " 1\n", + " \n", + " \n", " CS-134\n", " cs134\n", " CS-134\n", @@ -910,20 +918,20 @@ "source_key \n", "239, 240 Pu pu240 239, 240 Pu 8\n", "239,240Pu pu240 239,240Pu 6\n", - "241Am pu241 241Am 4\n", - "137Cs h3 137Cs 4\n", - "228Ra u238 228Ra 4\n", "210Pb ag106m 210Pb 4\n", + "228Ra u238 228Ra 4\n", "210Po ag106m 210Po 4\n", + "241Am pu241 241Am 4\n", "226Ra u235 226Ra 4\n", - "238Pu u238 238Pu 3\n", + "137Cs h3 137Cs 4\n", "99Tc tu 99Tc 3\n", + "238Pu u238 238Pu 3\n", "3H h3 3H 2\n", - "Cs-134 cs134 Cs-134 1\n", - "Cs-137 cs137 Cs-137 1\n", - "RA-228 ra228 RA-228 1\n", "RA-226 ra226 RA-226 1\n", + "Cs-137 cs137 Cs-137 1\n", + "Cs-134 cs134 Cs-134 1\n", "CS-137 cs137 CS-137 1\n", + "RA-228 ra228 RA-228 1\n", "CS-134 cs134 CS-134 1" ] }, @@ -955,16 +963,23 @@ "source": [ "#| exports\n", "fixes_nuclide_names = {\n", - " '99Tc': 'tc99',\n", - " '238Pu': 'pu238',\n", " '226Ra': 'ra226',\n", - " '210Pb': 'pb210',\n", - " '241Am': 'am241',\n", " '228Ra': 'ra228',\n", + " '239, 240 Pu': 'pu239_240_tot',\n", + " 'CS-134': 'cs134',\n", " '137Cs': 'cs137',\n", + " 'RA-226': 'ra226',\n", + " '3H': 'h3',\n", + " 'RA-228': 'ra228',\n", + " '238Pu': 'pu238',\n", + " '241Am': 'am241',\n", + " 'CS-137': 'cs137',\n", " '210Po': 'po210',\n", - " '239,240Pu': 'pu239_240_tot',\n", - " '239, 240 Pu': 'pu239_240_tot'\n", + " '210Pb': 'pb210',\n", + " 'Cs-137': 'cs137',\n", + " '99Tc': 'tc99',\n", + " 'Cs-134': 'cs134',\n", + " '239,240Pu': 'pu239_240_tot'\n", " }" ] }, @@ -986,7 +1001,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 18/18 [00:00<00:00, 30.89it/s]\n" + "Processing: 0%| | 0/18 [00:00\n", " \n", " \n", - " \n", - " 3H\n", - " h3\n", - " 3H\n", - " 2\n", - " \n", - " \n", - " CS-137\n", - " cs137\n", - " CS-137\n", - " 1\n", - " \n", - " \n", - " Cs-137\n", - " cs137\n", - " Cs-137\n", - " 1\n", - " \n", - " \n", - " Cs-134\n", - " cs134\n", - " Cs-134\n", - " 1\n", - " \n", - " \n", - " RA-228\n", - " ra228\n", - " RA-228\n", - " 1\n", - " \n", - " \n", - " RA-226\n", - " ra226\n", - " RA-226\n", - " 1\n", - " \n", - " \n", - " CS-134\n", - " cs134\n", - " CS-134\n", - " 1\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " matched_maris_name source_name match_score\n", - "source_key \n", - "3H h3 3H 2\n", - "CS-137 cs137 CS-137 1\n", - "Cs-137 cs137 Cs-137 1\n", - "Cs-134 cs134 Cs-134 1\n", - "RA-228 ra228 RA-228 1\n", - "RA-226 ra226 RA-226 1\n", - "CS-134 cs134 CS-134 1" + "Empty DataFrame\n", + "Columns: [matched_maris_name, source_name, match_score]\n", + "Index: []" ] }, "execution_count": null, @@ -1178,24 +1152,75 @@ }, { "cell_type": "markdown", - "id": "a54f21ff", + "id": "dd0f230a", "metadata": {}, "source": [ - "### Add Nuclide Id column" + ":::{.callout-tip}\n", + "\n", + "**DISCUSS**: The `Seawater` dataset contains 8 rows where nuclide is `nan` (remapped to `Unkown`), see below.\n", + "\n", + ":::" ] }, { "cell_type": "markdown", - "id": "0deedefa", + "id": "04e68a87", "metadata": {}, "source": [ - "The `nuclide_id` column is added to the dataframe for legacy reasons (again Open Refine output)." + "First lets apply the `RemoveAllNAValuesCB` and `RemapNuclideNameCB` callbacks to the `seawater` sample type.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "635c8f39", + "id": "0fc40f3c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ID Contracting Party RSC Sub-division Station ID Sample ID LatD LatM \\\n", + "0 1 Belgium 8.0 Belgica-W01 WNZ 01 51.0 22.0 \n", + "1 2 Belgium 8.0 Belgica-W02 WNZ 02 51.0 13.0 \n", + "\n", + " LatS LatDir LongD ... Nuclide Value type Activity or MDA Uncertainty \\\n", + "0 31.0 N 3.0 ... 137Cs < 0.20 NaN \n", + "1 25.0 N 2.0 ... 137Cs < 0.27 NaN \n", + "\n", + " Unit Data provider Measurement Comment Sample Comment Reference Comment \\\n", + "0 Bq/l SCK•CEN NaN NaN NaN \n", + "1 Bq/l SCK•CEN NaN NaN NaN \n", + "\n", + " NUCLIDE \n", + "0 cs137 \n", + "1 cs137 \n", + "\n", + "[2 rows x 26 columns]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[RemoveAllNAValuesCB(cols_to_check), \n", + " RemapNuclideNameCB(lut_nuclides)])\n", + "tfm()\n", + "print(tfm.dfs['seawater'].head(2))" + ] + }, + { + "cell_type": "markdown", + "id": "b903fc40", + "metadata": {}, + "source": [ + "Lets return the entries with `Unknown` nuclides." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd34f549", "metadata": {}, "outputs": [ { @@ -1219,23 +1244,605 @@ " \n", " \n", " \n", + " ID\n", + " Contracting Party\n", + " RSC Sub-division\n", + " Station ID\n", + " Sample ID\n", + " LatD\n", + " LatM\n", + " LatS\n", + " LatDir\n", + " LongD\n", + " ...\n", + " Nuclide\n", + " Value type\n", + " Activity or MDA\n", + " Uncertainty\n", + " Unit\n", + " Data provider\n", + " Measurement Comment\n", + " Sample Comment\n", + " Reference Comment\n", " NUCLIDE\n", - " nuclide_id\n", " \n", " \n", " \n", " \n", - " 0\n", - " pu239_240_tot\n", - " 77\n", - " \n", - " \n", - " 1\n", - " tc99\n", - " 15\n", - " \n", - " \n", - " 2\n", + " 18471\n", + " 120363\n", + " Ireland\n", + " 4.0\n", + " N1\n", + " NaN\n", + " 53.0\n", + " 25.0\n", + " 0.0\n", + " N\n", + " 6.0\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 2021 data\n", + " The Irish Navy attempted a few times to collec...\n", + " NaN\n", + " Unknown\n", + " \n", + " \n", + " 18472\n", + " 120364\n", + " Ireland\n", + " 4.0\n", + " N2\n", + " NaN\n", + " 53.0\n", + " 36.0\n", + " 0.0\n", + " N\n", + " 5.0\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 2021 data\n", + " NaN\n", + " NaN\n", + " Unknown\n", + " \n", + " \n", + " 18473\n", + " 120365\n", + " Ireland\n", + " 4.0\n", + " N3\n", + " NaN\n", + " 53.0\n", + " 44.0\n", + " 0.0\n", + " N\n", + " 5.0\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 2021 data\n", + " NaN\n", + " NaN\n", + " Unknown\n", + " \n", + " \n", + " 18474\n", + " 120366\n", + " Ireland\n", + " 4.0\n", + " N8\n", + " NaN\n", + " 53.0\n", + " 39.0\n", + " 0.0\n", + " N\n", + " 5.0\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 2021 data\n", + " NaN\n", + " NaN\n", + " Unknown\n", + " \n", + " \n", + " 18475\n", + " 120367\n", + " Ireland\n", + " 4.0\n", + " N9\n", + " NaN\n", + " 53.0\n", + " 53.0\n", + " 0.0\n", + " N\n", + " 5.0\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 2021 data\n", + " NaN\n", + " NaN\n", + " Unknown\n", + " \n", + " \n", + " 18476\n", + " 120368\n", + " Ireland\n", + " 4.0\n", + " N10\n", + " NaN\n", + " 53.0\n", + " 52.0\n", + " 0.0\n", + " N\n", + " 5.0\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 2021 data\n", + " NaN\n", + " NaN\n", + " Unknown\n", + " \n", + " \n", + " 18477\n", + " 120369\n", + " Ireland\n", + " 1.0\n", + " Salthill\n", + " NaN\n", + " 53.0\n", + " 15.0\n", + " 40.0\n", + " N\n", + " 9.0\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 2021 data\n", + " Woodstown (County Waterford) and Salthill (Cou...\n", + " NaN\n", + " Unknown\n", + " \n", + " \n", + " 18478\n", + " 120370\n", + " Ireland\n", + " 1.0\n", + " Woodstown\n", + " NaN\n", + " 52.0\n", + " 11.0\n", + " 55.0\n", + " N\n", + " 6.0\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " Unknown\n", + " \n", + " \n", + "\n", + "

8 rows × 26 columns

\n", + "" + ], + "text/plain": [ + " ID Contracting Party RSC Sub-division Station ID Sample ID LatD \\\n", + "18471 120363 Ireland 4.0 N1 NaN 53.0 \n", + "18472 120364 Ireland 4.0 N2 NaN 53.0 \n", + "18473 120365 Ireland 4.0 N3 NaN 53.0 \n", + "18474 120366 Ireland 4.0 N8 NaN 53.0 \n", + "18475 120367 Ireland 4.0 N9 NaN 53.0 \n", + "18476 120368 Ireland 4.0 N10 NaN 53.0 \n", + "18477 120369 Ireland 1.0 Salthill NaN 53.0 \n", + "18478 120370 Ireland 1.0 Woodstown NaN 52.0 \n", + "\n", + " LatM LatS LatDir LongD ... Nuclide Value type Activity or MDA \\\n", + "18471 25.0 0.0 N 6.0 ... NaN NaN NaN \n", + "18472 36.0 0.0 N 5.0 ... NaN NaN NaN \n", + "18473 44.0 0.0 N 5.0 ... NaN NaN NaN \n", + "18474 39.0 0.0 N 5.0 ... NaN NaN NaN \n", + "18475 53.0 0.0 N 5.0 ... NaN NaN NaN \n", + "18476 52.0 0.0 N 5.0 ... NaN NaN NaN \n", + "18477 15.0 40.0 N 9.0 ... NaN NaN NaN \n", + "18478 11.0 55.0 N 6.0 ... NaN NaN NaN \n", + "\n", + " Uncertainty Unit Data provider Measurement Comment \\\n", + "18471 NaN NaN NaN 2021 data \n", + "18472 NaN NaN NaN 2021 data \n", + "18473 NaN NaN NaN 2021 data \n", + "18474 NaN NaN NaN 2021 data \n", + "18475 NaN NaN NaN 2021 data \n", + "18476 NaN NaN NaN 2021 data \n", + "18477 NaN NaN NaN 2021 data \n", + "18478 NaN NaN NaN NaN \n", + "\n", + " Sample Comment Reference Comment \\\n", + "18471 The Irish Navy attempted a few times to collec... NaN \n", + "18472 NaN NaN \n", + "18473 NaN NaN \n", + "18474 NaN NaN \n", + "18475 NaN NaN \n", + "18476 NaN NaN \n", + "18477 Woodstown (County Waterford) and Salthill (Cou... NaN \n", + "18478 NaN NaN \n", + "\n", + " NUCLIDE \n", + "18471 Unknown \n", + "18472 Unknown \n", + "18473 Unknown \n", + "18474 Unknown \n", + "18475 Unknown \n", + "18476 Unknown \n", + "18477 Unknown \n", + "18478 Unknown \n", + "\n", + "[8 rows x 26 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfm.dfs['seawater'][tfm.dfs['seawater']['NUCLIDE'] == 'Unknown']" + ] + }, + { + "cell_type": "markdown", + "id": "f7a2d83a", + "metadata": {}, + "source": [ + "For discussion. Include ``RemoveFilteredRowsCB`` with utils.ipynb?. For now I placed it below. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1f48608", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class RemoveFilteredRowsCB(Callback):\n", + " \"\"\" Remove rows from a dataframe based on a filter condition. \"\"\"\n", + " \n", + " def __init__(self, filters:dict, verbose:bool=False):\n", + " fc.store_attr()\n", + " \n", + " def __call__(self, tfm: 'Transformer'):\n", + " for df_name, filter_condition in self.filters.items():\n", + " self._process_dataframe(tfm, df_name, filter_condition)\n", + "\n", + " def _process_dataframe(self, tfm: 'Transformer', df_name: str, filter_condition: Callable):\n", + " if df_name in tfm.dfs:\n", + " df = tfm.dfs[df_name]\n", + " initial_rows = len(df)\n", + " df = self._apply_filter(df, filter_condition)\n", + " removed_rows = initial_rows - len(df)\n", + " self._log_removal(df_name, removed_rows)\n", + " tfm.dfs[df_name] = df\n", + " else:\n", + " self._log_missing_dataframe(df_name)\n", + "\n", + " def _apply_filter(self, df: pd.DataFrame, filter_condition: Callable) -> pd.DataFrame:\n", + " mask = filter_condition(df)\n", + " return df[~mask] # Keep rows that don't match the filter\n", + "\n", + " def _log_removal(self, df_name: str, removed_rows: int):\n", + " if self.verbose:\n", + " print(f\"RemoveFilteredRowsCB: Removed {removed_rows} rows from '{df_name}'.\")\n", + "\n", + " def _log_missing_dataframe(self, df_name: str):\n", + " if self.verbose:\n", + " print(f\"RemoveFilteredRowsCB: Dataframe '{df_name}' not found in tfm.dfs.\")" + ] + }, + { + "cell_type": "markdown", + "id": "3a78c292", + "metadata": {}, + "source": [ + "The callback `RemoveFilteredRowsCB` allows to remove rows based on a custom filter condition. For instance, we can remove rows with `NUCLIDE` labelled as `Unknown` as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef9ecf00", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "nuclide_filters = {\n", + " 'seawater': lambda df: df['NUCLIDE'] == 'Unknown'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09a877fa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RemoveFilteredRowsCB: Removed 8 rows from 'seawater'.\n" + ] + }, + { + "data": { + "text/plain": [ + "{'seawater': ID Contracting Party RSC Sub-division Station ID Sample ID \\\n", + " 0 1 Belgium 8.0 Belgica-W01 WNZ 01 \n", + " 1 2 Belgium 8.0 Belgica-W02 WNZ 02 \n", + " 2 3 Belgium 8.0 Belgica-W03 WNZ 03 \n", + " 3 4 Belgium 8.0 Belgica-W04 WNZ 04 \n", + " 4 5 Belgium 8.0 Belgica-W05 WNZ 05 \n", + " ... ... ... ... ... ... \n", + " 18851 121646 United Kingdom 10.0 Rosyth 2100318 \n", + " 18852 121647 United Kingdom 10.0 Rosyth 2101399 \n", + " 18853 121648 United Kingdom 6.0 Wylfa 21-656 \n", + " 18854 121649 United Kingdom 6.0 Wylfa 21-657 \n", + " 18855 121650 United Kingdom 6.0 Wylfa 21-654 \n", + " \n", + " LatD LatM LatS LatDir LongD ... Nuclide Value type \\\n", + " 0 51.0 22.0 31.0 N 3.0 ... 137Cs < \n", + " 1 51.0 13.0 25.0 N 2.0 ... 137Cs < \n", + " 2 51.0 11.0 4.0 N 2.0 ... 137Cs < \n", + " 3 51.0 25.0 13.0 N 3.0 ... 137Cs < \n", + " 4 51.0 24.0 58.0 N 2.0 ... 137Cs < \n", + " ... ... ... ... ... ... ... ... ... \n", + " 18851 56.0 0.0 40.0 N 3.0 ... 3H < \n", + " 18852 56.0 0.0 40.0 N 3.0 ... 3H < \n", + " 18853 53.0 24.0 48.0 N 3.0 ... 137Cs = \n", + " 18854 53.0 34.0 11.0 N 3.0 ... 137Cs = \n", + " 18855 53.0 7.0 24.0 N 4.0 ... 137Cs < \n", + " \n", + " Activity or MDA Uncertainty Unit \\\n", + " 0 0.20000 NaN Bq/l \n", + " 1 0.27000 NaN Bq/l \n", + " 2 0.26000 NaN Bq/l \n", + " 3 0.25000 NaN Bq/l \n", + " 4 0.20000 NaN Bq/l \n", + " ... ... ... ... \n", + " 18851 1.00000 NaN Bq/l \n", + " 18852 1.05000 NaN Bq/l \n", + " 18853 0.00431 0.000543 Bq/l \n", + " 18854 0.00946 0.000253 Bq/l \n", + " 18855 0.00170 NaN Bq/l \n", + " \n", + " Data provider Measurement Comment \\\n", + " 0 SCK•CEN NaN \n", + " 1 SCK•CEN NaN \n", + " 2 SCK•CEN NaN \n", + " 3 SCK•CEN NaN \n", + " 4 SCK•CEN NaN \n", + " ... ... ... \n", + " 18851 SEPA-Scottish Environment Protection Agency NaN \n", + " 18852 SEPA-Scottish Environment Protection Agency NaN \n", + " 18853 BEIS NaN \n", + " 18854 BEIS NaN \n", + " 18855 BEIS NaN \n", + " \n", + " Sample Comment Reference Comment NUCLIDE \n", + " 0 NaN NaN cs137 \n", + " 1 NaN NaN cs137 \n", + " 2 NaN NaN cs137 \n", + " 3 NaN NaN cs137 \n", + " 4 NaN NaN cs137 \n", + " ... ... ... ... \n", + " 18851 East of Dockyard NaN h3 \n", + " 18852 East of Dockyard NaN h3 \n", + " 18853 Llandudno NaN cs137 \n", + " 18854 Prestatyn NaN cs137 \n", + " 18855 Holyhead NaN cs137 \n", + " \n", + " [18310 rows x 26 columns],\n", + " 'biota': ID Contracting Party RSC Sub-division Station ID Sample ID \\\n", + " 0 96793 United Kingdom 5 Hunterston 2200086 \n", + " 1 96822 United Kingdom 6 Chapelcross 2200081 \n", + " 2 96823 United Kingdom 7 Dounreay 2200093 \n", + " 3 96824 United Kingdom 7 Dounreay 2200089 \n", + " 4 96857 United Kingdom 10 Torness 2100074 \n", + " ... ... ... ... ... ... \n", + " 15309 54203 United Kingdom 6 Sellafield 1995001077 \n", + " 15310 48606 France 2 Granville NaN \n", + " 15311 48634 France 2 Granville NaN \n", + " 15312 48650 France 2 Dielette NaN \n", + " 15313 48610 France 2 Goury NaN \n", + " \n", + " LatD LatM LatS LatDir LongD ... Nuclide Value type \\\n", + " 0 55 43 31.0 N 4 ... 239,240Pu = \n", + " 1 54 58 8.0 N 3 ... 99Tc = \n", + " 2 58 33 57.0 N 3 ... 239,240Pu = \n", + " 3 58 37 7.0 N 3 ... 239,240Pu = \n", + " 4 55 57 53.0 N 2 ... 99Tc = \n", + " ... ... ... ... ... ... ... ... ... \n", + " 15309 54 27 18.0 N 3 ... 99Tc = \n", + " 15310 48 49 58.0 N 1 ... 239,240Pu = \n", + " 15311 48 49 58.0 N 1 ... 137Cs = \n", + " 15312 49 33 6.0 N 1 ... 137Cs = \n", + " 15313 49 42 52.0 N 1 ... 99Tc = \n", + " \n", + " Activity or MDA Uncertainty Unit \\\n", + " 0 0.3510 0.06600 Bq/kg f.w. \n", + " 1 39.0000 15.00000 Bq/kg f.w. \n", + " 2 0.0938 0.01800 Bq/kg f.w. \n", + " 3 1.5400 0.31000 Bq/kg f.w. \n", + " 4 16.0000 6.00000 Bq/kg f.w. \n", + " ... ... ... ... \n", + " 15309 838.0000 33.52000 Bq/kg f.w. \n", + " 15310 0.0180 0.00153 Bq/kg f.w. \n", + " 15311 0.2100 0.03465 Bq/kg f.w. \n", + " 15312 0.5600 0.03500 Bq/kg f.w. \n", + " 15313 17.1560 0.24700 Bq/kg f.w. \n", + " \n", + " Data provider Measurement Comment \\\n", + " 0 SEPA-Scottish Environment Protection Agency NaN \n", + " 1 SEPA-Scottish Environment Protection Agency NaN \n", + " 2 SEPA-Scottish Environment Protection Agency NaN \n", + " 3 SEPA-Scottish Environment Protection Agency NaN \n", + " 4 SEPA-Scottish Environment Protection Agency NaN \n", + " ... ... ... \n", + " 15309 FSA-Food Standards Agency NaN \n", + " 15310 IRSN : OPRI NaN \n", + " 15311 IRSN : OPRI NaN \n", + " 15312 IRSN : LERFA NaN \n", + " 15313 IRSN : LERFA NaN \n", + " \n", + " Sample Comment Reference Comment \\\n", + " 0 PLZ. Annual bulk of 2 samples, representative ... NaN \n", + " 1 PLZ NaN \n", + " 2 Sandside Bay. Annual bulk of 4 samples, repre... NaN \n", + " 3 Brims Ness. Annual bulk of 4 samples, represe... NaN \n", + " 4 Thornton Loch. Annual bulk of 2 samples, repre... NaN \n", + " ... ... ... \n", + " 15309 Nethertown NaN \n", + " 15310 NaN NaN \n", + " 15311 NaN NaN \n", + " 15312 NaN NaN \n", + " 15313 NaN NaN \n", + " \n", + " NUCLIDE \n", + " 0 pu239_240_tot \n", + " 1 tc99 \n", + " 2 pu239_240_tot \n", + " 3 pu239_240_tot \n", + " 4 tc99 \n", + " ... ... \n", + " 15309 tc99 \n", + " 15310 pu239_240_tot \n", + " 15311 cs137 \n", + " 15312 cs137 \n", + " 15313 tc99 \n", + " \n", + " [15314 rows x 28 columns]}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfm = Transformer(dfs, cbs=[\n", + " RemoveAllNAValuesCB(cols_to_check),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " RemoveFilteredRowsCB(nuclide_filters, verbose=True)\n", + "])\n", + "tfm()\n" + ] + }, + { + "cell_type": "markdown", + "id": "a54f21ff", + "metadata": {}, + "source": [ + "### Add Nuclide Id column" + ] + }, + { + "cell_type": "markdown", + "id": "0deedefa", + "metadata": {}, + "source": [ + "The `nuclide_id` column is added to the dataframe for legacy reasons (again Open Refine output)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "635c8f39", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RemoveFilteredRowsCB: Removed 8 rows from 'seawater'.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1255,50 +1862,274 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NUCLIDEnuclide_id
0pu239_240_tot77
1tc9915
2pu239_240_tot77
...
15309tc991515309tc9915
15310pu239_240_tot77
15311cs13733
15312cs13733
15313tc9915
\n", + "

15314 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " NUCLIDE nuclide_id\n", + "0 pu239_240_tot 77\n", + "1 tc99 15\n", + "2 pu239_240_tot 77\n", + "3 pu239_240_tot 77\n", + "4 tc99 15\n", + "... ... ...\n", + "15309 tc99 15\n", + "15310 pu239_240_tot 77\n", + "15311 cs137 33\n", + "15312 cs137 33\n", + "15313 tc99 15\n", + "\n", + "[15314 rows x 2 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemoveAllNAValuesCB(cols_to_check),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " RemoveFilteredRowsCB(nuclide_filters, verbose=True),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE')\n", + " ])\n", + "dfs_out = tfm()\n", + "\n", + "# For instance\n", + "dfs_out['biota'][['NUCLIDE', 'nuclide_id']]" + ] + }, + { + "cell_type": "markdown", + "id": "2ba5585f", + "metadata": {}, + "source": [ + "## Standardize Time" + ] + }, + { + "cell_type": "markdown", + "id": "d18cd209", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: `Seawater` dataset contains 1O rows with `NaN` values in the `Sampling date` column as shown below.\n", + "\n", + ":::" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98ac4866", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "dfs_test = Transformer(dfs, cbs=[RemoveAllNAValuesCB(cols_to_check)])()\n", + "dfs_test['seawater']['Sampling date'].isnull().sum()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c807cd86", + "metadata": {}, + "source": [ + "Create a callback that remaps the time format in the dictionary of dataframes (i.e. `%m/%d/%y %H:%M:%S`) and handle missing dates:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcd2893e", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class ParseTimeCB(Callback):\n", + " \"Parse the time format in the dataframe.\"\n", + " def __call__(self, tfm):\n", + " for df in tfm.dfs.values():\n", + " df['time'] = pd.to_datetime(df['Sampling date'], format='%d/%m/%Y', errors='coerce')\n", + " df['begperiod'] = df['time']\n", + " df.dropna(subset=['time'], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "00d44e5d", + "metadata": {}, + "source": [ + "Apply the transformer for callbacks `ParseTimeCB`. Then, print the ``begperiod`` and `time` data for `seawater`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc05ba8e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater biota\n", + "Number of rows in dfs 18856 15314\n", + "Number of rows in tfm.dfs 18308 15314\n", + "Number of dropped rows 548 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 18856 15314 \n", + "\n", + " begperiod time\n", + "0 2010-01-27 2010-01-27\n", + "1 2010-01-27 2010-01-27\n", + "2 2010-01-27 2010-01-27\n", + "3 2010-01-27 2010-01-27\n", + "4 2010-01-26 2010-01-26\n", + "... ... ...\n", + "18851 2021-04-29 2021-04-29\n", + "18852 2021-12-10 2021-12-10\n", + "18853 2021-04-07 2021-04-07\n", + "18854 2021-04-07 2021-04-07\n", + "18855 2021-04-07 2021-04-07\n", + "\n", + "[18308 rows x 2 columns]\n" + ] + } + ], + "source": [ + "#|eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemoveAllNAValuesCB(cols_to_check),\n", + " ParseTimeCB(),\n", + " CompareDfsAndTfmCB(dfs)])\n", + "\n", + "tfm()\n", + "\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", + "print(tfm.dfs['seawater'][['begperiod','time']])" + ] + }, + { + "cell_type": "markdown", + "id": "88c54c09", + "metadata": {}, + "source": [ + "NetCDF time format requires the time to be encoded as number of milliseconds since a time of origin. In our case the time of origin is `1970-01-01` as indicated in `configs.ipynb` `CONFIFS['units']['time']` dictionary." + ] + }, + { + "cell_type": "markdown", + "id": "57b38397", + "metadata": {}, + "source": [ + "`EncodeTimeCB` converts the HELCOM `time` format to the MARIS NetCDF `time` format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c30bd204", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
time
01264550400
15310pu239_240_tot7711264550400
15311cs1373321264550400
15312cs1373331264550400
15313tc991541264464000
\n", - "

15314 rows × 2 columns

\n", "
" ], "text/plain": [ - " NUCLIDE nuclide_id\n", - "0 pu239_240_tot 77\n", - "1 tc99 15\n", - "2 pu239_240_tot 77\n", - "3 pu239_240_tot 77\n", - "4 tc99 15\n", - "... ... ...\n", - "15309 tc99 15\n", - "15310 pu239_240_tot 77\n", - "15311 cs137 33\n", - "15312 cs137 33\n", - "15313 tc99 15\n", - "\n", - "[15314 rows x 2 columns]" + " time\n", + "0 1264550400\n", + "1 1264550400\n", + "2 1264550400\n", + "3 1264550400\n", + "4 1264464000" ] }, "execution_count": null, @@ -1307,47 +2138,116 @@ } ], "source": [ - "#| eval: false\n", + "#|eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[RemapNuclideNameCB(lut_nuclides),\n", - " AddNuclideIdColumnCB(col_value='NUCLIDE')\n", - " ])\n", - "dfs_out = tfm()\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemoveAllNAValuesCB(cols_to_check),\n", + " ParseTimeCB(),\n", + " EncodeTimeCB(cfg(), verbose = True)])\n", "\n", - "# For instance\n", - "dfs_out['biota'][['NUCLIDE', 'nuclide_id']]" + "tfm()\n", + "tfm.dfs['seawater'][['time']].head()" ] }, { "cell_type": "markdown", - "id": "2ba5585f", + "id": "17063a80", "metadata": {}, "source": [ - "## Standardize Time" + "## Sanitize value" ] }, { "cell_type": "markdown", - "id": "d18cd209", + "id": "0fe8ab6e", "metadata": {}, "source": [ - ":::{.callout-tip}\n", - "\n", - "**FEEDBACK TO DATA PROVIDER**: `Seawater` dataset contains still 1O rows with `NaN` values in the `Sampling date` column as shown below.\n", + "We allocate each column containing measurement values into a single column `value` and remove `NA` where needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38538f9f", + "metadata": {}, + "outputs": [], + "source": [ + "# | exports\n", + "class SanitizeValue(Callback):\n", + " \"Sanitize value by removing blank entries and populating `value` column.\"\n", + " def __init__(self, \n", + " value_col: str='Activity or MDA' # Column name to sanitize\n", + " ):\n", + " fc.store_attr()\n", "\n", - ":::" + " def __call__(self, tfm):\n", + " for df in tfm.dfs.values():\n", + " df.dropna(subset=[self.value_col], inplace=True)\n", + " df['value'] = df[self.value_col]" ] }, { "cell_type": "code", "execution_count": null, - "id": "98ac4866", + "id": "a68ecd5b", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value
00.20
10.27
20.26
30.25
40.20
\n", + "
" + ], "text/plain": [ - "10" + " value\n", + "0 0.20\n", + "1 0.27\n", + "2 0.26\n", + "3 0.25\n", + "4 0.20" ] }, "execution_count": null, @@ -1356,112 +2256,147 @@ } ], "source": [ - "#| eval: false\n", + "#|eval: false\n", "dfs = load_data(fname_in)\n", - "dfs_test = Transformer(dfs, cbs=[RemoveAllNAValuesCB(cols_to_check)])()\n", - "dfs_test['seawater']['Sampling date'].isnull().sum()\n" + "tfm = Transformer(dfs, cbs=[SanitizeValue()])\n", + "\n", + "tfm()['seawater'][['value']].head()" ] }, { "cell_type": "markdown", - "id": "c807cd86", + "id": "7c83412b", "metadata": {}, "source": [ - "Create a callback that remaps the time format in the dictionary of dataframes (i.e. `%m/%d/%y %H:%M:%S`) and handle missing dates:" + "## Normalize uncertainty" + ] + }, + { + "cell_type": "markdown", + "id": "13a44f1a", + "metadata": {}, + "source": [ + "For each sample type in the OSPAR dataset, the reported uncertainty is given as an expanded uncertainty with a coverage factor `𝑘=2`. For further details, refer to the [OSPAR reporting guidelines](https://mcc.jrc.ec.europa.eu/documents/OSPAR/Guidelines_forestimationof_a_%20measurefor_uncertainty_in_OSPARmonitoring.pdf).\n", + "\n", + "**Note**: The OSPAR uncertainty values are normalized to standard uncertainty with a coverage factor \n", + "𝑘=1." + ] + }, + { + "cell_type": "markdown", + "id": "97a933ab", + "metadata": {}, + "source": [ + "`NormalizeUncCB` callback normalizes the uncertainty using the following `lambda` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6c84351", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "unc_exp2stan = lambda df, unc_col: df[unc_col] / 2" ] }, { "cell_type": "code", "execution_count": null, - "id": "bcd2893e", + "id": "ecb2866d", "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "class ParseTimeCB(Callback):\n", - " \"Parse the time format in the dataframe.\"\n", + "class NormalizeUncCB(Callback):\n", + " \"\"\"Normalize uncertainty values in DataFrames.\"\"\"\n", + " def __init__(self, \n", + " col_unc: str='Uncertainty', # Column name to normalize\n", + " fn_convert_unc: Callable=unc_exp2stan, # Function correcting coverage factor\n", + " ): \n", + " fc.store_attr()\n", + "\n", " def __call__(self, tfm):\n", " for df in tfm.dfs.values():\n", - " df['time'] = pd.to_datetime(df['Sampling date'], format='%d/%m/%Y', errors='coerce')\n", - " df['begperiod'] = df['time']\n", - " df.dropna(subset=['time'], inplace=True)" - ] - }, - { - "cell_type": "markdown", - "id": "00d44e5d", - "metadata": {}, - "source": [ - "Apply the transformer for callbacks `ParseTimeCB`. Then, print the ``begperiod`` and `time` data for `seawater`." + " df['uncertainty'] = self.fn_convert_unc(df, self.col_unc)" ] }, { "cell_type": "code", "execution_count": null, - "id": "bc05ba8e", + "id": "76a18010", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " seawater biota\n", - "Number of rows in dfs 18856 15314\n", - "Number of rows in tfm.dfs 18308 15314\n", - "Number of dropped rows 548 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 18856 15314 \n", "\n", - " begperiod time\n", - "0 2010-01-27 2010-01-27\n", - "1 2010-01-27 2010-01-27\n", - "2 2010-01-27 2010-01-27\n", - "3 2010-01-27 2010-01-27\n", - "4 2010-01-26 2010-01-26\n", - "... ... ...\n", - "18851 2021-04-29 2021-04-29\n", - "18852 2021-12-10 2021-12-10\n", - "18853 2021-04-07 2021-04-07\n", - "18854 2021-04-07 2021-04-07\n", - "18855 2021-04-07 2021-04-07\n", + "seawater:\n", + " value uncertainty\n", + "0 0.20 NaN\n", + "1 0.27 NaN\n", + "2 0.26 NaN\n", + "3 0.25 NaN\n", + "4 0.20 NaN\n", "\n", - "[18308 rows x 2 columns]\n" + "biota:\n", + " value uncertainty\n", + "0 0.3510 0.033\n", + "1 39.0000 7.500\n", + "2 0.0938 0.009\n", + "3 1.5400 0.155\n", + "4 16.0000 3.000\n" ] } ], "source": [ "#|eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[\n", - " RemoveAllNAValuesCB(cols_to_check),\n", - " ParseTimeCB(),\n", - " CompareDfsAndTfmCB(dfs)])\n", - "\n", + "tfm = Transformer(dfs, cbs=[ \n", + " SanitizeValue(), \n", + " NormalizeUncCB()\n", + " ])\n", "tfm()\n", "\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", - "print(tfm.dfs['seawater'][['begperiod','time']])" + "for grp in ['seawater', 'biota']:\n", + " print(f'\\n{grp}:')\n", + " print(tfm.dfs[grp][['value', 'uncertainty']].head())" ] }, { "cell_type": "markdown", - "id": "88c54c09", + "id": "48276f91", "metadata": {}, "source": [ - "NetCDF time format requires the time to be encoded as number of milliseconds since a time of origin. In our case the time of origin is `1970-01-01` as indicated in `configs.ipynb` `CONFIFS['units']['time']` dictionary." + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: `Seawater` dataset contains rows where the uncertainty is much greater than the value. \n", + "\n", + ":::" ] }, { - "cell_type": "markdown", - "id": "57b38397", + "cell_type": "code", + "execution_count": null, + "id": "84000c3b", "metadata": {}, + "outputs": [], "source": [ - "`EncodeTimeCB` converts the HELCOM `time` format to the MARIS NetCDF `time` format." + "grp='seawater'\n", + "tfm.dfs[grp]['relative_uncertainty'] = (\n", + " # Divide 'uncertainty' by 'value'\n", + " (tfm.dfs[grp]['uncertainty'] / tfm.dfs[grp]['value'])\n", + " # Multiply by 100 to convert to percentage\n", + " * 100\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "c30bd204", + "id": "5f5144aa", "metadata": {}, "outputs": [ { @@ -1485,41 +2420,987 @@ " \n", " \n", " \n", - " time\n", + " ID\n", + " Contracting Party\n", + " Nuclide\n", + " Value type\n", + " Activity or MDA\n", + " Uncertainty\n", + " Unit\n", + " relative_uncertainty\n", + " \n", + " \n", + " \n", + " \n", + " 1158\n", + " 11075\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 2.800000e-03\n", + " 3.276000e-01\n", + " Bq/l\n", + " 5.850000e+03\n", + " \n", + " \n", + " 1160\n", + " 11077\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 2.900000e-03\n", + " 3.364000e-01\n", + " Bq/l\n", + " 5.800000e+03\n", + " \n", + " \n", + " 1162\n", + " 11079\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 2.500000e-03\n", + " 3.325000e-01\n", + " Bq/l\n", + " 6.650000e+03\n", + " \n", + " \n", + " 1164\n", + " 11081\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 2.500000e-03\n", + " 3.450000e-01\n", + " Bq/l\n", + " 6.900000e+03\n", + " \n", + " \n", + " 1166\n", + " 11083\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 3.800000e-03\n", + " 3.344000e-01\n", + " Bq/l\n", + " 4.400000e+03\n", + " \n", + " \n", + " 1168\n", + " 11085\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 3.500000e-03\n", + " 3.220000e-01\n", + " Bq/l\n", + " 4.600000e+03\n", + " \n", + " \n", + " 1170\n", + " 11087\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 3.500000e-03\n", + " 3.395000e-01\n", + " Bq/l\n", + " 4.850000e+03\n", + " \n", + " \n", + " 1211\n", + " 11128\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 1.600000e-03\n", + " 3.456000e-01\n", + " Bq/l\n", + " 1.080000e+04\n", + " \n", + " \n", + " 1213\n", + " 11130\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 1.600000e-03\n", + " 3.296000e-01\n", + " Bq/l\n", + " 1.030000e+04\n", + " \n", + " \n", + " 1215\n", + " 11132\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 3.000000e-03\n", + " 3.300000e-01\n", + " Bq/l\n", + " 5.500000e+03\n", + " \n", + " \n", + " 1217\n", + " 11134\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 3.900000e-03\n", + " 3.237000e-01\n", + " Bq/l\n", + " 4.150000e+03\n", + " \n", + " \n", + " 1220\n", + " 11137\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 7.300000e-03\n", + " 3.139000e-01\n", + " Bq/l\n", + " 2.150000e+03\n", + " \n", + " \n", + " 1222\n", + " 11139\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 1.500000e-02\n", + " 3.300000e-01\n", + " Bq/l\n", + " 1.100000e+03\n", + " \n", + " \n", + " 1224\n", + " 11141\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 9.600000e-03\n", + " 3.264000e-01\n", + " Bq/l\n", + " 1.700000e+03\n", + " \n", + " \n", + " 1226\n", + " 11143\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 6.600000e-03\n", + " 3.036000e-01\n", + " Bq/l\n", + " 2.300000e+03\n", + " \n", + " \n", + " 1228\n", + " 11145\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 4.000000e-03\n", + " 3.000000e-01\n", + " Bq/l\n", + " 3.750000e+03\n", + " \n", + " \n", + " 1230\n", + " 11147\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 4.000000e-03\n", + " 3.000000e-01\n", + " Bq/l\n", + " 3.750000e+03\n", + " \n", + " \n", + " 1232\n", + " 11149\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 1.700000e-02\n", + " 3.570000e-01\n", + " Bq/l\n", + " 1.050000e+03\n", + " \n", + " \n", + " 1234\n", + " 11151\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 1.410000e-02\n", + " 3.525000e-01\n", + " Bq/l\n", + " 1.250000e+03\n", + " \n", + " \n", + " 1248\n", + " 11165\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 5.500000e-03\n", + " 3.300000e-01\n", + " Bq/l\n", + " 3.000000e+03\n", + " \n", + " \n", + " 1250\n", + " 11167\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 3.600000e-03\n", + " 3.312000e-01\n", + " Bq/l\n", + " 4.600000e+03\n", + " \n", + " \n", + " 1252\n", + " 11169\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 6.160000e-02\n", + " 4.928000e-01\n", + " Bq/l\n", + " 4.000000e+02\n", + " \n", + " \n", + " 1254\n", + " 11171\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 7.150000e-02\n", + " 5.005000e-01\n", + " Bq/l\n", + " 3.500000e+02\n", + " \n", + " \n", + " 1256\n", + " 11173\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 7.580000e-02\n", + " 5.306000e-01\n", + " Bq/l\n", + " 3.500000e+02\n", + " \n", + " \n", + " 1258\n", + " 11175\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 8.280000e-02\n", + " 4.968000e-01\n", + " Bq/l\n", + " 3.000000e+02\n", + " \n", + " \n", + " 1260\n", + " 11177\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 8.850000e-02\n", + " 5.310000e-01\n", + " Bq/l\n", + " 3.000000e+02\n", + " \n", + " \n", + " 1262\n", + " 11179\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 1.150000e-02\n", + " 3.565000e-01\n", + " Bq/l\n", + " 1.550000e+03\n", + " \n", + " \n", + " 1264\n", + " 11181\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 1.040000e-02\n", + " 3.432000e-01\n", + " Bq/l\n", + " 1.650000e+03\n", + " \n", + " \n", + " 1266\n", + " 11183\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 2.810000e-02\n", + " 3.934000e-01\n", + " Bq/l\n", + " 7.000000e+02\n", + " \n", + " \n", + " 1268\n", + " 11185\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 4.550000e-02\n", + " 4.550000e-01\n", + " Bq/l\n", + " 5.000000e+02\n", + " \n", + " \n", + " 1270\n", + " 11187\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 3.710000e-02\n", + " 4.081000e-01\n", + " Bq/l\n", + " 5.500000e+02\n", + " \n", + " \n", + " 1272\n", + " 11189\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 4.050000e-02\n", + " 4.050000e-01\n", + " Bq/l\n", + " 5.000000e+02\n", + " \n", + " \n", + " 1274\n", + " 11191\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 5.080000e-02\n", + " 4.572000e-01\n", + " Bq/l\n", + " 4.500000e+02\n", + " \n", + " \n", + " 1276\n", + " 11193\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 7.460000e-02\n", + " 5.222000e-01\n", + " Bq/l\n", + " 3.500000e+02\n", + " \n", + " \n", + " 1278\n", + " 11195\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 4.820000e-02\n", + " 4.338000e-01\n", + " Bq/l\n", + " 4.500000e+02\n", + " \n", + " \n", + " 1280\n", + " 11197\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 3.260000e-02\n", + " 3.912000e-01\n", + " Bq/l\n", + " 6.000000e+02\n", + " \n", + " \n", + " 1282\n", + " 11199\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 1.280000e-02\n", + " 3.328000e-01\n", + " Bq/l\n", + " 1.300000e+03\n", + " \n", + " \n", + " 1623\n", + " 15018\n", + " Norway\n", + " 239,240Pu\n", + " =\n", + " 2.400000e-12\n", + " 4.000000e-07\n", + " Bq/l\n", + " 8.333333e+06\n", + " \n", + " \n", + " 4696\n", + " 37532\n", + " Germany\n", + " 99Tc\n", + " =\n", + " 2.230000e-03\n", + " 1.200000e-01\n", + " Bq/l\n", + " 2.690583e+03\n", + " \n", + " \n", + " 4712\n", + " 37548\n", + " Germany\n", + " 99Tc\n", + " =\n", + " 6.300000e-04\n", + " 7.000000e-02\n", + " Bq/l\n", + " 5.555556e+03\n", " \n", - " \n", - " \n", " \n", - " 0\n", - " 1264550400\n", + " 4713\n", + " 37549\n", + " Germany\n", + " 99Tc\n", + " =\n", + " 9.200000e-04\n", + " 9.000000e-02\n", + " Bq/l\n", + " 4.891304e+03\n", " \n", " \n", - " 1\n", - " 1264550400\n", + " 4714\n", + " 37550\n", + " Germany\n", + " 99Tc\n", + " =\n", + " 5.500000e-04\n", + " 7.000000e-02\n", + " Bq/l\n", + " 6.363636e+03\n", " \n", " \n", - " 2\n", - " 1264550400\n", + " 4715\n", + " 37551\n", + " Germany\n", + " 99Tc\n", + " =\n", + " 5.900000e-04\n", + " 1.200000e-01\n", + " Bq/l\n", + " 1.016949e+04\n", " \n", " \n", - " 3\n", - " 1264550400\n", + " 7482\n", + " 55488\n", + " United Kingdom\n", + " 3H\n", + " =\n", + " 1.110910e+01\n", + " 9.716400e+04\n", + " Bq/l\n", + " 4.373172e+05\n", " \n", " \n", - " 4\n", - " 1264464000\n", + " 8758\n", + " 61136\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 6.402730e-05\n", + " 1.747559e-02\n", + " Bq/l\n", + " 1.364698e+04\n", + " \n", + " \n", + " 8759\n", + " 61137\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 5.121720e-05\n", + " 2.262377e-02\n", + " Bq/l\n", + " 2.208610e+04\n", + " \n", + " \n", + " 8760\n", + " 61138\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 5.266390e-05\n", + " 2.183357e-02\n", + " Bq/l\n", + " 2.072916e+04\n", + " \n", + " \n", + " 8761\n", + " 61139\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 6.730810e-05\n", + " 1.577155e-02\n", + " Bq/l\n", + " 1.171594e+04\n", + " \n", + " \n", + " 8762\n", + " 61140\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 2.416740e-04\n", + " 1.041441e-02\n", + " Bq/l\n", + " 2.154639e+03\n", + " \n", + " \n", + " 11362\n", + " 75828\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 1.400000e-04\n", + " 4.000000e-02\n", + " Bq/l\n", + " 1.428571e+04\n", + " \n", + " \n", + " 11363\n", + " 75829\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 1.000000e-04\n", + " 6.000000e-02\n", + " Bq/l\n", + " 3.000000e+04\n", + " \n", + " \n", + " 11364\n", + " 75830\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 8.000000e-05\n", + " 4.000000e-02\n", + " Bq/l\n", + " 2.500000e+04\n", + " \n", + " \n", + " 11930\n", + " 80299\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 1.400000e-02\n", + " 7.000000e-02\n", + " Bq/l\n", + " 2.500000e+02\n", + " \n", + " \n", + " 11932\n", + " 80301\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 8.000000e-03\n", + " 1.000000e-01\n", + " Bq/l\n", + " 6.250000e+02\n", + " \n", + " \n", + " 11934\n", + " 80303\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 1.380000e-02\n", + " 7.000000e-02\n", + " Bq/l\n", + " 2.536232e+02\n", + " \n", + " \n", + " 11936\n", + " 80305\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 1.600000e-02\n", + " 1.300000e-01\n", + " Bq/l\n", + " 4.062500e+02\n", + " \n", + " \n", + " 12794\n", + " 85501\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 1.542590e-04\n", + " 7.184512e-03\n", + " Bq/l\n", + " 2.328717e+03\n", + " \n", + " \n", + " 12795\n", + " 85502\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 1.722010e-04\n", + " 4.580005e-03\n", + " Bq/l\n", + " 1.329843e+03\n", + " \n", + " \n", + " 12796\n", + " 85503\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 1.170680e-04\n", + " 5.970171e-03\n", + " Bq/l\n", + " 2.549873e+03\n", + " \n", + " \n", + " 12797\n", + " 85504\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 8.904780e-05\n", + " 7.069506e-03\n", + " Bq/l\n", + " 3.969501e+03\n", + " \n", + " \n", + " 12798\n", + " 85505\n", + " Norway\n", + " 99Tc\n", + " =\n", + " 4.649850e-05\n", + " 1.179140e-02\n", + " Bq/l\n", + " 1.267934e+04\n", + " \n", + " \n", + " 12811\n", + " 85518\n", + " Sweden\n", + " 3H\n", + " =\n", + " 8.564240e-01\n", + " 2.340000e+00\n", + " Bq/l\n", + " 1.366146e+02\n", + " \n", + " \n", + " 15893\n", + " 94611\n", + " United Kingdom\n", + " 3H\n", + " =\n", + " 3.351500e-01\n", + " 9.643907e-01\n", + " Bq/l\n", + " 1.438745e+02\n", + " \n", + " \n", + " 15929\n", + " 94647\n", + " United Kingdom\n", + " 99Tc\n", + " =\n", + " 2.030000e-03\n", + " 1.240000e-01\n", + " Bq/l\n", + " 3.054187e+03\n", + " \n", + " \n", + " 15930\n", + " 94648\n", + " United Kingdom\n", + " 99Tc\n", + " =\n", + " 2.360000e-03\n", + " 1.270000e-01\n", + " Bq/l\n", + " 2.690678e+03\n", + " \n", + " \n", + " 15961\n", + " 94679\n", + " United Kingdom\n", + " 3H\n", + " =\n", + " 2.721028e+00\n", + " 3.246159e+01\n", + " Bq/l\n", + " 5.964948e+02\n", + " \n", + " \n", + " 15963\n", + " 94681\n", + " United Kingdom\n", + " 3H\n", + " =\n", + " 2.098754e+00\n", + " 4.180965e+01\n", + " Bq/l\n", + " 9.960588e+02\n", + " \n", + " \n", + " 15967\n", + " 94685\n", + " United Kingdom\n", + " 3H\n", + " =\n", + " 3.882378e+00\n", + " 2.298351e+01\n", + " Bq/l\n", + " 2.959978e+02\n", + " \n", + " \n", + " 15969\n", + " 94687\n", + " United Kingdom\n", + " 3H\n", + " =\n", + " 3.804731e+00\n", + " 2.343423e+01\n", + " Bq/l\n", + " 3.079617e+02\n", + " \n", + " \n", + " 15971\n", + " 94689\n", + " United Kingdom\n", + " 3H\n", + " =\n", + " 2.332310e+00\n", + " 3.772006e+01\n", + " Bq/l\n", + " 8.086417e+02\n", + " \n", + " \n", + " 15973\n", + " 94691\n", + " United Kingdom\n", + " 3H\n", + " =\n", + " 7.773164e-01\n", + " 1.113200e+02\n", + " Bq/l\n", + " 7.160536e+03\n", + " \n", + " \n", + " 15977\n", + " 94695\n", + " United Kingdom\n", + " 3H\n", + " =\n", + " 1.554393e-01\n", + " 5.528771e+02\n", + " Bq/l\n", + " 1.778435e+05\n", + " \n", + " \n", + " 18788\n", + " 121583\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 2.460000e-03\n", + " 4.009800e-01\n", + " Bq/l\n", + " 8.150000e+03\n", + " \n", + " \n", + " 18789\n", + " 121584\n", + " United Kingdom\n", + " 137Cs\n", + " =\n", + " 1.690000e-03\n", + " 5.120700e-01\n", + " Bq/l\n", + " 1.515000e+04\n", " \n", " \n", "\n", "" ], "text/plain": [ - " time\n", - "0 1264550400\n", - "1 1264550400\n", - "2 1264550400\n", - "3 1264550400\n", - "4 1264464000" + " ID Contracting Party Nuclide Value type Activity or MDA \\\n", + "1158 11075 United Kingdom 137Cs = 2.800000e-03 \n", + "1160 11077 United Kingdom 137Cs = 2.900000e-03 \n", + "1162 11079 United Kingdom 137Cs = 2.500000e-03 \n", + "1164 11081 United Kingdom 137Cs = 2.500000e-03 \n", + "1166 11083 United Kingdom 137Cs = 3.800000e-03 \n", + "1168 11085 United Kingdom 137Cs = 3.500000e-03 \n", + "1170 11087 United Kingdom 137Cs = 3.500000e-03 \n", + "1211 11128 United Kingdom 137Cs = 1.600000e-03 \n", + "1213 11130 United Kingdom 137Cs = 1.600000e-03 \n", + "1215 11132 United Kingdom 137Cs = 3.000000e-03 \n", + "1217 11134 United Kingdom 137Cs = 3.900000e-03 \n", + "1220 11137 United Kingdom 137Cs = 7.300000e-03 \n", + "1222 11139 United Kingdom 137Cs = 1.500000e-02 \n", + "1224 11141 United Kingdom 137Cs = 9.600000e-03 \n", + "1226 11143 United Kingdom 137Cs = 6.600000e-03 \n", + "1228 11145 United Kingdom 137Cs = 4.000000e-03 \n", + "1230 11147 United Kingdom 137Cs = 4.000000e-03 \n", + "1232 11149 United Kingdom 137Cs = 1.700000e-02 \n", + "1234 11151 United Kingdom 137Cs = 1.410000e-02 \n", + "1248 11165 United Kingdom 137Cs = 5.500000e-03 \n", + "1250 11167 United Kingdom 137Cs = 3.600000e-03 \n", + "1252 11169 United Kingdom 137Cs = 6.160000e-02 \n", + "1254 11171 United Kingdom 137Cs = 7.150000e-02 \n", + "1256 11173 United Kingdom 137Cs = 7.580000e-02 \n", + "1258 11175 United Kingdom 137Cs = 8.280000e-02 \n", + "1260 11177 United Kingdom 137Cs = 8.850000e-02 \n", + "1262 11179 United Kingdom 137Cs = 1.150000e-02 \n", + "1264 11181 United Kingdom 137Cs = 1.040000e-02 \n", + "1266 11183 United Kingdom 137Cs = 2.810000e-02 \n", + "1268 11185 United Kingdom 137Cs = 4.550000e-02 \n", + "1270 11187 United Kingdom 137Cs = 3.710000e-02 \n", + "1272 11189 United Kingdom 137Cs = 4.050000e-02 \n", + "1274 11191 United Kingdom 137Cs = 5.080000e-02 \n", + "1276 11193 United Kingdom 137Cs = 7.460000e-02 \n", + "1278 11195 United Kingdom 137Cs = 4.820000e-02 \n", + "1280 11197 United Kingdom 137Cs = 3.260000e-02 \n", + "1282 11199 United Kingdom 137Cs = 1.280000e-02 \n", + "1623 15018 Norway 239,240Pu = 2.400000e-12 \n", + "4696 37532 Germany 99Tc = 2.230000e-03 \n", + "4712 37548 Germany 99Tc = 6.300000e-04 \n", + "4713 37549 Germany 99Tc = 9.200000e-04 \n", + "4714 37550 Germany 99Tc = 5.500000e-04 \n", + "4715 37551 Germany 99Tc = 5.900000e-04 \n", + "7482 55488 United Kingdom 3H = 1.110910e+01 \n", + "8758 61136 Norway 99Tc = 6.402730e-05 \n", + "8759 61137 Norway 99Tc = 5.121720e-05 \n", + "8760 61138 Norway 99Tc = 5.266390e-05 \n", + "8761 61139 Norway 99Tc = 6.730810e-05 \n", + "8762 61140 Norway 99Tc = 2.416740e-04 \n", + "11362 75828 Norway 99Tc = 1.400000e-04 \n", + "11363 75829 Norway 99Tc = 1.000000e-04 \n", + "11364 75830 Norway 99Tc = 8.000000e-05 \n", + "11930 80299 Sweden 137Cs = 1.400000e-02 \n", + "11932 80301 Sweden 137Cs = 8.000000e-03 \n", + "11934 80303 Sweden 137Cs = 1.380000e-02 \n", + "11936 80305 Sweden 137Cs = 1.600000e-02 \n", + "12794 85501 Norway 99Tc = 1.542590e-04 \n", + "12795 85502 Norway 99Tc = 1.722010e-04 \n", + "12796 85503 Norway 99Tc = 1.170680e-04 \n", + "12797 85504 Norway 99Tc = 8.904780e-05 \n", + "12798 85505 Norway 99Tc = 4.649850e-05 \n", + "12811 85518 Sweden 3H = 8.564240e-01 \n", + "15893 94611 United Kingdom 3H = 3.351500e-01 \n", + "15929 94647 United Kingdom 99Tc = 2.030000e-03 \n", + "15930 94648 United Kingdom 99Tc = 2.360000e-03 \n", + "15961 94679 United Kingdom 3H = 2.721028e+00 \n", + "15963 94681 United Kingdom 3H = 2.098754e+00 \n", + "15967 94685 United Kingdom 3H = 3.882378e+00 \n", + "15969 94687 United Kingdom 3H = 3.804731e+00 \n", + "15971 94689 United Kingdom 3H = 2.332310e+00 \n", + "15973 94691 United Kingdom 3H = 7.773164e-01 \n", + "15977 94695 United Kingdom 3H = 1.554393e-01 \n", + "18788 121583 United Kingdom 137Cs = 2.460000e-03 \n", + "18789 121584 United Kingdom 137Cs = 1.690000e-03 \n", + "\n", + " Uncertainty Unit relative_uncertainty \n", + "1158 3.276000e-01 Bq/l 5.850000e+03 \n", + "1160 3.364000e-01 Bq/l 5.800000e+03 \n", + "1162 3.325000e-01 Bq/l 6.650000e+03 \n", + "1164 3.450000e-01 Bq/l 6.900000e+03 \n", + "1166 3.344000e-01 Bq/l 4.400000e+03 \n", + "1168 3.220000e-01 Bq/l 4.600000e+03 \n", + "1170 3.395000e-01 Bq/l 4.850000e+03 \n", + "1211 3.456000e-01 Bq/l 1.080000e+04 \n", + "1213 3.296000e-01 Bq/l 1.030000e+04 \n", + "1215 3.300000e-01 Bq/l 5.500000e+03 \n", + "1217 3.237000e-01 Bq/l 4.150000e+03 \n", + "1220 3.139000e-01 Bq/l 2.150000e+03 \n", + "1222 3.300000e-01 Bq/l 1.100000e+03 \n", + "1224 3.264000e-01 Bq/l 1.700000e+03 \n", + "1226 3.036000e-01 Bq/l 2.300000e+03 \n", + "1228 3.000000e-01 Bq/l 3.750000e+03 \n", + "1230 3.000000e-01 Bq/l 3.750000e+03 \n", + "1232 3.570000e-01 Bq/l 1.050000e+03 \n", + "1234 3.525000e-01 Bq/l 1.250000e+03 \n", + "1248 3.300000e-01 Bq/l 3.000000e+03 \n", + "1250 3.312000e-01 Bq/l 4.600000e+03 \n", + "1252 4.928000e-01 Bq/l 4.000000e+02 \n", + "1254 5.005000e-01 Bq/l 3.500000e+02 \n", + "1256 5.306000e-01 Bq/l 3.500000e+02 \n", + "1258 4.968000e-01 Bq/l 3.000000e+02 \n", + "1260 5.310000e-01 Bq/l 3.000000e+02 \n", + "1262 3.565000e-01 Bq/l 1.550000e+03 \n", + "1264 3.432000e-01 Bq/l 1.650000e+03 \n", + "1266 3.934000e-01 Bq/l 7.000000e+02 \n", + "1268 4.550000e-01 Bq/l 5.000000e+02 \n", + "1270 4.081000e-01 Bq/l 5.500000e+02 \n", + "1272 4.050000e-01 Bq/l 5.000000e+02 \n", + "1274 4.572000e-01 Bq/l 4.500000e+02 \n", + "1276 5.222000e-01 Bq/l 3.500000e+02 \n", + "1278 4.338000e-01 Bq/l 4.500000e+02 \n", + "1280 3.912000e-01 Bq/l 6.000000e+02 \n", + "1282 3.328000e-01 Bq/l 1.300000e+03 \n", + "1623 4.000000e-07 Bq/l 8.333333e+06 \n", + "4696 1.200000e-01 Bq/l 2.690583e+03 \n", + "4712 7.000000e-02 Bq/l 5.555556e+03 \n", + "4713 9.000000e-02 Bq/l 4.891304e+03 \n", + "4714 7.000000e-02 Bq/l 6.363636e+03 \n", + "4715 1.200000e-01 Bq/l 1.016949e+04 \n", + "7482 9.716400e+04 Bq/l 4.373172e+05 \n", + "8758 1.747559e-02 Bq/l 1.364698e+04 \n", + "8759 2.262377e-02 Bq/l 2.208610e+04 \n", + "8760 2.183357e-02 Bq/l 2.072916e+04 \n", + "8761 1.577155e-02 Bq/l 1.171594e+04 \n", + "8762 1.041441e-02 Bq/l 2.154639e+03 \n", + "11362 4.000000e-02 Bq/l 1.428571e+04 \n", + "11363 6.000000e-02 Bq/l 3.000000e+04 \n", + "11364 4.000000e-02 Bq/l 2.500000e+04 \n", + "11930 7.000000e-02 Bq/l 2.500000e+02 \n", + "11932 1.000000e-01 Bq/l 6.250000e+02 \n", + "11934 7.000000e-02 Bq/l 2.536232e+02 \n", + "11936 1.300000e-01 Bq/l 4.062500e+02 \n", + "12794 7.184512e-03 Bq/l 2.328717e+03 \n", + "12795 4.580005e-03 Bq/l 1.329843e+03 \n", + "12796 5.970171e-03 Bq/l 2.549873e+03 \n", + "12797 7.069506e-03 Bq/l 3.969501e+03 \n", + "12798 1.179140e-02 Bq/l 1.267934e+04 \n", + "12811 2.340000e+00 Bq/l 1.366146e+02 \n", + "15893 9.643907e-01 Bq/l 1.438745e+02 \n", + "15929 1.240000e-01 Bq/l 3.054187e+03 \n", + "15930 1.270000e-01 Bq/l 2.690678e+03 \n", + "15961 3.246159e+01 Bq/l 5.964948e+02 \n", + "15963 4.180965e+01 Bq/l 9.960588e+02 \n", + "15967 2.298351e+01 Bq/l 2.959978e+02 \n", + "15969 2.343423e+01 Bq/l 3.079617e+02 \n", + "15971 3.772006e+01 Bq/l 8.086417e+02 \n", + "15973 1.113200e+02 Bq/l 7.160536e+03 \n", + "15977 5.528771e+02 Bq/l 1.778435e+05 \n", + "18788 4.009800e-01 Bq/l 8.150000e+03 \n", + "18789 5.120700e-01 Bq/l 1.515000e+04 " ] }, "execution_count": null, @@ -1528,58 +3409,43 @@ } ], "source": [ - "#|eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[\n", - " RemoveAllNAValuesCB(cols_to_check),\n", - " ParseTimeCB(),\n", - " EncodeTimeCB(cfg(), verbose = True)])\n", - "\n", - "tfm()\n", - "tfm.dfs['seawater'][['time']].head()" - ] - }, - { - "cell_type": "markdown", - "id": "17063a80", - "metadata": {}, - "source": [ - "## Sanitize value" + "threshold = 100\n", + "cols_to_show=['ID','Contracting Party','Nuclide', 'Value type','Activity or MDA', 'Uncertainty', 'Unit', 'relative_uncertainty' ]\n", + "tfm.dfs[grp][cols_to_show][tfm.dfs[grp]['relative_uncertainty'] > threshold]\n" ] }, { "cell_type": "markdown", - "id": "0fe8ab6e", + "id": "1bf1eea9", "metadata": {}, "source": [ - "We allocate each column containing measurement values into a single column `value` and remove `NA` where needed." + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: `biota` dataset contains rows where the uncertainty is much greater than the value. \n", + "\n", + ":::" ] }, { "cell_type": "code", "execution_count": null, - "id": "38538f9f", + "id": "010c88eb", "metadata": {}, "outputs": [], "source": [ - "# | exports\n", - "class SanitizeValue(Callback):\n", - " \"Sanitize value by removing blank entries and populating `value` column.\"\n", - " def __init__(self, \n", - " value_col: str='Activity or MDA' # Column name to sanitize\n", - " ):\n", - " fc.store_attr()\n", - "\n", - " def __call__(self, tfm):\n", - " for df in tfm.dfs.values():\n", - " df.dropna(subset=[self.value_col], inplace=True)\n", - " df['value'] = df[self.value_col]" + "grp='biota'\n", + "tfm.dfs[grp]['relative_uncertainty'] = (\n", + " # Divide 'uncertainty' by 'value'\n", + " (tfm.dfs[grp]['uncertainty'] / tfm.dfs[grp]['value'])\n", + " # Multiply by 100 to convert to percentage\n", + " * 100\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "a68ecd5b", + "id": "5d211075", "metadata": {}, "outputs": [ { @@ -1603,41 +3469,506 @@ " \n", " \n", " \n", - " value\n", + " ID\n", + " Contracting Party\n", + " Nuclide\n", + " Value type\n", + " Activity or MDA\n", + " Uncertainty\n", + " Unit\n", + " relative_uncertainty\n", + " \n", + " \n", + " \n", + " \n", + " 1491\n", + " 88591\n", + " Denmark\n", + " 137Cs\n", + " =\n", + " 0.024000\n", + " 0.1248\n", + " Bq/kg f.w.\n", + " 260.000000\n", + " \n", + " \n", + " 3279\n", + " 82675\n", + " United Kingdom\n", + " 239,240Pu\n", + " =\n", + " 0.056000\n", + " 0.1300\n", + " Bq/kg f.w.\n", + " 116.071429\n", + " \n", + " \n", + " 3430\n", + " 82600\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.380000\n", + " 3.3800\n", + " Bq/kg f.w.\n", + " 444.736842\n", + " \n", + " \n", + " 5934\n", + " 49310\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.168608\n", + " 0.7040\n", + " Bq/kg f.w.\n", + " 208.768267\n", + " \n", + " \n", + " 6202\n", + " 49307\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.157033\n", + " 0.7460\n", + " Bq/kg f.w.\n", + " 237.529691\n", + " \n", + " \n", + " 6605\n", + " 49305\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.118002\n", + " 0.5540\n", + " Bq/kg f.w.\n", + " 234.741784\n", + " \n", + " \n", + " 6891\n", + " 49300\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.153924\n", + " 0.7620\n", + " Bq/kg f.w.\n", + " 247.524752\n", + " \n", + " \n", + " 7238\n", + " 49297\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.192765\n", + " 0.7100\n", + " Bq/kg f.w.\n", + " 184.162063\n", + " \n", + " \n", + " 7435\n", + " 62016\n", + " France\n", + " 137Cs\n", + " =\n", + " 0.039809\n", + " 0.1200\n", + " Bq/kg f.w.\n", + " 150.719717\n", + " \n", + " \n", + " 7454\n", + " 49296\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.174048\n", + " 0.6720\n", + " Bq/kg f.w.\n", + " 193.050193\n", + " \n", + " \n", + " 7718\n", + " 49292\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.155430\n", + " 0.6600\n", + " Bq/kg f.w.\n", + " 212.314225\n", + " \n", + " \n", + " 8014\n", + " 49290\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.131120\n", + " 0.8800\n", + " Bq/kg f.w.\n", + " 335.570470\n", + " \n", + " \n", + " 8315\n", + " 49287\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.166440\n", + " 1.1400\n", + " Bq/kg f.w.\n", + " 342.465753\n", + " \n", + " \n", + " 8622\n", + " 49284\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.147918\n", + " 1.0680\n", + " Bq/kg f.w.\n", + " 361.010830\n", + " \n", + " \n", + " 8929\n", + " 49280\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.331110\n", + " 1.1320\n", + " Bq/kg f.w.\n", + " 170.940171\n", + " \n", + " \n", + " 9153\n", + " 49278\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.223232\n", + " 1.0240\n", + " Bq/kg f.w.\n", + " 229.357798\n", " \n", - " \n", - " \n", " \n", - " 0\n", - " 0.20\n", + " 9402\n", + " 49275\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.288048\n", + " 1.6320\n", + " Bq/kg f.w.\n", + " 283.286119\n", " \n", " \n", - " 1\n", - " 0.27\n", + " 9679\n", + " 49271\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.197788\n", + " 1.0040\n", + " Bq/kg f.w.\n", + " 253.807107\n", " \n", " \n", - " 2\n", - " 0.26\n", + " 9945\n", + " 49269\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.236670\n", + " 1.3720\n", + " Bq/kg f.w.\n", + " 289.855072\n", " \n", " \n", - " 3\n", - " 0.25\n", + " 10294\n", + " 49264\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.263056\n", + " 1.6040\n", + " Bq/kg f.w.\n", + " 304.878049\n", " \n", " \n", - " 4\n", - " 0.20\n", + " 10558\n", + " 49263\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.265506\n", + " 1.2920\n", + " Bq/kg f.w.\n", + " 243.309002\n", + " \n", + " \n", + " 10844\n", + " 49260\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.281385\n", + " 1.6900\n", + " Bq/kg f.w.\n", + " 300.300300\n", + " \n", + " \n", + " 10960\n", + " 23895\n", + " Belgium\n", + " 226Ra\n", + " =\n", + " 1.400000\n", + " 118.0000\n", + " Bq/kg f.w.\n", + " 4214.285714\n", + " \n", + " \n", + " 11141\n", + " 49257\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.255765\n", + " 1.7340\n", + " Bq/kg f.w.\n", + " 338.983051\n", + " \n", + " \n", + " 11443\n", + " 49252\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.280896\n", + " 1.5960\n", + " Bq/kg f.w.\n", + " 284.090909\n", + " \n", + " \n", + " 11723\n", + " 49251\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.316105\n", + " 1.9100\n", + " Bq/kg f.w.\n", + " 302.114804\n", + " \n", + " \n", + " 11973\n", + " 49248\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.350390\n", + " 1.8940\n", + " Bq/kg f.w.\n", + " 270.270270\n", + " \n", + " \n", + " 12196\n", + " 49245\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.202557\n", + " 1.5060\n", + " Bq/kg f.w.\n", + " 371.747212\n", + " \n", + " \n", + " 12225\n", + " 29984\n", + " Belgium\n", + " 137Cs\n", + " =\n", + " 0.169000\n", + " 27.0000\n", + " Bq/kg f.w.\n", + " 7988.165680\n", + " \n", + " \n", + " 12451\n", + " 49240\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.267120\n", + " 1.6800\n", + " Bq/kg f.w.\n", + " 314.465409\n", + " \n", + " \n", + " 12712\n", + " 49239\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.200202\n", + " 1.0940\n", + " Bq/kg f.w.\n", + " 273.224044\n", + " \n", + " \n", + " 13032\n", + " 49236\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.262000\n", + " 2.1200\n", + " Bq/kg f.w.\n", + " 404.580153\n", + " \n", + " \n", + " 13301\n", + " 49232\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.309000\n", + " 2.1600\n", + " Bq/kg f.w.\n", + " 349.514563\n", + " \n", + " \n", + " 13579\n", + " 49230\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.275000\n", + " 1.9820\n", + " Bq/kg f.w.\n", + " 360.363636\n", + " \n", + " \n", + " 14104\n", + " 49226\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.327000\n", + " 1.4680\n", + " Bq/kg f.w.\n", + " 224.464832\n", + " \n", + " \n", + " 14112\n", + " 35011\n", + " Belgium\n", + " 137Cs\n", + " =\n", + " 0.161900\n", + " 66.0000\n", + " Bq/kg f.w.\n", + " 20382.952440\n", + " \n", + " \n", + " 14674\n", + " 49221\n", + " Sweden\n", + " 137Cs\n", + " =\n", + " 0.295000\n", + " 2.7400\n", + " Bq/kg f.w.\n", + " 464.406780\n", " \n", " \n", "\n", "" ], "text/plain": [ - " value\n", - "0 0.20\n", - "1 0.27\n", - "2 0.26\n", - "3 0.25\n", - "4 0.20" + " ID Contracting Party Nuclide Value type Activity or MDA \\\n", + "1491 88591 Denmark 137Cs = 0.024000 \n", + "3279 82675 United Kingdom 239,240Pu = 0.056000 \n", + "3430 82600 Sweden 137Cs = 0.380000 \n", + "5934 49310 Sweden 137Cs = 0.168608 \n", + "6202 49307 Sweden 137Cs = 0.157033 \n", + "6605 49305 Sweden 137Cs = 0.118002 \n", + "6891 49300 Sweden 137Cs = 0.153924 \n", + "7238 49297 Sweden 137Cs = 0.192765 \n", + "7435 62016 France 137Cs = 0.039809 \n", + "7454 49296 Sweden 137Cs = 0.174048 \n", + "7718 49292 Sweden 137Cs = 0.155430 \n", + "8014 49290 Sweden 137Cs = 0.131120 \n", + "8315 49287 Sweden 137Cs = 0.166440 \n", + "8622 49284 Sweden 137Cs = 0.147918 \n", + "8929 49280 Sweden 137Cs = 0.331110 \n", + "9153 49278 Sweden 137Cs = 0.223232 \n", + "9402 49275 Sweden 137Cs = 0.288048 \n", + "9679 49271 Sweden 137Cs = 0.197788 \n", + "9945 49269 Sweden 137Cs = 0.236670 \n", + "10294 49264 Sweden 137Cs = 0.263056 \n", + "10558 49263 Sweden 137Cs = 0.265506 \n", + "10844 49260 Sweden 137Cs = 0.281385 \n", + "10960 23895 Belgium 226Ra = 1.400000 \n", + "11141 49257 Sweden 137Cs = 0.255765 \n", + "11443 49252 Sweden 137Cs = 0.280896 \n", + "11723 49251 Sweden 137Cs = 0.316105 \n", + "11973 49248 Sweden 137Cs = 0.350390 \n", + "12196 49245 Sweden 137Cs = 0.202557 \n", + "12225 29984 Belgium 137Cs = 0.169000 \n", + "12451 49240 Sweden 137Cs = 0.267120 \n", + "12712 49239 Sweden 137Cs = 0.200202 \n", + "13032 49236 Sweden 137Cs = 0.262000 \n", + "13301 49232 Sweden 137Cs = 0.309000 \n", + "13579 49230 Sweden 137Cs = 0.275000 \n", + "14104 49226 Sweden 137Cs = 0.327000 \n", + "14112 35011 Belgium 137Cs = 0.161900 \n", + "14674 49221 Sweden 137Cs = 0.295000 \n", + "\n", + " Uncertainty Unit relative_uncertainty \n", + "1491 0.1248 Bq/kg f.w. 260.000000 \n", + "3279 0.1300 Bq/kg f.w. 116.071429 \n", + "3430 3.3800 Bq/kg f.w. 444.736842 \n", + "5934 0.7040 Bq/kg f.w. 208.768267 \n", + "6202 0.7460 Bq/kg f.w. 237.529691 \n", + "6605 0.5540 Bq/kg f.w. 234.741784 \n", + "6891 0.7620 Bq/kg f.w. 247.524752 \n", + "7238 0.7100 Bq/kg f.w. 184.162063 \n", + "7435 0.1200 Bq/kg f.w. 150.719717 \n", + "7454 0.6720 Bq/kg f.w. 193.050193 \n", + "7718 0.6600 Bq/kg f.w. 212.314225 \n", + "8014 0.8800 Bq/kg f.w. 335.570470 \n", + "8315 1.1400 Bq/kg f.w. 342.465753 \n", + "8622 1.0680 Bq/kg f.w. 361.010830 \n", + "8929 1.1320 Bq/kg f.w. 170.940171 \n", + "9153 1.0240 Bq/kg f.w. 229.357798 \n", + "9402 1.6320 Bq/kg f.w. 283.286119 \n", + "9679 1.0040 Bq/kg f.w. 253.807107 \n", + "9945 1.3720 Bq/kg f.w. 289.855072 \n", + "10294 1.6040 Bq/kg f.w. 304.878049 \n", + "10558 1.2920 Bq/kg f.w. 243.309002 \n", + "10844 1.6900 Bq/kg f.w. 300.300300 \n", + "10960 118.0000 Bq/kg f.w. 4214.285714 \n", + "11141 1.7340 Bq/kg f.w. 338.983051 \n", + "11443 1.5960 Bq/kg f.w. 284.090909 \n", + "11723 1.9100 Bq/kg f.w. 302.114804 \n", + "11973 1.8940 Bq/kg f.w. 270.270270 \n", + "12196 1.5060 Bq/kg f.w. 371.747212 \n", + "12225 27.0000 Bq/kg f.w. 7988.165680 \n", + "12451 1.6800 Bq/kg f.w. 314.465409 \n", + "12712 1.0940 Bq/kg f.w. 273.224044 \n", + "13032 2.1200 Bq/kg f.w. 404.580153 \n", + "13301 2.1600 Bq/kg f.w. 349.514563 \n", + "13579 1.9820 Bq/kg f.w. 360.363636 \n", + "14104 1.4680 Bq/kg f.w. 224.464832 \n", + "14112 66.0000 Bq/kg f.w. 20382.952440 \n", + "14674 2.7400 Bq/kg f.w. 464.406780 " ] }, "execution_count": null, @@ -1646,113 +3977,9 @@ } ], "source": [ - "#|eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[SanitizeValue()])\n", - "\n", - "tfm()['seawater'][['value']].head()" - ] - }, - { - "cell_type": "markdown", - "id": "7c83412b", - "metadata": {}, - "source": [ - "## Normalize uncertainty" - ] - }, - { - "cell_type": "markdown", - "id": "13a44f1a", - "metadata": {}, - "source": [ - "For each sample type in the OSPAR dataset, the reported uncertainty is given as an expanded uncertainty with a coverage factor `𝑘=2`. For further details, refer to the [OSPAR reporting guidelines](https://mcc.jrc.ec.europa.eu/documents/OSPAR/Guidelines_forestimationof_a_%20measurefor_uncertainty_in_OSPARmonitoring.pdf).\n", - "\n", - "**Note**: The OSPAR uncertainty values are normalized to standard uncertainty with a coverage factor \n", - "𝑘=1." - ] - }, - { - "cell_type": "markdown", - "id": "97a933ab", - "metadata": {}, - "source": [ - "`NormalizeUncCB` callback normalizes the uncertainty using the following `lambda` function:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6c84351", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "unc_exp2stan = lambda df, unc_col: df[unc_col] / 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ecb2866d", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "class NormalizeUncCB(Callback):\n", - " \"\"\"Normalize uncertainty values in DataFrames.\"\"\"\n", - " def __init__(self, \n", - " col_unc: str='Uncertainty', # Column name to normalize\n", - " fn_convert_unc: Callable=unc_exp2stan, # Function correcting coverage factor\n", - " ): \n", - " fc.store_attr()\n", - "\n", - " def __call__(self, tfm):\n", - " for df in tfm.dfs.values():\n", - " df['uncertainty'] = self.fn_convert_unc(df, self.col_unc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "76a18010", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "seawater:\n", - " value uncertainty\n", - "0 0.20 NaN\n", - "1 0.27 NaN\n", - "2 0.26 NaN\n", - "3 0.25 NaN\n", - "4 0.20 NaN\n", - "\n", - "biota:\n", - " value uncertainty\n", - "0 0.3510 0.033\n", - "1 39.0000 7.500\n", - "2 0.0938 0.009\n", - "3 1.5400 0.155\n", - "4 16.0000 3.000\n" - ] - } - ], - "source": [ - "#|eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[ \n", - " SanitizeValue(), \n", - " NormalizeUncCB()\n", - " ])\n", - "tfm()\n", - "\n", - "for grp in ['seawater', 'biota']:\n", - " print(f'\\n{grp}:')\n", - " print(tfm.dfs[grp][['value', 'uncertainty']].head())" + "threshold = 100\n", + "cols_to_show=['ID','Contracting Party','Nuclide', 'Value type','Activity or MDA', 'Uncertainty', 'Unit', 'relative_uncertainty' ]\n", + "tfm.dfs[grp][cols_to_show][tfm.dfs[grp]['relative_uncertainty'] > threshold]\n" ] }, { @@ -1768,7 +3995,7 @@ "id": "6386763d", "metadata": {}, "source": [ - "Biota `species` information is contained in OSPAR biota `Species` column. We follow in the next following processing steps the same approach as for remapping of nuclide names above." + "The OSPAR dataset contains biota species information in the `Species` column of the biota dataframe. To ensure consistency with MARIS standards, we need to remap these species names. We'll use a same approach to the one we employed for standardizing nuclide names:\n" ] }, { @@ -1814,27 +4041,27 @@ " \n", " 0\n", " 0\n", - " Unknown\n", + " Solea solea (S.vulgaris)\n", " \n", " \n", " 1\n", " 1\n", - " Homarus gammarus\n", + " BROSME BROSME\n", " \n", " \n", " 2\n", " 2\n", - " SPRATTUS SPRATTUS\n", + " NaN\n", " \n", " \n", " 3\n", " 3\n", - " Anarhichas denticulatus\n", + " Argentina silus\n", " \n", " \n", " 4\n", " 4\n", - " MOLVA MOLVA\n", + " Lumpenus lampretaeformis\n", " \n", " \n", " ...\n", @@ -1844,27 +4071,27 @@ " \n", " 151\n", " 151\n", - " MELANOGRAMMUS AEGLEFINUS\n", + " SEBASTES MARINUS\n", " \n", " \n", " 152\n", " 152\n", - " MERLUCCIUS MERLUCCIUS\n", + " Thunnus thynnus\n", " \n", " \n", " 153\n", " 153\n", - " PECTEN MAXIMUS\n", + " Pleuronectes platessa\n", " \n", " \n", " 154\n", " 154\n", - " LITTORINA LITTOREA\n", + " Hippoglossoides platessoides\n", " \n", " \n", " 155\n", " 155\n", - " Pleuronectes platessa\n", + " Gaidropsarus argenteus\n", " \n", " \n", "\n", @@ -1872,18 +4099,18 @@ "" ], "text/plain": [ - " index value\n", - "0 0 Unknown\n", - "1 1 Homarus gammarus\n", - "2 2 SPRATTUS SPRATTUS\n", - "3 3 Anarhichas denticulatus\n", - "4 4 MOLVA MOLVA\n", - ".. ... ...\n", - "151 151 MELANOGRAMMUS AEGLEFINUS\n", - "152 152 MERLUCCIUS MERLUCCIUS\n", - "153 153 PECTEN MAXIMUS\n", - "154 154 LITTORINA LITTOREA\n", - "155 155 Pleuronectes platessa\n", + " index value\n", + "0 0 Solea solea (S.vulgaris)\n", + "1 1 BROSME BROSME\n", + "2 2 NaN\n", + "3 3 Argentina silus\n", + "4 4 Lumpenus lampretaeformis\n", + ".. ... ...\n", + "151 151 SEBASTES MARINUS\n", + "152 152 Thunnus thynnus\n", + "153 153 Pleuronectes platessa\n", + "154 154 Hippoglossoides platessoides\n", + "155 155 Gaidropsarus argenteus\n", "\n", "[156 rows x 2 columns]" ] @@ -1933,7 +4160,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 156/156 [00:22<00:00, 6.84it/s]\n" + "Processing: 0%| | 0/156 [00:0012\n", " \n", " \n", - " CERASTODERMA (CARDIUM) EDULE\n", + " Cerastoderma (Cardium) Edule\n", " Cerastoderma edule\n", - " CERASTODERMA (CARDIUM) EDULE\n", + " Cerastoderma (Cardium) Edule\n", " 10\n", " \n", " \n", - " Cerastoderma (Cardium) Edule\n", + " CERASTODERMA (CARDIUM) EDULE\n", " Cerastoderma edule\n", - " Cerastoderma (Cardium) Edule\n", + " CERASTODERMA (CARDIUM) EDULE\n", " 10\n", " \n", " \n", @@ -2012,22 +4246,16 @@ " 9\n", " \n", " \n", - " DICENTRARCHUS (MORONE) LABRAX\n", - " Dicentrarchus labrax\n", - " DICENTRARCHUS (MORONE) LABRAX\n", - " 9\n", - " \n", - " \n", " MONODONTA LINEATA\n", " Ophiothrix lineata\n", " MONODONTA LINEATA\n", " 9\n", " \n", " \n", - " Pleuronectiformes [order]\n", - " Pleuronectiformes\n", - " Pleuronectiformes [order]\n", - " 8\n", + " DICENTRARCHUS (MORONE) LABRAX\n", + " Dicentrarchus labrax\n", + " DICENTRARCHUS (MORONE) LABRAX\n", + " 9\n", " \n", " \n", " RAJIDAE/BATOIDEA\n", @@ -2036,27 +4264,27 @@ " 8\n", " \n", " \n", + " Pleuronectiformes [order]\n", + " Pleuronectiformes\n", + " Pleuronectiformes [order]\n", + " 8\n", + " \n", + " \n", " PALMARIA PALMATA\n", " Alaria marginata\n", " PALMARIA PALMATA\n", " 7\n", " \n", " \n", - " Rhodymenia spp.\n", - " Rhodymenia\n", - " Rhodymenia spp.\n", - " 5\n", - " \n", - " \n", - " Sepia spp.\n", - " Sepia\n", - " Sepia spp.\n", + " unknown\n", + " Plankton\n", + " unknown\n", " 5\n", " \n", " \n", - " unknown\n", + " Unknown\n", " Plankton\n", - " unknown\n", + " Unknown\n", " 5\n", " \n", " \n", @@ -2072,9 +4300,9 @@ " 5\n", " \n", " \n", - " Unknown\n", - " Plankton\n", - " Unknown\n", + " Rhodymenia spp.\n", + " Rhodymenia\n", + " Rhodymenia spp.\n", " 5\n", " \n", " \n", @@ -2084,21 +4312,21 @@ " 5\n", " \n", " \n", - " Tapes sp.\n", - " Tapes\n", - " Tapes sp.\n", - " 4\n", + " Sepia spp.\n", + " Sepia\n", + " Sepia spp.\n", + " 5\n", " \n", " \n", - " Fucus sp.\n", - " Fucus\n", - " Fucus sp.\n", + " RHODYMENIA spp\n", + " Rhodymenia\n", + " RHODYMENIA spp\n", " 4\n", " \n", " \n", - " Patella sp.\n", - " Patella aspera\n", - " Patella sp.\n", + " Tapes sp.\n", + " Tapes\n", + " Tapes sp.\n", " 4\n", " \n", " \n", @@ -2108,15 +4336,15 @@ " 4\n", " \n", " \n", - " Gadus sp.\n", - " Gadus\n", - " Gadus sp.\n", + " Patella sp.\n", + " Patella aspera\n", + " Patella sp.\n", " 4\n", " \n", " \n", - " RHODYMENIA spp\n", - " Rhodymenia\n", - " RHODYMENIA spp\n", + " Fucus sp.\n", + " Fucus\n", + " Fucus sp.\n", " 4\n", " \n", " \n", @@ -2126,28 +4354,28 @@ " 4\n", " \n", " \n", + " Gadus sp.\n", + " Gadus\n", + " Gadus sp.\n", + " 4\n", + " \n", + " \n", " PECTINIDAE\n", " Buccinidae\n", " PECTINIDAE\n", " 3\n", " \n", " \n", - " Gaidropsarus argenteus\n", - " Gaidropsarus argentatus\n", - " Gaidropsarus argenteus\n", - " 2\n", - " \n", - " \n", " PLUERONECTES PLATESSA\n", " Pleuronectes platessa\n", " PLUERONECTES PLATESSA\n", " 2\n", " \n", " \n", - " ASCOPHYLLUN NODOSUM\n", - " Ascophyllum nodosum\n", - " ASCOPHYLLUN NODOSUM\n", - " 1\n", + " Gaidropsarus argenteus\n", + " Gaidropsarus argentatus\n", + " Gaidropsarus argenteus\n", + " 2\n", " \n", " \n", " Sebastes vivipares\n", @@ -2155,6 +4383,12 @@ " Sebastes vivipares\n", " 1\n", " \n", + " \n", + " ASCOPHYLLUN NODOSUM\n", + " Ascophyllum nodosum\n", + " ASCOPHYLLUN NODOSUM\n", + " 1\n", + " \n", " \n", "\n", "" @@ -2166,33 +4400,33 @@ "Mixture of green, red and brown algae Mercenaria mercenaria \n", "Solea solea (S.vulgaris) Loligo vulgaris \n", "SOLEA SOLEA (S.VULGARIS) Loligo vulgaris \n", - "CERASTODERMA (CARDIUM) EDULE Cerastoderma edule \n", "Cerastoderma (Cardium) Edule Cerastoderma edule \n", + "CERASTODERMA (CARDIUM) EDULE Cerastoderma edule \n", "NUCELLA LAPILLUS Mugil cephalus \n", - "DICENTRARCHUS (MORONE) LABRAX Dicentrarchus labrax \n", "MONODONTA LINEATA Ophiothrix lineata \n", - "Pleuronectiformes [order] Pleuronectiformes \n", + "DICENTRARCHUS (MORONE) LABRAX Dicentrarchus labrax \n", "RAJIDAE/BATOIDEA Batoidea \n", + "Pleuronectiformes [order] Pleuronectiformes \n", "PALMARIA PALMATA Alaria marginata \n", - "Rhodymenia spp. Rhodymenia \n", - "Sepia spp. Sepia \n", "unknown Plankton \n", + "Unknown Plankton \n", "RAJA DIPTURUS BATIS Dipturus batis \n", "Flatfish Lambia \n", - "Unknown Plankton \n", + "Rhodymenia spp. Rhodymenia \n", "FUCUS SPP. Fucus \n", + "Sepia spp. Sepia \n", + "RHODYMENIA spp Rhodymenia \n", "Tapes sp. Tapes \n", - "Fucus sp. Fucus \n", - "Patella sp. Patella aspera \n", "FUCUS spp Fucus \n", - "Gadus sp. Gadus \n", - "RHODYMENIA spp Rhodymenia \n", + "Patella sp. Patella aspera \n", + "Fucus sp. Fucus \n", "Thunnus sp. Thunnus \n", + "Gadus sp. Gadus \n", "PECTINIDAE Buccinidae \n", - "Gaidropsarus argenteus Gaidropsarus argentatus \n", "PLUERONECTES PLATESSA Pleuronectes platessa \n", - "ASCOPHYLLUN NODOSUM Ascophyllum nodosum \n", + "Gaidropsarus argenteus Gaidropsarus argentatus \n", "Sebastes vivipares Sebastes viviparus \n", + "ASCOPHYLLUN NODOSUM Ascophyllum nodosum \n", "\n", " source_name \\\n", "source_key \n", @@ -2200,33 +4434,33 @@ "Mixture of green, red and brown algae Mixture of green, red and brown algae \n", "Solea solea (S.vulgaris) Solea solea (S.vulgaris) \n", "SOLEA SOLEA (S.VULGARIS) SOLEA SOLEA (S.VULGARIS) \n", - "CERASTODERMA (CARDIUM) EDULE CERASTODERMA (CARDIUM) EDULE \n", "Cerastoderma (Cardium) Edule Cerastoderma (Cardium) Edule \n", + "CERASTODERMA (CARDIUM) EDULE CERASTODERMA (CARDIUM) EDULE \n", "NUCELLA LAPILLUS NUCELLA LAPILLUS \n", - "DICENTRARCHUS (MORONE) LABRAX DICENTRARCHUS (MORONE) LABRAX \n", "MONODONTA LINEATA MONODONTA LINEATA \n", - "Pleuronectiformes [order] Pleuronectiformes [order] \n", + "DICENTRARCHUS (MORONE) LABRAX DICENTRARCHUS (MORONE) LABRAX \n", "RAJIDAE/BATOIDEA RAJIDAE/BATOIDEA \n", + "Pleuronectiformes [order] Pleuronectiformes [order] \n", "PALMARIA PALMATA PALMARIA PALMATA \n", - "Rhodymenia spp. Rhodymenia spp. \n", - "Sepia spp. Sepia spp. \n", "unknown unknown \n", + "Unknown Unknown \n", "RAJA DIPTURUS BATIS RAJA DIPTURUS BATIS \n", "Flatfish Flatfish \n", - "Unknown Unknown \n", + "Rhodymenia spp. Rhodymenia spp. \n", "FUCUS SPP. FUCUS SPP. \n", + "Sepia spp. Sepia spp. \n", + "RHODYMENIA spp RHODYMENIA spp \n", "Tapes sp. Tapes sp. \n", - "Fucus sp. Fucus sp. \n", - "Patella sp. Patella sp. \n", "FUCUS spp FUCUS spp \n", - "Gadus sp. Gadus sp. \n", - "RHODYMENIA spp RHODYMENIA spp \n", + "Patella sp. Patella sp. \n", + "Fucus sp. Fucus sp. \n", "Thunnus sp. Thunnus sp. \n", + "Gadus sp. Gadus sp. \n", "PECTINIDAE PECTINIDAE \n", - "Gaidropsarus argenteus Gaidropsarus argenteus \n", "PLUERONECTES PLATESSA PLUERONECTES PLATESSA \n", - "ASCOPHYLLUN NODOSUM ASCOPHYLLUN NODOSUM \n", + "Gaidropsarus argenteus Gaidropsarus argenteus \n", "Sebastes vivipares Sebastes vivipares \n", + "ASCOPHYLLUN NODOSUM ASCOPHYLLUN NODOSUM \n", "\n", " match_score \n", "source_key \n", @@ -2234,33 +4468,33 @@ "Mixture of green, red and brown algae 26 \n", "Solea solea (S.vulgaris) 12 \n", "SOLEA SOLEA (S.VULGARIS) 12 \n", - "CERASTODERMA (CARDIUM) EDULE 10 \n", "Cerastoderma (Cardium) Edule 10 \n", + "CERASTODERMA (CARDIUM) EDULE 10 \n", "NUCELLA LAPILLUS 9 \n", - "DICENTRARCHUS (MORONE) LABRAX 9 \n", "MONODONTA LINEATA 9 \n", - "Pleuronectiformes [order] 8 \n", + "DICENTRARCHUS (MORONE) LABRAX 9 \n", "RAJIDAE/BATOIDEA 8 \n", + "Pleuronectiformes [order] 8 \n", "PALMARIA PALMATA 7 \n", - "Rhodymenia spp. 5 \n", - "Sepia spp. 5 \n", "unknown 5 \n", + "Unknown 5 \n", "RAJA DIPTURUS BATIS 5 \n", "Flatfish 5 \n", - "Unknown 5 \n", + "Rhodymenia spp. 5 \n", "FUCUS SPP. 5 \n", + "Sepia spp. 5 \n", + "RHODYMENIA spp 4 \n", "Tapes sp. 4 \n", - "Fucus sp. 4 \n", - "Patella sp. 4 \n", "FUCUS spp 4 \n", - "Gadus sp. 4 \n", - "RHODYMENIA spp 4 \n", + "Patella sp. 4 \n", + "Fucus sp. 4 \n", "Thunnus sp. 4 \n", + "Gadus sp. 4 \n", "PECTINIDAE 3 \n", - "Gaidropsarus argenteus 2 \n", "PLUERONECTES PLATESSA 2 \n", - "ASCOPHYLLUN NODOSUM 1 \n", - "Sebastes vivipares 1 " + "Gaidropsarus argenteus 2 \n", + "Sebastes vivipares 1 \n", + "ASCOPHYLLUN NODOSUM 1 " ] }, "execution_count": null, @@ -2278,7 +4512,7 @@ "id": "b9a388bf", "metadata": {}, "source": [ - "We fix below some of the entries that are not properly matched by the `Remapper` object:" + "Below, we fix some of the entries that are not properly matched by the `Remapper` object:" ] }, { @@ -2290,17 +4524,18 @@ "source": [ "#|exports\n", "fixes_biota_species = {\n", + " 'SOLEA SOLEA (S.VULGARIS)': 'Solea solea',\n", + " 'MONODONTA LINEATA': 'Phorcus lineatus',\n", + " 'NUCELLA LAPILLUS': NA, # Dropped. In Worms 'Nucella lapillus (Linnaeus, 1758)'.\n", + " 'unknown': NA,\n", " 'PECTINIDAE': NA, # Dropped. In Worms as PECTINIDAE is a family.\n", + " 'RAJIDAE/BATOIDEA': NA,\n", + " 'Flatfish': NA,\n", " 'Unknown': NA,\n", - " 'unknown': NA,\n", " 'PALMARIA PALMATA': NA, # Dropped. In Worms 'Palmaria palmata (Linnaeus) F.Weber & D.Mohr, 1805',\n", - " 'RAJIDAE/BATOIDEA': NA, # Mix \n", - " 'MONODONTA LINEATA': 'Phorcus lineatus',\n", - " 'NUCELLA LAPILLUS': NA, # Dropped. In Worms 'Nucella lapillus (Linnaeus, 1758)', \n", - " 'SOLEA SOLEA (S.VULGARIS)': 'Solea solea',\n", - " 'Solea solea (S.vulgaris)': 'Solea solea',\n", - " 'Mixture of green, red and brown algae': NA, # Mix \n", - " 'RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA': NA, # Mix\n", + " 'Mixture of green, red and brown algae': NA,\n", + " 'RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA': NA,\n", + " 'Solea solea (S.vulgaris)': 'Solea solea'\n", " }" ] }, @@ -2322,7 +4557,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 156/156 [00:24<00:00, 6.30it/s]\n" + "Processing: 1%| | 1/156 [00:00<00:29, 5.25it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 100%|██████████| 156/156 [00:28<00:00, 5.39it/s]\n" ] }, { @@ -2359,15 +4601,15 @@ " \n", " \n", " \n", - " CERASTODERMA (CARDIUM) EDULE\n", + " Cerastoderma (Cardium) Edule\n", " Cerastoderma edule\n", - " CERASTODERMA (CARDIUM) EDULE\n", + " Cerastoderma (Cardium) Edule\n", " 10\n", " \n", " \n", - " Cerastoderma (Cardium) Edule\n", + " CERASTODERMA (CARDIUM) EDULE\n", " Cerastoderma edule\n", - " Cerastoderma (Cardium) Edule\n", + " CERASTODERMA (CARDIUM) EDULE\n", " 10\n", " \n", " \n", @@ -2383,27 +4625,15 @@ " 8\n", " \n", " \n", - " FUCUS SPP.\n", - " Fucus\n", - " FUCUS SPP.\n", - " 5\n", - " \n", - " \n", - " Flatfish\n", - " Lambia\n", - " Flatfish\n", - " 5\n", - " \n", - " \n", " Sepia spp.\n", " Sepia\n", " Sepia spp.\n", " 5\n", " \n", " \n", - " Rhodymenia spp.\n", - " Rhodymenia\n", - " Rhodymenia spp.\n", + " FUCUS SPP.\n", + " Fucus\n", + " FUCUS SPP.\n", " 5\n", " \n", " \n", @@ -2413,10 +4643,10 @@ " 5\n", " \n", " \n", - " RHODYMENIA spp\n", + " Rhodymenia spp.\n", " Rhodymenia\n", - " RHODYMENIA spp\n", - " 4\n", + " Rhodymenia spp.\n", + " 5\n", " \n", " \n", " Thunnus sp.\n", @@ -2425,15 +4655,21 @@ " 4\n", " \n", " \n", + " Patella sp.\n", + " Patella aspera\n", + " Patella sp.\n", + " 4\n", + " \n", + " \n", " FUCUS spp\n", " Fucus\n", " FUCUS spp\n", " 4\n", " \n", " \n", - " Gadus sp.\n", - " Gadus\n", - " Gadus sp.\n", + " Tapes sp.\n", + " Tapes\n", + " Tapes sp.\n", " 4\n", " \n", " \n", @@ -2443,34 +4679,28 @@ " 4\n", " \n", " \n", - " Tapes sp.\n", - " Tapes\n", - " Tapes sp.\n", + " RHODYMENIA spp\n", + " Rhodymenia\n", + " RHODYMENIA spp\n", " 4\n", " \n", " \n", - " Patella sp.\n", - " Patella aspera\n", - " Patella sp.\n", + " Gadus sp.\n", + " Gadus\n", + " Gadus sp.\n", " 4\n", " \n", " \n", - " Gaidropsarus argenteus\n", - " Gaidropsarus argentatus\n", - " Gaidropsarus argenteus\n", - " 2\n", - " \n", - " \n", " PLUERONECTES PLATESSA\n", " Pleuronectes platessa\n", " PLUERONECTES PLATESSA\n", " 2\n", " \n", " \n", - " ASCOPHYLLUN NODOSUM\n", - " Ascophyllum nodosum\n", - " ASCOPHYLLUN NODOSUM\n", - " 1\n", + " Gaidropsarus argenteus\n", + " Gaidropsarus argentatus\n", + " Gaidropsarus argenteus\n", + " 2\n", " \n", " \n", " Sebastes vivipares\n", @@ -2478,6 +4708,12 @@ " Sebastes vivipares\n", " 1\n", " \n", + " \n", + " ASCOPHYLLUN NODOSUM\n", + " Ascophyllum nodosum\n", + " ASCOPHYLLUN NODOSUM\n", + " 1\n", + " \n", " \n", "\n", "" @@ -2485,49 +4721,47 @@ "text/plain": [ " matched_maris_name \\\n", "source_key \n", - "CERASTODERMA (CARDIUM) EDULE Cerastoderma edule \n", "Cerastoderma (Cardium) Edule Cerastoderma edule \n", + "CERASTODERMA (CARDIUM) EDULE Cerastoderma edule \n", "DICENTRARCHUS (MORONE) LABRAX Dicentrarchus labrax \n", "Pleuronectiformes [order] Pleuronectiformes \n", - "FUCUS SPP. Fucus \n", - "Flatfish Lambia \n", "Sepia spp. Sepia \n", - "Rhodymenia spp. Rhodymenia \n", + "FUCUS SPP. Fucus \n", "RAJA DIPTURUS BATIS Dipturus batis \n", - "RHODYMENIA spp Rhodymenia \n", + "Rhodymenia spp. Rhodymenia \n", "Thunnus sp. Thunnus \n", + "Patella sp. Patella aspera \n", "FUCUS spp Fucus \n", - "Gadus sp. Gadus \n", - "Fucus sp. Fucus \n", "Tapes sp. Tapes \n", - "Patella sp. Patella aspera \n", - "Gaidropsarus argenteus Gaidropsarus argentatus \n", + "Fucus sp. Fucus \n", + "RHODYMENIA spp Rhodymenia \n", + "Gadus sp. Gadus \n", "PLUERONECTES PLATESSA Pleuronectes platessa \n", - "ASCOPHYLLUN NODOSUM Ascophyllum nodosum \n", + "Gaidropsarus argenteus Gaidropsarus argentatus \n", "Sebastes vivipares Sebastes viviparus \n", + "ASCOPHYLLUN NODOSUM Ascophyllum nodosum \n", "\n", " source_name match_score \n", "source_key \n", - "CERASTODERMA (CARDIUM) EDULE CERASTODERMA (CARDIUM) EDULE 10 \n", "Cerastoderma (Cardium) Edule Cerastoderma (Cardium) Edule 10 \n", + "CERASTODERMA (CARDIUM) EDULE CERASTODERMA (CARDIUM) EDULE 10 \n", "DICENTRARCHUS (MORONE) LABRAX DICENTRARCHUS (MORONE) LABRAX 9 \n", "Pleuronectiformes [order] Pleuronectiformes [order] 8 \n", - "FUCUS SPP. FUCUS SPP. 5 \n", - "Flatfish Flatfish 5 \n", "Sepia spp. Sepia spp. 5 \n", - "Rhodymenia spp. Rhodymenia spp. 5 \n", + "FUCUS SPP. FUCUS SPP. 5 \n", "RAJA DIPTURUS BATIS RAJA DIPTURUS BATIS 5 \n", - "RHODYMENIA spp RHODYMENIA spp 4 \n", + "Rhodymenia spp. Rhodymenia spp. 5 \n", "Thunnus sp. Thunnus sp. 4 \n", + "Patella sp. Patella sp. 4 \n", "FUCUS spp FUCUS spp 4 \n", - "Gadus sp. Gadus sp. 4 \n", - "Fucus sp. Fucus sp. 4 \n", "Tapes sp. Tapes sp. 4 \n", - "Patella sp. Patella sp. 4 \n", - "Gaidropsarus argenteus Gaidropsarus argenteus 2 \n", + "Fucus sp. Fucus sp. 4 \n", + "RHODYMENIA spp RHODYMENIA spp 4 \n", + "Gadus sp. Gadus sp. 4 \n", "PLUERONECTES PLATESSA PLUERONECTES PLATESSA 2 \n", - "ASCOPHYLLUN NODOSUM ASCOPHYLLUN NODOSUM 1 \n", - "Sebastes vivipares Sebastes vivipares 1 " + "Gaidropsarus argenteus Gaidropsarus argenteus 2 \n", + "Sebastes vivipares Sebastes vivipares 1 \n", + "ASCOPHYLLUN NODOSUM ASCOPHYLLUN NODOSUM 1 " ] }, "execution_count": null, @@ -2546,47 +4780,9 @@ "id": "24b4e864", "metadata": {}, "source": [ - "Visual inspection of the remaining unperfectly matched entries seem acceptable to proceed. \n", - "\n", - "We now define a callback to apply the lookup table to the `biota` dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "986b63d9", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "class RemapBiotaSpeciesCB(Callback):\n", - " \"Biota species standardized to MARIS format.\"\n", - " def __init__(self, \n", - " fn_lut: Callable, # Function that returns the lookup table dictionary\n", - " verbose: bool=False # Print unmatched values\n", - " ):\n", - " fc.store_attr()\n", + "Visual inspection of the remaining imperfectly matched entries appears acceptable to proceed. \n", "\n", - " def __call__(self, tfm):\n", - " lut = self.fn_lut()\n", - " tfm.dfs['biota']['species'] = tfm.dfs['biota']['Species'].apply(lambda x: self._get_species(x, lut))\n", - " \n", - " def _get_species(self, \n", - " value_to_match:str, # The value to match\n", - " lut:dict # The lookup table dictionary\n", - " ):\n", - " match = lut.get(value_to_match, Match(-1, None, None, None))\n", - " if self.verbose and match.matched_id == -1:\n", - " print(f'Unmatched species: {value_to_match}')\n", - " return match.matched_id" - ] - }, - { - "cell_type": "markdown", - "id": "5db1f46e", - "metadata": {}, - "source": [ - "Let's see it in action, along with the `RemapBiotaSpeciesCB` callback:" + "We can now use the generic `RemapCB` callback to perform the remapping." ] }, { @@ -2623,7 +4819,7 @@ " 434, 444, 443, 389, 440, 441, 439, 427, 438, 1605, 436,\n", " 426, 433, 390, 420, 417, 397, 421, 294, 1221, 422, 423,\n", " 428, 424, 415, 1607, 387, 380, 406, 398, 416, 408, 409,\n", - " 418, 430, 429, 411, 410, 148])" + " 418, 430, 429, 411, 410])" ] }, "execution_count": null, @@ -2636,7 +4832,7 @@ "dfs = load_data(fname_in)\n", "tfm = Transformer(dfs, cbs=[\n", " RemoveAllNAValuesCB(cols_to_check),\n", - " RemapBiotaSpeciesCB(lut_biota)\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='Species', dest_grps='biota') \n", " ])\n", "\n", "tfm()['biota']['species'].unique()" @@ -2655,9 +4851,20 @@ "id": "ec6d4172", "metadata": {}, "source": [ - "The OSPAR data includes entries with the variable `Body Part` labelled as `whole`. The Maris data requires that the body `body_part` distinguishes between `Whole animal` and `Whole plant`. The OSPAR data defines the `Biological group` which allows for the Body Part labelled as whole to be defined as `Whole animal` and `Whole plant`. \n", + "The OSPAR dataset includes entries where the `Body Part` is labeled as `whole`. However, the MARIS data standard requires a more specific distinction in the `body_part` field, differentiating between `Whole animal` and `Whole plant`. Fortunately, the OSPAR data provides a `Biological group` field that allows us to make this distinction.\n", "\n", - "To account for this, we create a temporary column `body_part_temp` that combines `Body Part` and `Biological group` and use it to perform the lookup using the `Remapper` object." + "To address this discrepancy and ensure compatibility with MARIS standards, we will:\n", + "\n", + "1. Create a temporary column `body_part_temp` that combines information from both `Body Part` and `Biological group`.\n", + "2. Use this temporary column to perform the lookup using our `Remapper` object." + ] + }, + { + "cell_type": "markdown", + "id": "d825bf87", + "metadata": {}, + "source": [ + "Lets create the temporary column, `body_part_temp`, that combines `Body Part` and `Biological group`." ] }, { @@ -2686,18 +4893,24 @@ { "data": { "text/plain": [ - "0 SOFT PARTS Molluscs\n", - "1 GROWING TIPS Seaweed\n", - "2 SOFT PARTS Molluscs\n", - "3 SOFT PARTS Molluscs\n", - "4 GROWING TIPS Seaweed\n", - " ... \n", - "15309 SOFT PARTS Molluscs\n", - "15310 SOFT PARTS Molluscs\n", - "15311 WHOLE PLANT Seaweed\n", - "15312 WHOLE PLANT Seaweed\n", - "15313 WHOLE PLANT Seaweed\n", - "Name: body_part_temp, Length: 15314, dtype: object" + "array(['SOFT PARTS Molluscs', 'GROWING TIPS Seaweed',\n", + " 'Whole plant Seaweed', 'WHOLE Fish', 'WHOLE ANIMAL Fish',\n", + " 'FLESH WITHOUT BONES Fish', 'WHOLE ANIMAL Molluscs',\n", + " 'WHOLE PLANT Seaweed', 'Soft Parts Molluscs',\n", + " 'FLESH WITHOUT BONES Molluscs', 'WHOLE Seaweed',\n", + " 'Whole without head FISH', 'Cod medallion FISH', 'Muscle FISH',\n", + " 'Whole animal Fish', 'Whole fisk FISH', 'Whole FISH',\n", + " 'Mix of muscle and whole fish without liver FISH', 'Flesh Fish',\n", + " 'WHOLE FISH Fish', 'Whole animal Molluscs', 'Muscle Fish',\n", + " 'Whole fish Fish', 'FLESH WITHOUT BONE Fish', 'UNKNOWN Fish',\n", + " 'WHOLE PLANT seaweed', 'WHOLE PLANT SEAWEED',\n", + " 'SOFT PARTS molluscs', 'FLESH WITHOUT BONES FISH',\n", + " 'WHOLE ANIMAL FISH', 'FLESH WITHOUT BONES fish', 'FLESH Fish',\n", + " 'FLESH WITHOUT BONES SEAWEED', 'FLESH WITH SCALES Fish',\n", + " 'FLESH WITHOUT BONE FISH', 'HEAD FISH', 'WHOLE FISH FISH',\n", + " 'Flesh without bones Fish', 'UNKNOWN FISH', 'Soft parts Molluscs',\n", + " 'Soft parts Fish', 'HEAD Fish', 'SOFT PARTS MOLLUSCS',\n", + " 'whole plant Seaweed', 'LIVER Fish', 'MUSCLE Fish'], dtype=object)" ] }, "execution_count": null, @@ -2713,7 +4926,7 @@ " AddBodypartTempCB(),\n", " ])\n", "dfs_test = tfm()\n", - "dfs_test['biota']['body_part_temp']" + "dfs_test['biota']['body_part_temp'].unique()" ] }, { @@ -2751,142 +4964,142 @@ " \n", " 0\n", " 0\n", - " WHOLE PLANT SEAWEED\n", + " Whole plant Seaweed\n", " \n", " \n", " 1\n", " 1\n", - " Soft parts Fish\n", + " WHOLE ANIMAL FISH\n", " \n", " \n", " 2\n", " 2\n", - " WHOLE ANIMAL Fish\n", + " Whole animal Molluscs\n", " \n", " \n", " 3\n", " 3\n", - " SOFT PARTS Molluscs\n", + " FLESH WITHOUT BONES fish\n", " \n", " \n", " 4\n", " 4\n", - " WHOLE PLANT seaweed\n", + " Cod medallion FISH\n", " \n", " \n", " 5\n", " 5\n", - " UNKNOWN FISH\n", + " SOFT PARTS MOLLUSCS\n", " \n", " \n", " 6\n", " 6\n", - " FLESH WITHOUT BONE Fish\n", + " Whole fish Fish\n", " \n", " \n", " 7\n", " 7\n", - " WHOLE ANIMAL Molluscs\n", + " Muscle FISH\n", " \n", " \n", " 8\n", " 8\n", - " FLESH WITHOUT BONES Molluscs\n", + " FLESH WITHOUT BONES FISH\n", " \n", " \n", " 9\n", " 9\n", - " Muscle FISH\n", + " FLESH Fish\n", " \n", " \n", " 10\n", " 10\n", - " GROWING TIPS Seaweed\n", + " WHOLE FISH Fish\n", " \n", " \n", " 11\n", " 11\n", - " SOFT PARTS MOLLUSCS\n", + " Whole animal Fish\n", " \n", " \n", " 12\n", " 12\n", - " WHOLE FISH Fish\n", + " Flesh Fish\n", " \n", " \n", " 13\n", " 13\n", - " SOFT PARTS molluscs\n", + " GROWING TIPS Seaweed\n", " \n", " \n", " 14\n", " 14\n", - " FLESH Fish\n", + " Whole FISH\n", " \n", " \n", " 15\n", " 15\n", - " WHOLE Seaweed\n", + " WHOLE PLANT SEAWEED\n", " \n", " \n", " 16\n", " 16\n", - " FLESH WITHOUT BONES SEAWEED\n", + " FLESH WITHOUT BONE FISH\n", " \n", " \n", " 17\n", " 17\n", - " Whole animal Fish\n", + " WHOLE ANIMAL Molluscs\n", " \n", " \n", " 18\n", " 18\n", - " Flesh without bones Fish\n", + " FLESH WITHOUT BONES Fish\n", " \n", " \n", " 19\n", " 19\n", - " whole plant Seaweed\n", + " HEAD FISH\n", " \n", " \n", " 20\n", " 20\n", - " Cod medallion FISH\n", + " UNKNOWN FISH\n", " \n", " \n", " 21\n", " 21\n", - " HEAD FISH\n", + " SOFT PARTS molluscs\n", " \n", " \n", " 22\n", " 22\n", - " LIVER Fish\n", + " UNKNOWN Fish\n", " \n", " \n", " 23\n", " 23\n", - " Flesh Fish\n", + " WHOLE FISH FISH\n", " \n", " \n", " 24\n", " 24\n", - " Whole animal Molluscs\n", + " Whole fisk FISH\n", " \n", " \n", " 25\n", " 25\n", - " WHOLE PLANT Seaweed\n", + " Whole without head FISH\n", " \n", " \n", " 26\n", " 26\n", - " Whole plant Seaweed\n", + " Soft parts Fish\n", " \n", " \n", " 27\n", " 27\n", - " Whole fisk FISH\n", + " FLESH WITHOUT BONES SEAWEED\n", " \n", " \n", " 28\n", @@ -2896,87 +5109,87 @@ " \n", " 29\n", " 29\n", - " Soft Parts Molluscs\n", + " LIVER Fish\n", " \n", " \n", " 30\n", " 30\n", - " MUSCLE Fish\n", + " Soft Parts Molluscs\n", " \n", " \n", " 31\n", " 31\n", - " Whole FISH\n", + " FLESH WITHOUT BONE Fish\n", " \n", " \n", " 32\n", " 32\n", - " WHOLE ANIMAL FISH\n", + " SOFT PARTS Molluscs\n", " \n", " \n", " 33\n", " 33\n", - " WHOLE FISH FISH\n", + " WHOLE PLANT Seaweed\n", " \n", " \n", " 34\n", " 34\n", - " WHOLE Fish\n", + " HEAD Fish\n", " \n", " \n", " 35\n", " 35\n", - " FLESH WITHOUT BONE FISH\n", + " WHOLE Fish\n", " \n", " \n", " 36\n", " 36\n", - " FLESH WITHOUT BONES Fish\n", + " WHOLE Seaweed\n", " \n", " \n", " 37\n", " 37\n", - " Whole without head FISH\n", + " whole plant Seaweed\n", " \n", " \n", " 38\n", " 38\n", - " UNKNOWN Fish\n", + " WHOLE ANIMAL Fish\n", " \n", " \n", " 39\n", " 39\n", - " FLESH WITHOUT BONES fish\n", + " FLESH WITHOUT BONES Molluscs\n", " \n", " \n", " 40\n", " 40\n", - " HEAD Fish\n", + " FLESH WITH SCALES Fish\n", " \n", " \n", " 41\n", " 41\n", - " Whole fish Fish\n", + " Flesh without bones Fish\n", " \n", " \n", " 42\n", " 42\n", - " FLESH WITH SCALES Fish\n", + " MUSCLE Fish\n", " \n", " \n", " 43\n", " 43\n", - " Muscle Fish\n", + " Soft parts Molluscs\n", " \n", " \n", " 44\n", " 44\n", - " FLESH WITHOUT BONES FISH\n", + " WHOLE PLANT seaweed\n", " \n", " \n", " 45\n", " 45\n", - " Soft parts Molluscs\n", + " Muscle Fish\n", " \n", " \n", "\n", @@ -2984,52 +5197,52 @@ ], "text/plain": [ " index value\n", - "0 0 WHOLE PLANT SEAWEED\n", - "1 1 Soft parts Fish\n", - "2 2 WHOLE ANIMAL Fish\n", - "3 3 SOFT PARTS Molluscs\n", - "4 4 WHOLE PLANT seaweed\n", - "5 5 UNKNOWN FISH\n", - "6 6 FLESH WITHOUT BONE Fish\n", - "7 7 WHOLE ANIMAL Molluscs\n", - "8 8 FLESH WITHOUT BONES Molluscs\n", - "9 9 Muscle FISH\n", - "10 10 GROWING TIPS Seaweed\n", - "11 11 SOFT PARTS MOLLUSCS\n", - "12 12 WHOLE FISH Fish\n", - "13 13 SOFT PARTS molluscs\n", - "14 14 FLESH Fish\n", - "15 15 WHOLE Seaweed\n", - "16 16 FLESH WITHOUT BONES SEAWEED\n", - "17 17 Whole animal Fish\n", - "18 18 Flesh without bones Fish\n", - "19 19 whole plant Seaweed\n", - "20 20 Cod medallion FISH\n", - "21 21 HEAD FISH\n", - "22 22 LIVER Fish\n", - "23 23 Flesh Fish\n", - "24 24 Whole animal Molluscs\n", - "25 25 WHOLE PLANT Seaweed\n", - "26 26 Whole plant Seaweed\n", - "27 27 Whole fisk FISH\n", + "0 0 Whole plant Seaweed\n", + "1 1 WHOLE ANIMAL FISH\n", + "2 2 Whole animal Molluscs\n", + "3 3 FLESH WITHOUT BONES fish\n", + "4 4 Cod medallion FISH\n", + "5 5 SOFT PARTS MOLLUSCS\n", + "6 6 Whole fish Fish\n", + "7 7 Muscle FISH\n", + "8 8 FLESH WITHOUT BONES FISH\n", + "9 9 FLESH Fish\n", + "10 10 WHOLE FISH Fish\n", + "11 11 Whole animal Fish\n", + "12 12 Flesh Fish\n", + "13 13 GROWING TIPS Seaweed\n", + "14 14 Whole FISH\n", + "15 15 WHOLE PLANT SEAWEED\n", + "16 16 FLESH WITHOUT BONE FISH\n", + "17 17 WHOLE ANIMAL Molluscs\n", + "18 18 FLESH WITHOUT BONES Fish\n", + "19 19 HEAD FISH\n", + "20 20 UNKNOWN FISH\n", + "21 21 SOFT PARTS molluscs\n", + "22 22 UNKNOWN Fish\n", + "23 23 WHOLE FISH FISH\n", + "24 24 Whole fisk FISH\n", + "25 25 Whole without head FISH\n", + "26 26 Soft parts Fish\n", + "27 27 FLESH WITHOUT BONES SEAWEED\n", "28 28 Mix of muscle and whole fish without liver FISH\n", - "29 29 Soft Parts Molluscs\n", - "30 30 MUSCLE Fish\n", - "31 31 Whole FISH\n", - "32 32 WHOLE ANIMAL FISH\n", - "33 33 WHOLE FISH FISH\n", - "34 34 WHOLE Fish\n", - "35 35 FLESH WITHOUT BONE FISH\n", - "36 36 FLESH WITHOUT BONES Fish\n", - "37 37 Whole without head FISH\n", - "38 38 UNKNOWN Fish\n", - "39 39 FLESH WITHOUT BONES fish\n", - "40 40 HEAD Fish\n", - "41 41 Whole fish Fish\n", - "42 42 FLESH WITH SCALES Fish\n", - "43 43 Muscle Fish\n", - "44 44 FLESH WITHOUT BONES FISH\n", - "45 45 Soft parts Molluscs" + "29 29 LIVER Fish\n", + "30 30 Soft Parts Molluscs\n", + "31 31 FLESH WITHOUT BONE Fish\n", + "32 32 SOFT PARTS Molluscs\n", + "33 33 WHOLE PLANT Seaweed\n", + "34 34 HEAD Fish\n", + "35 35 WHOLE Fish\n", + "36 36 WHOLE Seaweed\n", + "37 37 whole plant Seaweed\n", + "38 38 WHOLE ANIMAL Fish\n", + "39 39 FLESH WITHOUT BONES Molluscs\n", + "40 40 FLESH WITH SCALES Fish\n", + "41 41 Flesh without bones Fish\n", + "42 42 MUSCLE Fish\n", + "43 43 Soft parts Molluscs\n", + "44 44 WHOLE PLANT seaweed\n", + "45 45 Muscle Fish" ] }, "execution_count": null, @@ -3041,6 +5254,14 @@ "get_unique_across_dfs(dfs_test, col_name='body_part_temp', as_df=True)" ] }, + { + "cell_type": "markdown", + "id": "7b6feee1", + "metadata": {}, + "source": [ + "We try to remap the `body_part_temp` column to the `bodypar` column of the MARIS nomenclature, again using a `Remapper` object:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -3051,7 +5272,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 46/46 [00:00<00:00, 123.03it/s]\n" + "Processing: 0%| | 0/46 [00:0031\n", " \n", " \n", + " Whole without head FISH\n", + " Flesh without bones\n", + " Whole without head FISH\n", + " 13\n", + " \n", + " \n", " Cod medallion FISH\n", " Old leaf\n", " Cod medallion FISH\n", " 13\n", " \n", " \n", - " Whole without head FISH\n", - " Flesh without bones\n", - " Whole without head FISH\n", - " 13\n", + " SOFT PARTS Molluscs\n", + " Soft parts\n", + " SOFT PARTS Molluscs\n", + " 9\n", " \n", " \n", - " Soft parts Molluscs\n", - " Soft parts\n", - " Soft parts Molluscs\n", + " WHOLE ANIMAL Molluscs\n", + " Whole animal\n", + " WHOLE ANIMAL Molluscs\n", " 9\n", " \n", " \n", @@ -3118,9 +5352,9 @@ " 9\n", " \n", " \n", - " SOFT PARTS molluscs\n", - " Soft parts\n", - " SOFT PARTS molluscs\n", + " UNKNOWN Fish\n", + " Growing tips\n", + " UNKNOWN Fish\n", " 9\n", " \n", " \n", @@ -3130,51 +5364,33 @@ " 9\n", " \n", " \n", - " SOFT PARTS MOLLUSCS\n", - " Soft parts\n", - " SOFT PARTS MOLLUSCS\n", - " 9\n", - " \n", - " \n", " Soft Parts Molluscs\n", " Soft parts\n", " Soft Parts Molluscs\n", " 9\n", " \n", " \n", - " WHOLE FISH FISH\n", - " Whole animal\n", - " WHOLE FISH FISH\n", - " 9\n", - " \n", - " \n", - " FLESH WITHOUT BONES Molluscs\n", - " Flesh without bones\n", - " FLESH WITHOUT BONES Molluscs\n", - " 9\n", - " \n", - " \n", - " WHOLE ANIMAL Molluscs\n", - " Whole animal\n", - " WHOLE ANIMAL Molluscs\n", - " 9\n", - " \n", - " \n", " UNKNOWN FISH\n", " Growing tips\n", " UNKNOWN FISH\n", " 9\n", " \n", " \n", - " UNKNOWN Fish\n", - " Growing tips\n", - " UNKNOWN Fish\n", + " SOFT PARTS molluscs\n", + " Soft parts\n", + " SOFT PARTS molluscs\n", " 9\n", " \n", " \n", - " SOFT PARTS Molluscs\n", + " SOFT PARTS MOLLUSCS\n", " Soft parts\n", - " SOFT PARTS Molluscs\n", + " SOFT PARTS MOLLUSCS\n", + " 9\n", + " \n", + " \n", + " FLESH WITHOUT BONES Molluscs\n", + " Flesh without bones\n", + " FLESH WITHOUT BONES Molluscs\n", " 9\n", " \n", " \n", @@ -3190,10 +5406,16 @@ " 9\n", " \n", " \n", - " Whole plant Seaweed\n", - " Whole plant\n", - " Whole plant Seaweed\n", - " 8\n", + " Soft parts Molluscs\n", + " Soft parts\n", + " Soft parts Molluscs\n", + " 9\n", + " \n", + " \n", + " WHOLE FISH FISH\n", + " Whole animal\n", + " WHOLE FISH FISH\n", + " 9\n", " \n", " \n", " WHOLE PLANT Seaweed\n", @@ -3208,39 +5430,45 @@ " 8\n", " \n", " \n", - " whole plant Seaweed\n", + " GROWING TIPS Seaweed\n", + " Growing tips\n", + " GROWING TIPS Seaweed\n", + " 8\n", + " \n", + " \n", + " WHOLE PLANT seaweed\n", " Whole plant\n", - " whole plant Seaweed\n", + " WHOLE PLANT seaweed\n", " 8\n", " \n", " \n", - " FLESH WITHOUT BONES SEAWEED\n", - " Flesh without bones\n", - " FLESH WITHOUT BONES SEAWEED\n", + " whole plant Seaweed\n", + " Whole plant\n", + " whole plant Seaweed\n", " 8\n", " \n", " \n", - " WHOLE PLANT seaweed\n", + " Whole plant Seaweed\n", " Whole plant\n", - " WHOLE PLANT seaweed\n", + " Whole plant Seaweed\n", " 8\n", " \n", " \n", - " GROWING TIPS Seaweed\n", - " Growing tips\n", - " GROWING TIPS Seaweed\n", + " FLESH WITHOUT BONES SEAWEED\n", + " Flesh without bones\n", + " FLESH WITHOUT BONES SEAWEED\n", " 8\n", " \n", " \n", - " FLESH Fish\n", + " Flesh Fish\n", " Shells\n", - " FLESH Fish\n", + " Flesh Fish\n", " 7\n", " \n", " \n", - " Flesh Fish\n", + " FLESH Fish\n", " Shells\n", - " Flesh Fish\n", + " FLESH Fish\n", " 7\n", " \n", " \n", @@ -3250,39 +5478,51 @@ " 6\n", " \n", " \n", + " FLESH WITH SCALES Fish\n", + " Flesh with scales\n", + " FLESH WITH SCALES Fish\n", + " 5\n", + " \n", + " \n", + " Soft parts Fish\n", + " Soft parts\n", + " Soft parts Fish\n", + " 5\n", + " \n", + " \n", " Whole animal Fish\n", " Whole animal\n", " Whole animal Fish\n", " 5\n", " \n", " \n", - " FLESH WITHOUT BONES FISH\n", + " Flesh without bones Fish\n", " Flesh without bones\n", - " FLESH WITHOUT BONES FISH\n", + " Flesh without bones Fish\n", " 5\n", " \n", " \n", - " Muscle Fish\n", - " Muscle\n", - " Muscle Fish\n", + " LIVER Fish\n", + " Liver\n", + " LIVER Fish\n", " 5\n", " \n", " \n", - " FLESH WITH SCALES Fish\n", - " Flesh with scales\n", - " FLESH WITH SCALES Fish\n", + " FLESH WITHOUT BONES FISH\n", + " Flesh without bones\n", + " FLESH WITHOUT BONES FISH\n", " 5\n", " \n", " \n", - " WHOLE ANIMAL Fish\n", - " Whole animal\n", - " WHOLE ANIMAL Fish\n", + " HEAD FISH\n", + " Head\n", + " HEAD FISH\n", " 5\n", " \n", " \n", - " HEAD Fish\n", - " Head\n", - " HEAD Fish\n", + " WHOLE ANIMAL Fish\n", + " Whole animal\n", + " WHOLE ANIMAL Fish\n", " 5\n", " \n", " \n", @@ -3292,39 +5532,33 @@ " 5\n", " \n", " \n", - " LIVER Fish\n", - " Liver\n", - " LIVER Fish\n", + " HEAD Fish\n", + " Head\n", + " HEAD Fish\n", " 5\n", " \n", " \n", - " FLESH WITHOUT BONES Fish\n", - " Flesh without bones\n", - " FLESH WITHOUT BONES Fish\n", + " Muscle FISH\n", + " Muscle\n", + " Muscle FISH\n", " 5\n", " \n", " \n", - " WHOLE Fish\n", + " Whole FISH\n", " Whole animal\n", - " WHOLE Fish\n", - " 5\n", - " \n", - " \n", - " HEAD FISH\n", - " Head\n", - " HEAD FISH\n", + " Whole FISH\n", " 5\n", " \n", " \n", - " WHOLE ANIMAL FISH\n", + " WHOLE Fish\n", " Whole animal\n", - " WHOLE ANIMAL FISH\n", + " WHOLE Fish\n", " 5\n", " \n", " \n", - " Whole FISH\n", - " Whole animal\n", - " Whole FISH\n", + " Muscle Fish\n", + " Muscle\n", + " Muscle Fish\n", " 5\n", " \n", " \n", @@ -3334,33 +5568,27 @@ " 5\n", " \n", " \n", - " Soft parts Fish\n", - " Soft parts\n", - " Soft parts Fish\n", + " WHOLE ANIMAL FISH\n", + " Whole animal\n", + " WHOLE ANIMAL FISH\n", " 5\n", " \n", " \n", - " Flesh without bones Fish\n", + " FLESH WITHOUT BONES Fish\n", " Flesh without bones\n", - " Flesh without bones Fish\n", - " 5\n", - " \n", - " \n", - " Muscle FISH\n", - " Muscle\n", - " Muscle FISH\n", + " FLESH WITHOUT BONES Fish\n", " 5\n", " \n", " \n", - " FLESH WITHOUT BONE FISH\n", + " FLESH WITHOUT BONE Fish\n", " Flesh without bones\n", - " FLESH WITHOUT BONE FISH\n", + " FLESH WITHOUT BONE Fish\n", " 4\n", " \n", " \n", - " FLESH WITHOUT BONE Fish\n", + " FLESH WITHOUT BONE FISH\n", " Flesh without bones\n", - " FLESH WITHOUT BONE Fish\n", + " FLESH WITHOUT BONE FISH\n", " 4\n", " \n", " \n", @@ -3371,149 +5599,149 @@ " matched_maris_name \\\n", "source_key \n", "Mix of muscle and whole fish without liver FISH Flesh without bones \n", - "Cod medallion FISH Old leaf \n", "Whole without head FISH Flesh without bones \n", - "Soft parts Molluscs Soft parts \n", + "Cod medallion FISH Old leaf \n", + "SOFT PARTS Molluscs Soft parts \n", + "WHOLE ANIMAL Molluscs Whole animal \n", "Whole fisk FISH Whole animal \n", - "SOFT PARTS molluscs Soft parts \n", + "UNKNOWN Fish Growing tips \n", "WHOLE FISH Fish Whole animal \n", - "SOFT PARTS MOLLUSCS Soft parts \n", "Soft Parts Molluscs Soft parts \n", - "WHOLE FISH FISH Whole animal \n", - "FLESH WITHOUT BONES Molluscs Flesh without bones \n", - "WHOLE ANIMAL Molluscs Whole animal \n", "UNKNOWN FISH Growing tips \n", - "UNKNOWN Fish Growing tips \n", - "SOFT PARTS Molluscs Soft parts \n", + "SOFT PARTS molluscs Soft parts \n", + "SOFT PARTS MOLLUSCS Soft parts \n", + "FLESH WITHOUT BONES Molluscs Flesh without bones \n", "Whole fish Fish Whole animal \n", "Whole animal Molluscs Whole animal \n", - "Whole plant Seaweed Whole plant \n", + "Soft parts Molluscs Soft parts \n", + "WHOLE FISH FISH Whole animal \n", "WHOLE PLANT Seaweed Whole plant \n", "WHOLE PLANT SEAWEED Whole plant \n", + "GROWING TIPS Seaweed Growing tips \n", + "WHOLE PLANT seaweed Whole plant \n", "whole plant Seaweed Whole plant \n", + "Whole plant Seaweed Whole plant \n", "FLESH WITHOUT BONES SEAWEED Flesh without bones \n", - "WHOLE PLANT seaweed Whole plant \n", - "GROWING TIPS Seaweed Growing tips \n", - "FLESH Fish Shells \n", "Flesh Fish Shells \n", + "FLESH Fish Shells \n", "WHOLE Seaweed Whole plant \n", + "FLESH WITH SCALES Fish Flesh with scales \n", + "Soft parts Fish Soft parts \n", "Whole animal Fish Whole animal \n", + "Flesh without bones Fish Flesh without bones \n", + "LIVER Fish Liver \n", "FLESH WITHOUT BONES FISH Flesh without bones \n", - "Muscle Fish Muscle \n", - "FLESH WITH SCALES Fish Flesh with scales \n", + "HEAD FISH Head \n", "WHOLE ANIMAL Fish Whole animal \n", - "HEAD Fish Head \n", "FLESH WITHOUT BONES fish Flesh without bones \n", - "LIVER Fish Liver \n", - "FLESH WITHOUT BONES Fish Flesh without bones \n", - "WHOLE Fish Whole animal \n", - "HEAD FISH Head \n", - "WHOLE ANIMAL FISH Whole animal \n", + "HEAD Fish Head \n", + "Muscle FISH Muscle \n", "Whole FISH Whole animal \n", + "WHOLE Fish Whole animal \n", + "Muscle Fish Muscle \n", "MUSCLE Fish Muscle \n", - "Soft parts Fish Soft parts \n", - "Flesh without bones Fish Flesh without bones \n", - "Muscle FISH Muscle \n", - "FLESH WITHOUT BONE FISH Flesh without bones \n", + "WHOLE ANIMAL FISH Whole animal \n", + "FLESH WITHOUT BONES Fish Flesh without bones \n", "FLESH WITHOUT BONE Fish Flesh without bones \n", + "FLESH WITHOUT BONE FISH Flesh without bones \n", "\n", " source_name \\\n", "source_key \n", "Mix of muscle and whole fish without liver FISH Mix of muscle and whole fish without liver FISH \n", - "Cod medallion FISH Cod medallion FISH \n", "Whole without head FISH Whole without head FISH \n", - "Soft parts Molluscs Soft parts Molluscs \n", + "Cod medallion FISH Cod medallion FISH \n", + "SOFT PARTS Molluscs SOFT PARTS Molluscs \n", + "WHOLE ANIMAL Molluscs WHOLE ANIMAL Molluscs \n", "Whole fisk FISH Whole fisk FISH \n", - "SOFT PARTS molluscs SOFT PARTS molluscs \n", + "UNKNOWN Fish UNKNOWN Fish \n", "WHOLE FISH Fish WHOLE FISH Fish \n", - "SOFT PARTS MOLLUSCS SOFT PARTS MOLLUSCS \n", "Soft Parts Molluscs Soft Parts Molluscs \n", - "WHOLE FISH FISH WHOLE FISH FISH \n", - "FLESH WITHOUT BONES Molluscs FLESH WITHOUT BONES Molluscs \n", - "WHOLE ANIMAL Molluscs WHOLE ANIMAL Molluscs \n", "UNKNOWN FISH UNKNOWN FISH \n", - "UNKNOWN Fish UNKNOWN Fish \n", - "SOFT PARTS Molluscs SOFT PARTS Molluscs \n", + "SOFT PARTS molluscs SOFT PARTS molluscs \n", + "SOFT PARTS MOLLUSCS SOFT PARTS MOLLUSCS \n", + "FLESH WITHOUT BONES Molluscs FLESH WITHOUT BONES Molluscs \n", "Whole fish Fish Whole fish Fish \n", "Whole animal Molluscs Whole animal Molluscs \n", - "Whole plant Seaweed Whole plant Seaweed \n", + "Soft parts Molluscs Soft parts Molluscs \n", + "WHOLE FISH FISH WHOLE FISH FISH \n", "WHOLE PLANT Seaweed WHOLE PLANT Seaweed \n", "WHOLE PLANT SEAWEED WHOLE PLANT SEAWEED \n", + "GROWING TIPS Seaweed GROWING TIPS Seaweed \n", + "WHOLE PLANT seaweed WHOLE PLANT seaweed \n", "whole plant Seaweed whole plant Seaweed \n", + "Whole plant Seaweed Whole plant Seaweed \n", "FLESH WITHOUT BONES SEAWEED FLESH WITHOUT BONES SEAWEED \n", - "WHOLE PLANT seaweed WHOLE PLANT seaweed \n", - "GROWING TIPS Seaweed GROWING TIPS Seaweed \n", - "FLESH Fish FLESH Fish \n", "Flesh Fish Flesh Fish \n", + "FLESH Fish FLESH Fish \n", "WHOLE Seaweed WHOLE Seaweed \n", + "FLESH WITH SCALES Fish FLESH WITH SCALES Fish \n", + "Soft parts Fish Soft parts Fish \n", "Whole animal Fish Whole animal Fish \n", + "Flesh without bones Fish Flesh without bones Fish \n", + "LIVER Fish LIVER Fish \n", "FLESH WITHOUT BONES FISH FLESH WITHOUT BONES FISH \n", - "Muscle Fish Muscle Fish \n", - "FLESH WITH SCALES Fish FLESH WITH SCALES Fish \n", + "HEAD FISH HEAD FISH \n", "WHOLE ANIMAL Fish WHOLE ANIMAL Fish \n", - "HEAD Fish HEAD Fish \n", "FLESH WITHOUT BONES fish FLESH WITHOUT BONES fish \n", - "LIVER Fish LIVER Fish \n", - "FLESH WITHOUT BONES Fish FLESH WITHOUT BONES Fish \n", - "WHOLE Fish WHOLE Fish \n", - "HEAD FISH HEAD FISH \n", - "WHOLE ANIMAL FISH WHOLE ANIMAL FISH \n", + "HEAD Fish HEAD Fish \n", + "Muscle FISH Muscle FISH \n", "Whole FISH Whole FISH \n", + "WHOLE Fish WHOLE Fish \n", + "Muscle Fish Muscle Fish \n", "MUSCLE Fish MUSCLE Fish \n", - "Soft parts Fish Soft parts Fish \n", - "Flesh without bones Fish Flesh without bones Fish \n", - "Muscle FISH Muscle FISH \n", - "FLESH WITHOUT BONE FISH FLESH WITHOUT BONE FISH \n", + "WHOLE ANIMAL FISH WHOLE ANIMAL FISH \n", + "FLESH WITHOUT BONES Fish FLESH WITHOUT BONES Fish \n", "FLESH WITHOUT BONE Fish FLESH WITHOUT BONE Fish \n", + "FLESH WITHOUT BONE FISH FLESH WITHOUT BONE FISH \n", "\n", " match_score \n", "source_key \n", "Mix of muscle and whole fish without liver FISH 31 \n", - "Cod medallion FISH 13 \n", "Whole without head FISH 13 \n", - "Soft parts Molluscs 9 \n", + "Cod medallion FISH 13 \n", + "SOFT PARTS Molluscs 9 \n", + "WHOLE ANIMAL Molluscs 9 \n", "Whole fisk FISH 9 \n", - "SOFT PARTS molluscs 9 \n", + "UNKNOWN Fish 9 \n", "WHOLE FISH Fish 9 \n", - "SOFT PARTS MOLLUSCS 9 \n", "Soft Parts Molluscs 9 \n", - "WHOLE FISH FISH 9 \n", - "FLESH WITHOUT BONES Molluscs 9 \n", - "WHOLE ANIMAL Molluscs 9 \n", "UNKNOWN FISH 9 \n", - "UNKNOWN Fish 9 \n", - "SOFT PARTS Molluscs 9 \n", + "SOFT PARTS molluscs 9 \n", + "SOFT PARTS MOLLUSCS 9 \n", + "FLESH WITHOUT BONES Molluscs 9 \n", "Whole fish Fish 9 \n", "Whole animal Molluscs 9 \n", - "Whole plant Seaweed 8 \n", + "Soft parts Molluscs 9 \n", + "WHOLE FISH FISH 9 \n", "WHOLE PLANT Seaweed 8 \n", "WHOLE PLANT SEAWEED 8 \n", + "GROWING TIPS Seaweed 8 \n", + "WHOLE PLANT seaweed 8 \n", "whole plant Seaweed 8 \n", + "Whole plant Seaweed 8 \n", "FLESH WITHOUT BONES SEAWEED 8 \n", - "WHOLE PLANT seaweed 8 \n", - "GROWING TIPS Seaweed 8 \n", - "FLESH Fish 7 \n", "Flesh Fish 7 \n", + "FLESH Fish 7 \n", "WHOLE Seaweed 6 \n", - "Whole animal Fish 5 \n", - "FLESH WITHOUT BONES FISH 5 \n", - "Muscle Fish 5 \n", "FLESH WITH SCALES Fish 5 \n", - "WHOLE ANIMAL Fish 5 \n", - "HEAD Fish 5 \n", - "FLESH WITHOUT BONES fish 5 \n", - "LIVER Fish 5 \n", - "FLESH WITHOUT BONES Fish 5 \n", - "WHOLE Fish 5 \n", - "HEAD FISH 5 \n", - "WHOLE ANIMAL FISH 5 \n", - "Whole FISH 5 \n", - "MUSCLE Fish 5 \n", "Soft parts Fish 5 \n", + "Whole animal Fish 5 \n", "Flesh without bones Fish 5 \n", + "LIVER Fish 5 \n", + "FLESH WITHOUT BONES FISH 5 \n", + "HEAD FISH 5 \n", + "WHOLE ANIMAL Fish 5 \n", + "FLESH WITHOUT BONES fish 5 \n", + "HEAD Fish 5 \n", "Muscle FISH 5 \n", - "FLESH WITHOUT BONE FISH 4 \n", - "FLESH WITHOUT BONE Fish 4 " + "Whole FISH 5 \n", + "WHOLE Fish 5 \n", + "Muscle Fish 5 \n", + "MUSCLE Fish 5 \n", + "WHOLE ANIMAL FISH 5 \n", + "FLESH WITHOUT BONES Fish 5 \n", + "FLESH WITHOUT BONE Fish 4 \n", + "FLESH WITHOUT BONE FISH 4 " ] }, "execution_count": null, @@ -3536,6 +5764,14 @@ "remapper.select_match(match_score_threshold=0)" ] }, + { + "cell_type": "markdown", + "id": "f8a9ee36", + "metadata": {}, + "source": [ + "Many of the lookup entries are sufficient for our needs. However, for values that don't find a match, we can use the `fixes_biota_bodyparts` dictionary to apply manual corrections." + ] + }, { "cell_type": "code", "execution_count": null, @@ -3548,10 +5784,12 @@ " 'WHOLE Seaweed' : 'Whole plant',\n", " 'Flesh Fish': 'Flesh with bones', # We assume it as the category 'Flesh with bones' also exists\n", " 'FLESH Fish' : 'Flesh with bones',\n", - " 'UNKNOWN Fish' : 'Not available',\n", - " 'UNKNOWN FISH': 'Not available',\n", - " 'Cod medallion FISH' : 'Not available', # TO BE DETERMINED\n", - " 'Mix of muscle and whole fish without liver FISH' : 'Not available', # TO BE DETERMINED\n", + " 'UNKNOWN Fish' : NA,\n", + " 'UNKNOWN FISH': NA,\n", + " 'Cod medallion FISH' : NA, # TO BE DETERMINED\n", + " 'Mix of muscle and whole fish without liver FISH' : NA, # TO BE DETERMINED\n", + " 'Whole without head FISH' : NA, # TO BE DETERMINED\n", + " 'FLESH WITHOUT BONES SEAWEED' : NA\n", "}" ] }, @@ -3565,7 +5803,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 46/46 [00:00<00:00, 126.36it/s]\n" + "Processing: 0%| | 0/46 [00:00\n", " \n", " \n", - " Whole without head FISH\n", + " FLESH WITHOUT BONES Molluscs\n", " Flesh without bones\n", - " Whole without head FISH\n", - " 13\n", - " \n", - " \n", - " Whole animal Molluscs\n", - " Whole animal\n", - " Whole animal Molluscs\n", + " FLESH WITHOUT BONES Molluscs\n", " 9\n", " \n", " \n", - " SOFT PARTS MOLLUSCS\n", + " Soft parts Molluscs\n", " Soft parts\n", - " SOFT PARTS MOLLUSCS\n", - " 9\n", - " \n", - " \n", - " Whole fish Fish\n", - " Whole animal\n", - " Whole fish Fish\n", + " Soft parts Molluscs\n", " 9\n", " \n", " \n", - " WHOLE FISH FISH\n", + " WHOLE FISH Fish\n", " Whole animal\n", - " WHOLE FISH FISH\n", + " WHOLE FISH Fish\n", " 9\n", " \n", " \n", @@ -3650,21 +5883,15 @@ " 9\n", " \n", " \n", - " WHOLE FISH Fish\n", - " Whole animal\n", - " WHOLE FISH Fish\n", - " 9\n", - " \n", - " \n", - " Soft parts Molluscs\n", + " SOFT PARTS MOLLUSCS\n", " Soft parts\n", - " Soft parts Molluscs\n", + " SOFT PARTS MOLLUSCS\n", " 9\n", " \n", " \n", - " FLESH WITHOUT BONES Molluscs\n", - " Flesh without bones\n", - " FLESH WITHOUT BONES Molluscs\n", + " Whole animal Molluscs\n", + " Whole animal\n", + " Whole animal Molluscs\n", " 9\n", " \n", " \n", @@ -3680,15 +5907,21 @@ " 9\n", " \n", " \n", - " GROWING TIPS Seaweed\n", - " Growing tips\n", - " GROWING TIPS Seaweed\n", - " 8\n", + " Whole fish Fish\n", + " Whole animal\n", + " Whole fish Fish\n", + " 9\n", " \n", " \n", - " WHOLE PLANT Seaweed\n", + " WHOLE FISH FISH\n", + " Whole animal\n", + " WHOLE FISH FISH\n", + " 9\n", + " \n", + " \n", + " Whole plant Seaweed\n", " Whole plant\n", - " WHOLE PLANT Seaweed\n", + " Whole plant Seaweed\n", " 8\n", " \n", " \n", @@ -3698,9 +5931,9 @@ " 8\n", " \n", " \n", - " Whole plant Seaweed\n", - " Whole plant\n", - " Whole plant Seaweed\n", + " GROWING TIPS Seaweed\n", + " Growing tips\n", + " GROWING TIPS Seaweed\n", " 8\n", " \n", " \n", @@ -3716,39 +5949,45 @@ " 8\n", " \n", " \n", - " FLESH WITHOUT BONES SEAWEED\n", - " Flesh without bones\n", - " FLESH WITHOUT BONES SEAWEED\n", + " WHOLE PLANT Seaweed\n", + " Whole plant\n", + " WHOLE PLANT Seaweed\n", " 8\n", " \n", " \n", - " Whole animal Fish\n", - " Whole animal\n", - " Whole animal Fish\n", + " LIVER Fish\n", + " Liver\n", + " LIVER Fish\n", " 5\n", " \n", " \n", - " WHOLE ANIMAL FISH\n", - " Whole animal\n", - " WHOLE ANIMAL FISH\n", + " FLESH WITH SCALES Fish\n", + " Flesh with scales\n", + " FLESH WITH SCALES Fish\n", " 5\n", " \n", " \n", - " HEAD Fish\n", - " Head\n", - " HEAD Fish\n", + " Soft parts Fish\n", + " Soft parts\n", + " Soft parts Fish\n", " 5\n", " \n", " \n", - " FLESH WITHOUT BONES fish\n", + " FLESH WITHOUT BONES FISH\n", " Flesh without bones\n", - " FLESH WITHOUT BONES fish\n", + " FLESH WITHOUT BONES FISH\n", " 5\n", " \n", " \n", - " Muscle Fish\n", - " Muscle\n", - " Muscle Fish\n", + " Whole animal Fish\n", + " Whole animal\n", + " Whole animal Fish\n", + " 5\n", + " \n", + " \n", + " WHOLE ANIMAL Fish\n", + " Whole animal\n", + " WHOLE ANIMAL Fish\n", " 5\n", " \n", " \n", @@ -3758,15 +5997,15 @@ " 5\n", " \n", " \n", - " WHOLE Fish\n", - " Whole animal\n", - " WHOLE Fish\n", + " Muscle FISH\n", + " Muscle\n", + " Muscle FISH\n", " 5\n", " \n", " \n", - " HEAD FISH\n", - " Head\n", - " HEAD FISH\n", + " Muscle Fish\n", + " Muscle\n", + " Muscle Fish\n", " 5\n", " \n", " \n", @@ -3776,52 +6015,52 @@ " 5\n", " \n", " \n", - " FLESH WITH SCALES Fish\n", - " Flesh with scales\n", - " FLESH WITH SCALES Fish\n", + " WHOLE Fish\n", + " Whole animal\n", + " WHOLE Fish\n", " 5\n", " \n", " \n", - " MUSCLE Fish\n", - " Muscle\n", - " MUSCLE Fish\n", + " WHOLE ANIMAL FISH\n", + " Whole animal\n", + " WHOLE ANIMAL FISH\n", " 5\n", " \n", " \n", - " Muscle FISH\n", - " Muscle\n", - " Muscle FISH\n", + " HEAD FISH\n", + " Head\n", + " HEAD FISH\n", " 5\n", " \n", " \n", - " FLESH WITHOUT BONES FISH\n", + " Flesh without bones Fish\n", " Flesh without bones\n", - " FLESH WITHOUT BONES FISH\n", + " Flesh without bones Fish\n", " 5\n", " \n", " \n", - " Flesh without bones Fish\n", + " FLESH WITHOUT BONES fish\n", " Flesh without bones\n", - " Flesh without bones Fish\n", + " FLESH WITHOUT BONES fish\n", " 5\n", " \n", " \n", - " Soft parts Fish\n", - " Soft parts\n", - " Soft parts Fish\n", + " HEAD Fish\n", + " Head\n", + " HEAD Fish\n", " 5\n", " \n", " \n", - " LIVER Fish\n", - " Liver\n", - " LIVER Fish\n", + " MUSCLE Fish\n", + " Muscle\n", + " MUSCLE Fish\n", " 5\n", " \n", " \n", - " WHOLE ANIMAL Fish\n", - " Whole animal\n", - " WHOLE ANIMAL Fish\n", - " 5\n", + " FLESH WITHOUT BONE Fish\n", + " Flesh without bones\n", + " FLESH WITHOUT BONE Fish\n", + " 4\n", " \n", " \n", " FLESH WITHOUT BONE FISH\n", @@ -3830,10 +6069,10 @@ " 4\n", " \n", " \n", - " FLESH WITHOUT BONE Fish\n", - " Flesh without bones\n", - " FLESH WITHOUT BONE Fish\n", - " 4\n", + " FLESH WITHOUT BONES SEAWEED\n", + " (Not available)\n", + " FLESH WITHOUT BONES SEAWEED\n", + " 2\n", " \n", " \n", " Cod medallion FISH\n", @@ -3842,15 +6081,15 @@ " 2\n", " \n", " \n", - " UNKNOWN Fish\n", + " Mix of muscle and whole fish without liver FISH\n", " (Not available)\n", - " UNKNOWN Fish\n", + " Mix of muscle and whole fish without liver FISH\n", " 2\n", " \n", " \n", - " Mix of muscle and whole fish without liver FISH\n", + " UNKNOWN Fish\n", " (Not available)\n", - " Mix of muscle and whole fish without liver FISH\n", + " UNKNOWN Fish\n", " 2\n", " \n", " \n", @@ -3859,6 +6098,12 @@ " UNKNOWN FISH\n", " 2\n", " \n", + " \n", + " Whole without head FISH\n", + " (Not available)\n", + " Whole without head FISH\n", + " 2\n", + " \n", " \n", "\n", "" @@ -3866,141 +6111,141 @@ "text/plain": [ " matched_maris_name \\\n", "source_key \n", - "Whole without head FISH Flesh without bones \n", - "Whole animal Molluscs Whole animal \n", - "SOFT PARTS MOLLUSCS Soft parts \n", - "Whole fish Fish Whole animal \n", - "WHOLE FISH FISH Whole animal \n", + "FLESH WITHOUT BONES Molluscs Flesh without bones \n", + "Soft parts Molluscs Soft parts \n", + "WHOLE FISH Fish Whole animal \n", "Soft Parts Molluscs Soft parts \n", "Whole fisk FISH Whole animal \n", "SOFT PARTS molluscs Soft parts \n", - "WHOLE FISH Fish Whole animal \n", - "Soft parts Molluscs Soft parts \n", - "FLESH WITHOUT BONES Molluscs Flesh without bones \n", + "SOFT PARTS MOLLUSCS Soft parts \n", + "Whole animal Molluscs Whole animal \n", "WHOLE ANIMAL Molluscs Whole animal \n", "SOFT PARTS Molluscs Soft parts \n", - "GROWING TIPS Seaweed Growing tips \n", - "WHOLE PLANT Seaweed Whole plant \n", - "WHOLE PLANT seaweed Whole plant \n", + "Whole fish Fish Whole animal \n", + "WHOLE FISH FISH Whole animal \n", "Whole plant Seaweed Whole plant \n", + "WHOLE PLANT seaweed Whole plant \n", + "GROWING TIPS Seaweed Growing tips \n", "WHOLE PLANT SEAWEED Whole plant \n", "whole plant Seaweed Whole plant \n", - "FLESH WITHOUT BONES SEAWEED Flesh without bones \n", + "WHOLE PLANT Seaweed Whole plant \n", + "LIVER Fish Liver \n", + "FLESH WITH SCALES Fish Flesh with scales \n", + "Soft parts Fish Soft parts \n", + "FLESH WITHOUT BONES FISH Flesh without bones \n", "Whole animal Fish Whole animal \n", - "WHOLE ANIMAL FISH Whole animal \n", - "HEAD Fish Head \n", - "FLESH WITHOUT BONES fish Flesh without bones \n", - "Muscle Fish Muscle \n", + "WHOLE ANIMAL Fish Whole animal \n", "FLESH WITHOUT BONES Fish Flesh without bones \n", + "Muscle FISH Muscle \n", + "Muscle Fish Muscle \n", + "Whole FISH Whole animal \n", "WHOLE Fish Whole animal \n", + "WHOLE ANIMAL FISH Whole animal \n", "HEAD FISH Head \n", - "Whole FISH Whole animal \n", - "FLESH WITH SCALES Fish Flesh with scales \n", - "MUSCLE Fish Muscle \n", - "Muscle FISH Muscle \n", - "FLESH WITHOUT BONES FISH Flesh without bones \n", "Flesh without bones Fish Flesh without bones \n", - "Soft parts Fish Soft parts \n", - "LIVER Fish Liver \n", - "WHOLE ANIMAL Fish Whole animal \n", - "FLESH WITHOUT BONE FISH Flesh without bones \n", + "FLESH WITHOUT BONES fish Flesh without bones \n", + "HEAD Fish Head \n", + "MUSCLE Fish Muscle \n", "FLESH WITHOUT BONE Fish Flesh without bones \n", + "FLESH WITHOUT BONE FISH Flesh without bones \n", + "FLESH WITHOUT BONES SEAWEED (Not available) \n", "Cod medallion FISH (Not available) \n", - "UNKNOWN Fish (Not available) \n", "Mix of muscle and whole fish without liver FISH (Not available) \n", + "UNKNOWN Fish (Not available) \n", "UNKNOWN FISH (Not available) \n", + "Whole without head FISH (Not available) \n", "\n", " source_name \\\n", "source_key \n", - "Whole without head FISH Whole without head FISH \n", - "Whole animal Molluscs Whole animal Molluscs \n", - "SOFT PARTS MOLLUSCS SOFT PARTS MOLLUSCS \n", - "Whole fish Fish Whole fish Fish \n", - "WHOLE FISH FISH WHOLE FISH FISH \n", + "FLESH WITHOUT BONES Molluscs FLESH WITHOUT BONES Molluscs \n", + "Soft parts Molluscs Soft parts Molluscs \n", + "WHOLE FISH Fish WHOLE FISH Fish \n", "Soft Parts Molluscs Soft Parts Molluscs \n", "Whole fisk FISH Whole fisk FISH \n", "SOFT PARTS molluscs SOFT PARTS molluscs \n", - "WHOLE FISH Fish WHOLE FISH Fish \n", - "Soft parts Molluscs Soft parts Molluscs \n", - "FLESH WITHOUT BONES Molluscs FLESH WITHOUT BONES Molluscs \n", + "SOFT PARTS MOLLUSCS SOFT PARTS MOLLUSCS \n", + "Whole animal Molluscs Whole animal Molluscs \n", "WHOLE ANIMAL Molluscs WHOLE ANIMAL Molluscs \n", "SOFT PARTS Molluscs SOFT PARTS Molluscs \n", - "GROWING TIPS Seaweed GROWING TIPS Seaweed \n", - "WHOLE PLANT Seaweed WHOLE PLANT Seaweed \n", - "WHOLE PLANT seaweed WHOLE PLANT seaweed \n", + "Whole fish Fish Whole fish Fish \n", + "WHOLE FISH FISH WHOLE FISH FISH \n", "Whole plant Seaweed Whole plant Seaweed \n", + "WHOLE PLANT seaweed WHOLE PLANT seaweed \n", + "GROWING TIPS Seaweed GROWING TIPS Seaweed \n", "WHOLE PLANT SEAWEED WHOLE PLANT SEAWEED \n", "whole plant Seaweed whole plant Seaweed \n", - "FLESH WITHOUT BONES SEAWEED FLESH WITHOUT BONES SEAWEED \n", + "WHOLE PLANT Seaweed WHOLE PLANT Seaweed \n", + "LIVER Fish LIVER Fish \n", + "FLESH WITH SCALES Fish FLESH WITH SCALES Fish \n", + "Soft parts Fish Soft parts Fish \n", + "FLESH WITHOUT BONES FISH FLESH WITHOUT BONES FISH \n", "Whole animal Fish Whole animal Fish \n", - "WHOLE ANIMAL FISH WHOLE ANIMAL FISH \n", - "HEAD Fish HEAD Fish \n", - "FLESH WITHOUT BONES fish FLESH WITHOUT BONES fish \n", - "Muscle Fish Muscle Fish \n", + "WHOLE ANIMAL Fish WHOLE ANIMAL Fish \n", "FLESH WITHOUT BONES Fish FLESH WITHOUT BONES Fish \n", + "Muscle FISH Muscle FISH \n", + "Muscle Fish Muscle Fish \n", + "Whole FISH Whole FISH \n", "WHOLE Fish WHOLE Fish \n", + "WHOLE ANIMAL FISH WHOLE ANIMAL FISH \n", "HEAD FISH HEAD FISH \n", - "Whole FISH Whole FISH \n", - "FLESH WITH SCALES Fish FLESH WITH SCALES Fish \n", - "MUSCLE Fish MUSCLE Fish \n", - "Muscle FISH Muscle FISH \n", - "FLESH WITHOUT BONES FISH FLESH WITHOUT BONES FISH \n", "Flesh without bones Fish Flesh without bones Fish \n", - "Soft parts Fish Soft parts Fish \n", - "LIVER Fish LIVER Fish \n", - "WHOLE ANIMAL Fish WHOLE ANIMAL Fish \n", - "FLESH WITHOUT BONE FISH FLESH WITHOUT BONE FISH \n", + "FLESH WITHOUT BONES fish FLESH WITHOUT BONES fish \n", + "HEAD Fish HEAD Fish \n", + "MUSCLE Fish MUSCLE Fish \n", "FLESH WITHOUT BONE Fish FLESH WITHOUT BONE Fish \n", + "FLESH WITHOUT BONE FISH FLESH WITHOUT BONE FISH \n", + "FLESH WITHOUT BONES SEAWEED FLESH WITHOUT BONES SEAWEED \n", "Cod medallion FISH Cod medallion FISH \n", - "UNKNOWN Fish UNKNOWN Fish \n", "Mix of muscle and whole fish without liver FISH Mix of muscle and whole fish without liver FISH \n", + "UNKNOWN Fish UNKNOWN Fish \n", "UNKNOWN FISH UNKNOWN FISH \n", + "Whole without head FISH Whole without head FISH \n", "\n", " match_score \n", "source_key \n", - "Whole without head FISH 13 \n", - "Whole animal Molluscs 9 \n", - "SOFT PARTS MOLLUSCS 9 \n", - "Whole fish Fish 9 \n", - "WHOLE FISH FISH 9 \n", + "FLESH WITHOUT BONES Molluscs 9 \n", + "Soft parts Molluscs 9 \n", + "WHOLE FISH Fish 9 \n", "Soft Parts Molluscs 9 \n", "Whole fisk FISH 9 \n", "SOFT PARTS molluscs 9 \n", - "WHOLE FISH Fish 9 \n", - "Soft parts Molluscs 9 \n", - "FLESH WITHOUT BONES Molluscs 9 \n", + "SOFT PARTS MOLLUSCS 9 \n", + "Whole animal Molluscs 9 \n", "WHOLE ANIMAL Molluscs 9 \n", "SOFT PARTS Molluscs 9 \n", - "GROWING TIPS Seaweed 8 \n", - "WHOLE PLANT Seaweed 8 \n", - "WHOLE PLANT seaweed 8 \n", + "Whole fish Fish 9 \n", + "WHOLE FISH FISH 9 \n", "Whole plant Seaweed 8 \n", + "WHOLE PLANT seaweed 8 \n", + "GROWING TIPS Seaweed 8 \n", "WHOLE PLANT SEAWEED 8 \n", "whole plant Seaweed 8 \n", - "FLESH WITHOUT BONES SEAWEED 8 \n", + "WHOLE PLANT Seaweed 8 \n", + "LIVER Fish 5 \n", + "FLESH WITH SCALES Fish 5 \n", + "Soft parts Fish 5 \n", + "FLESH WITHOUT BONES FISH 5 \n", "Whole animal Fish 5 \n", - "WHOLE ANIMAL FISH 5 \n", - "HEAD Fish 5 \n", - "FLESH WITHOUT BONES fish 5 \n", - "Muscle Fish 5 \n", + "WHOLE ANIMAL Fish 5 \n", "FLESH WITHOUT BONES Fish 5 \n", + "Muscle FISH 5 \n", + "Muscle Fish 5 \n", + "Whole FISH 5 \n", "WHOLE Fish 5 \n", + "WHOLE ANIMAL FISH 5 \n", "HEAD FISH 5 \n", - "Whole FISH 5 \n", - "FLESH WITH SCALES Fish 5 \n", - "MUSCLE Fish 5 \n", - "Muscle FISH 5 \n", - "FLESH WITHOUT BONES FISH 5 \n", "Flesh without bones Fish 5 \n", - "Soft parts Fish 5 \n", - "LIVER Fish 5 \n", - "WHOLE ANIMAL Fish 5 \n", - "FLESH WITHOUT BONE FISH 4 \n", + "FLESH WITHOUT BONES fish 5 \n", + "HEAD Fish 5 \n", + "MUSCLE Fish 5 \n", "FLESH WITHOUT BONE Fish 4 \n", + "FLESH WITHOUT BONE FISH 4 \n", + "FLESH WITHOUT BONES SEAWEED 2 \n", "Cod medallion FISH 2 \n", - "UNKNOWN Fish 2 \n", "Mix of muscle and whole fish without liver FISH 2 \n", - "UNKNOWN FISH 2 " + "UNKNOWN Fish 2 \n", + "UNKNOWN FISH 2 \n", + "Whole without head FISH 2 " ] }, "execution_count": null, @@ -4014,6 +6259,94 @@ "remapper.select_match(match_score_threshold=1)" ] }, + { + "cell_type": "markdown", + "id": "e7ed9551", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: `biota` dataset includes 1 entry where the `Body Part` is `FLESH WITHOUT BONES` for the `Biological group` of `SEAWEED`, see below. \n", + "\n", + ":::" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93c19547", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDContracting PartySample IDBiological groupBody PartMeasurement CommentSample Comment
266087356IcelandTHFAG17CSEAWEEDFLESH WITHOUT BONESNaNNaN
\n", + "
" + ], + "text/plain": [ + " ID Contracting Party Sample ID Biological group Body Part \\\n", + "2660 87356 Iceland THFAG17C SEAWEED FLESH WITHOUT BONES \n", + "\n", + " Measurement Comment Sample Comment \n", + "2660 NaN NaN " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfs['biota'][['ID','Contracting Party','Sample ID','Biological group','Body Part', 'Measurement Comment', 'Sample Comment']][(tfm.dfs['biota']['Body Part'] == 'FLESH WITHOUT BONES') & (tfm.dfs['biota']['Biological group'] == 'SEAWEED')]" + ] + }, + { + "cell_type": "markdown", + "id": "d563fe9e", + "metadata": {}, + "source": [ + "HERE HERE, need to create the lambda: Remapper then use the genric RemapCB callback." + ] + }, { "cell_type": "code", "execution_count": null, @@ -4052,79 +6385,6 @@ "##### Correct OSPAR `Body Part` labelled as `Whole`" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "ec9c0e96", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['ID', 'Contracting Party', 'RSC Sub-division', 'Station ID',\n", - " 'Sample ID', 'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM',\n", - " 'LongS', 'LongDir', 'Sample type', 'Biological group', 'Species',\n", - " 'Body Part', 'Sampling date', 'Nuclide', 'Value type',\n", - " 'Activity or MDA', 'Uncertainty', 'Unit', 'Data provider',\n", - " 'Measurement Comment', 'Sample Comment', 'Reference Comment'],\n", - " dtype='object')" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dfs['biota'].columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0fcb710", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d2d0470c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['SOFT PARTS Molluscs', 'GROWING TIPS Seaweed',\n", - " 'Whole plant Seaweed', 'WHOLE Fish', 'WHOLE ANIMAL Fish',\n", - " 'FLESH WITHOUT BONES Fish', 'WHOLE ANIMAL Molluscs',\n", - " 'WHOLE PLANT Seaweed', 'Soft Parts Molluscs',\n", - " 'FLESH WITHOUT BONES Molluscs', 'WHOLE Seaweed',\n", - " 'Whole without head FISH', 'Cod medallion FISH', 'Muscle FISH',\n", - " 'Whole animal Fish', 'Whole fisk FISH', 'Whole FISH',\n", - " 'Mix of muscle and whole fish without liver FISH', 'Flesh Fish',\n", - " 'WHOLE FISH Fish', 'Whole animal Molluscs', 'Muscle Fish',\n", - " 'Whole fish Fish', 'FLESH WITHOUT BONE Fish', 'UNKNOWN Fish',\n", - " 'WHOLE PLANT seaweed', 'WHOLE PLANT SEAWEED',\n", - " 'SOFT PARTS molluscs', 'FLESH WITHOUT BONES FISH',\n", - " 'WHOLE ANIMAL FISH', 'FLESH WITHOUT BONES fish', 'FLESH Fish',\n", - " 'FLESH WITHOUT BONES SEAWEED', 'FLESH WITH SCALES Fish',\n", - " 'FLESH WITHOUT BONE FISH', 'HEAD FISH', 'WHOLE FISH FISH',\n", - " 'Flesh without bones Fish', 'UNKNOWN FISH', 'Soft parts Molluscs',\n", - " 'Soft parts Fish', 'HEAD Fish', 'SOFT PARTS MOLLUSCS',\n", - " 'whole plant Seaweed', 'LIVER Fish', 'MUSCLE Fish'], dtype=object)" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "get_unique_bodyparts_group(dfs)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -4160,142 +6420,142 @@ " \n", " 0\n", " 0\n", - " Flesh without bones\n", + " Flesh\n", " \n", " \n", " 1\n", " 1\n", - " WHOLE\n", + " FLESH WITHOUT BONE\n", " \n", " \n", " 2\n", " 2\n", - " Whole\n", + " whole plant\n", " \n", " \n", " 3\n", " 3\n", - " GROWING TIPS\n", + " Muscle\n", " \n", " \n", " 4\n", " 4\n", - " Soft Parts\n", + " Soft parts\n", " \n", " \n", " 5\n", " 5\n", - " Mix of muscle and whole fish without liver\n", + " FLESH WITH SCALES\n", " \n", " \n", " 6\n", " 6\n", - " Whole fisk\n", + " HEAD\n", " \n", " \n", " 7\n", " 7\n", - " MUSCLE\n", + " WHOLE ANIMAL\n", " \n", " \n", " 8\n", " 8\n", - " Muscle\n", + " GROWING TIPS\n", " \n", " \n", " 9\n", " 9\n", - " SOFT PARTS\n", + " Whole fisk\n", " \n", " \n", " 10\n", " 10\n", - " HEAD\n", + " Whole fish\n", " \n", " \n", " 11\n", " 11\n", - " Cod medallion\n", + " WHOLE FISH\n", " \n", " \n", " 12\n", " 12\n", - " LIVER\n", + " UNKNOWN\n", " \n", " \n", " 13\n", " 13\n", - " Whole fish\n", + " Whole animal\n", " \n", " \n", " 14\n", " 14\n", - " FLESH WITHOUT BONE\n", + " Flesh without bones\n", " \n", " \n", " 15\n", " 15\n", - " Whole animal\n", + " LIVER\n", " \n", " \n", " 16\n", " 16\n", - " Whole without head\n", + " SOFT PARTS\n", " \n", " \n", " 17\n", " 17\n", - " whole plant\n", + " WHOLE PLANT\n", " \n", " \n", " 18\n", " 18\n", - " FLESH\n", + " Whole\n", " \n", " \n", " 19\n", " 19\n", - " UNKNOWN\n", + " MUSCLE\n", " \n", " \n", " 20\n", " 20\n", - " WHOLE FISH\n", + " FLESH WITHOUT BONES\n", " \n", " \n", " 21\n", " 21\n", - " FLESH WITHOUT BONES\n", + " Cod medallion\n", " \n", " \n", " 22\n", " 22\n", - " FLESH WITH SCALES\n", + " FLESH\n", " \n", " \n", " 23\n", " 23\n", - " Soft parts\n", + " Whole without head\n", " \n", " \n", " 24\n", " 24\n", - " Whole plant\n", + " WHOLE\n", " \n", " \n", " 25\n", " 25\n", - " WHOLE ANIMAL\n", + " Mix of muscle and whole fish without liver\n", " \n", " \n", " 26\n", " 26\n", - " WHOLE PLANT\n", + " Whole plant\n", " \n", " \n", " 27\n", " 27\n", - " Flesh\n", + " Soft Parts\n", " \n", " \n", "\n", @@ -4303,34 +6563,34 @@ ], "text/plain": [ " index value\n", - "0 0 Flesh without bones\n", - "1 1 WHOLE\n", - "2 2 Whole\n", - "3 3 GROWING TIPS\n", - "4 4 Soft Parts\n", - "5 5 Mix of muscle and whole fish without liver\n", - "6 6 Whole fisk\n", - "7 7 MUSCLE\n", - "8 8 Muscle\n", - "9 9 SOFT PARTS\n", - "10 10 HEAD\n", - "11 11 Cod medallion\n", - "12 12 LIVER\n", - "13 13 Whole fish\n", - "14 14 FLESH WITHOUT BONE\n", - "15 15 Whole animal\n", - "16 16 Whole without head\n", - "17 17 whole plant\n", - "18 18 FLESH\n", - "19 19 UNKNOWN\n", - "20 20 WHOLE FISH\n", - "21 21 FLESH WITHOUT BONES\n", - "22 22 FLESH WITH SCALES\n", - "23 23 Soft parts\n", - "24 24 Whole plant\n", - "25 25 WHOLE ANIMAL\n", - "26 26 WHOLE PLANT\n", - "27 27 Flesh" + "0 0 Flesh\n", + "1 1 FLESH WITHOUT BONE\n", + "2 2 whole plant\n", + "3 3 Muscle\n", + "4 4 Soft parts\n", + "5 5 FLESH WITH SCALES\n", + "6 6 HEAD\n", + "7 7 WHOLE ANIMAL\n", + "8 8 GROWING TIPS\n", + "9 9 Whole fisk\n", + "10 10 Whole fish\n", + "11 11 WHOLE FISH\n", + "12 12 UNKNOWN\n", + "13 13 Whole animal\n", + "14 14 Flesh without bones\n", + "15 15 LIVER\n", + "16 16 SOFT PARTS\n", + "17 17 WHOLE PLANT\n", + "18 18 Whole\n", + "19 19 MUSCLE\n", + "20 20 FLESH WITHOUT BONES\n", + "21 21 Cod medallion\n", + "22 22 FLESH\n", + "23 23 Whole without head\n", + "24 24 WHOLE\n", + "25 25 Mix of muscle and whole fish without liver\n", + "26 26 Whole plant\n", + "27 27 Soft Parts" ] }, "execution_count": null, @@ -4353,7 +6613,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 28/28 [00:00<00:00, 131.90it/s]\n" + "Processing: 0%| | 0/28 [00:009\n", " \n", " \n", - " WHOLE FISH\n", + " Whole fisk\n", " Whole animal\n", - " WHOLE FISH\n", + " Whole fisk\n", " 5\n", " \n", " \n", - " UNKNOWN\n", - " Skin\n", - " UNKNOWN\n", + " Whole fish\n", + " Whole animal\n", + " Whole fish\n", " 5\n", " \n", " \n", - " Whole fisk\n", + " WHOLE FISH\n", " Whole animal\n", - " Whole fisk\n", + " WHOLE FISH\n", " 5\n", " \n", " \n", - " Whole fish\n", - " Whole animal\n", - " Whole fish\n", + " UNKNOWN\n", + " Skin\n", + " UNKNOWN\n", " 5\n", " \n", " \n", @@ -4438,9 +6705,9 @@ " 3\n", " \n", " \n", - " Whole\n", + " WHOLE\n", " Molt\n", - " Whole\n", + " WHOLE\n", " 3\n", " \n", " \n", @@ -4450,9 +6717,9 @@ " 3\n", " \n", " \n", - " WHOLE\n", + " Whole\n", " Molt\n", - " WHOLE\n", + " Whole\n", " 3\n", " \n", " \n", @@ -4462,51 +6729,51 @@ " 1\n", " \n", " \n", - " LIVER\n", - " Liver\n", - " LIVER\n", + " WHOLE PLANT\n", + " Whole plant\n", + " WHOLE PLANT\n", " 0\n", " \n", " \n", - " HEAD\n", - " Head\n", - " HEAD\n", + " Whole plant\n", + " Whole plant\n", + " Whole plant\n", " 0\n", " \n", " \n", - " SOFT PARTS\n", - " Soft parts\n", - " SOFT PARTS\n", + " FLESH WITHOUT BONES\n", + " Flesh without bones\n", + " FLESH WITHOUT BONES\n", " 0\n", " \n", " \n", - " Whole animal\n", - " Whole animal\n", - " Whole animal\n", + " MUSCLE\n", + " Muscle\n", + " MUSCLE\n", " 0\n", " \n", " \n", - " Muscle\n", - " Muscle\n", - " Muscle\n", + " Flesh without bones\n", + " Flesh without bones\n", + " Flesh without bones\n", " 0\n", " \n", " \n", - " whole plant\n", - " Whole plant\n", - " whole plant\n", + " SOFT PARTS\n", + " Soft parts\n", + " SOFT PARTS\n", " 0\n", " \n", " \n", - " MUSCLE\n", - " Muscle\n", - " MUSCLE\n", + " LIVER\n", + " Liver\n", + " LIVER\n", " 0\n", " \n", " \n", - " Soft Parts\n", - " Soft parts\n", - " Soft Parts\n", + " Whole animal\n", + " Whole animal\n", + " Whole animal\n", " 0\n", " \n", " \n", @@ -4516,9 +6783,15 @@ " 0\n", " \n", " \n", - " FLESH WITHOUT BONES\n", - " Flesh without bones\n", - " FLESH WITHOUT BONES\n", + " WHOLE ANIMAL\n", + " Whole animal\n", + " WHOLE ANIMAL\n", + " 0\n", + " \n", + " \n", + " HEAD\n", + " Head\n", + " HEAD\n", " 0\n", " \n", " \n", @@ -4534,27 +6807,21 @@ " 0\n", " \n", " \n", - " Whole plant\n", - " Whole plant\n", - " Whole plant\n", - " 0\n", - " \n", - " \n", - " WHOLE ANIMAL\n", - " Whole animal\n", - " WHOLE ANIMAL\n", + " Muscle\n", + " Muscle\n", + " Muscle\n", " 0\n", " \n", " \n", - " WHOLE PLANT\n", + " whole plant\n", " Whole plant\n", - " WHOLE PLANT\n", + " whole plant\n", " 0\n", " \n", " \n", - " Flesh without bones\n", - " Flesh without bones\n", - " Flesh without bones\n", + " Soft Parts\n", + " Soft parts\n", + " Soft Parts\n", " 0\n", " \n", " \n", @@ -4567,93 +6834,93 @@ "Mix of muscle and whole fish without liver Flesh without bones \n", "Whole without head Flesh without bones \n", "Cod medallion Exoskeleton \n", - "WHOLE FISH Whole animal \n", - "UNKNOWN Skin \n", "Whole fisk Whole animal \n", "Whole fish Whole animal \n", + "WHOLE FISH Whole animal \n", + "UNKNOWN Skin \n", "Flesh Leaf \n", - "Whole Molt \n", - "FLESH Leaf \n", "WHOLE Molt \n", + "FLESH Leaf \n", + "Whole Molt \n", "FLESH WITHOUT BONE Flesh without bones \n", - "LIVER Liver \n", - "HEAD Head \n", + "WHOLE PLANT Whole plant \n", + "Whole plant Whole plant \n", + "FLESH WITHOUT BONES Flesh without bones \n", + "MUSCLE Muscle \n", + "Flesh without bones Flesh without bones \n", "SOFT PARTS Soft parts \n", + "LIVER Liver \n", "Whole animal Whole animal \n", - "Muscle Muscle \n", - "whole plant Whole plant \n", - "MUSCLE Muscle \n", - "Soft Parts Soft parts \n", "GROWING TIPS Growing tips \n", - "FLESH WITHOUT BONES Flesh without bones \n", + "WHOLE ANIMAL Whole animal \n", + "HEAD Head \n", "FLESH WITH SCALES Flesh with scales \n", "Soft parts Soft parts \n", - "Whole plant Whole plant \n", - "WHOLE ANIMAL Whole animal \n", - "WHOLE PLANT Whole plant \n", - "Flesh without bones Flesh without bones \n", + "Muscle Muscle \n", + "whole plant Whole plant \n", + "Soft Parts Soft parts \n", "\n", " source_name \\\n", "source_key \n", "Mix of muscle and whole fish without liver Mix of muscle and whole fish without liver \n", "Whole without head Whole without head \n", "Cod medallion Cod medallion \n", - "WHOLE FISH WHOLE FISH \n", - "UNKNOWN UNKNOWN \n", "Whole fisk Whole fisk \n", "Whole fish Whole fish \n", + "WHOLE FISH WHOLE FISH \n", + "UNKNOWN UNKNOWN \n", "Flesh Flesh \n", - "Whole Whole \n", - "FLESH FLESH \n", "WHOLE WHOLE \n", + "FLESH FLESH \n", + "Whole Whole \n", "FLESH WITHOUT BONE FLESH WITHOUT BONE \n", - "LIVER LIVER \n", - "HEAD HEAD \n", + "WHOLE PLANT WHOLE PLANT \n", + "Whole plant Whole plant \n", + "FLESH WITHOUT BONES FLESH WITHOUT BONES \n", + "MUSCLE MUSCLE \n", + "Flesh without bones Flesh without bones \n", "SOFT PARTS SOFT PARTS \n", + "LIVER LIVER \n", "Whole animal Whole animal \n", - "Muscle Muscle \n", - "whole plant whole plant \n", - "MUSCLE MUSCLE \n", - "Soft Parts Soft Parts \n", "GROWING TIPS GROWING TIPS \n", - "FLESH WITHOUT BONES FLESH WITHOUT BONES \n", + "WHOLE ANIMAL WHOLE ANIMAL \n", + "HEAD HEAD \n", "FLESH WITH SCALES FLESH WITH SCALES \n", "Soft parts Soft parts \n", - "Whole plant Whole plant \n", - "WHOLE ANIMAL WHOLE ANIMAL \n", - "WHOLE PLANT WHOLE PLANT \n", - "Flesh without bones Flesh without bones \n", + "Muscle Muscle \n", + "whole plant whole plant \n", + "Soft Parts Soft Parts \n", "\n", " match_score \n", "source_key \n", "Mix of muscle and whole fish without liver 27 \n", "Whole without head 10 \n", "Cod medallion 9 \n", - "WHOLE FISH 5 \n", - "UNKNOWN 5 \n", "Whole fisk 5 \n", "Whole fish 5 \n", + "WHOLE FISH 5 \n", + "UNKNOWN 5 \n", "Flesh 3 \n", - "Whole 3 \n", - "FLESH 3 \n", "WHOLE 3 \n", + "FLESH 3 \n", + "Whole 3 \n", "FLESH WITHOUT BONE 1 \n", - "LIVER 0 \n", - "HEAD 0 \n", + "WHOLE PLANT 0 \n", + "Whole plant 0 \n", + "FLESH WITHOUT BONES 0 \n", + "MUSCLE 0 \n", + "Flesh without bones 0 \n", "SOFT PARTS 0 \n", + "LIVER 0 \n", "Whole animal 0 \n", - "Muscle 0 \n", - "whole plant 0 \n", - "MUSCLE 0 \n", - "Soft Parts 0 \n", "GROWING TIPS 0 \n", - "FLESH WITHOUT BONES 0 \n", + "WHOLE ANIMAL 0 \n", + "HEAD 0 \n", "FLESH WITH SCALES 0 \n", "Soft parts 0 \n", - "Whole plant 0 \n", - "WHOLE ANIMAL 0 \n", - "WHOLE PLANT 0 \n", - "Flesh without bones 0 " + "Muscle 0 \n", + "whole plant 0 \n", + "Soft Parts 0 " ] }, "execution_count": null, @@ -5693,14 +7960,6 @@ "print(tfm.dfs_dropped['biota']['Body Part'].unique())" ] }, - { - "cell_type": "markdown", - "id": "590ed5aa", - "metadata": {}, - "source": [ - "***" - ] - }, { "cell_type": "markdown", "id": "4d1241ed", From f243d9dec417666fc793ff081fabc748137209dc Mon Sep 17 00:00:00 2001 From: Franck Albinet Date: Tue, 15 Oct 2024 11:13:54 +0200 Subject: [PATCH 6/9] add the way unique sample id is defined in MARIS Open Refine --- nbs/metadata/sample-uniqueness.ipynb | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/nbs/metadata/sample-uniqueness.ipynb b/nbs/metadata/sample-uniqueness.ipynb index c57e36a..eef6e9b 100644 --- a/nbs/metadata/sample-uniqueness.ipynb +++ b/nbs/metadata/sample-uniqueness.ipynb @@ -13,8 +13,29 @@ "id": "5709cfb6", "metadata": {}, "source": [ - "> What constitutes a **single sample** in the context of MARIS database?\n", - "\n" + "> What constitutes a **single sample** in the context of MARIS database?" + ] + }, + { + "cell_type": "markdown", + "id": "285c3a1e", + "metadata": {}, + "source": [ + "As defined in OpenRefine MARIS currently, unique sample IDs are defined as the concatenation of the following columns:\n", + "- `ref_id`\n", + "- `latitude`\n", + "- `longitude`\n", + "- `begperiod`\n", + "- `samptype_id`\n", + "- `salinity` (if available)\n", + "- `sliceup` (if available)\n", + "- `slicedown` (if available)\n", + "- `sampdepth` (if available)\n", + "- `samplabcode` (if available)\n", + "- `species_id` (if available)\n", + "- `bodypar_id` (if available)\n", + "- `station` (if available)\n", + "- `SedRepName` (if available)" ] }, { From c2f765fdfbc54b1b9bb3f3f4f79279d55a329a58 Mon Sep 17 00:00:00 2001 From: Franck Albinet Date: Tue, 15 Oct 2024 11:19:03 +0200 Subject: [PATCH 7/9] fix markdown bullet points formatting --- nbs/metadata/sample-uniqueness.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/nbs/metadata/sample-uniqueness.ipynb b/nbs/metadata/sample-uniqueness.ipynb index eef6e9b..f553b50 100644 --- a/nbs/metadata/sample-uniqueness.ipynb +++ b/nbs/metadata/sample-uniqueness.ipynb @@ -22,6 +22,7 @@ "metadata": {}, "source": [ "As defined in OpenRefine MARIS currently, unique sample IDs are defined as the concatenation of the following columns:\n", + "\n", "- `ref_id`\n", "- `latitude`\n", "- `longitude`\n", From 5bd5c7ca3adab4ea1f52c2f3dab8ceec9e0a6faf Mon Sep 17 00:00:00 2001 From: niallmurphy93 Date: Wed, 16 Oct 2024 10:14:50 +0100 Subject: [PATCH 8/9] WIP: Update OSPAR handler - Improve data processing pipeline for OSPAR data --- marisco/handlers/helcom.py | 60 +- nbs/api/configs.ipynb | 4 + nbs/handlers/_ospar.ipynb | 3216 ++++++++++++++++-------------------- nbs/handlers/helcom.ipynb | 1279 +++++++++----- 4 files changed, 2306 insertions(+), 2253 deletions(-) diff --git a/marisco/handlers/helcom.py b/marisco/handlers/helcom.py index bb8aabe..b883536 100644 --- a/marisco/handlers/helcom.py +++ b/marisco/handlers/helcom.py @@ -256,13 +256,13 @@ def __call__(self, tfm: Transformer): fname_cache='species_helcom.pkl' ).generate_lookup_table(fixes=fixes_biota_species, as_df=False, overwrite=False) -# %% ../../nbs/handlers/helcom.ipynb 88 +# %% ../../nbs/handlers/helcom.ipynb 90 fixes_biota_tissues = { 'WHOLE FISH WITHOUT HEAD AND ENTRAILS': 'Whole animal eviscerated without head', 'ENTRAILS': 'Viscera', 'SKIN/EPIDERMIS': 'Skin'} -# %% ../../nbs/handlers/helcom.ipynb 91 +# %% ../../nbs/handlers/helcom.ipynb 93 lut_tissues = lambda: Remapper(provider_lut_df=pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv'), maris_lut_fn=bodyparts_lut_path, maris_col_id='bodypar_id', @@ -272,11 +272,11 @@ def __call__(self, tfm: Transformer): fname_cache='tissues_helcom.pkl' ).generate_lookup_table(fixes=fixes_biota_tissues, as_df=False, overwrite=False) -# %% ../../nbs/handlers/helcom.ipynb 95 +# %% ../../nbs/handlers/helcom.ipynb 97 lut_biogroup = lambda: get_lut(species_lut_path().parent, species_lut_path().name, key='species_id', value='biogroup_id') -# %% ../../nbs/handlers/helcom.ipynb 98 +# %% ../../nbs/handlers/helcom.ipynb 100 # TODO: Include Commonname field after next MARIS data reconciling process. def get_taxon_info_lut( maris_lut:str # Path to the MARIS lookup table (Excel file) @@ -287,7 +287,7 @@ def get_taxon_info_lut( lut_taxon = lambda: get_taxon_info_lut(species_lut_path()) -# %% ../../nbs/handlers/helcom.ipynb 99 +# %% ../../nbs/handlers/helcom.ipynb 101 class RemapTaxonInformationCB(Callback): "Update taxon information based on MARIS species LUT." def __init__(self, fn_lut: Callable): @@ -307,12 +307,12 @@ def __call__(self, tfm: Transformer): if len(unmatched) > 0: print(f"Unmatched species IDs: {', '.join(unmatched)}") -# %% ../../nbs/handlers/helcom.ipynb 108 +# %% ../../nbs/handlers/helcom.ipynb 110 fixes_sediments = { 'NO DATA': '(Not available)' } -# %% ../../nbs/handlers/helcom.ipynb 110 +# %% ../../nbs/handlers/helcom.ipynb 112 class RemapSedimentCB(Callback): "Update sediment id based on MARIS species LUT (dbo_sedtype.xlsx)." def __init__(self, @@ -352,7 +352,7 @@ def _print_unmatched_sedi(self, "Print the SEDI value if the matched_id is -1." print(f"Unmatched SEDI: {sedi_value}") -# %% ../../nbs/handlers/helcom.ipynb 111 +# %% ../../nbs/handlers/helcom.ipynb 113 lut_sediments = lambda: Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv'), maris_lut_fn=sediments_lut_path, maris_col_id='sedtype_id', @@ -362,7 +362,7 @@ def _print_unmatched_sedi(self, fname_cache='sediments_helcom.pkl' ).generate_lookup_table(fixes=fixes_sediments, as_df=False, overwrite=False) -# %% ../../nbs/handlers/helcom.ipynb 121 +# %% ../../nbs/handlers/helcom.ipynb 123 lut_units = { 'seawater': 1, # 'Bq/m3' 'sediment': 4, # 'Bq/kgd' for sediment @@ -373,7 +373,7 @@ def _print_unmatched_sedi(self, } } -# %% ../../nbs/handlers/helcom.ipynb 122 +# %% ../../nbs/handlers/helcom.ipynb 124 class RemapUnitCB(Callback): "Set the `unit` id column in the DataFrames based on a lookup table." def __init__(self, @@ -388,10 +388,10 @@ def __call__(self, tfm: Transformer): else: tfm.dfs[grp]['unit'] = tfm.dfs[grp]['BASIS'].apply(lambda x: lut_units[grp].get(x, 0)) -# %% ../../nbs/handlers/helcom.ipynb 127 +# %% ../../nbs/handlers/helcom.ipynb 129 lut_dl = lambda: pd.read_excel(detection_limit_lut_path(), usecols=['name','id']).set_index('name').to_dict()['id'] -# %% ../../nbs/handlers/helcom.ipynb 129 +# %% ../../nbs/handlers/helcom.ipynb 131 coi_dl = {'seawater' : {'val' : 'VALUE_Bq/m³', 'unc' : 'ERROR%_m³', 'dl' : '< VALUE_Bq/m³'}, @@ -403,7 +403,7 @@ def __call__(self, tfm: Transformer): 'unc' : 'ERROR%_kg', 'dl' : '< VALUE_Bq/kg'}} -# %% ../../nbs/handlers/helcom.ipynb 132 +# %% ../../nbs/handlers/helcom.ipynb 134 # TO BE REFACTORED class RemapDetectionLimitCB(Callback): "Remap value type to MARIS format." @@ -443,14 +443,14 @@ def _update_detection_limit(self, # Perform lookup df['detection_limit'] = df['detection_limit'].map(lut) -# %% ../../nbs/handlers/helcom.ipynb 140 +# %% ../../nbs/handlers/helcom.ipynb 142 lut_filtered = { 'N': 2, 'n': 2, 'F': 1 } -# %% ../../nbs/handlers/helcom.ipynb 142 +# %% ../../nbs/handlers/helcom.ipynb 144 class RemapFiltCB(Callback): "Lookup FILT value in dataframe using the lookup table." def __init__(self, @@ -463,7 +463,7 @@ def __call__(self, tfm): if 'FILT' in df.columns: df['FILT'] = df['FILT'].map(lambda x: self.lut_filtered.get(x, 0)) -# %% ../../nbs/handlers/helcom.ipynb 147 +# %% ../../nbs/handlers/helcom.ipynb 149 class AddSampleLabCodeCB(Callback): "Remap `KEY` column to `samplabcode` in each DataFrame." def __call__(self, tfm: Transformer): @@ -473,10 +473,10 @@ def __call__(self, tfm: Transformer): def _remap_sample_id(self, df: pd.DataFrame): df['samplabcode'] = df['KEY'] -# %% ../../nbs/handlers/helcom.ipynb 152 +# %% ../../nbs/handlers/helcom.ipynb 154 lut_method = lambda: pd.read_csv(Path(fname_in) / 'ANALYSIS_METHOD.csv').set_index('METHOD').to_dict()['DESCRIPTION'] -# %% ../../nbs/handlers/helcom.ipynb 153 +# %% ../../nbs/handlers/helcom.ipynb 155 class AddMeasurementNoteCB(Callback): "Record measurement notes by adding a 'measurenote' column to DataFrames." def __init__(self, @@ -490,7 +490,7 @@ def __call__(self, tfm: Transformer): if 'METHOD' in df.columns: df['measurementnote'] = df['METHOD'].map(lambda x: lut.get(x, 0)) -# %% ../../nbs/handlers/helcom.ipynb 157 +# %% ../../nbs/handlers/helcom.ipynb 159 class RemapStationIdCB(Callback): "Remap Station ID to MARIS format." def __init__(self): @@ -501,7 +501,7 @@ def __call__(self, tfm: Transformer): for grp in tfm.dfs.keys(): tfm.dfs[grp]['station'] = tfm.dfs[grp]['STATION'] -# %% ../../nbs/handlers/helcom.ipynb 161 +# %% ../../nbs/handlers/helcom.ipynb 163 class RemapSedSliceTopBottomCB(Callback): "Remap Sediment slice top and bottom to MARIS format." def __call__(self, tfm: Transformer): @@ -509,7 +509,7 @@ def __call__(self, tfm: Transformer): tfm.dfs['sediment']['top'] = tfm.dfs['sediment']['UPPSLI'] tfm.dfs['sediment']['bottom'] = tfm.dfs['sediment']['LOWSLI'] -# %% ../../nbs/handlers/helcom.ipynb 166 +# %% ../../nbs/handlers/helcom.ipynb 168 class LookupDryWetRatio(Callback): "Lookup dry-wet ratio and format for MARIS." def __call__(self, tfm: Transformer): @@ -525,7 +525,7 @@ def _apply_dry_wet_ratio(self, df: pd.DataFrame) -> None: df.loc[df['dry_wet_ratio'] == 0, 'dry_wet_ratio'] = np.NaN -# %% ../../nbs/handlers/helcom.ipynb 172 +# %% ../../nbs/handlers/helcom.ipynb 174 class ParseCoordinates(Callback): """ Get geographical coordinates from columns expressed in degrees decimal format @@ -575,7 +575,7 @@ def _safe_convert(self, value) -> str: print(f"Error converting value {value}: {e}") return value -# %% ../../nbs/handlers/helcom.ipynb 183 +# %% ../../nbs/handlers/helcom.ipynb 185 def get_common_rules( vars: dict, # Configuration dictionary encoding_type: str # Encoding type (`netcdf` or `openrefine`) @@ -615,7 +615,7 @@ def get_common_rules( return common -# %% ../../nbs/handlers/helcom.ipynb 184 +# %% ../../nbs/handlers/helcom.ipynb 186 def get_specific_rules( vars: dict, # Configuration dictionary encoding_type: str # Encoding type (`netcdf` or `openrefine`) @@ -654,7 +654,7 @@ def get_specific_rules( } } -# %% ../../nbs/handlers/helcom.ipynb 185 +# %% ../../nbs/handlers/helcom.ipynb 187 def get_renaming_rules( encoding_type: str = 'netcdf' # Encoding type (`netcdf` or `openrefine`) ) -> dict: # Renaming rules for NetCDF and OpenRefine. @@ -674,7 +674,7 @@ def get_renaming_rules( return dict(rules) -# %% ../../nbs/handlers/helcom.ipynb 186 +# %% ../../nbs/handlers/helcom.ipynb 188 class SelectAndRenameColumnCB(Callback): "Select and rename columns in a DataFrame based on renaming rules for a specified encoding type." def __init__(self, @@ -745,7 +745,7 @@ def _apply_renaming(self, return df, not_found_keys -# %% ../../nbs/handlers/helcom.ipynb 195 +# %% ../../nbs/handlers/helcom.ipynb 197 kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides', 'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure', 'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments', @@ -757,7 +757,7 @@ def _apply_renaming(self, 'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans', 'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)'] -# %% ../../nbs/handlers/helcom.ipynb 196 +# %% ../../nbs/handlers/helcom.ipynb 198 def get_attrs( tfm: Transformer, # Transformer object zotero_key: str, # Zotero dataset record key @@ -773,7 +773,7 @@ def get_attrs( KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs)) ])() -# %% ../../nbs/handlers/helcom.ipynb 198 +# %% ../../nbs/handlers/helcom.ipynb 200 def enums_xtra( tfm: Transformer, # Transformer object vars: list # List of variables to extract from the transformer @@ -787,7 +787,7 @@ def enums_xtra( xtras[f'{var}_t'] = enums.filter(f'{var}_t', unique_vals) return xtras -# %% ../../nbs/handlers/helcom.ipynb 200 +# %% ../../nbs/handlers/helcom.ipynb 202 def encode( fname_in: str, # Input file name fname_out_nc: str, # Output file name diff --git a/nbs/api/configs.ipynb b/nbs/api/configs.ipynb index 619d9f6..dc56583 100644 --- a/nbs/api/configs.ipynb +++ b/nbs/api/configs.ipynb @@ -1272,6 +1272,10 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.15" } }, "nbformat": 4, diff --git a/nbs/handlers/_ospar.ipynb b/nbs/handlers/_ospar.ipynb index 52d0512..aa9c3e4 100644 --- a/nbs/handlers/_ospar.ipynb +++ b/nbs/handlers/_ospar.ipynb @@ -67,7 +67,16 @@ "execution_count": null, "id": "f69f5756", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "#| hide\n", "%load_ext autoreload\n", @@ -597,92 +606,92 @@ " \n", " 0\n", " 0\n", - " 210Pb\n", + " NaN\n", " \n", " \n", " 1\n", " 1\n", - " NaN\n", + " 238Pu\n", " \n", " \n", " 2\n", " 2\n", - " 238Pu\n", + " Cs-134\n", " \n", " \n", " 3\n", " 3\n", - " 239,240Pu\n", + " 3H\n", " \n", " \n", " 4\n", " 4\n", - " 239, 240 Pu\n", + " 228Ra\n", " \n", " \n", " 5\n", " 5\n", - " 137Cs\n", + " 210Po\n", " \n", " \n", " 6\n", " 6\n", - " 226Ra\n", + " 210Pb\n", " \n", " \n", " 7\n", " 7\n", - " 99Tc\n", + " Cs-137\n", " \n", " \n", " 8\n", " 8\n", - " RA-226\n", + " 226Ra\n", " \n", " \n", " 9\n", " 9\n", - " CS-134\n", + " RA-228\n", " \n", " \n", " 10\n", " 10\n", - " Cs-137\n", + " 99Tc\n", " \n", " \n", " 11\n", " 11\n", - " Cs-134\n", + " 137Cs\n", " \n", " \n", " 12\n", " 12\n", - " 228Ra\n", + " 239,240Pu\n", " \n", " \n", " 13\n", " 13\n", - " CS-137\n", + " 239, 240 Pu\n", " \n", " \n", " 14\n", " 14\n", - " RA-228\n", + " 241Am\n", " \n", " \n", " 15\n", " 15\n", - " 3H\n", + " CS-137\n", " \n", " \n", " 16\n", " 16\n", - " 210Po\n", + " RA-226\n", " \n", " \n", " 17\n", " 17\n", - " 241Am\n", + " CS-134\n", " \n", " \n", "\n", @@ -690,24 +699,24 @@ ], "text/plain": [ " index value\n", - "0 0 210Pb\n", - "1 1 NaN\n", - "2 2 238Pu\n", - "3 3 239,240Pu\n", - "4 4 239, 240 Pu\n", - "5 5 137Cs\n", - "6 6 226Ra\n", - "7 7 99Tc\n", - "8 8 RA-226\n", - "9 9 CS-134\n", - "10 10 Cs-137\n", - "11 11 Cs-134\n", - "12 12 228Ra\n", - "13 13 CS-137\n", - "14 14 RA-228\n", - "15 15 3H\n", - "16 16 210Po\n", - "17 17 241Am" + "0 0 NaN\n", + "1 1 238Pu\n", + "2 2 Cs-134\n", + "3 3 3H\n", + "4 4 228Ra\n", + "5 5 210Po\n", + "6 6 210Pb\n", + "7 7 Cs-137\n", + "8 8 226Ra\n", + "9 9 RA-228\n", + "10 10 99Tc\n", + "11 11 137Cs\n", + "12 12 239,240Pu\n", + "13 13 239, 240 Pu\n", + "14 14 241Am\n", + "15 15 CS-137\n", + "16 16 RA-226\n", + "17 17 CS-134" ] }, "execution_count": null, @@ -771,7 +780,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 18/18 [00:00<00:00, 29.25it/s]\n" + "Processing: 100%|██████████| 18/18 [00:00<00:00, 37.85it/s]\n" ] }, { @@ -820,9 +829,9 @@ " 6\n", " \n", " \n", - " 210Pb\n", - " ag106m\n", - " 210Pb\n", + " 241Am\n", + " pu241\n", + " 241Am\n", " 4\n", " \n", " \n", @@ -838,9 +847,9 @@ " 4\n", " \n", " \n", - " 241Am\n", - " pu241\n", - " 241Am\n", + " 210Pb\n", + " ag106m\n", + " 210Pb\n", " 4\n", " \n", " \n", @@ -856,18 +865,18 @@ " 4\n", " \n", " \n", - " 99Tc\n", - " tu\n", - " 99Tc\n", - " 3\n", - " \n", - " \n", " 238Pu\n", " u238\n", " 238Pu\n", " 3\n", " \n", " \n", + " 99Tc\n", + " tu\n", + " 99Tc\n", + " 3\n", + " \n", + " \n", " 3H\n", " h3\n", " 3H\n", @@ -880,9 +889,15 @@ " 1\n", " \n", " \n", - " Cs-137\n", + " CS-137\n", " cs137\n", - " Cs-137\n", + " CS-137\n", + " 1\n", + " \n", + " \n", + " RA-228\n", + " ra228\n", + " RA-228\n", " 1\n", " \n", " \n", @@ -892,15 +907,9 @@ " 1\n", " \n", " \n", - " CS-137\n", + " Cs-137\n", " cs137\n", - " CS-137\n", - " 1\n", - " \n", - " \n", - " RA-228\n", - " ra228\n", - " RA-228\n", + " Cs-137\n", " 1\n", " \n", " \n", @@ -918,20 +927,20 @@ "source_key \n", "239, 240 Pu pu240 239, 240 Pu 8\n", "239,240Pu pu240 239,240Pu 6\n", - "210Pb ag106m 210Pb 4\n", + "241Am pu241 241Am 4\n", "228Ra u238 228Ra 4\n", "210Po ag106m 210Po 4\n", - "241Am pu241 241Am 4\n", + "210Pb ag106m 210Pb 4\n", "226Ra u235 226Ra 4\n", "137Cs h3 137Cs 4\n", - "99Tc tu 99Tc 3\n", "238Pu u238 238Pu 3\n", + "99Tc tu 99Tc 3\n", "3H h3 3H 2\n", "RA-226 ra226 RA-226 1\n", - "Cs-137 cs137 Cs-137 1\n", - "Cs-134 cs134 Cs-134 1\n", "CS-137 cs137 CS-137 1\n", "RA-228 ra228 RA-228 1\n", + "Cs-134 cs134 Cs-134 1\n", + "Cs-137 cs137 Cs-137 1\n", "CS-134 cs134 CS-134 1" ] }, @@ -1001,14 +1010,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 0%| | 0/18 [00:00\n", " 0\n", " 0\n", - " Solea solea (S.vulgaris)\n", + " Cerastoderma edule\n", " \n", " \n", " 1\n", " 1\n", - " BROSME BROSME\n", + " NaN\n", " \n", " \n", " 2\n", " 2\n", - " NaN\n", + " Gadus morhua\n", " \n", " \n", " 3\n", " 3\n", - " Argentina silus\n", + " PLEURONECTES PLATESSA\n", " \n", " \n", " 4\n", " 4\n", - " Lumpenus lampretaeformis\n", + " Coryphaenoides rupestris\n", " \n", " \n", " ...\n", @@ -4071,27 +4073,27 @@ " \n", " 151\n", " 151\n", - " SEBASTES MARINUS\n", + " Argentina silus\n", " \n", " \n", " 152\n", " 152\n", - " Thunnus thynnus\n", + " unknown\n", " \n", " \n", " 153\n", " 153\n", - " Pleuronectes platessa\n", + " GLYPTOCEPHALUS CYNOGLOSSUS\n", " \n", " \n", " 154\n", " 154\n", - " Hippoglossoides platessoides\n", + " SEBASTES MARINUS\n", " \n", " \n", " 155\n", " 155\n", - " Gaidropsarus argenteus\n", + " CERASTODERMA (CARDIUM) EDULE\n", " \n", " \n", "\n", @@ -4100,17 +4102,17 @@ ], "text/plain": [ " index value\n", - "0 0 Solea solea (S.vulgaris)\n", - "1 1 BROSME BROSME\n", - "2 2 NaN\n", - "3 3 Argentina silus\n", - "4 4 Lumpenus lampretaeformis\n", + "0 0 Cerastoderma edule\n", + "1 1 NaN\n", + "2 2 Gadus morhua\n", + "3 3 PLEURONECTES PLATESSA\n", + "4 4 Coryphaenoides rupestris\n", ".. ... ...\n", - "151 151 SEBASTES MARINUS\n", - "152 152 Thunnus thynnus\n", - "153 153 Pleuronectes platessa\n", - "154 154 Hippoglossoides platessoides\n", - "155 155 Gaidropsarus argenteus\n", + "151 151 Argentina silus\n", + "152 152 unknown\n", + "153 153 GLYPTOCEPHALUS CYNOGLOSSUS\n", + "154 154 SEBASTES MARINUS\n", + "155 155 CERASTODERMA (CARDIUM) EDULE\n", "\n", "[156 rows x 2 columns]" ] @@ -4160,14 +4162,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 0%| | 0/156 [00:0012\n", " \n", " \n", - " Cerastoderma (Cardium) Edule\n", + " CERASTODERMA (CARDIUM) EDULE\n", " Cerastoderma edule\n", - " Cerastoderma (Cardium) Edule\n", + " CERASTODERMA (CARDIUM) EDULE\n", " 10\n", " \n", " \n", - " CERASTODERMA (CARDIUM) EDULE\n", + " Cerastoderma (Cardium) Edule\n", " Cerastoderma edule\n", - " CERASTODERMA (CARDIUM) EDULE\n", + " Cerastoderma (Cardium) Edule\n", " 10\n", " \n", " \n", + " DICENTRARCHUS (MORONE) LABRAX\n", + " Dicentrarchus labrax\n", + " DICENTRARCHUS (MORONE) LABRAX\n", + " 9\n", + " \n", + " \n", " NUCELLA LAPILLUS\n", " Mugil cephalus\n", " NUCELLA LAPILLUS\n", @@ -4252,10 +4253,10 @@ " 9\n", " \n", " \n", - " DICENTRARCHUS (MORONE) LABRAX\n", - " Dicentrarchus labrax\n", - " DICENTRARCHUS (MORONE) LABRAX\n", - " 9\n", + " Pleuronectiformes [order]\n", + " Pleuronectiformes\n", + " Pleuronectiformes [order]\n", + " 8\n", " \n", " \n", " RAJIDAE/BATOIDEA\n", @@ -4264,18 +4265,24 @@ " 8\n", " \n", " \n", - " Pleuronectiformes [order]\n", - " Pleuronectiformes\n", - " Pleuronectiformes [order]\n", - " 8\n", - " \n", - " \n", " PALMARIA PALMATA\n", " Alaria marginata\n", " PALMARIA PALMATA\n", " 7\n", " \n", " \n", + " Flatfish\n", + " Lambia\n", + " Flatfish\n", + " 5\n", + " \n", + " \n", + " FUCUS SPP.\n", + " Fucus\n", + " FUCUS SPP.\n", + " 5\n", + " \n", + " \n", " unknown\n", " Plankton\n", " unknown\n", @@ -4294,33 +4301,21 @@ " 5\n", " \n", " \n", - " Flatfish\n", - " Lambia\n", - " Flatfish\n", - " 5\n", - " \n", - " \n", " Rhodymenia spp.\n", " Rhodymenia\n", " Rhodymenia spp.\n", " 5\n", " \n", " \n", - " FUCUS SPP.\n", - " Fucus\n", - " FUCUS SPP.\n", - " 5\n", - " \n", - " \n", " Sepia spp.\n", " Sepia\n", " Sepia spp.\n", " 5\n", " \n", " \n", - " RHODYMENIA spp\n", - " Rhodymenia\n", - " RHODYMENIA spp\n", + " Gadus sp.\n", + " Gadus\n", + " Gadus sp.\n", " 4\n", " \n", " \n", @@ -4330,9 +4325,9 @@ " 4\n", " \n", " \n", - " FUCUS spp\n", - " Fucus\n", - " FUCUS spp\n", + " Thunnus sp.\n", + " Thunnus\n", + " Thunnus sp.\n", " 4\n", " \n", " \n", @@ -4342,21 +4337,21 @@ " 4\n", " \n", " \n", - " Fucus sp.\n", + " FUCUS spp\n", " Fucus\n", - " Fucus sp.\n", + " FUCUS spp\n", " 4\n", " \n", " \n", - " Thunnus sp.\n", - " Thunnus\n", - " Thunnus sp.\n", + " RHODYMENIA spp\n", + " Rhodymenia\n", + " RHODYMENIA spp\n", " 4\n", " \n", " \n", - " Gadus sp.\n", - " Gadus\n", - " Gadus sp.\n", + " Fucus sp.\n", + " Fucus\n", + " Fucus sp.\n", " 4\n", " \n", " \n", @@ -4366,18 +4361,18 @@ " 3\n", " \n", " \n", - " PLUERONECTES PLATESSA\n", - " Pleuronectes platessa\n", - " PLUERONECTES PLATESSA\n", - " 2\n", - " \n", - " \n", " Gaidropsarus argenteus\n", " Gaidropsarus argentatus\n", " Gaidropsarus argenteus\n", " 2\n", " \n", " \n", + " PLUERONECTES PLATESSA\n", + " Pleuronectes platessa\n", + " PLUERONECTES PLATESSA\n", + " 2\n", + " \n", + " \n", " Sebastes vivipares\n", " Sebastes viviparus\n", " Sebastes vivipares\n", @@ -4400,31 +4395,31 @@ "Mixture of green, red and brown algae Mercenaria mercenaria \n", "Solea solea (S.vulgaris) Loligo vulgaris \n", "SOLEA SOLEA (S.VULGARIS) Loligo vulgaris \n", - "Cerastoderma (Cardium) Edule Cerastoderma edule \n", "CERASTODERMA (CARDIUM) EDULE Cerastoderma edule \n", + "Cerastoderma (Cardium) Edule Cerastoderma edule \n", + "DICENTRARCHUS (MORONE) LABRAX Dicentrarchus labrax \n", "NUCELLA LAPILLUS Mugil cephalus \n", "MONODONTA LINEATA Ophiothrix lineata \n", - "DICENTRARCHUS (MORONE) LABRAX Dicentrarchus labrax \n", - "RAJIDAE/BATOIDEA Batoidea \n", "Pleuronectiformes [order] Pleuronectiformes \n", + "RAJIDAE/BATOIDEA Batoidea \n", "PALMARIA PALMATA Alaria marginata \n", + "Flatfish Lambia \n", + "FUCUS SPP. Fucus \n", "unknown Plankton \n", "Unknown Plankton \n", "RAJA DIPTURUS BATIS Dipturus batis \n", - "Flatfish Lambia \n", "Rhodymenia spp. Rhodymenia \n", - "FUCUS SPP. Fucus \n", "Sepia spp. Sepia \n", - "RHODYMENIA spp Rhodymenia \n", + "Gadus sp. Gadus \n", "Tapes sp. Tapes \n", - "FUCUS spp Fucus \n", + "Thunnus sp. Thunnus \n", "Patella sp. Patella aspera \n", + "FUCUS spp Fucus \n", + "RHODYMENIA spp Rhodymenia \n", "Fucus sp. Fucus \n", - "Thunnus sp. Thunnus \n", - "Gadus sp. Gadus \n", "PECTINIDAE Buccinidae \n", - "PLUERONECTES PLATESSA Pleuronectes platessa \n", "Gaidropsarus argenteus Gaidropsarus argentatus \n", + "PLUERONECTES PLATESSA Pleuronectes platessa \n", "Sebastes vivipares Sebastes viviparus \n", "ASCOPHYLLUN NODOSUM Ascophyllum nodosum \n", "\n", @@ -4434,31 +4429,31 @@ "Mixture of green, red and brown algae Mixture of green, red and brown algae \n", "Solea solea (S.vulgaris) Solea solea (S.vulgaris) \n", "SOLEA SOLEA (S.VULGARIS) SOLEA SOLEA (S.VULGARIS) \n", - "Cerastoderma (Cardium) Edule Cerastoderma (Cardium) Edule \n", "CERASTODERMA (CARDIUM) EDULE CERASTODERMA (CARDIUM) EDULE \n", + "Cerastoderma (Cardium) Edule Cerastoderma (Cardium) Edule \n", + "DICENTRARCHUS (MORONE) LABRAX DICENTRARCHUS (MORONE) LABRAX \n", "NUCELLA LAPILLUS NUCELLA LAPILLUS \n", "MONODONTA LINEATA MONODONTA LINEATA \n", - "DICENTRARCHUS (MORONE) LABRAX DICENTRARCHUS (MORONE) LABRAX \n", - "RAJIDAE/BATOIDEA RAJIDAE/BATOIDEA \n", "Pleuronectiformes [order] Pleuronectiformes [order] \n", + "RAJIDAE/BATOIDEA RAJIDAE/BATOIDEA \n", "PALMARIA PALMATA PALMARIA PALMATA \n", + "Flatfish Flatfish \n", + "FUCUS SPP. FUCUS SPP. \n", "unknown unknown \n", "Unknown Unknown \n", "RAJA DIPTURUS BATIS RAJA DIPTURUS BATIS \n", - "Flatfish Flatfish \n", "Rhodymenia spp. Rhodymenia spp. \n", - "FUCUS SPP. FUCUS SPP. \n", "Sepia spp. Sepia spp. \n", - "RHODYMENIA spp RHODYMENIA spp \n", + "Gadus sp. Gadus sp. \n", "Tapes sp. Tapes sp. \n", - "FUCUS spp FUCUS spp \n", + "Thunnus sp. Thunnus sp. \n", "Patella sp. Patella sp. \n", + "FUCUS spp FUCUS spp \n", + "RHODYMENIA spp RHODYMENIA spp \n", "Fucus sp. Fucus sp. \n", - "Thunnus sp. Thunnus sp. \n", - "Gadus sp. Gadus sp. \n", "PECTINIDAE PECTINIDAE \n", - "PLUERONECTES PLATESSA PLUERONECTES PLATESSA \n", "Gaidropsarus argenteus Gaidropsarus argenteus \n", + "PLUERONECTES PLATESSA PLUERONECTES PLATESSA \n", "Sebastes vivipares Sebastes vivipares \n", "ASCOPHYLLUN NODOSUM ASCOPHYLLUN NODOSUM \n", "\n", @@ -4468,31 +4463,31 @@ "Mixture of green, red and brown algae 26 \n", "Solea solea (S.vulgaris) 12 \n", "SOLEA SOLEA (S.VULGARIS) 12 \n", - "Cerastoderma (Cardium) Edule 10 \n", "CERASTODERMA (CARDIUM) EDULE 10 \n", + "Cerastoderma (Cardium) Edule 10 \n", + "DICENTRARCHUS (MORONE) LABRAX 9 \n", "NUCELLA LAPILLUS 9 \n", "MONODONTA LINEATA 9 \n", - "DICENTRARCHUS (MORONE) LABRAX 9 \n", - "RAJIDAE/BATOIDEA 8 \n", "Pleuronectiformes [order] 8 \n", + "RAJIDAE/BATOIDEA 8 \n", "PALMARIA PALMATA 7 \n", + "Flatfish 5 \n", + "FUCUS SPP. 5 \n", "unknown 5 \n", "Unknown 5 \n", "RAJA DIPTURUS BATIS 5 \n", - "Flatfish 5 \n", "Rhodymenia spp. 5 \n", - "FUCUS SPP. 5 \n", "Sepia spp. 5 \n", - "RHODYMENIA spp 4 \n", + "Gadus sp. 4 \n", "Tapes sp. 4 \n", - "FUCUS spp 4 \n", + "Thunnus sp. 4 \n", "Patella sp. 4 \n", + "FUCUS spp 4 \n", + "RHODYMENIA spp 4 \n", "Fucus sp. 4 \n", - "Thunnus sp. 4 \n", - "Gadus sp. 4 \n", "PECTINIDAE 3 \n", - "PLUERONECTES PLATESSA 2 \n", "Gaidropsarus argenteus 2 \n", + "PLUERONECTES PLATESSA 2 \n", "Sebastes vivipares 1 \n", "ASCOPHYLLUN NODOSUM 1 " ] @@ -4512,7 +4507,7 @@ "id": "b9a388bf", "metadata": {}, "source": [ - "Below, we fix some of the entries that are not properly matched by the `Remapper` object:" + "Below, we fixthe entries that are not properly matched by the `Remapper` object:" ] }, { @@ -4544,7 +4539,7 @@ "id": "aacedba9", "metadata": {}, "source": [ - "And give it an another try:" + "We now attempt remapping again, incorporating the `fixes_biota_species` dictionary:" ] }, { @@ -4557,14 +4552,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 1%| | 1/156 [00:00<00:29, 5.25it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing: 100%|██████████| 156/156 [00:28<00:00, 5.39it/s]\n" + "Processing: 100%|██████████| 156/156 [00:28<00:00, 5.45it/s]\n" ] }, { @@ -4601,15 +4589,15 @@ " \n", " \n", " \n", - " Cerastoderma (Cardium) Edule\n", + " CERASTODERMA (CARDIUM) EDULE\n", " Cerastoderma edule\n", - " Cerastoderma (Cardium) Edule\n", + " CERASTODERMA (CARDIUM) EDULE\n", " 10\n", " \n", " \n", - " CERASTODERMA (CARDIUM) EDULE\n", + " Cerastoderma (Cardium) Edule\n", " Cerastoderma edule\n", - " CERASTODERMA (CARDIUM) EDULE\n", + " Cerastoderma (Cardium) Edule\n", " 10\n", " \n", " \n", @@ -4631,9 +4619,9 @@ " 5\n", " \n", " \n", - " FUCUS SPP.\n", - " Fucus\n", - " FUCUS SPP.\n", + " Rhodymenia spp.\n", + " Rhodymenia\n", + " Rhodymenia spp.\n", " 5\n", " \n", " \n", @@ -4643,15 +4631,15 @@ " 5\n", " \n", " \n", - " Rhodymenia spp.\n", - " Rhodymenia\n", - " Rhodymenia spp.\n", + " FUCUS SPP.\n", + " Fucus\n", + " FUCUS SPP.\n", " 5\n", " \n", " \n", - " Thunnus sp.\n", - " Thunnus\n", - " Thunnus sp.\n", + " Tapes sp.\n", + " Tapes\n", + " Tapes sp.\n", " 4\n", " \n", " \n", @@ -4661,15 +4649,15 @@ " 4\n", " \n", " \n", - " FUCUS spp\n", - " Fucus\n", - " FUCUS spp\n", + " RHODYMENIA spp\n", + " Rhodymenia\n", + " RHODYMENIA spp\n", " 4\n", " \n", " \n", - " Tapes sp.\n", - " Tapes\n", - " Tapes sp.\n", + " Gadus sp.\n", + " Gadus\n", + " Gadus sp.\n", " 4\n", " \n", " \n", @@ -4679,30 +4667,30 @@ " 4\n", " \n", " \n", - " RHODYMENIA spp\n", - " Rhodymenia\n", - " RHODYMENIA spp\n", + " Thunnus sp.\n", + " Thunnus\n", + " Thunnus sp.\n", " 4\n", " \n", " \n", - " Gadus sp.\n", - " Gadus\n", - " Gadus sp.\n", + " FUCUS spp\n", + " Fucus\n", + " FUCUS spp\n", " 4\n", " \n", " \n", - " PLUERONECTES PLATESSA\n", - " Pleuronectes platessa\n", - " PLUERONECTES PLATESSA\n", - " 2\n", - " \n", - " \n", " Gaidropsarus argenteus\n", " Gaidropsarus argentatus\n", " Gaidropsarus argenteus\n", " 2\n", " \n", " \n", + " PLUERONECTES PLATESSA\n", + " Pleuronectes platessa\n", + " PLUERONECTES PLATESSA\n", + " 2\n", + " \n", + " \n", " Sebastes vivipares\n", " Sebastes viviparus\n", " Sebastes vivipares\n", @@ -4721,45 +4709,45 @@ "text/plain": [ " matched_maris_name \\\n", "source_key \n", - "Cerastoderma (Cardium) Edule Cerastoderma edule \n", "CERASTODERMA (CARDIUM) EDULE Cerastoderma edule \n", + "Cerastoderma (Cardium) Edule Cerastoderma edule \n", "DICENTRARCHUS (MORONE) LABRAX Dicentrarchus labrax \n", "Pleuronectiformes [order] Pleuronectiformes \n", "Sepia spp. Sepia \n", - "FUCUS SPP. Fucus \n", - "RAJA DIPTURUS BATIS Dipturus batis \n", "Rhodymenia spp. Rhodymenia \n", - "Thunnus sp. Thunnus \n", - "Patella sp. Patella aspera \n", - "FUCUS spp Fucus \n", + "RAJA DIPTURUS BATIS Dipturus batis \n", + "FUCUS SPP. Fucus \n", "Tapes sp. Tapes \n", - "Fucus sp. Fucus \n", + "Patella sp. Patella aspera \n", "RHODYMENIA spp Rhodymenia \n", "Gadus sp. Gadus \n", - "PLUERONECTES PLATESSA Pleuronectes platessa \n", + "Fucus sp. Fucus \n", + "Thunnus sp. Thunnus \n", + "FUCUS spp Fucus \n", "Gaidropsarus argenteus Gaidropsarus argentatus \n", + "PLUERONECTES PLATESSA Pleuronectes platessa \n", "Sebastes vivipares Sebastes viviparus \n", "ASCOPHYLLUN NODOSUM Ascophyllum nodosum \n", "\n", " source_name match_score \n", "source_key \n", - "Cerastoderma (Cardium) Edule Cerastoderma (Cardium) Edule 10 \n", "CERASTODERMA (CARDIUM) EDULE CERASTODERMA (CARDIUM) EDULE 10 \n", + "Cerastoderma (Cardium) Edule Cerastoderma (Cardium) Edule 10 \n", "DICENTRARCHUS (MORONE) LABRAX DICENTRARCHUS (MORONE) LABRAX 9 \n", "Pleuronectiformes [order] Pleuronectiformes [order] 8 \n", "Sepia spp. Sepia spp. 5 \n", - "FUCUS SPP. FUCUS SPP. 5 \n", - "RAJA DIPTURUS BATIS RAJA DIPTURUS BATIS 5 \n", "Rhodymenia spp. Rhodymenia spp. 5 \n", - "Thunnus sp. Thunnus sp. 4 \n", - "Patella sp. Patella sp. 4 \n", - "FUCUS spp FUCUS spp 4 \n", + "RAJA DIPTURUS BATIS RAJA DIPTURUS BATIS 5 \n", + "FUCUS SPP. FUCUS SPP. 5 \n", "Tapes sp. Tapes sp. 4 \n", - "Fucus sp. Fucus sp. 4 \n", + "Patella sp. Patella sp. 4 \n", "RHODYMENIA spp RHODYMENIA spp 4 \n", "Gadus sp. Gadus sp. 4 \n", - "PLUERONECTES PLATESSA PLUERONECTES PLATESSA 2 \n", + "Fucus sp. Fucus sp. 4 \n", + "Thunnus sp. Thunnus sp. 4 \n", + "FUCUS spp FUCUS spp 4 \n", "Gaidropsarus argenteus Gaidropsarus argenteus 2 \n", + "PLUERONECTES PLATESSA PLUERONECTES PLATESSA 2 \n", "Sebastes vivipares Sebastes vivipares 1 \n", "ASCOPHYLLUN NODOSUM ASCOPHYLLUN NODOSUM 1 " ] @@ -4780,9 +4768,135 @@ "id": "24b4e864", "metadata": {}, "source": [ - "Visual inspection of the remaining imperfectly matched entries appears acceptable to proceed. \n", + "Visual inspection of the remaining imperfectly matched entries appears acceptable. We can now proceed with the final remapping process:\n", "\n", - "We can now use the generic `RemapCB` callback to perform the remapping." + "1. Create Remapper Lambda Function:\n", + "\n", + " We'll define a lambda function that instantiates a Remapper object and returns its corrected lookup table.\n", + "\n", + "2. Apply RemapCB: \n", + "\n", + " Using the generic `RemapCB` callback, we'll perform the actual remapping of the `species` column using our lambda function.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c04dee2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexvalue
00Cerastoderma edule
11NaN
22Gadus morhua
33PLEURONECTES PLATESSA
44Coryphaenoides rupestris
.........
151151Argentina silus
152152unknown
153153GLYPTOCEPHALUS CYNOGLOSSUS
154154SEBASTES MARINUS
155155CERASTODERMA (CARDIUM) EDULE
\n", + "

156 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " index value\n", + "0 0 Cerastoderma edule\n", + "1 1 NaN\n", + "2 2 Gadus morhua\n", + "3 3 PLEURONECTES PLATESSA\n", + "4 4 Coryphaenoides rupestris\n", + ".. ... ...\n", + "151 151 Argentina silus\n", + "152 152 unknown\n", + "153 153 GLYPTOCEPHALUS CYNOGLOSSUS\n", + "154 154 SEBASTES MARINUS\n", + "155 155 CERASTODERMA (CARDIUM) EDULE\n", + "\n", + "[156 rows x 2 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "provider_lut_df=get_unique_across_dfs(dfs, col_name='Species', as_df=True)\n", + "provider_lut_df" ] }, { @@ -4802,6 +4916,15 @@ " fname_cache='species_ospar.pkl').generate_lookup_table(fixes=fixes_biota_species, as_df=False, overwrite=False)" ] }, + { + "cell_type": "markdown", + "id": "c9ccf10e", + "metadata": {}, + "source": [ + "Putting it all together, we now apply the `RemapCB` to our data. This process results in the addition of a `species` column to our `biota` dataframe, containing standardized species IDs.\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -4964,227 +5087,227 @@ " \n", " 0\n", " 0\n", - " Whole plant Seaweed\n", + " Whole without head FISH\n", " \n", " \n", " 1\n", " 1\n", - " WHOLE ANIMAL FISH\n", + " LIVER Fish\n", " \n", " \n", " 2\n", " 2\n", - " Whole animal Molluscs\n", + " Flesh Fish\n", " \n", " \n", " 3\n", " 3\n", - " FLESH WITHOUT BONES fish\n", + " FLESH WITHOUT BONES Fish\n", " \n", " \n", " 4\n", " 4\n", - " Cod medallion FISH\n", + " HEAD FISH\n", " \n", " \n", " 5\n", " 5\n", - " SOFT PARTS MOLLUSCS\n", + " Whole FISH\n", " \n", " \n", " 6\n", " 6\n", - " Whole fish Fish\n", + " Whole animal Molluscs\n", " \n", " \n", " 7\n", " 7\n", - " Muscle FISH\n", + " SOFT PARTS MOLLUSCS\n", " \n", " \n", " 8\n", " 8\n", - " FLESH WITHOUT BONES FISH\n", + " SOFT PARTS Molluscs\n", " \n", " \n", " 9\n", " 9\n", - " FLESH Fish\n", + " UNKNOWN Fish\n", " \n", " \n", " 10\n", " 10\n", - " WHOLE FISH Fish\n", + " GROWING TIPS Seaweed\n", " \n", " \n", " 11\n", " 11\n", - " Whole animal Fish\n", + " Whole plant Seaweed\n", " \n", " \n", " 12\n", " 12\n", - " Flesh Fish\n", + " Soft parts Molluscs\n", " \n", " \n", " 13\n", " 13\n", - " GROWING TIPS Seaweed\n", + " MUSCLE Fish\n", " \n", " \n", " 14\n", " 14\n", - " Whole FISH\n", + " Flesh without bones Fish\n", " \n", " \n", " 15\n", " 15\n", - " WHOLE PLANT SEAWEED\n", + " WHOLE ANIMAL Fish\n", " \n", " \n", " 16\n", " 16\n", - " FLESH WITHOUT BONE FISH\n", + " UNKNOWN FISH\n", " \n", " \n", " 17\n", " 17\n", - " WHOLE ANIMAL Molluscs\n", + " FLESH WITHOUT BONES Molluscs\n", " \n", " \n", " 18\n", " 18\n", - " FLESH WITHOUT BONES Fish\n", + " Soft parts Fish\n", " \n", " \n", " 19\n", " 19\n", - " HEAD FISH\n", + " WHOLE Seaweed\n", " \n", " \n", " 20\n", " 20\n", - " UNKNOWN FISH\n", + " WHOLE PLANT SEAWEED\n", " \n", " \n", " 21\n", " 21\n", - " SOFT PARTS molluscs\n", + " whole plant Seaweed\n", " \n", " \n", " 22\n", " 22\n", - " UNKNOWN Fish\n", + " WHOLE Fish\n", " \n", " \n", " 23\n", " 23\n", - " WHOLE FISH FISH\n", + " WHOLE ANIMAL FISH\n", " \n", " \n", " 24\n", " 24\n", - " Whole fisk FISH\n", + " FLESH WITHOUT BONE FISH\n", " \n", " \n", " 25\n", " 25\n", - " Whole without head FISH\n", + " WHOLE ANIMAL Molluscs\n", " \n", " \n", " 26\n", " 26\n", - " Soft parts Fish\n", + " FLESH WITHOUT BONES SEAWEED\n", " \n", " \n", " 27\n", " 27\n", - " FLESH WITHOUT BONES SEAWEED\n", + " WHOLE FISH FISH\n", " \n", " \n", " 28\n", " 28\n", - " Mix of muscle and whole fish without liver FISH\n", + " WHOLE FISH Fish\n", " \n", " \n", " 29\n", " 29\n", - " LIVER Fish\n", + " Soft Parts Molluscs\n", " \n", " \n", " 30\n", " 30\n", - " Soft Parts Molluscs\n", + " Cod medallion FISH\n", " \n", " \n", " 31\n", " 31\n", - " FLESH WITHOUT BONE Fish\n", + " WHOLE PLANT seaweed\n", " \n", " \n", " 32\n", " 32\n", - " SOFT PARTS Molluscs\n", + " FLESH WITHOUT BONES FISH\n", " \n", " \n", " 33\n", " 33\n", - " WHOLE PLANT Seaweed\n", + " FLESH WITHOUT BONE Fish\n", " \n", " \n", " 34\n", " 34\n", - " HEAD Fish\n", + " Whole fish Fish\n", " \n", " \n", " 35\n", " 35\n", - " WHOLE Fish\n", + " Mix of muscle and whole fish without liver FISH\n", " \n", " \n", " 36\n", " 36\n", - " WHOLE Seaweed\n", + " Muscle FISH\n", " \n", " \n", " 37\n", " 37\n", - " whole plant Seaweed\n", + " HEAD Fish\n", " \n", " \n", " 38\n", " 38\n", - " WHOLE ANIMAL Fish\n", + " Whole fisk FISH\n", " \n", " \n", " 39\n", " 39\n", - " FLESH WITHOUT BONES Molluscs\n", + " WHOLE PLANT Seaweed\n", " \n", " \n", " 40\n", " 40\n", - " FLESH WITH SCALES Fish\n", + " FLESH Fish\n", " \n", " \n", " 41\n", " 41\n", - " Flesh without bones Fish\n", + " Whole animal Fish\n", " \n", " \n", " 42\n", " 42\n", - " MUSCLE Fish\n", + " FLESH WITH SCALES Fish\n", " \n", " \n", " 43\n", " 43\n", - " Soft parts Molluscs\n", + " SOFT PARTS molluscs\n", " \n", " \n", " 44\n", " 44\n", - " WHOLE PLANT seaweed\n", + " FLESH WITHOUT BONES fish\n", " \n", " \n", " 45\n", @@ -5197,51 +5320,51 @@ ], "text/plain": [ " index value\n", - "0 0 Whole plant Seaweed\n", - "1 1 WHOLE ANIMAL FISH\n", - "2 2 Whole animal Molluscs\n", - "3 3 FLESH WITHOUT BONES fish\n", - "4 4 Cod medallion FISH\n", - "5 5 SOFT PARTS MOLLUSCS\n", - "6 6 Whole fish Fish\n", - "7 7 Muscle FISH\n", - "8 8 FLESH WITHOUT BONES FISH\n", - "9 9 FLESH Fish\n", - "10 10 WHOLE FISH Fish\n", - "11 11 Whole animal Fish\n", - "12 12 Flesh Fish\n", - "13 13 GROWING TIPS Seaweed\n", - "14 14 Whole FISH\n", - "15 15 WHOLE PLANT SEAWEED\n", - "16 16 FLESH WITHOUT BONE FISH\n", - "17 17 WHOLE ANIMAL Molluscs\n", - "18 18 FLESH WITHOUT BONES Fish\n", - "19 19 HEAD FISH\n", - "20 20 UNKNOWN FISH\n", - "21 21 SOFT PARTS molluscs\n", - "22 22 UNKNOWN Fish\n", - "23 23 WHOLE FISH FISH\n", - "24 24 Whole fisk FISH\n", - "25 25 Whole without head FISH\n", - "26 26 Soft parts Fish\n", - "27 27 FLESH WITHOUT BONES SEAWEED\n", - "28 28 Mix of muscle and whole fish without liver FISH\n", - "29 29 LIVER Fish\n", - "30 30 Soft Parts Molluscs\n", - "31 31 FLESH WITHOUT BONE Fish\n", - "32 32 SOFT PARTS Molluscs\n", - "33 33 WHOLE PLANT Seaweed\n", - "34 34 HEAD Fish\n", - "35 35 WHOLE Fish\n", - "36 36 WHOLE Seaweed\n", - "37 37 whole plant Seaweed\n", - "38 38 WHOLE ANIMAL Fish\n", - "39 39 FLESH WITHOUT BONES Molluscs\n", - "40 40 FLESH WITH SCALES Fish\n", - "41 41 Flesh without bones Fish\n", - "42 42 MUSCLE Fish\n", - "43 43 Soft parts Molluscs\n", - "44 44 WHOLE PLANT seaweed\n", + "0 0 Whole without head FISH\n", + "1 1 LIVER Fish\n", + "2 2 Flesh Fish\n", + "3 3 FLESH WITHOUT BONES Fish\n", + "4 4 HEAD FISH\n", + "5 5 Whole FISH\n", + "6 6 Whole animal Molluscs\n", + "7 7 SOFT PARTS MOLLUSCS\n", + "8 8 SOFT PARTS Molluscs\n", + "9 9 UNKNOWN Fish\n", + "10 10 GROWING TIPS Seaweed\n", + "11 11 Whole plant Seaweed\n", + "12 12 Soft parts Molluscs\n", + "13 13 MUSCLE Fish\n", + "14 14 Flesh without bones Fish\n", + "15 15 WHOLE ANIMAL Fish\n", + "16 16 UNKNOWN FISH\n", + "17 17 FLESH WITHOUT BONES Molluscs\n", + "18 18 Soft parts Fish\n", + "19 19 WHOLE Seaweed\n", + "20 20 WHOLE PLANT SEAWEED\n", + "21 21 whole plant Seaweed\n", + "22 22 WHOLE Fish\n", + "23 23 WHOLE ANIMAL FISH\n", + "24 24 FLESH WITHOUT BONE FISH\n", + "25 25 WHOLE ANIMAL Molluscs\n", + "26 26 FLESH WITHOUT BONES SEAWEED\n", + "27 27 WHOLE FISH FISH\n", + "28 28 WHOLE FISH Fish\n", + "29 29 Soft Parts Molluscs\n", + "30 30 Cod medallion FISH\n", + "31 31 WHOLE PLANT seaweed\n", + "32 32 FLESH WITHOUT BONES FISH\n", + "33 33 FLESH WITHOUT BONE Fish\n", + "34 34 Whole fish Fish\n", + "35 35 Mix of muscle and whole fish without liver FISH\n", + "36 36 Muscle FISH\n", + "37 37 HEAD Fish\n", + "38 38 Whole fisk FISH\n", + "39 39 WHOLE PLANT Seaweed\n", + "40 40 FLESH Fish\n", + "41 41 Whole animal Fish\n", + "42 42 FLESH WITH SCALES Fish\n", + "43 43 SOFT PARTS molluscs\n", + "44 44 FLESH WITHOUT BONES fish\n", "45 45 Muscle Fish" ] }, @@ -5279,7 +5402,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 46/46 [00:00<00:00, 102.38it/s]\n" + "Processing: 100%|██████████| 46/46 [00:00<00:00, 95.93it/s]\n" ] }, { @@ -5334,9 +5457,27 @@ " 13\n", " \n", " \n", - " SOFT PARTS Molluscs\n", + " Soft parts Molluscs\n", " Soft parts\n", - " SOFT PARTS Molluscs\n", + " Soft parts Molluscs\n", + " 9\n", + " \n", + " \n", + " Soft Parts Molluscs\n", + " Soft parts\n", + " Soft Parts Molluscs\n", + " 9\n", + " \n", + " \n", + " WHOLE FISH Fish\n", + " Whole animal\n", + " WHOLE FISH Fish\n", + " 9\n", + " \n", + " \n", + " WHOLE FISH FISH\n", + " Whole animal\n", + " WHOLE FISH FISH\n", " 9\n", " \n", " \n", @@ -5352,21 +5493,15 @@ " 9\n", " \n", " \n", - " UNKNOWN Fish\n", - " Growing tips\n", - " UNKNOWN Fish\n", + " FLESH WITHOUT BONES Molluscs\n", + " Flesh without bones\n", + " FLESH WITHOUT BONES Molluscs\n", " 9\n", " \n", " \n", - " WHOLE FISH Fish\n", + " Whole fish Fish\n", " Whole animal\n", - " WHOLE FISH Fish\n", - " 9\n", - " \n", - " \n", - " Soft Parts Molluscs\n", - " Soft parts\n", - " Soft Parts Molluscs\n", + " Whole fish Fish\n", " 9\n", " \n", " \n", @@ -5382,57 +5517,45 @@ " 9\n", " \n", " \n", - " SOFT PARTS MOLLUSCS\n", - " Soft parts\n", - " SOFT PARTS MOLLUSCS\n", - " 9\n", - " \n", - " \n", - " FLESH WITHOUT BONES Molluscs\n", - " Flesh without bones\n", - " FLESH WITHOUT BONES Molluscs\n", - " 9\n", - " \n", - " \n", - " Whole fish Fish\n", + " Whole animal Molluscs\n", " Whole animal\n", - " Whole fish Fish\n", + " Whole animal Molluscs\n", " 9\n", " \n", " \n", - " Whole animal Molluscs\n", - " Whole animal\n", - " Whole animal Molluscs\n", + " UNKNOWN Fish\n", + " Growing tips\n", + " UNKNOWN Fish\n", " 9\n", " \n", " \n", - " Soft parts Molluscs\n", + " SOFT PARTS Molluscs\n", " Soft parts\n", - " Soft parts Molluscs\n", + " SOFT PARTS Molluscs\n", " 9\n", " \n", " \n", - " WHOLE FISH FISH\n", - " Whole animal\n", - " WHOLE FISH FISH\n", + " SOFT PARTS MOLLUSCS\n", + " Soft parts\n", + " SOFT PARTS MOLLUSCS\n", " 9\n", " \n", " \n", - " WHOLE PLANT Seaweed\n", + " Whole plant Seaweed\n", " Whole plant\n", - " WHOLE PLANT Seaweed\n", + " Whole plant Seaweed\n", " 8\n", " \n", " \n", - " WHOLE PLANT SEAWEED\n", + " whole plant Seaweed\n", " Whole plant\n", - " WHOLE PLANT SEAWEED\n", + " whole plant Seaweed\n", " 8\n", " \n", " \n", - " GROWING TIPS Seaweed\n", - " Growing tips\n", - " GROWING TIPS Seaweed\n", + " FLESH WITHOUT BONES SEAWEED\n", + " Flesh without bones\n", + " FLESH WITHOUT BONES SEAWEED\n", " 8\n", " \n", " \n", @@ -5442,33 +5565,33 @@ " 8\n", " \n", " \n", - " whole plant Seaweed\n", + " WHOLE PLANT SEAWEED\n", " Whole plant\n", - " whole plant Seaweed\n", + " WHOLE PLANT SEAWEED\n", " 8\n", " \n", " \n", - " Whole plant Seaweed\n", - " Whole plant\n", - " Whole plant Seaweed\n", + " GROWING TIPS Seaweed\n", + " Growing tips\n", + " GROWING TIPS Seaweed\n", " 8\n", " \n", " \n", - " FLESH WITHOUT BONES SEAWEED\n", - " Flesh without bones\n", - " FLESH WITHOUT BONES SEAWEED\n", + " WHOLE PLANT Seaweed\n", + " Whole plant\n", + " WHOLE PLANT Seaweed\n", " 8\n", " \n", " \n", - " Flesh Fish\n", + " FLESH Fish\n", " Shells\n", - " Flesh Fish\n", + " FLESH Fish\n", " 7\n", " \n", " \n", - " FLESH Fish\n", + " Flesh Fish\n", " Shells\n", - " FLESH Fish\n", + " Flesh Fish\n", " 7\n", " \n", " \n", @@ -5478,27 +5601,45 @@ " 6\n", " \n", " \n", + " Whole animal Fish\n", + " Whole animal\n", + " Whole animal Fish\n", + " 5\n", + " \n", + " \n", + " Muscle FISH\n", + " Muscle\n", + " Muscle FISH\n", + " 5\n", + " \n", + " \n", " FLESH WITH SCALES Fish\n", " Flesh with scales\n", " FLESH WITH SCALES Fish\n", " 5\n", " \n", " \n", - " Soft parts Fish\n", - " Soft parts\n", - " Soft parts Fish\n", + " FLESH WITHOUT BONES fish\n", + " Flesh without bones\n", + " FLESH WITHOUT BONES fish\n", " 5\n", " \n", " \n", - " Whole animal Fish\n", + " HEAD Fish\n", + " Head\n", + " HEAD Fish\n", + " 5\n", + " \n", + " \n", + " WHOLE ANIMAL FISH\n", " Whole animal\n", - " Whole animal Fish\n", + " WHOLE ANIMAL FISH\n", " 5\n", " \n", " \n", - " Flesh without bones Fish\n", + " FLESH WITHOUT BONES FISH\n", " Flesh without bones\n", - " Flesh without bones Fish\n", + " FLESH WITHOUT BONES FISH\n", " 5\n", " \n", " \n", @@ -5508,15 +5649,15 @@ " 5\n", " \n", " \n", - " FLESH WITHOUT BONES FISH\n", - " Flesh without bones\n", - " FLESH WITHOUT BONES FISH\n", + " WHOLE Fish\n", + " Whole animal\n", + " WHOLE Fish\n", " 5\n", " \n", " \n", - " HEAD FISH\n", - " Head\n", - " HEAD FISH\n", + " Soft parts Fish\n", + " Soft parts\n", + " Soft parts Fish\n", " 5\n", " \n", " \n", @@ -5526,21 +5667,15 @@ " 5\n", " \n", " \n", - " FLESH WITHOUT BONES fish\n", + " Flesh without bones Fish\n", " Flesh without bones\n", - " FLESH WITHOUT BONES fish\n", - " 5\n", - " \n", - " \n", - " HEAD Fish\n", - " Head\n", - " HEAD Fish\n", + " Flesh without bones Fish\n", " 5\n", " \n", " \n", - " Muscle FISH\n", + " MUSCLE Fish\n", " Muscle\n", - " Muscle FISH\n", + " MUSCLE Fish\n", " 5\n", " \n", " \n", @@ -5550,40 +5685,28 @@ " 5\n", " \n", " \n", - " WHOLE Fish\n", - " Whole animal\n", - " WHOLE Fish\n", + " HEAD FISH\n", + " Head\n", + " HEAD FISH\n", " 5\n", " \n", " \n", - " Muscle Fish\n", - " Muscle\n", - " Muscle Fish\n", + " FLESH WITHOUT BONES Fish\n", + " Flesh without bones\n", + " FLESH WITHOUT BONES Fish\n", " 5\n", " \n", " \n", - " MUSCLE Fish\n", + " Muscle Fish\n", " Muscle\n", - " MUSCLE Fish\n", + " Muscle Fish\n", " 5\n", " \n", " \n", - " WHOLE ANIMAL FISH\n", - " Whole animal\n", - " WHOLE ANIMAL FISH\n", - " 5\n", - " \n", - " \n", - " FLESH WITHOUT BONES Fish\n", - " Flesh without bones\n", - " FLESH WITHOUT BONES Fish\n", - " 5\n", - " \n", - " \n", - " FLESH WITHOUT BONE Fish\n", - " Flesh without bones\n", - " FLESH WITHOUT BONE Fish\n", - " 4\n", + " FLESH WITHOUT BONE Fish\n", + " Flesh without bones\n", + " FLESH WITHOUT BONE Fish\n", + " 4\n", " \n", " \n", " FLESH WITHOUT BONE FISH\n", @@ -5601,47 +5724,47 @@ "Mix of muscle and whole fish without liver FISH Flesh without bones \n", "Whole without head FISH Flesh without bones \n", "Cod medallion FISH Old leaf \n", - "SOFT PARTS Molluscs Soft parts \n", + "Soft parts Molluscs Soft parts \n", + "Soft Parts Molluscs Soft parts \n", + "WHOLE FISH Fish Whole animal \n", + "WHOLE FISH FISH Whole animal \n", "WHOLE ANIMAL Molluscs Whole animal \n", "Whole fisk FISH Whole animal \n", - "UNKNOWN Fish Growing tips \n", - "WHOLE FISH Fish Whole animal \n", - "Soft Parts Molluscs Soft parts \n", - "UNKNOWN FISH Growing tips \n", - "SOFT PARTS molluscs Soft parts \n", - "SOFT PARTS MOLLUSCS Soft parts \n", "FLESH WITHOUT BONES Molluscs Flesh without bones \n", "Whole fish Fish Whole animal \n", + "UNKNOWN FISH Growing tips \n", + "SOFT PARTS molluscs Soft parts \n", "Whole animal Molluscs Whole animal \n", - "Soft parts Molluscs Soft parts \n", - "WHOLE FISH FISH Whole animal \n", - "WHOLE PLANT Seaweed Whole plant \n", - "WHOLE PLANT SEAWEED Whole plant \n", - "GROWING TIPS Seaweed Growing tips \n", - "WHOLE PLANT seaweed Whole plant \n", - "whole plant Seaweed Whole plant \n", + "UNKNOWN Fish Growing tips \n", + "SOFT PARTS Molluscs Soft parts \n", + "SOFT PARTS MOLLUSCS Soft parts \n", "Whole plant Seaweed Whole plant \n", + "whole plant Seaweed Whole plant \n", "FLESH WITHOUT BONES SEAWEED Flesh without bones \n", - "Flesh Fish Shells \n", + "WHOLE PLANT seaweed Whole plant \n", + "WHOLE PLANT SEAWEED Whole plant \n", + "GROWING TIPS Seaweed Growing tips \n", + "WHOLE PLANT Seaweed Whole plant \n", "FLESH Fish Shells \n", + "Flesh Fish Shells \n", "WHOLE Seaweed Whole plant \n", - "FLESH WITH SCALES Fish Flesh with scales \n", - "Soft parts Fish Soft parts \n", "Whole animal Fish Whole animal \n", - "Flesh without bones Fish Flesh without bones \n", - "LIVER Fish Liver \n", - "FLESH WITHOUT BONES FISH Flesh without bones \n", - "HEAD FISH Head \n", - "WHOLE ANIMAL Fish Whole animal \n", + "Muscle FISH Muscle \n", + "FLESH WITH SCALES Fish Flesh with scales \n", "FLESH WITHOUT BONES fish Flesh without bones \n", "HEAD Fish Head \n", - "Muscle FISH Muscle \n", - "Whole FISH Whole animal \n", + "WHOLE ANIMAL FISH Whole animal \n", + "FLESH WITHOUT BONES FISH Flesh without bones \n", + "LIVER Fish Liver \n", "WHOLE Fish Whole animal \n", - "Muscle Fish Muscle \n", + "Soft parts Fish Soft parts \n", + "WHOLE ANIMAL Fish Whole animal \n", + "Flesh without bones Fish Flesh without bones \n", "MUSCLE Fish Muscle \n", - "WHOLE ANIMAL FISH Whole animal \n", + "Whole FISH Whole animal \n", + "HEAD FISH Head \n", "FLESH WITHOUT BONES Fish Flesh without bones \n", + "Muscle Fish Muscle \n", "FLESH WITHOUT BONE Fish Flesh without bones \n", "FLESH WITHOUT BONE FISH Flesh without bones \n", "\n", @@ -5650,47 +5773,47 @@ "Mix of muscle and whole fish without liver FISH Mix of muscle and whole fish without liver FISH \n", "Whole without head FISH Whole without head FISH \n", "Cod medallion FISH Cod medallion FISH \n", - "SOFT PARTS Molluscs SOFT PARTS Molluscs \n", + "Soft parts Molluscs Soft parts Molluscs \n", + "Soft Parts Molluscs Soft Parts Molluscs \n", + "WHOLE FISH Fish WHOLE FISH Fish \n", + "WHOLE FISH FISH WHOLE FISH FISH \n", "WHOLE ANIMAL Molluscs WHOLE ANIMAL Molluscs \n", "Whole fisk FISH Whole fisk FISH \n", - "UNKNOWN Fish UNKNOWN Fish \n", - "WHOLE FISH Fish WHOLE FISH Fish \n", - "Soft Parts Molluscs Soft Parts Molluscs \n", - "UNKNOWN FISH UNKNOWN FISH \n", - "SOFT PARTS molluscs SOFT PARTS molluscs \n", - "SOFT PARTS MOLLUSCS SOFT PARTS MOLLUSCS \n", "FLESH WITHOUT BONES Molluscs FLESH WITHOUT BONES Molluscs \n", "Whole fish Fish Whole fish Fish \n", + "UNKNOWN FISH UNKNOWN FISH \n", + "SOFT PARTS molluscs SOFT PARTS molluscs \n", "Whole animal Molluscs Whole animal Molluscs \n", - "Soft parts Molluscs Soft parts Molluscs \n", - "WHOLE FISH FISH WHOLE FISH FISH \n", - "WHOLE PLANT Seaweed WHOLE PLANT Seaweed \n", - "WHOLE PLANT SEAWEED WHOLE PLANT SEAWEED \n", - "GROWING TIPS Seaweed GROWING TIPS Seaweed \n", - "WHOLE PLANT seaweed WHOLE PLANT seaweed \n", - "whole plant Seaweed whole plant Seaweed \n", + "UNKNOWN Fish UNKNOWN Fish \n", + "SOFT PARTS Molluscs SOFT PARTS Molluscs \n", + "SOFT PARTS MOLLUSCS SOFT PARTS MOLLUSCS \n", "Whole plant Seaweed Whole plant Seaweed \n", + "whole plant Seaweed whole plant Seaweed \n", "FLESH WITHOUT BONES SEAWEED FLESH WITHOUT BONES SEAWEED \n", - "Flesh Fish Flesh Fish \n", + "WHOLE PLANT seaweed WHOLE PLANT seaweed \n", + "WHOLE PLANT SEAWEED WHOLE PLANT SEAWEED \n", + "GROWING TIPS Seaweed GROWING TIPS Seaweed \n", + "WHOLE PLANT Seaweed WHOLE PLANT Seaweed \n", "FLESH Fish FLESH Fish \n", + "Flesh Fish Flesh Fish \n", "WHOLE Seaweed WHOLE Seaweed \n", - "FLESH WITH SCALES Fish FLESH WITH SCALES Fish \n", - "Soft parts Fish Soft parts Fish \n", "Whole animal Fish Whole animal Fish \n", - "Flesh without bones Fish Flesh without bones Fish \n", - "LIVER Fish LIVER Fish \n", - "FLESH WITHOUT BONES FISH FLESH WITHOUT BONES FISH \n", - "HEAD FISH HEAD FISH \n", - "WHOLE ANIMAL Fish WHOLE ANIMAL Fish \n", + "Muscle FISH Muscle FISH \n", + "FLESH WITH SCALES Fish FLESH WITH SCALES Fish \n", "FLESH WITHOUT BONES fish FLESH WITHOUT BONES fish \n", "HEAD Fish HEAD Fish \n", - "Muscle FISH Muscle FISH \n", - "Whole FISH Whole FISH \n", + "WHOLE ANIMAL FISH WHOLE ANIMAL FISH \n", + "FLESH WITHOUT BONES FISH FLESH WITHOUT BONES FISH \n", + "LIVER Fish LIVER Fish \n", "WHOLE Fish WHOLE Fish \n", - "Muscle Fish Muscle Fish \n", + "Soft parts Fish Soft parts Fish \n", + "WHOLE ANIMAL Fish WHOLE ANIMAL Fish \n", + "Flesh without bones Fish Flesh without bones Fish \n", "MUSCLE Fish MUSCLE Fish \n", - "WHOLE ANIMAL FISH WHOLE ANIMAL FISH \n", + "Whole FISH Whole FISH \n", + "HEAD FISH HEAD FISH \n", "FLESH WITHOUT BONES Fish FLESH WITHOUT BONES Fish \n", + "Muscle Fish Muscle Fish \n", "FLESH WITHOUT BONE Fish FLESH WITHOUT BONE Fish \n", "FLESH WITHOUT BONE FISH FLESH WITHOUT BONE FISH \n", "\n", @@ -5699,47 +5822,47 @@ "Mix of muscle and whole fish without liver FISH 31 \n", "Whole without head FISH 13 \n", "Cod medallion FISH 13 \n", - "SOFT PARTS Molluscs 9 \n", + "Soft parts Molluscs 9 \n", + "Soft Parts Molluscs 9 \n", + "WHOLE FISH Fish 9 \n", + "WHOLE FISH FISH 9 \n", "WHOLE ANIMAL Molluscs 9 \n", "Whole fisk FISH 9 \n", - "UNKNOWN Fish 9 \n", - "WHOLE FISH Fish 9 \n", - "Soft Parts Molluscs 9 \n", - "UNKNOWN FISH 9 \n", - "SOFT PARTS molluscs 9 \n", - "SOFT PARTS MOLLUSCS 9 \n", "FLESH WITHOUT BONES Molluscs 9 \n", "Whole fish Fish 9 \n", + "UNKNOWN FISH 9 \n", + "SOFT PARTS molluscs 9 \n", "Whole animal Molluscs 9 \n", - "Soft parts Molluscs 9 \n", - "WHOLE FISH FISH 9 \n", - "WHOLE PLANT Seaweed 8 \n", - "WHOLE PLANT SEAWEED 8 \n", - "GROWING TIPS Seaweed 8 \n", - "WHOLE PLANT seaweed 8 \n", - "whole plant Seaweed 8 \n", + "UNKNOWN Fish 9 \n", + "SOFT PARTS Molluscs 9 \n", + "SOFT PARTS MOLLUSCS 9 \n", "Whole plant Seaweed 8 \n", + "whole plant Seaweed 8 \n", "FLESH WITHOUT BONES SEAWEED 8 \n", - "Flesh Fish 7 \n", + "WHOLE PLANT seaweed 8 \n", + "WHOLE PLANT SEAWEED 8 \n", + "GROWING TIPS Seaweed 8 \n", + "WHOLE PLANT Seaweed 8 \n", "FLESH Fish 7 \n", + "Flesh Fish 7 \n", "WHOLE Seaweed 6 \n", - "FLESH WITH SCALES Fish 5 \n", - "Soft parts Fish 5 \n", "Whole animal Fish 5 \n", - "Flesh without bones Fish 5 \n", - "LIVER Fish 5 \n", - "FLESH WITHOUT BONES FISH 5 \n", - "HEAD FISH 5 \n", - "WHOLE ANIMAL Fish 5 \n", + "Muscle FISH 5 \n", + "FLESH WITH SCALES Fish 5 \n", "FLESH WITHOUT BONES fish 5 \n", "HEAD Fish 5 \n", - "Muscle FISH 5 \n", - "Whole FISH 5 \n", + "WHOLE ANIMAL FISH 5 \n", + "FLESH WITHOUT BONES FISH 5 \n", + "LIVER Fish 5 \n", "WHOLE Fish 5 \n", - "Muscle Fish 5 \n", + "Soft parts Fish 5 \n", + "WHOLE ANIMAL Fish 5 \n", + "Flesh without bones Fish 5 \n", "MUSCLE Fish 5 \n", - "WHOLE ANIMAL FISH 5 \n", + "Whole FISH 5 \n", + "HEAD FISH 5 \n", "FLESH WITHOUT BONES Fish 5 \n", + "Muscle Fish 5 \n", "FLESH WITHOUT BONE Fish 4 \n", "FLESH WITHOUT BONE FISH 4 " ] @@ -5810,7 +5933,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 46/46 [00:00<00:00, 99.93it/s] \n" + "Processing: 100%|██████████| 46/46 [00:00<00:00, 96.58it/s]\n" ] }, { @@ -5847,33 +5970,21 @@ " \n", " \n", " \n", - " FLESH WITHOUT BONES Molluscs\n", - " Flesh without bones\n", - " FLESH WITHOUT BONES Molluscs\n", - " 9\n", - " \n", - " \n", - " Soft parts Molluscs\n", - " Soft parts\n", - " Soft parts Molluscs\n", - " 9\n", - " \n", - " \n", " WHOLE FISH Fish\n", " Whole animal\n", " WHOLE FISH Fish\n", " 9\n", " \n", " \n", - " Soft Parts Molluscs\n", - " Soft parts\n", - " Soft Parts Molluscs\n", + " FLESH WITHOUT BONES Molluscs\n", + " Flesh without bones\n", + " FLESH WITHOUT BONES Molluscs\n", " 9\n", " \n", " \n", - " Whole fisk FISH\n", + " WHOLE ANIMAL Molluscs\n", " Whole animal\n", - " Whole fisk FISH\n", + " WHOLE ANIMAL Molluscs\n", " 9\n", " \n", " \n", @@ -5883,9 +5994,9 @@ " 9\n", " \n", " \n", - " SOFT PARTS MOLLUSCS\n", + " Soft Parts Molluscs\n", " Soft parts\n", - " SOFT PARTS MOLLUSCS\n", + " Soft Parts Molluscs\n", " 9\n", " \n", " \n", @@ -5895,9 +6006,9 @@ " 9\n", " \n", " \n", - " WHOLE ANIMAL Molluscs\n", - " Whole animal\n", - " WHOLE ANIMAL Molluscs\n", + " SOFT PARTS MOLLUSCS\n", + " Soft parts\n", + " SOFT PARTS MOLLUSCS\n", " 9\n", " \n", " \n", @@ -5907,9 +6018,9 @@ " 9\n", " \n", " \n", - " Whole fish Fish\n", - " Whole animal\n", - " Whole fish Fish\n", + " Soft parts Molluscs\n", + " Soft parts\n", + " Soft parts Molluscs\n", " 9\n", " \n", " \n", @@ -5919,10 +6030,16 @@ " 9\n", " \n", " \n", - " Whole plant Seaweed\n", - " Whole plant\n", - " Whole plant Seaweed\n", - " 8\n", + " Whole fisk FISH\n", + " Whole animal\n", + " Whole fisk FISH\n", + " 9\n", + " \n", + " \n", + " Whole fish Fish\n", + " Whole animal\n", + " Whole fish Fish\n", + " 9\n", " \n", " \n", " WHOLE PLANT seaweed\n", @@ -5931,9 +6048,9 @@ " 8\n", " \n", " \n", - " GROWING TIPS Seaweed\n", - " Growing tips\n", - " GROWING TIPS Seaweed\n", + " whole plant Seaweed\n", + " Whole plant\n", + " whole plant Seaweed\n", " 8\n", " \n", " \n", @@ -5943,33 +6060,33 @@ " 8\n", " \n", " \n", - " whole plant Seaweed\n", + " WHOLE PLANT Seaweed\n", " Whole plant\n", - " whole plant Seaweed\n", + " WHOLE PLANT Seaweed\n", " 8\n", " \n", " \n", - " WHOLE PLANT Seaweed\n", + " Whole plant Seaweed\n", " Whole plant\n", - " WHOLE PLANT Seaweed\n", + " Whole plant Seaweed\n", " 8\n", " \n", " \n", - " LIVER Fish\n", - " Liver\n", - " LIVER Fish\n", - " 5\n", + " GROWING TIPS Seaweed\n", + " Growing tips\n", + " GROWING TIPS Seaweed\n", + " 8\n", " \n", " \n", - " FLESH WITH SCALES Fish\n", - " Flesh with scales\n", - " FLESH WITH SCALES Fish\n", + " Muscle FISH\n", + " Muscle\n", + " Muscle FISH\n", " 5\n", " \n", " \n", - " Soft parts Fish\n", - " Soft parts\n", - " Soft parts Fish\n", + " HEAD Fish\n", + " Head\n", + " HEAD Fish\n", " 5\n", " \n", " \n", @@ -5985,21 +6102,21 @@ " 5\n", " \n", " \n", - " WHOLE ANIMAL Fish\n", - " Whole animal\n", - " WHOLE ANIMAL Fish\n", + " FLESH WITH SCALES Fish\n", + " Flesh with scales\n", + " FLESH WITH SCALES Fish\n", " 5\n", " \n", " \n", - " FLESH WITHOUT BONES Fish\n", + " FLESH WITHOUT BONES fish\n", " Flesh without bones\n", - " FLESH WITHOUT BONES Fish\n", + " FLESH WITHOUT BONES fish\n", " 5\n", " \n", " \n", - " Muscle FISH\n", - " Muscle\n", - " Muscle FISH\n", + " WHOLE ANIMAL FISH\n", + " Whole animal\n", + " WHOLE ANIMAL FISH\n", " 5\n", " \n", " \n", @@ -6015,15 +6132,27 @@ " 5\n", " \n", " \n", + " FLESH WITHOUT BONES Fish\n", + " Flesh without bones\n", + " FLESH WITHOUT BONES Fish\n", + " 5\n", + " \n", + " \n", + " LIVER Fish\n", + " Liver\n", + " LIVER Fish\n", + " 5\n", + " \n", + " \n", " WHOLE Fish\n", " Whole animal\n", " WHOLE Fish\n", " 5\n", " \n", " \n", - " WHOLE ANIMAL FISH\n", - " Whole animal\n", - " WHOLE ANIMAL FISH\n", + " Soft parts Fish\n", + " Soft parts\n", + " Soft parts Fish\n", " 5\n", " \n", " \n", @@ -6033,21 +6162,15 @@ " 5\n", " \n", " \n", - " Flesh without bones Fish\n", - " Flesh without bones\n", - " Flesh without bones Fish\n", + " WHOLE ANIMAL Fish\n", + " Whole animal\n", + " WHOLE ANIMAL Fish\n", " 5\n", " \n", " \n", - " FLESH WITHOUT BONES fish\n", + " Flesh without bones Fish\n", " Flesh without bones\n", - " FLESH WITHOUT BONES fish\n", - " 5\n", - " \n", - " \n", - " HEAD Fish\n", - " Head\n", - " HEAD Fish\n", + " Flesh without bones Fish\n", " 5\n", " \n", " \n", @@ -6075,27 +6198,27 @@ " 2\n", " \n", " \n", - " Cod medallion FISH\n", + " UNKNOWN FISH\n", " (Not available)\n", - " Cod medallion FISH\n", + " UNKNOWN FISH\n", " 2\n", " \n", " \n", - " Mix of muscle and whole fish without liver FISH\n", + " UNKNOWN Fish\n", " (Not available)\n", - " Mix of muscle and whole fish without liver FISH\n", + " UNKNOWN Fish\n", " 2\n", " \n", " \n", - " UNKNOWN Fish\n", + " Mix of muscle and whole fish without liver FISH\n", " (Not available)\n", - " UNKNOWN Fish\n", + " Mix of muscle and whole fish without liver FISH\n", " 2\n", " \n", " \n", - " UNKNOWN FISH\n", + " Cod medallion FISH\n", " (Not available)\n", - " UNKNOWN FISH\n", + " Cod medallion FISH\n", " 2\n", " \n", " \n", @@ -6111,140 +6234,140 @@ "text/plain": [ " matched_maris_name \\\n", "source_key \n", - "FLESH WITHOUT BONES Molluscs Flesh without bones \n", - "Soft parts Molluscs Soft parts \n", "WHOLE FISH Fish Whole animal \n", - "Soft Parts Molluscs Soft parts \n", - "Whole fisk FISH Whole animal \n", + "FLESH WITHOUT BONES Molluscs Flesh without bones \n", + "WHOLE ANIMAL Molluscs Whole animal \n", "SOFT PARTS molluscs Soft parts \n", - "SOFT PARTS MOLLUSCS Soft parts \n", + "Soft Parts Molluscs Soft parts \n", "Whole animal Molluscs Whole animal \n", - "WHOLE ANIMAL Molluscs Whole animal \n", + "SOFT PARTS MOLLUSCS Soft parts \n", "SOFT PARTS Molluscs Soft parts \n", - "Whole fish Fish Whole animal \n", + "Soft parts Molluscs Soft parts \n", "WHOLE FISH FISH Whole animal \n", - "Whole plant Seaweed Whole plant \n", + "Whole fisk FISH Whole animal \n", + "Whole fish Fish Whole animal \n", "WHOLE PLANT seaweed Whole plant \n", - "GROWING TIPS Seaweed Growing tips \n", - "WHOLE PLANT SEAWEED Whole plant \n", "whole plant Seaweed Whole plant \n", + "WHOLE PLANT SEAWEED Whole plant \n", "WHOLE PLANT Seaweed Whole plant \n", - "LIVER Fish Liver \n", - "FLESH WITH SCALES Fish Flesh with scales \n", - "Soft parts Fish Soft parts \n", + "Whole plant Seaweed Whole plant \n", + "GROWING TIPS Seaweed Growing tips \n", + "Muscle FISH Muscle \n", + "HEAD Fish Head \n", "FLESH WITHOUT BONES FISH Flesh without bones \n", "Whole animal Fish Whole animal \n", - "WHOLE ANIMAL Fish Whole animal \n", - "FLESH WITHOUT BONES Fish Flesh without bones \n", - "Muscle FISH Muscle \n", + "FLESH WITH SCALES Fish Flesh with scales \n", + "FLESH WITHOUT BONES fish Flesh without bones \n", + "WHOLE ANIMAL FISH Whole animal \n", "Muscle Fish Muscle \n", "Whole FISH Whole animal \n", + "FLESH WITHOUT BONES Fish Flesh without bones \n", + "LIVER Fish Liver \n", "WHOLE Fish Whole animal \n", - "WHOLE ANIMAL FISH Whole animal \n", + "Soft parts Fish Soft parts \n", "HEAD FISH Head \n", + "WHOLE ANIMAL Fish Whole animal \n", "Flesh without bones Fish Flesh without bones \n", - "FLESH WITHOUT BONES fish Flesh without bones \n", - "HEAD Fish Head \n", "MUSCLE Fish Muscle \n", "FLESH WITHOUT BONE Fish Flesh without bones \n", "FLESH WITHOUT BONE FISH Flesh without bones \n", "FLESH WITHOUT BONES SEAWEED (Not available) \n", - "Cod medallion FISH (Not available) \n", - "Mix of muscle and whole fish without liver FISH (Not available) \n", - "UNKNOWN Fish (Not available) \n", "UNKNOWN FISH (Not available) \n", + "UNKNOWN Fish (Not available) \n", + "Mix of muscle and whole fish without liver FISH (Not available) \n", + "Cod medallion FISH (Not available) \n", "Whole without head FISH (Not available) \n", "\n", " source_name \\\n", "source_key \n", - "FLESH WITHOUT BONES Molluscs FLESH WITHOUT BONES Molluscs \n", - "Soft parts Molluscs Soft parts Molluscs \n", "WHOLE FISH Fish WHOLE FISH Fish \n", - "Soft Parts Molluscs Soft Parts Molluscs \n", - "Whole fisk FISH Whole fisk FISH \n", + "FLESH WITHOUT BONES Molluscs FLESH WITHOUT BONES Molluscs \n", + "WHOLE ANIMAL Molluscs WHOLE ANIMAL Molluscs \n", "SOFT PARTS molluscs SOFT PARTS molluscs \n", - "SOFT PARTS MOLLUSCS SOFT PARTS MOLLUSCS \n", + "Soft Parts Molluscs Soft Parts Molluscs \n", "Whole animal Molluscs Whole animal Molluscs \n", - "WHOLE ANIMAL Molluscs WHOLE ANIMAL Molluscs \n", + "SOFT PARTS MOLLUSCS SOFT PARTS MOLLUSCS \n", "SOFT PARTS Molluscs SOFT PARTS Molluscs \n", - "Whole fish Fish Whole fish Fish \n", + "Soft parts Molluscs Soft parts Molluscs \n", "WHOLE FISH FISH WHOLE FISH FISH \n", - "Whole plant Seaweed Whole plant Seaweed \n", + "Whole fisk FISH Whole fisk FISH \n", + "Whole fish Fish Whole fish Fish \n", "WHOLE PLANT seaweed WHOLE PLANT seaweed \n", - "GROWING TIPS Seaweed GROWING TIPS Seaweed \n", - "WHOLE PLANT SEAWEED WHOLE PLANT SEAWEED \n", "whole plant Seaweed whole plant Seaweed \n", + "WHOLE PLANT SEAWEED WHOLE PLANT SEAWEED \n", "WHOLE PLANT Seaweed WHOLE PLANT Seaweed \n", - "LIVER Fish LIVER Fish \n", - "FLESH WITH SCALES Fish FLESH WITH SCALES Fish \n", - "Soft parts Fish Soft parts Fish \n", + "Whole plant Seaweed Whole plant Seaweed \n", + "GROWING TIPS Seaweed GROWING TIPS Seaweed \n", + "Muscle FISH Muscle FISH \n", + "HEAD Fish HEAD Fish \n", "FLESH WITHOUT BONES FISH FLESH WITHOUT BONES FISH \n", "Whole animal Fish Whole animal Fish \n", - "WHOLE ANIMAL Fish WHOLE ANIMAL Fish \n", - "FLESH WITHOUT BONES Fish FLESH WITHOUT BONES Fish \n", - "Muscle FISH Muscle FISH \n", + "FLESH WITH SCALES Fish FLESH WITH SCALES Fish \n", + "FLESH WITHOUT BONES fish FLESH WITHOUT BONES fish \n", + "WHOLE ANIMAL FISH WHOLE ANIMAL FISH \n", "Muscle Fish Muscle Fish \n", "Whole FISH Whole FISH \n", + "FLESH WITHOUT BONES Fish FLESH WITHOUT BONES Fish \n", + "LIVER Fish LIVER Fish \n", "WHOLE Fish WHOLE Fish \n", - "WHOLE ANIMAL FISH WHOLE ANIMAL FISH \n", + "Soft parts Fish Soft parts Fish \n", "HEAD FISH HEAD FISH \n", + "WHOLE ANIMAL Fish WHOLE ANIMAL Fish \n", "Flesh without bones Fish Flesh without bones Fish \n", - "FLESH WITHOUT BONES fish FLESH WITHOUT BONES fish \n", - "HEAD Fish HEAD Fish \n", "MUSCLE Fish MUSCLE Fish \n", "FLESH WITHOUT BONE Fish FLESH WITHOUT BONE Fish \n", "FLESH WITHOUT BONE FISH FLESH WITHOUT BONE FISH \n", "FLESH WITHOUT BONES SEAWEED FLESH WITHOUT BONES SEAWEED \n", - "Cod medallion FISH Cod medallion FISH \n", - "Mix of muscle and whole fish without liver FISH Mix of muscle and whole fish without liver FISH \n", - "UNKNOWN Fish UNKNOWN Fish \n", "UNKNOWN FISH UNKNOWN FISH \n", + "UNKNOWN Fish UNKNOWN Fish \n", + "Mix of muscle and whole fish without liver FISH Mix of muscle and whole fish without liver FISH \n", + "Cod medallion FISH Cod medallion FISH \n", "Whole without head FISH Whole without head FISH \n", "\n", " match_score \n", "source_key \n", - "FLESH WITHOUT BONES Molluscs 9 \n", - "Soft parts Molluscs 9 \n", "WHOLE FISH Fish 9 \n", - "Soft Parts Molluscs 9 \n", - "Whole fisk FISH 9 \n", + "FLESH WITHOUT BONES Molluscs 9 \n", + "WHOLE ANIMAL Molluscs 9 \n", "SOFT PARTS molluscs 9 \n", - "SOFT PARTS MOLLUSCS 9 \n", + "Soft Parts Molluscs 9 \n", "Whole animal Molluscs 9 \n", - "WHOLE ANIMAL Molluscs 9 \n", + "SOFT PARTS MOLLUSCS 9 \n", "SOFT PARTS Molluscs 9 \n", - "Whole fish Fish 9 \n", + "Soft parts Molluscs 9 \n", "WHOLE FISH FISH 9 \n", - "Whole plant Seaweed 8 \n", + "Whole fisk FISH 9 \n", + "Whole fish Fish 9 \n", "WHOLE PLANT seaweed 8 \n", - "GROWING TIPS Seaweed 8 \n", - "WHOLE PLANT SEAWEED 8 \n", "whole plant Seaweed 8 \n", + "WHOLE PLANT SEAWEED 8 \n", "WHOLE PLANT Seaweed 8 \n", - "LIVER Fish 5 \n", - "FLESH WITH SCALES Fish 5 \n", - "Soft parts Fish 5 \n", + "Whole plant Seaweed 8 \n", + "GROWING TIPS Seaweed 8 \n", + "Muscle FISH 5 \n", + "HEAD Fish 5 \n", "FLESH WITHOUT BONES FISH 5 \n", "Whole animal Fish 5 \n", - "WHOLE ANIMAL Fish 5 \n", - "FLESH WITHOUT BONES Fish 5 \n", - "Muscle FISH 5 \n", + "FLESH WITH SCALES Fish 5 \n", + "FLESH WITHOUT BONES fish 5 \n", + "WHOLE ANIMAL FISH 5 \n", "Muscle Fish 5 \n", "Whole FISH 5 \n", + "FLESH WITHOUT BONES Fish 5 \n", + "LIVER Fish 5 \n", "WHOLE Fish 5 \n", - "WHOLE ANIMAL FISH 5 \n", + "Soft parts Fish 5 \n", "HEAD FISH 5 \n", + "WHOLE ANIMAL Fish 5 \n", "Flesh without bones Fish 5 \n", - "FLESH WITHOUT BONES fish 5 \n", - "HEAD Fish 5 \n", "MUSCLE Fish 5 \n", "FLESH WITHOUT BONE Fish 4 \n", "FLESH WITHOUT BONE FISH 4 \n", "FLESH WITHOUT BONES SEAWEED 2 \n", - "Cod medallion FISH 2 \n", - "Mix of muscle and whole fish without liver FISH 2 \n", - "UNKNOWN Fish 2 \n", "UNKNOWN FISH 2 \n", + "UNKNOWN Fish 2 \n", + "Mix of muscle and whole fish without liver FISH 2 \n", + "Cod medallion FISH 2 \n", "Whole without head FISH 2 " ] }, @@ -6259,24 +6382,26 @@ "remapper.select_match(match_score_threshold=1)" ] }, - { - "cell_type": "markdown", - "id": "e7ed9551", - "metadata": {}, - "source": [ - ":::{.callout-tip}\n", - "\n", - "**FEEDBACK TO DATA PROVIDER**: `biota` dataset includes 1 entry where the `Body Part` is `FLESH WITHOUT BONES` for the `Biological group` of `SEAWEED`, see below. \n", - "\n", - ":::" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "93c19547", + "id": "99e40cca", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 0%| | 0/46 [00:00\n", " \n", " \n", - " ID\n", - " Contracting Party\n", - " Sample ID\n", - " Biological group\n", - " Body Part\n", - " Measurement Comment\n", - " Sample Comment\n", + " matched_maris_name\n", + " source_name\n", + " match_score\n", + " \n", + " \n", + " source_key\n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 2660\n", - " 87356\n", - " Iceland\n", - " THFAG17C\n", - " SEAWEED\n", - " FLESH WITHOUT BONES\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " ID Contracting Party Sample ID Biological group Body Part \\\n", - "2660 87356 Iceland THFAG17C SEAWEED FLESH WITHOUT BONES \n", - "\n", - " Measurement Comment Sample Comment \n", - "2660 NaN NaN " - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dfs['biota'][['ID','Contracting Party','Sample ID','Biological group','Body Part', 'Measurement Comment', 'Sample Comment']][(tfm.dfs['biota']['Body Part'] == 'FLESH WITHOUT BONES') & (tfm.dfs['biota']['Biological group'] == 'SEAWEED')]" - ] - }, - { - "cell_type": "markdown", - "id": "d563fe9e", - "metadata": {}, - "source": [ - "HERE HERE, need to create the lambda: Remapper then use the genric RemapCB callback." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c256c803", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "class RemapBiotaBodyPartCB(Callback):\n", - " \"Biota body part standardized to MARIS format.\"\n", - " def __init__(self, \n", - " fn_lut: Callable, # Function that returns the lookup table dictionary\n", - " verbose: bool=False # Print unmatched values\n", - " ):\n", - " fc.store_attr()\n", - "\n", - " def __call__(self, tfm):\n", - " lut = self.fn_lut()\n", - " tfm.dfs['biota']['species'] = tfm.dfs['biota']['Species'].apply(lambda x: self._get_species(x, lut))\n", - " \n", - " def _get_species(self, \n", - " value_to_match:str, # The value to match\n", - " lut:dict # The lookup table dictionary\n", - " ):\n", - " match = lut.get(value_to_match, Match(-1, None, None, None))\n", - " if self.verbose and match.matched_id == -1:\n", - " print(f'Unmatched species: {value_to_match}')\n", - " return match.matched_id" - ] - }, - { - "cell_type": "markdown", - "id": "929b4b20", - "metadata": {}, - "source": [ - "##### Correct OSPAR `Body Part` labelled as `Whole`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "523b4b39", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexvalue
00FleshMix of muscle and whole fish without liver FISHFlesh without bonesMix of muscle and whole fish without liver FISH31
11FLESH WITHOUT BONEWhole without head FISHFlesh without bonesWhole without head FISH13
22whole plantCod medallion FISHOld leafCod medallion FISH13
33MuscleSoft parts MolluscsSoft partsSoft parts Molluscs9
44Soft Parts MolluscsSoft partsSoft Parts Molluscs9
55FLESH WITH SCALESWHOLE FISH FishWhole animalWHOLE FISH Fish9
66HEADWHOLE FISH FISHWhole animalWHOLE FISH FISH9
77WHOLE ANIMALWHOLE ANIMAL MolluscsWhole animalWHOLE ANIMAL Molluscs9
88GROWING TIPSWhole fisk FISHWhole animalWhole fisk FISH9
9FLESH WITHOUT BONES MolluscsFlesh without bonesFLESH WITHOUT BONES Molluscs9Whole fisk
1010Whole fishWhole fish FishWhole animalWhole fish Fish9
1111WHOLE FISHUNKNOWN FISHGrowing tipsUNKNOWN FISH9
1212UNKNOWNSOFT PARTS molluscsSoft partsSOFT PARTS molluscs9
1313Whole animal MolluscsWhole animalWhole animal Molluscs9
1414Flesh without bonesUNKNOWN FishGrowing tipsUNKNOWN Fish9
1515LIVERSOFT PARTS MolluscsSoft partsSOFT PARTS Molluscs9
1616SOFT PARTSSOFT PARTS MOLLUSCSSoft partsSOFT PARTS MOLLUSCS9
1717WHOLE PLANTWhole plant SeaweedWhole plantWhole plant Seaweed8
1818Wholewhole plant SeaweedWhole plantwhole plant Seaweed8
1919MUSCLEFLESH WITHOUT BONES SEAWEEDFlesh without bonesFLESH WITHOUT BONES SEAWEED8
2020FLESH WITHOUT BONESWHOLE PLANT seaweedWhole plantWHOLE PLANT seaweed8
2121Cod medallionWHOLE PLANT SEAWEEDWhole plantWHOLE PLANT SEAWEED8
2222FLESHGROWING TIPS SeaweedGrowing tipsGROWING TIPS Seaweed8
2323Whole without headWHOLE PLANT SeaweedWhole plantWHOLE PLANT Seaweed8
2424WHOLEFLESH FishShellsFLESH Fish7
2525Mix of muscle and whole fish without liverFlesh FishShellsFlesh Fish7
2626WHOLE SeaweedWhole plantWHOLE Seaweed6
2727Soft Parts
\n", - "
" - ], - "text/plain": [ - " index value\n", - "0 0 Flesh\n", - "1 1 FLESH WITHOUT BONE\n", - "2 2 whole plant\n", - "3 3 Muscle\n", - "4 4 Soft parts\n", - "5 5 FLESH WITH SCALES\n", - "6 6 HEAD\n", - "7 7 WHOLE ANIMAL\n", - "8 8 GROWING TIPS\n", - "9 9 Whole fisk\n", - "10 10 Whole fish\n", - "11 11 WHOLE FISH\n", - "12 12 UNKNOWN\n", - "13 13 Whole animal\n", - "14 14 Flesh without bones\n", - "15 15 LIVER\n", - "16 16 SOFT PARTS\n", - "17 17 WHOLE PLANT\n", - "18 18 Whole\n", - "19 19 MUSCLE\n", - "20 20 FLESH WITHOUT BONES\n", - "21 21 Cod medallion\n", - "22 22 FLESH\n", - "23 23 Whole without head\n", - "24 24 WHOLE\n", - "25 25 Mix of muscle and whole fish without liver\n", - "26 26 Whole plant\n", - "27 27 Soft Parts" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dfs = load_data(fname_in)\n", - "get_unique_across_dfs(dfs, col_name='Body Part', as_df=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14f0ae61", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing: 0%| | 0/28 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
matched_maris_namesource_namematch_scoreWhole animal FishWhole animalWhole animal Fish5
source_keyMuscle FISHMuscleMuscle FISH5
Mix of muscle and whole fish without liverFlesh without bonesMix of muscle and whole fish without liver27FLESH WITH SCALES FishFlesh with scalesFLESH WITH SCALES Fish5
Whole without headFLESH WITHOUT BONES fishFlesh without bonesWhole without head10
Cod medallionExoskeletonCod medallion9
Whole fiskWhole animalWhole fiskFLESH WITHOUT BONES fish5
Whole fishWhole animalWhole fishHEAD FishHeadHEAD Fish5
WHOLE FISHWHOLE ANIMAL FISHWhole animalWHOLE FISHWHOLE ANIMAL FISH5
UNKNOWNSkinUNKNOWNFLESH WITHOUT BONES FISHFlesh without bonesFLESH WITHOUT BONES FISH5
FleshLeafFlesh3
WHOLEMoltWHOLE3
FLESHLeafFLESH3
WholeMoltWhole3LIVER FishLiverLIVER Fish5
FLESH WITHOUT BONEFlesh without bonesFLESH WITHOUT BONE1WHOLE FishWhole animalWHOLE Fish5
WHOLE PLANTWhole plantWHOLE PLANT0Soft parts FishSoft partsSoft parts Fish5
Whole plantWhole plantWhole plant0WHOLE ANIMAL FishWhole animalWHOLE ANIMAL Fish5
FLESH WITHOUT BONESFlesh without bones FishFlesh without bonesFLESH WITHOUT BONES0Flesh without bones Fish5
MUSCLEMUSCLE FishMuscleMUSCLE0
Flesh without bonesFlesh without bonesFlesh without bones0
SOFT PARTSSoft partsSOFT PARTS0
LIVERLiverLIVER0
Whole animalWhole animalWhole animal0
GROWING TIPSGrowing tipsGROWING TIPS0MUSCLE Fish5
WHOLE ANIMALWhole FISHWhole animalWHOLE ANIMAL0Whole FISH5
HEADHEAD FISHHeadHEAD0
FLESH WITH SCALESFlesh with scalesFLESH WITH SCALES0HEAD FISH5
Soft partsSoft partsSoft parts0FLESH WITHOUT BONES FishFlesh without bonesFLESH WITHOUT BONES Fish5
MuscleMuscleMuscle FishMuscle0Muscle Fish5
whole plantWhole plantwhole plant0FLESH WITHOUT BONE FishFlesh without bonesFLESH WITHOUT BONE Fish4
Soft PartsSoft partsSoft Parts0FLESH WITHOUT BONE FISHFlesh without bonesFLESH WITHOUT BONE FISH4
\n", "" ], "text/plain": [ - " matched_maris_name \\\n", - "source_key \n", - "Mix of muscle and whole fish without liver Flesh without bones \n", - "Whole without head Flesh without bones \n", - "Cod medallion Exoskeleton \n", - "Whole fisk Whole animal \n", - "Whole fish Whole animal \n", - "WHOLE FISH Whole animal \n", - "UNKNOWN Skin \n", - "Flesh Leaf \n", - "WHOLE Molt \n", - "FLESH Leaf \n", - "Whole Molt \n", - "FLESH WITHOUT BONE Flesh without bones \n", - "WHOLE PLANT Whole plant \n", - "Whole plant Whole plant \n", - "FLESH WITHOUT BONES Flesh without bones \n", - "MUSCLE Muscle \n", - "Flesh without bones Flesh without bones \n", - "SOFT PARTS Soft parts \n", - "LIVER Liver \n", - "Whole animal Whole animal \n", - "GROWING TIPS Growing tips \n", - "WHOLE ANIMAL Whole animal \n", - "HEAD Head \n", - "FLESH WITH SCALES Flesh with scales \n", - "Soft parts Soft parts \n", - "Muscle Muscle \n", - "whole plant Whole plant \n", - "Soft Parts Soft parts \n", + " matched_maris_name \\\n", + "source_key \n", + "Mix of muscle and whole fish without liver FISH Flesh without bones \n", + "Whole without head FISH Flesh without bones \n", + "Cod medallion FISH Old leaf \n", + "Soft parts Molluscs Soft parts \n", + "Soft Parts Molluscs Soft parts \n", + "WHOLE FISH Fish Whole animal \n", + "WHOLE FISH FISH Whole animal \n", + "WHOLE ANIMAL Molluscs Whole animal \n", + "Whole fisk FISH Whole animal \n", + "FLESH WITHOUT BONES Molluscs Flesh without bones \n", + "Whole fish Fish Whole animal \n", + "UNKNOWN FISH Growing tips \n", + "SOFT PARTS molluscs Soft parts \n", + "Whole animal Molluscs Whole animal \n", + "UNKNOWN Fish Growing tips \n", + "SOFT PARTS Molluscs Soft parts \n", + "SOFT PARTS MOLLUSCS Soft parts \n", + "Whole plant Seaweed Whole plant \n", + "whole plant Seaweed Whole plant \n", + "FLESH WITHOUT BONES SEAWEED Flesh without bones \n", + "WHOLE PLANT seaweed Whole plant \n", + "WHOLE PLANT SEAWEED Whole plant \n", + "GROWING TIPS Seaweed Growing tips \n", + "WHOLE PLANT Seaweed Whole plant \n", + "FLESH Fish Shells \n", + "Flesh Fish Shells \n", + "WHOLE Seaweed Whole plant \n", + "Whole animal Fish Whole animal \n", + "Muscle FISH Muscle \n", + "FLESH WITH SCALES Fish Flesh with scales \n", + "FLESH WITHOUT BONES fish Flesh without bones \n", + "HEAD Fish Head \n", + "WHOLE ANIMAL FISH Whole animal \n", + "FLESH WITHOUT BONES FISH Flesh without bones \n", + "LIVER Fish Liver \n", + "WHOLE Fish Whole animal \n", + "Soft parts Fish Soft parts \n", + "WHOLE ANIMAL Fish Whole animal \n", + "Flesh without bones Fish Flesh without bones \n", + "MUSCLE Fish Muscle \n", + "Whole FISH Whole animal \n", + "HEAD FISH Head \n", + "FLESH WITHOUT BONES Fish Flesh without bones \n", + "Muscle Fish Muscle \n", + "FLESH WITHOUT BONE Fish Flesh without bones \n", + "FLESH WITHOUT BONE FISH Flesh without bones \n", "\n", - " source_name \\\n", - "source_key \n", - "Mix of muscle and whole fish without liver Mix of muscle and whole fish without liver \n", - "Whole without head Whole without head \n", - "Cod medallion Cod medallion \n", - "Whole fisk Whole fisk \n", - "Whole fish Whole fish \n", - "WHOLE FISH WHOLE FISH \n", - "UNKNOWN UNKNOWN \n", - "Flesh Flesh \n", - "WHOLE WHOLE \n", - "FLESH FLESH \n", - "Whole Whole \n", - "FLESH WITHOUT BONE FLESH WITHOUT BONE \n", - "WHOLE PLANT WHOLE PLANT \n", - "Whole plant Whole plant \n", - "FLESH WITHOUT BONES FLESH WITHOUT BONES \n", - "MUSCLE MUSCLE \n", - "Flesh without bones Flesh without bones \n", - "SOFT PARTS SOFT PARTS \n", - "LIVER LIVER \n", - "Whole animal Whole animal \n", - "GROWING TIPS GROWING TIPS \n", - "WHOLE ANIMAL WHOLE ANIMAL \n", - "HEAD HEAD \n", - "FLESH WITH SCALES FLESH WITH SCALES \n", - "Soft parts Soft parts \n", - "Muscle Muscle \n", - "whole plant whole plant \n", - "Soft Parts Soft Parts \n", + " source_name \\\n", + "source_key \n", + "Mix of muscle and whole fish without liver FISH Mix of muscle and whole fish without liver FISH \n", + "Whole without head FISH Whole without head FISH \n", + "Cod medallion FISH Cod medallion FISH \n", + "Soft parts Molluscs Soft parts Molluscs \n", + "Soft Parts Molluscs Soft Parts Molluscs \n", + "WHOLE FISH Fish WHOLE FISH Fish \n", + "WHOLE FISH FISH WHOLE FISH FISH \n", + "WHOLE ANIMAL Molluscs WHOLE ANIMAL Molluscs \n", + "Whole fisk FISH Whole fisk FISH \n", + "FLESH WITHOUT BONES Molluscs FLESH WITHOUT BONES Molluscs \n", + "Whole fish Fish Whole fish Fish \n", + "UNKNOWN FISH UNKNOWN FISH \n", + "SOFT PARTS molluscs SOFT PARTS molluscs \n", + "Whole animal Molluscs Whole animal Molluscs \n", + "UNKNOWN Fish UNKNOWN Fish \n", + "SOFT PARTS Molluscs SOFT PARTS Molluscs \n", + "SOFT PARTS MOLLUSCS SOFT PARTS MOLLUSCS \n", + "Whole plant Seaweed Whole plant Seaweed \n", + "whole plant Seaweed whole plant Seaweed \n", + "FLESH WITHOUT BONES SEAWEED FLESH WITHOUT BONES SEAWEED \n", + "WHOLE PLANT seaweed WHOLE PLANT seaweed \n", + "WHOLE PLANT SEAWEED WHOLE PLANT SEAWEED \n", + "GROWING TIPS Seaweed GROWING TIPS Seaweed \n", + "WHOLE PLANT Seaweed WHOLE PLANT Seaweed \n", + "FLESH Fish FLESH Fish \n", + "Flesh Fish Flesh Fish \n", + "WHOLE Seaweed WHOLE Seaweed \n", + "Whole animal Fish Whole animal Fish \n", + "Muscle FISH Muscle FISH \n", + "FLESH WITH SCALES Fish FLESH WITH SCALES Fish \n", + "FLESH WITHOUT BONES fish FLESH WITHOUT BONES fish \n", + "HEAD Fish HEAD Fish \n", + "WHOLE ANIMAL FISH WHOLE ANIMAL FISH \n", + "FLESH WITHOUT BONES FISH FLESH WITHOUT BONES FISH \n", + "LIVER Fish LIVER Fish \n", + "WHOLE Fish WHOLE Fish \n", + "Soft parts Fish Soft parts Fish \n", + "WHOLE ANIMAL Fish WHOLE ANIMAL Fish \n", + "Flesh without bones Fish Flesh without bones Fish \n", + "MUSCLE Fish MUSCLE Fish \n", + "Whole FISH Whole FISH \n", + "HEAD FISH HEAD FISH \n", + "FLESH WITHOUT BONES Fish FLESH WITHOUT BONES Fish \n", + "Muscle Fish Muscle Fish \n", + "FLESH WITHOUT BONE Fish FLESH WITHOUT BONE Fish \n", + "FLESH WITHOUT BONE FISH FLESH WITHOUT BONE FISH \n", "\n", - " match_score \n", - "source_key \n", - "Mix of muscle and whole fish without liver 27 \n", - "Whole without head 10 \n", - "Cod medallion 9 \n", - "Whole fisk 5 \n", - "Whole fish 5 \n", - "WHOLE FISH 5 \n", - "UNKNOWN 5 \n", - "Flesh 3 \n", - "WHOLE 3 \n", - "FLESH 3 \n", - "Whole 3 \n", - "FLESH WITHOUT BONE 1 \n", - "WHOLE PLANT 0 \n", - "Whole plant 0 \n", - "FLESH WITHOUT BONES 0 \n", - "MUSCLE 0 \n", - "Flesh without bones 0 \n", - "SOFT PARTS 0 \n", - "LIVER 0 \n", - "Whole animal 0 \n", - "GROWING TIPS 0 \n", - "WHOLE ANIMAL 0 \n", - "HEAD 0 \n", - "FLESH WITH SCALES 0 \n", - "Soft parts 0 \n", - "Muscle 0 \n", - "whole plant 0 \n", - "Soft Parts 0 " - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| eval: false\n", - "remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='Body Part', as_df=True),\n", - " maris_lut_fn=bodyparts_lut_path,\n", - " maris_col_id='bodypar_id',\n", - " maris_col_name='bodypar',\n", - " provider_col_to_match='value',\n", - " provider_col_key='value',\n", - " fname_cache='bodyparts_ospar.pkl'\n", - " )\n", - "\n", - "remapper.generate_lookup_table(as_df=True)\n", - "remapper.select_match(match_score_threshold=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2d51c4e3", - "metadata": {}, - "outputs": [], - "source": [ - "#|exports\n", - "# fixes_biota_bodyparts = {}" - ] - }, - { - "cell_type": "markdown", - "id": "a45ea7e6", - "metadata": {}, - "source": [ - "The OSPAR data includes entries with the variable Body Part labelled as `whole`. The Maris data requires that the body `body_part` distinguishes between `Whole animal` and `Whole plant`. The OSPAR data defines the `Biological group` which allows for the Body Part labelled as whole to be defined as `Whole animal` and `Whole plant`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fba75bc3", - "metadata": {}, - "outputs": [], + " match_score \n", + "source_key \n", + "Mix of muscle and whole fish without liver FISH 31 \n", + "Whole without head FISH 13 \n", + "Cod medallion FISH 13 \n", + "Soft parts Molluscs 9 \n", + "Soft Parts Molluscs 9 \n", + "WHOLE FISH Fish 9 \n", + "WHOLE FISH FISH 9 \n", + "WHOLE ANIMAL Molluscs 9 \n", + "Whole fisk FISH 9 \n", + "FLESH WITHOUT BONES Molluscs 9 \n", + "Whole fish Fish 9 \n", + "UNKNOWN FISH 9 \n", + "SOFT PARTS molluscs 9 \n", + "Whole animal Molluscs 9 \n", + "UNKNOWN Fish 9 \n", + "SOFT PARTS Molluscs 9 \n", + "SOFT PARTS MOLLUSCS 9 \n", + "Whole plant Seaweed 8 \n", + "whole plant Seaweed 8 \n", + "FLESH WITHOUT BONES SEAWEED 8 \n", + "WHOLE PLANT seaweed 8 \n", + "WHOLE PLANT SEAWEED 8 \n", + "GROWING TIPS Seaweed 8 \n", + "WHOLE PLANT Seaweed 8 \n", + "FLESH Fish 7 \n", + "Flesh Fish 7 \n", + "WHOLE Seaweed 6 \n", + "Whole animal Fish 5 \n", + "Muscle FISH 5 \n", + "FLESH WITH SCALES Fish 5 \n", + "FLESH WITHOUT BONES fish 5 \n", + "HEAD Fish 5 \n", + "WHOLE ANIMAL FISH 5 \n", + "FLESH WITHOUT BONES FISH 5 \n", + "LIVER Fish 5 \n", + "WHOLE Fish 5 \n", + "Soft parts Fish 5 \n", + "WHOLE ANIMAL Fish 5 \n", + "Flesh without bones Fish 5 \n", + "MUSCLE Fish 5 \n", + "Whole FISH 5 \n", + "HEAD FISH 5 \n", + "FLESH WITHOUT BONES Fish 5 \n", + "Muscle Fish 5 \n", + "FLESH WITHOUT BONE Fish 4 \n", + "FLESH WITHOUT BONE FISH 4 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#| export\n", - "whole_animal_plant = {'whole' : ['Whole','WHOLE', 'WHOLE FISH', 'Whole fisk', 'Whole fish'],\n", - " 'Whole animal' : ['Molluscs','Fish','FISH','molluscs','fish','MOLLUSCS'],\n", - " 'Whole plant' : ['Seaweed','seaweed','SEAWEED'] }" + "#| eval: false\n", + "remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs_test, col_name='body_part_temp', as_df=True),\n", + " maris_lut_fn=bodyparts_lut_path,\n", + " maris_col_id='bodypar_id',\n", + " maris_col_name='bodypar',\n", + " provider_col_to_match='value',\n", + " provider_col_key='value',\n", + " fname_cache='bodyparts_ospar.pkl'\n", + " )\n", + "\n", + "remapper.generate_lookup_table(as_df=True)\n", + "remapper.select_match(match_score_threshold=0)" ] }, { "cell_type": "code", "execution_count": null, - "id": "bfe388f0", + "id": "38e2a17f", "metadata": {}, "outputs": [], "source": [ - "#| export\n", - "class CorrectWholeBodyPartCB(Callback):\n", - " \"\"\"Update body parts labeled as 'whole' to either 'Whole animal' or 'Whole plant'.\"\"\"\n", - " \n", - " def __init__(self, wap: Dict[str, List[str]] = whole_animal_plant):\n", - " fc.store_attr()\n", - "\n", - " def __call__(self, tfm: 'Transformer'):\n", - " self.correct_whole_body_part(tfm.dfs['biota'])\n", - "\n", - " def correct_whole_body_part(self, df: pd.DataFrame):\n", - " df['body_part'] = df['Body Part'] \n", - " self.update_body_part(df, self.wap['whole'], self.wap['Whole animal'], 'Whole animal')\n", - " self.update_body_part(df, self.wap['whole'], self.wap['Whole plant'], 'Whole plant')\n", - "\n", - " def update_body_part(self, df: pd.DataFrame, whole_list: List[str], group_list: List[str], new_value: str):\n", - " mask = (df['body_part'].isin(whole_list)) & (df['Biological group'].isin(group_list))\n", - " df.loc[mask, 'body_part'] = new_value\n" + "#| exports\n", + "lut_bodyparts = lambda: Remapper(provider_lut_df=get_unique_across_dfs(tfm.dfs, col_name='body_part_temp', as_df=True),\n", + " maris_lut_fn=bodyparts_lut_path,\n", + " maris_col_id='bodypar_id',\n", + " maris_col_name='bodypar',\n", + " provider_col_to_match='value',\n", + " provider_col_key='value',\n", + " fname_cache='bodyparts_ospar.pkl'\n", + " ).generate_lookup_table(fixes=fixes_biota_bodyparts, as_df=False, overwrite=False)" ] }, { "cell_type": "code", "execution_count": null, - "id": "2f19f422", + "id": "ac6ee355", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - " seawater biota\n", - "Number of rows in dfs 18856 15314\n", - "Number of rows in tfm.dfs 18856 15314\n", - "Number of dropped rows 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 18856 15314 \n", - "\n", - "['SOFT PARTS' 'GROWING TIPS' 'Whole plant' 'Whole animal' 'WHOLE ANIMAL'\n", - " 'FLESH WITHOUT BONES' 'WHOLE PLANT' 'Soft Parts' 'Whole without head'\n", - " 'Cod medallion' 'Muscle' 'Mix of muscle and whole fish without liver'\n", - " 'Flesh' 'FLESH WITHOUT BONE' 'UNKNOWN' 'FLESH' 'FLESH WITH SCALES' 'HEAD'\n", - " 'Flesh without bones' 'Soft parts' 'whole plant' 'LIVER' 'MUSCLE']\n" - ] + "data": { + "text/plain": [ + "array([19, 56, 40, 1, 52, 43, 34, 10, 60, 13, 25])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "#|eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),\n", - " CorrectWholeBodyPartCB(),\n", - " CompareDfsAndTfmCB(dfs)\n", + "tfm = Transformer(dfs, cbs=[ \n", + " RemoveAllNAValuesCB(cols_to_check), \n", + " AddBodypartTempCB(),\n", + " RemapCB(lut_bodyparts, 'body_part', 'body_part_temp' , 'biota')\n", " ])\n", "tfm()\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", - "print(tfm.dfs['biota']['body_part'].unique())" + "tfm.dfs['biota']['body_part'].unique()" ] }, { "cell_type": "markdown", - "id": "05061d66", - "metadata": {}, - "source": [ - "Get a dataframe of matched OSPAR biota tissues with Maris Bodyparts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22eee5c2", + "id": "e7ed9551", "metadata": {}, - "outputs": [], "source": [ - "#|export\n", - "unmatched_fixes_biota_tissues = {}" + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: `biota` dataset includes 1 entry where the `Body Part` is `FLESH WITHOUT BONES` for the `Biological group` of `SEAWEED`, see below. \n", + "\n", + ":::" ] }, { "cell_type": "code", "execution_count": null, - "id": "30d2f63b", + "id": "93c19547", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generating lookup table: 0%| | 0/23 [00:00\n", " \n", " \n", - " matched_id\n", - " matched_maris_name\n", - " source_name\n", - " match_score\n", - " \n", - " \n", - " source_id\n", - " \n", - " \n", - " \n", - " \n", + " ID\n", + " Contracting Party\n", + " Sample ID\n", + " Biological group\n", + " Body Part\n", + " Measurement Comment\n", + " Sample Comment\n", " \n", " \n", " \n", " \n", - " Mix of muscle and whole fish without liver\n", - " 52\n", - " Flesh without bones\n", - " Mix of muscle and whole fish without liver\n", - " 27\n", - " \n", - " \n", - " Whole without head\n", - " 52\n", - " Flesh without bones\n", - " Whole without head\n", - " 10\n", - " \n", - " \n", - " Cod medallion\n", - " 8\n", - " Exoskeleton\n", - " Cod medallion\n", - " 9\n", - " \n", - " \n", - " UNKNOWN\n", - " 12\n", - " Skin\n", - " UNKNOWN\n", - " 5\n", - " \n", - " \n", - " FLESH\n", - " 42\n", - " Leaf\n", - " FLESH\n", - " 3\n", - " \n", - " \n", - " Flesh\n", - " 42\n", - " Leaf\n", - " Flesh\n", - " 3\n", - " \n", - " \n", - " FLESH WITHOUT BONE\n", - " 52\n", - " Flesh without bones\n", - " FLESH WITHOUT BONE\n", - " 1\n", - " \n", - " \n", - " LIVER\n", - " 25\n", - " Liver\n", - " LIVER\n", - " 0\n", - " \n", - " \n", - " whole plant\n", - " 40\n", - " Whole plant\n", - " whole plant\n", - " 0\n", - " \n", - " \n", - " Soft parts\n", - " 19\n", - " Soft parts\n", - " Soft parts\n", - " 0\n", - " \n", - " \n", - " Flesh without bones\n", - " 52\n", - " Flesh without bones\n", - " Flesh without bones\n", - " 0\n", - " \n", - " \n", - " HEAD\n", - " 13\n", - " Head\n", - " HEAD\n", - " 0\n", - " \n", - " \n", - " FLESH WITH SCALES\n", - " 60\n", - " Flesh with scales\n", - " FLESH WITH SCALES\n", - " 0\n", - " \n", - " \n", - " SOFT PARTS\n", - " 19\n", - " Soft parts\n", - " SOFT PARTS\n", - " 0\n", - " \n", - " \n", - " GROWING TIPS\n", - " 56\n", - " Growing tips\n", - " GROWING TIPS\n", - " 0\n", - " \n", - " \n", - " Muscle\n", - " 34\n", - " Muscle\n", - " Muscle\n", - " 0\n", - " \n", - " \n", - " Soft Parts\n", - " 19\n", - " Soft parts\n", - " Soft Parts\n", - " 0\n", - " \n", - " \n", - " WHOLE PLANT\n", - " 40\n", - " Whole plant\n", - " WHOLE PLANT\n", - " 0\n", - " \n", - " \n", - " FLESH WITHOUT BONES\n", - " 52\n", - " Flesh without bones\n", + " 2660\n", + " 87356\n", + " Iceland\n", + " THFAG17C\n", + " SEAWEED\n", " FLESH WITHOUT BONES\n", - " 0\n", - " \n", - " \n", - " WHOLE ANIMAL\n", - " 1\n", - " Whole animal\n", - " WHOLE ANIMAL\n", - " 0\n", - " \n", - " \n", - " Whole animal\n", - " 1\n", - " Whole animal\n", - " Whole animal\n", - " 0\n", - " \n", - " \n", - " Whole plant\n", - " 40\n", - " Whole plant\n", - " Whole plant\n", - " 0\n", - " \n", - " \n", - " MUSCLE\n", - " 34\n", - " Muscle\n", - " MUSCLE\n", - " 0\n", + " NaN\n", + " NaN\n", " \n", " \n", "\n", "" ], "text/plain": [ - " matched_id matched_maris_name \\\n", - "source_id \n", - "Mix of muscle and whole fish without liver 52 Flesh without bones \n", - "Whole without head 52 Flesh without bones \n", - "Cod medallion 8 Exoskeleton \n", - "UNKNOWN 12 Skin \n", - "FLESH 42 Leaf \n", - "Flesh 42 Leaf \n", - "FLESH WITHOUT BONE 52 Flesh without bones \n", - "LIVER 25 Liver \n", - "whole plant 40 Whole plant \n", - "Soft parts 19 Soft parts \n", - "Flesh without bones 52 Flesh without bones \n", - "HEAD 13 Head \n", - "FLESH WITH SCALES 60 Flesh with scales \n", - "SOFT PARTS 19 Soft parts \n", - "GROWING TIPS 56 Growing tips \n", - "Muscle 34 Muscle \n", - "Soft Parts 19 Soft parts \n", - "WHOLE PLANT 40 Whole plant \n", - "FLESH WITHOUT BONES 52 Flesh without bones \n", - "WHOLE ANIMAL 1 Whole animal \n", - "Whole animal 1 Whole animal \n", - "Whole plant 40 Whole plant \n", - "MUSCLE 34 Muscle \n", - "\n", - " source_name \\\n", - "source_id \n", - "Mix of muscle and whole fish without liver Mix of muscle and whole fish without liver \n", - "Whole without head Whole without head \n", - "Cod medallion Cod medallion \n", - "UNKNOWN UNKNOWN \n", - "FLESH FLESH \n", - "Flesh Flesh \n", - "FLESH WITHOUT BONE FLESH WITHOUT BONE \n", - "LIVER LIVER \n", - "whole plant whole plant \n", - "Soft parts Soft parts \n", - "Flesh without bones Flesh without bones \n", - "HEAD HEAD \n", - "FLESH WITH SCALES FLESH WITH SCALES \n", - "SOFT PARTS SOFT PARTS \n", - "GROWING TIPS GROWING TIPS \n", - "Muscle Muscle \n", - "Soft Parts Soft Parts \n", - "WHOLE PLANT WHOLE PLANT \n", - "FLESH WITHOUT BONES FLESH WITHOUT BONES \n", - "WHOLE ANIMAL WHOLE ANIMAL \n", - "Whole animal Whole animal \n", - "Whole plant Whole plant \n", - "MUSCLE MUSCLE \n", + " ID Contracting Party Sample ID Biological group Body Part \\\n", + "2660 87356 Iceland THFAG17C SEAWEED FLESH WITHOUT BONES \n", "\n", - " match_score \n", - "source_id \n", - "Mix of muscle and whole fish without liver 27 \n", - "Whole without head 10 \n", - "Cod medallion 9 \n", - "UNKNOWN 5 \n", - "FLESH 3 \n", - "Flesh 3 \n", - "FLESH WITHOUT BONE 1 \n", - "LIVER 0 \n", - "whole plant 0 \n", - "Soft parts 0 \n", - "Flesh without bones 0 \n", - "HEAD 0 \n", - "FLESH WITH SCALES 0 \n", - "SOFT PARTS 0 \n", - "GROWING TIPS 0 \n", - "Muscle 0 \n", - "Soft Parts 0 \n", - "WHOLE PLANT 0 \n", - "FLESH WITHOUT BONES 0 \n", - "WHOLE ANIMAL 0 \n", - "Whole animal 0 \n", - "Whole plant 0 \n", - "MUSCLE 0 " + " Measurement Comment Sample Comment \n", + "2660 NaN NaN " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfs['biota'][['ID','Contracting Party','Sample ID','Biological group','Body Part', 'Measurement Comment', 'Sample Comment']][(tfm.dfs['biota']['Body Part'] == 'FLESH WITHOUT BONES') & (tfm.dfs['biota']['Biological group'] == 'SEAWEED')]" + ] + }, + { + "cell_type": "markdown", + "id": "f0b1d387", + "metadata": {}, + "source": [ + "Now we will remove the data entry where the `Body Part` is `FLESH WITHOUT BONES` for the `Biological group` of `SEAWEED`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00e82f60", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "body_part_filters = {\n", + " 'biota': lambda df: df['body_part_temp'] == 'FLESH WITHOUT BONES SEAWEED'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7df73861", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RemoveFilteredRowsCB: Removed 1 rows from 'biota'.\n" + ] + }, + { + "data": { + "text/plain": [ + "array([19, 56, 40, 1, 52, 43, 34, 10, 60, 13, 25])" ] }, "execution_count": null, @@ -7364,42 +7058,72 @@ ], "source": [ "#|eval: false\n", - "tissues_lut_df = get_maris_lut(df_biota=tfm.dfs['biota'], \n", - " fname_cache='tissues_ospar.pkl', \n", - " data_provider_name_col='body_part',\n", - " maris_lut=bodyparts_lut_path,\n", - " maris_id='bodypar_id',\n", - " maris_name='bodypar',\n", - " unmatched_fixes=unmatched_fixes_biota_tissues,\n", - " as_dataframe=True,\n", - " overwrite=True)\n", - "tissues_lut_df" + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[ \n", + " RemoveAllNAValuesCB(cols_to_check), \n", + " AddBodypartTempCB(),\n", + " RemapCB(lut_bodyparts, 'body_part', 'body_part_temp' , 'biota'),\n", + " RemoveFilteredRowsCB(body_part_filters, verbose=True)\n", + " ])\n", + "tfm()\n", + "tfm.dfs['biota']['body_part'].unique()" + ] + }, + { + "cell_type": "markdown", + "id": "892cad61", + "metadata": {}, + "source": [ + "## Remap biogroup" + ] + }, + { + "cell_type": "markdown", + "id": "8f0c00f4", + "metadata": {}, + "source": [ + "#### Lookup : Biogroup" + ] + }, + { + "cell_type": "markdown", + "id": "85e63967", + "metadata": {}, + "source": [ + "The OSAR dataset includes a `Biological group` column. \n", + "The maris dataset contains a `Species` column, which links to the `biogroup_id` column of the MARIS nomenclature." + ] + }, + { + "cell_type": "markdown", + "id": "9d308318", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**DISCUSS**: Since the `species_id` column links to the `biogroup_id` column of the MARIS species look up table, do we need to include `biogroup_id` in the netcdf output?\n", + "\n", + ":::" ] }, { "cell_type": "markdown", - "id": "131fe2e1", + "id": "acde215b", "metadata": {}, "source": [ - "List unmatched OSPAR tissues:" + "Use the `Biological group` column to fill `NA` in the `Species` column" ] }, { "cell_type": "code", "execution_count": null, - "id": "6fb05e06", + "id": "9072bd51", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Mix of muscle and whole fish without liver',\n", - " 'Whole without head',\n", - " 'Cod medallion',\n", - " 'UNKNOWN',\n", - " 'FLESH',\n", - " 'Flesh',\n", - " 'FLESH WITHOUT BONE']" + "15313" ] }, "execution_count": null, @@ -7408,22 +7132,97 @@ } ], "source": [ - "#|eval: false\n", - "tissues_lut_df[tissues_lut_df['match_score'] >= 1]['source_name'].tolist()" + "tfm.dfs['biota']['Activity or MDA'].count()" ] }, { "cell_type": "markdown", - "id": "29d91d0d", + "id": "72f662a5", + "metadata": {}, + "source": [ + "HERE HERE, going to use the `Biological group` column to fill `NA` in the `Species` column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a4ba110", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2078" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfm.dfs['biota']['Species'].isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f857547e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['ID', 'Contracting Party', 'RSC Sub-division', 'Station ID',\n", + " 'Sample ID', 'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM',\n", + " 'LongS', 'LongDir', 'Sample type', 'Biological group', 'Species',\n", + " 'Body Part', 'Sampling date', 'Nuclide', 'Value type',\n", + " 'Activity or MDA', 'Uncertainty', 'Unit', 'Data provider',\n", + " 'Measurement Comment', 'Sample Comment', 'Reference Comment'],\n", + " dtype='object')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfs['biota'].columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f3c7410", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['ID', 'Contracting Party', 'RSC Sub-division', 'Station ID',\n", + " 'Sample ID', 'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM',\n", + " 'LongS', 'LongDir', 'Sample type', 'Biological group', 'Species',\n", + " 'Body Part', 'Sampling date', 'Nuclide', 'Value type',\n", + " 'Activity or MDA', 'Uncertainty', 'Unit', 'Data provider',\n", + " 'Measurement Comment', 'Sample Comment', 'Reference Comment',\n", + " 'body_part_temp', 'body_part'],\n", + " dtype='object')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Read Maris tissue lut to correct unmatched tissues:" + "tfm.dfs['biota'].columns" ] }, { "cell_type": "code", "execution_count": null, - "id": "0f64f480", + "id": "c72d754a", "metadata": {}, "outputs": [ { @@ -7447,200 +7246,74 @@ " \n", " \n", " \n", - " bodypar_id\n", - " bodypar\n", - " bodycode\n", - " groupcode\n", + " Species\n", " \n", " \n", " \n", " \n", " 0\n", - " -1\n", - " Not applicable\n", - " NaN\n", - " NaN\n", + " LITTORINA LITTOREA\n", " \n", " \n", " 1\n", - " 0\n", - " (Not available)\n", - " 0\n", - " 0\n", + " FUCUS VESICULOSUS\n", " \n", " \n", " 2\n", - " 1\n", - " Whole animal\n", - " WHOA\n", - " WHO\n", + " LITTORINA LITTOREA\n", " \n", " \n", " 3\n", - " 2\n", - " Whole animal eviscerated\n", - " WHOEV\n", - " WHO\n", + " LITTORINA LITTOREA\n", " \n", " \n", " 4\n", - " 3\n", - " Whole animal eviscerated without head\n", - " WHOHE\n", - " WHO\n", + " FUCUS VESICULOSUS\n", " \n", " \n", " ...\n", " ...\n", - " ...\n", - " ...\n", - " ...\n", " \n", " \n", - " 57\n", - " 56\n", - " Growing tips\n", - " GTIP\n", - " PHAN\n", + " 15309\n", + " LITTORINA LITTOREA\n", " \n", " \n", - " 58\n", - " 57\n", - " Upper parts of plants\n", - " UPPL\n", - " PHAN\n", + " 15310\n", + " Patella sp.\n", " \n", " \n", - " 59\n", - " 58\n", - " Lower parts of plants\n", - " LWPL\n", - " PHAN\n", + " 15311\n", + " Fucus serratus\n", " \n", " \n", - " 60\n", - " 59\n", - " Shells/carapace\n", - " SHCA\n", - " SKEL\n", + " 15312\n", + " Fucus serratus\n", " \n", " \n", - " 61\n", - " 60\n", - " Flesh with scales\n", - " FLES\n", - " FLES\n", + " 15313\n", + " Fucus serratus\n", " \n", " \n", "\n", - "

62 rows × 4 columns

\n", + "

15313 rows × 1 columns

\n", "" ], "text/plain": [ - " bodypar_id bodypar bodycode groupcode\n", - "0 -1 Not applicable NaN NaN\n", - "1 0 (Not available) 0 0\n", - "2 1 Whole animal WHOA WHO\n", - "3 2 Whole animal eviscerated WHOEV WHO\n", - "4 3 Whole animal eviscerated without head WHOHE WHO\n", - ".. ... ... ... ...\n", - "57 56 Growing tips GTIP PHAN\n", - "58 57 Upper parts of plants UPPL PHAN\n", - "59 58 Lower parts of plants LWPL PHAN\n", - "60 59 Shells/carapace SHCA SKEL\n", - "61 60 Flesh with scales FLES FLES\n", + " Species\n", + "0 LITTORINA LITTOREA\n", + "1 FUCUS VESICULOSUS\n", + "2 LITTORINA LITTOREA\n", + "3 LITTORINA LITTOREA\n", + "4 FUCUS VESICULOSUS\n", + "... ...\n", + "15309 LITTORINA LITTOREA\n", + "15310 Patella sp.\n", + "15311 Fucus serratus\n", + "15312 Fucus serratus\n", + "15313 Fucus serratus\n", "\n", - "[62 rows x 4 columns]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#|eval: false\n", - "marisco_lut_df = pd.read_excel(bodyparts_lut_path())\n", - "marisco_lut_df" - ] - }, - { - "cell_type": "markdown", - "id": "6b8c7d03", - "metadata": {}, - "source": [ - "Create a dictionary of unmatched tissues to allow for correctection" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f2c1d3ec", - "metadata": {}, - "outputs": [], - "source": [ - "#|export\n", - "unmatched_fixes_biota_tissues = {\n", - "'Mix of muscle and whole fish without liver' : 'Not available', # Drop\n", - " 'Whole without head' : 'Whole animal eviscerated without head', # Drop? eviscerated? ,\n", - " 'Cod medallion' : 'Whole animal eviscerated without head',\n", - " 'FLESH' : 'Flesh without bones', # Drop? with or without bones?\n", - " 'Flesh' : 'Flesh without bones', # Drop? with or without bones?\n", - " 'UNKNOWN' : 'Not available',\n", - " 'FLESH WITHOUT BONE' : 'Flesh without bones'\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3b6076a9", - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'get_maris_lut' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [174], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#|eval: false\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m tissues_lut_df \u001b[38;5;241m=\u001b[39m \u001b[43mget_maris_lut\u001b[49m(df_biota\u001b[38;5;241m=\u001b[39mtfm\u001b[38;5;241m.\u001b[39mdfs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbiota\u001b[39m\u001b[38;5;124m'\u001b[39m], \n\u001b[1;32m 3\u001b[0m fname_cache\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtissues_ospar.pkl\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[1;32m 4\u001b[0m data_provider_name_col\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbody_part\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 5\u001b[0m maris_lut\u001b[38;5;241m=\u001b[39mbodyparts_lut_path,\n\u001b[1;32m 6\u001b[0m maris_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbodypar_id\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 7\u001b[0m maris_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbodypar\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 8\u001b[0m unmatched_fixes\u001b[38;5;241m=\u001b[39munmatched_fixes_biota_tissues,\n\u001b[1;32m 9\u001b[0m as_dataframe\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 10\u001b[0m overwrite\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 11\u001b[0m tissues_lut_df\n", - "\u001b[0;31mNameError\u001b[0m: name 'get_maris_lut' is not defined" - ] - } - ], - "source": [ - "#|eval: false\n", - "# tissues_lut_df = get_maris_lut(df_biota=tfm.dfs['biota'], \n", - "# fname_cache='tissues_ospar.pkl', \n", - "# data_provider_name_col='body_part',\n", - "# maris_lut=bodyparts_lut_path,\n", - "# maris_id='bodypar_id',\n", - "# maris_name='bodypar',\n", - "# unmatched_fixes=unmatched_fixes_biota_tissues,\n", - "# as_dataframe=True,\n", - "# overwrite=True)\n", - "# tissues_lut_df" - ] - }, - { - "cell_type": "markdown", - "id": "b8b66cf5", - "metadata": {}, - "source": [ - "List unmatched OSPAR tissues:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50eb242f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Mix of muscle and whole fish without liver', 'UNKNOWN']" + "[15313 rows x 1 columns]" ] }, "execution_count": null, @@ -7649,125 +7322,16 @@ } ], "source": [ - "#|eval: false\n", - "tissues_lut_df[tissues_lut_df['match_score'] >= 1]['source_name'].tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "866927f8", - "metadata": {}, - "outputs": [], - "source": [ - "#| export\n", - "class LookupBiotaBodyPartCB(Callback):\n", - " \"\"\"Update body part id based on MARIS dbo_bodypar.xlsx\"\"\"\n", - "\n", - " def __init__(self, fn_lut: Callable, unmatched_fixes_biota_tissues: Dict[str, str]):\n", - " fc.store_attr()\n", - "\n", - " def __call__(self, tfm: 'Transformer'):\n", - " lut = self.fn_lut(df_biota=tfm.dfs['biota'])\n", - " self.drop_nan_species(tfm.dfs['biota'])\n", - " self.drop_unmatched(tfm.dfs['biota'])\n", - " self.perform_lookup(tfm.dfs['biota'], lut)\n", - "\n", - " def drop_nan_species(self, df: pd.DataFrame):\n", - " \"\"\"\n", - " Drop rows where 'body_part' is NaN.\n", - "\n", - " Args:\n", - " df (pd.DataFrame): The DataFrame to process.\n", - " \"\"\"\n", - " df.dropna(subset=['body_part'], inplace=True)\n", - "\n", - " def drop_unmatched(self, df: pd.DataFrame):\n", - " \"\"\"\n", - " Drop rows where the 'body_part' is in the unmatched_fixes_biota_tissues list with value 'Not available'.\n", - "\n", - " Args:\n", - " df (pd.DataFrame): The DataFrame to process.\n", - " \"\"\"\n", - " na_list = ['Not available']\n", - " na_biota_tissues = [k for k, v in self.unmatched_fixes_biota_tissues.items() if v in na_list]\n", - " df.drop(df[df['body_part'].isin(na_biota_tissues)].index, inplace=True)\n", - "\n", - " def perform_lookup(self, df: pd.DataFrame, lut: Dict[str, 'Match']):\n", - " \"\"\"\n", - " Perform lookup to update 'body_part' with matched IDs.\n", - "\n", - " Args:\n", - " df (pd.DataFrame): The DataFrame to process.\n", - " lut (Dict[str, Match]): The lookup table.\n", - " \"\"\"\n", - " df['body_part'] = df['body_part'].apply(lambda x: lut[x].matched_id if x in lut else x)\n" + "tfm.dfs['biota'][['Species']]" ] }, { "cell_type": "code", "execution_count": null, - "id": "1162c4c1", + "id": "b57ed98d", "metadata": {}, "outputs": [], - "source": [ - "#|eval: false\n", - "get_maris_bodypart=partial(get_maris_lut, \n", - " fname_cache='tissues_ospar.pkl', \n", - " data_provider_name_col='body_part',\n", - " maris_lut=bodyparts_lut_path,\n", - " maris_id='bodypar_id',\n", - " maris_name='bodypar',\n", - " unmatched_fixes=unmatched_fixes_biota_tissues,\n", - " as_dataframe=False,\n", - " overwrite=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "89baf396", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " seawater biota\n", - "Number of rows in dfs 18856 15314\n", - "Number of rows in tfm.dfs 18856 15308\n", - "Number of dropped rows 0 6\n", - "Number of rows in tfm.dfs + Number of dropped rows 18856 15314 \n", - "\n", - " Body Part body_part\n", - "0 SOFT PARTS 19\n", - "1 GROWING TIPS 56\n", - "2 SOFT PARTS 19\n", - "3 SOFT PARTS 19\n", - "4 GROWING TIPS 56\n" - ] - } - ], - "source": [ - "#|eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),\n", - " CorrectWholeBodyPartCB(),\n", - " LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", - "tfm()\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", - "print(tfm.dfs['biota'][['Body Part', 'body_part']][:5])" - ] - }, - { - "cell_type": "markdown", - "id": "8f0c00f4", - "metadata": {}, - "source": [ - "#### Lookup : Biogroup" - ] + "source": [] }, { "cell_type": "code", diff --git a/nbs/handlers/helcom.ipynb b/nbs/handlers/helcom.ipynb index 9150d13..ba61cd3 100644 --- a/nbs/handlers/helcom.ipynb +++ b/nbs/handlers/helcom.ipynb @@ -402,10 +402,10 @@ "3 WKRIL2012006 1\n", "4 WKRIL2012007 1\n", " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of rows in dfs 20318 37347 14893\n", + "Number of rows in tfm.dfs 20318 37347 14893\n", "Number of dropped rows 0 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "Number of rows in tfm.dfs + Number of dropped rows 20318 37347 14893 \n", "\n" ] } @@ -470,19 +470,19 @@ "output_type": "stream", "text": [ " index value n_chars stripped_chars\n", - "14 14 CS137 9 5\n", - "20 20 SR90 6 4\n", - "31 31 PU238 8 5\n", - "34 34 CS137 6 5\n", - "37 37 K40 8 3\n", - "53 53 SR90 7 4\n", - "54 54 SR90 5 4\n", - "59 59 SR90 8 4\n", - "62 62 CO60 8 4\n", - "69 69 CS134 8 5\n", - "73 73 TC99 7 4\n", - "75 75 AM241 8 5\n", - "91 91 CS137 8 5\n" + "3 3 CS137 6 5\n", + "8 8 CO60 8 4\n", + "20 20 PU238 8 5\n", + "22 22 TC99 7 4\n", + "24 24 K40 8 3\n", + "29 29 SR90 7 4\n", + "32 32 SR90 5 4\n", + "37 37 SR90 8 4\n", + "66 66 SR90 6 4\n", + "70 70 CS134 8 5\n", + "80 80 CS137 8 5\n", + "81 81 AM241 8 5\n", + "93 93 CS137 9 5\n" ] } ], @@ -612,39 +612,39 @@ " \n", " 0\n", " 0\n", - " pu239240\n", + " zr95\n", " \n", " \n", " 1\n", " 1\n", - " cs144\n", + " be7\n", " \n", " \n", " 2\n", " 2\n", - " cs141\n", + " pu241\n", " \n", " \n", " 3\n", " 3\n", - " cs140\n", + " co60\n", " \n", " \n", " 4\n", " 4\n", - " sn117m\n", + " ir192\n", " \n", " \n", "\n", "" ], "text/plain": [ - " index value\n", - "0 0 pu239240\n", - "1 1 cs144\n", - "2 2 cs141\n", - "3 3 cs140\n", - "4 4 sn117m" + " index value\n", + "0 0 zr95\n", + "1 1 be7\n", + "2 2 pu241\n", + "3 3 co60\n", + "4 4 ir192" ] }, "execution_count": null, @@ -704,7 +704,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 77/77 [00:02<00:00, 32.60it/s]\n" + "Processing: 0%| | 0/77 [00:00\n", " \n", " \n", - " pu239240\n", - " pu240\n", - " pu239240\n", - " 3\n", - " \n", - " \n", " pu238240\n", " pu240\n", " pu238240\n", @@ -765,10 +766,10 @@ " 3\n", " \n", " \n", - " cs142\n", - " ce144\n", - " cs142\n", - " 2\n", + " pu239240\n", + " pu240\n", + " pu239240\n", + " 3\n", " \n", " \n", " cs145\n", @@ -783,10 +784,10 @@ " 2\n", " \n", " \n", - " cs144\n", + " cs142\n", " ce144\n", - " cs144\n", - " 1\n", + " cs142\n", + " 2\n", " \n", " \n", " cs141\n", @@ -795,33 +796,39 @@ " 1\n", " \n", " \n", - " cs140\n", - " ce140\n", - " cs140\n", - " 1\n", - " \n", - " \n", " cs138\n", " cs137\n", " cs138\n", " 1\n", " \n", " \n", + " cs140\n", + " ce140\n", + " cs140\n", + " 1\n", + " \n", + " \n", " cs139\n", " ce139\n", " cs139\n", " 1\n", " \n", " \n", + " k-40\n", + " k40\n", + " k-40\n", + " 1\n", + " \n", + " \n", " cs146\n", " cs136\n", " cs146\n", " 1\n", " \n", " \n", - " k-40\n", - " k40\n", - " k-40\n", + " cs144\n", + " ce144\n", + " cs144\n", " 1\n", " \n", " \n", @@ -831,20 +838,20 @@ "text/plain": [ " matched_maris_name source_name match_score\n", "source_key \n", - "pu239240 pu240 pu239240 3\n", "pu238240 pu240 pu238240 3\n", "cm243244 cm244 cm243244 3\n", "cs134137 cs137 cs134137 3\n", - "cs142 ce144 cs142 2\n", + "pu239240 pu240 pu239240 3\n", "cs145 cs136 cs145 2\n", "cs143 cs127 cs143 2\n", - "cs144 ce144 cs144 1\n", + "cs142 ce144 cs142 2\n", "cs141 ce141 cs141 1\n", - "cs140 ce140 cs140 1\n", "cs138 cs137 cs138 1\n", + "cs140 ce140 cs140 1\n", "cs139 ce139 cs139 1\n", + "k-40 k40 k-40 1\n", "cs146 cs136 cs146 1\n", - "k-40 k40 k-40 1" + "cs144 ce144 cs144 1" ] }, "execution_count": null, @@ -910,7 +917,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 77/77 [00:01<00:00, 47.36it/s]\n" + "Processing: 0%| | 0/77 [00:00...\n", " \n", " \n", - " 15822\n", + " 14888\n", + " ra226\n", + " 53\n", + " \n", + " \n", + " 14889\n", " k40\n", " 4\n", " \n", " \n", - " 15823\n", + " 14890\n", " cs137\n", " 33\n", " \n", " \n", - " 15824\n", - " be7\n", - " 2\n", - " \n", - " \n", - " 15825\n", - " k40\n", - " 4\n", + " 14891\n", + " ra226\n", + " 53\n", " \n", " \n", - " 15826\n", + " 14892\n", " cs137\n", " 33\n", " \n", " \n", "\n", - "

15827 rows × 2 columns

\n", + "

14893 rows × 2 columns

\n", "" ], "text/plain": [ @@ -1129,13 +1143,13 @@ "3 cs137 33\n", "4 cs134 31\n", "... ... ...\n", - "15822 k40 4\n", - "15823 cs137 33\n", - "15824 be7 2\n", - "15825 k40 4\n", - "15826 cs137 33\n", + "14888 ra226 53\n", + "14889 k40 4\n", + "14890 cs137 33\n", + "14891 ra226 53\n", + "14892 cs137 33\n", "\n", - "[15827 rows x 2 columns]" + "[14893 rows x 2 columns]" ] }, "execution_count": null, @@ -1187,9 +1201,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "seawater DATE null values: 502\n", + "seawater DATE null values: 494\n", "sediment DATE null values: 741\n", - "biota DATE null values: 72\n" + "biota DATE null values: 84\n" ] } ], @@ -1266,10 +1280,10 @@ "output_type": "stream", "text": [ " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of rows in dfs 20318 37347 14893\n", + "Number of rows in tfm.dfs 20318 37347 14893\n", "Number of dropped rows 0 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "Number of rows in tfm.dfs + Number of dropped rows 20318 37347 14893 \n", "\n", " begperiod time\n", "0 2012-05-23 2012-05-23\n", @@ -1278,13 +1292,13 @@ "3 2012-05-24 2012-05-24\n", "4 2012-05-24 2012-05-24\n", "... ... ...\n", - "21211 2021-10-15 2021-10-15\n", - "21212 2021-11-04 2021-11-04\n", - "21213 2021-10-15 2021-10-15\n", - "21214 2021-05-17 2021-05-17\n", - "21215 2021-05-13 2021-05-13\n", + "20313 2015-06-22 2015-06-22\n", + "20314 2015-06-23 2015-06-23\n", + "20315 2015-06-23 2015-06-23\n", + "20316 2015-06-24 2015-06-24\n", + "20317 2015-06-24 2015-06-24\n", "\n", - "[21216 rows x 2 columns]\n" + "[20318 rows x 2 columns]\n" ] } ], @@ -1325,13 +1339,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "8 of 21216 entries for `time` are invalid for seawater.\n", - "1 of 39817 entries for `time` are invalid for sediment.\n", + "1 of 37347 entries for `time` are invalid for sediment.\n", " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21208 39816 15827\n", - "Number of dropped rows 8 1 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "Number of rows in dfs 20318 37347 14893\n", + "Number of rows in tfm.dfs 20318 37346 14893\n", + "Number of dropped rows 0 1 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 20318 37347 14893 \n", "\n" ] } @@ -1406,19 +1419,19 @@ " NaN\n", " NaN\n", " 5.3\n", - " 32.000000\n", + " 32.0\n", " 08/20/14 00:00:00\n", - " 90.0\n", + " 90\n", " KRIL\n", - " 2012003.0\n", + " 2012003\n", " ...\n", " NaN\n", " 0.0\n", " NaN\n", " NaN\n", " NaN\n", - " 11.0\n", - " 11.0\n", + " 11\n", + " 11\n", " 08/20/14 00:00:00\n", " 1337731200\n", " 2012-05-23\n", @@ -1430,19 +1443,19 @@ " NaN\n", " NaN\n", " 19.9\n", - " 20.000000\n", + " 20.0\n", " 08/20/14 00:00:00\n", - " 90.0\n", + " 90\n", " KRIL\n", - " 2012004.0\n", + " 2012004\n", " ...\n", " NaN\n", " 29.0\n", " NaN\n", " NaN\n", " NaN\n", - " 11.0\n", - " 11.0\n", + " 11\n", + " 11\n", " 08/20/14 00:00:00\n", " 1337731200\n", " 2012-05-23\n", @@ -1454,19 +1467,19 @@ " NaN\n", " NaN\n", " 25.5\n", - " 20.000000\n", + " 20.0\n", " 08/20/14 00:00:00\n", - " 90.0\n", + " 90\n", " KRIL\n", - " 2012005.0\n", + " 2012005\n", " ...\n", " NaN\n", " 0.0\n", " NaN\n", " NaN\n", " NaN\n", - " 11.0\n", - " 3.0\n", + " 11\n", + " 3\n", " 08/20/14 00:00:00\n", " 1339891200\n", " 2012-06-17\n", @@ -1478,19 +1491,19 @@ " NaN\n", " NaN\n", " 17.0\n", - " 29.000000\n", + " 29.0\n", " 08/20/14 00:00:00\n", - " 90.0\n", + " 90\n", " KRIL\n", - " 2012006.0\n", + " 2012006\n", " ...\n", " NaN\n", " 0.0\n", " NaN\n", " NaN\n", " NaN\n", - " 11.0\n", - " 11.0\n", + " 11\n", + " 11\n", " 08/20/14 00:00:00\n", " 1337817600\n", " 2012-05-24\n", @@ -1502,19 +1515,19 @@ " NaN\n", " NaN\n", " 22.2\n", - " 18.000000\n", + " 18.0\n", " 08/20/14 00:00:00\n", - " 90.0\n", + " 90\n", " KRIL\n", - " 2012007.0\n", + " 2012007\n", " ...\n", " NaN\n", " 39.0\n", " NaN\n", " NaN\n", " NaN\n", - " 11.0\n", - " 11.0\n", + " 11\n", + " 11\n", " 08/20/14 00:00:00\n", " 1337817600\n", " 2012-05-24\n", @@ -1544,169 +1557,169 @@ " ...\n", " \n", " \n", - " 21211\n", - " WSSSM2021005\n", - " H3\n", - " SSM45\n", + " 20313\n", + " WDHIG2015227\n", + " SR90\n", + " DHIG02\n", " NaN\n", - " 1030.0\n", - " 93.203883\n", - " 09/06/22 00:00:00\n", - " 77.0\n", - " SSSM\n", - " 202105.0\n", + " 6.6\n", + " 7.5\n", + " 11/22/16 00:00:00\n", + " 6\n", + " DHIG\n", + " 2015227\n", " ...\n", + " 12.0\n", + " 4.0\n", + " 7.5\n", " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " N\n", - " 1.0\n", - " 8.0\n", - " 09/06/22 00:00:00\n", - " 1634256000\n", - " 2021-10-15\n", + " F\n", + " 2\n", + " 6\n", + " 11/22/16 00:00:00\n", + " 1434931200\n", + " 2015-06-22\n", " \n", " \n", - " 21212\n", - " WSSSM2021006\n", - " H3\n", - " SSM45\n", + " 20314\n", + " WDHIG2015237\n", + " SR90\n", + " DHIG02\n", " NaN\n", - " 2240.0\n", - " 43.303571\n", - " 09/06/22 00:00:00\n", - " 77.0\n", - " SSSM\n", - " 202106.0\n", + " 6.9\n", + " 7.5\n", + " 11/22/16 00:00:00\n", + " 6\n", + " DHIG\n", + " 2015237\n", " ...\n", + " 20.0\n", + " 4.0\n", + " 7.7\n", " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " N\n", - " 10.0\n", - " 10.0\n", - " 09/06/22 00:00:00\n", - " 1635984000\n", - " 2021-11-04\n", + " F\n", + " 2\n", + " 6\n", + " 11/22/16 00:00:00\n", + " 1435017600\n", + " 2015-06-23\n", " \n", " \n", - " 21213\n", - " WSSSM2021007\n", - " H3\n", - " SSM45\n", + " 20315\n", + " WDHIG2015239\n", + " SR90\n", + " DHIG02\n", " NaN\n", - " 2060.0\n", - " 47.087379\n", - " 09/06/22 00:00:00\n", - " 77.0\n", - " SSSM\n", - " 202107.0\n", + " 6.8\n", + " 7.5\n", + " 11/22/16 00:00:00\n", + " 6\n", + " DHIG\n", + " 2015239\n", " ...\n", + " 17.0\n", + " 4.0\n", + " 7.8\n", " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " N\n", - " 12.0\n", - " 12.0\n", - " 09/06/22 00:00:00\n", - " 1634256000\n", - " 2021-10-15\n", + " F\n", + " 2\n", + " 2\n", + " 11/22/16 00:00:00\n", + " 1435017600\n", + " 2015-06-23\n", " \n", " \n", - " 21214\n", - " WSSSM2021008\n", - " H3\n", - " SSM45\n", + " 20316\n", + " WDHIG2015255\n", + " SR90\n", + " DHIG02\n", " NaN\n", - " 2300.0\n", - " 43.478261\n", - " 09/06/22 00:00:00\n", - " 77.0\n", - " SSSM\n", - " 202108.0\n", + " 7.3\n", + " 7.5\n", + " 11/22/16 00:00:00\n", + " 6\n", + " DHIG\n", + " 2015255\n", " ...\n", + " 47.0\n", + " 4.0\n", + " 8.4\n", " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " N\n", - " 12.0\n", - " 12.0\n", - " 09/06/22 00:00:00\n", - " 1621209600\n", - " 2021-05-17\n", + " F\n", + " 2\n", + " 2\n", + " 11/22/16 00:00:00\n", + " 1435104000\n", + " 2015-06-24\n", " \n", " \n", - " 21215\n", - " WSSSM2021004\n", - " H3\n", - " SSM45\n", - " <\n", - " NaN\n", + " 20317\n", + " WDHIG2015256\n", + " SR90\n", + " DHIG02\n", " NaN\n", - " 09/06/22 00:00:00\n", - " 77.0\n", - " SSSM\n", - " 202104.0\n", + " 5.5\n", + " 7.6\n", + " 11/22/16 00:00:00\n", + " 6\n", + " DHIG\n", + " 2015256\n", " ...\n", + " 47.0\n", + " 45.0\n", + " 15.9\n", " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " N\n", - " 15.0\n", - " 18.0\n", - " 09/06/22 00:00:00\n", - " 1620864000\n", - " 2021-05-13\n", + " F\n", + " 2\n", + " 2\n", + " 11/22/16 00:00:00\n", + " 1435104000\n", + " 2015-06-24\n", " \n", " \n", "\n", - "

21208 rows × 29 columns

\n", + "

20318 rows × 29 columns

\n", "" ], "text/plain": [ - " KEY NUCLIDE METHOD < VALUE_Bq/m³ VALUE_Bq/m³ ERROR%_m³ \\\n", - "0 WKRIL2012003 CS137 NaN NaN 5.3 32.000000 \n", - "1 WKRIL2012004 CS137 NaN NaN 19.9 20.000000 \n", - "2 WKRIL2012005 CS137 NaN NaN 25.5 20.000000 \n", - "3 WKRIL2012006 CS137 NaN NaN 17.0 29.000000 \n", - "4 WKRIL2012007 CS137 NaN NaN 22.2 18.000000 \n", - "... ... ... ... ... ... ... \n", - "21211 WSSSM2021005 H3 SSM45 NaN 1030.0 93.203883 \n", - "21212 WSSSM2021006 H3 SSM45 NaN 2240.0 43.303571 \n", - "21213 WSSSM2021007 H3 SSM45 NaN 2060.0 47.087379 \n", - "21214 WSSSM2021008 H3 SSM45 NaN 2300.0 43.478261 \n", - "21215 WSSSM2021004 H3 SSM45 < NaN NaN \n", + " KEY NUCLIDE METHOD < VALUE_Bq/m³ VALUE_Bq/m³ ERROR%_m³ \\\n", + "0 WKRIL2012003 CS137 NaN NaN 5.3 32.0 \n", + "1 WKRIL2012004 CS137 NaN NaN 19.9 20.0 \n", + "2 WKRIL2012005 CS137 NaN NaN 25.5 20.0 \n", + "3 WKRIL2012006 CS137 NaN NaN 17.0 29.0 \n", + "4 WKRIL2012007 CS137 NaN NaN 22.2 18.0 \n", + "... ... ... ... ... ... ... \n", + "20313 WDHIG2015227 SR90 DHIG02 NaN 6.6 7.5 \n", + "20314 WDHIG2015237 SR90 DHIG02 NaN 6.9 7.5 \n", + "20315 WDHIG2015239 SR90 DHIG02 NaN 6.8 7.5 \n", + "20316 WDHIG2015255 SR90 DHIG02 NaN 7.3 7.5 \n", + "20317 WDHIG2015256 SR90 DHIG02 NaN 5.5 7.6 \n", "\n", - " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... TDEPTH SDEPTH \\\n", - "0 08/20/14 00:00:00 90.0 KRIL 2012003.0 ... NaN 0.0 \n", - "1 08/20/14 00:00:00 90.0 KRIL 2012004.0 ... NaN 29.0 \n", - "2 08/20/14 00:00:00 90.0 KRIL 2012005.0 ... NaN 0.0 \n", - "3 08/20/14 00:00:00 90.0 KRIL 2012006.0 ... NaN 0.0 \n", - "4 08/20/14 00:00:00 90.0 KRIL 2012007.0 ... NaN 39.0 \n", - "... ... ... ... ... ... ... ... \n", - "21211 09/06/22 00:00:00 77.0 SSSM 202105.0 ... NaN 1.0 \n", - "21212 09/06/22 00:00:00 77.0 SSSM 202106.0 ... NaN 1.0 \n", - "21213 09/06/22 00:00:00 77.0 SSSM 202107.0 ... NaN 1.0 \n", - "21214 09/06/22 00:00:00 77.0 SSSM 202108.0 ... NaN 1.0 \n", - "21215 09/06/22 00:00:00 77.0 SSSM 202104.0 ... NaN 1.0 \n", + " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... TDEPTH SDEPTH \\\n", + "0 08/20/14 00:00:00 90 KRIL 2012003 ... NaN 0.0 \n", + "1 08/20/14 00:00:00 90 KRIL 2012004 ... NaN 29.0 \n", + "2 08/20/14 00:00:00 90 KRIL 2012005 ... NaN 0.0 \n", + "3 08/20/14 00:00:00 90 KRIL 2012006 ... NaN 0.0 \n", + "4 08/20/14 00:00:00 90 KRIL 2012007 ... NaN 39.0 \n", + "... ... ... ... ... ... ... ... \n", + "20313 11/22/16 00:00:00 6 DHIG 2015227 ... 12.0 4.0 \n", + "20314 11/22/16 00:00:00 6 DHIG 2015237 ... 20.0 4.0 \n", + "20315 11/22/16 00:00:00 6 DHIG 2015239 ... 17.0 4.0 \n", + "20316 11/22/16 00:00:00 6 DHIG 2015255 ... 47.0 4.0 \n", + "20317 11/22/16 00:00:00 6 DHIG 2015256 ... 47.0 45.0 \n", "\n", " SALIN TTEMP FILT MORS_SUBBASIN HELCOM_SUBBASIN DATE_OF_ENTRY_y \\\n", - "0 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", - "1 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", - "2 NaN NaN NaN 11.0 3.0 08/20/14 00:00:00 \n", - "3 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", - "4 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", + "0 NaN NaN NaN 11 11 08/20/14 00:00:00 \n", + "1 NaN NaN NaN 11 11 08/20/14 00:00:00 \n", + "2 NaN NaN NaN 11 3 08/20/14 00:00:00 \n", + "3 NaN NaN NaN 11 11 08/20/14 00:00:00 \n", + "4 NaN NaN NaN 11 11 08/20/14 00:00:00 \n", "... ... ... ... ... ... ... \n", - "21211 NaN NaN N 1.0 8.0 09/06/22 00:00:00 \n", - "21212 NaN NaN N 10.0 10.0 09/06/22 00:00:00 \n", - "21213 NaN NaN N 12.0 12.0 09/06/22 00:00:00 \n", - "21214 NaN NaN N 12.0 12.0 09/06/22 00:00:00 \n", - "21215 NaN NaN N 15.0 18.0 09/06/22 00:00:00 \n", + "20313 7.5 NaN F 2 6 11/22/16 00:00:00 \n", + "20314 7.7 NaN F 2 6 11/22/16 00:00:00 \n", + "20315 7.8 NaN F 2 2 11/22/16 00:00:00 \n", + "20316 8.4 NaN F 2 2 11/22/16 00:00:00 \n", + "20317 15.9 NaN F 2 2 11/22/16 00:00:00 \n", "\n", " time begperiod \n", "0 1337731200 2012-05-23 \n", @@ -1715,13 +1728,13 @@ "3 1337817600 2012-05-24 \n", "4 1337817600 2012-05-24 \n", "... ... ... \n", - "21211 1634256000 2021-10-15 \n", - "21212 1635984000 2021-11-04 \n", - "21213 1634256000 2021-10-15 \n", - "21214 1621209600 2021-05-17 \n", - "21215 1620864000 2021-05-13 \n", + "20313 1434931200 2015-06-22 \n", + "20314 1435017600 2015-06-23 \n", + "20315 1435017600 2015-06-23 \n", + "20316 1435104000 2015-06-24 \n", + "20317 1435104000 2015-06-24 \n", "\n", - "[21208 rows x 29 columns]" + "[20318 rows x 29 columns]" ] }, "execution_count": null, @@ -1795,10 +1808,10 @@ "output_type": "stream", "text": [ " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21122 39532 15798\n", - "Number of dropped rows 94 285 29\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "Number of rows in dfs 20318 37347 14893\n", + "Number of rows in tfm.dfs 20242 37090 14873\n", + "Number of dropped rows 76 257 20\n", + "Number of rows in tfm.dfs + Number of dropped rows 20318 37347 14893 \n", "\n" ] } @@ -2085,7 +2098,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 46/46 [00:07<00:00, 6.41it/s]\n" + "Processing: 0%| | 0/43 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RUBIN_IDRUBINSCIENTIFIC NAMEENGLISH NAME
011ABRA BRAABRAMIS BRAMABREAM
112ANGU ANGANGUILLA ANGUILLAEEL
213ARCT ISLARCTICA ISLANDICAISLAND CYPRINE
314ASTE RUBASTERIAS RUBENSCOMMON STARFISH
415CARD EDUCARDIUM EDULECOCKLE
540CH HI;BACHARA BALTICABALTIC STONEWORT
62CLAD GLOCLADOPHORA GLOMERATAGREEN ALGAE
75CLUP HARCLUPEA HARENGUSHERRING
816CRAN CRACRANGON CRANGONBROWN SHRIMP
917CYPR CARCYPRINUS CARPIOCARP
1018ENCH CIMENCHINODERMATA CIMB
1137ENGR ENCENGRAULIS ENCRASICOLUSEUROPEAN ANCHOVY
1219ESOX LUCESOX LUCIUSPIKE
1320FISHLARVAEFISH LARVAELARVAE
141FUCU VESFUCUS VESICULOSUSBLADDERWRACK
1539FURC LUMFURCELLARIA LUMBRICALISRED SEAWEED
166GADU MORGADUS MORHUACOD
1734GAST ACUGASTEROSTEUS ACULEATUS3-SPINNED STICKLEBACK
1836GYMN CERGYMNOCEPHALUS CERNUARUFFE
1921LAMI SACLAMINARIA SACCHARINASUGAR KELP
209LIMA LIMLIMANDA LIMANDADAB
213MACO BALMACOMA BALTICAMACOMA BALTICA
2222MERL MNGMERLANGIUS MERLANGUSWHITING
2323MYA AREMYA ARENARIASOFT-SHELLED CLAM
2424MYOX SCOMYOXOCEPHALUS SCORPIUSSHORT-HORN SCULPIN
2525MYTI EDUMYTILUS EDULISBLUE MUSSEL
2626OSME EPEOSMERUS EPERLANUSSMELT
2727PERC FLUPERCA FLUVIATILISPERCH
2828PLANKTONPLANKTONPLANKTON
297PLAT FLEPLATICHTHYS FLESUSFLOUNDER
308PLEU PLAPLEURONECTES PLATESSAPLAICE
3141POLY FUCPOLYSIPHONIA FUCOIDESBLACK SIPHON WEED
3210PSET MAXPSETTA MAXIMATURBOT
3329RHODOPHYRHODOPHYTARED ALGAE
3430RUTI RUTRUTILUS RUTILUSROUCH
354SADU ENTSADURIA ENTOMONSADURIA
3633SCOM SCOSCOMBER SCOMBRUSATLANTIC MACKEREL
3738SOLE SOLSOLEA SOLEASEA TONGUE
3831SPRA SPRSPRATTUS SPRATTUSSPRAT
3932STIZ LUCSTIZOSTEDION LUCIOPERCAPIKEPERCH
4042STUC PECSTUCKENIA PECTINATESAGO PONDWEED
4135ZOAR VIVZOARCES VIVIPARUSEELPOUT
4243ZANN PALUZANNICHELLIA PALUSTRISHORNED PONDWEED
\n", + "" + ], + "text/plain": [ + " RUBIN_ID RUBIN SCIENTIFIC NAME ENGLISH NAME\n", + "0 11 ABRA BRA ABRAMIS BRAMA BREAM\n", + "1 12 ANGU ANG ANGUILLA ANGUILLA EEL\n", + "2 13 ARCT ISL ARCTICA ISLANDICA ISLAND CYPRINE\n", + "3 14 ASTE RUB ASTERIAS RUBENS COMMON STARFISH\n", + "4 15 CARD EDU CARDIUM EDULE COCKLE\n", + "5 40 CH HI;BA CHARA BALTICA BALTIC STONEWORT\n", + "6 2 CLAD GLO CLADOPHORA GLOMERATA GREEN ALGAE\n", + "7 5 CLUP HAR CLUPEA HARENGUS HERRING\n", + "8 16 CRAN CRA CRANGON CRANGON BROWN SHRIMP\n", + "9 17 CYPR CAR CYPRINUS CARPIO CARP\n", + "10 18 ENCH CIM ENCHINODERMATA CIM B\n", + "11 37 ENGR ENC ENGRAULIS ENCRASICOLUS EUROPEAN ANCHOVY\n", + "12 19 ESOX LUC ESOX LUCIUS PIKE\n", + "13 20 FISHLARVAE FISH LARVAE LARVAE\n", + "14 1 FUCU VES FUCUS VESICULOSUS BLADDERWRACK\n", + "15 39 FURC LUM FURCELLARIA LUMBRICALIS RED SEAWEED\n", + "16 6 GADU MOR GADUS MORHUA COD\n", + "17 34 GAST ACU GASTEROSTEUS ACULEATUS 3-SPINNED STICKLEBACK\n", + "18 36 GYMN CER GYMNOCEPHALUS CERNUA RUFFE\n", + "19 21 LAMI SAC LAMINARIA SACCHARINA SUGAR KELP\n", + "20 9 LIMA LIM LIMANDA LIMANDA DAB\n", + "21 3 MACO BAL MACOMA BALTICA MACOMA BALTICA\n", + "22 22 MERL MNG MERLANGIUS MERLANGUS WHITING\n", + "23 23 MYA ARE MYA ARENARIA SOFT-SHELLED CLAM\n", + "24 24 MYOX SCO MYOXOCEPHALUS SCORPIUS SHORT-HORN SCULPIN\n", + "25 25 MYTI EDU MYTILUS EDULIS BLUE MUSSEL\n", + "26 26 OSME EPE OSMERUS EPERLANUS SMELT\n", + "27 27 PERC FLU PERCA FLUVIATILIS PERCH\n", + "28 28 PLANKTON PLANKTON PLANKTON\n", + "29 7 PLAT FLE PLATICHTHYS FLESUS FLOUNDER\n", + "30 8 PLEU PLA PLEURONECTES PLATESSA PLAICE\n", + "31 41 POLY FUC POLYSIPHONIA FUCOIDES BLACK SIPHON WEED\n", + "32 10 PSET MAX PSETTA MAXIMA TURBOT\n", + "33 29 RHODOPHY RHODOPHYTA RED ALGAE\n", + "34 30 RUTI RUT RUTILUS RUTILUS ROUCH\n", + "35 4 SADU ENT SADURIA ENTOMON SADURIA\n", + "36 33 SCOM SCO SCOMBER SCOMBRUS ATLANTIC MACKEREL\n", + "37 38 SOLE SOL SOLEA SOLEA SEA TONGUE\n", + "38 31 SPRA SPR SPRATTUS SPRATTUS SPRAT\n", + "39 32 STIZ LUC STIZOSTEDION LUCIOPERCA PIKEPERCH\n", + "40 42 STUC PEC STUCKENIA PECTINATE SAGO PONDWEED\n", + "41 35 ZOAR VIV ZOARCES VIVIPARUS EELPOUT\n", + "42 43 ZANN PALU ZANNICHELLIA PALUSTRIS HORNED PONDWEED" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "provider_lut_df=pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')\n", + "provider_lut_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0d34899", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'CARD EDU': Match(matched_id=274, matched_maris_name='Cerastoderma edule', source_name='CARDIUM EDULE', match_score=0),\n", + " 'ENCH CIM': Match(matched_id=276, matched_maris_name='Echinodermata', source_name='ENCHINODERMATA CIM', match_score=5),\n", + " 'LAMI SAC': Match(matched_id=110, matched_maris_name='Saccharina latissima', source_name='LAMINARIA SACCHARINA', match_score=0),\n", + " 'MACO BAL': Match(matched_id=122, matched_maris_name='Macoma balthica', source_name='MACOMA BALTICA', match_score=1),\n", + " 'PSET MAX': Match(matched_id=281, matched_maris_name='Scophthalmus maximus', source_name='PSETTA MAXIMA', match_score=0),\n", + " 'STIZ LUC': Match(matched_id=285, matched_maris_name='Sander lucioperca', source_name='STIZOSTEDION LUCIOPERCA', match_score=1),\n", + " 'STUC PEC': Match(matched_id=704, matched_maris_name='Stuckenia pectinata', source_name='STUCKENIA PECTINATE', match_score=1),\n", + " 'ABRA BRA': Match(matched_id=271, matched_maris_name='Abramis brama', source_name='ABRAMIS BRAMA', match_score=0),\n", + " 'ANGU ANG': Match(matched_id=272, matched_maris_name='Anguilla anguilla', source_name='ANGUILLA ANGUILLA', match_score=0),\n", + " 'ARCT ISL': Match(matched_id=273, matched_maris_name='Arctica islandica', source_name='ARCTICA ISLANDICA', match_score=0),\n", + " 'ASTE RUB': Match(matched_id=21, matched_maris_name='Asterias rubens', source_name='ASTERIAS RUBENS', match_score=0),\n", + " 'CH HI;BA': Match(matched_id=1611, matched_maris_name='Chara baltica', source_name='CHARA BALTICA', match_score=0),\n", + " 'CLAD GLO': Match(matched_id=290, matched_maris_name='Cladophora glomerata', source_name='CLADOPHORA GLOMERATA', match_score=0),\n", + " 'CLUP HAR': Match(matched_id=50, matched_maris_name='Clupea harengus', source_name='CLUPEA HARENGUS', match_score=0),\n", + " 'CRAN CRA': Match(matched_id=59, matched_maris_name='Crangon crangon', source_name='CRANGON CRANGON', match_score=0),\n", + " 'CYPR CAR': Match(matched_id=275, matched_maris_name='Cyprinus carpio', source_name='CYPRINUS CARPIO', match_score=0),\n", + " 'ENGR ENC': Match(matched_id=84, matched_maris_name='Engraulis encrasicolus', source_name='ENGRAULIS ENCRASICOLUS', match_score=0),\n", + " 'ESOX LUC': Match(matched_id=269, matched_maris_name='Esox lucius', source_name='ESOX LUCIUS', match_score=0),\n", + " 'FISHLARVAE': Match(matched_id=277, matched_maris_name='Fish larvae', source_name='FISH LARVAE', match_score=0),\n", + " 'FUCU VES': Match(matched_id=96, matched_maris_name='Fucus vesiculosus', source_name='FUCUS VESICULOSUS', match_score=0),\n", + " 'FURC LUM': Match(matched_id=289, matched_maris_name='Furcellaria lumbricalis', source_name='FURCELLARIA LUMBRICALIS', match_score=0),\n", + " 'GADU MOR': Match(matched_id=99, matched_maris_name='Gadus morhua', source_name='GADUS MORHUA', match_score=0),\n", + " 'GAST ACU': Match(matched_id=286, matched_maris_name='Gasterosteus aculeatus', source_name='GASTEROSTEUS ACULEATUS', match_score=0),\n", + " 'GYMN CER': Match(matched_id=288, matched_maris_name='Gymnocephalus cernua', source_name='GYMNOCEPHALUS CERNUA', match_score=0),\n", + " 'LIMA LIM': Match(matched_id=270, matched_maris_name='Limanda limanda', source_name='LIMANDA LIMANDA', match_score=0),\n", + " 'MERL MNG': Match(matched_id=139, matched_maris_name='Merlangius merlangus', source_name='MERLANGIUS MERLANGUS', match_score=0),\n", + " 'MYA ARE': Match(matched_id=120, matched_maris_name='Mya arenaria', source_name='MYA ARENARIA', match_score=0),\n", + " 'MYOX SCO': Match(matched_id=278, matched_maris_name='Myoxocephalus scorpius', source_name='MYOXOCEPHALUS SCORPIUS', match_score=0),\n", + " 'MYTI EDU': Match(matched_id=129, matched_maris_name='Mytilus edulis', source_name='MYTILUS EDULIS', match_score=0),\n", + " 'OSME EPE': Match(matched_id=279, matched_maris_name='Osmerus eperlanus', source_name='OSMERUS EPERLANUS', match_score=0),\n", + " 'PERC FLU': Match(matched_id=247, matched_maris_name='Perca fluviatilis', source_name='PERCA FLUVIATILIS', match_score=0),\n", + " 'PLANKTON': Match(matched_id=280, matched_maris_name='Plankton', source_name='PLANKTON', match_score=0),\n", + " 'PLAT FLE': Match(matched_id=191, matched_maris_name='Platichthys flesus', source_name='PLATICHTHYS FLESUS', match_score=0),\n", + " 'PLEU PLA': Match(matched_id=192, matched_maris_name='Pleuronectes platessa', source_name='PLEURONECTES PLATESSA', match_score=0),\n", + " 'POLY FUC': Match(matched_id=245, matched_maris_name='Polysiphonia fucoides', source_name='POLYSIPHONIA FUCOIDES', match_score=0),\n", + " 'RHODOPHY': Match(matched_id=282, matched_maris_name='Rhodophyta', source_name='RHODOPHYTA', match_score=0),\n", + " 'RUTI RUT': Match(matched_id=283, matched_maris_name='Rutilus rutilus', source_name='RUTILUS RUTILUS', match_score=0),\n", + " 'SADU ENT': Match(matched_id=284, matched_maris_name='Saduria entomon', source_name='SADURIA ENTOMON', match_score=0),\n", + " 'SCOM SCO': Match(matched_id=244, matched_maris_name='Scomber scombrus', source_name='SCOMBER SCOMBRUS', match_score=0),\n", + " 'SOLE SOL': Match(matched_id=397, matched_maris_name='Solea solea', source_name='SOLEA SOLEA', match_score=0),\n", + " 'SPRA SPR': Match(matched_id=243, matched_maris_name='Sprattus sprattus', source_name='SPRATTUS SPRATTUS', match_score=0),\n", + " 'ZOAR VIV': Match(matched_id=287, matched_maris_name='Zoarces viviparus', source_name='ZOARCES VIVIPARUS', match_score=0),\n", + " 'ZANN PALU': Match(matched_id=1524, matched_maris_name='Zannichellia palustris', source_name='ZANNICHELLIA PALUSTRIS', match_score=0)}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lut_biota()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -2364,8 +2850,7 @@ "text": [ "[ 99 243 50 139 270 192 191 284 84 269 122 96 287 279\n", " 278 288 286 244 129 275 271 285 283 247 120 59 280 274\n", - " 273 290 289 272 277 276 21 282 110 281 245 704 1524 703\n", - " 1611 621 60]\n" + " 273 290 289 272 277 276 21 282 110 281 245 704 1524]\n" ] } ], @@ -2480,7 +2965,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 29/29 [00:00<00:00, 139.43it/s]\n" + "Processing: 0%| | 0/29 [00:00\n", " \n", " 1\n", + " 30\n", + " SILT AND GRAVEL\n", + " YES\n", + " \n", + " \n", + " 2\n", " 0\n", " GRAVEL\n", " YES\n", " \n", " \n", - " 2\n", + " 3\n", " 1\n", " SAND\n", " YES\n", " \n", " \n", - " 3\n", + " 4\n", " 2\n", " FINE SAND\n", " NO\n", " \n", - " \n", - " 4\n", - " 3\n", - " SILT\n", - " YES\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " SEDI SEDIMENT TYPE RECOMMENDED TO BE USED\n", - "0 -99 NO DATA NaN\n", - "1 0 GRAVEL YES\n", - "2 1 SAND YES\n", - "3 2 FINE SAND NO\n", - "4 3 SILT YES" + " SEDI SEDIMENT TYPE RECOMMENDED TO BE USED\n", + "0 -99 NO DATA NaN\n", + "1 30 SILT AND GRAVEL YES\n", + "2 0 GRAVEL YES\n", + "3 1 SAND YES\n", + "4 2 FINE SAND NO" ] }, "execution_count": null, @@ -3105,7 +3604,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 47/47 [00:00<00:00, 132.73it/s]\n" + "Processing: 100%|██████████| 47/47 [00:00<00:00, 99.39it/s] \n" ] }, { @@ -3214,14 +3713,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing: 0%| | 0/47 [00:00\n", " 0\n", " 0\n", - " n\n", + " N\n", " \n", " \n", " 1\n", @@ -4020,12 +4512,12 @@ " \n", " 2\n", " 2\n", - " N\n", + " F\n", " \n", " \n", " 3\n", " 3\n", - " F\n", + " n\n", " \n", " \n", "\n", @@ -4033,10 +4525,10 @@ ], "text/plain": [ " index value\n", - "0 0 n\n", + "0 0 N\n", "1 1 NaN\n", - "2 2 N\n", - "3 3 F" + "2 2 F\n", + "3 3 n" ] }, "execution_count": null, @@ -4258,13 +4750,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "['WKRIL2012003' 'WKRIL2012004' 'WKRIL2012005' ... 'WSSSM2021006'\n", - " 'WSSSM2021007' 'WSSSM2021008']\n", + "['WKRIL2012003' 'WKRIL2012004' 'WKRIL2012005' ... 'WSSSM2018006'\n", + " 'WSSSM2018007' 'WSSSM2018008']\n", " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of rows in dfs 20318 37347 14893\n", + "Number of rows in tfm.dfs 20318 37347 14893\n", "Number of dropped rows 0 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "Number of rows in tfm.dfs + Number of dropped rows 20318 37347 14893 \n", "\n" ] } @@ -4325,52 +4817,52 @@ " \n", " \n", " METHOD\n", - " DESCRIPTION\n", " COUNTRY\n", + " DESCRIPTION\n", " \n", " \n", " \n", " \n", " 0\n", " BFFG01\n", - " Gammaspectrometric analysis with Germanium det...\n", " 6\n", + " Gammaspectrometric analysis with Germanium det...\n", " \n", " \n", " 1\n", " BFFG02\n", - " Sr-90, a) Y-90 extraction method dried ash and...\n", " 6\n", + " Sr-90, a) Y-90 extraction method dried ash and...\n", " \n", " \n", " 2\n", - " BFFG03\n", - " Pu238, Pu239241; Ashing and and drying the tra...\n", - " 6\n", + " CLOR02\n", + " 67\n", + " Radiochemical method Radiocaesium separation f...\n", " \n", " \n", " 3\n", - " BFFG04\n", - " Am-241 (not to in use any more)\n", - " 6\n", + " CLOR03\n", + " 67\n", + " Radiochem. meth.-134+137Cs was measured after ...\n", " \n", " \n", " 4\n", - " CLOR01\n", - " 137Cs and 40K activity concentrations are dete...\n", + " CLOR04\n", " 67\n", + " Radiochem. meth of Sr90. Precipation with oxal...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " METHOD DESCRIPTION COUNTRY\n", - "0 BFFG01 Gammaspectrometric analysis with Germanium det... 6\n", - "1 BFFG02 Sr-90, a) Y-90 extraction method dried ash and... 6\n", - "2 BFFG03 Pu238, Pu239241; Ashing and and drying the tra... 6\n", - "3 BFFG04 Am-241 (not to in use any more) 6\n", - "4 CLOR01 137Cs and 40K activity concentrations are dete... 67" + " METHOD COUNTRY DESCRIPTION\n", + "0 BFFG01 6 Gammaspectrometric analysis with Germanium det...\n", + "1 BFFG02 6 Sr-90, a) Y-90 extraction method dried ash and...\n", + "2 CLOR02 67 Radiochemical method Radiocaesium separation f...\n", + "3 CLOR03 67 Radiochem. meth.-134+137Cs was measured after ...\n", + "4 CLOR04 67 Radiochem. meth of Sr90. Precipation with oxal..." ] }, "execution_count": null, @@ -4432,10 +4924,10 @@ " 'For tritium liquid scintialtion counting, combined with electrolytic enrichment of analysed water samples, double distilled, before and after electrolysis in cells. Liquid Scintillation spectrometer LKB Wallac model 1410'\n", " 'Pretreatment drying (sediment, biota samples) and ashing (biota samples)or vaporization to 1000 ml (sea water samples), measured by gamma-spectrometry using HPGe detectors sediment, biota, sea water /Cs-137, Cs-134, K-40']\n", " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of rows in dfs 20318 37347 14893\n", + "Number of rows in tfm.dfs 20318 37347 14893\n", "Number of dropped rows 0 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "Number of rows in tfm.dfs + Number of dropped rows 20318 37347 14893 \n", "\n" ] } @@ -4498,10 +4990,10 @@ "output_type": "stream", "text": [ " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of rows in dfs 20318 37347 14893\n", + "Number of rows in tfm.dfs 20318 37347 14893\n", "Number of dropped rows 0 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "Number of rows in tfm.dfs + Number of dropped rows 20318 37347 14893 \n", "\n" ] } @@ -4642,10 +5134,10 @@ "output_type": "stream", "text": [ " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of rows in dfs 20318 37347 14893\n", + "Number of rows in tfm.dfs 20318 37347 14893\n", "Number of dropped rows 0 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "Number of rows in tfm.dfs + Number of dropped rows 20318 37347 14893 \n", "\n", "0 18.453\n", "1 18.453\n", @@ -4793,10 +5285,10 @@ "output_type": "stream", "text": [ " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21208 39816 15827\n", - "Number of dropped rows 8 1 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "Number of rows in dfs 20318 37347 14893\n", + "Number of rows in tfm.dfs 20318 37346 14893\n", + "Number of dropped rows 0 1 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 20318 37347 14893 \n", "\n", " lat lon\n", "0 54.283333 12.316667\n", @@ -4805,13 +5297,13 @@ "3 54.283333 12.316667\n", "4 54.283333 12.316667\n", "... ... ...\n", - "15822 60.373333 18.395667\n", - "15823 60.373333 18.395667\n", - "15824 60.503333 18.366667\n", - "15825 60.503333 18.366667\n", - "15826 60.503333 18.366667\n", + "14888 54.583300 19.000000\n", + "14889 54.333300 15.500000\n", + "14890 54.333300 15.500000\n", + "14891 54.333300 15.500000\n", + "14892 54.363900 19.433300\n", "\n", - "[15827 rows x 2 columns]\n" + "[14893 rows x 2 columns]\n" ] } ], @@ -4858,10 +5350,10 @@ "output_type": "stream", "text": [ " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21208 39816 15827\n", - "Number of dropped rows 8 1 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "Number of rows in dfs 20318 37347 14893\n", + "Number of rows in tfm.dfs 20318 37346 14893\n", + "Number of dropped rows 0 1 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 20318 37347 14893 \n", "\n", " lat lon\n", "0 54.283333 12.316667\n", @@ -4870,13 +5362,13 @@ "3 54.283333 12.316667\n", "4 54.283333 12.316667\n", "... ... ...\n", - "15822 60.373333 18.395667\n", - "15823 60.373333 18.395667\n", - "15824 60.503333 18.366667\n", - "15825 60.503333 18.366667\n", - "15826 60.503333 18.366667\n", + "14888 54.583300 19.000000\n", + "14889 54.333300 15.500000\n", + "14890 54.333300 15.500000\n", + "14891 54.333300 15.500000\n", + "14892 54.363900 19.433300\n", "\n", - "[15827 rows x 2 columns]\n" + "[14893 rows x 2 columns]\n" ] } ], @@ -4913,10 +5405,10 @@ "output_type": "stream", "text": [ " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21114 39531 15798\n", - "Number of dropped rows 102 286 29\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "Number of rows in dfs 20318 37347 14893\n", + "Number of rows in tfm.dfs 20242 37089 14873\n", + "Number of dropped rows 76 258 20\n", + "Number of rows in tfm.dfs + Number of dropped rows 20318 37347 14893 \n", "\n" ] } @@ -5023,9 +5515,9 @@ " NaN\n", " 10.0\n", " NaN\n", - " 26.0\n", + " 26\n", " RISO\n", - " 2001025.0\n", + " 2001025\n", " ...\n", " 10.500\n", " 10.833333\n", @@ -5034,8 +5526,8 @@ " 0.00\n", " NaN\n", " N\n", - " 5.0\n", - " 5.0\n", + " 5\n", + " 5\n", " NaN\n", " \n", " \n", @@ -5047,9 +5539,9 @@ " NaN\n", " NaN\n", " NaN\n", - " 93.0\n", + " 93\n", " LEPA\n", - " 2002001.0\n", + " 2002001\n", " ...\n", " 21.030\n", " 21.050000\n", @@ -5058,8 +5550,8 @@ " 3.77\n", " 14.40\n", " N\n", - " 4.0\n", - " 9.0\n", + " 4\n", + " 9\n", " NaN\n", " \n", " \n", @@ -5071,9 +5563,9 @@ " NaN\n", " NaN\n", " NaN\n", - " 93.0\n", + " 93\n", " LEPA\n", - " 2002004.0\n", + " 2002004\n", " ...\n", " 20.574\n", " 20.956667\n", @@ -5082,8 +5574,8 @@ " 6.57\n", " 11.95\n", " N\n", - " 4.0\n", - " 9.0\n", + " 4\n", + " 9\n", " NaN\n", " \n", " \n", @@ -5095,9 +5587,9 @@ " NaN\n", " NaN\n", " NaN\n", - " 93.0\n", + " 93\n", " LEPA\n", - " 2002007.0\n", + " 2002007\n", " ...\n", " 19.236\n", " 19.393333\n", @@ -5106,8 +5598,8 @@ " 7.00\n", " 9.19\n", " N\n", - " 4.0\n", - " 9.0\n", + " 4\n", + " 9\n", " NaN\n", " \n", " \n", @@ -5119,9 +5611,9 @@ " NaN\n", " NaN\n", " NaN\n", - " 93.0\n", + " 93\n", " LEPA\n", - " 2002010.0\n", + " 2002010\n", " ...\n", " 20.205\n", " 20.341700\n", @@ -5130,8 +5622,8 @@ " 7.06\n", " 8.65\n", " N\n", - " 4.0\n", - " 9.0\n", + " 4\n", + " 9\n", " NaN\n", " \n", " \n", @@ -5147,26 +5639,26 @@ "14023 WLEPA2002003 CS134 LEPA02 < NaN NaN \n", "14026 WLEPA2002004 CS134 LEPA02 < NaN NaN \n", "\n", - " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... LONGITUDE (ddmmmm) \\\n", - "13439 NaN 26.0 RISO 2001025.0 ... 10.500 \n", - "14017 NaN 93.0 LEPA 2002001.0 ... 21.030 \n", - "14020 NaN 93.0 LEPA 2002004.0 ... 20.574 \n", - "14023 NaN 93.0 LEPA 2002007.0 ... 19.236 \n", - "14026 NaN 93.0 LEPA 2002010.0 ... 20.205 \n", + " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... LONGITUDE (ddmmmm) \\\n", + "13439 NaN 26 RISO 2001025 ... 10.500 \n", + "14017 NaN 93 LEPA 2002001 ... 21.030 \n", + "14020 NaN 93 LEPA 2002004 ... 20.574 \n", + "14023 NaN 93 LEPA 2002007 ... 19.236 \n", + "14026 NaN 93 LEPA 2002010 ... 20.205 \n", "\n", " LONGITUDE (dddddd) TDEPTH SDEPTH SALIN TTEMP FILT MORS_SUBBASIN \\\n", - "13439 10.833333 22.0 20.0 0.00 NaN N 5.0 \n", - "14017 21.050000 16.0 0.0 3.77 14.40 N 4.0 \n", - "14020 20.956667 14.0 0.0 6.57 11.95 N 4.0 \n", - "14023 19.393333 73.0 0.0 7.00 9.19 N 4.0 \n", - "14026 20.341700 47.0 0.0 7.06 8.65 N 4.0 \n", + "13439 10.833333 22.0 20.0 0.00 NaN N 5 \n", + "14017 21.050000 16.0 0.0 3.77 14.40 N 4 \n", + "14020 20.956667 14.0 0.0 6.57 11.95 N 4 \n", + "14023 19.393333 73.0 0.0 7.00 9.19 N 4 \n", + "14026 20.341700 47.0 0.0 7.06 8.65 N 4 \n", "\n", " HELCOM_SUBBASIN DATE_OF_ENTRY_y \n", - "13439 5.0 NaN \n", - "14017 9.0 NaN \n", - "14020 9.0 NaN \n", - "14023 9.0 NaN \n", - "14026 9.0 NaN \n", + "13439 5 NaN \n", + "14017 9 NaN \n", + "14020 9 NaN \n", + "14023 9 NaN \n", + "14026 9 NaN \n", "\n", "[5 rows x 27 columns]" ] @@ -5405,21 +5897,14 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "seawater columns:\n", - "Index(['lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", - " 'smp_depth', 'tot_depth', '_sal', '_temp'],\n", - " dtype='object')\n", - "sediment columns:\n", - "Index(['lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", - " 'tot_depth', 'sed_type'],\n", - " dtype='object')\n", - "biota columns:\n", - "Index(['lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", - " 'smp_depth', 'species', 'body_part', 'bio_group'],\n", - " dtype='object')\n" + "ename": "NameError", + "evalue": "name 'RemapBiotaSpeciesCB' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[146], line 11\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#| eval: false\u001b[39;00m\n\u001b[1;32m 2\u001b[0m dfs \u001b[38;5;241m=\u001b[39m load_data(fname_in)\n\u001b[1;32m 3\u001b[0m tfm \u001b[38;5;241m=\u001b[39m Transformer(dfs, cbs\u001b[38;5;241m=\u001b[39m[AddSampleTypeIdColumnCB(),\n\u001b[1;32m 4\u001b[0m LowerStripNameCB(col_src\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNUCLIDE\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[1;32m 5\u001b[0m RemapNuclideNameCB(lut_nuclides),\n\u001b[1;32m 6\u001b[0m AddNuclideIdColumnCB(col_value\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNUCLIDE\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[1;32m 7\u001b[0m ParseTimeCB(),\n\u001b[1;32m 8\u001b[0m EncodeTimeCB(cfg()),\n\u001b[1;32m 9\u001b[0m SanitizeValue(coi_val), \n\u001b[1;32m 10\u001b[0m NormalizeUncCB(),\n\u001b[0;32m---> 11\u001b[0m \u001b[43mRemapBiotaSpeciesCB\u001b[49m(lut_biota),\n\u001b[1;32m 12\u001b[0m RemapBiotaBodyPartCB(lut_tissues),\n\u001b[1;32m 13\u001b[0m RemapBiogroupCB(lut_biogroup),\n\u001b[1;32m 14\u001b[0m RemapTaxonInformationCB(lut_taxon),\n\u001b[1;32m 15\u001b[0m RemapSedimentCB(lut_sediments),\n\u001b[1;32m 16\u001b[0m RemapUnitCB(),\n\u001b[1;32m 17\u001b[0m RemapDetectionLimitCB(coi_dl, lut_dl),\n\u001b[1;32m 18\u001b[0m RemapFiltCB(lut_filtered),\n\u001b[1;32m 19\u001b[0m AddSampleLabCodeCB(),\n\u001b[1;32m 20\u001b[0m AddMeasurementNoteCB(lut_method),\n\u001b[1;32m 21\u001b[0m RemapStationIdCB(),\n\u001b[1;32m 22\u001b[0m RemapSedSliceTopBottomCB(),\n\u001b[1;32m 23\u001b[0m LookupDryWetRatio(),\n\u001b[1;32m 24\u001b[0m ParseCoordinates(ddmm_to_dd),\n\u001b[1;32m 25\u001b[0m SanitizeLonLatCB(),\n\u001b[1;32m 26\u001b[0m CompareDfsAndTfmCB(dfs),\n\u001b[1;32m 27\u001b[0m SelectAndRenameColumnCB(get_renaming_rules, encoding_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnetcdf\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[1;32m 28\u001b[0m ])\n\u001b[1;32m 30\u001b[0m tfm()\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m grp \u001b[38;5;129;01min\u001b[39;00m tfm\u001b[38;5;241m.\u001b[39mdfs\u001b[38;5;241m.\u001b[39mkeys():\n", + "\u001b[0;31mNameError\u001b[0m: name 'RemapBiotaSpeciesCB' is not defined" ] } ], From b8b3214c00f50d60f4a5628a4f809a107c0193e2 Mon Sep 17 00:00:00 2001 From: Franck Albinet Date: Wed, 16 Oct 2024 16:27:39 +0200 Subject: [PATCH 9/9] Investigate unique id in HELCOM and Geotraces ds --- nbs/handlers/_geotraces.ipynb | 5725 +++++++++- .../_helcom-investigation-uniqueness.ipynb | 9795 ++++++++++------- 2 files changed, 11017 insertions(+), 4503 deletions(-) diff --git a/nbs/handlers/_geotraces.ipynb b/nbs/handlers/_geotraces.ipynb index bba4508..2f509e4 100644 --- a/nbs/handlers/_geotraces.ipynb +++ b/nbs/handlers/_geotraces.ipynb @@ -104,14 +104,6 @@ "- How to handle this unit? U_236_238_T_RATIO_BOTTLE [per 10^12]" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "78df7bee", - "metadata": {}, - "outputs": [], - "source": [] - }, { "attachments": {}, "cell_type": "markdown", @@ -126,7 +118,16 @@ "execution_count": null, "id": "0db45fee", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -501,96 +502,25 @@ { "cell_type": "code", "execution_count": null, - "id": "26d94fd4", + "id": "30755012", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
yyyy-mm-ddThh:mm:ss.sssLongitude [degrees_east]Latitude [degrees_north]DEPTH [m]BODC Bottle Number:INTEGER
02014-05-17T22:29:00349.2999938.43292957.11214048
12014-05-17T22:29:00349.2999938.43292957.21214039
22014-05-17T22:29:00349.2999938.43292957.21214027
32014-05-17T22:29:00349.2999938.43292957.21214018
42014-05-17T22:29:00349.2999938.43292957.21214036
\n", - "
" - ], "text/plain": [ - " yyyy-mm-ddThh:mm:ss.sss Longitude [degrees_east] Latitude [degrees_north] \\\n", - "0 2014-05-17T22:29:00 349.29999 38.4329 \n", - "1 2014-05-17T22:29:00 349.29999 38.4329 \n", - "2 2014-05-17T22:29:00 349.29999 38.4329 \n", - "3 2014-05-17T22:29:00 349.29999 38.4329 \n", - "4 2014-05-17T22:29:00 349.29999 38.4329 \n", - "\n", - " DEPTH [m] BODC Bottle Number:INTEGER \n", - "0 2957.1 1214048 \n", - "1 2957.2 1214039 \n", - "2 2957.2 1214027 \n", - "3 2957.2 1214018 \n", - "4 2957.2 1214036 " + "Index(['Cruise', 'Station:METAVAR:INDEXED_TEXT', 'Type',\n", + " 'yyyy-mm-ddThh:mm:ss.sss', 'Longitude [degrees_east]',\n", + " 'Latitude [degrees_north]', 'Bot. Depth [m]',\n", + " 'Operator's Cruise Name:METAVAR:INDEXED_TEXT',\n", + " 'Ship Name:METAVAR:INDEXED_TEXT', 'Period:METAVAR:INDEXED_TEXT',\n", + " 'Chief Scientist:METAVAR:INDEXED_TEXT',\n", + " 'GEOTRACES Scientist:METAVAR:INDEXED_TEXT',\n", + " 'Cruise Aliases:METAVAR:INDEXED_TEXT',\n", + " 'Cruise Information Link:METAVAR:INDEXED_TEXT',\n", + " 'BODC Cruise Number:METAVAR:INTEGER', 'CTDPRS_T_VALUE_SENSOR [dbar]',\n", + " 'QV:SEADATANET', 'DEPTH [m]', 'QV:SEADATANET.1',\n", + " 'Rosette Bottle Number:INTEGER'],\n", + " dtype='object')" ] }, "execution_count": null, @@ -599,24 +529,39 @@ } ], "source": [ - "# 'BODC Bottle Number:INTEGER'\n", - "\n", - "cols_unique = [\n", + "df.columns[:20]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc7549a0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "df duplicated keys: 423\n" + ] + } + ], + "source": [ + "unique_key = [\n", " 'yyyy-mm-ddThh:mm:ss.sss', \n", - " 'Longitude [degrees_east]',\n", - " 'Latitude [degrees_north]',\n", - " 'DEPTH [m]',\n", + " 'Longitude [degrees_east]', \n", + " 'Latitude [degrees_north]', \n", + " 'DEPTH [m]', \n", " 'BODC Bottle Number:INTEGER'\n", - " # 'Rosette Bottle Number:INTEGER',\n", "]\n", "\n", - "df[cols_unique].head()" + "print(f'df duplicated keys: ', df[unique_key].duplicated().sum()) " ] }, { "cell_type": "code", "execution_count": null, - "id": "df416ab1", + "id": "339e85e4", "metadata": {}, "outputs": [ { @@ -665,14 +610,14 @@ " \n", " \n", " \n", - " 9571\n", + " 9835\n", " GA03\n", - " Station 12\n", + " Station 5\n", " B\n", - " 2010-11-02T17:03:15\n", - " 335.5022\n", - " 17.4021\n", - " 3548.0\n", + " 2010-10-21T23:39:56\n", + " 338.09882\n", + " 31.0028\n", + " 5011.0\n", " KN199\n", " Knorr\n", " 15/10/2010 - 04/11/2010\n", @@ -680,107 +625,107 @@ " 9\n", " NaN\n", " 9\n", - " 0.07\n", + " 3.43\n", " 1\n", - " 1.48\n", + " 16.00\n", " 1\n", - " 21.700001\n", + " 2.710000\n", " 1\n", " 1\n", " \n", " \n", - " 9573\n", + " 9836\n", " GA03\n", - " Station 12\n", + " Station 5\n", " B\n", - " 2010-11-02T17:03:15\n", - " 335.5022\n", - " 17.4021\n", - " 3548.0\n", + " 2010-10-21T23:39:56\n", + " 338.09882\n", + " 31.0028\n", + " 5011.0\n", " KN199\n", " Knorr\n", " 15/10/2010 - 04/11/2010\n", " ...\n", - " 1\n", - " 0.10\n", - " 1\n", - " NaN\n", " 9\n", " NaN\n", " 9\n", - " 1.040000\n", + " 0.14\n", + " 1\n", + " 0.23\n", + " 1\n", + " 0.280000\n", " 1\n", " 1\n", " \n", " \n", - " 9574\n", + " 9837\n", " GA03\n", - " Station 12\n", + " Station 5\n", " B\n", - " 2010-11-02T17:03:15\n", - " 335.5022\n", - " 17.4021\n", - " 3548.0\n", + " 2010-10-21T23:39:56\n", + " 338.09882\n", + " 31.0028\n", + " 5011.0\n", " KN199\n", " Knorr\n", " 15/10/2010 - 04/11/2010\n", " ...\n", " 1\n", + " 0.23\n", + " 1\n", " NaN\n", " 9\n", " NaN\n", " 9\n", - " NaN\n", - " 9\n", - " 22.100000\n", + " 1.460000\n", " 1\n", " 1\n", " \n", " \n", - " 9576\n", + " 9838\n", " GA03\n", - " Station 12\n", + " Station 5\n", " B\n", - " 2010-11-02T17:03:15\n", - " 335.5022\n", - " 17.4021\n", - " 3548.0\n", + " 2010-10-21T23:39:56\n", + " 338.09882\n", + " 31.0028\n", + " 5011.0\n", " KN199\n", " Knorr\n", " 15/10/2010 - 04/11/2010\n", " ...\n", - " 1\n", - " 0.10\n", - " 1\n", - " NaN\n", " 9\n", " NaN\n", " 9\n", - " 2.500000\n", + " 32.50\n", + " 1\n", + " 209.00\n", + " 1\n", + " 12.000000\n", " 1\n", " 1\n", " \n", " \n", - " 9577\n", + " 9839\n", " GA03\n", - " Station 12\n", + " Station 5\n", " B\n", - " 2010-11-02T17:03:15\n", - " 335.5022\n", - " 17.4021\n", - " 3548.0\n", + " 2010-10-21T23:39:56\n", + " 338.09882\n", + " 31.0028\n", + " 5011.0\n", " KN199\n", " Knorr\n", " 15/10/2010 - 04/11/2010\n", " ...\n", " 1\n", - " 0.07\n", + " 0.24\n", " 1\n", " NaN\n", " 9\n", " NaN\n", " 9\n", - " 0.560000\n", + " 1.900000\n", " 1\n", " 1\n", " \n", @@ -814,7 +759,7 @@ " 36\n", " B\n", " 2013-12-17T04:09:34\n", - " 207.9930\n", + " 207.99300\n", " -10.5019\n", " 5162.0\n", " TN303\n", @@ -838,7 +783,7 @@ " 36\n", " B\n", " 2013-12-17T04:09:34\n", - " 207.9930\n", + " 207.99300\n", " -10.5019\n", " 5162.0\n", " TN303\n", @@ -862,7 +807,7 @@ " 36\n", " B\n", " 2013-12-17T04:09:34\n", - " 207.9930\n", + " 207.99300\n", " -10.5019\n", " 5162.0\n", " TN303\n", @@ -886,7 +831,7 @@ " 36\n", " B\n", " 2013-12-17T04:09:34\n", - " 207.9930\n", + " 207.99300\n", " -10.5019\n", " 5162.0\n", " TN303\n", @@ -910,7 +855,7 @@ " 36\n", " B\n", " 2013-12-17T04:09:34\n", - " 207.9930\n", + " 207.99300\n", " -10.5019\n", " 5162.0\n", " TN303\n", @@ -930,16 +875,16 @@ " \n", " \n", "\n", - "

423 rows × 1188 columns

\n", + "

446 rows × 1188 columns

\n", "" ], "text/plain": [ " Cruise Station:METAVAR:INDEXED_TEXT Type yyyy-mm-ddThh:mm:ss.sss \\\n", - "9571 GA03 Station 12 B 2010-11-02T17:03:15 \n", - "9573 GA03 Station 12 B 2010-11-02T17:03:15 \n", - "9574 GA03 Station 12 B 2010-11-02T17:03:15 \n", - "9576 GA03 Station 12 B 2010-11-02T17:03:15 \n", - "9577 GA03 Station 12 B 2010-11-02T17:03:15 \n", + "9835 GA03 Station 5 B 2010-10-21T23:39:56 \n", + "9836 GA03 Station 5 B 2010-10-21T23:39:56 \n", + "9837 GA03 Station 5 B 2010-10-21T23:39:56 \n", + "9838 GA03 Station 5 B 2010-10-21T23:39:56 \n", + "9839 GA03 Station 5 B 2010-10-21T23:39:56 \n", "... ... ... ... ... \n", "92211 GP16 36 B 2013-12-17T04:09:34 \n", "92212 GP16 36 B 2013-12-17T04:09:34 \n", @@ -948,24 +893,24 @@ "92215 GP16 36 B 2013-12-17T04:09:34 \n", "\n", " Longitude [degrees_east] Latitude [degrees_north] Bot. Depth [m] \\\n", - "9571 335.5022 17.4021 3548.0 \n", - "9573 335.5022 17.4021 3548.0 \n", - "9574 335.5022 17.4021 3548.0 \n", - "9576 335.5022 17.4021 3548.0 \n", - "9577 335.5022 17.4021 3548.0 \n", + "9835 338.09882 31.0028 5011.0 \n", + "9836 338.09882 31.0028 5011.0 \n", + "9837 338.09882 31.0028 5011.0 \n", + "9838 338.09882 31.0028 5011.0 \n", + "9839 338.09882 31.0028 5011.0 \n", "... ... ... ... \n", - "92211 207.9930 -10.5019 5162.0 \n", - "92212 207.9930 -10.5019 5162.0 \n", - "92213 207.9930 -10.5019 5162.0 \n", - "92214 207.9930 -10.5019 5162.0 \n", - "92215 207.9930 -10.5019 5162.0 \n", + "92211 207.99300 -10.5019 5162.0 \n", + "92212 207.99300 -10.5019 5162.0 \n", + "92213 207.99300 -10.5019 5162.0 \n", + "92214 207.99300 -10.5019 5162.0 \n", + "92215 207.99300 -10.5019 5162.0 \n", "\n", " Operator's Cruise Name:METAVAR:INDEXED_TEXT \\\n", - "9571 KN199 \n", - "9573 KN199 \n", - "9574 KN199 \n", - "9576 KN199 \n", - "9577 KN199 \n", + "9835 KN199 \n", + "9836 KN199 \n", + "9837 KN199 \n", + "9838 KN199 \n", + "9839 KN199 \n", "... ... \n", "92211 TN303 \n", "92212 TN303 \n", @@ -974,11 +919,11 @@ "92215 TN303 \n", "\n", " Ship Name:METAVAR:INDEXED_TEXT Period:METAVAR:INDEXED_TEXT ... \\\n", - "9571 Knorr 15/10/2010 - 04/11/2010 ... \n", - "9573 Knorr 15/10/2010 - 04/11/2010 ... \n", - "9574 Knorr 15/10/2010 - 04/11/2010 ... \n", - "9576 Knorr 15/10/2010 - 04/11/2010 ... \n", - "9577 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9835 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9836 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9837 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9838 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9839 Knorr 15/10/2010 - 04/11/2010 ... \n", "... ... ... ... \n", "92211 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", "92212 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", @@ -987,11 +932,11 @@ "92215 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", "\n", " QV:SEADATANET.581 Co_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.582 \\\n", - "9571 9 NaN 9 \n", - "9573 1 0.10 1 \n", - "9574 1 NaN 9 \n", - "9576 1 0.10 1 \n", - "9577 1 0.07 1 \n", + "9835 9 NaN 9 \n", + "9836 9 NaN 9 \n", + "9837 1 0.23 1 \n", + "9838 9 NaN 9 \n", + "9839 1 0.24 1 \n", "... ... ... ... \n", "92211 1 NaN 9 \n", "92212 1 1.06 1 \n", @@ -1000,11 +945,11 @@ "92215 1 NaN 9 \n", "\n", " Ni_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.583 \\\n", - "9571 0.07 1 \n", - "9573 NaN 9 \n", - "9574 NaN 9 \n", - "9576 NaN 9 \n", - "9577 NaN 9 \n", + "9835 3.43 1 \n", + "9836 0.14 1 \n", + "9837 NaN 9 \n", + "9838 32.50 1 \n", + "9839 NaN 9 \n", "... ... ... \n", "92211 3.72 1 \n", "92212 1.68 1 \n", @@ -1013,11 +958,11 @@ "92215 1.07 1 \n", "\n", " Cu_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.584 \\\n", - "9571 1.48 1 \n", - "9573 NaN 9 \n", - "9574 NaN 9 \n", - "9576 NaN 9 \n", - "9577 NaN 9 \n", + "9835 16.00 1 \n", + "9836 0.23 1 \n", + "9837 NaN 9 \n", + "9838 209.00 1 \n", + "9839 NaN 9 \n", "... ... ... \n", "92211 NaN 9 \n", "92212 NaN 9 \n", @@ -1026,11 +971,11 @@ "92215 NaN 9 \n", "\n", " Zn_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.585 QV:ODV:SAMPLE \n", - "9571 21.700001 1 1 \n", - "9573 1.040000 1 1 \n", - "9574 22.100000 1 1 \n", - "9576 2.500000 1 1 \n", - "9577 0.560000 1 1 \n", + "9835 2.710000 1 1 \n", + "9836 0.280000 1 1 \n", + "9837 1.460000 1 1 \n", + "9838 12.000000 1 1 \n", + "9839 1.900000 1 1 \n", "... ... ... ... \n", "92211 6.950000 1 1 \n", "92212 11.300000 1 1 \n", @@ -1038,7 +983,7 @@ "92214 12.800000 1 1 \n", "92215 22.900000 1 1 \n", "\n", - "[423 rows x 1188 columns]" + "[446 rows x 1188 columns]" ] }, "execution_count": null, @@ -1047,35 +992,4186 @@ } ], "source": [ - "df[df[cols_unique].duplicated()]" + "df[df[unique_key].duplicated(keep=False)].sort_values(by=unique_key)" ] }, { "cell_type": "code", "execution_count": null, - "id": "2dba1f37", + "id": "979dd34e", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "array([nan, 24.0, 23.0, ..., 3056.0, 3057.0, 0], dtype=object)" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['GEOTRACES Sample ID:INDEXED_TEXT'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2b35da85", - "metadata": {}, + "name": "stdout", + "output_type": "stream", + "text": [ + "Cruise\n", + "['GA01' 'GA02' 'GA03' 'GA04N' 'GA04S']\n", + "\n", + "Station:METAVAR:INDEXED_TEXT\n", + "[0 1 2 3 4]\n", + "\n", + "Type\n", + "['B']\n", + "\n", + "yyyy-mm-ddThh:mm:ss.sss\n", + "['2014-05-17T22:29:00' '2014-05-19T18:19:18' '2014-05-20T23:04:59'\n", + " '2014-05-21T04:20:56' '2014-05-21T16:32:53']\n", + "\n", + "Longitude [degrees_east]\n", + "[349.29999 349.96399 350.54053 350.35678 350.23337]\n", + "\n", + "Latitude [degrees_north]\n", + "[38.4329 40.3333 40.3331 40.3329 40.333 ]\n", + "\n", + "Bot. Depth [m]\n", + "[4854. 3578. 153. 439. 804.]\n", + "\n", + "Operator's Cruise Name:METAVAR:INDEXED_TEXT\n", + "['GEOVIDE' 'JC057' 'PE319' 'PE321' 'PE358']\n", + "\n", + "Ship Name:METAVAR:INDEXED_TEXT\n", + "['Pourquoi pas?' 'RRS James Cook' 'Pelagia' 'Knorr' 'Angeles Alvarino']\n", + "\n", + "Period:METAVAR:INDEXED_TEXT\n", + "['15/05/2014 - 30/06/2014' '02/03/2011 - 06/04/2011'\n", + " '28/04/2010 - 26/05/2010' '11/06/2010 - 08/07/2010'\n", + " '29/07/2012 - 19/08/2012']\n", + "\n", + "Chief Scientist:METAVAR:INDEXED_TEXT\n", + "['Sarthou Geraldine' 'Rijkenberg Micha' 'Gerringa Loes' 'Jenkins William'\n", + " 'Boyle Edward']\n", + "\n", + "GEOTRACES Scientist:METAVAR:INDEXED_TEXT\n", + "['Sarthou Geraldine' 'de Baar Hein' 'Boyle Edward' 'Jenkins William'\n", + " 'Garcia-Orellana Jordi']\n", + "\n", + "Cruise Aliases:METAVAR:INDEXED_TEXT\n", + "[nan '64PE319' '64PE321' 'THOR' '64PE374']\n", + "\n", + "Cruise Information Link:METAVAR:INDEXED_TEXT\n", + "['https://www.bodc.ac.uk/resources/inventories/cruise_inventory/report/15251/'\n", + " 'https://www.bodc.ac.uk/resources/inventories/cruise_inventory/report/10584/'\n", + " 'https://www.bodc.ac.uk/resources/inventories/cruise_inventory/report/10001/'\n", + " 'https://www.bodc.ac.uk/resources/inventories/cruise_inventory/report/10002/'\n", + " 'https://www.bodc.ac.uk/resources/inventories/cruise_inventory/report/13372/']\n", + "\n", + "BODC Cruise Number:METAVAR:INTEGER\n", + "[15251 10584 10001 10002 13372]\n", + "\n", + "CTDPRS_T_VALUE_SENSOR [dbar]\n", + "[3.00085e+03 3.00096e+03 3.00106e+03 3.00116e+03 2.80000e+00]\n", + "\n", + "QV:SEADATANET\n", + "[0]\n", + "\n", + "DEPTH [m]\n", + "[2957.1 2957.2 2957.3 2957.4 3. ]\n", + "\n", + "QV:SEADATANET.1\n", + "[0]\n", + "\n", + "Rosette Bottle Number:INTEGER\n", + "[19. 16. 12. 9. 15.]\n", + "\n", + "QV:SEADATANET.2\n", + "[0 9]\n", + "\n", + "GEOTRACES Sample ID:INDEXED_TEXT\n", + "[nan 24.0 23.0 22.0 21.0]\n", + "\n", + "QV:SEADATANET.3\n", + "[9 0]\n", + "\n", + "Bottle Flag:INDEXED_TEXT\n", + "['No problem reported (0)' 'Bottle misfire (3)' 'Questionable depth (8)'\n", + " 'Bottle leak (5)' 'No sample (7)']\n", + "\n", + "QV:SEADATANET.4\n", + "[0]\n", + "\n", + "Cast Identifier:INDEXED_TEXT\n", + "['GEOP_000_01' 'geoh_001_01' 'geoh_001_05' 'geoh_001_11' 'geoh_001_03']\n", + "\n", + "QV:SEADATANET.5\n", + "[0]\n", + "\n", + "Sampling Device:INDEXED_TEXT\n", + "['UCCTD' 'CTD' 'SAP' 'CPUMP' 'GPUMP']\n", + "\n", + "QV:SEADATANET.6\n", + "[0]\n", + "\n", + "BODC Bottle Number:INTEGER\n", + "[1214048 1214039 1214027 1214018 1214036]\n", + "\n", + "QV:SEADATANET.7\n", + "[0]\n", + "\n", + "BODC Event Number:INTEGER\n", + "[1898200 1896667 1896681 1896702 1896674]\n", + "\n", + "QV:SEADATANET.8\n", + "[0]\n", + "\n", + "Single-Cell ID:INDEXED_TEXT\n", + "[nan 'C253' 'C261' 'C266' 'C256']\n", + "\n", + "QV:SEADATANET.9\n", + "[9 0]\n", + "\n", + "NCBI_Metagenome_BioSample_Accession:INDEXED_TEXT\n", + "[nan 'SAMN07136678' 'SAMN07136679' 'SAMN07136680' 'SAMN07136681']\n", + "\n", + "QV:SEADATANET.10\n", + "[9 0]\n", + "\n", + "NCBI_Single-Cell-Genome_BioProject_Accession:INDEXED_TEXT\n", + "[nan 'PRJNA445865;PRJEB33281']\n", + "\n", + "QV:SEADATANET.11\n", + "[9 0]\n", + "\n", + "NCBI_16S-18S-rRNA-gene_BioSample_Accession:INDEXED_TEXT\n", + "[nan 'SAMN15928680' 'SAMN15928676' 'SAMN15928677' 'SAMN15928678']\n", + "\n", + "QV:SEADATANET.12\n", + "[9 0]\n", + "\n", + "EMBL_EBI_Metagenome_MGNIFY_Analysis_Accession:INDEXED_TEXT\n", + "[nan 'https://www.ebi.ac.uk/metagenomics/analyses/MGYA00463269'\n", + " 'https://www.ebi.ac.uk/metagenomics/analyses/MGYA00466185'\n", + " 'https://www.ebi.ac.uk/metagenomics/analyses/MGYA00466186'\n", + " 'https://www.ebi.ac.uk/metagenomics/analyses/MGYA00452433']\n", + "\n", + "QV:SEADATANET.13\n", + "[9 0]\n", + "\n", + "CTDTMP_T_VALUE_SENSOR [deg C]\n", + "[2.8403 2.8405 2.8404 2.839 2.8406]\n", + "\n", + "QV:SEADATANET.14\n", + "[1 9 2 3]\n", + "\n", + "CTDSAL_D_CONC_SENSOR [pss-78]\n", + "[34.950802 34.950699 34.9506 35.146198 35.130901]\n", + "\n", + "QV:SEADATANET.15\n", + "[1 9 4 2 3]\n", + "\n", + "SALINITY_D_CONC_BOTTLE\n", + "[34.9519 34.952702 34.951302 34.960201 34.954399]\n", + "\n", + "QV:SEADATANET.16\n", + "[1 3 9 2 4]\n", + "\n", + "CFC-11_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 3.6151 3.6904 3.9974 4.3853]\n", + "\n", + "QV:SEADATANET.17\n", + "[9 1 3 6]\n", + "\n", + "CFC-12_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 2.1131 2.136 2.2993 2.5283]\n", + "\n", + "QV:SEADATANET.18\n", + "[9 1 3 6]\n", + "\n", + "CFC113_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.202 0.204 0.176 0.154]\n", + "\n", + "QV:SEADATANET.19\n", + "[9 1 6 3]\n", + "\n", + "SF6_D_CONC_BOTTLE [fmol/kg]\n", + "[ nan 1.313 1.362 1.4 1.467]\n", + "\n", + "QV:SEADATANET.20\n", + "[9 1 6 3]\n", + "\n", + "He_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 1.6641 1.6745 1.6909 1.6908]\n", + "\n", + "QV:SEADATANET.21\n", + "[9 1 3]\n", + "\n", + "Ne_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 6.579 6.777 6.908 6.924]\n", + "\n", + "QV:SEADATANET.22\n", + "[9 1 3]\n", + "\n", + "Ar_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 11.714 11.708 11.781 11.998]\n", + "\n", + "QV:SEADATANET.23\n", + "[9 1]\n", + "\n", + "Kr_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 2.651 2.6589 2.6564 2.6689]\n", + "\n", + "QV:SEADATANET.24\n", + "[9 1]\n", + "\n", + "Xe_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.36615 0.36581 0.36629 0.36818]\n", + "\n", + "QV:SEADATANET.25\n", + "[9 1]\n", + "\n", + "SALINITY_D_CONC_PUMP\n", + "[ nan 34.91 34.919998 35.02 34.98 ]\n", + "\n", + "QV:SEADATANET.26\n", + "[9 1 2 3]\n", + "\n", + "SALINITY_D_CONC_FISH\n", + "[ nan 35.419167 34.453396 35.397396 34.531933]\n", + "\n", + "QV:SEADATANET.27\n", + "[9 1 2 3]\n", + "\n", + "SALINITY_D_CONC_UWAY\n", + "[ nan 36.34 36.240002 36.950001 36.509998]\n", + "\n", + "QV:SEADATANET.28\n", + "[9 1 2 3]\n", + "\n", + "CFC-11_D_CONC_UWAY [pmol/kg]\n", + "[ nan 1.918561 1.552457 1.83768 1.76309 ]\n", + "\n", + "QV:SEADATANET.29\n", + "[9 1]\n", + "\n", + "CFC-12_D_CONC_UWAY [pmol/kg]\n", + "[ nan 1.163526 0.950414 1.115449 1.094962]\n", + "\n", + "QV:SEADATANET.30\n", + "[9 1]\n", + "\n", + "CFC113_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.14083 0.140568 0.16651 0.157061]\n", + "\n", + "QV:SEADATANET.31\n", + "[9 1 3]\n", + "\n", + "SF6_D_CONC_UWAY [fmol/kg]\n", + "[ nan 1.188855 1.12 1.19016 1.20843 ]\n", + "\n", + "QV:SEADATANET.32\n", + "[9 1 3]\n", + "\n", + "SALINITY_D_CONC_BOAT_PUMP\n", + "[ nan 31.404301 31.8871 32.318501 32.295399]\n", + "\n", + "QV:SEADATANET.33\n", + "[9 2]\n", + "\n", + "OXYGEN_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 285.600006 247.300003 166.199997 180.199997]\n", + "\n", + "QV:SEADATANET.34\n", + "[9 1 3 6 4]\n", + "\n", + "CTDOXY_D_CONC_SENSOR [umol/kg]\n", + "[226.399994 249.5 250.699997 251.600006 253.100006]\n", + "\n", + "QV:SEADATANET.35\n", + "[1 9 3 2 4]\n", + "\n", + "PHOSPHATE_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 1.012 1.014 1.032 2.367]\n", + "\n", + "QV:SEADATANET.36\n", + "[9 1 3 6 2]\n", + "\n", + "PHOSPHATE_LL_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.003707 0.01561 0.036098 0.066439]\n", + "\n", + "QV:SEADATANET.37\n", + "[9 1 3 6 2]\n", + "\n", + "SILICATE_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.88 0.85 0.54 1.06]\n", + "\n", + "QV:SEADATANET.38\n", + "[9 2 4 3 6]\n", + "\n", + "NITRATE_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.04 0.03 0.94 4.43]\n", + "\n", + "QV:SEADATANET.39\n", + "[9 2 3 6 1]\n", + "\n", + "NITRATE_LL_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.023512 0.101463 0.284098 1.146146]\n", + "\n", + "QV:SEADATANET.40\n", + "[9 1 3]\n", + "\n", + "NITRITE_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0. 0.02 0.11 0.03]\n", + "\n", + "QV:SEADATANET.41\n", + "[9 6 2 3 1]\n", + "\n", + "NITRITE_LL_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.00322 0.025073 0.068976 0.106537]\n", + "\n", + "QV:SEADATANET.42\n", + "[9 1 3 6]\n", + "\n", + "NO2+NO3_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.025268 0.019512 0.787902 3.837268]\n", + "\n", + "QV:SEADATANET.43\n", + "[9 1 6 3 4]\n", + "\n", + "NO2+NO3_LL_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.019415 0.016878 0.017659 0.048683]\n", + "\n", + "QV:SEADATANET.44\n", + "[9 1]\n", + "\n", + "NH4_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 0.022439 0.013659 0.021463 0.029268]\n", + "\n", + "QV:SEADATANET.45\n", + "[9 1 6 3 4]\n", + "\n", + "TALK_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 2277.800049 2278. 2277.600098 2276.699951]\n", + "\n", + "QV:SEADATANET.46\n", + "[9 1 3]\n", + "\n", + "DIC_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 2073.100098 2075.100098 2093.399902 2123.300049]\n", + "\n", + "QV:SEADATANET.47\n", + "[9 1 3]\n", + "\n", + "PH_SWS_BOTTLE\n", + "[ nan 8.006 8.005 7.989 7.943]\n", + "\n", + "QV:SEADATANET.48\n", + "[9 1]\n", + "\n", + "PH_TOT_BOTTLE\n", + "[ nan 7.801 7.649 7.629 7.6405]\n", + "\n", + "QV:SEADATANET.49\n", + "[9 3 1]\n", + "\n", + "DOC_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 62.439026 53.658539 67.31707 40. ]\n", + "\n", + "QV:SEADATANET.50\n", + "[9 3 1]\n", + "\n", + "TDN_D_CONC_BOTTLE [umol/kg]\n", + "[ nan 3.512195 3.414634 2.926829 5.073171]\n", + "\n", + "QV:SEADATANET.51\n", + "[9 1 3]\n", + "\n", + "PHOSPHATE_D_CONC_PUMP [umol/kg]\n", + "[ nan 2.27 2.18 1.68 1.5 ]\n", + "\n", + "QV:SEADATANET.52\n", + "[9 1 2 3 6]\n", + "\n", + "SILICATE_D_CONC_PUMP [umol/kg]\n", + "[ nan 22.450001 23.52 23.030001 25.959999]\n", + "\n", + "QV:SEADATANET.53\n", + "[9 1 2 3 6]\n", + "\n", + "NITRATE_D_CONC_PUMP [umol/kg]\n", + "[ nan 34.41 31.969999 24.17 21.620001]\n", + "\n", + "QV:SEADATANET.54\n", + "[9 1 2 3 6]\n", + "\n", + "NITRITE_D_CONC_PUMP [umol/kg]\n", + "[ nan 0. 0.01 0.16 0.11]\n", + "\n", + "QV:SEADATANET.55\n", + "[9 6 1 3 2]\n", + "\n", + "PHOSPHATE_D_CONC_FISH [umol/kg]\n", + "[ nan 0.007 0.334 0.009 0.217]\n", + "\n", + "QV:SEADATANET.56\n", + "[9 1 6 2 3]\n", + "\n", + "PHOSPHATE_LL_D_CONC_FISH [umol/kg]\n", + "[ nan 0.004683 0.014049 0.038049 0.021951]\n", + "\n", + "QV:SEADATANET.57\n", + "[9 1 6]\n", + "\n", + "SILICATE_D_CONC_FISH [umol/kg]\n", + "[ nan 0.48 0.24 0.42 0.71]\n", + "\n", + "QV:SEADATANET.58\n", + "[9 1 2 3 6]\n", + "\n", + "NITRATE_D_CONC_FISH [umol/kg]\n", + "[ nan 2.422 1.101 13.464 0.027]\n", + "\n", + "QV:SEADATANET.59\n", + "[9 1 2 3 6]\n", + "\n", + "NITRATE_LL_D_CONC_FISH [umol/kg]\n", + "[ nan 0.030439 0.02039 0.025659 0.034634]\n", + "\n", + "QV:SEADATANET.60\n", + "[9 1 3]\n", + "\n", + "NITRITE_D_CONC_FISH [umol/kg]\n", + "[ nan 0.098 0.002 0.015 0.069]\n", + "\n", + "QV:SEADATANET.61\n", + "[9 1 2 6 3]\n", + "\n", + "NITRITE_LL_D_CONC_FISH [umol/kg]\n", + "[ nan 0.001171 0.002439 0.000976 0.000683]\n", + "\n", + "QV:SEADATANET.62\n", + "[9 1]\n", + "\n", + "NO2+NO3_D_CONC_FISH [umol/kg]\n", + "[ nan 0.019512 0.058537 0.195122 1.053659]\n", + "\n", + "QV:SEADATANET.63\n", + "[9 6 1]\n", + "\n", + "NO2+NO3_LL_D_CONC_FISH [umol/kg]\n", + "[ nan 0.003707 0.004079 0.003059 0.004301]\n", + "\n", + "QV:SEADATANET.64\n", + "[9 1]\n", + "\n", + "DOC_D_CONC_FISH [umol/kg]\n", + "[ nan 73.268295 67.024391 74.731705 63.900002]\n", + "\n", + "QV:SEADATANET.65\n", + "[9 1]\n", + "\n", + "PHOSPHATE_D_CONC_UWAY [umol/kg]\n", + "[ nan 0.03]\n", + "\n", + "QV:SEADATANET.66\n", + "[9 1]\n", + "\n", + "SILICATE_D_CONC_UWAY [umol/kg]\n", + "[ nan 0.39]\n", + "\n", + "QV:SEADATANET.67\n", + "[9 1]\n", + "\n", + "NITRATE_D_CONC_UWAY [umol/kg]\n", + "[ nan 0.02]\n", + "\n", + "QV:SEADATANET.68\n", + "[9 1]\n", + "\n", + "NITRITE_D_CONC_UWAY [umol/kg]\n", + "[nan 0.]\n", + "\n", + "QV:SEADATANET.69\n", + "[9 6]\n", + "\n", + "DOC_D_CONC_UWAY [umol/kg]\n", + "[ nan 83.609756 76.975609 75.31707 67.902435]\n", + "\n", + "QV:SEADATANET.70\n", + "[9 1]\n", + "\n", + "NITRATE_D_CONC_BOAT_PUMP [umol/kg]\n", + "[ nan 0. 0.1 8.22 1.97]\n", + "\n", + "QV:SEADATANET.71\n", + "[9 2]\n", + "\n", + "NITRITE_D_CONC_BOAT_PUMP [umol/kg]\n", + "[ nan 0. 0.11 0.08]\n", + "\n", + "QV:SEADATANET.72\n", + "[9 2]\n", + "\n", + "DIC_13_12_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 1.15 0.99 0.78 0.68]\n", + "\n", + "QV:SEADATANET.73\n", + "[9 1 3]\n", + "\n", + "DIC_14_12_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 22.360001 20.950001 22.15 23.299999]\n", + "\n", + "QV:SEADATANET.74\n", + "[9 1]\n", + "\n", + "He_3_4_D_DELTA_BOTTLE [%]\n", + "[ nan -1.74 -1.32 -1.27 -1.13]\n", + "\n", + "QV:SEADATANET.75\n", + "[9 1]\n", + "\n", + "TRITIUM_D_CONC_BOTTLE [TU]\n", + "[ nan 0.733 0.696 0.718 0.709]\n", + "\n", + "QV:SEADATANET.76\n", + "[9 1 3 4]\n", + "\n", + "H2O_2_1_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan -1.78 -2.11 -2.16 -2.14]\n", + "\n", + "QV:SEADATANET.77\n", + "[9 1 3]\n", + "\n", + "H2O_18_16_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan -0.26 -0.35 -0.38 -0.44]\n", + "\n", + "QV:SEADATANET.78\n", + "[9 1 3]\n", + "\n", + "NITRATE_15_14_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 13.7 3.38 7.47 4.665]\n", + "\n", + "QV:SEADATANET.79\n", + "[9 1 2]\n", + "\n", + "NITRATE_18_16_D_DELTA_BOTTLE [per 10^3]\n", + "[nan 8.8 5.1 3.4 3.2]\n", + "\n", + "QV:SEADATANET.80\n", + "[9 1 2]\n", + "\n", + "SILICATE_30_28_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 1.85 1.9 1.96 1.83]\n", + "\n", + "QV:SEADATANET.81\n", + "[9 1 3 2]\n", + "\n", + "Al_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 24.507317 6.222439 8.497561 10.097561]\n", + "\n", + "QV:SEADATANET.82\n", + "[9 1 3 6 2]\n", + "\n", + "Ba_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 48.783298 44.298702 52.265789 67.36132 ]\n", + "\n", + "QV:SEADATANET.83\n", + "[9 1 3 2 4]\n", + "\n", + "Cd_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.05282 0.06973 0.15567 0.37431]\n", + "\n", + "QV:SEADATANET.84\n", + "[9 1 3 6 2]\n", + "\n", + "Co_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 35.191807 32.344975 40.712002 55.530731]\n", + "\n", + "QV:SEADATANET.85\n", + "[9 1 3 2 6]\n", + "\n", + "Cr_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 3.27958 3.27481 3.26909 3.26816]\n", + "\n", + "QV:SEADATANET.86\n", + "[9 1]\n", + "\n", + "Cu_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 1.102439 1.073171 1.107317 1.160976]\n", + "\n", + "QV:SEADATANET.87\n", + "[9 1 3 2 4]\n", + "\n", + "Fe_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 1.04 0.5 0.72 0.82]\n", + "\n", + "QV:SEADATANET.88\n", + "[9 3 1 2 6]\n", + "\n", + "Fe_II_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.273171 0.126829 0.078049 0.058537]\n", + "\n", + "QV:SEADATANET.89\n", + "[9 1 6 2 3]\n", + "\n", + "Fe_S_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.292 0.198 0.359 0.421]\n", + "\n", + "QV:SEADATANET.90\n", + "[9 1 3]\n", + "\n", + "Ga_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 39.5 36.599998 42.900002 37.400002]\n", + "\n", + "QV:SEADATANET.91\n", + "[9 3 1 2 4]\n", + "\n", + "Hf_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.31 0.13 0.23 0.47]\n", + "\n", + "QV:SEADATANET.92\n", + "[9 1]\n", + "\n", + "Hg_0_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.06 0. 0.08 0.11]\n", + "\n", + "QV:SEADATANET.93\n", + "[9 2 3 5]\n", + "\n", + "Hg_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 2.868293 1.960976 1.609756 0.985366]\n", + "\n", + "QV:SEADATANET.94\n", + "[9 3 1 2 5]\n", + "\n", + "Hg_DM_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0. 0.005 0.004 0.076]\n", + "\n", + "QV:SEADATANET.95\n", + "[9 3 2 5]\n", + "\n", + "Hg_Me_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.116 0.106 0.12 0.148]\n", + "\n", + "QV:SEADATANET.96\n", + "[9 1 4]\n", + "\n", + "Hg_MM_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.012 0.026 0.045 0.038]\n", + "\n", + "QV:SEADATANET.97\n", + "[9 2 3 4 1]\n", + "\n", + "Hg_T_CONC_BOTTLE [pmol/kg]\n", + "[ nan 1.401 0.7435 1.649 1.213 ]\n", + "\n", + "QV:SEADATANET.98\n", + "[9 1 4]\n", + "\n", + "I_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 20.760977 20.497561 21.736586 19.980488]\n", + "\n", + "QV:SEADATANET.99\n", + "[9 1 6]\n", + "\n", + "I_V_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 333.346344 341.453644 327.112183 282.21463 ]\n", + "\n", + "QV:SEADATANET.100\n", + "[9 1]\n", + "\n", + "Mn_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.087805 0.126829 0.165854 0.17561 ]\n", + "\n", + "QV:SEADATANET.101\n", + "[9 1 3 2 4]\n", + "\n", + "Mo_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 118.411842 118.910263 116.68261 117.252235]\n", + "\n", + "QV:SEADATANET.102\n", + "[9 1 3]\n", + "\n", + "Nb_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 2.82 3.3 3.26 2.99]\n", + "\n", + "QV:SEADATANET.103\n", + "[9 1 6]\n", + "\n", + "Ni_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 2.144948 2.450863 2.629792 2.739713]\n", + "\n", + "QV:SEADATANET.104\n", + "[9 1 3 2 4]\n", + "\n", + "Pb_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 45.299999 39. 71.300003 51. ]\n", + "\n", + "QV:SEADATANET.105\n", + "[9 4 3 1 2]\n", + "\n", + "Pb_TD_CONC_BOTTLE [pmol/kg]\n", + "[ nan 17.550821 17.42984 20.257715 20.011883]\n", + "\n", + "QV:SEADATANET.106\n", + "[9 1]\n", + "\n", + "Ti_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 45.433681 49.948601 46.342361 65.870621]\n", + "\n", + "QV:SEADATANET.107\n", + "[9 1 3 6 4]\n", + "\n", + "U_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 11.61627 11.85819 11.862291 11.694177]\n", + "\n", + "QV:SEADATANET.108\n", + "[9 1 3]\n", + "\n", + "V_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 30.959999 31.75 31.959999 31.950001]\n", + "\n", + "QV:SEADATANET.109\n", + "[9 1 2 3]\n", + "\n", + "Zn_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.074824 0.35851 0.125532 0.195598]\n", + "\n", + "QV:SEADATANET.110\n", + "[9 1 3 6 4]\n", + "\n", + "Hg_Me_T_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.05197 0.03679 0.0472 0.04341]\n", + "\n", + "QV:SEADATANET.111\n", + "[9 1 4]\n", + "\n", + "Al_D_CONC_FISH [nmol/kg]\n", + "[ nan 2.6387 1.289014 2.513986 1.166395]\n", + "\n", + "QV:SEADATANET.112\n", + "[9 1 3 6]\n", + "\n", + "Ba_D_CONC_FISH [nmol/kg]\n", + "[ nan 37.900002 39.400002 39.200001 38.5 ]\n", + "\n", + "QV:SEADATANET.113\n", + "[9 1]\n", + "\n", + "Cd_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.03147 0.00021 0.08086 0.00187]\n", + "\n", + "QV:SEADATANET.114\n", + "[9 1]\n", + "\n", + "Co_D_CONC_FISH [pmol/kg]\n", + "[ nan 36.097561 35.121952 25.365854 16.585365]\n", + "\n", + "QV:SEADATANET.115\n", + "[9 1 6 3]\n", + "\n", + "Cu_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.917073 0.526829 0.57561 0.419512]\n", + "\n", + "QV:SEADATANET.116\n", + "[9 1]\n", + "\n", + "Fe_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.091487 0.010606 0.016417 0.028293]\n", + "\n", + "QV:SEADATANET.117\n", + "[9 1 5 2 3]\n", + "\n", + "Fe_II_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.058537 0.068293 0.243902 0.204878]\n", + "\n", + "QV:SEADATANET.118\n", + "[9 6 1]\n", + "\n", + "Fe_S_CONC_FISH [nmol/kg]\n", + "[ nan 0.171459 0.092 0.063 0.298 ]\n", + "\n", + "QV:SEADATANET.119\n", + "[9 1 3]\n", + "\n", + "Ga_D_CONC_FISH [pmol/kg]\n", + "[ nan 34.700001 32.099998 35.700001 33.700001]\n", + "\n", + "QV:SEADATANET.120\n", + "[9 1 3]\n", + "\n", + "Hf_D_CONC_FISH [pmol/kg]\n", + "[ nan 0.2 0.04 0.06 0.08]\n", + "\n", + "QV:SEADATANET.121\n", + "[9 1]\n", + "\n", + "Hg_D_CONC_FISH [pmol/kg]\n", + "[ nan 0.084195 0.127122 0.357073 0.116488]\n", + "\n", + "QV:SEADATANET.122\n", + "[9 3 1 6]\n", + "\n", + "Mn_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.849437 0.485266 0.858047 0.506071]\n", + "\n", + "QV:SEADATANET.123\n", + "[9 1 3]\n", + "\n", + "Mo_D_CONC_FISH [nmol/kg]\n", + "[ nan 111.099998 110.199997 108.800003 109.900002]\n", + "\n", + "QV:SEADATANET.124\n", + "[9 1 3]\n", + "\n", + "Ni_D_CONC_FISH [nmol/kg]\n", + "[ nan 2.148 2.33 2.161 4.001539]\n", + "\n", + "QV:SEADATANET.125\n", + "[9 1 3]\n", + "\n", + "Pb_D_CONC_FISH [pmol/kg]\n", + "[ nan 43.299999 17. 20.200001 14. ]\n", + "\n", + "QV:SEADATANET.126\n", + "[9 1]\n", + "\n", + "Pb_TD_CONC_FISH [pmol/kg]\n", + "[ nan 23.029144 22.486393 20.799829 23.19504 ]\n", + "\n", + "QV:SEADATANET.127\n", + "[9 1]\n", + "\n", + "Ti_D_CONC_FISH [pmol/kg]\n", + "[ nan 41.784595 23.785187 39.185116]\n", + "\n", + "QV:SEADATANET.128\n", + "[9 1]\n", + "\n", + "V_D_CONC_FISH [nmol/kg]\n", + "[ nan 33.5 33.099998 33.200001 33.400002]\n", + "\n", + "QV:SEADATANET.129\n", + "[9 1 3]\n", + "\n", + "Zn_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.85 0.02 0.01 0.015]\n", + "\n", + "QV:SEADATANET.130\n", + "[9 1 6]\n", + "\n", + "Hf_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.2 0.38 0.18 0.16]\n", + "\n", + "QV:SEADATANET.131\n", + "[9 1]\n", + "\n", + "Al_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 0.878049 1.473171 1.326829 1.629268]\n", + "\n", + "QV:SEADATANET.132\n", + "[9 2]\n", + "\n", + "Ba_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 44.540001 11.87 43.75 54.419998]\n", + "\n", + "QV:SEADATANET.133\n", + "[9 1]\n", + "\n", + "Cd_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 0.43405 0.12185 0.3503 0.48855]\n", + "\n", + "QV:SEADATANET.134\n", + "[9 1]\n", + "\n", + "Co_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[ nan 105.599998 382.600006 451.5 394.5 ]\n", + "\n", + "QV:SEADATANET.135\n", + "[9 1]\n", + "\n", + "Cu_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 1.78645 3.42765 2.68315 2.3425 ]\n", + "\n", + "QV:SEADATANET.136\n", + "[9 1]\n", + "\n", + "Fe_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 0.1286 0.3085 3.669 8.3961]\n", + "\n", + "QV:SEADATANET.137\n", + "[9 2]\n", + "\n", + "Fe_II_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 0.00505 0.01465 0. ]\n", + "\n", + "QV:SEADATANET.138\n", + "[9 6]\n", + "\n", + "Ga_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[ nan 3.09 5.14 4.55 7.31]\n", + "\n", + "QV:SEADATANET.139\n", + "[9 1 2 3]\n", + "\n", + "Mn_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 1.85565 11.3677 29.95565 19.59445]\n", + "\n", + "QV:SEADATANET.140\n", + "[9 1]\n", + "\n", + "Ni_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 5.03085 4.10155 4.8299 5.45035]\n", + "\n", + "QV:SEADATANET.141\n", + "[9 1]\n", + "\n", + "Pb_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[ nan 11.1 2.75 2.7 5.2 ]\n", + "\n", + "QV:SEADATANET.142\n", + "[9 1]\n", + "\n", + "V_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 31.51 26.34 29.67 31.91]\n", + "\n", + "QV:SEADATANET.143\n", + "[9 1]\n", + "\n", + "Zn_D_CONC_BOAT_PUMP [nmol/kg]\n", + "[ nan 0.5064 0.3742 0.516 1.1316]\n", + "\n", + "QV:SEADATANET.144\n", + "[9 2]\n", + "\n", + "Al_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 2.009756 1.053659 1.258537 1.960976]\n", + "\n", + "QV:SEADATANET.145\n", + "[9 2]\n", + "\n", + "Ba_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 62.75 63.990002 66.800003 68.099998]\n", + "\n", + "QV:SEADATANET.146\n", + "[9 1]\n", + "\n", + "Cd_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 0.31845 0.31255 0.29615 0.2863 ]\n", + "\n", + "QV:SEADATANET.147\n", + "[9 1 2]\n", + "\n", + "Co_D_CONC_SUBICE_PUMP [pmol/kg]\n", + "[ nan 308.5 268.600006 279.899994 253.100006]\n", + "\n", + "QV:SEADATANET.148\n", + "[9 1]\n", + "\n", + "Cu_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 14.3195 6.8502 6.7681 7.4731]\n", + "\n", + "QV:SEADATANET.149\n", + "[9 1 2]\n", + "\n", + "Fe_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 5.9468 3.0739 3.4268 3.6126]\n", + "\n", + "QV:SEADATANET.150\n", + "[9 2]\n", + "\n", + "Ga_D_CONC_SUBICE_PUMP [pmol/kg]\n", + "[ nan 6.81 6.95 7.42 7.34]\n", + "\n", + "QV:SEADATANET.151\n", + "[9 1 2]\n", + "\n", + "Mn_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 5.2041 4.5234 3.8534 5.91495]\n", + "\n", + "QV:SEADATANET.152\n", + "[9 1 2]\n", + "\n", + "Ni_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 10.52805 8.257 8.30705 8.384 ]\n", + "\n", + "QV:SEADATANET.153\n", + "[9 1 2]\n", + "\n", + "Pb_D_CONC_SUBICE_PUMP [pmol/kg]\n", + "[ nan 10.1 1.6 1.2 1.4]\n", + "\n", + "QV:SEADATANET.154\n", + "[9 2 6]\n", + "\n", + "V_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 13.09 13.5 13.95 13.29]\n", + "\n", + "QV:SEADATANET.155\n", + "[9 1 2]\n", + "\n", + "Zn_D_CONC_SUBICE_PUMP [nmol/kg]\n", + "[ nan 19.6989 1.0666 0.9169 3.0386]\n", + "\n", + "QV:SEADATANET.156\n", + "[9 4 2]\n", + "\n", + "Ba_138_134_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 0.57 0.53 0.55 0.58]\n", + "\n", + "QV:SEADATANET.157\n", + "[9 1]\n", + "\n", + "Cd_114_110_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 1.084213 0.96041 0.551941 0.450833]\n", + "\n", + "QV:SEADATANET.158\n", + "[9 1 2]\n", + "\n", + "Cu_65_63_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 0.559117 0.54069 0.514726 0.542622]\n", + "\n", + "QV:SEADATANET.159\n", + "[9 1 2 3]\n", + "\n", + "Cr_53_52_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 1.02023 1.03613 1.04769 1.06464]\n", + "\n", + "QV:SEADATANET.160\n", + "[9 1]\n", + "\n", + "Fe_56_54_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 0.03 0.02 0.07 -0.26]\n", + "\n", + "QV:SEADATANET.161\n", + "[9 1 3 2 6]\n", + "\n", + "Ni_60_58_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan 1.63836 1.602155 1.524474 1.447863]\n", + "\n", + "QV:SEADATANET.162\n", + "[9 1]\n", + "\n", + "Zn_66_64_D_DELTA_BOTTLE [per 10^3]\n", + "[ nan -0.175932 -0.195314 0.104437 0.101797]\n", + "\n", + "QV:SEADATANET.163\n", + "[9 1 2 3]\n", + "\n", + "Cd_114_110_D_DELTA_FISH [per 10^3]\n", + "[ nan 0.716152 0.750159 0.476101 0.664141]\n", + "\n", + "QV:SEADATANET.164\n", + "[9 1]\n", + "\n", + "Fe_56_54_D_DELTA_FISH [per 10^3]\n", + "[ nan -0.11 0.52 0.32 0.5 ]\n", + "\n", + "QV:SEADATANET.165\n", + "[9 1 2 6 5]\n", + "\n", + "Ni_60_58_D_DELTA_FISH [per 10^3]\n", + "[ nan 1.486192 1.608341 1.431721 1.54732 ]\n", + "\n", + "QV:SEADATANET.166\n", + "[9 1]\n", + "\n", + "Zn_66_64_D_DELTA_FISH [per 10^3]\n", + "[ nan 0.4 0.61 -0.12 0.18]\n", + "\n", + "QV:SEADATANET.167\n", + "[9 1]\n", + "\n", + "Ba_138_134_D_DELTA_BOAT_PUMP [per 10^3]\n", + "[ nan 0.443654 0.616907 0.491612 0.396128]\n", + "\n", + "QV:SEADATANET.168\n", + "[9 1]\n", + "\n", + "Ba_138_134_D_DELTA_SUBICE_PUMP [per 10^3]\n", + "[ nan 0.353344 0.348303]\n", + "\n", + "QV:SEADATANET.169\n", + "[9 1]\n", + "\n", + "Cs_137_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 1655.10083 1438.428833 1366.985596 936.56958 ]\n", + "\n", + "QV:SEADATANET.170\n", + "[9 1 3]\n", + "\n", + "I_129_D_CONC_BOTTLE [atoms/kg]\n", + "[ nan 4.99553638e+09 5.46595840e+09 4.74542438e+09\n", + " 5.02532045e+09]\n", + "\n", + "QV:SEADATANET.171\n", + "[9 2]\n", + "\n", + "Np_237_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 0.168566 0.169112 0.162868 0.161249]\n", + "\n", + "QV:SEADATANET.172\n", + "[9 1 3]\n", + "\n", + "Pu_239_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 6.238634 6.482439 6.656293 6.81278 ]\n", + "\n", + "QV:SEADATANET.173\n", + "[9 1]\n", + "\n", + "Pu_239_Pu_240_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 4285.794922 3697.453613 5849.404785 5792.272949]\n", + "\n", + "QV:SEADATANET.174\n", + "[9 1 4]\n", + "\n", + "Pu_240_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 4.024683 4.245073 4.100585 4.72439 ]\n", + "\n", + "QV:SEADATANET.175\n", + "[9 1]\n", + "\n", + "U_236_238_T_RATIO_BOTTLE [per 10^12]\n", + "[ nan 1529.460449 1479.776367 1505.207764 1489.274292]\n", + "\n", + "QV:SEADATANET.176\n", + "[9 2]\n", + "\n", + "U_236_D_CONC_BOTTLE [atoms/kg]\n", + "[ nan 5363510.5 5209238. 4758632. 4707234. ]\n", + "\n", + "QV:SEADATANET.177\n", + "[9 1]\n", + "\n", + "U_236_T_CONC_BOTTLE [atoms/kg]\n", + "[ nan 9355403. 12158179. 15394789. 18341200.]\n", + "\n", + "QV:SEADATANET.178\n", + "[9 1 2]\n", + "\n", + "U_236_D_CONC_FISH [atoms/kg]\n", + "[ nan 6168131. 7304443. 5495804.5 5656699.5]\n", + "\n", + "QV:SEADATANET.179\n", + "[9 1]\n", + "\n", + "Cs_137_D_CONC_UWAY [uBq/kg]\n", + "[ nan 1966.119995 1712.800049 1561.294678 1780.053711]\n", + "\n", + "QV:SEADATANET.180\n", + "[9 1]\n", + "\n", + "Pu_239_Pu_240_D_CONC_UWAY [uBq/kg]\n", + "[ nan 5350.819336 5146.790039 5680.321777 4982.546387]\n", + "\n", + "QV:SEADATANET.181\n", + "[9 1]\n", + "\n", + "Pb_206_204_D_RATIO_BOTTLE\n", + "[ nan 18.51 18.57 18.91 18.389999]\n", + "\n", + "QV:SEADATANET.182\n", + "[9 4 1 3 0]\n", + "\n", + "Pb_206_204_TD_RATIO_BOTTLE\n", + "[ nan 18.475285 18.532042 18.523636 18.514265]\n", + "\n", + "QV:SEADATANET.183\n", + "[9 1]\n", + "\n", + "Pb_206_207_D_RATIO_BOTTLE\n", + "[ nan 1.1794 1.1826 1.2018 1.1702]\n", + "\n", + "QV:SEADATANET.184\n", + "[9 4 1 3 2]\n", + "\n", + "Pb_206_207_TD_RATIO_BOTTLE\n", + "[ nan 1.179389 1.182446 1.181789 1.181492]\n", + "\n", + "QV:SEADATANET.185\n", + "[9 1]\n", + "\n", + "Pb_208_207_D_RATIO_BOTTLE\n", + "[ nan 2.4485 2.4512 2.4571 2.4406]\n", + "\n", + "QV:SEADATANET.186\n", + "[9 4 1 3 0]\n", + "\n", + "Pb_208_207_TD_RATIO_BOTTLE\n", + "[ nan 2.465975 2.473708 2.473035 2.472412]\n", + "\n", + "QV:SEADATANET.187\n", + "[9 1]\n", + "\n", + "Pb_207_204_TD_RATIO_BOTTLE\n", + "[ nan 15.624 15.618 15.614 15.627]\n", + "\n", + "QV:SEADATANET.188\n", + "[9 1]\n", + "\n", + "Pb_208_204_TD_RATIO_BOTTLE\n", + "[ nan 38.016998 37.922001 37.941002 38.014999]\n", + "\n", + "QV:SEADATANET.189\n", + "[9 1]\n", + "\n", + "Pb_208_206_D_RATIO_BOTTLE\n", + "[ nan 2.0788 2.0764 2.0771 2.0738]\n", + "\n", + "QV:SEADATANET.190\n", + "[9 1 2 3]\n", + "\n", + "Pb_208_206_TD_RATIO_BOTTLE\n", + "[ nan 2.099135 2.105348 2.102701 2.432718]\n", + "\n", + "QV:SEADATANET.191\n", + "[9 1]\n", + "\n", + "Pb_206_204_D_RATIO_FISH\n", + "[ nan 18.58 18.379999 18.200001 18.6 ]\n", + "\n", + "QV:SEADATANET.192\n", + "[9 1]\n", + "\n", + "Pb_206_204_TD_RATIO_FISH\n", + "[ nan 18.42308 18.403351 18.324005 18.443455]\n", + "\n", + "QV:SEADATANET.193\n", + "[9 1]\n", + "\n", + "Pb_206_207_D_RATIO_FISH\n", + "[ nan 1.1727 1.1778 1.1776 1.1831]\n", + "\n", + "QV:SEADATANET.194\n", + "[9 1]\n", + "\n", + "Pb_206_207_TD_RATIO_FISH\n", + "[ nan 1.177033 1.175863 1.171213 1.178684]\n", + "\n", + "QV:SEADATANET.195\n", + "[9 1]\n", + "\n", + "Pb_208_207_D_RATIO_FISH\n", + "[ nan 2.4476 2.4578 2.4544 2.4549]\n", + "\n", + "QV:SEADATANET.196\n", + "[9 1]\n", + "\n", + "Pb_208_207_TD_RATIO_FISH\n", + "[ nan 2.455154 2.44959 2.450164 2.453791]\n", + "\n", + "QV:SEADATANET.197\n", + "[9 1]\n", + "\n", + "Pb_207_204_TD_RATIO_FISH\n", + "[ nan 15.634182 15.619825 15.654844 15.625303]\n", + "\n", + "QV:SEADATANET.198\n", + "[9 1]\n", + "\n", + "Pb_208_204_TD_RATIO_FISH\n", + "[ nan 38.277477 38.029686 38.144962 38.043201]\n", + "\n", + "QV:SEADATANET.199\n", + "[9 1]\n", + "\n", + "Pb_208_206_TD_RATIO_FISH\n", + "[ nan 2.113995 2.104017 2.101173 2.097486]\n", + "\n", + "QV:SEADATANET.200\n", + "[9 1]\n", + "\n", + "Pb_206_204_D_RATIO_BOAT_PUMP\n", + "[ nan 18.24 19.040001 18.379999 18.360001]\n", + "\n", + "QV:SEADATANET.201\n", + "[9 1]\n", + "\n", + "Pb_206_207_D_RATIO_BOAT_PUMP\n", + "[ nan 1.1724 1.2178 1.1964 1.1754]\n", + "\n", + "QV:SEADATANET.202\n", + "[9 1]\n", + "\n", + "Pb_208_207_D_RATIO_BOAT_PUMP\n", + "[ nan 2.4494 2.4808 2.4642 2.4447]\n", + "\n", + "QV:SEADATANET.203\n", + "[9 1]\n", + "\n", + "Pa_231_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 0.796751 0.780488 1.447151 2.211385]\n", + "\n", + "QV:SEADATANET.204\n", + "[9 1 3 2 '9']\n", + "\n", + "Pb_210_D_CONC_BOTTLE [mBq/kg]\n", + "[ nan 2.328333 1.363333 1.011667 1.741667]\n", + "\n", + "QV:SEADATANET.205\n", + "[9 1 3]\n", + "\n", + "Po_210_D_CONC_BOTTLE [mBq/kg]\n", + "[ nan 0.533333 2.033333 2.233333 2.183333]\n", + "\n", + "QV:SEADATANET.206\n", + "[9 1 3]\n", + "\n", + "Ra_224_D_CONC_BOTTLE [mBq/kg]\n", + "[ nan 0.237398 0.177236 0.352846 0.110569]\n", + "\n", + "QV:SEADATANET.207\n", + "[9 1]\n", + "\n", + "Ra_226_D_CONC_BOTTLE [mBq/kg]\n", + "[ nan 1.437218 1.297032 1.39294 1.299583]\n", + "\n", + "QV:SEADATANET.208\n", + "[9 1 3]\n", + "\n", + "Ra_228_T_CONC_BOTTLE [mBq/kg]\n", + "[ nan 0.112195 0.463415 0.443902 0.123577]\n", + "\n", + "QV:SEADATANET.209\n", + "[9 1]\n", + "\n", + "Ra_228_D_CONC_BOTTLE [mBq/kg]\n", + "[ nan 0.463653 0.35308 0.454824 0.100843]\n", + "\n", + "QV:SEADATANET.210\n", + "[9 1]\n", + "\n", + "Th_230_T_CONC_BOTTLE [uBq/kg]\n", + "[ nan 1.16 1.2 1.42 1.27]\n", + "\n", + "QV:SEADATANET.211\n", + "[9 1]\n", + "\n", + "Th_230_D_CONC_BOTTLE [uBq/kg]\n", + "[ nan 0.715444 2.764224 3.170732 4.65041 ]\n", + "\n", + "QV:SEADATANET.212\n", + "[9 1 3 2 4]\n", + "\n", + "Th_232_T_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.87393 1.262899 1.12791 0.50296 ]\n", + "\n", + "QV:SEADATANET.213\n", + "[9 1]\n", + "\n", + "Th_232_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.18998 0.932646 0.069088 0.155441]\n", + "\n", + "QV:SEADATANET.214\n", + "[9 1 3 2 5]\n", + "\n", + "Th_234_T_CONC_BOTTLE [mBq/kg]\n", + "[ nan 31.234859 28.776278 35.996681 40.823105]\n", + "\n", + "QV:SEADATANET.215\n", + "[9 1 3 2]\n", + "\n", + "Ac_227_D_CONC_PUMP [uBq/kg]\n", + "[nan 1.4 0. 0.8 0.7]\n", + "\n", + "QV:SEADATANET.216\n", + "[9 3 2]\n", + "\n", + "Be_7_T_CONC_PUMP [uBq/kg]\n", + "[ nan 1024.390259 936.585388 624.390259 253.658539]\n", + "\n", + "QV:SEADATANET.217\n", + "[9 1 6]\n", + "\n", + "Be_7_D_CONC_PUMP [uBq/kg]\n", + "[ nan 758.299988 770. 540.799988 3736.699951]\n", + "\n", + "QV:SEADATANET.218\n", + "[9 1 2 6]\n", + "\n", + "Ra_223_D_CONC_PUMP [mBq/kg]\n", + "[ nan 0.02439 0. 0.006504 0.011382]\n", + "\n", + "QV:SEADATANET.219\n", + "[9 1 6 3 2]\n", + "\n", + "Ra_224_D_CONC_PUMP [mBq/kg]\n", + "[ nan 0.113821 0.099187 0.100813 0.982114]\n", + "\n", + "QV:SEADATANET.220\n", + "[9 1 3 6 2]\n", + "\n", + "Ra_226_D_CONC_PUMP [mBq/kg]\n", + "[ nan 1.430898 1.24878 1.382117 1.512195]\n", + "\n", + "QV:SEADATANET.221\n", + "[9 1 8 3 2]\n", + "\n", + "Ra_228_T_CONC_PUMP [mBq/kg]\n", + "[ nan 0.435772 0.130081 0.268293 0.201626]\n", + "\n", + "QV:SEADATANET.222\n", + "[9 1]\n", + "\n", + "Ra_228_D_CONC_PUMP [mBq/kg]\n", + "[ nan 0.338211 0.214634 0.172358 0.18374 ]\n", + "\n", + "QV:SEADATANET.223\n", + "[9 1 3 6 2]\n", + "\n", + "Th_228_D_CONC_PUMP [uBq/kg]\n", + "[ nan 78.048798 55.284565 87.804901 69.918709]\n", + "\n", + "QV:SEADATANET.224\n", + "[9 1 6 2 3]\n", + "\n", + "Th_234_T_CONC_PUMP [mBq/kg]\n", + "[ nan 20.9 18.700001 20. 17.4 ]\n", + "\n", + "QV:SEADATANET.225\n", + "[9 2 3 1]\n", + "\n", + "Pa_231_D_CONC_FISH [uBq/kg]\n", + "[ nan 2.831112 0.69904 0.681564 0.454376]\n", + "\n", + "QV:SEADATANET.226\n", + "[9 3 1]\n", + "\n", + "Pb_210_D_CONC_FISH [mBq/kg]\n", + "[ nan 1.170732 2.347967 1.365854 1.707317]\n", + "\n", + "QV:SEADATANET.227\n", + "[9 1]\n", + "\n", + "Po_210_D_CONC_FISH [mBq/kg]\n", + "[ nan 0.686179 0.746341 0.663415 0.414634]\n", + "\n", + "QV:SEADATANET.228\n", + "[9 1]\n", + "\n", + "Ra_226_D_CONC_FISH [mBq/kg]\n", + "[ nan 1.495935 1.349593 1.479675 1.398374]\n", + "\n", + "QV:SEADATANET.229\n", + "[9 1]\n", + "\n", + "Ra_228_T_CONC_FISH [mBq/kg]\n", + "[ nan 0.160976 0.523577 0.478049 0.247154]\n", + "\n", + "QV:SEADATANET.230\n", + "[9 1]\n", + "\n", + "Th_230_D_CONC_FISH [uBq/kg]\n", + "[ nan 3.160352 0.645745 1.002804 1.017998]\n", + "\n", + "QV:SEADATANET.231\n", + "[9 3 1]\n", + "\n", + "Th_232_D_CONC_FISH [pmol/kg]\n", + "[ nan 0.384452 0.823081 0.396779 0.371307]\n", + "\n", + "QV:SEADATANET.232\n", + "[9 1]\n", + "\n", + "Th_234_T_CONC_FISH [mBq/kg]\n", + "[ nan 27.479675]\n", + "\n", + "QV:SEADATANET.233\n", + "[9 1]\n", + "\n", + "Pa_231_D_CONC_UWAY [uBq/kg]\n", + "[ nan 0.603631 0.755251 1.121143]\n", + "\n", + "QV:SEADATANET.234\n", + "[9 1]\n", + "\n", + "Po_210_D_CONC_UWAY [mBq/kg]\n", + "[ nan 0.416585 0.446829 0.393333 0.434309]\n", + "\n", + "QV:SEADATANET.235\n", + "[9 1]\n", + "\n", + "Pb_210_D_CONC_UWAY [mBq/kg]\n", + "[ nan 0.833008 0.730569 0.789268 0.931057]\n", + "\n", + "QV:SEADATANET.236\n", + "[9 1]\n", + "\n", + "Ra_224_D_CONC_UWAY [mBq/kg]\n", + "[ nan 0.091057 0.043902 0.20813 0.279675]\n", + "\n", + "QV:SEADATANET.237\n", + "[9 1 2]\n", + "\n", + "Ra_226_D_CONC_UWAY [mBq/kg]\n", + "[ nan 1.927839 1.767957 1.703508 1.596485]\n", + "\n", + "QV:SEADATANET.238\n", + "[9 1 2]\n", + "\n", + "Ra_228_D_CONC_UWAY [mBq/kg]\n", + "[ nan 0.316783 0.445928 0.276271 0.349226]\n", + "\n", + "QV:SEADATANET.239\n", + "[9 1 2]\n", + "\n", + "Th_228_D_CONC_UWAY [uBq/kg]\n", + "[ nan 382.500275 81.612221 207.02478 92.250481]\n", + "\n", + "QV:SEADATANET.240\n", + "[9 2]\n", + "\n", + "Th_230_D_CONC_UWAY [uBq/kg]\n", + "[ nan 1.128682 2.108295 2.05976 ]\n", + "\n", + "QV:SEADATANET.241\n", + "[9 1]\n", + "\n", + "Th_232_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.050588 0.045396 0.057749]\n", + "\n", + "QV:SEADATANET.242\n", + "[9 1]\n", + "\n", + "Th_234_T_CONC_UWAY [mBq/kg]\n", + "[ nan 28.40683 28.622438 29.650732 31.100489]\n", + "\n", + "QV:SEADATANET.243\n", + "[9 1 2]\n", + "\n", + "Pa_231_D_CONC_BOAT_PUMP [uBq/kg]\n", + "[ nan 0.2133]\n", + "\n", + "QV:SEADATANET.244\n", + "[9 1]\n", + "\n", + "Th_230_D_CONC_BOAT_PUMP [uBq/kg]\n", + "[ nan 1.461]\n", + "\n", + "QV:SEADATANET.245\n", + "[9 1]\n", + "\n", + "Th_232_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[ nan 0.29776]\n", + "\n", + "QV:SEADATANET.246\n", + "[9 1]\n", + "\n", + "Pa_231_D_CONC_SUBICE_PUMP [uBq/kg]\n", + "[ nan 0.011 0.015 0.088 0.0417]\n", + "\n", + "QV:SEADATANET.247\n", + "[9 1]\n", + "\n", + "Th_230_D_CONC_SUBICE_PUMP [uBq/kg]\n", + "[ nan 1.135 1.192 1.306 1.319]\n", + "\n", + "QV:SEADATANET.248\n", + "[9 1]\n", + "\n", + "Th_232_D_CONC_SUBICE_PUMP [pmol/kg]\n", + "[ nan 1.0646 1.1204 1.238 1.368 ]\n", + "\n", + "QV:SEADATANET.249\n", + "[9 1]\n", + "\n", + "Th_234_T_CONC_SUBICE_PUMP [mBq/kg]\n", + "[ nan 30.700001 29.4 34.700001 27.9 ]\n", + "\n", + "QV:SEADATANET.250\n", + "[9 2]\n", + "\n", + "Hf_176_177_D_EPSILON_BOTTLE [per 10^4]\n", + "[ nan 0.41 -0.26 -0.25 -1.94]\n", + "\n", + "QV:SEADATANET.251\n", + "[9 1]\n", + "\n", + "Nd_143_144_D_EPSILON_BOTTLE [per 10^4]\n", + "[ nan -14.15286 -13.879521 -13.883313 -13.361316]\n", + "\n", + "QV:SEADATANET.252\n", + "[9 1 3 2 4]\n", + "\n", + "Hf_176_177_D_EPSILON_FISH [per 10^4]\n", + "[ nan -0.28 -0.27 0.3 0.31]\n", + "\n", + "QV:SEADATANET.253\n", + "[9 1]\n", + "\n", + "Nd_143_144_D_EPSILON_FISH [per 10^4]\n", + "[ nan -14.153927 -13.015296 -14.404537 -12.36 ]\n", + "\n", + "QV:SEADATANET.254\n", + "[9 1]\n", + "\n", + "Nd_143_144_D_EPSILON_UWAY [per 10^4]\n", + "[ nan -7.2 -4. -9. -8.4]\n", + "\n", + "QV:SEADATANET.255\n", + "[9 1]\n", + "\n", + "Y_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 114.879997 114.400002 112.089996 116.050003]\n", + "\n", + "QV:SEADATANET.256\n", + "[9 1 3 5 4]\n", + "\n", + "La_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 11.05 10.66 10.1 11.95]\n", + "\n", + "QV:SEADATANET.257\n", + "[9 1 3 2 4]\n", + "\n", + "Ce_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 14.333636 16.887186 12.796327 11.228178]\n", + "\n", + "QV:SEADATANET.258\n", + "[9 1 3 2]\n", + "\n", + "Pr_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 3.647191 3.897719 3.518812 3.357544]\n", + "\n", + "QV:SEADATANET.259\n", + "[9 1 3 2]\n", + "\n", + "Nd_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 17.587685 17.614305 17.587372 17.616825]\n", + "\n", + "QV:SEADATANET.260\n", + "[9 1 3 2 4]\n", + "\n", + "Sm_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 3.101465 3.444644 3.13025 2.889282]\n", + "\n", + "QV:SEADATANET.261\n", + "[9 1 3 2]\n", + "\n", + "Eu_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.934788 0.983839 0.934371 0.824531]\n", + "\n", + "QV:SEADATANET.262\n", + "[9 1 2 3]\n", + "\n", + "Gd_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 4.627342 4.938366 4.587365 4.290338]\n", + "\n", + "QV:SEADATANET.263\n", + "[9 1 3 2]\n", + "\n", + "Tb_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.765122 0.802596 0.772829 0.748298]\n", + "\n", + "QV:SEADATANET.264\n", + "[9 1 3 2]\n", + "\n", + "Dy_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 5.565683 5.875991 5.80047 5.379701]\n", + "\n", + "QV:SEADATANET.265\n", + "[9 1 3 2]\n", + "\n", + "Ho_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 1.445829 1.472963 1.449741 1.420948]\n", + "\n", + "QV:SEADATANET.266\n", + "[9 1 3 2]\n", + "\n", + "Er_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 4.508715 4.63153 4.620785 4.540745]\n", + "\n", + "QV:SEADATANET.267\n", + "[9 1 3 2]\n", + "\n", + "Tm_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.607171 0.630057 0.631402 0.621412]\n", + "\n", + "QV:SEADATANET.268\n", + "[9 1 2 3]\n", + "\n", + "Yb_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 3.79999 3.855863 3.872933 3.808411]\n", + "\n", + "QV:SEADATANET.269\n", + "[9 1 2 3]\n", + "\n", + "Lu_D_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.579948 0.613831 0.625943 0.592847]\n", + "\n", + "QV:SEADATANET.270\n", + "[9 1 2 3]\n", + "\n", + "Y_D_CONC_FISH [pmol/kg]\n", + "[ nan 123.199806 125.192375 124.840157 131.079376]\n", + "\n", + "QV:SEADATANET.271\n", + "[9 1]\n", + "\n", + "La_D_CONC_FISH [pmol/kg]\n", + "[ nan 22.059525 19.920168 20.493565 21.766468]\n", + "\n", + "QV:SEADATANET.272\n", + "[9 1 7]\n", + "\n", + "Ce_D_CONC_FISH [pmol/kg]\n", + "[ nan 26.446739 22.531816 25.272701 27.210564]\n", + "\n", + "QV:SEADATANET.273\n", + "[9 1 7]\n", + "\n", + "Pr_D_CONC_FISH [pmol/kg]\n", + "[ nan 5.208297 4.478406 4.820566 5.175287]\n", + "\n", + "QV:SEADATANET.274\n", + "[9 1 7]\n", + "\n", + "Nd_D_CONC_FISH [pmol/kg]\n", + "[ nan 17.279907 16.940922 17.141417 23.242701]\n", + "\n", + "QV:SEADATANET.275\n", + "[9 1 7 4]\n", + "\n", + "Sm_D_CONC_FISH [pmol/kg]\n", + "[ nan 4.621849 3.958291 4.408125 4.655192]\n", + "\n", + "QV:SEADATANET.276\n", + "[9 1 7]\n", + "\n", + "Eu_D_CONC_FISH [pmol/kg]\n", + "[ nan 1.179016 1.035409 1.127002 1.176829]\n", + "\n", + "QV:SEADATANET.277\n", + "[9 1 7]\n", + "\n", + "Gd_D_CONC_FISH [pmol/kg]\n", + "[ nan 5.729134 5.32105 5.606716 5.810727]\n", + "\n", + "QV:SEADATANET.278\n", + "[9 1 7]\n", + "\n", + "Tb_D_CONC_FISH [pmol/kg]\n", + "[ nan 0.846657 0.804053 0.852279 0.879247]\n", + "\n", + "QV:SEADATANET.279\n", + "[9 1 7]\n", + "\n", + "Dy_D_CONC_FISH [pmol/kg]\n", + "[ nan 5.795363 5.628952 5.82169 6.007077]\n", + "\n", + "QV:SEADATANET.280\n", + "[9 1 7]\n", + "\n", + "Ho_D_CONC_FISH [pmol/kg]\n", + "[ nan 1.385252 1.404803 1.416585 1.423634]\n", + "\n", + "QV:SEADATANET.281\n", + "[9 1 7]\n", + "\n", + "Er_D_CONC_FISH [pmol/kg]\n", + "[ nan 4.349482 4.470709 4.437563 4.423068]\n", + "\n", + "QV:SEADATANET.282\n", + "[9 1 7]\n", + "\n", + "Tm_D_CONC_FISH [pmol/kg]\n", + "[ nan 0.576506 0.594909 0.590402 0.593117]\n", + "\n", + "QV:SEADATANET.283\n", + "[9 1 7]\n", + "\n", + "Yb_D_CONC_FISH [pmol/kg]\n", + "[ nan 3.441847 3.676881 3.536759 3.556391]\n", + "\n", + "QV:SEADATANET.284\n", + "[9 1 7]\n", + "\n", + "Lu_D_CONC_FISH [pmol/kg]\n", + "[ nan 0.532376 0.575817 0.552478 0.554649]\n", + "\n", + "QV:SEADATANET.285\n", + "[9 1 7]\n", + "\n", + "La_D_CONC_UWAY [pmol/kg]\n", + "[ nan 26.299999 33.599998 32.599998 31. ]\n", + "\n", + "QV:SEADATANET.286\n", + "[9 1 3]\n", + "\n", + "Ce_D_CONC_UWAY [pmol/kg]\n", + "[ nan 4.7 16.200001 2.92 3.43 ]\n", + "\n", + "QV:SEADATANET.287\n", + "[9 1 3]\n", + "\n", + "Pr_D_CONC_UWAY [pmol/kg]\n", + "[ nan 3.44 5.8 4.36 4.48]\n", + "\n", + "QV:SEADATANET.288\n", + "[9 1]\n", + "\n", + "Nd_D_CONC_UWAY [pmol/kg]\n", + "[ nan 14.3 22.6 18.5 18.299999]\n", + "\n", + "QV:SEADATANET.289\n", + "[9 1]\n", + "\n", + "Sm_D_CONC_UWAY [pmol/kg]\n", + "[ nan 2.74 4.98 3.48 3.73]\n", + "\n", + "QV:SEADATANET.290\n", + "[9 1]\n", + "\n", + "Eu_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.78 1.19 0.87 0.96]\n", + "\n", + "QV:SEADATANET.291\n", + "[9 1]\n", + "\n", + "Gd_D_CONC_UWAY [pmol/kg]\n", + "[ nan 3.9 6.49 5.05 5.65]\n", + "\n", + "QV:SEADATANET.292\n", + "[9 1]\n", + "\n", + "Tb_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.69 0.96 0.8 0.74]\n", + "\n", + "QV:SEADATANET.293\n", + "[9 1]\n", + "\n", + "Dy_D_CONC_UWAY [pmol/kg]\n", + "[ nan 5. 7.24 5.97 6.55]\n", + "\n", + "QV:SEADATANET.294\n", + "[9 1]\n", + "\n", + "Ho_D_CONC_UWAY [pmol/kg]\n", + "[ nan 1.47 1.99 1.77 1.88]\n", + "\n", + "QV:SEADATANET.295\n", + "[9 1]\n", + "\n", + "Er_D_CONC_UWAY [pmol/kg]\n", + "[ nan 5.04 6.6 6.03 6.11]\n", + "\n", + "QV:SEADATANET.296\n", + "[9 1]\n", + "\n", + "Tm_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.74 1.08 0.92 0.83]\n", + "\n", + "QV:SEADATANET.297\n", + "[9 1]\n", + "\n", + "Yb_D_CONC_UWAY [pmol/kg]\n", + "[ nan 4.41 7.22 6.06 5.98]\n", + "\n", + "QV:SEADATANET.298\n", + "[9 1]\n", + "\n", + "Lu_D_CONC_UWAY [pmol/kg]\n", + "[ nan 0.82 1.29 1.1 1.05]\n", + "\n", + "QV:SEADATANET.299\n", + "[9 1]\n", + "\n", + "La_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[ nan 25.700001]\n", + "\n", + "QV:SEADATANET.300\n", + "[9 1]\n", + "\n", + "Ce_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 8.1]\n", + "\n", + "QV:SEADATANET.301\n", + "[9 1]\n", + "\n", + "Pr_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 4.9]\n", + "\n", + "QV:SEADATANET.302\n", + "[9 1]\n", + "\n", + "Nd_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[ nan 22.4]\n", + "\n", + "QV:SEADATANET.303\n", + "[9 1]\n", + "\n", + "Sm_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 5.7]\n", + "\n", + "QV:SEADATANET.304\n", + "[9 1]\n", + "\n", + "Eu_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 1.5]\n", + "\n", + "QV:SEADATANET.305\n", + "[9 1]\n", + "\n", + "Gd_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 8.6]\n", + "\n", + "QV:SEADATANET.306\n", + "[9 1]\n", + "\n", + "Tb_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 1.2]\n", + "\n", + "QV:SEADATANET.307\n", + "[9 1]\n", + "\n", + "Dy_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 9.9]\n", + "\n", + "QV:SEADATANET.308\n", + "[9 1]\n", + "\n", + "Ho_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 2.2]\n", + "\n", + "QV:SEADATANET.309\n", + "[9 1]\n", + "\n", + "Er_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 7.8]\n", + "\n", + "QV:SEADATANET.310\n", + "[9 1]\n", + "\n", + "Tm_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 1.]\n", + "\n", + "QV:SEADATANET.311\n", + "[9 1]\n", + "\n", + "Yb_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 6.8]\n", + "\n", + "QV:SEADATANET.312\n", + "[9 1]\n", + "\n", + "Lu_D_CONC_BOAT_PUMP [pmol/kg]\n", + "[nan 1.1]\n", + "\n", + "QV:SEADATANET.313\n", + "[9 1]\n", + "\n", + "Cu_Cu'_D_CONC_BOTTLE [fmol/kg]\n", + "[ nan 1.18 0.08 1.39 1.24]\n", + "\n", + "QV:SEADATANET.314\n", + "[9 1]\n", + "\n", + "L1Cu_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 3.248976 2.680488 3.141659 3.027707]\n", + "\n", + "QV:SEADATANET.315\n", + "[9 1 3 2 0]\n", + "\n", + "L1Cu_D_LogK_BOTTLE\n", + "[ nan 13.204 13.179 13.135 13.142]\n", + "\n", + "QV:SEADATANET.316\n", + "[9 1 3]\n", + "\n", + "LFe_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.452872 0.311019 0.500923 0.807415]\n", + "\n", + "QV:SEADATANET.317\n", + "[9 3 1]\n", + "\n", + "LFe_D_LogK_BOTTLE\n", + "[ nan 24.34 22.700001 22.514463 22.812017]\n", + "\n", + "QV:SEADATANET.318\n", + "[9 3 1]\n", + "\n", + "L1Fe_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.926829 1.356098 1.629268 0.8 ]\n", + "\n", + "QV:SEADATANET.319\n", + "[9 1 6 3]\n", + "\n", + "L1Fe_D_LogK_BOTTLE\n", + "[ nan 13.06 12.6 12.61 12.29]\n", + "\n", + "QV:SEADATANET.320\n", + "[9 1 6 7 3]\n", + "\n", + "L2Fe_D_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.409756 0.780488 0. 0.429268]\n", + "\n", + "QV:SEADATANET.321\n", + "[9 1 6 3]\n", + "\n", + "L2Fe_D_LogK_BOTTLE\n", + "[ nan 12. 11.66 0. 11.85]\n", + "\n", + "QV:SEADATANET.322\n", + "[9 1 6 3]\n", + "\n", + "Cu_Cu'_D_CONC_FISH [fmol/kg]\n", + "[ nan 2.45 4.64 28.450001 222.330002]\n", + "\n", + "QV:SEADATANET.323\n", + "[9 1]\n", + "\n", + "L1Cu_D_CONC_FISH [nmol/kg]\n", + "[ nan 1.882927 2.282927 3.02439 3.082927]\n", + "\n", + "QV:SEADATANET.324\n", + "[9 1 3]\n", + "\n", + "L1Cu_D_LogK_FISH\n", + "[ nan 13.85 13.6 13.2 13.45]\n", + "\n", + "QV:SEADATANET.325\n", + "[9 1 3]\n", + "\n", + "L1Fe_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.858537 1.180488 0. 0.887805]\n", + "\n", + "QV:SEADATANET.326\n", + "[9 1 6]\n", + "\n", + "L1Fe_D_LogK_FISH\n", + "[ nan 12.46 12.51 0. 12.23]\n", + "\n", + "QV:SEADATANET.327\n", + "[9 1 6 7]\n", + "\n", + "L2Fe_D_CONC_FISH [nmol/kg]\n", + "[ nan 0.556098 1.170732 0.965854 0.692683]\n", + "\n", + "QV:SEADATANET.328\n", + "[9 1 6]\n", + "\n", + "L2Fe_D_LogK_FISH\n", + "[ nan 11.32 11.19 11.18 11.92]\n", + "\n", + "QV:SEADATANET.329\n", + "[9 1 6]\n", + "\n", + "Al_TP_CONC_BOTTLE [nmol/kg]\n", + "[ nan 7.472453 3.5 4.489644 2.28 ]\n", + "\n", + "QV:SEADATANET.330\n", + "[9 1 6 3]\n", + "\n", + "Al_TPL_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.845571 1.510146 0.347541 0.23839 ]\n", + "\n", + "QV:SEADATANET.331\n", + "[9 1 6 3]\n", + "\n", + "Al_TPR_CONC_BOTTLE [nmol/kg]\n", + "[ nan 19.539667 43.703709 8.205854 4.534156]\n", + "\n", + "QV:SEADATANET.332\n", + "[9 1 6]\n", + "\n", + "Ba_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 135.708298 44.02 201.399643 333.720001]\n", + "\n", + "QV:SEADATANET.333\n", + "[9 1 3 6]\n", + "\n", + "Ba_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 625.636475 67.758514 81.765289 83.128609]\n", + "\n", + "QV:SEADATANET.334\n", + "[9 1 3 6]\n", + "\n", + "Cd_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 3.62 4.7 3.08 2.15]\n", + "\n", + "QV:SEADATANET.335\n", + "[9 1 6 3]\n", + "\n", + "Cd_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 1.453659 1.960976 0.965854 2.380488]\n", + "\n", + "QV:SEADATANET.336\n", + "[9 1 3 6]\n", + "\n", + "Cd_TPR_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.058537 0.04878 0.039024 0.068293]\n", + "\n", + "QV:SEADATANET.337\n", + "[9 1 6]\n", + "\n", + "Co_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 5.47 3.33 2.03 2.24]\n", + "\n", + "QV:SEADATANET.338\n", + "[9 1 6 3]\n", + "\n", + "Co_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 2.146341 2.526829 2.009756 1.687805]\n", + "\n", + "QV:SEADATANET.339\n", + "[9 1 3 6]\n", + "\n", + "Co_TPR_CONC_BOTTLE [pmol/kg]\n", + "[ nan 1.356098 2.380488 0.8 0.585366]\n", + "\n", + "QV:SEADATANET.340\n", + "[9 1]\n", + "\n", + "Cr_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 3.86 9.74 18.68 22.870001]\n", + "\n", + "QV:SEADATANET.341\n", + "[9 1 6 3]\n", + "\n", + "Cu_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 4.88 12.33 8.93 3.26]\n", + "\n", + "QV:SEADATANET.342\n", + "[9 1 3 6]\n", + "\n", + "Cu_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 16.388987 8.389944 6.808309 5.773022]\n", + "\n", + "QV:SEADATANET.343\n", + "[9 1 3 6]\n", + "\n", + "Fe_TP_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.77 0.37 1.11 2. ]\n", + "\n", + "QV:SEADATANET.344\n", + "[9 1 6 3]\n", + "\n", + "Fe_TPL_CONC_BOTTLE [nmol/kg]\n", + "[ nan 1.017063 1.771346 0.402605 0.199941]\n", + "\n", + "QV:SEADATANET.345\n", + "[9 1 3 6]\n", + "\n", + "Fe_TPR_CONC_BOTTLE [nmol/kg]\n", + "[ nan 4.455463 9.519083 2.132 1.211259]\n", + "\n", + "QV:SEADATANET.346\n", + "[9 1]\n", + "\n", + "Mn_TP_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.04 0.02 0.06 0.17]\n", + "\n", + "QV:SEADATANET.347\n", + "[9 1 6 3]\n", + "\n", + "Mn_TPL_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.304741 0.51999 0.060361 0.050039]\n", + "\n", + "QV:SEADATANET.348\n", + "[9 1 3 6]\n", + "\n", + "Mn_TPR_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.034166 0.065141 0.012185 0.007015]\n", + "\n", + "QV:SEADATANET.349\n", + "[9 1 6]\n", + "\n", + "Mo_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.91 1.56 0.33 0.22]\n", + "\n", + "QV:SEADATANET.350\n", + "[9 1 3 6]\n", + "\n", + "Mo_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 45.811729 4.875809 3.80094 2.154378]\n", + "\n", + "QV:SEADATANET.351\n", + "[9 3 1 6]\n", + "\n", + "Ni_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0. 2.511323 33.178623 38.417618]\n", + "\n", + "QV:SEADATANET.352\n", + "[9 6 1]\n", + "\n", + "Ni_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 25.03315 16.92819 13.370416 9.952879]\n", + "\n", + "QV:SEADATANET.353\n", + "[9 1 6]\n", + "\n", + "P_TP_CONC_BOTTLE [nmol/kg]\n", + "[ nan 38.150002 32.130001 19.440001 14.98 ]\n", + "\n", + "QV:SEADATANET.354\n", + "[9 1 6 3]\n", + "\n", + "P_TPL_CONC_BOTTLE [nmol/kg]\n", + "[ nan 17.380283 20.647444 11.461259 10.311629]\n", + "\n", + "QV:SEADATANET.355\n", + "[9 1 3 6]\n", + "\n", + "P_TPR_CONC_BOTTLE [nmol/kg]\n", + "[ nan 3.621961 5.256117 1.741298 2.019054]\n", + "\n", + "QV:SEADATANET.356\n", + "[9 1 6]\n", + "\n", + "Pb_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 1.12 0.48 0.8 1.83]\n", + "\n", + "QV:SEADATANET.357\n", + "[9 1 6 3]\n", + "\n", + "Pb_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.937513 0.490884 0.405313 0.664039]\n", + "\n", + "QV:SEADATANET.358\n", + "[9 1 6]\n", + "\n", + "Sc_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.414146 0.433854 0.498537 0.57561 ]\n", + "\n", + "QV:SEADATANET.359\n", + "[9 1 3]\n", + "\n", + "Th_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.01 0.05 0.04 0.03]\n", + "\n", + "QV:SEADATANET.360\n", + "[9 6 2 1 3]\n", + "\n", + "Th_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0. 0.009463 0.012878 0.017561]\n", + "\n", + "QV:SEADATANET.361\n", + "[9 6 1]\n", + "\n", + "Ti_TP_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.06 0.01 0.08 0.15]\n", + "\n", + "QV:SEADATANET.362\n", + "[9 1 6 3]\n", + "\n", + "Ti_TPL_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.001639 0.002976 0.000624 0.000332]\n", + "\n", + "QV:SEADATANET.363\n", + "[9 1 3 6]\n", + "\n", + "Ti_TPR_CONC_BOTTLE [nmol/kg]\n", + "[ nan 0.434312 0.86641 0.192166 0.128127]\n", + "\n", + "QV:SEADATANET.364\n", + "[9 1 6]\n", + "\n", + "V_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 7.47 11.41 11.5 13.02]\n", + "\n", + "QV:SEADATANET.365\n", + "[9 1 3 6]\n", + "\n", + "V_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 72.975609 35.707317 20.780487 216.585373]\n", + "\n", + "QV:SEADATANET.366\n", + "[9 1 3 6]\n", + "\n", + "Zn_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 11.699382 14.316231 37.984665 73.345901]\n", + "\n", + "QV:SEADATANET.367\n", + "[9 1 3 6]\n", + "\n", + "Zn_TPL_CONC_BOTTLE [pmol/kg]\n", + "[ nan 30.335958 52.920467 44.784649 43.065662]\n", + "\n", + "QV:SEADATANET.368\n", + "[9 1 3 6]\n", + "\n", + "Ag_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.049659 0.011415 0.01639 0.007346]\n", + "\n", + "QV:SEADATANET.369\n", + "[9 2 1]\n", + "\n", + "Ag_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.985366 0.274146 0.236098 0.17561 ]\n", + "\n", + "QV:SEADATANET.370\n", + "[9 2 1 3 4]\n", + "\n", + "Al_TP_CONC_PUMP [nmol/kg]\n", + "[ nan 4.682927 6.634146 4.292683 4.390244]\n", + "\n", + "QV:SEADATANET.371\n", + "[9 1]\n", + "\n", + "Al_LPT_CONC_PUMP [nmol/kg]\n", + "[ nan 4.92143 2.053793 2.657613 7.085427]\n", + "\n", + "QV:SEADATANET.372\n", + "[9 1 3 6 4]\n", + "\n", + "Al_SPT_CONC_PUMP [nmol/kg]\n", + "[ nan 0.737598 2.816617 0.362548 25.65716 ]\n", + "\n", + "QV:SEADATANET.373\n", + "[9 1 2 3 6]\n", + "\n", + "Al_SPL_CONC_PUMP [nmol/kg]\n", + "[ nan 0.358899 0.134947 0.29612 0.251254]\n", + "\n", + "QV:SEADATANET.374\n", + "[9 2]\n", + "\n", + "Ba_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 13.294371 18.433971 20.170162 33.29546 ]\n", + "\n", + "QV:SEADATANET.375\n", + "[9 1 3 4 2]\n", + "\n", + "Ba_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 3.435481 334.05838 8.401126 321.196442]\n", + "\n", + "QV:SEADATANET.376\n", + "[9 1 2 3 6]\n", + "\n", + "Ba_SPL_CONC_PUMP [pmol/kg]\n", + "[ nan 92.825371 204.898071 290.218628 189.205826]\n", + "\n", + "QV:SEADATANET.377\n", + "[9 2]\n", + "\n", + "Cd_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 13.756098 5.717073 3.404878 3.736585]\n", + "\n", + "QV:SEADATANET.378\n", + "[9 1]\n", + "\n", + "Cd_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.975389 0.039125 0.037386 0.012737]\n", + "\n", + "QV:SEADATANET.379\n", + "[9 1 3 4 6]\n", + "\n", + "Cd_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.271325 3.442991 0.028339 0.63321 ]\n", + "\n", + "QV:SEADATANET.380\n", + "[9 1 2 3 4]\n", + "\n", + "Co_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.993266 0.164983 0.20018 0.488921]\n", + "\n", + "QV:SEADATANET.381\n", + "[9 1 3 4 2]\n", + "\n", + "Co_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.653975 2.075954 0.121729 1.814991]\n", + "\n", + "QV:SEADATANET.382\n", + "[9 1 2 3 4]\n", + "\n", + "Cu_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 3.34821 2.004086 1.426711 2.59067 ]\n", + "\n", + "QV:SEADATANET.383\n", + "[9 1 3 2 4]\n", + "\n", + "Cu_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 6.878622 26.956652 0.89219 19.770351]\n", + "\n", + "QV:SEADATANET.384\n", + "[9 1 2 3 4]\n", + "\n", + "Cu_SPL_CONC_PUMP [pmol/kg]\n", + "[ nan 8.997337 4.554814 8.533956 6.668941]\n", + "\n", + "QV:SEADATANET.385\n", + "[9 2 3]\n", + "\n", + "Fe_TP_CONC_PUMP [nmol/kg]\n", + "[ nan 11.804878 5.268293 3.317073 1.268293]\n", + "\n", + "QV:SEADATANET.386\n", + "[9 1]\n", + "\n", + "Fe_LPT_CONC_PUMP [nmol/kg]\n", + "[ nan 1.050323 0.344769 0.625939 1.439502]\n", + "\n", + "QV:SEADATANET.387\n", + "[9 1 3 6 4]\n", + "\n", + "Fe_SPT_CONC_PUMP [nmol/kg]\n", + "[ nan 0.255904 0.79256 0.160249 5.349295]\n", + "\n", + "QV:SEADATANET.388\n", + "[9 1 2 3 4]\n", + "\n", + "Fe_SPL_CONC_PUMP [nmol/kg]\n", + "[ nan 0.342561 0.114592 0.258232 0.32794 ]\n", + "\n", + "QV:SEADATANET.389\n", + "[9 2]\n", + "\n", + "Ga_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 0.712051 0.860803 0.944894 0.887285]\n", + "\n", + "QV:SEADATANET.390\n", + "[9 1 6]\n", + "\n", + "Hg_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.033 0.017 0.024 0.042]\n", + "\n", + "QV:SEADATANET.391\n", + "[9 2 3]\n", + "\n", + "Hg_MM_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.00141 0.00088 0.00038 0.00178]\n", + "\n", + "QV:SEADATANET.392\n", + "[9 2 5 3]\n", + "\n", + "Mn_TP_CONC_PUMP [nmol/kg]\n", + "[ nan 0.188293 0.118049 0.078049 0.059512]\n", + "\n", + "QV:SEADATANET.393\n", + "[9 1]\n", + "\n", + "Mn_LPT_CONC_PUMP [nmol/kg]\n", + "[ nan 0.014976 0.004817 0.010099 0.02417 ]\n", + "\n", + "QV:SEADATANET.394\n", + "[9 1 3 4 2]\n", + "\n", + "Mn_SPT_CONC_PUMP [nmol/kg]\n", + "[ nan 0.006083 0.030332 0.005697 0.165448]\n", + "\n", + "QV:SEADATANET.395\n", + "[9 1 2 3 6]\n", + "\n", + "Mn_SPL_CONC_PUMP [nmol/kg]\n", + "[ nan 0.020698 0.054247 0.130304 0.048691]\n", + "\n", + "QV:SEADATANET.396\n", + "[9 2]\n", + "\n", + "Mo_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 3.259011 11.614256 1.130145 2.436231]\n", + "\n", + "QV:SEADATANET.397\n", + "[9 1 6 3]\n", + "\n", + "Mo_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 3.179421 0.536872 0.182223 0.113805]\n", + "\n", + "QV:SEADATANET.398\n", + "[9 1 3]\n", + "\n", + "Mo_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.823194 0.253418 1.101504 2.246607]\n", + "\n", + "QV:SEADATANET.399\n", + "[9 1 2 3]\n", + "\n", + "Ni_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 39.466263 136.473419 16.966465 16.998934]\n", + "\n", + "QV:SEADATANET.400\n", + "[9 3 1 6]\n", + "\n", + "Ni_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 1.411194 2.831841 2.533521 3.43655 ]\n", + "\n", + "QV:SEADATANET.401\n", + "[9 1 3 2]\n", + "\n", + "Ni_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 3.485622 7.531662 11.42548 11.258619]\n", + "\n", + "QV:SEADATANET.402\n", + "[9 1 2 3]\n", + "\n", + "Ni_SPL_CONC_PUMP [pmol/kg]\n", + "[ nan 39.856834 6.581006 11.0156 3.719055]\n", + "\n", + "QV:SEADATANET.403\n", + "[9 2]\n", + "\n", + "P_TP_CONC_PUMP [nmol/kg]\n", + "[ nan 35.465439 24.159981 7.821284 4.762243]\n", + "\n", + "QV:SEADATANET.404\n", + "[9 1]\n", + "\n", + "P_LPT_CONC_PUMP [nmol/kg]\n", + "[ nan 4.30732 0.595033 0.434555 0.282651]\n", + "\n", + "QV:SEADATANET.405\n", + "[9 1 3 2 4]\n", + "\n", + "P_SPT_CONC_PUMP [nmol/kg]\n", + "[ nan 2.650167 17.605572 0.099575 2.681759]\n", + "\n", + "QV:SEADATANET.406\n", + "[9 1 2 3 4]\n", + "\n", + "P_SPL_CONC_PUMP [nmol/kg]\n", + "[ nan 7.477912 1.229058 0.857984 0.173157]\n", + "\n", + "QV:SEADATANET.407\n", + "[9 2]\n", + "\n", + "Pb_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 13.876318 2.93466 -0.048195 -1.068311]\n", + "\n", + "QV:SEADATANET.408\n", + "[9 1 6 3]\n", + "\n", + "Pb_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.629705 0.160938 0.235971 0.469684]\n", + "\n", + "QV:SEADATANET.409\n", + "[9 1 3 4 2]\n", + "\n", + "Pb_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.359329 0.805823 0.101191 2.653187]\n", + "\n", + "QV:SEADATANET.410\n", + "[9 1 2 3 6]\n", + "\n", + "Th_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 0.010341 0.009102 0.003893 0.003454]\n", + "\n", + "QV:SEADATANET.411\n", + "[9 6 1 3]\n", + "\n", + "Th_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.301463 0.044098 0.020976 0.021268]\n", + "\n", + "QV:SEADATANET.412\n", + "[9 2 1]\n", + "\n", + "Th_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.587317 0.027122 0.022146 0.059317]\n", + "\n", + "QV:SEADATANET.413\n", + "[9 2 1 3]\n", + "\n", + "Ti_LPT_CONC_PUMP [nmol/kg]\n", + "[ nan 0.074035 0.096704 0.045821 0.103233]\n", + "\n", + "QV:SEADATANET.414\n", + "[9 1 3 6 4]\n", + "\n", + "Ti_SPT_CONC_PUMP [nmol/kg]\n", + "[ nan 0.071292 0.004644 0.010673 0.36272 ]\n", + "\n", + "QV:SEADATANET.415\n", + "[9 1 2 3 6]\n", + "\n", + "U_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 1.133128 0.911309 0.6291 0.549211]\n", + "\n", + "QV:SEADATANET.416\n", + "[9 1]\n", + "\n", + "V_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 11.621088 5.792983 4.533802 4.30882 ]\n", + "\n", + "QV:SEADATANET.417\n", + "[9 1]\n", + "\n", + "V_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 3.602153 1.478805 2.053688 4.783217]\n", + "\n", + "QV:SEADATANET.418\n", + "[9 1 3 2 6]\n", + "\n", + "V_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 1.331966 9.289524 18.583929 31.730047]\n", + "\n", + "QV:SEADATANET.419\n", + "[9 1 2 3 4]\n", + "\n", + "V_SPL_CONC_PUMP [pmol/kg]\n", + "[ nan 21.144424 5.6307 3.015887 4.041974]\n", + "\n", + "QV:SEADATANET.420\n", + "[9 2]\n", + "\n", + "Zn_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 1650.754272 -19.898796 -28.986933 -29.430882]\n", + "\n", + "QV:SEADATANET.421\n", + "[9 3 6 1]\n", + "\n", + "Al_TP_CONC_FISH [nmol/kg]\n", + "[ nan 687.715454 51.490208 19.100677 10.364789]\n", + "\n", + "QV:SEADATANET.422\n", + "[9 3 1 6]\n", + "\n", + "Al_TPL_CONC_FISH [nmol/kg]\n", + "[ nan 660.996948 41.436481 11.532884 5.344119]\n", + "\n", + "QV:SEADATANET.423\n", + "[9 1 6]\n", + "\n", + "Ba_TP_CONC_FISH [pmol/kg]\n", + "[ nan 111.671021 51.580975 19.200584 14.317268]\n", + "\n", + "QV:SEADATANET.424\n", + "[9 1]\n", + "\n", + "Ba_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 46.484684 28.02878 7.490732 5.551902]\n", + "\n", + "QV:SEADATANET.425\n", + "[9 1]\n", + "\n", + "Cd_TP_CONC_FISH [pmol/kg]\n", + "[ nan 102.007706 136.569656 92.71405 88.721367]\n", + "\n", + "QV:SEADATANET.426\n", + "[9 1]\n", + "\n", + "Cd_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 103.190826 194.30751 87.460876 80.36937 ]\n", + "\n", + "QV:SEADATANET.427\n", + "[9 1]\n", + "\n", + "Co_TP_CONC_FISH [pmol/kg]\n", + "[ nan 15.210341 18.204781 12.33639 11.280293]\n", + "\n", + "QV:SEADATANET.428\n", + "[9 1]\n", + "\n", + "Co_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 15.543317 17.88595 7.469463 6.682634]\n", + "\n", + "QV:SEADATANET.429\n", + "[9 1 6]\n", + "\n", + "Cu_TP_CONC_FISH [pmol/kg]\n", + "[ nan 88.650826 103.526047 66.520195 56.520878]\n", + "\n", + "QV:SEADATANET.430\n", + "[9 1 6]\n", + "\n", + "Cu_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 76.036781 60.740292 43.786732 27.000389]\n", + "\n", + "QV:SEADATANET.431\n", + "[9 1 6]\n", + "\n", + "Fe_TP_CONC_FISH [nmol/kg]\n", + "[ nan 1.882218 3.733868 0.954305 1.130148]\n", + "\n", + "QV:SEADATANET.432\n", + "[9 1 6]\n", + "\n", + "Fe_TPL_CONC_FISH [nmol/kg]\n", + "[ nan 1.138439 1.863248 0.386656 0.310107]\n", + "\n", + "QV:SEADATANET.433\n", + "[9 1 6]\n", + "\n", + "Mn_TP_CONC_FISH [nmol/kg]\n", + "[ nan 0.151841 0.201322 0.15209 0.110522]\n", + "\n", + "QV:SEADATANET.434\n", + "[9 1]\n", + "\n", + "Mn_TPL_CONC_FISH [nmol/kg]\n", + "[ nan 0.147664 0.238492 0.131041 0.076773]\n", + "\n", + "QV:SEADATANET.435\n", + "[9 1]\n", + "\n", + "Ni_TP_CONC_FISH [pmol/kg]\n", + "[ nan 83.675415 144.370926 138.708786 138.701263]\n", + "\n", + "QV:SEADATANET.436\n", + "[9 1]\n", + "\n", + "Ni_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 67.48732 141.175323 104.912193 95.014244]\n", + "\n", + "QV:SEADATANET.437\n", + "[9 1]\n", + "\n", + "P_TP_CONC_FISH [nmol/kg]\n", + "[ nan 187.721634 263.543549 191.073456 190.559769]\n", + "\n", + "QV:SEADATANET.438\n", + "[9 1]\n", + "\n", + "P_TPL_CONC_FISH [nmol/kg]\n", + "[ nan 105.829308 282.871338 147.596603 126.760963]\n", + "\n", + "QV:SEADATANET.439\n", + "[9 1]\n", + "\n", + "Pb_TP_CONC_FISH [pmol/kg]\n", + "[ nan 2.362012 2.137587 1.174115 1.322653]\n", + "\n", + "QV:SEADATANET.440\n", + "[9 1]\n", + "\n", + "Pb_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 2.227902 2.211512 1.216 1.249463]\n", + "\n", + "QV:SEADATANET.441\n", + "[9 1]\n", + "\n", + "Th_TP_CONC_FISH [pmol/kg]\n", + "[ nan 0.130695 0.107883 0.036751 0.048958]\n", + "\n", + "QV:SEADATANET.442\n", + "[9 1 6]\n", + "\n", + "Th_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 0.034049 0. ]\n", + "\n", + "QV:SEADATANET.443\n", + "[9 1 6]\n", + "\n", + "Ti_TP_CONC_FISH [nmol/kg]\n", + "[ nan 0.095881 0.138347 0.050543 0.078738]\n", + "\n", + "QV:SEADATANET.444\n", + "[9 1 6]\n", + "\n", + "Ti_TPL_CONC_FISH [nmol/kg]\n", + "[ nan 0. 0.003446]\n", + "\n", + "QV:SEADATANET.445\n", + "[9 6 1]\n", + "\n", + "V_TP_CONC_FISH [pmol/kg]\n", + "[ nan 68.910828 63.827221 7.692878 7.591122]\n", + "\n", + "QV:SEADATANET.446\n", + "[9 1]\n", + "\n", + "V_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 69.329071 63.438828 6.152683 4.277951]\n", + "\n", + "QV:SEADATANET.447\n", + "[9 1]\n", + "\n", + "Zn_TP_CONC_FISH [pmol/kg]\n", + "[ nan 588.273376 180.921753 107.334045 193.626343]\n", + "\n", + "QV:SEADATANET.448\n", + "[9 1]\n", + "\n", + "Zn_TPL_CONC_FISH [pmol/kg]\n", + "[ nan 513.994629 177.999512 115.275703 124.16478 ]\n", + "\n", + "QV:SEADATANET.449\n", + "[9 1 6]\n", + "\n", + "bSi_30_28_TP_DELTA_PUMP [per 10^3]\n", + "[ nan 1.07 1.64 1.33 1.08]\n", + "\n", + "QV:SEADATANET.450\n", + "[9 1]\n", + "\n", + "PIC_LPT_CONC_PUMP [umol C/kg]\n", + "[ nan 0.030585 0.00865 0.011919 0.008463]\n", + "\n", + "QV:SEADATANET.451\n", + "[9 1 2 6 4]\n", + "\n", + "PIC_SPT_CONC_PUMP [umol C/kg]\n", + "[ nan 0.112049 0.084065 0.088756 0.042667]\n", + "\n", + "QV:SEADATANET.452\n", + "[9 1 4 6 2]\n", + "\n", + "C_LPT_CONC_PUMP [umol C/kg]\n", + "[ nan 0.666634 0.365649 0.107361 0.070052]\n", + "\n", + "QV:SEADATANET.453\n", + "[9 1]\n", + "\n", + "C_SPT_CONC_PUMP [umol C/kg]\n", + "[ nan 2.702056 0.212912 0.580666 0.43848 ]\n", + "\n", + "QV:SEADATANET.454\n", + "[9 1 4]\n", + "\n", + "POC_LPT_CONC_PUMP [umol C/kg]\n", + "[ nan 0.398764 0.109805 0.027512 0.020317]\n", + "\n", + "QV:SEADATANET.455\n", + "[9 1 4 2 3]\n", + "\n", + "POC_SPT_CONC_PUMP [umol C/kg]\n", + "[ nan 2.536488 0.643902 0.42978 0.253106]\n", + "\n", + "QV:SEADATANET.456\n", + "[9 1 2 4 3]\n", + "\n", + "N_LPT_CONC_PUMP [nmol N/kg]\n", + "[ nan 85.375427 37.764053 15.441927 15.14714 ]\n", + "\n", + "QV:SEADATANET.457\n", + "[9 1 4 6 3]\n", + "\n", + "N_SPT_CONC_PUMP [nmol N/kg]\n", + "[ nan 533.570679 256.519867 82.778313 71.194481]\n", + "\n", + "QV:SEADATANET.458\n", + "[9 1 4 6 2]\n", + "\n", + "bSi_TP_CONC_PUMP [nmol Si/kg]\n", + "[ nan 302.439026 360.975616 273.170746 136.585373]\n", + "\n", + "QV:SEADATANET.459\n", + "[9 1]\n", + "\n", + "bSi_LPT_CONC_PUMP [nmol Si/kg]\n", + "[ nan 16.904764 29.141895 10.76722 5.698265]\n", + "\n", + "QV:SEADATANET.460\n", + "[9 1 3 2 4]\n", + "\n", + "bSi_SPT_CONC_PUMP [nmol Si/kg]\n", + "[ nan 4.904008 49.978989 15.02632 13.257302]\n", + "\n", + "QV:SEADATANET.461\n", + "[9 1 3 2 4]\n", + "\n", + "PARTICLEMASS_LPT_CONC_PUMP [ug/kg]\n", + "[ nan 21.502323 5.0924 2.740238 2.023667]\n", + "\n", + "QV:SEADATANET.462\n", + "[9 1 3 2 4]\n", + "\n", + "PARTICLEMASS_SPT_CONC_PUMP [ug/kg]\n", + "[ nan 82.256165 23.490496 19.031048 11.234714]\n", + "\n", + "QV:SEADATANET.463\n", + "[9 1 3 2 4]\n", + "\n", + "Po_210_TP_CONC_BOTTLE [mBq/kg]\n", + "[ nan 0.090601 0.029121 0.016718 0.040343]\n", + "\n", + "QV:SEADATANET.464\n", + "[9 1]\n", + "\n", + "Pb_210_TP_CONC_BOTTLE [mBq/kg]\n", + "[ nan 0.114567 0.107668 0.041092 0.087087]\n", + "\n", + "QV:SEADATANET.465\n", + "[9 1]\n", + "\n", + "Po_210_SPT_CONC_PUMP [mBq/kg]\n", + "[ nan 0.1792 0.0932 0.1027 0.2343]\n", + "\n", + "QV:SEADATANET.466\n", + "[9 1 6]\n", + "\n", + "Po_210_LPT_CONC_PUMP [mBq/kg]\n", + "[ nan 0.06 0.015 0.033 0.051]\n", + "\n", + "QV:SEADATANET.467\n", + "[9 1 6]\n", + "\n", + "Pb_210_SPT_CONC_PUMP [mBq/kg]\n", + "[ nan 0.1508 0.1742 0.276 0.1953]\n", + "\n", + "QV:SEADATANET.468\n", + "[9 1 6]\n", + "\n", + "Pb_210_LPT_CONC_PUMP [mBq/kg]\n", + "[ nan 0.093 0.075 0.102 0.084]\n", + "\n", + "QV:SEADATANET.469\n", + "[9 1]\n", + "\n", + "Pa_231_TP_CONC_PUMP [uBq/kg]\n", + "[ nan 0.035907 0.017296 0.01316 0.113627]\n", + "\n", + "QV:SEADATANET.470\n", + "[9 1 3]\n", + "\n", + "Pa_231_SPT_CONC_PUMP [uBq/kg]\n", + "[ nan 0.002451 0.007665 0.039516 0.110679]\n", + "\n", + "QV:SEADATANET.471\n", + "[9 1 6 3]\n", + "\n", + "Pa_231_LPT_CONC_PUMP [uBq/kg]\n", + "[ nan 0.007314 0.008882 0.010333 0.000989]\n", + "\n", + "QV:SEADATANET.472\n", + "[9 1 4]\n", + "\n", + "Th_228_SPT_CONC_PUMP [uBq/kg]\n", + "[ nan 6.504068 8.130088 9.756098 13.008137]\n", + "\n", + "QV:SEADATANET.473\n", + "[9 1 6 2 3]\n", + "\n", + "Th_228_LPT_CONC_PUMP [uBq/kg]\n", + "[ nan 1.23 -0.4 0.23 0.25]\n", + "\n", + "QV:SEADATANET.474\n", + "[9 2 6 3]\n", + "\n", + "Th_230_TP_CONC_PUMP [uBq/kg]\n", + "[ nan 0.391425 0.566893 0.595886 0.506238]\n", + "\n", + "QV:SEADATANET.475\n", + "[9 1]\n", + "\n", + "Th_230_SPT_CONC_PUMP [uBq/kg]\n", + "[ nan 0.10078 0.257379 0.834169 2.136799]\n", + "\n", + "QV:SEADATANET.476\n", + "[9 1 3]\n", + "\n", + "Th_230_LPT_CONC_PUMP [uBq/kg]\n", + "[ nan 0.102821 0.129562 0.132244 0.029499]\n", + "\n", + "QV:SEADATANET.477\n", + "[9 1 2 4]\n", + "\n", + "Th_232_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 0.091275 0.086139 0.054428 0.30588 ]\n", + "\n", + "QV:SEADATANET.478\n", + "[9 1]\n", + "\n", + "Th_232_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.021205 0.082637 0.459946 1.867481]\n", + "\n", + "QV:SEADATANET.479\n", + "[9 1 3 6]\n", + "\n", + "Th_232_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 0.058755 0.11342 0.107164 0.63596 ]\n", + "\n", + "QV:SEADATANET.480\n", + "[9 1 4]\n", + "\n", + "Th_234_SPT_CONC_PUMP [mBq/kg]\n", + "[ nan 2.781774 1.471779 3.213887 2.923771]\n", + "\n", + "QV:SEADATANET.481\n", + "[9 1 4 6 2]\n", + "\n", + "Th_234_LPT_CONC_PUMP [mBq/kg]\n", + "[ nan 0.363003 0.373994 0.352724 0.647588]\n", + "\n", + "QV:SEADATANET.482\n", + "[9 1 2 4 3]\n", + "\n", + "Po_210_TP_CONC_UWAY [mBq/kg]\n", + "[ nan 0.019187 0.04813 0.137561 0.054797]\n", + "\n", + "QV:SEADATANET.483\n", + "[9 1]\n", + "\n", + "Pb_210_TP_CONC_UWAY [mBq/kg]\n", + "[ nan 0.030081 0.057073 0.143415 0.045528]\n", + "\n", + "QV:SEADATANET.484\n", + "[9 1]\n", + "\n", + "Nd_143_144_TP_EPSILON_PUMP [per 10^4]\n", + "[ nan -6.748074 -7.76467 -5.730428 -7.449456]\n", + "\n", + "QV:SEADATANET.485\n", + "[9 1 3]\n", + "\n", + "Y_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 1.32 0.74 2.91 0.568]\n", + "\n", + "QV:SEADATANET.486\n", + "[9 1 2 6 3]\n", + "\n", + "La_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.96 1.73 0.6 1.01]\n", + "\n", + "QV:SEADATANET.487\n", + "[9 1 4 2 6]\n", + "\n", + "Ce_TP_CONC_BOTTLE [pmol/kg]\n", + "[nan 2. 2.9 1.1 1.5]\n", + "\n", + "QV:SEADATANET.488\n", + "[9 1 4 3 2]\n", + "\n", + "Pr_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.22 0.37 0.06 0.16]\n", + "\n", + "QV:SEADATANET.489\n", + "[9 1 4 2 6]\n", + "\n", + "Nd_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.76 1.38 0.22 0.51]\n", + "\n", + "QV:SEADATANET.490\n", + "[9 1 4]\n", + "\n", + "Sm_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.16 0.33 0.05 0.11]\n", + "\n", + "QV:SEADATANET.491\n", + "[9 1]\n", + "\n", + "Eu_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.032 0.06 0.011 0.022]\n", + "\n", + "QV:SEADATANET.492\n", + "[9 1]\n", + "\n", + "Gd_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.124 0.238 0.044 0.095]\n", + "\n", + "QV:SEADATANET.493\n", + "[9 1]\n", + "\n", + "Tb_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.018 0.035 0.007 0.031]\n", + "\n", + "QV:SEADATANET.494\n", + "[9 1 2]\n", + "\n", + "Dy_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.107 0.192 0.041 0.071]\n", + "\n", + "QV:SEADATANET.495\n", + "[9 1 2]\n", + "\n", + "Ho_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.02 0.036 0.008 0.014]\n", + "\n", + "QV:SEADATANET.496\n", + "[9 1 2]\n", + "\n", + "Er_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.051 0.097 0.023 0.039]\n", + "\n", + "QV:SEADATANET.497\n", + "[9 1 2]\n", + "\n", + "Tm_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.0078 0.013 0.0037 0.0065]\n", + "\n", + "QV:SEADATANET.498\n", + "[9 1 2]\n", + "\n", + "Yb_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.0495 0.0822 0.0212 0.036 ]\n", + "\n", + "QV:SEADATANET.499\n", + "[9 1 2]\n", + "\n", + "Lu_TP_CONC_BOTTLE [pmol/kg]\n", + "[ nan 0.0051 0.0077 0.0017 0.0037]\n", + "\n", + "QV:SEADATANET.500\n", + "[9 1 2]\n", + "\n", + "Y_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 1.970732 0.357073 0.192195 0.190244]\n", + "\n", + "QV:SEADATANET.501\n", + "[9 2 1 3]\n", + "\n", + "Y_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 3.229268 0.68878 0.618537 0.657561]\n", + "\n", + "QV:SEADATANET.502\n", + "[9 2 1 3]\n", + "\n", + "Nd_TP_CONC_PUMP [pmol/kg]\n", + "[ nan 0.600837 0.532094 1.052143 0.952824]\n", + "\n", + "QV:SEADATANET.503\n", + "[9 1 3]\n", + "\n", + "Nd_LPT_CONC_PUMP [pmol/kg]\n", + "[ nan 1.336585 0.218537 0.115122 0.130732]\n", + "\n", + "QV:SEADATANET.504\n", + "[9 2 3 1]\n", + "\n", + "Nd_SPT_CONC_PUMP [pmol/kg]\n", + "[ nan 2.57561 0.281951 0.303415 0.418537]\n", + "\n", + "QV:SEADATANET.505\n", + "[9 2 3 1]\n", + "\n", + "Y_TP_CONC_FISH [pmol/kg]\n", + "[ nan 12.603902 1.971317 1.403707 1.233561]\n", + "\n", + "QV:SEADATANET.506\n", + "[9 1]\n", + "\n", + "La_TP_CONC_FISH [pmol/kg]\n", + "[ nan 2.045171 1.915512 1.972293 2.076 ]\n", + "\n", + "QV:SEADATANET.507\n", + "[9 1]\n", + "\n", + "Fe_56_54_TP_DELTA_BOTTLE [per 10^3]\n", + "[ nan 0.046448 0.109383 0.016848 -0.015711]\n", + "\n", + "QV:SEADATANET.508\n", + "[9 1]\n", + "\n", + "Ba_138_134_SPL_DELTA_PUMP [per 10^3]\n", + "[ nan 0. -0.05 -0.02 -0.03]\n", + "\n", + "QV:SEADATANET.509\n", + "[9 2]\n", + "\n", + "Cu_65_63_SPT_DELTA_PUMP [per 10^3]\n", + "[ nan 0.37 0.27 0.24 0.11]\n", + "\n", + "QV:SEADATANET.510\n", + "[9 2 3]\n", + "\n", + "Cu_65_63_SPL_DELTA_PUMP [per 10^3]\n", + "[ nan 0.48 0.54 0.39 0.27]\n", + "\n", + "QV:SEADATANET.511\n", + "[9 2 3]\n", + "\n", + "Allo_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 2.880534 0. 5.768254 6.78763 ]\n", + "\n", + "QV:SEADATANET.512\n", + "[9 1 6]\n", + "\n", + "Anth_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[nan 0.]\n", + "\n", + "QV:SEADATANET.513\n", + "[9 6]\n", + "\n", + "But fuco_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 35.493546 33.028984 42.024036 48.899803]\n", + "\n", + "QV:SEADATANET.514\n", + "[9 1 6]\n", + "\n", + "Alpha Car_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 3.524788 55.936646 3.891268 32.790211]\n", + "\n", + "QV:SEADATANET.515\n", + "[9 1 6]\n", + "\n", + "Beta Car_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 10.269084 8.585586 10.358151 7.232014]\n", + "\n", + "QV:SEADATANET.516\n", + "[9 1 6]\n", + "\n", + "Diadino_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 69.338997 35.938 13.28 10.502 ]\n", + "\n", + "QV:SEADATANET.517\n", + "[9 1 6]\n", + "\n", + "Diato_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 3.443475 3.578867 2.974312 7.085016]\n", + "\n", + "QV:SEADATANET.518\n", + "[9 1 6]\n", + "\n", + "Fuco_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 145.857208 143.758408 212.44043 108.200089]\n", + "\n", + "QV:SEADATANET.519\n", + "[9 1 6]\n", + "\n", + "Hex fuco_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 102.854836 93.955658 94.594887 68.711449]\n", + "\n", + "QV:SEADATANET.520\n", + "[9 1 6]\n", + "\n", + "Lut_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 0. 12.51382 3.665026 5.146942]\n", + "\n", + "QV:SEADATANET.521\n", + "[9 6 1]\n", + "\n", + "Neo_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 4.245147 36.744522 2.638522 37.6022 ]\n", + "\n", + "QV:SEADATANET.522\n", + "[9 1 6]\n", + "\n", + "Perid_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 11.945248 5.510989 12.257776 3.910178]\n", + "\n", + "QV:SEADATANET.523\n", + "[9 1 6]\n", + "\n", + "Pras_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 3.851663 46.023136 2.398029 39.186272]\n", + "\n", + "QV:SEADATANET.524\n", + "[9 1 6]\n", + "\n", + "Viola_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 8.175071 8.402634 5.047509 4.284502]\n", + "\n", + "QV:SEADATANET.525\n", + "[9 1 6]\n", + "\n", + "Zea_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 0. 14.658606 16.749643 10.810811]\n", + "\n", + "QV:SEADATANET.526\n", + "[9 6 1]\n", + "\n", + "Gyrox_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[nan 0.]\n", + "\n", + "QV:SEADATANET.527\n", + "[9 6]\n", + "\n", + "Allo_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 0. 1.620942 20.188374 17.506992]\n", + "\n", + "QV:SEADATANET.528\n", + "[9 6 1]\n", + "\n", + "But fuco_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 18.731256 34.212151 48.980167 55.461548]\n", + "\n", + "QV:SEADATANET.529\n", + "[9 1]\n", + "\n", + "Beta Car_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 7.805306 7.048862 8.974472 7.832905]\n", + "\n", + "QV:SEADATANET.530\n", + "[9 1]\n", + "\n", + "Diadino_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 31.242001 21.532 80.603996 89.880745]\n", + "\n", + "QV:SEADATANET.531\n", + "[9 1]\n", + "\n", + "Fuco_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 10.968168 42.386414 53.230206 67.281281]\n", + "\n", + "QV:SEADATANET.532\n", + "[9 1]\n", + "\n", + "Hex fuco_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 88.186478 89.810249 170.852112 254.361237]\n", + "\n", + "QV:SEADATANET.533\n", + "[9 1]\n", + "\n", + "Lut_HPLC_TP_CONC_FISH [ng/liter]\n", + "[nan 0.]\n", + "\n", + "QV:SEADATANET.534\n", + "[9 6]\n", + "\n", + "Perid_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 5.472818 4.42072 23.007784 81.729637]\n", + "\n", + "QV:SEADATANET.535\n", + "[9 1 6]\n", + "\n", + "Viola_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 4.317011 5.415626 0. 24.516607]\n", + "\n", + "QV:SEADATANET.536\n", + "[9 1 6]\n", + "\n", + "Zea_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 32.383663 0. 15.97846 13.495225]\n", + "\n", + "QV:SEADATANET.537\n", + "[9 1 6]\n", + "\n", + "Chl a_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 343.280304 291.095856 368.654602 234.799271]\n", + "\n", + "QV:SEADATANET.538\n", + "[9 1 6]\n", + "\n", + "Chl b_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 0. 85.002571 59.243439 60.365414]\n", + "\n", + "QV:SEADATANET.539\n", + "[9 6 1]\n", + "\n", + "Chl c3_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 0. 29.862658 17.607647 46.841129]\n", + "\n", + "QV:SEADATANET.540\n", + "[9 6 1]\n", + "\n", + "Chl c1-chl c2_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 318.122711 3348.00293 142.516449 7650.768066]\n", + "\n", + "QV:SEADATANET.541\n", + "[9 1]\n", + "\n", + "Chl c TOT_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 131.300003 122.699997 177.5 76.599998]\n", + "\n", + "QV:SEADATANET.542\n", + "[9 1 6]\n", + "\n", + "Chl a allom_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 5.085326 52.472664 6.207444 80.428101]\n", + "\n", + "QV:SEADATANET.543\n", + "[9 1]\n", + "\n", + "Chl a epimer_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 6.699603 53.29155 10.527572 4.659686]\n", + "\n", + "QV:SEADATANET.544\n", + "[9 1]\n", + "\n", + "Chlide a_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 24.69055 288.909882 4.629637 1406.627686]\n", + "\n", + "QV:SEADATANET.545\n", + "[9 1 6]\n", + "\n", + "DV chl a_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 0. 6.046967 6.508017 11.18438 ]\n", + "\n", + "QV:SEADATANET.546\n", + "[9 6 1]\n", + "\n", + "Chl a-DV chla_HPLC_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 449. 404.5 500.899994 241. ]\n", + "\n", + "QV:SEADATANET.547\n", + "[9 1 6]\n", + "\n", + "Chl a_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 181.916824 181.609406 245.424957 517.605774]\n", + "\n", + "QV:SEADATANET.548\n", + "[9 1]\n", + "\n", + "Chl b_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 0. 53.777664 34.019558]\n", + "\n", + "QV:SEADATANET.549\n", + "[9 6 1]\n", + "\n", + "Chl c3_HPLC_TP_CONC_FISH [ng/liter]\n", + "[nan 0.]\n", + "\n", + "QV:SEADATANET.550\n", + "[9 6]\n", + "\n", + "DV chl a_HPLC_TP_CONC_FISH [ng/liter]\n", + "[ nan 14.120045 0. ]\n", + "\n", + "QV:SEADATANET.551\n", + "[9 1 6]\n", + "\n", + "CHLA_FLUOR_TP_CONC_BOTTLE [ng/liter]\n", + "[ nan 53.660454 60.322136 54.215595 87.284599]\n", + "\n", + "QV:SEADATANET.552\n", + "[9 1 6]\n", + "\n", + "PHAEO_FLUOR_TP_CONC_BOTTLE [ng/liter]\n", + "[nan 31. 32. 28. 29.]\n", + "\n", + "QV:SEADATANET.553\n", + "[9 1]\n", + "\n", + "PEP_VAAEAVLSMTK_NiSOD_ProSyn_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 441.946869 379.980927 97.860695 40.193665]\n", + "\n", + "QV:SEADATANET.554\n", + "[9 1 6]\n", + "\n", + "PEP_SPYNQSLVANQIVNK_IdiA_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 0. 48.103069 25.950068 203.370743]\n", + "\n", + "QV:SEADATANET.555\n", + "[9 6 1]\n", + "\n", + "PEP_LHNFISSAESPK_Fld_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 1.39776 34.549454 7.477602 0.495968]\n", + "\n", + "QV:SEADATANET.556\n", + "[9 1 6]\n", + "\n", + "PEP_AGADMVGYVDK_Fld_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 15.13088 78.002304 13.546335 1.454976]\n", + "\n", + "QV:SEADATANET.557\n", + "[9 1 6]\n", + "\n", + "PEP_TVGIYYATTTGK_Fld_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 93.825279 279.165466 85.259254 7.3104 ]\n", + "\n", + "QV:SEADATANET.558\n", + "[9 1]\n", + "\n", + "PEP_VNSVIDAIAEAAK_P-II-glnB-glnK_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 17.52832 25.690355 6.244293 0.482912]\n", + "\n", + "QV:SEADATANET.559\n", + "[9 1 6]\n", + "\n", + "PEP_LSHQAIAEAIGSTR_NtcA_Cyano_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 15.792 16.514317 4.540525 1.601824]\n", + "\n", + "QV:SEADATANET.560\n", + "[9 1 6]\n", + "\n", + "PEP_SKLEDDPANPELILTAR_PhoP_Syn_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 0. 0.491568 0.142632 0.579424]\n", + "\n", + "QV:SEADATANET.561\n", + "[9 6 1]\n", + "\n", + "PEP_LIDQDGVPVVFGGWTSASR_UreaTran_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 1791.224976 1486.048218 539.431274 85.778305]\n", + "\n", + "QV:SEADATANET.562\n", + "[9 1 6]\n", + "\n", + "PEP_VVGEDYLPLGNTEVAPIISK_UreaTran_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 1830.47168 2480.089355 728.808899 138.801819]\n", + "\n", + "QV:SEADATANET.563\n", + "[9 1 6]\n", + "\n", + "PEP_IEYIVEDGASDWPTFAEK_UreaTran_ProSyn_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 13561.165039 7878.208984 2308.209961 0. ]\n", + "\n", + "QV:SEADATANET.564\n", + "[9 1 6]\n", + "\n", + "PEP_IPEDIAFAESR_UreC_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[nan 1.1 0.8 1.4 2.1]\n", + "\n", + "QV:SEADATANET.565\n", + "[9 1 6]\n", + "\n", + "PEP_VGVAGPVGSGK_UreG_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[nan 0.5 0.3 0.4 0.8]\n", + "\n", + "QV:SEADATANET.566\n", + "[9 1 6]\n", + "\n", + "PEP_FDYDGDYGTVLNR_UDP-sulfoquin_m-taxa_TP_CONC_PUMP [fmol/liter]\n", + "[nan 3.5 2.6 4.6 2.1]\n", + "\n", + "QV:SEADATANET.567\n", + "[9 1 6]\n", + "\n", + "PEP_NEAVENDLIVDNK_UDP-sulfoquin_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[nan 1.8 1.1 2.4 2.9]\n", + "\n", + "QV:SEADATANET.568\n", + "[9 1 6]\n", + "\n", + "PEP_EAYPDFASAK_NH4-transporter_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[nan 1.2 1.1 0.1 0. ]\n", + "\n", + "QV:SEADATANET.569\n", + "[9 1 6]\n", + "\n", + "PEP_FDSLINSADNVMTYK_Glut-synt_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 2.00000000e-01 1.00000000e-01 3.00000000e-01\n", + " 1.02400002e+02]\n", + "\n", + "QV:SEADATANET.570\n", + "[9 1 6]\n", + "\n", + "PEP_EGYFPVSPNDTAQDIR_Glut-synt_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 10.8 6.6 9.6 21.299999]\n", + "\n", + "QV:SEADATANET.571\n", + "[9 1 6]\n", + "\n", + "PEP_HAPSFLAFTNPTTNSYK_Glut-synt_ProSyn_TP_CONC_PUMP [fmol/liter]\n", + "[ nan 29.9 15.6 17.700001 52. ]\n", + "\n", + "QV:SEADATANET.572\n", + "[9 1]\n", + "\n", + "PEP_VASLTGADINYLPNPR_UDP-sulfoquin_Pro_TP_CONC_PUMP [fmol/liter]\n", + "[nan 2.1 1.4 2.6 2.4]\n", + "\n", + "QV:SEADATANET.573\n", + "[9 1 6]\n", + "\n", + "CELL_VOLUME_BOTTLE [um^3]\n", + "[ nan 29.4 4.7 6.2 338. ]\n", + "\n", + "QV:SEADATANET.574\n", + "[9 1]\n", + "\n", + "CELL_TYPE_BOTTLE\n", + "[nan 2. 3. 1. 5.]\n", + "\n", + "QV:SEADATANET.575\n", + "[9 1]\n", + "\n", + "Fe_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 0.69 3.14 56.5 1.39]\n", + "\n", + "QV:SEADATANET.576\n", + "[9 1]\n", + "\n", + "C_CELL_CONC_BOTTLE [fmol/cell]\n", + "[ nan 430. 76.599998 100. 4260. ]\n", + "\n", + "QV:SEADATANET.577\n", + "[9 1]\n", + "\n", + "Si_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 33. 40.400002 27.9 12.9 ]\n", + "\n", + "QV:SEADATANET.578\n", + "[9 1]\n", + "\n", + "P_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 38. 200. 375. 6050.]\n", + "\n", + "QV:SEADATANET.579\n", + "[9 1]\n", + "\n", + "S_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 287. 870. 13100. 462.]\n", + "\n", + "QV:SEADATANET.580\n", + "[9 1]\n", + "\n", + "Mn_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 0.38 7.76 0.06 0.46]\n", + "\n", + "QV:SEADATANET.581\n", + "[9 1]\n", + "\n", + "Co_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 0.07 0.1 0.05 0.01]\n", + "\n", + "QV:SEADATANET.582\n", + "[9 1]\n", + "\n", + "Ni_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 0.07 0.22 3.03 0.37]\n", + "\n", + "QV:SEADATANET.583\n", + "[9 1]\n", + "\n", + "Cu_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 1.48 0.71 4.54 1.7 ]\n", + "\n", + "QV:SEADATANET.584\n", + "[9 1]\n", + "\n", + "Zn_CELL_CONC_BOTTLE [amol/cell]\n", + "[ nan 1.4 21.700001 1.04 22.1 ]\n", + "\n", + "QV:SEADATANET.585\n", + "[9 1]\n", + "\n", + "QV:ODV:SAMPLE\n", + "[1]\n", + "\n" + ] + } + ], + "source": [ + "for col in df.columns:\n", + " print(col)\n", + " print(df[col].unique()[:5])\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26d94fd4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yyyy-mm-ddThh:mm:ss.sssLongitude [degrees_east]Latitude [degrees_north]DEPTH [m]BODC Bottle Number:INTEGER
02014-05-17T22:29:00349.2999938.43292957.11214048
12014-05-17T22:29:00349.2999938.43292957.21214039
22014-05-17T22:29:00349.2999938.43292957.21214027
32014-05-17T22:29:00349.2999938.43292957.21214018
42014-05-17T22:29:00349.2999938.43292957.21214036
\n", + "
" + ], + "text/plain": [ + " yyyy-mm-ddThh:mm:ss.sss Longitude [degrees_east] Latitude [degrees_north] \\\n", + "0 2014-05-17T22:29:00 349.29999 38.4329 \n", + "1 2014-05-17T22:29:00 349.29999 38.4329 \n", + "2 2014-05-17T22:29:00 349.29999 38.4329 \n", + "3 2014-05-17T22:29:00 349.29999 38.4329 \n", + "4 2014-05-17T22:29:00 349.29999 38.4329 \n", + "\n", + " DEPTH [m] BODC Bottle Number:INTEGER \n", + "0 2957.1 1214048 \n", + "1 2957.2 1214039 \n", + "2 2957.2 1214027 \n", + "3 2957.2 1214018 \n", + "4 2957.2 1214036 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 'BODC Bottle Number:INTEGER'\n", + "\n", + "cols_unique = [\n", + " 'yyyy-mm-ddThh:mm:ss.sss', \n", + " 'Longitude [degrees_east]',\n", + " 'Latitude [degrees_north]',\n", + " 'DEPTH [m]',\n", + " 'BODC Bottle Number:INTEGER'\n", + " # 'Rosette Bottle Number:INTEGER',\n", + "]\n", + "\n", + "df[cols_unique].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df416ab1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CruiseStation:METAVAR:INDEXED_TEXTTypeyyyy-mm-ddThh:mm:ss.sssLongitude [degrees_east]Latitude [degrees_north]Bot. Depth [m]Operator's Cruise Name:METAVAR:INDEXED_TEXTShip Name:METAVAR:INDEXED_TEXTPeriod:METAVAR:INDEXED_TEXT...QV:SEADATANET.581Co_CELL_CONC_BOTTLE [amol/cell]QV:SEADATANET.582Ni_CELL_CONC_BOTTLE [amol/cell]QV:SEADATANET.583Cu_CELL_CONC_BOTTLE [amol/cell]QV:SEADATANET.584Zn_CELL_CONC_BOTTLE [amol/cell]QV:SEADATANET.585QV:ODV:SAMPLE
9571GA03Station 12B2010-11-02T17:03:15335.502217.40213548.0KN199Knorr15/10/2010 - 04/11/2010...9NaN90.0711.48121.70000111
9573GA03Station 12B2010-11-02T17:03:15335.502217.40213548.0KN199Knorr15/10/2010 - 04/11/2010...10.101NaN9NaN91.04000011
9574GA03Station 12B2010-11-02T17:03:15335.502217.40213548.0KN199Knorr15/10/2010 - 04/11/2010...1NaN9NaN9NaN922.10000011
9576GA03Station 12B2010-11-02T17:03:15335.502217.40213548.0KN199Knorr15/10/2010 - 04/11/2010...10.101NaN9NaN92.50000011
9577GA03Station 12B2010-11-02T17:03:15335.502217.40213548.0KN199Knorr15/10/2010 - 04/11/2010...10.071NaN9NaN90.56000011
..................................................................
92211GP1636B2013-12-17T04:09:34207.9930-10.50195162.0TN303Thomas G. Thompson25/10/2013 - 20/12/2013...1NaN93.721NaN96.95000011
92212GP1636B2013-12-17T04:09:34207.9930-10.50195162.0TN303Thomas G. Thompson25/10/2013 - 20/12/2013...11.0611.681NaN911.30000011
92213GP1636B2013-12-17T04:09:34207.9930-10.50195162.0TN303Thomas G. Thompson25/10/2013 - 20/12/2013...1NaN914.701NaN916.29999911
92214GP1636B2013-12-17T04:09:34207.9930-10.50195162.0TN303Thomas G. Thompson25/10/2013 - 20/12/2013...1NaN92.051NaN912.80000011
92215GP1636B2013-12-17T04:09:34207.9930-10.50195162.0TN303Thomas G. Thompson25/10/2013 - 20/12/2013...1NaN91.071NaN922.90000011
\n", + "

423 rows × 1188 columns

\n", + "
" + ], + "text/plain": [ + " Cruise Station:METAVAR:INDEXED_TEXT Type yyyy-mm-ddThh:mm:ss.sss \\\n", + "9571 GA03 Station 12 B 2010-11-02T17:03:15 \n", + "9573 GA03 Station 12 B 2010-11-02T17:03:15 \n", + "9574 GA03 Station 12 B 2010-11-02T17:03:15 \n", + "9576 GA03 Station 12 B 2010-11-02T17:03:15 \n", + "9577 GA03 Station 12 B 2010-11-02T17:03:15 \n", + "... ... ... ... ... \n", + "92211 GP16 36 B 2013-12-17T04:09:34 \n", + "92212 GP16 36 B 2013-12-17T04:09:34 \n", + "92213 GP16 36 B 2013-12-17T04:09:34 \n", + "92214 GP16 36 B 2013-12-17T04:09:34 \n", + "92215 GP16 36 B 2013-12-17T04:09:34 \n", + "\n", + " Longitude [degrees_east] Latitude [degrees_north] Bot. Depth [m] \\\n", + "9571 335.5022 17.4021 3548.0 \n", + "9573 335.5022 17.4021 3548.0 \n", + "9574 335.5022 17.4021 3548.0 \n", + "9576 335.5022 17.4021 3548.0 \n", + "9577 335.5022 17.4021 3548.0 \n", + "... ... ... ... \n", + "92211 207.9930 -10.5019 5162.0 \n", + "92212 207.9930 -10.5019 5162.0 \n", + "92213 207.9930 -10.5019 5162.0 \n", + "92214 207.9930 -10.5019 5162.0 \n", + "92215 207.9930 -10.5019 5162.0 \n", + "\n", + " Operator's Cruise Name:METAVAR:INDEXED_TEXT \\\n", + "9571 KN199 \n", + "9573 KN199 \n", + "9574 KN199 \n", + "9576 KN199 \n", + "9577 KN199 \n", + "... ... \n", + "92211 TN303 \n", + "92212 TN303 \n", + "92213 TN303 \n", + "92214 TN303 \n", + "92215 TN303 \n", + "\n", + " Ship Name:METAVAR:INDEXED_TEXT Period:METAVAR:INDEXED_TEXT ... \\\n", + "9571 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9573 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9574 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9576 Knorr 15/10/2010 - 04/11/2010 ... \n", + "9577 Knorr 15/10/2010 - 04/11/2010 ... \n", + "... ... ... ... \n", + "92211 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", + "92212 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", + "92213 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", + "92214 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", + "92215 Thomas G. Thompson 25/10/2013 - 20/12/2013 ... \n", + "\n", + " QV:SEADATANET.581 Co_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.582 \\\n", + "9571 9 NaN 9 \n", + "9573 1 0.10 1 \n", + "9574 1 NaN 9 \n", + "9576 1 0.10 1 \n", + "9577 1 0.07 1 \n", + "... ... ... ... \n", + "92211 1 NaN 9 \n", + "92212 1 1.06 1 \n", + "92213 1 NaN 9 \n", + "92214 1 NaN 9 \n", + "92215 1 NaN 9 \n", + "\n", + " Ni_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.583 \\\n", + "9571 0.07 1 \n", + "9573 NaN 9 \n", + "9574 NaN 9 \n", + "9576 NaN 9 \n", + "9577 NaN 9 \n", + "... ... ... \n", + "92211 3.72 1 \n", + "92212 1.68 1 \n", + "92213 14.70 1 \n", + "92214 2.05 1 \n", + "92215 1.07 1 \n", + "\n", + " Cu_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.584 \\\n", + "9571 1.48 1 \n", + "9573 NaN 9 \n", + "9574 NaN 9 \n", + "9576 NaN 9 \n", + "9577 NaN 9 \n", + "... ... ... \n", + "92211 NaN 9 \n", + "92212 NaN 9 \n", + "92213 NaN 9 \n", + "92214 NaN 9 \n", + "92215 NaN 9 \n", + "\n", + " Zn_CELL_CONC_BOTTLE [amol/cell] QV:SEADATANET.585 QV:ODV:SAMPLE \n", + "9571 21.700001 1 1 \n", + "9573 1.040000 1 1 \n", + "9574 22.100000 1 1 \n", + "9576 2.500000 1 1 \n", + "9577 0.560000 1 1 \n", + "... ... ... ... \n", + "92211 6.950000 1 1 \n", + "92212 11.300000 1 1 \n", + "92213 16.299999 1 1 \n", + "92214 12.800000 1 1 \n", + "92215 22.900000 1 1 \n", + "\n", + "[423 rows x 1188 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[cols_unique].duplicated()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2dba1f37", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([nan, 24.0, 23.0, ..., 3056.0, 3057.0, 0], dtype=object)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['GEOTRACES Sample ID:INDEXED_TEXT'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b35da85", + "metadata": {}, "outputs": [ { "data": { @@ -5124,31 +9220,393 @@ } ], "source": [ - "for col in df.columns:\n", - " print(col)\n", - " print(df[col].unique()[:5])\n", - " print()" + "for col in df.columns:\n", + " print(col)\n", + " print(df[col].unique()[:5])\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e301352", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Pourquoi pas?', 'RRS James Cook', 'Pelagia', 'Knorr',\n", + " 'Angeles Alvarino', 'RRS Discovery', 'Meteor', 'FS Meteor',\n", + " 'James Cook', 'Atlantic Explorer', 'Pelican', 'Point Sur',\n", + " 'Sagar Kanya', 'Hakuho Maru', 'Tangaroa', 'Aurora Australis',\n", + " 'Marion Dufresne', 'FS Polarstern', 'Jakov Smirnitskiy',\n", + " 'Investigator', 'S. A. Agulhas II', 'Sam Rothberg', 'Healy',\n", + " 'Amundsen', 'Polarstern', \"L'Atalante\", 'RV Southern Surveyor',\n", + " 'Roger Revelle', 'Thomas G. Thompson', 'Sonne', 'Kilo Moana',\n", + " 'John P. Tully', 'Nathaniel B. Palmer', 'Akademik Tryoshnikov'],\n", + " dtype=object)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Ship Name:METAVAR:INDEXED_TEXT\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55d645fa-660d-4298-b72b-fb06dbd2e2d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ctdsal_d_conc_sensor [pss-78]\n", + "salinity_d_conc_bottle\n", + "salinity_d_conc_pump\n", + "salinity_d_conc_fish\n", + "salinity_d_conc_uway\n", + "salinity_d_conc_boat_pump\n", + "ctdtmp_t_value_sensor [deg c]\n", + "oxygen_d_conc_bottle [umol/kg]\n", + "ctdoxy_d_conc_sensor [umol/kg]\n", + "U_236_238_T_RATIO_BOTTLE [per 10^12]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "def find_print_col(s, cols, lower=True):\n", + " cols = cols if not lower else [col.lower() for col in cols]\n", + " for col in cols:\n", + " if s in col: print(col)\n", + "\n", + "find_print_col('sal', df.columns)\n", + "find_print_col('tmp', df.columns)\n", + "find_print_col('oxy', df.columns)\n", + "find_print_col('U_236_238', df.columns, lower=False)" + ] + }, + { + "cell_type": "markdown", + "id": "bf1a71c0", + "metadata": {}, + "source": [ + "## Data transformation pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "c4407027-942c-4240-a92d-a40311c05afd", + "metadata": {}, + "source": [ + "### Select columns of interest" ] }, { "cell_type": "code", "execution_count": null, - "id": "5e301352", + "id": "55d50ff7-63dc-4719-a6c1-c2ff7e6ecb7f", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "# U_236_238\n", + "# Done: Th_232, I_129, Ac_227" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d3a7bba-39d7-4fc3-8f0e-fb83ff52dcc2", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "common_coi = ['yyyy-mm-ddThh:mm:ss.sss', 'Longitude [degrees_east]',\n", + " 'Latitude [degrees_north]', 'Bot. Depth [m]', 'DEPTH [m]', 'BODC Bottle Number:INTEGER']\n", + "\n", + "nuclides_pattern = ['^TRITI', '^Th_228', '^Th_23[024]', '^Pa_231', \n", + " '^U_236_[DT]', '^Be_', '^Cs_137', '^Pb_210', '^Po_210',\n", + " '^Ra_22[3468]', 'Np_237', '^Pu_239_[D]', '^Pu_240', '^Pu_239_Pu_240',\n", + " '^I_129', '^Ac_227'] \n", + "\n", + "class SelectColsOfInterestCB(Callback):\n", + " \"Select columns of interest.\"\n", + " def __init__(self, common_coi, nuclides_pattern): fc.store_attr()\n", + " def __call__(self, tfm):\n", + " nuc_of_interest = [c for c in tfm.df.columns if \n", + " any(re.match(pattern, c) for pattern in self.nuclides_pattern)]\n", + "\n", + " tfm.df = tfm.df[self.common_coi + nuc_of_interest]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9005e7e4-f0d7-4944-abea-60e5f5522e22", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "tfm = Transformer(df, cbs=[\n", + " SelectColsOfInterestCB(common_coi, nuclides_pattern)\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52f52056-f58d-4ef5-b5de-dc8adf51eac0", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "df_test shape: (105417, 86)\n" + ] + }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yyyy-mm-ddThh:mm:ss.sssLongitude [degrees_east]Latitude [degrees_north]Bot. Depth [m]DEPTH [m]BODC Bottle Number:INTEGERTRITIUM_D_CONC_BOTTLE [TU]Cs_137_D_CONC_BOTTLE [uBq/kg]I_129_D_CONC_BOTTLE [atoms/kg]Np_237_D_CONC_BOTTLE [uBq/kg]...Th_230_TP_CONC_PUMP [uBq/kg]Th_230_SPT_CONC_PUMP [uBq/kg]Th_230_LPT_CONC_PUMP [uBq/kg]Th_232_TP_CONC_PUMP [pmol/kg]Th_232_SPT_CONC_PUMP [pmol/kg]Th_232_LPT_CONC_PUMP [pmol/kg]Th_234_SPT_CONC_PUMP [mBq/kg]Th_234_LPT_CONC_PUMP [mBq/kg]Po_210_TP_CONC_UWAY [mBq/kg]Pb_210_TP_CONC_UWAY [mBq/kg]
02014-05-17T22:29:00349.2999938.43294854.02957.11214048NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
12014-05-17T22:29:00349.2999938.43294854.02957.21214039NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22014-05-17T22:29:00349.2999938.43294854.02957.21214027NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
32014-05-17T22:29:00349.2999938.43294854.02957.21214018NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
42014-05-17T22:29:00349.2999938.43294854.02957.21214036NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 86 columns

\n", + "
" + ], "text/plain": [ - "array(['Pourquoi pas?', 'RRS James Cook', 'Pelagia', 'Knorr',\n", - " 'Angeles Alvarino', 'RRS Discovery', 'Meteor', 'FS Meteor',\n", - " 'James Cook', 'Atlantic Explorer', 'Pelican', 'Point Sur',\n", - " 'Sagar Kanya', 'Hakuho Maru', 'Tangaroa', 'Aurora Australis',\n", - " 'Marion Dufresne', 'FS Polarstern', 'Jakov Smirnitskiy',\n", - " 'Investigator', 'S. A. Agulhas II', 'Sam Rothberg', 'Healy',\n", - " 'Amundsen', 'Polarstern', \"L'Atalante\", 'RV Southern Surveyor',\n", - " 'Roger Revelle', 'Thomas G. Thompson', 'Sonne', 'Kilo Moana',\n", - " 'John P. Tully', 'Nathaniel B. Palmer', 'Akademik Tryoshnikov'],\n", - " dtype=object)" + " yyyy-mm-ddThh:mm:ss.sss Longitude [degrees_east] Latitude [degrees_north] \\\n", + "0 2014-05-17T22:29:00 349.29999 38.4329 \n", + "1 2014-05-17T22:29:00 349.29999 38.4329 \n", + "2 2014-05-17T22:29:00 349.29999 38.4329 \n", + "3 2014-05-17T22:29:00 349.29999 38.4329 \n", + "4 2014-05-17T22:29:00 349.29999 38.4329 \n", + "\n", + " Bot. Depth [m] DEPTH [m] BODC Bottle Number:INTEGER \\\n", + "0 4854.0 2957.1 1214048 \n", + "1 4854.0 2957.2 1214039 \n", + "2 4854.0 2957.2 1214027 \n", + "3 4854.0 2957.2 1214018 \n", + "4 4854.0 2957.2 1214036 \n", + "\n", + " TRITIUM_D_CONC_BOTTLE [TU] Cs_137_D_CONC_BOTTLE [uBq/kg] \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " I_129_D_CONC_BOTTLE [atoms/kg] Np_237_D_CONC_BOTTLE [uBq/kg] ... \\\n", + "0 NaN NaN ... \n", + "1 NaN NaN ... \n", + "2 NaN NaN ... \n", + "3 NaN NaN ... \n", + "4 NaN NaN ... \n", + "\n", + " Th_230_TP_CONC_PUMP [uBq/kg] Th_230_SPT_CONC_PUMP [uBq/kg] \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " Th_230_LPT_CONC_PUMP [uBq/kg] Th_232_TP_CONC_PUMP [pmol/kg] \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " Th_232_SPT_CONC_PUMP [pmol/kg] Th_232_LPT_CONC_PUMP [pmol/kg] \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " Th_234_SPT_CONC_PUMP [mBq/kg] Th_234_LPT_CONC_PUMP [mBq/kg] \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + " Po_210_TP_CONC_UWAY [mBq/kg] Pb_210_TP_CONC_UWAY [mBq/kg] \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + "[5 rows x 86 columns]" ] }, "execution_count": null, @@ -5157,192 +9615,36 @@ } ], "source": [ - "df[\"Ship Name:METAVAR:INDEXED_TEXT\"].unique()" + "#| eval: false\n", + "df_test = tfm()\n", + "print(f'df_test shape: {df_test.shape}')\n", + "df_test.head()" ] }, { "cell_type": "code", "execution_count": null, - "id": "85f010f0", + "id": "c21028a6", "metadata": {}, "outputs": [], "source": [ - "# GEOTRACES Sample ID" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a01362a", - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'GEOTRACES Sample ID'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/indexes/base.py:3802\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3801\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3802\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3803\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", - "File \u001b[0;32mindex.pyx:153\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mindex.pyx:182\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'GEOTRACES Sample ID'", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [71], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mGEOTRACES Sample ID\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/frame.py:4090\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4089\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4090\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4091\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4092\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", - "File \u001b[0;32m~/mambaforge/envs/marisco/lib/python3.10/site-packages/pandas/core/indexes/base.py:3809\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3805\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3807\u001b[0m ):\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3809\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3810\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3812\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", - "\u001b[0;31mKeyError\u001b[0m: 'GEOTRACES Sample ID'" - ] - } - ], - "source": [ - "df['GEOTRACES Sample ID']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8cb03efb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['Cruise', 'Station:METAVAR:INDEXED_TEXT', 'Type',\n", - " 'yyyy-mm-ddThh:mm:ss.sss', 'Longitude [degrees_east]',\n", - " 'Latitude [degrees_north]', 'Bot. Depth [m]',\n", - " 'Operator's Cruise Name:METAVAR:INDEXED_TEXT',\n", - " 'Ship Name:METAVAR:INDEXED_TEXT', 'Period:METAVAR:INDEXED_TEXT',\n", - " ...\n", - " 'QV:SEADATANET.581', 'Co_CELL_CONC_BOTTLE [amol/cell]',\n", - " 'QV:SEADATANET.582', 'Ni_CELL_CONC_BOTTLE [amol/cell]',\n", - " 'QV:SEADATANET.583', 'Cu_CELL_CONC_BOTTLE [amol/cell]',\n", - " 'QV:SEADATANET.584', 'Zn_CELL_CONC_BOTTLE [amol/cell]',\n", - " 'QV:SEADATANET.585', 'QV:ODV:SAMPLE'],\n", - " dtype='object', length=1188)" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.columns" + "cols_measurements = [col for col in df_test.columns if col not in common_coi]" ] }, { "cell_type": "code", "execution_count": null, - "id": "55d645fa-660d-4298-b72b-fb06dbd2e2d7", + "id": "1b4e8eda", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "ctdsal_d_conc_sensor [pss-78]\n", - "salinity_d_conc_bottle\n", - "salinity_d_conc_pump\n", - "salinity_d_conc_fish\n", - "salinity_d_conc_uway\n", - "salinity_d_conc_boat_pump\n", - "ctdtmp_t_value_sensor [deg c]\n", - "oxygen_d_conc_bottle [umol/kg]\n", - "ctdoxy_d_conc_sensor [umol/kg]\n", - "U_236_238_T_RATIO_BOTTLE [per 10^12]\n" + "df_test shape after dropping rows with no measurements: (9389, 86)\n", + "df_test duplicated keys: 534\n" ] - } - ], - "source": [ - "#| eval: false\n", - "def find_print_col(s, cols, lower=True):\n", - " cols = cols if not lower else [col.lower() for col in cols]\n", - " for col in cols:\n", - " if s in col: print(col)\n", - "\n", - "find_print_col('sal', df.columns)\n", - "find_print_col('tmp', df.columns)\n", - "find_print_col('oxy', df.columns)\n", - "find_print_col('U_236_238', df.columns, lower=False)" - ] - }, - { - "cell_type": "markdown", - "id": "bf1a71c0", - "metadata": {}, - "source": [ - "## Data transformation pipeline" - ] - }, - { - "cell_type": "markdown", - "id": "c4407027-942c-4240-a92d-a40311c05afd", - "metadata": {}, - "source": [ - "### Select columns of interest" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55d50ff7-63dc-4719-a6c1-c2ff7e6ecb7f", - "metadata": {}, - "outputs": [], - "source": [ - "#| hide\n", - "# U_236_238\n", - "# Done: Th_232, I_129, Ac_227" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d3a7bba-39d7-4fc3-8f0e-fb83ff52dcc2", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "common_coi = ['yyyy-mm-ddThh:mm:ss.sss', 'Longitude [degrees_east]',\n", - " 'Latitude [degrees_north]', 'Bot. Depth [m]', 'DEPTH [m]']\n", - "\n", - "nuclides_pattern = ['^TRITI', '^Th_228', '^Th_23[024]', '^Pa_231', \n", - " '^U_236_[DT]', '^Be_', '^Cs_137', '^Pb_210', '^Po_210',\n", - " '^Ra_22[3468]', 'Np_237', '^Pu_239_[D]', '^Pu_240', '^Pu_239_Pu_240',\n", - " '^I_129', '^Ac_227'] \n", - "\n", - "class SelectColsOfInterestCB(Callback):\n", - " \"Select columns of interest.\"\n", - " def __init__(self, common_coi, nuclides_pattern): fc.store_attr()\n", - " def __call__(self, tfm):\n", - " nuc_of_interest = [c for c in tfm.df.columns if \n", - " any(re.match(pattern, c) for pattern in self.nuclides_pattern)]\n", - "\n", - " tfm.df = tfm.df[self.common_coi + nuc_of_interest]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9005e7e4-f0d7-4944-abea-60e5f5522e22", - "metadata": {}, - "outputs": [], - "source": [ - "#| eval: false\n", - "tfm = Transformer(df, cbs=[\n", - " SelectColsOfInterestCB(common_coi, nuclides_pattern)\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52f52056-f58d-4ef5-b5de-dc8adf51eac0", - "metadata": {}, - "outputs": [ + }, { "data": { "text/html": [ @@ -5369,11 +9671,11 @@ " Latitude [degrees_north]\n", " Bot. Depth [m]\n", " DEPTH [m]\n", + " BODC Bottle Number:INTEGER\n", " TRITIUM_D_CONC_BOTTLE [TU]\n", " Cs_137_D_CONC_BOTTLE [uBq/kg]\n", " I_129_D_CONC_BOTTLE [atoms/kg]\n", " Np_237_D_CONC_BOTTLE [uBq/kg]\n", - " Pu_239_D_CONC_BOTTLE [uBq/kg]\n", " ...\n", " Th_230_TP_CONC_PUMP [uBq/kg]\n", " Th_230_SPT_CONC_PUMP [uBq/kg]\n", @@ -5386,16 +9688,88 @@ " Po_210_TP_CONC_UWAY [mBq/kg]\n", " Pb_210_TP_CONC_UWAY [mBq/kg]\n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " 47271\n", + " 2007-07-31T08:55:44\n", + " 33.9862\n", + " 77.5033\n", + " 197.0\n", + " 7.0\n", + " 820343\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 47272\n", + " 2007-07-31T08:55:44\n", + " 33.9862\n", + " 77.5033\n", + " 197.0\n", + " 7.0\n", + " 837200\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.019187\n", + " 0.030081\n", + " \n", + " \n", + " 47315\n", + " 2007-07-31T08:55:44\n", + " 33.9862\n", + " 77.5033\n", + " 197.0\n", + " 74.9\n", + " 194761\n", + " NaN\n", + " 1892.405884\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", " \n", - " 0\n", - " 2014-05-17T22:29:00\n", - " 349.29999\n", - " 38.4329\n", - " 4854.0\n", - " 2957.1\n", - " NaN\n", + " 47316\n", + " 2007-07-31T08:55:44\n", + " 33.9862\n", + " 77.5033\n", + " 197.0\n", + " 74.9\n", + " 196037\n", " NaN\n", " NaN\n", " NaN\n", @@ -5413,39 +9787,111 @@ " NaN\n", " \n", " \n", - " 1\n", - " 2014-05-17T22:29:00\n", - " 349.29999\n", - " 38.4329\n", - " 4854.0\n", - " 2957.2\n", + " 47358\n", + " 2007-07-31T19:54:17\n", + " 33.9585\n", + " 78.9978\n", + " 275.0\n", + " 7.0\n", + " 837203\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " NaN\n", + " 0.048130\n", + " 0.057073\n", + " \n", + " \n", + " ...\n", " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 88603\n", + " 2018-11-22T07:33:10\n", + " 208.0000\n", + " -19.9857\n", + " 4441.0\n", + " 4059.6\n", + " 1844432\n", + " NaN\n", + " NaN\n", " NaN\n", " NaN\n", + " ...\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " 0.592941\n", + " 0.076590\n", " NaN\n", " NaN\n", " \n", " \n", - " 2\n", - " 2014-05-17T22:29:00\n", - " 349.29999\n", - " 38.4329\n", - " 4854.0\n", - " 2957.2\n", + " 88609\n", + " 2018-11-22T07:33:10\n", + " 208.0000\n", + " -19.9857\n", + " 4441.0\n", + " 4139.6\n", + " 1844426\n", + " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0.659813\n", + " 0.074881\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 88610\n", + " 2018-11-22T07:33:10\n", + " 208.0000\n", + " -19.9857\n", + " 4441.0\n", + " 4139.6\n", + " 1844429\n", + " NaN\n", + " NaN\n", " NaN\n", " NaN\n", " ...\n", @@ -5461,13 +9907,13 @@ " NaN\n", " \n", " \n", - " 3\n", - " 2014-05-17T22:29:00\n", - " 349.29999\n", - " 38.4329\n", - " 4854.0\n", - " 2957.2\n", - " NaN\n", + " 88613\n", + " 2018-11-22T07:33:10\n", + " 208.0000\n", + " -19.9857\n", + " 4441.0\n", + " 4179.6\n", + " 1844423\n", " NaN\n", " NaN\n", " NaN\n", @@ -5485,13 +9931,13 @@ " NaN\n", " \n", " \n", - " 4\n", - " 2014-05-17T22:29:00\n", - " 349.29999\n", - " 38.4329\n", - " 4854.0\n", - " 2957.2\n", - " NaN\n", + " 88614\n", + " 2018-11-22T07:33:10\n", + " 208.0000\n", + " -19.9857\n", + " 4441.0\n", + " 4179.6\n", + " 1844420\n", " NaN\n", " NaN\n", " NaN\n", @@ -5503,81 +9949,148 @@ " NaN\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", + " 0.648955\n", + " 0.056465\n", " NaN\n", " NaN\n", " \n", " \n", "\n", - "

5 rows × 85 columns

\n", + "

1063 rows × 86 columns

\n", "" ], "text/plain": [ - " yyyy-mm-ddThh:mm:ss.sss Longitude [degrees_east] Latitude [degrees_north] \\\n", - "0 2014-05-17T22:29:00 349.29999 38.4329 \n", - "1 2014-05-17T22:29:00 349.29999 38.4329 \n", - "2 2014-05-17T22:29:00 349.29999 38.4329 \n", - "3 2014-05-17T22:29:00 349.29999 38.4329 \n", - "4 2014-05-17T22:29:00 349.29999 38.4329 \n", + " yyyy-mm-ddThh:mm:ss.sss Longitude [degrees_east] \\\n", + "47271 2007-07-31T08:55:44 33.9862 \n", + "47272 2007-07-31T08:55:44 33.9862 \n", + "47315 2007-07-31T08:55:44 33.9862 \n", + "47316 2007-07-31T08:55:44 33.9862 \n", + "47358 2007-07-31T19:54:17 33.9585 \n", + "... ... ... \n", + "88603 2018-11-22T07:33:10 208.0000 \n", + "88609 2018-11-22T07:33:10 208.0000 \n", + "88610 2018-11-22T07:33:10 208.0000 \n", + "88613 2018-11-22T07:33:10 208.0000 \n", + "88614 2018-11-22T07:33:10 208.0000 \n", "\n", - " Bot. Depth [m] DEPTH [m] TRITIUM_D_CONC_BOTTLE [TU] \\\n", - "0 4854.0 2957.1 NaN \n", - "1 4854.0 2957.2 NaN \n", - "2 4854.0 2957.2 NaN \n", - "3 4854.0 2957.2 NaN \n", - "4 4854.0 2957.2 NaN \n", + " Latitude [degrees_north] Bot. Depth [m] DEPTH [m] \\\n", + "47271 77.5033 197.0 7.0 \n", + "47272 77.5033 197.0 7.0 \n", + "47315 77.5033 197.0 74.9 \n", + "47316 77.5033 197.0 74.9 \n", + "47358 78.9978 275.0 7.0 \n", + "... ... ... ... \n", + "88603 -19.9857 4441.0 4059.6 \n", + "88609 -19.9857 4441.0 4139.6 \n", + "88610 -19.9857 4441.0 4139.6 \n", + "88613 -19.9857 4441.0 4179.6 \n", + "88614 -19.9857 4441.0 4179.6 \n", "\n", - " Cs_137_D_CONC_BOTTLE [uBq/kg] I_129_D_CONC_BOTTLE [atoms/kg] \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", + " BODC Bottle Number:INTEGER TRITIUM_D_CONC_BOTTLE [TU] \\\n", + "47271 820343 NaN \n", + "47272 837200 NaN \n", + "47315 194761 NaN \n", + "47316 196037 NaN \n", + "47358 837203 NaN \n", + "... ... ... \n", + "88603 1844432 NaN \n", + "88609 1844426 NaN \n", + "88610 1844429 NaN \n", + "88613 1844423 NaN \n", + "88614 1844420 NaN \n", "\n", - " Np_237_D_CONC_BOTTLE [uBq/kg] Pu_239_D_CONC_BOTTLE [uBq/kg] ... \\\n", - "0 NaN NaN ... \n", - "1 NaN NaN ... \n", - "2 NaN NaN ... \n", - "3 NaN NaN ... \n", - "4 NaN NaN ... \n", + " Cs_137_D_CONC_BOTTLE [uBq/kg] I_129_D_CONC_BOTTLE [atoms/kg] \\\n", + "47271 NaN NaN \n", + "47272 NaN NaN \n", + "47315 1892.405884 NaN \n", + "47316 NaN NaN \n", + "47358 NaN NaN \n", + "... ... ... \n", + "88603 NaN NaN \n", + "88609 NaN NaN \n", + "88610 NaN NaN \n", + "88613 NaN NaN \n", + "88614 NaN NaN \n", "\n", - " Th_230_TP_CONC_PUMP [uBq/kg] Th_230_SPT_CONC_PUMP [uBq/kg] \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", + " Np_237_D_CONC_BOTTLE [uBq/kg] ... Th_230_TP_CONC_PUMP [uBq/kg] \\\n", + "47271 NaN ... NaN \n", + "47272 NaN ... NaN \n", + "47315 NaN ... NaN \n", + "47316 NaN ... NaN \n", + "47358 NaN ... NaN \n", + "... ... ... ... \n", + "88603 NaN ... NaN \n", + "88609 NaN ... NaN \n", + "88610 NaN ... NaN \n", + "88613 NaN ... NaN \n", + "88614 NaN ... NaN \n", "\n", - " Th_230_LPT_CONC_PUMP [uBq/kg] Th_232_TP_CONC_PUMP [pmol/kg] \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", + " Th_230_SPT_CONC_PUMP [uBq/kg] Th_230_LPT_CONC_PUMP [uBq/kg] \\\n", + "47271 NaN NaN \n", + "47272 NaN NaN \n", + "47315 NaN NaN \n", + "47316 NaN NaN \n", + "47358 NaN NaN \n", + "... ... ... \n", + "88603 NaN NaN \n", + "88609 NaN NaN \n", + "88610 NaN NaN \n", + "88613 NaN NaN \n", + "88614 NaN NaN \n", "\n", - " Th_232_SPT_CONC_PUMP [pmol/kg] Th_232_LPT_CONC_PUMP [pmol/kg] \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", + " Th_232_TP_CONC_PUMP [pmol/kg] Th_232_SPT_CONC_PUMP [pmol/kg] \\\n", + "47271 NaN NaN \n", + "47272 NaN NaN \n", + "47315 NaN NaN \n", + "47316 NaN NaN \n", + "47358 NaN NaN \n", + "... ... ... \n", + "88603 NaN NaN \n", + "88609 NaN NaN \n", + "88610 NaN NaN \n", + "88613 NaN NaN \n", + "88614 NaN NaN \n", "\n", - " Th_234_SPT_CONC_PUMP [mBq/kg] Th_234_LPT_CONC_PUMP [mBq/kg] \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", + " Th_232_LPT_CONC_PUMP [pmol/kg] Th_234_SPT_CONC_PUMP [mBq/kg] \\\n", + "47271 NaN NaN \n", + "47272 NaN NaN \n", + "47315 NaN NaN \n", + "47316 NaN NaN \n", + "47358 NaN NaN \n", + "... ... ... \n", + "88603 NaN 0.592941 \n", + "88609 NaN 0.659813 \n", + "88610 NaN NaN \n", + "88613 NaN NaN \n", + "88614 NaN 0.648955 \n", "\n", - " Po_210_TP_CONC_UWAY [mBq/kg] Pb_210_TP_CONC_UWAY [mBq/kg] \n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", + " Th_234_LPT_CONC_PUMP [mBq/kg] Po_210_TP_CONC_UWAY [mBq/kg] \\\n", + "47271 NaN NaN \n", + "47272 NaN 0.019187 \n", + "47315 NaN NaN \n", + "47316 NaN NaN \n", + "47358 NaN 0.048130 \n", + "... ... ... \n", + "88603 0.076590 NaN \n", + "88609 0.074881 NaN \n", + "88610 NaN NaN \n", + "88613 NaN NaN \n", + "88614 0.056465 NaN \n", + "\n", + " Pb_210_TP_CONC_UWAY [mBq/kg] \n", + "47271 NaN \n", + "47272 0.030081 \n", + "47315 NaN \n", + "47316 NaN \n", + "47358 0.057073 \n", + "... ... \n", + "88603 NaN \n", + "88609 NaN \n", + "88610 NaN \n", + "88613 NaN \n", + "88614 NaN \n", "\n", - "[5 rows x 85 columns]" + "[1063 rows x 86 columns]" ] }, "execution_count": null, @@ -5586,9 +10099,217 @@ } ], "source": [ - "#| eval: false\n", - "df_test = tfm()\n", - "df_test.head()" + "unique_key = [\n", + " 'yyyy-mm-ddThh:mm:ss.sss', \n", + " 'Longitude [degrees_east]', \n", + " 'Latitude [degrees_north]', \n", + " 'DEPTH [m]', \n", + " # 'BODC Bottle Number:INTEGER'\n", + " ]\n", + "\n", + "df_test.dropna(subset=cols_measurements, how='all', inplace=True);\n", + "print(f'df_test shape after dropping rows with no measurements: {df_test.shape}')\n", + "print(f'df_test duplicated keys: {df_test[unique_key].duplicated().sum()}')\n", + "\n", + "df_test[df_test[unique_key].duplicated(keep=False)].sort_values(by=unique_key)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "663f41aa", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAq0AAAESCAYAAADXKOyWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAwvklEQVR4nO3de1RV953//9cpN5HAHhEPh1OR0MYQDWobzHCJE2+IMEE0ZqopLdWJNXHiJUTNRdOuOJ1WjBm16TCx6litlyxcXY0mrQYlYyRxGRRNaNAaYyaaYAUxFg5i6cHg/v2Rn/ubAyYG5cAWn4+19lqcvd/s997o2b788OGDwzRNUwAAAICNfaOrLwAAAAC4GkIrAAAAbI/QCgAAANsjtAIAAMD2CK0AAACwPUIrAAAAbI/QCgAAANsL7OoL8JdLly7p9OnTCg8Pl8Ph6OrLAQAAQCumaer8+fNyu936xje+eiy124bW06dPKzY2tqsvAwAAAFdRVVWlvn37fmVNtw2t4eHhkj7/IkRERHTx1QAAAKC1hoYGxcbGWrntq3Tb0Hp5SkBERAShFQAAwMa+zlROfhALAAAAtkdoBQAAgO0RWgEAAGB7hFYAAADYHqEVAAAAtkdoBQAAgO21K7SuXLlSgwcPtpaRSk1N1WuvvWYdnzp1qhwOh8+WkpLicw6v16vZs2crKipKYWFhysnJ0alTp3xq6urqlJeXJ8MwZBiG8vLyVF9ff+13CQAAgBtau0Jr3759tWTJEh08eFAHDx7UqFGjNH78eB05csSqyczMVHV1tbXt2LHD5xz5+fnaunWrioqKtHfvXjU2Nio7O1stLS1WTW5urioqKlRcXKzi4mJVVFQoLy/vOm8VAAAANyqHaZrm9ZwgMjJSzz//vKZNm6apU6eqvr5e27Ztu2Ktx+NRnz59tHHjRk2ePFnS//t1qzt27NDYsWN19OhRDRw4UGVlZUpOTpYklZWVKTU1Ve+//74SEhK+1nU1NDTIMAx5PJ5u/csFbn16e6f2O7nkvk7tBwAAuq/25LVrntPa0tKioqIiXbhwQampqdb+PXv2yOl06vbbb9f06dNVW1trHTt06JAuXryojIwMa5/b7VZiYqL27dsnSXr77bdlGIYVWCUpJSVFhmFYNVfi9XrV0NDgswEAAKB7aHdorays1C233KKQkBDNmDFDW7du1cCBAyVJWVlZ2rx5s3bv3q1ly5apvLxco0aNktfrlSTV1NQoODhYvXr18jlndHS0ampqrBqn09mmr9PptGqupKCgwJoDaxiGYmNj23trAAAAsKnA9n5CQkKCKioqVF9fr9///veaMmWKSktLNXDgQOtb/pKUmJiooUOHKi4uTtu3b9fEiRO/9Jymafr8ztkr/f7Z1jWtLViwQHPnzrVeNzQ0EFwBAAC6iXaH1uDgYN12222SpKFDh6q8vFwvvPCCVq1a1aY2JiZGcXFxOn78uCTJ5XKpublZdXV1PqOttbW1SktLs2rOnDnT5lxnz55VdHT0l15XSEiIQkJC2ns7AAAAuAFc9zqtpmla3/5v7dy5c6qqqlJMTIwkKSkpSUFBQSopKbFqqqurdfjwYSu0pqamyuPx6MCBA1bN/v375fF4rBoAAADcXNo10rpw4UJlZWUpNjZW58+fV1FRkfbs2aPi4mI1NjZq0aJFeuCBBxQTE6OTJ09q4cKFioqK0v333y9JMgxD06ZN07x589S7d29FRkZq/vz5GjRokNLT0yVJAwYMUGZmpqZPn26N3j788MPKzs7+2isHAAAAoHtpV2g9c+aM8vLyVF1dLcMwNHjwYBUXF2vMmDFqampSZWWlNmzYoPr6esXExGjkyJHasmWLwsPDrXOsWLFCgYGBmjRpkpqamjR69GitX79eAQEBVs3mzZs1Z84ca5WBnJwcFRYWdtAtAwAA4EZz3eu02hXrtPoH67QCAICO0inrtAIAAACdhdAKAAAA2yO0AgAAwPYIrQAAALA9QisAAABsj9AKAAAA2yO0AgAAwPYIrQAAALA9QisAAABsj9AKAAAA2yO0AgAAwPYIrQAAALA9QisAAABsj9AKAAAA2yO0AgAAwPYIrQAAALA9QisAAABsj9AKAAAA2yO0AgAAwPYIrQAAALC9doXWlStXavDgwYqIiFBERIRSU1P12muvWcdN09SiRYvkdrsVGhqqESNG6MiRIz7n8Hq9mj17tqKiohQWFqacnBydOnXKp6aurk55eXkyDEOGYSgvL0/19fXXfpcAAAC4obUrtPbt21dLlizRwYMHdfDgQY0aNUrjx4+3gunSpUu1fPlyFRYWqry8XC6XS2PGjNH58+etc+Tn52vr1q0qKirS3r171djYqOzsbLW0tFg1ubm5qqioUHFxsYqLi1VRUaG8vLwOumUAAADcaBymaZrXc4LIyEg9//zzeuihh+R2u5Wfn6+nnnpK0uejqtHR0Xruuef0yCOPyOPxqE+fPtq4caMmT54sSTp9+rRiY2O1Y8cOjR07VkePHtXAgQNVVlam5ORkSVJZWZlSU1P1/vvvKyEh4WtdV0NDgwzDkMfjUURExPXcoq3d+vT2Tu13csl9ndoPAAB0X+3Ja9c8p7WlpUVFRUW6cOGCUlNTdeLECdXU1CgjI8OqCQkJ0fDhw7Vv3z5J0qFDh3Tx4kWfGrfbrcTERKvm7bfflmEYVmCVpJSUFBmGYdVcidfrVUNDg88GAACA7qHdobWyslK33HKLQkJCNGPGDG3dulUDBw5UTU2NJCk6OtqnPjo62jpWU1Oj4OBg9erV6ytrnE5nm75Op9OquZKCggJrDqxhGIqNjW3vrQEAAMCm2h1aExISVFFRobKyMv3bv/2bpkyZoj//+c/WcYfD4VNvmmabfa21rrlS/dXOs2DBAnk8Hmurqqr6urcEAAAAm2t3aA0ODtZtt92moUOHqqCgQEOGDNELL7wgl8slSW1GQ2tra63RV5fLpebmZtXV1X1lzZkzZ9r0PXv2bJtR3C8KCQmxVjW4vAEAAKB7uO51Wk3TlNfrVXx8vFwul0pKSqxjzc3NKi0tVVpamiQpKSlJQUFBPjXV1dU6fPiwVZOamiqPx6MDBw5YNfv375fH47FqAAAAcHMJbE/xwoULlZWVpdjYWJ0/f15FRUXas2ePiouL5XA4lJ+fr8WLF6t///7q37+/Fi9erJ49eyo3N1eSZBiGpk2bpnnz5ql3796KjIzU/PnzNWjQIKWnp0uSBgwYoMzMTE2fPl2rVq2SJD388MPKzs7+2isHAAAAoHtpV2g9c+aM8vLyVF1dLcMwNHjwYBUXF2vMmDGSpCeffFJNTU169NFHVVdXp+TkZO3atUvh4eHWOVasWKHAwEBNmjRJTU1NGj16tNavX6+AgACrZvPmzZozZ461ykBOTo4KCws74n4BAABwA7rudVrtinVa/YN1WgEAQEfplHVaAQAAgM5CaAUAAIDtEVoBAABge4RWAAAA2B6hFQAAALZHaAUAAIDtEVoBAABge4RWAAAA2B6hFQAAALZHaAUAAIDtEVoBAABge4RWAAAA2B6hFQAAALZHaAUAAIDtEVoBAABge4RWAAAA2B6hFQAAALZHaAUAAIDtEVoBAABge4RWAAAA2F67QmtBQYHuvvtuhYeHy+l0asKECTp27JhPzdSpU+VwOHy2lJQUnxqv16vZs2crKipKYWFhysnJ0alTp3xq6urqlJeXJ8MwZBiG8vLyVF9ff213CQAAgBtau0JraWmpZs6cqbKyMpWUlOizzz5TRkaGLly44FOXmZmp6upqa9uxY4fP8fz8fG3dulVFRUXau3evGhsblZ2drZaWFqsmNzdXFRUVKi4uVnFxsSoqKpSXl3cdtwoAAIAbVWB7iouLi31er1u3Tk6nU4cOHdK9995r7Q8JCZHL5briOTwej9auXauNGzcqPT1dkrRp0ybFxsbq9ddf19ixY3X06FEVFxerrKxMycnJkqQ1a9YoNTVVx44dU0JCQrtuEgAAADe265rT6vF4JEmRkZE++/fs2SOn06nbb79d06dPV21trXXs0KFDunjxojIyMqx9brdbiYmJ2rdvnyTp7bfflmEYVmCVpJSUFBmGYdW05vV61dDQ4LMBAACge7jm0GqapubOnathw4YpMTHR2p+VlaXNmzdr9+7dWrZsmcrLyzVq1Ch5vV5JUk1NjYKDg9WrVy+f80VHR6umpsaqcTqdbXo6nU6rprWCggJr/qthGIqNjb3WWwMAAIDNtGt6wBfNmjVL7733nvbu3euzf/LkydbHiYmJGjp0qOLi4rR9+3ZNnDjxS89nmqYcDof1+osff1nNFy1YsEBz5861Xjc0NBBcAQAAuolrGmmdPXu2Xn31Vb3xxhvq27fvV9bGxMQoLi5Ox48flyS5XC41Nzerrq7Op662tlbR0dFWzZkzZ9qc6+zZs1ZNayEhIYqIiPDZAAAA0D20K7SapqlZs2bp5Zdf1u7duxUfH3/Vzzl37pyqqqoUExMjSUpKSlJQUJBKSkqsmurqah0+fFhpaWmSpNTUVHk8Hh04cMCq2b9/vzwej1UDAACAm0e7pgfMnDlTL730kl555RWFh4db80sNw1BoaKgaGxu1aNEiPfDAA4qJidHJkye1cOFCRUVF6f7777dqp02bpnnz5ql3796KjIzU/PnzNWjQIGs1gQEDBigzM1PTp0/XqlWrJEkPP/ywsrOzWTkAAADgJtSu0Lpy5UpJ0ogRI3z2r1u3TlOnTlVAQIAqKyu1YcMG1dfXKyYmRiNHjtSWLVsUHh5u1a9YsUKBgYGaNGmSmpqaNHr0aK1fv14BAQFWzebNmzVnzhxrlYGcnBwVFhZe630CAADgBuYwTdPs6ovwh4aGBhmGIY/H063nt9769PZO7XdyyX2d2g8AAHRf7clr17VOKwAAANAZCK0AAACwPUIrAAAAbI/QCgAAANsjtAIAAMD2CK0AAACwPUIrAAAAbI/QCgAAANsjtAIAAMD2CK0AAACwPUIrAAAAbI/QCgAAANsjtAIAAMD2CK0AAACwPUIrAAAAbI/QCgAAANsjtAIAAMD2CK0AAACwPUIrAAAAbC+wqy8AN5Zbn97eqf1OLrmvU/sBAAB7YqQVAAAAtteu0FpQUKC7775b4eHhcjqdmjBhgo4dO+ZTY5qmFi1aJLfbrdDQUI0YMUJHjhzxqfF6vZo9e7aioqIUFhamnJwcnTp1yqemrq5OeXl5MgxDhmEoLy9P9fX113aXAAAAuKG1K7SWlpZq5syZKisrU0lJiT777DNlZGTowoULVs3SpUu1fPlyFRYWqry8XC6XS2PGjNH58+etmvz8fG3dulVFRUXau3evGhsblZ2drZaWFqsmNzdXFRUVKi4uVnFxsSoqKpSXl9cBtwwAAIAbjcM0TfNaP/ns2bNyOp0qLS3VvffeK9M05Xa7lZ+fr6eeekrS56Oq0dHReu655/TII4/I4/GoT58+2rhxoyZPnixJOn36tGJjY7Vjxw6NHTtWR48e1cCBA1VWVqbk5GRJUllZmVJTU/X+++8rISGhzbV4vV55vV7rdUNDg2JjY+XxeBQREXGtt2h7nT3HtLMxpxUAgO6roaFBhmF8rbx2XXNaPR6PJCkyMlKSdOLECdXU1CgjI8OqCQkJ0fDhw7Vv3z5J0qFDh3Tx4kWfGrfbrcTERKvm7bfflmEYVmCVpJSUFBmGYdW0VlBQYE0lMAxDsbGx13NrAAAAsJFrDq2maWru3LkaNmyYEhMTJUk1NTWSpOjoaJ/a6Oho61hNTY2Cg4PVq1evr6xxOp1tejqdTqumtQULFsjj8VhbVVXVtd4aAAAAbOaal7yaNWuW3nvvPe3du7fNMYfD4fPaNM02+1prXXOl+q86T0hIiEJCQr7OpQMAAOAGc00jrbNnz9arr76qN954Q3379rX2u1wuSWozGlpbW2uNvrpcLjU3N6uuru4ra86cOdOm79mzZ9uM4gIAAKD7a1doNU1Ts2bN0ssvv6zdu3crPj7e53h8fLxcLpdKSkqsfc3NzSotLVVaWpokKSkpSUFBQT411dXVOnz4sFWTmpoqj8ejAwcOWDX79++Xx+OxagAAAHDzaNf0gJkzZ+qll17SK6+8ovDwcGtE1TAMhYaGyuFwKD8/X4sXL1b//v3Vv39/LV68WD179lRubq5VO23aNM2bN0+9e/dWZGSk5s+fr0GDBik9PV2SNGDAAGVmZmr69OlatWqVJOnhhx9Wdnb2FVcOAAAAQPfWrtC6cuVKSdKIESN89q9bt05Tp06VJD355JNqamrSo48+qrq6OiUnJ2vXrl0KDw+36lesWKHAwEBNmjRJTU1NGj16tNavX6+AgACrZvPmzZozZ461ykBOTo4KCwuv5R4BAABwg7uudVrtrD3rft3IWKcVAADcqDptnVYAAACgMxBaAQAAYHuEVgAAANgeoRUAAAC2R2gFAACA7RFaAQAAYHuEVgAAANgeoRUAAAC2R2gFAACA7RFaAQAAYHuEVgAAANgeoRUAAAC2R2gFAACA7RFaAQAAYHuEVgAAANgeoRUAAAC2F9jVFwB8lVuf3t6p/U4uua9T+wEAgK+HkVYAAADYHqEVAAAAtkdoBQAAgO21O7S++eabGjdunNxutxwOh7Zt2+ZzfOrUqXI4HD5bSkqKT43X69Xs2bMVFRWlsLAw5eTk6NSpUz41dXV1ysvLk2EYMgxDeXl5qq+vb/cNAgAA4MbX7tB64cIFDRkyRIWFhV9ak5mZqerqamvbsWOHz/H8/Hxt3bpVRUVF2rt3rxobG5Wdna2WlharJjc3VxUVFSouLlZxcbEqKiqUl5fX3ssFAABAN9Du1QOysrKUlZX1lTUhISFyuVxXPObxeLR27Vpt3LhR6enpkqRNmzYpNjZWr7/+usaOHaujR4+quLhYZWVlSk5OliStWbNGqampOnbsmBISEtp72QAAALiB+WVO6549e+R0OnX77bdr+vTpqq2ttY4dOnRIFy9eVEZGhrXP7XYrMTFR+/btkyS9/fbbMgzDCqySlJKSIsMwrJrWvF6vGhoafDYAAAB0Dx0eWrOysrR582bt3r1by5YtU3l5uUaNGiWv1ytJqqmpUXBwsHr16uXzedHR0aqpqbFqnE5nm3M7nU6rprWCggJr/qthGIqNje3gOwMAAEBX6fBfLjB58mTr48TERA0dOlRxcXHavn27Jk6c+KWfZ5qmHA6H9fqLH39ZzRctWLBAc+fOtV43NDQQXAEAALoJvy95FRMTo7i4OB0/flyS5HK51NzcrLq6Op+62tpaRUdHWzVnzpxpc66zZ89aNa2FhIQoIiLCZwMAAED34PfQeu7cOVVVVSkmJkaSlJSUpKCgIJWUlFg11dXVOnz4sNLS0iRJqamp8ng8OnDggFWzf/9+eTweqwYAAAA3j3ZPD2hsbNSHH35ovT5x4oQqKioUGRmpyMhILVq0SA888IBiYmJ08uRJLVy4UFFRUbr//vslSYZhaNq0aZo3b5569+6tyMhIzZ8/X4MGDbJWExgwYIAyMzM1ffp0rVq1SpL08MMPKzs7m5UDAAAAbkLtDq0HDx7UyJEjrdeX55FOmTJFK1euVGVlpTZs2KD6+nrFxMRo5MiR2rJli8LDw63PWbFihQIDAzVp0iQ1NTVp9OjRWr9+vQICAqyazZs3a86cOdYqAzk5OV+5NiwAAAC6L4dpmmZXX4Q/NDQ0yDAMeTyebj2/9dant3f1JXQrJ5fc19WXAADATaM9ec3vc1oBAACA60VoBQAAgO0RWgEAAGB7hFYAAADYHqEVAAAAtkdoBQAAgO0RWgEAAGB7hFYAAADYHqEVAAAAtkdoBQAAgO0RWgEAAGB7hFYAAADYHqEVAAAAtkdoBQAAgO0RWgEAAGB7hFYAAADYHqEVAAAAtkdoBQAAgO0RWgEAAGB7hFYAAADYXrtD65tvvqlx48bJ7XbL4XBo27ZtPsdN09SiRYvkdrsVGhqqESNG6MiRIz41Xq9Xs2fPVlRUlMLCwpSTk6NTp0751NTV1SkvL0+GYcgwDOXl5am+vr7dNwgAAIAbX7tD64ULFzRkyBAVFhZe8fjSpUu1fPlyFRYWqry8XC6XS2PGjNH58+etmvz8fG3dulVFRUXau3evGhsblZ2drZaWFqsmNzdXFRUVKi4uVnFxsSoqKpSXl3cNtwgAAIAbncM0TfOaP9nh0NatWzVhwgRJn4+yut1u5efn66mnnpL0+ahqdHS0nnvuOT3yyCPyeDzq06ePNm7cqMmTJ0uSTp8+rdjYWO3YsUNjx47V0aNHNXDgQJWVlSk5OVmSVFZWptTUVL3//vtKSEi46rU1NDTIMAx5PB5FRERc6y3a3q1Pb+/qS+hWTi65r6svAQCAm0Z78lqHzmk9ceKEampqlJGRYe0LCQnR8OHDtW/fPknSoUOHdPHiRZ8at9utxMREq+btt9+WYRhWYJWklJQUGYZh1bTm9XrV0NDgswEAAKB7COzIk9XU1EiSoqOjffZHR0fr448/tmqCg4PVq1evNjWXP7+mpkZOp7PN+Z1Op1XTWkFBgf793//9uu8BN7fOHrlmZBcAgK/HL6sHOBwOn9emabbZ11rrmivVf9V5FixYII/HY21VVVXXcOUAAACwow4NrS6XS5LajIbW1tZao68ul0vNzc2qq6v7ypozZ860Of/Zs2fbjOJeFhISooiICJ8NAAAA3UOHhtb4+Hi5XC6VlJRY+5qbm1VaWqq0tDRJUlJSkoKCgnxqqqurdfjwYasmNTVVHo9HBw4csGr2798vj8dj1QAAAODm0e45rY2Njfrwww+t1ydOnFBFRYUiIyPVr18/5efna/Hixerfv7/69++vxYsXq2fPnsrNzZUkGYahadOmad68eerdu7ciIyM1f/58DRo0SOnp6ZKkAQMGKDMzU9OnT9eqVaskSQ8//LCys7O/1soBAAAA6F7aHVoPHjyokSNHWq/nzp0rSZoyZYrWr1+vJ598Uk1NTXr00UdVV1en5ORk7dq1S+Hh4dbnrFixQoGBgZo0aZKampo0evRorV+/XgEBAVbN5s2bNWfOHGuVgZycnC9dGxYAAADd23Wt02pnrNOKGwGrBwAAbmZdtk4rAAAA4A+EVgAAANgeoRUAAAC2R2gFAACA7RFaAQAAYHuEVgAAANgeoRUAAAC2R2gFAACA7RFaAQAAYHuEVgAAANgeoRUAAAC2R2gFAACA7RFaAQAAYHuEVgAAANgeoRUAAAC2R2gFAACA7RFaAQAAYHuEVgAAANgeoRUAAAC2R2gFAACA7XV4aF20aJEcDofP5nK5rOOmaWrRokVyu90KDQ3ViBEjdOTIEZ9zeL1ezZ49W1FRUQoLC1NOTo5OnTrV0ZcKAACAG4RfRlrvvPNOVVdXW1tlZaV1bOnSpVq+fLkKCwtVXl4ul8ulMWPG6Pz581ZNfn6+tm7dqqKiIu3du1eNjY3Kzs5WS0uLPy4XAAAANhfol5MGBvqMrl5mmqZ++ctf6plnntHEiRMlSb/97W8VHR2tl156SY888og8Ho/Wrl2rjRs3Kj09XZK0adMmxcbG6vXXX9fYsWP9cckAAACwMb+MtB4/flxut1vx8fF68MEH9dFHH0mSTpw4oZqaGmVkZFi1ISEhGj58uPbt2ydJOnTokC5evOhT43a7lZiYaNVcidfrVUNDg88GAACA7qHDQ2tycrI2bNignTt3as2aNaqpqVFaWprOnTunmpoaSVJ0dLTP50RHR1vHampqFBwcrF69en1pzZUUFBTIMAxri42N7eA7AwAAQFfp8NCalZWlBx54QIMGDVJ6erq2b98u6fNpAJc5HA6fzzFNs82+1q5Ws2DBAnk8Hmurqqq6jrsAAACAnfh9yauwsDANGjRIx48ft+a5th4xra2ttUZfXS6XmpubVVdX96U1VxISEqKIiAifDQAAAN2D30Or1+vV0aNHFRMTo/j4eLlcLpWUlFjHm5ubVVpaqrS0NElSUlKSgoKCfGqqq6t1+PBhqwYAAAA3lw5fPWD+/PkaN26c+vXrp9raWv385z9XQ0ODpkyZIofDofz8fC1evFj9+/dX//79tXjxYvXs2VO5ubmSJMMwNG3aNM2bN0+9e/dWZGSk5s+fb003AAAAwM2nw0PrqVOn9P3vf1+ffvqp+vTpo5SUFJWVlSkuLk6S9OSTT6qpqUmPPvqo6urqlJycrF27dik8PNw6x4oVKxQYGKhJkyapqalJo0eP1vr16xUQENDRlwsAAIAbgMM0TbOrL8IfGhoaZBiGPB5Pt57feuvT27v6EnAdTi65r6svAQCALtOevOb3Oa0AAADA9SK0AgAAwPYIrQAAALA9QisAAABsj9AKAAAA2yO0AgAAwPYIrQAAALA9QisAAABsj9AKAAAA2yO0AgAAwPYIrQAAALA9QisAAABsL7CrLwBA93br09s7td/JJfd1aj8AQOcgtAJdiEAHu+vsv6MSf08BXBnTAwAAAGB7jLQCN5GuGDXrbIxeA0D3RGgFgBvIzfAfDwC4EqYHAAAAwPYIrQAAALA9QisAAABsjzmtAAB0Y/xwIroL24+0vvjii4qPj1ePHj2UlJSkt956q6svCQAAAJ3M1iOtW7ZsUX5+vl588UXdc889WrVqlbKysvTnP/9Z/fr16+rLAwD4ASODAK7EYZqm2dUX8WWSk5N11113aeXKlda+AQMGaMKECSooKPCp9Xq98nq91muPx6N+/fqpqqpKERERnXbNnS3x2Z1dfQnATe3wv4/t1H685298/J3pWJ399ewKnf1n2Jlf04aGBsXGxqq+vl6GYXx1sWlTXq/XDAgIMF9++WWf/XPmzDHvvffeNvXPPvusKYmNjY2NjY2Nje0G26qqqq6aDW07PeDTTz9VS0uLoqOjffZHR0erpqamTf2CBQs0d+5c6/WlS5f017/+Vb1795bD4fD79V6ry//D6MwR4c7uSb8bu19X9KQf/ezek343dr+u6Nnd+10r0zR1/vx5ud3uq9baNrRe1jpwmqZ5xRAaEhKikJAQn33/8A//4M9L61ARERGd/peqs3vS78bu1xU96Uc/u/ek343dryt6dvd+1+Kq0wL+f7ZdPSAqKkoBAQFtRlVra2vbjL4CAACge7NtaA0ODlZSUpJKSkp89peUlCgtLa2LrgoAAABdwdbTA+bOnau8vDwNHTpUqampWr16tT755BPNmDGjqy+tw4SEhOjZZ59tM7WhO/Wk343dryt60o9+du9Jvxu7X1f07O79OoOtl7ySPv/lAkuXLlV1dbUSExO1YsUK3XvvvV19WQAAAOhEtg+tAAAAgG3ntAIAAACXEVoBAABge4RWAAAA2B6hFQAAALZHaO1iL774ouLj49WjRw8lJSXprbfe8luvN998U+PGjZPb7ZbD4dC2bdv81qugoEB33323wsPD5XQ6NWHCBB07dsxv/SRp5cqVGjx4sPXbP1JTU/Xaa6/5tedlBQUFcjgcys/P91uPRYsWyeFw+Gwul8tv/STpL3/5i374wx+qd+/e6tmzp77zne/o0KFDful16623trk/h8OhmTNn+qWfJH322Wf6yU9+ovj4eIWGhupb3/qWfvazn+nSpUt+63n+/Hnl5+crLi5OoaGhSktLU3l5eYec+2rvcdM0tWjRIrndboWGhmrEiBE6cuSI3/q9/PLLGjt2rKKiouRwOFRRUXHNva7W7+LFi3rqqac0aNAghYWFye1260c/+pFOnz7tt57S5+/LO+64Q2FhYerVq5fS09O1f/9+v/X7okceeUQOh0O//OUv/dZv6tSpbd6TKSkpfusnSUePHlVOTo4Mw1B4eLhSUlL0ySef+KXflZ45DodDzz//vF/6NTY2atasWerbt69CQ0M1YMAArVy58pp6fd2eZ86c0dSpU+V2u9WzZ09lZmbq+PHj19WzqxBau9CWLVuUn5+vZ555Ru+++67+6Z/+SVlZWdf85ryaCxcuaMiQISosLPTL+b+otLRUM2fOVFlZmUpKSvTZZ58pIyNDFy5c8FvPvn37asmSJTp48KAOHjyoUaNGafz48df1j/LXUV5ertWrV2vw4MF+7SNJd955p6qrq62tsrLSb73q6up0zz33KCgoSK+99pr+/Oc/a9myZX779cjl5eU+93b5F4t873vf80s/SXruuef061//WoWFhTp69KiWLl2q559/Xv/1X//lt54//vGPVVJSoo0bN6qyslIZGRlKT0/XX/7yl+s+99Xe40uXLtXy5ctVWFio8vJyuVwujRkzRufPn/dLvwsXLuiee+7RkiVLrun87en3t7/9Te+8845++tOf6p133tHLL7+sDz74QDk5OX7rKUm33367CgsLVVlZqb179+rWW29VRkaGzp4965d+l23btk379+//Wr+v/Xr7ZWZm+rw3d+zY4bd+//d//6dhw4bpjjvu0J49e/SnP/1JP/3pT9WjRw+/9PvifVVXV+s3v/mNHA6HHnjgAb/0e/zxx1VcXKxNmzbp6NGjevzxxzV79my98sor19Tvaj1N09SECRP00Ucf6ZVXXtG7776ruLg4paen+/XfY78x0WX+8R//0ZwxY4bPvjvuuMN8+umn/d5bkrl161a/97mstrbWlGSWlpZ2Wk/TNM1evXqZ//M//+O3858/f97s37+/WVJSYg4fPtx87LHH/Nbr2WefNYcMGeK387f21FNPmcOGDeu0fq099thj5re//W3z0qVLfutx3333mQ899JDPvokTJ5o//OEP/dLvb3/7mxkQEGD+8Y9/9Nk/ZMgQ85lnnunQXq3f45cuXTJdLpe5ZMkSa9/f//530zAM89e//nWH9/uiEydOmJLMd99997r7fJ1+lx04cMCUZH788ced1tPj8ZiSzNdff91v/U6dOmV+85vfNA8fPmzGxcWZK1asuO5eX9ZvypQp5vjx4zvk/F+n3+TJk/32/vs6f37jx483R40a5bd+d955p/mzn/3MZ99dd91l/uQnP/FLz2PHjpmSzMOHD1v7PvvsMzMyMtJcs2ZNh/TsTIy0dpHm5mYdOnRIGRkZPvszMjK0b9++Lroq//F4PJKkyMjITunX0tKioqIiXbhwQampqX7rM3PmTN13331KT0/3W48vOn78uNxut+Lj4/Xggw/qo48+8luvV199VUOHDtX3vvc9OZ1Offe739WaNWv81u+LmpubtWnTJj300ENyOBx+6zNs2DD97//+rz744ANJ0p/+9Cft3btX//zP/+yXfp999plaWlrajBqFhoZq7969ful52YkTJ1RTU+PzzAkJCdHw4cO75TNH+vy543A4/Pbdgdaam5u1evVqGYahIUOG+KXHpUuXlJeXpyeeeEJ33nmnX3q0tmfPHjmdTt1+++2aPn26amtr/dLn0qVL2r59u26//XaNHTtWTqdTycnJfp3K9kVnzpzR9u3bNW3aNL/1GDZsmF599VX95S9/kWmaeuONN/TBBx9o7Nixfunn9XolyeeZExAQoODgYL8/c/yB0NpFPv30U7W0tCg6Otpnf3R0tGpqarroqvzDNE3NnTtXw4YNU2Jiol97VVZW6pZbblFISIhmzJihrVu3auDAgX7pVVRUpHfeeUcFBQV+OX9rycnJ2rBhg3bu3Kk1a9aopqZGaWlpOnfunF/6ffTRR1q5cqX69++vnTt3asaMGZozZ442bNjgl35ftG3bNtXX12vq1Kl+7fPUU0/p+9//vu644w4FBQXpu9/9rvLz8/X973/fL/3Cw8OVmpqq//iP/9Dp06fV0tKiTZs2af/+/aqurvZLz8suP1duhmeOJP3973/X008/rdzcXEVERPi11x//+Efdcsst6tGjh1asWKGSkhJFRUX5pddzzz2nwMBAzZkzxy/nby0rK0ubN2/W7t27tWzZMpWXl2vUqFFWGOpItbW1amxs1JIlS5SZmaldu3bp/vvv18SJE1VaWtrh/Vr77W9/q/DwcE2cONFvPX71q19p4MCB6tu3r4KDg5WZmakXX3xRw4YN80u/O+64Q3FxcVqwYIHq6urU3NysJUuWqKamxu/PHH8I7OoLuNm1HkUyTdOvI0tdYdasWXrvvfc65X91CQkJqqioUH19vX7/+99rypQpKi0t7fDgWlVVpccee0y7du265rlW7ZWVlWV9PGjQIKWmpurb3/62fvvb32ru3Lkd3u/SpUsaOnSoFi9eLEn67ne/qyNHjmjlypX60Y9+1OH9vmjt2rXKysq67vl6V7NlyxZt2rRJL730ku68805VVFQoPz9fbrdbU6ZM8UvPjRs36qGHHtI3v/lNBQQE6K677lJubq7eeecdv/Rr7WZ45ly8eFEPPvigLl26pBdffNHv/UaOHKmKigp9+umnWrNmjSZNmqT9+/fL6XR2aJ9Dhw7phRde0DvvvNNpf2aTJ0+2Pk5MTNTQoUMVFxen7du3d3i4u/wDkOPHj9fjjz8uSfrOd76jffv26de//rWGDx/eof1a+81vfqMf/OAHfn2m/+pXv1JZWZleffVVxcXF6c0339Sjjz6qmJgYv3zHLigoSL///e81bdo0RUZGKiAgQOnp6T7/ntxIGGntIlFRUQoICGgzwlFbW9tmJORGNnv2bL366qt644031LdvX7/3Cw4O1m233aahQ4eqoKBAQ4YM0QsvvNDhfQ4dOqTa2lolJSUpMDBQgYGBKi0t1a9+9SsFBgaqpaWlw3u2FhYWpkGDBvntp0BjYmLahP0BAwb47QcFL/v444/1+uuv68c//rFf+0jSE088oaeffloPPvigBg0apLy8PD3++ON+HT3/9re/rdLSUjU2NqqqqkoHDhzQxYsXFR8f77eekqyVJrr7M+fixYuaNGmSTpw4oZKSEr+Pskqfvxdvu+02paSkaO3atQoMDNTatWs7vM9bb72l2tpa9evXz3rufPzxx5o3b55uvfXWDu93JTExMYqLi/PLcycqKkqBgYFd8tx56623dOzYMb8+d5qamrRw4UItX75c48aN0+DBgzVr1ixNnjxZ//mf/+m3vklJSdZgTnV1tYqLi3Xu3Dm/P3P8gdDaRYKDg5WUlGT9hPRlJSUlSktL66Kr6jimaWrWrFl6+eWXtXv37i57c5im6ZdvY40ePVqVlZWqqKiwtqFDh+oHP/iBKioqFBAQ0OE9W/N6vTp69KhiYmL8cv577rmnzTJlH3zwgeLi4vzS77J169bJ6XTqvvvu82sf6fOfOP/GN3wfgwEBAX5d8uqysLAwxcTEqK6uTjt37tT48eP92i8+Pl4ul8vnmdPc3KzS0tJu8cyR/l9gPX78uF5//XX17t27S67DX8+dvLw8vffeez7PHbfbrSeeeEI7d+7s8H5Xcu7cOVVVVfnluRMcHKy77767S547a9euVVJSkt/mIkuf//28ePFilz1zDMNQnz59dPz4cR08eNDvzxx/YHpAF5o7d67y8vI0dOhQpaamavXq1frkk080Y8YMv/RrbGzUhx9+aL0+ceKEKioqFBkZqX79+nVor5kzZ+qll17SK6+8ovDwcGt0xzAMhYaGdmivyxYuXKisrCzFxsbq/PnzKioq0p49e1RcXNzhvcLDw9vMzw0LC1Pv3r39Nm93/vz5GjdunPr166fa2lr9/Oc/V0NDg9++jf34448rLS1Nixcv1qRJk3TgwAGtXr1aq1ev9ks/6fNvD65bt05TpkxRYKD/H0/jxo3TL37xC/Xr10933nmn3n33XS1fvlwPPfSQ33ru3LlTpmkqISFBH374oZ544gklJCToX//1X6/73Fd7j+fn52vx4sXq37+/+vfvr8WLF6tnz57Kzc31S7+//vWv+uSTT6y1Ui+HEZfLdU1rDH9VP7fbrX/5l3/RO++8oz/+8Y9qaWmxnjuRkZEKDg7u8Hvs3bu3fvGLXygnJ0cxMTE6d+6cXnzxRZ06deqal2q72te0dRAPCgqSy+VSQkJCh/eLjIzUokWL9MADDygmJkYnT57UwoULFRUVpfvvv98v9/fEE09o8uTJuvfeezVy5EgVFxfrD3/4g/bs2eOXfpLU0NCg3/3ud1q2bNk19WhPv+HDh+uJJ55QaGio4uLiVFpaqg0bNmj58uV+6/m73/1Offr0Ub9+/VRZWanHHntMEyZMaPOD4DeErlu4AKZpmv/93/9txsXFmcHBweZdd93l1yWh3njjDVNSm23KlCkd3utKfSSZ69at6/Belz300EPW17JPnz7m6NGjzV27dvmtX2v+XvJq8uTJZkxMjBkUFGS63W5z4sSJ5pEjR/zWzzRN8w9/+IOZmJhohoSEmHfccYe5evVqv/bbuXOnKck8duyYX/tc1tDQYD722GNmv379zB49epjf+ta3zGeeecb0er1+67llyxbzW9/6lhkcHGy6XC5z5syZZn19fYec+2rv8UuXLpnPPvus6XK5zJCQEPPee+81Kysr/dZv3bp1Vzz+7LPPdni/y8tqXWl74403/HKPTU1N5v3332+63W4zODjYjImJMXNycswDBw74pd+VXO+SV1/V729/+5uZkZFh9unTxwwKCjL79etnTpkyxfzkk0/80u+ytWvXmrfddpvZo0cPc8iQIea2bdv82m/VqlVmaGhoh7wPr9avurranDp1qul2u80ePXqYCQkJ5rJly65rab+r9XzhhRfMvn37Wn+GP/nJT/z6jPMnh2ma5jUnXgAAAKATMKcVAAAAtkdoBQAAgO0RWgEAAGB7hFYAAADYHqEVAAAAtkdoBQAAgO0RWgEAAGB7hFYAAADYHqEVAAAAtkdoBQAAgO0RWgEAAGB7/x9asb+3Q7EsuAAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "unique_key = [\n", + " 'yyyy-mm-ddThh:mm:ss.sss', \n", + " 'Longitude [degrees_east]', \n", + " 'Latitude [degrees_north]', \n", + " 'DEPTH [m]', \n", + " 'BODC Bottle Number:INTEGER'\n", + " ]\n", + "\n", + "def count_non_nan(group):\n", + " return group[cols_measurements].notna().sum(axis=1).iloc[0]\n", + "\n", + "\n", + "non_nan_counts = df_test.groupby(unique_key).apply(count_non_nan)\n", + "\n", + "from matplotlib import pyplot as plt\n", + "# Calculate the range for bins\n", + "max_count = int(non_nan_counts.max())\n", + "bins = np.arange(0, max_count + 2) - 0.5 # +2 to include the max value, -0.5 for bin edges\n", + "\n", + "# Create the histogram\n", + "plt.figure(figsize=(8, 3))\n", + "plt.hist(non_nan_counts.values, bins=bins)\n", + "\n", + "plt.xticks(range(0, max_count + 1));" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11402c17", + "metadata": {}, + "outputs": [], + "source": [ + "# If we use the unique key as is, how many different measurements do we get per unique sample?\n", + "\n", + "df_test.groupby(unique_key).size().sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0529c4a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['yyyy-mm-ddThh:mm:ss.sss', 'Longitude [degrees_east]',\n", + " 'Latitude [degrees_north]', 'Bot. Depth [m]', 'DEPTH [m]',\n", + " 'BODC Bottle Number:INTEGER', 'TRITIUM_D_CONC_BOTTLE [TU]',\n", + " 'Cs_137_D_CONC_BOTTLE [uBq/kg]', 'I_129_D_CONC_BOTTLE [atoms/kg]',\n", + " 'Np_237_D_CONC_BOTTLE [uBq/kg]', 'Pu_239_D_CONC_BOTTLE [uBq/kg]',\n", + " 'Pu_239_Pu_240_D_CONC_BOTTLE [uBq/kg]', 'Pu_240_D_CONC_BOTTLE [uBq/kg]',\n", + " 'U_236_D_CONC_BOTTLE [atoms/kg]', 'U_236_T_CONC_BOTTLE [atoms/kg]',\n", + " 'U_236_D_CONC_FISH [atoms/kg]', 'Cs_137_D_CONC_UWAY [uBq/kg]',\n", + " 'Pu_239_Pu_240_D_CONC_UWAY [uBq/kg]', 'Pa_231_D_CONC_BOTTLE [uBq/kg]',\n", + " 'Pb_210_D_CONC_BOTTLE [mBq/kg]', 'Po_210_D_CONC_BOTTLE [mBq/kg]',\n", + " 'Ra_224_D_CONC_BOTTLE [mBq/kg]', 'Ra_226_D_CONC_BOTTLE [mBq/kg]',\n", + " 'Ra_228_T_CONC_BOTTLE [mBq/kg]', 'Ra_228_D_CONC_BOTTLE [mBq/kg]',\n", + " 'Th_230_T_CONC_BOTTLE [uBq/kg]', 'Th_230_D_CONC_BOTTLE [uBq/kg]',\n", + " 'Th_232_T_CONC_BOTTLE [pmol/kg]', 'Th_232_D_CONC_BOTTLE [pmol/kg]',\n", + " 'Th_234_T_CONC_BOTTLE [mBq/kg]', 'Ac_227_D_CONC_PUMP [uBq/kg]',\n", + " 'Be_7_T_CONC_PUMP [uBq/kg]', 'Be_7_D_CONC_PUMP [uBq/kg]',\n", + " 'Ra_223_D_CONC_PUMP [mBq/kg]', 'Ra_224_D_CONC_PUMP [mBq/kg]',\n", + " 'Ra_226_D_CONC_PUMP [mBq/kg]', 'Ra_228_T_CONC_PUMP [mBq/kg]',\n", + " 'Ra_228_D_CONC_PUMP [mBq/kg]', 'Th_228_D_CONC_PUMP [uBq/kg]',\n", + " 'Th_234_T_CONC_PUMP [mBq/kg]', 'Pa_231_D_CONC_FISH [uBq/kg]',\n", + " 'Pb_210_D_CONC_FISH [mBq/kg]', 'Po_210_D_CONC_FISH [mBq/kg]',\n", + " 'Ra_226_D_CONC_FISH [mBq/kg]', 'Ra_228_T_CONC_FISH [mBq/kg]',\n", + " 'Th_230_D_CONC_FISH [uBq/kg]', 'Th_232_D_CONC_FISH [pmol/kg]',\n", + " 'Th_234_T_CONC_FISH [mBq/kg]', 'Pa_231_D_CONC_UWAY [uBq/kg]',\n", + " 'Po_210_D_CONC_UWAY [mBq/kg]', 'Pb_210_D_CONC_UWAY [mBq/kg]',\n", + " 'Ra_224_D_CONC_UWAY [mBq/kg]', 'Ra_226_D_CONC_UWAY [mBq/kg]',\n", + " 'Ra_228_D_CONC_UWAY [mBq/kg]', 'Th_228_D_CONC_UWAY [uBq/kg]',\n", + " 'Th_230_D_CONC_UWAY [uBq/kg]', 'Th_232_D_CONC_UWAY [pmol/kg]',\n", + " 'Th_234_T_CONC_UWAY [mBq/kg]', 'Pa_231_D_CONC_BOAT_PUMP [uBq/kg]',\n", + " 'Th_230_D_CONC_BOAT_PUMP [uBq/kg]', 'Th_232_D_CONC_BOAT_PUMP [pmol/kg]',\n", + " 'Pa_231_D_CONC_SUBICE_PUMP [uBq/kg]',\n", + " 'Th_230_D_CONC_SUBICE_PUMP [uBq/kg]',\n", + " 'Th_232_D_CONC_SUBICE_PUMP [pmol/kg]',\n", + " 'Th_234_T_CONC_SUBICE_PUMP [mBq/kg]', 'Po_210_TP_CONC_BOTTLE [mBq/kg]',\n", + " 'Pb_210_TP_CONC_BOTTLE [mBq/kg]', 'Po_210_SPT_CONC_PUMP [mBq/kg]',\n", + " 'Po_210_LPT_CONC_PUMP [mBq/kg]', 'Pb_210_SPT_CONC_PUMP [mBq/kg]',\n", + " 'Pb_210_LPT_CONC_PUMP [mBq/kg]', 'Pa_231_TP_CONC_PUMP [uBq/kg]',\n", + " 'Pa_231_SPT_CONC_PUMP [uBq/kg]', 'Pa_231_LPT_CONC_PUMP [uBq/kg]',\n", + " 'Th_228_SPT_CONC_PUMP [uBq/kg]', 'Th_228_LPT_CONC_PUMP [uBq/kg]',\n", + " 'Th_230_TP_CONC_PUMP [uBq/kg]', 'Th_230_SPT_CONC_PUMP [uBq/kg]',\n", + " 'Th_230_LPT_CONC_PUMP [uBq/kg]', 'Th_232_TP_CONC_PUMP [pmol/kg]',\n", + " 'Th_232_SPT_CONC_PUMP [pmol/kg]', 'Th_232_LPT_CONC_PUMP [pmol/kg]',\n", + " 'Th_234_SPT_CONC_PUMP [mBq/kg]', 'Th_234_LPT_CONC_PUMP [mBq/kg]',\n", + " 'Po_210_TP_CONC_UWAY [mBq/kg]', 'Pb_210_TP_CONC_UWAY [mBq/kg]'],\n", + " dtype='object')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_test.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8deb1ea", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yyyy-mm-ddThh:mm:ss.sssLongitude [degrees_east]Latitude [degrees_north]Bot. Depth [m]DEPTH [m]BODC Bottle Number:INTEGERTRITIUM_D_CONC_BOTTLE [TU]Cs_137_D_CONC_BOTTLE [uBq/kg]I_129_D_CONC_BOTTLE [atoms/kg]Np_237_D_CONC_BOTTLE [uBq/kg]...Th_230_TP_CONC_PUMP [uBq/kg]Th_230_SPT_CONC_PUMP [uBq/kg]Th_230_LPT_CONC_PUMP [uBq/kg]Th_232_TP_CONC_PUMP [pmol/kg]Th_232_SPT_CONC_PUMP [pmol/kg]Th_232_LPT_CONC_PUMP [pmol/kg]Th_234_SPT_CONC_PUMP [mBq/kg]Th_234_LPT_CONC_PUMP [mBq/kg]Po_210_TP_CONC_UWAY [mBq/kg]Pb_210_TP_CONC_UWAY [mBq/kg]
\n", + "

0 rows × 86 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [yyyy-mm-ddThh:mm:ss.sss, Longitude [degrees_east], Latitude [degrees_north], Bot. Depth [m], DEPTH [m], BODC Bottle Number:INTEGER, TRITIUM_D_CONC_BOTTLE [TU], Cs_137_D_CONC_BOTTLE [uBq/kg], I_129_D_CONC_BOTTLE [atoms/kg], Np_237_D_CONC_BOTTLE [uBq/kg], Pu_239_D_CONC_BOTTLE [uBq/kg], Pu_239_Pu_240_D_CONC_BOTTLE [uBq/kg], Pu_240_D_CONC_BOTTLE [uBq/kg], U_236_D_CONC_BOTTLE [atoms/kg], U_236_T_CONC_BOTTLE [atoms/kg], U_236_D_CONC_FISH [atoms/kg], Cs_137_D_CONC_UWAY [uBq/kg], Pu_239_Pu_240_D_CONC_UWAY [uBq/kg], Pa_231_D_CONC_BOTTLE [uBq/kg], Pb_210_D_CONC_BOTTLE [mBq/kg], Po_210_D_CONC_BOTTLE [mBq/kg], Ra_224_D_CONC_BOTTLE [mBq/kg], Ra_226_D_CONC_BOTTLE [mBq/kg], Ra_228_T_CONC_BOTTLE [mBq/kg], Ra_228_D_CONC_BOTTLE [mBq/kg], Th_230_T_CONC_BOTTLE [uBq/kg], Th_230_D_CONC_BOTTLE [uBq/kg], Th_232_T_CONC_BOTTLE [pmol/kg], Th_232_D_CONC_BOTTLE [pmol/kg], Th_234_T_CONC_BOTTLE [mBq/kg], Ac_227_D_CONC_PUMP [uBq/kg], Be_7_T_CONC_PUMP [uBq/kg], Be_7_D_CONC_PUMP [uBq/kg], Ra_223_D_CONC_PUMP [mBq/kg], Ra_224_D_CONC_PUMP [mBq/kg], Ra_226_D_CONC_PUMP [mBq/kg], Ra_228_T_CONC_PUMP [mBq/kg], Ra_228_D_CONC_PUMP [mBq/kg], Th_228_D_CONC_PUMP [uBq/kg], Th_234_T_CONC_PUMP [mBq/kg], Pa_231_D_CONC_FISH [uBq/kg], Pb_210_D_CONC_FISH [mBq/kg], Po_210_D_CONC_FISH [mBq/kg], Ra_226_D_CONC_FISH [mBq/kg], Ra_228_T_CONC_FISH [mBq/kg], Th_230_D_CONC_FISH [uBq/kg], Th_232_D_CONC_FISH [pmol/kg], Th_234_T_CONC_FISH [mBq/kg], Pa_231_D_CONC_UWAY [uBq/kg], Po_210_D_CONC_UWAY [mBq/kg], Pb_210_D_CONC_UWAY [mBq/kg], Ra_224_D_CONC_UWAY [mBq/kg], Ra_226_D_CONC_UWAY [mBq/kg], Ra_228_D_CONC_UWAY [mBq/kg], Th_228_D_CONC_UWAY [uBq/kg], Th_230_D_CONC_UWAY [uBq/kg], Th_232_D_CONC_UWAY [pmol/kg], Th_234_T_CONC_UWAY [mBq/kg], Pa_231_D_CONC_BOAT_PUMP [uBq/kg], Th_230_D_CONC_BOAT_PUMP [uBq/kg], Th_232_D_CONC_BOAT_PUMP [pmol/kg], Pa_231_D_CONC_SUBICE_PUMP [uBq/kg], Th_230_D_CONC_SUBICE_PUMP [uBq/kg], Th_232_D_CONC_SUBICE_PUMP [pmol/kg], Th_234_T_CONC_SUBICE_PUMP [mBq/kg], Po_210_TP_CONC_BOTTLE [mBq/kg], Pb_210_TP_CONC_BOTTLE [mBq/kg], Po_210_SPT_CONC_PUMP [mBq/kg], Po_210_LPT_CONC_PUMP [mBq/kg], Pb_210_SPT_CONC_PUMP [mBq/kg], Pb_210_LPT_CONC_PUMP [mBq/kg], Pa_231_TP_CONC_PUMP [uBq/kg], Pa_231_SPT_CONC_PUMP [uBq/kg], Pa_231_LPT_CONC_PUMP [uBq/kg], Th_228_SPT_CONC_PUMP [uBq/kg], Th_228_LPT_CONC_PUMP [uBq/kg], Th_230_TP_CONC_PUMP [uBq/kg], Th_230_SPT_CONC_PUMP [uBq/kg], Th_230_LPT_CONC_PUMP [uBq/kg], Th_232_TP_CONC_PUMP [pmol/kg], Th_232_SPT_CONC_PUMP [pmol/kg], Th_232_LPT_CONC_PUMP [pmol/kg], Th_234_SPT_CONC_PUMP [mBq/kg], Th_234_LPT_CONC_PUMP [mBq/kg], Po_210_TP_CONC_UWAY [mBq/kg], Pb_210_TP_CONC_UWAY [mBq/kg]]\n", + "Index: []\n", + "\n", + "[0 rows x 86 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_key = [\n", + " 'yyyy-mm-ddThh:mm:ss.sss', 'Longitude [degrees_east]', \n", + " 'Latitude [degrees_north]', 'DEPTH [m]', 'BODC Bottle Number:INTEGER']\n", + "\n", + "df_test[df_test[unique_key].duplicated(keep=False)].sort_values(by=unique_key)" ] }, { diff --git a/nbs/handlers/_helcom-investigation-uniqueness.ipynb b/nbs/handlers/_helcom-investigation-uniqueness.ipynb index 929a6c0..c5769de 100644 --- a/nbs/handlers/_helcom-investigation-uniqueness.ipynb +++ b/nbs/handlers/_helcom-investigation-uniqueness.ipynb @@ -18,6 +18,16 @@ "# HELCOM" ] }, + { + "cell_type": "markdown", + "id": "8d9478e9", + "metadata": {}, + "source": [ + "Questions (uniqueness): \n", + " \n", + " - how many duplicates when lon, lat, date, depth is used as a key in seawater" + ] + }, { "cell_type": "markdown", "id": "5709cfb6", @@ -63,16 +73,7 @@ "execution_count": null, "id": "0db45fee", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "#| hide\n", "%load_ext autoreload\n", @@ -401,69 +402,18 @@ { "cell_type": "code", "execution_count": null, - "id": "0af1ec68", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(39817, 35)" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| eval: false\n", - "dfs['sediment'].shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0f74d5f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "25991" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| eval: false\n", - "dfs['sediment']['KEY'].duplicated().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98927b4c", + "id": "2511a24a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "KEY\n", - "SSSSM2021030 11\n", - "SSAAS1987036 11\n", - "SSSSI2006030 11\n", - "SSSSI2010030 11\n", - "SSSSI2010003 10\n", - " ..\n", - "SSSSI2006020 1\n", - "SSSSI2006014 1\n", - "SSSSI2006013 1\n", - "SCLOR2006078 1\n", - "SLVDC1997011 1\n", - "Name: count, Length: 13826, dtype: int64" + "Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/m³', 'VALUE_Bq/m³', 'ERROR%_m³',\n", + " 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR',\n", + " 'MONTH', 'DAY', 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", + " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'TDEPTH', 'SDEPTH', 'SALIN',\n", + " 'TTEMP', 'FILT', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'DATE_OF_ENTRY_y'],\n", + " dtype='object')" ] }, "execution_count": null, @@ -472,14 +422,13 @@ } ], "source": [ - "#| eval: false\n", - "dfs['sediment']['KEY'].value_counts()" + "dfs['seawater'].columns" ] }, { "cell_type": "code", "execution_count": null, - "id": "df06bf0c", + "id": "e5b8647f", "metadata": {}, "outputs": [ { @@ -506,296 +455,182 @@ " KEY\n", " NUCLIDE\n", " METHOD\n", - " < VALUE_Bq/kg\n", - " VALUE_Bq/kg\n", - " ERROR%_kg\n", - " < VALUE_Bq/m²\n", - " VALUE_Bq/m²\n", - " ERROR%_m²\n", + " < VALUE_Bq/m³\n", + " VALUE_Bq/m³\n", + " ERROR%_m³\n", " DATE_OF_ENTRY_x\n", + " COUNTRY\n", + " LABORATORY\n", + " SEQUENCE\n", " ...\n", - " LOWSLI\n", - " AREA\n", - " SEDI\n", - " OXIC\n", - " DW%\n", - " LOI%\n", + " LONGITUDE (ddmmmm)\n", + " LONGITUDE (dddddd)\n", + " TDEPTH\n", + " SDEPTH\n", + " SALIN\n", + " TTEMP\n", + " FILT\n", " MORS_SUBBASIN\n", " HELCOM_SUBBASIN\n", - " SUM_LINK\n", " DATE_OF_ENTRY_y\n", " \n", " \n", " \n", " \n", " 0\n", - " SKRIL2012048\n", - " RA226\n", - " NaN\n", - " NaN\n", - " 35.0\n", - " 26.0\n", - " NaN\n", + " WKRIL2012003\n", + " CS137\n", " NaN\n", " NaN\n", + " 5.3\n", + " 32.0\n", " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012003.0\n", " ...\n", - " 20.0\n", - " 0.006\n", + " 29.20\n", + " 29.3333\n", " NaN\n", + " 0.0\n", " NaN\n", " NaN\n", " NaN\n", " 11.0\n", " 11.0\n", - " NaN\n", " 08/20/14 00:00:00\n", " \n", " \n", " 1\n", - " SKRIL2012049\n", - " RA226\n", - " NaN\n", - " NaN\n", - " 36.0\n", - " 22.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 08/20/14 00:00:00\n", - " ...\n", - " 27.0\n", - " 0.006\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 11.0\n", - " 11.0\n", - " NaN\n", - " 08/20/14 00:00:00\n", - " \n", - " \n", - " 186\n", - " SKRIL2012048\n", + " WKRIL2012004\n", " CS137\n", " NaN\n", " NaN\n", - " 3.0\n", - " 33.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 08/20/14 00:00:00\n", - " ...\n", + " 19.9\n", " 20.0\n", - " 0.006\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 11.0\n", - " 11.0\n", - " NaN\n", - " 08/20/14 00:00:00\n", - " \n", - " \n", - " 187\n", - " SKRIL2012049\n", - " CS137\n", - " NaN\n", - " <\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012004.0\n", " ...\n", - " 27.0\n", - " 0.006\n", + " 29.20\n", + " 29.3333\n", " NaN\n", + " 29.0\n", " NaN\n", " NaN\n", " NaN\n", " 11.0\n", " 11.0\n", - " NaN\n", " 08/20/14 00:00:00\n", " \n", " \n", - " 562\n", - " SKRIL2012048\n", - " RA228\n", + " 2\n", + " WKRIL2012005\n", + " CS137\n", " NaN\n", " NaN\n", - " 60.0\n", + " 25.5\n", " 20.0\n", - " NaN\n", - " NaN\n", - " NaN\n", " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012005.0\n", " ...\n", - " 20.0\n", - " 0.006\n", + " 23.09\n", + " 23.1500\n", " NaN\n", + " 0.0\n", " NaN\n", " NaN\n", " NaN\n", " 11.0\n", - " 11.0\n", - " NaN\n", + " 3.0\n", " 08/20/14 00:00:00\n", " \n", " \n", - " 563\n", - " SKRIL2012049\n", - " RA228\n", - " NaN\n", - " NaN\n", - " 59.0\n", - " 20.0\n", - " NaN\n", + " 3\n", + " WKRIL2012006\n", + " CS137\n", " NaN\n", " NaN\n", + " 17.0\n", + " 29.0\n", " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012006.0\n", " ...\n", - " 27.0\n", - " 0.006\n", + " 27.59\n", + " 27.9833\n", " NaN\n", + " 0.0\n", " NaN\n", " NaN\n", " NaN\n", " 11.0\n", " 11.0\n", - " NaN\n", " 08/20/14 00:00:00\n", " \n", " \n", - " 825\n", - " SKRIL2012048\n", - " K40\n", - " NaN\n", - " NaN\n", - " 980.0\n", - " 20.0\n", - " NaN\n", + " 4\n", + " WKRIL2012007\n", + " CS137\n", " NaN\n", " NaN\n", + " 22.2\n", + " 18.0\n", " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012007.0\n", " ...\n", - " 20.0\n", - " 0.006\n", + " 27.59\n", + " 27.9833\n", " NaN\n", + " 39.0\n", " NaN\n", " NaN\n", " NaN\n", " 11.0\n", " 11.0\n", - " NaN\n", - " 08/20/14 00:00:00\n", - " \n", - " \n", - " 826\n", - " SKRIL2012049\n", - " K40\n", - " NaN\n", - " NaN\n", - " 950.0\n", - " 20.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 08/20/14 00:00:00\n", - " ...\n", - " 27.0\n", - " 0.006\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 11.0\n", - " 11.0\n", - " NaN\n", " 08/20/14 00:00:00\n", " \n", " \n", "\n", - "

8 rows × 35 columns

\n", + "

5 rows × 27 columns

\n", "" ], "text/plain": [ - " KEY NUCLIDE METHOD < VALUE_Bq/kg VALUE_Bq/kg ERROR%_kg \\\n", - "0 SKRIL2012048 RA226 NaN NaN 35.0 26.0 \n", - "1 SKRIL2012049 RA226 NaN NaN 36.0 22.0 \n", - "186 SKRIL2012048 CS137 NaN NaN 3.0 33.0 \n", - "187 SKRIL2012049 CS137 NaN < 1.0 NaN \n", - "562 SKRIL2012048 RA228 NaN NaN 60.0 20.0 \n", - "563 SKRIL2012049 RA228 NaN NaN 59.0 20.0 \n", - "825 SKRIL2012048 K40 NaN NaN 980.0 20.0 \n", - "826 SKRIL2012049 K40 NaN NaN 950.0 20.0 \n", + " KEY NUCLIDE METHOD < VALUE_Bq/m³ VALUE_Bq/m³ ERROR%_m³ \\\n", + "0 WKRIL2012003 CS137 NaN NaN 5.3 32.0 \n", + "1 WKRIL2012004 CS137 NaN NaN 19.9 20.0 \n", + "2 WKRIL2012005 CS137 NaN NaN 25.5 20.0 \n", + "3 WKRIL2012006 CS137 NaN NaN 17.0 29.0 \n", + "4 WKRIL2012007 CS137 NaN NaN 22.2 18.0 \n", "\n", - " < VALUE_Bq/m² VALUE_Bq/m² ERROR%_m² DATE_OF_ENTRY_x ... LOWSLI \\\n", - "0 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", - "1 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", - "186 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", - "187 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", - "562 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", - "563 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", - "825 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", - "826 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", + " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... LONGITUDE (ddmmmm) \\\n", + "0 08/20/14 00:00:00 90.0 KRIL 2012003.0 ... 29.20 \n", + "1 08/20/14 00:00:00 90.0 KRIL 2012004.0 ... 29.20 \n", + "2 08/20/14 00:00:00 90.0 KRIL 2012005.0 ... 23.09 \n", + "3 08/20/14 00:00:00 90.0 KRIL 2012006.0 ... 27.59 \n", + "4 08/20/14 00:00:00 90.0 KRIL 2012007.0 ... 27.59 \n", "\n", - " AREA SEDI OXIC DW% LOI% MORS_SUBBASIN HELCOM_SUBBASIN SUM_LINK \\\n", - "0 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", - "1 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", - "186 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", - "187 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", - "562 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", - "563 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", - "825 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", - "826 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + " LONGITUDE (dddddd) TDEPTH SDEPTH SALIN TTEMP FILT MORS_SUBBASIN \\\n", + "0 29.3333 NaN 0.0 NaN NaN NaN 11.0 \n", + "1 29.3333 NaN 29.0 NaN NaN NaN 11.0 \n", + "2 23.1500 NaN 0.0 NaN NaN NaN 11.0 \n", + "3 27.9833 NaN 0.0 NaN NaN NaN 11.0 \n", + "4 27.9833 NaN 39.0 NaN NaN NaN 11.0 \n", "\n", - " DATE_OF_ENTRY_y \n", - "0 08/20/14 00:00:00 \n", - "1 08/20/14 00:00:00 \n", - "186 08/20/14 00:00:00 \n", - "187 08/20/14 00:00:00 \n", - "562 08/20/14 00:00:00 \n", - "563 08/20/14 00:00:00 \n", - "825 08/20/14 00:00:00 \n", - "826 08/20/14 00:00:00 \n", + " HELCOM_SUBBASIN DATE_OF_ENTRY_y \n", + "0 11.0 08/20/14 00:00:00 \n", + "1 11.0 08/20/14 00:00:00 \n", + "2 3.0 08/20/14 00:00:00 \n", + "3 11.0 08/20/14 00:00:00 \n", + "4 11.0 08/20/14 00:00:00 \n", "\n", - "[8 rows x 35 columns]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| eval: false\n", - "# SKRIL2012048, SSSSM2021030\n", - "df = dfs['sediment'][dfs['sediment']['KEY'].isin(['SKRIL2012048', 'SKRIL2012049'])]; df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16f99ba4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',\n", - " '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',\n", - " 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',\n", - " 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", - " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',\n", - " 'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',\n", - " 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],\n", - " dtype='object')" + "[5 rows x 27 columns]" ] }, "execution_count": null, @@ -804,55 +639,59 @@ } ], "source": [ - "#| eval: false\n", - "df.columns" + "dfs['seawater'].head()" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "07c10fa1", + "cell_type": "markdown", + "id": "16072d60", "metadata": {}, - "outputs": [], "source": [ - "# coi_val = {'seawater' : {'val': 'VALUE_Bq/m³'},\n", - "# 'biota': {'val': 'VALUE_Bq/kg'},\n", - "# 'sediment': {'val': 'VALUE_Bq/kg'}}" + "Issues:\n", + "\n", + "- **seawater**: 43 duplicates with each value replicated with a factor of 1000\n", + "- **sediment**: 8 duplicates. Sometimes similar values, or NaN value or different (ex. K40) but difficult to find the reason\n", + "- **biota**: 94 duplicates. D and W versions in `BASIS` column ??\n", + "\n", + "\n", + "In summary, sounds sensible to inject in the pipeline/Transform, most likely after some cleaning, parsing, ... a step where we create explicitely the unique key. ['KEY', 'NUCLIDE'] for instance in the case of Helcom. While sending feedback to data provider, we could suggest to homogenize the nomenclature across the data providers. " ] }, { "cell_type": "code", "execution_count": null, - "id": "3585d711", + "id": "253ba994", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',\n", - " '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',\n", - " 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',\n", - " 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", - " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',\n", - " 'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',\n", - " 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],\n", - " dtype='object')" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "biota shape: (15827, 33)\n", + "biota columns: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'BASIS',\n", + " 'ERROR%', 'NUMBER', 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY',\n", + " 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY', 'STATION',\n", + " 'LATITUDE ddmmmm', 'LATITUDE dddddd', 'LONGITUDE ddmmmm',\n", + " 'LONGITUDE dddddd', 'SDEPTH', 'RUBIN', 'BIOTATYPE', 'TISSUE', 'NO',\n", + " 'LENGTH', 'WEIGHT', 'DW%', 'LOI%', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN',\n", + " 'DATE_OF_ENTRY_y'],\n", + " dtype='object')\n", + "biota duplicated keys: 94\n" + ] } ], "source": [ - "#| eval: false\n", - "df.columns" + "smp_type, unique_key = 'biota', ['KEY', 'NUCLIDE']\n", + "\n", + "print(f'{smp_type} shape: ', dfs[smp_type].shape) \n", + "print(f'{smp_type} columns: ', dfs[smp_type].columns)\n", + "print(f'{smp_type} duplicated keys: ', dfs[smp_type][unique_key].duplicated().sum()) " ] }, { "cell_type": "code", "execution_count": null, - "id": "04be1fb8", + "id": "cadb722d", "metadata": {}, "outputs": [ { @@ -877,137 +716,182 @@ " \n", " \n", " KEY\n", - " LATITUDE (ddmmmm)\n", - " LATITUDE (dddddd)\n", - " DATE\n", " NUCLIDE\n", " VALUE_Bq/kg\n", - " SEQUENCE\n", - " UPPSLI\n", - " LOWSLI\n", + " LATITUDE dddddd\n", + " LONGITUDE dddddd\n", + " DATE\n", + " SDEPTH\n", + " BASIS\n", + " TISSUE\n", " \n", " \n", " \n", " \n", - " 0\n", - " SKRIL2012048\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", - " RA226\n", - " 35.0\n", - " 2012048.0\n", - " 15.0\n", - " 20.0\n", + " 15091\n", + " BIMGW2019001\n", + " CS137\n", + " 3.510000\n", + " 54.3639\n", + " 19.4333\n", + " 09/20/19 00:00:00\n", + " NaN\n", + " D\n", + " 5\n", " \n", " \n", - " 1\n", - " SKRIL2012049\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", - " RA226\n", - " 36.0\n", - " 2012049.0\n", - " 20.0\n", - " 27.0\n", + " 15104\n", + " BIMGW2019001\n", + " CS137\n", + " 1.180000\n", + " 54.3639\n", + " 19.4333\n", + " 09/20/19 00:00:00\n", + " NaN\n", + " W\n", + " 5\n", " \n", " \n", - " 186\n", - " SKRIL2012048\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", + " 15092\n", + " BIMGW2019002\n", " CS137\n", - " 3.0\n", - " 2012048.0\n", - " 15.0\n", - " 20.0\n", + " 4.770000\n", + " 54.3639\n", + " 19.4333\n", + " 09/20/19 00:00:00\n", + " NaN\n", + " D\n", + " 5\n", " \n", " \n", - " 187\n", - " SKRIL2012049\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", + " 15105\n", + " BIMGW2019002\n", " CS137\n", - " 1.0\n", - " 2012049.0\n", - " 20.0\n", - " 27.0\n", + " 1.210000\n", + " 54.3639\n", + " 19.4333\n", + " 09/20/19 00:00:00\n", + " NaN\n", + " W\n", + " 5\n", " \n", " \n", - " 562\n", - " SKRIL2012048\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", - " RA228\n", - " 60.0\n", - " 2012048.0\n", - " 15.0\n", - " 20.0\n", + " 15093\n", + " BIMGW2019003\n", + " CS137\n", + " 1.830000\n", + " 54.3639\n", + " 19.4333\n", + " 09/20/19 00:00:00\n", + " NaN\n", + " D\n", + " 5\n", " \n", " \n", - " 563\n", - " SKRIL2012049\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", - " RA228\n", - " 59.0\n", - " 2012049.0\n", - " 20.0\n", - " 27.0\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 825\n", - " SKRIL2012048\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", + " 13907\n", + " BVTIG2015006\n", + " CS134\n", + " 0.008333\n", + " 54.7985\n", + " 13.9752\n", + " 09/05/15 00:00:00\n", + " 37.0\n", + " W\n", + " 5\n", + " \n", + " \n", + " 13685\n", + " BVTIG2015006\n", + " CS137\n", + " 5.240000\n", + " 54.7985\n", + " 13.9752\n", + " 09/05/15 00:00:00\n", + " 37.0\n", + " W\n", + " 5\n", + " \n", + " \n", + " 13906\n", + " BVTIG2015006\n", + " CS137\n", + " 5.240000\n", + " 54.7985\n", + " 13.9752\n", + " 09/05/15 00:00:00\n", + " 37.0\n", + " W\n", + " 5\n", + " \n", + " \n", + " 13684\n", + " BVTIG2015006\n", " K40\n", - " 980.0\n", - " 2012048.0\n", - " 15.0\n", - " 20.0\n", + " 122.000000\n", + " 54.7985\n", + " 13.9752\n", + " 09/05/15 00:00:00\n", + " 37.0\n", + " W\n", + " 5\n", " \n", " \n", - " 826\n", - " SKRIL2012049\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", + " 13905\n", + " BVTIG2015006\n", " K40\n", - " 950.0\n", - " 2012049.0\n", - " 20.0\n", - " 27.0\n", + " 122.000000\n", + " 54.7985\n", + " 13.9752\n", + " 09/05/15 00:00:00\n", + " 37.0\n", + " W\n", + " 5\n", " \n", " \n", "\n", + "

170 rows × 9 columns

\n", "" ], "text/plain": [ - " KEY LATITUDE (ddmmmm) LATITUDE (dddddd) DATE \\\n", - "0 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", - "1 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", - "186 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", - "187 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", - "562 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", - "563 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", - "825 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", - "826 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + " KEY NUCLIDE VALUE_Bq/kg LATITUDE dddddd LONGITUDE dddddd \\\n", + "15091 BIMGW2019001 CS137 3.510000 54.3639 19.4333 \n", + "15104 BIMGW2019001 CS137 1.180000 54.3639 19.4333 \n", + "15092 BIMGW2019002 CS137 4.770000 54.3639 19.4333 \n", + "15105 BIMGW2019002 CS137 1.210000 54.3639 19.4333 \n", + "15093 BIMGW2019003 CS137 1.830000 54.3639 19.4333 \n", + "... ... ... ... ... ... \n", + "13907 BVTIG2015006 CS134 0.008333 54.7985 13.9752 \n", + "13685 BVTIG2015006 CS137 5.240000 54.7985 13.9752 \n", + "13906 BVTIG2015006 CS137 5.240000 54.7985 13.9752 \n", + "13684 BVTIG2015006 K40 122.000000 54.7985 13.9752 \n", + "13905 BVTIG2015006 K40 122.000000 54.7985 13.9752 \n", "\n", - " NUCLIDE VALUE_Bq/kg SEQUENCE UPPSLI LOWSLI \n", - "0 RA226 35.0 2012048.0 15.0 20.0 \n", - "1 RA226 36.0 2012049.0 20.0 27.0 \n", - "186 CS137 3.0 2012048.0 15.0 20.0 \n", - "187 CS137 1.0 2012049.0 20.0 27.0 \n", - "562 RA228 60.0 2012048.0 15.0 20.0 \n", - "563 RA228 59.0 2012049.0 20.0 27.0 \n", - "825 K40 980.0 2012048.0 15.0 20.0 \n", - "826 K40 950.0 2012049.0 20.0 27.0 " + " DATE SDEPTH BASIS TISSUE \n", + "15091 09/20/19 00:00:00 NaN D 5 \n", + "15104 09/20/19 00:00:00 NaN W 5 \n", + "15092 09/20/19 00:00:00 NaN D 5 \n", + "15105 09/20/19 00:00:00 NaN W 5 \n", + "15093 09/20/19 00:00:00 NaN D 5 \n", + "... ... ... ... ... \n", + "13907 09/05/15 00:00:00 37.0 W 5 \n", + "13685 09/05/15 00:00:00 37.0 W 5 \n", + "13906 09/05/15 00:00:00 37.0 W 5 \n", + "13684 09/05/15 00:00:00 37.0 W 5 \n", + "13905 09/05/15 00:00:00 37.0 W 5 \n", + "\n", + "[170 rows x 9 columns]" ] }, "execution_count": null, @@ -1016,17 +900,16 @@ } ], "source": [ - "#| eval: false\n", - "# Same key => several nuclides\n", - "# Same lat, lon, date, nuclide => different values\n", - "# lat, lon, date, nuclide, 'upsli', 'lowsli' should be unique\n", - "df[['KEY', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'NUCLIDE', 'VALUE_Bq/kg', 'SEQUENCE', 'UPPSLI', 'LOWSLI']]" + "# coi = unique_key + ['VALUE_Bq/m³', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)', 'DATE'] # seawater\n", + "coi = unique_key + ['VALUE_Bq/kg', 'LATITUDE dddddd', 'LONGITUDE dddddd', 'DATE', 'SDEPTH', 'BASIS', 'TISSUE'] # biota\n", + "# coi = unique_key + ['VALUE_Bq/m²', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)', 'DATE', 'UPPSLI', 'LOWSLI'] # sediment\n", + "dfs[smp_type][dfs[smp_type][unique_key].duplicated(keep=False)].sort_values(by=unique_key)[coi]" ] }, { "cell_type": "code", "execution_count": null, - "id": "dbf47fbe", + "id": "2a351aa1", "metadata": {}, "outputs": [ { @@ -1051,137 +934,128 @@ " \n", " \n", " KEY\n", - " LATITUDE (ddmmmm)\n", - " LATITUDE (dddddd)\n", - " DATE\n", " NUCLIDE\n", - " VALUE_Bq/kg\n", + " METHOD\n", + " < VALUE_Bq/m³\n", + " VALUE_Bq/m³\n", + " ERROR%_m³\n", + " DATE_OF_ENTRY_x\n", + " COUNTRY\n", + " LABORATORY\n", " SEQUENCE\n", - " UPPSLI\n", - " LOWSLI\n", + " ...\n", + " LONGITUDE (ddmmmm)\n", + " LONGITUDE (dddddd)\n", + " TDEPTH\n", + " SDEPTH\n", + " SALIN\n", + " TTEMP\n", + " FILT\n", + " MORS_SUBBASIN\n", + " HELCOM_SUBBASIN\n", + " DATE_OF_ENTRY_y\n", " \n", " \n", " \n", " \n", - " 0\n", - " SKRIL2012048\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", - " RA226\n", - " 35.0\n", - " 2012048.0\n", - " 15.0\n", - " 20.0\n", - " \n", - " \n", " 1\n", - " SKRIL2012049\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", - " RA226\n", - " 36.0\n", - " 2012049.0\n", - " 20.0\n", - " 27.0\n", - " \n", - " \n", - " 186\n", - " SKRIL2012048\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", - " CS137\n", - " 3.0\n", - " 2012048.0\n", - " 15.0\n", - " 20.0\n", - " \n", - " \n", - " 187\n", - " SKRIL2012049\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", + " WKRIL2012004\n", " CS137\n", - " 1.0\n", - " 2012049.0\n", - " 20.0\n", - " 27.0\n", - " \n", - " \n", - " 562\n", - " SKRIL2012048\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", - " RA228\n", - " 60.0\n", - " 2012048.0\n", - " 15.0\n", - " 20.0\n", - " \n", - " \n", - " 563\n", - " SKRIL2012049\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", - " RA228\n", - " 59.0\n", - " 2012049.0\n", + " NaN\n", + " NaN\n", + " 19.9\n", " 20.0\n", - " 27.0\n", + " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012004.0\n", + " ...\n", + " 29.2\n", + " 29.3333\n", + " NaN\n", + " 29.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " 08/20/14 00:00:00\n", " \n", " \n", - " 825\n", - " SKRIL2012048\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", - " K40\n", - " 980.0\n", - " 2012048.0\n", - " 15.0\n", - " 20.0\n", + " 35\n", + " WKRIL2012004\n", + " SR90\n", + " NaN\n", + " NaN\n", + " 7.7\n", + " 19.0\n", + " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012004.0\n", + " ...\n", + " 29.2\n", + " 29.3333\n", + " NaN\n", + " 29.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " 08/20/14 00:00:00\n", " \n", " \n", - " 826\n", - " SKRIL2012049\n", - " 59.4\n", - " 59,6667\n", - " 06/17/12 00:00:00\n", - " K40\n", - " 950.0\n", - " 2012049.0\n", - " 20.0\n", - " 27.0\n", + " 69\n", + " WKRIL2012004\n", + " H3\n", + " NaN\n", + " NaN\n", + " 4200.0\n", + " 33.0\n", + " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012004.0\n", + " ...\n", + " 29.2\n", + " 29.3333\n", + " NaN\n", + " 29.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " 08/20/14 00:00:00\n", " \n", " \n", "\n", + "

3 rows × 27 columns

\n", "" ], "text/plain": [ - " KEY LATITUDE (ddmmmm) LATITUDE (dddddd) DATE \\\n", - "0 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", - "1 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", - "186 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", - "187 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", - "562 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", - "563 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", - "825 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", - "826 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + " KEY NUCLIDE METHOD < VALUE_Bq/m³ VALUE_Bq/m³ ERROR%_m³ \\\n", + "1 WKRIL2012004 CS137 NaN NaN 19.9 20.0 \n", + "35 WKRIL2012004 SR90 NaN NaN 7.7 19.0 \n", + "69 WKRIL2012004 H3 NaN NaN 4200.0 33.0 \n", "\n", - " NUCLIDE VALUE_Bq/kg SEQUENCE UPPSLI LOWSLI \n", - "0 RA226 35.0 2012048.0 15.0 20.0 \n", - "1 RA226 36.0 2012049.0 20.0 27.0 \n", - "186 CS137 3.0 2012048.0 15.0 20.0 \n", - "187 CS137 1.0 2012049.0 20.0 27.0 \n", - "562 RA228 60.0 2012048.0 15.0 20.0 \n", - "563 RA228 59.0 2012049.0 20.0 27.0 \n", - "825 K40 980.0 2012048.0 15.0 20.0 \n", - "826 K40 950.0 2012049.0 20.0 27.0 " + " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... LONGITUDE (ddmmmm) \\\n", + "1 08/20/14 00:00:00 90.0 KRIL 2012004.0 ... 29.2 \n", + "35 08/20/14 00:00:00 90.0 KRIL 2012004.0 ... 29.2 \n", + "69 08/20/14 00:00:00 90.0 KRIL 2012004.0 ... 29.2 \n", + "\n", + " LONGITUDE (dddddd) TDEPTH SDEPTH SALIN TTEMP FILT MORS_SUBBASIN \\\n", + "1 29.3333 NaN 29.0 NaN NaN NaN 11.0 \n", + "35 29.3333 NaN 29.0 NaN NaN NaN 11.0 \n", + "69 29.3333 NaN 29.0 NaN NaN NaN 11.0 \n", + "\n", + " HELCOM_SUBBASIN DATE_OF_ENTRY_y \n", + "1 11.0 08/20/14 00:00:00 \n", + "35 11.0 08/20/14 00:00:00 \n", + "69 11.0 08/20/14 00:00:00 \n", + "\n", + "[3 rows x 27 columns]" ] }, "execution_count": null, @@ -1190,282 +1064,46 @@ } ], "source": [ - "#| eval: false\n", - "# Same sample, several nuclides\n", - "df_test = df[['KEY', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'NUCLIDE', 'VALUE_Bq/kg', 'SEQUENCE', 'UPPSLI', 'LOWSLI']]; df_test\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23031b91", - "metadata": {}, - "outputs": [], - "source": [ - "#| eval: false\n", - "# df_test.pivot_table(index=['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'YEAR', 'MONTH', 'DAY'],\n", - "# columns='NUCLIDE',\n", - "# values='VALUE_Bq/kg',\n", - "# fill_value=np.nan,\n", - "# aggfunc=lambda x: x).reset_index()\n" + "dfs['seawater'][dfs['seawater']['KEY'] == 'WKRIL2012004']" ] }, { "cell_type": "code", "execution_count": null, - "id": "53cdf8c0", + "id": "abef3443", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "KEY\n", + "WCLOR1984001 3\n", + "WCLOR1984002 2\n", + "WCLOR1984003 3\n", + "WCLOR1984004 2\n", + "WCLOR1984005 2\n", + " ..\n", + "WSTUK2021011 3\n", + "WSTUK2021012 3\n", + "WSTUK2021013 4\n", + "WSTUK2021014 4\n", + "WSTUK2021015 3\n", + "Name: NUCLIDE, Length: 9552, dtype: int64" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# #| eval: false\n", - "# # Preprocess the data\n", - "# df_test['VALUE_Bq/kg'] = df_test['VALUE_Bq/kg'].fillna(-999)\n", - "\n", - "# # Then pivot\n", - "# pivoted = df_test.pivot_table(index=['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'YEAR', 'MONTH', 'DAY'],\n", - "# columns='NUCLIDE',\n", - "# values='VALUE_Bq/kg',\n", - "# aggfunc='first').reset_index()\n", - "\n", - "# # Replace -999 with 'Below Detection Limit' or any other indicator\n", - "# pivoted = pivoted.replace(-999, np.nan)\n", - "# pivoted" + "dfs['seawater'].groupby('KEY')['NUCLIDE'].nunique()" ] }, { "cell_type": "code", "execution_count": null, - "id": "fc75cf0f", - "metadata": {}, - "outputs": [], - "source": [ - "#| eval: false\n", - "# dfs['sediment'][['LATITUDE (ddmmm)', 'LONGITUDE (ddmmm)', '']].head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30bd602a", - "metadata": {}, - "outputs": [], - "source": [ - "#| eval: false\n", - "# dfs['sediment'].head()" - ] - }, - { - "cell_type": "markdown", - "id": "5687eade", - "metadata": {}, - "source": [ - "## Add sample type column" - ] - }, - { - "cell_type": "markdown", - "id": "a984410e", - "metadata": {}, - "source": [ - "The sample type (`seawater`, `biota`, `sediment`, ...) as defined in the `configs.ipynb` are encoded group names in NetCDF produced. Addition of sample type ids into individual dataframes is done using the `AddSampleTypeIdColumnCB` callback for legacy purposes (i.e. Open Refine output)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cf5ba759", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " KEY samptype_id\n", - "0 WKRIL2012003 1\n", - "1 WKRIL2012004 1\n", - "2 WKRIL2012005 1\n", - "3 WKRIL2012006 1\n", - "4 WKRIL2012007 1\n", - " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21216 39817 15827\n", - "Number of dropped rows 0 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", - "\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", - "\n", - "print(tfm()['seawater'][['KEY', 'samptype_id']].head())\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "142ddab3", - "metadata": {}, - "source": [ - "## Normalize nuclide names" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "8a2311cd", - "metadata": {}, - "source": [ - "### Lower & strip nuclide names" - ] - }, - { - "cell_type": "markdown", - "id": "4b7b4ceb", - "metadata": {}, - "source": [ - ":::{.callout-tip}\n", - "\n", - "**FEEDBACK TO DATA PROVIDER**: Some nuclide names contain one or multiple trailing spaces.\n", - "\n", - ":::" - ] - }, - { - "cell_type": "markdown", - "id": "64d84ed7", - "metadata": {}, - "source": [ - "This is demonstrated below for the `NUCLIDE` column:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a2306ba", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " index value n_chars stripped_chars\n", - "6 6 TC99 7 4\n", - "16 16 CS137 6 5\n", - "33 33 CS137 9 5\n", - "41 41 CS134 8 5\n", - "43 43 SR90 6 4\n", - "46 46 SR90 5 4\n", - "48 48 K40 8 3\n", - "49 49 PU238 8 5\n", - "64 64 CO60 8 4\n", - "65 65 AM241 8 5\n", - "66 66 CS137 8 5\n", - "83 83 SR90 8 4\n", - "86 86 SR90 7 4\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "df = get_unique_across_dfs(load_data(fname_in), 'NUCLIDE', as_df=True, include_nchars=True)\n", - "df['stripped_chars'] = df['value'].str.strip().str.replace(' ', '').str.len()\n", - "print(df[df['n_chars'] != df['stripped_chars']])" - ] - }, - { - "cell_type": "markdown", - "id": "518174ba", - "metadata": {}, - "source": [ - "To fix this issue, we use the `LowerStripNameCB` callback. For each dataframe in the dictionary of dataframes, it corrects the nuclide name by converting it lowercase, striping any leading or trailing whitespace(s) and ensuring the number comes before letters (e.g. `137cs`)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a3fa068", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "seawater nuclides: \n", - "['cs137' 'sr90' 'h3' 'cs134' 'pu238' 'pu239240' 'am241' 'cm242' 'cm244'\n", - " 'tc99' 'k40' 'ru103' 'sr89' 'sb125' 'nb95' 'ru106' 'zr95' 'ag110m'\n", - " 'cm243244' 'ba140' 'ce144' 'u234' 'u238' 'co60' 'pu239' 'pb210' 'po210'\n", - " 'np237' 'pu240' 'mn54']\n", - "sediment nuclides: \n", - "['ra226' 'cs137' 'ra228' 'k40' 'sr90' 'cs134137' 'cs134' 'pu239240'\n", - " 'pu238' 'co60' 'ru103' 'ru106' 'sb125' 'ag110m' 'ce144' 'am241' 'be7'\n", - " 'th228' 'pb210' 'co58' 'mn54' 'zr95' 'ba140' 'po210' 'ra224' 'nb95'\n", - " 'pu238240' 'pu241' 'pu239' 'eu155' 'ir192' 'th232' 'cd109' 'sb124' 'zn65'\n", - " 'th234' 'tl208' 'pb212' 'pb214' 'bi214' 'ac228' 'ra223' 'u235' 'bi212']\n", - "biota nuclides: \n", - "['cs134' 'k40' 'co60' 'cs137' 'sr90' 'ag108m' 'mn54' 'co58' 'ag110m'\n", - " 'zn65' 'sb125' 'pu239240' 'ru106' 'be7' 'ce144' 'pb210' 'po210' 'sb124'\n", - " 'sr89' 'zr95' 'te129m' 'ru103' 'nb95' 'ce141' 'la140' 'i131' 'ba140'\n", - " 'pu238' 'u235' 'bi214' 'pb214' 'pb212' 'tl208' 'ac228' 'ra223' 'eu155'\n", - " 'ra226' 'gd153' 'sn113' 'fe59' 'tc99' 'co57' 'sn117m' 'eu152' 'sc46'\n", - " 'rb86' 'ra224' 'th232' 'cs134137' 'am241' 'ra228' 'th228' 'k-40' 'cs138'\n", - " 'cs139' 'cs140' 'cs141' 'cs142' 'cs143' 'cs144' 'cs145' 'cs146']\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE')])\n", - "\n", - "for key in tfm().keys():\n", - " print(f'{key} nuclides: ')\n", - " print(tfm()[key]['NUCLIDE'].unique())" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "52c9d0fe", - "metadata": {}, - "source": [ - "### Remap nuclide names to MARIS data formats" - ] - }, - { - "cell_type": "markdown", - "id": "a58baf14", - "metadata": {}, - "source": [ - "We below map nuclide names used by HELCOM to the MARIS standard nuclide names. \n", - "\n", - "Remapping data provider nomenclatures into MARIS standards is one recurrent operation and is done in a semi-automated manner according to the following pattern:\n", - "\n", - "1. **Inspect** data provider nomenclature:\n", - "2. **Match** automatically against MARIS nomenclature (using a fuzzy matching algorithm); \n", - "3. **Fix** potential mismatches; \n", - "4. **Apply** the lookup table to the dataframe.\n", - "\n", - "As now on, we will use this pattern to remap the HELCOM data provider nomenclatures into MARIS standards and name it for the sake of brevity **IMFA** (**I**nspect, **M**atch, **F**ix, **A**pply)." - ] - }, - { - "cell_type": "markdown", - "id": "ae4b31bc", - "metadata": {}, - "source": [ - "The unique values of the data provider nuclide names. The `get_unique_across_dfs` is a utility function allowing to retrieve unique values of a specific column across all dataframes (please remind that we have one dataframe per sample type - biota, ...)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e32ee8d0", + "id": "db1c37b0", "metadata": {}, "outputs": [ { @@ -1489,47 +1127,64 @@ " \n", " \n", " \n", - " index\n", - " value\n", + " KEY\n", + " LATITUDE (ddmmmm)\n", + " LONGITUDE (ddmmmm)\n", + " DATE\n", + " NUCLIDE\n", + " VALUE_Bq/m³\n", + " SEQUENCE\n", + " FILT\n", " \n", " \n", " \n", " \n", - " 0\n", - " 0\n", - " sb125\n", - " \n", - " \n", - " 1\n", - " 1\n", - " ce141\n", - " \n", - " \n", - " 2\n", - " 2\n", - " gd153\n", + " 797\n", + " WCLOR1989028\n", + " 54.5\n", + " 17.3206\n", + " 06/19/89 00:00:00\n", + " CS134\n", + " 17.0\n", + " 1989028.0\n", + " N\n", " \n", " \n", - " 3\n", - " 3\n", - " ra226\n", + " 798\n", + " WCLOR1989028\n", + " 54.5\n", + " 17.3206\n", + " 06/19/89 00:00:00\n", + " CS137\n", + " 109.0\n", + " 1989028.0\n", + " N\n", " \n", " \n", - " 4\n", - " 4\n", - " ra228\n", + " 799\n", + " WCLOR1989028\n", + " 54.5\n", + " 17.3206\n", + " 06/19/89 00:00:00\n", + " SR90\n", + " 21.4\n", + " 1989028.0\n", + " N\n", " \n", " \n", "\n", "" ], "text/plain": [ - " index value\n", - "0 0 sb125\n", - "1 1 ce141\n", - "2 2 gd153\n", - "3 3 ra226\n", - "4 4 ra228" + " KEY LATITUDE (ddmmmm) LONGITUDE (ddmmmm) DATE \\\n", + "797 WCLOR1989028 54.5 17.3206 06/19/89 00:00:00 \n", + "798 WCLOR1989028 54.5 17.3206 06/19/89 00:00:00 \n", + "799 WCLOR1989028 54.5 17.3206 06/19/89 00:00:00 \n", + "\n", + " NUCLIDE VALUE_Bq/m³ SEQUENCE FILT \n", + "797 CS134 17.0 1989028.0 N \n", + "798 CS137 109.0 1989028.0 N \n", + "799 SR90 21.4 1989028.0 N " ] }, "execution_count": null, @@ -1538,60 +1193,17 @@ } ], "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE')])\n", - "dfs_output = tfm()\n", - "\n", - "get_unique_across_dfs(dfs_output, col_name='NUCLIDE', as_df=True).head(5)" - ] - }, - { - "cell_type": "markdown", - "id": "614c1bdf", - "metadata": {}, - "source": [ - "Let's now create an instance of a fuzzy matching algorithm `Remapper`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bcdbc619", - "metadata": {}, - "outputs": [], - "source": [ - "#| eval: false\n", - "remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs_output, col_name='NUCLIDE', as_df=True),\n", - " maris_lut_fn=nuc_lut_path,\n", - " maris_col_id='nuclide_id',\n", - " maris_col_name='nc_name',\n", - " provider_col_to_match='value',\n", - " provider_col_key='value',\n", - " fname_cache='nuclides_helcom.pkl')" - ] - }, - { - "cell_type": "markdown", - "id": "f7e0ea0c", - "metadata": {}, - "source": [ - "And try to match HELCOM to MARIS nuclide names as automatically as possible. The `match_score` column allows to assess the results:" + "# same lab, method, station, salin, ttemp, filt \n", + "coi = ['KEY', 'LATITUDE (ddmmmm)', 'LONGITUDE (ddmmmm)', 'DATE', 'NUCLIDE', 'VALUE_Bq/m³', 'SEQUENCE', 'FILT']\n", + "dfs['seawater'][dfs['seawater'].KEY == 'WCLOR1989028'][coi]" ] }, { "cell_type": "code", "execution_count": null, - "id": "cb645c29", + "id": "f407a5d6", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing: 100%|██████████| 77/77 [00:01<00:00, 49.22it/s]\n" - ] - }, { "data": { "text/html": [ @@ -1613,123 +1225,64 @@ " \n", " \n", " \n", - " matched_maris_name\n", - " source_name\n", - " match_score\n", - " \n", - " \n", - " source_key\n", - " \n", - " \n", - " \n", + " KEY\n", + " LATITUDE (ddmmmm)\n", + " LONGITUDE (ddmmmm)\n", + " DATE\n", + " NUCLIDE\n", + " VALUE_Bq/m³\n", + " SEQUENCE\n", + " FILT\n", " \n", " \n", " \n", " \n", - " cm243244\n", - " cm244\n", - " cm243244\n", - " 3\n", - " \n", - " \n", - " cs134137\n", - " cs137\n", - " cs134137\n", - " 3\n", - " \n", - " \n", - " pu238240\n", - " pu240\n", - " pu238240\n", - " 3\n", - " \n", - " \n", - " pu239240\n", - " pu240\n", - " pu239240\n", - " 3\n", - " \n", - " \n", - " cs143\n", - " cs127\n", - " cs143\n", - " 2\n", - " \n", - " \n", - " cs145\n", - " cs136\n", - " cs145\n", - " 2\n", - " \n", - " \n", - " cs142\n", - " ce144\n", - " cs142\n", - " 2\n", - " \n", - " \n", - " cs140\n", - " ce140\n", - " cs140\n", - " 1\n", - " \n", - " \n", - " k-40\n", - " k40\n", - " k-40\n", - " 1\n", - " \n", - " \n", - " cs144\n", - " ce144\n", - " cs144\n", - " 1\n", - " \n", - " \n", - " cs141\n", - " ce141\n", - " cs141\n", - " 1\n", - " \n", - " \n", - " cs138\n", - " cs137\n", - " cs138\n", - " 1\n", + " 803\n", + " WCLOR1989030\n", + " 54.5\n", + " 17.3206\n", + " 06/19/89 00:00:00\n", + " CS134\n", + " 16.0\n", + " 1989030.0\n", + " N\n", " \n", " \n", - " cs139\n", - " ce139\n", - " cs139\n", - " 1\n", + " 804\n", + " WCLOR1989030\n", + " 54.5\n", + " 17.3206\n", + " 06/19/89 00:00:00\n", + " CS137\n", + " 90.0\n", + " 1989030.0\n", + " N\n", " \n", " \n", - " cs146\n", - " cs136\n", - " cs146\n", - " 1\n", + " 805\n", + " WCLOR1989030\n", + " 54.5\n", + " 17.3206\n", + " 06/19/89 00:00:00\n", + " SR90\n", + " 20.7\n", + " 1989030.0\n", + " N\n", " \n", " \n", "\n", "" ], "text/plain": [ - " matched_maris_name source_name match_score\n", - "source_key \n", - "cm243244 cm244 cm243244 3\n", - "cs134137 cs137 cs134137 3\n", - "pu238240 pu240 pu238240 3\n", - "pu239240 pu240 pu239240 3\n", - "cs143 cs127 cs143 2\n", - "cs145 cs136 cs145 2\n", - "cs142 ce144 cs142 2\n", - "cs140 ce140 cs140 1\n", - "k-40 k40 k-40 1\n", - "cs144 ce144 cs144 1\n", - "cs141 ce141 cs141 1\n", - "cs138 cs137 cs138 1\n", - "cs139 ce139 cs139 1\n", - "cs146 cs136 cs146 1" + " KEY LATITUDE (ddmmmm) LONGITUDE (ddmmmm) DATE \\\n", + "803 WCLOR1989030 54.5 17.3206 06/19/89 00:00:00 \n", + "804 WCLOR1989030 54.5 17.3206 06/19/89 00:00:00 \n", + "805 WCLOR1989030 54.5 17.3206 06/19/89 00:00:00 \n", + "\n", + " NUCLIDE VALUE_Bq/m³ SEQUENCE FILT \n", + "803 CS134 16.0 1989030.0 N \n", + "804 CS137 90.0 1989030.0 N \n", + "805 SR90 20.7 1989030.0 N " ] }, "execution_count": null, @@ -1738,147 +1291,75 @@ } ], "source": [ - "#| eval: false\n", - "remapper.generate_lookup_table(as_df=True)\n", - "remapper.select_match(match_score_threshold=1)" - ] - }, - { - "cell_type": "markdown", - "id": "4a5cb838", - "metadata": {}, - "source": [ - "We then manually inspect the remaining unmatched names and create a fixes table to map them to the correct MARIS standards:" + "dfs['seawater'][dfs['seawater'].KEY == 'WCLOR1989030'][coi]" ] }, { "cell_type": "code", "execution_count": null, - "id": "60cf885b", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "fixes_nuclide_names = {\n", - " 'cs134137': 'cs134_137_tot',\n", - " 'cm243244': 'cm243_244_tot',\n", - " 'pu239240': 'pu239_240_tot',\n", - " 'pu238240': 'pu238_240_tot',\n", - " 'cs143': 'cs137',\n", - " 'cs145': 'cs137',\n", - " 'cs142': 'cs137',\n", - " 'cs141': 'cs137',\n", - " 'cs144': 'cs137',\n", - " 'k-40': 'k40',\n", - " 'cs140': 'cs137',\n", - " 'cs146': 'cs137',\n", - " 'cs139': 'cs137',\n", - " 'cs138': 'cs137'\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "dd575e7e", + "id": "0af1ec68", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(39817, 35)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Let's try to match again but this time we use the `fixes_nuclide_names` to map the nuclide names to the MARIS standards:\n" + "#| eval: false\n", + "dfs['sediment'].shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "73410b14", + "id": "b0f74d5f", "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing: 100%|██████████| 77/77 [00:01<00:00, 52.46it/s]\n" - ] + "data": { + "text/plain": [ + "25991" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "#| eval: false\n", - "remapper.generate_lookup_table(as_df=True, fixes=fixes_nuclide_names)\n", - "fc.test_eq(len(remapper.select_match(match_score_threshold=1)), 0)" - ] - }, - { - "cell_type": "markdown", - "id": "abd1276f", - "metadata": {}, - "source": [ - "Test passes! We can now create a callback `RemapNuclideNameCB` to remap the nuclide names. Note that we pass `overwrite=False` to the `Remapper` constructor to now use the cached version.\n" + "dfs['sediment']['KEY'].duplicated().sum()" ] }, { "cell_type": "code", "execution_count": null, - "id": "9a189ef9", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "# Create a lookup table for nuclide names\n", - "lut_nuclides = lambda df: Remapper(provider_lut_df=df,\n", - " maris_lut_fn=nuc_lut_path,\n", - " maris_col_id='nuclide_id',\n", - " maris_col_name='nc_name',\n", - " provider_col_to_match='value',\n", - " provider_col_key='value',\n", - " fname_cache='nuclides_helcom.pkl').generate_lookup_table(fixes=fixes_nuclide_names, \n", - " as_df=False, overwrite=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03d47237", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "class RemapNuclideNameCB(Callback):\n", - " \"Remap data provider nuclide names to MARIS nuclide names.\"\n", - " def __init__(self, \n", - " fn_lut: Callable # Function that returns the lookup table dictionary\n", - " ):\n", - " fc.store_attr()\n", - "\n", - " def __call__(self, tfm: Transformer):\n", - " df_uniques = get_unique_across_dfs(tfm.dfs, col_name='NUCLIDE', as_df=True)\n", - " lut = {k: v.matched_maris_name for k, v in self.fn_lut(df_uniques).items()} \n", - " for k in tfm.dfs.keys():\n", - " tfm.dfs[k]['NUCLIDE'] = tfm.dfs[k]['NUCLIDE'].replace(lut)" - ] - }, - { - "cell_type": "markdown", - "id": "ce649d7a", - "metadata": {}, - "source": [ - "Let's see it in action, along with the `RemapRdnNameCB` callback:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c9a9ff7", + "id": "98927b4c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array(['cs134', 'k40', 'co60', 'cs137', 'sr90', 'ag108m', 'mn54', 'co58',\n", - " 'ag110m', 'zn65', 'sb125', 'pu239_240_tot', 'ru106', 'be7',\n", - " 'ce144', 'pb210', 'po210', 'sb124', 'sr89', 'zr95', 'te129m',\n", - " 'ru103', 'nb95', 'ce141', 'la140', 'i131', 'ba140', 'pu238',\n", - " 'u235', 'bi214', 'pb214', 'pb212', 'tl208', 'ac228', 'ra223',\n", - " 'eu155', 'ra226', 'gd153', 'sn113', 'fe59', 'tc99', 'co57',\n", - " 'sn117m', 'eu152', 'sc46', 'rb86', 'ra224', 'th232',\n", - " 'cs134_137_tot', 'am241', 'ra228', 'th228'], dtype=object)" + "KEY\n", + "SSSSM2021030 11\n", + "SSAAS1987036 11\n", + "SSSSI2006030 11\n", + "SSSSI2010030 11\n", + "SSSSI2010003 10\n", + " ..\n", + "SSSSI2006020 1\n", + "SSSSI2006014 1\n", + "SSSSI2006013 1\n", + "SCLOR2006078 1\n", + "SLVDC1997011 1\n", + "Name: count, Length: 13826, dtype: int64" ] }, "execution_count": null, @@ -1888,36 +1369,13 @@ ], "source": [ "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE'),\n", - " RemapNuclideNameCB(lut_nuclides)\n", - " ])\n", - "dfs_out = tfm()\n", - "\n", - "# For instance\n", - "dfs_out['biota'].NUCLIDE.unique()\n" - ] - }, - { - "cell_type": "markdown", - "id": "f91ba2d3", - "metadata": {}, - "source": [ - "### Add Nuclide Id column" - ] - }, - { - "cell_type": "markdown", - "id": "49a6c352", - "metadata": {}, - "source": [ - "The `nuclide_id` column is added to the dataframe for legacy reasons (again Open Refine output)." + "dfs['sediment']['KEY'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, - "id": "4ec4271e", + "id": "df06bf0c", "metadata": {}, "outputs": [ { @@ -1941,86 +1399,269 @@ " \n", " \n", " \n", + " KEY\n", " NUCLIDE\n", - " nuclide_id\n", + " METHOD\n", + " < VALUE_Bq/kg\n", + " VALUE_Bq/kg\n", + " ERROR%_kg\n", + " < VALUE_Bq/m²\n", + " VALUE_Bq/m²\n", + " ERROR%_m²\n", + " DATE_OF_ENTRY_x\n", + " ...\n", + " LOWSLI\n", + " AREA\n", + " SEDI\n", + " OXIC\n", + " DW%\n", + " LOI%\n", + " MORS_SUBBASIN\n", + " HELCOM_SUBBASIN\n", + " SUM_LINK\n", + " DATE_OF_ENTRY_y\n", " \n", " \n", " \n", " \n", " 0\n", - " cs134\n", - " 31\n", + " SKRIL2012048\n", + " RA226\n", + " NaN\n", + " NaN\n", + " 35.0\n", + " 26.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 08/20/14 00:00:00\n", + " ...\n", + " 20.0\n", + " 0.006\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " NaN\n", + " 08/20/14 00:00:00\n", " \n", " \n", " 1\n", - " k40\n", - " 4\n", - " \n", - " \n", - " 2\n", - " co60\n", - " 9\n", + " SKRIL2012049\n", + " RA226\n", + " NaN\n", + " NaN\n", + " 36.0\n", + " 22.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 08/20/14 00:00:00\n", + " ...\n", + " 27.0\n", + " 0.006\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " NaN\n", + " 08/20/14 00:00:00\n", " \n", " \n", - " 3\n", - " cs137\n", - " 33\n", + " 186\n", + " SKRIL2012048\n", + " CS137\n", + " NaN\n", + " NaN\n", + " 3.0\n", + " 33.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 08/20/14 00:00:00\n", + " ...\n", + " 20.0\n", + " 0.006\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " NaN\n", + " 08/20/14 00:00:00\n", " \n", " \n", - " 4\n", - " cs134\n", - " 31\n", + " 187\n", + " SKRIL2012049\n", + " CS137\n", + " NaN\n", + " <\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 08/20/14 00:00:00\n", + " ...\n", + " 27.0\n", + " 0.006\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " NaN\n", + " 08/20/14 00:00:00\n", " \n", " \n", - " ...\n", - " ...\n", + " 562\n", + " SKRIL2012048\n", + " RA228\n", + " NaN\n", + " NaN\n", + " 60.0\n", + " 20.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 08/20/14 00:00:00\n", " ...\n", + " 20.0\n", + " 0.006\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " NaN\n", + " 08/20/14 00:00:00\n", " \n", " \n", - " 15822\n", - " k40\n", - " 4\n", + " 563\n", + " SKRIL2012049\n", + " RA228\n", + " NaN\n", + " NaN\n", + " 59.0\n", + " 20.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 08/20/14 00:00:00\n", + " ...\n", + " 27.0\n", + " 0.006\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " NaN\n", + " 08/20/14 00:00:00\n", " \n", " \n", - " 15823\n", - " cs137\n", - " 33\n", - " \n", - " \n", - " 15824\n", - " be7\n", - " 2\n", - " \n", - " \n", - " 15825\n", - " k40\n", - " 4\n", + " 825\n", + " SKRIL2012048\n", + " K40\n", + " NaN\n", + " NaN\n", + " 980.0\n", + " 20.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 08/20/14 00:00:00\n", + " ...\n", + " 20.0\n", + " 0.006\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " NaN\n", + " 08/20/14 00:00:00\n", " \n", " \n", - " 15826\n", - " cs137\n", - " 33\n", + " 826\n", + " SKRIL2012049\n", + " K40\n", + " NaN\n", + " NaN\n", + " 950.0\n", + " 20.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 08/20/14 00:00:00\n", + " ...\n", + " 27.0\n", + " 0.006\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " NaN\n", + " 08/20/14 00:00:00\n", " \n", " \n", "\n", - "

15827 rows × 2 columns

\n", + "

8 rows × 35 columns

\n", "" ], "text/plain": [ - " NUCLIDE nuclide_id\n", - "0 cs134 31\n", - "1 k40 4\n", - "2 co60 9\n", - "3 cs137 33\n", - "4 cs134 31\n", - "... ... ...\n", - "15822 k40 4\n", - "15823 cs137 33\n", - "15824 be7 2\n", - "15825 k40 4\n", - "15826 cs137 33\n", + " KEY NUCLIDE METHOD < VALUE_Bq/kg VALUE_Bq/kg ERROR%_kg \\\n", + "0 SKRIL2012048 RA226 NaN NaN 35.0 26.0 \n", + "1 SKRIL2012049 RA226 NaN NaN 36.0 22.0 \n", + "186 SKRIL2012048 CS137 NaN NaN 3.0 33.0 \n", + "187 SKRIL2012049 CS137 NaN < 1.0 NaN \n", + "562 SKRIL2012048 RA228 NaN NaN 60.0 20.0 \n", + "563 SKRIL2012049 RA228 NaN NaN 59.0 20.0 \n", + "825 SKRIL2012048 K40 NaN NaN 980.0 20.0 \n", + "826 SKRIL2012049 K40 NaN NaN 950.0 20.0 \n", "\n", - "[15827 rows x 2 columns]" + " < VALUE_Bq/m² VALUE_Bq/m² ERROR%_m² DATE_OF_ENTRY_x ... LOWSLI \\\n", + "0 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", + "1 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", + "186 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", + "187 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", + "562 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", + "563 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", + "825 NaN NaN NaN 08/20/14 00:00:00 ... 20.0 \n", + "826 NaN NaN NaN 08/20/14 00:00:00 ... 27.0 \n", + "\n", + " AREA SEDI OXIC DW% LOI% MORS_SUBBASIN HELCOM_SUBBASIN SUM_LINK \\\n", + "0 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "1 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "186 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "187 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "562 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "563 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "825 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "826 0.006 NaN NaN NaN NaN 11.0 11.0 NaN \n", + "\n", + " DATE_OF_ENTRY_y \n", + "0 08/20/14 00:00:00 \n", + "1 08/20/14 00:00:00 \n", + "186 08/20/14 00:00:00 \n", + "187 08/20/14 00:00:00 \n", + "562 08/20/14 00:00:00 \n", + "563 08/20/14 00:00:00 \n", + "825 08/20/14 00:00:00 \n", + "826 08/20/14 00:00:00 \n", + "\n", + "[8 rows x 35 columns]" ] }, "execution_count": null, @@ -2030,213 +1671,84 @@ ], "source": [ "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE'),\n", - " RemapNuclideNameCB(lut_nuclides),\n", - " AddNuclideIdColumnCB(col_value='NUCLIDE')\n", - " ])\n", - "dfs_out = tfm()\n", - "\n", - "# For instance\n", - "dfs_out['biota'][['NUCLIDE', 'nuclide_id']]" - ] - }, - { - "cell_type": "markdown", - "id": "02e9e1f4", - "metadata": {}, - "source": [ - "## Standardize Time" - ] - }, - { - "cell_type": "markdown", - "id": "24856dc5", - "metadata": {}, - "source": [ - ":::{.callout-tip}\n", - "\n", - "**FEEDBACK TO DATA PROVIDER**: Time/date is provide in the `DATE`, `YEAR`\n", - ", `MONTH`, `DAY` columns. Note that the `DATE` contains missing values as indicated below. When missing, we fallback on the `YEAR`, `MONTH`, `DAY` columns. Note also that sometimes `DAY` and `MONTH` contain 0. In this case we systematically set them to 1.\n", - "\n", - ":::" + "# SKRIL2012048, SSSSM2021030\n", + "df = dfs['sediment'][dfs['sediment']['KEY'].isin(['SKRIL2012048', 'SKRIL2012049'])]; df" ] }, { "cell_type": "code", "execution_count": null, - "id": "612873e6", + "id": "16f99ba4", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "seawater DATE null values: 502\n", - "sediment DATE null values: 741\n", - "biota DATE null values: 72\n" - ] + "data": { + "text/plain": [ + "Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',\n", + " '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',\n", + " 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',\n", + " 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", + " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',\n", + " 'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',\n", + " 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],\n", + " dtype='object')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "for key in dfs.keys():\n", - " print(f'{key} DATE null values: ', dfs[key]['DATE'].isna().sum())" + "df.columns" ] }, { "cell_type": "code", "execution_count": null, - "id": "ae547a0c", + "id": "07c10fa1", "metadata": {}, "outputs": [], "source": [ - "#| exports\n", - "class ParseTimeCB(Callback):\n", - " \"Parse and standardize time information in the dataframe.\"\n", - " def __call__(self, tfm: Transformer):\n", - " for df in tfm.dfs.values():\n", - " self._process_dates(df)\n", - " self._define_beg_period(df)\n", - "\n", - " def _process_dates(self, df: pd.DataFrame) -> None:\n", - " \"Process and correct date and time information in the DataFrame.\"\n", - " df['time'] = self._parse_date(df)\n", - " self._handle_missing_dates(df)\n", - " self._fill_missing_time(df)\n", - "\n", - " def _parse_date(self, df: pd.DataFrame) -> pd.Series:\n", - " \"Parse the DATE column if present.\"\n", - " return pd.to_datetime(df['DATE'], format='%m/%d/%y %H:%M:%S', errors='coerce')\n", - "\n", - " def _handle_missing_dates(self, df: pd.DataFrame):\n", - " \"Handle cases where DAY or MONTH is 0 or missing.\"\n", - " df.loc[df[\"DAY\"] == 0, \"DAY\"] = 1\n", - " df.loc[df[\"MONTH\"] == 0, \"MONTH\"] = 1\n", - " \n", - " missing_day_month = (df[\"DAY\"].isna()) & (df[\"MONTH\"].isna()) & (df[\"YEAR\"].notna())\n", - " df.loc[missing_day_month, [\"DAY\", \"MONTH\"]] = 1\n", - "\n", - " def _fill_missing_time(self, df: pd.DataFrame) -> None:\n", - " \"Fill missing time values using YEAR, MONTH, and DAY columns.\"\n", - " missing_time = df['time'].isna()\n", - " df.loc[missing_time, 'time'] = pd.to_datetime(\n", - " df.loc[missing_time, ['YEAR', 'MONTH', 'DAY']], \n", - " format='%Y%m%d', \n", - " errors='coerce'\n", - " )\n", - "\n", - " def _define_beg_period(self, df: pd.DataFrame) -> None:\n", - " \"Create a standardized date representation for Open Refine.\"\n", - " df['begperiod'] = df['time']" + "# coi_val = {'seawater' : {'val': 'VALUE_Bq/m³'},\n", + "# 'biota': {'val': 'VALUE_Bq/kg'},\n", + "# 'sediment': {'val': 'VALUE_Bq/kg'}}" ] }, { - "cell_type": "markdown", - "id": "48c34819", + "cell_type": "code", + "execution_count": null, + "id": "3585d711", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',\n", + " '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',\n", + " 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',\n", + " 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", + " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',\n", + " 'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',\n", + " 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],\n", + " dtype='object')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Apply the transformer for callbacks `ParseTimeCB`. Then, print the ``begperiod`` and `time` data for `seawater`." + "#| eval: false\n", + "df.columns" ] }, { "cell_type": "code", "execution_count": null, - "id": "f2b90d07", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21216 39817 15827\n", - "Number of dropped rows 0 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", - "\n", - " begperiod time\n", - "0 2012-05-23 2012-05-23\n", - "1 2012-05-23 2012-05-23\n", - "2 2012-06-17 2012-06-17\n", - "3 2012-05-24 2012-05-24\n", - "4 2012-05-24 2012-05-24\n", - "... ... ...\n", - "21211 2021-10-15 2021-10-15\n", - "21212 2021-11-04 2021-11-04\n", - "21213 2021-10-15 2021-10-15\n", - "21214 2021-05-17 2021-05-17\n", - "21215 2021-05-13 2021-05-13\n", - "\n", - "[21216 rows x 2 columns]\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[ParseTimeCB(),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", - "tfm()\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", - "print(tfm.dfs['seawater'][['begperiod','time']])" - ] - }, - { - "cell_type": "markdown", - "id": "28dd488a", - "metadata": {}, - "source": [ - "NetCDF time format requires the time to be encoded as number of milliseconds since a time of origin. In our case the time of origin is `1970-01-01` as indicated in `configs.ipynb` `CONFIFS['units']['time']` dictionary." - ] - }, - { - "cell_type": "markdown", - "id": "486b2966", - "metadata": {}, - "source": [ - "`EncodeTimeCB` converts the HELCOM `time` format to the MARIS NetCDF `time` format." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b8edc56", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8 of 21216 entries for `time` are invalid for seawater.\n", - "1 of 39817 entries for `time` are invalid for sediment.\n", - " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21208 39816 15827\n", - "Number of dropped rows 8 1 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", - "\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[ParseTimeCB(),\n", - " EncodeTimeCB(cfg(), verbose=True),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", - "tfm()\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0be521de", + "id": "04be1fb8", "metadata": {}, "outputs": [ { @@ -2261,352 +1773,137 @@ " \n", " \n", " KEY\n", + " LATITUDE (ddmmmm)\n", + " LATITUDE (dddddd)\n", + " DATE\n", " NUCLIDE\n", - " METHOD\n", - " < VALUE_Bq/m³\n", - " VALUE_Bq/m³\n", - " ERROR%_m³\n", - " DATE_OF_ENTRY_x\n", - " COUNTRY\n", - " LABORATORY\n", + " VALUE_Bq/kg\n", " SEQUENCE\n", - " ...\n", - " TDEPTH\n", - " SDEPTH\n", - " SALIN\n", - " TTEMP\n", - " FILT\n", - " MORS_SUBBASIN\n", - " HELCOM_SUBBASIN\n", - " DATE_OF_ENTRY_y\n", - " time\n", - " begperiod\n", + " UPPSLI\n", + " LOWSLI\n", " \n", " \n", " \n", " \n", " 0\n", - " WKRIL2012003\n", - " CS137\n", - " NaN\n", - " NaN\n", - " 5.3\n", - " 32.000000\n", - " 08/20/14 00:00:00\n", - " 90.0\n", - " KRIL\n", - " 2012003.0\n", - " ...\n", - " NaN\n", - " 0.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 11.0\n", - " 11.0\n", - " 08/20/14 00:00:00\n", - " 1337731200\n", - " 2012-05-23\n", + " SKRIL2012048\n", + " 59.4\n", + " 59,6667\n", + " 06/17/12 00:00:00\n", + " RA226\n", + " 35.0\n", + " 2012048.0\n", + " 15.0\n", + " 20.0\n", " \n", " \n", " 1\n", - " WKRIL2012004\n", - " CS137\n", - " NaN\n", - " NaN\n", - " 19.9\n", - " 20.000000\n", - " 08/20/14 00:00:00\n", - " 90.0\n", - " KRIL\n", - " 2012004.0\n", - " ...\n", - " NaN\n", - " 29.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 11.0\n", - " 11.0\n", - " 08/20/14 00:00:00\n", - " 1337731200\n", - " 2012-05-23\n", + " SKRIL2012049\n", + " 59.4\n", + " 59,6667\n", + " 06/17/12 00:00:00\n", + " RA226\n", + " 36.0\n", + " 2012049.0\n", + " 20.0\n", + " 27.0\n", " \n", " \n", - " 2\n", - " WKRIL2012005\n", + " 186\n", + " SKRIL2012048\n", + " 59.4\n", + " 59,6667\n", + " 06/17/12 00:00:00\n", " CS137\n", - " NaN\n", - " NaN\n", - " 25.5\n", - " 20.000000\n", - " 08/20/14 00:00:00\n", - " 90.0\n", - " KRIL\n", - " 2012005.0\n", - " ...\n", - " NaN\n", - " 0.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 11.0\n", " 3.0\n", - " 08/20/14 00:00:00\n", - " 1339891200\n", - " 2012-06-17\n", - " \n", - " \n", - " 3\n", - " WKRIL2012006\n", - " CS137\n", - " NaN\n", - " NaN\n", - " 17.0\n", - " 29.000000\n", - " 08/20/14 00:00:00\n", - " 90.0\n", - " KRIL\n", - " 2012006.0\n", - " ...\n", - " NaN\n", - " 0.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 11.0\n", - " 11.0\n", - " 08/20/14 00:00:00\n", - " 1337817600\n", - " 2012-05-24\n", + " 2012048.0\n", + " 15.0\n", + " 20.0\n", " \n", " \n", - " 4\n", - " WKRIL2012007\n", + " 187\n", + " SKRIL2012049\n", + " 59.4\n", + " 59,6667\n", + " 06/17/12 00:00:00\n", " CS137\n", - " NaN\n", - " NaN\n", - " 22.2\n", - " 18.000000\n", - " 08/20/14 00:00:00\n", - " 90.0\n", - " KRIL\n", - " 2012007.0\n", - " ...\n", - " NaN\n", - " 39.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 11.0\n", - " 11.0\n", - " 08/20/14 00:00:00\n", - " 1337817600\n", - " 2012-05-24\n", + " 1.0\n", + " 2012049.0\n", + " 20.0\n", + " 27.0\n", " \n", " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 21211\n", - " WSSSM2021005\n", - " H3\n", - " SSM45\n", - " NaN\n", - " 1030.0\n", - " 93.203883\n", - " 09/06/22 00:00:00\n", - " 77.0\n", - " SSSM\n", - " 202105.0\n", - " ...\n", - " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " N\n", - " 1.0\n", - " 8.0\n", - " 09/06/22 00:00:00\n", - " 1634256000\n", - " 2021-10-15\n", - " \n", - " \n", - " 21212\n", - " WSSSM2021006\n", - " H3\n", - " SSM45\n", - " NaN\n", - " 2240.0\n", - " 43.303571\n", - " 09/06/22 00:00:00\n", - " 77.0\n", - " SSSM\n", - " 202106.0\n", - " ...\n", - " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " N\n", - " 10.0\n", - " 10.0\n", - " 09/06/22 00:00:00\n", - " 1635984000\n", - " 2021-11-04\n", + " 562\n", + " SKRIL2012048\n", + " 59.4\n", + " 59,6667\n", + " 06/17/12 00:00:00\n", + " RA228\n", + " 60.0\n", + " 2012048.0\n", + " 15.0\n", + " 20.0\n", " \n", " \n", - " 21213\n", - " WSSSM2021007\n", - " H3\n", - " SSM45\n", - " NaN\n", - " 2060.0\n", - " 47.087379\n", - " 09/06/22 00:00:00\n", - " 77.0\n", - " SSSM\n", - " 202107.0\n", - " ...\n", - " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " N\n", - " 12.0\n", - " 12.0\n", - " 09/06/22 00:00:00\n", - " 1634256000\n", - " 2021-10-15\n", + " 563\n", + " SKRIL2012049\n", + " 59.4\n", + " 59,6667\n", + " 06/17/12 00:00:00\n", + " RA228\n", + " 59.0\n", + " 2012049.0\n", + " 20.0\n", + " 27.0\n", " \n", " \n", - " 21214\n", - " WSSSM2021008\n", - " H3\n", - " SSM45\n", - " NaN\n", - " 2300.0\n", - " 43.478261\n", - " 09/06/22 00:00:00\n", - " 77.0\n", - " SSSM\n", - " 202108.0\n", - " ...\n", - " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " N\n", - " 12.0\n", - " 12.0\n", - " 09/06/22 00:00:00\n", - " 1621209600\n", - " 2021-05-17\n", + " 825\n", + " SKRIL2012048\n", + " 59.4\n", + " 59,6667\n", + " 06/17/12 00:00:00\n", + " K40\n", + " 980.0\n", + " 2012048.0\n", + " 15.0\n", + " 20.0\n", " \n", " \n", - " 21215\n", - " WSSSM2021004\n", - " H3\n", - " SSM45\n", - " <\n", - " NaN\n", - " NaN\n", - " 09/06/22 00:00:00\n", - " 77.0\n", - " SSSM\n", - " 202104.0\n", - " ...\n", - " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " N\n", - " 15.0\n", - " 18.0\n", - " 09/06/22 00:00:00\n", - " 1620864000\n", - " 2021-05-13\n", + " 826\n", + " SKRIL2012049\n", + " 59.4\n", + " 59,6667\n", + " 06/17/12 00:00:00\n", + " K40\n", + " 950.0\n", + " 2012049.0\n", + " 20.0\n", + " 27.0\n", " \n", " \n", "\n", - "

21208 rows × 29 columns

\n", "" ], "text/plain": [ - " KEY NUCLIDE METHOD < VALUE_Bq/m³ VALUE_Bq/m³ ERROR%_m³ \\\n", - "0 WKRIL2012003 CS137 NaN NaN 5.3 32.000000 \n", - "1 WKRIL2012004 CS137 NaN NaN 19.9 20.000000 \n", - "2 WKRIL2012005 CS137 NaN NaN 25.5 20.000000 \n", - "3 WKRIL2012006 CS137 NaN NaN 17.0 29.000000 \n", - "4 WKRIL2012007 CS137 NaN NaN 22.2 18.000000 \n", - "... ... ... ... ... ... ... \n", - "21211 WSSSM2021005 H3 SSM45 NaN 1030.0 93.203883 \n", - "21212 WSSSM2021006 H3 SSM45 NaN 2240.0 43.303571 \n", - "21213 WSSSM2021007 H3 SSM45 NaN 2060.0 47.087379 \n", - "21214 WSSSM2021008 H3 SSM45 NaN 2300.0 43.478261 \n", - "21215 WSSSM2021004 H3 SSM45 < NaN NaN \n", - "\n", - " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... TDEPTH SDEPTH \\\n", - "0 08/20/14 00:00:00 90.0 KRIL 2012003.0 ... NaN 0.0 \n", - "1 08/20/14 00:00:00 90.0 KRIL 2012004.0 ... NaN 29.0 \n", - "2 08/20/14 00:00:00 90.0 KRIL 2012005.0 ... NaN 0.0 \n", - "3 08/20/14 00:00:00 90.0 KRIL 2012006.0 ... NaN 0.0 \n", - "4 08/20/14 00:00:00 90.0 KRIL 2012007.0 ... NaN 39.0 \n", - "... ... ... ... ... ... ... ... \n", - "21211 09/06/22 00:00:00 77.0 SSSM 202105.0 ... NaN 1.0 \n", - "21212 09/06/22 00:00:00 77.0 SSSM 202106.0 ... NaN 1.0 \n", - "21213 09/06/22 00:00:00 77.0 SSSM 202107.0 ... NaN 1.0 \n", - "21214 09/06/22 00:00:00 77.0 SSSM 202108.0 ... NaN 1.0 \n", - "21215 09/06/22 00:00:00 77.0 SSSM 202104.0 ... NaN 1.0 \n", - "\n", - " SALIN TTEMP FILT MORS_SUBBASIN HELCOM_SUBBASIN DATE_OF_ENTRY_y \\\n", - "0 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", - "1 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", - "2 NaN NaN NaN 11.0 3.0 08/20/14 00:00:00 \n", - "3 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", - "4 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", - "... ... ... ... ... ... ... \n", - "21211 NaN NaN N 1.0 8.0 09/06/22 00:00:00 \n", - "21212 NaN NaN N 10.0 10.0 09/06/22 00:00:00 \n", - "21213 NaN NaN N 12.0 12.0 09/06/22 00:00:00 \n", - "21214 NaN NaN N 12.0 12.0 09/06/22 00:00:00 \n", - "21215 NaN NaN N 15.0 18.0 09/06/22 00:00:00 \n", - "\n", - " time begperiod \n", - "0 1337731200 2012-05-23 \n", - "1 1337731200 2012-05-23 \n", - "2 1339891200 2012-06-17 \n", - "3 1337817600 2012-05-24 \n", - "4 1337817600 2012-05-24 \n", - "... ... ... \n", - "21211 1634256000 2021-10-15 \n", - "21212 1635984000 2021-11-04 \n", - "21213 1634256000 2021-10-15 \n", - "21214 1621209600 2021-05-17 \n", - "21215 1620864000 2021-05-13 \n", + " KEY LATITUDE (ddmmmm) LATITUDE (dddddd) DATE \\\n", + "0 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "1 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "186 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "187 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "562 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "563 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "825 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "826 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", "\n", - "[21208 rows x 29 columns]" + " NUCLIDE VALUE_Bq/kg SEQUENCE UPPSLI LOWSLI \n", + "0 RA226 35.0 2012048.0 15.0 20.0 \n", + "1 RA226 36.0 2012049.0 20.0 27.0 \n", + "186 CS137 3.0 2012048.0 15.0 20.0 \n", + "187 CS137 1.0 2012049.0 20.0 27.0 \n", + "562 RA228 60.0 2012048.0 15.0 20.0 \n", + "563 RA228 59.0 2012049.0 20.0 27.0 \n", + "825 K40 980.0 2012048.0 15.0 20.0 \n", + "826 K40 950.0 2012049.0 20.0 27.0 " ] }, "execution_count": null, @@ -2615,255 +1912,456 @@ } ], "source": [ - "tfm.dfs['seawater']" + "#| eval: false\n", + "# Same key => several nuclides\n", + "# Same lat, lon, date, nuclide => different values\n", + "# lat, lon, date, nuclide, 'upsli', 'lowsli' should be unique\n", + "df[['KEY', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'NUCLIDE', 'VALUE_Bq/kg', 'SEQUENCE', 'UPPSLI', 'LOWSLI']]" ] }, { - "cell_type": "markdown", - "id": "69ef4f4b", - "metadata": {}, - "source": [ - "## Sanitize value" - ] - }, - { - "cell_type": "markdown", - "id": "6de49e39", + "cell_type": "code", + "execution_count": null, + "id": "dbf47fbe", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KEYLATITUDE (ddmmmm)LATITUDE (dddddd)DATENUCLIDEVALUE_Bq/kgSEQUENCEUPPSLILOWSLI
0SKRIL201204859.459,666706/17/12 00:00:00RA22635.02012048.015.020.0
1SKRIL201204959.459,666706/17/12 00:00:00RA22636.02012049.020.027.0
186SKRIL201204859.459,666706/17/12 00:00:00CS1373.02012048.015.020.0
187SKRIL201204959.459,666706/17/12 00:00:00CS1371.02012049.020.027.0
562SKRIL201204859.459,666706/17/12 00:00:00RA22860.02012048.015.020.0
563SKRIL201204959.459,666706/17/12 00:00:00RA22859.02012049.020.027.0
825SKRIL201204859.459,666706/17/12 00:00:00K40980.02012048.015.020.0
826SKRIL201204959.459,666706/17/12 00:00:00K40950.02012049.020.027.0
\n", + "
" + ], + "text/plain": [ + " KEY LATITUDE (ddmmmm) LATITUDE (dddddd) DATE \\\n", + "0 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "1 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "186 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "187 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "562 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "563 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "825 SKRIL2012048 59.4 59,6667 06/17/12 00:00:00 \n", + "826 SKRIL2012049 59.4 59,6667 06/17/12 00:00:00 \n", + "\n", + " NUCLIDE VALUE_Bq/kg SEQUENCE UPPSLI LOWSLI \n", + "0 RA226 35.0 2012048.0 15.0 20.0 \n", + "1 RA226 36.0 2012049.0 20.0 27.0 \n", + "186 CS137 3.0 2012048.0 15.0 20.0 \n", + "187 CS137 1.0 2012049.0 20.0 27.0 \n", + "562 RA228 60.0 2012048.0 15.0 20.0 \n", + "563 RA228 59.0 2012049.0 20.0 27.0 \n", + "825 K40 980.0 2012048.0 15.0 20.0 \n", + "826 K40 950.0 2012049.0 20.0 27.0 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "We allocate each column containing measurement values (named differently across sample types as `unit` are mentioned as well in column names) into a single column `value` and remove NA where needed." + "#| eval: false\n", + "# Same sample, several nuclides\n", + "df_test = df[['KEY', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'NUCLIDE', 'VALUE_Bq/kg', 'SEQUENCE', 'UPPSLI', 'LOWSLI']]; df_test\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "8580f592", + "id": "23031b91", "metadata": {}, "outputs": [], "source": [ - "#| exports\n", - "coi_val = {'seawater' : {'val': 'VALUE_Bq/m³'},\n", - " 'biota': {'val': 'VALUE_Bq/kg'},\n", - " 'sediment': {'val': 'VALUE_Bq/kg'}}" + "#| eval: false\n", + "# df_test.pivot_table(index=['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'YEAR', 'MONTH', 'DAY'],\n", + "# columns='NUCLIDE',\n", + "# values='VALUE_Bq/kg',\n", + "# fill_value=np.nan,\n", + "# aggfunc=lambda x: x).reset_index()\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "def0a599", + "id": "53cdf8c0", "metadata": {}, "outputs": [], "source": [ - "#| exports\n", - "class SanitizeValue(Callback):\n", - " \"Sanitize value/measurement by removing blank entries and populating `value` column.\"\n", - " def __init__(self, \n", - " coi: Dict[str, Dict[str, str]] # Columns of interest. Format: {group_name: {'val': 'column_name'}}\n", - " ): \n", - " fc.store_attr()\n", + "# #| eval: false\n", + "# # Preprocess the data\n", + "# df_test['VALUE_Bq/kg'] = df_test['VALUE_Bq/kg'].fillna(-999)\n", "\n", - " def __call__(self, tfm: Transformer):\n", - " for grp, df in tfm.dfs.items():\n", - " value_col = self.coi[grp]['val']\n", - " df.dropna(subset=[value_col], inplace=True)\n", - " df['value'] = df[value_col]" + "# # Then pivot\n", + "# pivoted = df_test.pivot_table(index=['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'DATE', 'YEAR', 'MONTH', 'DAY'],\n", + "# columns='NUCLIDE',\n", + "# values='VALUE_Bq/kg',\n", + "# aggfunc='first').reset_index()\n", + "\n", + "# # Replace -999 with 'Below Detection Limit' or any other indicator\n", + "# pivoted = pivoted.replace(-999, np.nan)\n", + "# pivoted" ] }, { "cell_type": "code", "execution_count": null, - "id": "bccb7a50", + "id": "fc75cf0f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21122 39532 15798\n", - "Number of dropped rows 94 285 29\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", - "\n" - ] - } - ], + "outputs": [], "source": [ "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[SanitizeValue(coi_val),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", - "\n", - "tfm()\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" + "# dfs['sediment'][['LATITUDE (ddmmm)', 'LONGITUDE (ddmmm)', '']].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30bd602a", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "# dfs['sediment'].head()" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "be199c49", + "id": "5687eade", "metadata": {}, "source": [ - "## Normalize uncertainty" + "## Add sample type column" ] }, { "cell_type": "markdown", - "id": "7515714b", + "id": "a984410e", "metadata": {}, "source": [ - "Function `unc_rel2stan` converts uncertainty from relative uncertainty to standard uncertainty." + "The sample type (`seawater`, `biota`, `sediment`, ...) as defined in the `configs.ipynb` are encoded group names in NetCDF produced. Addition of sample type ids into individual dataframes is done using the `AddSampleTypeIdColumnCB` callback for legacy purposes (i.e. Open Refine output)." ] }, { "cell_type": "code", "execution_count": null, - "id": "76077d40", + "id": "cf5ba759", "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "def unc_rel2stan(\n", - " df: pd.DataFrame, # DataFrame containing measurement and uncertainty columns\n", - " meas_col: str, # Name of the column with measurement values\n", - " unc_col: str # Name of the column with relative uncertainty values (percentages)\n", - ") -> pd.Series: # Series with calculated absolute uncertainties\n", - " \"Convert relative uncertainty to absolute uncertainty.\"\n", - " return df.apply(lambda row: row[unc_col] * row[meas_col] / 100, axis=1)" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " KEY samptype_id\n", + "0 WKRIL2012003 1\n", + "1 WKRIL2012004 1\n", + "2 WKRIL2012005 1\n", + "3 WKRIL2012006 1\n", + "4 WKRIL2012007 1\n", + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of dropped rows 0 0 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "\n", + "print(tfm()['seawater'][['KEY', 'samptype_id']].head())\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "2917d107", + "id": "142ddab3", "metadata": {}, "source": [ - "For each sample type in the Helcom dataset, the uncertainty is given as a relative uncertainty. The column names for both the value and the uncertainty vary by sample type. The coi_units_unc dictionary defines the column names for the Value and Uncertainty for each sample type." + "## Normalize nuclide names" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "b231b09b", + "attachments": {}, + "cell_type": "markdown", + "id": "8a2311cd", "metadata": {}, - "outputs": [], "source": [ - "#| exports\n", - "# Columns of interest\n", - "coi_units_unc = [('seawater', 'VALUE_Bq/m³', 'ERROR%_m³'),\n", - " ('biota', 'VALUE_Bq/kg', 'ERROR%'),\n", - " ('sediment', 'VALUE_Bq/kg', 'ERROR%_kg')]" + "### Lower & strip nuclide names" ] }, { "cell_type": "markdown", - "id": "f20c9a4b", + "id": "4b7b4ceb", "metadata": {}, "source": [ - "NormalizeUncCB callback normalizes the uncertainty by converting from relative uncertainty to standard uncertainty. " + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: Some nuclide names contain one or multiple trailing spaces.\n", + "\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "id": "64d84ed7", + "metadata": {}, + "source": [ + "This is demonstrated below for the `NUCLIDE` column:" ] }, { "cell_type": "code", "execution_count": null, - "id": "5cf262ed", + "id": "9a2306ba", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " index value n_chars stripped_chars\n", + "12 12 SR90 5 4\n", + "18 18 CO60 8 4\n", + "20 20 SR90 6 4\n", + "24 24 CS137 9 5\n", + "27 27 CS134 8 5\n", + "43 43 K40 8 3\n", + "44 44 AM241 8 5\n", + "50 50 SR90 8 4\n", + "52 52 PU238 8 5\n", + "54 54 SR90 7 4\n", + "55 55 CS137 6 5\n", + "78 78 TC99 7 4\n", + "81 81 CS137 8 5\n" + ] + } + ], "source": [ - "#| exports\n", - "class NormalizeUncCB(Callback):\n", - " \"Convert from relative error % to uncertainty of activity unit.\"\n", - " def __init__(self, \n", - " fn_convert_unc: Callable=unc_rel2stan, # Function converting relative uncertainty to absolute uncertainty\n", - " coi: List[Tuple[str, str, str]]=coi_units_unc # List of columns of interest\n", - " ):\n", - " fc.store_attr()\n", - " \n", - " def __call__(self, tfm: Transformer):\n", - " for grp, val, unc in self.coi:\n", - " if grp in tfm.dfs:\n", - " df = tfm.dfs[grp]\n", - " df['uncertainty'] = self.fn_convert_unc(df, val, unc)" + "#| eval: false\n", + "df = get_unique_across_dfs(load_data(fname_in), 'NUCLIDE', as_df=True, include_nchars=True)\n", + "df['stripped_chars'] = df['value'].str.strip().str.replace(' ', '').str.len()\n", + "print(df[df['n_chars'] != df['stripped_chars']])" ] }, { "cell_type": "markdown", - "id": "8545b262", + "id": "518174ba", "metadata": {}, "source": [ - "Apply the transformer for callback NormalizeUncCB(). Then, print the value (i.e. activity per unit ) and standard uncertainty for each sample type." + "To fix this issue, we use the `LowerStripNameCB` callback. For each dataframe in the dictionary of dataframes, it corrects the nuclide name by converting it lowercase, striping any leading or trailing whitespace(s) and ensuring the number comes before letters (e.g. `137cs`)." ] }, { "cell_type": "code", "execution_count": null, - "id": "fd9e14e2", + "id": "8a3fa068", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " value uncertainty\n", - "0 5.3 1.696\n", - "1 19.9 3.980\n", - "2 25.5 5.100\n", - "3 17.0 4.930\n", - "4 22.2 3.996\n", - " value uncertainty\n", - "0 0.010140 NaN\n", - "1 135.300000 4.830210\n", - "2 0.013980 NaN\n", - "3 4.338000 0.150962\n", - "4 0.009614 NaN\n", - " value uncertainty\n", - "0 35.0 9.10\n", - "1 36.0 7.92\n", - "2 38.0 9.12\n", - "3 36.0 9.00\n", - "4 30.0 6.90\n" + "seawater nuclides: \n", + "['cs137' 'sr90' 'h3' 'cs134' 'pu238' 'pu239240' 'am241' 'cm242' 'cm244'\n", + " 'tc99' 'k40' 'ru103' 'sr89' 'sb125' 'nb95' 'ru106' 'zr95' 'ag110m'\n", + " 'cm243244' 'ba140' 'ce144' 'u234' 'u238' 'co60' 'pu239' 'pb210' 'po210'\n", + " 'np237' 'pu240' 'mn54']\n", + "sediment nuclides: \n", + "['ra226' 'cs137' 'ra228' 'k40' 'sr90' 'cs134137' 'cs134' 'pu239240'\n", + " 'pu238' 'co60' 'ru103' 'ru106' 'sb125' 'ag110m' 'ce144' 'am241' 'be7'\n", + " 'th228' 'pb210' 'co58' 'mn54' 'zr95' 'ba140' 'po210' 'ra224' 'nb95'\n", + " 'pu238240' 'pu241' 'pu239' 'eu155' 'ir192' 'th232' 'cd109' 'sb124' 'zn65'\n", + " 'th234' 'tl208' 'pb212' 'pb214' 'bi214' 'ac228' 'ra223' 'u235' 'bi212']\n", + "biota nuclides: \n", + "['cs134' 'k40' 'co60' 'cs137' 'sr90' 'ag108m' 'mn54' 'co58' 'ag110m'\n", + " 'zn65' 'sb125' 'pu239240' 'ru106' 'be7' 'ce144' 'pb210' 'po210' 'sb124'\n", + " 'sr89' 'zr95' 'te129m' 'ru103' 'nb95' 'ce141' 'la140' 'i131' 'ba140'\n", + " 'pu238' 'u235' 'bi214' 'pb214' 'pb212' 'tl208' 'ac228' 'ra223' 'eu155'\n", + " 'ra226' 'gd153' 'sn113' 'fe59' 'tc99' 'co57' 'sn117m' 'eu152' 'sc46'\n", + " 'rb86' 'ra224' 'th232' 'cs134137' 'am241' 'ra228' 'th228' 'k-40' 'cs138'\n", + " 'cs139' 'cs140' 'cs141' 'cs142' 'cs143' 'cs144' 'cs145' 'cs146']\n" ] } ], "source": [ "#| eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[NormalizeUncCB(),\n", - " SanitizeValue(coi_val)])\n", + "tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE')])\n", "\n", - "print(tfm()['seawater'][['value', 'uncertainty']][:5])\n", - "print(tfm()['biota'][['value', 'uncertainty']][:5])\n", - "print(tfm()['sediment'][['value', 'uncertainty']][:5])" + "for key in tfm().keys():\n", + " print(f'{key} nuclides: ')\n", + " print(tfm()[key]['NUCLIDE'].unique())" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "9392b0cb", + "id": "52c9d0fe", "metadata": {}, "source": [ - "## Remap Biota species" + "### Remap nuclide names to MARIS data formats" ] }, { "cell_type": "markdown", - "id": "abd63300", + "id": "a58baf14", "metadata": {}, "source": [ - "We follow in the next following processing steps the same approach as for remapping of nuclide names above." + "We below map nuclide names used by HELCOM to the MARIS standard nuclide names. \n", + "\n", + "Remapping data provider nomenclatures into MARIS standards is one recurrent operation and is done in a semi-automated manner according to the following pattern:\n", + "\n", + "1. **Inspect** data provider nomenclature:\n", + "2. **Match** automatically against MARIS nomenclature (using a fuzzy matching algorithm); \n", + "3. **Fix** potential mismatches; \n", + "4. **Apply** the lookup table to the dataframe.\n", + "\n", + "As now on, we will use this pattern to remap the HELCOM data provider nomenclatures into MARIS standards and name it for the sake of brevity **IMFA** (**I**nspect, **M**atch, **F**ix, **A**pply)." ] }, { "cell_type": "markdown", - "id": "02e7dbf2", + "id": "ae4b31bc", "metadata": {}, "source": [ - "Let's inspect the `RUBIN_NAME.csv` file provided by HELCOM describing the biota species nomenclature." + "The unique values of the data provider nuclide names. The `get_unique_across_dfs` is a utility function allowing to retrieve unique values of a specific column across all dataframes (please remind that we have one dataframe per sample type - biota, ...)." ] }, { "cell_type": "code", "execution_count": null, - "id": "eb121e8b", + "id": "e32ee8d0", "metadata": {}, "outputs": [ { @@ -2887,59 +2385,47 @@ " \n", " \n", " \n", - " RUBIN_ID\n", - " RUBIN\n", - " SCIENTIFIC NAME\n", - " ENGLISH NAME\n", + " index\n", + " value\n", " \n", " \n", " \n", " \n", " 0\n", - " 11\n", - " ABRA BRA\n", - " ABRAMIS BRAMA\n", - " BREAM\n", + " 0\n", + " cd109\n", " \n", " \n", " 1\n", - " 12\n", - " ANGU ANG\n", - " ANGUILLA ANGUILLA\n", - " EEL\n", + " 1\n", + " co57\n", " \n", " \n", " 2\n", - " 13\n", - " ARCT ISL\n", - " ARCTICA ISLANDICA\n", - " ISLAND CYPRINE\n", + " 2\n", + " ra223\n", " \n", " \n", " 3\n", - " 14\n", - " ASTE RUB\n", - " ASTERIAS RUBENS\n", - " COMMON STARFISH\n", + " 3\n", + " rb86\n", " \n", " \n", " 4\n", - " 15\n", - " CARD EDU\n", - " CARDIUM EDULE\n", - " COCKLE\n", + " 4\n", + " gd153\n", " \n", " \n", "\n", "" ], "text/plain": [ - " RUBIN_ID RUBIN SCIENTIFIC NAME ENGLISH NAME\n", - "0 11 ABRA BRA ABRAMIS BRAMA BREAM\n", - "1 12 ANGU ANG ANGUILLA ANGUILLA EEL\n", - "2 13 ARCT ISL ARCTICA ISLANDICA ISLAND CYPRINE\n", - "3 14 ASTE RUB ASTERIAS RUBENS COMMON STARFISH\n", - "4 15 CARD EDU CARDIUM EDULE COCKLE" + " index value\n", + "0 0 cd109\n", + "1 1 co57\n", + "2 2 ra223\n", + "3 3 rb86\n", + "4 4 gd153" ] }, "execution_count": null, @@ -2949,28 +2435,57 @@ ], "source": [ "#| eval: false\n", - "pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv').head()" + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE')])\n", + "dfs_output = tfm()\n", + "\n", + "get_unique_across_dfs(dfs_output, col_name='NUCLIDE', as_df=True).head(5)" ] }, { "cell_type": "markdown", - "id": "3ec2bd53", + "id": "614c1bdf", "metadata": {}, "source": [ - "We try to remap the `SCIENTIFIC NAME` column to the `species` column of the MARIS nomenclature, again using a `Remapper` object:" + "Let's now create an instance of a fuzzy matching algorithm `Remapper`:" ] }, { "cell_type": "code", "execution_count": null, - "id": "da393947", + "id": "bcdbc619", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs_output, col_name='NUCLIDE', as_df=True),\n", + " maris_lut_fn=nuc_lut_path,\n", + " maris_col_id='nuclide_id',\n", + " maris_col_name='nc_name',\n", + " provider_col_to_match='value',\n", + " provider_col_key='value',\n", + " fname_cache='nuclides_helcom.pkl')" + ] + }, + { + "cell_type": "markdown", + "id": "f7e0ea0c", + "metadata": {}, + "source": [ + "And try to match HELCOM to MARIS nuclide names as automatically as possible. The `match_score` column allows to assess the results:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb645c29", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 46/46 [00:06<00:00, 6.94it/s]\n" + "Processing: 100%|██████████| 77/77 [00:01<00:00, 45.17it/s]\n" ] }, { @@ -3007,45 +2522,87 @@ " \n", " \n", " \n", - " STIZ LUC\n", - " Sander lucioperca\n", - " STIZOSTEDION LUCIOPERCA\n", - " 10\n", + " cs134137\n", + " cs137\n", + " cs134137\n", + " 3\n", " \n", " \n", - " LAMI SAC\n", - " Laminaria japonica\n", - " LAMINARIA SACCHARINA\n", - " 7\n", + " pu238240\n", + " pu240\n", + " pu238240\n", + " 3\n", " \n", " \n", - " CARD EDU\n", - " Cardiidae\n", - " CARDIUM EDULE\n", - " 6\n", + " pu239240\n", + " pu240\n", + " pu239240\n", + " 3\n", " \n", " \n", - " ENCH CIM\n", - " Echinodermata\n", - " ENCHINODERMATA CIM\n", - " 5\n", + " cm243244\n", + " cm244\n", + " cm243244\n", + " 3\n", " \n", " \n", - " PSET MAX\n", - " Pinctada maxima\n", - " PSETTA MAXIMA\n", - " 5\n", + " cs143\n", + " cs127\n", + " cs143\n", + " 2\n", " \n", " \n", - " MACO BAL\n", - " Macoma balthica\n", - " MACOMA BALTICA\n", + " cs142\n", + " ce144\n", + " cs142\n", + " 2\n", + " \n", + " \n", + " cs145\n", + " cs136\n", + " cs145\n", + " 2\n", + " \n", + " \n", + " cs138\n", + " cs137\n", + " cs138\n", " 1\n", " \n", " \n", - " STUC PEC\n", - " Stuckenia pectinata\n", - " STUCKENIA PECTINATE\n", + " k-40\n", + " k40\n", + " k-40\n", + " 1\n", + " \n", + " \n", + " cs139\n", + " ce139\n", + " cs139\n", + " 1\n", + " \n", + " \n", + " cs144\n", + " ce144\n", + " cs144\n", + " 1\n", + " \n", + " \n", + " cs140\n", + " ce140\n", + " cs140\n", + " 1\n", + " \n", + " \n", + " cs141\n", + " ce141\n", + " cs141\n", + " 1\n", + " \n", + " \n", + " cs146\n", + " cs136\n", + " cs146\n", " 1\n", " \n", " \n", @@ -3053,15 +2610,22 @@ "" ], "text/plain": [ - " matched_maris_name source_name match_score\n", - "source_key \n", - "STIZ LUC Sander lucioperca STIZOSTEDION LUCIOPERCA 10\n", - "LAMI SAC Laminaria japonica LAMINARIA SACCHARINA 7\n", - "CARD EDU Cardiidae CARDIUM EDULE 6\n", - "ENCH CIM Echinodermata ENCHINODERMATA CIM 5\n", - "PSET MAX Pinctada maxima PSETTA MAXIMA 5\n", - "MACO BAL Macoma balthica MACOMA BALTICA 1\n", - "STUC PEC Stuckenia pectinata STUCKENIA PECTINATE 1" + " matched_maris_name source_name match_score\n", + "source_key \n", + "cs134137 cs137 cs134137 3\n", + "pu238240 pu240 pu238240 3\n", + "pu239240 pu240 pu239240 3\n", + "cm243244 cm244 cm243244 3\n", + "cs143 cs127 cs143 2\n", + "cs142 ce144 cs142 2\n", + "cs145 cs136 cs145 2\n", + "cs138 cs137 cs138 1\n", + "k-40 k40 k-40 1\n", + "cs139 ce139 cs139 1\n", + "cs144 ce144 cs144 1\n", + "cs140 ce140 cs140 1\n", + "cs141 ce141 cs141 1\n", + "cs146 cs136 cs146 1" ] }, "execution_count": null, @@ -3071,214 +2635,185 @@ ], "source": [ "#| eval: false\n", - "remapper = Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv'),\n", - " maris_lut_fn=species_lut_path,\n", - " maris_col_id='species_id',\n", - " maris_col_name='species',\n", - " provider_col_to_match='SCIENTIFIC NAME',\n", - " provider_col_key='RUBIN',\n", - " fname_cache='species_helcom.pkl'\n", - " )\n", - "\n", "remapper.generate_lookup_table(as_df=True)\n", "remapper.select_match(match_score_threshold=1)" ] }, { "cell_type": "markdown", - "id": "e592a7a9", + "id": "4a5cb838", "metadata": {}, "source": [ - "We fix below some of the entries that are not properly matched by the `Remapper` object:" + "We then manually inspect the remaining unmatched names and create a fixes table to map them to the correct MARIS standards:" ] }, { "cell_type": "code", "execution_count": null, - "id": "3e31a799", + "id": "60cf885b", "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "fixes_biota_species = {\n", - " 'CARDIUM EDULE': 'Cerastoderma edule',\n", - " 'LAMINARIA SACCHARINA': 'Saccharina latissima',\n", - " 'PSETTA MAXIMA': 'Scophthalmus maximus',\n", - " 'STIZOSTEDION LUCIOPERCA': 'Sander luciopercas'}" + "fixes_nuclide_names = {\n", + " 'cs134137': 'cs134_137_tot',\n", + " 'cm243244': 'cm243_244_tot',\n", + " 'pu239240': 'pu239_240_tot',\n", + " 'pu238240': 'pu238_240_tot',\n", + " 'cs143': 'cs137',\n", + " 'cs145': 'cs137',\n", + " 'cs142': 'cs137',\n", + " 'cs141': 'cs137',\n", + " 'cs144': 'cs137',\n", + " 'k-40': 'k40',\n", + " 'cs140': 'cs137',\n", + " 'cs146': 'cs137',\n", + " 'cs139': 'cs137',\n", + " 'cs138': 'cs137'\n", + " }" ] }, { "cell_type": "markdown", - "id": "f7d1d994", + "id": "dd575e7e", "metadata": {}, "source": [ - "And give it an another try:" + "Let's try to match again but this time we use the `fixes_nuclide_names` to map the nuclide names to the MARIS standards:\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "5a70225a", + "id": "73410b14", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 46/46 [00:07<00:00, 5.79it/s]\n" + "Processing: 100%|██████████| 77/77 [00:01<00:00, 49.83it/s]\n" ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
matched_maris_namesource_namematch_score
source_key
ENCH CIMEchinodermataENCHINODERMATA CIM5
MACO BALMacoma balthicaMACOMA BALTICA1
STIZ LUCSander luciopercaSTIZOSTEDION LUCIOPERCA1
STUC PECStuckenia pectinataSTUCKENIA PECTINATE1
\n", - "
" - ], - "text/plain": [ - " matched_maris_name source_name match_score\n", - "source_key \n", - "ENCH CIM Echinodermata ENCHINODERMATA CIM 5\n", - "MACO BAL Macoma balthica MACOMA BALTICA 1\n", - "STIZ LUC Sander lucioperca STIZOSTEDION LUCIOPERCA 1\n", - "STUC PEC Stuckenia pectinata STUCKENIA PECTINATE 1" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ "#| eval: false\n", - "remapper.generate_lookup_table(fixes=fixes_biota_species)\n", - "remapper.select_match(match_score_threshold=1)" + "remapper.generate_lookup_table(as_df=True, fixes=fixes_nuclide_names)\n", + "fc.test_eq(len(remapper.select_match(match_score_threshold=1)), 0)" ] }, { "cell_type": "markdown", - "id": "e6f49b32", + "id": "abd1276f", "metadata": {}, "source": [ - "Visual inspection of the remaining unperfectly matched entries seem acceptable to proceed. \n", - "\n", - "We can now use the generic `RemapCB` callback to perform the remapping of the `RUBIN` column to the `species` column after having defined the lookup table `lut_biota`." + "Test passes! We can now create a callback `RemapNuclideNameCB` to remap the nuclide names. Note that we pass `overwrite=False` to the `Remapper` constructor to now use the cached version.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "ccd6c46e", + "id": "9a189ef9", "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "lut_biota = lambda: Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv'),\n", - " maris_lut_fn=species_lut_path,\n", - " maris_col_id='species_id',\n", - " maris_col_name='species',\n", - " provider_col_to_match='SCIENTIFIC NAME',\n", - " provider_col_key='RUBIN',\n", - " fname_cache='species_helcom.pkl'\n", - " ).generate_lookup_table(fixes=fixes_biota_species, as_df=False, overwrite=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b83ffe12", + "# Create a lookup table for nuclide names\n", + "lut_nuclides = lambda df: Remapper(provider_lut_df=df,\n", + " maris_lut_fn=nuc_lut_path,\n", + " maris_col_id='nuclide_id',\n", + " maris_col_name='nc_name',\n", + " provider_col_to_match='value',\n", + " provider_col_key='value',\n", + " fname_cache='nuclides_helcom.pkl').generate_lookup_table(fixes=fixes_nuclide_names, \n", + " as_df=False, overwrite=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03d47237", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class RemapNuclideNameCB(Callback):\n", + " \"Remap data provider nuclide names to MARIS nuclide names.\"\n", + " def __init__(self, \n", + " fn_lut: Callable # Function that returns the lookup table dictionary\n", + " ):\n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm: Transformer):\n", + " df_uniques = get_unique_across_dfs(tfm.dfs, col_name='NUCLIDE', as_df=True)\n", + " lut = {k: v.matched_maris_name for k, v in self.fn_lut(df_uniques).items()} \n", + " for k in tfm.dfs.keys():\n", + " tfm.dfs[k]['NUCLIDE'] = tfm.dfs[k]['NUCLIDE'].replace(lut)" + ] + }, + { + "cell_type": "markdown", + "id": "ce649d7a", + "metadata": {}, + "source": [ + "Let's see it in action, along with the `RemapRdnNameCB` callback:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c9a9ff7", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 99 243 50 139 270 192 191 284 84 269 122 96 287 279\n", - " 278 288 286 244 129 275 271 285 283 247 120 59 280 274\n", - " 273 290 289 272 277 276 21 282 110 281 245 704 1524 703\n", - " 1611 621 60]\n" - ] + "data": { + "text/plain": [ + "array(['cs134', 'k40', 'co60', 'cs137', 'sr90', 'ag108m', 'mn54', 'co58',\n", + " 'ag110m', 'zn65', 'sb125', 'pu239_240_tot', 'ru106', 'be7',\n", + " 'ce144', 'pb210', 'po210', 'sb124', 'sr89', 'zr95', 'te129m',\n", + " 'ru103', 'nb95', 'ce141', 'la140', 'i131', 'ba140', 'pu238',\n", + " 'u235', 'bi214', 'pb214', 'pb212', 'tl208', 'ac228', 'ra223',\n", + " 'eu155', 'ra226', 'gd153', 'sn113', 'fe59', 'tc99', 'co57',\n", + " 'sn117m', 'eu152', 'sc46', 'rb86', 'ra224', 'th232',\n", + " 'cs134_137_tot', 'am241', 'ra228', 'th228'], dtype=object)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "#| eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[\n", - " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota')\n", - " ])\n", + "tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides)\n", + " ])\n", + "dfs_out = tfm()\n", "\n", - "# For instance:\n", - "print(tfm()['biota']['species'].unique())" + "# For instance\n", + "dfs_out['biota'].NUCLIDE.unique()\n" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "2c74e492", + "id": "f91ba2d3", "metadata": {}, "source": [ - "## Remap Biota tissues\n", - "Let's inspect the `TISSUE.csv` file provided by HELCOM describing the tissue nomenclature. Biota tissue is known as `body part` in the maris data set." + "### Add Nuclide Id column" + ] + }, + { + "cell_type": "markdown", + "id": "49a6c352", + "metadata": {}, + "source": [ + "The `nuclide_id` column is added to the dataframe for legacy reasons (again Open Refine output)." ] }, { "cell_type": "code", "execution_count": null, - "id": "a38df50b-46a9-4a2d-9379-e670eb0d0bb6", + "id": "4ec4271e", "metadata": {}, "outputs": [ { @@ -3302,47 +2837,86 @@ " \n", " \n", " \n", - " TISSUE\n", - " TISSUE_DESCRIPTION\n", + " NUCLIDE\n", + " nuclide_id\n", " \n", " \n", " \n", " \n", " 0\n", - " 1\n", - " WHOLE FISH\n", + " cs134\n", + " 31\n", " \n", " \n", " 1\n", - " 2\n", - " WHOLE FISH WITHOUT ENTRAILS\n", + " k40\n", + " 4\n", " \n", " \n", " 2\n", - " 3\n", - " WHOLE FISH WITHOUT HEAD AND ENTRAILS\n", + " co60\n", + " 9\n", " \n", " \n", " 3\n", - " 4\n", - " FLESH WITH BONES\n", + " cs137\n", + " 33\n", " \n", " \n", " 4\n", - " 5\n", - " FLESH WITHOUT BONES (FILETS)\n", + " cs134\n", + " 31\n", + " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 15822\n", + " k40\n", + " 4\n", + " \n", + " \n", + " 15823\n", + " cs137\n", + " 33\n", + " \n", + " \n", + " 15824\n", + " be7\n", + " 2\n", + " \n", + " \n", + " 15825\n", + " k40\n", + " 4\n", + " \n", + " \n", + " 15826\n", + " cs137\n", + " 33\n", " \n", " \n", "\n", + "

15827 rows × 2 columns

\n", "" ], "text/plain": [ - " TISSUE TISSUE_DESCRIPTION\n", - "0 1 WHOLE FISH\n", - "1 2 WHOLE FISH WITHOUT ENTRAILS\n", - "2 3 WHOLE FISH WITHOUT HEAD AND ENTRAILS\n", - "3 4 FLESH WITH BONES\n", - "4 5 FLESH WITHOUT BONES (FILETS)" + " NUCLIDE nuclide_id\n", + "0 cs134 31\n", + "1 k40 4\n", + "2 co60 9\n", + "3 cs137 33\n", + "4 cs134 31\n", + "... ... ...\n", + "15822 k40 4\n", + "15823 cs137 33\n", + "15824 be7 2\n", + "15825 k40 4\n", + "15826 cs137 33\n", + "\n", + "[15827 rows x 2 columns]" ] }, "execution_count": null, @@ -3352,193 +2926,215 @@ ], "source": [ "#| eval: false\n", - "pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv').head()" + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE')\n", + " ])\n", + "dfs_out = tfm()\n", + "\n", + "# For instance\n", + "dfs_out['biota'][['NUCLIDE', 'nuclide_id']]" + ] + }, + { + "cell_type": "markdown", + "id": "02e9e1f4", + "metadata": {}, + "source": [ + "## Standardize Time" + ] + }, + { + "cell_type": "markdown", + "id": "24856dc5", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: Time/date is provide in the `DATE`, `YEAR`\n", + ", `MONTH`, `DAY` columns. Note that the `DATE` contains missing values as indicated below. When missing, we fallback on the `YEAR`, `MONTH`, `DAY` columns. Note also that sometimes `DAY` and `MONTH` contain 0. In this case we systematically set them to 1.\n", + "\n", + ":::" ] }, { "cell_type": "code", "execution_count": null, - "id": "2613f239", + "id": "612873e6", "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing: 0%| | 0/29 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
matched_maris_namesource_namematch_score
source_key
3Flesh without bonesWHOLE FISH WITHOUT HEAD AND ENTRAILS20
2Flesh without bonesWHOLE FISH WITHOUT ENTRAILS13
8Soft partsSKIN/EPIDERMIS10
5Flesh without bonesFLESH WITHOUT BONES (FILETS)9
1Whole animalWHOLE FISH5
12BrainENTRAILS5
15Stomach and intestineSTOMACH + INTESTINE3
41Whole animalWHOLE ANIMALS1
\n", - "" - ], - "text/plain": [ - " matched_maris_name source_name \\\n", - "source_key \n", - "3 Flesh without bones WHOLE FISH WITHOUT HEAD AND ENTRAILS \n", - "2 Flesh without bones WHOLE FISH WITHOUT ENTRAILS \n", - "8 Soft parts SKIN/EPIDERMIS \n", - "5 Flesh without bones FLESH WITHOUT BONES (FILETS) \n", - "1 Whole animal WHOLE FISH \n", - "12 Brain ENTRAILS \n", - "15 Stomach and intestine STOMACH + INTESTINE \n", - "41 Whole animal WHOLE ANIMALS \n", - "\n", - " match_score \n", - "source_key \n", - "3 20 \n", - "2 13 \n", - "8 10 \n", - "5 9 \n", - "1 5 \n", - "12 5 \n", - "15 3 \n", - "41 1 " - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ "#| eval: false\n", - "remapper = Remapper(provider_lut_df=pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv'),\n", - " maris_lut_fn=bodyparts_lut_path,\n", - " maris_col_id='bodypar_id',\n", - " maris_col_name='bodypar',\n", - " provider_col_to_match='TISSUE_DESCRIPTION',\n", - " provider_col_key='TISSUE',\n", - " fname_cache='tissues_helcom.pkl'\n", - " )\n", + "dfs = load_data(fname_in)\n", + "for key in dfs.keys():\n", + " print(f'{key} DATE null values: ', dfs[key]['DATE'].isna().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae547a0c", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class ParseTimeCB(Callback):\n", + " \"Parse and standardize time information in the dataframe.\"\n", + " def __call__(self, tfm: Transformer):\n", + " for df in tfm.dfs.values():\n", + " self._process_dates(df)\n", + " self._define_beg_period(df)\n", "\n", - "remapper.generate_lookup_table(as_df=True)\n", - "remapper.select_match(match_score_threshold=1)" + " def _process_dates(self, df: pd.DataFrame) -> None:\n", + " \"Process and correct date and time information in the DataFrame.\"\n", + " df['time'] = self._parse_date(df)\n", + " self._handle_missing_dates(df)\n", + " self._fill_missing_time(df)\n", + "\n", + " def _parse_date(self, df: pd.DataFrame) -> pd.Series:\n", + " \"Parse the DATE column if present.\"\n", + " return pd.to_datetime(df['DATE'], format='%m/%d/%y %H:%M:%S', errors='coerce')\n", + "\n", + " def _handle_missing_dates(self, df: pd.DataFrame):\n", + " \"Handle cases where DAY or MONTH is 0 or missing.\"\n", + " df.loc[df[\"DAY\"] == 0, \"DAY\"] = 1\n", + " df.loc[df[\"MONTH\"] == 0, \"MONTH\"] = 1\n", + " \n", + " missing_day_month = (df[\"DAY\"].isna()) & (df[\"MONTH\"].isna()) & (df[\"YEAR\"].notna())\n", + " df.loc[missing_day_month, [\"DAY\", \"MONTH\"]] = 1\n", + "\n", + " def _fill_missing_time(self, df: pd.DataFrame) -> None:\n", + " \"Fill missing time values using YEAR, MONTH, and DAY columns.\"\n", + " missing_time = df['time'].isna()\n", + " df.loc[missing_time, 'time'] = pd.to_datetime(\n", + " df.loc[missing_time, ['YEAR', 'MONTH', 'DAY']], \n", + " format='%Y%m%d', \n", + " errors='coerce'\n", + " )\n", + "\n", + " def _define_beg_period(self, df: pd.DataFrame) -> None:\n", + " \"Create a standardized date representation for Open Refine.\"\n", + " df['begperiod'] = df['time']" ] }, { "cell_type": "markdown", - "id": "0fee1bb9", + "id": "48c34819", "metadata": {}, "source": [ - "We fix below some of the entries that are not properly matched by the `Remapper` object:" + "Apply the transformer for callbacks `ParseTimeCB`. Then, print the ``begperiod`` and `time` data for `seawater`." ] }, { "cell_type": "code", "execution_count": null, - "id": "c6e2b06f-5eb1-4708-8087-75c836f08112", + "id": "f2b90d07", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of dropped rows 0 0 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n", + " begperiod time\n", + "0 2012-05-23 2012-05-23\n", + "1 2012-05-23 2012-05-23\n", + "2 2012-06-17 2012-06-17\n", + "3 2012-05-24 2012-05-24\n", + "4 2012-05-24 2012-05-24\n", + "... ... ...\n", + "21211 2021-10-15 2021-10-15\n", + "21212 2021-11-04 2021-11-04\n", + "21213 2021-10-15 2021-10-15\n", + "21214 2021-05-17 2021-05-17\n", + "21215 2021-05-13 2021-05-13\n", + "\n", + "[21216 rows x 2 columns]\n" + ] + } + ], "source": [ - "#| exports\n", - "fixes_biota_tissues = {\n", - " 'WHOLE FISH WITHOUT HEAD AND ENTRAILS': 'Whole animal eviscerated without head',\n", - " 'ENTRAILS': 'Viscera',\n", - " 'SKIN/EPIDERMIS': 'Skin'}" + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[ParseTimeCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", + "print(tfm.dfs['seawater'][['begperiod','time']])" + ] + }, + { + "cell_type": "markdown", + "id": "28dd488a", + "metadata": {}, + "source": [ + "NetCDF time format requires the time to be encoded as number of milliseconds since a time of origin. In our case the time of origin is `1970-01-01` as indicated in `configs.ipynb` `CONFIFS['units']['time']` dictionary." + ] + }, + { + "cell_type": "markdown", + "id": "486b2966", + "metadata": {}, + "source": [ + "`EncodeTimeCB` converts the HELCOM `time` format to the MARIS NetCDF `time` format." ] }, { "cell_type": "code", "execution_count": null, - "id": "c07fc4b8", + "id": "4b8edc56", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 29/29 [00:00<00:00, 137.97it/s]\n" + "8 of 21216 entries for `time` are invalid for seawater.\n", + "1 of 39817 entries for `time` are invalid for sediment.\n", + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21208 39816 15827\n", + "Number of dropped rows 8 1 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" ] - }, + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[ParseTimeCB(),\n", + " EncodeTimeCB(cfg(), verbose=True),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0be521de", + "metadata": {}, + "outputs": [ { "data": { "text/html": [ @@ -3560,304 +3156,610 @@ " \n", " \n", " \n", - " matched_maris_name\n", - " source_name\n", - " match_score\n", + " KEY\n", + " NUCLIDE\n", + " METHOD\n", + " < VALUE_Bq/m³\n", + " VALUE_Bq/m³\n", + " ERROR%_m³\n", + " DATE_OF_ENTRY_x\n", + " COUNTRY\n", + " LABORATORY\n", + " SEQUENCE\n", + " ...\n", + " TDEPTH\n", + " SDEPTH\n", + " SALIN\n", + " TTEMP\n", + " FILT\n", + " MORS_SUBBASIN\n", + " HELCOM_SUBBASIN\n", + " DATE_OF_ENTRY_y\n", + " time\n", + " begperiod\n", " \n", + " \n", + " \n", " \n", - " source_key\n", - " \n", - " \n", - " \n", + " 0\n", + " WKRIL2012003\n", + " CS137\n", + " NaN\n", + " NaN\n", + " 5.3\n", + " 32.000000\n", + " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012003.0\n", + " ...\n", + " NaN\n", + " 0.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " 08/20/14 00:00:00\n", + " 1337731200\n", + " 2012-05-23\n", + " \n", + " \n", + " 1\n", + " WKRIL2012004\n", + " CS137\n", + " NaN\n", + " NaN\n", + " 19.9\n", + " 20.000000\n", + " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012004.0\n", + " ...\n", + " NaN\n", + " 29.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " 08/20/14 00:00:00\n", + " 1337731200\n", + " 2012-05-23\n", " \n", - " \n", - " \n", " \n", " 2\n", - " Flesh without bones\n", - " WHOLE FISH WITHOUT ENTRAILS\n", - " 13\n", + " WKRIL2012005\n", + " CS137\n", + " NaN\n", + " NaN\n", + " 25.5\n", + " 20.000000\n", + " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012005.0\n", + " ...\n", + " NaN\n", + " 0.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 3.0\n", + " 08/20/14 00:00:00\n", + " 1339891200\n", + " 2012-06-17\n", " \n", " \n", - " 5\n", - " Flesh without bones\n", - " FLESH WITHOUT BONES (FILETS)\n", - " 9\n", + " 3\n", + " WKRIL2012006\n", + " CS137\n", + " NaN\n", + " NaN\n", + " 17.0\n", + " 29.000000\n", + " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012006.0\n", + " ...\n", + " NaN\n", + " 0.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " 08/20/14 00:00:00\n", + " 1337817600\n", + " 2012-05-24\n", " \n", " \n", - " 1\n", - " Whole animal\n", - " WHOLE FISH\n", - " 5\n", + " 4\n", + " WKRIL2012007\n", + " CS137\n", + " NaN\n", + " NaN\n", + " 22.2\n", + " 18.000000\n", + " 08/20/14 00:00:00\n", + " 90.0\n", + " KRIL\n", + " 2012007.0\n", + " ...\n", + " NaN\n", + " 39.0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 11.0\n", + " 11.0\n", + " 08/20/14 00:00:00\n", + " 1337817600\n", + " 2012-05-24\n", " \n", " \n", - " 15\n", - " Stomach and intestine\n", - " STOMACH + INTESTINE\n", - " 3\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 41\n", - " Whole animal\n", - " WHOLE ANIMALS\n", - " 1\n", + " 21211\n", + " WSSSM2021005\n", + " H3\n", + " SSM45\n", + " NaN\n", + " 1030.0\n", + " 93.203883\n", + " 09/06/22 00:00:00\n", + " 77.0\n", + " SSSM\n", + " 202105.0\n", + " ...\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " N\n", + " 1.0\n", + " 8.0\n", + " 09/06/22 00:00:00\n", + " 1634256000\n", + " 2021-10-15\n", " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " matched_maris_name source_name match_score\n", - "source_key \n", - "2 Flesh without bones WHOLE FISH WITHOUT ENTRAILS 13\n", - "5 Flesh without bones FLESH WITHOUT BONES (FILETS) 9\n", - "1 Whole animal WHOLE FISH 5\n", - "15 Stomach and intestine STOMACH + INTESTINE 3\n", - "41 Whole animal WHOLE ANIMALS 1" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| eval: false\n", - "remapper.generate_lookup_table(as_df=True, fixes=fixes_biota_tissues)\n", - "remapper.select_match(match_score_threshold=1)" - ] - }, - { - "cell_type": "markdown", - "id": "6ef75cb1", - "metadata": {}, - "source": [ - "Visual inspection of the remaining unperfectly matched entries seem acceptable to proceed. \n", - "\n", - "We can now use the generic `RemapCB` callback to perform the remapping of the `TISSUE` column to the `body_part` column after having defined the lookup table `lut_tissues`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c42eb30", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "lut_tissues = lambda: Remapper(provider_lut_df=pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv'),\n", - " maris_lut_fn=bodyparts_lut_path,\n", - " maris_col_id='bodypar_id',\n", - " maris_col_name='bodypar',\n", - " provider_col_to_match='TISSUE_DESCRIPTION',\n", - " provider_col_key='TISSUE',\n", - " fname_cache='tissues_helcom.pkl'\n", - " ).generate_lookup_table(fixes=fixes_biota_tissues, as_df=False, overwrite=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d1887c6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " TISSUE body_part\n", - "0 5 52\n", - "1 5 52\n", - "2 5 52\n", - "3 5 52\n", - "4 5 52\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[\n", - " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", - " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota')\n", - " ])\n", - "\n", - "print(tfm()['biota'][['TISSUE', 'body_part']][:5])\n" + " \n", + " 21212\n", + " WSSSM2021006\n", + " H3\n", + " SSM45\n", + " NaN\n", + " 2240.0\n", + " 43.303571\n", + " 09/06/22 00:00:00\n", + " 77.0\n", + " SSSM\n", + " 202106.0\n", + " ...\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " N\n", + " 10.0\n", + " 10.0\n", + " 09/06/22 00:00:00\n", + " 1635984000\n", + " 2021-11-04\n", + " \n", + " \n", + " 21213\n", + " WSSSM2021007\n", + " H3\n", + " SSM45\n", + " NaN\n", + " 2060.0\n", + " 47.087379\n", + " 09/06/22 00:00:00\n", + " 77.0\n", + " SSSM\n", + " 202107.0\n", + " ...\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " N\n", + " 12.0\n", + " 12.0\n", + " 09/06/22 00:00:00\n", + " 1634256000\n", + " 2021-10-15\n", + " \n", + " \n", + " 21214\n", + " WSSSM2021008\n", + " H3\n", + " SSM45\n", + " NaN\n", + " 2300.0\n", + " 43.478261\n", + " 09/06/22 00:00:00\n", + " 77.0\n", + " SSSM\n", + " 202108.0\n", + " ...\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " N\n", + " 12.0\n", + " 12.0\n", + " 09/06/22 00:00:00\n", + " 1621209600\n", + " 2021-05-17\n", + " \n", + " \n", + " 21215\n", + " WSSSM2021004\n", + " H3\n", + " SSM45\n", + " <\n", + " NaN\n", + " NaN\n", + " 09/06/22 00:00:00\n", + " 77.0\n", + " SSSM\n", + " 202104.0\n", + " ...\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " N\n", + " 15.0\n", + " 18.0\n", + " 09/06/22 00:00:00\n", + " 1620864000\n", + " 2021-05-13\n", + " \n", + " \n", + "\n", + "

21208 rows × 29 columns

\n", + "" + ], + "text/plain": [ + " KEY NUCLIDE METHOD < VALUE_Bq/m³ VALUE_Bq/m³ ERROR%_m³ \\\n", + "0 WKRIL2012003 CS137 NaN NaN 5.3 32.000000 \n", + "1 WKRIL2012004 CS137 NaN NaN 19.9 20.000000 \n", + "2 WKRIL2012005 CS137 NaN NaN 25.5 20.000000 \n", + "3 WKRIL2012006 CS137 NaN NaN 17.0 29.000000 \n", + "4 WKRIL2012007 CS137 NaN NaN 22.2 18.000000 \n", + "... ... ... ... ... ... ... \n", + "21211 WSSSM2021005 H3 SSM45 NaN 1030.0 93.203883 \n", + "21212 WSSSM2021006 H3 SSM45 NaN 2240.0 43.303571 \n", + "21213 WSSSM2021007 H3 SSM45 NaN 2060.0 47.087379 \n", + "21214 WSSSM2021008 H3 SSM45 NaN 2300.0 43.478261 \n", + "21215 WSSSM2021004 H3 SSM45 < NaN NaN \n", + "\n", + " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... TDEPTH SDEPTH \\\n", + "0 08/20/14 00:00:00 90.0 KRIL 2012003.0 ... NaN 0.0 \n", + "1 08/20/14 00:00:00 90.0 KRIL 2012004.0 ... NaN 29.0 \n", + "2 08/20/14 00:00:00 90.0 KRIL 2012005.0 ... NaN 0.0 \n", + "3 08/20/14 00:00:00 90.0 KRIL 2012006.0 ... NaN 0.0 \n", + "4 08/20/14 00:00:00 90.0 KRIL 2012007.0 ... NaN 39.0 \n", + "... ... ... ... ... ... ... ... \n", + "21211 09/06/22 00:00:00 77.0 SSSM 202105.0 ... NaN 1.0 \n", + "21212 09/06/22 00:00:00 77.0 SSSM 202106.0 ... NaN 1.0 \n", + "21213 09/06/22 00:00:00 77.0 SSSM 202107.0 ... NaN 1.0 \n", + "21214 09/06/22 00:00:00 77.0 SSSM 202108.0 ... NaN 1.0 \n", + "21215 09/06/22 00:00:00 77.0 SSSM 202104.0 ... NaN 1.0 \n", + "\n", + " SALIN TTEMP FILT MORS_SUBBASIN HELCOM_SUBBASIN DATE_OF_ENTRY_y \\\n", + "0 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", + "1 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", + "2 NaN NaN NaN 11.0 3.0 08/20/14 00:00:00 \n", + "3 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", + "4 NaN NaN NaN 11.0 11.0 08/20/14 00:00:00 \n", + "... ... ... ... ... ... ... \n", + "21211 NaN NaN N 1.0 8.0 09/06/22 00:00:00 \n", + "21212 NaN NaN N 10.0 10.0 09/06/22 00:00:00 \n", + "21213 NaN NaN N 12.0 12.0 09/06/22 00:00:00 \n", + "21214 NaN NaN N 12.0 12.0 09/06/22 00:00:00 \n", + "21215 NaN NaN N 15.0 18.0 09/06/22 00:00:00 \n", + "\n", + " time begperiod \n", + "0 1337731200 2012-05-23 \n", + "1 1337731200 2012-05-23 \n", + "2 1339891200 2012-06-17 \n", + "3 1337817600 2012-05-24 \n", + "4 1337817600 2012-05-24 \n", + "... ... ... \n", + "21211 1634256000 2021-10-15 \n", + "21212 1635984000 2021-11-04 \n", + "21213 1634256000 2021-10-15 \n", + "21214 1621209600 2021-05-17 \n", + "21215 1620864000 2021-05-13 \n", + "\n", + "[21208 rows x 29 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfm.dfs['seawater']" ] }, { "cell_type": "markdown", - "id": "cc596011", + "id": "69ef4f4b", "metadata": {}, "source": [ - "## Remap biogroup" + "## Sanitize value" ] }, { "cell_type": "markdown", - "id": "da42ebe6", + "id": "6de49e39", "metadata": {}, "source": [ - "`get_biogroup_lut` reads the file at `species_lut_path()` and from the contents of this file creates a dictionary linking `species_id` to `biogroup_id`." + "We allocate each column containing measurement values (named differently across sample types as `unit` are mentioned as well in column names) into a single column `value` and remove NA where needed." ] }, { "cell_type": "code", "execution_count": null, - "id": "cf290302", + "id": "8580f592", "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "lut_biogroup = lambda: get_lut(species_lut_path().parent, species_lut_path().name, \n", - " key='species_id', value='biogroup_id')" + "coi_val = {'seawater' : {'val': 'VALUE_Bq/m³'},\n", + " 'biota': {'val': 'VALUE_Bq/kg'},\n", + " 'sediment': {'val': 'VALUE_Bq/kg'}}" ] }, { "cell_type": "code", "execution_count": null, - "id": "c2a37157", + "id": "def0a599", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class SanitizeValue(Callback):\n", + " \"Sanitize value/measurement by removing blank entries and populating `value` column.\"\n", + " def __init__(self, \n", + " coi: Dict[str, Dict[str, str]] # Columns of interest. Format: {group_name: {'val': 'column_name'}}\n", + " ): \n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm: Transformer):\n", + " for grp, df in tfm.dfs.items():\n", + " value_col = self.coi[grp]['val']\n", + " df.dropna(subset=[value_col], inplace=True)\n", + " df['value'] = df[value_col]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bccb7a50", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[ 4 2 14 11 8 3]\n" + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21122 39532 15798\n", + "Number of dropped rows 94 285 29\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" ] } ], "source": [ "#| eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[\n", - " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", - " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", - " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota')\n", - " ])\n", + "tfm = Transformer(dfs, cbs=[SanitizeValue(coi_val),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", "\n", - "print(tfm()['biota']['bio_group'].unique())\n" + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "2bea8647", + "id": "be199c49", "metadata": {}, "source": [ - "## Remap Taxon Information\n", - "Currently, the details (`Taxonname`, `TaxonRepName`, `Taxonrank`) are used for importing into the MARIS master database, but they are not included in the NetCDF encoding. \n", - "\n", - "We first need to retrieve the taxon information from the `dbo_species.xlsx` file." + "## Normalize uncertainty" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "324d52dc", + "cell_type": "markdown", + "id": "7515714b", "metadata": {}, - "outputs": [], "source": [ - "#| exports\n", - "# TODO: Include Commonname field after next MARIS data reconciling process.\n", - "def get_taxon_info_lut(\n", - " maris_lut:str # Path to the MARIS lookup table (Excel file)\n", - ") -> dict: # A dictionary mapping species_id to biogroup_id\n", - " \"Retrieve a lookup table for Taxonname from a MARIS lookup table.\"\n", - " species = pd.read_excel(maris_lut)\n", - " return species[['species_id', 'Taxonname', 'Taxonrank','TaxonDB','TaxonDBID','TaxonDBURL']].set_index('species_id').to_dict()\n", - "\n", - "lut_taxon = lambda: get_taxon_info_lut(species_lut_path())" + "Function `unc_rel2stan` converts uncertainty from relative uncertainty to standard uncertainty." ] }, { "cell_type": "code", "execution_count": null, - "id": "b04111c3", + "id": "76077d40", "metadata": {}, "outputs": [], "source": [ - "# | exports\n", - "class RemapTaxonInformationCB(Callback):\n", - " \"Update taxon information based on MARIS species LUT.\"\n", - " def __init__(self, fn_lut: Callable):\n", - " self.fn_lut = fn_lut\n", - "\n", + "#| exports\n", + "def unc_rel2stan(\n", + " df: pd.DataFrame, # DataFrame containing measurement and uncertainty columns\n", + " meas_col: str, # Name of the column with measurement values\n", + " unc_col: str # Name of the column with relative uncertainty values (percentages)\n", + ") -> pd.Series: # Series with calculated absolute uncertainties\n", + " \"Convert relative uncertainty to absolute uncertainty.\"\n", + " return df.apply(lambda row: row[unc_col] * row[meas_col] / 100, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "2917d107", + "metadata": {}, + "source": [ + "For each sample type in the Helcom dataset, the uncertainty is given as a relative uncertainty. The column names for both the value and the uncertainty vary by sample type. The coi_units_unc dictionary defines the column names for the Value and Uncertainty for each sample type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b231b09b", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "# Columns of interest\n", + "coi_units_unc = [('seawater', 'VALUE_Bq/m³', 'ERROR%_m³'),\n", + " ('biota', 'VALUE_Bq/kg', 'ERROR%'),\n", + " ('sediment', 'VALUE_Bq/kg', 'ERROR%_kg')]" + ] + }, + { + "cell_type": "markdown", + "id": "f20c9a4b", + "metadata": {}, + "source": [ + "NormalizeUncCB callback normalizes the uncertainty by converting from relative uncertainty to standard uncertainty. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cf262ed", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class NormalizeUncCB(Callback):\n", + " \"Convert from relative error % to uncertainty of activity unit.\"\n", + " def __init__(self, \n", + " fn_convert_unc: Callable=unc_rel2stan, # Function converting relative uncertainty to absolute uncertainty\n", + " coi: List[Tuple[str, str, str]]=coi_units_unc # List of columns of interest\n", + " ):\n", + " fc.store_attr()\n", + " \n", " def __call__(self, tfm: Transformer):\n", - " lut = self.fn_lut()\n", - " df = tfm.dfs['biota']\n", - " \n", - " df['TaxonRepName'] = df.get('RUBIN', 'Unknown')\n", - " \n", - " taxon_columns = ['Taxonname', 'Taxonrank', 'TaxonDB', 'TaxonDBID', 'TaxonDBURL']\n", - " for col in taxon_columns:\n", - " df[col] = df['species'].map(lut[col]).fillna('Unknown')\n", - " \n", - " unmatched = df[df['Taxonname'] == 'Unknown']['species'].unique()\n", - " if len(unmatched) > 0:\n", - " print(f\"Unmatched species IDs: {', '.join(unmatched)}\")" + " for grp, val, unc in self.coi:\n", + " if grp in tfm.dfs:\n", + " df = tfm.dfs[grp]\n", + " df['uncertainty'] = self.fn_convert_unc(df, val, unc)" + ] + }, + { + "cell_type": "markdown", + "id": "8545b262", + "metadata": {}, + "source": [ + "Apply the transformer for callback NormalizeUncCB(). Then, print the value (i.e. activity per unit ) and standard uncertainty for each sample type." ] }, { "cell_type": "code", "execution_count": null, - "id": "40c7c54e", + "id": "fd9e14e2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " TaxonRepName Taxonname Taxonrank TaxonDB TaxonDBID \\\n", - "0 GADU MOR Gadus morhua species Wikidata Q199788 \n", - "40 SPRA SPR Sprattus sprattus species Wikidata Q506823 \n", - "44 CLUP HAR Clupea harengus species Wikidata Q2396858 \n", - "77 MERL MNG Merlangius merlangus species Wikidata Q273083 \n", - "78 LIMA LIM Limanda limanda species Wikidata Q1135526 \n", - "\n", - " TaxonDBURL \n", - "0 https://www.wikidata.org/wiki/Q199788 \n", - "40 https://www.wikidata.org/wiki/Q506823 \n", - "44 https://www.wikidata.org/wiki/Q2396858 \n", - "77 https://www.wikidata.org/wiki/Q273083 \n", - "78 https://www.wikidata.org/wiki/Q1135526 \n" + " value uncertainty\n", + "0 5.3 1.696\n", + "1 19.9 3.980\n", + "2 25.5 5.100\n", + "3 17.0 4.930\n", + "4 22.2 3.996\n", + " value uncertainty\n", + "0 0.010140 NaN\n", + "1 135.300000 4.830210\n", + "2 0.013980 NaN\n", + "3 4.338000 0.150962\n", + "4 0.009614 NaN\n", + " value uncertainty\n", + "0 35.0 9.10\n", + "1 36.0 7.92\n", + "2 38.0 9.12\n", + "3 36.0 9.00\n", + "4 30.0 6.90\n" ] } ], "source": [ "#| eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[ \n", - " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", - " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", - " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", - " RemapTaxonInformationCB(lut_taxon)\n", - " ])\n", - "tfm()\n", - "print(tfm.dfs['biota'][['TaxonRepName', 'Taxonname', 'Taxonrank',\n", - " 'TaxonDB','TaxonDBID','TaxonDBURL']].drop_duplicates().head())" + "tfm = Transformer(dfs, cbs=[NormalizeUncCB(),\n", + " SanitizeValue(coi_val)])\n", + "\n", + "print(tfm()['seawater'][['value', 'uncertainty']][:5])\n", + "print(tfm()['biota'][['value', 'uncertainty']][:5])\n", + "print(tfm()['sediment'][['value', 'uncertainty']][:5])" ] }, { "cell_type": "markdown", - "id": "adcf607d", + "id": "9392b0cb", "metadata": {}, "source": [ - "## Remap Sediment types\n", - "We use again the same **IMFA** (Inspect, Match, Fix, Apply) pattern to remap the HELCOM sediment types." + "## Remap Biota species" ] }, { "cell_type": "markdown", - "id": "0f938d40", + "id": "abd63300", "metadata": {}, "source": [ - "Let's inspect the `SEDIMENT_TYPE.csv` file provided by HELCOM describing the sediment type nomenclature:" + "We follow in the next following processing steps the same approach as for remapping of nuclide names above." + ] + }, + { + "cell_type": "markdown", + "id": "02e7dbf2", + "metadata": {}, + "source": [ + "Let's inspect the `RUBIN_NAME.csv` file provided by HELCOM describing the biota species nomenclature." ] }, { "cell_type": "code", "execution_count": null, - "id": "d5f6b82a", + "id": "eb121e8b", "metadata": {}, "outputs": [ { @@ -3881,53 +3783,59 @@ " \n", " \n", " \n", - " SEDI\n", - " SEDIMENT TYPE\n", - " RECOMMENDED TO BE USED\n", + " RUBIN_ID\n", + " RUBIN\n", + " SCIENTIFIC NAME\n", + " ENGLISH NAME\n", " \n", " \n", " \n", " \n", " 0\n", - " -99\n", - " NO DATA\n", - " NaN\n", + " 11\n", + " ABRA BRA\n", + " ABRAMIS BRAMA\n", + " BREAM\n", " \n", " \n", " 1\n", - " 0\n", - " GRAVEL\n", - " YES\n", + " 12\n", + " ANGU ANG\n", + " ANGUILLA ANGUILLA\n", + " EEL\n", " \n", " \n", " 2\n", - " 1\n", - " SAND\n", - " YES\n", + " 13\n", + " ARCT ISL\n", + " ARCTICA ISLANDICA\n", + " ISLAND CYPRINE\n", " \n", " \n", " 3\n", - " 2\n", - " FINE SAND\n", - " NO\n", + " 14\n", + " ASTE RUB\n", + " ASTERIAS RUBENS\n", + " COMMON STARFISH\n", " \n", " \n", " 4\n", - " 3\n", - " SILT\n", - " YES\n", + " 15\n", + " CARD EDU\n", + " CARDIUM EDULE\n", + " COCKLE\n", " \n", " \n", "\n", "" ], "text/plain": [ - " SEDI SEDIMENT TYPE RECOMMENDED TO BE USED\n", - "0 -99 NO DATA NaN\n", - "1 0 GRAVEL YES\n", - "2 1 SAND YES\n", - "3 2 FINE SAND NO\n", - "4 3 SILT YES" + " RUBIN_ID RUBIN SCIENTIFIC NAME ENGLISH NAME\n", + "0 11 ABRA BRA ABRAMIS BRAMA BREAM\n", + "1 12 ANGU ANG ANGUILLA ANGUILLA EEL\n", + "2 13 ARCT ISL ARCTICA ISLANDICA ISLAND CYPRINE\n", + "3 14 ASTE RUB ASTERIAS RUBENS COMMON STARFISH\n", + "4 15 CARD EDU CARDIUM EDULE COCKLE" ] }, "execution_count": null, @@ -3937,67 +3845,28 @@ ], "source": [ "#| eval: false\n", - "pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv').head()" - ] - }, - { - "cell_type": "markdown", - "id": "05762600", - "metadata": {}, - "source": [ - ":::{.callout-tip}\n", - "\n", - "**FEEDBACK TO DATA PROVIDER**: The `SEDI` values `56` and `73` are not found in the `SEDIMENT_TYPE.csv` lookup table provided. Note also there are many `nan` values in the `SEDIMENT_TYPE.csv` file.\n", - "\n", - "We reassign them to `-99` for now but should be clarified/fixed. This is demonstrated below.\n", - "\n", - ":::" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fbc6540f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Missing SEDI values: {56.0, 73.0, nan}\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "df_sed_lut = pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv')\n", - "dfs = load_data(fname_in)\n", - "\n", - "sediment_sedi = set(dfs['sediment'].SEDI.unique())\n", - "lookup_sedi = set(df_sed_lut['SEDI'])\n", - "missing = sediment_sedi - lookup_sedi\n", - "print(f\"Missing SEDI values: {missing if missing else 'None'}\")" + "pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv').head()" ] }, { "cell_type": "markdown", - "id": "34f305d9", + "id": "3ec2bd53", "metadata": {}, "source": [ - "Let's try to match as many as possible:" + "We try to remap the `SCIENTIFIC NAME` column to the `species` column of the MARIS nomenclature, again using a `Remapper` object:" ] }, { "cell_type": "code", "execution_count": null, - "id": "ac413a89", + "id": "da393947", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 47/47 [00:00<00:00, 141.72it/s]\n" + "Processing: 100%|██████████| 46/46 [00:06<00:00, 6.80it/s]\n" ] }, { @@ -4034,21 +3903,45 @@ " \n", " \n", " \n", - " -99\n", - " Soft\n", - " NO DATA\n", - " 5\n", + " STIZ LUC\n", + " Sander lucioperca\n", + " STIZOSTEDION LUCIOPERCA\n", + " 10\n", " \n", " \n", - " 50\n", - " Mud and gravel\n", - " MUD AND GARVEL\n", - " 2\n", + " LAMI SAC\n", + " Laminaria japonica\n", + " LAMINARIA SACCHARINA\n", + " 7\n", " \n", " \n", - " 46\n", - " Glacial clay\n", - " CLACIAL CLAY\n", + " CARD EDU\n", + " Cardiidae\n", + " CARDIUM EDULE\n", + " 6\n", + " \n", + " \n", + " ENCH CIM\n", + " Echinodermata\n", + " ENCHINODERMATA CIM\n", + " 5\n", + " \n", + " \n", + " PSET MAX\n", + " Pinctada maxima\n", + " PSETTA MAXIMA\n", + " 5\n", + " \n", + " \n", + " MACO BAL\n", + " Macoma balthica\n", + " MACOMA BALTICA\n", + " 1\n", + " \n", + " \n", + " STUC PEC\n", + " Stuckenia pectinata\n", + " STUCKENIA PECTINATE\n", " 1\n", " \n", " \n", @@ -4056,11 +3949,15 @@ "" ], "text/plain": [ - " matched_maris_name source_name match_score\n", - "source_key \n", - "-99 Soft NO DATA 5\n", - " 50 Mud and gravel MUD AND GARVEL 2\n", - " 46 Glacial clay CLACIAL CLAY 1" + " matched_maris_name source_name match_score\n", + "source_key \n", + "STIZ LUC Sander lucioperca STIZOSTEDION LUCIOPERCA 10\n", + "LAMI SAC Laminaria japonica LAMINARIA SACCHARINA 7\n", + "CARD EDU Cardiidae CARDIUM EDULE 6\n", + "ENCH CIM Echinodermata ENCHINODERMATA CIM 5\n", + "PSET MAX Pinctada maxima PSETTA MAXIMA 5\n", + "MACO BAL Macoma balthica MACOMA BALTICA 1\n", + "STUC PEC Stuckenia pectinata STUCKENIA PECTINATE 1" ] }, "execution_count": null, @@ -4070,43 +3967,61 @@ ], "source": [ "#| eval: false\n", - "remapper = Remapper(provider_lut_df=pd.read_csv(Path(fname_in)/'SEDIMENT_TYPE.csv'),\n", - " maris_lut_fn=sediments_lut_path,\n", - " maris_col_id='sedtype_id',\n", - " maris_col_name='sedtype',\n", - " provider_col_to_match='SEDIMENT TYPE',\n", - " provider_col_key='SEDI',\n", - " fname_cache='sediments_helcom.pkl'\n", + "remapper = Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv'),\n", + " maris_lut_fn=species_lut_path,\n", + " maris_col_id='species_id',\n", + " maris_col_name='species',\n", + " provider_col_to_match='SCIENTIFIC NAME',\n", + " provider_col_key='RUBIN',\n", + " fname_cache='species_helcom.pkl'\n", " )\n", "\n", "remapper.generate_lookup_table(as_df=True)\n", "remapper.select_match(match_score_threshold=1)" ] }, + { + "cell_type": "markdown", + "id": "e592a7a9", + "metadata": {}, + "source": [ + "We fix below some of the entries that are not properly matched by the `Remapper` object:" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "a9bbc268", + "id": "3e31a799", "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "fixes_sediments = {\n", - " 'NO DATA': '(Not available)'\n", - "}" + "fixes_biota_species = {\n", + " 'CARDIUM EDULE': 'Cerastoderma edule',\n", + " 'LAMINARIA SACCHARINA': 'Saccharina latissima',\n", + " 'PSETTA MAXIMA': 'Scophthalmus maximus',\n", + " 'STIZOSTEDION LUCIOPERCA': 'Sander luciopercas'}" + ] + }, + { + "cell_type": "markdown", + "id": "f7d1d994", + "metadata": {}, + "source": [ + "And give it an another try:" ] }, { "cell_type": "code", "execution_count": null, - "id": "10fd41a0", + "id": "5a70225a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Processing: 100%|██████████| 47/47 [00:00<00:00, 102.45it/s]\n" + "Processing: 100%|██████████| 46/46 [00:06<00:00, 6.68it/s]\n" ] }, { @@ -4143,15 +4058,27 @@ " \n", " \n", " \n", - " 50\n", - " Mud and gravel\n", - " MUD AND GARVEL\n", - " 2\n", + " ENCH CIM\n", + " Echinodermata\n", + " ENCHINODERMATA CIM\n", + " 5\n", " \n", " \n", - " 46\n", - " Glacial clay\n", - " CLACIAL CLAY\n", + " MACO BAL\n", + " Macoma balthica\n", + " MACOMA BALTICA\n", + " 1\n", + " \n", + " \n", + " STIZ LUC\n", + " Sander lucioperca\n", + " STIZOSTEDION LUCIOPERCA\n", + " 1\n", + " \n", + " \n", + " STUC PEC\n", + " Stuckenia pectinata\n", + " STUCKENIA PECTINATE\n", " 1\n", " \n", " \n", @@ -4159,10 +4086,12 @@ "" ], "text/plain": [ - " matched_maris_name source_name match_score\n", - "source_key \n", - "50 Mud and gravel MUD AND GARVEL 2\n", - "46 Glacial clay CLACIAL CLAY 1" + " matched_maris_name source_name match_score\n", + "source_key \n", + "ENCH CIM Echinodermata ENCHINODERMATA CIM 5\n", + "MACO BAL Macoma balthica MACOMA BALTICA 1\n", + "STIZ LUC Sander lucioperca STIZOSTEDION LUCIOPERCA 1\n", + "STUC PEC Stuckenia pectinata STUCKENIA PECTINATE 1" ] }, "execution_count": null, @@ -4172,177 +4101,144 @@ ], "source": [ "#| eval: false\n", - "remapper.generate_lookup_table(as_df=True, fixes=fixes_sediments)\n", + "remapper.generate_lookup_table(fixes=fixes_biota_species)\n", "remapper.select_match(match_score_threshold=1)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "4cad7ec2-97fd-43a8-83cb-c965ae89efde", + "cell_type": "markdown", + "id": "e6f49b32", "metadata": {}, - "outputs": [], "source": [ - "#| exports\n", - "class RemapSedimentCB(Callback):\n", - " \"Update sediment id based on MARIS species LUT (dbo_sedtype.xlsx).\"\n", - " def __init__(self, \n", - " fn_lut: Callable, # Function that returns the lookup table dictionary\n", - " ):\n", - " fc.store_attr()\n", - "\n", - " def _fix_inconsistent_sedi(self, df:pd.DataFrame) -> pd.DataFrame:\n", - " \"Temporary fix for inconsistent SEDI values. Data provider to confirm and clarify.\"\n", - " df['SEDI'] = df['SEDI'].replace({56: -99, 73: -99, np.nan: -99})\n", - " return df\n", - " \n", - " def __call__(self, tfm: Transformer):\n", - " \"Remap sediment types in the DataFrame using the lookup table and handle specific replacements.\"\n", - " lut = self.fn_lut()\n", - " \n", - " # Set SedRepName (TBC: what's used for?)\n", - " tfm.dfs['sediment']['SedRepName'] = tfm.dfs['sediment']['SEDI'] \n", - " \n", - " tfm.dfs['sediment'] = self._fix_inconsistent_sedi(tfm.dfs['sediment'])\n", - " tfm.dfs['sediment']['sed_type'] = tfm.dfs['sediment']['SEDI'].apply(lambda x: self._get_sediment_type(x, lut))\n", - "\n", - " def _get_sediment_type(self, \n", - " sedi_value: int, # The `SEDI` value from the DataFrame\n", - " lut: dict # The lookup table dictionary\n", - " ) -> Match: # The Match object\n", - " \"Get the matched_id from the lookup table and print SEDI if the matched_id is -1.\"\n", - " match = lut.get(sedi_value, Match(-1, None, None, None))\n", - " \n", - " if match.matched_id == -1:\n", - " self._print_unmatched_sedi(sedi_value)\n", - " return match.matched_id\n", + "Visual inspection of the remaining unperfectly matched entries seem acceptable to proceed. \n", "\n", - " def _print_unmatched_sedi(self, \n", - " sedi_value: int # The `SEDI` value from the DataFram\n", - " ) -> None:\n", - " \"Print the SEDI value if the matched_id is -1.\"\n", - " print(f\"Unmatched SEDI: {sedi_value}\")" + "We can now use the generic `RemapCB` callback to perform the remapping of the `RUBIN` column to the `species` column after having defined the lookup table `lut_biota`." ] }, { "cell_type": "code", "execution_count": null, - "id": "25a4ff58", + "id": "ccd6c46e", "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "lut_sediments = lambda: Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv'),\n", - " maris_lut_fn=sediments_lut_path,\n", - " maris_col_id='sedtype_id',\n", - " maris_col_name='sedtype',\n", - " provider_col_to_match='SEDIMENT TYPE',\n", - " provider_col_key='SEDI',\n", - " fname_cache='sediments_helcom.pkl'\n", - " ).generate_lookup_table(fixes=fixes_sediments, as_df=False, overwrite=False)" - ] - }, - { - "cell_type": "markdown", - "id": "f131e929", - "metadata": {}, - "source": [ - "Apply the transformer for callbacks `RemapSedimentCB(get_maris_sediments)`. Then, print the `SEDI` and `sed_type` for the `biota` dataframe." + "lut_biota = lambda: Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv'),\n", + " maris_lut_fn=species_lut_path,\n", + " maris_col_id='species_id',\n", + " maris_col_name='species',\n", + " provider_col_to_match='SCIENTIFIC NAME',\n", + " provider_col_key='RUBIN',\n", + " fname_cache='species_helcom.pkl'\n", + " ).generate_lookup_table(fixes=fixes_biota_species, as_df=False, overwrite=False)" ] }, { "cell_type": "code", "execution_count": null, - "id": "16d42cb0", + "id": "b83ffe12", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "array([ 0, 2, 58, 30, 59, 55, 56, 36, 29, 47, 4, 54, 33, 6, 44, 42, 48,\n", - " 61, 57, 28, 49, 32, 45, 39, 46, 38, 31, 60, 62, 26, 53, 52, 1, 51,\n", - " 37, 34, 50, 7, 10, 41, 43, 35])" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 99 243 50 139 270 192 191 284 84 269 122 96 287 279\n", + " 278 288 286 244 129 275 271 285 283 247 120 59 280 274\n", + " 273 290 289 272 277 276 21 282 110 281 245 704 1524 703\n", + " 1611 621 60]\n" + ] } ], "source": [ "#| eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[RemapSedimentCB(lut_sediments)])\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota')\n", + " ])\n", "\n", - "tfm()['sediment']['sed_type'].unique()" - ] - }, - { - "cell_type": "markdown", - "id": "c3a0add1", - "metadata": {}, - "source": [ - "## Remap units" + "# For instance:\n", + "print(tfm()['biota']['species'].unique())" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "7a4064ed", + "id": "2c74e492", "metadata": {}, "source": [ - ":::{.callout-tip}\n", - "\n", - "**FEEDBACK TO DATA PROVIDER**: The handling of unit types varies between `biota` and `sediment` sample types. For consistency and ease of use, it would be beneficial to have dedicated unit columns for all sample types.\n", - "\n", - ":::" + "## Remap Biota tissues\n", + "Let's inspect the `TISSUE.csv` file provided by HELCOM describing the tissue nomenclature. Biota tissue is known as `body part` in the maris data set." ] }, { - "cell_type": "markdown", - "id": "e6a682ac", - "metadata": {}, - "source": [ - "For `seawater` and `sediment` sample types, the HELCOM dataset refers to units direcly in the name of certain columns, such as `VALUE_Bq/m³` or `VALUE_Bq/kg`. As for `biota`, the units are included in the `BASIS` column. This is shown below: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cab93970", + "cell_type": "code", + "execution_count": null, + "id": "a38df50b-46a9-4a2d-9379-e670eb0d0bb6", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "biota: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'BASIS',\n", - " 'ERROR%', 'NUMBER', 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY',\n", - " 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY', 'STATION',\n", - " 'LATITUDE ddmmmm', 'LATITUDE dddddd', 'LONGITUDE ddmmmm',\n", - " 'LONGITUDE dddddd', 'SDEPTH', 'RUBIN', 'BIOTATYPE', 'TISSUE', 'NO',\n", - " 'LENGTH', 'WEIGHT', 'DW%', 'LOI%', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN',\n", - " 'DATE_OF_ENTRY_y'],\n", - " dtype='object')\n", - "sediment: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',\n", - " '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',\n", - " 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',\n", - " 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", - " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',\n", - " 'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',\n", - " 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],\n", - " dtype='object')\n", - "seawater: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/m³', 'VALUE_Bq/m³', 'ERROR%_m³',\n", - " 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR',\n", - " 'MONTH', 'DAY', 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", - " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'TDEPTH', 'SDEPTH', 'SALIN',\n", - " 'TTEMP', 'FILT', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'DATE_OF_ENTRY_y'],\n", - " dtype='object')\n" - ] - }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TISSUETISSUE_DESCRIPTION
01WHOLE FISH
12WHOLE FISH WITHOUT ENTRAILS
23WHOLE FISH WITHOUT HEAD AND ENTRAILS
34FLESH WITH BONES
45FLESH WITHOUT BONES (FILETS)
\n", + "
" + ], "text/plain": [ - "array(['W', nan, 'D', 'F'], dtype=object)" + " TISSUE TISSUE_DESCRIPTION\n", + "0 1 WHOLE FISH\n", + "1 2 WHOLE FISH WITHOUT ENTRAILS\n", + "2 3 WHOLE FISH WITHOUT HEAD AND ENTRAILS\n", + "3 4 FLESH WITH BONES\n", + "4 5 FLESH WITHOUT BONES (FILETS)" ] }, "execution_count": null, @@ -4352,27 +4248,22 @@ ], "source": [ "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "for grp in ['biota', 'sediment', 'seawater']:\n", - " print(f\"{grp}: {dfs[grp].columns}\")\n", - " \n", - "dfs['biota']['BASIS'].unique()" - ] - }, - { - "cell_type": "markdown", - "id": "f7cbefe4", - "metadata": {}, - "source": [ - "Given the inconsistent handling of units across sample types, we need to define custom mapping rules for standardizing the units. Below the MARIS unit types:" + "pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv').head()" ] }, { "cell_type": "code", "execution_count": null, - "id": "12a86baf", + "id": "2613f239", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 100%|██████████| 29/29 [00:00<00:00, 108.70it/s]\n" + ] + }, { "data": { "text/html": [ @@ -4394,233 +4285,149 @@ " \n", " \n", " \n", - " unit_id\n", - " unit\n", - " unit_sanitized\n", - " \n", - " \n", - " \n", - " \n", - " 0\n", - " -1\n", - " Not applicable\n", - " Not applicable\n", - " \n", - " \n", - " 1\n", - " 0\n", - " NOT AVAILABLE\n", - " NOT AVAILABLE\n", + " matched_maris_name\n", + " source_name\n", + " match_score\n", " \n", " \n", - " 2\n", - " 1\n", - " Bq/m3\n", - " Bq per m3\n", + " source_key\n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", " 3\n", - " 2\n", - " Bq/m2\n", - " Bq per m2\n", - " \n", - " \n", - " 4\n", - " 3\n", - " Bq/kg\n", - " Bq per kg\n", - " \n", - " \n", - " 5\n", - " 4\n", - " Bq/kgd\n", - " Bq per kgd\n", - " \n", - " \n", - " 6\n", - " 5\n", - " Bq/kgw\n", - " Bq per kgw\n", + " Flesh without bones\n", + " WHOLE FISH WITHOUT HEAD AND ENTRAILS\n", + " 20\n", " \n", " \n", - " 7\n", - " 6\n", - " kg/kg\n", - " kg per kg\n", + " 2\n", + " Flesh without bones\n", + " WHOLE FISH WITHOUT ENTRAILS\n", + " 13\n", " \n", " \n", " 8\n", - " 7\n", - " TU\n", - " TU\n", - " \n", - " \n", - " 9\n", - " 8\n", - " DELTA/mill\n", - " DELTA per mill\n", + " Soft parts\n", + " SKIN/EPIDERMIS\n", + " 10\n", " \n", " \n", - " 10\n", + " 5\n", + " Flesh without bones\n", + " FLESH WITHOUT BONES (FILETS)\n", " 9\n", - " atom/kg\n", - " atom per kg\n", " \n", " \n", - " 11\n", - " 10\n", - " atom/kgd\n", - " atom per kgd\n", + " 1\n", + " Whole animal\n", + " WHOLE FISH\n", + " 5\n", " \n", " \n", " 12\n", - " 11\n", - " atom/kgw\n", - " atom per kgw\n", + " Brain\n", + " ENTRAILS\n", + " 5\n", " \n", " \n", - " 13\n", - " 12\n", - " atom/l\n", - " atom per l\n", + " 15\n", + " Stomach and intestine\n", + " STOMACH + INTESTINE\n", + " 3\n", " \n", " \n", - " 14\n", - " 13\n", - " Bq/kgC\n", - " Bq per kgC\n", + " 41\n", + " Whole animal\n", + " WHOLE ANIMALS\n", + " 1\n", " \n", " \n", "\n", "" ], "text/plain": [ - " unit_id unit unit_sanitized\n", - "0 -1 Not applicable Not applicable\n", - "1 0 NOT AVAILABLE NOT AVAILABLE\n", - "2 1 Bq/m3 Bq per m3\n", - "3 2 Bq/m2 Bq per m2\n", - "4 3 Bq/kg Bq per kg\n", - "5 4 Bq/kgd Bq per kgd\n", - "6 5 Bq/kgw Bq per kgw\n", - "7 6 kg/kg kg per kg\n", - "8 7 TU TU\n", - "9 8 DELTA/mill DELTA per mill\n", - "10 9 atom/kg atom per kg\n", - "11 10 atom/kgd atom per kgd\n", - "12 11 atom/kgw atom per kgw\n", - "13 12 atom/l atom per l\n", - "14 13 Bq/kgC Bq per kgC" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| eval: false\n", - "pd.read_excel(unit_lut_path())[['unit_id', 'unit', 'unit_sanitized']]" - ] - }, - { - "cell_type": "markdown", - "id": "9ec28334", - "metadata": {}, + " matched_maris_name source_name \\\n", + "source_key \n", + "3 Flesh without bones WHOLE FISH WITHOUT HEAD AND ENTRAILS \n", + "2 Flesh without bones WHOLE FISH WITHOUT ENTRAILS \n", + "8 Soft parts SKIN/EPIDERMIS \n", + "5 Flesh without bones FLESH WITHOUT BONES (FILETS) \n", + "1 Whole animal WHOLE FISH \n", + "12 Brain ENTRAILS \n", + "15 Stomach and intestine STOMACH + INTESTINE \n", + "41 Whole animal WHOLE ANIMALS \n", + "\n", + " match_score \n", + "source_key \n", + "3 20 \n", + "2 13 \n", + "8 10 \n", + "5 9 \n", + "1 5 \n", + "12 5 \n", + "15 3 \n", + "41 1 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "We define unit names renaming rules from HELCOM in an **ad hoc** way for now:" + "#| eval: false\n", + "remapper = Remapper(provider_lut_df=pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv'),\n", + " maris_lut_fn=bodyparts_lut_path,\n", + " maris_col_id='bodypar_id',\n", + " maris_col_name='bodypar',\n", + " provider_col_to_match='TISSUE_DESCRIPTION',\n", + " provider_col_key='TISSUE',\n", + " fname_cache='tissues_helcom.pkl'\n", + " )\n", + "\n", + "remapper.generate_lookup_table(as_df=True)\n", + "remapper.select_match(match_score_threshold=1)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "ea7fa747", + "cell_type": "markdown", + "id": "0fee1bb9", "metadata": {}, - "outputs": [], "source": [ - "#| exports\n", - "lut_units = {\n", - " 'seawater': 1, # 'Bq/m3'\n", - " 'sediment': 4, # 'Bq/kgd' for sediment\n", - " 'biota': {\n", - " 'D': 4, # 'Bq/kgd'\n", - " 'W': 5, # 'Bq/kgw'\n", - " 'F': 5 # 'Bq/kgw' (assumed to be 'Fresh', so set to wet)\n", - " }\n", - "}" + "We fix below some of the entries that are not properly matched by the `Remapper` object:" ] }, { "cell_type": "code", "execution_count": null, - "id": "e404d620", + "id": "c6e2b06f-5eb1-4708-8087-75c836f08112", "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "class RemapUnitCB(Callback):\n", - " \"Set the `unit` id column in the DataFrames based on a lookup table.\"\n", - " def __init__(self, \n", - " lut_units: dict=lut_units # Dictionary containing renaming rules for different unit categories\n", - " ):\n", - " fc.store_attr()\n", - "\n", - " def __call__(self, tfm: Transformer):\n", - " for grp in tfm.dfs.keys():\n", - " if grp in ['seawater', 'sediment']:\n", - " tfm.dfs[grp]['unit'] = self.lut_units[grp]\n", - " else:\n", - " tfm.dfs[grp]['unit'] = tfm.dfs[grp]['BASIS'].apply(lambda x: lut_units[grp].get(x, 0))" - ] - }, - { - "cell_type": "markdown", - "id": "3a03fcc9", - "metadata": {}, - "source": [ - "Apply the transformer for callback `RemapUnitCB()`. Then, print the unique `unit` for the `seawater` dataframe." + "fixes_biota_tissues = {\n", + " 'WHOLE FISH WITHOUT HEAD AND ENTRAILS': 'Whole animal eviscerated without head',\n", + " 'ENTRAILS': 'Viscera',\n", + " 'SKIN/EPIDERMIS': 'Skin'}" ] }, { "cell_type": "code", "execution_count": null, - "id": "aa0f0abf", + "id": "c07fc4b8", "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "biota: [5 0 4]\n", - "sediment: [4]\n", - "seawater: [1]\n" + "Processing: 100%|██████████| 29/29 [00:00<00:00, 139.05it/s]\n" ] - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[RemapUnitCB()])\n", - "\n", - "for grp in ['biota', 'sediment', 'seawater']:\n", - " print(f\"{grp}: {tfm()[grp]['unit'].unique()}\")" - ] - }, - { - "cell_type": "markdown", - "id": "5d978c67", - "metadata": {}, - "source": [ - "## Remap detection limit\n", - "Detection limits are encoded as follows in MARIS:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1b07268", - "metadata": {}, - "outputs": [ + }, { "data": { "text/html": [ @@ -4642,60 +4449,60 @@ " \n", " \n", " \n", - " id\n", - " name\n", - " name_sanitized\n", + " matched_maris_name\n", + " source_name\n", + " match_score\n", " \n", - " \n", - " \n", " \n", - " 0\n", - " -1\n", - " Not applicable\n", - " Not applicable\n", + " source_key\n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " 1\n", - " 0\n", - " Not Available\n", - " Not available\n", + " 2\n", + " Flesh without bones\n", + " WHOLE FISH WITHOUT ENTRAILS\n", + " 13\n", " \n", " \n", - " 2\n", - " 1\n", - " =\n", - " Detected value\n", + " 5\n", + " Flesh without bones\n", + " FLESH WITHOUT BONES (FILETS)\n", + " 9\n", " \n", " \n", - " 3\n", - " 2\n", - " <\n", - " Detection limit\n", + " 1\n", + " Whole animal\n", + " WHOLE FISH\n", + " 5\n", " \n", " \n", - " 4\n", + " 15\n", + " Stomach and intestine\n", + " STOMACH + INTESTINE\n", " 3\n", - " ND\n", - " Not detected\n", " \n", " \n", - " 5\n", - " 4\n", - " DE\n", - " Derived\n", + " 41\n", + " Whole animal\n", + " WHOLE ANIMALS\n", + " 1\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id name name_sanitized\n", - "0 -1 Not applicable Not applicable\n", - "1 0 Not Available Not available\n", - "2 1 = Detected value\n", - "3 2 < Detection limit\n", - "4 3 ND Not detected\n", - "5 4 DE Derived" + " matched_maris_name source_name match_score\n", + "source_key \n", + "2 Flesh without bones WHOLE FISH WITHOUT ENTRAILS 13\n", + "5 Flesh without bones FLESH WITHOUT BONES (FILETS) 9\n", + "1 Whole animal WHOLE FISH 5\n", + "15 Stomach and intestine STOMACH + INTESTINE 3\n", + "41 Whole animal WHOLE ANIMALS 1" ] }, "execution_count": null, @@ -4705,165 +4512,241 @@ ], "source": [ "#| eval: false\n", - "pd.read_excel(detection_limit_lut_path())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7083b6f", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "lut_dl = lambda: pd.read_excel(detection_limit_lut_path(), usecols=['name','id']).set_index('name').to_dict()['id']" + "remapper.generate_lookup_table(as_df=True, fixes=fixes_biota_tissues)\n", + "remapper.select_match(match_score_threshold=1)" ] }, { "cell_type": "markdown", - "id": "3023ddb4", + "id": "6ef75cb1", "metadata": {}, "source": [ - "Based on columns of interest for each sample type:" + "Visual inspection of the remaining unperfectly matched entries seem acceptable to proceed. \n", + "\n", + "We can now use the generic `RemapCB` callback to perform the remapping of the `TISSUE` column to the `body_part` column after having defined the lookup table `lut_tissues`." ] }, { "cell_type": "code", "execution_count": null, - "id": "2dc43c01", + "id": "4c42eb30", "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "coi_dl = {'seawater' : {'val' : 'VALUE_Bq/m³',\n", - " 'unc' : 'ERROR%_m³',\n", - " 'dl' : '< VALUE_Bq/m³'},\n", - " 'biota': {'val' : 'VALUE_Bq/kg',\n", - " 'unc' : 'ERROR%',\n", - " 'dl' : '< VALUE_Bq/kg'},\n", - " 'sediment': {\n", - " 'val' : 'VALUE_Bq/kg',\n", - " 'unc' : 'ERROR%_kg',\n", - " 'dl' : '< VALUE_Bq/kg'}}" - ] - }, - { - "cell_type": "markdown", - "id": "3d8ac6a6", - "metadata": {}, - "source": [ - "We follow the following business logic to encode the detection limit:" - ] - }, - { - "cell_type": "markdown", - "id": "f6f4784b", - "metadata": {}, - "source": [ - "`RemapDetectionLimitCB` creates a `detection_limit` column with values determined as follows:\n", - "1. Perform a lookup with the appropriate columns value type (or detection limit) columns (`< VALUE_Bq/m³` or `< VALUE_Bq/kg`) against the table returned from the function `get_detectionlimit_lut`.\n", - "2. If `< VALUE_Bq/m³` or `< VALUE_Bq/kg` is NaN but both activity values (`VALUE_Bq/m³` or `VALUE_Bq/kg`) and standard uncertainty (`ERROR%_m³`, `ERROR%`, or `ERROR%_kg`) are provided, then assign the ID of `1` (i.e. \"Detected value\").\n", - "3. For other NaN values in the `detection_limit` column, set them to `0` (i.e. `Not Available`)." + "lut_tissues = lambda: Remapper(provider_lut_df=pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv'),\n", + " maris_lut_fn=bodyparts_lut_path,\n", + " maris_col_id='bodypar_id',\n", + " maris_col_name='bodypar',\n", + " provider_col_to_match='TISSUE_DESCRIPTION',\n", + " provider_col_key='TISSUE',\n", + " fname_cache='tissues_helcom.pkl'\n", + " ).generate_lookup_table(fixes=fixes_biota_tissues, as_df=False, overwrite=False)" ] }, { "cell_type": "code", "execution_count": null, - "id": "0a72f956", + "id": "7d1887c6", "metadata": {}, - "outputs": [], - "source": [ - "# | exports\n", - "# TO BE REFACTORED\n", - "class RemapDetectionLimitCB(Callback):\n", - " \"Remap value type to MARIS format.\"\n", - " def __init__(self, \n", - " coi: dict, # Configuration options for column names\n", - " fn_lut: Callable # Function that returns a lookup table\n", - " ):\n", - " fc.store_attr()\n", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " TISSUE body_part\n", + "0 5 52\n", + "1 5 52\n", + "2 5 52\n", + "3 5 52\n", + "4 5 52\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota')\n", + " ])\n", + "\n", + "print(tfm()['biota'][['TISSUE', 'body_part']][:5])\n" + ] + }, + { + "cell_type": "markdown", + "id": "cc596011", + "metadata": {}, + "source": [ + "## Remap biogroup" + ] + }, + { + "cell_type": "markdown", + "id": "da42ebe6", + "metadata": {}, + "source": [ + "`get_biogroup_lut` reads the file at `species_lut_path()` and from the contents of this file creates a dictionary linking `species_id` to `biogroup_id`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf290302", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "lut_biogroup = lambda: get_lut(species_lut_path().parent, species_lut_path().name, \n", + " key='species_id', value='biogroup_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a37157", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 4 2 14 11 8 3]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota')\n", + " ])\n", + "\n", + "print(tfm()['biota']['bio_group'].unique())\n" + ] + }, + { + "cell_type": "markdown", + "id": "2bea8647", + "metadata": {}, + "source": [ + "## Remap Taxon Information\n", + "Currently, the details (`Taxonname`, `TaxonRepName`, `Taxonrank`) are used for importing into the MARIS master database, but they are not included in the NetCDF encoding. \n", + "\n", + "We first need to retrieve the taxon information from the `dbo_species.xlsx` file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "324d52dc", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "# TODO: Include Commonname field after next MARIS data reconciling process.\n", + "def get_taxon_info_lut(\n", + " maris_lut:str # Path to the MARIS lookup table (Excel file)\n", + ") -> dict: # A dictionary mapping species_id to biogroup_id\n", + " \"Retrieve a lookup table for Taxonname from a MARIS lookup table.\"\n", + " species = pd.read_excel(maris_lut)\n", + " return species[['species_id', 'Taxonname', 'Taxonrank','TaxonDB','TaxonDBID','TaxonDBURL']].set_index('species_id').to_dict()\n", + "\n", + "lut_taxon = lambda: get_taxon_info_lut(species_lut_path())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b04111c3", + "metadata": {}, + "outputs": [], + "source": [ + "# | exports\n", + "class RemapTaxonInformationCB(Callback):\n", + " \"Update taxon information based on MARIS species LUT.\"\n", + " def __init__(self, fn_lut: Callable):\n", + " self.fn_lut = fn_lut\n", "\n", " def __call__(self, tfm: Transformer):\n", - " \"Remap detection limits in the DataFrames using the lookup table.\"\n", " lut = self.fn_lut()\n", + " df = tfm.dfs['biota']\n", " \n", - " for grp in tfm.dfs:\n", - " df = tfm.dfs[grp]\n", - " self._update_detection_limit(df, grp, lut)\n", - " \n", - " def _update_detection_limit(self, \n", - " df: pd.DataFrame, # The DataFrame to modify\n", - " grp: str, # The group name to get the column configuration\n", - " lut: dict # The lookup table dictionary\n", - " ) -> None:\n", - " \"Update detection limit column in the DataFrame based on lookup table and rules.\"\n", - " detection_col = self.coi[grp]['dl']\n", - " value_col = self.coi[grp]['val']\n", - " uncertainty_col = self.coi[grp]['unc']\n", - " \n", - " # Copy detection limit column\n", - " df['detection_limit'] = df[detection_col]\n", + " df['TaxonRepName'] = df.get('RUBIN', 'Unknown')\n", " \n", - " # Fill values with '=' or 'Not Available'\n", - " condition = ((df[value_col].notna()) & (df[uncertainty_col].notna()) &\n", - " (~df['detection_limit'].isin(lut.keys())))\n", - " df.loc[condition, 'detection_limit'] = '='\n", - " df.loc[~df['detection_limit'].isin(lut.keys()), 'detection_limit'] = 'Not Available'\n", + " taxon_columns = ['Taxonname', 'Taxonrank', 'TaxonDB', 'TaxonDBID', 'TaxonDBURL']\n", + " for col in taxon_columns:\n", + " df[col] = df['species'].map(lut[col]).fillna('Unknown')\n", " \n", - " # Perform lookup\n", - " df['detection_limit'] = df['detection_limit'].map(lut)" + " unmatched = df[df['Taxonname'] == 'Unknown']['species'].unique()\n", + " if len(unmatched) > 0:\n", + " print(f\"Unmatched species IDs: {', '.join(unmatched)}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "1ba3694d", + "id": "40c7c54e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "biota: [2 1 0]\n", - "sediment: [1 2 0]\n", - "seawater: [1 2 0]\n" + " TaxonRepName Taxonname Taxonrank TaxonDB TaxonDBID \\\n", + "0 GADU MOR Gadus morhua species Wikidata Q199788 \n", + "40 SPRA SPR Sprattus sprattus species Wikidata Q506823 \n", + "44 CLUP HAR Clupea harengus species Wikidata Q2396858 \n", + "77 MERL MNG Merlangius merlangus species Wikidata Q273083 \n", + "78 LIMA LIM Limanda limanda species Wikidata Q1135526 \n", + "\n", + " TaxonDBURL \n", + "0 https://www.wikidata.org/wiki/Q199788 \n", + "40 https://www.wikidata.org/wiki/Q506823 \n", + "44 https://www.wikidata.org/wiki/Q2396858 \n", + "77 https://www.wikidata.org/wiki/Q273083 \n", + "78 https://www.wikidata.org/wiki/Q1135526 \n" ] } ], "source": [ "#| eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[\n", - " NormalizeUncCB(),\n", - " SanitizeValue(coi_val), \n", - " RemapUnitCB(),\n", - " RemapDetectionLimitCB(coi_dl, lut_dl)])\n", - "\n", - "\n", - "for grp in ['biota', 'sediment', 'seawater']:\n", - " print(f\"{grp}: {tfm()[grp]['detection_limit'].unique()}\")" + "tfm = Transformer(dfs, cbs=[ \n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", + " RemapTaxonInformationCB(lut_taxon)\n", + " ])\n", + "tfm()\n", + "print(tfm.dfs['biota'][['TaxonRepName', 'Taxonname', 'Taxonrank',\n", + " 'TaxonDB','TaxonDBID','TaxonDBURL']].drop_duplicates().head())" ] }, { "cell_type": "markdown", - "id": "0026620e", + "id": "adcf607d", "metadata": {}, "source": [ - "## Remap filtering status" + "## Remap Sediment types\n", + "We use again the same **IMFA** (Inspect, Match, Fix, Apply) pattern to remap the HELCOM sediment types." ] }, { "cell_type": "markdown", - "id": "33ea63f3", + "id": "0f938d40", "metadata": {}, "source": [ - "HELCOM filtered status is encoded as follows in the `FILT` column:" + "Let's inspect the `SEDIMENT_TYPE.csv` file provided by HELCOM describing the sediment type nomenclature:" ] }, { "cell_type": "code", "execution_count": null, - "id": "5eacd28c", + "id": "d5f6b82a", "metadata": {}, "outputs": [ { @@ -4887,41 +4770,53 @@ " \n", " \n", " \n", - " index\n", - " value\n", + " SEDI\n", + " SEDIMENT TYPE\n", + " RECOMMENDED TO BE USED\n", " \n", " \n", " \n", " \n", " 0\n", - " 0\n", + " -99\n", + " NO DATA\n", " NaN\n", " \n", " \n", " 1\n", - " 1\n", - " F\n", + " 0\n", + " GRAVEL\n", + " YES\n", " \n", " \n", " 2\n", - " 2\n", - " n\n", + " 1\n", + " SAND\n", + " YES\n", " \n", " \n", " 3\n", + " 2\n", + " FINE SAND\n", + " NO\n", + " \n", + " \n", + " 4\n", " 3\n", - " N\n", + " SILT\n", + " YES\n", " \n", " \n", "\n", "" ], "text/plain": [ - " index value\n", - "0 0 NaN\n", - "1 1 F\n", - "2 2 n\n", - "3 3 N" + " SEDI SEDIMENT TYPE RECOMMENDED TO BE USED\n", + "0 -99 NO DATA NaN\n", + "1 0 GRAVEL YES\n", + "2 1 SAND YES\n", + "3 2 FINE SAND NO\n", + "4 3 SILT YES" ] }, "execution_count": null, @@ -4931,24 +4826,69 @@ ], "source": [ "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "get_unique_across_dfs(dfs, col_name='FILT', as_df=True).head(5)" + "pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv').head()" ] }, { "cell_type": "markdown", - "id": "703ee067", + "id": "05762600", "metadata": {}, "source": [ - "While MARIS uses a different encoding for filtered status:" + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: The `SEDI` values `56` and `73` are not found in the `SEDIMENT_TYPE.csv` lookup table provided. Note also there are many `nan` values in the `SEDIMENT_TYPE.csv` file.\n", + "\n", + "We reassign them to `-99` for now but should be clarified/fixed. This is demonstrated below.\n", + "\n", + ":::" ] }, { "cell_type": "code", "execution_count": null, - "id": "34e737e8", + "id": "fbc6540f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing SEDI values: {56.0, nan, 73.0}\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "df_sed_lut = pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv')\n", + "dfs = load_data(fname_in)\n", + "\n", + "sediment_sedi = set(dfs['sediment'].SEDI.unique())\n", + "lookup_sedi = set(df_sed_lut['SEDI'])\n", + "missing = sediment_sedi - lookup_sedi\n", + "print(f\"Missing SEDI values: {missing if missing else 'None'}\")" + ] + }, + { + "cell_type": "markdown", + "id": "34f305d9", + "metadata": {}, + "source": [ + "Let's try to match as many as possible:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac413a89", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing: 100%|██████████| 47/47 [00:00<00:00, 141.16it/s]\n" + ] + }, { "data": { "text/html": [ @@ -4970,41 +4910,46 @@ " \n", " \n", " \n", - " id\n", - " name\n", + " matched_maris_name\n", + " source_name\n", + " match_score\n", + " \n", + " \n", + " source_key\n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " -1\n", - " Not applicable\n", + " -99\n", + " Soft\n", + " NO DATA\n", + " 5\n", " \n", " \n", - " 1\n", - " 0\n", - " Not available\n", + " 50\n", + " Mud and gravel\n", + " MUD AND GARVEL\n", + " 2\n", " \n", " \n", - " 2\n", + " 46\n", + " Glacial clay\n", + " CLACIAL CLAY\n", " 1\n", - " Yes\n", - " \n", - " \n", - " 3\n", - " 2\n", - " No\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id name\n", - "0 -1 Not applicable\n", - "1 0 Not available\n", - "2 1 Yes\n", - "3 2 No" + " matched_maris_name source_name match_score\n", + "source_key \n", + "-99 Soft NO DATA 5\n", + " 50 Mud and gravel MUD AND GARVEL 2\n", + " 46 Glacial clay CLACIAL CLAY 1" ] }, "execution_count": null, @@ -5014,180 +4959,45 @@ ], "source": [ "#| eval: false\n", - "pd.read_excel(filtered_lut_path())" - ] - }, - { - "cell_type": "markdown", - "id": "37fbf457", - "metadata": {}, - "source": [ - "For only four categories to remap, the `Remapper` is an overkill. We can use a simple dictionary to map the values:" + "remapper = Remapper(provider_lut_df=pd.read_csv(Path(fname_in)/'SEDIMENT_TYPE.csv'),\n", + " maris_lut_fn=sediments_lut_path,\n", + " maris_col_id='sedtype_id',\n", + " maris_col_name='sedtype',\n", + " provider_col_to_match='SEDIMENT TYPE',\n", + " provider_col_key='SEDI',\n", + " fname_cache='sediments_helcom.pkl'\n", + " )\n", + "\n", + "remapper.generate_lookup_table(as_df=True)\n", + "remapper.select_match(match_score_threshold=1)" ] }, { "cell_type": "code", "execution_count": null, - "id": "3d2b4bbc", + "id": "a9bbc268", "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "lut_filtered = {\n", - " 'N': 2,\n", - " 'n': 2,\n", - " 'F': 1\n", + "fixes_sediments = {\n", + " 'NO DATA': '(Not available)'\n", "}" ] }, - { - "cell_type": "markdown", - "id": "b43ea425", - "metadata": {}, - "source": [ - "`RemapFiltCB` converts the HELCOM `FILT` format to the MARIS `FILT` format." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8f58336", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "class RemapFiltCB(Callback):\n", - " \"Lookup FILT value in dataframe using the lookup table.\"\n", - " def __init__(self,\n", - " lut_filtered: dict=lut_filtered, # Dictionary mapping FILT codes to their corresponding names\n", - " ):\n", - " fc.store_attr()\n", - "\n", - " def __call__(self, tfm):\n", - " for df in tfm.dfs.values():\n", - " if 'FILT' in df.columns:\n", - " df['FILT'] = df['FILT'].map(lambda x: self.lut_filtered.get(x, 0))" - ] - }, - { - "cell_type": "markdown", - "id": "719feb2c", - "metadata": {}, - "source": [ - "For instance:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2d13536", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0 2 1]\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[RemapFiltCB(lut_filtered)])\n", - "\n", - "print(tfm()['seawater']['FILT'].unique())" - ] - }, - { - "cell_type": "markdown", - "id": "c2e5ef74", - "metadata": {}, - "source": [ - "## Add Sample Laboratory code" - ] - }, - { - "cell_type": "markdown", - "id": "b3a02de8", - "metadata": {}, - "source": [ - "Sample Laboratory code is currently stored in MARIS master DB but not encoded as NetCDF variable. Decision to include it in the NetCDF output is TBD." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f29d5b8", - "metadata": {}, - "outputs": [], - "source": [ - "# | exports\n", - "class AddSampleLabCodeCB(Callback):\n", - " \"Remap `KEY` column to `samplabcode` in each DataFrame.\"\n", - " def __call__(self, tfm: Transformer):\n", - " for grp in tfm.dfs:\n", - " self._remap_sample_id(tfm.dfs[grp])\n", - " \n", - " def _remap_sample_id(self, df: pd.DataFrame):\n", - " df['samplabcode'] = df['KEY']" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "a13ddf94", + "id": "10fd41a0", "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "['WKRIL2012003' 'WKRIL2012004' 'WKRIL2012005' ... 'WSSSM2021006'\n", - " 'WSSSM2021007' 'WSSSM2021008']\n", - " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21216 39817 15827\n", - "Number of dropped rows 0 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", - "\n" + "Processing: 100%|██████████| 47/47 [00:00<00:00, 138.95it/s]\n" ] - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[\n", - " AddSampleLabCodeCB(),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", - "\n", - "print(tfm()['seawater']['samplabcode'].unique())\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n" - ] - }, - { - "cell_type": "markdown", - "id": "fe0fb210", - "metadata": {}, - "source": [ - "## Add measurement note" - ] - }, - { - "cell_type": "markdown", - "id": "9c05383c", - "metadata": {}, - "source": [ - "The HELCOM dataset includes a look-up table `ANALYSIS_METHOD.csv` capturing the measurement method used as described by HELCOM. For instance:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0985b9e8", - "metadata": {}, - "outputs": [ + }, { "data": { "text/html": [ @@ -5209,53 +5019,39 @@ " \n", " \n", " \n", - " METHOD\n", - " DESCRIPTION\n", - " COUNTRY\n", - " \n", - " \n", - " \n", - " \n", - " 0\n", - " BFFG01\n", - " Gammaspectrometric analysis with Germanium det...\n", - " 6\n", - " \n", - " \n", - " 1\n", - " BFFG02\n", - " Sr-90, a) Y-90 extraction method dried ash and...\n", - " 6\n", + " matched_maris_name\n", + " source_name\n", + " match_score\n", " \n", " \n", - " 2\n", - " BFFG03\n", - " Pu238, Pu239241; Ashing and and drying the tra...\n", - " 6\n", + " source_key\n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " 3\n", - " BFFG04\n", - " Am-241 (not to in use any more)\n", - " 6\n", + " 50\n", + " Mud and gravel\n", + " MUD AND GARVEL\n", + " 2\n", " \n", " \n", - " 4\n", - " CLOR01\n", - " 137Cs and 40K activity concentrations are dete...\n", - " 67\n", + " 46\n", + " Glacial clay\n", + " CLACIAL CLAY\n", + " 1\n", " \n", " \n", "\n", "" ], "text/plain": [ - " METHOD DESCRIPTION COUNTRY\n", - "0 BFFG01 Gammaspectrometric analysis with Germanium det... 6\n", - "1 BFFG02 Sr-90, a) Y-90 extraction method dried ash and... 6\n", - "2 BFFG03 Pu238, Pu239241; Ashing and and drying the tra... 6\n", - "3 BFFG04 Am-241 (not to in use any more) 6\n", - "4 CLOR01 137Cs and 40K activity concentrations are dete... 67" + " matched_maris_name source_name match_score\n", + "source_key \n", + "50 Mud and gravel MUD AND GARVEL 2\n", + "46 Glacial clay CLACIAL CLAY 1" ] }, "execution_count": null, @@ -5265,357 +5061,205 @@ ], "source": [ "#| eval: false\n", - "pd.read_csv(Path(fname_in) / 'ANALYSIS_METHOD.csv').head()" + "remapper.generate_lookup_table(as_df=True, fixes=fixes_sediments)\n", + "remapper.select_match(match_score_threshold=1)" ] }, { "cell_type": "code", "execution_count": null, - "id": "0d9976e2", + "id": "4cad7ec2-97fd-43a8-83cb-c965ae89efde", "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "lut_method = lambda: pd.read_csv(Path(fname_in) / 'ANALYSIS_METHOD.csv').set_index('METHOD').to_dict()['DESCRIPTION']" + "class RemapSedimentCB(Callback):\n", + " \"Update sediment id based on MARIS species LUT (dbo_sedtype.xlsx).\"\n", + " def __init__(self, \n", + " fn_lut: Callable, # Function that returns the lookup table dictionary\n", + " ):\n", + " fc.store_attr()\n", + "\n", + " def _fix_inconsistent_sedi(self, df:pd.DataFrame) -> pd.DataFrame:\n", + " \"Temporary fix for inconsistent SEDI values. Data provider to confirm and clarify.\"\n", + " df['SEDI'] = df['SEDI'].replace({56: -99, 73: -99, np.nan: -99})\n", + " return df\n", + " \n", + " def __call__(self, tfm: Transformer):\n", + " \"Remap sediment types in the DataFrame using the lookup table and handle specific replacements.\"\n", + " lut = self.fn_lut()\n", + " \n", + " # Set SedRepName (TBC: what's used for?)\n", + " tfm.dfs['sediment']['SedRepName'] = tfm.dfs['sediment']['SEDI'] \n", + " \n", + " tfm.dfs['sediment'] = self._fix_inconsistent_sedi(tfm.dfs['sediment'])\n", + " tfm.dfs['sediment']['sed_type'] = tfm.dfs['sediment']['SEDI'].apply(lambda x: self._get_sediment_type(x, lut))\n", + "\n", + " def _get_sediment_type(self, \n", + " sedi_value: int, # The `SEDI` value from the DataFrame\n", + " lut: dict # The lookup table dictionary\n", + " ) -> Match: # The Match object\n", + " \"Get the matched_id from the lookup table and print SEDI if the matched_id is -1.\"\n", + " match = lut.get(sedi_value, Match(-1, None, None, None))\n", + " \n", + " if match.matched_id == -1:\n", + " self._print_unmatched_sedi(sedi_value)\n", + " return match.matched_id\n", + "\n", + " def _print_unmatched_sedi(self, \n", + " sedi_value: int # The `SEDI` value from the DataFram\n", + " ) -> None:\n", + " \"Print the SEDI value if the matched_id is -1.\"\n", + " print(f\"Unmatched SEDI: {sedi_value}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "016db0d9", + "id": "25a4ff58", "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "class AddMeasurementNoteCB(Callback):\n", - " \"Record measurement notes by adding a 'measurenote' column to DataFrames.\"\n", - " def __init__(self, \n", - " fn_lut: Callable # Function that returns the lookup dictionary with `METHOD` as key and `DESCRIPTION` as value\n", - " ):\n", - " fc.store_attr()\n", - " \n", - " def __call__(self, tfm: Transformer):\n", - " lut = self.fn_lut()\n", - " for df in tfm.dfs.values():\n", - " if 'METHOD' in df.columns:\n", - " df['measurementnote'] = df['METHOD'].map(lambda x: lut.get(x, 0))" + "lut_sediments = lambda: Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv'),\n", + " maris_lut_fn=sediments_lut_path,\n", + " maris_col_id='sedtype_id',\n", + " maris_col_name='sedtype',\n", + " provider_col_to_match='SEDIMENT TYPE',\n", + " provider_col_key='SEDI',\n", + " fname_cache='sediments_helcom.pkl'\n", + " ).generate_lookup_table(fixes=fixes_sediments, as_df=False, overwrite=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f131e929", + "metadata": {}, + "source": [ + "Apply the transformer for callbacks `RemapSedimentCB(get_maris_sediments)`. Then, print the `SEDI` and `sed_type` for the `biota` dataframe." ] }, { "cell_type": "code", "execution_count": null, - "id": "e100431c", + "id": "16d42cb0", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0\n", - " 'Radiochemical method Radiocaesium separation from seawater samples.134+137Cs was adsorbed on AMP mat, dissolved with NaOH and after purification precipitated as chloroplatinate (Cs2PtCl6).Counting with low background anticoincidence beta counter.'\n", - " 'Radiochem. meth of Sr90. Precipation with oxalate and separation of calcium, barium, radium and ytrium couting with low background anticoincidence beta counter. 1982-1994'\n", - " 'For tritium liquid scintialtion counting, combined with electrolytic enrichment of analysed water samples, double distilled, before and after electrolysis in cells. Liquid Scintillation spectrometer LKB Wallac model 1410'\n", - " 'Pretreatment drying (sediment, biota samples) and ashing (biota samples)or vaporization to 1000 ml (sea water samples), measured by gamma-spectrometry using HPGe detectors sediment, biota, sea water /Cs-137, Cs-134, K-40']\n", - " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21216 39817 15827\n", - "Number of dropped rows 0 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", - "\n" - ] + "data": { + "text/plain": [ + "array([ 0, 2, 58, 30, 59, 55, 56, 36, 29, 47, 4, 54, 33, 6, 44, 42, 48,\n", + " 61, 57, 28, 49, 32, 45, 39, 46, 38, 31, 60, 62, 26, 53, 52, 1, 51,\n", + " 37, 34, 50, 7, 10, 41, 43, 35])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "#| eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[\n", - " AddMeasurementNoteCB(lut_method),\n", - " CompareDfsAndTfmCB(dfs)])\n", + "tfm = Transformer(dfs, cbs=[RemapSedimentCB(lut_sediments)])\n", "\n", - "tfm()\n", - "print(tfm.dfs['seawater']['measurementnote'].unique()[:5])\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" + "tfm()['sediment']['sed_type'].unique()" ] }, { "cell_type": "markdown", - "id": "b90fa59a", + "id": "c3a0add1", "metadata": {}, "source": [ - "## Add station" + "## Remap units" ] }, { "cell_type": "markdown", - "id": "0dfa0216", + "id": "7a4064ed", "metadata": {}, "source": [ - "*For MARIS master DB import only (not included in the NetCDF output).*" + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: The handling of unit types varies between `biota` and `sediment` sample types. For consistency and ease of use, it would be beneficial to have dedicated unit columns for all sample types.\n", + "\n", + ":::" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "768db093", + "cell_type": "markdown", + "id": "e6a682ac", "metadata": {}, - "outputs": [], "source": [ - "#| exports\n", - "class RemapStationIdCB(Callback):\n", - " \"Remap Station ID to MARIS format.\"\n", - " def __init__(self):\n", - " fc.store_attr()\n", - "\n", - " def __call__(self, tfm: Transformer):\n", - " \"Iterate through all DataFrames in the transformer object and remap `STATION` to `station_id`.\"\n", - " for grp in tfm.dfs.keys(): \n", - " tfm.dfs[grp]['station'] = tfm.dfs[grp]['STATION']" + "For `seawater` and `sediment` sample types, the HELCOM dataset refers to units direcly in the name of certain columns, such as `VALUE_Bq/m³` or `VALUE_Bq/kg`. As for `biota`, the units are included in the `BASIS` column. This is shown below: " ] }, { "cell_type": "code", "execution_count": null, - "id": "0ccb2604", + "id": "cab93970", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21216 39817 15827\n", - "Number of dropped rows 0 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", - "\n" + "biota: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'BASIS',\n", + " 'ERROR%', 'NUMBER', 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY',\n", + " 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY', 'STATION',\n", + " 'LATITUDE ddmmmm', 'LATITUDE dddddd', 'LONGITUDE ddmmmm',\n", + " 'LONGITUDE dddddd', 'SDEPTH', 'RUBIN', 'BIOTATYPE', 'TISSUE', 'NO',\n", + " 'LENGTH', 'WEIGHT', 'DW%', 'LOI%', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN',\n", + " 'DATE_OF_ENTRY_y'],\n", + " dtype='object')\n", + "sediment: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',\n", + " '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',\n", + " 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',\n", + " 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", + " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',\n", + " 'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',\n", + " 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],\n", + " dtype='object')\n", + "seawater: Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/m³', 'VALUE_Bq/m³', 'ERROR%_m³',\n", + " 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR',\n", + " 'MONTH', 'DAY', 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',\n", + " 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'TDEPTH', 'SDEPTH', 'SALIN',\n", + " 'TTEMP', 'FILT', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'DATE_OF_ENTRY_y'],\n", + " dtype='object')\n" ] + }, + { + "data": { + "text/plain": [ + "array(['W', nan, 'D', 'F'], dtype=object)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "#| eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[\n", - " RemapStationIdCB(),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", - "tfm()\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" - ] - }, - { - "cell_type": "markdown", - "id": "ff696fec", - "metadata": {}, - "source": [ - "## Add slice position (top and bottom)" + "for grp in ['biota', 'sediment', 'seawater']:\n", + " print(f\"{grp}: {dfs[grp].columns}\")\n", + " \n", + "dfs['biota']['BASIS'].unique()" ] }, { "cell_type": "markdown", - "id": "f615911d", + "id": "f7cbefe4", "metadata": {}, "source": [ - "*For MARIS master DB import only (not included in the NetCDF output).*" + "Given the inconsistent handling of units across sample types, we need to define custom mapping rules for standardizing the units. Below the MARIS unit types:" ] }, { "cell_type": "code", "execution_count": null, - "id": "cf398df9", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "class RemapSedSliceTopBottomCB(Callback):\n", - " \"Remap Sediment slice top and bottom to MARIS format.\"\n", - " def __call__(self, tfm: Transformer):\n", - " \"Iterate through all DataFrames in the transformer object and remap sediment slice top and bottom.\"\n", - " tfm.dfs['sediment']['top'] = tfm.dfs['sediment']['UPPSLI']\n", - " tfm.dfs['sediment']['bottom'] = tfm.dfs['sediment']['LOWSLI']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6479e6f3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " top bottom\n", - "0 15.0 20.0\n", - "1 20.0 27.0\n", - "2 0.0 2.0\n", - "3 2.0 4.0\n", - "4 4.0 6.0\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[RemapSedSliceTopBottomCB()])\n", - "tfm()\n", - "print(tfm.dfs['sediment'][['top','bottom']].head())" - ] - }, - { - "cell_type": "markdown", - "id": "5e4bbf53", - "metadata": {}, - "source": [ - "## Add dry to wet ratio" - ] - }, - { - "cell_type": "markdown", - "id": "bb091cc0", - "metadata": {}, - "source": [ - "*`DW%` is not included in the NetCDF output currently.*" - ] - }, - { - "cell_type": "markdown", - "id": "4735dd22", - "metadata": {}, - "source": [ - "HELCOM Description:\n", - "\n", - "**Sediment:**\n", - "1. DW%: DRY WEIGHT AS PERCENTAGE (%) OF FRESH WEIGHT.\n", - "2. VALUE_Bq/kg: Measured radioactivity concentration in Bq/kg dry wt. in scientific format(e.g. 123 = 1.23E+02, 0.076 = 7.6E-02)\n", - "\n", - "**Biota:**\n", - "1. WEIGHT: Average weight (in g) of specimen in the sample\n", - "2. DW%: DRY WEIGHT AS PERCENTAGE (%) OF FRESH WEIGHT" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ef385c79", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "class LookupDryWetRatio(Callback):\n", - " \"Lookup dry-wet ratio and format for MARIS.\"\n", - " def __call__(self, tfm: Transformer):\n", - " \"Iterate through all DataFrames in the transformer object and apply the dry-wet ratio lookup.\"\n", - " for grp in tfm.dfs.keys():\n", - " if 'DW%' in tfm.dfs[grp].columns:\n", - " self._apply_dry_wet_ratio(tfm.dfs[grp])\n", - "\n", - " def _apply_dry_wet_ratio(self, df: pd.DataFrame) -> None:\n", - " \"Apply dry-wet ratio conversion and formatting to the given DataFrame.\"\n", - " df['dry_wet_ratio'] = df['DW%']\n", - " # Convert 'DW%' = 0% to NaN.\n", - " df.loc[df['dry_wet_ratio'] == 0, 'dry_wet_ratio'] = np.NaN\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9d714bc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21216 39817 15827\n", - "Number of dropped rows 0 0 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", - "\n", - "0 18.453\n", - "1 18.453\n", - "2 18.453\n", - "3 18.453\n", - "4 18.458\n", - "Name: dry_wet_ratio, dtype: float64\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[\n", - " LookupDryWetRatio(),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", - "\n", - "tfm()\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", - "print(tfm.dfs['biota']['dry_wet_ratio'].head())\n" - ] - }, - { - "cell_type": "markdown", - "id": "963b9aa0", - "metadata": {}, - "source": [ - "## Standardize Coordinates" - ] - }, - { - "cell_type": "markdown", - "id": "d3203cb3", - "metadata": {}, - "source": [ - ":::{.callout-tip}\n", - "\n", - "**FEEDBACK TO DATA PROVIDER**: Column names for geographical coordinates are inconsistent across sample types (biota, sediment, seawater). Sometimes using parentheses, sometimes not.\n", - "\n", - ":::" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03c04fe9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "seawater: ['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)']\n", - "sediment: ['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)']\n", - "biota: ['LATITUDE ddmmmm', 'LATITUDE dddddd', 'LONGITUDE ddmmmm', 'LONGITUDE dddddd']\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "for grp in dfs.keys():\n", - " print(f'{grp}: {[col for col in dfs[grp].columns if \"LON\" in col or \"LAT\" in col]}')" - ] - }, - { - "cell_type": "markdown", - "id": "7150dcb6", - "metadata": {}, - "source": [ - ":::{.callout-tip}\n", - "\n", - "**FEEDBACK TO DATA PROVIDER**: \n", - "\n", - "- Geographical coordinates are provided in both decimal degree and degree-minute formats. Some coordinates are missing the decimal format and obliged us to use the degree-minute format with less precision.\n", - "- Also note that latitude values have `,` as decimal separator while longitude values have `.` as decimal separator (see below)\n", - "\n", - ":::" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "484b281b", + "id": "12a86baf", "metadata": {}, "outputs": [ { @@ -5639,302 +5283,231 @@ " \n", " \n", " \n", - " LATITUDE (ddmmmm)\n", - " LATITUDE (dddddd)\n", + " unit_id\n", + " unit\n", + " unit_sanitized\n", " \n", " \n", " \n", " \n", " 0\n", - " 59.400\n", - " 59,6667\n", + " -1\n", + " Not applicable\n", + " Not applicable\n", " \n", " \n", " 1\n", - " 59.400\n", - " 59,6667\n", + " 0\n", + " NOT AVAILABLE\n", + " NOT AVAILABLE\n", " \n", " \n", " 2\n", - " 59.516\n", - " 59,86\n", + " 1\n", + " Bq/m3\n", + " Bq per m3\n", " \n", " \n", " 3\n", - " 59.516\n", - " 59,86\n", + " 2\n", + " Bq/m2\n", + " Bq per m2\n", " \n", " \n", " 4\n", - " 59.516\n", - " 59,86\n", + " 3\n", + " Bq/kg\n", + " Bq per kg\n", " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " LATITUDE (ddmmmm) LATITUDE (dddddd)\n", - "0 59.400 59,6667\n", - "1 59.400 59,6667\n", - "2 59.516 59,86\n", - "3 59.516 59,86\n", - "4 59.516 59,86" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| eval: false\n", - "dfs['sediment'][['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)']].head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61afcc23", - "metadata": {}, - "outputs": [], - "source": [ - "#| exports\n", - "class ParseCoordinates(Callback):\n", - " \"\"\"\n", - " Get geographical coordinates from columns expressed in degrees decimal format \n", - " or from columns in degrees/minutes decimal format where degrees decimal format is missing.\n", - " \"\"\"\n", - " def __init__(self, \n", - " fn_convert_cor: Callable # Function that converts coordinates from degree-minute to decimal degree format\n", - " ):\n", - " self.fn_convert_cor = fn_convert_cor\n", - "\n", - " def __call__(self, tfm:Transformer):\n", - " for df in tfm.dfs.values():\n", - " self._format_coordinates(df)\n", - "\n", - " def _format_coordinates(self, df:pd.DataFrame) -> None:\n", - " coord_cols = self._get_coord_columns(df.columns)\n", - " \n", - " for coord in ['lat', 'lon']:\n", - " decimal_col, minute_col = coord_cols[f'{coord}_d'], coord_cols[f'{coord}_m']\n", - " \n", - " condition = df[decimal_col].isna() | (df[decimal_col] == 0)\n", - " df[coord] = np.where(condition,\n", - " df[minute_col].apply(self._safe_convert),\n", - " df[decimal_col])\n", - " \n", - " df.dropna(subset=['lat', 'lon'], inplace=True)\n", - "\n", - " def _get_coord_columns(self, columns) -> dict:\n", - " return {\n", - " 'lon_d': self._find_coord_column(columns, 'LON', 'dddddd'),\n", - " 'lat_d': self._find_coord_column(columns, 'LAT', 'dddddd'),\n", - " 'lon_m': self._find_coord_column(columns, 'LON', 'ddmmmm'),\n", - " 'lat_m': self._find_coord_column(columns, 'LAT', 'ddmmmm')\n", - " }\n", - "\n", - " def _find_coord_column(self, columns, coord_type, coord_format) -> str:\n", - " pattern = re.compile(f'{coord_type}.*{coord_format}', re.IGNORECASE)\n", - " matching_columns = [col for col in columns if pattern.search(col)]\n", - " return matching_columns[0] if matching_columns else None\n", - "\n", - " def _safe_convert(self, value) -> str:\n", - " if pd.isna(value):\n", - " return value\n", - " try:\n", - " return self.fn_convert_cor(value)\n", - " except Exception as e:\n", - " print(f\"Error converting value {value}: {e}\")\n", - " return value" + " \n", + " 5\n", + " 4\n", + " Bq/kgd\n", + " Bq per kgd\n", + " \n", + " \n", + " 6\n", + " 5\n", + " Bq/kgw\n", + " Bq per kgw\n", + " \n", + " \n", + " 7\n", + " 6\n", + " kg/kg\n", + " kg per kg\n", + " \n", + " \n", + " 8\n", + " 7\n", + " TU\n", + " TU\n", + " \n", + " \n", + " 9\n", + " 8\n", + " DELTA/mill\n", + " DELTA per mill\n", + " \n", + " \n", + " 10\n", + " 9\n", + " atom/kg\n", + " atom per kg\n", + " \n", + " \n", + " 11\n", + " 10\n", + " atom/kgd\n", + " atom per kgd\n", + " \n", + " \n", + " 12\n", + " 11\n", + " atom/kgw\n", + " atom per kgw\n", + " \n", + " \n", + " 13\n", + " 12\n", + " atom/l\n", + " atom per l\n", + " \n", + " \n", + " 14\n", + " 13\n", + " Bq/kgC\n", + " Bq per kgC\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " unit_id unit unit_sanitized\n", + "0 -1 Not applicable Not applicable\n", + "1 0 NOT AVAILABLE NOT AVAILABLE\n", + "2 1 Bq/m3 Bq per m3\n", + "3 2 Bq/m2 Bq per m2\n", + "4 3 Bq/kg Bq per kg\n", + "5 4 Bq/kgd Bq per kgd\n", + "6 5 Bq/kgw Bq per kgw\n", + "7 6 kg/kg kg per kg\n", + "8 7 TU TU\n", + "9 8 DELTA/mill DELTA per mill\n", + "10 9 atom/kg atom per kg\n", + "11 10 atom/kgd atom per kgd\n", + "12 11 atom/kgw atom per kgw\n", + "13 12 atom/l atom per l\n", + "14 13 Bq/kgC Bq per kgC" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "pd.read_excel(unit_lut_path())[['unit_id', 'unit', 'unit_sanitized']]" + ] + }, + { + "cell_type": "markdown", + "id": "9ec28334", + "metadata": {}, + "source": [ + "We define unit names renaming rules from HELCOM in an **ad hoc** way for now:" ] }, { "cell_type": "code", "execution_count": null, - "id": "1baf7136", + "id": "ea7fa747", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21208 39816 15827\n", - "Number of dropped rows 8 1 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", - "\n", - " lat lon\n", - "0 54.283333 12.316667\n", - "1 54.283333 12.316667\n", - "2 54.283333 12.316667\n", - "3 54.283333 12.316667\n", - "4 54.283333 12.316667\n", - "... ... ...\n", - "15822 60.373333 18.395667\n", - "15823 60.373333 18.395667\n", - "15824 60.503333 18.366667\n", - "15825 60.503333 18.366667\n", - "15826 60.503333 18.366667\n", - "\n", - "[15827 rows x 2 columns]\n" - ] - } - ], + "outputs": [], "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[ \n", - " ParseCoordinates(ddmm_to_dd),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", - "tfm()\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", - "print(tfm.dfs['biota'][['lat','lon']])" + "#| exports\n", + "lut_units = {\n", + " 'seawater': 1, # 'Bq/m3'\n", + " 'sediment': 4, # 'Bq/kgd' for sediment\n", + " 'biota': {\n", + " 'D': 4, # 'Bq/kgd'\n", + " 'W': 5, # 'Bq/kgw'\n", + " 'F': 5 # 'Bq/kgw' (assumed to be 'Fresh', so set to wet)\n", + " }\n", + "}" ] }, { - "cell_type": "markdown", - "id": "754289f1", + "cell_type": "code", + "execution_count": null, + "id": "e404d620", "metadata": {}, + "outputs": [], "source": [ - ":::{.callout-tip}\n", - "\n", - "**FEEDBACK TO DATA PROVIDER**: Some samples have (lon, lat): (0, 0) or are outside lon/lat possible values. \n", + "#| exports\n", + "class RemapUnitCB(Callback):\n", + " \"Set the `unit` id column in the DataFrames based on a lookup table.\"\n", + " def __init__(self, \n", + " lut_units: dict=lut_units # Dictionary containing renaming rules for different unit categories\n", + " ):\n", + " fc.store_attr()\n", "\n", - ":::" + " def __call__(self, tfm: Transformer):\n", + " for grp in tfm.dfs.keys():\n", + " if grp in ['seawater', 'sediment']:\n", + " tfm.dfs[grp]['unit'] = self.lut_units[grp]\n", + " else:\n", + " tfm.dfs[grp]['unit'] = tfm.dfs[grp]['BASIS'].apply(lambda x: lut_units[grp].get(x, 0))" ] }, { "cell_type": "markdown", - "id": "5a055628", + "id": "3a03fcc9", "metadata": {}, "source": [ - "Sanitize coordinates drops a row when both longitude & latitude equal 0 or data contains unrealistic longitude & latitude values. Converts longitude & latitude `,` separator to `.` separator.\"" + "Apply the transformer for callback `RemapUnitCB()`. Then, print the unique `unit` for the `seawater` dataframe." ] }, { "cell_type": "code", "execution_count": null, - "id": "99a85059", + "id": "aa0f0abf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21208 39816 15827\n", - "Number of dropped rows 8 1 0\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", - "\n", - " lat lon\n", - "0 54.283333 12.316667\n", - "1 54.283333 12.316667\n", - "2 54.283333 12.316667\n", - "3 54.283333 12.316667\n", - "4 54.283333 12.316667\n", - "... ... ...\n", - "15822 60.373333 18.395667\n", - "15823 60.373333 18.395667\n", - "15824 60.503333 18.366667\n", - "15825 60.503333 18.366667\n", - "15826 60.503333 18.366667\n", - "\n", - "[15827 rows x 2 columns]\n" + "biota: [5 0 4]\n", + "sediment: [4]\n", + "seawater: [1]\n" ] } ], "source": [ "#| eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[\n", - " ParseCoordinates(ddmm_to_dd),\n", - " SanitizeLonLatCB(),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", + "tfm = Transformer(dfs, cbs=[RemapUnitCB()])\n", "\n", - "tfm()\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", - "print(tfm.dfs['biota'][['lat','lon']])\n" - ] - }, - { - "cell_type": "markdown", - "id": "47716bff", - "metadata": {}, - "source": [ - "## Review all callbacks" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b8a07959", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " seawater sediment biota\n", - "Number of rows in dfs 21216 39817 15827\n", - "Number of rows in tfm.dfs 21114 39531 15798\n", - "Number of dropped rows 102 286 29\n", - "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", - "\n" - ] - } - ], - "source": [ - "#| eval: false\n", - "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[\n", - " AddSampleTypeIdColumnCB(),\n", - " LowerStripNameCB(col_src='NUCLIDE'),\n", - " RemapNuclideNameCB(lut_nuclides),\n", - " AddNuclideIdColumnCB(col_value='NUCLIDE'),\n", - " ParseTimeCB(),\n", - " EncodeTimeCB(cfg()),\n", - " SanitizeValue(coi_val), \n", - " NormalizeUncCB(),\n", - " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", - " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", - " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", - " RemapTaxonInformationCB(lut_taxon),\n", - " RemapSedimentCB(lut_sediments),\n", - " RemapUnitCB(),\n", - " RemapDetectionLimitCB(coi_dl, lut_dl),\n", - " RemapFiltCB(lut_filtered),\n", - " AddSampleLabCodeCB(),\n", - " AddMeasurementNoteCB(lut_method),\n", - " RemapStationIdCB(),\n", - " RemapSedSliceTopBottomCB(),\n", - " LookupDryWetRatio(),\n", - " ParseCoordinates(ddmm_to_dd),\n", - " SanitizeLonLatCB(),\n", - " CompareDfsAndTfmCB(dfs)\n", - " ])\n", - "\n", - "tfm()\n", - "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n" + "for grp in ['biota', 'sediment', 'seawater']:\n", + " print(f\"{grp}: {tfm()[grp]['unit'].unique()}\")" ] }, { "cell_type": "markdown", - "id": "2f13c7a2", + "id": "5d978c67", "metadata": {}, "source": [ - "For instance, to inspect dropped rows:" + "## Remap detection limit\n", + "Detection limits are encoded as follows in MARIS:" ] }, { "cell_type": "code", "execution_count": null, - "id": "29baf65c", + "id": "f1b07268", "metadata": {}, "outputs": [ { @@ -5958,185 +5531,60 @@ " \n", " \n", " \n", - " KEY\n", - " NUCLIDE\n", - " METHOD\n", - " < VALUE_Bq/m³\n", - " VALUE_Bq/m³\n", - " ERROR%_m³\n", - " DATE_OF_ENTRY_x\n", - " COUNTRY\n", - " LABORATORY\n", - " SEQUENCE\n", - " ...\n", - " LONGITUDE (ddmmmm)\n", - " LONGITUDE (dddddd)\n", - " TDEPTH\n", - " SDEPTH\n", - " SALIN\n", - " TTEMP\n", - " FILT\n", - " MORS_SUBBASIN\n", - " HELCOM_SUBBASIN\n", - " DATE_OF_ENTRY_y\n", + " id\n", + " name\n", + " name_sanitized\n", " \n", " \n", " \n", " \n", - " 13439\n", - " WRISO2001025\n", - " CS137\n", - " RISO02\n", - " NaN\n", - " NaN\n", - " 10.0\n", - " NaN\n", - " 26.0\n", - " RISO\n", - " 2001025.0\n", - " ...\n", - " 10.500\n", - " 10.833333\n", - " 22.0\n", - " 20.0\n", - " 0.00\n", - " NaN\n", - " N\n", - " 5.0\n", - " 5.0\n", - " NaN\n", + " 0\n", + " -1\n", + " Not applicable\n", + " Not applicable\n", " \n", " \n", - " 14017\n", - " WLEPA2002001\n", - " CS134\n", - " LEPA02\n", - " <\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 93.0\n", - " LEPA\n", - " 2002001.0\n", - " ...\n", - " 21.030\n", - " 21.050000\n", - " 16.0\n", - " 0.0\n", - " 3.77\n", - " 14.40\n", - " N\n", - " 4.0\n", - " 9.0\n", - " NaN\n", + " 1\n", + " 0\n", + " Not Available\n", + " Not available\n", " \n", " \n", - " 14020\n", - " WLEPA2002002\n", - " CS134\n", - " LEPA02\n", - " <\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 93.0\n", - " LEPA\n", - " 2002004.0\n", - " ...\n", - " 20.574\n", - " 20.956667\n", - " 14.0\n", - " 0.0\n", - " 6.57\n", - " 11.95\n", - " N\n", - " 4.0\n", - " 9.0\n", - " NaN\n", + " 2\n", + " 1\n", + " =\n", + " Detected value\n", " \n", " \n", - " 14023\n", - " WLEPA2002003\n", - " CS134\n", - " LEPA02\n", + " 3\n", + " 2\n", " <\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 93.0\n", - " LEPA\n", - " 2002007.0\n", - " ...\n", - " 19.236\n", - " 19.393333\n", - " 73.0\n", - " 0.0\n", - " 7.00\n", - " 9.19\n", - " N\n", - " 4.0\n", - " 9.0\n", - " NaN\n", + " Detection limit\n", " \n", " \n", - " 14026\n", - " WLEPA2002004\n", - " CS134\n", - " LEPA02\n", - " <\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 93.0\n", - " LEPA\n", - " 2002010.0\n", - " ...\n", - " 20.205\n", - " 20.341700\n", - " 47.0\n", - " 0.0\n", - " 7.06\n", - " 8.65\n", - " N\n", - " 4.0\n", - " 9.0\n", - " NaN\n", + " 4\n", + " 3\n", + " ND\n", + " Not detected\n", + " \n", + " \n", + " 5\n", + " 4\n", + " DE\n", + " Derived\n", " \n", " \n", "\n", - "

5 rows × 27 columns

\n", "" ], "text/plain": [ - " KEY NUCLIDE METHOD < VALUE_Bq/m³ VALUE_Bq/m³ ERROR%_m³ \\\n", - "13439 WRISO2001025 CS137 RISO02 NaN NaN 10.0 \n", - "14017 WLEPA2002001 CS134 LEPA02 < NaN NaN \n", - "14020 WLEPA2002002 CS134 LEPA02 < NaN NaN \n", - "14023 WLEPA2002003 CS134 LEPA02 < NaN NaN \n", - "14026 WLEPA2002004 CS134 LEPA02 < NaN NaN \n", - "\n", - " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... LONGITUDE (ddmmmm) \\\n", - "13439 NaN 26.0 RISO 2001025.0 ... 10.500 \n", - "14017 NaN 93.0 LEPA 2002001.0 ... 21.030 \n", - "14020 NaN 93.0 LEPA 2002004.0 ... 20.574 \n", - "14023 NaN 93.0 LEPA 2002007.0 ... 19.236 \n", - "14026 NaN 93.0 LEPA 2002010.0 ... 20.205 \n", - "\n", - " LONGITUDE (dddddd) TDEPTH SDEPTH SALIN TTEMP FILT MORS_SUBBASIN \\\n", - "13439 10.833333 22.0 20.0 0.00 NaN N 5.0 \n", - "14017 21.050000 16.0 0.0 3.77 14.40 N 4.0 \n", - "14020 20.956667 14.0 0.0 6.57 11.95 N 4.0 \n", - "14023 19.393333 73.0 0.0 7.00 9.19 N 4.0 \n", - "14026 20.341700 47.0 0.0 7.06 8.65 N 4.0 \n", - "\n", - " HELCOM_SUBBASIN DATE_OF_ENTRY_y \n", - "13439 5.0 NaN \n", - "14017 9.0 NaN \n", - "14020 9.0 NaN \n", - "14023 9.0 NaN \n", - "14026 9.0 NaN \n", - "\n", - "[5 rows x 27 columns]" + " id name name_sanitized\n", + "0 -1 Not applicable Not applicable\n", + "1 0 Not Available Not available\n", + "2 1 = Detected value\n", + "3 2 < Detection limit\n", + "4 3 ND Not detected\n", + "5 4 DE Derived" ] }, "execution_count": null, @@ -6145,289 +5593,2634 @@ } ], "source": [ - "tfm.dfs_dropped['seawater'].head()" + "#| eval: false\n", + "pd.read_excel(detection_limit_lut_path())" ] }, { - "attachments": {}, - "cell_type": "markdown", - "id": "e17f6685", + "cell_type": "code", + "execution_count": null, + "id": "c7083b6f", "metadata": {}, + "outputs": [], "source": [ - "## Rename columns of interest for NetCDF or Open Refine" + "#| exports\n", + "lut_dl = lambda: pd.read_excel(detection_limit_lut_path(), usecols=['name','id']).set_index('name').to_dict()['id']" ] }, { "cell_type": "markdown", - "id": "af441203", + "id": "3023ddb4", "metadata": {}, "source": [ - "> Column names are standardized to MARIS NetCDF format (i.e. PEP8 ). " + "Based on columns of interest for each sample type:" ] }, { "cell_type": "code", "execution_count": null, - "id": "66e7bfc7", + "id": "2dc43c01", "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "def get_common_rules(\n", - " vars: dict, # Configuration dictionary\n", - " encoding_type: str # Encoding type (`netcdf` or `openrefine`)\n", - " ) -> dict: # Common renaming rules for NetCDF and OpenRefine.\n", - " \"Get common renaming rules for NetCDF and OpenRefine.\"\n", - " common = {\n", - " 'KEY': 'key',\n", - " 'lat': 'latitude' if encoding_type == 'openrefine' else vars['defaults']['lat']['name'],\n", - " 'lon': 'longitude' if encoding_type == 'openrefine' else vars['defaults']['lon']['name'],\n", - " 'time': 'begperiod' if encoding_type == 'openrefine' else vars['defaults']['time']['name'],\n", - " 'NUCLIDE': 'nuclide_id' if encoding_type == 'openrefine' else 'nuclide',\n", - " 'detection_limit': 'detection' if encoding_type == 'openrefine' else vars['suffixes']['detection_limit']['name'],\n", - " 'unit': 'unit_id' if encoding_type == 'openrefine' else vars['suffixes']['unit']['name'],\n", - " 'value': 'activity' if encoding_type == 'openrefine' else 'value',\n", - " 'uncertainty': 'uncertaint' if encoding_type == 'openrefine' else vars['suffixes']['uncertainty']['name'],\n", - " 'SDEPTH': 'sampdepth' if encoding_type == 'openrefine' else vars['defaults']['smp_depth']['name'],\n", - " 'TDEPTH': 'totdepth' if encoding_type == 'openrefine' else vars['defaults']['tot_depth']['name'],\n", - " }\n", - " \n", - " if encoding_type == 'openrefine':\n", - " common.update({\n", - " 'samptype_id': 'samptype_id',\n", - " 'station': 'station',\n", - " 'samplabcode': 'samplabcode',\n", - " 'SALIN': 'salinity',\n", - " 'TTEMP': 'temperatur',\n", - " 'FILT': 'filtered',\n", - " 'measurenote': 'measurenote'\n", - " })\n", - " else:\n", - " common.update({\n", - " 'counting_method': vars['suffixes']['counting_method']['name'],\n", - " 'sampling_method': vars['suffixes']['sampling_method']['name'],\n", - " 'preparation_method': vars['suffixes']['preparation_method']['name'],\n", - " 'SALIN': vars['suffixes']['salinity']['name'],\n", - " 'TTEMP': vars['suffixes']['temperature']['name'],\n", - " })\n", - " \n", - " return common" + "coi_dl = {'seawater' : {'val' : 'VALUE_Bq/m³',\n", + " 'unc' : 'ERROR%_m³',\n", + " 'dl' : '< VALUE_Bq/m³'},\n", + " 'biota': {'val' : 'VALUE_Bq/kg',\n", + " 'unc' : 'ERROR%',\n", + " 'dl' : '< VALUE_Bq/kg'},\n", + " 'sediment': {\n", + " 'val' : 'VALUE_Bq/kg',\n", + " 'unc' : 'ERROR%_kg',\n", + " 'dl' : '< VALUE_Bq/kg'}}" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "6bc3002a", + "cell_type": "markdown", + "id": "3d8ac6a6", "metadata": {}, - "outputs": [], "source": [ - "#| exports\n", - "def get_specific_rules(\n", - " vars: dict, # Configuration dictionary\n", - " encoding_type: str # Encoding type (`netcdf` or `openrefine`)\n", - " ) -> dict: # Specific renaming rules for NetCDF and OpenRefine.\n", - " \"Get specific renaming rules for NetCDF and OpenRefine.\"\n", - " if encoding_type == 'netcdf':\n", - " return {\n", - " 'biota': {\n", - " 'species': vars['bio']['species']['name'],\n", - " 'body_part': vars['bio']['body_part']['name'],\n", - " 'bio_group': vars['bio']['bio_group']['name']\n", - " },\n", - " 'sediment': {\n", - " 'sed_type': vars['sed']['sed_type']['name'],\n", - " 'top': vars['sed']['top']['name'],\n", - " 'bottom': vars['sed']['bottom']['name'],\n", - " }\n", - " }\n", - " elif encoding_type == 'openrefine':\n", - " return {\n", - " 'biota': {\n", - " 'species': 'species_id',\n", - " 'Taxonname': 'Taxonname',\n", - " 'TaxonRepName': 'TaxonRepName',\n", - " 'Taxonrank': 'Taxonrank',\n", - " 'TaxonDB': 'TaxonDB',\n", - " 'TaxonDBID': 'TaxonDBID',\n", - " 'TaxonDBURL': 'TaxonDBURL',\n", - " 'body_part': 'bodypar_id',\n", - " 'dry_wet_ratio': 'percentwt',\n", - " },\n", - " 'sediment': {\n", - " 'sed_type': 'sedtype_id',\n", - " 'top': 'sliceup',\n", - " 'bottom': 'slicedown',\n", - " 'SedRepName': 'SedRepName',\n", - " 'dry_wet_ratio': 'percentwt',\n", - " }\n", - " }" + "We follow the following business logic to encode the detection limit:" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "cbfc4bf7", + "cell_type": "markdown", + "id": "f6f4784b", "metadata": {}, - "outputs": [], "source": [ - "#| exports\n", - "def get_renaming_rules(\n", - " encoding_type: str = 'netcdf' # Encoding type (`netcdf` or `openrefine`)\n", - " ) -> dict: # Renaming rules for NetCDF and OpenRefine.\n", - " \"Get renaming rules for NetCDF and OpenRefine.\"\n", - " vars = cdl_cfg()['vars']\n", - " \n", - " if encoding_type not in ['netcdf', 'openrefine']:\n", - " raise ValueError(\"Invalid encoding_type provided. Please use 'netcdf' or 'openrefine'.\")\n", - " \n", - " common_rules = get_common_rules(vars, encoding_type)\n", - " specific_rules = get_specific_rules(vars, encoding_type)\n", - " \n", - " rules = defaultdict(dict)\n", - " for sample_type in ['seawater', 'biota', 'sediment']:\n", - " rules[sample_type] = common_rules.copy()\n", - " rules[sample_type].update(specific_rules.get(sample_type, {}))\n", - " \n", - " return dict(rules)" + "`RemapDetectionLimitCB` creates a `detection_limit` column with values determined as follows:\n", + "1. Perform a lookup with the appropriate columns value type (or detection limit) columns (`< VALUE_Bq/m³` or `< VALUE_Bq/kg`) against the table returned from the function `get_detectionlimit_lut`.\n", + "2. If `< VALUE_Bq/m³` or `< VALUE_Bq/kg` is NaN but both activity values (`VALUE_Bq/m³` or `VALUE_Bq/kg`) and standard uncertainty (`ERROR%_m³`, `ERROR%`, or `ERROR%_kg`) are provided, then assign the ID of `1` (i.e. \"Detected value\").\n", + "3. For other NaN values in the `detection_limit` column, set them to `0` (i.e. `Not Available`)." ] }, { "cell_type": "code", "execution_count": null, - "id": "4b7476af", + "id": "0a72f956", "metadata": {}, "outputs": [], "source": [ - "#| exports\n", - "class SelectAndRenameColumnCB(Callback):\n", - " \"Select and rename columns in a DataFrame based on renaming rules for a specified encoding type.\"\n", + "# | exports\n", + "# TO BE REFACTORED\n", + "class RemapDetectionLimitCB(Callback):\n", + " \"Remap value type to MARIS format.\"\n", " def __init__(self, \n", - " fn_renaming_rules: Callable, # A function that returns an OrderedDict of renaming rules \n", - " encoding_type: str='netcdf', # The encoding type (`netcdf` or `openrefine`) to determine which renaming rules to use\n", - " verbose: bool=False # Whether to print out renaming rules that were not applied\n", - " ):\n", + " coi: dict, # Configuration options for column names\n", + " fn_lut: Callable # Function that returns a lookup table\n", + " ):\n", " fc.store_attr()\n", "\n", " def __call__(self, tfm: Transformer):\n", - " \"Apply column selection and renaming to DataFrames in the transformer, and identify unused rules.\"\n", - " try:\n", - " renaming_rules = self.fn_renaming_rules(self.encoding_type)\n", - " except ValueError as e:\n", - " print(f\"Error fetching renaming rules: {e}\")\n", - " return\n", - "\n", - " for group in tfm.dfs.keys():\n", - " # Get relevant renaming rules for the current group\n", - " group_rules = self._get_group_rules(renaming_rules, group)\n", - "\n", - " if not group_rules:\n", - " continue\n", + " \"Remap detection limits in the DataFrames using the lookup table.\"\n", + " lut = self.fn_lut()\n", + " \n", + " for grp in tfm.dfs:\n", + " df = tfm.dfs[grp]\n", + " self._update_detection_limit(df, grp, lut)\n", + " \n", + " def _update_detection_limit(self, \n", + " df: pd.DataFrame, # The DataFrame to modify\n", + " grp: str, # The group name to get the column configuration\n", + " lut: dict # The lookup table dictionary\n", + " ) -> None:\n", + " \"Update detection limit column in the DataFrame based on lookup table and rules.\"\n", + " detection_col = self.coi[grp]['dl']\n", + " value_col = self.coi[grp]['val']\n", + " uncertainty_col = self.coi[grp]['unc']\n", + " \n", + " # Copy detection limit column\n", + " df['detection_limit'] = df[detection_col]\n", + " \n", + " # Fill values with '=' or 'Not Available'\n", + " condition = ((df[value_col].notna()) & (df[uncertainty_col].notna()) &\n", + " (~df['detection_limit'].isin(lut.keys())))\n", + " df.loc[condition, 'detection_limit'] = '='\n", + " df.loc[~df['detection_limit'].isin(lut.keys()), 'detection_limit'] = 'Not Available'\n", + " \n", + " # Perform lookup\n", + " df['detection_limit'] = df['detection_limit'].map(lut)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ba3694d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "biota: [2 1 0]\n", + "sediment: [1 2 0]\n", + "seawater: [1 2 0]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " NormalizeUncCB(),\n", + " SanitizeValue(coi_val), \n", + " RemapUnitCB(),\n", + " RemapDetectionLimitCB(coi_dl, lut_dl)])\n", "\n", - " # Apply renaming rules and track keys not found in the DataFrame\n", - " df = tfm.dfs[group]\n", - " df, not_found_keys = self._apply_renaming(df, group_rules)\n", - " tfm.dfs[group] = df\n", - " \n", - " # Print any renaming rules that were not used\n", - " if not_found_keys and self.verbose:\n", - " print(f\"\\nGroup '{group}' has the following renaming rules not applied:\")\n", - " for old_col in not_found_keys:\n", - " print(f\"Key '{old_col}' from renaming rules was not found in the DataFrame.\")\n", "\n", - " def _get_group_rules(self, \n", - " renaming_rules: OrderedDict, # Renaming rules\n", - " group: str # Group name to filter rules\n", - " ) -> OrderedDict: # Renaming rules applicable to the specified group\n", - " \"Retrieve and merge renaming rules for the specified group based on the encoding type.\"\n", - " relevant_rules = [rules for key, rules in renaming_rules.items() if group in key]\n", - " merged_rules = OrderedDict()\n", - " for rules in relevant_rules:\n", - " merged_rules.update(rules)\n", - " return merged_rules\n", - "\n", - " def _apply_renaming(self, \n", - " df: pd.DataFrame, # DataFrame to modify\n", - " rename_rules: OrderedDict # Renaming rules\n", - " ) -> tuple: # (Renamed and filtered df, Column names from renaming rules that were not found in the DataFrame)\n", - " \"\"\"\n", - " Select columns based on renaming rules and apply renaming, only for existing columns\n", - " while maintaining the order of the dictionary columns.\"\"\"\n", - " existing_columns = set(df.columns)\n", - " valid_rules = OrderedDict((old_col, new_col) for old_col, new_col in rename_rules.items() if old_col in existing_columns)\n", - "\n", - " # Create a list to maintain the order of columns\n", - " columns_to_keep = [col for col in rename_rules.keys() if col in existing_columns]\n", - " columns_to_keep += [new_col for old_col, new_col in valid_rules.items() if new_col in df.columns]\n", - "\n", - " df = df[list(OrderedDict.fromkeys(columns_to_keep))]\n", - "\n", - " # Apply renaming\n", - " df.rename(columns=valid_rules, inplace=True)\n", - "\n", - " # Determine which keys were not found\n", - " not_found_keys = set(rename_rules.keys()) - existing_columns\n", - " return df, not_found_keys\n" + "for grp in ['biota', 'sediment', 'seawater']:\n", + " print(f\"{grp}: {tfm()[grp]['detection_limit'].unique()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "0026620e", + "metadata": {}, + "source": [ + "## Remap filtering status" + ] + }, + { + "cell_type": "markdown", + "id": "33ea63f3", + "metadata": {}, + "source": [ + "HELCOM filtered status is encoded as follows in the `FILT` column:" ] }, { "cell_type": "code", "execution_count": null, - "id": "9a4a8682-672f-4188-9091-821b727b4764", + "id": "5eacd28c", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "seawater columns:\n", - "Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", - " 'smp_depth', 'tot_depth', '_sal', '_temp'],\n", - " dtype='object')\n", - "sediment columns:\n", - "Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", - " 'tot_depth', 'sed_type', 'top', 'bottom'],\n", - " dtype='object')\n", - "biota columns:\n", - "Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", - " 'smp_depth', 'species', 'body_part', 'bio_group'],\n", - " dtype='object')\n" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexvalue
00F
11N
22n
33NaN
\n", + "
" + ], + "text/plain": [ + " index value\n", + "0 0 F\n", + "1 1 N\n", + "2 2 n\n", + "3 3 NaN" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "#| eval: false\n", "dfs = load_data(fname_in)\n", - "tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),\n", - " LowerStripNameCB(col_src='NUCLIDE'),\n", - " RemapNuclideNameCB(lut_nuclides),\n", - " AddNuclideIdColumnCB(col_value='NUCLIDE'),\n", - " ParseTimeCB(),\n", - " EncodeTimeCB(cfg()),\n", - " SanitizeValue(coi_val), \n", - " NormalizeUncCB(),\n", - " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", - " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", - " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", - " RemapTaxonInformationCB(lut_taxon),\n", - " RemapSedimentCB(lut_sediments),\n", - " RemapUnitCB(),\n", - " RemapDetectionLimitCB(coi_dl, lut_dl),\n", - " RemapFiltCB(lut_filtered),\n", - " AddSampleLabCodeCB(),\n", - " AddMeasurementNoteCB(lut_method),\n", - " RemapStationIdCB(),\n", - " RemapSedSliceTopBottomCB(),\n", - " LookupDryWetRatio(),\n", - " ParseCoordinates(ddmm_to_dd),\n", - " SanitizeLonLatCB(),\n", - " CompareDfsAndTfmCB(dfs),\n", - " SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),\n", - " ])\n", - "\n", - "tfm()\n", - "for grp in tfm.dfs.keys():\n", - " print(f'{grp} columns:')\n", - " print(tfm.dfs[grp].columns)" + "get_unique_across_dfs(dfs, col_name='FILT', as_df=True).head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "703ee067", + "metadata": {}, + "source": [ + "While MARIS uses a different encoding for filtered status:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34e737e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idname
0-1Not applicable
10Not available
21Yes
32No
\n", + "
" + ], + "text/plain": [ + " id name\n", + "0 -1 Not applicable\n", + "1 0 Not available\n", + "2 1 Yes\n", + "3 2 No" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "pd.read_excel(filtered_lut_path())" + ] + }, + { + "cell_type": "markdown", + "id": "37fbf457", + "metadata": {}, + "source": [ + "For only four categories to remap, the `Remapper` is an overkill. We can use a simple dictionary to map the values:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d2b4bbc", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "lut_filtered = {\n", + " 'N': 2,\n", + " 'n': 2,\n", + " 'F': 1\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "b43ea425", + "metadata": {}, + "source": [ + "`RemapFiltCB` converts the HELCOM `FILT` format to the MARIS `FILT` format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8f58336", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class RemapFiltCB(Callback):\n", + " \"Lookup FILT value in dataframe using the lookup table.\"\n", + " def __init__(self,\n", + " lut_filtered: dict=lut_filtered, # Dictionary mapping FILT codes to their corresponding names\n", + " ):\n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm):\n", + " for df in tfm.dfs.values():\n", + " if 'FILT' in df.columns:\n", + " df['FILT'] = df['FILT'].map(lambda x: self.lut_filtered.get(x, 0))" + ] + }, + { + "cell_type": "markdown", + "id": "719feb2c", + "metadata": {}, + "source": [ + "For instance:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2d13536", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 2 1]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[RemapFiltCB(lut_filtered)])\n", + "\n", + "print(tfm()['seawater']['FILT'].unique())" + ] + }, + { + "cell_type": "markdown", + "id": "c2e5ef74", + "metadata": {}, + "source": [ + "## Add Sample Laboratory code" + ] + }, + { + "cell_type": "markdown", + "id": "b3a02de8", + "metadata": {}, + "source": [ + "Sample Laboratory code is currently stored in MARIS master DB but not encoded as NetCDF variable. Decision to include it in the NetCDF output is TBD." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f29d5b8", + "metadata": {}, + "outputs": [], + "source": [ + "# | exports\n", + "class AddSampleLabCodeCB(Callback):\n", + " \"Remap `KEY` column to `samplabcode` in each DataFrame.\"\n", + " def __call__(self, tfm: Transformer):\n", + " for grp in tfm.dfs:\n", + " self._remap_sample_id(tfm.dfs[grp])\n", + " \n", + " def _remap_sample_id(self, df: pd.DataFrame):\n", + " df['samplabcode'] = df['KEY']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a13ddf94", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['WKRIL2012003' 'WKRIL2012004' 'WKRIL2012005' ... 'WSSSM2021006'\n", + " 'WSSSM2021007' 'WSSSM2021008']\n", + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of dropped rows 0 0 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " AddSampleLabCodeCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "\n", + "print(tfm()['seawater']['samplabcode'].unique())\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n" + ] + }, + { + "cell_type": "markdown", + "id": "fe0fb210", + "metadata": {}, + "source": [ + "## Add measurement note" + ] + }, + { + "cell_type": "markdown", + "id": "9c05383c", + "metadata": {}, + "source": [ + "The HELCOM dataset includes a look-up table `ANALYSIS_METHOD.csv` capturing the measurement method used as described by HELCOM. For instance:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0985b9e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
METHODDESCRIPTIONCOUNTRY
0BFFG01Gammaspectrometric analysis with Germanium det...6
1BFFG02Sr-90, a) Y-90 extraction method dried ash and...6
2BFFG03Pu238, Pu239241; Ashing and and drying the tra...6
3BFFG04Am-241 (not to in use any more)6
4CLOR01137Cs and 40K activity concentrations are dete...67
\n", + "
" + ], + "text/plain": [ + " METHOD DESCRIPTION COUNTRY\n", + "0 BFFG01 Gammaspectrometric analysis with Germanium det... 6\n", + "1 BFFG02 Sr-90, a) Y-90 extraction method dried ash and... 6\n", + "2 BFFG03 Pu238, Pu239241; Ashing and and drying the tra... 6\n", + "3 BFFG04 Am-241 (not to in use any more) 6\n", + "4 CLOR01 137Cs and 40K activity concentrations are dete... 67" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "pd.read_csv(Path(fname_in) / 'ANALYSIS_METHOD.csv').head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d9976e2", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "lut_method = lambda: pd.read_csv(Path(fname_in) / 'ANALYSIS_METHOD.csv').set_index('METHOD').to_dict()['DESCRIPTION']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "016db0d9", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class AddMeasurementNoteCB(Callback):\n", + " \"Record measurement notes by adding a 'measurenote' column to DataFrames.\"\n", + " def __init__(self, \n", + " fn_lut: Callable # Function that returns the lookup dictionary with `METHOD` as key and `DESCRIPTION` as value\n", + " ):\n", + " fc.store_attr()\n", + " \n", + " def __call__(self, tfm: Transformer):\n", + " lut = self.fn_lut()\n", + " for df in tfm.dfs.values():\n", + " if 'METHOD' in df.columns:\n", + " df['measurementnote'] = df['METHOD'].map(lambda x: lut.get(x, 0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e100431c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0\n", + " 'Radiochemical method Radiocaesium separation from seawater samples.134+137Cs was adsorbed on AMP mat, dissolved with NaOH and after purification precipitated as chloroplatinate (Cs2PtCl6).Counting with low background anticoincidence beta counter.'\n", + " 'Radiochem. meth of Sr90. Precipation with oxalate and separation of calcium, barium, radium and ytrium couting with low background anticoincidence beta counter. 1982-1994'\n", + " 'For tritium liquid scintialtion counting, combined with electrolytic enrichment of analysed water samples, double distilled, before and after electrolysis in cells. Liquid Scintillation spectrometer LKB Wallac model 1410'\n", + " 'Pretreatment drying (sediment, biota samples) and ashing (biota samples)or vaporization to 1000 ml (sea water samples), measured by gamma-spectrometry using HPGe detectors sediment, biota, sea water /Cs-137, Cs-134, K-40']\n", + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of dropped rows 0 0 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " AddMeasurementNoteCB(lut_method),\n", + " CompareDfsAndTfmCB(dfs)])\n", + "\n", + "tfm()\n", + "print(tfm.dfs['seawater']['measurementnote'].unique()[:5])\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" + ] + }, + { + "cell_type": "markdown", + "id": "b90fa59a", + "metadata": {}, + "source": [ + "## Add station" + ] + }, + { + "cell_type": "markdown", + "id": "0dfa0216", + "metadata": {}, + "source": [ + "*For MARIS master DB import only (not included in the NetCDF output).*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "768db093", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class RemapStationIdCB(Callback):\n", + " \"Remap Station ID to MARIS format.\"\n", + " def __init__(self):\n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm: Transformer):\n", + " \"Iterate through all DataFrames in the transformer object and remap `STATION` to `station_id`.\"\n", + " for grp in tfm.dfs.keys(): \n", + " tfm.dfs[grp]['station'] = tfm.dfs[grp]['STATION']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ccb2604", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of dropped rows 0 0 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " RemapStationIdCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')" + ] + }, + { + "cell_type": "markdown", + "id": "ff696fec", + "metadata": {}, + "source": [ + "## Add slice position (top and bottom)" + ] + }, + { + "cell_type": "markdown", + "id": "f615911d", + "metadata": {}, + "source": [ + "*For MARIS master DB import only (not included in the NetCDF output).*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf398df9", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class RemapSedSliceTopBottomCB(Callback):\n", + " \"Remap Sediment slice top and bottom to MARIS format.\"\n", + " def __call__(self, tfm: Transformer):\n", + " \"Iterate through all DataFrames in the transformer object and remap sediment slice top and bottom.\"\n", + " tfm.dfs['sediment']['top'] = tfm.dfs['sediment']['UPPSLI']\n", + " tfm.dfs['sediment']['bottom'] = tfm.dfs['sediment']['LOWSLI']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6479e6f3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " top bottom\n", + "0 15.0 20.0\n", + "1 20.0 27.0\n", + "2 0.0 2.0\n", + "3 2.0 4.0\n", + "4 4.0 6.0\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[RemapSedSliceTopBottomCB()])\n", + "tfm()\n", + "print(tfm.dfs['sediment'][['top','bottom']].head())" + ] + }, + { + "cell_type": "markdown", + "id": "5e4bbf53", + "metadata": {}, + "source": [ + "## Add dry to wet ratio" + ] + }, + { + "cell_type": "markdown", + "id": "bb091cc0", + "metadata": {}, + "source": [ + "*`DW%` is not included in the NetCDF output currently.*" + ] + }, + { + "cell_type": "markdown", + "id": "4735dd22", + "metadata": {}, + "source": [ + "HELCOM Description:\n", + "\n", + "**Sediment:**\n", + "1. DW%: DRY WEIGHT AS PERCENTAGE (%) OF FRESH WEIGHT.\n", + "2. VALUE_Bq/kg: Measured radioactivity concentration in Bq/kg dry wt. in scientific format(e.g. 123 = 1.23E+02, 0.076 = 7.6E-02)\n", + "\n", + "**Biota:**\n", + "1. WEIGHT: Average weight (in g) of specimen in the sample\n", + "2. DW%: DRY WEIGHT AS PERCENTAGE (%) OF FRESH WEIGHT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef385c79", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class LookupDryWetRatio(Callback):\n", + " \"Lookup dry-wet ratio and format for MARIS.\"\n", + " def __call__(self, tfm: Transformer):\n", + " \"Iterate through all DataFrames in the transformer object and apply the dry-wet ratio lookup.\"\n", + " for grp in tfm.dfs.keys():\n", + " if 'DW%' in tfm.dfs[grp].columns:\n", + " self._apply_dry_wet_ratio(tfm.dfs[grp])\n", + "\n", + " def _apply_dry_wet_ratio(self, df: pd.DataFrame) -> None:\n", + " \"Apply dry-wet ratio conversion and formatting to the given DataFrame.\"\n", + " df['dry_wet_ratio'] = df['DW%']\n", + " # Convert 'DW%' = 0% to NaN.\n", + " df.loc[df['dry_wet_ratio'] == 0, 'dry_wet_ratio'] = np.NaN\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9d714bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21216 39817 15827\n", + "Number of dropped rows 0 0 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n", + "0 18.453\n", + "1 18.453\n", + "2 18.453\n", + "3 18.453\n", + "4 18.458\n", + "Name: dry_wet_ratio, dtype: float64\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " LookupDryWetRatio(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", + "print(tfm.dfs['biota']['dry_wet_ratio'].head())\n" + ] + }, + { + "cell_type": "markdown", + "id": "963b9aa0", + "metadata": {}, + "source": [ + "## Standardize Coordinates" + ] + }, + { + "cell_type": "markdown", + "id": "d3203cb3", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: Column names for geographical coordinates are inconsistent across sample types (biota, sediment, seawater). Sometimes using parentheses, sometimes not.\n", + "\n", + ":::" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03c04fe9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "seawater: ['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)']\n", + "sediment: ['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)']\n", + "biota: ['LATITUDE ddmmmm', 'LATITUDE dddddd', 'LONGITUDE ddmmmm', 'LONGITUDE dddddd']\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "for grp in dfs.keys():\n", + " print(f'{grp}: {[col for col in dfs[grp].columns if \"LON\" in col or \"LAT\" in col]}')" + ] + }, + { + "cell_type": "markdown", + "id": "7150dcb6", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: \n", + "\n", + "- Geographical coordinates are provided in both decimal degree and degree-minute formats. Some coordinates are missing the decimal format and obliged us to use the degree-minute format with less precision.\n", + "- Also note that latitude values have `,` as decimal separator while longitude values have `.` as decimal separator (see below)\n", + "\n", + ":::" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "484b281b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LATITUDE (ddmmmm)LATITUDE (dddddd)
059.40059,6667
159.40059,6667
259.51659,86
359.51659,86
459.51659,86
\n", + "
" + ], + "text/plain": [ + " LATITUDE (ddmmmm) LATITUDE (dddddd)\n", + "0 59.400 59,6667\n", + "1 59.400 59,6667\n", + "2 59.516 59,86\n", + "3 59.516 59,86\n", + "4 59.516 59,86" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "dfs['sediment'][['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)']].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61afcc23", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class ParseCoordinates(Callback):\n", + " \"\"\"\n", + " Get geographical coordinates from columns expressed in degrees decimal format \n", + " or from columns in degrees/minutes decimal format where degrees decimal format is missing.\n", + " \"\"\"\n", + " def __init__(self, \n", + " fn_convert_cor: Callable # Function that converts coordinates from degree-minute to decimal degree format\n", + " ):\n", + " self.fn_convert_cor = fn_convert_cor\n", + "\n", + " def __call__(self, tfm:Transformer):\n", + " for df in tfm.dfs.values():\n", + " self._format_coordinates(df)\n", + "\n", + " def _format_coordinates(self, df:pd.DataFrame) -> None:\n", + " coord_cols = self._get_coord_columns(df.columns)\n", + " \n", + " for coord in ['lat', 'lon']:\n", + " decimal_col, minute_col = coord_cols[f'{coord}_d'], coord_cols[f'{coord}_m']\n", + " \n", + " condition = df[decimal_col].isna() | (df[decimal_col] == 0)\n", + " df[coord] = np.where(condition,\n", + " df[minute_col].apply(self._safe_convert),\n", + " df[decimal_col])\n", + " \n", + " df.dropna(subset=['lat', 'lon'], inplace=True)\n", + "\n", + " def _get_coord_columns(self, columns) -> dict:\n", + " return {\n", + " 'lon_d': self._find_coord_column(columns, 'LON', 'dddddd'),\n", + " 'lat_d': self._find_coord_column(columns, 'LAT', 'dddddd'),\n", + " 'lon_m': self._find_coord_column(columns, 'LON', 'ddmmmm'),\n", + " 'lat_m': self._find_coord_column(columns, 'LAT', 'ddmmmm')\n", + " }\n", + "\n", + " def _find_coord_column(self, columns, coord_type, coord_format) -> str:\n", + " pattern = re.compile(f'{coord_type}.*{coord_format}', re.IGNORECASE)\n", + " matching_columns = [col for col in columns if pattern.search(col)]\n", + " return matching_columns[0] if matching_columns else None\n", + "\n", + " def _safe_convert(self, value) -> str:\n", + " if pd.isna(value):\n", + " return value\n", + " try:\n", + " return self.fn_convert_cor(value)\n", + " except Exception as e:\n", + " print(f\"Error converting value {value}: {e}\")\n", + " return value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1baf7136", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21208 39816 15827\n", + "Number of dropped rows 8 1 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n", + " lat lon\n", + "0 54.283333 12.316667\n", + "1 54.283333 12.316667\n", + "2 54.283333 12.316667\n", + "3 54.283333 12.316667\n", + "4 54.283333 12.316667\n", + "... ... ...\n", + "15822 60.373333 18.395667\n", + "15823 60.373333 18.395667\n", + "15824 60.503333 18.366667\n", + "15825 60.503333 18.366667\n", + "15826 60.503333 18.366667\n", + "\n", + "[15827 rows x 2 columns]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[ \n", + " ParseCoordinates(ddmm_to_dd),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", + "print(tfm.dfs['biota'][['lat','lon']])" + ] + }, + { + "cell_type": "markdown", + "id": "754289f1", + "metadata": {}, + "source": [ + ":::{.callout-tip}\n", + "\n", + "**FEEDBACK TO DATA PROVIDER**: Some samples have (lon, lat): (0, 0) or are outside lon/lat possible values. \n", + "\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "id": "5a055628", + "metadata": {}, + "source": [ + "Sanitize coordinates drops a row when both longitude & latitude equal 0 or data contains unrealistic longitude & latitude values. Converts longitude & latitude `,` separator to `.` separator.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99a85059", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21208 39816 15827\n", + "Number of dropped rows 8 1 0\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n", + " lat lon\n", + "0 54.283333 12.316667\n", + "1 54.283333 12.316667\n", + "2 54.283333 12.316667\n", + "3 54.283333 12.316667\n", + "4 54.283333 12.316667\n", + "... ... ...\n", + "15822 60.373333 18.395667\n", + "15823 60.373333 18.395667\n", + "15824 60.503333 18.366667\n", + "15825 60.503333 18.366667\n", + "15826 60.503333 18.366667\n", + "\n", + "[15827 rows x 2 columns]\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " ParseCoordinates(ddmm_to_dd),\n", + " SanitizeLonLatCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n", + "print(tfm.dfs['biota'][['lat','lon']])\n" + ] + }, + { + "cell_type": "markdown", + "id": "47716bff", + "metadata": {}, + "source": [ + "## Review all callbacks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8a07959", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " seawater sediment biota\n", + "Number of rows in dfs 21216 39817 15827\n", + "Number of rows in tfm.dfs 21114 39531 15798\n", + "Number of dropped rows 102 286 29\n", + "Number of rows in tfm.dfs + Number of dropped rows 21216 39817 15827 \n", + "\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[\n", + " AddSampleTypeIdColumnCB(),\n", + " LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE'),\n", + " ParseTimeCB(),\n", + " EncodeTimeCB(cfg()),\n", + " SanitizeValue(coi_val), \n", + " NormalizeUncCB(),\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", + " RemapTaxonInformationCB(lut_taxon),\n", + " RemapSedimentCB(lut_sediments),\n", + " RemapUnitCB(),\n", + " RemapDetectionLimitCB(coi_dl, lut_dl),\n", + " RemapFiltCB(lut_filtered),\n", + " AddSampleLabCodeCB(),\n", + " AddMeasurementNoteCB(lut_method),\n", + " RemapStationIdCB(),\n", + " RemapSedSliceTopBottomCB(),\n", + " LookupDryWetRatio(),\n", + " ParseCoordinates(ddmm_to_dd),\n", + " SanitizeLonLatCB(),\n", + " CompareDfsAndTfmCB(dfs)\n", + " ])\n", + "\n", + "tfm()\n", + "print(pd.DataFrame.from_dict(tfm.compare_stats) , '\\n')\n" + ] + }, + { + "cell_type": "markdown", + "id": "2f13c7a2", + "metadata": {}, + "source": [ + "For instance, to inspect dropped rows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29baf65c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
KEYNUCLIDEMETHOD< VALUE_Bq/m³VALUE_Bq/m³ERROR%_m³DATE_OF_ENTRY_xCOUNTRYLABORATORYSEQUENCE...LONGITUDE (ddmmmm)LONGITUDE (dddddd)TDEPTHSDEPTHSALINTTEMPFILTMORS_SUBBASINHELCOM_SUBBASINDATE_OF_ENTRY_y
13439WRISO2001025CS137RISO02NaNNaN10.0NaN26.0RISO2001025.0...10.50010.83333322.020.00.00NaNN5.05.0NaN
14017WLEPA2002001CS134LEPA02<NaNNaNNaN93.0LEPA2002001.0...21.03021.05000016.00.03.7714.40N4.09.0NaN
14020WLEPA2002002CS134LEPA02<NaNNaNNaN93.0LEPA2002004.0...20.57420.95666714.00.06.5711.95N4.09.0NaN
14023WLEPA2002003CS134LEPA02<NaNNaNNaN93.0LEPA2002007.0...19.23619.39333373.00.07.009.19N4.09.0NaN
14026WLEPA2002004CS134LEPA02<NaNNaNNaN93.0LEPA2002010.0...20.20520.34170047.00.07.068.65N4.09.0NaN
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " KEY NUCLIDE METHOD < VALUE_Bq/m³ VALUE_Bq/m³ ERROR%_m³ \\\n", + "13439 WRISO2001025 CS137 RISO02 NaN NaN 10.0 \n", + "14017 WLEPA2002001 CS134 LEPA02 < NaN NaN \n", + "14020 WLEPA2002002 CS134 LEPA02 < NaN NaN \n", + "14023 WLEPA2002003 CS134 LEPA02 < NaN NaN \n", + "14026 WLEPA2002004 CS134 LEPA02 < NaN NaN \n", + "\n", + " DATE_OF_ENTRY_x COUNTRY LABORATORY SEQUENCE ... LONGITUDE (ddmmmm) \\\n", + "13439 NaN 26.0 RISO 2001025.0 ... 10.500 \n", + "14017 NaN 93.0 LEPA 2002001.0 ... 21.030 \n", + "14020 NaN 93.0 LEPA 2002004.0 ... 20.574 \n", + "14023 NaN 93.0 LEPA 2002007.0 ... 19.236 \n", + "14026 NaN 93.0 LEPA 2002010.0 ... 20.205 \n", + "\n", + " LONGITUDE (dddddd) TDEPTH SDEPTH SALIN TTEMP FILT MORS_SUBBASIN \\\n", + "13439 10.833333 22.0 20.0 0.00 NaN N 5.0 \n", + "14017 21.050000 16.0 0.0 3.77 14.40 N 4.0 \n", + "14020 20.956667 14.0 0.0 6.57 11.95 N 4.0 \n", + "14023 19.393333 73.0 0.0 7.00 9.19 N 4.0 \n", + "14026 20.341700 47.0 0.0 7.06 8.65 N 4.0 \n", + "\n", + " HELCOM_SUBBASIN DATE_OF_ENTRY_y \n", + "13439 5.0 NaN \n", + "14017 9.0 NaN \n", + "14020 9.0 NaN \n", + "14023 9.0 NaN \n", + "14026 9.0 NaN \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfm.dfs_dropped['seawater'].head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e17f6685", + "metadata": {}, + "source": [ + "## Rename columns of interest for NetCDF or Open Refine" + ] + }, + { + "cell_type": "markdown", + "id": "af441203", + "metadata": {}, + "source": [ + "> Column names are standardized to MARIS NetCDF format (i.e. PEP8 ). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66e7bfc7", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def get_common_rules(\n", + " vars: dict, # Configuration dictionary\n", + " encoding_type: str # Encoding type (`netcdf` or `openrefine`)\n", + " ) -> dict: # Common renaming rules for NetCDF and OpenRefine.\n", + " \"Get common renaming rules for NetCDF and OpenRefine.\"\n", + " common = {\n", + " 'KEY': 'key',\n", + " 'lat': 'latitude' if encoding_type == 'openrefine' else vars['defaults']['lat']['name'],\n", + " 'lon': 'longitude' if encoding_type == 'openrefine' else vars['defaults']['lon']['name'],\n", + " 'time': 'begperiod' if encoding_type == 'openrefine' else vars['defaults']['time']['name'],\n", + " 'NUCLIDE': 'nuclide_id' if encoding_type == 'openrefine' else 'nuclide',\n", + " 'detection_limit': 'detection' if encoding_type == 'openrefine' else vars['suffixes']['detection_limit']['name'],\n", + " 'unit': 'unit_id' if encoding_type == 'openrefine' else vars['suffixes']['unit']['name'],\n", + " 'value': 'activity' if encoding_type == 'openrefine' else 'value',\n", + " 'uncertainty': 'uncertaint' if encoding_type == 'openrefine' else vars['suffixes']['uncertainty']['name'],\n", + " 'SDEPTH': 'sampdepth' if encoding_type == 'openrefine' else vars['defaults']['smp_depth']['name'],\n", + " 'TDEPTH': 'totdepth' if encoding_type == 'openrefine' else vars['defaults']['tot_depth']['name'],\n", + " }\n", + " \n", + " if encoding_type == 'openrefine':\n", + " common.update({\n", + " 'samptype_id': 'samptype_id',\n", + " 'station': 'station',\n", + " 'samplabcode': 'samplabcode',\n", + " 'SALIN': 'salinity',\n", + " 'TTEMP': 'temperatur',\n", + " 'FILT': 'filtered',\n", + " 'measurenote': 'measurenote'\n", + " })\n", + " else:\n", + " common.update({\n", + " 'counting_method': vars['suffixes']['counting_method']['name'],\n", + " 'sampling_method': vars['suffixes']['sampling_method']['name'],\n", + " 'preparation_method': vars['suffixes']['preparation_method']['name'],\n", + " 'SALIN': vars['suffixes']['salinity']['name'],\n", + " 'TTEMP': vars['suffixes']['temperature']['name'],\n", + " })\n", + " \n", + " return common" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bc3002a", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def get_specific_rules(\n", + " vars: dict, # Configuration dictionary\n", + " encoding_type: str # Encoding type (`netcdf` or `openrefine`)\n", + " ) -> dict: # Specific renaming rules for NetCDF and OpenRefine.\n", + " \"Get specific renaming rules for NetCDF and OpenRefine.\"\n", + " if encoding_type == 'netcdf':\n", + " return {\n", + " 'biota': {\n", + " 'species': vars['bio']['species']['name'],\n", + " 'body_part': vars['bio']['body_part']['name'],\n", + " 'bio_group': vars['bio']['bio_group']['name']\n", + " },\n", + " 'sediment': {\n", + " 'sed_type': vars['sed']['sed_type']['name'],\n", + " 'top': vars['sed']['top']['name'],\n", + " 'bottom': vars['sed']['bottom']['name'],\n", + " }\n", + " }\n", + " elif encoding_type == 'openrefine':\n", + " return {\n", + " 'biota': {\n", + " 'species': 'species_id',\n", + " 'Taxonname': 'Taxonname',\n", + " 'TaxonRepName': 'TaxonRepName',\n", + " 'Taxonrank': 'Taxonrank',\n", + " 'TaxonDB': 'TaxonDB',\n", + " 'TaxonDBID': 'TaxonDBID',\n", + " 'TaxonDBURL': 'TaxonDBURL',\n", + " 'body_part': 'bodypar_id',\n", + " 'dry_wet_ratio': 'percentwt',\n", + " },\n", + " 'sediment': {\n", + " 'sed_type': 'sedtype_id',\n", + " 'top': 'sliceup',\n", + " 'bottom': 'slicedown',\n", + " 'SedRepName': 'SedRepName',\n", + " 'dry_wet_ratio': 'percentwt',\n", + " }\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbfc4bf7", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def get_renaming_rules(\n", + " encoding_type: str = 'netcdf' # Encoding type (`netcdf` or `openrefine`)\n", + " ) -> dict: # Renaming rules for NetCDF and OpenRefine.\n", + " \"Get renaming rules for NetCDF and OpenRefine.\"\n", + " vars = cdl_cfg()['vars']\n", + " \n", + " if encoding_type not in ['netcdf', 'openrefine']:\n", + " raise ValueError(\"Invalid encoding_type provided. Please use 'netcdf' or 'openrefine'.\")\n", + " \n", + " common_rules = get_common_rules(vars, encoding_type)\n", + " specific_rules = get_specific_rules(vars, encoding_type)\n", + " \n", + " rules = defaultdict(dict)\n", + " for sample_type in ['seawater', 'biota', 'sediment']:\n", + " rules[sample_type] = common_rules.copy()\n", + " rules[sample_type].update(specific_rules.get(sample_type, {}))\n", + " \n", + " return dict(rules)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b7476af", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "class SelectAndRenameColumnCB(Callback):\n", + " \"Select and rename columns in a DataFrame based on renaming rules for a specified encoding type.\"\n", + " def __init__(self, \n", + " fn_renaming_rules: Callable, # A function that returns an OrderedDict of renaming rules \n", + " encoding_type: str='netcdf', # The encoding type (`netcdf` or `openrefine`) to determine which renaming rules to use\n", + " verbose: bool=False # Whether to print out renaming rules that were not applied\n", + " ):\n", + " fc.store_attr()\n", + "\n", + " def __call__(self, tfm: Transformer):\n", + " \"Apply column selection and renaming to DataFrames in the transformer, and identify unused rules.\"\n", + " try:\n", + " renaming_rules = self.fn_renaming_rules(self.encoding_type)\n", + " except ValueError as e:\n", + " print(f\"Error fetching renaming rules: {e}\")\n", + " return\n", + "\n", + " for group in tfm.dfs.keys():\n", + " # Get relevant renaming rules for the current group\n", + " group_rules = self._get_group_rules(renaming_rules, group)\n", + "\n", + " if not group_rules:\n", + " continue\n", + "\n", + " # Apply renaming rules and track keys not found in the DataFrame\n", + " df = tfm.dfs[group]\n", + " df, not_found_keys = self._apply_renaming(df, group_rules)\n", + " tfm.dfs[group] = df\n", + " \n", + " # Print any renaming rules that were not used\n", + " if not_found_keys and self.verbose:\n", + " print(f\"\\nGroup '{group}' has the following renaming rules not applied:\")\n", + " for old_col in not_found_keys:\n", + " print(f\"Key '{old_col}' from renaming rules was not found in the DataFrame.\")\n", + "\n", + " def _get_group_rules(self, \n", + " renaming_rules: OrderedDict, # Renaming rules\n", + " group: str # Group name to filter rules\n", + " ) -> OrderedDict: # Renaming rules applicable to the specified group\n", + " \"Retrieve and merge renaming rules for the specified group based on the encoding type.\"\n", + " relevant_rules = [rules for key, rules in renaming_rules.items() if group in key]\n", + " merged_rules = OrderedDict()\n", + " for rules in relevant_rules:\n", + " merged_rules.update(rules)\n", + " return merged_rules\n", + "\n", + " def _apply_renaming(self, \n", + " df: pd.DataFrame, # DataFrame to modify\n", + " rename_rules: OrderedDict # Renaming rules\n", + " ) -> tuple: # (Renamed and filtered df, Column names from renaming rules that were not found in the DataFrame)\n", + " \"\"\"\n", + " Select columns based on renaming rules and apply renaming, only for existing columns\n", + " while maintaining the order of the dictionary columns.\"\"\"\n", + " existing_columns = set(df.columns)\n", + " valid_rules = OrderedDict((old_col, new_col) for old_col, new_col in rename_rules.items() if old_col in existing_columns)\n", + "\n", + " # Create a list to maintain the order of columns\n", + " columns_to_keep = [col for col in rename_rules.keys() if col in existing_columns]\n", + " columns_to_keep += [new_col for old_col, new_col in valid_rules.items() if new_col in df.columns]\n", + "\n", + " df = df[list(OrderedDict.fromkeys(columns_to_keep))]\n", + "\n", + " # Apply renaming\n", + " df.rename(columns=valid_rules, inplace=True)\n", + "\n", + " # Determine which keys were not found\n", + " not_found_keys = set(rename_rules.keys()) - existing_columns\n", + " return df, not_found_keys\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a4a8682-672f-4188-9091-821b727b4764", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "seawater columns:\n", + "Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", + " 'smp_depth', 'tot_depth', '_sal', '_temp'],\n", + " dtype='object')\n", + "sediment columns:\n", + "Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", + " 'tot_depth', 'sed_type', 'top', 'bottom'],\n", + " dtype='object')\n", + "biota columns:\n", + "Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',\n", + " 'smp_depth', 'species', 'body_part', 'bio_group'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "dfs = load_data(fname_in)\n", + "tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),\n", + " LowerStripNameCB(col_src='NUCLIDE'),\n", + " RemapNuclideNameCB(lut_nuclides),\n", + " AddNuclideIdColumnCB(col_value='NUCLIDE'),\n", + " ParseTimeCB(),\n", + " EncodeTimeCB(cfg()),\n", + " SanitizeValue(coi_val), \n", + " NormalizeUncCB(),\n", + " RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),\n", + " RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),\n", + " RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),\n", + " RemapTaxonInformationCB(lut_taxon),\n", + " RemapSedimentCB(lut_sediments),\n", + " RemapUnitCB(),\n", + " RemapDetectionLimitCB(coi_dl, lut_dl),\n", + " RemapFiltCB(lut_filtered),\n", + " AddSampleLabCodeCB(),\n", + " AddMeasurementNoteCB(lut_method),\n", + " RemapStationIdCB(),\n", + " RemapSedSliceTopBottomCB(),\n", + " LookupDryWetRatio(),\n", + " ParseCoordinates(ddmm_to_dd),\n", + " SanitizeLonLatCB(),\n", + " CompareDfsAndTfmCB(dfs),\n", + " SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),\n", + " ])\n", + "\n", + "tfm()\n", + "for grp in tfm.dfs.keys():\n", + " print(f'{grp} columns:')\n", + " print(tfm.dfs[grp].columns)" + ] + }, + { + "cell_type": "markdown", + "id": "685ff3d0", + "metadata": {}, + "source": [ + "### Uniquess on seawater" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b495b6ab", + "metadata": {}, + "outputs": [], + "source": [ + "result = tfm.dfs['seawater']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c368be62", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
keylatlontimenuclide_dl_unitvalue_uncsmp_depthtot_depth_sal_temp
0WKRIL201200360.083329.33331337731200cs137115.31.6960.0NaNNaNNaN
1WKRIL201200460.083329.33331337731200cs1371119.93.98029.0NaNNaNNaN
2WKRIL201200559.433323.15001339891200cs1371125.55.1000.0NaNNaNNaN
3WKRIL201200660.250027.98331337817600cs1371117.04.9300.0NaNNaNNaN
4WKRIL201200760.250027.98331337817600cs1371122.23.99639.0NaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " key lat lon time nuclide _dl _unit value \\\n", + "0 WKRIL2012003 60.0833 29.3333 1337731200 cs137 1 1 5.3 \n", + "1 WKRIL2012004 60.0833 29.3333 1337731200 cs137 1 1 19.9 \n", + "2 WKRIL2012005 59.4333 23.1500 1339891200 cs137 1 1 25.5 \n", + "3 WKRIL2012006 60.2500 27.9833 1337817600 cs137 1 1 17.0 \n", + "4 WKRIL2012007 60.2500 27.9833 1337817600 cs137 1 1 22.2 \n", + "\n", + " _unc smp_depth tot_depth _sal _temp \n", + "0 1.696 0.0 NaN NaN NaN \n", + "1 3.980 29.0 NaN NaN NaN \n", + "2 5.100 0.0 NaN NaN NaN \n", + "3 4.930 0.0 NaN NaN NaN \n", + "4 3.996 39.0 NaN NaN NaN " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f32e3ec5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# duplicates: 11689 out of 21114\n" + ] + } + ], + "source": [ + "cols_idx = ['lat', 'lon', 'time', 'smp_depth']\n", + "print(f'# duplicates: {result[cols_idx].duplicated().sum()} out of {result.shape[0]}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31e66259", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# duplicates: 153 out of 21114\n" + ] + } + ], + "source": [ + "cols_idx = ['lat', 'lon', 'time', 'smp_depth', 'nuclide']\n", + "print(f'# duplicates: {result[cols_idx].duplicated().sum()} out of {result.shape[0]}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c6a9533", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
latlontimesmp_depthnuclidevaluekey
79754.833317.534361421760016.0cs13417.0000WCLOR1989028
79854.833317.534361421760016.0cs137109.0000WCLOR1989028
79954.833317.534361421760016.0sr9021.4000WCLOR1989028
80354.833317.534361421760016.0cs13416.0000WCLOR1989030
80454.833317.534361421760016.0cs13790.0000WCLOR1989030
80554.833317.534361421760016.0sr9020.7000WCLOR1989030
208865.300024.00005304960000.0cs13447.0000WDHIG1986357
208965.300024.00005304960000.0cs137108.0000WDHIG1986357
212963.500021.00005305824000.0cs13475.0000WDHIG1986367
213063.500021.00005305824000.0cs137151.0000WDHIG1986367
217261.066719.70005306688000.0cs134260.0000WDHIG1986377
217361.066719.70005306688000.0cs137499.0000WDHIG1986377
218861.066719.7000530668800130.0cs13462.0000WDHIG1986381
218961.066719.7000530668800130.0cs137126.0000WDHIG1986381
249354.83339.898355503360025.0cs13413.9000WDHIG1987257
249454.83339.898355503360025.0cs13763.6000WDHIG1987257
250354.750010.883355503360013.0cs13416.8000WDHIG1987270
250454.750010.883355503360013.0cs13767.9000WDHIG1987270
250555.416710.95175550336000.0am2410.0010WDHIG1987271
250655.416710.95175550336000.0cm2420.0105WDHIG1987271
\n", + "
" + ], + "text/plain": [ + " lat lon time smp_depth nuclide value key\n", + "797 54.8333 17.5343 614217600 16.0 cs134 17.0000 WCLOR1989028\n", + "798 54.8333 17.5343 614217600 16.0 cs137 109.0000 WCLOR1989028\n", + "799 54.8333 17.5343 614217600 16.0 sr90 21.4000 WCLOR1989028\n", + "803 54.8333 17.5343 614217600 16.0 cs134 16.0000 WCLOR1989030\n", + "804 54.8333 17.5343 614217600 16.0 cs137 90.0000 WCLOR1989030\n", + "805 54.8333 17.5343 614217600 16.0 sr90 20.7000 WCLOR1989030\n", + "2088 65.3000 24.0000 530496000 0.0 cs134 47.0000 WDHIG1986357\n", + "2089 65.3000 24.0000 530496000 0.0 cs137 108.0000 WDHIG1986357\n", + "2129 63.5000 21.0000 530582400 0.0 cs134 75.0000 WDHIG1986367\n", + "2130 63.5000 21.0000 530582400 0.0 cs137 151.0000 WDHIG1986367\n", + "2172 61.0667 19.7000 530668800 0.0 cs134 260.0000 WDHIG1986377\n", + "2173 61.0667 19.7000 530668800 0.0 cs137 499.0000 WDHIG1986377\n", + "2188 61.0667 19.7000 530668800 130.0 cs134 62.0000 WDHIG1986381\n", + "2189 61.0667 19.7000 530668800 130.0 cs137 126.0000 WDHIG1986381\n", + "2493 54.8333 9.8983 555033600 25.0 cs134 13.9000 WDHIG1987257\n", + "2494 54.8333 9.8983 555033600 25.0 cs137 63.6000 WDHIG1987257\n", + "2503 54.7500 10.8833 555033600 13.0 cs134 16.8000 WDHIG1987270\n", + "2504 54.7500 10.8833 555033600 13.0 cs137 67.9000 WDHIG1987270\n", + "2505 55.4167 10.9517 555033600 0.0 am241 0.0010 WDHIG1987271\n", + "2506 55.4167 10.9517 555033600 0.0 cm242 0.0105 WDHIG1987271" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result[cols_idx + ['value', 'key']][result[cols_idx].duplicated(keep=False)][:20]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3ddf0bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
keylatlontimenuclide_dl_unitvalue_uncsmp_depthtot_depth_sal_temp
797WCLOR198902854.833317.5343614217600cs1341117.00004.5900016.019.07.62NaN
798WCLOR198902854.833317.5343614217600cs13711109.000014.1700016.019.07.62NaN
799WCLOR198902854.833317.5343614217600sr901121.40003.6380016.019.07.62NaN
803WCLOR198903054.833317.5343614217600cs1341116.00004.3200016.019.07.62NaN
804WCLOR198903054.833317.5343614217600cs1371190.000011.7000016.019.07.62NaN
805WCLOR198903054.833317.5343614217600sr901120.70003.5190016.019.07.62NaN
2088WDHIG198635765.300024.0000530496000cs1341147.000041.830000.025.03.674.2
2089WDHIG198635765.300024.0000530496000cs13711108.000050.760000.025.03.674.2
2129WDHIG198636763.500021.0000530582400cs1341175.00008.250000.021.04.866.2
2130WDHIG198636763.500021.0000530582400cs13711151.00007.550000.021.04.866.2
2172WDHIG198637761.066719.7000530668800cs13411260.0000150.800000.0137.05.965.5
2173WDHIG198637761.066719.7000530668800cs13711499.0000124.750000.0137.05.965.5
2188WDHIG198638161.066719.7000530668800cs1341162.000050.22000130.0137.06.053.4
2189WDHIG198638161.066719.7000530668800cs13711126.000049.14000130.0137.06.053.4
2493WDHIG198725754.83339.8983555033600cs1341113.90002.0850025.027.023.097.9
2494WDHIG198725754.83339.8983555033600cs1371163.600022.2600025.027.023.097.9
2503WDHIG198727054.750010.8833555033600cs1341116.800015.6240013.015.021.3413.0
2504WDHIG198727054.750010.8833555033600cs1371167.900014.2590013.015.021.3413.0
2505WDHIG198727155.416710.9517555033600am241010.0010NaN0.026.019.3014.2
2506WDHIG198727155.416710.9517555033600cm242110.01050.004410.026.019.3014.2
\n", + "
" + ], + "text/plain": [ + " key lat lon time nuclide _dl _unit value \\\n", + "797 WCLOR1989028 54.8333 17.5343 614217600 cs134 1 1 17.0000 \n", + "798 WCLOR1989028 54.8333 17.5343 614217600 cs137 1 1 109.0000 \n", + "799 WCLOR1989028 54.8333 17.5343 614217600 sr90 1 1 21.4000 \n", + "803 WCLOR1989030 54.8333 17.5343 614217600 cs134 1 1 16.0000 \n", + "804 WCLOR1989030 54.8333 17.5343 614217600 cs137 1 1 90.0000 \n", + "805 WCLOR1989030 54.8333 17.5343 614217600 sr90 1 1 20.7000 \n", + "2088 WDHIG1986357 65.3000 24.0000 530496000 cs134 1 1 47.0000 \n", + "2089 WDHIG1986357 65.3000 24.0000 530496000 cs137 1 1 108.0000 \n", + "2129 WDHIG1986367 63.5000 21.0000 530582400 cs134 1 1 75.0000 \n", + "2130 WDHIG1986367 63.5000 21.0000 530582400 cs137 1 1 151.0000 \n", + "2172 WDHIG1986377 61.0667 19.7000 530668800 cs134 1 1 260.0000 \n", + "2173 WDHIG1986377 61.0667 19.7000 530668800 cs137 1 1 499.0000 \n", + "2188 WDHIG1986381 61.0667 19.7000 530668800 cs134 1 1 62.0000 \n", + "2189 WDHIG1986381 61.0667 19.7000 530668800 cs137 1 1 126.0000 \n", + "2493 WDHIG1987257 54.8333 9.8983 555033600 cs134 1 1 13.9000 \n", + "2494 WDHIG1987257 54.8333 9.8983 555033600 cs137 1 1 63.6000 \n", + "2503 WDHIG1987270 54.7500 10.8833 555033600 cs134 1 1 16.8000 \n", + "2504 WDHIG1987270 54.7500 10.8833 555033600 cs137 1 1 67.9000 \n", + "2505 WDHIG1987271 55.4167 10.9517 555033600 am241 0 1 0.0010 \n", + "2506 WDHIG1987271 55.4167 10.9517 555033600 cm242 1 1 0.0105 \n", + "\n", + " _unc smp_depth tot_depth _sal _temp \n", + "797 4.59000 16.0 19.0 7.62 NaN \n", + "798 14.17000 16.0 19.0 7.62 NaN \n", + "799 3.63800 16.0 19.0 7.62 NaN \n", + "803 4.32000 16.0 19.0 7.62 NaN \n", + "804 11.70000 16.0 19.0 7.62 NaN \n", + "805 3.51900 16.0 19.0 7.62 NaN \n", + "2088 41.83000 0.0 25.0 3.67 4.2 \n", + "2089 50.76000 0.0 25.0 3.67 4.2 \n", + "2129 8.25000 0.0 21.0 4.86 6.2 \n", + "2130 7.55000 0.0 21.0 4.86 6.2 \n", + "2172 150.80000 0.0 137.0 5.96 5.5 \n", + "2173 124.75000 0.0 137.0 5.96 5.5 \n", + "2188 50.22000 130.0 137.0 6.05 3.4 \n", + "2189 49.14000 130.0 137.0 6.05 3.4 \n", + "2493 2.08500 25.0 27.0 23.09 7.9 \n", + "2494 22.26000 25.0 27.0 23.09 7.9 \n", + "2503 15.62400 13.0 15.0 21.34 13.0 \n", + "2504 14.25900 13.0 15.0 21.34 13.0 \n", + "2505 NaN 0.0 26.0 19.30 14.2 \n", + "2506 0.00441 0.0 26.0 19.30 14.2 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result[result[cols_idx].duplicated(keep=False)][:20]" ] }, {