Merge branch 'main' of https://github.com/franckalbinet/marisco into …

…compound_idx
franckalbinet · Oct 18, 2024 · 2badae3 · 2badae3
2 parents 9463b34 + b1d4bf4
commit 2badae3
Show file tree

Hide file tree

Showing 20 changed files with 32,373 additions and 3,652 deletions.
diff --git a/.cursorrules b/.cursorrules
@@ -0,0 +1,41 @@
+{
+  "rules": [
+    {
+      "name": "Python Expert Guidelines",
+      "pattern": ".*",
+      "message": "You are an expert in Python, Marisco, and nbdev conventions. Ensure your code follows these guidelines:\n\n- Write concise, technical responses with accurate Python examples.\n- Use descriptive variable names that clearly indicate their purpose.\n- Prioritize readability and maintainability; follow PEP 8 style guidelines.\n- Utilize comments and docstrings for clarity, ensuring they follow nbdev conventions.\n- Structure code logically with a clear separation of concerns (modules, classes, functions).\n- Implement effective error handling and input validation.\n- Use type hints for all function signatures to enhance code clarity and usability.\n- Leverage fastcore and other utility libraries to enhance functionality and efficiency.",
+      "severity": "info"
+    },
+    {
+      "name": "IMFA Pattern Reminder",
+      "pattern": "(?i)remap|nomenclature",
+      "message": "Remember the IMFA Pattern for remapping nomenclatures:\n1. Inspect data provider nomenclature.\n2. Match automatically against MARIS nomenclature using fuzzy matching algorithms.\n3. Fix potential mismatches.\n4. Apply the lookup table to the dataframe.\nUse the `Remapper` object to facilitate this process.",
+      "severity": "info"
+    },
+    {
+      "name": "Error Handling and Validation",
+      "pattern": "(?i)error|exception|validate",
+      "message": "Ensure robust error handling and input validation:\n- Implement graceful error handling for unexpected conditions.\n- Use informative exceptions and log error messages for easier debugging.\n- Validate input data rigorously before processing to avoid runtime errors.",
+      "severity": "warning"
+    },
+    {
+      "name": "Marisco Guidelines",
+      "pattern": "marisco",
+      "message": "Follow Marisco's architectural patterns:\n- Use structured data formats (like NetCDF and CSV) and ensure data integrity.\n- Implement callbacks for modular data transformations and serializers for consistent data output.\n- Utilize configuration files for managing constants and settings.\n- Use appropriate functions from Marisco utility and config modules.\n- Maintain a consistent approach to data manipulation and ensure reproducibility.",
+      "severity": "info"
+    },
+    {
+      "name": "Performance Optimization",
+      "pattern": "(?i)performance|optimize",
+      "message": "Consider performance optimization:\n- Use vectorized operations and efficient algorithms for data processing.\n- Implement caching and other optimization techniques for large datasets.\n- Ensure functions are modular and reusable to promote code efficiency.",
+      "severity": "info"
+    }
+  ],
+  "formatters": {
+    "python": {
+      "quote_type": "single",
+      "indent_size": 4,
+      "max_line_length": 100
+    }
+  }
+}
diff --git a/install_configure_guide/README.md b/install_configure_guide/README.md
@@ -828,6 +828,10 @@ An integer value (the 'bodypar_id' defined in the LUT).
 ### Description:
 Top of sediment core interval relative to the water-sediment interface (cm).
 
+**_NOTE:_**
+For sediment samples, if the top slice depth is missing (i.e., sliceup = -1), there should not be multiple grab samples for the same latitude, longitude, and time. Multiple samples at the same location and time likely indicate a core was taken, but slice top and bottom information is missing. In such cases, these records should be flagged for review and excluded from analysis until complete depth information is provided.
+
+
 ### Lookup Table (LUT) in use:
 No
 

diff --git a/marisco/_modidx.py b/marisco/_modidx.py
@@ -608,7 +608,8 @@
                                         'marisco.handlers.ospar.load_data': ('handlers/ospar.html#load_data', 'marisco/handlers/ospar.py'),
                                         'marisco.handlers.ospar.unc_exp2stan': ( 'handlers/ospar.html#unc_exp2stan',
                                                                                  'marisco/handlers/ospar.py')},
-            'marisco.inout': { 'marisco.inout.read_toml': ('api/inout.html#read_toml', 'marisco/inout.py'),
+            'marisco.inout': { 'marisco.inout.flatten_dict': ('api/inout.html#flatten_dict', 'marisco/inout.py'),
+                               'marisco.inout.read_toml': ('api/inout.html#read_toml', 'marisco/inout.py'),
                                'marisco.inout.write_toml': ('api/inout.html#write_toml', 'marisco/inout.py')},
             'marisco.metadata': { 'marisco.metadata.BboxCB': ('api/metadata.html#bboxcb', 'marisco/metadata.py'),
                                   'marisco.metadata.BboxCB.__call__': ('api/metadata.html#bboxcb.__call__', 'marisco/metadata.py'),

diff --git a/marisco/callbacks.py b/marisco/callbacks.py
@@ -137,7 +137,8 @@ def __init__(self,
                  col_remap: str, # Name of the column to remap
                  col_src: str, # Name of the column with the source values
                  dest_grps: list[str]|str=grp_names(), # List of destination groups
-                 default_value: Any = -1 # Default value for unmatched entries
+                 default_value: Any = -1, # Default value for unmatched entries
+                 verbose: bool = False, # Whether to print unmatched values
                 ):
         fc.store_attr()
         self.lut = None
@@ -157,7 +158,7 @@ def _remap_value(self, value: str) -> Any:
         value = value.strip() if isinstance(value, str) else value
         match = self.lut.get(value, Match(self.default_value, None, None, None))
         if isinstance(match, Match):
-            if match.matched_id == self.default_value:
+            if match.matched_id == self.default_value and self.verbose:
                 print(f"Unmatched value: {value}")
             return match.matched_id 
         else:

diff --git a/marisco/configs.py b/marisco/configs.py
@@ -264,7 +264,23 @@ def cache_path():
                     'standard_name': 'sediment_type_tbd'
                 },
                 'dtype': 'sed_type_t'
-            }
+            },
+            'top': {
+                'name': 'top',
+                'attrs': {
+                    'long_name': 'Top depth of sediment layer',
+                    'standard_name': 'top_depth_of_sediment_layer_tbd'
+                },
+                'dtype': 'f4'
+            },
+            'bottom': {
+                'name': 'bottom',
+                'attrs': {
+                    'long_name': 'Bottom depth of sediment layer',
+                    'standard_name': 'bottom_depth_of_sediment_layer_tbd'
+                },
+                'dtype': 'f4'
+            },
         },
         'suffixes':  {
             'uncertainty': {

diff --git a/marisco/handlers/helcom.py b/marisco/handlers/helcom.py
@@ -525,7 +525,7 @@ def _apply_dry_wet_ratio(self, df: pd.DataFrame) -> None:
         df.loc[df['dry_wet_ratio'] == 0, 'dry_wet_ratio'] = np.NaN
 
 
-# %% ../../nbs/handlers/helcom.ipynb 172
+# %% ../../nbs/handlers/helcom.ipynb 173
 class ParseCoordinates(Callback):
     """
     Get geographical coordinates from columns expressed in degrees decimal format 
@@ -575,13 +575,14 @@ def _safe_convert(self, value) -> str:
             print(f"Error converting value {value}: {e}")
             return value
 
-# %% ../../nbs/handlers/helcom.ipynb 183
+# %% ../../nbs/handlers/helcom.ipynb 184
 def get_common_rules(
     vars: dict, # Configuration dictionary
     encoding_type: str # Encoding type (`netcdf` or `openrefine`)
     ) -> dict: # Common renaming rules for NetCDF and OpenRefine.
     "Get common renaming rules for NetCDF and OpenRefine."
     common = {
+        'KEY': 'key',
         'lat': 'latitude' if encoding_type == 'openrefine' else vars['defaults']['lat']['name'],
         'lon': 'longitude' if encoding_type == 'openrefine' else vars['defaults']['lon']['name'],
         'time': 'begperiod' if encoding_type == 'openrefine' else vars['defaults']['time']['name'],
@@ -615,7 +616,7 @@ def get_common_rules(
 
     return common
 
-# %% ../../nbs/handlers/helcom.ipynb 184
+# %% ../../nbs/handlers/helcom.ipynb 185
 def get_specific_rules(
     vars: dict, # Configuration dictionary
     encoding_type: str # Encoding type (`netcdf` or `openrefine`)
@@ -630,6 +631,8 @@ def get_specific_rules(
             },
             'sediment': {
                 'sed_type': vars['sed']['sed_type']['name'],
+                'top': vars['sed']['top']['name'],
+                'bottom': vars['sed']['bottom']['name'],
             }
         }
     elif encoding_type == 'openrefine':
@@ -654,7 +657,7 @@ def get_specific_rules(
             }
         }
 
-# %% ../../nbs/handlers/helcom.ipynb 185
+# %% ../../nbs/handlers/helcom.ipynb 186
 def get_renaming_rules(
     encoding_type: str = 'netcdf' # Encoding type (`netcdf` or `openrefine`)
     ) -> dict: # Renaming rules for NetCDF and OpenRefine.
@@ -674,7 +677,7 @@ def get_renaming_rules(
 
     return dict(rules)
 
-# %% ../../nbs/handlers/helcom.ipynb 186
+# %% ../../nbs/handlers/helcom.ipynb 187
 class SelectAndRenameColumnCB(Callback):
     "Select and rename columns in a DataFrame based on renaming rules for a specified encoding type."
     def __init__(self, 
@@ -745,7 +748,7 @@ def _apply_renaming(self,
         return df, not_found_keys
 
 
-# %% ../../nbs/handlers/helcom.ipynb 195
+# %% ../../nbs/handlers/helcom.ipynb 197
 kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
       'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
       'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
@@ -757,7 +760,7 @@ def _apply_renaming(self,
       'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
       'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']
 
-# %% ../../nbs/handlers/helcom.ipynb 196
+# %% ../../nbs/handlers/helcom.ipynb 198
 def get_attrs(
     tfm: Transformer, # Transformer object
     zotero_key: str, # Zotero dataset record key
@@ -773,7 +776,7 @@ def get_attrs(
         KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
         ])()
 
-# %% ../../nbs/handlers/helcom.ipynb 198
+# %% ../../nbs/handlers/helcom.ipynb 200
 def enums_xtra(
     tfm: Transformer, # Transformer object
     vars: list # List of variables to extract from the transformer
@@ -787,7 +790,7 @@ def enums_xtra(
             xtras[f'{var}_t'] = enums.filter(f'{var}_t', unique_vals)
     return xtras
 
-# %% ../../nbs/handlers/helcom.ipynb 200
+# %% ../../nbs/handlers/helcom.ipynb 202
 def encode(
     fname_in: str, # Input file name
     fname_out_nc: str, # Output file name

diff --git a/marisco/inout.py b/marisco/inout.py
@@ -3,20 +3,38 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/api/inout.ipynb.
 
 # %% auto 0
-__all__ = ['write_toml', 'read_toml']
+__all__ = ['write_toml', 'flatten_dict', 'read_toml']
 
 # %% ../nbs/api/inout.ipynb 2
 import tomli_w
 import tomli
+from typing import Dict, Any
+
 
 # %% ../nbs/api/inout.ipynb 3
-def write_toml(fname, cfg):
+def write_toml(fname: str, cfg: Dict[str, Any]):
     "Write a TOML file from a dictionary."
+    none_keys = [k for k, v in flatten_dict(cfg).items() if v is None]
+    if none_keys:
+        print(f"Warning: The following config keys have None values: {', '.join(none_keys)}")
+
     print(f'Creating {fname}')
     with open(fname, "wb") as f:
         tomli_w.dump(cfg, f)
 
 # %% ../nbs/api/inout.ipynb 4
+def flatten_dict(d: Dict[str, Any], parent_key: str = '', sep: str = '.') -> Dict[str, Any]:
+    """Flatten a nested dictionary."""
+    items = []
+    for k, v in d.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, dict):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+
+# %% ../nbs/api/inout.ipynb 5
 def read_toml(fname):
     "Read a TOML file into a dictionary."
     with open(fname, "rb") as f:

diff --git a/marisco/utils.py b/marisco/utils.py
@@ -106,8 +106,7 @@ def _format_output(self):
         df_lut.index.name = 'source_key'
         return df_lut.sort_values(by='match_score', ascending=False)
 
-
-# %% ../nbs/api/utils.ipynb 16
+# %% ../nbs/api/utils.ipynb 17
 def has_valid_varname(
     var_names:list, # variable names
     cdl_path:str, # Path to MARIS CDL file (point of truth)
@@ -140,7 +139,7 @@ def has_valid_varname(
                 print(f'"{name}" variable name not found in MARIS CDL')
     return has_valid  
 
-# %% ../nbs/api/utils.ipynb 20
+# %% ../nbs/api/utils.ipynb 21
 def get_bbox(df,
              coord_cols=('lon', 'lat')
             ):
@@ -149,7 +148,7 @@ def get_bbox(df,
     arr = [(row[x], row[y]) for _, row in df.iterrows()]
     return MultiPoint(arr).envelope
 
-# %% ../nbs/api/utils.ipynb 26
+# %% ../nbs/api/utils.ipynb 27
 def ddmm_to_dd(
     ddmmmm:float # Coordinates in degrees/minutes decimal format
     ) -> float: # Coordinates in degrees decimal format
@@ -158,7 +157,7 @@ def ddmm_to_dd(
     mins = mins * 100
     return round(int(degs) + (mins / 60), 6)
 
-# %% ../nbs/api/utils.ipynb 29
+# %% ../nbs/api/utils.ipynb 30
 def download_files_in_folder(owner:str, 
                              repo:str, 
                              src_dir:str, 
@@ -192,7 +191,7 @@ def download_file(owner, repo, src_dir, dest_dir, fname):
     else:
         print(f"Error: {response.status_code}")
 
-# %% ../nbs/api/utils.ipynb 31
+# %% ../nbs/api/utils.ipynb 32
 def match_worms(
     name:str # Name of species to look up in WoRMS
     ):
@@ -215,7 +214,7 @@ def match_worms(
     else:
         return -1
 
-# %% ../nbs/api/utils.ipynb 36
+# %% ../nbs/api/utils.ipynb 37
 @dataclass
 class Match:
     "Match between a data provider name and a MARIS lookup table."
@@ -224,7 +223,7 @@ class Match:
     source_name: str
     match_score: int
 
-# %% ../nbs/api/utils.ipynb 37
+# %% ../nbs/api/utils.ipynb 38
 def match_maris_lut(
     lut_path: str, # Path to MARIS species authoritative species look-up table
     data_provider_name: str, # Name of data provider nomenclature item to look up 
@@ -241,15 +240,15 @@ def match_maris_lut(
     df = df.sort_values(by='score', ascending=True)[:nresults]
     return df[[maris_id, maris_name, 'score']]
 
-# %% ../nbs/api/utils.ipynb 44
+# %% ../nbs/api/utils.ipynb 45
 def get_bbox(df,
              coord_cols=('lon', 'lat')
             ):
     x, y = coord_cols        
     arr = [(row[x], row[y]) for _, row in df.iterrows()]
     return MultiPoint(arr).envelope
 
-# %% ../nbs/api/utils.ipynb 51
+# %% ../nbs/api/utils.ipynb 52
 def download_files_in_folder(owner:str, 
                              repo:str, 
                              src_dir:str, 
@@ -283,7 +282,7 @@ def download_file(owner, repo, src_dir, dest_dir, fname):
     else:
         print(f"Error: {response.status_code}")
 
-# %% ../nbs/api/utils.ipynb 53
+# %% ../nbs/api/utils.ipynb 54
 def match_worms(
     name:str # Name of species to look up in WoRMS
     ):
@@ -306,7 +305,7 @@ def match_worms(
     else:
         return -1
 
-# %% ../nbs/api/utils.ipynb 58
+# %% ../nbs/api/utils.ipynb 59
 def test_dfs(
     dfs1:dict, # First dictionary of DataFrames to compare 
     dfs2:dict # Second dictionary of DataFrames to compare

diff --git a/nbs/api/callbacks.ipynb b/nbs/api/callbacks.ipynb
@@ -445,7 +445,19 @@
    "execution_count": null,
    "id": "8c905654",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'Callback' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m#| exports\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mRemapCB\u001b[39;00m(\u001b[43mCallback\u001b[49m):\n\u001b[1;32m      3\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeneric MARIS remapping callback.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      4\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \n\u001b[1;32m      5\u001b[0m                  fn_lut: Callable, \u001b[38;5;66;03m# Function that returns the lookup table dictionary\u001b[39;00m\n\u001b[1;32m      6\u001b[0m                  col_remap: \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;66;03m# Name of the column to remap\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     10\u001b[0m                  verbose: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;66;03m# Whether to print unmatched values\u001b[39;00m\n\u001b[1;32m     11\u001b[0m                 ):\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'Callback' is not defined"
+     ]
+    }
+   ],
    "source": [
     "#| exports\n",
     "class RemapCB(Callback):\n",
@@ -455,7 +467,8 @@
     "                 col_remap: str, # Name of the column to remap\n",
     "                 col_src: str, # Name of the column with the source values\n",
     "                 dest_grps: list[str]|str=grp_names(), # List of destination groups\n",
-    "                 default_value: Any = -1 # Default value for unmatched entries\n",
+    "                 default_value: Any = -1, # Default value for unmatched entries\n",
+    "                 verbose: bool = False, # Whether to print unmatched values\n",
     "                ):\n",
     "        fc.store_attr()\n",
     "        self.lut = None\n",
@@ -475,7 +488,7 @@
     "        value = value.strip() if isinstance(value, str) else value\n",
     "        match = self.lut.get(value, Match(self.default_value, None, None, None))\n",
     "        if isinstance(match, Match):\n",
-    "            if match.matched_id == self.default_value:\n",
+    "            if match.matched_id == self.default_value and self.verbose:\n",
     "                print(f\"Unmatched value: {value}\")\n",
     "            return match.matched_id \n",
     "        else:\n",