Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/franckalbinet/marisco into …
Browse files Browse the repository at this point in the history
…compound_idx
  • Loading branch information
niallmurphy93 committed Oct 18, 2024
2 parents 9463b34 + b1d4bf4 commit 2badae3
Show file tree
Hide file tree
Showing 20 changed files with 32,373 additions and 3,652 deletions.
41 changes: 41 additions & 0 deletions .cursorrules
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{
"rules": [
{
"name": "Python Expert Guidelines",
"pattern": ".*",
"message": "You are an expert in Python, Marisco, and nbdev conventions. Ensure your code follows these guidelines:\n\n- Write concise, technical responses with accurate Python examples.\n- Use descriptive variable names that clearly indicate their purpose.\n- Prioritize readability and maintainability; follow PEP 8 style guidelines.\n- Utilize comments and docstrings for clarity, ensuring they follow nbdev conventions.\n- Structure code logically with a clear separation of concerns (modules, classes, functions).\n- Implement effective error handling and input validation.\n- Use type hints for all function signatures to enhance code clarity and usability.\n- Leverage fastcore and other utility libraries to enhance functionality and efficiency.",
"severity": "info"
},
{
"name": "IMFA Pattern Reminder",
"pattern": "(?i)remap|nomenclature",
"message": "Remember the IMFA Pattern for remapping nomenclatures:\n1. Inspect data provider nomenclature.\n2. Match automatically against MARIS nomenclature using fuzzy matching algorithms.\n3. Fix potential mismatches.\n4. Apply the lookup table to the dataframe.\nUse the `Remapper` object to facilitate this process.",
"severity": "info"
},
{
"name": "Error Handling and Validation",
"pattern": "(?i)error|exception|validate",
"message": "Ensure robust error handling and input validation:\n- Implement graceful error handling for unexpected conditions.\n- Use informative exceptions and log error messages for easier debugging.\n- Validate input data rigorously before processing to avoid runtime errors.",
"severity": "warning"
},
{
"name": "Marisco Guidelines",
"pattern": "marisco",
"message": "Follow Marisco's architectural patterns:\n- Use structured data formats (like NetCDF and CSV) and ensure data integrity.\n- Implement callbacks for modular data transformations and serializers for consistent data output.\n- Utilize configuration files for managing constants and settings.\n- Use appropriate functions from Marisco utility and config modules.\n- Maintain a consistent approach to data manipulation and ensure reproducibility.",
"severity": "info"
},
{
"name": "Performance Optimization",
"pattern": "(?i)performance|optimize",
"message": "Consider performance optimization:\n- Use vectorized operations and efficient algorithms for data processing.\n- Implement caching and other optimization techniques for large datasets.\n- Ensure functions are modular and reusable to promote code efficiency.",
"severity": "info"
}
],
"formatters": {
"python": {
"quote_type": "single",
"indent_size": 4,
"max_line_length": 100
}
}
}
4 changes: 4 additions & 0 deletions install_configure_guide/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,10 @@ An integer value (the 'bodypar_id' defined in the LUT).
### Description:
Top of sediment core interval relative to the water-sediment interface (cm).

**_NOTE:_**
For sediment samples, if the top slice depth is missing (i.e., sliceup = -1), there should not be multiple grab samples for the same latitude, longitude, and time. Multiple samples at the same location and time likely indicate a core was taken, but slice top and bottom information is missing. In such cases, these records should be flagged for review and excluded from analysis until complete depth information is provided.


### Lookup Table (LUT) in use:
No

Expand Down
3 changes: 2 additions & 1 deletion marisco/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,8 @@
'marisco.handlers.ospar.load_data': ('handlers/ospar.html#load_data', 'marisco/handlers/ospar.py'),
'marisco.handlers.ospar.unc_exp2stan': ( 'handlers/ospar.html#unc_exp2stan',
'marisco/handlers/ospar.py')},
'marisco.inout': { 'marisco.inout.read_toml': ('api/inout.html#read_toml', 'marisco/inout.py'),
'marisco.inout': { 'marisco.inout.flatten_dict': ('api/inout.html#flatten_dict', 'marisco/inout.py'),
'marisco.inout.read_toml': ('api/inout.html#read_toml', 'marisco/inout.py'),
'marisco.inout.write_toml': ('api/inout.html#write_toml', 'marisco/inout.py')},
'marisco.metadata': { 'marisco.metadata.BboxCB': ('api/metadata.html#bboxcb', 'marisco/metadata.py'),
'marisco.metadata.BboxCB.__call__': ('api/metadata.html#bboxcb.__call__', 'marisco/metadata.py'),
Expand Down
5 changes: 3 additions & 2 deletions marisco/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ def __init__(self,
col_remap: str, # Name of the column to remap
col_src: str, # Name of the column with the source values
dest_grps: list[str]|str=grp_names(), # List of destination groups
default_value: Any = -1 # Default value for unmatched entries
default_value: Any = -1, # Default value for unmatched entries
verbose: bool = False, # Whether to print unmatched values
):
fc.store_attr()
self.lut = None
Expand All @@ -157,7 +158,7 @@ def _remap_value(self, value: str) -> Any:
value = value.strip() if isinstance(value, str) else value
match = self.lut.get(value, Match(self.default_value, None, None, None))
if isinstance(match, Match):
if match.matched_id == self.default_value:
if match.matched_id == self.default_value and self.verbose:
print(f"Unmatched value: {value}")
return match.matched_id
else:
Expand Down
18 changes: 17 additions & 1 deletion marisco/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,23 @@ def cache_path():
'standard_name': 'sediment_type_tbd'
},
'dtype': 'sed_type_t'
}
},
'top': {
'name': 'top',
'attrs': {
'long_name': 'Top depth of sediment layer',
'standard_name': 'top_depth_of_sediment_layer_tbd'
},
'dtype': 'f4'
},
'bottom': {
'name': 'bottom',
'attrs': {
'long_name': 'Bottom depth of sediment layer',
'standard_name': 'bottom_depth_of_sediment_layer_tbd'
},
'dtype': 'f4'
},
},
'suffixes': {
'uncertainty': {
Expand Down
21 changes: 12 additions & 9 deletions marisco/handlers/helcom.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ def _apply_dry_wet_ratio(self, df: pd.DataFrame) -> None:
df.loc[df['dry_wet_ratio'] == 0, 'dry_wet_ratio'] = np.NaN


# %% ../../nbs/handlers/helcom.ipynb 172
# %% ../../nbs/handlers/helcom.ipynb 173
class ParseCoordinates(Callback):
"""
Get geographical coordinates from columns expressed in degrees decimal format
Expand Down Expand Up @@ -575,13 +575,14 @@ def _safe_convert(self, value) -> str:
print(f"Error converting value {value}: {e}")
return value

# %% ../../nbs/handlers/helcom.ipynb 183
# %% ../../nbs/handlers/helcom.ipynb 184
def get_common_rules(
vars: dict, # Configuration dictionary
encoding_type: str # Encoding type (`netcdf` or `openrefine`)
) -> dict: # Common renaming rules for NetCDF and OpenRefine.
"Get common renaming rules for NetCDF and OpenRefine."
common = {
'KEY': 'key',
'lat': 'latitude' if encoding_type == 'openrefine' else vars['defaults']['lat']['name'],
'lon': 'longitude' if encoding_type == 'openrefine' else vars['defaults']['lon']['name'],
'time': 'begperiod' if encoding_type == 'openrefine' else vars['defaults']['time']['name'],
Expand Down Expand Up @@ -615,7 +616,7 @@ def get_common_rules(

return common

# %% ../../nbs/handlers/helcom.ipynb 184
# %% ../../nbs/handlers/helcom.ipynb 185
def get_specific_rules(
vars: dict, # Configuration dictionary
encoding_type: str # Encoding type (`netcdf` or `openrefine`)
Expand All @@ -630,6 +631,8 @@ def get_specific_rules(
},
'sediment': {
'sed_type': vars['sed']['sed_type']['name'],
'top': vars['sed']['top']['name'],
'bottom': vars['sed']['bottom']['name'],
}
}
elif encoding_type == 'openrefine':
Expand All @@ -654,7 +657,7 @@ def get_specific_rules(
}
}

# %% ../../nbs/handlers/helcom.ipynb 185
# %% ../../nbs/handlers/helcom.ipynb 186
def get_renaming_rules(
encoding_type: str = 'netcdf' # Encoding type (`netcdf` or `openrefine`)
) -> dict: # Renaming rules for NetCDF and OpenRefine.
Expand All @@ -674,7 +677,7 @@ def get_renaming_rules(

return dict(rules)

# %% ../../nbs/handlers/helcom.ipynb 186
# %% ../../nbs/handlers/helcom.ipynb 187
class SelectAndRenameColumnCB(Callback):
"Select and rename columns in a DataFrame based on renaming rules for a specified encoding type."
def __init__(self,
Expand Down Expand Up @@ -745,7 +748,7 @@ def _apply_renaming(self,
return df, not_found_keys


# %% ../../nbs/handlers/helcom.ipynb 195
# %% ../../nbs/handlers/helcom.ipynb 197
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
Expand All @@ -757,7 +760,7 @@ def _apply_renaming(self,
'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

# %% ../../nbs/handlers/helcom.ipynb 196
# %% ../../nbs/handlers/helcom.ipynb 198
def get_attrs(
tfm: Transformer, # Transformer object
zotero_key: str, # Zotero dataset record key
Expand All @@ -773,7 +776,7 @@ def get_attrs(
KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
])()

# %% ../../nbs/handlers/helcom.ipynb 198
# %% ../../nbs/handlers/helcom.ipynb 200
def enums_xtra(
tfm: Transformer, # Transformer object
vars: list # List of variables to extract from the transformer
Expand All @@ -787,7 +790,7 @@ def enums_xtra(
xtras[f'{var}_t'] = enums.filter(f'{var}_t', unique_vals)
return xtras

# %% ../../nbs/handlers/helcom.ipynb 200
# %% ../../nbs/handlers/helcom.ipynb 202
def encode(
fname_in: str, # Input file name
fname_out_nc: str, # Output file name
Expand Down
22 changes: 20 additions & 2 deletions marisco/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,38 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/api/inout.ipynb.

# %% auto 0
__all__ = ['write_toml', 'read_toml']
__all__ = ['write_toml', 'flatten_dict', 'read_toml']

# %% ../nbs/api/inout.ipynb 2
import tomli_w
import tomli
from typing import Dict, Any


# %% ../nbs/api/inout.ipynb 3
def write_toml(fname, cfg):
def write_toml(fname: str, cfg: Dict[str, Any]):
"Write a TOML file from a dictionary."
none_keys = [k for k, v in flatten_dict(cfg).items() if v is None]
if none_keys:
print(f"Warning: The following config keys have None values: {', '.join(none_keys)}")

print(f'Creating {fname}')
with open(fname, "wb") as f:
tomli_w.dump(cfg, f)

# %% ../nbs/api/inout.ipynb 4
def flatten_dict(d: Dict[str, Any], parent_key: str = '', sep: str = '.') -> Dict[str, Any]:
"""Flatten a nested dictionary."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(flatten_dict(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)

# %% ../nbs/api/inout.ipynb 5
def read_toml(fname):
"Read a TOML file into a dictionary."
with open(fname, "rb") as f:
Expand Down
23 changes: 11 additions & 12 deletions marisco/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,7 @@ def _format_output(self):
df_lut.index.name = 'source_key'
return df_lut.sort_values(by='match_score', ascending=False)


# %% ../nbs/api/utils.ipynb 16
# %% ../nbs/api/utils.ipynb 17
def has_valid_varname(
var_names:list, # variable names
cdl_path:str, # Path to MARIS CDL file (point of truth)
Expand Down Expand Up @@ -140,7 +139,7 @@ def has_valid_varname(
print(f'"{name}" variable name not found in MARIS CDL')
return has_valid

# %% ../nbs/api/utils.ipynb 20
# %% ../nbs/api/utils.ipynb 21
def get_bbox(df,
coord_cols=('lon', 'lat')
):
Expand All @@ -149,7 +148,7 @@ def get_bbox(df,
arr = [(row[x], row[y]) for _, row in df.iterrows()]
return MultiPoint(arr).envelope

# %% ../nbs/api/utils.ipynb 26
# %% ../nbs/api/utils.ipynb 27
def ddmm_to_dd(
ddmmmm:float # Coordinates in degrees/minutes decimal format
) -> float: # Coordinates in degrees decimal format
Expand All @@ -158,7 +157,7 @@ def ddmm_to_dd(
mins = mins * 100
return round(int(degs) + (mins / 60), 6)

# %% ../nbs/api/utils.ipynb 29
# %% ../nbs/api/utils.ipynb 30
def download_files_in_folder(owner:str,
repo:str,
src_dir:str,
Expand Down Expand Up @@ -192,7 +191,7 @@ def download_file(owner, repo, src_dir, dest_dir, fname):
else:
print(f"Error: {response.status_code}")

# %% ../nbs/api/utils.ipynb 31
# %% ../nbs/api/utils.ipynb 32
def match_worms(
name:str # Name of species to look up in WoRMS
):
Expand All @@ -215,7 +214,7 @@ def match_worms(
else:
return -1

# %% ../nbs/api/utils.ipynb 36
# %% ../nbs/api/utils.ipynb 37
@dataclass
class Match:
"Match between a data provider name and a MARIS lookup table."
Expand All @@ -224,7 +223,7 @@ class Match:
source_name: str
match_score: int

# %% ../nbs/api/utils.ipynb 37
# %% ../nbs/api/utils.ipynb 38
def match_maris_lut(
lut_path: str, # Path to MARIS species authoritative species look-up table
data_provider_name: str, # Name of data provider nomenclature item to look up
Expand All @@ -241,15 +240,15 @@ def match_maris_lut(
df = df.sort_values(by='score', ascending=True)[:nresults]
return df[[maris_id, maris_name, 'score']]

# %% ../nbs/api/utils.ipynb 44
# %% ../nbs/api/utils.ipynb 45
def get_bbox(df,
coord_cols=('lon', 'lat')
):
x, y = coord_cols
arr = [(row[x], row[y]) for _, row in df.iterrows()]
return MultiPoint(arr).envelope

# %% ../nbs/api/utils.ipynb 51
# %% ../nbs/api/utils.ipynb 52
def download_files_in_folder(owner:str,
repo:str,
src_dir:str,
Expand Down Expand Up @@ -283,7 +282,7 @@ def download_file(owner, repo, src_dir, dest_dir, fname):
else:
print(f"Error: {response.status_code}")

# %% ../nbs/api/utils.ipynb 53
# %% ../nbs/api/utils.ipynb 54
def match_worms(
name:str # Name of species to look up in WoRMS
):
Expand All @@ -306,7 +305,7 @@ def match_worms(
else:
return -1

# %% ../nbs/api/utils.ipynb 58
# %% ../nbs/api/utils.ipynb 59
def test_dfs(
dfs1:dict, # First dictionary of DataFrames to compare
dfs2:dict # Second dictionary of DataFrames to compare
Expand Down
19 changes: 16 additions & 3 deletions nbs/api/callbacks.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,19 @@
"execution_count": null,
"id": "8c905654",
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'Callback' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#| exports\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mRemapCB\u001b[39;00m(\u001b[43mCallback\u001b[49m):\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeneric MARIS remapping callback.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \n\u001b[1;32m 5\u001b[0m fn_lut: Callable, \u001b[38;5;66;03m# Function that returns the lookup table dictionary\u001b[39;00m\n\u001b[1;32m 6\u001b[0m col_remap: \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;66;03m# Name of the column to remap\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 10\u001b[0m verbose: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;66;03m# Whether to print unmatched values\u001b[39;00m\n\u001b[1;32m 11\u001b[0m ):\n",
"\u001b[0;31mNameError\u001b[0m: name 'Callback' is not defined"
]
}
],
"source": [
"#| exports\n",
"class RemapCB(Callback):\n",
Expand All @@ -455,7 +467,8 @@
" col_remap: str, # Name of the column to remap\n",
" col_src: str, # Name of the column with the source values\n",
" dest_grps: list[str]|str=grp_names(), # List of destination groups\n",
" default_value: Any = -1 # Default value for unmatched entries\n",
" default_value: Any = -1, # Default value for unmatched entries\n",
" verbose: bool = False, # Whether to print unmatched values\n",
" ):\n",
" fc.store_attr()\n",
" self.lut = None\n",
Expand All @@ -475,7 +488,7 @@
" value = value.strip() if isinstance(value, str) else value\n",
" match = self.lut.get(value, Match(self.default_value, None, None, None))\n",
" if isinstance(match, Match):\n",
" if match.matched_id == self.default_value:\n",
" if match.matched_id == self.default_value and self.verbose:\n",
" print(f\"Unmatched value: {value}\")\n",
" return match.matched_id \n",
" else:\n",
Expand Down
Loading

0 comments on commit 2badae3

Please sign in to comment.