Skip to content

Commit

Permalink
Refactoring OSPAR handler: Implement generic RemapCB for data remapping
Browse files Browse the repository at this point in the history
- Convert existing callbacks to use the generic RemapCB
- Enhance code reusability and maintainability
- Standardize remapping process across different data types
- Improve consistency in handling OSPAR data transformations
  • Loading branch information
niallmurphy93 committed Oct 15, 2024
1 parent 2846ace commit b3ebef6
Show file tree
Hide file tree
Showing 5 changed files with 3,679 additions and 1,389 deletions.
5 changes: 3 additions & 2 deletions marisco/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ def __init__(self,
col_remap: str, # Name of the column to remap
col_src: str, # Name of the column with the source values
dest_grps: list[str]|str=grp_names(), # List of destination groups
default_value: Any = -1 # Default value for unmatched entries
default_value: Any = -1, # Default value for unmatched entries
verbose: bool = False, # Whether to print unmatched values
):
fc.store_attr()
self.lut = None
Expand All @@ -157,7 +158,7 @@ def _remap_value(self, value: str) -> Any:
value = value.strip() if isinstance(value, str) else value
match = self.lut.get(value, Match(self.default_value, None, None, None))
if isinstance(match, Match):
if match.matched_id == self.default_value:
if match.matched_id == self.default_value and self.verbose:
print(f"Unmatched value: {value}")
return match.matched_id
else:
Expand Down
1 change: 0 additions & 1 deletion marisco/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ def _format_output(self):
df_lut.index.name = 'source_key'
return df_lut.sort_values(by='match_score', ascending=False)


# %% ../nbs/api/utils.ipynb 16
def has_valid_varname(
var_names:list, # variable names
Expand Down
19 changes: 16 additions & 3 deletions nbs/api/callbacks.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,19 @@
"execution_count": null,
"id": "8c905654",
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'Callback' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#| exports\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mRemapCB\u001b[39;00m(\u001b[43mCallback\u001b[49m):\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeneric MARIS remapping callback.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \n\u001b[1;32m 5\u001b[0m fn_lut: Callable, \u001b[38;5;66;03m# Function that returns the lookup table dictionary\u001b[39;00m\n\u001b[1;32m 6\u001b[0m col_remap: \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;66;03m# Name of the column to remap\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 10\u001b[0m verbose: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;66;03m# Whether to print unmatched values\u001b[39;00m\n\u001b[1;32m 11\u001b[0m ):\n",
"\u001b[0;31mNameError\u001b[0m: name 'Callback' is not defined"
]
}
],
"source": [
"#| exports\n",
"class RemapCB(Callback):\n",
Expand All @@ -455,7 +467,8 @@
" col_remap: str, # Name of the column to remap\n",
" col_src: str, # Name of the column with the source values\n",
" dest_grps: list[str]|str=grp_names(), # List of destination groups\n",
" default_value: Any = -1 # Default value for unmatched entries\n",
" default_value: Any = -1, # Default value for unmatched entries\n",
" verbose: bool = False, # Whether to print unmatched values\n",
" ):\n",
" fc.store_attr()\n",
" self.lut = None\n",
Expand All @@ -475,7 +488,7 @@
" value = value.strip() if isinstance(value, str) else value\n",
" match = self.lut.get(value, Match(self.default_value, None, None, None))\n",
" if isinstance(match, Match):\n",
" if match.matched_id == self.default_value:\n",
" if match.matched_id == self.default_value and self.verbose:\n",
" print(f\"Unmatched value: {value}\")\n",
" return match.matched_id \n",
" else:\n",
Expand Down
52 changes: 35 additions & 17 deletions nbs/api/utils.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -199,30 +199,30 @@
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>cs137</td>\n",
" <td>cs134</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>cs134_137_tot</td>\n",
" <td>13</td>\n",
" <td>cs137</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>cs134</td>\n",
" <td>5</td>\n",
" <td>cs134_137_tot</td>\n",
" <td>13</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" index value n_chars\n",
"0 0 cs137 5\n",
"1 1 cs134_137_tot 13\n",
"2 2 cs134 5"
"0 0 cs134 5\n",
"1 1 cs137 5\n",
"2 2 cs134_137_tot 13"
]
},
"execution_count": null,
Expand All @@ -239,7 +239,20 @@
"execution_count": null,
"id": "cf58241b",
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'pd' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#| exports\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mRemapper\u001b[39;00m():\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRemap a data provider lookup table to a MARIS lookup table using fuzzy matching.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 5\u001b[0m provider_lut_df:pd\u001b[38;5;241m.\u001b[39mDataFrame, \u001b[38;5;66;03m# Data provider lookup table to be remapped\u001b[39;00m\n\u001b[1;32m 6\u001b[0m maris_lut_fn:\u001b[38;5;28mcallable\u001b[39m, \u001b[38;5;66;03m# Function that returns the MARIS lookup table path\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 11\u001b[0m fname_cache \u001b[38;5;66;03m# Cache file name\u001b[39;00m\n\u001b[1;32m 12\u001b[0m ):\n",
"Cell \u001b[0;32mIn[2], line 5\u001b[0m, in \u001b[0;36mRemapper\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mRemapper\u001b[39;00m():\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRemap a data provider lookup table to a MARIS lookup table using fuzzy matching.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m----> 5\u001b[0m provider_lut_df:\u001b[43mpd\u001b[49m\u001b[38;5;241m.\u001b[39mDataFrame, \u001b[38;5;66;03m# Data provider lookup table to be remapped\u001b[39;00m\n\u001b[1;32m 6\u001b[0m maris_lut_fn:\u001b[38;5;28mcallable\u001b[39m, \u001b[38;5;66;03m# Function that returns the MARIS lookup table path\u001b[39;00m\n\u001b[1;32m 7\u001b[0m maris_col_id:\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;66;03m# MARIS lookup table column name for the id\u001b[39;00m\n\u001b[1;32m 8\u001b[0m maris_col_name:\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;66;03m# MARIS lookup table column name for the name\u001b[39;00m\n\u001b[1;32m 9\u001b[0m provider_col_to_match:\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;66;03m# Data provider lookup table column name for the name to match\u001b[39;00m\n\u001b[1;32m 10\u001b[0m provider_col_key, \u001b[38;5;66;03m# Data provider lookup table column name for the key\u001b[39;00m\n\u001b[1;32m 11\u001b[0m fname_cache \u001b[38;5;66;03m# Cache file name\u001b[39;00m\n\u001b[1;32m 12\u001b[0m ):\n\u001b[1;32m 13\u001b[0m fc\u001b[38;5;241m.\u001b[39mstore_attr()\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache_file \u001b[38;5;241m=\u001b[39m cache_path() \u001b[38;5;241m/\u001b[39m fname_cache\n",
"\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
]
}
],
"source": [
"#| exports\n",
"class Remapper():\n",
Expand Down Expand Up @@ -301,7 +314,7 @@
" df_lut = pd.DataFrame.from_dict(self.lut, orient='index', \n",
" columns=['matched_maris_name', 'source_name', 'match_score'])\n",
" df_lut.index.name = 'source_key'\n",
" return df_lut.sort_values(by='match_score', ascending=False)\n"
" return df_lut.sort_values(by='match_score', ascending=False)"
]
},
{
Expand Down Expand Up @@ -432,8 +445,9 @@
"(-10.0, 40.0, 5.0, 50.0)"
]
},
"execution_count": null,
"metadata": {},
"output_type": "display_data"
"output_type": "execute_result"
}
],
"source": [
Expand All @@ -453,8 +467,9 @@
"'POLYGON ((-10 40, 5 40, 5 50, -10 50, -10 40))'"
]
},
"execution_count": null,
"metadata": {},
"output_type": "display_data"
"output_type": "execute_result"
}
],
"source": [
Expand Down Expand Up @@ -486,8 +501,9 @@
"(0.0, 1.0, 0.0, 1.0)"
]
},
"execution_count": null,
"metadata": {},
"output_type": "display_data"
"output_type": "execute_result"
}
],
"source": [
Expand Down Expand Up @@ -646,7 +662,7 @@
" 'order': 'Decapoda',\n",
" 'family': 'Aristeidae',\n",
" 'genus': 'Aristeus',\n",
" 'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-06-10',\n",
" 'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-10-14',\n",
" 'lsid': 'urn:lsid:marinespecies.org:taxname:107083',\n",
" 'isMarine': 1,\n",
" 'isBrackish': 0,\n",
Expand All @@ -657,8 +673,9 @@
" 'modified': '2022-08-24T09:48:14.813Z'}]]"
]
},
"execution_count": null,
"metadata": {},
"output_type": "display_data"
"output_type": "execute_result"
}
],
"source": [
Expand Down Expand Up @@ -990,8 +1007,9 @@
"52 51 Soft clay 7"
]
},
"execution_count": null,
"metadata": {},
"output_type": "display_data"
"output_type": "execute_result"
}
],
"source": [
Expand Down Expand Up @@ -1363,7 +1381,7 @@
" 'order': 'Decapoda',\n",
" 'family': 'Aristeidae',\n",
" 'genus': 'Aristeus',\n",
" 'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-06-10',\n",
" 'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-10-14',\n",
" 'lsid': 'urn:lsid:marinespecies.org:taxname:107083',\n",
" 'isMarine': 1,\n",
" 'isBrackish': 0,\n",
Expand Down
Loading

0 comments on commit b3ebef6

Please sign in to comment.