Skip to content

Commit

Permalink
review helcom handler up to species look-up
Browse files Browse the repository at this point in the history
  • Loading branch information
franckalbinet committed Jun 5, 2024
1 parent 76a46dd commit 14d6cbb
Show file tree
Hide file tree
Showing 5 changed files with 339 additions and 796 deletions.
59 changes: 28 additions & 31 deletions marisco/handlers/helcom.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
'LookupBiotaSpeciesCB', 'get_bodypart', 'LookupBiotaBodyPartCB', 'get_sediment', 'LookupSedimentCB',
'LookupUnitCB', 'RenameColumnCB', 'ReshapeLongToWide', 'get_attrs', 'encode']

# %% ../../nbs/handlers/helcom.ipynb 7
# %% ../../nbs/handlers/helcom.ipynb 6
import pandas as pd # Python package that provides fast, flexible, and expressive data structures.
import numpy as np
from tqdm import tqdm # Python Progress Bar Library
Expand All @@ -28,7 +28,7 @@
from ..configs import base_path, nc_tpl_path, cfg, cache_path
from ..serializers import NetCDFEncoder

# %% ../../nbs/handlers/helcom.ipynb 13
# %% ../../nbs/handlers/helcom.ipynb 11
def load_data(src_dir,
smp_types=['SEA', 'SED', 'BIO']):
"Load HELCOM data and return the data in a dictionary of dataframes with the dictionary key as the sample type"
Expand All @@ -43,7 +43,7 @@ def load_data(src_dir,
dfs[lut_smp_type[smp_type]] = df
return dfs

# %% ../../nbs/handlers/helcom.ipynb 14
# %% ../../nbs/handlers/helcom.ipynb 12
def rename_cols(cols):
"Flatten multiindex columns"
new_cols = []
Expand All @@ -59,23 +59,23 @@ def rename_cols(cols):
new_cols.append(inner)
return new_cols

# %% ../../nbs/handlers/helcom.ipynb 31
# %% ../../nbs/handlers/helcom.ipynb 27
class LowerStripRdnNameCB(Callback):
"Convert nuclide names to lowercase & strip any trailing space(s)"
def __call__(self, tfm):
for k in tfm.dfs.keys():
tfm.dfs[k]['NUCLIDE'] = tfm.dfs[k]['NUCLIDE'].apply(
lambda x: x.lower().strip())

# %% ../../nbs/handlers/helcom.ipynb 35
# %% ../../nbs/handlers/helcom.ipynb 31
def get_unique_nuclides(dfs):
"Get list of unique radionuclide types measured across samples."
nuclides = []
for k in dfs.keys():
nuclides += dfs[k]['NUCLIDE'].unique().tolist()
return nuclides

# %% ../../nbs/handlers/helcom.ipynb 39
# %% ../../nbs/handlers/helcom.ipynb 35
varnames_lut_updates = {
'k-40': 'k40',
'cm243244': 'cm243_244_tot',
Expand All @@ -92,14 +92,13 @@ def get_unique_nuclides(dfs):
'cs145': 'cs137',
'cs146': 'cs137'}


# %% ../../nbs/handlers/helcom.ipynb 41
# %% ../../nbs/handlers/helcom.ipynb 37
def get_varnames_lut(dfs, lut=varnames_lut_updates):
lut = {n: n for n in set(get_unique_nuclides(dfs))}
lut.update(varnames_lut_updates)
return lut

# %% ../../nbs/handlers/helcom.ipynb 43
# %% ../../nbs/handlers/helcom.ipynb 39
class RemapRdnNameCB(Callback):
"Remap to MARIS radionuclide names."
def __init__(self,
Expand All @@ -111,28 +110,27 @@ def __call__(self, tfm):
for k in tfm.dfs.keys():
tfm.dfs[k]['NUCLIDE'].replace(lut, inplace=True)

# %% ../../nbs/handlers/helcom.ipynb 50
# %% ../../nbs/handlers/helcom.ipynb 46
class ParseTimeCB(Callback):
def __call__(self, tfm):
for k in tfm.dfs.keys():
tfm.dfs[k]['time'] = pd.to_datetime(tfm.dfs[k].DATE,
format='%m/%d/%y %H:%M:%S')

# %% ../../nbs/handlers/helcom.ipynb 54
# %% ../../nbs/handlers/helcom.ipynb 49
# Make measurement and uncertainty units consistent
def fix_units(df, meas_col, unc_col):
return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)

# %% ../../nbs/handlers/helcom.ipynb 56
# %% ../../nbs/handlers/helcom.ipynb 51
# Columns of interest
coi_units_unc = [('seawater', 'VALUE_Bq/m³', 'ERROR%_m³'),
('biota', 'VALUE_Bq/kg', 'ERROR%'),
('sediment', 'VALUE_Bq/kg', 'ERROR%_kg')]

# %% ../../nbs/handlers/helcom.ipynb 58
# %% ../../nbs/handlers/helcom.ipynb 53
class NormalizeUncUnitCB(Callback):
"Convert from relative error % to uncertainty of activity unit"

def __init__(self, coi=coi_units_unc): fc.store_attr()

def __call__(self, tfm):
Expand All @@ -142,9 +140,8 @@ def __call__(self, tfm):
def fix_units(self, df, meas_col, unc_col):
return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)

# %% ../../nbs/handlers/helcom.ipynb 65
# %% ../../nbs/handlers/helcom.ipynb 59
def get_maris_species(fname_in, fname_cache, overwrite=False, verbose=False):

fname_cache = cache_path() / fname_cache
lut = {}
df = pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')
Expand All @@ -168,7 +165,7 @@ def get_maris_species(fname_in, fname_cache, overwrite=False, verbose=False):

return lut

# %% ../../nbs/handlers/helcom.ipynb 71
# %% ../../nbs/handlers/helcom.ipynb 65
def get_worms_species(fname_in, fname_cache, load_lut=False, overwrite=False):
fname_cache = cache_path() / fname_cache
lut = {}
Expand Down Expand Up @@ -209,7 +206,7 @@ def get_worms_species(fname_in, fname_cache, load_lut=False, overwrite=False):

return lut

# %% ../../nbs/handlers/helcom.ipynb 77
# %% ../../nbs/handlers/helcom.ipynb 71
class LookupBiotaSpeciesCB(Callback):
'Match species with MARIS database.'
def __init__(self, fn_lut): fc.store_attr()
Expand All @@ -220,7 +217,7 @@ def __call__(self, tfm):
# Remove data with a species_id of -1.
tfm.dfs['biota']=tfm.dfs['biota'].drop(tfm.dfs['biota'][tfm.dfs['biota']['species_id'] == -1 ].index)

# %% ../../nbs/handlers/helcom.ipynb 81
# %% ../../nbs/handlers/helcom.ipynb 75
def get_bodypart(verbose=False):
"Naive lut - TO BE REFACTORED"
lut={
Expand Down Expand Up @@ -249,15 +246,15 @@ def get_bodypart(verbose=False):
print (str(helcom_tissue[helcom_tissue.TISSUE==int(k)].TISSUE_DESCRIPTION.values[0]) + ' : ' + str(marris_dbo_bodypar[marris_dbo_bodypar.bodypar_id==v].bodypar.values[0]))
return lut

# %% ../../nbs/handlers/helcom.ipynb 82
# %% ../../nbs/handlers/helcom.ipynb 76
class LookupBiotaBodyPartCB(Callback):
'Update bodypart id based on MARIS dbo_bodypar.xlsx'
def __init__(self, fn_lut): fc.store_attr()
def __call__(self, tfm):
lut = self.fn_lut()
tfm.dfs['biota']['body_part'] = tfm.dfs['biota']['TISSUE'].apply(lambda x: lut[x])

# %% ../../nbs/handlers/helcom.ipynb 88
# %% ../../nbs/handlers/helcom.ipynb 82
def get_sediment(verbose=False):
lut = {}
if verbose: print('Source:Destination')
Expand All @@ -269,7 +266,7 @@ def get_sediment(verbose=False):
if verbose: print(f'({row["SEDI"]}) {row["SEDIMENT TYPE"]}: ({match.iloc[0,0]}) {match.iloc[0,1]}')
return lut

# %% ../../nbs/handlers/helcom.ipynb 93
# %% ../../nbs/handlers/helcom.ipynb 87
class LookupSedimentCB(Callback):
'Update sediment id based on MARIS dbo_sedtype.xlsx'
def __init__(self, fn_lut): fc.store_attr()
Expand All @@ -281,14 +278,14 @@ def __call__(self, tfm):
tfm.dfs['sediment']['SEDI'].replace(73, -99, inplace=True)
tfm.dfs['sediment']['sed_type'] = tfm.dfs['sediment']['SEDI'].apply(lambda x: lut[x])

# %% ../../nbs/handlers/helcom.ipynb 97
# %% ../../nbs/handlers/helcom.ipynb 91
# Define unit names renaming rules
renaming_unit_rules = { 'VALUE_Bq/m³': 1, #'Bq/m3'
'VALUE_Bq/kg': 3 #'Bq/kg'
}


# %% ../../nbs/handlers/helcom.ipynb 98
# %% ../../nbs/handlers/helcom.ipynb 92
class LookupUnitCB(Callback):
def __init__(self,
renaming_unit_rules=renaming_unit_rules):
Expand All @@ -300,7 +297,7 @@ def __call__(self, tfm):
tfm.dfs[grp]['unit'] = np.where(tfm.dfs[grp].loc[:,k].notna(), np.int64(v), np.int64(0))


# %% ../../nbs/handlers/helcom.ipynb 103
# %% ../../nbs/handlers/helcom.ipynb 97
# Define columns of interest by sample type
coi_grp = {'seawater': ['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time',
'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)','unit'],
Expand All @@ -312,7 +309,7 @@ def __call__(self, tfm):
'species_id', 'body_part','unit']}


# %% ../../nbs/handlers/helcom.ipynb 104
# %% ../../nbs/handlers/helcom.ipynb 98
# Define column names renaming rules
renaming_rules = {
'NUCLIDE': 'nuclide',
Expand All @@ -330,7 +327,7 @@ def __call__(self, tfm):
}


# %% ../../nbs/handlers/helcom.ipynb 105
# %% ../../nbs/handlers/helcom.ipynb 99
class RenameColumnCB(Callback):
def __init__(self,
coi=coi_grp,
Expand All @@ -345,7 +342,7 @@ def __call__(self, tfm):
# Rename cols
tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)

# %% ../../nbs/handlers/helcom.ipynb 110
# %% ../../nbs/handlers/helcom.ipynb 104
class ReshapeLongToWide(Callback):
def __init__(self): fc.store_attr()

Expand Down Expand Up @@ -374,7 +371,7 @@ def __call__(self, tfm):
# Set index
tfm.dfs[k].index.name = 'sample'

# %% ../../nbs/handlers/helcom.ipynb 123
# %% ../../nbs/handlers/helcom.ipynb 117
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
Expand All @@ -387,7 +384,7 @@ def __call__(self, tfm):
'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']


# %% ../../nbs/handlers/helcom.ipynb 124
# %% ../../nbs/handlers/helcom.ipynb 118
def get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw):
return GlobAttrsFeeder(tfm.dfs, cbs=[
BboxCB(),
Expand All @@ -398,7 +395,7 @@ def get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw):
KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
])()

# %% ../../nbs/handlers/helcom.ipynb 130
# %% ../../nbs/handlers/helcom.ipynb 124
def encode(fname_in, fname_out, nc_tpl_path, **kwargs):
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
Expand Down
2 changes: 1 addition & 1 deletion marisco/nc_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def create_variable(self:NCTemplater,
self.dim['name'])
nc_var.setncatts(attrs)

# %% ../nbs/api/nc_template.ipynb 29
# %% ../nbs/api/nc_template.ipynb 27
@patch
def generate(self:NCTemplater):
"Generate CDL"
Expand Down
9 changes: 4 additions & 5 deletions marisco/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,16 +125,15 @@ def match_worms(

# %% ../nbs/api/utils.ipynb 27
def match_maris_species(
lut_path:str, # Path to MARIS species authoritative species look-up table
name:str, # Name of species to look up
col_lookup:str='species', # Name of the column where the character strings match
dist_fn:Callable=jf.levenshtein_distance, # Jellyfish distance to use
coi:list=['species_id', 'species', 'Taxonname', 'TaxonDBID',
'AphiaID', 'scientificname', 'status', 'rank',
'match_type'], # Columns of interest to display
nresults:int=10 # Maxiumn number of results to return
coi:list=['species_id', 'species', 'Taxonname', 'TaxonDBID'], # Columns of interest to display
nresults:int=10 # Maximum number of results to return
):
"Lookup `name` in MARIS master species lookup table"
df = pd.read_excel(species_lut_path())
df = pd.read_excel(lut_path)
df = df.dropna(subset=col_lookup)
df = df.astype({'species_id':'int'})
results = []
Expand Down
Loading

0 comments on commit 14d6cbb

Please sign in to comment.