Skip to content

Commit

Permalink
updating maris db species
Browse files Browse the repository at this point in the history
  • Loading branch information
franckalbinet committed Jun 6, 2024
1 parent 14d6cbb commit e63a98a
Show file tree
Hide file tree
Showing 3 changed files with 930 additions and 729 deletions.
2 changes: 0 additions & 2 deletions marisco/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,6 @@
'marisco/handlers/helcom.py'),
'marisco.handlers.helcom.get_varnames_lut': ( 'handlers/helcom.html#get_varnames_lut',
'marisco/handlers/helcom.py'),
'marisco.handlers.helcom.get_worms_species': ( 'handlers/helcom.html#get_worms_species',
'marisco/handlers/helcom.py'),
'marisco.handlers.helcom.load_data': ( 'handlers/helcom.html#load_data',
'marisco/handlers/helcom.py'),
'marisco.handlers.helcom.rename_cols': ( 'handlers/helcom.html#rename_cols',
Expand Down
138 changes: 50 additions & 88 deletions marisco/handlers/helcom.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
# %% auto 0
__all__ = ['varnames_lut_updates', 'coi_units_unc', 'renaming_unit_rules', 'coi_grp', 'renaming_rules', 'kw', 'load_data',
'rename_cols', 'LowerStripRdnNameCB', 'get_unique_nuclides', 'get_varnames_lut', 'RemapRdnNameCB',
'ParseTimeCB', 'fix_units', 'NormalizeUncUnitCB', 'get_maris_species', 'get_worms_species',
'LookupBiotaSpeciesCB', 'get_bodypart', 'LookupBiotaBodyPartCB', 'get_sediment', 'LookupSedimentCB',
'LookupUnitCB', 'RenameColumnCB', 'ReshapeLongToWide', 'get_attrs', 'encode']
'ParseTimeCB', 'fix_units', 'NormalizeUncUnitCB', 'get_maris_species', 'LookupBiotaSpeciesCB',
'get_bodypart', 'LookupBiotaBodyPartCB', 'get_sediment', 'LookupSedimentCB', 'LookupUnitCB',
'RenameColumnCB', 'ReshapeLongToWide', 'get_attrs', 'encode']

# %% ../../nbs/handlers/helcom.ipynb 6
# %% ../../nbs/handlers/helcom.ipynb 5
import pandas as pd # Python package that provides fast, flexible, and expressive data structures.
import numpy as np
from tqdm import tqdm # Python Progress Bar Library
Expand All @@ -25,10 +25,11 @@
DepthRangeCB, TimeRangeCB,
ZoteroCB, KeyValuePairCB)

from ..configs import base_path, nc_tpl_path, cfg, cache_path
from ..configs import base_path, nc_tpl_path, cfg, cache_path, species_lut_path, sediments_lut_path

from ..serializers import NetCDFEncoder

# %% ../../nbs/handlers/helcom.ipynb 11
# %% ../../nbs/handlers/helcom.ipynb 10
def load_data(src_dir,
smp_types=['SEA', 'SED', 'BIO']):
"Load HELCOM data and return the data in a dictionary of dataframes with the dictionary key as the sample type"
Expand All @@ -43,7 +44,7 @@ def load_data(src_dir,
dfs[lut_smp_type[smp_type]] = df
return dfs

# %% ../../nbs/handlers/helcom.ipynb 12
# %% ../../nbs/handlers/helcom.ipynb 11
def rename_cols(cols):
"Flatten multiindex columns"
new_cols = []
Expand All @@ -59,23 +60,23 @@ def rename_cols(cols):
new_cols.append(inner)
return new_cols

# %% ../../nbs/handlers/helcom.ipynb 27
# %% ../../nbs/handlers/helcom.ipynb 26
class LowerStripRdnNameCB(Callback):
"Convert nuclide names to lowercase & strip any trailing space(s)"
def __call__(self, tfm):
for k in tfm.dfs.keys():
tfm.dfs[k]['NUCLIDE'] = tfm.dfs[k]['NUCLIDE'].apply(
lambda x: x.lower().strip())

# %% ../../nbs/handlers/helcom.ipynb 31
# %% ../../nbs/handlers/helcom.ipynb 30
def get_unique_nuclides(dfs):
"Get list of unique radionuclide types measured across samples."
nuclides = []
for k in dfs.keys():
nuclides += dfs[k]['NUCLIDE'].unique().tolist()
return nuclides

# %% ../../nbs/handlers/helcom.ipynb 35
# %% ../../nbs/handlers/helcom.ipynb 34
varnames_lut_updates = {
'k-40': 'k40',
'cm243244': 'cm243_244_tot',
Expand All @@ -92,13 +93,13 @@ def get_unique_nuclides(dfs):
'cs145': 'cs137',
'cs146': 'cs137'}

# %% ../../nbs/handlers/helcom.ipynb 37
# %% ../../nbs/handlers/helcom.ipynb 36
def get_varnames_lut(dfs, lut=varnames_lut_updates):
lut = {n: n for n in set(get_unique_nuclides(dfs))}
lut.update(varnames_lut_updates)
return lut

# %% ../../nbs/handlers/helcom.ipynb 39
# %% ../../nbs/handlers/helcom.ipynb 38
class RemapRdnNameCB(Callback):
"Remap to MARIS radionuclide names."
def __init__(self,
Expand All @@ -110,25 +111,25 @@ def __call__(self, tfm):
for k in tfm.dfs.keys():
tfm.dfs[k]['NUCLIDE'].replace(lut, inplace=True)

# %% ../../nbs/handlers/helcom.ipynb 46
# %% ../../nbs/handlers/helcom.ipynb 45
class ParseTimeCB(Callback):
def __call__(self, tfm):
for k in tfm.dfs.keys():
tfm.dfs[k]['time'] = pd.to_datetime(tfm.dfs[k].DATE,
format='%m/%d/%y %H:%M:%S')

# %% ../../nbs/handlers/helcom.ipynb 49
# %% ../../nbs/handlers/helcom.ipynb 48
# Make measurement and uncertainty units consistent
def fix_units(df, meas_col, unc_col):
return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)

# %% ../../nbs/handlers/helcom.ipynb 51
# %% ../../nbs/handlers/helcom.ipynb 50
# Columns of interest
coi_units_unc = [('seawater', 'VALUE_Bq/m³', 'ERROR%_m³'),
('biota', 'VALUE_Bq/kg', 'ERROR%'),
('sediment', 'VALUE_Bq/kg', 'ERROR%_kg')]

# %% ../../nbs/handlers/helcom.ipynb 53
# %% ../../nbs/handlers/helcom.ipynb 52
class NormalizeUncUnitCB(Callback):
"Convert from relative error % to uncertainty of activity unit"
def __init__(self, coi=coi_units_unc): fc.store_attr()
Expand All @@ -141,23 +142,27 @@ def fix_units(self, df, meas_col, unc_col):
return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)

# %% ../../nbs/handlers/helcom.ipynb 59
def get_maris_species(fname_in, fname_cache, overwrite=False, verbose=False):
def get_maris_species(fname_in,
fname_cache,
overwrite=False,
verbose=False):
fname_cache = cache_path() / fname_cache
lut = {}
df = pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')

if overwrite or (not fname_cache.exists()):
if verbose:
print('Source:Destination:Score')
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
match = match_maris_species(row['SCIENTIFIC NAME'])
'''
Include the source, destination and score in lut.
'''
lut[row['RUBIN']] = {'id': match.iloc[0]['species_id'],'name': match.iloc[0]['species'],'source': row["SCIENTIFIC NAME"], 'status':'marisco_cdl', 'match_type': match.iloc[0]['score']}
for _, row in tqdm(df.iterrows(), total=df.shape[0], leave=True):
match = match_maris_species(species_lut_path(), row['SCIENTIFIC NAME'])
lut[row['RUBIN']] = {
'matched_id': match.iloc[0]['species_id'],
'matched_maris_name': match.iloc[0]['species'],
'source_name': row["SCIENTIFIC NAME"],
'match_score': match.iloc[0]['score']
}
if verbose:
print(f'{row["SCIENTIFIC NAME"]}: {match.iloc[0]["species"]}: {match.iloc[0]["score"]}')
# Return a verbose lut
print(f'Source name: {row["SCIENTIFIC NAME"]} | ' +
f'MARIS matched name: {match.iloc[0]["species"]} | ' +
f'Matched score: {match.iloc[0]["score"]}', flush=True)

fc.save_pickle(fname_cache, lut)
else:
Expand All @@ -166,58 +171,17 @@ def get_maris_species(fname_in, fname_cache, overwrite=False, verbose=False):
return lut

# %% ../../nbs/handlers/helcom.ipynb 65
def get_worms_species(fname_in, fname_cache, load_lut=False, overwrite=False):
fname_cache = cache_path() / fname_cache
lut = {}

if overwrite or (not fname_cache.exists()):
df = pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')

if load_lut:
'''
open and read the LUT file
'''
lut = fc.load_pickle(fname_cache)

for _, row in tqdm(df[['RUBIN', 'SCIENTIFIC NAME']].iterrows(), total=df.shape[0]):
if load_lut:
'''
If row['RUBIN'] in LUT and match_type equals 0 then dont complete the lookup with WORMS.
'''
if row['RUBIN'] in lut:
if lut[row['RUBIN']]['match_type'] == 0:
continue
res = match_worms(row['SCIENTIFIC NAME'])
if (res == -1):
print(f"No match found for {row['RUBIN']} ({row['SCIENTIFIC NAME']})")
id = -1
lut[row['RUBIN']] = {'id': id, 'name': '', 'source': row["SCIENTIFIC NAME"] ,'status': 'No match', 'match_type': 'No match', 'unacceptreason':'No match'}
else:
if len(res[0]) > 1:
print(f"Several matches for {row['RUBIN']} ({row['SCIENTIFIC NAME']})")

id, name, status, match_type,unacceptreason = [res[0][0].get(key)
for key in ['AphiaID', 'scientificname', 'status', 'match_type','unacceptreason']]

lut[row['RUBIN']] = {'id': id, 'name': name, 'source': row["SCIENTIFIC NAME"] ,'status': status, 'match_type': match_type, 'unacceptreason':unacceptreason}
fc.save_pickle(fname_cache, lut)
else:
lut = fc.load_pickle(fname_cache)

return lut

# %% ../../nbs/handlers/helcom.ipynb 71
class LookupBiotaSpeciesCB(Callback):
'Match species with MARIS database.'
def __init__(self, fn_lut): fc.store_attr()
def __call__(self, tfm):
lut = self.fn_lut()
tfm.dfs['biota']['species_id'] = tfm.dfs['biota']['RUBIN'].apply(
lambda x: lut[x.strip()]['id'])
tfm.dfs['biota']['species_id'] = tfm.dfs['biota']['RUBIN'].apply(lambda x: lut[x.strip()]['id'])

# Remove data with a species_id of -1.
tfm.dfs['biota']=tfm.dfs['biota'].drop(tfm.dfs['biota'][tfm.dfs['biota']['species_id'] == -1 ].index)
# tfm.dfs['biota'] = tfm.dfs['biota'].drop(tfm.dfs['biota'][tfm.dfs['biota']['species_id'] == -1 ].index)

# %% ../../nbs/handlers/helcom.ipynb 75
# %% ../../nbs/handlers/helcom.ipynb 69
def get_bodypart(verbose=False):
"Naive lut - TO BE REFACTORED"
lut={
Expand Down Expand Up @@ -246,15 +210,15 @@ def get_bodypart(verbose=False):
print (str(helcom_tissue[helcom_tissue.TISSUE==int(k)].TISSUE_DESCRIPTION.values[0]) + ' : ' + str(marris_dbo_bodypar[marris_dbo_bodypar.bodypar_id==v].bodypar.values[0]))
return lut

# %% ../../nbs/handlers/helcom.ipynb 76
# %% ../../nbs/handlers/helcom.ipynb 70
class LookupBiotaBodyPartCB(Callback):
'Update bodypart id based on MARIS dbo_bodypar.xlsx'
def __init__(self, fn_lut): fc.store_attr()
def __call__(self, tfm):
lut = self.fn_lut()
tfm.dfs['biota']['body_part'] = tfm.dfs['biota']['TISSUE'].apply(lambda x: lut[x])

# %% ../../nbs/handlers/helcom.ipynb 82
# %% ../../nbs/handlers/helcom.ipynb 75
def get_sediment(verbose=False):
lut = {}
if verbose: print('Source:Destination')
Expand All @@ -266,7 +230,7 @@ def get_sediment(verbose=False):
if verbose: print(f'({row["SEDI"]}) {row["SEDIMENT TYPE"]}: ({match.iloc[0,0]}) {match.iloc[0,1]}')
return lut

# %% ../../nbs/handlers/helcom.ipynb 87
# %% ../../nbs/handlers/helcom.ipynb 80
class LookupSedimentCB(Callback):
'Update sediment id based on MARIS dbo_sedtype.xlsx'
def __init__(self, fn_lut): fc.store_attr()
Expand All @@ -278,14 +242,12 @@ def __call__(self, tfm):
tfm.dfs['sediment']['SEDI'].replace(73, -99, inplace=True)
tfm.dfs['sediment']['sed_type'] = tfm.dfs['sediment']['SEDI'].apply(lambda x: lut[x])

# %% ../../nbs/handlers/helcom.ipynb 91
# %% ../../nbs/handlers/helcom.ipynb 84
# Define unit names renaming rules
renaming_unit_rules = { 'VALUE_Bq/m³': 1, #'Bq/m3'
'VALUE_Bq/kg': 3 #'Bq/kg'
}

renaming_unit_rules = {'VALUE_Bq/m³': 1, #'Bq/m3'
'VALUE_Bq/kg': 3 } #'Bq/kg'

# %% ../../nbs/handlers/helcom.ipynb 92
# %% ../../nbs/handlers/helcom.ipynb 85
class LookupUnitCB(Callback):
def __init__(self,
renaming_unit_rules=renaming_unit_rules):
Expand All @@ -297,7 +259,7 @@ def __call__(self, tfm):
tfm.dfs[grp]['unit'] = np.where(tfm.dfs[grp].loc[:,k].notna(), np.int64(v), np.int64(0))


# %% ../../nbs/handlers/helcom.ipynb 97
# %% ../../nbs/handlers/helcom.ipynb 90
# Define columns of interest by sample type
coi_grp = {'seawater': ['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time',
'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)','unit'],
Expand All @@ -309,7 +271,7 @@ def __call__(self, tfm):
'species_id', 'body_part','unit']}


# %% ../../nbs/handlers/helcom.ipynb 98
# %% ../../nbs/handlers/helcom.ipynb 91
# Define column names renaming rules
renaming_rules = {
'NUCLIDE': 'nuclide',
Expand All @@ -327,7 +289,7 @@ def __call__(self, tfm):
}


# %% ../../nbs/handlers/helcom.ipynb 99
# %% ../../nbs/handlers/helcom.ipynb 92
class RenameColumnCB(Callback):
def __init__(self,
coi=coi_grp,
Expand All @@ -342,7 +304,7 @@ def __call__(self, tfm):
# Rename cols
tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)

# %% ../../nbs/handlers/helcom.ipynb 104
# %% ../../nbs/handlers/helcom.ipynb 97
class ReshapeLongToWide(Callback):
def __init__(self): fc.store_attr()

Expand Down Expand Up @@ -371,7 +333,7 @@ def __call__(self, tfm):
# Set index
tfm.dfs[k].index.name = 'sample'

# %% ../../nbs/handlers/helcom.ipynb 117
# %% ../../nbs/handlers/helcom.ipynb 110
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
Expand All @@ -384,7 +346,7 @@ def __call__(self, tfm):
'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']


# %% ../../nbs/handlers/helcom.ipynb 118
# %% ../../nbs/handlers/helcom.ipynb 111
def get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw):
return GlobAttrsFeeder(tfm.dfs, cbs=[
BboxCB(),
Expand All @@ -395,7 +357,7 @@ def get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw):
KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
])()

# %% ../../nbs/handlers/helcom.ipynb 124
# %% ../../nbs/handlers/helcom.ipynb 117
def encode(fname_in, fname_out, nc_tpl_path, **kwargs):
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
Expand Down
Loading

0 comments on commit e63a98a

Please sign in to comment.