Skip to content

Commit

Permalink
accept proposed change to not hardcode the '_' separator in the nc te…
Browse files Browse the repository at this point in the history
…mplater but rather defined it in configs
  • Loading branch information
franckalbinet committed Jun 3, 2024
1 parent 803df95 commit 76a46dd
Show file tree
Hide file tree
Showing 10 changed files with 273 additions and 109 deletions.
37 changes: 37 additions & 0 deletions marisco/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,43 @@
'marisco/nc_template.py'),
'marisco.nc_template.NCTemplater.nuclide_vars': ( 'api/nc_template.html#nctemplater.nuclide_vars',
'marisco/nc_template.py')},
'marisco.netcdf_to_csv': { 'marisco.netcdf_to_csv.LookupNuclideIdCB': ( 'handlers/netcdf_to_csv.html#lookupnuclideidcb',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.LookupNuclideIdCB.__call__': ( 'handlers/netcdf_to_csv.html#__call__',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.LookupNuclideIdCB.__init__': ( 'handlers/netcdf_to_csv.html#__init__',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.LookupSampleType': ( 'handlers/netcdf_to_csv.html#lookupsampletype',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.LookupSampleType.__call__': ( 'handlers/netcdf_to_csv.html#__call__',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.LookupSampleType.__init__': ( 'handlers/netcdf_to_csv.html#__init__',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.ParseTimeCB': ( 'handlers/netcdf_to_csv.html#parsetimecb',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.ParseTimeCB.__call__': ( 'handlers/netcdf_to_csv.html#__call__',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.ParseTimeCB.__init__': ( 'handlers/netcdf_to_csv.html#__init__',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.ParseTimeCB.format_time': ( 'handlers/netcdf_to_csv.html#format_time',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.RenameColumnCB': ( 'handlers/netcdf_to_csv.html#renamecolumncb',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.RenameColumnCB.__call__': ( 'handlers/netcdf_to_csv.html#__call__',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.RenameColumnCB.__init__': ( 'handlers/netcdf_to_csv.html#__init__',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.TransposeNuclideColumns': ( 'handlers/netcdf_to_csv.html#transposenuclidecolumns',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.TransposeNuclideColumns.__call__': ( 'handlers/netcdf_to_csv.html#__call__',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.TransposeNuclideColumns.__init__': ( 'handlers/netcdf_to_csv.html#__init__',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.TransposeNuclideColumns.transpose_nuclides': ( 'handlers/netcdf_to_csv.html#transpose_nuclides',
'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.encode': ('handlers/netcdf_to_csv.html#encode', 'marisco/netcdf_to_csv.py'),
'marisco.netcdf_to_csv.get_nucnames_lut': ( 'handlers/netcdf_to_csv.html#get_nucnames_lut',
'marisco/netcdf_to_csv.py')},
'marisco.serializers': { 'marisco.serializers.NetCDFEncoder': ('api/serializers.html#netcdfencoder', 'marisco/serializers.py'),
'marisco.serializers.NetCDFEncoder.__init__': ( 'api/serializers.html#__init__',
'marisco/serializers.py'),
Expand Down
20 changes: 10 additions & 10 deletions marisco/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,79 +240,79 @@ def cache_path(): return Path(cfg()['dirs']['cache'])
},
'suffixes': {
'uncertainty': {
'name': 'unc',
'name': '_unc',
'attrs': {
'long_name': ' uncertainty',
'standard_name': '_uncertainty'
},
'dtype': 'f4'
},
'detection_limit': {
'name': 'dl',
'name': '_dl',
'attrs': {
'long_name': ' detection limit',
'standard_name': '_detection_limit'
},
'dtype': 'dl_t'
},
'volume': {
'name': 'vol',
'name': '_vol',
'attrs': {
'long_name': ' volume',
'standard_name': '_volume'
},
'dtype': 'f4'
},
'salinity': {
'name': 'sal',
'name': '_sal',
'attrs': {
'long_name': ' salinity',
'standard_name': '_sal'
},
'dtype': 'f4'
},
'temperature': {
'name': 'temp',
'name': '_temp',
'attrs': {
'long_name': ' temperature',
'standard_name': '_temp'
},
'dtype': 'f4'
},
'filtered': {
'name': 'filt',
'name': '_filt',
'attrs': {
'long_name': ' filtered',
'standard_name': '_filtered'
},
'dtype': 'filt_t'
},
'counting_method': {
'name': 'counmet',
'name': '_counmet',
'attrs': {
'long_name': ' counting method',
'standard_name': '_counting_method'
},
'dtype': 'counmet_t'
},
'sampling_method': {
'name': 'sampmet',
'name': '_sampmet',
'attrs': {
'long_name': ' sampling method',
'standard_name': '_sampling_method'
},
'dtype': 'sampmet_t'
},
'preparation_method': {
'name': 'prepmet',
'name': '_prepmet',
'attrs': {
'long_name': ' preparation method',
'standard_name': '_preparation_method'
},
'dtype': 'prepmet_t'
},
'unit': {
'name': 'unit',
'name': '_unit',
'attrs': {
'long_name': ' unit',
'standard_name': '_unit'
Expand Down
3 changes: 2 additions & 1 deletion marisco/nc_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def derive(
) -> dict: # Derived variable name and associated attributes
"Derive NetCDF nuclide-dependent variable names & attributes as defined in CDL."
return {
'name': nuclide['name'] + '_' + suffix['name'],
# 'name': nuclide['name'] + '_' + suffix['name'],
'name': nuclide['name'] + suffix['name'],
'dtype': suffix['dtype'], # Using dtype from suffix
'attrs': {key: nuclide['attrs'][key] + suffix['attrs'][key] for key in nuclide['attrs']}
}
Expand Down
152 changes: 152 additions & 0 deletions marisco/netcdf_to_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/handlers/netcdf_to_csv.ipynb.

# %% auto 0
__all__ = ['sample_cols_grp', 'sample_type_lut', 'coi_grp', 'renaming_rules', 'TransposeNuclideColumns', 'ParseTimeCB',
'LookupSampleType', 'get_nucnames_lut', 'LookupNuclideIdCB', 'RenameColumnCB', 'encode']

# %% ../nbs/handlers/netcdf_to_csv.ipynb 15
# Define cols that are not nuclides
sample_cols_grp = {'seawater': ['sample','lon', 'lat', 'depth', 'time'],
'sediment': ['sample', 'lon', 'lat', 'depth', 'time', 'sed_type'],
'biota': ['sample', 'lon', 'lat', 'depth', 'time', 'species_id', 'body_part']}

# %% ../nbs/handlers/netcdf_to_csv.ipynb 16
class TransposeNuclideColumns(Callback):

" Transpose NetCDF nuclide data."
def __init__(self, cols_grp=sample_cols_grp): fc.store_attr()

def __call__(self, tfm):
for grp in tfm.dfs.keys():
tfm.dfs[grp]=self.transpose_nuclides(tfm.dfs[grp], grp)

def transpose_nuclides(self, df, group):
sample_cols=self.cols_grp[group]
nuclide_unc_unit_cols=list(set(df.columns) - set(sample_cols))
unc_cols=[x for x in nuclide_unc_unit_cols if '_unc' in x]
unit_cols=[x for x in nuclide_unc_unit_cols if '_unit' in x]
dl_cols=[x for x in nuclide_unc_unit_cols if '_dl' in x]
nuclide_cols= list(set(nuclide_unc_unit_cols) - set(unit_cols+unc_cols+dl_cols))

# Transpose nuclide_cols
nuclide_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=nuclide_cols, var_name='nuclide', value_name='activity')

# Transpose unc_cols
unc_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unc_cols, var_name='nuclide', value_name='uncertainty')
unc_df['nuclide'] = unc_df['nuclide'].str.replace('_unc', '')

# Transpose unit_cols
unit_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unit_cols, var_name='nuclide', value_name='unit_id')
unit_df['nuclide'] = unit_df['nuclide'].str.replace('_unit', '')

# Transpose dl_cols
dl_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=dl_cols, var_name='nuclide', value_name='dl')
dl_df['nuclide'] = dl_df['nuclide'].str.replace('_dl', '')

# Combine nuclide_df, unc_df, unit_df and dl_df
combine_on=sample_cols+['nuclide']
df = pd.merge(nuclide_df, unc_df, how='outer', left_on= combine_on, right_on = combine_on)
df = pd.merge(df, unit_df, how='outer', left_on= combine_on, right_on = combine_on)
df = pd.merge(df, dl_df, how='outer', left_on= combine_on, right_on = combine_on)

# Keep all rows where 'activity' is not 'nan' OR 'uncertainty' is not 'nan' OR 'dl' is not 'nan' OR'unit_id' not equal 0.
df=df[(df['activity'].notna()) | (df['uncertainty'].notna()) | (df['dl'].notna()) | (df['unit_id'] != 0 )]
return(df)

# %% ../nbs/handlers/netcdf_to_csv.ipynb 20
class ParseTimeCB(Callback):
def __init__(self, cfg): fc.store_attr()
def __call__(self, tfm):
for k in tfm.dfs.keys():
tfm.dfs[k]['time'] = tfm.dfs[k]['time'].apply(self.format_time)
def format_time(self, x):
return num2pydate(x, units=self.cfg['units']['time'])

# %% ../nbs/handlers/netcdf_to_csv.ipynb 24
# Define sample types groups
sample_type_lut = {'seawater': 1,
'sediment': 2,
'biota': 3,
'suspended matter': 4}

# %% ../nbs/handlers/netcdf_to_csv.ipynb 25
class LookupSampleType(Callback):
def __init__(self, lut=sample_type_lut): fc.store_attr()
def __call__(self, tfm):
for k in tfm.dfs.keys():
tfm.dfs[k]['samptype_id'] = self.lut[k]


# %% ../nbs/handlers/netcdf_to_csv.ipynb 29
def get_nucnames_lut():
fname = lut_path() / 'dbo_nuclide.xlsx'
df_nuclide = pd.read_excel(fname, usecols=['nuclide_id', 'nc_name'])
return df_nuclide.set_index('nc_name').to_dict()['nuclide_id']

# %% ../nbs/handlers/netcdf_to_csv.ipynb 30
class LookupNuclideIdCB(Callback):
"Lookup MARIS nuclide_id."
def __init__(self,
fn_lut=get_nucnames_lut):
fc.store_attr()

def __call__(self, tfm):
lut = self.fn_lut()
for k in tfm.dfs.keys():
tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
tfm.dfs[k]['nuclide_id']=tfm.dfs[k]['nuclide_id'].astype('int64')

# %% ../nbs/handlers/netcdf_to_csv.ipynb 39
# Define columns of interest by sample type
coi_grp = {'seawater': ['sample', 'lon', 'lat', 'depth', 'time', 'nuclide', 'activity',
'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id'],
'sediment' : ['sample', 'lon', 'lat', 'depth', 'time', 'sed_type', 'nuclide',
'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id'],
'biota' : ['sample', 'lon', 'lat', 'depth', 'time', 'species_id', 'body_part',
'nuclide', 'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id']}

# %% ../nbs/handlers/netcdf_to_csv.ipynb 40
# Define column names renaming rules
renaming_rules = {
'lat': 'latitude',
'lon': 'longitude',
'time': 'begperiod',
'depth': 'sampdepth',
'nuclide': 'nuclide_id',
'uncertainty': 'uncertaint',
'dl': 'detection',
'sed_type': 'sedtype_id (0)',
'species_id': 'species_id (0)',
'body_part': 'bodypar_id',
}

# %% ../nbs/handlers/netcdf_to_csv.ipynb 41
class RenameColumnCB(Callback):
def __init__(self,
coi=coi_grp,
renaming_rules=renaming_rules):
fc.store_attr()

def __call__(self, tfm):
for k in tfm.dfs.keys():
# Select cols of interest
tfm.dfs[k] = tfm.dfs[k].loc[:, self.coi[k]]

# Rename cols
tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)

# %% ../nbs/handlers/netcdf_to_csv.ipynb 45
def encode(fname_in, fname_out, **kwargs):
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
ParseTimeCB(cfg()),
LookupSampleType(),
LookupNuclideIdCB(),
RenameColumnCB()
])

encoder = OpenRefineCsvEncoder(tfm(),
dest_fname=fname_out,
**kwargs)
encoder.encode()
return encoder
Loading

0 comments on commit 76a46dd

Please sign in to comment.