-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
accept proposed change to not hardcode the '_' separator in the nc te…
…mplater but rather defined it in configs
- Loading branch information
1 parent
803df95
commit 76a46dd
Showing
10 changed files
with
273 additions
and
109 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/handlers/netcdf_to_csv.ipynb. | ||
|
||
# %% auto 0 | ||
__all__ = ['sample_cols_grp', 'sample_type_lut', 'coi_grp', 'renaming_rules', 'TransposeNuclideColumns', 'ParseTimeCB', | ||
'LookupSampleType', 'get_nucnames_lut', 'LookupNuclideIdCB', 'RenameColumnCB', 'encode'] | ||
|
||
# %% ../nbs/handlers/netcdf_to_csv.ipynb 15 | ||
# Define cols that are not nuclides | ||
sample_cols_grp = {'seawater': ['sample','lon', 'lat', 'depth', 'time'], | ||
'sediment': ['sample', 'lon', 'lat', 'depth', 'time', 'sed_type'], | ||
'biota': ['sample', 'lon', 'lat', 'depth', 'time', 'species_id', 'body_part']} | ||
|
||
# %% ../nbs/handlers/netcdf_to_csv.ipynb 16 | ||
class TransposeNuclideColumns(Callback): | ||
|
||
" Transpose NetCDF nuclide data." | ||
def __init__(self, cols_grp=sample_cols_grp): fc.store_attr() | ||
|
||
def __call__(self, tfm): | ||
for grp in tfm.dfs.keys(): | ||
tfm.dfs[grp]=self.transpose_nuclides(tfm.dfs[grp], grp) | ||
|
||
def transpose_nuclides(self, df, group): | ||
sample_cols=self.cols_grp[group] | ||
nuclide_unc_unit_cols=list(set(df.columns) - set(sample_cols)) | ||
unc_cols=[x for x in nuclide_unc_unit_cols if '_unc' in x] | ||
unit_cols=[x for x in nuclide_unc_unit_cols if '_unit' in x] | ||
dl_cols=[x for x in nuclide_unc_unit_cols if '_dl' in x] | ||
nuclide_cols= list(set(nuclide_unc_unit_cols) - set(unit_cols+unc_cols+dl_cols)) | ||
|
||
# Transpose nuclide_cols | ||
nuclide_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=nuclide_cols, var_name='nuclide', value_name='activity') | ||
|
||
# Transpose unc_cols | ||
unc_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unc_cols, var_name='nuclide', value_name='uncertainty') | ||
unc_df['nuclide'] = unc_df['nuclide'].str.replace('_unc', '') | ||
|
||
# Transpose unit_cols | ||
unit_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unit_cols, var_name='nuclide', value_name='unit_id') | ||
unit_df['nuclide'] = unit_df['nuclide'].str.replace('_unit', '') | ||
|
||
# Transpose dl_cols | ||
dl_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=dl_cols, var_name='nuclide', value_name='dl') | ||
dl_df['nuclide'] = dl_df['nuclide'].str.replace('_dl', '') | ||
|
||
# Combine nuclide_df, unc_df, unit_df and dl_df | ||
combine_on=sample_cols+['nuclide'] | ||
df = pd.merge(nuclide_df, unc_df, how='outer', left_on= combine_on, right_on = combine_on) | ||
df = pd.merge(df, unit_df, how='outer', left_on= combine_on, right_on = combine_on) | ||
df = pd.merge(df, dl_df, how='outer', left_on= combine_on, right_on = combine_on) | ||
|
||
# Keep all rows where 'activity' is not 'nan' OR 'uncertainty' is not 'nan' OR 'dl' is not 'nan' OR'unit_id' not equal 0. | ||
df=df[(df['activity'].notna()) | (df['uncertainty'].notna()) | (df['dl'].notna()) | (df['unit_id'] != 0 )] | ||
return(df) | ||
|
||
# %% ../nbs/handlers/netcdf_to_csv.ipynb 20 | ||
class ParseTimeCB(Callback): | ||
def __init__(self, cfg): fc.store_attr() | ||
def __call__(self, tfm): | ||
for k in tfm.dfs.keys(): | ||
tfm.dfs[k]['time'] = tfm.dfs[k]['time'].apply(self.format_time) | ||
def format_time(self, x): | ||
return num2pydate(x, units=self.cfg['units']['time']) | ||
|
||
# %% ../nbs/handlers/netcdf_to_csv.ipynb 24 | ||
# Define sample types groups | ||
sample_type_lut = {'seawater': 1, | ||
'sediment': 2, | ||
'biota': 3, | ||
'suspended matter': 4} | ||
|
||
# %% ../nbs/handlers/netcdf_to_csv.ipynb 25 | ||
class LookupSampleType(Callback): | ||
def __init__(self, lut=sample_type_lut): fc.store_attr() | ||
def __call__(self, tfm): | ||
for k in tfm.dfs.keys(): | ||
tfm.dfs[k]['samptype_id'] = self.lut[k] | ||
|
||
|
||
# %% ../nbs/handlers/netcdf_to_csv.ipynb 29 | ||
def get_nucnames_lut(): | ||
fname = lut_path() / 'dbo_nuclide.xlsx' | ||
df_nuclide = pd.read_excel(fname, usecols=['nuclide_id', 'nc_name']) | ||
return df_nuclide.set_index('nc_name').to_dict()['nuclide_id'] | ||
|
||
# %% ../nbs/handlers/netcdf_to_csv.ipynb 30 | ||
class LookupNuclideIdCB(Callback): | ||
"Lookup MARIS nuclide_id." | ||
def __init__(self, | ||
fn_lut=get_nucnames_lut): | ||
fc.store_attr() | ||
|
||
def __call__(self, tfm): | ||
lut = self.fn_lut() | ||
for k in tfm.dfs.keys(): | ||
tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut) | ||
tfm.dfs[k]['nuclide_id']=tfm.dfs[k]['nuclide_id'].astype('int64') | ||
|
||
# %% ../nbs/handlers/netcdf_to_csv.ipynb 39 | ||
# Define columns of interest by sample type | ||
coi_grp = {'seawater': ['sample', 'lon', 'lat', 'depth', 'time', 'nuclide', 'activity', | ||
'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id'], | ||
'sediment' : ['sample', 'lon', 'lat', 'depth', 'time', 'sed_type', 'nuclide', | ||
'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id'], | ||
'biota' : ['sample', 'lon', 'lat', 'depth', 'time', 'species_id', 'body_part', | ||
'nuclide', 'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id']} | ||
|
||
# %% ../nbs/handlers/netcdf_to_csv.ipynb 40 | ||
# Define column names renaming rules | ||
renaming_rules = { | ||
'lat': 'latitude', | ||
'lon': 'longitude', | ||
'time': 'begperiod', | ||
'depth': 'sampdepth', | ||
'nuclide': 'nuclide_id', | ||
'uncertainty': 'uncertaint', | ||
'dl': 'detection', | ||
'sed_type': 'sedtype_id (0)', | ||
'species_id': 'species_id (0)', | ||
'body_part': 'bodypar_id', | ||
} | ||
|
||
# %% ../nbs/handlers/netcdf_to_csv.ipynb 41 | ||
class RenameColumnCB(Callback): | ||
def __init__(self, | ||
coi=coi_grp, | ||
renaming_rules=renaming_rules): | ||
fc.store_attr() | ||
|
||
def __call__(self, tfm): | ||
for k in tfm.dfs.keys(): | ||
# Select cols of interest | ||
tfm.dfs[k] = tfm.dfs[k].loc[:, self.coi[k]] | ||
|
||
# Rename cols | ||
tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True) | ||
|
||
# %% ../nbs/handlers/netcdf_to_csv.ipynb 45 | ||
def encode(fname_in, fname_out, **kwargs): | ||
dfs = netcdf4_to_df(fname_in) | ||
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(), | ||
ParseTimeCB(cfg()), | ||
LookupSampleType(), | ||
LookupNuclideIdCB(), | ||
RenameColumnCB() | ||
]) | ||
|
||
encoder = OpenRefineCsvEncoder(tfm(), | ||
dest_fname=fname_out, | ||
**kwargs) | ||
encoder.encode() | ||
return encoder |
Oops, something went wrong.