accept proposed change to not hardcode the '_' separator in the nc te…

…mplater but rather defined it in configs
franckalbinet · Jun 3, 2024 · 76a46dd · 76a46dd
1 parent 803df95
commit 76a46dd
Show file tree

Hide file tree

Showing 10 changed files with 273 additions and 109 deletions.
diff --git a/marisco/_modidx.py b/marisco/_modidx.py
@@ -279,6 +279,43 @@
                                                                                    'marisco/nc_template.py'),
                                      'marisco.nc_template.NCTemplater.nuclide_vars': ( 'api/nc_template.html#nctemplater.nuclide_vars',
                                                                                        'marisco/nc_template.py')},
+            'marisco.netcdf_to_csv': { 'marisco.netcdf_to_csv.LookupNuclideIdCB': ( 'handlers/netcdf_to_csv.html#lookupnuclideidcb',
+                                                                                    'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.LookupNuclideIdCB.__call__': ( 'handlers/netcdf_to_csv.html#__call__',
+                                                                                             'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.LookupNuclideIdCB.__init__': ( 'handlers/netcdf_to_csv.html#__init__',
+                                                                                             'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.LookupSampleType': ( 'handlers/netcdf_to_csv.html#lookupsampletype',
+                                                                                   'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.LookupSampleType.__call__': ( 'handlers/netcdf_to_csv.html#__call__',
+                                                                                            'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.LookupSampleType.__init__': ( 'handlers/netcdf_to_csv.html#__init__',
+                                                                                            'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.ParseTimeCB': ( 'handlers/netcdf_to_csv.html#parsetimecb',
+                                                                              'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.ParseTimeCB.__call__': ( 'handlers/netcdf_to_csv.html#__call__',
+                                                                                       'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.ParseTimeCB.__init__': ( 'handlers/netcdf_to_csv.html#__init__',
+                                                                                       'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.ParseTimeCB.format_time': ( 'handlers/netcdf_to_csv.html#format_time',
+                                                                                          'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.RenameColumnCB': ( 'handlers/netcdf_to_csv.html#renamecolumncb',
+                                                                                 'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.RenameColumnCB.__call__': ( 'handlers/netcdf_to_csv.html#__call__',
+                                                                                          'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.RenameColumnCB.__init__': ( 'handlers/netcdf_to_csv.html#__init__',
+                                                                                          'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.TransposeNuclideColumns': ( 'handlers/netcdf_to_csv.html#transposenuclidecolumns',
+                                                                                          'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.TransposeNuclideColumns.__call__': ( 'handlers/netcdf_to_csv.html#__call__',
+                                                                                                   'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.TransposeNuclideColumns.__init__': ( 'handlers/netcdf_to_csv.html#__init__',
+                                                                                                   'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.TransposeNuclideColumns.transpose_nuclides': ( 'handlers/netcdf_to_csv.html#transpose_nuclides',
+                                                                                                             'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.encode': ('handlers/netcdf_to_csv.html#encode', 'marisco/netcdf_to_csv.py'),
+                                       'marisco.netcdf_to_csv.get_nucnames_lut': ( 'handlers/netcdf_to_csv.html#get_nucnames_lut',
+                                                                                   'marisco/netcdf_to_csv.py')},
             'marisco.serializers': { 'marisco.serializers.NetCDFEncoder': ('api/serializers.html#netcdfencoder', 'marisco/serializers.py'),
                                      'marisco.serializers.NetCDFEncoder.__init__': ( 'api/serializers.html#__init__',
                                                                                      'marisco/serializers.py'),

diff --git a/marisco/configs.py b/marisco/configs.py
@@ -240,79 +240,79 @@ def cache_path(): return Path(cfg()['dirs']['cache'])
         },
         'suffixes':  {
             'uncertainty': {
-                'name': 'unc',
+                'name': '_unc',
                 'attrs': {
                     'long_name': ' uncertainty',
                     'standard_name': '_uncertainty'
                 },
                 'dtype': 'f4'
             },
             'detection_limit': {
-                'name': 'dl',
+                'name': '_dl',
                 'attrs': {
                     'long_name': ' detection limit',
                     'standard_name': '_detection_limit'
                 },
                 'dtype': 'dl_t'
             },
             'volume': {
-                'name': 'vol',
+                'name': '_vol',
                 'attrs': {
                     'long_name': ' volume',
                     'standard_name': '_volume'
                 },
                 'dtype': 'f4'
             },
             'salinity': {
-                'name': 'sal',
+                'name': '_sal',
                 'attrs': {
                     'long_name': ' salinity',
                     'standard_name': '_sal'
                 },
                 'dtype': 'f4'
             },
             'temperature': {
-                'name': 'temp',
+                'name': '_temp',
                 'attrs': {
                     'long_name': ' temperature',
                     'standard_name': '_temp'
                 },
                 'dtype': 'f4'
             },
             'filtered': {
-                'name': 'filt',
+                'name': '_filt',
                 'attrs': {
                     'long_name': ' filtered',
                     'standard_name': '_filtered'
                 },
                 'dtype': 'filt_t'
             },
             'counting_method': {
-                'name': 'counmet',
+                'name': '_counmet',
                 'attrs': {
                     'long_name': ' counting method',
                     'standard_name': '_counting_method'
                 },
                 'dtype': 'counmet_t'
             },
             'sampling_method': {
-                'name': 'sampmet',
+                'name': '_sampmet',
                 'attrs': {
                     'long_name': ' sampling method',
                     'standard_name': '_sampling_method'
                 },
                 'dtype': 'sampmet_t'
             },
             'preparation_method': {
-                'name': 'prepmet',
+                'name': '_prepmet',
                 'attrs': {
                     'long_name': ' preparation method',
                     'standard_name': '_preparation_method'
                 },
                 'dtype': 'prepmet_t'
             },
             'unit': {
-                'name': 'unit',
+                'name': '_unit',
                 'attrs': {
                     'long_name': ' unit',
                     'standard_name': '_unit'

diff --git a/marisco/nc_template.py b/marisco/nc_template.py
@@ -73,7 +73,8 @@ def derive(
 ) -> dict: # Derived variable name and associated attributes
     "Derive NetCDF nuclide-dependent variable names & attributes as defined in CDL." 
     return {
-        'name': nuclide['name'] + '_' + suffix['name'],
+        # 'name': nuclide['name'] + '_' + suffix['name'],
+        'name': nuclide['name'] + suffix['name'],
         'dtype': suffix['dtype'],  # Using dtype from suffix
         'attrs': {key: nuclide['attrs'][key] + suffix['attrs'][key] for key in nuclide['attrs']}
         }

diff --git a/marisco/netcdf_to_csv.py b/marisco/netcdf_to_csv.py
@@ -0,0 +1,152 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/handlers/netcdf_to_csv.ipynb.
+
+# %% auto 0
+__all__ = ['sample_cols_grp', 'sample_type_lut', 'coi_grp', 'renaming_rules', 'TransposeNuclideColumns', 'ParseTimeCB',
+           'LookupSampleType', 'get_nucnames_lut', 'LookupNuclideIdCB', 'RenameColumnCB', 'encode']
+
+# %% ../nbs/handlers/netcdf_to_csv.ipynb 15
+# Define cols that are not nuclides
+sample_cols_grp = {'seawater': ['sample','lon', 'lat', 'depth', 'time'],
+           'sediment': ['sample', 'lon', 'lat', 'depth', 'time', 'sed_type'],
+           'biota': ['sample', 'lon', 'lat', 'depth', 'time', 'species_id', 'body_part']}
+
+# %% ../nbs/handlers/netcdf_to_csv.ipynb 16
+class TransposeNuclideColumns(Callback):
+
+    " Transpose NetCDF nuclide data."
+    def __init__(self, cols_grp=sample_cols_grp): fc.store_attr()
+
+    def __call__(self, tfm):
+        for grp in tfm.dfs.keys():
+            tfm.dfs[grp]=self.transpose_nuclides(tfm.dfs[grp], grp)
+
+    def transpose_nuclides(self, df, group):
+        sample_cols=self.cols_grp[group]
+        nuclide_unc_unit_cols=list(set(df.columns) - set(sample_cols))
+        unc_cols=[x for x in nuclide_unc_unit_cols if '_unc' in x]
+        unit_cols=[x for x in nuclide_unc_unit_cols if '_unit' in x]
+        dl_cols=[x for x in nuclide_unc_unit_cols if '_dl' in x]
+        nuclide_cols= list(set(nuclide_unc_unit_cols) - set(unit_cols+unc_cols+dl_cols))
+
+        # Transpose nuclide_cols
+        nuclide_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=nuclide_cols, var_name='nuclide', value_name='activity')
+
+        # Transpose unc_cols
+        unc_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unc_cols, var_name='nuclide', value_name='uncertainty')
+        unc_df['nuclide'] = unc_df['nuclide'].str.replace('_unc', '')    
+
+        # Transpose unit_cols
+        unit_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unit_cols, var_name='nuclide', value_name='unit_id')       
+        unit_df['nuclide'] = unit_df['nuclide'].str.replace('_unit', '')
+
+        # Transpose dl_cols
+        dl_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=dl_cols, var_name='nuclide', value_name='dl')       
+        dl_df['nuclide'] = dl_df['nuclide'].str.replace('_dl', '')        
+
+        # Combine nuclide_df, unc_df, unit_df and dl_df
+        combine_on=sample_cols+['nuclide']
+        df = pd.merge(nuclide_df, unc_df,  how='outer', left_on= combine_on, right_on = combine_on)
+        df = pd.merge(df, unit_df,  how='outer', left_on= combine_on, right_on = combine_on)
+        df = pd.merge(df, dl_df,  how='outer', left_on= combine_on, right_on = combine_on)
+
+        # Keep all rows where 'activity' is not 'nan' OR 'uncertainty' is not 'nan' OR 'dl' is not 'nan' OR'unit_id' not equal 0.
+        df=df[(df['activity'].notna()) | (df['uncertainty'].notna()) | (df['dl'].notna()) | (df['unit_id'] != 0 )]
+        return(df)            
+
+# %% ../nbs/handlers/netcdf_to_csv.ipynb 20
+class ParseTimeCB(Callback):
+    def __init__(self, cfg): fc.store_attr()
+    def __call__(self, tfm):
+        for k in tfm.dfs.keys():
+            tfm.dfs[k]['time'] = tfm.dfs[k]['time'].apply(self.format_time)
+    def format_time(self, x): 
+        return num2pydate(x, units=self.cfg['units']['time'])
+
+# %% ../nbs/handlers/netcdf_to_csv.ipynb 24
+# Define sample types groups
+sample_type_lut = {'seawater': 1,
+           'sediment': 2,
+           'biota': 3,
+           'suspended matter': 4}
+
+# %% ../nbs/handlers/netcdf_to_csv.ipynb 25
+class LookupSampleType(Callback):
+    def __init__(self, lut=sample_type_lut): fc.store_attr()
+    def __call__(self, tfm):
+        for k in tfm.dfs.keys():
+            tfm.dfs[k]['samptype_id'] = self.lut[k]
+
+
+# %% ../nbs/handlers/netcdf_to_csv.ipynb 29
+def get_nucnames_lut():
+    fname = lut_path() / 'dbo_nuclide.xlsx'
+    df_nuclide = pd.read_excel(fname, usecols=['nuclide_id', 'nc_name'])
+    return df_nuclide.set_index('nc_name').to_dict()['nuclide_id']
+
+# %% ../nbs/handlers/netcdf_to_csv.ipynb 30
+class LookupNuclideIdCB(Callback):
+    "Lookup MARIS nuclide_id."
+    def __init__(self,
+                 fn_lut=get_nucnames_lut):
+        fc.store_attr()
+
+    def __call__(self, tfm):
+        lut = self.fn_lut()
+        for k in tfm.dfs.keys():
+            tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
+            tfm.dfs[k]['nuclide_id']=tfm.dfs[k]['nuclide_id'].astype('int64')
+
+# %% ../nbs/handlers/netcdf_to_csv.ipynb 39
+# Define columns of interest by sample type
+coi_grp = {'seawater': ['sample', 'lon', 'lat', 'depth', 'time', 'nuclide', 'activity',
+                     'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id'],
+       'sediment' : ['sample', 'lon', 'lat', 'depth', 'time', 'sed_type', 'nuclide',
+                     'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id'],
+       'biota' : ['sample', 'lon', 'lat', 'depth', 'time', 'species_id', 'body_part',
+                     'nuclide', 'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id']}
+
+# %% ../nbs/handlers/netcdf_to_csv.ipynb 40
+# Define column names renaming rules
+renaming_rules = {
+    'lat': 'latitude',
+    'lon': 'longitude',
+    'time': 'begperiod',
+    'depth': 'sampdepth',
+    'nuclide': 'nuclide_id',
+    'uncertainty': 'uncertaint',
+    'dl': 'detection',
+    'sed_type': 'sedtype_id (0)',
+    'species_id': 'species_id (0)',
+    'body_part': 'bodypar_id',
+}
+
+# %% ../nbs/handlers/netcdf_to_csv.ipynb 41
+class RenameColumnCB(Callback):
+    def __init__(self,
+                 coi=coi_grp,
+                 renaming_rules=renaming_rules):
+        fc.store_attr()
+
+    def __call__(self, tfm):
+        for k in tfm.dfs.keys():
+            # Select cols of interest
+            tfm.dfs[k] = tfm.dfs[k].loc[:, self.coi[k]]
+
+            # Rename cols
+            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)
+
+# %% ../nbs/handlers/netcdf_to_csv.ipynb 45
+def encode(fname_in, fname_out, **kwargs):
+    dfs = netcdf4_to_df(fname_in)
+    tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
+                                ParseTimeCB(cfg()),
+                                LookupSampleType(),
+                                LookupNuclideIdCB(),
+                                RenameColumnCB()
+                                ])
+
+    encoder = OpenRefineCsvEncoder(tfm(), 
+                            dest_fname=fname_out, 
+                            **kwargs)
+    encoder.encode()
+    return encoder