Skip to content

Commit

Permalink
add callback to remove nan values in data provider submitted dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
franckalbinet committed Sep 24, 2024
1 parent 44e482a commit 5119a32
Show file tree
Hide file tree
Showing 4 changed files with 655 additions and 631 deletions.
6 changes: 6 additions & 0 deletions marisco/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@
'marisco/callbacks.py'),
'marisco.callbacks.LowerStripNameCB._safe_transform': ( 'api/callbacks.html#lowerstripnamecb._safe_transform',
'marisco/callbacks.py'),
'marisco.callbacks.RemoveAllNAValuesCB': ( 'api/callbacks.html#removeallnavaluescb',
'marisco/callbacks.py'),
'marisco.callbacks.RemoveAllNAValuesCB.__call__': ( 'api/callbacks.html#removeallnavaluescb.__call__',
'marisco/callbacks.py'),
'marisco.callbacks.RemoveAllNAValuesCB.__init__': ( 'api/callbacks.html#removeallnavaluescb.__init__',
'marisco/callbacks.py'),
'marisco.callbacks.ReshapeLongToWide': ('api/callbacks.html#reshapelongtowide', 'marisco/callbacks.py'),
'marisco.callbacks.ReshapeLongToWide.__call__': ( 'api/callbacks.html#reshapelongtowide.__call__',
'marisco/callbacks.py'),
Expand Down
22 changes: 18 additions & 4 deletions marisco/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# %% auto 0
__all__ = ['Callback', 'run_cbs', 'Transformer', 'SanitizeLonLatCB', 'AddSampleTypeIdColumnCB', 'AddNuclideIdColumnCB',
'LowerStripNameCB', 'ReshapeLongToWide', 'CompareDfsAndTfmCB', 'EncodeTimeCB']
'LowerStripNameCB', 'RemoveAllNAValuesCB', 'ReshapeLongToWide', 'CompareDfsAndTfmCB', 'EncodeTimeCB']

# %% ../nbs/api/callbacks.ipynb 2
import copy
Expand Down Expand Up @@ -128,7 +128,21 @@ def __call__(self, tfm):
for key in tfm.dfs.keys():
tfm.dfs[key][self.col_dst] = tfm.dfs[key][self.col_src].apply(self._safe_transform)

# %% ../nbs/api/callbacks.ipynb 32
# %% ../nbs/api/callbacks.ipynb 31
class RemoveAllNAValuesCB(Callback):
"Remove rows with all NA values."
def __init__(self,
cols_to_check:dict # A dictionary with the sample type as key and the column name to check as value
):
fc.store_attr()

def __call__(self, tfm):
for k in tfm.dfs.keys():
col_to_check = self.cols_to_check[k]
mask = tfm.dfs[k][col_to_check].isnull().all(axis=1)
tfm.dfs[k] = tfm.dfs[k][~mask]

# %% ../nbs/api/callbacks.ipynb 33
class ReshapeLongToWide(Callback):
def __init__(self, columns=['nuclide'], values=['value'],
num_fill_value=-999, str_fill_value='STR FILL VALUE'):
Expand Down Expand Up @@ -182,7 +196,7 @@ def __call__(self, tfm):
tfm.dfs[grp] = self.pivot(tfm.dfs[grp])
tfm.dfs[grp].columns = self.renamed_cols(tfm.dfs[grp].columns)

# %% ../nbs/api/callbacks.ipynb 34
# %% ../nbs/api/callbacks.ipynb 35
class CompareDfsAndTfmCB(Callback):
def __init__(self, dfs: Dict[str, pd.DataFrame]):
"Create a dataframe of dropped data. Data included in the `dfs` not in the `tfm`."
Expand Down Expand Up @@ -219,7 +233,7 @@ def _compute_stats(self,
'Number of rows in tfm.dfs + Number of dropped rows': len(tfm.dfs[grp].index) + len(tfm.dfs_dropped[grp].index)
}

# %% ../nbs/api/callbacks.ipynb 39
# %% ../nbs/api/callbacks.ipynb 40
class EncodeTimeCB(Callback):
"Encode time as `int` representing seconds since xxx"
def __init__(self, cfg , verbose=False): fc.store_attr()
Expand Down
22 changes: 22 additions & 0 deletions nbs/api/callbacks.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,28 @@
"## Change structure"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "701be72f",
"metadata": {},
"outputs": [],
"source": [
"#| exports\n",
"class RemoveAllNAValuesCB(Callback):\n",
" \"Remove rows with all NA values.\"\n",
" def __init__(self, \n",
" cols_to_check:dict # A dictionary with the sample type as key and the column name to check as value\n",
" ):\n",
" fc.store_attr()\n",
"\n",
" def __call__(self, tfm):\n",
" for k in tfm.dfs.keys():\n",
" col_to_check = self.cols_to_check[k]\n",
" mask = tfm.dfs[k][col_to_check].isnull().all(axis=1)\n",
" tfm.dfs[k] = tfm.dfs[k][~mask]"
]
},
{
"cell_type": "markdown",
"id": "3a32c3bc",
Expand Down
Loading

0 comments on commit 5119a32

Please sign in to comment.