-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Biomarkers transform for ModelAD #148
base: dev
Are you sure you want to change the base?
Changes from all commits
f348d68
2d43820
b19a529
84355f7
1297757
537605c
597bfca
b796474
d6a7d19
46feee2
4ac1f23
bb8cb5d
1a09560
2e4c792
200f068
dd8d422
3fae0ae
f75638b
8016df4
5e3dfeb
8ee844d
8ca5cc9
44cbdb2
653bede
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
destination: &dest syn51498092 | ||
staging_path: ./staging | ||
gx_folder: none | ||
gx_table: none | ||
datasets: | ||
- biomarkers: | ||
files: | ||
- name: biomarkers | ||
id: syn61250724.1 | ||
format: csv | ||
final_format: json | ||
provenance: | ||
- syn61250724.1 | ||
destination: *dest | ||
custom_transformations: 1 | ||
column_rename: | ||
agedeath: ageDeath |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
""" | ||
This module contains the transformation logic for the biomarkers dataset. | ||
This is for the Model AD project. | ||
""" | ||
|
||
import pandas as pd | ||
from typing import Dict, List, Any | ||
|
||
|
||
def transform_biomarkers(datasets: Dict[str, pd.DataFrame]) -> List[Dict[str, Any]]: | ||
""" | ||
Takes dictionary of dataset DataFrames, extracts the biomarkers | ||
DataFrame, and transforms it into a list of dictionaries grouped by | ||
'model', 'type', 'ageDeath', 'tissue', and 'units'. | ||
|
||
Args: | ||
datasets (Dict[str, pd.DataFrame]): dictionary of dataset names mapped to their DataFrame | ||
|
||
Returns: | ||
List[Dict[str, Any]]: a list of dictionaries containing biomarker data modeled after intended final JSON structure | ||
""" | ||
biomarkers_dataset = datasets["biomarkers"] | ||
|
||
# Check that the dataset looks like what we expect | ||
if not isinstance(biomarkers_dataset, pd.DataFrame): | ||
raise TypeError( | ||
f"Expected pd.DataFrame for Biomarker dataset but received {type(biomarkers_dataset)}." | ||
) | ||
if ( | ||
not list(biomarkers_dataset.columns).sort() | ||
== [ | ||
"model", | ||
"type", | ||
"ageDeath", | ||
"tissue", | ||
"units", | ||
"genotype", | ||
"measurement", | ||
"sex", | ||
].sort() | ||
): | ||
raise ValueError( | ||
f"Biomarker dataset does not contain expected columns. Columns found: {list(biomarkers_dataset.columns)}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be worth changing this check from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So true! I was trying to be strict with the error handling, but if there is a possibility for extra columns that we could just ignore, then I'll use |
||
) | ||
|
||
data_as_list = [] | ||
grouped = biomarkers_dataset.groupby( | ||
["model", "type", "ageDeath", "tissue", "units"] | ||
) | ||
|
||
for (model, type_, ageDeath, tissue, units), group in grouped: | ||
# Create the base structure for each group | ||
entry = { | ||
"model": model, | ||
"type": type_, | ||
"ageDeath": ageDeath, | ||
"tissue": tissue, | ||
"units": units, | ||
"points": [], | ||
} | ||
|
||
# Append the measurement, genotype, and sex for each row | ||
for _, row in group.iterrows(): | ||
point = { | ||
"genotype": row["genotype"], | ||
"measurement": row["measurement"], | ||
"sex": row["sex"], | ||
} | ||
entry["points"].append(point) | ||
|
||
# Add the entry to the list | ||
data_as_list.append(entry) | ||
|
||
return data_as_list |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,6 +59,8 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: | |
if dataset_name in ["proteomics", "proteomics_tmt", "proteomics_srm"]: | ||
df = datasets[dataset_name] | ||
return transform.transform_proteomics(df=df) | ||
if dataset_name == "biomarkers": | ||
return transform.transform_biomarkers(datasets=datasets) | ||
else: | ||
return None | ||
|
||
|
@@ -123,12 +125,22 @@ def process_dataset( | |
staging_path=staging_path, | ||
filename=dataset_name + "." + dataset_obj[dataset_name]["final_format"], | ||
) | ||
else: | ||
elif isinstance(df, list): | ||
json_path = load.list_to_json( | ||
df=df, | ||
staging_path=staging_path, | ||
filename=dataset_name + "." + dataset_obj[dataset_name]["final_format"], | ||
) | ||
elif isinstance(df, DataFrame): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @BWMac here are the changes mentioned in the PR description. What do you think about them? |
||
json_path = load.df_to_json( | ||
df=df, | ||
staging_path=staging_path, | ||
filename=dataset_name + "." + dataset_obj[dataset_name]["final_format"], | ||
) | ||
else: | ||
raise ADTDataProcessingError( | ||
f"Data processing failed for {dataset_name}. Data is of type {type(df)}. Supported data types are: dict, list, pd.DataFrame." | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do we think about this error handling? Is it necessary? Since we control all of the outputs, maybe we don't need it and can keep using the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll let Brad or others talk about the necessity of this, my comment is on the message itself. If I saw this message "in a vacuum" I would ask myself "What WAS the Dataframe type when the exception was raised?" could that information be added to the exception? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Really great point! I updated the error message, let me know if you think I should make any more changes here. |
||
|
||
gx_enabled = dataset_obj[dataset_name].get("gx_enabled", False) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
model,type,measurement,units,ageDeath,tissue,sex,genotype | ||
ModelA,TypeA,1,A,1,TissueA,male,genotype1 | ||
ModelA,TypeA,1,A,1,TissueA,male,genotype1 | ||
ModelA,TypeA,1,A,1,TissueA,male,genotype2 | ||
ModelA,TypeA,1,A,1,TissueA,male,genotype2 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
genotype,measurement,sex,model,type,ageDeath,tissue,units | ||
3xTG-AD,1.887403509,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,2.507680422,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,3.095873882,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,3.639017544,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,0.74988673,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,11.98119457,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,6.393679577,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,3.098566667,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,3.452296527,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,2.644899123,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,0.110659787,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,1.899942325,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,2.397684799,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,6.048395918,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,5.886637838,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,3.368940156,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,0.234726268,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,6.193847515,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,7.122192076,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg | ||
3xTG-AD,4.853281065,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,37.12325,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,15.62036898,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,37.12734722,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,75.55092784,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,63.56345,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,60.22938,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,23.23818056,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,93.87042718,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,107.5141702,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,130.5238413,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,13.75984343,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,34.07885294,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,80.33369231,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,84.89822857,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,92.64340206,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,133.3285882,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,130.4151077,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg | ||
3xTG-AD,255.0758333,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
model,type,measurement,units,ageDeath,tissue,sex,genotype | ||
ModelA,TypeA,1,A,1,TissueA,male,genotype1 | ||
ModelA,TypeA,2,A,1,TissueA,male,genotype1 | ||
ModelA,TypeA,3,A,2,TissueA,male,genotype2 | ||
ModelA,TypeB,4,A,2,TissueA,male,genotype1 | ||
ModelA,TypeB,5,A,3,TissueA,male,genotype1 | ||
ModelA,TypeB,6,A,3,TissueA,male,genotype2 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
[ | ||
{ | ||
"model": "ModelA", | ||
"type": "TypeA", | ||
"ageDeath": 1, | ||
"tissue": "TissueA", | ||
"units": "A", | ||
"points": [ | ||
{ | ||
"genotype": "genotype1", | ||
"measurement": 1, | ||
"sex": "male" | ||
}, | ||
{ | ||
"genotype": "genotype1", | ||
"measurement": 1, | ||
"sex": "male" | ||
}, | ||
{ | ||
"genotype": "genotype2", | ||
"measurement": 1, | ||
"sex": "male" | ||
}, | ||
{ | ||
"genotype": "genotype2", | ||
"measurement": 1, | ||
"sex": "male" | ||
} | ||
] | ||
} | ||
] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See my comment on your test function, this error check probably isn't necessary.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I was thinking about this earlier. The type hints should catch this :)
I'll remove it. Thank you for the validation!