Sage-Bionetworks · beatrizsaldana · Sep 18, 2024 · Sep 18, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/modelad_test_config.yaml b/modelad_test_config.yaml
@@ -0,0 +1,17 @@
+destination: &dest syn51498092
+staging_path: ./staging
+gx_folder: none
+gx_table: none
+datasets:
+  - biomarkers:
+      files:
+        - name: biomarkers
+          id: syn61250724.1
+          format: csv
+      final_format: json
+      provenance:
+        - syn61250724.1
+      destination: *dest
+      custom_transformations: 1
+      column_rename:
+        agedeath: ageDeath
diff --git a/src/agoradatatools/etl/load.py b/src/agoradatatools/etl/load.py
@@ -5,6 +5,8 @@
 import pandas as pd
 from synapseclient import Activity, File, Synapse
 
+from typing import Dict, List, Any
+
 
 class NumpyEncoder(json.JSONEncoder):
     """Special json encoder for numpy types"""
@@ -160,3 +162,20 @@ def dict_to_json(df: dict, staging_path: str, filename: str) -> str:
     json.dump(df_as_dict, temp_json, cls=NumpyEncoder, indent=2)
     temp_json.close()
     return temp_json.name
+
+
+def list_to_json(df: List[Dict[str, Any]], staging_path: str, filename: str) -> str:
+    """Converts a list into a JSON file.
+
+    Args:
+        df (list): List to be converted to a JSON file
+        staging_path (str): Path to staging directory
+        filename (str): name of JSON file to be created
+
+    Returns:
+        str: Returns a string containing the name of the new JSON file
+    """
+
+    with open(os.path.join(staging_path, filename), "w+") as temp_json:
+        json.dump(df, temp_json, cls=NumpyEncoder, indent=2)
+    return temp_json.name
diff --git a/src/agoradatatools/etl/transform/__init__.py b/src/agoradatatools/etl/transform/__init__.py
@@ -16,6 +16,7 @@
 )
 from agoradatatools.etl.transform.team_info import transform_team_info
 from agoradatatools.etl.transform.proteomics import transform_proteomics
+from agoradatatools.etl.transform.biomarkers import transform_biomarkers
 
 __all__ = [
     "transform_distribution_data",
@@ -28,4 +29,5 @@
     "transform_rnaseq_differential_expression",
     "transform_team_info",
     "transform_proteomics",
+    "transform_biomarkers",
 ]
diff --git a/src/agoradatatools/etl/transform/biomarkers.py b/src/agoradatatools/etl/transform/biomarkers.py
@@ -0,0 +1,74 @@
+"""
+This module contains the transformation logic for the biomarkers dataset.
+This is for the Model AD project.
+"""
+
+import pandas as pd
+from typing import Dict, List, Any
+
+
+def transform_biomarkers(datasets: Dict[str, pd.DataFrame]) -> List[Dict[str, Any]]:
+    """
+    Takes dictionary of dataset DataFrames, extracts the biomarkers
+    DataFrame, and transforms it into a list of dictionaries grouped by
+    'model', 'type', 'ageDeath', 'tissue', and 'units'.
+
+    Args:
+        datasets (Dict[str, pd.DataFrame]): dictionary of dataset names mapped to their DataFrame
+
+    Returns:
+        List[Dict[str, Any]]: a list of dictionaries containing biomarker data modeled after intended final JSON structure
+    """
+    biomarkers_dataset = datasets["biomarkers"]
+
+    # Check that the dataset looks like what we expect
+    if not isinstance(biomarkers_dataset, pd.DataFrame):
+        raise TypeError(
+            f"Expected pd.DataFrame for Biomarker dataset but received {type(biomarkers_dataset)}."
+        )
+    if (
+        not list(biomarkers_dataset.columns).sort()
+        == [
+            "model",
+            "type",
+            "ageDeath",
+            "tissue",
+            "units",
+            "genotype",
+            "measurement",
+            "sex",
+        ].sort()
+    ):
+        raise ValueError(
+            f"Biomarker dataset does not contain expected columns. Columns found: {list(biomarkers_dataset.columns)}"
+        )
+
+    data_as_list = []
+    grouped = biomarkers_dataset.groupby(
+        ["model", "type", "ageDeath", "tissue", "units"]
+    )
+
+    for (model, type_, ageDeath, tissue, units), group in grouped:
+        # Create the base structure for each group
+        entry = {
+            "model": model,
+            "type": type_,
+            "ageDeath": ageDeath,
+            "tissue": tissue,
+            "units": units,
+            "points": [],
+        }
+
+        # Append the measurement, genotype, and sex for each row
+        for _, row in group.iterrows():
+            point = {
+                "genotype": row["genotype"],
+                "measurement": row["measurement"],
+                "sex": row["sex"],
+            }
+            entry["points"].append(point)
+
+        # Add the entry to the list
+        data_as_list.append(entry)
+
+    return data_as_list
diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py
@@ -59,6 +59,8 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj:
     if dataset_name in ["proteomics", "proteomics_tmt", "proteomics_srm"]:
         df = datasets[dataset_name]
         return transform.transform_proteomics(df=df)
+    if dataset_name == "biomarkers":
+        return transform.transform_biomarkers(datasets=datasets)
     else:
         return None
 
@@ -123,12 +125,22 @@ def process_dataset(
             staging_path=staging_path,
             filename=dataset_name + "." + dataset_obj[dataset_name]["final_format"],
         )
-    else:
+    elif isinstance(df, list):
+        json_path = load.list_to_json(
+            df=df,
+            staging_path=staging_path,
+            filename=dataset_name + "." + dataset_obj[dataset_name]["final_format"],
+        )
+    elif isinstance(df, DataFrame):
         json_path = load.df_to_json(
             df=df,
             staging_path=staging_path,
             filename=dataset_name + "." + dataset_obj[dataset_name]["final_format"],
         )
+    else:
+        raise ADTDataProcessingError(
+            f"Data processing failed for {dataset_name}. Data is of type {type(df)}. Supported data types are: dict, list, pd.DataFrame."
+        )
 
     gx_enabled = dataset_obj[dataset_name].get("gx_enabled", False)
 

diff --git a/tests/test_assets/biomarkers/input/biomarkers_duplicated_input.csv b/tests/test_assets/biomarkers/input/biomarkers_duplicated_input.csv
@@ -0,0 +1,5 @@
+model,type,measurement,units,ageDeath,tissue,sex,genotype
+ModelA,TypeA,1,A,1,TissueA,male,genotype1
+ModelA,TypeA,1,A,1,TissueA,male,genotype1
+ModelA,TypeA,1,A,1,TissueA,male,genotype2
+ModelA,TypeA,1,A,1,TissueA,male,genotype2
diff --git a/tests/test_assets/biomarkers/input/biomarkers_good_input.csv b/tests/test_assets/biomarkers/input/biomarkers_good_input.csv
@@ -0,0 +1,39 @@
+genotype,measurement,sex,model,type,ageDeath,tissue,units
+3xTG-AD,1.887403509,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,2.507680422,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,3.095873882,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,3.639017544,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,0.74988673,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,11.98119457,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,6.393679577,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,3.098566667,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,3.452296527,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,2.644899123,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,0.110659787,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,1.899942325,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,2.397684799,male,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,6.048395918,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,5.886637838,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,3.368940156,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,0.234726268,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,6.193847515,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,7.122192076,female,3xTG-AD,Insoluble Abeta40,4,cerebral cortex,pg/mg
+3xTG-AD,4.853281065,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,37.12325,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,15.62036898,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,37.12734722,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,75.55092784,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,63.56345,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,60.22938,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,23.23818056,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,93.87042718,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,107.5141702,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,130.5238413,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,13.75984343,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,34.07885294,male,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,80.33369231,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,84.89822857,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,92.64340206,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,133.3285882,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,130.4151077,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
+3xTG-AD,255.0758333,female,3xTG-AD,Insoluble Abeta40,4,hippocampus,pg/mg
diff --git a/tests/test_assets/biomarkers/input/biomarkers_good_test_input.csv b/tests/test_assets/biomarkers/input/biomarkers_good_test_input.csv
@@ -0,0 +1,7 @@
+model,type,measurement,units,ageDeath,tissue,sex,genotype
+ModelA,TypeA,1,A,1,TissueA,male,genotype1
+ModelA,TypeA,2,A,1,TissueA,male,genotype1
+ModelA,TypeA,3,A,2,TissueA,male,genotype2
+ModelA,TypeB,4,A,2,TissueA,male,genotype1
+ModelA,TypeB,5,A,3,TissueA,male,genotype1
+ModelA,TypeB,6,A,3,TissueA,male,genotype2
diff --git a/tests/test_assets/biomarkers/output/biomarkers_duplicated_output.json b/tests/test_assets/biomarkers/output/biomarkers_duplicated_output.json
@@ -0,0 +1,31 @@
+[
+    {
+        "model": "ModelA",
+        "type": "TypeA",
+        "ageDeath": 1,
+        "tissue": "TissueA",
+        "units": "A",
+        "points": [
+            {
+                "genotype": "genotype1",
+                "measurement": 1,
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype1",
+                "measurement": 1,
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype2",
+                "measurement": 1,
+                "sex": "male"
+            },
+            {
+                "genotype": "genotype2",
+                "measurement": 1,
+                "sex": "male"
+            }
+        ]
+    }
+]