From bde1f78e2e03da8cf8cde512e28e39eb7a4827eb Mon Sep 17 00:00:00 2001
From: hasan7n <78664424+hasan7n@users.noreply.github.com>
Date: Fri, 25 Aug 2023 04:09:19 +0200
Subject: [PATCH] Data preparation MLCube for BraTS2023 (#479)

* add data preparation MLCube for BraTS2023

* finalize data preps

* add dict for mapping missing modalities in synthesis task

* add paramter
---
 .../mlcube/workspace/labels/labels.csv        |  13 --
 .../mlcube/workspace/names/names.txt          |   3 -
 .../BraTS2023/data_prep/mlcube/mlcube.yaml    |  45 ++++++
 .../mlcube/workspace/parameters.yaml          |   2 +
 .../workspace/parameters_inpainting.yaml      |   1 +
 .../workspace/parameters_synthesis.yaml       |   5 +
 .../BraTS2023/data_prep/project/Dockerfile    |  11 ++
 .../BraTS2023/data_prep/project/mlcube.py     |  51 ++++++
 .../BraTS2023/data_prep/project/prepare.py    | 116 +++++++++++++
 .../data_prep/project/requirements.txt        |   4 +
 .../data_prep/project/sanity_check.py         | 152 ++++++++++++++++++
 examples/BraTS2023/data_prep/project/stats.py |  11 ++
 12 files changed, 398 insertions(+), 16 deletions(-)
 delete mode 100644 examples/BraTS/data_prep/mlcube/workspace/labels/labels.csv
 delete mode 100644 examples/BraTS/data_prep/mlcube/workspace/names/names.txt
 create mode 100644 examples/BraTS2023/data_prep/mlcube/mlcube.yaml
 create mode 100644 examples/BraTS2023/data_prep/mlcube/workspace/parameters.yaml
 create mode 100644 examples/BraTS2023/data_prep/mlcube/workspace/parameters_inpainting.yaml
 create mode 100644 examples/BraTS2023/data_prep/mlcube/workspace/parameters_synthesis.yaml
 create mode 100644 examples/BraTS2023/data_prep/project/Dockerfile
 create mode 100644 examples/BraTS2023/data_prep/project/mlcube.py
 create mode 100644 examples/BraTS2023/data_prep/project/prepare.py
 create mode 100644 examples/BraTS2023/data_prep/project/requirements.txt
 create mode 100644 examples/BraTS2023/data_prep/project/sanity_check.py
 create mode 100644 examples/BraTS2023/data_prep/project/stats.py

diff --git a/examples/BraTS/data_prep/mlcube/workspace/labels/labels.csv b/examples/BraTS/data_prep/mlcube/workspace/labels/labels.csv
deleted file mode 100644
index 2774c39ed..000000000
--- a/examples/BraTS/data_prep/mlcube/workspace/labels/labels.csv
+++ /dev/null
@@ -1,13 +0,0 @@
-id,greeting
-0,"Hello, Adam Smith"
-1,"Hello, John Smith"
-2,"Hello, Michael Stevens"
-3,"Howdy, Adam Smith"
-4,"Howdy, John Smith"
-5,"Howdy, Michael Stevens"
-6,"Greetings, Adam Smith"
-7,"Greetings, John Smith"
-8,"Greetings, Michael Stevens"
-9,"Bonjour, Adam Smith"
-10,"Bonjour, John Smith"
-11,"Bonjour, Michael Stevens"
diff --git a/examples/BraTS/data_prep/mlcube/workspace/names/names.txt b/examples/BraTS/data_prep/mlcube/workspace/names/names.txt
deleted file mode 100644
index 491910d3a..000000000
--- a/examples/BraTS/data_prep/mlcube/workspace/names/names.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-Adam Smith Miller
-John Smith Jones
-Michael M. Stevens Taylor
\ No newline at end of file
diff --git a/examples/BraTS2023/data_prep/mlcube/mlcube.yaml b/examples/BraTS2023/data_prep/mlcube/mlcube.yaml
new file mode 100644
index 000000000..d86822c8e
--- /dev/null
+++ b/examples/BraTS2023/data_prep/mlcube/mlcube.yaml
@@ -0,0 +1,45 @@
+name: BraTS2023 Data Preparator Cube
+description: BraTS2023 Data Preparator Cube
+authors:
+  - { name: "MLCommons Medical Working Group" }
+
+platform:
+  accelerator_count: 0
+
+docker:
+  # Image name.
+  image: mlcommons/brats2023-prep:0.0.1
+  # Docker build context relative to $MLCUBE_ROOT. Default is `build`.
+  build_context: "../project"
+  # Docker file name within docker build context, default is `Dockerfile`.
+  build_file: "Dockerfile"
+
+tasks:
+  prepare:
+    parameters:
+      inputs:
+        {
+          data_path: input_data/,
+          labels_path: input_labels/,
+          parameters_file: parameters.yaml,
+        }
+      outputs: { output_path: data/, output_labels_path: labels/ }
+  sanity_check:
+    parameters:
+      inputs:
+        {
+          data_path: data/,
+          labels_path: labels/,
+
+          parameters_file: parameters.yaml,
+        }
+  statistics:
+    parameters:
+      inputs:
+        {
+          data_path: data/,
+          labels_path: labels/,
+
+          parameters_file: parameters.yaml,
+        }
+      outputs: { output_path: { type: file, default: statistics.yaml } }
diff --git a/examples/BraTS2023/data_prep/mlcube/workspace/parameters.yaml b/examples/BraTS2023/data_prep/mlcube/workspace/parameters.yaml
new file mode 100644
index 000000000..89fc9d6e6
--- /dev/null
+++ b/examples/BraTS2023/data_prep/mlcube/workspace/parameters.yaml
@@ -0,0 +1,2 @@
+task: segmentation
+segmentation_modalities: ["t1c", "t1n", "t2f", "t2w"]
diff --git a/examples/BraTS2023/data_prep/mlcube/workspace/parameters_inpainting.yaml b/examples/BraTS2023/data_prep/mlcube/workspace/parameters_inpainting.yaml
new file mode 100644
index 000000000..2ffff864c
--- /dev/null
+++ b/examples/BraTS2023/data_prep/mlcube/workspace/parameters_inpainting.yaml
@@ -0,0 +1 @@
+task: inpainting
diff --git a/examples/BraTS2023/data_prep/mlcube/workspace/parameters_synthesis.yaml b/examples/BraTS2023/data_prep/mlcube/workspace/parameters_synthesis.yaml
new file mode 100644
index 000000000..00df28579
--- /dev/null
+++ b/examples/BraTS2023/data_prep/mlcube/workspace/parameters_synthesis.yaml
@@ -0,0 +1,5 @@
+task: synthesis
+segmentation_modalities: ["t1c", "t1n", "t2f", "t2w"]
+original_data_in_labels: original_data
+segmentation_labels: segmentation_labels
+missing_modality_json: "missing.json"
diff --git a/examples/BraTS2023/data_prep/project/Dockerfile b/examples/BraTS2023/data_prep/project/Dockerfile
new file mode 100644
index 000000000..91c477415
--- /dev/null
+++ b/examples/BraTS2023/data_prep/project/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.9.16-slim
+
+COPY ./requirements.txt /mlcube_project/requirements.txt 
+
+RUN pip3 install --no-cache-dir -r /mlcube_project/requirements.txt
+
+ENV LANG C.UTF-8
+
+COPY . /mlcube_project
+
+ENTRYPOINT ["python3", "/mlcube_project/mlcube.py"]
\ No newline at end of file
diff --git a/examples/BraTS2023/data_prep/project/mlcube.py b/examples/BraTS2023/data_prep/project/mlcube.py
new file mode 100644
index 000000000..057b7ca1c
--- /dev/null
+++ b/examples/BraTS2023/data_prep/project/mlcube.py
@@ -0,0 +1,51 @@
+"""MLCube handler file"""
+import typer
+import yaml
+from prepare import prepare_dataset
+from sanity_check import perform_sanity_checks
+from stats import generate_statistics
+
+app = typer.Typer()
+
+
+@app.command("prepare")
+def prepare(
+    data_path: str = typer.Option(..., "--data_path"),
+    labels_path: str = typer.Option(..., "--labels_path"),
+    parameters_file: str = typer.Option(..., "--parameters_file"),
+    output_path: str = typer.Option(..., "--output_path"),
+    output_labels_path: str = typer.Option(..., "--output_labels_path"),
+):
+    with open(parameters_file) as f:
+        parameters = yaml.safe_load(f)
+
+    prepare_dataset(data_path, labels_path, parameters, output_path, output_labels_path)
+
+
+@app.command("sanity_check")
+def sanity_check(
+    data_path: str = typer.Option(..., "--data_path"),
+    labels_path: str = typer.Option(..., "--labels_path"),
+    parameters_file: str = typer.Option(..., "--parameters_file"),
+):
+    with open(parameters_file) as f:
+        parameters = yaml.safe_load(f)
+
+    perform_sanity_checks(data_path, labels_path, parameters)
+
+
+@app.command("statistics")
+def statistics(
+    data_path: str = typer.Option(..., "--data_path"),
+    labels_path: str = typer.Option(..., "--labels_path"),
+    parameters_file: str = typer.Option(..., "--parameters_file"),
+    out_path: str = typer.Option(..., "--output_path"),
+):
+    with open(parameters_file) as f:
+        parameters = yaml.safe_load(f)
+
+    generate_statistics(data_path, labels_path, parameters, out_path)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/examples/BraTS2023/data_prep/project/prepare.py b/examples/BraTS2023/data_prep/project/prepare.py
new file mode 100644
index 000000000..44e770a4c
--- /dev/null
+++ b/examples/BraTS2023/data_prep/project/prepare.py
@@ -0,0 +1,116 @@
+import os
+import shutil
+from glob import iglob
+import random
+import json
+
+random.seed(7)
+
+
+def __copy_modalities(input_folder, modalities, output_folder):
+    for file in iglob(os.path.join(input_folder, "*.nii.gz")):
+        for modality in modalities:
+            if file.endswith(f"{modality}.nii.gz"):
+                new_file = os.path.join(output_folder, os.path.basename(file))
+                shutil.copyfile(file, new_file)
+                break
+
+
+def copy_segmentation_data(
+    data_path, labels_path, parameters, output_path, output_labels_path
+):
+    # copy data
+    modalities = parameters["segmentation_modalities"]
+    for folder in iglob(os.path.join(data_path, "*/")):
+        outfolder = os.path.join(
+            output_path, os.path.basename(os.path.normpath(folder))
+        )
+        os.makedirs(outfolder, exist_ok=True)
+        __copy_modalities(folder, modalities, outfolder)
+
+    # copy labels
+    for folder_or_file in iglob(os.path.join(labels_path, "*")):
+        if os.path.isdir(folder_or_file):
+            __copy_modalities(folder_or_file, ["seg"], output_labels_path)
+        else:
+            file = folder_or_file
+            if file.endswith(f"seg.nii.gz"):
+                new_file = os.path.join(output_labels_path, os.path.basename(file))
+                shutil.copyfile(file, new_file)
+                break
+
+
+def post_process_for_synthesis(parameters, output_path, output_labels_path):
+    modalities = parameters["segmentation_modalities"]
+    original_data_in_labels = parameters["original_data_in_labels"]
+    segmentation_labels = parameters["segmentation_labels"]
+    missing_modality_json = parameters["missing_modality_json"]
+
+    # move labels to a sub directory
+    labels_subdir = os.path.join(output_labels_path, segmentation_labels)
+    os.makedirs(labels_subdir, exist_ok=True)
+    for obj in iglob(os.path.join(output_labels_path, "*")):
+        if os.path.normpath(obj) != os.path.normpath(labels_subdir):
+            shutil.move(obj, labels_subdir)
+
+    # copy data to labels for metrics calculation
+    data_subdir = os.path.join(output_labels_path, original_data_in_labels)
+    shutil.copytree(output_path, data_subdir)
+
+    # drop modalities
+    missing_modality_dict = {}
+    for folder in iglob(os.path.join(output_path, "*/")):
+        missing_modality = random.choice(modalities)
+        for file in iglob(os.path.join(folder, "*.nii.gz")):
+            if file.endswith(f"{missing_modality}.nii.gz"):
+                os.remove(file)
+                break
+        foldername = os.path.basename(os.path.normpath(folder))
+        missing_modality_dict[foldername] = missing_modality
+
+    out_json = os.path.join(output_labels_path, missing_modality_json)
+    with open(out_json, "w") as f:
+        json.dump(missing_modality_dict, f)
+
+
+def copy_inpainting_data(
+    data_path, labels_path, parameters, output_path, output_labels_path
+):
+    # copy data
+    modalities = ["mask", "t1n-voided"]
+    for folder in iglob(os.path.join(data_path, "*/")):
+        outfolder = os.path.join(
+            output_path, os.path.basename(os.path.normpath(folder))
+        )
+        os.makedirs(outfolder, exist_ok=True)
+        __copy_modalities(folder, modalities, outfolder)
+
+    # copy labels
+    modalities = ["mask-healthy", "t1n"]
+    for folder in iglob(os.path.join(labels_path, "*/")):
+        outfolder = os.path.join(
+            output_labels_path, os.path.basename(os.path.normpath(folder))
+        )
+        os.makedirs(outfolder, exist_ok=True)
+        __copy_modalities(folder, modalities, outfolder)
+
+
+def prepare_dataset(
+    data_path, labels_path, parameters, output_path, output_labels_path
+):
+    task = parameters["task"]
+    assert task in ["segmentation", "inpainting", "synthesis"], "Invalid task"
+    os.makedirs(output_path, exist_ok=True)
+    os.makedirs(output_labels_path, exist_ok=True)
+
+    if task in ["segmentation", "synthesis"]:
+        copy_segmentation_data(
+            data_path, labels_path, parameters, output_path, output_labels_path
+        )
+        if task == "synthesis":
+            post_process_for_synthesis(parameters, output_path, output_labels_path)
+
+    else:
+        copy_inpainting_data(
+            data_path, labels_path, parameters, output_path, output_labels_path
+        )
diff --git a/examples/BraTS2023/data_prep/project/requirements.txt b/examples/BraTS2023/data_prep/project/requirements.txt
new file mode 100644
index 000000000..fd9cc83a4
--- /dev/null
+++ b/examples/BraTS2023/data_prep/project/requirements.txt
@@ -0,0 +1,4 @@
+pyYAML
+typer
+numpy
+SimpleITK>=2.1.0
diff --git a/examples/BraTS2023/data_prep/project/sanity_check.py b/examples/BraTS2023/data_prep/project/sanity_check.py
new file mode 100644
index 000000000..c153e4df6
--- /dev/null
+++ b/examples/BraTS2023/data_prep/project/sanity_check.py
@@ -0,0 +1,152 @@
+import os
+import numpy as np
+import SimpleITK as sitk
+import json
+
+
+def check_image_dims(path):
+    base_size = np.array([240, 240, 155])
+    base_spacing = np.array([1.0, 1.0, 1.0])
+    image = sitk.ReadImage(path)
+    size_array = np.array(image.GetSize())
+    spacing_array = np.array(image.GetSpacing())
+
+    assert (base_size == size_array).all(), (
+        "Image size is not [240,240,155] for " + path
+    )
+    assert np.isclose(base_spacing, spacing_array).all(), (
+        "Image resolution is not [1,1,1] for " + path
+    )
+
+
+def check_subject_validity_for_segmentation(labels_path, subject_dir, parameters):
+    modalities = parameters["segmentation_modalities"]
+
+    strings_to_check = [f"-{modality}.nii.gz" for modality in modalities]
+
+    for string in strings_to_check:
+        if not os.path.isfile(
+            os.path.join(subject_dir, os.path.basename(subject_dir) + string)
+        ):
+            raise ValueError(
+                f"{os.path.basename(subject_dir)} does not contain all modalities"
+            )
+    assert len(os.listdir(subject_dir)) == len(
+        modalities
+    ), "invalid number of modalities"
+
+    # labels
+    if not os.path.isfile(
+        os.path.join(labels_path, os.path.basename(subject_dir) + "-seg.nii.gz")
+    ):
+        raise ValueError(
+            f"{os.path.basename(subject_dir)} does not contain segmentation labels"
+        )
+
+
+def check_subject_validity_for_synthesis(labels_path, subject_dir, parameters):
+    modalities = parameters["segmentation_modalities"]
+    original_data_in_labels = parameters["original_data_in_labels"]
+    segmentation_labels = parameters["segmentation_labels"]
+    missing_modality_json = parameters["missing_modality_json"]
+    missing_modality_json = os.path.join(labels_path, missing_modality_json)
+    missing_modality_dict = json.load(open(missing_modality_json))
+
+    strings_to_check = [f"-{modality}.nii.gz" for modality in modalities]
+
+    for folder in [
+        subject_dir,
+        os.path.join(
+            labels_path, original_data_in_labels, os.path.basename(subject_dir)
+        ),
+    ]:  # checking both data input folder and data folder copied to labels
+        missing_modalities = 0
+        for string in strings_to_check:
+            if not os.path.isfile(
+                os.path.join(folder, os.path.basename(subject_dir) + string)
+            ):
+                missing_modalities += 1
+                missing_modality = missing_modality_dict[os.path.basename(subject_dir)]
+                assert (
+                    string == f"-{missing_modality}.nii.gz"
+                ), "Missing modality doesn't appear in the missing modality mapping dict"
+        if folder == subject_dir:
+            if missing_modalities != 1:
+                raise ValueError(
+                    f"{os.path.basename(subject_dir)} does not have one missing modality"
+                )
+            assert (
+                len(os.listdir(folder)) == len(modalities) - 1
+            ), "invalid number of modalities"
+        else:
+            if missing_modalities != 0:
+                raise ValueError(
+                    f"{os.path.basename(subject_dir)} does not have all data in labels"
+                )
+            assert len(os.listdir(folder)) == len(
+                modalities
+            ), "invalid number of modalities"
+
+    # labels
+    if not os.path.isfile(
+        os.path.join(
+            labels_path,
+            segmentation_labels,
+            os.path.basename(subject_dir) + "-seg.nii.gz",
+        )
+    ):
+        raise ValueError(
+            f"{os.path.basename(subject_dir)} does not contain segmentation labels"
+        )
+
+
+def check_subject_validity_for_inpainting(labels_path, subject_dir, parameters):
+    strings_to_check = ["-mask.nii.gz", "-t1n-voided.nii.gz"]
+    for string in strings_to_check:
+        if not os.path.isfile(
+            os.path.join(subject_dir, os.path.basename(subject_dir) + string)
+        ):
+            raise ValueError(
+                f"{os.path.basename(subject_dir)} does not contain {string}"
+            )
+    assert len(os.listdir(subject_dir)) == len(
+        strings_to_check
+    ), "invalid number of modalities"
+
+    # labels
+    strings_to_check = ["-mask-healthy.nii.gz", "-t1n.nii.gz"]
+    for string in strings_to_check:
+        if not os.path.isfile(
+            os.path.join(
+                labels_path,
+                os.path.basename(subject_dir),
+                os.path.basename(subject_dir) + string,
+            )
+        ):
+            raise ValueError(
+                f"{os.path.basename(subject_dir)} does not contain {string}"
+            )
+    assert len(
+        os.listdir(os.path.join(labels_path, os.path.basename(subject_dir)))
+    ) == len(strings_to_check), "invalid number of modalities"
+
+
+def perform_sanity_checks(data_path, labels_path, parameters):
+    task = parameters["task"]
+    data_folders = os.listdir(data_path)
+
+    for folder in data_folders:
+        current_subject = os.path.join(data_path, folder)
+        assert os.path.isdir(current_subject), "Unexpected file found"
+        if task == "segmentation":
+            check_subject_validity_for_segmentation(
+                labels_path, current_subject, parameters
+            )
+        elif task == "synthesis":
+            check_subject_validity_for_synthesis(
+                labels_path, current_subject, parameters
+            )
+        else:
+            check_subject_validity_for_inpainting(
+                labels_path, current_subject, parameters
+            )
diff --git a/examples/BraTS2023/data_prep/project/stats.py b/examples/BraTS2023/data_prep/project/stats.py
new file mode 100644
index 000000000..4b7a79e6f
--- /dev/null
+++ b/examples/BraTS2023/data_prep/project/stats.py
@@ -0,0 +1,11 @@
+import os
+import yaml
+
+
+def generate_statistics(data_path, labels_path, parameters, out_path):
+    stats = {
+        "Number of Subjects": len(os.listdir(data_path)),
+    }
+
+    with open(out_path, "w") as f:
+        yaml.dump(stats, f)