From bde1f78e2e03da8cf8cde512e28e39eb7a4827eb Mon Sep 17 00:00:00 2001 From: hasan7n <78664424+hasan7n@users.noreply.github.com> Date: Fri, 25 Aug 2023 04:09:19 +0200 Subject: [PATCH] Data preparation MLCube for BraTS2023 (#479) * add data preparation MLCube for BraTS2023 * finalize data preps * add dict for mapping missing modalities in synthesis task * add paramter --- .../mlcube/workspace/labels/labels.csv | 13 -- .../mlcube/workspace/names/names.txt | 3 - .../BraTS2023/data_prep/mlcube/mlcube.yaml | 45 ++++++ .../mlcube/workspace/parameters.yaml | 2 + .../workspace/parameters_inpainting.yaml | 1 + .../workspace/parameters_synthesis.yaml | 5 + .../BraTS2023/data_prep/project/Dockerfile | 11 ++ .../BraTS2023/data_prep/project/mlcube.py | 51 ++++++ .../BraTS2023/data_prep/project/prepare.py | 116 +++++++++++++ .../data_prep/project/requirements.txt | 4 + .../data_prep/project/sanity_check.py | 152 ++++++++++++++++++ examples/BraTS2023/data_prep/project/stats.py | 11 ++ 12 files changed, 398 insertions(+), 16 deletions(-) delete mode 100644 examples/BraTS/data_prep/mlcube/workspace/labels/labels.csv delete mode 100644 examples/BraTS/data_prep/mlcube/workspace/names/names.txt create mode 100644 examples/BraTS2023/data_prep/mlcube/mlcube.yaml create mode 100644 examples/BraTS2023/data_prep/mlcube/workspace/parameters.yaml create mode 100644 examples/BraTS2023/data_prep/mlcube/workspace/parameters_inpainting.yaml create mode 100644 examples/BraTS2023/data_prep/mlcube/workspace/parameters_synthesis.yaml create mode 100644 examples/BraTS2023/data_prep/project/Dockerfile create mode 100644 examples/BraTS2023/data_prep/project/mlcube.py create mode 100644 examples/BraTS2023/data_prep/project/prepare.py create mode 100644 examples/BraTS2023/data_prep/project/requirements.txt create mode 100644 examples/BraTS2023/data_prep/project/sanity_check.py create mode 100644 examples/BraTS2023/data_prep/project/stats.py diff --git a/examples/BraTS/data_prep/mlcube/workspace/labels/labels.csv b/examples/BraTS/data_prep/mlcube/workspace/labels/labels.csv deleted file mode 100644 index 2774c39ed..000000000 --- a/examples/BraTS/data_prep/mlcube/workspace/labels/labels.csv +++ /dev/null @@ -1,13 +0,0 @@ -id,greeting -0,"Hello, Adam Smith" -1,"Hello, John Smith" -2,"Hello, Michael Stevens" -3,"Howdy, Adam Smith" -4,"Howdy, John Smith" -5,"Howdy, Michael Stevens" -6,"Greetings, Adam Smith" -7,"Greetings, John Smith" -8,"Greetings, Michael Stevens" -9,"Bonjour, Adam Smith" -10,"Bonjour, John Smith" -11,"Bonjour, Michael Stevens" diff --git a/examples/BraTS/data_prep/mlcube/workspace/names/names.txt b/examples/BraTS/data_prep/mlcube/workspace/names/names.txt deleted file mode 100644 index 491910d3a..000000000 --- a/examples/BraTS/data_prep/mlcube/workspace/names/names.txt +++ /dev/null @@ -1,3 +0,0 @@ -Adam Smith Miller -John Smith Jones -Michael M. Stevens Taylor \ No newline at end of file diff --git a/examples/BraTS2023/data_prep/mlcube/mlcube.yaml b/examples/BraTS2023/data_prep/mlcube/mlcube.yaml new file mode 100644 index 000000000..d86822c8e --- /dev/null +++ b/examples/BraTS2023/data_prep/mlcube/mlcube.yaml @@ -0,0 +1,45 @@ +name: BraTS2023 Data Preparator Cube +description: BraTS2023 Data Preparator Cube +authors: + - { name: "MLCommons Medical Working Group" } + +platform: + accelerator_count: 0 + +docker: + # Image name. + image: mlcommons/brats2023-prep:0.0.1 + # Docker build context relative to $MLCUBE_ROOT. Default is `build`. + build_context: "../project" + # Docker file name within docker build context, default is `Dockerfile`. + build_file: "Dockerfile" + +tasks: + prepare: + parameters: + inputs: + { + data_path: input_data/, + labels_path: input_labels/, + parameters_file: parameters.yaml, + } + outputs: { output_path: data/, output_labels_path: labels/ } + sanity_check: + parameters: + inputs: + { + data_path: data/, + labels_path: labels/, + + parameters_file: parameters.yaml, + } + statistics: + parameters: + inputs: + { + data_path: data/, + labels_path: labels/, + + parameters_file: parameters.yaml, + } + outputs: { output_path: { type: file, default: statistics.yaml } } diff --git a/examples/BraTS2023/data_prep/mlcube/workspace/parameters.yaml b/examples/BraTS2023/data_prep/mlcube/workspace/parameters.yaml new file mode 100644 index 000000000..89fc9d6e6 --- /dev/null +++ b/examples/BraTS2023/data_prep/mlcube/workspace/parameters.yaml @@ -0,0 +1,2 @@ +task: segmentation +segmentation_modalities: ["t1c", "t1n", "t2f", "t2w"] diff --git a/examples/BraTS2023/data_prep/mlcube/workspace/parameters_inpainting.yaml b/examples/BraTS2023/data_prep/mlcube/workspace/parameters_inpainting.yaml new file mode 100644 index 000000000..2ffff864c --- /dev/null +++ b/examples/BraTS2023/data_prep/mlcube/workspace/parameters_inpainting.yaml @@ -0,0 +1 @@ +task: inpainting diff --git a/examples/BraTS2023/data_prep/mlcube/workspace/parameters_synthesis.yaml b/examples/BraTS2023/data_prep/mlcube/workspace/parameters_synthesis.yaml new file mode 100644 index 000000000..00df28579 --- /dev/null +++ b/examples/BraTS2023/data_prep/mlcube/workspace/parameters_synthesis.yaml @@ -0,0 +1,5 @@ +task: synthesis +segmentation_modalities: ["t1c", "t1n", "t2f", "t2w"] +original_data_in_labels: original_data +segmentation_labels: segmentation_labels +missing_modality_json: "missing.json" diff --git a/examples/BraTS2023/data_prep/project/Dockerfile b/examples/BraTS2023/data_prep/project/Dockerfile new file mode 100644 index 000000000..91c477415 --- /dev/null +++ b/examples/BraTS2023/data_prep/project/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.9.16-slim + +COPY ./requirements.txt /mlcube_project/requirements.txt + +RUN pip3 install --no-cache-dir -r /mlcube_project/requirements.txt + +ENV LANG C.UTF-8 + +COPY . /mlcube_project + +ENTRYPOINT ["python3", "/mlcube_project/mlcube.py"] \ No newline at end of file diff --git a/examples/BraTS2023/data_prep/project/mlcube.py b/examples/BraTS2023/data_prep/project/mlcube.py new file mode 100644 index 000000000..057b7ca1c --- /dev/null +++ b/examples/BraTS2023/data_prep/project/mlcube.py @@ -0,0 +1,51 @@ +"""MLCube handler file""" +import typer +import yaml +from prepare import prepare_dataset +from sanity_check import perform_sanity_checks +from stats import generate_statistics + +app = typer.Typer() + + +@app.command("prepare") +def prepare( + data_path: str = typer.Option(..., "--data_path"), + labels_path: str = typer.Option(..., "--labels_path"), + parameters_file: str = typer.Option(..., "--parameters_file"), + output_path: str = typer.Option(..., "--output_path"), + output_labels_path: str = typer.Option(..., "--output_labels_path"), +): + with open(parameters_file) as f: + parameters = yaml.safe_load(f) + + prepare_dataset(data_path, labels_path, parameters, output_path, output_labels_path) + + +@app.command("sanity_check") +def sanity_check( + data_path: str = typer.Option(..., "--data_path"), + labels_path: str = typer.Option(..., "--labels_path"), + parameters_file: str = typer.Option(..., "--parameters_file"), +): + with open(parameters_file) as f: + parameters = yaml.safe_load(f) + + perform_sanity_checks(data_path, labels_path, parameters) + + +@app.command("statistics") +def statistics( + data_path: str = typer.Option(..., "--data_path"), + labels_path: str = typer.Option(..., "--labels_path"), + parameters_file: str = typer.Option(..., "--parameters_file"), + out_path: str = typer.Option(..., "--output_path"), +): + with open(parameters_file) as f: + parameters = yaml.safe_load(f) + + generate_statistics(data_path, labels_path, parameters, out_path) + + +if __name__ == "__main__": + app() diff --git a/examples/BraTS2023/data_prep/project/prepare.py b/examples/BraTS2023/data_prep/project/prepare.py new file mode 100644 index 000000000..44e770a4c --- /dev/null +++ b/examples/BraTS2023/data_prep/project/prepare.py @@ -0,0 +1,116 @@ +import os +import shutil +from glob import iglob +import random +import json + +random.seed(7) + + +def __copy_modalities(input_folder, modalities, output_folder): + for file in iglob(os.path.join(input_folder, "*.nii.gz")): + for modality in modalities: + if file.endswith(f"{modality}.nii.gz"): + new_file = os.path.join(output_folder, os.path.basename(file)) + shutil.copyfile(file, new_file) + break + + +def copy_segmentation_data( + data_path, labels_path, parameters, output_path, output_labels_path +): + # copy data + modalities = parameters["segmentation_modalities"] + for folder in iglob(os.path.join(data_path, "*/")): + outfolder = os.path.join( + output_path, os.path.basename(os.path.normpath(folder)) + ) + os.makedirs(outfolder, exist_ok=True) + __copy_modalities(folder, modalities, outfolder) + + # copy labels + for folder_or_file in iglob(os.path.join(labels_path, "*")): + if os.path.isdir(folder_or_file): + __copy_modalities(folder_or_file, ["seg"], output_labels_path) + else: + file = folder_or_file + if file.endswith(f"seg.nii.gz"): + new_file = os.path.join(output_labels_path, os.path.basename(file)) + shutil.copyfile(file, new_file) + break + + +def post_process_for_synthesis(parameters, output_path, output_labels_path): + modalities = parameters["segmentation_modalities"] + original_data_in_labels = parameters["original_data_in_labels"] + segmentation_labels = parameters["segmentation_labels"] + missing_modality_json = parameters["missing_modality_json"] + + # move labels to a sub directory + labels_subdir = os.path.join(output_labels_path, segmentation_labels) + os.makedirs(labels_subdir, exist_ok=True) + for obj in iglob(os.path.join(output_labels_path, "*")): + if os.path.normpath(obj) != os.path.normpath(labels_subdir): + shutil.move(obj, labels_subdir) + + # copy data to labels for metrics calculation + data_subdir = os.path.join(output_labels_path, original_data_in_labels) + shutil.copytree(output_path, data_subdir) + + # drop modalities + missing_modality_dict = {} + for folder in iglob(os.path.join(output_path, "*/")): + missing_modality = random.choice(modalities) + for file in iglob(os.path.join(folder, "*.nii.gz")): + if file.endswith(f"{missing_modality}.nii.gz"): + os.remove(file) + break + foldername = os.path.basename(os.path.normpath(folder)) + missing_modality_dict[foldername] = missing_modality + + out_json = os.path.join(output_labels_path, missing_modality_json) + with open(out_json, "w") as f: + json.dump(missing_modality_dict, f) + + +def copy_inpainting_data( + data_path, labels_path, parameters, output_path, output_labels_path +): + # copy data + modalities = ["mask", "t1n-voided"] + for folder in iglob(os.path.join(data_path, "*/")): + outfolder = os.path.join( + output_path, os.path.basename(os.path.normpath(folder)) + ) + os.makedirs(outfolder, exist_ok=True) + __copy_modalities(folder, modalities, outfolder) + + # copy labels + modalities = ["mask-healthy", "t1n"] + for folder in iglob(os.path.join(labels_path, "*/")): + outfolder = os.path.join( + output_labels_path, os.path.basename(os.path.normpath(folder)) + ) + os.makedirs(outfolder, exist_ok=True) + __copy_modalities(folder, modalities, outfolder) + + +def prepare_dataset( + data_path, labels_path, parameters, output_path, output_labels_path +): + task = parameters["task"] + assert task in ["segmentation", "inpainting", "synthesis"], "Invalid task" + os.makedirs(output_path, exist_ok=True) + os.makedirs(output_labels_path, exist_ok=True) + + if task in ["segmentation", "synthesis"]: + copy_segmentation_data( + data_path, labels_path, parameters, output_path, output_labels_path + ) + if task == "synthesis": + post_process_for_synthesis(parameters, output_path, output_labels_path) + + else: + copy_inpainting_data( + data_path, labels_path, parameters, output_path, output_labels_path + ) diff --git a/examples/BraTS2023/data_prep/project/requirements.txt b/examples/BraTS2023/data_prep/project/requirements.txt new file mode 100644 index 000000000..fd9cc83a4 --- /dev/null +++ b/examples/BraTS2023/data_prep/project/requirements.txt @@ -0,0 +1,4 @@ +pyYAML +typer +numpy +SimpleITK>=2.1.0 diff --git a/examples/BraTS2023/data_prep/project/sanity_check.py b/examples/BraTS2023/data_prep/project/sanity_check.py new file mode 100644 index 000000000..c153e4df6 --- /dev/null +++ b/examples/BraTS2023/data_prep/project/sanity_check.py @@ -0,0 +1,152 @@ +import os +import numpy as np +import SimpleITK as sitk +import json + + +def check_image_dims(path): + base_size = np.array([240, 240, 155]) + base_spacing = np.array([1.0, 1.0, 1.0]) + image = sitk.ReadImage(path) + size_array = np.array(image.GetSize()) + spacing_array = np.array(image.GetSpacing()) + + assert (base_size == size_array).all(), ( + "Image size is not [240,240,155] for " + path + ) + assert np.isclose(base_spacing, spacing_array).all(), ( + "Image resolution is not [1,1,1] for " + path + ) + + +def check_subject_validity_for_segmentation(labels_path, subject_dir, parameters): + modalities = parameters["segmentation_modalities"] + + strings_to_check = [f"-{modality}.nii.gz" for modality in modalities] + + for string in strings_to_check: + if not os.path.isfile( + os.path.join(subject_dir, os.path.basename(subject_dir) + string) + ): + raise ValueError( + f"{os.path.basename(subject_dir)} does not contain all modalities" + ) + assert len(os.listdir(subject_dir)) == len( + modalities + ), "invalid number of modalities" + + # labels + if not os.path.isfile( + os.path.join(labels_path, os.path.basename(subject_dir) + "-seg.nii.gz") + ): + raise ValueError( + f"{os.path.basename(subject_dir)} does not contain segmentation labels" + ) + + +def check_subject_validity_for_synthesis(labels_path, subject_dir, parameters): + modalities = parameters["segmentation_modalities"] + original_data_in_labels = parameters["original_data_in_labels"] + segmentation_labels = parameters["segmentation_labels"] + missing_modality_json = parameters["missing_modality_json"] + missing_modality_json = os.path.join(labels_path, missing_modality_json) + missing_modality_dict = json.load(open(missing_modality_json)) + + strings_to_check = [f"-{modality}.nii.gz" for modality in modalities] + + for folder in [ + subject_dir, + os.path.join( + labels_path, original_data_in_labels, os.path.basename(subject_dir) + ), + ]: # checking both data input folder and data folder copied to labels + missing_modalities = 0 + for string in strings_to_check: + if not os.path.isfile( + os.path.join(folder, os.path.basename(subject_dir) + string) + ): + missing_modalities += 1 + missing_modality = missing_modality_dict[os.path.basename(subject_dir)] + assert ( + string == f"-{missing_modality}.nii.gz" + ), "Missing modality doesn't appear in the missing modality mapping dict" + if folder == subject_dir: + if missing_modalities != 1: + raise ValueError( + f"{os.path.basename(subject_dir)} does not have one missing modality" + ) + assert ( + len(os.listdir(folder)) == len(modalities) - 1 + ), "invalid number of modalities" + else: + if missing_modalities != 0: + raise ValueError( + f"{os.path.basename(subject_dir)} does not have all data in labels" + ) + assert len(os.listdir(folder)) == len( + modalities + ), "invalid number of modalities" + + # labels + if not os.path.isfile( + os.path.join( + labels_path, + segmentation_labels, + os.path.basename(subject_dir) + "-seg.nii.gz", + ) + ): + raise ValueError( + f"{os.path.basename(subject_dir)} does not contain segmentation labels" + ) + + +def check_subject_validity_for_inpainting(labels_path, subject_dir, parameters): + strings_to_check = ["-mask.nii.gz", "-t1n-voided.nii.gz"] + for string in strings_to_check: + if not os.path.isfile( + os.path.join(subject_dir, os.path.basename(subject_dir) + string) + ): + raise ValueError( + f"{os.path.basename(subject_dir)} does not contain {string}" + ) + assert len(os.listdir(subject_dir)) == len( + strings_to_check + ), "invalid number of modalities" + + # labels + strings_to_check = ["-mask-healthy.nii.gz", "-t1n.nii.gz"] + for string in strings_to_check: + if not os.path.isfile( + os.path.join( + labels_path, + os.path.basename(subject_dir), + os.path.basename(subject_dir) + string, + ) + ): + raise ValueError( + f"{os.path.basename(subject_dir)} does not contain {string}" + ) + assert len( + os.listdir(os.path.join(labels_path, os.path.basename(subject_dir))) + ) == len(strings_to_check), "invalid number of modalities" + + +def perform_sanity_checks(data_path, labels_path, parameters): + task = parameters["task"] + data_folders = os.listdir(data_path) + + for folder in data_folders: + current_subject = os.path.join(data_path, folder) + assert os.path.isdir(current_subject), "Unexpected file found" + if task == "segmentation": + check_subject_validity_for_segmentation( + labels_path, current_subject, parameters + ) + elif task == "synthesis": + check_subject_validity_for_synthesis( + labels_path, current_subject, parameters + ) + else: + check_subject_validity_for_inpainting( + labels_path, current_subject, parameters + ) diff --git a/examples/BraTS2023/data_prep/project/stats.py b/examples/BraTS2023/data_prep/project/stats.py new file mode 100644 index 000000000..4b7a79e6f --- /dev/null +++ b/examples/BraTS2023/data_prep/project/stats.py @@ -0,0 +1,11 @@ +import os +import yaml + + +def generate_statistics(data_path, labels_path, parameters, out_path): + stats = { + "Number of Subjects": len(os.listdir(data_path)), + } + + with open(out_path, "w") as f: + yaml.dump(stats, f)