Merge branch 'fl-poc' of https://github.com/hasan7n/medperf into be_e…

…nable_partial_epochs
hasan7n · Oct 16, 2024 · f0e0170 · f0e0170
2 parents 2dcd792 + 26b4337
commit f0e0170
Show file tree

Hide file tree

Showing 39 changed files with 630 additions and 288 deletions.
diff --git a/cli/cli_tests_training.sh b/cli/cli_tests_training.sh
@@ -20,6 +20,8 @@ print_eval medperf profile create -n testdata1
 checkFailed "testdata1 profile creation failed"
 print_eval medperf profile create -n testdata2
 checkFailed "testdata2 profile creation failed"
+print_eval medperf profile create -n fladmin
+checkFailed "fladmin profile creation failed"
 ##########################################################
 
 echo "\n"
@@ -71,6 +73,13 @@ checkFailed "testdata2 profile activation failed"
 
 print_eval medperf auth login -e $DATAOWNER2
 checkFailed "testdata2 login failed"
+
+print_eval medperf profile activate fladmin
+checkFailed "fladmin profile activation failed"
+
+print_eval medperf auth login -e $FLADMIN
+checkFailed "fladmin login failed"
+
 ##########################################################
 
 echo "\n"
@@ -97,6 +106,11 @@ PREP_UID=$(medperf mlcube ls | grep trainprep | head -n 1 | tr -s ' ' | cut -d '
 print_eval medperf mlcube submit --name traincube -m $TRAIN_MLCUBE -a $TRAIN_WEIGHTS --operational
 checkFailed "traincube submission failed"
 TRAINCUBE_UID=$(medperf mlcube ls | grep traincube | head -n 1 | tr -s ' ' | cut -d ' ' -f 2)
+
+print_eval medperf mlcube submit --name fladmincube -m $FLADMIN_MLCUBE --operational
+checkFailed "fladmincube submission failed"
+FLADMINCUBE_UID=$(medperf mlcube ls | grep fladmincube | head -n 1 | tr -s ' ' | cut -d ' ' -f 2)
+
 ##########################################################
 
 echo "\n"
@@ -105,7 +119,7 @@ echo "\n"
 echo "====================================="
 echo "Submit Training Experiment"
 echo "====================================="
-print_eval medperf training submit -n trainexp -d trainexp -p $PREP_UID -m $TRAINCUBE_UID
+print_eval medperf training submit -n trainexp -d trainexp -p $PREP_UID -m $TRAINCUBE_UID -a $FLADMINCUBE_UID
 checkFailed "Training exp submission failed"
 TRAINING_UID=$(medperf training ls | grep trainexp | tail -n 1 | tr -s ' ' | cut -d ' ' -f 2)
 
@@ -403,6 +417,51 @@ fi
 
 echo "\n"
 
+##########################################################
+echo "====================================="
+echo "Activate fladmin profile"
+echo "====================================="
+print_eval medperf profile activate fladmin
+checkFailed "fladmin profile activation failed"
+##########################################################
+
+echo "\n"
+
+##########################################################
+echo "====================================="
+echo "Get fladmin certificate"
+echo "====================================="
+print_eval medperf certificate get_client_certificate -t $TRAINING_UID
+checkFailed "Get fladmin cert failed"
+##########################################################
+
+echo "\n"
+
+##########################################################
+echo "====================================="
+echo "Check experiment status"
+echo "====================================="
+print_eval medperf training get_experiment_status -t $TRAINING_UID
+checkFailed "Get experiment status failed"
+
+sleep 3 # sleep some time then get status again
+
+print_eval medperf training get_experiment_status -t $TRAINING_UID
+checkFailed "Get experiment status failed"
+##########################################################
+
+echo "\n"
+
+##########################################################
+echo "====================================="
+echo "Update plan parameter"
+echo "====================================="
+print_eval medperf training update_plan -t $TRAINING_UID -f "straggler_handling_policy.settings.straggler_cutoff_time" -v 1200
+checkFailed "Update plan failed"
+##########################################################
+
+echo "\n"
+
 ##########################################################
 echo "====================================="
 echo "Waiting for other prcocesses to exit successfully"

diff --git a/cli/medperf/commands/training/get_experiment_status.py b/cli/medperf/commands/training/get_experiment_status.py
@@ -0,0 +1,88 @@
+from medperf import config
+from medperf.account_management.account_management import get_medperf_user_data
+from medperf.entities.ca import CA
+from medperf.entities.training_exp import TrainingExp
+from medperf.entities.cube import Cube
+from medperf.utils import (
+    get_pki_assets_path,
+    generate_tmp_path,
+    dict_pretty_print,
+    remove_path,
+)
+from medperf.certificates import trust
+import yaml
+import os
+
+
+class GetExperimentStatus:
+    @classmethod
+    def run(cls, training_exp_id: int):
+        """Starts the aggregation server of a training experiment
+
+        Args:
+            training_exp_id (int): Training experiment UID.
+        """
+        execution = cls(training_exp_id)
+        execution.prepare()
+        execution.prepare_plan()
+        execution.prepare_pki_assets()
+        with config.ui.interactive():
+            execution.prepare_admin_cube()
+            execution.get_experiment_status()
+        execution.print_experiment_status()
+        execution.store_status()
+
+    def __init__(self, training_exp_id: int) -> None:
+        self.training_exp_id = training_exp_id
+        self.ui = config.ui
+
+    def prepare(self):
+        self.training_exp = TrainingExp.get(self.training_exp_id)
+        self.ui.print(f"Training Experiment: {self.training_exp.name}")
+        self.user_email: str = get_medperf_user_data()["email"]
+        self.status_output = generate_tmp_path()
+        self.temp_dir = generate_tmp_path()
+
+    def prepare_plan(self):
+        self.training_exp.prepare_plan()
+
+    def prepare_pki_assets(self):
+        ca = CA.from_experiment(self.training_exp_id)
+        trust(ca)
+        self.admin_pki_assets = get_pki_assets_path(self.user_email, ca.name)
+        self.ca = ca
+
+    def prepare_admin_cube(self):
+        self.cube = self.__get_cube(self.training_exp.fl_admin_mlcube, "FL Admin")
+
+    def __get_cube(self, uid: int, name: str) -> Cube:
+        self.ui.text = (
+            "Retrieving and setting up training MLCube. This may take some time."
+        )
+        cube = Cube.get(uid)
+        cube.download_run_files()
+        self.ui.print(f"> {name} cube download complete")
+        return cube
+
+    def get_experiment_status(self):
+        env_dict = {"MEDPERF_ADMIN_PARTICIPANT_CN": self.user_email}
+        params = {
+            "node_cert_folder": self.admin_pki_assets,
+            "ca_cert_folder": self.ca.pki_assets,
+            "plan_path": self.training_exp.plan_path,
+            "output_status_file": self.status_output,
+            "temp_dir": self.temp_dir,
+        }
+
+        self.ui.text = "Getting training experiment status"
+        self.cube.run(task="get_experiment_status", env_dict=env_dict, **params)
+
+    def print_experiment_status(self):
+        with open(self.status_output) as f:
+            contents = yaml.safe_load(f)
+        dict_pretty_print(contents)
+
+    def store_status(self):
+        new_status_path = self.training_exp.status_path
+        remove_path(new_status_path)
+        os.rename(self.status_output, new_status_path)
diff --git a/cli/medperf/commands/training/training.py b/cli/medperf/commands/training/training.py
@@ -11,6 +11,8 @@
 from medperf.commands.training.close_event import CloseEvent
 from medperf.commands.list import EntityList
 from medperf.commands.view import EntityView
+from medperf.commands.training.get_experiment_status import GetExperimentStatus
+from medperf.commands.training.update_plan import UpdatePlan
 
 app = typer.Typer()
 
@@ -28,7 +30,7 @@ def submit(
         ..., "--fl-mlcube", "-m", help="Reference Model MLCube UID"
     ),
     fl_admin_mlcube: int = typer.Option(
-        None, "--fl-mlcube", "-a", help="FL admin interface MLCube"
+        None, "--fl-admin-mlcube", "-a", help="FL admin interface MLCube"
     ),
     operational: bool = typer.Option(
         False,
@@ -86,6 +88,36 @@ def start_event(
     config.ui.print("✅ Done!")
 
 
+@app.command("get_experiment_status")
+@clean_except
+def get_experiment_status(
+    training_exp_id: int = typer.Option(
+        ..., "--training_exp_id", "-t", help="UID of the desired benchmark"
+    )
+):
+    """Runs the benchmark execution step for a given benchmark, prepared dataset and model"""
+    GetExperimentStatus.run(training_exp_id)
+    config.ui.print("✅ Done!")
+
+
+@app.command("update_plan")
+@clean_except
+def update_plan(
+    training_exp_id: int = typer.Option(
+        ..., "--training_exp_id", "-t", help="UID of the desired benchmark"
+    ),
+    field_name: str = typer.Option(
+        ..., "--field_name", "-f", help="UID of the desired benchmark"
+    ),
+    value: str = typer.Option(
+        ..., "--value", "-v", help="UID of the desired benchmark"
+    ),
+):
+    """Runtime-update of a scalar field of the training plan"""
+    UpdatePlan.run(training_exp_id, field_name, value)
+    config.ui.print("✅ Done!")
+
+
 @app.command("close_event")
 @clean_except
 def close_event(

diff --git a/cli/medperf/commands/training/update_plan.py b/cli/medperf/commands/training/update_plan.py
@@ -0,0 +1,74 @@
+from medperf import config
+from medperf.account_management.account_management import get_medperf_user_data
+from medperf.entities.ca import CA
+from medperf.entities.training_exp import TrainingExp
+from medperf.entities.cube import Cube
+from medperf.utils import get_pki_assets_path, generate_tmp_path
+from medperf.certificates import trust
+
+
+class UpdatePlan:
+    @classmethod
+    def run(cls, training_exp_id: int, field_name: str, field_value: str):
+        """Starts the aggregation server of a training experiment
+
+        Args:
+            training_exp_id (int): Training experiment UID.
+        """
+        execution = cls(training_exp_id, field_name, field_value)
+        execution.prepare()
+        execution.prepare_plan()
+        execution.prepare_pki_assets()
+        with config.ui.interactive():
+            execution.prepare_admin_cube()
+            execution.update_plan()
+
+    def __init__(self, training_exp_id: int, field_name: str, field_value: str) -> None:
+        self.training_exp_id = training_exp_id
+        self.field_name = field_name
+        self.field_value = field_value
+        self.ui = config.ui
+
+    def prepare(self):
+        self.training_exp = TrainingExp.get(self.training_exp_id)
+        self.ui.print(f"Training Experiment: {self.training_exp.name}")
+        self.user_email: str = get_medperf_user_data()["email"]
+        self.temp_dir = generate_tmp_path()
+
+    def prepare_plan(self):
+        self.training_exp.prepare_plan()
+
+    def prepare_pki_assets(self):
+        ca = CA.from_experiment(self.training_exp_id)
+        trust(ca)
+        self.admin_pki_assets = get_pki_assets_path(self.user_email, ca.name)
+        self.ca = ca
+
+    def prepare_admin_cube(self):
+        self.cube = self.__get_cube(self.training_exp.fl_admin_mlcube, "FL Admin")
+
+    def __get_cube(self, uid: int, name: str) -> Cube:
+        self.ui.text = (
+            "Retrieving and setting up training MLCube. This may take some time."
+        )
+        cube = Cube.get(uid)
+        cube.download_run_files()
+        self.ui.print(f"> {name} cube download complete")
+        return cube
+
+    def update_plan(self):
+        env_dict = {
+            "MEDPERF_ADMIN_PARTICIPANT_CN": self.user_email,
+            "MEDPERF_UPDATE_FIELD_NAME": self.field_name,
+            "MEDPERF_UPDATE_FIELD_VALUE": self.field_value,
+        }
+
+        params = {
+            "node_cert_folder": self.admin_pki_assets,
+            "ca_cert_folder": self.ca.pki_assets,
+            "plan_path": self.training_exp.plan_path,
+            "temp_dir": self.temp_dir,
+        }
+
+        self.ui.text = "Updating plan"
+        self.cube.run(task="update_plan", env_dict=env_dict, **params)
diff --git a/cli/medperf/config.py b/cli/medperf/config.py
@@ -170,6 +170,7 @@
 training_exps_filename = "training-info.yaml"
 participants_list_filename = "cols.yaml"
 training_exp_plan_filename = "plan.yaml"
+training_exp_status_filename = "status.yaml"
 training_report_file = "report.yaml"
 training_report_folder = "report"
 training_out_agg_logs = "agg_logs"

diff --git a/cli/medperf/entities/cube.py b/cli/medperf/entities/cube.py
@@ -261,7 +261,7 @@ def run(
             "get_experiment_status",
             "add_collaborator",
             "remove_collaborator",
-            "set_straggler_cuttoff_time",
+            "update_plan",
         ]:
             cmd += " --network=none"
         if config.gpus is not None:

diff --git a/cli/medperf/entities/training_exp.py b/cli/medperf/entities/training_exp.py
@@ -63,6 +63,7 @@ def __init__(self, *args, **kwargs):
 
         self.generated_uid = self.name
         self.plan_path = os.path.join(self.path, config.training_exp_plan_filename)
+        self.status_path = os.path.join(self.path, config.training_exp_status_filename)
 
     @classmethod
     def _Entity__remote_prefilter(cls, filters: dict) -> callable:

diff --git a/cli/tests_setup.sh b/cli/tests_setup.sh
@@ -121,6 +121,7 @@ METRIC_PARAMS="$ASSETS_URL/metrics/mlcube/workspace/parameters.yaml"
 # FL cubes
 TRAIN_MLCUBE="https://raw.githubusercontent.com/hasan7n/medperf/19c80d88deaad27b353d1cb9bc180757534027aa/examples/fl/fl/mlcube/mlcube.yaml"
 TRAIN_WEIGHTS="https://storage.googleapis.com/medperf-storage/testfl/init_weights_miccai.tar.gz"
+FLADMIN_MLCUBE="https://raw.githubusercontent.com/hasan7n/medperf/bc431ffe6c3b761b28674816e6f26511e8b27042/examples/fl/fl_admin/mlcube/mlcube.yaml"
 
 # test users credentials
 MODELOWNER="[email protected]"
@@ -129,6 +130,7 @@ BENCHMARKOWNER="[email protected]"
 ADMIN="[email protected]"
 DATAOWNER2="[email protected]"
 AGGOWNER="[email protected]"
+FLADMIN="[email protected]"
 
 # local MLCubes for local compatibility tests
 PREP_LOCAL="$(dirname $(dirname $(realpath "$0")))/examples/chestxray_tutorial/data_preparator/mlcube"

diff --git a/examples/fl/cert/project/Dockerfile b/examples/fl/cert/project/Dockerfile
@@ -1,7 +1,6 @@
 FROM python:3.11.9-alpine
 
-# update openssl to fix https://avd.aquasec.com/nvd/cve-2024-2511 
-RUN apk update && apk add openssl=3.1.4-r6 jq
+RUN apk update && apk add jq
 
 ARG VERSION=0.26.1
 RUN wget https://dl.smallstep.com/gh-release/cli/gh-release-header/v${VERSION}/step_linux_${VERSION}_amd64.tar.gz \

diff --git a/examples/fl/cert/project/get_cert.sh b/examples/fl/cert/project/get_cert.sh
@@ -83,5 +83,13 @@ step ca certificate --ca-url $CA_ADDRESS:$CA_PORT \
     $PROVISIONER_ARGS \
     $MEDPERF_INPUT_CN $cert_path $key_path
 
+EXITSTATUS="$?"
+if [ $EXITSTATUS -ne "0" ]; then
+    echo "Failed to get the certificate"
+    # cleanup
+    rm -rf $STEPPATH
+    exit 1
+fi
+
 # cleanup
 rm -rf $STEPPATH
diff --git a/examples/fl/cert/project/trust.sh b/examples/fl/cert/project/trust.sh
@@ -47,6 +47,13 @@ if [ -n "$CA_FINGERPRINT" ]; then
 else
     wget -O $pki_assets/root_ca.crt $CA_ADDRESS:$CA_PORT/roots.pem
 fi
+EXITSTATUS="$?"
+if [ $EXITSTATUS -ne "0" ]; then
+    echo "Failed to retrieve the root certificate"
+    # cleanup
+    rm -rf $STEPPATH
+    exit 1
+fi
 
 # cleanup
 rm -rf $STEPPATH