Skip to content

Commit

Permalink
Merge branch 'fl-poc' of https://github.com/hasan7n/medperf into be_e…
Browse files Browse the repository at this point in the history
…nable_partial_epochs
  • Loading branch information
brandon-edwards committed Oct 16, 2024
2 parents 2dcd792 + 26b4337 commit f0e0170
Show file tree
Hide file tree
Showing 39 changed files with 630 additions and 288 deletions.
61 changes: 60 additions & 1 deletion cli/cli_tests_training.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ print_eval medperf profile create -n testdata1
checkFailed "testdata1 profile creation failed"
print_eval medperf profile create -n testdata2
checkFailed "testdata2 profile creation failed"
print_eval medperf profile create -n fladmin
checkFailed "fladmin profile creation failed"
##########################################################

echo "\n"
Expand Down Expand Up @@ -71,6 +73,13 @@ checkFailed "testdata2 profile activation failed"

print_eval medperf auth login -e $DATAOWNER2
checkFailed "testdata2 login failed"

print_eval medperf profile activate fladmin
checkFailed "fladmin profile activation failed"

print_eval medperf auth login -e $FLADMIN
checkFailed "fladmin login failed"

##########################################################

echo "\n"
Expand All @@ -97,6 +106,11 @@ PREP_UID=$(medperf mlcube ls | grep trainprep | head -n 1 | tr -s ' ' | cut -d '
print_eval medperf mlcube submit --name traincube -m $TRAIN_MLCUBE -a $TRAIN_WEIGHTS --operational
checkFailed "traincube submission failed"
TRAINCUBE_UID=$(medperf mlcube ls | grep traincube | head -n 1 | tr -s ' ' | cut -d ' ' -f 2)

print_eval medperf mlcube submit --name fladmincube -m $FLADMIN_MLCUBE --operational
checkFailed "fladmincube submission failed"
FLADMINCUBE_UID=$(medperf mlcube ls | grep fladmincube | head -n 1 | tr -s ' ' | cut -d ' ' -f 2)

##########################################################

echo "\n"
Expand All @@ -105,7 +119,7 @@ echo "\n"
echo "====================================="
echo "Submit Training Experiment"
echo "====================================="
print_eval medperf training submit -n trainexp -d trainexp -p $PREP_UID -m $TRAINCUBE_UID
print_eval medperf training submit -n trainexp -d trainexp -p $PREP_UID -m $TRAINCUBE_UID -a $FLADMINCUBE_UID
checkFailed "Training exp submission failed"
TRAINING_UID=$(medperf training ls | grep trainexp | tail -n 1 | tr -s ' ' | cut -d ' ' -f 2)

Expand Down Expand Up @@ -403,6 +417,51 @@ fi

echo "\n"

##########################################################
echo "====================================="
echo "Activate fladmin profile"
echo "====================================="
print_eval medperf profile activate fladmin
checkFailed "fladmin profile activation failed"
##########################################################

echo "\n"

##########################################################
echo "====================================="
echo "Get fladmin certificate"
echo "====================================="
print_eval medperf certificate get_client_certificate -t $TRAINING_UID
checkFailed "Get fladmin cert failed"
##########################################################

echo "\n"

##########################################################
echo "====================================="
echo "Check experiment status"
echo "====================================="
print_eval medperf training get_experiment_status -t $TRAINING_UID
checkFailed "Get experiment status failed"

sleep 3 # sleep some time then get status again

print_eval medperf training get_experiment_status -t $TRAINING_UID
checkFailed "Get experiment status failed"
##########################################################

echo "\n"

##########################################################
echo "====================================="
echo "Update plan parameter"
echo "====================================="
print_eval medperf training update_plan -t $TRAINING_UID -f "straggler_handling_policy.settings.straggler_cutoff_time" -v 1200
checkFailed "Update plan failed"
##########################################################

echo "\n"

##########################################################
echo "====================================="
echo "Waiting for other prcocesses to exit successfully"
Expand Down
88 changes: 88 additions & 0 deletions cli/medperf/commands/training/get_experiment_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from medperf import config
from medperf.account_management.account_management import get_medperf_user_data
from medperf.entities.ca import CA
from medperf.entities.training_exp import TrainingExp
from medperf.entities.cube import Cube
from medperf.utils import (
get_pki_assets_path,
generate_tmp_path,
dict_pretty_print,
remove_path,
)
from medperf.certificates import trust
import yaml
import os


class GetExperimentStatus:
@classmethod
def run(cls, training_exp_id: int):
"""Starts the aggregation server of a training experiment
Args:
training_exp_id (int): Training experiment UID.
"""
execution = cls(training_exp_id)
execution.prepare()
execution.prepare_plan()
execution.prepare_pki_assets()
with config.ui.interactive():
execution.prepare_admin_cube()
execution.get_experiment_status()
execution.print_experiment_status()
execution.store_status()

def __init__(self, training_exp_id: int) -> None:
self.training_exp_id = training_exp_id
self.ui = config.ui

def prepare(self):
self.training_exp = TrainingExp.get(self.training_exp_id)
self.ui.print(f"Training Experiment: {self.training_exp.name}")
self.user_email: str = get_medperf_user_data()["email"]
self.status_output = generate_tmp_path()
self.temp_dir = generate_tmp_path()

def prepare_plan(self):
self.training_exp.prepare_plan()

def prepare_pki_assets(self):
ca = CA.from_experiment(self.training_exp_id)
trust(ca)
self.admin_pki_assets = get_pki_assets_path(self.user_email, ca.name)
self.ca = ca

def prepare_admin_cube(self):
self.cube = self.__get_cube(self.training_exp.fl_admin_mlcube, "FL Admin")

def __get_cube(self, uid: int, name: str) -> Cube:
self.ui.text = (
"Retrieving and setting up training MLCube. This may take some time."
)
cube = Cube.get(uid)
cube.download_run_files()
self.ui.print(f"> {name} cube download complete")
return cube

def get_experiment_status(self):
env_dict = {"MEDPERF_ADMIN_PARTICIPANT_CN": self.user_email}
params = {
"node_cert_folder": self.admin_pki_assets,
"ca_cert_folder": self.ca.pki_assets,
"plan_path": self.training_exp.plan_path,
"output_status_file": self.status_output,
"temp_dir": self.temp_dir,
}

self.ui.text = "Getting training experiment status"
self.cube.run(task="get_experiment_status", env_dict=env_dict, **params)

def print_experiment_status(self):
with open(self.status_output) as f:
contents = yaml.safe_load(f)
dict_pretty_print(contents)

def store_status(self):
new_status_path = self.training_exp.status_path
remove_path(new_status_path)
os.rename(self.status_output, new_status_path)
34 changes: 33 additions & 1 deletion cli/medperf/commands/training/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from medperf.commands.training.close_event import CloseEvent
from medperf.commands.list import EntityList
from medperf.commands.view import EntityView
from medperf.commands.training.get_experiment_status import GetExperimentStatus
from medperf.commands.training.update_plan import UpdatePlan

app = typer.Typer()

Expand All @@ -28,7 +30,7 @@ def submit(
..., "--fl-mlcube", "-m", help="Reference Model MLCube UID"
),
fl_admin_mlcube: int = typer.Option(
None, "--fl-mlcube", "-a", help="FL admin interface MLCube"
None, "--fl-admin-mlcube", "-a", help="FL admin interface MLCube"
),
operational: bool = typer.Option(
False,
Expand Down Expand Up @@ -86,6 +88,36 @@ def start_event(
config.ui.print("✅ Done!")


@app.command("get_experiment_status")
@clean_except
def get_experiment_status(
training_exp_id: int = typer.Option(
..., "--training_exp_id", "-t", help="UID of the desired benchmark"
)
):
"""Runs the benchmark execution step for a given benchmark, prepared dataset and model"""
GetExperimentStatus.run(training_exp_id)
config.ui.print("✅ Done!")


@app.command("update_plan")
@clean_except
def update_plan(
training_exp_id: int = typer.Option(
..., "--training_exp_id", "-t", help="UID of the desired benchmark"
),
field_name: str = typer.Option(
..., "--field_name", "-f", help="UID of the desired benchmark"
),
value: str = typer.Option(
..., "--value", "-v", help="UID of the desired benchmark"
),
):
"""Runtime-update of a scalar field of the training plan"""
UpdatePlan.run(training_exp_id, field_name, value)
config.ui.print("✅ Done!")


@app.command("close_event")
@clean_except
def close_event(
Expand Down
74 changes: 74 additions & 0 deletions cli/medperf/commands/training/update_plan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from medperf import config
from medperf.account_management.account_management import get_medperf_user_data
from medperf.entities.ca import CA
from medperf.entities.training_exp import TrainingExp
from medperf.entities.cube import Cube
from medperf.utils import get_pki_assets_path, generate_tmp_path
from medperf.certificates import trust


class UpdatePlan:
@classmethod
def run(cls, training_exp_id: int, field_name: str, field_value: str):
"""Starts the aggregation server of a training experiment
Args:
training_exp_id (int): Training experiment UID.
"""
execution = cls(training_exp_id, field_name, field_value)
execution.prepare()
execution.prepare_plan()
execution.prepare_pki_assets()
with config.ui.interactive():
execution.prepare_admin_cube()
execution.update_plan()

def __init__(self, training_exp_id: int, field_name: str, field_value: str) -> None:
self.training_exp_id = training_exp_id
self.field_name = field_name
self.field_value = field_value
self.ui = config.ui

def prepare(self):
self.training_exp = TrainingExp.get(self.training_exp_id)
self.ui.print(f"Training Experiment: {self.training_exp.name}")
self.user_email: str = get_medperf_user_data()["email"]
self.temp_dir = generate_tmp_path()

def prepare_plan(self):
self.training_exp.prepare_plan()

def prepare_pki_assets(self):
ca = CA.from_experiment(self.training_exp_id)
trust(ca)
self.admin_pki_assets = get_pki_assets_path(self.user_email, ca.name)
self.ca = ca

def prepare_admin_cube(self):
self.cube = self.__get_cube(self.training_exp.fl_admin_mlcube, "FL Admin")

def __get_cube(self, uid: int, name: str) -> Cube:
self.ui.text = (
"Retrieving and setting up training MLCube. This may take some time."
)
cube = Cube.get(uid)
cube.download_run_files()
self.ui.print(f"> {name} cube download complete")
return cube

def update_plan(self):
env_dict = {
"MEDPERF_ADMIN_PARTICIPANT_CN": self.user_email,
"MEDPERF_UPDATE_FIELD_NAME": self.field_name,
"MEDPERF_UPDATE_FIELD_VALUE": self.field_value,
}

params = {
"node_cert_folder": self.admin_pki_assets,
"ca_cert_folder": self.ca.pki_assets,
"plan_path": self.training_exp.plan_path,
"temp_dir": self.temp_dir,
}

self.ui.text = "Updating plan"
self.cube.run(task="update_plan", env_dict=env_dict, **params)
1 change: 1 addition & 0 deletions cli/medperf/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@
training_exps_filename = "training-info.yaml"
participants_list_filename = "cols.yaml"
training_exp_plan_filename = "plan.yaml"
training_exp_status_filename = "status.yaml"
training_report_file = "report.yaml"
training_report_folder = "report"
training_out_agg_logs = "agg_logs"
Expand Down
2 changes: 1 addition & 1 deletion cli/medperf/entities/cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def run(
"get_experiment_status",
"add_collaborator",
"remove_collaborator",
"set_straggler_cuttoff_time",
"update_plan",
]:
cmd += " --network=none"
if config.gpus is not None:
Expand Down
1 change: 1 addition & 0 deletions cli/medperf/entities/training_exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def __init__(self, *args, **kwargs):

self.generated_uid = self.name
self.plan_path = os.path.join(self.path, config.training_exp_plan_filename)
self.status_path = os.path.join(self.path, config.training_exp_status_filename)

@classmethod
def _Entity__remote_prefilter(cls, filters: dict) -> callable:
Expand Down
2 changes: 2 additions & 0 deletions cli/tests_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ METRIC_PARAMS="$ASSETS_URL/metrics/mlcube/workspace/parameters.yaml"
# FL cubes
TRAIN_MLCUBE="https://raw.githubusercontent.com/hasan7n/medperf/19c80d88deaad27b353d1cb9bc180757534027aa/examples/fl/fl/mlcube/mlcube.yaml"
TRAIN_WEIGHTS="https://storage.googleapis.com/medperf-storage/testfl/init_weights_miccai.tar.gz"
FLADMIN_MLCUBE="https://raw.githubusercontent.com/hasan7n/medperf/bc431ffe6c3b761b28674816e6f26511e8b27042/examples/fl/fl_admin/mlcube/mlcube.yaml"

# test users credentials
MODELOWNER="[email protected]"
Expand All @@ -129,6 +130,7 @@ BENCHMARKOWNER="[email protected]"
ADMIN="[email protected]"
DATAOWNER2="[email protected]"
AGGOWNER="[email protected]"
FLADMIN="[email protected]"

# local MLCubes for local compatibility tests
PREP_LOCAL="$(dirname $(dirname $(realpath "$0")))/examples/chestxray_tutorial/data_preparator/mlcube"
Expand Down
3 changes: 1 addition & 2 deletions examples/fl/cert/project/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
FROM python:3.11.9-alpine

# update openssl to fix https://avd.aquasec.com/nvd/cve-2024-2511
RUN apk update && apk add openssl=3.1.4-r6 jq
RUN apk update && apk add jq

ARG VERSION=0.26.1
RUN wget https://dl.smallstep.com/gh-release/cli/gh-release-header/v${VERSION}/step_linux_${VERSION}_amd64.tar.gz \
Expand Down
8 changes: 8 additions & 0 deletions examples/fl/cert/project/get_cert.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,5 +83,13 @@ step ca certificate --ca-url $CA_ADDRESS:$CA_PORT \
$PROVISIONER_ARGS \
$MEDPERF_INPUT_CN $cert_path $key_path

EXITSTATUS="$?"
if [ $EXITSTATUS -ne "0" ]; then
echo "Failed to get the certificate"
# cleanup
rm -rf $STEPPATH
exit 1
fi

# cleanup
rm -rf $STEPPATH
7 changes: 7 additions & 0 deletions examples/fl/cert/project/trust.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ if [ -n "$CA_FINGERPRINT" ]; then
else
wget -O $pki_assets/root_ca.crt $CA_ADDRESS:$CA_PORT/roots.pem
fi
EXITSTATUS="$?"
if [ $EXITSTATUS -ne "0" ]; then
echo "Failed to retrieve the root certificate"
# cleanup
rm -rf $STEPPATH
exit 1
fi

# cleanup
rm -rf $STEPPATH
Loading

0 comments on commit f0e0170

Please sign in to comment.