Merge remote-tracking branch 'origin/main' into debug-logging

mlcommons · Feb 1, 2024 · 29e7955 · 29e7955
2 parents e82a1bc + 71582b7
commit 29e7955
Show file tree

Hide file tree

Showing 19 changed files with 174 additions and 115 deletions.
diff --git a/.github/workflows/docker-ci.yml b/.github/workflows/docker-ci.yml
@@ -18,6 +18,7 @@ jobs:
         python-version: '3.9'
 
     - name: Install Singularity
+      working-directory: ..
       run: |
         sudo apt-get update
         sudo apt-get install -y build-essential libssl-dev uuid-dev libgpgme11-dev \

diff --git a/.github/workflows/local-ci.yml b/.github/workflows/local-ci.yml
@@ -16,6 +16,9 @@ jobs:
         python-version: '3.9'
 
     - name: Install Singularity
+      # The way we install singularity shouldn't happen inside another git repo, otherwise
+      # the singularity installer will set that repo's tag/commit as the singularity version.
+      working-directory: ..
       run: |
         sudo apt-get update
         sudo apt-get install -y build-essential libssl-dev uuid-dev libgpgme11-dev \

diff --git a/cli/cli_tests.sh b/cli/cli_tests.sh
@@ -107,6 +107,7 @@ medperf mlcube submit --name model2 -m $MODEL_MLCUBE -p $MODEL2_PARAMS -a $MODEL
 checkFailed "Model2 submission failed"
 MODEL2_UID=$(medperf mlcube ls | tail -n 1 | tr -s ' ' | cut -d ' ' -f 2)
 
+# MLCube with singularity section
 medperf mlcube submit --name model3 -m $MODEL_WITH_SINGULARITY -p $MODEL3_PARAMS -a $MODEL_ADD -i $MODEL_SING_IMAGE
 checkFailed "Model3 submission failed"
 MODEL3_UID=$(medperf mlcube ls | tail -n 1 | tr -s ' ' | cut -d ' ' -f 2)
@@ -236,11 +237,12 @@ echo "\n"
 
 ##########################################################
 echo "====================================="
-echo "Running model3 association"
+echo "Running model3 association (with singularity)"
 echo "====================================="
-# medperf --platform singularity mlcube associate -m $MODEL3_UID -b $BMK_UID -y
-# TMP: revert to singularity when MLCube issue is fixed
-medperf mlcube associate -m $MODEL3_UID -b $BMK_UID -y
+# this will run two types of singularity mlcubes:
+#   1) an already built singularity image (model 3)
+#   2) a docker image to be converted (metrics)
+medperf --platform singularity mlcube associate -m $MODEL3_UID -b $BMK_UID -y
 checkFailed "Model3 association failed"
 ##########################################################
 
@@ -312,10 +314,10 @@ echo "\n"
 
 ##########################################################
 echo "====================================="
-echo "Running model2"
+echo "Running model3 (with singularity)"
 echo "====================================="
-medperf run -b $BMK_UID -d $DSET_A_UID -m $MODEL2_UID -y
-checkFailed "Model2 run failed"
+medperf --platform=singularity run -b $BMK_UID -d $DSET_A_UID -m $MODEL3_UID -y
+checkFailed "Model3 run failed"
 ##########################################################
 
 echo "\n"

diff --git a/cli/medperf/commands/compatibility_test/utils.py b/cli/medperf/commands/compatibility_test/utils.py
@@ -95,5 +95,6 @@ def prepare_cube(cube_uid: str):
 def get_cube(uid: int, name: str, local_only: bool = False) -> Cube:
     config.ui.text = f"Retrieving {name} cube"
     cube = Cube.get(uid, local_only=local_only)
+    cube.download_run_files()
     config.ui.print(f"> {name} cube download complete")
     return cube
diff --git a/cli/medperf/commands/dataset/create.py b/cli/medperf/commands/dataset/create.py
@@ -96,6 +96,7 @@ def get_prep_cube(self):
             self.ui.print(f"Benchmark Data Preparation: {benchmark.name}")
         self.ui.text = "Retrieving data preparation cube"
         self.cube = Cube.get(cube_uid)
+        self.cube.download_run_files()
         self.ui.print("> Preparation cube download complete")
 
     def run_cube_tasks(self):

diff --git a/cli/medperf/commands/mlcube/submit.py b/cli/medperf/commands/mlcube/submit.py
@@ -31,7 +31,8 @@ def __init__(self, submit_info: dict):
         config.tmp_paths.append(self.cube.path)
 
     def download(self):
-        self.cube.download()
+        self.cube.download_config_files()
+        self.cube.download_run_files()
 
     def upload(self):
         updated_body = self.cube.upload()

diff --git a/cli/medperf/commands/result/create.py b/cli/medperf/commands/result/create.py
@@ -153,6 +153,7 @@ def load_cached_results(self):
     def __get_cube(self, uid: int, name: str) -> Cube:
         self.ui.text = f"Retrieving {name} cube"
         cube = Cube.get(uid)
+        cube.download_run_files()
         self.ui.print(f"> {name} cube download complete")
         return cube
 

diff --git a/cli/medperf/decorators.py b/cli/medperf/decorators.py
@@ -208,9 +208,13 @@ def wrapper(
             "--gpus",
             help="""
             What GPUs to expose to MLCube.
-            Accepted Values are comma separated GPU IDs (e.g "1,2"), or \"all\".
-            MLCubes that aren't configured to use GPUs won't be affected by this.
-            Defaults to all available GPUs""",
+            Accepted Values are:\n
+            - "" or 0: to expose no GPUs (e.g.: --gpus="")\n
+            - "all": to expose all GPUs. (e.g.: --gpus=all)\n
+            - an integer: to expose a certain number of GPUs. ONLY AVAILABLE FOR DOCKER
+            (e.g., --gpus=2 to expose 2 GPUs)\n
+            - Form "device=<id1>,<id2>": to expose specific GPUs.
+            (e.g., --gpus="device=0,2")\n""",
         ),
         cleanup: bool = typer.Option(
             config.cleanup,

diff --git a/cli/medperf/entities/cube.py b/cli/medperf/entities/cube.py
@@ -167,7 +167,7 @@ def get(cls, cube_uid: Union[str, int], local_only: bool = False) -> "Cube":
 
         if not cube.is_valid:
             raise InvalidEntityError("The requested MLCube is marked as INVALID.")
-        cube.download()
+        cube.download_config_files()
         return cube
 
     @classmethod
@@ -211,45 +211,66 @@ def download_additional(self):
     def download_image(self):
         url = self.image_tarball_url
         tarball_hash = self.image_tarball_hash
-        img_hash = self.image_hash
 
         if url:
             _, local_hash = resources.get_cube_image(url, self.path, tarball_hash)
             self.image_tarball_hash = local_hash
         else:
-            # Retrieve image from image registry
-            logging.debug(f"Retrieving {self.id} image")
-            cmd = f"mlcube configure --mlcube={self.cube_path}"
-            with pexpect.spawn(cmd, timeout=config.mlcube_configure_timeout) as proc:
-                proc_out = proc.read()
-            if proc.exitstatus != 0:
-                raise ExecutionError(
-                    "There was an error while retrieving the MLCube image"
-                )
-            logging.debug(proc_out)
-
-            # Retrieve image hash from MLCube
-            logging.debug(f"Retrieving {self.id} image hash")
-            tmp_out_yaml = generate_tmp_path()
-            cmd = f"mlcube inspect --mlcube={self.cube_path} --format=yaml"
-            cmd += f" --output-file {tmp_out_yaml}"
-            with pexpect.spawn(cmd, timeout=config.mlcube_inspect_timeout) as proc:
-                proc_stdout = proc.read()
-            logging.debug(proc_stdout)
-            if proc.exitstatus != 0:
-                raise ExecutionError(
-                    "There was an error while inspecting the image hash"
-                )
-            with open(tmp_out_yaml) as f:
-                mlcube_details = yaml.safe_load(f)
-            remove_path(tmp_out_yaml)
-            local_hash = mlcube_details["hash"]
-            verify_hash(local_hash, img_hash)
-            self.image_hash = local_hash
-
-    def download(self):
-        """Downloads the required elements for an mlcube to run locally."""
+            if config.platform == "docker":
+                # For docker, image should be pulled before calculating its hash
+                self._get_image_from_registry()
+                self._set_image_hash_from_registry()
+            elif config.platform == "singularity":
+                # For singularity, we need the hash first before trying to convert
+                self._set_image_hash_from_registry()
+
+                image_folder = os.path.join(config.cubes_folder, config.image_path)
+                if os.path.exists(image_folder):
+                    for file in os.listdir(image_folder):
+                        if file == self._converted_singularity_image_name:
+                            return
+                        remove_path(os.path.join(image_folder, file))
+
+                self._get_image_from_registry()
+            else:
+                # TODO: such a check should happen on commands entrypoints, not here
+                raise InvalidArgumentError("Unsupported platform")
+
+    @property
+    def _converted_singularity_image_name(self):
+        return f"{self.image_hash}.sif"
+
+    def _set_image_hash_from_registry(self):
+        # Retrieve image hash from MLCube
+        logging.debug(f"Retrieving {self.id} image hash")
+        tmp_out_yaml = generate_tmp_path()
+        cmd = f"mlcube inspect --mlcube={self.cube_path} --format=yaml"
+        cmd += f" --platform={config.platform} --output-file {tmp_out_yaml}"
+        with pexpect.spawn(cmd, timeout=config.mlcube_inspect_timeout) as proc:
+            proc_stdout = proc.read()
+        logging.debug(proc_stdout)
+        if proc.exitstatus != 0:
+            raise ExecutionError("There was an error while inspecting the image hash")
+        with open(tmp_out_yaml) as f:
+            mlcube_details = yaml.safe_load(f)
+        remove_path(tmp_out_yaml)
+        local_hash = mlcube_details["hash"]
+        verify_hash(local_hash, self.image_hash)
+        self.image_hash = local_hash
+
+    def _get_image_from_registry(self):
+        # Retrieve image from image registry
+        logging.debug(f"Retrieving {self.id} image")
+        cmd = f"mlcube configure --mlcube={self.cube_path} --platform={config.platform}"
+        if config.platform == "singularity":
+            cmd += f" -Psingularity.image={self._converted_singularity_image_name}"
+        with pexpect.spawn(cmd, timeout=config.mlcube_configure_timeout) as proc:
+            proc_out = proc.read()
+        if proc.exitstatus != 0:
+            raise ExecutionError("There was an error while retrieving the MLCube image")
+        logging.debug(proc_out)
 
+    def download_config_files(self):
         try:
             self.download_mlcube()
         except InvalidEntityError as e:
@@ -260,6 +281,7 @@ def download(self):
         except InvalidEntityError as e:
             raise InvalidEntityError(f"MLCube {self.name} parameters file: {e}")
 
+    def download_run_files(self):
         try:
             self.download_additional()
         except InvalidEntityError as e:
@@ -300,12 +322,36 @@ def run(
             cmd_arg = f'{k}="{v}"'
             cmd = " ".join([cmd, cmd_arg])
 
-        cpu_args = self.get_config("docker.cpu_args") or ""
-        gpu_args = self.get_config("docker.gpu_args") or ""
-        cpu_args = " ".join([cpu_args, "-u $(id -u):$(id -g)"]).strip()
-        gpu_args = " ".join([gpu_args, "-u $(id -u):$(id -g)"]).strip()
-        cmd += f' -Pdocker.cpu_args="{cpu_args}"'
-        cmd += f' -Pdocker.gpu_args="{gpu_args}"'
+        # TODO: we should override run args instead of what we are doing below
+        #       we shouldn't allow arbitrary run args unless our client allows it
+        if config.platform == "docker":
+            # use current user
+            cpu_args = self.get_config("docker.cpu_args") or ""
+            gpu_args = self.get_config("docker.gpu_args") or ""
+            cpu_args = " ".join([cpu_args, "-u $(id -u):$(id -g)"]).strip()
+            gpu_args = " ".join([gpu_args, "-u $(id -u):$(id -g)"]).strip()
+            cmd += f' -Pdocker.cpu_args="{cpu_args}"'
+            cmd += f' -Pdocker.gpu_args="{gpu_args}"'
+        elif config.platform == "singularity":
+            # use -e to discard host env vars, -C to isolate the container (see singularity run --help)
+            run_args = self.get_config("singularity.run_args") or ""
+            run_args = " ".join([run_args, "-eC"]).strip()
+            cmd += f' -Psingularity.run_args="{run_args}"'
+
+            # set image name in case of running docker image with singularity
+            # Assuming we only accept mlcube.yamls with either singularity or docker sections
+            # TODO: make checks on submitted mlcubes
+            singularity_config = self.get_config("singularity")
+            if singularity_config is None:
+                cmd += (
+                    f' -Psingularity.image="{self._converted_singularity_image_name}"'
+                )
+        else:
+            raise InvalidArgumentError("Unsupported platform")
+
+        # set accelerator count to zero to avoid unexpected behaviours and
+        # force mlcube to only use --gpus to figure out GPU config
+        cmd += " -Pplatform.accelerator_count=0"
 
         container_loglevel = config.container_loglevel
         if container_loglevel:

diff --git a/cli/medperf/tests/commands/dataset/test_create.py b/cli/medperf/tests/commands/dataset/test_create.py
@@ -4,7 +4,7 @@
 import pytest
 from unittest.mock import call
 
-from medperf.tests.mocks import MockCube
+from medperf.tests.mocks import TestCube
 from medperf.tests.mocks.benchmark import TestBenchmark
 from medperf.tests.mocks.dataset import TestDataset
 from medperf.commands.dataset.create import DataPreparation
@@ -39,7 +39,8 @@ def preparation(mocker, comms, ui):
         DESCRIPTION,
         LOCATION,
     )
-    mocker.patch(PATCH_DATAPREP.format("Cube.get"), return_value=MockCube(True))
+    mocker.patch(PATCH_DATAPREP.format("Cube.get"), return_value=TestCube())
+    mocker.patch(PATCH_DATAPREP.format("Cube.download_run_files"))
     preparation.get_prep_cube()
     preparation.data_path = DATA_PATH
     preparation.labels_path = LABELS_PATH
@@ -75,16 +76,16 @@ def test_get_prep_cube_gets_prep_cube_if_provided(
         self, mocker, cube_uid, comms, ui
     ):
         # Arrange
-        spy = mocker.patch(
-            PATCH_DATAPREP.format("Cube.get"), return_value=MockCube(True)
-        )
+        spy = mocker.patch(PATCH_DATAPREP.format("Cube.get"), return_value=TestCube())
+        down_spy = mocker.patch(PATCH_DATAPREP.format("Cube.download_run_files"))
 
         # Act
         preparation = DataPreparation(None, cube_uid, *[""] * 5)
         preparation.get_prep_cube()
 
         # Assert
         spy.assert_called_once_with(cube_uid)
+        down_spy.assert_called_once()
 
     @pytest.mark.parametrize("cube_uid", [998, 68, 109])
     def test_get_prep_cube_gets_benchmark_cube_if_provided(
@@ -93,16 +94,16 @@ def test_get_prep_cube_gets_benchmark_cube_if_provided(
         # Arrange
         benchmark = TestBenchmark(data_preparation_mlcube=cube_uid)
         mocker.patch(PATCH_DATAPREP.format("Benchmark.get"), return_value=benchmark)
-        spy = mocker.patch(
-            PATCH_DATAPREP.format("Cube.get"), return_value=MockCube(True)
-        )
+        spy = mocker.patch(PATCH_DATAPREP.format("Cube.get"), return_value=TestCube())
+        down_spy = mocker.patch(PATCH_DATAPREP.format("Cube.download_run_files"))
 
         # Act
         preparation = DataPreparation(cube_uid, None, *[""] * 5)
         preparation.get_prep_cube()
 
         # Assert
         spy.assert_called_once_with(cube_uid)
+        down_spy.assert_called_once()
 
     def test_run_cube_tasks_runs_required_tasks(self, mocker, preparation):
         # Arrange

diff --git a/cli/medperf/tests/commands/mlcube/test_submit.py b/cli/medperf/tests/commands/mlcube/test_submit.py
@@ -10,7 +10,8 @@
 
 @pytest.fixture
 def cube(mocker):
-    mocker.patch(PATCH_MLCUBE.format("Cube.download"))
+    mocker.patch(PATCH_MLCUBE.format("Cube.download_config_files"))
+    mocker.patch(PATCH_MLCUBE.format("Cube.download_run_files"))
     mocker.patch(PATCH_MLCUBE.format("Cube.upload"))
     mocker.patch(PATCH_MLCUBE.format("Cube.write"))
     return TestCube()
@@ -90,10 +91,11 @@ def test_upload_uploads_using_entity(mocker, comms, ui, cube):
 
 def test_download_executes_expected_commands(mocker, comms, ui, cube):
     submission = SubmitCube(cube.todict())
-    down_spy = mocker.patch(PATCH_MLCUBE.format("Cube.download"))
-
+    config_down_spy = mocker.patch(PATCH_MLCUBE.format("Cube.download_config_files"))
+    run_down_spy = mocker.patch(PATCH_MLCUBE.format("Cube.download_run_files"))
     # Act
     submission.download()
 
     # Assert
-    down_spy.assert_called_once_with()
+    config_down_spy.assert_called_once_with()
+    run_down_spy.assert_called_once_with()
diff --git a/cli/medperf/tests/commands/result/test_create.py b/cli/medperf/tests/commands/result/test_create.py
@@ -74,6 +74,7 @@ def __get_side_effect(id):
         return cube
 
     mocker.patch(PATCH_EXECUTION.format("Cube.get"), side_effect=__get_side_effect)
+    mocker.patch(PATCH_EXECUTION.format("Cube.download_run_files"))
 
 
 def mock_execution(mocker, state_variables):