Fix dataset hashes not taking labels into account (#493)

* Add Data Preparator cookiecutter template * Rename cookiecutter folder * Temporarily remove possibly offending files * Remove cookicutter conditionals * Inclube back missing pieces of template * remove cookiecutter typo * Use project_name attribute * Change cookiecutter fields order * Create empty directories on hook * Fix empty folders paths * Create evaluator mlcube cookiecutter template * Fix JSON Syntax Error * Update template default values * Remove reference to undefined template variable * Implement model mlcube cookiecutter template * Update cookiecutter variable default values * Create medperf CLI command for creating MLCubes * Provide additional options for mlcube create * Start working on tests * Add tests for cube create * Ignore invalid syntax on cookiecutter conditionals * Ignore more flake8 errors * Remove unused import * Empty commit for cloudbuild * Fix inconsistency with labels paths * Update mlcube.yaml so it can be commented on docs * Don't render noqa comments on template * Remove flake8 specific ignores * Exclude templates from lint checks * Remove specific flake8 ignores * Fix labels_paht being passed in he wrong situation * Add requirements to cookiecutters * Set separate labels as true by default * Remove duplicate templates * Abstract field-error dict formatting * Reformat errors dictionary for printing * Allow passing multiple folders for hashing * Generate dataset hashes with data and labels * Fix linter issue * Fix utils using outdated folder hash function * Fix tests * Add note about hash ordering --------- Co-authored-by: hasan7n <[email protected]>
mlcommons · Dec 1, 2023 · 5a6a1e3 · 5a6a1e3
1 parent 8c26674
commit 5a6a1e3
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 24 deletions.
diff --git a/cli/medperf/commands/compatibility_test/utils.py b/cli/medperf/commands/compatibility_test/utils.py
@@ -1,4 +1,4 @@
-from medperf.utils import storage_path, get_folder_hash
+from medperf.utils import storage_path, get_folders_hash
 from medperf.exceptions import InvalidArgumentError, InvalidEntityError
 
 from medperf.comms.entity_resources import resources
@@ -34,7 +34,7 @@ def download_demo_data(dset_url, dset_hash):
 
 
 def prepare_local_cube(path):
-    temp_uid = get_folder_hash(path)
+    temp_uid = get_folders_hash([path])
     cubes_storage = storage_path(config.cubes_storage)
     dst = os.path.join(cubes_storage, temp_uid)
     os.symlink(path, dst)

diff --git a/cli/medperf/commands/dataset/create.py b/cli/medperf/commands/dataset/create.py
@@ -9,7 +9,7 @@
 from medperf.utils import (
     remove_path,
     generate_tmp_path,
-    get_folder_hash,
+    get_folders_hash,
     storage_path,
 )
 from medperf.exceptions import InvalidArgumentError
@@ -176,8 +176,8 @@ def get_statistics(self):
 
     def generate_uids(self):
         """Auto-generates dataset UIDs for both input and output paths"""
-        self.in_uid = get_folder_hash(self.data_path)
-        self.generated_uid = get_folder_hash(self.out_datapath)
+        self.in_uid = get_folders_hash([self.data_path, self.labels_path])
+        self.generated_uid = get_folders_hash([self.out_datapath, self.out_labelspath])
 
     def to_permanent_path(self) -> str:
         """Renames the temporary data folder to permanent one using the hash of

diff --git a/cli/medperf/tests/commands/dataset/test_create.py b/cli/medperf/tests/commands/dataset/test_create.py
@@ -195,15 +195,30 @@ def test_fails_if_invalid_params(self, mocker, benchmark_uid, cube_uid, comms, u
         else:
             preparation.validate()
 
-    @pytest.mark.parametrize("in_path", ["data_path", "input_path", "/usr/data/path"])
-    @pytest.mark.parametrize("out_path", ["out_path", "~/.medperf/data/123"])
+    @pytest.mark.parametrize(
+        "in_path",
+        [
+            ["data", "labels"],
+            ["in_data", "in_labels"],
+            ["/usr/data/path", "usr/labels/path"],
+        ],
+    )
+    @pytest.mark.parametrize(
+        "out_path",
+        [
+            ["out_data", "out_labels"],
+            ["~/.medperf/data/123/data", "~/.medperf/data/123/labels"],
+        ],
+    )
     def test_generate_uids_assigns_uids_to_obj_properties(
         self, mocker, in_path, out_path, preparation
     ):
         # Arrange
-        mocker.patch(PATCH_DATAPREP.format("get_folder_hash"), side_effect=lambda x: x)
-        preparation.data_path = in_path
-        preparation.out_datapath = out_path
+        mocker.patch(PATCH_DATAPREP.format("get_folders_hash"), side_effect=lambda x: x)
+        preparation.data_path = in_path[0]
+        preparation.labels_path = in_path[1]
+        preparation.out_datapath = out_path[0]
+        preparation.out_labelspath = out_path[1]
 
         # Act
         preparation.generate_uids()

diff --git a/cli/medperf/tests/test_utils.py b/cli/medperf/tests/test_utils.py
@@ -344,7 +344,7 @@ def test_dict_pretty_print_passes_clean_dict_to_yaml(mocker, ui, dict_with_nones
     spy.assert_called_once_with(exp_dict)
 
 
-def test_get_folder_hash_hashes_all_files_in_folder(mocker, filesystem):
+def test_get_folders_hash_hashes_all_files_in_folder(mocker, filesystem):
     # Arrange
     fs = filesystem[0]
     files = filesystem[1]
@@ -353,13 +353,13 @@ def test_get_folder_hash_hashes_all_files_in_folder(mocker, filesystem):
     spy = mocker.patch(patch_utils.format("get_file_hash"), side_effect=files)
 
     # Act
-    utils.get_folder_hash("test")
+    utils.get_folders_hash(["test"])
 
     # Assert
     spy.assert_has_calls(exp_calls)
 
 
-def test_get_folder_hash_sorts_individual_hashes(mocker, filesystem):
+def test_get_folders_hash_sorts_individual_hashes(mocker, filesystem):
     # Arrange
     fs = filesystem[0]
     files = filesystem[1]
@@ -368,21 +368,21 @@ def test_get_folder_hash_sorts_individual_hashes(mocker, filesystem):
     spy = mocker.patch("builtins.sorted", side_effect=sorted)
 
     # Act
-    utils.get_folder_hash("test")
+    utils.get_folders_hash(["test"])
 
     # Assert
     spy.assert_called_once_with(files)
 
 
-def test_get_folder_hash_returns_expected_hash(mocker, filesystem):
+def test_get_folders_hash_returns_expected_hash(mocker, filesystem):
     # Arrange
     fs = filesystem[0]
     files = filesystem[1]
     mocker.patch("os.walk", return_value=fs)
     mocker.patch(patch_utils.format("get_file_hash"), side_effect=files)
 
     # Act
-    hash = utils.get_folder_hash("test")
+    hash = utils.get_folders_hash(["test"])
 
     # Assert
     assert hash == "b7e9365f1e796ba29e9e6b1b94b5f4cc7238530601fad8ec96ece9fee68c3d7f"

diff --git a/cli/medperf/utils.py b/cli/medperf/utils.py
@@ -382,9 +382,9 @@ def combine_proc_sp_text(proc: spawn) -> str:
     return proc_out
 
 
-def get_folder_hash(path: str) -> str:
-    """Generates a hash for all the contents of the folder. This procedure
-    hashes all of the files in the folder, sorts them and then hashes that list.
+def get_folders_hash(paths: List[str]) -> str:
+    """Generates a hash for all the contents of the fiven folders. This procedure
+    hashes all of the files in all passed folders, sorts them and then hashes that list.
 
     Args:
         path (str): Folder to hash
@@ -393,11 +393,14 @@ def get_folder_hash(path: str) -> str:
         str: sha256 hash of the whole folder
     """
     hashes = []
-    for root, _, files in os.walk(path, topdown=False):
-        for file in files:
-            logging.debug(f"Hashing file {file}")
-            filepath = os.path.join(root, file)
-            hashes.append(get_file_hash(filepath))
+
+    # The hash doesn't depend on the order of paths or folders, as the hashes get sorted after the fact
+    for path in paths:
+        for root, _, files in os.walk(path, topdown=False):
+            for file in files:
+                logging.debug(f"Hashing file {file}")
+                filepath = os.path.join(root, file)
+                hashes.append(get_file_hash(filepath))
 
     hashes = sorted(hashes)
     sha = hashlib.sha256()