From 5a6a1e3f0a72f0ae0749d92c1563b7e2fbe1951f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?=
 <alejandro.aristizabal24@gmail.com>
Date: Fri, 1 Dec 2023 06:11:53 -0500
Subject: [PATCH] Fix dataset hashes not taking labels into account (#493)

* Add Data Preparator cookiecutter template

* Rename cookiecutter folder

* Temporarily remove possibly offending files

* Remove cookicutter conditionals

* Inclube back missing pieces of template

* remove cookiecutter typo

* Use project_name attribute

* Change cookiecutter fields order

* Create empty directories on hook

* Fix empty folders paths

* Create evaluator mlcube cookiecutter template

* Fix JSON Syntax Error

* Update template default values

* Remove reference to undefined template variable

* Implement model mlcube cookiecutter template

* Update cookiecutter variable default values

* Create medperf CLI command for creating MLCubes

* Provide additional options for mlcube create

* Start working on tests

* Add tests for cube create

* Ignore invalid syntax on cookiecutter conditionals

* Ignore more flake8 errors

* Remove unused import

* Empty commit for cloudbuild

* Fix inconsistency with labels paths

* Update mlcube.yaml so it can be commented on docs

* Don't render noqa comments on template

* Remove flake8 specific ignores

* Exclude templates from lint checks

* Remove specific flake8 ignores

* Fix labels_paht being passed in he wrong situation

* Add requirements to cookiecutters

* Set separate labels as true by default

* Remove duplicate templates

* Abstract field-error dict formatting

* Reformat errors dictionary for printing

* Allow passing multiple folders for hashing

* Generate dataset hashes with data and labels

* Fix linter issue

* Fix utils using outdated folder hash function

* Fix tests

* Add note about hash ordering

---------

Co-authored-by: hasan7n <78664424+hasan7n@users.noreply.github.com>
---
 .../commands/compatibility_test/utils.py      |  4 +--
 cli/medperf/commands/dataset/create.py        |  6 ++---
 .../tests/commands/dataset/test_create.py     | 25 +++++++++++++++----
 cli/medperf/tests/test_utils.py               | 12 ++++-----
 cli/medperf/utils.py                          | 19 ++++++++------
 5 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/cli/medperf/commands/compatibility_test/utils.py b/cli/medperf/commands/compatibility_test/utils.py
index 4f8784c7d..a956a5796 100644
--- a/cli/medperf/commands/compatibility_test/utils.py
+++ b/cli/medperf/commands/compatibility_test/utils.py
@@ -1,4 +1,4 @@
-from medperf.utils import storage_path, get_folder_hash
+from medperf.utils import storage_path, get_folders_hash
 from medperf.exceptions import InvalidArgumentError, InvalidEntityError
 
 from medperf.comms.entity_resources import resources
@@ -34,7 +34,7 @@ def download_demo_data(dset_url, dset_hash):
 
 
 def prepare_local_cube(path):
-    temp_uid = get_folder_hash(path)
+    temp_uid = get_folders_hash([path])
     cubes_storage = storage_path(config.cubes_storage)
     dst = os.path.join(cubes_storage, temp_uid)
     os.symlink(path, dst)
diff --git a/cli/medperf/commands/dataset/create.py b/cli/medperf/commands/dataset/create.py
index 7797e6d7e..10acc9c97 100644
--- a/cli/medperf/commands/dataset/create.py
+++ b/cli/medperf/commands/dataset/create.py
@@ -9,7 +9,7 @@
 from medperf.utils import (
     remove_path,
     generate_tmp_path,
-    get_folder_hash,
+    get_folders_hash,
     storage_path,
 )
 from medperf.exceptions import InvalidArgumentError
@@ -176,8 +176,8 @@ def get_statistics(self):
 
     def generate_uids(self):
         """Auto-generates dataset UIDs for both input and output paths"""
-        self.in_uid = get_folder_hash(self.data_path)
-        self.generated_uid = get_folder_hash(self.out_datapath)
+        self.in_uid = get_folders_hash([self.data_path, self.labels_path])
+        self.generated_uid = get_folders_hash([self.out_datapath, self.out_labelspath])
 
     def to_permanent_path(self) -> str:
         """Renames the temporary data folder to permanent one using the hash of
diff --git a/cli/medperf/tests/commands/dataset/test_create.py b/cli/medperf/tests/commands/dataset/test_create.py
index 5cd1f622d..49e82a355 100644
--- a/cli/medperf/tests/commands/dataset/test_create.py
+++ b/cli/medperf/tests/commands/dataset/test_create.py
@@ -195,15 +195,30 @@ def test_fails_if_invalid_params(self, mocker, benchmark_uid, cube_uid, comms, u
         else:
             preparation.validate()
 
-    @pytest.mark.parametrize("in_path", ["data_path", "input_path", "/usr/data/path"])
-    @pytest.mark.parametrize("out_path", ["out_path", "~/.medperf/data/123"])
+    @pytest.mark.parametrize(
+        "in_path",
+        [
+            ["data", "labels"],
+            ["in_data", "in_labels"],
+            ["/usr/data/path", "usr/labels/path"],
+        ],
+    )
+    @pytest.mark.parametrize(
+        "out_path",
+        [
+            ["out_data", "out_labels"],
+            ["~/.medperf/data/123/data", "~/.medperf/data/123/labels"],
+        ],
+    )
     def test_generate_uids_assigns_uids_to_obj_properties(
         self, mocker, in_path, out_path, preparation
     ):
         # Arrange
-        mocker.patch(PATCH_DATAPREP.format("get_folder_hash"), side_effect=lambda x: x)
-        preparation.data_path = in_path
-        preparation.out_datapath = out_path
+        mocker.patch(PATCH_DATAPREP.format("get_folders_hash"), side_effect=lambda x: x)
+        preparation.data_path = in_path[0]
+        preparation.labels_path = in_path[1]
+        preparation.out_datapath = out_path[0]
+        preparation.out_labelspath = out_path[1]
 
         # Act
         preparation.generate_uids()
diff --git a/cli/medperf/tests/test_utils.py b/cli/medperf/tests/test_utils.py
index 9b32e0b99..4130c70ce 100644
--- a/cli/medperf/tests/test_utils.py
+++ b/cli/medperf/tests/test_utils.py
@@ -344,7 +344,7 @@ def test_dict_pretty_print_passes_clean_dict_to_yaml(mocker, ui, dict_with_nones
     spy.assert_called_once_with(exp_dict)
 
 
-def test_get_folder_hash_hashes_all_files_in_folder(mocker, filesystem):
+def test_get_folders_hash_hashes_all_files_in_folder(mocker, filesystem):
     # Arrange
     fs = filesystem[0]
     files = filesystem[1]
@@ -353,13 +353,13 @@ def test_get_folder_hash_hashes_all_files_in_folder(mocker, filesystem):
     spy = mocker.patch(patch_utils.format("get_file_hash"), side_effect=files)
 
     # Act
-    utils.get_folder_hash("test")
+    utils.get_folders_hash(["test"])
 
     # Assert
     spy.assert_has_calls(exp_calls)
 
 
-def test_get_folder_hash_sorts_individual_hashes(mocker, filesystem):
+def test_get_folders_hash_sorts_individual_hashes(mocker, filesystem):
     # Arrange
     fs = filesystem[0]
     files = filesystem[1]
@@ -368,13 +368,13 @@ def test_get_folder_hash_sorts_individual_hashes(mocker, filesystem):
     spy = mocker.patch("builtins.sorted", side_effect=sorted)
 
     # Act
-    utils.get_folder_hash("test")
+    utils.get_folders_hash(["test"])
 
     # Assert
     spy.assert_called_once_with(files)
 
 
-def test_get_folder_hash_returns_expected_hash(mocker, filesystem):
+def test_get_folders_hash_returns_expected_hash(mocker, filesystem):
     # Arrange
     fs = filesystem[0]
     files = filesystem[1]
@@ -382,7 +382,7 @@ def test_get_folder_hash_returns_expected_hash(mocker, filesystem):
     mocker.patch(patch_utils.format("get_file_hash"), side_effect=files)
 
     # Act
-    hash = utils.get_folder_hash("test")
+    hash = utils.get_folders_hash(["test"])
 
     # Assert
     assert hash == "b7e9365f1e796ba29e9e6b1b94b5f4cc7238530601fad8ec96ece9fee68c3d7f"
diff --git a/cli/medperf/utils.py b/cli/medperf/utils.py
index 5e80c26b9..d4c8d7ff3 100644
--- a/cli/medperf/utils.py
+++ b/cli/medperf/utils.py
@@ -382,9 +382,9 @@ def combine_proc_sp_text(proc: spawn) -> str:
     return proc_out
 
 
-def get_folder_hash(path: str) -> str:
-    """Generates a hash for all the contents of the folder. This procedure
-    hashes all of the files in the folder, sorts them and then hashes that list.
+def get_folders_hash(paths: List[str]) -> str:
+    """Generates a hash for all the contents of the fiven folders. This procedure
+    hashes all of the files in all passed folders, sorts them and then hashes that list.
 
     Args:
         path (str): Folder to hash
@@ -393,11 +393,14 @@ def get_folder_hash(path: str) -> str:
         str: sha256 hash of the whole folder
     """
     hashes = []
-    for root, _, files in os.walk(path, topdown=False):
-        for file in files:
-            logging.debug(f"Hashing file {file}")
-            filepath = os.path.join(root, file)
-            hashes.append(get_file_hash(filepath))
+
+    # The hash doesn't depend on the order of paths or folders, as the hashes get sorted after the fact
+    for path in paths:
+        for root, _, files in os.walk(path, topdown=False):
+            for file in files:
+                logging.debug(f"Hashing file {file}")
+                filepath = os.path.join(root, file)
+                hashes.append(get_file_hash(filepath))
 
     hashes = sorted(hashes)
     sha = hashlib.sha256()