From 5a6a1e3f0a72f0ae0749d92c1563b7e2fbe1951f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Fri, 1 Dec 2023 06:11:53 -0500 Subject: [PATCH] Fix dataset hashes not taking labels into account (#493) * Add Data Preparator cookiecutter template * Rename cookiecutter folder * Temporarily remove possibly offending files * Remove cookicutter conditionals * Inclube back missing pieces of template * remove cookiecutter typo * Use project_name attribute * Change cookiecutter fields order * Create empty directories on hook * Fix empty folders paths * Create evaluator mlcube cookiecutter template * Fix JSON Syntax Error * Update template default values * Remove reference to undefined template variable * Implement model mlcube cookiecutter template * Update cookiecutter variable default values * Create medperf CLI command for creating MLCubes * Provide additional options for mlcube create * Start working on tests * Add tests for cube create * Ignore invalid syntax on cookiecutter conditionals * Ignore more flake8 errors * Remove unused import * Empty commit for cloudbuild * Fix inconsistency with labels paths * Update mlcube.yaml so it can be commented on docs * Don't render noqa comments on template * Remove flake8 specific ignores * Exclude templates from lint checks * Remove specific flake8 ignores * Fix labels_paht being passed in he wrong situation * Add requirements to cookiecutters * Set separate labels as true by default * Remove duplicate templates * Abstract field-error dict formatting * Reformat errors dictionary for printing * Allow passing multiple folders for hashing * Generate dataset hashes with data and labels * Fix linter issue * Fix utils using outdated folder hash function * Fix tests * Add note about hash ordering --------- Co-authored-by: hasan7n <78664424+hasan7n@users.noreply.github.com> --- .../commands/compatibility_test/utils.py | 4 +-- cli/medperf/commands/dataset/create.py | 6 ++--- .../tests/commands/dataset/test_create.py | 25 +++++++++++++++---- cli/medperf/tests/test_utils.py | 12 ++++----- cli/medperf/utils.py | 19 ++++++++------ 5 files changed, 42 insertions(+), 24 deletions(-) diff --git a/cli/medperf/commands/compatibility_test/utils.py b/cli/medperf/commands/compatibility_test/utils.py index 4f8784c7d..a956a5796 100644 --- a/cli/medperf/commands/compatibility_test/utils.py +++ b/cli/medperf/commands/compatibility_test/utils.py @@ -1,4 +1,4 @@ -from medperf.utils import storage_path, get_folder_hash +from medperf.utils import storage_path, get_folders_hash from medperf.exceptions import InvalidArgumentError, InvalidEntityError from medperf.comms.entity_resources import resources @@ -34,7 +34,7 @@ def download_demo_data(dset_url, dset_hash): def prepare_local_cube(path): - temp_uid = get_folder_hash(path) + temp_uid = get_folders_hash([path]) cubes_storage = storage_path(config.cubes_storage) dst = os.path.join(cubes_storage, temp_uid) os.symlink(path, dst) diff --git a/cli/medperf/commands/dataset/create.py b/cli/medperf/commands/dataset/create.py index 7797e6d7e..10acc9c97 100644 --- a/cli/medperf/commands/dataset/create.py +++ b/cli/medperf/commands/dataset/create.py @@ -9,7 +9,7 @@ from medperf.utils import ( remove_path, generate_tmp_path, - get_folder_hash, + get_folders_hash, storage_path, ) from medperf.exceptions import InvalidArgumentError @@ -176,8 +176,8 @@ def get_statistics(self): def generate_uids(self): """Auto-generates dataset UIDs for both input and output paths""" - self.in_uid = get_folder_hash(self.data_path) - self.generated_uid = get_folder_hash(self.out_datapath) + self.in_uid = get_folders_hash([self.data_path, self.labels_path]) + self.generated_uid = get_folders_hash([self.out_datapath, self.out_labelspath]) def to_permanent_path(self) -> str: """Renames the temporary data folder to permanent one using the hash of diff --git a/cli/medperf/tests/commands/dataset/test_create.py b/cli/medperf/tests/commands/dataset/test_create.py index 5cd1f622d..49e82a355 100644 --- a/cli/medperf/tests/commands/dataset/test_create.py +++ b/cli/medperf/tests/commands/dataset/test_create.py @@ -195,15 +195,30 @@ def test_fails_if_invalid_params(self, mocker, benchmark_uid, cube_uid, comms, u else: preparation.validate() - @pytest.mark.parametrize("in_path", ["data_path", "input_path", "/usr/data/path"]) - @pytest.mark.parametrize("out_path", ["out_path", "~/.medperf/data/123"]) + @pytest.mark.parametrize( + "in_path", + [ + ["data", "labels"], + ["in_data", "in_labels"], + ["/usr/data/path", "usr/labels/path"], + ], + ) + @pytest.mark.parametrize( + "out_path", + [ + ["out_data", "out_labels"], + ["~/.medperf/data/123/data", "~/.medperf/data/123/labels"], + ], + ) def test_generate_uids_assigns_uids_to_obj_properties( self, mocker, in_path, out_path, preparation ): # Arrange - mocker.patch(PATCH_DATAPREP.format("get_folder_hash"), side_effect=lambda x: x) - preparation.data_path = in_path - preparation.out_datapath = out_path + mocker.patch(PATCH_DATAPREP.format("get_folders_hash"), side_effect=lambda x: x) + preparation.data_path = in_path[0] + preparation.labels_path = in_path[1] + preparation.out_datapath = out_path[0] + preparation.out_labelspath = out_path[1] # Act preparation.generate_uids() diff --git a/cli/medperf/tests/test_utils.py b/cli/medperf/tests/test_utils.py index 9b32e0b99..4130c70ce 100644 --- a/cli/medperf/tests/test_utils.py +++ b/cli/medperf/tests/test_utils.py @@ -344,7 +344,7 @@ def test_dict_pretty_print_passes_clean_dict_to_yaml(mocker, ui, dict_with_nones spy.assert_called_once_with(exp_dict) -def test_get_folder_hash_hashes_all_files_in_folder(mocker, filesystem): +def test_get_folders_hash_hashes_all_files_in_folder(mocker, filesystem): # Arrange fs = filesystem[0] files = filesystem[1] @@ -353,13 +353,13 @@ def test_get_folder_hash_hashes_all_files_in_folder(mocker, filesystem): spy = mocker.patch(patch_utils.format("get_file_hash"), side_effect=files) # Act - utils.get_folder_hash("test") + utils.get_folders_hash(["test"]) # Assert spy.assert_has_calls(exp_calls) -def test_get_folder_hash_sorts_individual_hashes(mocker, filesystem): +def test_get_folders_hash_sorts_individual_hashes(mocker, filesystem): # Arrange fs = filesystem[0] files = filesystem[1] @@ -368,13 +368,13 @@ def test_get_folder_hash_sorts_individual_hashes(mocker, filesystem): spy = mocker.patch("builtins.sorted", side_effect=sorted) # Act - utils.get_folder_hash("test") + utils.get_folders_hash(["test"]) # Assert spy.assert_called_once_with(files) -def test_get_folder_hash_returns_expected_hash(mocker, filesystem): +def test_get_folders_hash_returns_expected_hash(mocker, filesystem): # Arrange fs = filesystem[0] files = filesystem[1] @@ -382,7 +382,7 @@ def test_get_folder_hash_returns_expected_hash(mocker, filesystem): mocker.patch(patch_utils.format("get_file_hash"), side_effect=files) # Act - hash = utils.get_folder_hash("test") + hash = utils.get_folders_hash(["test"]) # Assert assert hash == "b7e9365f1e796ba29e9e6b1b94b5f4cc7238530601fad8ec96ece9fee68c3d7f" diff --git a/cli/medperf/utils.py b/cli/medperf/utils.py index 5e80c26b9..d4c8d7ff3 100644 --- a/cli/medperf/utils.py +++ b/cli/medperf/utils.py @@ -382,9 +382,9 @@ def combine_proc_sp_text(proc: spawn) -> str: return proc_out -def get_folder_hash(path: str) -> str: - """Generates a hash for all the contents of the folder. This procedure - hashes all of the files in the folder, sorts them and then hashes that list. +def get_folders_hash(paths: List[str]) -> str: + """Generates a hash for all the contents of the fiven folders. This procedure + hashes all of the files in all passed folders, sorts them and then hashes that list. Args: path (str): Folder to hash @@ -393,11 +393,14 @@ def get_folder_hash(path: str) -> str: str: sha256 hash of the whole folder """ hashes = [] - for root, _, files in os.walk(path, topdown=False): - for file in files: - logging.debug(f"Hashing file {file}") - filepath = os.path.join(root, file) - hashes.append(get_file_hash(filepath)) + + # The hash doesn't depend on the order of paths or folders, as the hashes get sorted after the fact + for path in paths: + for root, _, files in os.walk(path, topdown=False): + for file in files: + logging.debug(f"Hashing file {file}") + filepath = os.path.join(root, file) + hashes.append(get_file_hash(filepath)) hashes = sorted(hashes) sha = hashlib.sha256()