diff --git a/cli/medperf/commands/compatibility_test/utils.py b/cli/medperf/commands/compatibility_test/utils.py index 4f8784c7d..a956a5796 100644 --- a/cli/medperf/commands/compatibility_test/utils.py +++ b/cli/medperf/commands/compatibility_test/utils.py @@ -1,4 +1,4 @@ -from medperf.utils import storage_path, get_folder_hash +from medperf.utils import storage_path, get_folders_hash from medperf.exceptions import InvalidArgumentError, InvalidEntityError from medperf.comms.entity_resources import resources @@ -34,7 +34,7 @@ def download_demo_data(dset_url, dset_hash): def prepare_local_cube(path): - temp_uid = get_folder_hash(path) + temp_uid = get_folders_hash([path]) cubes_storage = storage_path(config.cubes_storage) dst = os.path.join(cubes_storage, temp_uid) os.symlink(path, dst) diff --git a/cli/medperf/commands/dataset/create.py b/cli/medperf/commands/dataset/create.py index 7797e6d7e..10acc9c97 100644 --- a/cli/medperf/commands/dataset/create.py +++ b/cli/medperf/commands/dataset/create.py @@ -9,7 +9,7 @@ from medperf.utils import ( remove_path, generate_tmp_path, - get_folder_hash, + get_folders_hash, storage_path, ) from medperf.exceptions import InvalidArgumentError @@ -176,8 +176,8 @@ def get_statistics(self): def generate_uids(self): """Auto-generates dataset UIDs for both input and output paths""" - self.in_uid = get_folder_hash(self.data_path) - self.generated_uid = get_folder_hash(self.out_datapath) + self.in_uid = get_folders_hash([self.data_path, self.labels_path]) + self.generated_uid = get_folders_hash([self.out_datapath, self.out_labelspath]) def to_permanent_path(self) -> str: """Renames the temporary data folder to permanent one using the hash of diff --git a/cli/medperf/tests/commands/dataset/test_create.py b/cli/medperf/tests/commands/dataset/test_create.py index 5cd1f622d..49e82a355 100644 --- a/cli/medperf/tests/commands/dataset/test_create.py +++ b/cli/medperf/tests/commands/dataset/test_create.py @@ -195,15 +195,30 @@ def test_fails_if_invalid_params(self, mocker, benchmark_uid, cube_uid, comms, u else: preparation.validate() - @pytest.mark.parametrize("in_path", ["data_path", "input_path", "/usr/data/path"]) - @pytest.mark.parametrize("out_path", ["out_path", "~/.medperf/data/123"]) + @pytest.mark.parametrize( + "in_path", + [ + ["data", "labels"], + ["in_data", "in_labels"], + ["/usr/data/path", "usr/labels/path"], + ], + ) + @pytest.mark.parametrize( + "out_path", + [ + ["out_data", "out_labels"], + ["~/.medperf/data/123/data", "~/.medperf/data/123/labels"], + ], + ) def test_generate_uids_assigns_uids_to_obj_properties( self, mocker, in_path, out_path, preparation ): # Arrange - mocker.patch(PATCH_DATAPREP.format("get_folder_hash"), side_effect=lambda x: x) - preparation.data_path = in_path - preparation.out_datapath = out_path + mocker.patch(PATCH_DATAPREP.format("get_folders_hash"), side_effect=lambda x: x) + preparation.data_path = in_path[0] + preparation.labels_path = in_path[1] + preparation.out_datapath = out_path[0] + preparation.out_labelspath = out_path[1] # Act preparation.generate_uids() diff --git a/cli/medperf/tests/test_utils.py b/cli/medperf/tests/test_utils.py index 9b32e0b99..4130c70ce 100644 --- a/cli/medperf/tests/test_utils.py +++ b/cli/medperf/tests/test_utils.py @@ -344,7 +344,7 @@ def test_dict_pretty_print_passes_clean_dict_to_yaml(mocker, ui, dict_with_nones spy.assert_called_once_with(exp_dict) -def test_get_folder_hash_hashes_all_files_in_folder(mocker, filesystem): +def test_get_folders_hash_hashes_all_files_in_folder(mocker, filesystem): # Arrange fs = filesystem[0] files = filesystem[1] @@ -353,13 +353,13 @@ def test_get_folder_hash_hashes_all_files_in_folder(mocker, filesystem): spy = mocker.patch(patch_utils.format("get_file_hash"), side_effect=files) # Act - utils.get_folder_hash("test") + utils.get_folders_hash(["test"]) # Assert spy.assert_has_calls(exp_calls) -def test_get_folder_hash_sorts_individual_hashes(mocker, filesystem): +def test_get_folders_hash_sorts_individual_hashes(mocker, filesystem): # Arrange fs = filesystem[0] files = filesystem[1] @@ -368,13 +368,13 @@ def test_get_folder_hash_sorts_individual_hashes(mocker, filesystem): spy = mocker.patch("builtins.sorted", side_effect=sorted) # Act - utils.get_folder_hash("test") + utils.get_folders_hash(["test"]) # Assert spy.assert_called_once_with(files) -def test_get_folder_hash_returns_expected_hash(mocker, filesystem): +def test_get_folders_hash_returns_expected_hash(mocker, filesystem): # Arrange fs = filesystem[0] files = filesystem[1] @@ -382,7 +382,7 @@ def test_get_folder_hash_returns_expected_hash(mocker, filesystem): mocker.patch(patch_utils.format("get_file_hash"), side_effect=files) # Act - hash = utils.get_folder_hash("test") + hash = utils.get_folders_hash(["test"]) # Assert assert hash == "b7e9365f1e796ba29e9e6b1b94b5f4cc7238530601fad8ec96ece9fee68c3d7f" diff --git a/cli/medperf/utils.py b/cli/medperf/utils.py index 5e80c26b9..d4c8d7ff3 100644 --- a/cli/medperf/utils.py +++ b/cli/medperf/utils.py @@ -382,9 +382,9 @@ def combine_proc_sp_text(proc: spawn) -> str: return proc_out -def get_folder_hash(path: str) -> str: - """Generates a hash for all the contents of the folder. This procedure - hashes all of the files in the folder, sorts them and then hashes that list. +def get_folders_hash(paths: List[str]) -> str: + """Generates a hash for all the contents of the fiven folders. This procedure + hashes all of the files in all passed folders, sorts them and then hashes that list. Args: path (str): Folder to hash @@ -393,11 +393,14 @@ def get_folder_hash(path: str) -> str: str: sha256 hash of the whole folder """ hashes = [] - for root, _, files in os.walk(path, topdown=False): - for file in files: - logging.debug(f"Hashing file {file}") - filepath = os.path.join(root, file) - hashes.append(get_file_hash(filepath)) + + # The hash doesn't depend on the order of paths or folders, as the hashes get sorted after the fact + for path in paths: + for root, _, files in os.walk(path, topdown=False): + for file in files: + logging.debug(f"Hashing file {file}") + filepath = os.path.join(root, file) + hashes.append(get_file_hash(filepath)) hashes = sorted(hashes) sha = hashlib.sha256()