Skip to content

Commit

Permalink
Fix dataset hashes not taking labels into account (#493)
Browse files Browse the repository at this point in the history
* Add Data Preparator cookiecutter template

* Rename cookiecutter folder

* Temporarily remove possibly offending files

* Remove cookicutter conditionals

* Inclube back missing pieces of template

* remove cookiecutter typo

* Use project_name attribute

* Change cookiecutter fields order

* Create empty directories on hook

* Fix empty folders paths

* Create evaluator mlcube cookiecutter template

* Fix JSON Syntax Error

* Update template default values

* Remove reference to undefined template variable

* Implement model mlcube cookiecutter template

* Update cookiecutter variable default values

* Create medperf CLI command for creating MLCubes

* Provide additional options for mlcube create

* Start working on tests

* Add tests for cube create

* Ignore invalid syntax on cookiecutter conditionals

* Ignore more flake8 errors

* Remove unused import

* Empty commit for cloudbuild

* Fix inconsistency with labels paths

* Update mlcube.yaml so it can be commented on docs

* Don't render noqa comments on template

* Remove flake8 specific ignores

* Exclude templates from lint checks

* Remove specific flake8 ignores

* Fix labels_paht being passed in he wrong situation

* Add requirements to cookiecutters

* Set separate labels as true by default

* Remove duplicate templates

* Abstract field-error dict formatting

* Reformat errors dictionary for printing

* Allow passing multiple folders for hashing

* Generate dataset hashes with data and labels

* Fix linter issue

* Fix utils using outdated folder hash function

* Fix tests

* Add note about hash ordering

---------

Co-authored-by: hasan7n <[email protected]>
  • Loading branch information
aristizabal95 and hasan7n authored Dec 1, 2023
1 parent 8c26674 commit 5a6a1e3
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 24 deletions.
4 changes: 2 additions & 2 deletions cli/medperf/commands/compatibility_test/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from medperf.utils import storage_path, get_folder_hash
from medperf.utils import storage_path, get_folders_hash
from medperf.exceptions import InvalidArgumentError, InvalidEntityError

from medperf.comms.entity_resources import resources
Expand Down Expand Up @@ -34,7 +34,7 @@ def download_demo_data(dset_url, dset_hash):


def prepare_local_cube(path):
temp_uid = get_folder_hash(path)
temp_uid = get_folders_hash([path])
cubes_storage = storage_path(config.cubes_storage)
dst = os.path.join(cubes_storage, temp_uid)
os.symlink(path, dst)
Expand Down
6 changes: 3 additions & 3 deletions cli/medperf/commands/dataset/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from medperf.utils import (
remove_path,
generate_tmp_path,
get_folder_hash,
get_folders_hash,
storage_path,
)
from medperf.exceptions import InvalidArgumentError
Expand Down Expand Up @@ -176,8 +176,8 @@ def get_statistics(self):

def generate_uids(self):
"""Auto-generates dataset UIDs for both input and output paths"""
self.in_uid = get_folder_hash(self.data_path)
self.generated_uid = get_folder_hash(self.out_datapath)
self.in_uid = get_folders_hash([self.data_path, self.labels_path])
self.generated_uid = get_folders_hash([self.out_datapath, self.out_labelspath])

def to_permanent_path(self) -> str:
"""Renames the temporary data folder to permanent one using the hash of
Expand Down
25 changes: 20 additions & 5 deletions cli/medperf/tests/commands/dataset/test_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,15 +195,30 @@ def test_fails_if_invalid_params(self, mocker, benchmark_uid, cube_uid, comms, u
else:
preparation.validate()

@pytest.mark.parametrize("in_path", ["data_path", "input_path", "/usr/data/path"])
@pytest.mark.parametrize("out_path", ["out_path", "~/.medperf/data/123"])
@pytest.mark.parametrize(
"in_path",
[
["data", "labels"],
["in_data", "in_labels"],
["/usr/data/path", "usr/labels/path"],
],
)
@pytest.mark.parametrize(
"out_path",
[
["out_data", "out_labels"],
["~/.medperf/data/123/data", "~/.medperf/data/123/labels"],
],
)
def test_generate_uids_assigns_uids_to_obj_properties(
self, mocker, in_path, out_path, preparation
):
# Arrange
mocker.patch(PATCH_DATAPREP.format("get_folder_hash"), side_effect=lambda x: x)
preparation.data_path = in_path
preparation.out_datapath = out_path
mocker.patch(PATCH_DATAPREP.format("get_folders_hash"), side_effect=lambda x: x)
preparation.data_path = in_path[0]
preparation.labels_path = in_path[1]
preparation.out_datapath = out_path[0]
preparation.out_labelspath = out_path[1]

# Act
preparation.generate_uids()
Expand Down
12 changes: 6 additions & 6 deletions cli/medperf/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def test_dict_pretty_print_passes_clean_dict_to_yaml(mocker, ui, dict_with_nones
spy.assert_called_once_with(exp_dict)


def test_get_folder_hash_hashes_all_files_in_folder(mocker, filesystem):
def test_get_folders_hash_hashes_all_files_in_folder(mocker, filesystem):
# Arrange
fs = filesystem[0]
files = filesystem[1]
Expand All @@ -353,13 +353,13 @@ def test_get_folder_hash_hashes_all_files_in_folder(mocker, filesystem):
spy = mocker.patch(patch_utils.format("get_file_hash"), side_effect=files)

# Act
utils.get_folder_hash("test")
utils.get_folders_hash(["test"])

# Assert
spy.assert_has_calls(exp_calls)


def test_get_folder_hash_sorts_individual_hashes(mocker, filesystem):
def test_get_folders_hash_sorts_individual_hashes(mocker, filesystem):
# Arrange
fs = filesystem[0]
files = filesystem[1]
Expand All @@ -368,21 +368,21 @@ def test_get_folder_hash_sorts_individual_hashes(mocker, filesystem):
spy = mocker.patch("builtins.sorted", side_effect=sorted)

# Act
utils.get_folder_hash("test")
utils.get_folders_hash(["test"])

# Assert
spy.assert_called_once_with(files)


def test_get_folder_hash_returns_expected_hash(mocker, filesystem):
def test_get_folders_hash_returns_expected_hash(mocker, filesystem):
# Arrange
fs = filesystem[0]
files = filesystem[1]
mocker.patch("os.walk", return_value=fs)
mocker.patch(patch_utils.format("get_file_hash"), side_effect=files)

# Act
hash = utils.get_folder_hash("test")
hash = utils.get_folders_hash(["test"])

# Assert
assert hash == "b7e9365f1e796ba29e9e6b1b94b5f4cc7238530601fad8ec96ece9fee68c3d7f"
Expand Down
19 changes: 11 additions & 8 deletions cli/medperf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,9 +382,9 @@ def combine_proc_sp_text(proc: spawn) -> str:
return proc_out


def get_folder_hash(path: str) -> str:
"""Generates a hash for all the contents of the folder. This procedure
hashes all of the files in the folder, sorts them and then hashes that list.
def get_folders_hash(paths: List[str]) -> str:
"""Generates a hash for all the contents of the fiven folders. This procedure
hashes all of the files in all passed folders, sorts them and then hashes that list.
Args:
path (str): Folder to hash
Expand All @@ -393,11 +393,14 @@ def get_folder_hash(path: str) -> str:
str: sha256 hash of the whole folder
"""
hashes = []
for root, _, files in os.walk(path, topdown=False):
for file in files:
logging.debug(f"Hashing file {file}")
filepath = os.path.join(root, file)
hashes.append(get_file_hash(filepath))

# The hash doesn't depend on the order of paths or folders, as the hashes get sorted after the fact
for path in paths:
for root, _, files in os.walk(path, topdown=False):
for file in files:
logging.debug(f"Hashing file {file}")
filepath = os.path.join(root, file)
hashes.append(get_file_hash(filepath))

hashes = sorted(hashes)
sha = hashlib.sha256()
Expand Down

0 comments on commit 5a6a1e3

Please sign in to comment.