Skip to content

Commit

Permalink
Merge pull request #19110 from nsoranzo/always_validate_hashes
Browse files Browse the repository at this point in the history
Always validate hashes when provided
  • Loading branch information
nsoranzo authored Nov 7, 2024
2 parents 8a0fbf6 + f7c5769 commit eb7d89d
Show file tree
Hide file tree
Showing 16 changed files with 20 additions and 85 deletions.
21 changes: 1 addition & 20 deletions client/src/api/schema/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14105,21 +14105,6 @@ export interface components {
* @description The source of the content. Can be other history element to be copied or library elements.
*/
source: components["schemas"]["DatasetSourceType"];
/**
* Validate hashes
* @description Set to true to enable dataset validation during materialization.
* @default false
*/
validate_hashes: boolean;
};
/** MaterializeDatasetOptions */
MaterializeDatasetOptions: {
/**
* Validate hashes
* @description Set to true to enable dataset validation during materialization.
* @default false
*/
validate_hashes: boolean;
};
/** MessageExceptionModel */
MessageExceptionModel: {
Expand Down Expand Up @@ -24485,11 +24470,7 @@ export interface operations {
};
cookie?: never;
};
requestBody?: {
content: {
"application/json": components["schemas"]["MaterializeDatasetOptions"] | null;
};
};
requestBody?: never;
responses: {
/** @description Successful Response */
200: {
Expand Down
4 changes: 1 addition & 3 deletions lib/galaxy/managers/hdas.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,7 @@ def materialize(self, request: MaterializeDatasetInstanceTaskRequest, in_place:
else:
dataset_instance = self.ldda_manager.get_accessible(request.content, user)
history = self.app.history_manager.by_id(request.history_id)
new_hda = materializer.ensure_materialized(
dataset_instance, target_history=history, validate_hashes=request.validate_hashes, in_place=in_place
)
new_hda = materializer.ensure_materialized(dataset_instance, target_history=history, in_place=in_place)
if not in_place:
history.add_dataset(new_hda, set_hid=True)
else:
Expand Down
17 changes: 5 additions & 12 deletions lib/galaxy/model/deferred.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ def ensure_materialized(
self,
dataset_instance: Union[HistoryDatasetAssociation, LibraryDatasetDatasetAssociation],
target_history: Optional[History] = None,
validate_hashes: bool = False,
in_place: bool = False,
) -> HistoryDatasetAssociation:
"""Create a new detached dataset instance from the supplied instance.
Expand Down Expand Up @@ -143,9 +142,7 @@ def ensure_materialized(
sa_session.commit()
object_store_populator.set_dataset_object_store_id(materialized_dataset)
try:
path = self._stream_source(
target_source, dataset_instance.datatype, validate_hashes, materialized_dataset_hashes
)
path = self._stream_source(target_source, dataset_instance.datatype, materialized_dataset_hashes)
object_store.update_from_file(materialized_dataset, file_name=path)
materialized_dataset.set_size()
except Exception as e:
Expand All @@ -157,9 +154,7 @@ def ensure_materialized(
# TODO: optimize this by streaming right to this path...
# TODO: take into acount transform and ensure we are and are not modifying the file as appropriate.
try:
path = self._stream_source(
target_source, dataset_instance.datatype, validate_hashes, materialized_dataset_hashes
)
path = self._stream_source(target_source, dataset_instance.datatype, materialized_dataset_hashes)
shutil.move(path, transient_paths.external_filename)
materialized_dataset.external_filename = transient_paths.external_filename
except Exception as e:
Expand Down Expand Up @@ -211,14 +206,12 @@ def ensure_materialized(
materialized_dataset_instance.metadata_deferred = False
return materialized_dataset_instance

def _stream_source(
self, target_source: DatasetSource, datatype, validate_hashes: bool, dataset_hashes: List[DatasetHash]
) -> str:
def _stream_source(self, target_source: DatasetSource, datatype, dataset_hashes: List[DatasetHash]) -> str:
source_uri = target_source.source_uri
if source_uri is None:
raise Exception("Cannot stream from dataset source without specified source_uri")
path = stream_url_to_file(source_uri, file_sources=self._file_sources)
if validate_hashes and target_source.hashes:
if target_source.hashes:
for source_hash in target_source.hashes:
_validate_hash(path, source_hash, "downloaded file")

Expand All @@ -244,7 +237,7 @@ def _stream_source(
if datatype_groom:
datatype.groom_dataset_content(path)

if validate_hashes and dataset_hashes:
if dataset_hashes:
for dataset_hash in dataset_hashes:
_validate_hash(path, dataset_hash, "dataset contents")

Expand Down
3 changes: 2 additions & 1 deletion lib/galaxy/model/unittest_utils/store_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
TEST_SOURCE_URI_BAM = "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/1.bam"
TEST_HASH_FUNCTION = "MD5"
TEST_HASH_VALUE = "f568c29421792b1b1df4474dafae01f1"
TEST_HASH_VALUE_BAM = "34693a376f9878cbe53166db083e0e26"
TEST_HISTORY_NAME = "My history in a model store"
TEST_EXTENSION = "bed"
TEST_LIBRARY_NAME = "My cool library"
Expand Down Expand Up @@ -339,7 +340,7 @@ def deferred_hda_model_store_dict_bam(
dataset_hash = dict(
model_class="DatasetHash",
hash_function=TEST_HASH_FUNCTION,
hash_value=TEST_HASH_VALUE,
hash_value=TEST_HASH_VALUE_BAM,
extra_files_path=None,
)
dataset_source: Dict[str, Any] = dict(
Expand Down
10 changes: 1 addition & 9 deletions lib/galaxy/schema/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3688,15 +3688,7 @@ class PageSummaryBase(Model):
)


class MaterializeDatasetOptions(Model):
validate_hashes: bool = Field(
False,
title="Validate hashes",
description="Set to true to enable dataset validation during materialization.",
)


class MaterializeDatasetInstanceAPIRequest(MaterializeDatasetOptions):
class MaterializeDatasetInstanceAPIRequest(Model):
source: DatasetSourceType = Field(
title="Source",
description="The source of the content. Can be other history element to be copied or library elements.",
Expand Down
5 changes: 0 additions & 5 deletions lib/galaxy/schema/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,6 @@ class MaterializeDatasetInstanceTaskRequest(Model):
"- The decoded id of the HDA\n"
),
)
validate_hashes: bool = Field(
False,
title="Validate hashes",
description="Set to true to enable dataset validation during materialization.",
)


class ComputeDatasetHashTaskRequest(Model):
Expand Down
12 changes: 4 additions & 8 deletions lib/galaxy/tools/data_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def _resolve_item_with_primary(item):
hash_function = hash_dict.get("hash_function")
hash_value = hash_dict.get("hash_value")
try:
_handle_hash_validation(upload_config, hash_function, hash_value, path)
_handle_hash_validation(hash_function, hash_value, path)
except Exception as e:
error_message = str(e)
item["error_message"] = error_message
Expand Down Expand Up @@ -501,7 +501,7 @@ def _has_src_to_path(upload_config: "UploadConfig", item: Dict[str, Any], is_dat
for hash_function in HASH_NAMES:
hash_value = item.get(hash_function)
if hash_value:
_handle_hash_validation(upload_config, hash_function, hash_value, path)
_handle_hash_validation(hash_function, hash_value, path)
if name is None:
name = url.split("/")[-1]
elif src == "pasted":
Expand All @@ -516,11 +516,8 @@ def _has_src_to_path(upload_config: "UploadConfig", item: Dict[str, Any], is_dat
return name, path


def _handle_hash_validation(
upload_config: "UploadConfig", hash_function: HashFunctionNameEnum, hash_value: str, path: str
):
if upload_config.validate_hashes:
verify_hash(path, hash_func_name=hash_function, hash_value=hash_value, what="upload")
def _handle_hash_validation(hash_function: HashFunctionNameEnum, hash_value: str, path: str):
verify_hash(path, hash_func_name=hash_function, hash_value=hash_value, what="upload")


def _arg_parser():
Expand Down Expand Up @@ -566,7 +563,6 @@ def __init__(
self.to_posix_lines = request.get("to_posix_lines", False)
self.space_to_tab = request.get("space_to_tab", False)
self.auto_decompress = request.get("auto_decompress", False)
self.validate_hashes = request.get("validate_hashes", False)
self.deferred = request.get("deferred", False)
self.link_data_only = _link_data_only(request)
self.file_sources_dict = file_sources_dict
Expand Down
6 changes: 0 additions & 6 deletions lib/galaxy/webapps/galaxy/api/history_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@
HistoryContentType,
MaterializeDatasetInstanceAPIRequest,
MaterializeDatasetInstanceRequest,
MaterializeDatasetOptions,
StoreExportPayload,
UpdateHistoryContentsBatchPayload,
UpdateHistoryContentsPayload,
Expand Down Expand Up @@ -1073,17 +1072,12 @@ def materialize_dataset(
history_id: HistoryIDPathParam,
id: HistoryItemIDPathParam,
trans: ProvidesHistoryContext = DependsOnTrans,
materialize_api_payload: Optional[MaterializeDatasetOptions] = Body(None),
) -> AsyncTaskResultSummary:
validate_hashes: bool = (
materialize_api_payload.validate_hashes if materialize_api_payload is not None else False
)
# values are already validated, use model_construct
materialize_request = MaterializeDatasetInstanceRequest.model_construct(
history_id=history_id,
source=DatasetSourceType.hda,
content=id,
validate_hashes=validate_hashes,
)
rval = self.service.materialize(trans, materialize_request)
return rval
Expand Down
1 change: 0 additions & 1 deletion lib/galaxy/webapps/galaxy/services/history_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,6 @@ def materialize(
history_id=request.history_id,
source=request.source,
content=request.content,
validate_hashes=request.validate_hashes,
user=trans.async_request_user,
)
results = materialize_task.delay(request=task_request)
Expand Down
1 change: 0 additions & 1 deletion lib/galaxy/workflow/scheduling_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,6 @@ def __attempt_materialize(self, workflow_invocation, session) -> bool:
history_id=workflow_invocation.history.id,
source="hda",
content=hda.id,
validate_hashes=True,
)
materialized_okay = self.app.hda_manager.materialize(task_request, in_place=True)
if not materialized_okay:
Expand Down
1 change: 0 additions & 1 deletion lib/galaxy_test/api/test_libraries.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,6 @@ def test_fetch_failed_validation(self):
payload = {
"history_id": history_id, # TODO: Shouldn't be needed :(
"targets": targets,
"validate_hashes": True,
}
tool_response = self.dataset_populator.fetch(payload, assert_ok=False)
job = self.dataset_populator.check_run(tool_response)
Expand Down
2 changes: 0 additions & 2 deletions lib/galaxy_test/api/test_tools_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,7 +1028,6 @@ def test_upload_and_validate_hash_valid(self):
payload = {
"history_id": history_id,
"targets": targets,
"validate_hashes": True,
}
fetch_response = self.dataset_populator.fetch(payload)
self._assert_status_code_is(fetch_response, 200)
Expand All @@ -1053,7 +1052,6 @@ def test_upload_and_validate_hash_invalid(self):
payload = {
"history_id": history_id,
"targets": targets,
"validate_hashes": True,
}
fetch_response = self.dataset_populator.fetch(payload, assert_ok=True, wait=False)
self._assert_status_code_is(fetch_response, 200)
Expand Down
7 changes: 1 addition & 6 deletions lib/galaxy_test/base/populators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1023,9 +1023,7 @@ def tools_post(self, payload: dict, url="tools") -> Response:
def describe_tool_execution(self, tool_id: str) -> "DescribeToolExecution":
return DescribeToolExecution(self, tool_id)

def materialize_dataset_instance(
self, history_id: str, id: str, source: str = "hda", validate_hashes: bool = False
):
def materialize_dataset_instance(self, history_id: str, id: str, source: str = "hda"):
payload: Dict[str, Any]
if source == "ldda":
url = f"histories/{history_id}/materialize"
Expand All @@ -1036,8 +1034,6 @@ def materialize_dataset_instance(
else:
url = f"histories/{history_id}/contents/datasets/{id}/materialize"
payload = {}
if validate_hashes:
payload["validate_hashes"] = True
create_response = self._post(url, payload, json=True)
api_asserts.assert_status_code_is_ok(create_response)
create_response_json = create_response.json()
Expand Down Expand Up @@ -2780,7 +2776,6 @@ def fetch_single_url_to_folder(self, file_type="auto", assert_ok=True):
payload = {
"history_id": history_id, # TODO: Shouldn't be needed :(
"targets": targets,
"validate_hashes": True,
}
return library, self.dataset_populator.fetch(payload, assert_ok=assert_ok)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_materialize_hash_failure(self, history_id: str):
assert deferred_hda["state"] == "deferred"
assert not deferred_hda["deleted"]

self.dataset_populator.materialize_dataset_instance(history_id, deferred_hda["id"], validate_hashes=True)
self.dataset_populator.materialize_dataset_instance(history_id, deferred_hda["id"])
self.dataset_populator.wait_on_history_length(history_id, 2)
new_hda_details = self.dataset_populator.get_history_dataset_details(
history_id, hid=2, assert_ok=False, wait=False
Expand Down
5 changes: 0 additions & 5 deletions test/unit/app/tools/test_data_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def test_simple_path_get(hash_value: str, error_message: Optional[str]):
],
}
],
"validate_hashes": True,
}
execute_context.execute_request(request)
output = _unnamed_output(execute_context)
Expand Down Expand Up @@ -111,7 +110,6 @@ def test_correct_md5():
],
}
],
"validate_hashes": True,
}
execute_context.execute_request(request)
output = _unnamed_output(execute_context)
Expand Down Expand Up @@ -142,7 +140,6 @@ def test_incorrect_md5():
],
}
],
"validate_hashes": True,
}
execute_context.execute_request(request)
output = _unnamed_output(execute_context)
Expand Down Expand Up @@ -175,7 +172,6 @@ def test_correct_sha1():
],
}
],
"validate_hashes": True,
}
execute_context.execute_request(request)
output = _unnamed_output(execute_context)
Expand Down Expand Up @@ -206,7 +202,6 @@ def test_incorrect_sha1():
],
}
],
"validate_hashes": True,
}
execute_context.execute_request(request)
output = _unnamed_output(execute_context)
Expand Down
8 changes: 4 additions & 4 deletions test/unit/data/test_dataset_materialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_hash_validate():
_assert_2_bed_metadata(deferred_hda)
assert deferred_hda.dataset.state == "deferred"
materializer = materializer_factory(True, object_store=fixture_context.app.object_store)
materialized_hda = materializer.ensure_materialized(deferred_hda, validate_hashes=True)
materialized_hda = materializer.ensure_materialized(deferred_hda)
materialized_dataset = materialized_hda.dataset
assert materialized_dataset.state == "ok"

Expand All @@ -86,7 +86,7 @@ def test_hash_invalid():
_assert_2_bed_metadata(deferred_hda)
assert deferred_hda.dataset.state == "deferred"
materializer = materializer_factory(True, object_store=fixture_context.app.object_store)
materialized_hda = materializer.ensure_materialized(deferred_hda, validate_hashes=True)
materialized_hda = materializer.ensure_materialized(deferred_hda)
materialized_dataset = materialized_hda.dataset
assert materialized_dataset.state == "error"

Expand All @@ -103,7 +103,7 @@ def test_hash_validate_source_of_download():
_assert_2_bed_metadata(deferred_hda)
assert deferred_hda.dataset.state == "deferred"
materializer = materializer_factory(True, object_store=fixture_context.app.object_store)
materialized_hda = materializer.ensure_materialized(deferred_hda, validate_hashes=True)
materialized_hda = materializer.ensure_materialized(deferred_hda)
materialized_dataset = materialized_hda.dataset
assert materialized_dataset.state == "ok", materialized_hda.info

Expand All @@ -120,7 +120,7 @@ def test_hash_invalid_source_of_download():
_assert_2_bed_metadata(deferred_hda)
assert deferred_hda.dataset.state == "deferred"
materializer = materializer_factory(True, object_store=fixture_context.app.object_store)
materialized_hda = materializer.ensure_materialized(deferred_hda, validate_hashes=True)
materialized_hda = materializer.ensure_materialized(deferred_hda)
materialized_dataset = materialized_hda.dataset
assert materialized_dataset.state == "error", materialized_hda.info

Expand Down

0 comments on commit eb7d89d

Please sign in to comment.