From f7c5769ca56d781a04fa3b18a271961538bd3ccc Mon Sep 17 00:00:00 2001 From: Nicola Soranzo Date: Tue, 5 Nov 2024 23:16:09 +0000 Subject: [PATCH] Always validate hashes when provided Currently when uploading datasets with associated hashes, this were only validated if `validate_hashes=True` was also passed in the request. This commit drops `validate_hashes` completely and always validates hashes when they were provided. --- client/src/api/schema/schema.ts | 21 +------------------ lib/galaxy/managers/hdas.py | 4 +--- lib/galaxy/model/deferred.py | 17 +++++---------- .../model/unittest_utils/store_fixtures.py | 3 ++- lib/galaxy/schema/schema.py | 10 +-------- lib/galaxy/schema/tasks.py | 5 ----- lib/galaxy/tools/data_fetch.py | 12 ++++------- .../webapps/galaxy/api/history_contents.py | 6 ------ .../galaxy/services/history_contents.py | 1 - lib/galaxy/workflow/scheduling_manager.py | 1 - lib/galaxy_test/api/test_libraries.py | 1 - lib/galaxy_test/api/test_tools_upload.py | 2 -- lib/galaxy_test/base/populators.py | 7 +------ ...test_materialize_dataset_instance_tasks.py | 2 +- test/unit/app/tools/test_data_fetch.py | 5 ----- .../unit/data/test_dataset_materialization.py | 8 +++---- 16 files changed, 20 insertions(+), 85 deletions(-) diff --git a/client/src/api/schema/schema.ts b/client/src/api/schema/schema.ts index 93d99b75fa0a..53670a59cf17 100644 --- a/client/src/api/schema/schema.ts +++ b/client/src/api/schema/schema.ts @@ -14083,21 +14083,6 @@ export interface components { * @description The source of the content. Can be other history element to be copied or library elements. */ source: components["schemas"]["DatasetSourceType"]; - /** - * Validate hashes - * @description Set to true to enable dataset validation during materialization. - * @default false - */ - validate_hashes: boolean; - }; - /** MaterializeDatasetOptions */ - MaterializeDatasetOptions: { - /** - * Validate hashes - * @description Set to true to enable dataset validation during materialization. - * @default false - */ - validate_hashes: boolean; }; /** MessageExceptionModel */ MessageExceptionModel: { @@ -24415,11 +24400,7 @@ export interface operations { }; cookie?: never; }; - requestBody?: { - content: { - "application/json": components["schemas"]["MaterializeDatasetOptions"] | null; - }; - }; + requestBody?: never; responses: { /** @description Successful Response */ 200: { diff --git a/lib/galaxy/managers/hdas.py b/lib/galaxy/managers/hdas.py index 63fa618545a8..5e2c08347f31 100644 --- a/lib/galaxy/managers/hdas.py +++ b/lib/galaxy/managers/hdas.py @@ -176,9 +176,7 @@ def materialize(self, request: MaterializeDatasetInstanceTaskRequest, in_place: else: dataset_instance = self.ldda_manager.get_accessible(request.content, user) history = self.app.history_manager.by_id(request.history_id) - new_hda = materializer.ensure_materialized( - dataset_instance, target_history=history, validate_hashes=request.validate_hashes, in_place=in_place - ) + new_hda = materializer.ensure_materialized(dataset_instance, target_history=history, in_place=in_place) if not in_place: history.add_dataset(new_hda, set_hid=True) else: diff --git a/lib/galaxy/model/deferred.py b/lib/galaxy/model/deferred.py index f9b5f1414d3c..06eb995adc93 100644 --- a/lib/galaxy/model/deferred.py +++ b/lib/galaxy/model/deferred.py @@ -99,7 +99,6 @@ def ensure_materialized( self, dataset_instance: Union[HistoryDatasetAssociation, LibraryDatasetDatasetAssociation], target_history: Optional[History] = None, - validate_hashes: bool = False, in_place: bool = False, ) -> HistoryDatasetAssociation: """Create a new detached dataset instance from the supplied instance. @@ -143,9 +142,7 @@ def ensure_materialized( sa_session.commit() object_store_populator.set_dataset_object_store_id(materialized_dataset) try: - path = self._stream_source( - target_source, dataset_instance.datatype, validate_hashes, materialized_dataset_hashes - ) + path = self._stream_source(target_source, dataset_instance.datatype, materialized_dataset_hashes) object_store.update_from_file(materialized_dataset, file_name=path) materialized_dataset.set_size() except Exception as e: @@ -157,9 +154,7 @@ def ensure_materialized( # TODO: optimize this by streaming right to this path... # TODO: take into acount transform and ensure we are and are not modifying the file as appropriate. try: - path = self._stream_source( - target_source, dataset_instance.datatype, validate_hashes, materialized_dataset_hashes - ) + path = self._stream_source(target_source, dataset_instance.datatype, materialized_dataset_hashes) shutil.move(path, transient_paths.external_filename) materialized_dataset.external_filename = transient_paths.external_filename except Exception as e: @@ -211,14 +206,12 @@ def ensure_materialized( materialized_dataset_instance.metadata_deferred = False return materialized_dataset_instance - def _stream_source( - self, target_source: DatasetSource, datatype, validate_hashes: bool, dataset_hashes: List[DatasetHash] - ) -> str: + def _stream_source(self, target_source: DatasetSource, datatype, dataset_hashes: List[DatasetHash]) -> str: source_uri = target_source.source_uri if source_uri is None: raise Exception("Cannot stream from dataset source without specified source_uri") path = stream_url_to_file(source_uri, file_sources=self._file_sources) - if validate_hashes and target_source.hashes: + if target_source.hashes: for source_hash in target_source.hashes: _validate_hash(path, source_hash, "downloaded file") @@ -244,7 +237,7 @@ def _stream_source( if datatype_groom: datatype.groom_dataset_content(path) - if validate_hashes and dataset_hashes: + if dataset_hashes: for dataset_hash in dataset_hashes: _validate_hash(path, dataset_hash, "dataset contents") diff --git a/lib/galaxy/model/unittest_utils/store_fixtures.py b/lib/galaxy/model/unittest_utils/store_fixtures.py index c05f0660fcf1..153303c12c81 100644 --- a/lib/galaxy/model/unittest_utils/store_fixtures.py +++ b/lib/galaxy/model/unittest_utils/store_fixtures.py @@ -12,6 +12,7 @@ TEST_SOURCE_URI_BAM = "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/1.bam" TEST_HASH_FUNCTION = "MD5" TEST_HASH_VALUE = "f568c29421792b1b1df4474dafae01f1" +TEST_HASH_VALUE_BAM = "34693a376f9878cbe53166db083e0e26" TEST_HISTORY_NAME = "My history in a model store" TEST_EXTENSION = "bed" TEST_LIBRARY_NAME = "My cool library" @@ -339,7 +340,7 @@ def deferred_hda_model_store_dict_bam( dataset_hash = dict( model_class="DatasetHash", hash_function=TEST_HASH_FUNCTION, - hash_value=TEST_HASH_VALUE, + hash_value=TEST_HASH_VALUE_BAM, extra_files_path=None, ) dataset_source: Dict[str, Any] = dict( diff --git a/lib/galaxy/schema/schema.py b/lib/galaxy/schema/schema.py index 10920de2d6d2..ac353a8268d5 100644 --- a/lib/galaxy/schema/schema.py +++ b/lib/galaxy/schema/schema.py @@ -3682,15 +3682,7 @@ class PageSummaryBase(Model): ) -class MaterializeDatasetOptions(Model): - validate_hashes: bool = Field( - False, - title="Validate hashes", - description="Set to true to enable dataset validation during materialization.", - ) - - -class MaterializeDatasetInstanceAPIRequest(MaterializeDatasetOptions): +class MaterializeDatasetInstanceAPIRequest(Model): source: DatasetSourceType = Field( title="Source", description="The source of the content. Can be other history element to be copied or library elements.", diff --git a/lib/galaxy/schema/tasks.py b/lib/galaxy/schema/tasks.py index d83640f52ffa..313ec9f1dad0 100644 --- a/lib/galaxy/schema/tasks.py +++ b/lib/galaxy/schema/tasks.py @@ -108,11 +108,6 @@ class MaterializeDatasetInstanceTaskRequest(Model): "- The decoded id of the HDA\n" ), ) - validate_hashes: bool = Field( - False, - title="Validate hashes", - description="Set to true to enable dataset validation during materialization.", - ) class ComputeDatasetHashTaskRequest(Model): diff --git a/lib/galaxy/tools/data_fetch.py b/lib/galaxy/tools/data_fetch.py index 01d43461f509..bd5f1ae0b8e6 100644 --- a/lib/galaxy/tools/data_fetch.py +++ b/lib/galaxy/tools/data_fetch.py @@ -260,7 +260,7 @@ def _resolve_item_with_primary(item): hash_function = hash_dict.get("hash_function") hash_value = hash_dict.get("hash_value") try: - _handle_hash_validation(upload_config, hash_function, hash_value, path) + _handle_hash_validation(hash_function, hash_value, path) except Exception as e: error_message = str(e) item["error_message"] = error_message @@ -501,7 +501,7 @@ def _has_src_to_path(upload_config: "UploadConfig", item: Dict[str, Any], is_dat for hash_function in HASH_NAMES: hash_value = item.get(hash_function) if hash_value: - _handle_hash_validation(upload_config, hash_function, hash_value, path) + _handle_hash_validation(hash_function, hash_value, path) if name is None: name = url.split("/")[-1] elif src == "pasted": @@ -516,11 +516,8 @@ def _has_src_to_path(upload_config: "UploadConfig", item: Dict[str, Any], is_dat return name, path -def _handle_hash_validation( - upload_config: "UploadConfig", hash_function: HashFunctionNameEnum, hash_value: str, path: str -): - if upload_config.validate_hashes: - verify_hash(path, hash_func_name=hash_function, hash_value=hash_value, what="upload") +def _handle_hash_validation(hash_function: HashFunctionNameEnum, hash_value: str, path: str): + verify_hash(path, hash_func_name=hash_function, hash_value=hash_value, what="upload") def _arg_parser(): @@ -566,7 +563,6 @@ def __init__( self.to_posix_lines = request.get("to_posix_lines", False) self.space_to_tab = request.get("space_to_tab", False) self.auto_decompress = request.get("auto_decompress", False) - self.validate_hashes = request.get("validate_hashes", False) self.deferred = request.get("deferred", False) self.link_data_only = _link_data_only(request) self.file_sources_dict = file_sources_dict diff --git a/lib/galaxy/webapps/galaxy/api/history_contents.py b/lib/galaxy/webapps/galaxy/api/history_contents.py index f5653a6f7dae..2386807211ba 100644 --- a/lib/galaxy/webapps/galaxy/api/history_contents.py +++ b/lib/galaxy/webapps/galaxy/api/history_contents.py @@ -54,7 +54,6 @@ HistoryContentType, MaterializeDatasetInstanceAPIRequest, MaterializeDatasetInstanceRequest, - MaterializeDatasetOptions, StoreExportPayload, UpdateHistoryContentsBatchPayload, UpdateHistoryContentsPayload, @@ -1073,17 +1072,12 @@ def materialize_dataset( history_id: HistoryIDPathParam, id: HistoryItemIDPathParam, trans: ProvidesHistoryContext = DependsOnTrans, - materialize_api_payload: Optional[MaterializeDatasetOptions] = Body(None), ) -> AsyncTaskResultSummary: - validate_hashes: bool = ( - materialize_api_payload.validate_hashes if materialize_api_payload is not None else False - ) # values are already validated, use model_construct materialize_request = MaterializeDatasetInstanceRequest.model_construct( history_id=history_id, source=DatasetSourceType.hda, content=id, - validate_hashes=validate_hashes, ) rval = self.service.materialize(trans, materialize_request) return rval diff --git a/lib/galaxy/webapps/galaxy/services/history_contents.py b/lib/galaxy/webapps/galaxy/services/history_contents.py index f874b0a71749..6e60c334753a 100644 --- a/lib/galaxy/webapps/galaxy/services/history_contents.py +++ b/lib/galaxy/webapps/galaxy/services/history_contents.py @@ -573,7 +573,6 @@ def materialize( history_id=request.history_id, source=request.source, content=request.content, - validate_hashes=request.validate_hashes, user=trans.async_request_user, ) results = materialize_task.delay(request=task_request) diff --git a/lib/galaxy/workflow/scheduling_manager.py b/lib/galaxy/workflow/scheduling_manager.py index 918ca431a0dd..a68f42aa8933 100644 --- a/lib/galaxy/workflow/scheduling_manager.py +++ b/lib/galaxy/workflow/scheduling_manager.py @@ -349,7 +349,6 @@ def __attempt_materialize(self, workflow_invocation, session) -> bool: history_id=workflow_invocation.history.id, source="hda", content=hda.id, - validate_hashes=True, ) materialized_okay = self.app.hda_manager.materialize(task_request, in_place=True) if not materialized_okay: diff --git a/lib/galaxy_test/api/test_libraries.py b/lib/galaxy_test/api/test_libraries.py index ba0bd7850870..75f8f4c30452 100644 --- a/lib/galaxy_test/api/test_libraries.py +++ b/lib/galaxy_test/api/test_libraries.py @@ -340,7 +340,6 @@ def test_fetch_failed_validation(self): payload = { "history_id": history_id, # TODO: Shouldn't be needed :( "targets": targets, - "validate_hashes": True, } tool_response = self.dataset_populator.fetch(payload, assert_ok=False) job = self.dataset_populator.check_run(tool_response) diff --git a/lib/galaxy_test/api/test_tools_upload.py b/lib/galaxy_test/api/test_tools_upload.py index 15d0198f9c31..13a8ab067654 100644 --- a/lib/galaxy_test/api/test_tools_upload.py +++ b/lib/galaxy_test/api/test_tools_upload.py @@ -1025,7 +1025,6 @@ def test_upload_and_validate_hash_valid(self): payload = { "history_id": history_id, "targets": targets, - "validate_hashes": True, } fetch_response = self.dataset_populator.fetch(payload) self._assert_status_code_is(fetch_response, 200) @@ -1050,7 +1049,6 @@ def test_upload_and_validate_hash_invalid(self): payload = { "history_id": history_id, "targets": targets, - "validate_hashes": True, } fetch_response = self.dataset_populator.fetch(payload, assert_ok=True, wait=False) self._assert_status_code_is(fetch_response, 200) diff --git a/lib/galaxy_test/base/populators.py b/lib/galaxy_test/base/populators.py index 5b8e46998c22..c2504c9c328f 100644 --- a/lib/galaxy_test/base/populators.py +++ b/lib/galaxy_test/base/populators.py @@ -1023,9 +1023,7 @@ def tools_post(self, payload: dict, url="tools") -> Response: def describe_tool_execution(self, tool_id: str) -> "DescribeToolExecution": return DescribeToolExecution(self, tool_id) - def materialize_dataset_instance( - self, history_id: str, id: str, source: str = "hda", validate_hashes: bool = False - ): + def materialize_dataset_instance(self, history_id: str, id: str, source: str = "hda"): payload: Dict[str, Any] if source == "ldda": url = f"histories/{history_id}/materialize" @@ -1036,8 +1034,6 @@ def materialize_dataset_instance( else: url = f"histories/{history_id}/contents/datasets/{id}/materialize" payload = {} - if validate_hashes: - payload["validate_hashes"] = True create_response = self._post(url, payload, json=True) api_asserts.assert_status_code_is_ok(create_response) create_response_json = create_response.json() @@ -2780,7 +2776,6 @@ def fetch_single_url_to_folder(self, file_type="auto", assert_ok=True): payload = { "history_id": history_id, # TODO: Shouldn't be needed :( "targets": targets, - "validate_hashes": True, } return library, self.dataset_populator.fetch(payload, assert_ok=assert_ok) diff --git a/test/integration/test_materialize_dataset_instance_tasks.py b/test/integration/test_materialize_dataset_instance_tasks.py index 5d3d209e7221..817098994a5f 100644 --- a/test/integration/test_materialize_dataset_instance_tasks.py +++ b/test/integration/test_materialize_dataset_instance_tasks.py @@ -89,7 +89,7 @@ def test_materialize_hash_failure(self, history_id: str): assert deferred_hda["state"] == "deferred" assert not deferred_hda["deleted"] - self.dataset_populator.materialize_dataset_instance(history_id, deferred_hda["id"], validate_hashes=True) + self.dataset_populator.materialize_dataset_instance(history_id, deferred_hda["id"]) self.dataset_populator.wait_on_history_length(history_id, 2) new_hda_details = self.dataset_populator.get_history_dataset_details( history_id, hid=2, assert_ok=False, wait=False diff --git a/test/unit/app/tools/test_data_fetch.py b/test/unit/app/tools/test_data_fetch.py index 8c1a64318de6..667e5e7ebbb1 100644 --- a/test/unit/app/tools/test_data_fetch.py +++ b/test/unit/app/tools/test_data_fetch.py @@ -52,7 +52,6 @@ def test_simple_path_get(hash_value: str, error_message: Optional[str]): ], } ], - "validate_hashes": True, } execute_context.execute_request(request) output = _unnamed_output(execute_context) @@ -111,7 +110,6 @@ def test_correct_md5(): ], } ], - "validate_hashes": True, } execute_context.execute_request(request) output = _unnamed_output(execute_context) @@ -142,7 +140,6 @@ def test_incorrect_md5(): ], } ], - "validate_hashes": True, } execute_context.execute_request(request) output = _unnamed_output(execute_context) @@ -175,7 +172,6 @@ def test_correct_sha1(): ], } ], - "validate_hashes": True, } execute_context.execute_request(request) output = _unnamed_output(execute_context) @@ -206,7 +202,6 @@ def test_incorrect_sha1(): ], } ], - "validate_hashes": True, } execute_context.execute_request(request) output = _unnamed_output(execute_context) diff --git a/test/unit/data/test_dataset_materialization.py b/test/unit/data/test_dataset_materialization.py index e002b439726d..be791a047cce 100644 --- a/test/unit/data/test_dataset_materialization.py +++ b/test/unit/data/test_dataset_materialization.py @@ -71,7 +71,7 @@ def test_hash_validate(): _assert_2_bed_metadata(deferred_hda) assert deferred_hda.dataset.state == "deferred" materializer = materializer_factory(True, object_store=fixture_context.app.object_store) - materialized_hda = materializer.ensure_materialized(deferred_hda, validate_hashes=True) + materialized_hda = materializer.ensure_materialized(deferred_hda) materialized_dataset = materialized_hda.dataset assert materialized_dataset.state == "ok" @@ -86,7 +86,7 @@ def test_hash_invalid(): _assert_2_bed_metadata(deferred_hda) assert deferred_hda.dataset.state == "deferred" materializer = materializer_factory(True, object_store=fixture_context.app.object_store) - materialized_hda = materializer.ensure_materialized(deferred_hda, validate_hashes=True) + materialized_hda = materializer.ensure_materialized(deferred_hda) materialized_dataset = materialized_hda.dataset assert materialized_dataset.state == "error" @@ -103,7 +103,7 @@ def test_hash_validate_source_of_download(): _assert_2_bed_metadata(deferred_hda) assert deferred_hda.dataset.state == "deferred" materializer = materializer_factory(True, object_store=fixture_context.app.object_store) - materialized_hda = materializer.ensure_materialized(deferred_hda, validate_hashes=True) + materialized_hda = materializer.ensure_materialized(deferred_hda) materialized_dataset = materialized_hda.dataset assert materialized_dataset.state == "ok", materialized_hda.info @@ -120,7 +120,7 @@ def test_hash_invalid_source_of_download(): _assert_2_bed_metadata(deferred_hda) assert deferred_hda.dataset.state == "deferred" materializer = materializer_factory(True, object_store=fixture_context.app.object_store) - materialized_hda = materializer.ensure_materialized(deferred_hda, validate_hashes=True) + materialized_hda = materializer.ensure_materialized(deferred_hda) materialized_dataset = materialized_hda.dataset assert materialized_dataset.state == "error", materialized_hda.info