diff --git a/cubi_tk/irods_common.py b/cubi_tk/irods_common.py index 68fc02d..852ccd9 100644 --- a/cubi_tk/irods_common.py +++ b/cubi_tk/irods_common.py @@ -295,7 +295,7 @@ def __init__( :param irods_env_path: Path to irods_environment.json :type irods_env_path: pathlib.Path, optional """ - super.__init__(ask, irods_env_path) + super().__init__(ask, irods_env_path) self.hash_scheme = hash_scheme def retrieve_irods_data_objects(self, irods_path: str) -> Dict[str, List[iRODSDataObject]]: @@ -308,15 +308,15 @@ def retrieve_irods_data_objects(self, irods_path: str) -> Dict[str, List[iRODSDa """ # Connect to iRODS - with self.session as irods_session: + with self.session as session: try: - root_coll = irods_session.collections.get(irods_path) + root_coll = session.collections.get(irods_path) # Get files and run checks logger.info("Querying for data objects") if root_coll is not None: - irods_data_objs = self.get_data_objs(root_coll) + irods_data_objs = self._irods_query(session, root_coll) irods_obj_dict = self.parse_irods_collection(irods_data_objs) return irods_obj_dict @@ -326,24 +326,27 @@ def retrieve_irods_data_objects(self, irods_path: str) -> Dict[str, List[iRODSDa return {} - def get_data_objs( - self, root_coll: iRODSCollection + def _irods_query( + self, + session: iRODSSession, + root_coll: iRODSCollection, ) -> Dict[str, Union[Dict[str, iRODSDataObject], List[iRODSDataObject]]]: """Get data objects recursively under the given iRODS path.""" - data_objs = dict(files=[], checksums={}) + ignore_schemes = [k.lower() for k in HASH_SCHEMES if k != self.hash_scheme.upper()] - irods_sess = root_coll.manager.sess - query = irods_sess.query(DataObjectModel, CollectionModel).filter( + query = session.query(DataObjectModel, CollectionModel).filter( Like(CollectionModel.name, f"{root_coll.path}%") ) + data_objs = dict(files=[], checksums={}) for res in query: - # If the 'res' dict is not split into Colllection&Object the resulting iRODSDataObject is not fully functional, likely because a name/path/... attribute is overwritten somewhere + # If the 'res' dict is not split into Colllection&Object the resulting iRODSDataObject is not fully functional, + # likely because a name/path/... attribute is overwritten somewhere coll_res = {k: v for k, v in res.items() if k.icat_id >= 500} obj_res = {k: v for k, v in res.items() if k.icat_id < 500} coll = iRODSCollection(root_coll.manager, coll_res) - obj = iRODSDataObject(irods_sess.data_objects, parent=coll, results=[obj_res]) + obj = iRODSDataObject(session.data_objects, parent=coll, results=[obj_res]) if obj.path.endswith("." + self.hash_scheme.lower()): data_objs["checksums"][obj.path] = obj diff --git a/cubi_tk/sodar/pull_data_collection.py b/cubi_tk/sodar/pull_data_collection.py index e9c6172..e26d02c 100644 --- a/cubi_tk/sodar/pull_data_collection.py +++ b/cubi_tk/sodar/pull_data_collection.py @@ -26,7 +26,9 @@ class PullDataCollection(PullDataCommon): "dragen": [ "**/*_FAM_dragen.fam.hard-filtered.vcf.gz" "**/*_FAM_dragen.fam.hard-filtered.vcf.gz.tbi", - "**/*dragen.qc-coverage*", + "**/*.qc-coverage*.csv", + "**/*.ped", + "**/*.mapping_metrics.csv", ], } @@ -76,9 +78,8 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None: help="UUID from Assay to check. Used to specify target while dealing with multi-assay projects.", ) - group_files = parser.add_argument_group( - "File Selection", mutually_exclusive=True, required=True - ) + group_files = parser.add_mutually_exclusive_group(required=True) + group_files.add_argument( "-p", "--preset", help="Preset to use for file selection.", choices=cls.presets.keys() ) @@ -108,7 +109,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None: ) group_samples.add_argument( "--biomedsheet", - help="Biomedsheet file for filtering collections. Sets tes-column to 2 and " + help="Biomedsheet file for filtering collections. Sets tsv-column to 2 and " "tsv-header to 13. Takes precedence over --tsv.", ) group_samples.add_argument( @@ -133,7 +134,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None: ) parser.add_argument( "--output-regex", - nargs="3", + nargs=3, action="append", metavar=("FILEPART", "MATCH", "REPL"), default=[], @@ -206,15 +207,15 @@ def execute(self) -> typing.Optional[int]: samples = None # Find all remote files (iRODS) - FileSearcher = RetrieveSodarCollection( + filesearcher = RetrieveSodarCollection( self.args.sodar_url, self.args.sodar_api_token, self.args.assay_uuid, self.args.project_uuid, ) - remote_files_dict = FileSearcher.perform() - assay_path = FileSearcher.get_assay_irods_path(self.args.assay_uuid) + remote_files_dict = filesearcher.perform() + assay_path = filesearcher.get_assay_irods_path(self.args.assay_uuid) if self.args.all_files: file_patterns = [] @@ -223,8 +224,8 @@ def execute(self) -> typing.Optional[int]: else: # self.args.file_pattern file_patterns = self.args.file_pattern - filtered_remote_files_dict = self.filter_irods_collection( - remote_files_dict, file_patterns, samples, self.args.substring_match, assay_path + filtered_remote_files_dict = self.filter_irods_file_list( + file_patterns, samples, self.args.substring_match, assay_path ) if len(filtered_remote_files_dict) == 0: @@ -265,8 +266,8 @@ def parse_sample_tsv(tsv_path, sample_col=1, n_header_cols=1, skip_comments=True return samples - def filter_irods_collection( - self, + @staticmethod + def filter_irods_file_list( remote_files_dict: Dict[str, List[iRODSDataObject]], common_assay_path: str, file_patterns: List[str], @@ -353,6 +354,8 @@ def build_download_jobs( self.args.output_dir, self.args.output_pattern.format(**out_parts) ), irods_obj.path, + # # Unclear if this is available or not + # irods_obj.size, ) output_list.append(job) diff --git a/tests/test_irods_common.py b/tests/test_irods_common.py index 156ae24..3f478cc 100644 --- a/tests/test_irods_common.py +++ b/tests/test_irods_common.py @@ -4,7 +4,7 @@ import irods.exception import pytest -from cubi_tk.irods_common import TransferJob, iRODSCommon, iRODSTransfer +from cubi_tk.irods_common import TransferJob, iRODSCommon, iRODSTransfer, iRODSRetrieveCollection def test_transfer_job_bytes(fs): @@ -167,3 +167,44 @@ def test_irods_transfer_get(mocksession, jobs): # download mockget.assert_any_call(job.path_remote, job.path_local) assert itransfer.size == 222 + + +# Test iRODSRetrieveCollection ######### + + +# This tests `retrieve_irods_data_objects` and by extension `parse_irods_collection` +# A test for _irods_query would require mocking `session.query` results in a +# way that allows creation of iRODSDataObject instances from those results +@patch("cubi_tk.irods_common.iRODSCommon._init_irods") +@patch("cubi_tk.irods_common.iRODSRetrieveCollection._irods_query") +def test_irods_retrieve_data_objects(mockquery, mocksession): + # Possible alternative to MagicMocks here: + # create a fake iRODSDataObject class with a path attribute + mockobj1 = MagicMock() + mockobj1.path = "/root/coll1/file1.vcf.gz" + mockobj1.name = "file1.vcf.gz" + mockobj2 = MagicMock() + mockobj2.path = "/root/coll2/file2.vcf.gz" + mockobj2.name = "file2.vcf.gz" + mockobj3 = MagicMock() + mockobj3.path = "/root/coll1/subcol/file1.vcf.gz" + mockobj3.name = "file1.vcf.gz" + + mockcksum = MagicMock() + + mockquery.return_value = { + "files": [mockobj1, mockobj2, mockobj3], + "checksums": { + "/root/coll1/file1.vcf.gz": mockcksum, + "/root/coll2/file2.vcf.gz": mockcksum, + "/root/coll1/subcol/file1.vcf.gz": mockcksum, + }, + } + + mocksession.collections.get.return_value = "path" + + data_objs = iRODSRetrieveCollection().retrieve_irods_data_objects("/fake/path") + + expected_data_objs = {"file1.vcf.gz": [mockobj1, mockobj3], "file2.vcf.gz": [mockobj2]} + + assert data_objs == expected_data_objs diff --git a/tests/test_sodar_pull_data_collection.py b/tests/test_sodar_pull_data_collection.py new file mode 100644 index 0000000..89c91d2 --- /dev/null +++ b/tests/test_sodar_pull_data_collection.py @@ -0,0 +1,152 @@ +from copy import deepcopy +from unittest.mock import MagicMock + +import pathlib +import pytest +import re + +from cubi_tk.sodar.pull_data_collection import PullDataCollection +from cubi_tk.irods_common import TransferJob + + +class MockDataObject: + def __init__(self, path): + self.path = path + + def __eq__(self, other): + return self.path == other.path + + def __repr__(self): + return f"MockDataObject(path={self.path})" + + +@pytest.fixture +def filtered_data_objects(): + return { + "coll1-N1-DNA1": [ + MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol1/file1.vcf.gz"), + MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol2/file1.vcf.gz"), + MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol1/miscFile.txt"), + ], + "coll2-N1-DNA1": [ + MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/file2.vcf.gz"), + MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/file2.bam"), + MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/miscFile.txt"), + ], + } + + +def test_filter_irods_collection(filtered_data_objects): + fake_irods_data_dict = { + "file1.vcf.gz": [ + MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol1/file1.vcf.gz"), + MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol2/file1.vcf.gz"), + ], + "file2.vcf.gz": [ + MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/file2.vcf.gz"), + ], + "file2.bam": [ + MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/file2.bam"), + ], + "miscFile.txt": [ + MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol1/miscFile.txt"), + MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/miscFile.txt"), + ], + } + + kwarg_list = [ + # No filters at all -> all files + {"file_patterns": [], "samples": [], "substring_match": False}, + # Test filepattern filter works + {"file_patterns": ["*.vcf.gz"], "samples": [], "substring_match": False}, + # Test file pattern with mutiple patterns, also **/*.X & *.Y + {"file_patterns": ["*.vcf.gz", "**/*.txt"], "samples": [], "substring_match": False}, + # Test Sample/Collection filter works + {"file_patterns": [], "samples": ["coll1-N1-DNA1"], "substring_match": False}, + # Test substring matching works + {"file_patterns": [], "samples": ["coll1"], "substring_match": True}, + ] + + expected_results = [ + deepcopy(filtered_data_objects), + { + k: [v for v in l if v.path.endswith("vcf.gz")] + for k, l in deepcopy(filtered_data_objects).items() + }, + { + k: [v for v in l if not v.path.endswith("bam")] + for k, l in deepcopy(filtered_data_objects).items() + }, + {k: l for k, l in deepcopy(filtered_data_objects).items() if k == "coll1-N1-DNA1"}, + {k: l for k, l in deepcopy(filtered_data_objects).items() if k == "coll1-N1-DNA1"}, + ] + + for kwargs, expected in zip(kwarg_list, expected_results): + result = PullDataCollection.filter_irods_file_list( + fake_irods_data_dict, "/irods/project", **kwargs + ) + assert result == expected + + +def test_build_download_jobs(filtered_data_objects): + mockargs = MagicMock() + mockargs.output_dir = "/path/to/output" + mockargs.output_regex = [] # ['', '', ''] + mockargs.output_pattern = "{collection}/{subcollections}/{filename}" + + testinstance = PullDataCollection(mockargs) + + expected_out = [ + TransferJob( + path_remote=obj.path, path_local=obj.path.replace("/irods/project", "/path/to/output") + ) + for k, l in filtered_data_objects.items() + for obj in l + ] + out = testinstance.build_download_jobs(filtered_data_objects, "/irods/project") + assert out == expected_out + + # Test with different output pattern + mockargs.output_pattern = "{collection}/{filename}" + expected_out = [ + TransferJob( + path_remote=obj.path, + path_local=re.sub( + "/subcol[12]", "", obj.path.replace("/irods/project", "/path/to/output") + ), + ) + for k, l in filtered_data_objects.items() + for obj in l + ] + out = testinstance.build_download_jobs(filtered_data_objects, "/irods/project") + assert out == expected_out + + # Test with regex + mockargs.output_regex = [ + ["subcollections", "subcol", "subcollection"], + ["collection", "-N1-DNA1", ""], + ] + mockargs.output_pattern = "{collection}/{subcollections}/{filename}" + expected_out = [ + TransferJob( + path_remote=obj.path, + path_local=obj.path.replace("/irods/project", "/path/to/output") + .replace("subcol", "subcollection") + .replace("-N1-DNA1", ""), + ) + for k, l in filtered_data_objects.items() + for obj in l + ] + out = testinstance.build_download_jobs(filtered_data_objects, "/irods/project") + assert out == expected_out + + +def test_parse_samplesheet(): + # Test on Biomedsheet + samples = PullDataCollection.parse_sample_tsv( + pathlib.Path(__file__).resolve().parent / "data" / "pull_sheets" / "sheet.tsv", + sample_col=2, + n_header_cols=13, + ) + + assert samples == ["index", "mother", "father"]