Added tests

bihealth · Mar 11, 2024 · b5fe121 · b5fe121
1 parent 1bb597b
commit b5fe121
Show file tree

Hide file tree

Showing 4 changed files with 224 additions and 25 deletions.
diff --git a/cubi_tk/irods_common.py b/cubi_tk/irods_common.py
@@ -295,7 +295,7 @@ def __init__(
         :param irods_env_path: Path to irods_environment.json
         :type irods_env_path: pathlib.Path, optional
         """
-        super.__init__(ask, irods_env_path)
+        super().__init__(ask, irods_env_path)
         self.hash_scheme = hash_scheme
 
     def retrieve_irods_data_objects(self, irods_path: str) -> Dict[str, List[iRODSDataObject]]:
@@ -308,15 +308,15 @@ def retrieve_irods_data_objects(self, irods_path: str) -> Dict[str, List[iRODSDa
         """
 
         # Connect to iRODS
-        with self.session as irods_session:
+        with self.session as session:
             try:
-                root_coll = irods_session.collections.get(irods_path)
+                root_coll = session.collections.get(irods_path)
 
                 # Get files and run checks
                 logger.info("Querying for data objects")
 
                 if root_coll is not None:
-                    irods_data_objs = self.get_data_objs(root_coll)
+                    irods_data_objs = self._irods_query(session, root_coll)
                     irods_obj_dict = self.parse_irods_collection(irods_data_objs)
                     return irods_obj_dict
 
@@ -326,24 +326,27 @@ def retrieve_irods_data_objects(self, irods_path: str) -> Dict[str, List[iRODSDa
 
         return {}
 
-    def get_data_objs(
-        self, root_coll: iRODSCollection
+    def _irods_query(
+        self,
+        session: iRODSSession,
+        root_coll: iRODSCollection,
     ) -> Dict[str, Union[Dict[str, iRODSDataObject], List[iRODSDataObject]]]:
         """Get data objects recursively under the given iRODS path."""
-        data_objs = dict(files=[], checksums={})
+
         ignore_schemes = [k.lower() for k in HASH_SCHEMES if k != self.hash_scheme.upper()]
-        irods_sess = root_coll.manager.sess
 
-        query = irods_sess.query(DataObjectModel, CollectionModel).filter(
+        query = session.query(DataObjectModel, CollectionModel).filter(
             Like(CollectionModel.name, f"{root_coll.path}%")
         )
 
+        data_objs = dict(files=[], checksums={})
         for res in query:
-            # If the 'res' dict is not split into Colllection&Object the resulting iRODSDataObject is not fully functional, likely because a name/path/... attribute is overwritten somewhere
+            # If the 'res' dict is not split into Colllection&Object the resulting iRODSDataObject is not fully functional,
+            # likely because a name/path/... attribute is overwritten somewhere
             coll_res = {k: v for k, v in res.items() if k.icat_id >= 500}
             obj_res = {k: v for k, v in res.items() if k.icat_id < 500}
             coll = iRODSCollection(root_coll.manager, coll_res)
-            obj = iRODSDataObject(irods_sess.data_objects, parent=coll, results=[obj_res])
+            obj = iRODSDataObject(session.data_objects, parent=coll, results=[obj_res])
 
             if obj.path.endswith("." + self.hash_scheme.lower()):
                 data_objs["checksums"][obj.path] = obj

diff --git a/cubi_tk/sodar/pull_data_collection.py b/cubi_tk/sodar/pull_data_collection.py
@@ -26,7 +26,9 @@ class PullDataCollection(PullDataCommon):
         "dragen": [
             "**/*_FAM_dragen.fam.hard-filtered.vcf.gz"
             "**/*_FAM_dragen.fam.hard-filtered.vcf.gz.tbi",
-            "**/*dragen.qc-coverage*",
+            "**/*.qc-coverage*.csv",
+            "**/*.ped",
+            "**/*.mapping_metrics.csv",
         ],
     }
 
@@ -76,9 +78,8 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
             help="UUID from Assay to check. Used to specify target while dealing with multi-assay projects.",
         )
 
-        group_files = parser.add_argument_group(
-            "File Selection", mutually_exclusive=True, required=True
-        )
+        group_files = parser.add_mutually_exclusive_group(required=True)
+
         group_files.add_argument(
             "-p", "--preset", help="Preset to use for file selection.", choices=cls.presets.keys()
         )
@@ -108,7 +109,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
         )
         group_samples.add_argument(
             "--biomedsheet",
-            help="Biomedsheet file for filtering collections. Sets tes-column to 2 and "
+            help="Biomedsheet file for filtering collections. Sets tsv-column to 2 and "
             "tsv-header to 13. Takes precedence over --tsv.",
         )
         group_samples.add_argument(
@@ -133,7 +134,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
         )
         parser.add_argument(
             "--output-regex",
-            nargs="3",
+            nargs=3,
             action="append",
             metavar=("FILEPART", "MATCH", "REPL"),
             default=[],
@@ -206,15 +207,15 @@ def execute(self) -> typing.Optional[int]:
             samples = None
 
         # Find all remote files (iRODS)
-        FileSearcher = RetrieveSodarCollection(
+        filesearcher = RetrieveSodarCollection(
             self.args.sodar_url,
             self.args.sodar_api_token,
             self.args.assay_uuid,
             self.args.project_uuid,
         )
 
-        remote_files_dict = FileSearcher.perform()
-        assay_path = FileSearcher.get_assay_irods_path(self.args.assay_uuid)
+        remote_files_dict = filesearcher.perform()
+        assay_path = filesearcher.get_assay_irods_path(self.args.assay_uuid)
 
         if self.args.all_files:
             file_patterns = []
@@ -223,8 +224,8 @@ def execute(self) -> typing.Optional[int]:
         else:  # self.args.file_pattern
             file_patterns = self.args.file_pattern
 
-        filtered_remote_files_dict = self.filter_irods_collection(
-            remote_files_dict, file_patterns, samples, self.args.substring_match, assay_path
+        filtered_remote_files_dict = self.filter_irods_file_list(
+            file_patterns, samples, self.args.substring_match, assay_path
         )
 
         if len(filtered_remote_files_dict) == 0:
@@ -265,8 +266,8 @@ def parse_sample_tsv(tsv_path, sample_col=1, n_header_cols=1, skip_comments=True
 
         return samples
 
-    def filter_irods_collection(
-        self,
+    @staticmethod
+    def filter_irods_file_list(
         remote_files_dict: Dict[str, List[iRODSDataObject]],
         common_assay_path: str,
         file_patterns: List[str],
@@ -353,6 +354,8 @@ def build_download_jobs(
                         self.args.output_dir, self.args.output_pattern.format(**out_parts)
                     ),
                     irods_obj.path,
+                    # # Unclear if this is available or not
+                    # irods_obj.size,
                 )
                 output_list.append(job)
 

diff --git a/tests/test_irods_common.py b/tests/test_irods_common.py
@@ -4,7 +4,7 @@
 import irods.exception
 import pytest
 
-from cubi_tk.irods_common import TransferJob, iRODSCommon, iRODSTransfer
+from cubi_tk.irods_common import TransferJob, iRODSCommon, iRODSTransfer, iRODSRetrieveCollection
 
 
 def test_transfer_job_bytes(fs):
@@ -167,3 +167,44 @@ def test_irods_transfer_get(mocksession, jobs):
         # download
         mockget.assert_any_call(job.path_remote, job.path_local)
     assert itransfer.size == 222
+
+
+# Test iRODSRetrieveCollection #########
+
+
+# This tests `retrieve_irods_data_objects` and by extension `parse_irods_collection`
+# A test for _irods_query would require mocking `session.query` results in a
+# way that allows creation of iRODSDataObject instances from those results
+@patch("cubi_tk.irods_common.iRODSCommon._init_irods")
+@patch("cubi_tk.irods_common.iRODSRetrieveCollection._irods_query")
+def test_irods_retrieve_data_objects(mockquery, mocksession):
+    # Possible alternative to MagicMocks here:
+    # create a fake iRODSDataObject class with a path attribute
+    mockobj1 = MagicMock()
+    mockobj1.path = "/root/coll1/file1.vcf.gz"
+    mockobj1.name = "file1.vcf.gz"
+    mockobj2 = MagicMock()
+    mockobj2.path = "/root/coll2/file2.vcf.gz"
+    mockobj2.name = "file2.vcf.gz"
+    mockobj3 = MagicMock()
+    mockobj3.path = "/root/coll1/subcol/file1.vcf.gz"
+    mockobj3.name = "file1.vcf.gz"
+
+    mockcksum = MagicMock()
+
+    mockquery.return_value = {
+        "files": [mockobj1, mockobj2, mockobj3],
+        "checksums": {
+            "/root/coll1/file1.vcf.gz": mockcksum,
+            "/root/coll2/file2.vcf.gz": mockcksum,
+            "/root/coll1/subcol/file1.vcf.gz": mockcksum,
+        },
+    }
+
+    mocksession.collections.get.return_value = "path"
+
+    data_objs = iRODSRetrieveCollection().retrieve_irods_data_objects("/fake/path")
+
+    expected_data_objs = {"file1.vcf.gz": [mockobj1, mockobj3], "file2.vcf.gz": [mockobj2]}
+
+    assert data_objs == expected_data_objs
diff --git a/tests/test_sodar_pull_data_collection.py b/tests/test_sodar_pull_data_collection.py
@@ -0,0 +1,152 @@
+from copy import deepcopy
+from unittest.mock import MagicMock
+
+import pathlib
+import pytest
+import re
+
+from cubi_tk.sodar.pull_data_collection import PullDataCollection
+from cubi_tk.irods_common import TransferJob
+
+
+class MockDataObject:
+    def __init__(self, path):
+        self.path = path
+
+    def __eq__(self, other):
+        return self.path == other.path
+
+    def __repr__(self):
+        return f"MockDataObject(path={self.path})"
+
+
+@pytest.fixture
+def filtered_data_objects():
+    return {
+        "coll1-N1-DNA1": [
+            MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol1/file1.vcf.gz"),
+            MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol2/file1.vcf.gz"),
+            MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol1/miscFile.txt"),
+        ],
+        "coll2-N1-DNA1": [
+            MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/file2.vcf.gz"),
+            MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/file2.bam"),
+            MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/miscFile.txt"),
+        ],
+    }
+
+
+def test_filter_irods_collection(filtered_data_objects):
+    fake_irods_data_dict = {
+        "file1.vcf.gz": [
+            MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol1/file1.vcf.gz"),
+            MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol2/file1.vcf.gz"),
+        ],
+        "file2.vcf.gz": [
+            MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/file2.vcf.gz"),
+        ],
+        "file2.bam": [
+            MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/file2.bam"),
+        ],
+        "miscFile.txt": [
+            MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol1/miscFile.txt"),
+            MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/miscFile.txt"),
+        ],
+    }
+
+    kwarg_list = [
+        # No filters at all -> all files
+        {"file_patterns": [], "samples": [], "substring_match": False},
+        # Test filepattern filter works
+        {"file_patterns": ["*.vcf.gz"], "samples": [], "substring_match": False},
+        # Test file pattern with mutiple patterns, also **/*.X & *.Y
+        {"file_patterns": ["*.vcf.gz", "**/*.txt"], "samples": [], "substring_match": False},
+        # Test Sample/Collection filter works
+        {"file_patterns": [], "samples": ["coll1-N1-DNA1"], "substring_match": False},
+        # Test substring matching works
+        {"file_patterns": [], "samples": ["coll1"], "substring_match": True},
+    ]
+
+    expected_results = [
+        deepcopy(filtered_data_objects),
+        {
+            k: [v for v in l if v.path.endswith("vcf.gz")]
+            for k, l in deepcopy(filtered_data_objects).items()
+        },
+        {
+            k: [v for v in l if not v.path.endswith("bam")]
+            for k, l in deepcopy(filtered_data_objects).items()
+        },
+        {k: l for k, l in deepcopy(filtered_data_objects).items() if k == "coll1-N1-DNA1"},
+        {k: l for k, l in deepcopy(filtered_data_objects).items() if k == "coll1-N1-DNA1"},
+    ]
+
+    for kwargs, expected in zip(kwarg_list, expected_results):
+        result = PullDataCollection.filter_irods_file_list(
+            fake_irods_data_dict, "/irods/project", **kwargs
+        )
+        assert result == expected
+
+
+def test_build_download_jobs(filtered_data_objects):
+    mockargs = MagicMock()
+    mockargs.output_dir = "/path/to/output"
+    mockargs.output_regex = []  # ['', '', '']
+    mockargs.output_pattern = "{collection}/{subcollections}/{filename}"
+
+    testinstance = PullDataCollection(mockargs)
+
+    expected_out = [
+        TransferJob(
+            path_remote=obj.path, path_local=obj.path.replace("/irods/project", "/path/to/output")
+        )
+        for k, l in filtered_data_objects.items()
+        for obj in l
+    ]
+    out = testinstance.build_download_jobs(filtered_data_objects, "/irods/project")
+    assert out == expected_out
+
+    # Test with different output pattern
+    mockargs.output_pattern = "{collection}/{filename}"
+    expected_out = [
+        TransferJob(
+            path_remote=obj.path,
+            path_local=re.sub(
+                "/subcol[12]", "", obj.path.replace("/irods/project", "/path/to/output")
+            ),
+        )
+        for k, l in filtered_data_objects.items()
+        for obj in l
+    ]
+    out = testinstance.build_download_jobs(filtered_data_objects, "/irods/project")
+    assert out == expected_out
+
+    # Test with regex
+    mockargs.output_regex = [
+        ["subcollections", "subcol", "subcollection"],
+        ["collection", "-N1-DNA1", ""],
+    ]
+    mockargs.output_pattern = "{collection}/{subcollections}/{filename}"
+    expected_out = [
+        TransferJob(
+            path_remote=obj.path,
+            path_local=obj.path.replace("/irods/project", "/path/to/output")
+            .replace("subcol", "subcollection")
+            .replace("-N1-DNA1", ""),
+        )
+        for k, l in filtered_data_objects.items()
+        for obj in l
+    ]
+    out = testinstance.build_download_jobs(filtered_data_objects, "/irods/project")
+    assert out == expected_out
+
+
+def test_parse_samplesheet():
+    # Test on Biomedsheet
+    samples = PullDataCollection.parse_sample_tsv(
+        pathlib.Path(__file__).resolve().parent / "data" / "pull_sheets" / "sheet.tsv",
+        sample_col=2,
+        n_header_cols=13,
+    )
+
+    assert samples == ["index", "mother", "father"]