diff --git a/config/beanstalk.clean.yml b/config/beanstalk.clean.yml index 3a22bb4ba..443e4aa81 100644 --- a/config/beanstalk.clean.yml +++ b/config/beanstalk.clean.yml @@ -17,3 +17,6 @@ beanstalkd: ancestry: submission: ancestry events: ancestry_events + proteomics: + submission: proteomics + events: proteomics_events \ No newline at end of file diff --git a/python/pyproject.toml b/python/pyproject.toml index 3e6430f0f..cc443c85a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["maturin>=0.14,<0.15", "setuptools", "wheel", "Cython"] +requires = ["maturin>=1.3.0,<1.4.0", "setuptools", "wheel", "Cython"] build-backend = "maturin" [project] diff --git a/python/python/bystro/search/index/bystro_file.pyx b/python/python/bystro/search/index/bystro_file.pyx index 9b8f202df..26017005a 100644 --- a/python/python/bystro/search/index/bystro_file.pyx +++ b/python/python/bystro/search/index/bystro_file.pyx @@ -30,14 +30,14 @@ cdef class ReadAnnotationTarball: list paths int id - def __cinit__(self, str index_name, dict delimiters, str tar_path, str annotation_name = 'annotation.tsv.gz', int chunk_size=500): + def __cinit__(self, str index_name, object delimiters, str tar_path, str annotation_name = 'annotation.tsv.gz', int chunk_size=500): self.index_name = index_name self.chunk_size = chunk_size - self.field_separator = delimiters['field'] - self.position_delimiter = delimiters['position'] - self.overlap_delimiter = delimiters['overlap'] - self.value_delimiter = delimiters['value'] - self.empty_field_char = delimiters['empty_field'] + self.field_separator = delimiters.field + self.position_delimiter = delimiters.position + self.overlap_delimiter = delimiters.overlap + self.value_delimiter = delimiters.value + self.empty_field_char = delimiters.empty_field t = tarfile.open(tar_path) for member in t.getmembers(): @@ -117,5 +117,5 @@ cdef class ReadAnnotationTarball: def get_header_fields(self): return self.header_fields -cpdef ReadAnnotationTarball read_annotation_tarball(str index_name, dict delimiters, str tar_path, str annotation_name = 'annotation.tsv.gz', int chunk_size=500): +cpdef ReadAnnotationTarball read_annotation_tarball(str index_name, object delimiters, str tar_path, str annotation_name = 'annotation.tsv.gz', int chunk_size=500): return ReadAnnotationTarball(index_name, delimiters, tar_path, annotation_name, chunk_size) \ No newline at end of file diff --git a/python/python/bystro/search/index/handler.py b/python/python/bystro/search/index/handler.py index a9b1e5ff0..624080807 100644 --- a/python/python/bystro/search/index/handler.py +++ b/python/python/bystro/search/index/handler.py @@ -14,9 +14,9 @@ from bystro.beanstalkd.worker import ProgressPublisher, get_progress_reporter from bystro.search.index.bystro_file import ( # type: ignore # pylint: disable=no-name-in-module,import-error # noqa: E501 - read_annotation_tarball, + read_annotation_tarball, ) -from bystro.search.utils.annotation import get_delimiters +from bystro.search.utils.annotation import DelimitersConfig from bystro.search.utils.opensearch import gather_opensearch_args ray.init(ignore_reinit_error=True, address="auto") @@ -47,7 +47,9 @@ async def index(self, data): self.counter += resp[0] if self.counter >= self.reporter_batch: - await asyncio.to_thread(self.progress_tracker.increment.remote, self.counter) + await asyncio.to_thread( + self.progress_tracker.increment.remote, self.counter + ) self.counter = 0 return resp @@ -96,7 +98,9 @@ async def go( if not index_body["settings"].get("number_of_shards"): file_size = os.path.getsize(tar_path) - index_body["settings"]["number_of_shards"] = ceil(float(file_size) / float(1e10)) + index_body["settings"]["number_of_shards"] = ceil( + float(file_size) / float(1e10) + ) try: await client.indices.create(index_name, body=index_body) @@ -106,7 +110,7 @@ async def go( data = read_annotation_tarball( index_name=index_name, tar_path=tar_path, - delimiters=get_delimiters(), + delimiters=DelimitersConfig(), chunk_size=paralleleism_chunk_size, ) @@ -154,7 +158,9 @@ async def go( if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process some config files.") - parser.add_argument("--tar", type=str, help="Path to the tarball containing the annotation") + parser.add_argument( + "--tar", type=str, help="Path to the tarball containing the annotation" + ) parser.add_argument( "--search_conf", diff --git a/python/python/bystro/search/index/tests/test_bystro_file.py b/python/python/bystro/search/index/tests/test_bystro_file.py index b31571f65..97f490171 100644 --- a/python/python/bystro/search/index/tests/test_bystro_file.py +++ b/python/python/bystro/search/index/tests/test_bystro_file.py @@ -6,7 +6,7 @@ from bystro.search.index.bystro_file import ( # type: ignore # pylint: disable=no-name-in-module,import-error # noqa: E501 read_annotation_tarball, ) -from bystro.search.utils.annotation import get_delimiters +from bystro.search.utils.annotation import DelimitersConfig def create_mock_tarball(annotation_content): @@ -27,11 +27,11 @@ def create_mock_tarball(annotation_content): def test_read_annotation_tarball(): - delims = get_delimiters() - delim_v = delims["value"] # e.g. ; - delim_f = delims["field"] # e.g. \t - delim_o = delims["overlap"] # e.g. chr(31) - delim_p = delims["position"] # e.g. | + delims = DelimitersConfig() + delim_v = delims.value # e.g. ; + delim_f = delims.field # e.g. \t + delim_o = delims.overlap # e.g. chr(31) + delim_p = delims.position # e.g. | header = f"field1{delim_f}field2{delim_f}field3\n" field1val = f"value1a{delim_v}value1b{delim_p}value2aa{delim_o}value2ab{delim_v}value2b{delim_f}" @@ -70,9 +70,12 @@ def test_read_annotation_tarball(): }, } ] - + import numpy as np + import json result_data = next(reader) assert result_data == expected_data + np.testing.assert_equal(result_data, expected_data) + print(json.dumps(expected_data, indent=4)) # Test the end of the data with pytest.raises(StopIteration): diff --git a/python/python/bystro/search/save/handler.py b/python/python/bystro/search/save/handler.py index 9489e7e05..c67bb0e0e 100644 --- a/python/python/bystro/search/save/handler.py +++ b/python/python/bystro/search/save/handler.py @@ -7,11 +7,13 @@ # TODO 2023-05-08: concatenate chunks in a different ray worker import gzip +import logging import math import os import pathlib import subprocess import traceback +from typing import Any import numpy as np import ray @@ -19,9 +21,16 @@ from opensearchpy import OpenSearch from bystro.beanstalkd.worker import ProgressPublisher, get_progress_reporter -from bystro.search.utils.annotation import AnnotationOutputs, get_delimiters +from bystro.search.utils.annotation import ( + AnnotationOutputs, + DelimitersConfig, +) from bystro.search.utils.messages import SaveJobData from bystro.search.utils.opensearch import gather_opensearch_args +from bystro.utils.compress import GZIP_EXECUTABLE +from bystro.utils.tar import GNU_TAR_EXECUTABLE_NAME + +logger = logging.getLogger(__name__) ray.init(ignore_reinit_error=True, address="auto") @@ -57,8 +66,8 @@ def _get_header(field_names): return parents, children -def _populate_data(field_path, data_for_end_of_path): - if not isinstance(field_path, list) or data_for_end_of_path is None: +def _populate_data(field_path: list[str] | str, data_for_end_of_path: Any): + if not isinstance(data_for_end_of_path, dict): return data_for_end_of_path for child_field in field_path: @@ -70,8 +79,8 @@ def _populate_data(field_path, data_for_end_of_path): return data_for_end_of_path -def _make_output_string(rows: list, delims: dict): - empty_field_char = delims["empty_field"] +def _make_output_string(rows: list, delims: DelimitersConfig): + empty_field_char = delims.empty_field for row_idx, row in enumerate(rows): # pylint:disable=too-many-nested-blocks # Some fields may just be missing; we won't store even the alt/pos [[]] structure for those for i, column in enumerate(row): @@ -99,18 +108,23 @@ def _make_output_string(rows: list, delims: dict): if isinstance(sub, list): inner_values.append( - delims["overlap"].join( - map(lambda x: str(x) if x is not None else empty_field_char, sub) + delims.overlap.join( + map( + lambda x: str(x) + if x is not None + else empty_field_char, + sub, + ) ) ) else: inner_values.append(str(sub)) - column[j] = delims["value"].join(inner_values) + column[j] = delims.value.join(inner_values) - row[i] = delims["position"].join(column) + row[i] = delims.position.join(column) - rows[row_idx] = delims["field"].join(row) + rows[row_idx] = delims.field.join(row) return bytes("\n".join(rows) + "\n", encoding="utf-8") @@ -143,7 +157,9 @@ def _process_query( for doc in resp["hits"]["hits"]: row = np.empty(len(field_names), dtype=object) for y in range(len(field_names)): - row[y] = _populate_data(child_fields[y], doc["_source"].get(parent_fields[y])) + row[y] = _populate_data( + child_fields[y], doc["_source"].get(parent_fields[y]) + ) if row[discordant_idx][0][0] is False: row[discordant_idx][0][0] = 0 @@ -191,7 +207,7 @@ def go( # pylint:disable=invalid-name output_dir = os.path.dirname(job_data.outputBasePath) basename = os.path.basename(job_data.outputBasePath) pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) - outputs = AnnotationOutputs.from_path(output_dir, basename, True) + outputs, stats = AnnotationOutputs.from_path(output_dir, basename, compress=True) written_chunks = [os.path.join(output_dir, f"{job_data.indexName}_header")] @@ -203,7 +219,9 @@ def go( # pylint:disable=invalid-name client = OpenSearch(**search_client_args) query = _clean_query(job_data.queryBody) - num_slices = _get_num_slices(client, job_data.indexName, max_query_size, max_slices, query) + num_slices = _get_num_slices( + client, job_data.indexName, max_query_size, max_slices, query + ) pit_id = client.create_point_in_time(index=job_data.indexName, params={"keep_alive": keep_alive})["pit_id"] # type: ignore # noqa: E501 try: reporter = get_progress_reporter(publisher) @@ -212,7 +230,9 @@ def go( # pylint:disable=invalid-name reqs = [] for slice_id in range(num_slices): - written_chunks.append(os.path.join(output_dir, f"{job_data.indexName}_{slice_id}")) + written_chunks.append( + os.path.join(output_dir, f"{job_data.indexName}_{slice_id}") + ) body = query.copy() if num_slices > 1: # Slice queries require max > 1 @@ -223,7 +243,7 @@ def go( # pylint:disable=invalid-name job_data.fieldNames, written_chunks[-1], reporter, - get_delimiters(), + DelimitersConfig(), ) reqs.append(res) results_processed = ray.get(reqs) @@ -234,14 +254,24 @@ def go( # pylint:disable=invalid-name all_chunks = " ".join(written_chunks) annotation_path = os.path.join(output_dir, outputs.annotation) - ret = subprocess.call(f"cat {all_chunks} > {annotation_path}; rm {all_chunks}", shell=True) + ret = subprocess.call( + f"cat {all_chunks} > {annotation_path}; rm {all_chunks}", shell=True + ) if ret != 0: raise IOError(f"Failed to write {annotation_path}") - tarball_name = os.path.basename(outputs.archived) + ret = subprocess.call( + f"{GZIP_EXECUTABLE} -d -c {annotation_path} | {stats.stdin_cli_stats_command}", + shell=True, + ) + if ret != 0: + raise IOError(f"Failed to write statistics for {annotation_path}") + tarball_name = os.path.basename(outputs.archived) + # Webserver requires the output to have top-level statistics data, + # but annotation data will be too large to want to store 2 copies of ret = subprocess.call( - f'cd {output_dir}; tar --exclude ".*" --exclude={tarball_name} -cf {tarball_name} * --remove-files', # noqa: E501 + f'cd {output_dir}; {GNU_TAR_EXECUTABLE_NAME} --exclude ".*" --exclude={tarball_name} -cf {tarball_name} * && rm {annotation_path}', # noqa: E501 shell=True, ) if ret != 0: diff --git a/python/python/bystro/search/save/tests/test_make_output_string.py b/python/python/bystro/search/save/tests/test_make_output_string.py index 64d676185..8f0dc65b5 100644 --- a/python/python/bystro/search/save/tests/test_make_output_string.py +++ b/python/python/bystro/search/save/tests/test_make_output_string.py @@ -1,12 +1,68 @@ -from bystro.search.save.handler import _make_output_string +from bystro.search.save.handler import _make_output_string, _populate_data +from bystro.search.utils.annotation import DelimitersConfig + +delims = DelimitersConfig( + empty_field="!", + overlap="/", + value=";", + position="|", + field="\t", +) + +def test_populate_data(): + data = {"a": {"b": {"c": "value"}}} + + # Positive cases + assert _populate_data(["a", "b", "c"], data) == "value" + assert _populate_data(["a"], data) == {"b": {"c": "value"}} + + # Negative cases + assert _populate_data(["a", "b", "d"], data) is None + assert _populate_data("a.b.c", data) is None + assert _populate_data(["a", "b", "c"], None) is None + assert _populate_data(["a.exact"], data) is None + assert _populate_data(["a", "b.exact"], data) is None + + +def test_populate_data_real_example_nested(): + _real_example_nested = { + "nearest": { + "refSeq": { + "name2": [[["GCFC2"]]], + "name": [[["NM_001201334"], ["NM_003203"]]], + "dist": [[["0"]]], + } + } + } + + assert _populate_data(["nearest", "refSeq", "name2"], _real_example_nested) == [ + [["GCFC2"]] + ] + assert _populate_data(["nearest", "refSeq", "name"], _real_example_nested) == [ + [["NM_001201334"], ["NM_003203"]] + ] + assert _populate_data(["nearest", "refSeq", "dist"], _real_example_nested) == [ + [["0"]] + ] + + +def test_populate_data_real_example_scalar(): + _real_example_scalar = { + "chrom": [[["chr2"]]], + "pos": [[["75928300"]]], + "type": [[["SNP"]]] + } + + assert _populate_data(["chrom"], _real_example_scalar) == [[["chr2"]]] + assert _populate_data(["pos"], _real_example_scalar) == [[["75928300"]]] + assert _populate_data(["type"], _real_example_scalar) == [[["SNP"]]] + + # _populate_data considers any non-dict value a leaf value and will return it as is + # and the provided "path" is a label of the leaf's data, a no-op + assert _populate_data("chrom", [[["chr2"]]]) == [[["chr2"]]] + assert _populate_data("pos", [[["75928300"]]]) == [[["75928300"]]] + assert _populate_data("type", [[["SNP"]]]) == [[["SNP"]]] -delims = { - "empty_field": "!", - "overlap": "/", - "value": ";", - "position": "|", - "field": "\t", -} def test_basic_functionality(): rows = [ @@ -15,9 +71,9 @@ def test_basic_functionality(): # column1 [ # position values [ # value values - ["gene1_mrna1", None, "gene1_mrna2"], # overlap values + ["gene1_mrna1", None, "gene1_mrna2"], # overlap values # gene1 has 3 transcripts, 1 of which is non-coding and doens't have an mrna record - ["gene1"] + ["gene1"], ], # 2 value delimited values at the next position in the indel ["position2a", "position2b"], @@ -32,22 +88,19 @@ def test_basic_functionality(): # row2 [ # column1 - [ + [ # Retain backwards compat with scalar values in 2nd dimension - [ - "row2_scalar" - ] + ["row2_scalar"] ] - ] + ], ] - expected = ( - b"gene1_mrna1/!/gene1_mrna2;gene1|position2a;position2b\tcol2_scalar\nrow2_scalar\n" - ) + expected = b"gene1_mrna1/!/gene1_mrna2;gene1|position2a;position2b\tcol2_scalar\nrow2_scalar\n" assert _make_output_string(rows, delims) == expected + def test_empty_list(): rows = [] expected = b"\n" - assert _make_output_string(rows, delims) == expected \ No newline at end of file + assert _make_output_string(rows, delims) == expected diff --git a/python/python/bystro/search/utils/annotation.py b/python/python/bystro/search/utils/annotation.py index 7973617f6..ad9dc90e1 100644 --- a/python/python/bystro/search/utils/annotation.py +++ b/python/python/bystro/search/utils/annotation.py @@ -1,56 +1,107 @@ import os from glob import glob +import logging from os import path +import shutil +from typing import Optional, Any from msgspec import Struct +logger = logging.getLogger(__name__) -class StatisticsOutputs(Struct, frozen=True): + +class FileProcessorsConfig(Struct, frozen=True, forbid_unknown_fields=True): + args: str + program: str + + +class StatisticsOutputExtensions(Struct, frozen=True, forbid_unknown_fields=True): + json: str = "statistics.json" + tsv: str = "statistics.tsv" + qc: str = "statistics.qc.tsv" + + +class StatisticsConfig(Struct, frozen=True, forbid_unknown_fields=True): + dbSNPnameField: str = "dbSNP.name" + siteTypeField: str = "refSeq.siteType" + exonicAlleleFunctionField: str = "refSeq.exonicAlleleFunction" + refField: str = "ref" + homozygotesField: str = "homozygotes" + heterozygotesField: str = "heterozygotes" + altField: str = "alt" + programPath: str = "bystro-stats" + outputExtensions: StatisticsOutputExtensions = StatisticsOutputExtensions() + + @staticmethod + def from_dict(annotation_config: dict[str, Any]): + """Get statistics config from a dictionary""" + stats_config: Optional[dict[str, Any]] = annotation_config.get("statistics") + + if stats_config is None: + logger.warning( + "No 'statistics' config found in supplied annotation config, using defaults" + ) + return StatisticsConfig() + + if "outputExtensions" in stats_config: + stats_config["outputExtensions"] = StatisticsOutputExtensions( + **stats_config["outputExtensions"] + ) + + return StatisticsConfig(**stats_config) + + +class StatisticsOutputs(Struct, frozen=True, forbid_unknown_fields=True): """ - Paths to all possible Bystro statistics outputs - - Attributes: - json: str - Basename of the JSON statistics file - tab: str - Basename of the TSV statistics file - qc: str - Basename of the QC statistics file + Paths to all possible Bystro statistics outputs + + Attributes: + json: str + Basename of the JSON statistics file + tab: str + Basename of the TSV statistics file + qc: str + Basename of the QC statistics file """ + json: str tab: str qc: str -class AnnotationOutputs(Struct, frozen=True): +class AnnotationOutputs(Struct, frozen=True, forbid_unknown_fields=True): """ - Paths to all possible Bystro annotation outputs - - Attributes: - output_dir: str - Output directory - archived: str - Basename of the archive - annotation: str - Basename of the annotation TSV file, inside the archive - sampleList: Optional[str] - Basename of the sample list file, inside the archive - log: Basename of the log file, inside the archive - statistics: Optional[StatisticsOutputs] - Basenames of the statistics files, inside the archive + Paths to all possible Bystro annotation outputs + + Attributes: + output_dir: str + Output directory + archived: str + Basename of the archive + annotation: str + Basename of the annotation TSV file, found inside the archive only + sampleList: Optional[str] + Basename of the sample list file, in the archive and output directory + log: str + Basename of the log file, in the archive and output directory + statistics: StatisticsOutputs + Basenames of the statistics files, in the archive and output directory + header: Optional[str] + Basename of the header file, in the archive and output directory """ + archived: str annotation: str sampleList: str log: str - statistics: StatisticsOutputs | None = None + statistics: StatisticsOutputs + header: str | None = None @staticmethod def from_path( output_dir: str, basename: str, compress: bool, - generate_statistics: bool = True, make_dir: bool = True, make_dir_mode: int = 511, ): @@ -71,46 +122,111 @@ def from_path( archived = f"{basename}.tar" - statistics = None - if generate_statistics: - statistics = StatisticsOutputs( - json=f"{basename}.statistics.json", - tab=f"{basename}.statistics.tsv", - qc=f"{basename}.statistics.qc.tsv", - ) + stats = Statistics(output_base_path=os.path.join(output_dir, basename)) + statistics_tarball_members = StatisticsOutputs( + json=f"{os.path.basename(stats.json_output_path)}", + tab=f"{os.path.basename(stats.tsv_output_path)}", + qc=f"{os.path.basename(stats.qc_output_path)}", + ) - return AnnotationOutputs( - annotation=annotation, - sampleList=sampleList, - statistics=statistics, - archived=archived, - log=log, + return ( + AnnotationOutputs( + annotation=annotation, + sampleList=sampleList, + statistics=statistics_tarball_members, + archived=archived, + log=log, + ), + stats, ) -_default_delimiters = { - "field": "\t", - "position": "|", - "overlap": chr(31), - "value": ";", - "empty_field": "!", -} +class DelimitersConfig(Struct, frozen=True, forbid_unknown_fields=True): + field: str = "\t" + position: str = "|" + overlap: str = chr(31) + value: str = ";" + empty_field: str = "!" + @staticmethod + def from_dict(annotation_config: dict[str, Any]): + """Get delimiters from a dictionary""" + delim_config: Optional[dict[str, str]] = annotation_config.get("delimiters") -def get_delimiters(annotation_conf: dict | None = None): - if annotation_conf: - return annotation_conf.get("delimiters", _default_delimiters) - return _default_delimiters + if delim_config is None: + logger.warning( + "No 'delimiters' key found in supplied annotation config, using defaults" + ) + return DelimitersConfig() + + return DelimitersConfig(**delim_config) -def get_config_file_path(config_path_base_dir: str, assembly: str, suffix: str = ".y*ml"): +def get_config_file_path( + config_path_base_dir: str, assembly: str, suffix: str = ".y*ml" +): """Get config file path""" paths = glob(path.join(config_path_base_dir, assembly + suffix)) if not paths: - raise ValueError(f"\n\nNo config path found for the assembly {assembly}. Exiting\n\n") + raise ValueError( + f"\n\nNo config path found for the assembly {assembly}. Exiting\n\n" + ) if len(paths) > 1: print("\n\nMore than 1 config path found, choosing first") return paths[0] + + +class Statistics: + def __init__( + self, output_base_path: str, annotation_config: dict[str, Any] | None = None + ): + if annotation_config is None: + self._config = StatisticsConfig() + self._delimiters = DelimitersConfig() + else: + self._config = StatisticsConfig.from_dict(annotation_config) + self._delimiters = DelimitersConfig.from_dict(annotation_config) + + program_path = shutil.which(self._config.programPath) + if not program_path: + raise ValueError( + f"Couldn't find statistics program {self._config.programPath}" + ) + + self.program_path = program_path + self.json_output_path = ( + f"{output_base_path}.{self._config.outputExtensions.json}" + ) + self.tsv_output_path = f"{output_base_path}.{self._config.outputExtensions.tsv}" + self.qc_output_path = f"{output_base_path}.{self._config.outputExtensions.qc}" + + @property + def stdin_cli_stats_command(self) -> str: + value_delim = self._delimiters.value + field_delim = self._delimiters.field + empty_field = self._delimiters.empty_field + + het_field = self._config.heterozygotesField + hom_field = self._config.homozygotesField + site_type_field = self._config.siteTypeField + ea_fun_field = self._config.exonicAlleleFunctionField + ref_field = self._config.refField + alt_field = self._config.altField + dbSNP_field = self._config.dbSNPnameField + + statsProg = self.program_path + + dbSNPpart = f"-dbSnpNameColumn {dbSNP_field}" if dbSNP_field else "" + + return ( + f"{statsProg} -outJsonPath {self.json_output_path} -outTabPath {self.tsv_output_path} " + f"-outQcTabPath {self.qc_output_path} -refColumn {ref_field} " + f"-altColumn {alt_field} -homozygotesColumn {hom_field} " + f"-heterozygotesColumn {het_field} -siteTypeColumn {site_type_field} " + f"{dbSNPpart} -emptyField '{empty_field}' " + f"-exonicAlleleFunctionColumn {ea_fun_field} " + f"-primaryDelimiter '{value_delim}' -fieldSeparator '{field_delim}'" + ) diff --git a/python/python/bystro/search/utils/tests/test_annotation.py b/python/python/bystro/search/utils/tests/test_annotation.py new file mode 100644 index 000000000..524a40999 --- /dev/null +++ b/python/python/bystro/search/utils/tests/test_annotation.py @@ -0,0 +1,218 @@ +import pytest + +from bystro.search.utils.annotation import ( + DelimitersConfig, + get_config_file_path, + StatisticsConfig, + StatisticsOutputExtensions, + Statistics, +) + + +def assert_defaults(config: DelimitersConfig): + assert config.field == "\t" + assert config.position == "|" + assert config.overlap == chr(31) + assert config.value == ";" + assert config.empty_field == "!" + + +def test_delimiters_default_values(): + config = DelimitersConfig() + assert_defaults(config) + + +def test_delimiters_from_dict_no_arg(): + with pytest.raises( + TypeError, match=r"missing 1 required positional argument: 'annotation_config'" + ): + # Ignoring type checking because we're testing the error + DelimitersConfig.from_dict() # type: ignore + + +def test_delimiters_from_dict_no_delimiters_key(): + config_dict = {"random_key": "random_value"} + config = DelimitersConfig.from_dict(config_dict) + assert_defaults(config) + + +def test_delimiters_from_dict_unexpected_key(): + annotation_config = {"delimiters": {"random_delim_key": "random_value"}} + with pytest.raises( + TypeError, match=r"Unexpected keyword argument 'random_delim_key'" + ): + DelimitersConfig.from_dict(annotation_config) + + +def test_delimiters_from_dict_with_delimiters_key(): + config_dict = { + "delimiters": { + "field": "x", + "position": "y", + "overlap": "z", + "value": "w", + "empty_field": "v", + } + } + config = DelimitersConfig.from_dict(config_dict) + assert config.field == "x" + assert config.position == "y" + assert config.overlap == "z" + assert config.value == "w" + assert config.empty_field == "v" + + +def test_delimiters_from_dict_partial_delimiters_key(): + config_dict = {"delimiters": {"field": "x", "position": "y"}} + config = DelimitersConfig.from_dict(config_dict) + assert config.field == "x" + assert config.position == "y" + assert config.overlap == chr(31) # default value + assert config.value == ";" # default value + assert config.empty_field == "!" # default value + + +def test_delimiters_unexpected_key(): + with pytest.raises( + TypeError, match=r"Unexpected keyword argument 'random_delim_key2'" + ): + # Ignoring type checking because we're testing the error + DelimitersConfig(random_delim_key2= "random_value") # type: ignore + + +def test_get_config_file_path_no_path_found(mocker): + mocker.patch("bystro.search.utils.annotation.glob", return_value=[]) + + with pytest.raises(ValueError, match=r"No config path found for the assembly"): + get_config_file_path("/dummy/path", "dummy_assembly") + + +def test_get_config_file_path_single_path_found(mocker): + mocked_path = "/dummy/path/dummy_assembly.yml" + mocker.patch("bystro.search.utils.annotation.glob", return_value=[mocked_path]) + result = get_config_file_path("/dummy/path", "dummy_assembly") + assert result == mocked_path + + +def test_get_config_file_path_single_path_found_with_extension(mocker): + mocked_path = "/dummy/path/dummy_assembly.blargh" + mocker.patch("bystro.search.utils.annotation.glob", return_value=[mocked_path]) + result = get_config_file_path("/dummy/path", "dummy_assembly", suffix=".blargh") + assert result == mocked_path + + +def test_get_config_file_path_single_path_found_with_extension_wildcard(mocker): + mocked_path = "/dummy/path/dummy_assembly.blargh" + mocker.patch("bystro.search.utils.annotation.glob", return_value=[mocked_path]) + result = get_config_file_path("/dummy/path", "dummy_assembly", suffix=".bl*rgh") + assert result == mocked_path + + +def test_get_config_file_path_multiple_paths_found(mocker, capsys): + mocked_paths = [ + "/dummy/path/dummy_assembly_1.yml", + "/dummy/path/dummy_assembly_2.yml", + ] + mocker.patch("bystro.search.utils.annotation.glob", return_value=mocked_paths) + result = get_config_file_path("/dummy/path", "dummy_assembly") + captured = capsys.readouterr() # Capture the standard output + assert "More than 1 config path found, choosing first" in captured.out + assert result == mocked_paths[0] + + +def test_statistics_output_extensions_defaults(): + extensions = StatisticsOutputExtensions() + assert extensions.json == "statistics.json" + assert extensions.tsv == "statistics.tsv" + assert extensions.qc == "statistics.qc.tsv" + + +def test_statistics_config_defaults(): + config = StatisticsConfig() + assert config.dbSNPnameField == "dbSNP.name" + assert config.siteTypeField == "refSeq.siteType" + assert config.exonicAlleleFunctionField == "refSeq.exonicAlleleFunction" + assert config.refField == "ref" + assert config.homozygotesField == "homozygotes" + assert config.heterozygotesField == "heterozygotes" + assert config.altField == "alt" + assert config.programPath == "bystro-stats" + assert isinstance(config.outputExtensions, StatisticsOutputExtensions) + + +def test_statistics_config_from_dict_none(): + with pytest.raises( + TypeError, match=r"missing 1 required positional argument: 'annotation_config'" + ): + # Ignoring type checking because we're testing the error + StatisticsConfig.from_dict() # type: ignore + + +def test_statistics_config_from_dict_no_statistics_key(): + annotation_config = {"random_key": "random_value"} + config = StatisticsConfig.from_dict(annotation_config) + + assert ( + config.dbSNPnameField == "dbSNP.name" + ) # Again, just checking one representative default value + + +def test_statistics_config_no_statistics_key(): + with pytest.raises( + TypeError, match=r"Unexpected keyword argument 'random_stats_key2'" + ): + StatisticsConfig(random_stats_key2="random_value") # type: ignore + + +def test_statistics_config_from_dict_unexpected_key(): + annotation_config = {"statistics": {"random_key": "random_value"}} + with pytest.raises(TypeError, match=r"Unexpected keyword argument 'random_key'"): + StatisticsConfig.from_dict(annotation_config) + + +def test_statistics_config_from_dict_with_statistics_key(): + annotation_config = { + "statistics": { + "dbSNPnameField": "new.dbSNP.name", + "siteTypeField": "new.refSeq.siteType", + "outputExtensions": { + "json": ".new.json", + "tsv": ".new.tsv", + "qc": ".new.qc.tsv", + }, + } + } + config = StatisticsConfig.from_dict(annotation_config) + assert config.dbSNPnameField == "new.dbSNP.name" + assert config.siteTypeField == "new.refSeq.siteType" + assert config.outputExtensions.json == ".new.json" + assert config.outputExtensions.tsv == ".new.tsv" + assert config.outputExtensions.qc == ".new.qc.tsv" + + +def test_statistics_init_program_not_found(mocker): + mocker.patch("shutil.which", return_value=None) + with pytest.raises(ValueError, match=r"Couldn't find statistics program"): + Statistics("/dummy/path", None) + + +def test_statistics_get_stats_arguments(mocker): + # Mocking which to return a valid program path + mocker.patch("shutil.which", return_value="/path/to/bystro-stats") + + statistics = Statistics("/dummy/output_base_path", None) + + # Note: Your actual values might differ based on the defaults + # in StatisticsConfig and DelimitersConfig + expected_args = ( + "/path/to/bystro-stats -outJsonPath /dummy/output_base_path.statistics.json " + "-outTabPath /dummy/output_base_path.statistics.tsv " + "-outQcTabPath /dummy/output_base_path.statistics.qc.tsv " + "-refColumn ref -altColumn alt -homozygotesColumn homozygotes " + "-heterozygotesColumn heterozygotes -siteTypeColumn refSeq.siteType " + "-dbSnpNameColumn dbSNP.name -emptyField '!' " + "-exonicAlleleFunctionColumn refSeq.exonicAlleleFunction " + "-primaryDelimiter ';' -fieldSeparator '\t'" + ) + + assert statistics.stdin_cli_stats_command == expected_args diff --git a/python/python/bystro/utils/compress.py b/python/python/bystro/utils/compress.py new file mode 100644 index 000000000..08921f7d4 --- /dev/null +++ b/python/python/bystro/utils/compress.py @@ -0,0 +1,19 @@ +"""Utility functions for safely invoking tar in cross-OS manner.""" +import shutil + + +def _get_gzip_program_path() -> str: + pigz = shutil.which("pigz") + + if pigz: + return pigz + + gzip = shutil.which("gzip") + + if gzip: + return gzip + + raise OSError("Neither gzip nor pigz not found on system") + + +GZIP_EXECUTABLE = _get_gzip_program_path() diff --git a/python/python/bystro/utils/tests/test_compress.py b/python/python/bystro/utils/tests/test_compress.py new file mode 100644 index 000000000..331db1a24 --- /dev/null +++ b/python/python/bystro/utils/tests/test_compress.py @@ -0,0 +1,23 @@ +import pytest +import shutil +from bystro.utils.compress import _get_gzip_program_path + + +def test_pigz_found(monkeypatch): + monkeypatch.setattr( + shutil, "which", lambda x: "/path/to/pigz" if x == "pigz" else None + ) + assert _get_gzip_program_path() == "/path/to/pigz" + + +def test_pigz_not_found_gzip_found(monkeypatch): + monkeypatch.setattr( + shutil, "which", lambda x: "/path/to/gzip" if x == "gzip" else None + ) + assert _get_gzip_program_path() == "/path/to/gzip" + + +def test_neither_found(monkeypatch): + monkeypatch.setattr(shutil, "which", lambda _: None) + with pytest.raises(OSError, match="Neither gzip nor pigz not found on system"): + _get_gzip_program_path() diff --git a/startup.yml b/startup.yml index ec4be407d..43777a2dd 100644 --- a/startup.yml +++ b/startup.yml @@ -1,6 +1,6 @@ apps: - name: BystroAnnotationServer - script: bin/bystro-server.pl + script: perl/bin/bystro-server.pl interpreter: 'perl' args: -q config/beanstalk.yml --debug - name: BystroSaveServer