Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: overloaded 'SeqDict.from_sam()' method to support SAM files (#176) #183

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 25 additions & 9 deletions fgpyo/fasta/sequence_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@
from dataclasses import field
from dataclasses import replace
from enum import unique
from pathlib import Path
from typing import Any
from typing import Dict
from typing import Iterator
Expand All @@ -137,6 +138,8 @@
from typing import Union
from typing import overload

from fgpyo import sam

if sys.version_info[0] == 3 and sys.version_info[1] < 11:
from strenum import StrEnum
else:
Expand Down Expand Up @@ -214,7 +217,7 @@ def parse(value: str) -> "AlternateLocus":
class SequenceMetadata(MutableMapping[Union[Keys, str], str]):
"""Stores information about a single Sequence (ex. chromosome, contig).

Implements the mutable mapping interface, which provide access to the attributes of this
Implements the mutable mapping interface, which provides access to the attributes of this
sequence, including name, length, but not index. When using the mapping interface, for example
getting, setting, deleting, as well as iterating over keys, values, and items, the _values_ will
always be strings (`str` type). For example, the length will be an `str` when accessing via
Expand Down Expand Up @@ -446,28 +449,41 @@ def to_sam_header(

@staticmethod
@overload
def from_sam(header: pysam.AlignmentHeader) -> "SequenceDictionary": ...
def from_sam(data: Path) -> "SequenceDictionary": ... # pragma: no cover

@staticmethod
@overload
def from_sam(data: pysam.AlignmentFile) -> "SequenceDictionary": ... # pragma: no cover

@staticmethod
@overload
def from_sam(data: pysam.AlignmentHeader) -> "SequenceDictionary": ... # pragma: no cover

@staticmethod
@overload
def from_sam(header: List[Dict[str, Any]]) -> "SequenceDictionary": ...
def from_sam(data: List[Dict[str, Any]]) -> "SequenceDictionary": ... # pragma: no cover

@staticmethod
def from_sam(
header: Union[pysam.AlignmentHeader, List[Dict[str, Any]]],
data: Union[Path, pysam.AlignmentFile, pysam.AlignmentHeader, List[Dict[str, Any]]],
) -> "SequenceDictionary":
"""Creates a `SequenceDictionary` from either a `pysam.AlignmentHeader` or from
the list of sequences returned by `pysam.AlignmentHeader#to_dict()["SQ"]`."""
if isinstance(header, pysam.AlignmentHeader):
return SequenceDictionary.from_sam(header=header.to_dict()["SQ"])
the list of sequences returned by `pysam.AlignmentHeader.to_dict()["SQ"]`."""
if isinstance(data, pysam.AlignmentHeader):
return SequenceDictionary.from_sam(data.to_dict()["SQ"])
if isinstance(data, pysam.AlignmentFile):
return SequenceDictionary.from_sam(data.header.to_dict()["SQ"])
if isinstance(data, Path):
with sam.reader(data, file_type=sam.SamFileType.SAM) as fh:
return SequenceDictionary.from_sam(fh.header)

infos: List[SequenceMetadata] = [
SequenceMetadata.from_sam(meta=meta, index=index) for index, meta in enumerate(header)
SequenceMetadata.from_sam(meta=meta, index=index) for index, meta in enumerate(data)
]

return SequenceDictionary(infos=infos)

# TODO: mypyp doesn't like these
# TODO: mypy doesn't like these
# @overload
# def __getitem__(self, key: str) -> SequenceMetadata: ...
#
Expand Down
4 changes: 4 additions & 0 deletions tests/fgpyo/fasta/data/sequence.dict
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@HD VN:1.5
@SQ SN:chr1 LN:10
@SQ SN:chr2 LN:20 AN:chr3
@RG ID:foo
12 changes: 7 additions & 5 deletions tests/fgpyo/fasta/test_sequence_dictionary.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
from typing import Any
from typing import Dict
from typing import List
Expand All @@ -11,6 +12,8 @@
from fgpyo.fasta.sequence_dictionary import SequenceDictionary
from fgpyo.fasta.sequence_dictionary import SequenceMetadata
from fgpyo.fasta.sequence_dictionary import Topology
from fgpyo.sam import SamFileType
from fgpyo.sam import reader


def test_alternate_locus_raises_start_gt_end() -> None:
Expand Down Expand Up @@ -315,10 +318,6 @@ def test_sequence_dictionary_same_as() -> None:
assert not this.same_as(that)


# to_sam
# from_sam


def test_sequence_dictionary_to_and_from_sam() -> None:
sd = SequenceDictionary(
infos=[
Expand All @@ -333,7 +332,10 @@ def test_sequence_dictionary_to_and_from_sam() -> None:
header = pysam.AlignmentHeader.from_dict(
header_dict={"HD": {"VN": "1.5"}, "SQ": mapping, "RG": [{"ID": "foo"}]}
)

samfile = Path(__file__).parent / "data" / "sequence.dict"
alignment: pysam.AlignmentFile = reader(samfile, file_type=SamFileType.SAM)
assert SequenceDictionary.from_sam(samfile) == sd
assert SequenceDictionary.from_sam(alignment) == sd
assert SequenceDictionary.from_sam(mapping) == sd
assert SequenceDictionary.from_sam(header) == sd
assert sd.to_sam_header(extra_header={"RG": [{"ID": "foo"}]})
Expand Down
Loading