diff --git a/fgpyo/sam/__init__.py b/fgpyo/sam/__init__.py index 4738454..f8d0efd 100644 --- a/fgpyo/sam/__init__.py +++ b/fgpyo/sam/__init__.py @@ -158,6 +158,7 @@ import enum import io import sys +from dataclasses import dataclass from pathlib import Path from typing import IO from typing import Any @@ -360,6 +361,46 @@ class _CigarOpUtil: } +@dataclass(frozen=True) +class RangeOfBases: + """A simple data class for holding offsets into a range of bases in a read. + + Attributes: + start (int): The starting offset (0-based) into the range. + stop (int): The ending (excluded) offset into the range. + + Properties: + slice: A slice of the range. can be used rto extract the aligned bases from a string. + range: The range of bases represented by this object. Can be used to obtain the indexes + into the aligned bases in the read. + __len__: The length of the range. + __iter__: Enables unpacking of start and stop into a tuple by return the iterator of + (start, stop) + + """ + + start: int + stop: int + + @property + def slice(self) -> slice: + """A slice of the range""" + return slice(self.start, self.stop) + + @property + def range(self) -> range: + """The range of bases represented by this object""" + return range(self.start, self.stop) + + def __len__(self) -> int: + """The length of the range""" + return self.stop - self.start + + def __iter__(self): + """enables unpacking of start and stop into a tuple""" + return (self.start, self.stop).__iter__() + + @enum.unique class CigarOp(enum.Enum): """Enumeration of operators that can appear in a Cigar string. @@ -547,7 +588,7 @@ def length_on_target(self) -> int: """Returns the length of the alignment on the target sequence.""" return sum([elem.length_on_target for elem in self.elements]) - def query_alignment_offsets(self, reverse: bool = False) -> Optional[range]: + def query_alignment_offsets(self, reverse: bool = False) -> Optional[RangeOfBases]: """Gets the 0-based, end-exclusive positions of the first and last aligned base in the query. The resulting range will contain the range of positions in the SEQ string for the bases that are aligned. If no bases are aligned, the return value will be None. @@ -556,13 +597,12 @@ def query_alignment_offsets(self, reverse: bool = False) -> Optional[range]: reverse: If True, count from the end of the query. i.e. find the offsets using the reversed elements of the cigar. - Returns: - A range, defining the start and stop offsets of the aligned part - of the query. These offsets are 0-based and open-ended, with respect to the - beginning of the query. (If 'reverse' is True, the offsets are with - respect to the reversed query.) - If no bases are aligned, the return value will be None. + A RangeOfBases object containing the start and stop positions (0-based, end-exclusive) + of the aligned part of the query. These offsets are 0-based and open-ended, with + respect to the beginning of the query. (If 'reverse' is True, the offsets are with + respect to the reversed query.) If no bases are aligned, the return value will be + None. """ start_offset: int = 0 end_offset: int = 0 @@ -583,7 +623,7 @@ def query_alignment_offsets(self, reverse: bool = False) -> Optional[range]: # We have exited the alignment and are in the clipping operators after the alignment break - ret = range(start_offset, end_offset) + ret = RangeOfBases(start_offset, end_offset) if not alignment_began or len(ret) == 0: return None return ret diff --git a/tests/fgpyo/sam/test_cigar.py b/tests/fgpyo/sam/test_cigar.py index 9c08bbe..d744ca6 100644 --- a/tests/fgpyo/sam/test_cigar.py +++ b/tests/fgpyo/sam/test_cigar.py @@ -1,8 +1,10 @@ from typing import Optional +from typing import Tuple import pytest from fgpyo.sam import Cigar +from fgpyo.sam import RangeOfBases cigar = Cigar.from_cigarstring("1M4D45N37X23I11=") @@ -38,50 +40,54 @@ def test_bad_index_raises_type_error(index: int) -> None: @pytest.mark.parametrize( ("cigar_string", "maybe_range"), { - ("10M", range(0, 10)), - ("10M10I", range(0, 20)), - ("10X10I", range(0, 20)), - ("10X10D", range(0, 10)), - ("10=10D", range(0, 10)), - ("10S10M", range(10, 20)), - ("10H10M", range(0, 10)), - ("10H10S10M", range(10, 20)), - ("10H10S10M5S", range(10, 20)), - ("10H10S10M5S10H", range(10, 20)), + ("10M", RangeOfBases(0, 10)), + ("10M10I", RangeOfBases(0, 20)), + ("10X10I", RangeOfBases(0, 20)), + ("10X10D", RangeOfBases(0, 10)), + ("10=10D", RangeOfBases(0, 10)), + ("10S10M", RangeOfBases(10, 20)), + ("10H10M", RangeOfBases(0, 10)), + ("10H10S10M", RangeOfBases(10, 20)), + ("10H10S10M5S", RangeOfBases(10, 20)), + ("10H10S10M5S10H", RangeOfBases(10, 20)), ("10H", None), ("10S", None), ("10S10H", None), ("5H10S10H", None), ("76D", None), - ("76I", range(0, 76)), + ("76I", RangeOfBases(0, 76)), ("10P76S", None), ("50S1000N50S", None), }, ) -def test_get_alignments(cigar_string: str, maybe_range: Optional[range]) -> None: +def test_get_alignments(cigar_string: str, maybe_range: Optional[RangeOfBases]) -> None: cig = Cigar.from_cigarstring(cigar_string) + assert Cigar.query_alignment_offsets(cig, reverse=False) == maybe_range @pytest.mark.parametrize( ("cigar_string", "maybe_range"), { - ("10M", range(0, 10)), - ("10M10I", range(0, 20)), - ("10X10I", range(0, 20)), - ("10X10D", range(0, 10)), - ("10=10D", range(0, 10)), - ("10S10M", range(0, 10)), - ("10H10M", range(0, 10)), - ("10H10S10M", range(0, 10)), - ("10H10S10M5S", range(5, 15)), - ("10H10S10M5S10H", range(5, 15)), + ("10M", RangeOfBases(0, 10)), + ("10M10I", RangeOfBases(0, 20)), + ("10X10I", RangeOfBases(0, 20)), + ("10X10D", RangeOfBases(0, 10)), + ("10=10D", RangeOfBases(0, 10)), + ("10S10M", RangeOfBases(0, 10)), + ("10H10M", RangeOfBases(0, 10)), + ("10H10S10M", RangeOfBases(0, 10)), + ("10H10S10M5S", RangeOfBases(5, 15)), + ("10H10S10M5S10H", RangeOfBases(5, 15)), ("10H", None), ("10S", None), ("10S10H", None), ("5H10S10H", None), }, ) -def test_get_alignments_reversed(cigar_string: str, maybe_range: Optional[range]) -> None: +def test_get_alignments_reversed(cigar_string: str, maybe_range: Optional[Tuple[int, int]]) -> None: cig = Cigar.from_cigarstring(cigar_string) + assert Cigar.query_alignment_offsets(cig, reverse=True) == maybe_range + if maybe_range is not None: + start, stop = maybe_range