diff --git a/fgpyo/sam/__init__.py b/fgpyo/sam/__init__.py index 75d8368..92b35bd 100644 --- a/fgpyo/sam/__init__.py +++ b/fgpyo/sam/__init__.py @@ -368,7 +368,7 @@ class CigarOp(enum.Enum): code (int): The `~pysam` cigar operator code. character (int): The single character cigar operator. consumes_query (bool): True if this operator consumes query bases, False otherwise. - consumes_reference (bool): True if this operator consumes reference bases, False otherwise. + consumes_target (bool): True if this operator consumes target bases, False otherwise. """ M = (0, "M", True, True) #: Match or Mismatch the reference @@ -547,27 +547,24 @@ def length_on_target(self) -> int: """Returns the length of the alignment on the target sequence.""" return sum([elem.length_on_target for elem in self.elements]) - def get_alignment_offsets(self, reverse: bool = False) -> Tuple[int, int]: + def get_query_alignment_offsets(self, reverse: bool = False) -> Optional[range]: + """ + Get the 0-based, end-exclusive positions of the first and last aligned base in the query. + The resulting range will contain the range of positions in the SEQ string for the bases t + hat are aligned. If no bases are aligned, the return value will be None. + + Args: + reverse: If True, count from the end of the query. i.e. find the offsets + using the reversed elements of the cigar. + + + Returns: + A range, defining the start and stop offsets of the aligned part + of the query. These offsets are 0-based and open-ended, with respect to the + beginning of the query. (If 'reverse' is True, the offsets are with + respect to the reversed query.) + If no bases are aligned, the return value will be None. """ - Get the starting and ending offsets for the alignment based on the CIGAR string. - - Args: - reverse: If True, count from the end of the read sequence. - Otherwise (default behavior), count from the beginning of the read sequence. - - Returns: - A tuple (start, end), defining the start and end offsets of the aligned part - of the read. These offsets are 0-based and open-ended, with respect to the - beginning of the read sequence. (If 'reverse' is True, the offsets are with - respect to the end of the read sequence.) - If the Cigar contains no alignment operators that consume sequence bases, or - only clipping operators, the start and end offsets will be the same value - (indicating an empty region). This shared value will be the offset to the first - base consumed by a non-clipping operator or the length of the read sequence if - there is no such base. - - # - """ # TODO: figure out how to remove the '#' from the documentation above without # breaking the build start_offset: int = 0 end_offset: int = 0 @@ -577,6 +574,7 @@ def get_alignment_offsets(self, reverse: bool = False) -> Tuple[int, int]: for element in elements: if element.operator.is_clipping and not alignment_began: # We are in the clipping operators preceding the alignment + # Note: hardclips have length-on-query=0 start_offset += element.length_on_query end_offset += element.length_on_query elif not element.operator.is_clipping: @@ -587,7 +585,10 @@ def get_alignment_offsets(self, reverse: bool = False) -> Tuple[int, int]: # We have exited the alignment and are in the clipping operators after the alignment break - return start_offset, end_offset + ret = range(start_offset, end_offset) + if not alignment_began or len(ret) == 0: + return None + return ret def __getitem__( self, index: Union[int, slice] diff --git a/tests/fgpyo/sam/test_cigar.py b/tests/fgpyo/sam/test_cigar.py index 4c8964e..2515b5c 100644 --- a/tests/fgpyo/sam/test_cigar.py +++ b/tests/fgpyo/sam/test_cigar.py @@ -1,3 +1,5 @@ +from typing import Optional + import pytest from fgpyo.sam import Cigar @@ -34,48 +36,52 @@ def test_bad_index_raises_type_error(index: int) -> None: @pytest.mark.parametrize( - ("cigar_string", "start", "end"), + ("cigar_string", "maybe_range"), { - ("10M", 0, 10), - ("10M10I", 0, 20), - ("10X10I", 0, 20), - ("10X10D", 0, 10), - ("10=10D", 0, 10), - ("10S10M", 10, 20), - ("10H10M", 0, 10), - ("10H10S10M", 10, 20), - ("10H10S10M5S", 10, 20), - ("10H10S10M5S10H", 10, 20), - ("10H", 0, 0), - ("10S", 10, 10), - ("10S10H", 10, 10), - ("5H10S10H", 10, 10), + ("10M", range(0, 10)), + ("10M10I", range(0, 20)), + ("10X10I", range(0, 20)), + ("10X10D", range(0, 10)), + ("10=10D", range(0, 10)), + ("10S10M", range(10, 20)), + ("10H10M", range(0, 10)), + ("10H10S10M", range(10, 20)), + ("10H10S10M5S", range(10, 20)), + ("10H10S10M5S10H", range(10, 20)), + ("10H", None), + ("10S", None), + ("10S10H", None), + ("5H10S10H", None), + ("76D", None), + ("76I", range(0, 76)), + ("10P76S", None), + ("50S1000N50S", None), }, ) -def test_get_alignments(cigar_string: str, start: int, end: int) -> None: +def test_get_alignments(cigar_string: str, maybe_range: Optional[range]) -> None: cig = Cigar.from_cigarstring(cigar_string) - assert Cigar.get_alignment_offsets(cig, reverse=False) == (start, end) + assert Cigar.get_query_alignment_offsets(cig, reverse=False) == maybe_range @pytest.mark.parametrize( - ("cigar_string", "start", "end"), + ("cigar_string", "maybe_range"), { - ("10M", 0, 10), - ("10M10I", 0, 20), - ("10X10I", 0, 20), - ("10X10D", 0, 10), - ("10=10D", 0, 10), - ("10S10M", 0, 10), - ("10H10M", 0, 10), - ("10H10S10M", 0, 10), - ("10H10S10M5S", 5, 15), - ("10H10S10M5S10H", 5, 15), - ("10H", 0, 0), - ("10S", 10, 10), - ("10S10H", 10, 10), - ("5H10S10H", 10, 10), + ("10M", range(0, 10)), + ("10M10I", range(0, 20)), + ("10X10I", range(0, 20)), + ("10X10D", range(0, 10)), + ("10=10D", range(0, 10)), + ("10S10M", range(0, 10)), + ("10H10M", range(0, 10)), + ("10H10S10M", range(0, 10)), + ("10H10S10M5S", range(5, 15)), + ("10H10S10M5S10H", range(5, 15)), + ("10H", None), + ("10S", None), + ("10S10H", None), + ("5H10S10H", None), }, ) -def test_get_alignments_reversed(cigar_string: str, start: int, end: int) -> None: +def test_get_alignments_reversed(cigar_string: str, maybe_range: Optional[range]) -> None: cig = Cigar.from_cigarstring(cigar_string) - assert Cigar.get_alignment_offsets(cig, reverse=True) == (start, end) + assert Cigar.get_query_alignment_offsets(cig, reverse=True) == maybe_range