Skip to content

Commit

Permalink
responding to review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
yfarjoun committed Sep 19, 2024
1 parent f371b6c commit e041787
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 56 deletions.
45 changes: 23 additions & 22 deletions fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ class CigarOp(enum.Enum):
code (int): The `~pysam` cigar operator code.
character (int): The single character cigar operator.
consumes_query (bool): True if this operator consumes query bases, False otherwise.
consumes_reference (bool): True if this operator consumes reference bases, False otherwise.
consumes_target (bool): True if this operator consumes target bases, False otherwise.
"""

M = (0, "M", True, True) #: Match or Mismatch the reference
Expand Down Expand Up @@ -547,27 +547,24 @@ def length_on_target(self) -> int:
"""Returns the length of the alignment on the target sequence."""
return sum([elem.length_on_target for elem in self.elements])

def get_alignment_offsets(self, reverse: bool = False) -> Tuple[int, int]:
def get_query_alignment_offsets(self, reverse: bool = False) -> Optional[range]:
"""
Get the 0-based, end-exclusive positions of the first and last aligned base in the query.
The resulting range will contain the range of positions in the SEQ string for the bases t
hat are aligned. If no bases are aligned, the return value will be None.
Args:
reverse: If True, count from the end of the query. i.e. find the offsets
using the reversed elements of the cigar.
Returns:
A range, defining the start and stop offsets of the aligned part
of the query. These offsets are 0-based and open-ended, with respect to the
beginning of the query. (If 'reverse' is True, the offsets are with
respect to the reversed query.)
If no bases are aligned, the return value will be None.
"""
Get the starting and ending offsets for the alignment based on the CIGAR string.
Args:
reverse: If True, count from the end of the read sequence.
Otherwise (default behavior), count from the beginning of the read sequence.
Returns:
A tuple (start, end), defining the start and end offsets of the aligned part
of the read. These offsets are 0-based and open-ended, with respect to the
beginning of the read sequence. (If 'reverse' is True, the offsets are with
respect to the end of the read sequence.)
If the Cigar contains no alignment operators that consume sequence bases, or
only clipping operators, the start and end offsets will be the same value
(indicating an empty region). This shared value will be the offset to the first
base consumed by a non-clipping operator or the length of the read sequence if
there is no such base.
#
""" # TODO: figure out how to remove the '#' from the documentation above without
# breaking the build
start_offset: int = 0
end_offset: int = 0
Expand All @@ -577,6 +574,7 @@ def get_alignment_offsets(self, reverse: bool = False) -> Tuple[int, int]:
for element in elements:
if element.operator.is_clipping and not alignment_began:
# We are in the clipping operators preceding the alignment
# Note: hardclips have length-on-query=0
start_offset += element.length_on_query
end_offset += element.length_on_query
elif not element.operator.is_clipping:
Expand All @@ -587,7 +585,10 @@ def get_alignment_offsets(self, reverse: bool = False) -> Tuple[int, int]:
# We have exited the alignment and are in the clipping operators after the alignment
break

return start_offset, end_offset
ret = range(start_offset, end_offset)
if not alignment_began or len(ret) == 0:
return None
return ret

def __getitem__(
self, index: Union[int, slice]
Expand Down
74 changes: 40 additions & 34 deletions tests/fgpyo/sam/test_cigar.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Optional

import pytest

from fgpyo.sam import Cigar
Expand Down Expand Up @@ -34,48 +36,52 @@ def test_bad_index_raises_type_error(index: int) -> None:


@pytest.mark.parametrize(
("cigar_string", "start", "end"),
("cigar_string", "maybe_range"),
{
("10M", 0, 10),
("10M10I", 0, 20),
("10X10I", 0, 20),
("10X10D", 0, 10),
("10=10D", 0, 10),
("10S10M", 10, 20),
("10H10M", 0, 10),
("10H10S10M", 10, 20),
("10H10S10M5S", 10, 20),
("10H10S10M5S10H", 10, 20),
("10H", 0, 0),
("10S", 10, 10),
("10S10H", 10, 10),
("5H10S10H", 10, 10),
("10M", range(0, 10)),
("10M10I", range(0, 20)),
("10X10I", range(0, 20)),
("10X10D", range(0, 10)),
("10=10D", range(0, 10)),
("10S10M", range(10, 20)),
("10H10M", range(0, 10)),
("10H10S10M", range(10, 20)),
("10H10S10M5S", range(10, 20)),
("10H10S10M5S10H", range(10, 20)),
("10H", None),
("10S", None),
("10S10H", None),
("5H10S10H", None),
("76D", None),
("76I", range(0, 76)),
("10P76S", None),
("50S1000N50S", None),
},
)
def test_get_alignments(cigar_string: str, start: int, end: int) -> None:
def test_get_alignments(cigar_string: str, maybe_range: Optional[range]) -> None:
cig = Cigar.from_cigarstring(cigar_string)
assert Cigar.get_alignment_offsets(cig, reverse=False) == (start, end)
assert Cigar.get_query_alignment_offsets(cig, reverse=False) == maybe_range


@pytest.mark.parametrize(
("cigar_string", "start", "end"),
("cigar_string", "maybe_range"),
{
("10M", 0, 10),
("10M10I", 0, 20),
("10X10I", 0, 20),
("10X10D", 0, 10),
("10=10D", 0, 10),
("10S10M", 0, 10),
("10H10M", 0, 10),
("10H10S10M", 0, 10),
("10H10S10M5S", 5, 15),
("10H10S10M5S10H", 5, 15),
("10H", 0, 0),
("10S", 10, 10),
("10S10H", 10, 10),
("5H10S10H", 10, 10),
("10M", range(0, 10)),
("10M10I", range(0, 20)),
("10X10I", range(0, 20)),
("10X10D", range(0, 10)),
("10=10D", range(0, 10)),
("10S10M", range(0, 10)),
("10H10M", range(0, 10)),
("10H10S10M", range(0, 10)),
("10H10S10M5S", range(5, 15)),
("10H10S10M5S10H", range(5, 15)),
("10H", None),
("10S", None),
("10S10H", None),
("5H10S10H", None),
},
)
def test_get_alignments_reversed(cigar_string: str, start: int, end: int) -> None:
def test_get_alignments_reversed(cigar_string: str, maybe_range: Optional[range]) -> None:
cig = Cigar.from_cigarstring(cigar_string)
assert Cigar.get_alignment_offsets(cig, reverse=True) == (start, end)
assert Cigar.get_query_alignment_offsets(cig, reverse=True) == maybe_range

0 comments on commit e041787

Please sign in to comment.