Skip to content

Commit

Permalink
added "RangeOfBases class to hold the results" (with some utility met…
Browse files Browse the repository at this point in the history
…hods)
  • Loading branch information
yfarjoun committed Sep 23, 2024
1 parent 6124684 commit 5c8a76a
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 31 deletions.
56 changes: 48 additions & 8 deletions fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@
import enum
import io
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import IO
from typing import Any
Expand Down Expand Up @@ -360,6 +361,46 @@ class _CigarOpUtil:
}


@dataclass(frozen=True)
class RangeOfBases:
"""A simple data class for holding offsets into a range of bases in a read.
Attributes:
start (int): The starting offset (0-based) into the range.
stop (int): The ending (excluded) offset into the range.
Properties:
slice: A slice of the range. can be used rto extract the aligned bases from a string.
range: The range of bases represented by this object. Can be used to obtain the indexes
into the aligned bases in the read.
__len__: The length of the range.
__iter__: Enables unpacking of start and stop into a tuple by return the iterator of
(start, stop)
"""

start: int
stop: int

@property
def slice(self) -> slice:
"""A slice of the range"""
return slice(self.start, self.stop)

@property
def range(self) -> range:
"""The range of bases represented by this object"""
return range(self.start, self.stop)

def __len__(self) -> int:
"""The length of the range"""
return self.stop - self.start

def __iter__(self):
"""enables unpacking of start and stop into a tuple"""
return (self.start, self.stop).__iter__()


@enum.unique
class CigarOp(enum.Enum):
"""Enumeration of operators that can appear in a Cigar string.
Expand Down Expand Up @@ -547,7 +588,7 @@ def length_on_target(self) -> int:
"""Returns the length of the alignment on the target sequence."""
return sum([elem.length_on_target for elem in self.elements])

def query_alignment_offsets(self, reverse: bool = False) -> Optional[range]:
def query_alignment_offsets(self, reverse: bool = False) -> Optional[RangeOfBases]:
"""Gets the 0-based, end-exclusive positions of the first and last aligned base in the
query. The resulting range will contain the range of positions in the SEQ string for
the bases that are aligned. If no bases are aligned, the return value will be None.
Expand All @@ -556,13 +597,12 @@ def query_alignment_offsets(self, reverse: bool = False) -> Optional[range]:
reverse: If True, count from the end of the query. i.e. find the offsets
using the reversed elements of the cigar.
Returns:
A range, defining the start and stop offsets of the aligned part
of the query. These offsets are 0-based and open-ended, with respect to the
beginning of the query. (If 'reverse' is True, the offsets are with
respect to the reversed query.)
If no bases are aligned, the return value will be None.
A RangeOfBases object containing the start and stop positions (0-based, end-exclusive)
of the aligned part of the query. These offsets are 0-based and open-ended, with
respect to the beginning of the query. (If 'reverse' is True, the offsets are with
respect to the reversed query.) If no bases are aligned, the return value will be
None.
"""
start_offset: int = 0
end_offset: int = 0
Expand All @@ -583,7 +623,7 @@ def query_alignment_offsets(self, reverse: bool = False) -> Optional[range]:
# We have exited the alignment and are in the clipping operators after the alignment
break

ret = range(start_offset, end_offset)
ret = RangeOfBases(start_offset, end_offset)
if not alignment_began or len(ret) == 0:
return None
return ret
Expand Down
52 changes: 29 additions & 23 deletions tests/fgpyo/sam/test_cigar.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typing import Optional
from typing import Tuple

import pytest

from fgpyo.sam import Cigar
from fgpyo.sam import RangeOfBases

cigar = Cigar.from_cigarstring("1M4D45N37X23I11=")

Expand Down Expand Up @@ -38,50 +40,54 @@ def test_bad_index_raises_type_error(index: int) -> None:
@pytest.mark.parametrize(
("cigar_string", "maybe_range"),
{
("10M", range(0, 10)),
("10M10I", range(0, 20)),
("10X10I", range(0, 20)),
("10X10D", range(0, 10)),
("10=10D", range(0, 10)),
("10S10M", range(10, 20)),
("10H10M", range(0, 10)),
("10H10S10M", range(10, 20)),
("10H10S10M5S", range(10, 20)),
("10H10S10M5S10H", range(10, 20)),
("10M", RangeOfBases(0, 10)),
("10M10I", RangeOfBases(0, 20)),
("10X10I", RangeOfBases(0, 20)),
("10X10D", RangeOfBases(0, 10)),
("10=10D", RangeOfBases(0, 10)),
("10S10M", RangeOfBases(10, 20)),
("10H10M", RangeOfBases(0, 10)),
("10H10S10M", RangeOfBases(10, 20)),
("10H10S10M5S", RangeOfBases(10, 20)),
("10H10S10M5S10H", RangeOfBases(10, 20)),
("10H", None),
("10S", None),
("10S10H", None),
("5H10S10H", None),
("76D", None),
("76I", range(0, 76)),
("76I", RangeOfBases(0, 76)),
("10P76S", None),
("50S1000N50S", None),
},
)
def test_get_alignments(cigar_string: str, maybe_range: Optional[range]) -> None:
def test_get_alignments(cigar_string: str, maybe_range: Optional[RangeOfBases]) -> None:
cig = Cigar.from_cigarstring(cigar_string)

assert Cigar.query_alignment_offsets(cig, reverse=False) == maybe_range


@pytest.mark.parametrize(
("cigar_string", "maybe_range"),
{
("10M", range(0, 10)),
("10M10I", range(0, 20)),
("10X10I", range(0, 20)),
("10X10D", range(0, 10)),
("10=10D", range(0, 10)),
("10S10M", range(0, 10)),
("10H10M", range(0, 10)),
("10H10S10M", range(0, 10)),
("10H10S10M5S", range(5, 15)),
("10H10S10M5S10H", range(5, 15)),
("10M", RangeOfBases(0, 10)),
("10M10I", RangeOfBases(0, 20)),
("10X10I", RangeOfBases(0, 20)),
("10X10D", RangeOfBases(0, 10)),
("10=10D", RangeOfBases(0, 10)),
("10S10M", RangeOfBases(0, 10)),
("10H10M", RangeOfBases(0, 10)),
("10H10S10M", RangeOfBases(0, 10)),
("10H10S10M5S", RangeOfBases(5, 15)),
("10H10S10M5S10H", RangeOfBases(5, 15)),
("10H", None),
("10S", None),
("10S10H", None),
("5H10S10H", None),
},
)
def test_get_alignments_reversed(cigar_string: str, maybe_range: Optional[range]) -> None:
def test_get_alignments_reversed(cigar_string: str, maybe_range: Optional[Tuple[int, int]]) -> None:
cig = Cigar.from_cigarstring(cigar_string)

assert Cigar.query_alignment_offsets(cig, reverse=True) == maybe_range
if maybe_range is not None:
start, stop = maybe_range

0 comments on commit 5c8a76a

Please sign in to comment.