Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add cigar utility method get_alignment_offsets #188

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
41 changes: 41 additions & 0 deletions fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,47 @@ def length_on_target(self) -> int:
"""Returns the length of the alignment on the target sequence."""
return sum([elem.length_on_target for elem in self.elements])

def query_alignment_offsets(self, reverse: bool = False) -> Optional[range]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tfenne I disagree with the suggestion to use a range object.

  1. This is an unconventional way to use range
  2. The stated purpose of the function is to return the start and end positions of the alignment within the query sequence. I assume that in most contexts where this function will be used, the start and end positions will be used to slice the query sequence. range cannot be used to slice a string, nor does it support tuple unpacking.

I think it would be more ergonomic to return a tuple[int, int], though I wouldn't mind seeing a use case where range provides advantages. (Nor would I be opposed to using a NamedTuple with start and end attributes)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not opinionated here, but am noticing that range does have start-stop access, so in that regard it will be equivalent to the named NamedTuple.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(almost stop!=end)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but you can't unpack the range

i.e.

# this is valid
from typing import NamedTuple

class Offsets(NamedTuple):
    start: int
    end: int
    
offsets = Offsets(start=1, end=200)

start, end = offsets

# this is invalid
offsets = range(1, 200)
start, end = offsets

So when calling the method, you can either accept the tuple or unpack it immediately, depending on which makes more sense in your context

e.g. I imagine most use cases will look like the following

start, end = cigar.alignment_offsets()
aligned_sequence = query[start:end]

"""Gets the 0-based, end-exclusive positions of the first and last aligned base in the query.
The resulting range will contain the range of positions in the SEQ string for the bases
that are aligned. If no bases are aligned, the return value will be None.

Args:
reverse: If True, count from the end of the query. i.e. find the offsets
using the reversed elements of the cigar.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change


Returns:
A range, defining the start and stop offsets of the aligned part
of the query. These offsets are 0-based and open-ended, with respect to the
beginning of the query. (If 'reverse' is True, the offsets are with
respect to the reversed query.)
If no bases are aligned, the return value will be None.
"""
start_offset: int = 0
end_offset: int = 0
element: CigarElement
alignment_began = False
elements = self.elements if not reverse else reversed(self.elements)
for element in elements:
if element.operator.is_clipping and not alignment_began:
# We are in the clipping operators preceding the alignment
# Note: hardclips have length-on-query=0
start_offset += element.length_on_query
end_offset += element.length_on_query
elif not element.operator.is_clipping:
# We are within the alignment
alignment_began = True
end_offset += element.length_on_query
Comment on lines +618 to +621
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So ... I think you have to consider edge-case cigars here. What does this function report for:

  • 76D
  • 76I
  • 10P76S
  • 50S1000N50S

else:
# We have exited the alignment and are in the clipping operators after the alignment
break

ret = range(start_offset, end_offset)
if not alignment_began or len(ret) == 0:
return None
return ret

def __getitem__(
self, index: Union[int, slice]
) -> Union[CigarElement, Tuple[CigarElement, ...]]:
Expand Down
54 changes: 54 additions & 0 deletions tests/fgpyo/sam/test_cigar.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Optional

import pytest

from fgpyo.sam import Cigar
Expand Down Expand Up @@ -31,3 +33,55 @@ def test_bad_index_raises_index_error(index: int) -> None:
def test_bad_index_raises_type_error(index: int) -> None:
with pytest.raises(TypeError):
cigar[index]


@pytest.mark.parametrize(
("cigar_string", "maybe_range"),
{
("10M", range(0, 10)),
("10M10I", range(0, 20)),
("10X10I", range(0, 20)),
("10X10D", range(0, 10)),
("10=10D", range(0, 10)),
("10S10M", range(10, 20)),
("10H10M", range(0, 10)),
("10H10S10M", range(10, 20)),
("10H10S10M5S", range(10, 20)),
("10H10S10M5S10H", range(10, 20)),
("10H", None),
("10S", None),
("10S10H", None),
("5H10S10H", None),
("76D", None),
("76I", range(0, 76)),
("10P76S", None),
("50S1000N50S", None),
},
)
def test_get_alignments(cigar_string: str, maybe_range: Optional[range]) -> None:
cig = Cigar.from_cigarstring(cigar_string)
assert Cigar.query_alignment_offsets(cig, reverse=False) == maybe_range


@pytest.mark.parametrize(
("cigar_string", "maybe_range"),
{
("10M", range(0, 10)),
("10M10I", range(0, 20)),
("10X10I", range(0, 20)),
("10X10D", range(0, 10)),
("10=10D", range(0, 10)),
("10S10M", range(0, 10)),
("10H10M", range(0, 10)),
("10H10S10M", range(0, 10)),
("10H10S10M5S", range(5, 15)),
("10H10S10M5S10H", range(5, 15)),
("10H", None),
("10S", None),
("10S10H", None),
("5H10S10H", None),
},
)
def test_get_alignments_reversed(cigar_string: str, maybe_range: Optional[range]) -> None:
cig = Cigar.from_cigarstring(cigar_string)
assert Cigar.query_alignment_offsets(cig, reverse=True) == maybe_range
Loading