Skip to content

Commit

Permalink
update (biopython#4432)
Browse files Browse the repository at this point in the history
  • Loading branch information
mdehoon authored Aug 23, 2023
1 parent e02bd4e commit d416809
Show file tree
Hide file tree
Showing 3 changed files with 283 additions and 3 deletions.
130 changes: 129 additions & 1 deletion Bio/Align/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ def __add__(self, other):
if len(self) != len(other):
raise ValueError(
"When adding two alignments they must have the same length"
" (i.e. same number or rows)"
" (i.e. same number of rows)"
)
merged = (left + right for left, right in zip(self, other))
# Take any common annotation:
Expand Down Expand Up @@ -1138,6 +1138,134 @@ def __array__(self, dtype=None):
data = np.array(data, dtype)
return data

def __add__(self, other):
"""Combine two alignments by adding them row-wise.
For example,
>>> import numpy as np
>>> from Bio.Seq import Seq
>>> from Bio.SeqRecord import SeqRecord
>>> from Bio.Align import Alignment
>>> a1 = SeqRecord(Seq("AAAAC"), id="Alpha")
>>> b1 = SeqRecord(Seq("AAAC"), id="Beta")
>>> c1 = SeqRecord(Seq("AAAAG"), id="Gamma")
>>> a2 = SeqRecord(Seq("GTT"), id="Alpha")
>>> b2 = SeqRecord(Seq("TT"), id="Beta")
>>> c2 = SeqRecord(Seq("GT"), id="Gamma")
>>> left = Alignment([a1, b1, c1],
... coordinates=np.array([[0, 3, 4, 5],
... [0, 3, 3, 4],
... [0, 3, 4, 5]]))
>>> left.annotations = {"tool": "demo", "name": "start"}
>>> left.column_annotations = {"stats": "CCCXC"}
>>> right = Alignment([a2, b2, c2],
... coordinates=np.array([[0, 1, 2, 3],
... [0, 0, 1, 2],
... [0, 1, 1, 2]]))
>>> right.annotations = {"tool": "demo", "name": "end"}
>>> right.column_annotations = {"stats": "CXC"}
Now, let's look at these two alignments:
>>> print(left)
Alpha 0 AAAAC 5
Beta 0 AAA-C 4
Gamma 0 AAAAG 5
<BLANKLINE>
>>> print(right)
Alpha 0 GTT 3
Beta 0 -TT 2
Gamma 0 G-T 2
<BLANKLINE>
And add them:
>>> combined = left + right
>>> print(combined)
Alpha 0 AAAACGTT 8
Beta 0 AAA-C-TT 6
Gamma 0 AAAAGG-T 7
<BLANKLINE>
For this to work, both alignments must have the same number of sequences
(here they both have 3 rows):
>>> len(left)
3
>>> len(right)
3
>>> len(combined)
3
The sequences are SeqRecord objects, and these can be added together. Refer
to the SeqRecord documentation for details of how the annotation is handled. This
example is a special case in that both original alignments shared the same names,
meaning when the rows are added they also get the same name.
Any common annotations are preserved, but differing annotation is lost. This is
the same behavior used in the SeqRecord annotations and is designed to prevent
accidental propagation of inappropriate values:
>>> combined.annotations
{'tool': 'demo'}
Similarly any common per-column-annotations are combined:
>>> combined.column_annotations
{'stats': 'CCCXCCXC'}
"""
if not isinstance(other, Alignment):
raise NotImplementedError
if len(self) != len(other):
raise ValueError(
"When adding two alignments they must have the same length"
" (i.e. same number of rows)"
)
starts1 = self.coordinates[:, 0]
ends1 = self.coordinates[:, -1]
sequences1 = self.sequences
starts2 = other.coordinates[:, 0]
ends2 = other.coordinates[:, -1]
sequences2 = other.sequences
sequences = []
for start1, end1, seq1, start2, end2, seq2 in zip(
starts1, ends1, sequences1, starts2, ends2, sequences2
):
sequence = seq1[start1:end1] + seq2[start2:end2]
sequences.append(sequence)
offset = starts2 - ends1 + starts1
coordinates1 = self.coordinates - starts1[:, None]
coordinates2 = other.coordinates - offset[:, None]
coordinates = np.append(coordinates1, coordinates2, axis=1)
alignment = Alignment(sequences, coordinates)
# Take any common annotation:
annotations = {}
try:
for k, v in self.annotations.items():
try:
if other.annotations[k] == v:
annotations[k] = v
except KeyError:
continue
except AttributeError:
pass
else:
alignment.annotations = annotations
column_annotations = {}
try:
for k, v in self.column_annotations.items():
try:
column_annotations[k] = v + other.column_annotations[k]
except KeyError:
continue
except AttributeError:
pass
else:
alignment.column_annotations = column_annotations
return alignment

@property
def frequencies(self):
"""Return the frequency of each letter in each column of the alignment.
Expand Down
78 changes: 78 additions & 0 deletions Doc/Tutorial/chapter_align.tex
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,84 @@ \subsection{Reverse-complementing the alignment}
\end{minted}
Reverse-complementing an alignment preserves its column annotations (in reverse order), but discards all other annotations.

\subsection{Adding alignments}

Alignments can be added together to form an extended alignment if they have the same number of rows. As an example, let's first create two alignments:
%cont-doctest
\begin{minted}{pycon}
>>> from Bio.Seq import Seq
>>> from Bio.SeqRecord import SeqRecord
>>> a1 = SeqRecord(Seq("AAAAC"), id="Alpha")
>>> b1 = SeqRecord(Seq("AAAC"), id="Beta")
>>> c1 = SeqRecord(Seq("AAAAG"), id="Gamma")
>>> a2 = SeqRecord(Seq("GTT"), id="Alpha")
>>> b2 = SeqRecord(Seq("TT"), id="Beta")
>>> c2 = SeqRecord(Seq("GT"), id="Gamma")
>>> left = Alignment(
... [a1, b1, c1], coordinates=np.array([[0, 3, 4, 5], [0, 3, 3, 4], [0, 3, 4, 5]])
... )
>>> left.annotations = {"tool": "demo", "name": "start"}
>>> left.column_annotations = {"stats": "CCCXC"}
>>> right = Alignment(
... [a2, b2, c2], coordinates=np.array([[0, 1, 2, 3], [0, 0, 1, 2], [0, 1, 1, 2]])
... )
>>> right.annotations = {"tool": "demo", "name": "end"}
>>> right.column_annotations = {"stats": "CXC"}
\end{minted}
Now, let's look at these two alignments:
%cont-doctest
\begin{minted}{pycon}
>>> print(left)
Alpha 0 AAAAC 5
Beta 0 AAA-C 4
Gamma 0 AAAAG 5
<BLANKLINE>
>>> print(right)
Alpha 0 GTT 3
Beta 0 -TT 2
Gamma 0 G-T 2
<BLANKLINE>
\end{minted}
Adding the two alignments will combine the two alignments row-wise:
%cont-doctest
\begin{minted}{pycon}
>>> combined = left + right
>>> print(combined)
Alpha 0 AAAACGTT 8
Beta 0 AAA-C-TT 6
Gamma 0 AAAAGG-T 7
<BLANKLINE>
\end{minted}
For this to work, both alignments must have the same number of sequences
(here they both have 3 rows):
%cont-doctest
\begin{minted}{pycon}
>>> len(left)
3
>>> len(right)
3
>>> len(combined)
3
\end{minted}
The sequences are \verb|SeqRecord| objects, which can be added together. Refer
to Chapter~\ref{chapter:seq_annot} for details of how the annotation is handled.
This example is a special case in that both original alignments shared the same names, meaning when the rows are added they also get the same name.

Any common annotations are preserved, but differing annotation is lost. This is
the same behavior used in the \verb|SeqRecord| annotations and is designed to
prevent accidental propagation of inappropriate values:
%cont-doctest
\begin{minted}{pycon}
>>> combined.annotations
{'tool': 'demo'}
\end{minted}
Similarly any common per-column-annotations are combined:
%cont-doctest
\begin{minted}{pycon}
>>> combined.column_annotations
{'stats': 'CCCXCCXC'}
\end{minted}

\subsection{Mapping alignments}

Suppose you have a pairwise alignment of a transcript to a chromosome:
Expand Down
78 changes: 76 additions & 2 deletions Tests/test_Align_Alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1381,13 +1381,48 @@ def test_reverse_complement(self):
)
self.assertEqual(rc_alignment.column_annotations["letter"], "LKJIHGFEDCBA")

def test_add(self):
target = Seq("ACTAGG")
query = Seq("ACCTACG")
sequences = (target, query)
coordinates = np.array([[0, 2, 2, 6], [0, 2, 3, 7]])
alignment1 = Align.Alignment(sequences, coordinates)
target = Seq("CGTGGGG")
query = Seq("CGG")
sequences = (target, query)
coordinates = np.array([[0, 2, 3, 4, 7], [0, 2, 2, 3, 3]])
alignment2 = Align.Alignment(sequences, coordinates)
self.assertEqual(
str(alignment1),
"""\
target 0 AC-TAGG 6
0 ||-||.| 7
query 0 ACCTACG 7
""",
)
self.assertEqual(
str(alignment2),
"""\
target 0 CGTGGGG 7
0 ||-|--- 7
query 0 CG-G--- 3
""",
)
self.assertEqual(
str(alignment1 + alignment2),
"""\
target 0 AC-TAGGCGTGGGG 13
0 ||-||.|||-|--- 14
query 0 ACCTACGCG-G--- 10
""",
)


class TestMultipleAlignment(unittest.TestCase):
def setUp(self):
path = "Clustalw/opuntia.aln"
with open(path) as stream:
alignments = Align.parse(stream, "clustal")
self.alignment = next(alignments)
self.alignment = Align.read(stream, "clustal")

def tearDown(self):
del self.alignment
Expand Down Expand Up @@ -2334,6 +2369,45 @@ def test_substitutions(self):
self.assertAlmostEqual(m["C", "T"], 14.0)
self.assertAlmostEqual(m["T", "C"], 14.0)

def test_add(self):
self.assertEqual(
str(self.alignment[:, 50:60]),
"""\
gi|627328 50 TATATA---- 56
gi|627328 50 TATATATA-- 58
gi|627328 50 TATATA---- 56
gi|627328 50 TATATA---- 56
gi|627329 50 TATATATATA 60
gi|627328 50 TATATATATA 60
gi|627329 50 TATATATATA 60
""",
)
self.assertEqual(
str(self.alignment[:, 65:75]),
"""\
gi|627328 56 -ATATATTTC 65
gi|627328 58 -ATATATTTC 67
gi|627328 56 -ATATATTTC 65
gi|627328 56 -ATATATTTA 65
gi|627329 60 -ATATATTTC 69
gi|627328 60 -ATATATTTC 69
gi|627329 65 AATATATTTC 75
""",
)
alignment = self.alignment[:, 50:60] + self.alignment[:, 65:75]
self.assertEqual(
str(alignment),
"""\
gi|627328 0 TATATA-----ATATATTTC 15
gi|627328 0 TATATATA---ATATATTTC 17
gi|627328 0 TATATA-----ATATATTTC 15
gi|627328 0 TATATA-----ATATATTTA 15
gi|627329 0 TATATATATA-ATATATTTC 19
gi|627328 0 TATATATATA-ATATATTTC 19
gi|627329 0 TATATATATAAATATATTTC 20
""",
)


class TestAlignment_format(unittest.TestCase):
def setUp(self):
Expand Down

0 comments on commit d416809

Please sign in to comment.