From 9e76a7024510aa152bf1c5da762d625a4b399b80 Mon Sep 17 00:00:00 2001
From: Nils Homer <nilshomer@gmail.com>
Date: Wed, 14 Aug 2024 17:02:58 -0700
Subject: [PATCH] feat: add methods for computing run lengths of repeats in a
 sequence

The `longest_dinucleotide_run_length` method is optimized for dinucleotides,
whereas the `longest_multinucleotide_run_length` method is general for
any repeat of length N.
---
 fgpyo/sequence.py            | 104 ++++++++++++++++++++++++++++-------
 tests/fgpyo/test_sequence.py |  66 +++++++++++++++++++++-
 2 files changed, 149 insertions(+), 21 deletions(-)

diff --git a/fgpyo/sequence.py b/fgpyo/sequence.py
index d8ab32e..b85ea1c 100644
--- a/fgpyo/sequence.py
+++ b/fgpyo/sequence.py
@@ -76,29 +76,11 @@ def reverse_complement(bases: str) -> str:
     return "".join([_COMPLEMENTS[b] for b in bases[::-1]])
 
 
-def longest_hp_length(bases: str) -> int:
-    """Calculates the length of the longest homopolymer in the input sequence."""
-    max_hp = 0
-    i = 0
-    # NB: if we have found a homopolymer of length `max_hp`, then we do not need
-    # to examine the last `max_hp` bases since we'll never find a longer one.
-    bases_len = len(bases)
-    while i < bases_len - max_hp:
-        base = bases[i]
-        j = i + 1
-        while j < bases_len and bases[j] == base:
-            j += 1
-        max_hp = max(max_hp, j - i)
-        # skip over all the bases in the current homopolymer
-        i = j
-    return max_hp
-
-
 def gc_content(bases: str) -> float:
     """Calculates the fraction of G and C bases in a sequence."""
     if len(bases) == 0:
         return 0
-    gc_count = sum(1 for base in bases if base == "C" or base == "G" or base == "c" or base == "g")
+    gc_count = sum(1 for base in bases if base in "CGcg")
     return gc_count / len(bases)
 
 
@@ -164,3 +146,87 @@ def levenshtein(string1: str, string2: str) -> int:
                     matrix[i + 1][j + 1],  # Substitution
                 )
     return matrix[0][0]
+
+
+def longest_homopolymer_length(bases: str, min_hp: int = 0) -> int:
+    """Calculates the length of the longest homopolymer in the input sequence.
+
+    Args:
+        bases: the bases over which to compute
+        min_hp: do not search for homopolymers of this length or shorter
+
+    Return:
+        the length of the longest homopolymer
+    """
+    i = 0
+    # NB: if we have found a homopolymer of length `min_hp`, then we do not need
+    # to examine the last `min_hp` bases since we'll never find a longer one.
+    bases_len = len(bases)
+    while i < bases_len - min_hp:
+        base = bases[i].upper()
+        j = i + 1
+        while j < bases_len and bases[j] == base:
+            j += 1
+        min_hp = max(min_hp, j - i)
+        # skip over all the bases in the current homopolymer
+        i = j
+    return min_hp
+
+
+def longest_dinucleotide_run_length(bases: str) -> int:
+    """Number of bases in the longest dinucleotide run in a primer.
+
+    A dinucleotide run is when two nucleotides are repeated in tandem. For example,
+    TTGG (length = 4) or AACCAACCAA (length = 10). If there are no such runs, returns 0.
+
+    Args:
+        bases: the bases over which to compute
+
+    Return:
+        the number of bases in the longest dinuc repeat (NOT the number of repeat units)
+    """
+    return longest_multinucleotide_run_length(bases=bases, repeat_unit_length=2)
+
+
+def longest_multinucleotide_run_length(bases: str, repeat_unit_length: int) -> int:
+    """Number of bases in the longest multi-nucleotide run.
+
+    A multi-nucleotide run is when N nucleotides are repeated in tandem. For example,
+    TTGG (length = 4, N=2) or TAGTAGTAG (length = 9, N = 3). If there are no such runs,
+    returns 0.
+
+    Args:
+        bases: the bases over which to compute
+        repeat_unit_length: the length of the multi-nucleotide repetitive unit (must be > 0)
+
+    Returns:
+        the number of bases in the longest multinucleotide repeat (NOT the number of repeat units)
+    """
+    if repeat_unit_length <= 0:
+        raise ValueError(f"repeat_unit_length must be >= 0, found: {repeat_unit_length}")
+    elif len(bases) < repeat_unit_length:
+        return 0
+    elif len(bases) == repeat_unit_length:
+        return repeat_unit_length
+    elif repeat_unit_length == 1:
+        return longest_homopolymer_length(bases=bases)
+
+    best_length: int = 0
+    start = 0  # the start index of the current multi-nucleotide run
+    while start < len(bases) - 1:
+        # get the dinuc bases
+        dinuc = bases[start : start + repeat_unit_length].upper()
+        # keep going while there are more di-nucs
+        end = start + repeat_unit_length
+        while end < len(bases) - 1 and dinuc == bases[end : end + repeat_unit_length].upper():
+            end += repeat_unit_length
+        cur_length = end - start
+        # update the longest total run length
+        best_length = max(best_length, cur_length)
+        # move to the next start
+        if cur_length <= repeat_unit_length:  # only one repeat unit found, move the start by 1bp
+            start += 1
+        else:  # multiple repeats found, skip to the last base of the current run
+            start += cur_length - 1
+
+    return best_length
diff --git a/tests/fgpyo/test_sequence.py b/tests/fgpyo/test_sequence.py
index d810ea0..0f605b6 100644
--- a/tests/fgpyo/test_sequence.py
+++ b/tests/fgpyo/test_sequence.py
@@ -1,11 +1,16 @@
 """Tests for `~fgpyo.sequence`"""
 
+from typing import List
+from typing import Tuple
+
 import pytest
 
 from fgpyo.sequence import gc_content
 from fgpyo.sequence import hamming
 from fgpyo.sequence import levenshtein
-from fgpyo.sequence import longest_hp_length
+from fgpyo.sequence import longest_dinucleotide_run_length
+from fgpyo.sequence import longest_homopolymer_length
+from fgpyo.sequence import longest_multinucleotide_run_length
 from fgpyo.sequence import reverse_complement
 
 
@@ -39,7 +44,7 @@ def test_reverse_complement(bases: str, expected_rev_comp: str) -> None:
     ],
 )
 def test_homopolymer(bases: str, expected_hp_len: int) -> None:
-    assert longest_hp_length(bases) == expected_hp_len
+    assert longest_homopolymer_length(bases) == expected_hp_len
 
 
 @pytest.mark.parametrize(
@@ -111,3 +116,60 @@ def test_hamming_with_invalid_strings(string1: str, string2: str) -> None:
 )
 def test_levenshtein_dynamic(string1: str, string2: str, levenshtein_distance: int) -> None:
     assert levenshtein(string1, string2) == levenshtein_distance
+
+
+MULTINUCLEOTIDE_TEST_CASES: List[Tuple[str, int, int]] = [
+    ("", 2, 0),
+    ("", 1, 0),
+    ("A", 1, 1),
+    ("A", 2, 0),
+    ("ACGT", 2, 2),
+    ("AACCGGTT", 2, 2),
+    ("TTTTCCCGGA", 2, 4),
+    ("ACCGGGTTTT", 2, 4),
+    ("GTGTGTAAAA", 2, 6),
+    ("GTGTGTGAAAA", 2, 6),
+    ("GTGTGTACACACAC", 2, 8),
+    ("GTGTGTGACACACAC", 2, 8),
+    ("AAACTCTCTCGG", 2, 6),
+    ("AACTCTCTCGGG", 2, 6),
+    ("TGTATATATA", 2, 8),
+    ("TGTATATATAC", 2, 8),
+    ("TGATATATATC", 2, 8),
+    ("TGATATATAC", 2, 6),
+    ("TAGTAGTAG", 3, 9),
+    ("TATAGTAGTAGTA", 3, 9),
+    ("TACGTACGTACTG", 4, 8),
+    ("TAGTAGTAGTAG", 3, 12),
+    ("TAGTAGTAGTAG", 6, 12),
+    ("TAGTAGTAGTAG", 12, 12),
+    ("AACCGGTT", 1, 2),
+    ("AACCGGTT", 3, 3),
+    ("TAGTAGTAGTAG", 5, 5),
+    ("TAGTAGTAGTAG", 7, 7),
+    ("TTTAAAAAAAAAATTT", 5, 10),
+    ("ACGACCATATatatatatatGC", 2, 14),
+    ("ACGACCATATatatatatatATGC", 2, 16),
+    ("ttgaTtaCaGATTACAgattacacc", 7, 21),
+]
+
+DINUCLEOTIDE_TEST_CASES: List[Tuple[str, int]] = [
+    (bases, expected_length)
+    for bases, repeat_unit_length, expected_length in MULTINUCLEOTIDE_TEST_CASES
+    if repeat_unit_length == 2
+]
+
+
+@pytest.mark.parametrize("bases, expected_length", DINUCLEOTIDE_TEST_CASES)
+def test_longest_dinucleotide_run_length(bases: str, expected_length: int) -> None:
+    assert expected_length == longest_dinucleotide_run_length(bases=bases)
+
+
+@pytest.mark.parametrize("bases, repeat_unit_length, expected_length", MULTINUCLEOTIDE_TEST_CASES)
+def test_longest_multinucleotide_run_length(
+    bases: str, repeat_unit_length: int, expected_length: int
+) -> None:
+    assert expected_length == 0 or expected_length % repeat_unit_length == 0
+    assert expected_length == longest_multinucleotide_run_length(
+        bases, repeat_unit_length=repeat_unit_length
+    )