From 9e76a7024510aa152bf1c5da762d625a4b399b80 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Wed, 14 Aug 2024 17:02:58 -0700 Subject: [PATCH] feat: add methods for computing run lengths of repeats in a sequence The `longest_dinucleotide_run_length` method is optimized for dinucleotides, whereas the `longest_multinucleotide_run_length` method is general for any repeat of length N. --- fgpyo/sequence.py | 104 ++++++++++++++++++++++++++++------- tests/fgpyo/test_sequence.py | 66 +++++++++++++++++++++- 2 files changed, 149 insertions(+), 21 deletions(-) diff --git a/fgpyo/sequence.py b/fgpyo/sequence.py index d8ab32e..b85ea1c 100644 --- a/fgpyo/sequence.py +++ b/fgpyo/sequence.py @@ -76,29 +76,11 @@ def reverse_complement(bases: str) -> str: return "".join([_COMPLEMENTS[b] for b in bases[::-1]]) -def longest_hp_length(bases: str) -> int: - """Calculates the length of the longest homopolymer in the input sequence.""" - max_hp = 0 - i = 0 - # NB: if we have found a homopolymer of length `max_hp`, then we do not need - # to examine the last `max_hp` bases since we'll never find a longer one. - bases_len = len(bases) - while i < bases_len - max_hp: - base = bases[i] - j = i + 1 - while j < bases_len and bases[j] == base: - j += 1 - max_hp = max(max_hp, j - i) - # skip over all the bases in the current homopolymer - i = j - return max_hp - - def gc_content(bases: str) -> float: """Calculates the fraction of G and C bases in a sequence.""" if len(bases) == 0: return 0 - gc_count = sum(1 for base in bases if base == "C" or base == "G" or base == "c" or base == "g") + gc_count = sum(1 for base in bases if base in "CGcg") return gc_count / len(bases) @@ -164,3 +146,87 @@ def levenshtein(string1: str, string2: str) -> int: matrix[i + 1][j + 1], # Substitution ) return matrix[0][0] + + +def longest_homopolymer_length(bases: str, min_hp: int = 0) -> int: + """Calculates the length of the longest homopolymer in the input sequence. + + Args: + bases: the bases over which to compute + min_hp: do not search for homopolymers of this length or shorter + + Return: + the length of the longest homopolymer + """ + i = 0 + # NB: if we have found a homopolymer of length `min_hp`, then we do not need + # to examine the last `min_hp` bases since we'll never find a longer one. + bases_len = len(bases) + while i < bases_len - min_hp: + base = bases[i].upper() + j = i + 1 + while j < bases_len and bases[j] == base: + j += 1 + min_hp = max(min_hp, j - i) + # skip over all the bases in the current homopolymer + i = j + return min_hp + + +def longest_dinucleotide_run_length(bases: str) -> int: + """Number of bases in the longest dinucleotide run in a primer. + + A dinucleotide run is when two nucleotides are repeated in tandem. For example, + TTGG (length = 4) or AACCAACCAA (length = 10). If there are no such runs, returns 0. + + Args: + bases: the bases over which to compute + + Return: + the number of bases in the longest dinuc repeat (NOT the number of repeat units) + """ + return longest_multinucleotide_run_length(bases=bases, repeat_unit_length=2) + + +def longest_multinucleotide_run_length(bases: str, repeat_unit_length: int) -> int: + """Number of bases in the longest multi-nucleotide run. + + A multi-nucleotide run is when N nucleotides are repeated in tandem. For example, + TTGG (length = 4, N=2) or TAGTAGTAG (length = 9, N = 3). If there are no such runs, + returns 0. + + Args: + bases: the bases over which to compute + repeat_unit_length: the length of the multi-nucleotide repetitive unit (must be > 0) + + Returns: + the number of bases in the longest multinucleotide repeat (NOT the number of repeat units) + """ + if repeat_unit_length <= 0: + raise ValueError(f"repeat_unit_length must be >= 0, found: {repeat_unit_length}") + elif len(bases) < repeat_unit_length: + return 0 + elif len(bases) == repeat_unit_length: + return repeat_unit_length + elif repeat_unit_length == 1: + return longest_homopolymer_length(bases=bases) + + best_length: int = 0 + start = 0 # the start index of the current multi-nucleotide run + while start < len(bases) - 1: + # get the dinuc bases + dinuc = bases[start : start + repeat_unit_length].upper() + # keep going while there are more di-nucs + end = start + repeat_unit_length + while end < len(bases) - 1 and dinuc == bases[end : end + repeat_unit_length].upper(): + end += repeat_unit_length + cur_length = end - start + # update the longest total run length + best_length = max(best_length, cur_length) + # move to the next start + if cur_length <= repeat_unit_length: # only one repeat unit found, move the start by 1bp + start += 1 + else: # multiple repeats found, skip to the last base of the current run + start += cur_length - 1 + + return best_length diff --git a/tests/fgpyo/test_sequence.py b/tests/fgpyo/test_sequence.py index d810ea0..0f605b6 100644 --- a/tests/fgpyo/test_sequence.py +++ b/tests/fgpyo/test_sequence.py @@ -1,11 +1,16 @@ """Tests for `~fgpyo.sequence`""" +from typing import List +from typing import Tuple + import pytest from fgpyo.sequence import gc_content from fgpyo.sequence import hamming from fgpyo.sequence import levenshtein -from fgpyo.sequence import longest_hp_length +from fgpyo.sequence import longest_dinucleotide_run_length +from fgpyo.sequence import longest_homopolymer_length +from fgpyo.sequence import longest_multinucleotide_run_length from fgpyo.sequence import reverse_complement @@ -39,7 +44,7 @@ def test_reverse_complement(bases: str, expected_rev_comp: str) -> None: ], ) def test_homopolymer(bases: str, expected_hp_len: int) -> None: - assert longest_hp_length(bases) == expected_hp_len + assert longest_homopolymer_length(bases) == expected_hp_len @pytest.mark.parametrize( @@ -111,3 +116,60 @@ def test_hamming_with_invalid_strings(string1: str, string2: str) -> None: ) def test_levenshtein_dynamic(string1: str, string2: str, levenshtein_distance: int) -> None: assert levenshtein(string1, string2) == levenshtein_distance + + +MULTINUCLEOTIDE_TEST_CASES: List[Tuple[str, int, int]] = [ + ("", 2, 0), + ("", 1, 0), + ("A", 1, 1), + ("A", 2, 0), + ("ACGT", 2, 2), + ("AACCGGTT", 2, 2), + ("TTTTCCCGGA", 2, 4), + ("ACCGGGTTTT", 2, 4), + ("GTGTGTAAAA", 2, 6), + ("GTGTGTGAAAA", 2, 6), + ("GTGTGTACACACAC", 2, 8), + ("GTGTGTGACACACAC", 2, 8), + ("AAACTCTCTCGG", 2, 6), + ("AACTCTCTCGGG", 2, 6), + ("TGTATATATA", 2, 8), + ("TGTATATATAC", 2, 8), + ("TGATATATATC", 2, 8), + ("TGATATATAC", 2, 6), + ("TAGTAGTAG", 3, 9), + ("TATAGTAGTAGTA", 3, 9), + ("TACGTACGTACTG", 4, 8), + ("TAGTAGTAGTAG", 3, 12), + ("TAGTAGTAGTAG", 6, 12), + ("TAGTAGTAGTAG", 12, 12), + ("AACCGGTT", 1, 2), + ("AACCGGTT", 3, 3), + ("TAGTAGTAGTAG", 5, 5), + ("TAGTAGTAGTAG", 7, 7), + ("TTTAAAAAAAAAATTT", 5, 10), + ("ACGACCATATatatatatatGC", 2, 14), + ("ACGACCATATatatatatatATGC", 2, 16), + ("ttgaTtaCaGATTACAgattacacc", 7, 21), +] + +DINUCLEOTIDE_TEST_CASES: List[Tuple[str, int]] = [ + (bases, expected_length) + for bases, repeat_unit_length, expected_length in MULTINUCLEOTIDE_TEST_CASES + if repeat_unit_length == 2 +] + + +@pytest.mark.parametrize("bases, expected_length", DINUCLEOTIDE_TEST_CASES) +def test_longest_dinucleotide_run_length(bases: str, expected_length: int) -> None: + assert expected_length == longest_dinucleotide_run_length(bases=bases) + + +@pytest.mark.parametrize("bases, repeat_unit_length, expected_length", MULTINUCLEOTIDE_TEST_CASES) +def test_longest_multinucleotide_run_length( + bases: str, repeat_unit_length: int, expected_length: int +) -> None: + assert expected_length == 0 or expected_length % repeat_unit_length == 0 + assert expected_length == longest_multinucleotide_run_length( + bases, repeat_unit_length=repeat_unit_length + )