allow user to set splice site filtration strategy

ablab · Apr 9, 2024 · 666b79b · 666b79b
1 parent 6117cff
commit 666b79b
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -522,6 +522,15 @@ The main explanation that some aligner report a lot of false unspliced alignment
 for ONT reads.
 
 
+`--report_canonical`
+    Strategy for reporting novel transcripts based on canonical splice sites, should be one of:
+
+* `only_canonical` - report novel transcripts, which contain only canonical splice sites;
+* `only_stranded` - report novel transcripts, for which the strand can be unambiguously derived using splice sites and 
+presence of a polyA tail, allowing some splice sites to be non-canonical (default);
+* `all` -- report all transcript model regardless of their splice sites.
+
+
 `--polya_requirement` Strategy for using polyA tails during transcript model construction, should be one of:
 
 * `default` - polyA tails are required if at least 70% of the reads have polyA tail; 
@@ -541,9 +550,6 @@ We recommend _not_ to modify these options unless you are clearly aware of their
 `--no_secondary`
     Ignore secondary alignments.
 
-`--report_unstranded`
-    Report transcripts for which the strand cannot be detected using canonical splice sites.
-
 `--aligner`
     Force to use this alignment method, can be `starlong` or `minimap2`; `minimap2` is currently used as default. Make sure the specified aligner is in the `$PATH` variable.
 

diff --git a/isoquant.py b/isoquant.py
@@ -30,6 +30,7 @@
     DataSetReadMapper
 )
 from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies
+from src.graph_based_model_construction import StrandnessReportingLevel
 from src.long_read_assigner import AmbiguityResolvingMethod
 from src.long_read_counter import COUNTING_STRATEGIES
 from src.input_data_storage import InputDataStorage
@@ -150,20 +151,22 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                           help="gene quantification strategy", type=str, default="with_inconsistent")
     add_additional_option("--report_novel_unspliced", "-u", type=bool_str,
                           help="report novel monoexonic transcripts (true/false), "
-                               "default: False for ONT, True for other data types")
+                               "default: false for ONT, true for other data types")
+    add_additional_option("--report_canonical",  type=str, choices=[e.name for e in StrandnessReportingLevel],
+                          help="reporting level for novel transcripts based on canonical splice sites " +
+                               "/".join([e.name for e in StrandnessReportingLevel]) +
+                               "default: " + StrandnessReportingLevel.only_canonical.name,
+                          default=StrandnessReportingLevel.only_canonical.name)
     add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies],
-                          help="require polyA tails to be present when reporting transcripts (default/never/always), "
+                          help="require polyA tails to be present when reporting transcripts " +
+                               "/".join([e.name for e in StrandnessReportingLevel]) +
                                "default: require polyA only when polyA percentage is >= 70%%",
                           default=PolyAUsageStrategies.default.name)
     # OUTPUT PROPERTIES
     pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int,
                                      default="16")
     pipeline_args_group.add_argument('--check_canonical', action='store_true', default=False,
                                      help="report whether splice junctions are canonical")
-    add_additional_option_to_group(pipeline_args_group, "--report_unstranded",
-                                   help="report transcripts for which the strand cannot be detected "
-                                        "using canonical splice sites",
-                                   action='store_true', default=False)
     pipeline_args_group.add_argument("--sqanti_output", help="produce SQANTI-like TSV output",
                                      action='store_true', default=False)
     pipeline_args_group.add_argument("--count_exons", help="perform exon and intron counting",
@@ -650,6 +653,7 @@ def set_model_construction_options(args):
     args.require_monointronic_polya = strategy.require_monointronic_polya
     args.require_monoexonic_polya = strategy.require_monoexonic_polya
     args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement]
+    args.report_canonical_strategy = StrandnessReportingLevel[args.report_canonical]
 
 
 def set_configs_directory(args):

diff --git a/src/gene_info.py b/src/gene_info.py
@@ -635,7 +635,7 @@ def set_strand(self, intron, strand=None):
         elif self.chr_record:
             self.strand_dict[intron] = get_intron_strand(intron, self.chr_record)
 
-    def get_strand(self, introns, has_polya=False, has_polyt=False):
+    def count_canonical_sites(self, introns):
         count_fwd = 0
         count_rev = 0
         for intron in introns:
@@ -648,6 +648,19 @@ def get_strand(self, introns, has_polya=False, has_polyt=False):
                 count_fwd += 1
             elif strand == '-':
                 count_rev += 1
+        return count_fwd, count_rev
+
+    # all splice sites must be canonical from the same strand, not just the majority
+    def get_clean_strand(self, introns):
+        count_fwd, count_rev = self.count_canonical_sites(introns)
+        if count_fwd == 0 and count_rev > 0:
+            return  '-'
+        elif count_fwd > 0 and count_rev == 0:
+            return '+'
+        return '.'
+
+    def get_strand(self, introns, has_polya=False, has_polyt=False):
+        count_fwd, count_rev = self.count_canonical_sites(introns)
         if count_fwd == count_rev:
             if has_polya and not has_polyt:
                 return '+'

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
@@ -7,6 +7,7 @@
 import logging
 from collections import defaultdict
 from functools import cmp_to_key
+from enum import unique, Enum
 
 from .common import (
     AtomicCounter,
@@ -30,6 +31,13 @@
 logger = logging.getLogger('IsoQuant')
 
 
+@unique
+class StrandnessReportingLevel(Enum):
+    only_canonical = 1
+    only_stranded = 2
+    all = 3
+
+
 class GraphBasedModelConstructor:
     transcript_id_counter = AtomicCounter()
     transcript_prefix = "transcript"
@@ -370,19 +378,22 @@ def construct_fl_isoforms(self):
                 has_polya = path[-1][0] == VERTEX_polya
                 polya_site = has_polya or has_polyt
                 transcript_strand = self.strand_detector.get_strand(intron_path, has_polya, has_polyt)
-                transcript_ss_strand = self.strand_detector.get_strand(intron_path)
+                transcript_clean_strand = self.strand_detector.get_clean_strand(intron_path)
 
                 #logger.debug("uuu Novel isoform %s has coverage: %d cutoff = %d, component cov = %d, max_coverage = %d"
                 #             % (new_transcript_id, count, novel_isoform_cutoff, component_coverage, self.intron_graph.max_coverage))
                 if count < novel_isoform_cutoff:
                     # logger.debug("uuu Novel isoform %s has low coverage: %d\t%d" % (new_transcript_id, count, novel_isoform_cutoff))
                     pass
                 elif (len(novel_exons) == 2 and
-                      ((self.params.require_monointronic_polya and not polya_site) or transcript_ss_strand == '.')):
+                      ((self.params.require_monointronic_polya and not polya_site) or transcript_clean_strand == '.')):
                     # logger.debug("uuu Avoiding single intron %s isoform: %d\t%s" % (new_transcript_id, count, str(path)))
                     pass
-                elif transcript_strand == '.' and not self.params.report_unstranded:
-                    logger.info("Avoiding unreliable transcript with %d exons" % len(novel_exons))
+                elif ((self.params.report_canonical_strategy == StrandnessReportingLevel.only_canonical
+                       and transcript_clean_strand == '.')
+                      or (self.params.report_canonical_strategy == StrandnessReportingLevel.only_stranded
+                          and transcript_strand == '.')):
+                    logger.debug("Avoiding unreliable transcript with %d exons" % len(novel_exons))
                     pass
                 else:
                     if self.params.use_technical_replicas and \