diff --git a/README.md b/README.md index dce014ad..4eb69179 100644 --- a/README.md +++ b/README.md @@ -522,6 +522,15 @@ The main explanation that some aligner report a lot of false unspliced alignment for ONT reads. +`--report_canonical` + Strategy for reporting novel transcripts based on canonical splice sites, should be one of: + +* `only_canonical` - report novel transcripts, which contain only canonical splice sites; +* `only_stranded` - report novel transcripts, for which the strand can be unambiguously derived using splice sites and +presence of a polyA tail, allowing some splice sites to be non-canonical (default); +* `all` -- report all transcript model regardless of their splice sites. + + `--polya_requirement` Strategy for using polyA tails during transcript model construction, should be one of: * `default` - polyA tails are required if at least 70% of the reads have polyA tail; @@ -541,9 +550,6 @@ We recommend _not_ to modify these options unless you are clearly aware of their `--no_secondary` Ignore secondary alignments. -`--report_unstranded` - Report transcripts for which the strand cannot be detected using canonical splice sites. - `--aligner` Force to use this alignment method, can be `starlong` or `minimap2`; `minimap2` is currently used as default. Make sure the specified aligner is in the `$PATH` variable. diff --git a/isoquant.py b/isoquant.py index 0dff1955..8e473f65 100755 --- a/isoquant.py +++ b/isoquant.py @@ -30,6 +30,7 @@ DataSetReadMapper ) from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies +from src.graph_based_model_construction import StrandnessReportingLevel from src.long_read_assigner import AmbiguityResolvingMethod from src.long_read_counter import COUNTING_STRATEGIES from src.input_data_storage import InputDataStorage @@ -150,9 +151,15 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help help="gene quantification strategy", type=str, default="with_inconsistent") add_additional_option("--report_novel_unspliced", "-u", type=bool_str, help="report novel monoexonic transcripts (true/false), " - "default: False for ONT, True for other data types") + "default: false for ONT, true for other data types") + add_additional_option("--report_canonical", type=str, choices=[e.name for e in StrandnessReportingLevel], + help="reporting level for novel transcripts based on canonical splice sites " + + "/".join([e.name for e in StrandnessReportingLevel]) + + "default: " + StrandnessReportingLevel.only_canonical.name, + default=StrandnessReportingLevel.only_canonical.name) add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies], - help="require polyA tails to be present when reporting transcripts (default/never/always), " + help="require polyA tails to be present when reporting transcripts " + + "/".join([e.name for e in StrandnessReportingLevel]) + "default: require polyA only when polyA percentage is >= 70%%", default=PolyAUsageStrategies.default.name) # OUTPUT PROPERTIES @@ -160,10 +167,6 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help default="16") pipeline_args_group.add_argument('--check_canonical', action='store_true', default=False, help="report whether splice junctions are canonical") - add_additional_option_to_group(pipeline_args_group, "--report_unstranded", - help="report transcripts for which the strand cannot be detected " - "using canonical splice sites", - action='store_true', default=False) pipeline_args_group.add_argument("--sqanti_output", help="produce SQANTI-like TSV output", action='store_true', default=False) pipeline_args_group.add_argument("--count_exons", help="perform exon and intron counting", @@ -650,6 +653,7 @@ def set_model_construction_options(args): args.require_monointronic_polya = strategy.require_monointronic_polya args.require_monoexonic_polya = strategy.require_monoexonic_polya args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement] + args.report_canonical_strategy = StrandnessReportingLevel[args.report_canonical] def set_configs_directory(args): diff --git a/src/gene_info.py b/src/gene_info.py index 88ce30f2..2a0e318d 100644 --- a/src/gene_info.py +++ b/src/gene_info.py @@ -635,7 +635,7 @@ def set_strand(self, intron, strand=None): elif self.chr_record: self.strand_dict[intron] = get_intron_strand(intron, self.chr_record) - def get_strand(self, introns, has_polya=False, has_polyt=False): + def count_canonical_sites(self, introns): count_fwd = 0 count_rev = 0 for intron in introns: @@ -648,6 +648,19 @@ def get_strand(self, introns, has_polya=False, has_polyt=False): count_fwd += 1 elif strand == '-': count_rev += 1 + return count_fwd, count_rev + + # all splice sites must be canonical from the same strand, not just the majority + def get_clean_strand(self, introns): + count_fwd, count_rev = self.count_canonical_sites(introns) + if count_fwd == 0 and count_rev > 0: + return '-' + elif count_fwd > 0 and count_rev == 0: + return '+' + return '.' + + def get_strand(self, introns, has_polya=False, has_polyt=False): + count_fwd, count_rev = self.count_canonical_sites(introns) if count_fwd == count_rev: if has_polya and not has_polyt: return '+' diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 6520418e..fb135e45 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -7,6 +7,7 @@ import logging from collections import defaultdict from functools import cmp_to_key +from enum import unique, Enum from .common import ( AtomicCounter, @@ -30,6 +31,13 @@ logger = logging.getLogger('IsoQuant') +@unique +class StrandnessReportingLevel(Enum): + only_canonical = 1 + only_stranded = 2 + all = 3 + + class GraphBasedModelConstructor: transcript_id_counter = AtomicCounter() transcript_prefix = "transcript" @@ -370,7 +378,7 @@ def construct_fl_isoforms(self): has_polya = path[-1][0] == VERTEX_polya polya_site = has_polya or has_polyt transcript_strand = self.strand_detector.get_strand(intron_path, has_polya, has_polyt) - transcript_ss_strand = self.strand_detector.get_strand(intron_path) + transcript_clean_strand = self.strand_detector.get_clean_strand(intron_path) #logger.debug("uuu Novel isoform %s has coverage: %d cutoff = %d, component cov = %d, max_coverage = %d" # % (new_transcript_id, count, novel_isoform_cutoff, component_coverage, self.intron_graph.max_coverage)) @@ -378,11 +386,14 @@ def construct_fl_isoforms(self): # logger.debug("uuu Novel isoform %s has low coverage: %d\t%d" % (new_transcript_id, count, novel_isoform_cutoff)) pass elif (len(novel_exons) == 2 and - ((self.params.require_monointronic_polya and not polya_site) or transcript_ss_strand == '.')): + ((self.params.require_monointronic_polya and not polya_site) or transcript_clean_strand == '.')): # logger.debug("uuu Avoiding single intron %s isoform: %d\t%s" % (new_transcript_id, count, str(path))) pass - elif transcript_strand == '.' and not self.params.report_unstranded: - logger.info("Avoiding unreliable transcript with %d exons" % len(novel_exons)) + elif ((self.params.report_canonical_strategy == StrandnessReportingLevel.only_canonical + and transcript_clean_strand == '.') + or (self.params.report_canonical_strategy == StrandnessReportingLevel.only_stranded + and transcript_strand == '.')): + logger.debug("Avoiding unreliable transcript with %d exons" % len(novel_exons)) pass else: if self.params.use_technical_replicas and \