ablab · rkpfeil · Aug 21, 2023 · Jun 20, 2023 · Jun 1, 2023 · Jun 5, 2023
diff --git a/isoquant.py b/isoquant.py
@@ -87,6 +87,10 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
     input_args.add_argument('--fastq_list', type=str, help='text file with list of FASTQ files, one file per line'
                                                            ', leave empty line between samples')
 
+    # TODO: add nargs="+" to support multiple files
+    parser.add_argument('--illumina_bam', type=str, help='sorted and indexed file with Illumina '
+                                                         'reads from the same sample')
+
     parser.add_argument('--prefix', '-p', type=str,
                         help='experiment name; to be used for folder and file naming; default is OUT', default="OUT")
     parser.add_argument('--labels', '-l', nargs='+', type=str,
@@ -350,36 +354,43 @@ def check_input_files(args):
                 if args.input_data.input_type == "save":
                     saves = glob.glob(in_file + "*")
                     if not saves:
-                        print("ERROR! Input files " + in_file + "* do not exist")
+                        logger.critical("Input files " + in_file + "* do not exist")
                     continue
                 if not os.path.isfile(in_file):
-                    print("ERROR! Input file " + in_file + " does not exist")
+                    logger.critical("Input file " + in_file + " does not exist")
                     exit(-1)
                 if args.input_data.input_type == "bam":
                     bamfile_in = pysam.AlignmentFile(in_file, "rb")
                     if not bamfile_in.has_index():
-                        print("ERROR! BAM file " + in_file + " is not indexed, run samtools sort and samtools index")
+                        logger.critical("BAM file " + in_file + " is not indexed, run samtools sort and samtools index")
                         exit(-1)
                     bamfile_in.close()
 
+    if args.illumina_bam is not None:
+        bamfile_in = pysam.AlignmentFile(args.illumina_bam, "rb")
+        if not bamfile_in.has_index():
+            logger.critical("BAM file " + args.illumina_bam + " is not indexed, run samtools sort and samtools index")
+            exit(-1)
+        bamfile_in.close()
+
     if args.cage is not None:
         logger.critical("CAGE data is not supported yet")
         exit(-1)
         if not os.path.isfile(args.cage):
-            print("ERROR! Bed file with CAGE peaks " + args.cage + " does not exist")
+            logger.critical("Bed file with CAGE peaks " + args.cage + " does not exist")
             exit(-1)
 
     if args.genedb is not None:
         if not os.path.isfile(args.genedb):
-            print("ERROR! Gene database " + args.genedb + " does not exist")
+            logger.critical("Gene database " + args.genedb + " does not exist")
             exit(-1)
     else:
         args.no_junc_bed = True
 
     if args.read_assignments is not None:
         for r in args.read_assignments:
             if not glob.glob(r + "*"):
-                print("No files found with prefix " + str(r))
+                logger.critical("No files found with prefix " + str(r))
                 exit(-1)
 
 

diff --git a/misc/correction_only.py b/misc/correction_only.py
@@ -0,0 +1,101 @@
+import argparse
+import pysam
+import gffutils
+import os
+
+from src.illumina_exon_corrector import IlluminaExonCorrector
+from src.alignment_info import AlignmentInfo
+from src.common import junctions_from_blocks, overlaps
+from src.short_utils import get_region_from_db
+from src.correction_stats import CorrectionStats, Stats
+from src.gtf2db import gtf2db
+
+def parse_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("--short", "-s", help="input sam/bam file containing alignment of the short reads", 
+                        type=str, dest = "short", required = True)
+    parser.add_argument("--ont", "-i", help="input sam/bam file containing alignment of the long reads", 
+                        type=str, dest = "ont", required = True)
+    parser.add_argument("--reference", "-r", help="input reference gtf file",
+                        type=str, dest = "ref", required = True)
+    parser.add_argument("--output", "-o", help="output folder location",
+                        type=str, dest = "out", required = True)
+    return parser.parse_args()
+
+def get_reference_db(ref_file, out_folder):
+    if(ref_file[-2:] != "db"):
+        gtf = ref_file
+        args.output_exists = os.path.exists(out_folder)
+        if not args.output_exists:
+            os.makedirs(out_folder)
+        ref = os.path.join(out_folder, "mouse_reference.db")
+        gtf2db(gtf, ref, True)
+    else:
+        ref = ref_file
+    return gffutils.FeatureDB(ref)
+
+
+args = parse_args()
+short_file = args.short
+long_file = pysam.AlignmentFile(args.ont, "rb")
+ref_db = get_reference_db(args.ref, args.out)
+gene_list = list(ref_db.features_of_type('gene', order_by=('seqid', 'start')))
+correction = CorrectionStats(ref_db, long_file)
+categorized = []
+current_region = (0,0)
+chromosome = 0
+genes = []
+before = 0
+after = 0
+for gene in gene_list:
+    if overlaps((gene.start,gene.end), current_region) and gene.seqid == chromosome:
+        genes.append(gene)
+        current_region = (current_region[0], max(current_region[1], gene.end))
+    else:
+        if genes :
+            corrector = IlluminaExonCorrector(chromosome, current_region[0], current_region[1], short_file)
+            reference_introns = get_region_from_db(ref_db, (chromosome, current_region[0], current_region[1]))
+            for alignment in long_file.fetch(chromosome, start = current_region[0], stop = current_region[1]):
+                ai = AlignmentInfo(alignment)
+                if not ai.read_exons:
+                    print("Read has no aligned exons")
+                    continue
+                exons = ai.read_exons
+                introns = set(junctions_from_blocks(exons))
+                corrected_exons = corrector.correct_read(ai)
+                corrected_introns = set(junctions_from_blocks(corrected_exons))
+                before += len(introns.intersection(reference_introns))
+                after += len(corrected_introns.intersection(reference_introns))
+                print("Chromosome:", chromosome)
+                #categorized.append(correction.read_stats(introns, corrected_introns, alignment))
+                categorized.extend(correction.intron_stats(junctions_from_blocks(exons), junctions_from_blocks(corrected_exons), alignment))
+                print("corrected exons:", corrected_exons)
+        genes = [gene]
+        current_region = (gene.start, gene.end)
+        chromosome = gene.seqid
+if genes:
+    corrector = IlluminaExonCorrector(chromosome, current_region[0], current_region[1], short_file)
+    reference_introns = get_region_from_db(ref_db, (chromosome, current_region[0], current_region[1]))
+    for alignment in long_file.fetch(chromosome, start = current_region[0], stop = current_region[1]):
+        ai = AlignmentInfo(alignment)
+        if not ai.read_exons:
+            print("Read has no aligned exons")
+            continue
+        exons = ai.read_exons
+        introns = set(junctions_from_blocks(exons))
+        corrected_exons = corrector.correct_read(ai)
+        corrected_introns = set(junctions_from_blocks(corrected_exons))
+        before += len(introns.intersection(reference_introns))
+        after += len(corrected_introns.intersection(reference_introns))
+        print("Chromosome:", chromosome)
+        #categorized.append(correction.read_stats(introns, corrected_introns, alignment))
+        categorized.extend(correction.intron_stats(junctions_from_blocks(exons), junctions_from_blocks(corrected_exons), alignment))
+        print("corrected exons:", corrected_exons)
+
+print("Number of correct introns before correction:", before)
+print("Number of correct introns after correction:", after)
+print("Number of true positive reads:", categorized.count(Stats.true_positive))
+print("Number of false positive reads:", categorized.count(Stats.false_positive))
+print("Number of true negative reads:", categorized.count(Stats.true_negative))
+print("Number of false negative reads:", categorized.count(Stats.false_negative))
+
diff --git a/src/alignment_processor.py b/src/alignment_processor.py
@@ -25,6 +25,8 @@
 from .polya_verification import PolyAFixer
 from .exon_corrector import ExonCorrector
 from .alignment_info import AlignmentInfo
+from .illumina_exon_corrector import IlluminaExonCorrector, VoidExonCorrector
+
 
 logger = logging.getLogger('IsoQuant')
 
@@ -264,14 +266,19 @@ def process_alignments_in_region(self, current_region, alignment_storage):
         logger.debug("Processing region %s" % str(current_region))
         gene_info = self.get_gene_info_for_region(current_region)
         if gene_info.empty():
-            assignment_storage = self.process_intergenic(alignment_storage)
+            assignment_storage = self.process_intergenic(alignment_storage, current_region)
         else:
             assignment_storage = self.process_genic(alignment_storage, gene_info)
 
         return gene_info, assignment_storage
 
-    def process_intergenic(self, alignment_storage):
+    def process_intergenic(self, alignment_storage, region):
         assignment_storage = []
+        if self.params.illumina_bam is not None:
+            corrector = IlluminaExonCorrector(self.chr_id, region[0], region[1], self.params.illumina_bam)
+        else:
+            corrector = VoidExonCorrector()
+
         for bam_index, alignment in alignment_storage:
             if alignment.reference_id == -1 or alignment.is_supplementary or \
                     (self.params.no_secondary and alignment.is_secondary):
@@ -307,7 +314,7 @@ def process_intergenic(self, alignment_storage):
             read_assignment.polya_info = alignment_info.polya_info
             read_assignment.cage_found = len(alignment_info.cage_hits) > 0
             read_assignment.exons = alignment_info.read_exons
-            read_assignment.corrected_exons = alignment_info.read_exons
+            read_assignment.corrected_exons = corrector.correct_read(alignment_info)
             read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
 
             read_assignment.read_group = self.read_groupper.get_group_id(alignment, self.bam_merger.bam_pairs[bam_index][1])
@@ -482,4 +489,4 @@ def check_antisense(self, read_assignment):
             if match.match_classification not in \
                     [MatchClassification.novel_in_catalog, MatchClassification.novel_not_in_catalog]:
                 match.match_classification = MatchClassification.antisense
-            match.match_subclassifications.append(MatchEvent(MatchEventSubtype.antisense))
+            match.match_subclassifications.append(MatchEvent(MatchEventSubtype.antisense))
diff --git a/src/correction_stats.py b/src/correction_stats.py
@@ -0,0 +1,150 @@
+from enum import Enum, unique
+
+from .common import junctions_from_blocks, overlaps
+
+
+@unique
+class Stats(Enum):
+    true_negative = 1  #correct before and not changed
+    false_negative = 2 #either changed but not correct after or not correct before and not changed
+    true_positive = 3  #changed and it is correct after
+    false_positive = 4 #changed and it is not correct after but was correct before
+
+
+class CorrectionStats:
+
+    def __init__(self, reference_db, long_reads):
+
+        self.reference = reference_db
+        self.long_reads = long_reads
+
+        self.name_map = self.map_names(self.reference, self.long_reads)
+
+    def map_names(self, ref, long_reads):
+        transcripts = list(ref.features_of_type('transcript', order_by=('seqid', 'start')))
+        names = {}
+        for alignment in long_reads.fetch():
+            name = alignment.query_name
+            name = name.split('_')[0]
+            if not name in names.keys():
+                for t in transcripts:
+                    if name in t["transcript_id"][0]:
+                        names[name] = t
+                        break
+        return names
+
+    def get_introns_from_transcript(self, transcript):
+        exons = []
+        for e in self.reference.children(transcript, order_by='start'):
+            if e.featuretype == 'exon':
+                    exons.append((e.start, e.end))
+        introns= junctions_from_blocks(exons)
+        introns = set(introns)
+        return introns
+
+    def read_stats(self, introns, corrected_introns, alignment):
+        name = alignment.query_name
+        name = name.split('_')[0]
+        transcript = self.name_map[name]
+        reference_introns = self.get_introns_from_transcript(transcript)
+        n_ref = len(reference_introns)
+        n_before = len(introns.intersection(reference_introns))
+        n_after = len(corrected_introns.intersection(reference_introns))
+        changed = not(introns == corrected_introns)
+        correct_before = (n_before == len(introns))
+        correct_after = (n_after == len(corrected_introns))
+        if changed:
+            if correct_after:
+                return Stats.true_positive
+            elif not correct_before:
+                #if len(introns.intersection(reference_introns)) > len(corrected_introns.intersection(reference_introns)):
+                #    print("False Positive in Read", alignment.query_name)
+                #    print("Before:", sorted(introns))
+                #    print("After:", sorted(corrected_introns))
+                #    return Stats.false_positive
+                #else:
+                print("Change but wrong before and after in Read", alignment.query_name, "Correct before:", len(introns.intersection(reference_introns)), "Correct after:", len(corrected_introns.intersection(reference_introns)))
+                return Stats.false_negative
+            else:
+                print("False Positive in Read", alignment.query_name)
+                print("Before:", sorted(introns))
+                print("After:", sorted(corrected_introns))
+                return Stats.false_positive
+        else:
+            if correct_before:
+                return Stats.true_negative
+            elif n_before != 0:
+                print("No correction but wrong Introns in Read", alignment.query_name)
+                return Stats.false_negative
+            else:
+                return Stats.true_negative
+
+    def stats_single(self, before, after, reference_introns):
+        if before in reference_introns:
+            print("False positive, before:", before, "after:", after)
+            return Stats.false_positive
+        elif after in reference_introns:
+            print("True positive, before:", before, "after:", after)
+            return Stats.true_positive
+        else:
+            print("False negative with change, before:", before, "after:", after)
+            return Stats.false_negative
+
+    def intron_stats(self, introns, corrected_introns, alignment):
+        classification = []
+        name = alignment.query_name
+        name = name.split('_')[0]
+        transcript = self.name_map[name]
+        reference_introns = self.get_introns_from_transcript(transcript)
+        n_ref = len(reference_introns)
+        changed = not(introns == corrected_introns)
+        unchanged_introns = [intron for intron in introns if intron in corrected_introns]
+        #unchanged_introns = introns.intersection(corrected_introns)
+        for intron in unchanged_introns:
+            if intron in reference_introns:
+                print("True negative", intron)
+                classification.append(Stats.true_negative)
+            else:
+                classification.append(Stats.false_negative)
+        #diff_before = list(sorted(introns - unchanged_introns))
+        #diff_after = list(sorted(corrected_introns - unchanged_introns))
+        diff_before = [intron for intron in introns if not intron in unchanged_introns]
+        diff_after = [intron for intron in corrected_introns if not intron in unchanged_introns]
+        if len(diff_before) == len(diff_after):
+            for i in range(len(diff_before)):
+                classification.append(self.stats_single(diff_before[i], diff_after[i], reference_introns))
+        elif len(diff_before) < len(diff_after):
+            j = 0
+            for i in range(len(diff_before)):
+                b = diff_before[i]
+                a = diff_after[i+j]
+                if b[0] == a[0] and b[1] == a[1]-4:
+                    classification.append(self.stats_single(b, a, reference_introns))
+                elif b[1] == a[1] and a[0] == b[0]-4:
+                    classification.append(self.stats_single(b, a, reference_introns))
+                else:
+                    print(len(diff_before))
+                    print(len(diff_after))
+                    print(i)
+                    print(j)
+                    right = diff_after[i+j+1]
+                    if overlaps(a, b) and overlaps(b, right):
+                        j = j + 1
+                        if b in reference_introns:
+                            classification.append(Stats.false_positive)
+                            print("False positive, before:", b, "after:", a, right)
+                        elif a in reference_introns and right in reference_introns:
+                            classification.append(Stats.true_positive)
+                            print("True positive, before:", b, "after:", a, right)
+                        else:
+                            classification.append(Stats.false_negative)
+                            print("False negative with change, before:", b, "after:", a, right)
+                    elif overlaps(a, b):
+                        classification.append(self.stats_single(b, a, reference_introns))
+                    elif overlaps(b, right):
+                        classification.append(self.stats_single(b, right, reference_introns))
+        else:
+            print("more exons before than after")
+            print(diff_before)
+            print(diff_after)
+        return classification