ablab · heidi-holappa · Aug 8, 2023 · Aug 17, 2023 · Aug 21, 2023 · Aug 22, 2023
diff --git a/src/alignment_processor.py b/src/alignment_processor.py
@@ -307,6 +307,7 @@ def process_intergenic(self, alignment_storage):
             read_assignment.polya_info = alignment_info.polya_info
             read_assignment.cage_found = len(alignment_info.cage_hits) > 0
             read_assignment.exons = alignment_info.read_exons
+            read_assignment.cigartuples = alignment.cigartuples
             read_assignment.corrected_exons = alignment_info.read_exons
             read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
 
@@ -358,6 +359,7 @@ def process_genic(self, alignment_storage, gene_info):
             read_assignment.polya_info = alignment_info.polya_info
             read_assignment.cage_found = len(alignment_info.cage_hits) > 0
             read_assignment.exons = alignment_info.read_exons
+            read_assignment.cigartuples = alignment.cigartuples
             read_assignment.corrected_exons = exon_corrector.correct_assigned_read(alignment_info,
                                                                                    read_assignment)
             read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
@@ -26,6 +26,11 @@
 from .long_read_profiles import CombinedProfileConstructor
 from .polya_finder import PolyAInfo
 
+from .transcript_splice_site_corrector import (
+    count_deletions_for_splice_site_locations, 
+    correct_splice_site_errors,
+    generate_updated_exon_list
+    )
 
 logger = logging.getLogger('IsoQuant')
 
@@ -130,6 +135,7 @@ def process(self, read_assignment_storage):
         self.construct_assignment_based_isoforms(read_assignment_storage)
         self.assign_reads_to_models(read_assignment_storage)
         self.filter_transcripts()
+        self.correct_transcripts()
 
         if self.params.genedb:
             self.create_extended_annotation()
@@ -198,6 +204,81 @@ def compare_models_with_known(self):
             model.add_additional_attribute("alternatives", event_string)
             self.transcript2transcript.append(assignment)
 
+    def correct_transcripts(self):
+        for model in self.transcript_model_storage:
+            exons = model.exon_blocks
+            assigned_reads = self.transcript_read_ids[model.transcript_id]
+            corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads)
+            if corrected_exons:
+                logger.debug(f"correct_transcripts. Corrected exons: {corrected_exons}, original exons: {exons}")
+                model.exon_blocks = corrected_exons
+
+    def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
+        # exons: list of coordinate pairs
+        # assigned_reads: list of ReadAssignment, contains read_id and cigartuples
+        # self.chr_record - FASTA recored, i.e. a single chromosome from a reference
+        # returns: a list of corrected exons if correction takes place, None - otherwise
+        # TODO Heidi: insert your code here
+
+        # Constants
+        ACCEPTED_DEL_CASES = [3, 4, 5, 6]
+        SUPPORTED_STRANDS = ['+', '-']
+        THRESHOLD_CASES_AT_LOCATION = 0.7
+        MIN_N_OF_ALIGNED_READS = 5
+        WINDOW_SIZE = 8
+
+        MORE_CONSERVATIVE_STRATEGY = False
+
+
+        strand = assigned_reads[0].strand
+        if strand not in SUPPORTED_STRANDS:
+            return None
+
+        splice_site_cases = {}
+        # Iterate assigned_reads list and count deletions for splice site locations
+        for read_assignment in assigned_reads:
+            read_start = read_assignment.corrected_exons[0][0]
+            read_end = read_assignment.corrected_exons[-1][1]
+            cigartuples = read_assignment.cigartuples
+            if not cigartuples:
+                # logger.debug(f"Heidi: No cigar tuples for read {read_assignment.read_id}")
+                continue
+            # logger.debug(f"Heidi: Cigar tuples for read {read_assignment.read_id}: {cigartuples}")
+            count_deletions_for_splice_site_locations(
+                read_start, 
+                read_end, 
+                cigartuples, 
+                exons, 
+                splice_site_cases,
+                WINDOW_SIZE)
+
+
+
+        corrected_exons = correct_splice_site_errors(
+            splice_site_cases,
+            MIN_N_OF_ALIGNED_READS,
+            ACCEPTED_DEL_CASES,
+            THRESHOLD_CASES_AT_LOCATION,
+            MORE_CONSERVATIVE_STRATEGY,
+            strand,
+            self.chr_record
+        )
+
+        if not corrected_exons:
+            return None
+
+        cases = [str(exon) + ": " + str(splice_site_cases[exon]) for exon in corrected_exons]
+
+
+        updated_exons = generate_updated_exon_list(
+            splice_site_cases,
+            corrected_exons,
+            exons
+        )
+
+        return updated_exons
+
+
     def filter_transcripts(self):
         filtered_storage = []
         confirmed_transcipt_ids = set()

diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py
@@ -477,6 +477,7 @@ def __init__(self, read_id, assignment_type, match=None):
         self.assignment_id = ReadAssignment.assignment_id_generator.increment()
         self.read_id = read_id
         self.exons = None
+        self.cigartuples = None
         self.corrected_exons = None
         self.corrected_introns = None
         self.gene_info = None
@@ -507,6 +508,9 @@ def deserialize(cls, infile, gene_info):
         read_assignment.assignment_id = read_int(infile)
         read_assignment.read_id = read_string(infile)
         read_assignment.exons = read_list_of_pairs(infile, read_int)
+        read_assignment.cigartuples = read_list_of_pairs(infile, read_int)
+        if not read_assignment.cigartuples:
+            read_assignment.cigartuples = None
         read_assignment.corrected_exons = read_list_of_pairs(infile, read_int)
         read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
         read_assignment.gene_info = gene_info
@@ -532,6 +536,10 @@ def serialize(self, outfile):
         write_int(self.assignment_id, outfile)
         write_string(self.read_id, outfile)
         write_list_of_pairs(self.exons, outfile, write_int)
+        if self.cigartuples is None:
+            write_list_of_pairs([], outfile, write_int)
+        else:
+            write_list_of_pairs(self.cigartuples, outfile, write_int)
         write_list_of_pairs(self.corrected_exons, outfile, write_int)
         write_bool_array([self.multimapper, self.polyA_found, self.cage_found], outfile)
         write_int_neg(self.polya_info.external_polya_pos, outfile)