Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Keep cigartuples #108

Open
wants to merge 46 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
1b26e04
keep cigartuples in read assignment
andrewprzh Aug 8, 2023
27f52fe
template for transcript correction
andrewprzh Aug 17, 2023
65f324d
Add initial implementation for transcript_splice_site_corrector and u…
heidi-holappa Aug 21, 2023
bd52a52
Fix issues with two unittests
heidi-holappa Aug 22, 2023
99c9d39
Fix issue with two datastructures having the same var name
heidi-holappa Aug 22, 2023
98619db
Refactor code into separate functions
heidi-holappa Aug 22, 2023
321be9c
Refactor code into separate functions
heidi-holappa Aug 22, 2023
f30996b
Expand tests for untested functions
heidi-holappa Aug 22, 2023
3f8aa16
Expand unittests
heidi-holappa Aug 22, 2023
3061f0f
expand unittests
heidi-holappa Aug 22, 2023
8eefa31
Fix key-issue in splice_site_dict
heidi-holappa Aug 22, 2023
cc53125
Add threshold verification to conservative strategy
heidi-holappa Aug 22, 2023
af81624
Add constant for threshold to args
heidi-holappa Aug 22, 2023
2740474
Update function name
heidi-holappa Aug 22, 2023
e2e3ed5
fix cigartuples, can be None sometimes
andrewprzh Aug 22, 2023
07ab569
Add logger.debug to see corrected exons
heidi-holappa Aug 23, 2023
c8efee0
Merge branch 'keep_cigartuples' of github.com:ablab/IsoQuant into kee…
heidi-holappa Aug 23, 2023
7953ff3
Add logger.debug to see corrected exons
heidi-holappa Aug 23, 2023
bec61f8
Add logger.debug to see corrected exons
heidi-holappa Aug 23, 2023
8e46fc1
Add logger.debug to see corrected exons
heidi-holappa Aug 23, 2023
0359146
Add logger.debug to see indel calc
heidi-holappa Aug 23, 2023
bc91708
Add debugging to see matching cases list
heidi-holappa Aug 23, 2023
054aa1c
Add debugger to cases with no cigartuples
heidi-holappa Aug 23, 2023
5ccbc81
add debug line to verify if cigartuples are found on some reads
heidi-holappa Aug 24, 2023
6e217ec
Move debug to correct transcripts
heidi-holappa Aug 25, 2023
82fdc3a
Move debug to correct transcripts
heidi-holappa Aug 25, 2023
20440ae
Move debug to correct transcripts
heidi-holappa Aug 25, 2023
83f5ae6
Move debug to correct transcripts
heidi-holappa Aug 25, 2023
f8f363f
Move debug to correct transcripts
heidi-holappa Aug 25, 2023
b37f59b
Fix bug with dict key ref
heidi-holappa Aug 25, 2023
8c240f4
Add test for GraphBasedModelConstructor
heidi-holappa Aug 25, 2023
2360b62
Check for abs value
heidi-holappa Aug 25, 2023
898f75c
Expand unittests for GraphBaseModelConstructor method
heidi-holappa Aug 25, 2023
05de4f3
Improve debugger stdouts
heidi-holappa Aug 25, 2023
c9d09b6
Update unittest after changing constant positioning
heidi-holappa Aug 28, 2023
35a0942
Move WINDOW_SIZE to main func
heidi-holappa Aug 28, 2023
9e2c29d
Move const WINDOW_SIZE upper
heidi-holappa Aug 28, 2023
5c22578
Change division to multiplication
heidi-holappa Aug 28, 2023
46d01bb
Expand tests
heidi-holappa Aug 28, 2023
5a90864
Shorten key name
heidi-holappa Aug 29, 2023
47ca8b3
Update tests after key name change
heidi-holappa Aug 29, 2023
b825a5c
fix cigartuples exactly where they needed to be
andrewprzh Aug 29, 2023
3c07e17
Change idx correction for FASTA extract
heidi-holappa Aug 30, 2023
ce52d63
Fix unittests after fixing issue with chr_record idx-correction
heidi-holappa Aug 30, 2023
aa4f08a
Merge branch 'keep_cigartuples' of github.com:ablab/IsoQuant into kee…
heidi-holappa Aug 30, 2023
fb7db12
Remove unneeded logger.debugs
heidi-holappa Aug 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/alignment_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ def process_intergenic(self, alignment_storage):
read_assignment.polya_info = alignment_info.polya_info
read_assignment.cage_found = len(alignment_info.cage_hits) > 0
read_assignment.exons = alignment_info.read_exons
read_assignment.cigartuples = alignment.cigartuples
read_assignment.corrected_exons = alignment_info.read_exons
read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)

Expand Down Expand Up @@ -358,6 +359,7 @@ def process_genic(self, alignment_storage, gene_info):
read_assignment.polya_info = alignment_info.polya_info
read_assignment.cage_found = len(alignment_info.cage_hits) > 0
read_assignment.exons = alignment_info.read_exons
read_assignment.cigartuples = alignment.cigartuples
read_assignment.corrected_exons = exon_corrector.correct_assigned_read(alignment_info,
read_assignment)
read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
Expand Down
81 changes: 81 additions & 0 deletions src/graph_based_model_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
from .long_read_profiles import CombinedProfileConstructor
from .polya_finder import PolyAInfo

from .transcript_splice_site_corrector import (
count_deletions_for_splice_site_locations,
correct_splice_site_errors,
generate_updated_exon_list
)

logger = logging.getLogger('IsoQuant')

Expand Down Expand Up @@ -130,6 +135,7 @@ def process(self, read_assignment_storage):
self.construct_assignment_based_isoforms(read_assignment_storage)
self.assign_reads_to_models(read_assignment_storage)
self.filter_transcripts()
self.correct_transcripts()

if self.params.genedb:
self.create_extended_annotation()
Expand Down Expand Up @@ -198,6 +204,81 @@ def compare_models_with_known(self):
model.add_additional_attribute("alternatives", event_string)
self.transcript2transcript.append(assignment)

def correct_transcripts(self):
for model in self.transcript_model_storage:
exons = model.exon_blocks
assigned_reads = self.transcript_read_ids[model.transcript_id]
corrected_exons = self.correct_transcript_splice_sites(exons, assigned_reads)
if corrected_exons:
logger.debug(f"correct_transcripts. Corrected exons: {corrected_exons}, original exons: {exons}")
model.exon_blocks = corrected_exons

def correct_transcript_splice_sites(self, exons: list, assigned_reads: list):
# exons: list of coordinate pairs
# assigned_reads: list of ReadAssignment, contains read_id and cigartuples
# self.chr_record - FASTA recored, i.e. a single chromosome from a reference
# returns: a list of corrected exons if correction takes place, None - otherwise
# TODO Heidi: insert your code here

# Constants
ACCEPTED_DEL_CASES = [3, 4, 5, 6]
SUPPORTED_STRANDS = ['+', '-']
THRESHOLD_CASES_AT_LOCATION = 0.7
MIN_N_OF_ALIGNED_READS = 5
WINDOW_SIZE = 8

MORE_CONSERVATIVE_STRATEGY = False


strand = assigned_reads[0].strand
if strand not in SUPPORTED_STRANDS:
return None

splice_site_cases = {}
# Iterate assigned_reads list and count deletions for splice site locations
for read_assignment in assigned_reads:
read_start = read_assignment.corrected_exons[0][0]
read_end = read_assignment.corrected_exons[-1][1]
cigartuples = read_assignment.cigartuples
if not cigartuples:
# logger.debug(f"Heidi: No cigar tuples for read {read_assignment.read_id}")
continue
# logger.debug(f"Heidi: Cigar tuples for read {read_assignment.read_id}: {cigartuples}")
count_deletions_for_splice_site_locations(
read_start,
read_end,
cigartuples,
exons,
splice_site_cases,
WINDOW_SIZE)



corrected_exons = correct_splice_site_errors(
splice_site_cases,
MIN_N_OF_ALIGNED_READS,
ACCEPTED_DEL_CASES,
THRESHOLD_CASES_AT_LOCATION,
MORE_CONSERVATIVE_STRATEGY,
strand,
self.chr_record
)

if not corrected_exons:
return None

cases = [str(exon) + ": " + str(splice_site_cases[exon]) for exon in corrected_exons]


updated_exons = generate_updated_exon_list(
splice_site_cases,
corrected_exons,
exons
)

return updated_exons


def filter_transcripts(self):
filtered_storage = []
confirmed_transcipt_ids = set()
Expand Down
8 changes: 8 additions & 0 deletions src/isoform_assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,7 @@ def __init__(self, read_id, assignment_type, match=None):
self.assignment_id = ReadAssignment.assignment_id_generator.increment()
self.read_id = read_id
self.exons = None
self.cigartuples = None
self.corrected_exons = None
self.corrected_introns = None
self.gene_info = None
Expand Down Expand Up @@ -507,6 +508,9 @@ def deserialize(cls, infile, gene_info):
read_assignment.assignment_id = read_int(infile)
read_assignment.read_id = read_string(infile)
read_assignment.exons = read_list_of_pairs(infile, read_int)
read_assignment.cigartuples = read_list_of_pairs(infile, read_int)
if not read_assignment.cigartuples:
read_assignment.cigartuples = None
read_assignment.corrected_exons = read_list_of_pairs(infile, read_int)
read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
read_assignment.gene_info = gene_info
Expand All @@ -532,6 +536,10 @@ def serialize(self, outfile):
write_int(self.assignment_id, outfile)
write_string(self.read_id, outfile)
write_list_of_pairs(self.exons, outfile, write_int)
if self.cigartuples is None:
write_list_of_pairs([], outfile, write_int)
else:
write_list_of_pairs(self.cigartuples, outfile, write_int)
write_list_of_pairs(self.corrected_exons, outfile, write_int)
write_bool_array([self.multimapper, self.polyA_found, self.cage_found], outfile)
write_int_neg(self.polya_info.external_polya_pos, outfile)
Expand Down
Loading