Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Short reads #105

Merged
merged 48 commits into from
Aug 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
6aa06a0
add option for illumina reads
andrewprzh Jun 20, 2023
9b3716d
Functionalities for comparing introns from IsoQuant and Illumina reads
rkpfeil Jun 1, 2023
7cb37e3
Changed default file name for database
rkpfeil Jun 5, 2023
57301ef
Output of number of equal counts
rkpfeil Jun 5, 2023
93dfa19
added first correction of introns with short reads
rkpfeil Jun 19, 2023
7bc9a1a
Cosmetic changes
rkpfeil Jun 21, 2023
bd3e294
moved get introns function into the corrector
rkpfeil Jun 21, 2023
f5aa6a0
add correction step to process_intergenic
rkpfeil Jun 21, 2023
c27b933
changed correct_exons to correct_read
rkpfeil Jun 21, 2023
1b5777f
adding second criterion for correcting introns, test
rkpfeil Jun 28, 2023
345e80e
Debugging
rkpfeil Jun 28, 2023
685c7d3
Debugging
rkpfeil Jun 28, 2023
5c94942
debuggung
rkpfeil Jun 28, 2023
231e270
debugging
rkpfeil Jun 28, 2023
c744960
debugging
rkpfeil Jun 28, 2023
6d5340d
debugging
rkpfeil Jun 28, 2023
cbf0b46
added check for introns from short reads longer than the long read
rkpfeil Jun 28, 2023
1262ef8
Removing wrong correction
rkpfeil Jun 29, 2023
bd7ac21
third try for correcting based on offsets
rkpfeil Jun 29, 2023
a1a3e11
correcting through count and difference
rkpfeil Jun 29, 2023
0083e7b
correction of difference 4
rkpfeil Jun 30, 2023
74a0ff1
Added statistics for reads and introns. Included skipped exons. Curre…
rkpfeil Jul 20, 2023
e469724
reverting to version 3 to run on server
rkpfeil Jul 21, 2023
311debc
switching to read statistics on server
rkpfeil Jul 31, 2023
d54ce8b
fixed reference db creation
rkpfeil Jul 31, 2023
96b7ed9
fixed reference db creation
rkpfeil Jul 31, 2023
55f7036
reverting to version 4. intron stats fixed
rkpfeil Jul 31, 2023
1e15f80
corrected false brackets
rkpfeil Jul 31, 2023
248e67f
version 5 for stats
rkpfeil Jul 31, 2023
f90fa8c
version 6 for stats
rkpfeil Jul 31, 2023
d990593
version 7 for stats
rkpfeil Aug 1, 2023
7ed19d1
version 8 for stats
rkpfeil Aug 1, 2023
3cb0173
version 9 for stats
rkpfeil Aug 1, 2023
814e6be
Version 10 for stats
rkpfeil Aug 1, 2023
035883f
version 11
rkpfeil Aug 2, 2023
cd736ad
version 2 for stats
rkpfeil Aug 2, 2023
1f580ac
version 1 for stats
rkpfeil Aug 2, 2023
fa08e9a
version 11 and intron statistics fix
rkpfeil Aug 3, 2023
5538d2e
Added first test. Reworking the code slightly
rkpfeil Aug 7, 2023
24994e9
added tests and toy data
rkpfeil Aug 10, 2023
4704979
fixed wrong file name in test
rkpfeil Aug 10, 2023
4e9a218
Added comments and removed unnecessary lines
rkpfeil Aug 16, 2023
b057b6d
removed unnecessary folder
rkpfeil Aug 17, 2023
6ac68e2
fixing errors
rkpfeil Aug 17, 2023
68e7385
moving and renaming to avoid being seen as tests
rkpfeil Aug 18, 2023
781218d
Fixed file calling in tests
rkpfeil Aug 18, 2023
7f73d83
whitespace correction
rkpfeil Aug 21, 2023
4a38053
whitespace correction
rkpfeil Aug 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions isoquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help
input_args.add_argument('--fastq_list', type=str, help='text file with list of FASTQ files, one file per line'
', leave empty line between samples')

# TODO: add nargs="+" to support multiple files
parser.add_argument('--illumina_bam', type=str, help='sorted and indexed file with Illumina '
'reads from the same sample')

parser.add_argument('--prefix', '-p', type=str,
help='experiment name; to be used for folder and file naming; default is OUT', default="OUT")
parser.add_argument('--labels', '-l', nargs='+', type=str,
Expand Down Expand Up @@ -350,36 +354,43 @@ def check_input_files(args):
if args.input_data.input_type == "save":
saves = glob.glob(in_file + "*")
if not saves:
print("ERROR! Input files " + in_file + "* do not exist")
logger.critical("Input files " + in_file + "* do not exist")
continue
if not os.path.isfile(in_file):
print("ERROR! Input file " + in_file + " does not exist")
logger.critical("Input file " + in_file + " does not exist")
exit(-1)
if args.input_data.input_type == "bam":
bamfile_in = pysam.AlignmentFile(in_file, "rb")
if not bamfile_in.has_index():
print("ERROR! BAM file " + in_file + " is not indexed, run samtools sort and samtools index")
logger.critical("BAM file " + in_file + " is not indexed, run samtools sort and samtools index")
exit(-1)
bamfile_in.close()

if args.illumina_bam is not None:
bamfile_in = pysam.AlignmentFile(args.illumina_bam, "rb")
if not bamfile_in.has_index():
logger.critical("BAM file " + args.illumina_bam + " is not indexed, run samtools sort and samtools index")
exit(-1)
bamfile_in.close()

if args.cage is not None:
logger.critical("CAGE data is not supported yet")
exit(-1)
if not os.path.isfile(args.cage):
print("ERROR! Bed file with CAGE peaks " + args.cage + " does not exist")
logger.critical("Bed file with CAGE peaks " + args.cage + " does not exist")
exit(-1)

if args.genedb is not None:
if not os.path.isfile(args.genedb):
print("ERROR! Gene database " + args.genedb + " does not exist")
logger.critical("Gene database " + args.genedb + " does not exist")
exit(-1)
else:
args.no_junc_bed = True

if args.read_assignments is not None:
for r in args.read_assignments:
if not glob.glob(r + "*"):
print("No files found with prefix " + str(r))
logger.critical("No files found with prefix " + str(r))
exit(-1)


Expand Down
101 changes: 101 additions & 0 deletions misc/correction_only.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import argparse
import pysam
import gffutils
import os

from src.illumina_exon_corrector import IlluminaExonCorrector
from src.alignment_info import AlignmentInfo
from src.common import junctions_from_blocks, overlaps
from src.short_utils import get_region_from_db
from src.correction_stats import CorrectionStats, Stats
from src.gtf2db import gtf2db

def parse_args():
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("--short", "-s", help="input sam/bam file containing alignment of the short reads",
type=str, dest = "short", required = True)
parser.add_argument("--ont", "-i", help="input sam/bam file containing alignment of the long reads",
type=str, dest = "ont", required = True)
parser.add_argument("--reference", "-r", help="input reference gtf file",
type=str, dest = "ref", required = True)
parser.add_argument("--output", "-o", help="output folder location",
type=str, dest = "out", required = True)
return parser.parse_args()

def get_reference_db(ref_file, out_folder):
if(ref_file[-2:] != "db"):
gtf = ref_file
args.output_exists = os.path.exists(out_folder)
if not args.output_exists:
os.makedirs(out_folder)
ref = os.path.join(out_folder, "mouse_reference.db")
gtf2db(gtf, ref, True)
else:
ref = ref_file
return gffutils.FeatureDB(ref)


args = parse_args()
short_file = args.short
long_file = pysam.AlignmentFile(args.ont, "rb")
ref_db = get_reference_db(args.ref, args.out)
gene_list = list(ref_db.features_of_type('gene', order_by=('seqid', 'start')))
correction = CorrectionStats(ref_db, long_file)
categorized = []
current_region = (0,0)
chromosome = 0
genes = []
before = 0
after = 0
for gene in gene_list:
if overlaps((gene.start,gene.end), current_region) and gene.seqid == chromosome:
genes.append(gene)
current_region = (current_region[0], max(current_region[1], gene.end))
else:
if genes :
corrector = IlluminaExonCorrector(chromosome, current_region[0], current_region[1], short_file)
reference_introns = get_region_from_db(ref_db, (chromosome, current_region[0], current_region[1]))
for alignment in long_file.fetch(chromosome, start = current_region[0], stop = current_region[1]):
ai = AlignmentInfo(alignment)
if not ai.read_exons:
print("Read has no aligned exons")
continue
exons = ai.read_exons
introns = set(junctions_from_blocks(exons))
corrected_exons = corrector.correct_read(ai)
corrected_introns = set(junctions_from_blocks(corrected_exons))
before += len(introns.intersection(reference_introns))
after += len(corrected_introns.intersection(reference_introns))
print("Chromosome:", chromosome)
#categorized.append(correction.read_stats(introns, corrected_introns, alignment))
categorized.extend(correction.intron_stats(junctions_from_blocks(exons), junctions_from_blocks(corrected_exons), alignment))
print("corrected exons:", corrected_exons)
genes = [gene]
current_region = (gene.start, gene.end)
chromosome = gene.seqid
if genes:
corrector = IlluminaExonCorrector(chromosome, current_region[0], current_region[1], short_file)
reference_introns = get_region_from_db(ref_db, (chromosome, current_region[0], current_region[1]))
for alignment in long_file.fetch(chromosome, start = current_region[0], stop = current_region[1]):
ai = AlignmentInfo(alignment)
if not ai.read_exons:
print("Read has no aligned exons")
continue
exons = ai.read_exons
introns = set(junctions_from_blocks(exons))
corrected_exons = corrector.correct_read(ai)
corrected_introns = set(junctions_from_blocks(corrected_exons))
before += len(introns.intersection(reference_introns))
after += len(corrected_introns.intersection(reference_introns))
print("Chromosome:", chromosome)
#categorized.append(correction.read_stats(introns, corrected_introns, alignment))
categorized.extend(correction.intron_stats(junctions_from_blocks(exons), junctions_from_blocks(corrected_exons), alignment))
print("corrected exons:", corrected_exons)

print("Number of correct introns before correction:", before)
print("Number of correct introns after correction:", after)
print("Number of true positive reads:", categorized.count(Stats.true_positive))
print("Number of false positive reads:", categorized.count(Stats.false_positive))
print("Number of true negative reads:", categorized.count(Stats.true_negative))
print("Number of false negative reads:", categorized.count(Stats.false_negative))

15 changes: 11 additions & 4 deletions src/alignment_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from .polya_verification import PolyAFixer
from .exon_corrector import ExonCorrector
from .alignment_info import AlignmentInfo
from .illumina_exon_corrector import IlluminaExonCorrector, VoidExonCorrector


logger = logging.getLogger('IsoQuant')

Expand Down Expand Up @@ -264,14 +266,19 @@ def process_alignments_in_region(self, current_region, alignment_storage):
logger.debug("Processing region %s" % str(current_region))
gene_info = self.get_gene_info_for_region(current_region)
if gene_info.empty():
assignment_storage = self.process_intergenic(alignment_storage)
assignment_storage = self.process_intergenic(alignment_storage, current_region)
else:
assignment_storage = self.process_genic(alignment_storage, gene_info)

return gene_info, assignment_storage

def process_intergenic(self, alignment_storage):
def process_intergenic(self, alignment_storage, region):
assignment_storage = []
if self.params.illumina_bam is not None:
corrector = IlluminaExonCorrector(self.chr_id, region[0], region[1], self.params.illumina_bam)
else:
corrector = VoidExonCorrector()

for bam_index, alignment in alignment_storage:
if alignment.reference_id == -1 or alignment.is_supplementary or \
(self.params.no_secondary and alignment.is_secondary):
Expand Down Expand Up @@ -307,7 +314,7 @@ def process_intergenic(self, alignment_storage):
read_assignment.polya_info = alignment_info.polya_info
read_assignment.cage_found = len(alignment_info.cage_hits) > 0
read_assignment.exons = alignment_info.read_exons
read_assignment.corrected_exons = alignment_info.read_exons
read_assignment.corrected_exons = corrector.correct_read(alignment_info)
read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)

read_assignment.read_group = self.read_groupper.get_group_id(alignment, self.bam_merger.bam_pairs[bam_index][1])
Expand Down Expand Up @@ -482,4 +489,4 @@ def check_antisense(self, read_assignment):
if match.match_classification not in \
[MatchClassification.novel_in_catalog, MatchClassification.novel_not_in_catalog]:
match.match_classification = MatchClassification.antisense
match.match_subclassifications.append(MatchEvent(MatchEventSubtype.antisense))
match.match_subclassifications.append(MatchEvent(MatchEventSubtype.antisense))
150 changes: 150 additions & 0 deletions src/correction_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from enum import Enum, unique

from .common import junctions_from_blocks, overlaps

rkpfeil marked this conversation as resolved.
Show resolved Hide resolved

@unique
class Stats(Enum):
true_negative = 1 #correct before and not changed
false_negative = 2 #either changed but not correct after or not correct before and not changed
true_positive = 3 #changed and it is correct after
false_positive = 4 #changed and it is not correct after but was correct before


class CorrectionStats:

def __init__(self, reference_db, long_reads):

self.reference = reference_db
self.long_reads = long_reads

self.name_map = self.map_names(self.reference, self.long_reads)

def map_names(self, ref, long_reads):
transcripts = list(ref.features_of_type('transcript', order_by=('seqid', 'start')))
names = {}
for alignment in long_reads.fetch():
name = alignment.query_name
name = name.split('_')[0]
if not name in names.keys():
for t in transcripts:
if name in t["transcript_id"][0]:
names[name] = t
break
return names

def get_introns_from_transcript(self, transcript):
exons = []
for e in self.reference.children(transcript, order_by='start'):
if e.featuretype == 'exon':
exons.append((e.start, e.end))
introns= junctions_from_blocks(exons)
introns = set(introns)
return introns

def read_stats(self, introns, corrected_introns, alignment):
name = alignment.query_name
name = name.split('_')[0]
transcript = self.name_map[name]
reference_introns = self.get_introns_from_transcript(transcript)
n_ref = len(reference_introns)
n_before = len(introns.intersection(reference_introns))
n_after = len(corrected_introns.intersection(reference_introns))
changed = not(introns == corrected_introns)
correct_before = (n_before == len(introns))
correct_after = (n_after == len(corrected_introns))
if changed:
if correct_after:
return Stats.true_positive
elif not correct_before:
#if len(introns.intersection(reference_introns)) > len(corrected_introns.intersection(reference_introns)):
# print("False Positive in Read", alignment.query_name)
# print("Before:", sorted(introns))
# print("After:", sorted(corrected_introns))
# return Stats.false_positive
#else:
print("Change but wrong before and after in Read", alignment.query_name, "Correct before:", len(introns.intersection(reference_introns)), "Correct after:", len(corrected_introns.intersection(reference_introns)))
return Stats.false_negative
else:
print("False Positive in Read", alignment.query_name)
print("Before:", sorted(introns))
print("After:", sorted(corrected_introns))
return Stats.false_positive
else:
if correct_before:
return Stats.true_negative
elif n_before != 0:
print("No correction but wrong Introns in Read", alignment.query_name)
return Stats.false_negative
else:
return Stats.true_negative

def stats_single(self, before, after, reference_introns):
if before in reference_introns:
print("False positive, before:", before, "after:", after)
return Stats.false_positive
elif after in reference_introns:
print("True positive, before:", before, "after:", after)
return Stats.true_positive
else:
print("False negative with change, before:", before, "after:", after)
return Stats.false_negative

def intron_stats(self, introns, corrected_introns, alignment):
classification = []
name = alignment.query_name
name = name.split('_')[0]
transcript = self.name_map[name]
reference_introns = self.get_introns_from_transcript(transcript)
n_ref = len(reference_introns)
changed = not(introns == corrected_introns)
unchanged_introns = [intron for intron in introns if intron in corrected_introns]
#unchanged_introns = introns.intersection(corrected_introns)
for intron in unchanged_introns:
if intron in reference_introns:
print("True negative", intron)
classification.append(Stats.true_negative)
else:
classification.append(Stats.false_negative)
#diff_before = list(sorted(introns - unchanged_introns))
#diff_after = list(sorted(corrected_introns - unchanged_introns))
diff_before = [intron for intron in introns if not intron in unchanged_introns]
diff_after = [intron for intron in corrected_introns if not intron in unchanged_introns]
if len(diff_before) == len(diff_after):
for i in range(len(diff_before)):
classification.append(self.stats_single(diff_before[i], diff_after[i], reference_introns))
elif len(diff_before) < len(diff_after):
j = 0
for i in range(len(diff_before)):
b = diff_before[i]
a = diff_after[i+j]
if b[0] == a[0] and b[1] == a[1]-4:
classification.append(self.stats_single(b, a, reference_introns))
elif b[1] == a[1] and a[0] == b[0]-4:
classification.append(self.stats_single(b, a, reference_introns))
else:
print(len(diff_before))
print(len(diff_after))
print(i)
print(j)
right = diff_after[i+j+1]
if overlaps(a, b) and overlaps(b, right):
j = j + 1
if b in reference_introns:
classification.append(Stats.false_positive)
print("False positive, before:", b, "after:", a, right)
elif a in reference_introns and right in reference_introns:
classification.append(Stats.true_positive)
print("True positive, before:", b, "after:", a, right)
else:
classification.append(Stats.false_negative)
print("False negative with change, before:", b, "after:", a, right)
elif overlaps(a, b):
classification.append(self.stats_single(b, a, reference_introns))
elif overlaps(b, right):
classification.append(self.stats_single(b, right, reference_introns))
else:
print("more exons before than after")
print(diff_before)
print(diff_after)
return classification
Loading