forked from FRED-2/OptiType
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hlatyper.py
597 lines (468 loc) · 27.6 KB
/
hlatyper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
import pandas as pd
import numpy as np
import re
import pylab
import warnings
from collections import OrderedDict
from Bio import SeqIO
from datetime import datetime
def now(start=datetime.now()):
# function argument NOT to be ever set! It gets initialized at function definition time
# so we can calculate difference without further hassle.
return str(datetime.now()-start)[:-4]
def memoize(f): # from http://code.activestate.com/recipes/578231-probably-the-fastest-memoization-decorator-in-the-/
class MemoDict(dict):
def __missing__(self, key):
ret = self[key] = f(key)
return ret
return MemoDict().__getitem__
# if we used MDINSHP=X chars this could be a full cigar parser, however, we only need M and D to
# determine a read's span on the reference sequence to create a coverage plot.
CIGAR_SLICER = re.compile(r'[0-9]+[MD]')
@memoize
def length_on_reference(cigar_string):
return sum([int(p[:-1]) for p in re.findall(CIGAR_SLICER, cigar_string)])
def feature_order(feature):
# to use it in sorted(..., key=feature_order) this returns:
# 0 for ('UTR', 1)
# 2 for exon, 1
# 3 for intron, 1
# 4 for exon, 2
# 5 for intron, 2
# ...
# 999 for UTR, 2
feat_type, feat_number = feature
assert feat_type in ('intron', 'exon', 'UTR'), 'Unknown feature in list (function accepts intron/exon/UTR)'
assert isinstance(feat_number, int), 'Feature number has to be integer'
return 0 if feature==('UTR', 1) else 999 if feature==('UTR', 2) else (feat_number*2 + (feat_type=='intron'))
def store_dataframes(out_hdf, **kwargs):
# DataFrames to serialize have to be passed by keyword arguments. An argument matrix1=DataFrame(...)
# will be written into table 'matrix1' in the HDF file.
complevel = kwargs.pop('complevel', 9) # default complevel & complib values if
complib = kwargs.pop('complib', 'zlib') # not explicitly asked for as arguments
if kwargs.get("verbosity", 0):
print now(), 'Storing %d DataFrames in file %s with compression settings %d %s...' % (len(kwargs), out_hdf, complevel, complib)
store = pd.HDFStore(out_hdf, complevel=complevel, complib=complib) # TODO: WRITE ONLY? it probably appends now
for table_name, dataframe in kwargs.iteritems():
store[table_name] = dataframe
store.close()
if kwargs.get("verbosity", 0):
print now(), 'DataFrames stored in file.'
def load_hdf(in_hdf, as_dict=False, *args): # isn't really neccesary, but makes a read-only flag on it to be sure
store = pd.HDFStore(in_hdf, 'r')
if len(args):
if as_dict:
to_return = {table: store[table] for table in args}
else:
to_return = tuple((store[table] for table in args))
store.close()
return to_return
else:
return store # store doesn't get closed! Either the user closes it manually or gets closed on exit
def sam_to_hdf(samfile, **kwargs):
if kwargs.get("verbosity", 0):
print now(), 'Loading alleles and read IDs from %s...' % samfile
# run through the SAM file once to see how many reads and alleles we are dealing with
# for a one-step DataFrame initialization instead of slow growing
read_ids, allele_ids = [], []
first_hit_row = True
total_hits = 0
with open(samfile, 'r') as f:
last_read_id = None
for line in f:
if line.startswith('@'):
if line.startswith('@SQ'):
allele_ids.append(line.split('\t')[1][3:]) # SN:HLA:HLA00001
continue
total_hits += 1
read_id = line.split('\t')[0]
if last_read_id != read_id:
read_ids.append(read_id)
last_read_id = read_id
if first_hit_row: # analyze SAM file structure and find MD tag column (describes mismatches).
first_hit_row = False
columns = line.split()
try:
md_index = map(lambda x: x.startswith('MD:'), columns).index(True)
except ValueError:
# TODO: we don't really handle the case if MD-tag is not present, code will fail later
print '\tNo MD-tag found in SAM file!'
md_index = None
try:
nm_index = map(lambda x: x.startswith('NM:'), columns).index(True)
except ValueError:
# TODO: we don't really handle the case if NM-tag is not present, code will fail later
print '\tNo NM-tag found in SAM file!'
nm_index = None
if kwargs.get("verbosity", 0):
print now(), '%d alleles and %d reads found.' % (len(allele_ids), len(read_ids))
print now(), 'Initializing mapping matrix...'
# major performance increase if we initialize a numpy zero-matrix and pass that in the constructor
# than if we just let pandas initialize its default NaN matrix
matrix_pos = pd.DataFrame(np.zeros((len(read_ids), len(allele_ids)), dtype=int), columns=allele_ids, index=read_ids)
matrix_etc = pd.DataFrame(np.zeros((len(read_ids), len(allele_ids)), dtype=int), columns=allele_ids, index=read_ids)
if kwargs.get("verbosity", 0):
print now(), '%dx%d mapping matrix initialized. Populating %d hits from SAM file...' % (len(read_ids), len(allele_ids), total_hits)
milestones = [x * total_hits / 10 for x in range(1, 11)] # for progress bar
with open(samfile, 'r') as f:
# NM, CIGAR and MD fields are extremely redundant. We'll build an ordered set of their triplets
# and create a second matrix identical to the first one where fields reference positions in this
# ordered set of hit descriptors. Reason to use OrderedDict: each key triplet's value is their
# position index in the set. So if I do a lookup on a triplet, I can tell its index, and later on
# I can retreive the triplet from its index by nm_cigar_mismatch_set.keys()[index]
# A minor hack: to be able to initialize a fast numpy zero matrix and not a slow pandas NaN matrix
# I add a first dummy element to this set (index 0) that correspond to unset matrix fields
# and the actual triplets will start from 1.
nm_cigar_mismatch_set = OrderedDict()
nm_cigar_mismatch_set[(0, '', '')] = 0
counter = 0
percent = 0
for line in f:
if line.startswith('@'):
continue
fields = line.strip().split('\t')
read_id, allele_id, position, cigar, nm, mismatches = (fields[i] for i in (0, 2, 3, 5, nm_index, md_index))
position = int(position)
nm = int(nm[5:]) # NM:i:2 --> 2
mismatches = mismatches[5:] # MD:Z:1T32A67 --> 1T32A67
matrix_pos[allele_id][read_id] = position # SAM indexes from 1, so null elements are not hits
if (nm, cigar, mismatches) not in nm_cigar_mismatch_set:
descriptor_index = len(nm_cigar_mismatch_set)
nm_cigar_mismatch_set[(nm, cigar, mismatches)] = descriptor_index
else:
descriptor_index = nm_cigar_mismatch_set[(nm, cigar, mismatches)]
matrix_etc[allele_id][read_id] = descriptor_index
counter += 1
if counter in milestones:
percent += 10
if kwargs.get("verbosity", 0):
print '\t%d%% completed' % percent
if kwargs.get("verbosity", 0):
print now(), '%d elements filled. Matrix sparsity: 1 in %.2f' % (counter, matrix_pos.shape[0]*matrix_pos.shape[1]/float(counter))
# convert HLA:HLA00001 identifiers to HLA00001
matrix_pos.rename(columns=lambda x: x.replace('HLA:', ''), inplace=True)
matrix_etc.rename(columns=lambda x: x.replace('HLA:', ''), inplace=True)
# make a DataFrame from descriptor triplet set
hit_descriptors = pd.DataFrame(nm_cigar_mismatch_set.keys(), columns=['NM', 'CIGAR', 'MD'])
return matrix_pos, matrix_etc, hit_descriptors
def get_compact_model(hit_df, to_bool=False):
# turn a hit matrix dataframe (can be mapping position matrix if used with to_bool=True)
# into a smaller matrix DF that removes duplicate rows and creates the "occurence" vector
# with the number of rows the representative read represents
hit_df = hit_df.ix[hit_df.any(axis=1)] # remove all-zero rows
if to_bool:
hit_df = hit_df.applymap(bool)
occurence = {r[0]: len(r) for r in hit_df.groupby(hit_df.columns.tolist()).groups.itervalues()}
unique_mtx = hit_df.drop_duplicates()
return unique_mtx, occurence
def mtx_to_sparse_dict(hit_df):
# takes a hit matrix and creates a dictionary of (read, allele):1 tuples corresponding to hits
# (needed by OptiType)
all_hits = {}
for read_id, alleles in hit_df.iterrows():
hit_alleles = alleles[alleles!=0].index
for hit_allele in hit_alleles:
all_hits[(read_id, hit_allele)] = 1
return all_hits
#return {(read, allele): 1 for read in hit_df.index for allele in hit_df.columns if hit_df[allele][read]>0} # awfully inefficient
def create_allele_dataframes(imgt_dat, fasta_gen, fasta_nuc, **kwargs):
if kwargs.get("verbosity", 0):
print now(), 'Loading IMGT allele dat file...'
alleles = OrderedDict()
with open(imgt_dat, 'rU') as handle:
for i, record in enumerate(SeqIO.parse(handle, "imgt")):
alleles[record.id] = record
if kwargs.get("verbosity", 0):
print now(), 'Initializing allele DataFrame...'
'''
id HLA000001
type A*01:01:01:01
locus A
class I --- I, II, TAP, MIC, other
flags 0 --- problems with allele stored here, see below
len_gen 3503
len_nuc 1098
full_gen 1
full_nuc 1
flagging: sum of the below codes
+1 if HLA type ends in a letter (N, Q, etc.)
+2 if CDS annotation != exon annotation
+4 if features don't add up to gen sequence
+8 if exons don't add up to nuc sequence
'''
allele_info = 'id type 4digit locus flags len_dat len_gen len_nuc full_gen full_nuc'
table = pd.DataFrame(index=alleles.keys(), columns=allele_info.split())
sequences = []
if kwargs.get("verbosity", 0):
print now(), 'Filling DataFrame with allele data...'
all_features = [] # contains tuples: (HLA id, feature type, feature number, feature start, feature end)
for allele in alleles.itervalues():
allele_type = allele.description.replace('HLA-', '').split(',')[0]
table.ix[allele.id]['id'] = allele.id
table.ix[allele.id]['type'] = allele_type
table.ix[allele.id]['4digit'] = ':'.join(allele_type.split(':')[:2])
table.ix[allele.id]['locus'] = allele_type.split('*')[0]
table.ix[allele.id]['flags'] = 0 if allele_type[-1].isdigit() else 1
table.ix[allele.id]['len_dat'] = len(str(allele.seq))
table.ix[allele.id]['len_gen'] = 0 # TODO: IT STILL DOESNT SOLVE PERFORMANCEWARNING!
table.ix[allele.id]['len_nuc'] = 0 # we initialize these nulls so that we don't get a
table.ix[allele.id]['full_gen'] = 0 # PerformanceWarning + pickling when storing HDF
table.ix[allele.id]['full_nuc'] = 0 # because of NaNs (they don't map to ctypes)
sequences.append((allele.id, 'dat', str(allele.seq)))
# number of features in 2013-04-30 hla.dat:
# 9296 source (total # of alleles)
# 9291 CDS, 5 gene (HLA-P pseudogenes)
# 24493 exons, 3697 introns, 1027 UTRs
# CDS matches exons in 99+% of cases, some have a single base-pair extension at the end for some
# weird single-base exons or such (all on unimportant pseudogene loci)
# so we extract exons, introns and UTRs
features = [f for f in allele.features if f.type in ('exon', 'intron', 'UTR')]
for feature in features:
if feature.type in ('exon', 'intron'):
feature_num = int(feature.qualifiers['number'][0])
else: # UTR
# UTR either starts at the beginning or ends at the end
assert feature.location.start == 0 or feature.location.end == len(allele.seq)
feature_num = 1 if feature.location.start == 0 else 2 # 1 if 5' UTR, 2 if 3' UTR
all_features.append(
(allele.id, feature.type, feature_num,
int(feature.location.start), int(feature.location.end), len(feature))
)
# small sanity check, can be commented out
cds = [f for f in allele.features if f.type == 'CDS']
if cds:
if sum(map(len, [f for f in features if f.type == 'exon'])) != len(cds[0]):
print "\tCDS length doesn't match sum of exons for", allele.id, allele_type
table.ix[allele.id]['flags'] += 2
else:
print "\tNo CDS found for", allele.id, allele_type
table.ix[allele.id]['flags'] += 2
if kwargs.get("verbosity", 0):
print now(), 'Loading gen and nuc files...'
with open(fasta_gen, 'r') as fasta_gen:
for record in SeqIO.parse(fasta_gen, 'fasta'):
allele_id = record.id.replace('HLA:', '')
table.ix[allele_id]['len_gen'] = len(record.seq)
sequences.append((allele_id, 'gen', str(record.seq)))
with open(fasta_nuc, 'r') as fasta_nuc:
for record in SeqIO.parse(fasta_nuc, 'fasta'):
allele_id = record.id.replace('HLA:', '')
table.ix[allele_id]['len_nuc'] = len(record.seq)
sequences.append((allele_id, 'nuc', str(record.seq)))
# convert list of tuples into DataFrame for features and sequences
all_features = pd.DataFrame(all_features, columns=['id', 'feature', 'number', 'start', 'end', 'length'])
sequences = pd.DataFrame(sequences, columns=['id', 'source', 'sequence'])
joined = pd.merge(table, all_features, how='inner', on='id')
exons_for_locus = {}
for i_locus, i_group in joined.groupby('locus'):
exons_for_locus[i_locus] = i_group[i_group['feature']=='exon']['number'].max()
# for i_id, i_group in joined.groupby('id'):
# max_exons = exons_for_locus[table.ix[i_id]['locus']]
# if len(i_group) >= 2*max_exons-1:
# print i_id, 'is fully annotated on all exons and introns and UTRs'
if kwargs.get("verbosity", 0):
print now(), 'Checking dat features vs gen/nuc sequences...'
for allele, features in joined.groupby('id'):
row = features.irow(0) # first row of the features subtable. Contains all allele information because of the join
sum_features_length = features['length'].sum()
sum_exons_length = features.ix[features['feature']=='exon']['length'].sum()
if row['len_gen']>0 and row['len_gen'] != sum_features_length:
print "\tFeature lengths don't add up to gen sequence length", allele, row['len_gen'], sum_features_length, row['type']
table.ix[allele]['flags'] += 4
if row['len_nuc']>0 and row['len_nuc'] != sum_exons_length:
print "\tExon lengths don't add up to nuc sequence length", allele, row['len_nuc'], sum_exons_length, row['type']
table.ix[allele]['flags'] += 8
if kwargs.get("verbosity", 0):
print now(), 'Sanity check finished...'
return table, all_features, sequences
def prune_identical_alleles(binary_mtx, report_groups=False):
# return binary_mtx.transpose().drop_duplicates().transpose()
# # faster:
hash_columns = binary_mtx.transpose().dot(np.random.rand(binary_mtx.shape[0]))
if report_groups:
grouper = hash_columns.groupby(hash_columns)
groups = {g[1].index[0]: g[1].index.tolist() for g in grouper}
alleles_to_keep = hash_columns.drop_duplicates().index # relying on keeping the first representative
# TODO: maybe return sets of alleles that were collapsed into the representative (identical patterned sets)
return binary_mtx[alleles_to_keep] if not report_groups else (binary_mtx[alleles_to_keep], groups)
def prune_identical_reads(binary_mtx):
# almost the same as ht.get_compact_model() except it doesn't return an occurence vector.
# It should only be used to compactify a matrix before passing it to the function finding
# overshadowed alleles as it uses a super expensive square product operation.
# Final compactifying should be later done on the original binary matrix.
#
# return binary_mtx.drop_duplicates()
# # faster:
reads_to_keep = binary_mtx.dot(np.random.rand(binary_mtx.shape[1])).drop_duplicates().index
return binary_mtx.ix[reads_to_keep]
def prune_overshadowed_alleles(binary_mtx):
# USES COVARIANCE which can be slow for huge matrices.
# For a 1000reads x 3600alleles matrix it takes 15 seconds.
# So you should always prune identical columns and rows before to give it a matrix as small as possible.
# So ideal usage:
# prune_overshadowed_alleles(prune_identical_alleles(prune_identical_reads(binary_mtx)))
bb = binary_mtx.applymap(int) # make sure dot products are not boolean but numbers
covariance = bb.transpose().dot(bb)
diagonal = pd.Series([covariance[ii][ii] for ii in covariance.columns], index=covariance.columns)
new_covariance = covariance[covariance.columns]
for ii in new_covariance.columns:
new_covariance[ii][ii] = 0
overshadowed = []
for ii in new_covariance.columns:
potential_superiors = new_covariance[ii][new_covariance[ii]==diagonal[ii]].index
if any(diagonal[potential_superiors] > diagonal[ii]):
overshadowed.append(ii)
non_overshadowed = covariance.columns.diff(overshadowed)
return non_overshadowed
def create_paired_matrix(binary_1, binary_2, id_cleaning=None):
# id_cleaning is a function object that turns a read ID that contains pair member information (like XYZ_1234:5678/1) into an
# ID that identifies the pair itself (like XYZ_1234:5678) so we can match the two DF's IDs. In the above case a suitable
# id_cleaning function would be lambda x: x[:-2] (gets rid of /1 and /2 from the end). Sometimes it's trickier as pair member
# information can be somewhere in the middle, and sometimes it's not even required at all as both pair members have the same ID
# just in two different files (1000 genomes).
if id_cleaning is not None:
binary_1.index = map(id_cleaning, binary_1.index)
binary_2.index = map(id_cleaning, binary_2.index)
common_read_ids = binary_1.index.intersection(binary_2.index)
b_1 = binary_1.ix[common_read_ids]
b_2 = binary_2.ix[common_read_ids]
return b_1 * b_2 # elementwise AND
def get_features(allele_id, features, feature_list):
# allele_id can be like HLA12345_HLA67890: then we take features from HLA12345 first if possible, then from HLA67890
# for sequence reconstruction (in that case HLA67890 should be a nearest neighbor of HLA12345)
if '_' in allele_id:
partial_allele, complete_allele = allele_id.split('_')
else:
complete_allele = allele_id
feats_complete = {(of['feature'], of['number']): of for _, of in features.ix[features['id']==complete_allele].iterrows()}
feats_partial = {(of['feature'], of['number']): of for _, of in features.ix[features['id']==partial_allele].iterrows()} if '_' in allele_id else feats_complete
feats_to_include = []
for feat in sorted(feature_list, key=feature_order):
if feat in feats_partial:
feats_to_include.append(feats_partial[feat])
elif feat in feats_complete:
feats_to_include.append(feats_complete[feat])
else:
warnings.warn('Feature %s not found for allele %s' % (feat, allele_id))
return pd.DataFrame(feats_to_include)
def calculate_coverage(alignment, features, alleles_to_plot, features_used):
assert len(alignment) in (3, 6, 7), ("Alignment tuple either has to have 3, 6 or 7 elements. First six: pos, etc, desc "
"once or twice depending on single or paired end, and an optional binary DF at the end for PROPER paired-end plotting")
if len(alignment) == 3:
matrix_pos, matrix_etc, hit_descriptors = alignment[:3]
pos = matrix_pos[alleles_to_plot]
etc = matrix_etc[alleles_to_plot]
hit = hit_descriptors # TODO: just want a shorter name
hit_counts = pos.applymap(bool).sum(axis=1) # how many of the selected alleles does a particular read hit? (unambiguous hit or not)
# if only a single allele is fed to this function that has no hits at all, we still want to create a null-matrix
# for it. Its initialization depends on max_ambiguity (it's the size of one of its dimensions) so we set this to 1 at least.
max_ambiguity = max(1, hit_counts.max())
to_process = [(pos, etc, hit, hit_counts)]
elif len(alignment) == 6:
matrix_pos1, matrix_etc1, hit_descriptors1, matrix_pos2, matrix_etc2, hit_descriptors2 = alignment
pos1 = matrix_pos1[alleles_to_plot]
etc1 = matrix_etc1[alleles_to_plot]
hit1 = hit_descriptors1
pos2 = matrix_pos2[alleles_to_plot]
etc2 = matrix_etc2[alleles_to_plot]
hit2 = hit_descriptors2
hit_counts1 = pos1.applymap(bool).sum(axis=1)
hit_counts2 = pos2.applymap(bool).sum(axis=1)
max_ambiguity = max(1, hit_counts1.max(), hit_counts2.max())
to_process = [(pos1, etc1, hit1, hit_counts1), (pos2, etc2, hit2, hit_counts2)]
else:
matrix_pos1, matrix_etc1, hit_descriptors1, matrix_pos2, matrix_etc2, hit_descriptors2, binary = alignment
binx = binary[alleles_to_plot]
pos1 = matrix_pos1[alleles_to_plot].ix[binx.index] * binx
pos2 = matrix_pos2[alleles_to_plot].ix[binx.index] * binx
etc1 = matrix_etc1[alleles_to_plot].ix[binx.index] * binx
etc2 = matrix_etc2[alleles_to_plot].ix[binx.index] * binx
hit1 = hit_descriptors1
hit2 = hit_descriptors2
hit_counts = binx.sum(axis=1)
max_ambiguity = max(1, hit_counts.max())
to_process = [(pos1, etc1, hit1, hit_counts), (pos2, etc2, hit2, hit_counts)]
coverage_matrices = []
for allele in alleles_to_plot:
allele_features = get_features(allele, features, features_used)
allele_length = allele_features['length'].sum()
# typically 1-3 rows x sequence_length columns, plus a dimension for perfect hits vs hits with mismatches.
# Unique hits go in first row, reads hitting 2 alleles in second, etc.
# Nth row stands for reads hitting N alleles (ie. read might have come from one of N alleles)
coverage = np.zeros((2, max_ambiguity, allele_length), dtype=int)
# to_process is a list containing tuples with mapping/position/hit property dataframes to superimpose.
# For single-end plotting to_process has just one element. For paired end, to_process has two elements that
# were pre-processed to only plot reads where both pairs map, etc. but the point is that superimposing the two
# will provide the correct result.
for pos, etc, hit, hit_counts in to_process:
reads = pos[pos[allele]!=0].index # reads hitting allele: coverage plot will be built using these
for i_pos, i_cigar, i_mismatches, i_hitcount in zip(
pos.ix[reads][allele],
etc.ix[reads][allele].map(hit['CIGAR']),
etc.ix[reads][allele].map(hit['NM']),
hit_counts[reads]):
read_length = length_on_reference(i_cigar) # this does not neccessarily equal actual read length because of indel mismatches
coverage[bool(i_mismatches)][i_hitcount-1][i_pos-1:i_pos-1+read_length] += 1
coverage_matrices.append((allele, coverage))
return coverage_matrices
def plot_coverage(outfile, coverage_matrices, allele_data, features, features_used, columns=2):
def start_end_zeros(cov_array):
# the areaplot polygon gets screwed up if the series don't end/start with zero
# this adds a zero to the sides to circumvent that
return np.append(np.append([0], cov_array), [0])
def allele_sorter(allele_cov_mtx_tuple):
allele, _ = allele_cov_mtx_tuple
return allele_data.ix[allele.split('_')[0]]['type']
def get_allele_locus(allele):
return allele_data.ix[allele.split('_')[0]]['locus']
number_of_loci = len(set((get_allele_locus(allele) for allele, _ in coverage_matrices)))
dpi = 50
box_size = (7, 3)
# subplot_rows = len(coverage_matrices)/columns + bool(len(coverage_matrices) % columns) # instead of float division + rounding up
# subplot_rows = 3 # TODO
subplot_rows = number_of_loci
area_colors = [(.4, .7, .4), (.6, .9, .6), (.9, .3, .3), (.9, .6, .6)] # colors: unique perfect hits, shared perfect hits, unique mismatch hits, shared mismatch hits
figure = pylab.figure(figsize=(box_size[0]*columns, box_size[1]*subplot_rows), dpi=dpi) # TODO: dpi doesn't seem to do shit. Is it stuck in 100?
coverage_matrices = sorted(coverage_matrices, key=allele_sorter) # so that the A alleles come first, then B, and so on.
prev_locus = ''
i_subplot = 0
for allele, coverage in coverage_matrices:
if '_' in allele:
partial, complete = allele.split('_')
plot_title = '%s (introns from %s)' % (allele_data.ix[partial]['type'], allele_data.ix[complete]['type']) # , allele for debugging (original ID)
else:
plot_title = allele_data.ix[allele]['type'] # + allele for debugging original ID
if prev_locus != get_allele_locus(allele): # new locus, start new row
i_subplot = i_subplot + columns - ((i_subplot-1) % columns)
else:
i_subplot += 1
#print i_subplot
prev_locus = get_allele_locus(allele)
plot = figure.add_subplot(subplot_rows, columns, i_subplot, adjustable='box')
_, max_ambig, seq_length = coverage.shape # first dimension size is 2 (perfect vs mismatch)
shared_weighting = np.reciprocal(np.arange(max_ambig)+1.0) # --> 1, 1/2, 1/3...
shared_weighting[0] = 0 # --> 0, 1/2, 1/3, so the unique part doesn't get mixed in
unique_perfect = start_end_zeros(coverage[0][0])
unique_mismatch = start_end_zeros(coverage[1][0])
shared_perfect = start_end_zeros(shared_weighting.dot(coverage[0]))
shared_mismatch = start_end_zeros(shared_weighting.dot(coverage[1]))
# Exon annotation
i_start = 1 # position of last feature's end. It's one because we padded with zeros above
for _, ft in get_features(allele, features, features_used).iterrows():
if ft['feature'] == 'exon':
plot.axvspan(i_start, i_start + ft['length'], facecolor='blue', alpha=0.1, linewidth=0, zorder=1)
i_start += ft['length']
areas = plot.stackplot(np.arange(seq_length+2), unique_perfect+0.001, shared_perfect, unique_mismatch, # seq_length+2 because of the zero padding at either end
shared_mismatch, linewidth=0, colors=area_colors, zorder=5) # +0.001 so zeros are not -inf on the logplot, but somewhere well below the cutoff line. Makes everything nicer
for aa in areas:
# if you output to pdf it strangely doesn't respect linewidth=0 perfectly, you'll have
# to set line colors identical to area color to avoid having a black line between them
aa.set_edgecolor(aa.get_facecolor())
plot.tick_params(axis='both', labelsize=10, direction='out', which='both', top=False)
plot.text(.015, 0.97, plot_title, horizontalalignment='left', verticalalignment='top', transform=plot.transAxes, fontsize=10, zorder=6)
_, _, _, y2 = plot.axis()
plot.axis((0, seq_length, 1, y2)) # enforce y axis minimum at 10^0. This corresponds to zero coverage because of the +1 above
plot.set_yscale('log')
plot.set_ylim(bottom=0.5)
figure.tight_layout()
figure.savefig(outfile)