-
Notifications
You must be signed in to change notification settings - Fork 1
/
missing-uniprot.py
49 lines (46 loc) · 1.25 KB
/
missing-uniprot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from sys import *
import csv
reader = csv.DictReader(open('refseqp-wd.tab', 'r'), delimiter='\t')
refs = {}
dups = set()
for item in reader:
uid = item.get('refseq')
iturl = item.get('item')
it = iturl[iturl.rfind('/')+1:]
git = refs.get(uid)
if git is None or git == it:
refs[it] = uid
else:
#print('more than one value: {} ({}, {})'.format(uid, git, it))
dups.add(it)
for k in dups:
refs.pop(k)
reader = csv.DictReader(open('uniprot-refseq.tab', 'r'), delimiter='\t')
unips = {}
dups = set()
for item in reader:
uid = item.get('uniprot')
if '-' in uid:
continue
ref = item.get('refseq')
if ref.find('.') > -1:
ref = ref[:ref.find('.')]
git = unips.get(ref)
if git is None or git == it:
unips[ref] = uid
else:
#print('more than one value: {} ({}, {})'.format(uid, git, it))
dups.add(ref)
for k in dups:
unips.pop(k)
#uids = set(unips.keys())
ids = set(l.rstrip() for l in open('wd-refseq-without-uniprot', 'r').readlines())
#s = uids.difference(full)
#print(len(uids))
#print(len(s))
for it in ids:
r = refs.get(it)
if r is not None:
u = unips.get(refs[it])
if u is not None:
print('{}|P352|"{}"'.format(it, u))