-
Notifications
You must be signed in to change notification settings - Fork 0
/
refseq_exclusion.py
executable file
·109 lines (98 loc) · 4.21 KB
/
refseq_exclusion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
## Import required modules
import sys
import os
import xml.etree.ElementTree as ET
def main():
## Global variables
help_string = """
Synopsis:
python refseq_exclusion.py [-h] [--version] [<output options>] input-file
Description:
This utility extracts which assemblies in downloaded NCBI data were excluded
from RefSeq and the reason they were excluded. You must provide the path to
an assembly_result.xml file corresponding to your NCBI data. The XML file can
be downloaded from the 'Send to:' menu in the upper right of your NCBI search
results page with the 'File' destination and 'XML' format options selected.
Options:
-h | --help
Display this help text and exit.
--version
Display version information and exit.
-l | --list <identifier>
Outputs a simple list of the specified identifier of excluded assemblies.
-r | --reason <phrase>
Filters output to only show assemblies excluded for the specified reason.
"""
version_string = """
Last updated June 21 2019
"""
flags = {
'list': False,
'reason': False
}
options = {
'identifier': "",
'reason': ""
}
## Handle arguments
args = sys.argv[1:]
while len(args) > 0 and args[0][0] == "-":
arg = args.pop(0)
if arg == "-h" or arg == "--help":
print(help_string)
sys.exit(0)
elif arg == "--version":
print(version_string)
sys.exit(0)
elif arg in ["-l", "--list"]:
flags['list'] = True
options['identifier'] = args.pop(0)
elif arg in ["-r", "--reason"]:
flags['reason'] = True
options['reason'] = args.pop(0)
elif arg not in ["-", "--"]:
sys.exit("Invalid option: " + arg)
if len(args) < 1:
sys.exit("Missing input file argument.\nUse the --help option to learn more.")
if len(args) > 1:
sys.exit("Extra argument(s) detected.\nUse the --help option to learn more.")
target = args.pop(0)
if not os.path.isfile(target):
sys.exit("Specified file does not exist: " + target)
if not flags['list']:
print("Genome assemblies excluded from RefSeq")
print("Target file: " + os.path.abspath(target))
if flags['reason']: print("Specified reason: " + options['reason'])
print("------------------------------------------------")
find_exclusions(target, flags, options)
def find_exclusions(target_file, flags, arguments):
with open(target_file,'r') as f:
# Add a <data> tag as a wrapper to provide a single root node.
text = "<data>" + f.read() + "</data>"
root = ET.fromstring(text)
del text # So we don't keep two copies of the file in memory
## Iterate through files looking for excluded assemblies.
counter = 0
for DS in root.findall("DocumentSummary"): # For each DocumentSummary node...
exclusion = DS.find("ExclFromRefSeq") # Find the exclusion node
if exclusion.text is not None: # If the node has any contents...
# Get a list of the exclusion reasons
reason_list = [reason.text for reason in exclusion.findall("string")]
# If we're filtering and our specified reason isn't in the list, skip
if flags['reason'] and (arguments['reason'] not in reason_list): continue
# Format output depending on the flags
if flags['list']:
print(DS.find(arguments['identifier']).text)
else:
counter += 1
print("Organism: " + DS.find("Organism").text)
print("Assembly Name: " + DS.find("AssemblyName").text)
print("Accession: " + DS.find("AssemblyAccession").text)
print("Reasons for exclusion from RefSeq:")
for reason in reason_list:
print(" " + reason)
print("")
if not flags['list']: print("Total excluded entries: " + str(counter))
if __name__ == "__main__":
main()