-
Notifications
You must be signed in to change notification settings - Fork 0
/
add-info-to-gff3.py
executable file
·84 lines (76 loc) · 3.42 KB
/
add-info-to-gff3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
## Import required modules
import os
import sys
import pandas as pd
def main():
help_string = """
Synopsis:
python add-info-to-gff3 <GFF3-feature-file> <tRNAs-file> <gene-cluster-file>
Description:
This is an auxiliary utility for the script anvi-script-get-annotated-genome.sh
and is not generally intended for independent use. For example, it doesn't
have any input validation to speak of, relying on the fact that the script
above will only pass well-formed input. For that script to work, both scripts
must be in the same directory.
"""
version_string = """
Last updated 23 October 2021
"""
## Handle arguments
if '-h' in sys.argv or '--help' in sys.argv:
print(help_string)
sys.exit(1)
if '-v' in sys.argv or '--version' in sys.argv:
print(version_string)
sys.exit(1)
gff3_path = sys.argv[1]
trna_path = sys.argv[2]
gc_path = None
if len(sys.argv) > 3:
gc_path = sys.argv[3]
temp_path = gff3_path + ".tmp"
# Import the additional data files into a pandas dataframe
print("Reading in additional data files...")
trna_df = pd.read_table(trna_path, dtype=str)
gc_df = None
if gc_path != None:
gc_df = pd.read_csv(gc_path, sep='\t', dtype=str)
# Open the input file and output temp file, then write the header to the output.
print("Beginning writing process...")
gff3_in = open(gff3_path, 'r')
gff3_out = open(temp_path, 'w')
gff3_out.write(gff3_in.readline())
# Find additional data for each gene and write the updated line to the temp file.
for line in gff3_in:
# As of anvio v7.1, the GFF3 file uses ID=<project-name>___<gene_caller_id> where
# <project-name> was specified during the creation of the contigs db we are using.
# In genes with no function, the line ends after the ID. If there is a function,
# there is a semicolon after the ID.
start = line.find("___") + 3 # Get the index of the start of the ID number
end = line.find(";") # Try to get the index of the end of the ID number
if end > 0: # If there was a semicolon...
gene_id = line[start:end]
else: # If there wasn't...
gene_id = line[start:].strip() # Strip the \n off the end of the line
# The Transfer_RNAs annotation source includes semicolons in the function, but GFF3 reserves them as special
trna = "!!!".join(trna_df[trna_df['gene_callers_id']==gene_id]['function'].to_numpy()).replace(";"," %3B ")
accession = "!!!".join(trna_df[trna_df['gene_callers_id']==gene_id]['accession'].to_numpy())
new_attributes = ""
if accession: # If there is a value, add fields for it.
new_attributes = ";Name=" + accession + ";product=" + trna
# Start to build the output line
out_line = line.rstrip()
out_line = out_line + new_attributes
if gc_path != None:
gene_cluster = "!!!".join(gc_df[gc_df['gene_caller_id']==gene_id]['gene_cluster_id'].to_numpy())
out_line = out_line + ";Alias=" + gene_cluster
out_line = out_line + "\n"
gff3_out.write(out_line)
# Close relevant files and replace the old feature annotation file with the new one.
print("Cleaning up...")
gff3_in.close()
gff3_out.close()
os.replace(temp_path, gff3_path)
if __name__ == "__main__":
main()