arq5x · pfpjs · Jan 19, 2019 · Jan 20, 2019 · Jan 23, 2019
diff --git a/gemini/annotation_provenance/gene_table/README b/gemini/annotation_provenance/gene_table/README
@@ -1,5 +1,5 @@
 # Ensembl biomart download: 
-# Ensembl gene version: Ensembl 75
+# Ensembl gene version: Ensembl 95
 ############################################################
 1) Retrieve all gene ids: query1.pl -> mart_export1
 2) Retrieve CDS length: query2.pl -> mart_export2
@@ -16,11 +16,11 @@
 #processing to add None for blank columns
 ##########################################
 
-a) cat mart_export1 | awk -F'\t' '{ OFS = "\t" }; {for(n=1; n<=NF; n++) sub(/^$/, "None", $n); print $0}' > ensembl75_1
-b) cat mart_export2 | awk -F'\t' '{ OFS = "\t" }; {for(n=1; n<=NF; n++) sub(/^$/, "None", $n); print $0}' > ensembl75_2
-c) cat mart_export3 | awk -F'\t' '{ OFS = "\t" }; {for(n=1; n<=NF; n++) sub(/^$/, "None", $n); print $0}' > ensembl75_3
+a) cat mart_export1 | awk -F'\t' '{ OFS = "\t" }; {for(n=1; n<=NF; n++) sub(/^$/, "None", $n); print $0}' > ensembl_1
+b) cat mart_export2 | awk -F'\t' '{ OFS = "\t" }; {for(n=1; n<=NF; n++) sub(/^$/, "None", $n); print $0}' > ensembl_2
+c) cat mart_export3 | awk -F'\t' '{ OFS = "\t" }; {for(n=1; n<=NF; n++) sub(/^$/, "None", $n); print $0}' > ensembl_3
 d) cat HGNC_download | awk -F'\t' '{ OFS = "\t" }; {for(n=1; n<=NF; n++) sub(/^$/, "None", $n); print $0}' > hgnc_file
-e) cat HMD_HumanPhenotype.rpt | awk -F'\t' '{ OFS = "\t" }; {for(n=1; n<=NF; n++) sub(/^$/, "None", $n); print $0}' > HMD_HumanPhenotype
+e) cat anno_files/HMD_HumanPhenotype.rpt | awk -F'\t' '{ OFS = "\t" }; {for(n=1; n<=NF; n++) sub(/^$/, "None", $n); print $0}' > HMD_HumanPhenotype
 ###########################################################################
 # Linking CDS length to gene ids (i.e. joining on a & b)
 # Connector for both these files is the ensembl transcript id
@@ -96,5 +96,5 @@ python combined_gene_table.py (outfiles: detailed_gene_table, summary_gene_table
   "transcript_max_start","Phenotype_id")					 
 
 
-# rm gene_table ensembl_75* ensembl_format hgnc_file HMD_HumanPhenotype
+# rm gene_table ensembl_* hgnc_file HMD_HumanPhenotype
 
diff --git a/gemini/annotation_provenance/gene_table/combined_gene_table.py b/gemini/annotation_provenance/gene_table/combined_gene_table.py
@@ -7,10 +7,10 @@
 from collections import defaultdict
 
 
-filename = 'detailed_gene_table_v75'
+filename = 'detailed_gene_table'
 detailed_out = open(filename, 'w')
 
-file = 'summary_gene_table_v75'
+file = 'summary_gene_table'
 summary_out = open(file, 'w')
 
 # write out files for detailed and summary gene table
@@ -35,7 +35,7 @@
 lines_seen = set()
 
 
-for line in open("genic_intolerance_dataset2", 'r'):
+for line in open("anno_files/genic_intolerance_dataset2", 'r'):
     if line.startswith("#") is False:
         field = line.strip().split("\t")
         name = str(field[0])