[AD] workflow changes to handle multiple contigs

koadman · Aug 30, 2020 · ddfa754 · ddfa754
1 parent f1a4afc
commit ddfa754
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 19 deletions.
diff --git a/bin/bam_per_contig.py b/bin/bam_per_contig.py
@@ -8,15 +8,20 @@
 import sys
 import gfapy
 import os
+import json
 
 gfa = gfapy.Gfa.from_file(sys.argv[1])
 bam = pysam.AlignmentFile(sys.argv[2], "rb" )
-base_name = sys.argv[3]
+contig_map = sys.argv[3]
 
 out_bams = {}
 out_read_count = {}
+id = 0
+contig_IDs = {}
 for seg in gfa.segments:
-    seg_bam = seg.name+'.bam'
+    seg_bam = 'contig_'+str(id)+'.bam'
+    contig_IDs[seg.name]='contig_'+str(id)
+    id+=1
     out_bams[seg.name] = pysam.AlignmentFile(seg_bam, "wb", template=bam)
     out_read_count[seg.name] = 0
 
@@ -28,5 +33,8 @@
 for contig in out_bams:
     out_bams[contig].close()
     if out_read_count[contig] == 0:
-        del_bam = contig+'.bam'
+        del_bam = contig_IDs[contig]+'.bam'
         os.remove(del_bam)
+
+with open(contig_map, 'w') as json_file:
+    json.dump(contig_IDs, json_file)
diff --git a/bin/get_arcs.py b/bin/get_arcs.py
@@ -9,11 +9,20 @@
 
 CONTIG_MARGIN = 200
 MIN_SINGLECOPY_LENGTH = 20000
-target_contig = "edge_3"
 target_left = 0
-target_right = 50000
+target_right = 500000000
+
 gfa = gfapy.Gfa.from_file(sys.argv[1])
 vcf_in = pysam.VariantFile(sys.argv[2])
+bam = pysam.AlignmentFile(sys.argv[3], "rb" )
+id_map_file = open(sys.argv[4], 'r')
+target_contig_id = sys.argv[5]
+
+# find the target contig
+id_map_json = json.load(id_map_file)
+for contig in id_map_json:
+    if id_map_json[contig] == target_contig_id:
+        target_contig = contig
 
 # get the reference sequences
 ref_seqs = {}
@@ -36,7 +45,6 @@
 
 print("found "+str(len(variants))+" variants")
 
-bam = pysam.AlignmentFile(sys.argv[3], "rb" )
 local_var_counts = {}
 hic_var_counts = {}
 HIC_MIN_DIST = 800 # reads separated by more than this many bp are treated as Hi-C pairs
@@ -144,7 +152,7 @@
 
 dat['subsample'] = 10
 
-with open('standat.json', 'w') as json_file:
+with open(target_contig_id+'.standat.json', 'w') as json_file:
     json.dump(dat, json_file)
 
 #l = 0

diff --git a/bin/strain3C b/bin/strain3C
@@ -22,8 +22,9 @@ input:
 	file(bam) from bam
 output:
 	file('*.bam') into contig_bams mode('flatten')
+	file('id_map.json') into contig_ID_map
 """
-bam_per_contig.py ${gfa} ${bam} contig_
+bam_per_contig.py ${gfa} ${bam} id_map.json
 """
 }
 
@@ -33,42 +34,48 @@ input:
 	file(bam) from contig_bams
 	file(vcf) from vcf
 	file(vcf_tbi) from vcf_tbi
+	file(id_map) from contig_ID_map
 output:
-	file("*.json") into standat1 mode('flatten')
-	file("*.json") into standat2 mode('flatten')
+	file("contig*.json") into standat mode('flatten')
+	file("contig*.json") into standat2 mode('flatten')
 """
-get_arcs.py ${gfa} ${vcf} ${bam}
+CONTIG=`basename ${bam} .bam`
+ get_arcs.py ${gfa} ${vcf} ${bam} ${id_map} \$CONTIG
 """
 }
 
 process estimate_strain_counts {
 input:
-	file(stanjson) from standat1
+	file(stanjson) from standat
 	each rep from Channel.from(1..params.strain_count_replicates)
 output:
-	file('*.csv') into strain_estimate_out
+	set file('*.csv') into strain_estimate_out
 """
 BNAME=`basename ${stanjson} .json`
 add_straincount.py ${stanjson} ${params.maxstrains} \$BNAME.${rep}.json
 genotypes3C variational adapt engaged=0 eta=1 tol_rel_obj=0.001 output_samples=1000 data file=\$BNAME.${rep}.json output file=${stanjson}.${rep}.csv
 """
 }
 
-strain_count_models = strain_estimate_out.collect()
+strain_estimate_out.map{ f1 -> [f1.getName().toString().split('.standat.')[0], f1] }.groupTuple().set{ strain_count_models }
+
+//strain_count_models = strain_estimate_out.collect()
 process get_strain_count {
 input:
-	file(strain_models) from strain_count_models
+	set contig_name,file('*') from strain_count_models
 output:
-	file('strain_count.txt') into strain_count_file
+	set contig_name,file('strain_count.txt') into strain_count_file
 """
- select_strain_count.R ${strain_models} ${params.abundance_noise_threshold}
+ select_strain_count.R *.csv ${params.abundance_noise_threshold}
 """
 }
 
+standat2.map{ f1 -> [f1.getName().toString().split('.standat.')[0], f1] }.groupTuple().set{ standat3 }
+strain_count_file.join(standat3).set{ scsd }
+
 process infer_contig_haplotypes {
 input:
-	file(stanjson) from standat2
-	file(esc) from strain_count_file
+	set contig_name, file(esc), file(stanjson) from scsd
 	each rep from Channel.from(1..params.replicates)
 output:
 	file('*.csv') into model_out