diff --git a/bin/bam_per_contig.py b/bin/bam_per_contig.py index 1474aa9..ad0be68 100755 --- a/bin/bam_per_contig.py +++ b/bin/bam_per_contig.py @@ -8,15 +8,20 @@ import sys import gfapy import os +import json gfa = gfapy.Gfa.from_file(sys.argv[1]) bam = pysam.AlignmentFile(sys.argv[2], "rb" ) -base_name = sys.argv[3] +contig_map = sys.argv[3] out_bams = {} out_read_count = {} +id = 0 +contig_IDs = {} for seg in gfa.segments: - seg_bam = seg.name+'.bam' + seg_bam = 'contig_'+str(id)+'.bam' + contig_IDs[seg.name]='contig_'+str(id) + id+=1 out_bams[seg.name] = pysam.AlignmentFile(seg_bam, "wb", template=bam) out_read_count[seg.name] = 0 @@ -28,5 +33,8 @@ for contig in out_bams: out_bams[contig].close() if out_read_count[contig] == 0: - del_bam = contig+'.bam' + del_bam = contig_IDs[contig]+'.bam' os.remove(del_bam) + +with open(contig_map, 'w') as json_file: + json.dump(contig_IDs, json_file) diff --git a/bin/get_arcs.py b/bin/get_arcs.py index fd11de2..b7f8cb3 100755 --- a/bin/get_arcs.py +++ b/bin/get_arcs.py @@ -9,11 +9,20 @@ CONTIG_MARGIN = 200 MIN_SINGLECOPY_LENGTH = 20000 -target_contig = "edge_3" target_left = 0 -target_right = 50000 +target_right = 500000000 + gfa = gfapy.Gfa.from_file(sys.argv[1]) vcf_in = pysam.VariantFile(sys.argv[2]) +bam = pysam.AlignmentFile(sys.argv[3], "rb" ) +id_map_file = open(sys.argv[4], 'r') +target_contig_id = sys.argv[5] + +# find the target contig +id_map_json = json.load(id_map_file) +for contig in id_map_json: + if id_map_json[contig] == target_contig_id: + target_contig = contig # get the reference sequences ref_seqs = {} @@ -36,7 +45,6 @@ print("found "+str(len(variants))+" variants") -bam = pysam.AlignmentFile(sys.argv[3], "rb" ) local_var_counts = {} hic_var_counts = {} HIC_MIN_DIST = 800 # reads separated by more than this many bp are treated as Hi-C pairs @@ -144,7 +152,7 @@ dat['subsample'] = 10 -with open('standat.json', 'w') as json_file: +with open(target_contig_id+'.standat.json', 'w') as json_file: json.dump(dat, json_file) #l = 0 diff --git a/bin/strain3C b/bin/strain3C index 3dd3051..52ded18 100755 --- a/bin/strain3C +++ b/bin/strain3C @@ -22,8 +22,9 @@ input: file(bam) from bam output: file('*.bam') into contig_bams mode('flatten') + file('id_map.json') into contig_ID_map """ -bam_per_contig.py ${gfa} ${bam} contig_ +bam_per_contig.py ${gfa} ${bam} id_map.json """ } @@ -33,20 +34,22 @@ input: file(bam) from contig_bams file(vcf) from vcf file(vcf_tbi) from vcf_tbi + file(id_map) from contig_ID_map output: - file("*.json") into standat1 mode('flatten') - file("*.json") into standat2 mode('flatten') + file("contig*.json") into standat mode('flatten') + file("contig*.json") into standat2 mode('flatten') """ -get_arcs.py ${gfa} ${vcf} ${bam} +CONTIG=`basename ${bam} .bam` + get_arcs.py ${gfa} ${vcf} ${bam} ${id_map} \$CONTIG """ } process estimate_strain_counts { input: - file(stanjson) from standat1 + file(stanjson) from standat each rep from Channel.from(1..params.strain_count_replicates) output: - file('*.csv') into strain_estimate_out + set file('*.csv') into strain_estimate_out """ BNAME=`basename ${stanjson} .json` add_straincount.py ${stanjson} ${params.maxstrains} \$BNAME.${rep}.json @@ -54,21 +57,25 @@ genotypes3C variational adapt engaged=0 eta=1 tol_rel_obj=0.001 output_samples=1 """ } -strain_count_models = strain_estimate_out.collect() +strain_estimate_out.map{ f1 -> [f1.getName().toString().split('.standat.')[0], f1] }.groupTuple().set{ strain_count_models } + +//strain_count_models = strain_estimate_out.collect() process get_strain_count { input: - file(strain_models) from strain_count_models + set contig_name,file('*') from strain_count_models output: - file('strain_count.txt') into strain_count_file + set contig_name,file('strain_count.txt') into strain_count_file """ - select_strain_count.R ${strain_models} ${params.abundance_noise_threshold} + select_strain_count.R *.csv ${params.abundance_noise_threshold} """ } +standat2.map{ f1 -> [f1.getName().toString().split('.standat.')[0], f1] }.groupTuple().set{ standat3 } +strain_count_file.join(standat3).set{ scsd } + process infer_contig_haplotypes { input: - file(stanjson) from standat2 - file(esc) from strain_count_file + set contig_name, file(esc), file(stanjson) from scsd each rep from Channel.from(1..params.replicates) output: file('*.csv') into model_out