Skip to content

Commit

Permalink
[AD] workflow changes to handle multiple contigs
Browse files Browse the repository at this point in the history
  • Loading branch information
koadman authored and koadman committed Aug 30, 2020
1 parent f1a4afc commit ddfa754
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 19 deletions.
14 changes: 11 additions & 3 deletions bin/bam_per_contig.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,20 @@
import sys
import gfapy
import os
import json

gfa = gfapy.Gfa.from_file(sys.argv[1])
bam = pysam.AlignmentFile(sys.argv[2], "rb" )
base_name = sys.argv[3]
contig_map = sys.argv[3]

out_bams = {}
out_read_count = {}
id = 0
contig_IDs = {}
for seg in gfa.segments:
seg_bam = seg.name+'.bam'
seg_bam = 'contig_'+str(id)+'.bam'
contig_IDs[seg.name]='contig_'+str(id)
id+=1
out_bams[seg.name] = pysam.AlignmentFile(seg_bam, "wb", template=bam)
out_read_count[seg.name] = 0

Expand All @@ -28,5 +33,8 @@
for contig in out_bams:
out_bams[contig].close()
if out_read_count[contig] == 0:
del_bam = contig+'.bam'
del_bam = contig_IDs[contig]+'.bam'
os.remove(del_bam)

with open(contig_map, 'w') as json_file:
json.dump(contig_IDs, json_file)
16 changes: 12 additions & 4 deletions bin/get_arcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,20 @@

CONTIG_MARGIN = 200
MIN_SINGLECOPY_LENGTH = 20000
target_contig = "edge_3"
target_left = 0
target_right = 50000
target_right = 500000000

gfa = gfapy.Gfa.from_file(sys.argv[1])
vcf_in = pysam.VariantFile(sys.argv[2])
bam = pysam.AlignmentFile(sys.argv[3], "rb" )
id_map_file = open(sys.argv[4], 'r')
target_contig_id = sys.argv[5]

# find the target contig
id_map_json = json.load(id_map_file)
for contig in id_map_json:
if id_map_json[contig] == target_contig_id:
target_contig = contig

# get the reference sequences
ref_seqs = {}
Expand All @@ -36,7 +45,6 @@

print("found "+str(len(variants))+" variants")

bam = pysam.AlignmentFile(sys.argv[3], "rb" )
local_var_counts = {}
hic_var_counts = {}
HIC_MIN_DIST = 800 # reads separated by more than this many bp are treated as Hi-C pairs
Expand Down Expand Up @@ -144,7 +152,7 @@

dat['subsample'] = 10

with open('standat.json', 'w') as json_file:
with open(target_contig_id+'.standat.json', 'w') as json_file:
json.dump(dat, json_file)

#l = 0
Expand Down
31 changes: 19 additions & 12 deletions bin/strain3C
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ input:
file(bam) from bam
output:
file('*.bam') into contig_bams mode('flatten')
file('id_map.json') into contig_ID_map
"""
bam_per_contig.py ${gfa} ${bam} contig_
bam_per_contig.py ${gfa} ${bam} id_map.json
"""
}

Expand All @@ -33,42 +34,48 @@ input:
file(bam) from contig_bams
file(vcf) from vcf
file(vcf_tbi) from vcf_tbi
file(id_map) from contig_ID_map
output:
file("*.json") into standat1 mode('flatten')
file("*.json") into standat2 mode('flatten')
file("contig*.json") into standat mode('flatten')
file("contig*.json") into standat2 mode('flatten')
"""
get_arcs.py ${gfa} ${vcf} ${bam}
CONTIG=`basename ${bam} .bam`
get_arcs.py ${gfa} ${vcf} ${bam} ${id_map} \$CONTIG
"""
}

process estimate_strain_counts {
input:
file(stanjson) from standat1
file(stanjson) from standat
each rep from Channel.from(1..params.strain_count_replicates)
output:
file('*.csv') into strain_estimate_out
set file('*.csv') into strain_estimate_out
"""
BNAME=`basename ${stanjson} .json`
add_straincount.py ${stanjson} ${params.maxstrains} \$BNAME.${rep}.json
genotypes3C variational adapt engaged=0 eta=1 tol_rel_obj=0.001 output_samples=1000 data file=\$BNAME.${rep}.json output file=${stanjson}.${rep}.csv
"""
}

strain_count_models = strain_estimate_out.collect()
strain_estimate_out.map{ f1 -> [f1.getName().toString().split('.standat.')[0], f1] }.groupTuple().set{ strain_count_models }

//strain_count_models = strain_estimate_out.collect()
process get_strain_count {
input:
file(strain_models) from strain_count_models
set contig_name,file('*') from strain_count_models
output:
file('strain_count.txt') into strain_count_file
set contig_name,file('strain_count.txt') into strain_count_file
"""
select_strain_count.R ${strain_models} ${params.abundance_noise_threshold}
select_strain_count.R *.csv ${params.abundance_noise_threshold}
"""
}

standat2.map{ f1 -> [f1.getName().toString().split('.standat.')[0], f1] }.groupTuple().set{ standat3 }
strain_count_file.join(standat3).set{ scsd }

process infer_contig_haplotypes {
input:
file(stanjson) from standat2
file(esc) from strain_count_file
set contig_name, file(esc), file(stanjson) from scsd
each rep from Channel.from(1..params.replicates)
output:
file('*.csv') into model_out
Expand Down

0 comments on commit ddfa754

Please sign in to comment.