-
Notifications
You must be signed in to change notification settings - Fork 56
/
haplotypecaller-gvcf-gatk4.wdl
279 lines (246 loc) · 8.71 KB
/
haplotypecaller-gvcf-gatk4.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
version 1.0
## Copyright Broad Institute, 2019
##
## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool
## from GATK4 in GVCF mode on a single sample according to GATK Best Practices.
## When executed the workflow scatters the HaplotypeCaller tool over a sample
## using an intervals list file. The output file produced will be a
## single gvcf file which can be used by the joint-discovery workflow.
##
## Requirements/expectations :
## - One analysis-ready BAM file for a single sample (as identified in RG:SM)
## - Set of variant calling intervals lists for the scatter, provided in a file
##
## Outputs :
## - One GVCF file and its index
##
## Cromwell version support
## - Successfully tested on v53
##
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the dockers
## for detailed licensing information pertaining to the included programs.
# WORKFLOW DEFINITION
workflow HaplotypeCallerGvcf_GATK4 {
input {
File input_bam
File input_bam_index
File ref_dict
File ref_fasta
File ref_fasta_index
File scattered_calling_intervals_list
Boolean make_gvcf = true
Boolean make_bamout = false
String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.2.0.0"
String gatk_path = "/gatk/gatk"
String gitc_docker = "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.7-1603303710"
String samtools_path = "samtools"
}
Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list)
#is the input a cram file?
Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram"
String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam")
String vcf_basename = sample_basename
String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz"
String output_filename = vcf_basename + output_suffix
# We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller.
# If we take the number we are scattering by and reduce by 20 we will have enough disk space
# to account for the fact that the data is quite uneven across the shards.
Int potential_hc_divisor = length(scattered_calling_intervals) - 20
Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1
if ( is_cram ) {
call CramToBamTask {
input:
input_cram = input_bam,
sample_name = sample_basename,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
docker = gitc_docker,
samtools_path = samtools_path
}
}
# Call variants in parallel over grouped calling intervals
scatter (interval_file in scattered_calling_intervals) {
# Generate GVCF by interval
call HaplotypeCaller {
input:
input_bam = select_first([CramToBamTask.output_bam, input_bam]),
input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]),
interval_list = interval_file,
output_filename = output_filename,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
hc_scatter = hc_divisor,
make_gvcf = make_gvcf,
make_bamout = make_bamout,
docker = gatk_docker,
gatk_path = gatk_path
}
}
# Merge per-interval GVCFs
call MergeGVCFs {
input:
input_vcfs = HaplotypeCaller.output_vcf,
input_vcfs_indexes = HaplotypeCaller.output_vcf_index,
output_filename = output_filename,
docker = gatk_docker,
gatk_path = gatk_path
}
# Outputs that will be retained when execution is complete
output {
File output_vcf = MergeGVCFs.output_vcf
File output_vcf_index = MergeGVCFs.output_vcf_index
}
}
# TASK DEFINITIONS
task CramToBamTask {
input {
# Command parameters
File ref_fasta
File ref_fasta_index
File ref_dict
File input_cram
String sample_name
# Runtime parameters
String docker
Int? machine_mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? preemptible_attempts
String samtools_path
}
Float output_bam_size = size(input_cram, "GB") / 0.60
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20
command {
set -e
set -o pipefail
~{samtools_path} view -h -T ~{ref_fasta} ~{input_cram} |
~{samtools_path} view -b -o ~{sample_name}.bam -
~{samtools_path} index -b ~{sample_name}.bam
mv ~{sample_name}.bam.bai ~{sample_name}.bai
}
runtime {
docker: docker
memory: select_first([machine_mem_gb, 15]) + " GB"
disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 3])
}
output {
File output_bam = "~{sample_name}.bam"
File output_bai = "~{sample_name}.bai"
}
}
# HaplotypeCaller per-sample in GVCF mode
task HaplotypeCaller {
input {
# Command parameters
File input_bam
File input_bam_index
File interval_list
String output_filename
File ref_dict
File ref_fasta
File ref_fasta_index
Float? contamination
Boolean make_gvcf
Boolean make_bamout
Int hc_scatter
String? gcs_project_for_requester_pays
String gatk_path
String? java_options
# Runtime parameters
String docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? preemptible_attempts
}
String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"])
Int machine_mem_gb = select_first([mem_gb, 7])
Int command_mem_gb = machine_mem_gb - 1
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20
String vcf_basename = if make_gvcf then basename(output_filename, ".gvcf") else basename(output_filename, ".vcf")
String bamout_arg = if make_bamout then "-bamout ~{vcf_basename}.bamout.bam" else ""
parameter_meta {
input_bam: {
description: "a bam file",
localization_optional: true
}
input_bam_index: {
description: "an index file for the bam input",
localization_optional: true
}
}
command {
set -e
~{gatk_path} --java-options "-Xmx~{command_mem_gb}G ~{java_opt}" \
HaplotypeCaller \
-R ~{ref_fasta} \
-I ~{input_bam} \
-L ~{interval_list} \
-O ~{output_filename} \
-contamination ~{default="0" contamination} \
-G StandardAnnotation -G StandardHCAnnotation ~{true="-G AS_StandardAnnotation" false="" make_gvcf} \
-GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \
~{true="-ERC GVCF" false="" make_gvcf} \
~{if defined(gcs_project_for_requester_pays) then "--gcs-project-for-requester-pays ~{gcs_project_for_requester_pays}" else ""} \
~{bamout_arg}
# Cromwell doesn't like optional task outputs, so we have to touch this file.
touch ~{vcf_basename}.bamout.bam
}
runtime {
docker: docker
memory: machine_mem_gb + " GB"
disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 3])
}
output {
File output_vcf = "~{output_filename}"
File output_vcf_index = "~{output_filename}.tbi"
File bamout = "~{vcf_basename}.bamout.bam"
}
}
# Merge GVCFs generated per-interval for the same sample
task MergeGVCFs {
input {
# Command parameters
Array[File] input_vcfs
Array[File] input_vcfs_indexes
String output_filename
String gatk_path
# Runtime parameters
String docker
Int? mem_gb
Int? disk_space_gb
Int? preemptible_attempts
}
Boolean use_ssd = false
Int machine_mem_gb = select_first([mem_gb, 3])
Int command_mem_gb = machine_mem_gb - 1
command {
set -e
~{gatk_path} --java-options "-Xmx~{command_mem_gb}G" \
MergeVcfs \
--INPUT ~{sep=' --INPUT ' input_vcfs} \
--OUTPUT ~{output_filename}
}
runtime {
docker: docker
memory: machine_mem_gb + " GB"
disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 3])
}
output {
File output_vcf = "~{output_filename}"
File output_vcf_index = "~{output_filename}.tbi"
}
}