Skip to content

Commit

Permalink
Test Parabricks DeepVariant germline workflow with best performance o…
Browse files Browse the repository at this point in the history
…ptions.
  • Loading branch information
skchronicles committed Jul 10, 2024
1 parent 5912bcd commit c8b3b05
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 12 deletions.
14 changes: 14 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ rule all:
join(workpath, "refs", genome + "{ext}"),
ext=bwa_index_extensions
),
# GATK germline pipeline output files,
# Run Parabricks germline pipeline with 1 GPU and normal memory,
# Output files of `rule parabricks_gatk_germline_1gpu_normal_memory`
# in file `rules/germline.smk`
Expand Down Expand Up @@ -126,13 +127,26 @@ rule all:
join(workpath, "gatk_germline", "4gpu_normal_memory_optimized", "{name}", "{name}.bam"),
name=samples
),
# DeepVariant germline pipeline output files,
# Run Parabricks DeepVariant germline pipeline with 1 GPU and normal memory
# and a recommended set of option to gain the best performance/runtimes,
# Output files of `rule parabricks_deepvariant_germline_1gpu_normal_memory_optimized`
# in file `rules/germline.smk`
expand(
join(workpath, "deepvariant_germline", "1gpu_normal_memory_optimized", "{name}", "{name}.bam"),
name=samples
),
# bgzip compress and index VCF files,
# Output files of `rule bgzip_index_vcf`
expand(
join(workpath, "gatk_germline", "{benchmark_configuration}", "{name}", "{name}.vcf.gz"),
benchmark_configuration=benchmark_configurations,
name=samples
),
expand(
join(workpath, "deepvariant_germline", "1gpu_normal_memory_optimized", "{name}", "{name}.vcf.gz"),
name=samples
),


# Import rules
Expand Down
100 changes: 88 additions & 12 deletions workflow/rules/germline.smk
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ rule parabricks_gatk_germline_1gpu_normal_memory:
GIAB Sample fastq file (scatter-per-sample).
@Outputs:
BAM file,
GVCF file,
VCF file,
Recal table
"""
input:
Expand Down Expand Up @@ -82,7 +82,7 @@ rule parabricks_gatk_germline_1gpu_normal_memory_optimized:
GIAB Sample fastq file (scatter-per-sample).
@Outputs:
BAM file,
GVCF file,
VCF file,
Recal table
"""
input:
Expand Down Expand Up @@ -157,7 +157,7 @@ rule parabricks_gatk_germline_1gpu_low_memory:
GIAB Sample fastq file (scatter-per-sample).
@Outputs:
BAM file,
GVCF file,
VCF file,
Recal table
"""
input:
Expand Down Expand Up @@ -227,7 +227,7 @@ rule parabricks_gatk_germline_1gpu_high_memory:
GIAB Sample fastq file (scatter-per-sample).
@Outputs:
BAM file,
GVCF file,
VCF file,
Recal table
"""
input:
Expand Down Expand Up @@ -282,6 +282,82 @@ rule parabricks_gatk_germline_1gpu_high_memory:
"""


rule parabricks_deepvariant_germline_1gpu_normal_memory_optimized:
"""Benchmarking Parabricks DeepVariant Germline pipeline with 1 GPU and an normal allotment
of main/system memory using the recommended set of options for the best performance.
NOTE: The limit memory option needs to be toned down to allow for sufficent system memory to
be available for the GPU. Internal testing has shown that parabricks germline pipeline tends
to use more than the allocated memory (even with
the --memory-limit option).
@Inputs:
GIAB Sample fastq file (scatter-per-sample).
@Outputs:
BAM file,
VCF file
"""
input:
idxs = expand(join(workpath, "refs", genome + "{ext}"), ext=bwa_index_extensions),
lnk = join(workpath, "refs", genome),
r1 = join(workpath,"{name}.R1.fastq.gz"),
r2 = join(workpath,"{name}.R2.fastq.gz"),
output:
bam = join(workpath, "deepvariant_germline", "1gpu_normal_memory_optimized", "{sample}", "{name}.bam"),
vcf = join(workpath, "deepvariant_germline", "1gpu_normal_memory_optimized", "{sample}", "{name}.vcf"),
recal = join(workpath, "deepvariant_germline", "1gpu_normal_memory_optimized", "{sample}", "{name}.recal"),
params:
# Rule specific parameters
sample = "{name}",
# Job submission parameters
rname = "pb_dv_germline_1gpu_normal_memory_optimized",
mem = allocated("mem", "1-gpu_normal-memory_optimized", cluster),
gres = allocated("gres", "1-gpu_normal-memory_optimized", cluster),
time = allocated("time", "1-gpu_normal-memory_optimized", cluster),
partition = allocated("partition", "1-gpu_normal-memory_optimized", cluster),
# Singularity options
bindpaths = ','.join(bindpaths),
tmpdir = tmpdir,
sif = config['images']['parabricks'],
# Parabricks options
RUNNING_MEMORY_GB = int(
int(allocated("mem", "1-gpu_normal-memory_optimized", cluster).lower().rstrip("g")) / 2
),
KNOWN_INDELS_1 = config['references']['GATK_KNOWN_INDELS'],
KNOWN_INDELS_2 = config['references']['OTHER_KNOWN_INDELS'],
threads: int(allocated("threads", "1-gpu_normal-memory_optimized", cluster))
shell: """
# Run Parabricks germline pipeline with
# default acceleration options and the
# recommended set of options for best
# performance
singularity exec \\
-c \\
--nv \\
--env TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=268435456 \\
-B {params.bindpaths},{params.tmpdir}:/tmp \\
{params.sif} \\
pbrun deepvariant_germline \\
--mode shortread \\
--ref {input.lnk} \\
--in-fq {input.r1} {input.r1} "@RG\\tID:{params.sample}\\tSM:{params.sample}\\tPL:illumina\\tLB:{params.sample}\\tPU:{params.sample}\\tCN:ncbr\\tDS:wgs" \\
--knownSites {params.KNOWN_INDELS_1} \\
--knownSites {params.KNOWN_INDELS_2} \\
--out-bam {output.bam} \\
--out-variants {output.vcf} \\
--out-recal-file {output.recal} \\
--bwa-options="-M" \\
--monitor-usage \\
--memory-limit {params.RUNNING_MEMORY_GB} \\
--tmp-dir /tmp \\
--num-cpu-threads-per-stage {threads} \\
--bwa-cpu-thread-pool {threads} \\
--num-streams-per-gpu 4 \\
--gpusort \\
--gpuwrite \\
--fq2bamfast \\
--keep-tmp
"""


# Rules utilizing more than one A100 GPU,
# This set of rules use 2 GPUs with different
# memory allocations and performance options
Expand All @@ -295,7 +371,7 @@ rule parabricks_gatk_germline_2gpu_normal_memory:
GIAB Sample fastq file (scatter-per-sample).
@Outputs:
BAM file,
GVCF file,
VCF file,
Recal table
"""
input:
Expand Down Expand Up @@ -361,7 +437,7 @@ rule parabricks_gatk_germline_2gpu_normal_memory_optimized:
GIAB Sample fastq file (scatter-per-sample).
@Outputs:
BAM file,
GVCF file,
VCF file,
Recal table
"""
input:
Expand Down Expand Up @@ -437,7 +513,7 @@ rule parabricks_gatk_germline_2gpu_low_memory:
GIAB Sample fastq file (scatter-per-sample).
@Outputs:
BAM file,
GVCF file,
VCF file,
Recal table
"""
input:
Expand Down Expand Up @@ -508,7 +584,7 @@ rule parabricks_gatk_germline_4gpu_normal_memory:
GIAB Sample fastq file (scatter-per-sample).
@Outputs:
BAM file,
GVCF file,
VCF file,
Recal table
"""
input:
Expand Down Expand Up @@ -574,7 +650,7 @@ rule parabricks_gatk_germline_4gpu_normal_memory_optimized:
GIAB Sample fastq file (scatter-per-sample).
@Outputs:
BAM file,
GVCF file,
VCF file,
Recal table
"""
input:
Expand Down Expand Up @@ -649,10 +725,10 @@ rule bgzip_index_vcf:
tabix index file
"""
input:
vcf = join(workpath, "gatk_germline", "{benchmark_configuration}", "{sample}", "{name}.vcf"),
vcf = join(workpath, "{tested_tool}", "{benchmark_configuration}", "{sample}", "{name}.vcf"),
output:
vcf = join(workpath, "gatk_germline", "{benchmark_configuration}", "{sample}", "{name}.vcf.gz"),
idx = join(workpath, "gatk_germline", "{benchmark_configuration}", "{sample}", "{name}.vcf.gz.tbi"),
vcf = join(workpath, "{tested_tool}", "{benchmark_configuration}", "{sample}", "{name}.vcf.gz"),
idx = join(workpath, "{tested_tool}", "{benchmark_configuration}", "{sample}", "{name}.vcf.gz.tbi"),
params:
# Job submission parameters
rname = "bgzip_index_vcf",
Expand Down

0 comments on commit c8b3b05

Please sign in to comment.