From 5912bcd6f8fed016a0524629cc24570aeda40df2 Mon Sep 17 00:00:00 2001 From: skchronicles Date: Mon, 8 Jul 2024 16:54:37 -0400 Subject: [PATCH] Adding step to compress and index VCF --- config/cluster.json | 7 +++++++ config/containers.json | 3 ++- workflow/Snakefile | 22 ++++++++++++++++++++++ workflow/rules/germline.smk | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/config/cluster.json b/config/cluster.json index efbf169..f1c53d8 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -75,5 +75,12 @@ "partition": "gpu", "time": "0-02:00:00", "gres": "gpu:a100:4,lscratch:500" + }, + "bgzip_index_vcf": { + "threads": "2", + "mem": "16g", + "partition": "norm", + "time": "0-02:00:00", + "gres": "lscratch:20" } } \ No newline at end of file diff --git a/config/containers.json b/config/containers.json index d0cb43f..e9bd396 100644 --- a/config/containers.json +++ b/config/containers.json @@ -1,6 +1,7 @@ { "images": { "parabricks": "docker://nvcr.io/nvidia/clara/clara-parabricks:4.3.1-1", - "bwa": "docker://biocontainers/bwa:v0.7.17_cv1" + "bwa": "docker://biocontainers/bwa:v0.7.17_cv1", + "gatk": "docker://skchronicles/genome-seek:v0.1.0" } } diff --git a/workflow/Snakefile b/workflow/Snakefile index 88a95ff..bb956a6 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -27,6 +27,21 @@ bwa_index_extensions = [ ".amb", ".ann", ".bwt", ".pac", ".sa" ] +# Benchmark configurations for +# different GPU, memory, and +# performance settings +benchmark_configurations = [ + "1gpu_normal_memory", + "1gpu_normal_memory_optimized", + "1gpu_low_memory", + "1gpu_high_memory", + "2gpu_normal_memory", + "2gpu_normal_memory_optimized", + "2gpu_low_memory", + "4gpu_normal_memory", + "4gpu_normal_memory_optimized" +] + # Read in resource information, # containing information about # threads, mem, walltimes, etc. @@ -111,6 +126,13 @@ rule all: join(workpath, "gatk_germline", "4gpu_normal_memory_optimized", "{name}", "{name}.bam"), name=samples ), + # bgzip compress and index VCF files, + # Output files of `rule bgzip_index_vcf` + expand( + join(workpath, "gatk_germline", "{benchmark_configuration}", "{name}", "{name}.vcf.gz"), + benchmark_configuration=benchmark_configurations, + name=samples + ), # Import rules diff --git a/workflow/rules/germline.smk b/workflow/rules/germline.smk index 580c1cf..8a1f59a 100644 --- a/workflow/rules/germline.smk +++ b/workflow/rules/germline.smk @@ -638,3 +638,37 @@ rule parabricks_gatk_germline_4gpu_normal_memory_optimized: --fq2bamfast \\ --keep-tmp """ + + +rule bgzip_index_vcf: + """Data processing step to compress the VCF file with bgzip and index it with tabix. + @Inputs: + VCF file + @Outputs: + bgzip compressed VCF file, + tabix index file + """ + input: + vcf = join(workpath, "gatk_germline", "{benchmark_configuration}", "{sample}", "{name}.vcf"), + output: + vcf = join(workpath, "gatk_germline", "{benchmark_configuration}", "{sample}", "{name}.vcf.gz"), + idx = join(workpath, "gatk_germline", "{benchmark_configuration}", "{sample}", "{name}.vcf.gz.tbi"), + params: + # Job submission parameters + rname = "bgzip_index_vcf", + mem = allocated("mem", "bgzip_index_vcf", cluster), + gres = allocated("gres", "bgzip_index_vcf", cluster), + time = allocated("time", "bgzip_index_vcf", cluster), + partition = allocated("partition", "bgzip_index_vcf", cluster), + threads: int(allocated("threads", "bgzip_index_vcf", cluster)), + container: config['images']['gatk'], + shell: """ + # Bgzip compress the VCF file + bgzip \\ + -c {input.vcf} \\ + > {output.vcf} + # Create a tabix index + tabix \\ + -p vcf \\ + {output.vcf} + """ \ No newline at end of file