diff --git a/config/cluster.json b/config/cluster.json
index 000d26e..efbf169 100644
--- a/config/cluster.json
+++ b/config/cluster.json
@@ -61,5 +61,19 @@
         "partition": "gpu",
         "time": "0-04:00:00",
         "gres": "gpu:a100:2,lscratch:500"
+    },
+    "4-gpu_normal-memory": {
+        "threads": "64",
+        "mem": "240g",
+        "partition": "gpu",
+        "time": "0-02:00:00",
+        "gres": "gpu:a100:4,lscratch:500"
+    },
+    "4-gpu_normal-memory_optimized": {
+        "threads": "64",
+        "mem": "240g",
+        "partition": "gpu",
+        "time": "0-02:00:00",
+        "gres": "gpu:a100:4,lscratch:500"
     }
 }
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 1a67eb1..88a95ff 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -96,6 +96,21 @@ rule all:
             join(workpath, "gatk_germline", "2gpu_low_memory", "{name}", "{name}.bam"),
             name=samples
         ),
+        # Run Parabricks germline pipeline with 4 GPU and normal memory,
+        # Output files of `rule parabricks_gatk_germline_4gpu_normal_memory`
+        # in file `rules/germline.smk`
+        expand(
+            join(workpath, "gatk_germline", "4gpu_normal_memory", "{name}", "{name}.bam"),
+            name=samples
+        ),
+        # Run Parabricks germline pipeline with 4 GPU and normal memory and
+        # a recommended set of option to gain the best performance/runtimes,
+        # Output files of `rule parabricks_gatk_germline_4gpu_normal_memory_optimized`
+        # in file `rules/germline.smk`
+        expand(
+            join(workpath, "gatk_germline", "4gpu_normal_memory_optimized", "{name}", "{name}.bam"),
+            name=samples
+        ),
 
 
 # Import rules 
diff --git a/workflow/rules/germline.smk b/workflow/rules/germline.smk
index 203f129..02370b4 100644
--- a/workflow/rules/germline.smk
+++ b/workflow/rules/germline.smk
@@ -93,7 +93,8 @@ rule parabricks_gatk_germline_1gpu_normal_memory_optimized:
         r2   = join(workpath,"{name}.R2.fastq.gz"),
     output:
         bam   = join(workpath, "gatk_germline", "1gpu_normal_memory_optimized", "{sample}", "{name}.bam"),
-        gvcf  = join(workpath, "gatk_germline", "1gpu_normal_memory_optimized", "{sample}", "{name}.g.vcf.gz"),
+        # Gzipped VCF cannot be created with --run-partition option
+        gvcf  = join(workpath, "gatk_germline", "1gpu_normal_memory_optimized", "{sample}", "{name}.g.vcf"),
         recal = join(workpath, "gatk_germline", "1gpu_normal_memory_optimized", "{sample}", "{name}.recal"),
     params:
         # Rule specific parameters
@@ -375,7 +376,8 @@ rule parabricks_gatk_germline_2gpu_normal_memory_optimized:
         r2   = join(workpath,"{name}.R2.fastq.gz"),
     output:
         bam   = join(workpath, "gatk_germline", "2gpu_normal_memory_optimized", "{sample}", "{name}.bam"),
-        gvcf  = join(workpath, "gatk_germline", "2gpu_normal_memory_optimized", "{sample}", "{name}.g.vcf.gz"),
+        # Gzipped VCF cannot be created with --run-partition option
+        gvcf  = join(workpath, "gatk_germline", "2gpu_normal_memory_optimized", "{sample}", "{name}.g.vcf"),
         recal = join(workpath, "gatk_germline", "2gpu_normal_memory_optimized", "{sample}", "{name}.recal"),
     params:
         # Rule specific parameters
@@ -498,3 +500,151 @@ rule parabricks_gatk_germline_2gpu_low_memory:
                 --low-memory \\
                 --htvc-low-memory
     """
+
+# Rule utilizing 4 A100 GPUs,
+# NOTE: each A100 node has 4 GPUs
+# so we are allocating an entire 
+# node for this rule
+rule parabricks_gatk_germline_4gpu_normal_memory:
+    """Benchmarking Parabricks GATK Germline pipeline with 4 GPU and an normal allotment
+    of main memory. NOTE: The limit memory option needs to be toned down to allow for 
+    sufficent system memory to be available for the GPU. Internal testing has shown that
+    parabricks germline pipeline tends to use more than the allocated memory (even with
+    the --memory-limit option).
+    @Inputs:
+        GIAB Sample fastq file (scatter-per-sample).
+    @Outputs:
+        BAM file,
+        GVCF file,
+        Recal table
+    """
+    input:
+        idxs = expand(join(workpath, "refs", genome + "{ext}"), ext=bwa_index_extensions),
+        lnk  = join(workpath, "refs", genome),
+        r1   = join(workpath,"{name}.R1.fastq.gz"),
+        r2   = join(workpath,"{name}.R2.fastq.gz"),
+    output:
+        bam   = join(workpath, "gatk_germline", "4gpu_normal_memory", "{sample}", "{name}.bam"),
+        gvcf  = join(workpath, "gatk_germline", "4gpu_normal_memory", "{sample}", "{name}.g.vcf.gz"),
+        recal = join(workpath, "gatk_germline", "4gpu_normal_memory", "{sample}", "{name}.recal"),
+    params:
+        # Rule specific parameters
+        sample = "{name}",
+        # Job submission parameters
+        rname = "pb_germline_4gpu_normal_memory",
+        mem   = allocated("mem",  "4-gpu_normal-memory", cluster),
+        gres  = allocated("gres", "4-gpu_normal-memory", cluster),
+        time  = allocated("time", "4-gpu_normal-memory", cluster),
+        partition = allocated("partition", "4-gpu_normal-memory", cluster),
+        # Singularity options
+        bindpaths = ','.join(bindpaths),
+        tmpdir    = tmpdir,
+        sif       = config['images']['parabricks'],
+        # Parabricks options
+        RUNNING_MEMORY_GB = int(
+            allocated("mem", "4-gpu_normal-memory", cluster).lower().rstrip("g") 
+        ) - 12 ,
+        KNOWN_INDELS_1 = config['references']['GATK_KNOWN_INDELS'],
+        KNOWN_INDELS_2 = config['references']['OTHER_KNOWN_INDELS'],
+    threads: int(allocated("threads", "4-gpu_normal-memory", cluster))
+    shell: """
+    # Run Parabricks germline pipeline with
+    # default acceleration options
+    singularity exec \\
+        -c \\
+        --nv  \\
+        -B {params.bindpaths},{params.tmpdir}:/tmp \\
+        {params.sif} \\
+             pbrun germline \\
+                --ref {input.lnk} \\
+                --in-fq {input.r1} {input.r1} "@RG\\tID:{params.sample}\\tSM:{params.sample}\\tPL:illumina\\tLB:{params.sample}\\tPU:{params.sample}\\tCN:ncbr\\tDS:wgs" \\
+                --knownSites {params.KNOWN_INDELS_1} \\
+                --knownSites {params.KNOWN_INDELS_2} \\
+                --out-bam {output.bam} \\
+                --out-variants {output.gvcf} \\
+                --out-recal-file {output.recal} \\
+                --gvcf \\
+                --bwa-options="-M" \\
+                --monitor-usage \\
+                --memory-limit {params.RUNNING_MEMORY_GB} \\
+                --tmp-dir /tmp
+    """
+
+
+rule parabricks_gatk_germline_4gpu_normal_memory_optimized:
+    """Benchmarking Parabricks GATK Germline pipeline with 4 GPU and an normal allotment
+    of main memory using the recommended set of option to gain the best performance. 
+    NOTE: The limit memory option needs to be toned down to allow for sufficent system
+    memory to be available for the GPU. Internal testing has shown that parabricks
+    germline pipeline tends to use more than the allocated memory (even with the 
+    --memory-limit option).
+    @Inputs:
+        GIAB Sample fastq file (scatter-per-sample).
+    @Outputs:
+        BAM file,
+        GVCF file,
+        Recal table
+    """
+    input:
+        idxs = expand(join(workpath, "refs", genome + "{ext}"), ext=bwa_index_extensions),
+        lnk  = join(workpath, "refs", genome),
+        r1   = join(workpath,"{name}.R1.fastq.gz"),
+        r2   = join(workpath,"{name}.R2.fastq.gz"),
+    output:
+        bam   = join(workpath, "gatk_germline", "4gpu_normal_memory_optimized", "{sample}", "{name}.bam"),
+        # Gzipped VCF cannot be created with --run-partition option
+        gvcf  = join(workpath, "gatk_germline", "4gpu_normal_memory_optimized", "{sample}", "{name}.g.vcf"),
+        recal = join(workpath, "gatk_germline", "4gpu_normal_memory_optimized", "{sample}", "{name}.recal"),
+    params:
+        # Rule specific parameters
+        sample = "{name}",
+        # Job submission parameters
+        rname = "pb_germline_4gpu_normal_memory_optimized",
+        mem   = allocated("mem",  "4-gpu_normal-memory_optimized", cluster),
+        gres  = allocated("gres", "4-gpu_normal-memory_optimized", cluster),
+        time  = allocated("time", "4-gpu_normal-memory_optimized", cluster),
+        partition = allocated("partition", "4-gpu_normal-memory_optimized", cluster),
+        # Singularity options
+        bindpaths = ','.join(bindpaths),
+        tmpdir    = tmpdir,
+        sif       = config['images']['parabricks'],
+        # Parabricks options
+        RUNNING_MEMORY_GB = int(
+            allocated("mem", "4-gpu_normal-memory_optimized", cluster).lower().rstrip("g") 
+        ) - 12 ,
+        KNOWN_INDELS_1 = config['references']['GATK_KNOWN_INDELS'],
+        KNOWN_INDELS_2 = config['references']['OTHER_KNOWN_INDELS'],
+    threads: int(allocated("threads", "4-gpu_normal-memory_optimized", cluster))
+    shell: """
+    # Run Parabricks germline pipeline with
+    # default acceleration options and the 
+    # recommended set of options for best
+    # performance
+    singularity exec \\
+        -c \\
+        --nv  \\
+        --env TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=268435456 \\
+        -B {params.bindpaths},{params.tmpdir}:/tmp \\
+        {params.sif} \\
+             pbrun germline \\
+                --ref {input.lnk} \\
+                --in-fq {input.r1} {input.r1} "@RG\\tID:{params.sample}\\tSM:{params.sample}\\tPL:illumina\\tLB:{params.sample}\\tPU:{params.sample}\\tCN:ncbr\\tDS:wgs" \\
+                --knownSites {params.KNOWN_INDELS_1} \\
+                --knownSites {params.KNOWN_INDELS_2} \\
+                --out-bam {output.bam} \\
+                --out-variants {output.gvcf} \\
+                --out-recal-file {output.recal} \\
+                --gvcf \\
+                --bwa-options="-M" \\
+                --monitor-usage \\
+                --memory-limit {params.RUNNING_MEMORY_GB} \\
+                --tmp-dir /tmp \\
+                --num-cpu-threads-per-stage {threads} \\
+                --bwa-cpu-thread-pool {threads} \\
+                --run-partition \\
+                --read-from-tmp-dir \\
+                --gpusort \\
+                --gpuwrite \\
+                --fq2bamfast \\
+                --keep-tmp
+    """