Merge pull request #439 from broadinstitute/fnegrete_test

Add workflow for QIIME2 classification of 16S reads
broadinstitute · Dec 21, 2022 · 9605025 · 9605025
2 parents 5c0ebd7 + 7b809b0
commit 9605025
Show file tree

Hide file tree

Showing 7 changed files with 382 additions and 0 deletions.
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -349,3 +349,13 @@ workflows:
     primaryDescriptorPath: /pipes/WDL/workflows/trimal.wdl
     testParameterFiles:
       - empty.json
+  - name: amplicon16S_analysis
+    subclass: WDL
+    primaryDescriptorPath: /pipes/WDL/workflows/amplicon16S_analysis.wdl
+    testParameterFiles:
+      - empty.json
+  - name: qiime_import_bam
+    subclass: WDL
+    primaryDescriptorPath: /pipes/WDL/workflows/qiime_import_bam.wdl
+    testParameterFiles:
+      - empty.json
diff --git a/pipes/WDL/tasks/tasks_16S_amplicon.wdl b/pipes/WDL/tasks/tasks_16S_amplicon.wdl
@@ -0,0 +1,295 @@
+version 1.0 
+
+task qiime_import_from_bam {
+    meta {
+        description: "Parsing demultiplexed fastq BAM files into qiime readable files."
+    }
+    input { 
+        File    reads_bam
+        String  sample_name
+        Int     memory_mb = 2000
+        Int     cpu = 1
+        Int     disk_size_gb = ceil(2*size(reads_bam, "GiB")) + 5
+        String  docker     = "quay.io/broadinstitute/qiime2:conda" 
+    }
+    parameter_meta {
+        reads_bam: {description: "Input BAM file"}
+    }
+
+    command <<<
+        set -ex -o pipefail
+        #Part 1A | BAM -> FASTQ [Simple samtools command]
+        samtools fastq -1 $(pwd)/R1.fastq.gz -2 $(pwd)/R2.fastq.gz -0 /dev/null ~{reads_bam}
+        #making new bash variable | regex: (_) -> (-)
+        NEWSAMPLENAME=$(echo "~{sample_name}" | perl -lape 's/[_]/-/g')
+        #All names added to one giant file 
+        echo ${NEWSAMPLENAME} > NEWSAMPLENAME.txt
+        #Make a manifest.txt that contains [1.sampleid 2.R1_fastq 3.R2.fastq]
+        #> =overwrite or writes new file 
+        echo -e "sample-id\tforward-absolute-filepath\treverse-absolute-filepath" > manifest.tsv
+        #>>= appends 
+        #\t= tabs next value 
+        echo -e "$NEWSAMPLENAME\t$(pwd)/R1.fastq.gz\t$(pwd)/R2.fastq.gz" >> manifest.tsv
+
+        #fastq -> bam (provided by qiime tools import fxn)
+        qiime tools import \
+            --type 'SampleData[PairedEndSequencesWithQuality]' \
+            --input-path manifest.tsv \
+            --input-format PairedEndFastqManifestPhred33V2 \
+            --output-path "~{sample_name}.qza"
+    >>>
+
+    output {
+        File   reads_qza               = "~{sample_name}.qza"
+        String samplename_master_sheet = read_string("NEWSAMPLENAME.txt")
+    }
+    runtime {
+        docker: docker
+        memory: "${memory_mb} MiB"
+        cpu: cpu
+        disk: disk_size_gb + " GB"
+        disks: "local-disk " + disk_size_gb + " HDD"
+    }
+}
+
+#Part 1 | Step 2:cutadapt: Trim sequences 
+#trimreads_trim
+#trim = default 
+task trim_reads {
+
+    meta {
+        description:"Removing adapter sequences, primers, and other unwanted sequence from sequence data."
+    }
+
+    input {
+        File    reads_qza
+
+        String  qza_basename = basename(reads_qza, '.qza')
+        #Boolean not_default = false
+        String  forward_adapter         = "CTGCTGCCTCCCGTAGGAGT"
+        String  reverse_adapter         = "AGAGTTTGATCCTGGCTCAG"
+        Int    min_length              = 1
+        Boolean keep_untrimmed_reads   = false
+        Int     memory_mb = 2000
+        Int     cpu = 1
+        Int     disk_size_gb = ceil(2*size(reads_qza, "GiB")) + 5
+        String  docker          = "quay.io/broadinstitute/qiime2:conda" 
+    }
+
+    command <<<
+        set -ex -o pipefail
+        qiime cutadapt trim-paired \
+        --i-demultiplexed-sequences "~{reads_qza}" \
+        --p-front-f "~{forward_adapter}" \
+        --p-front-r "~{reverse_adapter}" \
+        ~{"--p-minimum-length " + min_length} \
+        ~{true='--p-no-discard-untrimmed' false='--p-discard-untrimmed' keep_untrimmed_reads} \
+        --o-trimmed-sequences "~{qza_basename}_trimmed.qza"
+
+        #trim_visual 
+        qiime demux summarize \
+        --i-data "~{qza_basename}_trimmed.qza" \
+        --o-visualization "~{qza_basename}_trim_summary.qzv"
+    >>>
+
+    output {
+        #trimmed_sequences = paired ends for vsearch
+        File trimmed_reads_qza     = "~{qza_basename}_trimmed.qza"
+        File trimmed_visualization = "~{qza_basename}_trim_summary.qzv" 
+    }
+
+    runtime {
+        docker: docker
+        memory: "${memory_mb} MiB"
+        cpu: cpu
+        disk: disk_size_gb + " GB"
+        disks: "local-disk " + disk_size_gb + " HDD"
+    }
+}
+
+#Part 1 | Step 3:VSEARCH: Merge sequences 
+task join_paired_ends { 
+    meta {
+        description: "Join paired-end sequence reads using vseach's merge_pairs function."
+    }
+    input {
+        #Input File: Merge paired reads
+        File    trimmed_reads_qza
+        String  reads_basename = basename(trimmed_reads_qza, '.qza')
+        Int     memory_mb = 2000
+        Int     cpu = 1
+        Int     disk_size_gb = ceil(2*size(trimmed_reads_qza, "GiB")) + 5
+        String  docker = "quay.io/broadinstitute/qiime2:conda"
+    }
+
+    command <<< 
+        set -ex -o pipefail
+        qiime vsearch join-pairs \
+        --i-demultiplexed-seqs ~{trimmed_reads_qza} \
+        --o-joined-sequences "~{reads_basename}_joined.qza"
+
+        qiime demux summarize \
+        --i-data "~{reads_basename}_joined.qza" \
+        --o-visualization "~{reads_basename}_visualization.qzv"
+    >>>
+    output {
+        File joined_end_reads_qza       = "~{reads_basename}_joined.qza"
+        File joined_end_visualization = "~{reads_basename}_visualization.qzv"
+    }
+    runtime {
+        docker: docker
+        memory: "${memory_mb} MiB"
+        cpu: cpu
+        disk: disk_size_gb + " GB"
+        disks: "local-disk " + disk_size_gb + " HDD"
+    }
+}
+
+task deblur {
+
+    meta {
+        description: "Perform sequence quality control for Illumina data using the Deblur workflow with a 16S reference as a positive filter."
+        }
+    input {
+        File    joined_end_reads_qza
+        String  joined_end_basename = basename(joined_end_reads_qza, '.qza')
+        Int     trim_length_var = 300
+        Int     memory_mb = 2000
+        Int     cpu = 1
+        Int     disk_size_gb = ceil(2*size(joined_end_reads_qza, "GiB")) + 5
+        String  docker = "quay.io/broadinstitute/qiime2:conda"
+    }
+        command <<< 
+        set -ex -o pipefail
+
+            qiime deblur denoise-16S \
+            --i-demultiplexed-seqs ~{joined_end_reads_qza} \
+            ~{"--p-trim-length " + trim_length_var} \
+            --p-sample-stats \
+            --o-representative-sequences "~{joined_end_basename}_rep_seqs.qza" \
+            --o-table "~{joined_end_basename}_table.qza" \
+            --o-stats "~{joined_end_basename}_stats.qza"
+
+            #Generate feature table- give you the number of features per sample 
+            qiime feature-table summarize \
+            --i-table  "~{joined_end_basename}_table.qza" \
+            --o-visualization   "~{joined_end_basename}_table.qzv"
+            #Generate visualization of deblur stats
+            qiime deblur visualize-stats \
+            --i-deblur-stats "~{joined_end_basename}_stats.qza" \
+            --o-visualization "~{joined_end_basename}_stats.qzv"
+        >>>
+    output {
+        File representative_seqs_qza = "~{joined_end_basename}_rep_seqs.qza"
+        File representative_table_qza = "~{joined_end_basename}_table.qza"
+        File feature_table = "~{joined_end_basename}_table.qzv"
+        File visualize_stats = "~{joined_end_basename}_stats.qzv"
+
+    }
+    runtime {
+        docker: docker
+        memory: "${memory_mb} MiB"
+        cpu: cpu
+        disk: disk_size_gb + " GB"
+        disks: "local-disk " + disk_size_gb + " HDD"
+    }
+}
+task train_classifier {
+    meta {
+        descrription: " Upload a classidier trained to classify v1-2 amplicon sequences"
+    }
+    input {
+        File    otu_ref
+        File    taxanomy_ref
+        String  forward_adapter
+        String  reverse_adapter
+        Int    min_length  =   100
+        Int    max_length  =   500
+        String  otu_basename    =   basename(otu_ref, '.qza')
+        Int     memory_mb = 2000
+        Int     cpu = 1
+        Int     disk_size_gb = ceil(2*size(otu_ref, "GiB")) + 5
+        String  docker = "quay.io/broadinstitute/qiime2:conda"
+    }
+    command <<<
+     set -ex -o pipefail
+        CONDA_ENV_NAME=$(conda info --envs -q | awk -F" " '/qiime.*/{ print $1 }')
+        conda activate ${CONDA_ENV_NAME}
+
+        qiime tools import \
+        --type 'FeatureData[Sequence]' \
+        --input-path ~{otu_ref} \
+        --output-path "~{otu_basename}_seqs.qza"
+
+        qiime tools import \
+        --type 'FeatureData[Taxonomy]'
+        --input-format HeaderlessTSVTaxonomyFormat \
+        --input-path ~{taxanomy_ref} \
+        --output-path "~{otu_basename}_tax.qza"
+
+        qiime feature-classifier extract-reads\
+        --i-sequeunces "~{otu_basename}_seqs.qza"\
+        --p-f-primer "~{forward_adapter}" \
+        --p-r-primer "~{reverse_adapter}" \
+        ~{"--p-min-length " + min_length} \
+        ~{"--p-max-length " + max_length} \
+        --o-reads "~{otu_basename}_v1-2-ref-seqs.qza"
+
+        qiime feature-classifier fit-classifier-naive-bayes \ 
+        --i-reference-reads "~{otu_basename}_v1-2-ref-seqs.qza" \ 
+        --i-reference-taxonomy "~{otu_basename}_tax.qza" \ 
+        --o-classifier "~{otu_basename}_v1-2-classifier.qza"
+        >>>
+    output {
+        File    trained_classifier = "~{otu_basename}_v1-2-classifier.qza"
+    }
+    runtime {
+        docker: docker
+        memory: "${memory_mb} MiB"
+        cpu: cpu
+        disk: disk_size_gb + " GB"
+        disks: "local-disk " + disk_size_gb + " HDD"
+    }
+}
+task tax_analysis {
+    meta {
+        description: "Protocol describes performing a taxonomic classification with a naive bayes classifier that has been trained on the V1-2 regions amplified by our primers."
+    }
+    input {
+        File    trained_classifier
+        File    representative_seqs_qza
+        File    representative_table_qza 
+        String  basename  =   basename(trained_classifier, '.qza')
+        Int     memory_mb = 5
+        Int     cpu = 1
+        Int     disk_size_gb = 375
+        String  docker = "quay.io/broadinstitute/qiime2:conda"
+    }
+    command <<<
+        set -ex -o pipefail
+        qiime feature-classifier classify-sklearn \
+        --i-classifier ~{trained_classifier} \
+        --i-reads ~{representative_seqs_qza} \
+        --o-classification "~{basename}_tax.qza"
+
+        qiime feature-table tabulate-seqs \
+        --i-data ~{representative_seqs_qza} \
+        --o-visualization "~{basename}_rep_seqs.qzv"
+
+        qiime taxa barplot \
+        --i-table ~{representative_table_qza} \
+        --i-taxonomy "~{basename}_tax.qza" \
+        --o-visualization "~{basename}_bar_plots.qzv"
+    >>>
+    output {
+        File rep_seq_list = "~{basename}_rep_seqs.qzv" 
+        File tax_classification_graph = "~{basename}_bar_plots.qzv"
+}
+    runtime {
+        docker: docker
+        memory: "7 GB"
+        cpu: cpu
+        disk: disk_size_gb + " GB"
+        disks: "local-disk " + disk_size_gb + " HDD"
+    }
+}
diff --git a/pipes/WDL/workflows/amplicon16S_analysis.wdl b/pipes/WDL/workflows/amplicon16S_analysis.wdl
@@ -0,0 +1,48 @@
+version 1.0
+
+import "../tasks/tasks_16S_amplicon.wdl" as qiime 
+
+workflow amplicon16S_analysis {
+
+    meta {
+        description: "Running 16S amplicon (from BAM format) sequencing analysis with qiime."
+        author: "fnegrete"
+        email:  "[email protected]"
+        allowNestedInputs: true 
+    }
+    input {
+        File    reads_bam
+        File    trained_classifier
+        String  sample_name
+        Boolean keep_untrimmed_reads 
+    }
+
+    call qiime.qiime_import_from_bam {
+        input:
+            reads_bam  = reads_bam,
+            sample_name = sample_name
+    }
+    #__________________________________________
+    call qiime.trim_reads {
+        input:
+           reads_qza            = qiime_import_from_bam.reads_qza,
+           keep_untrimmed_reads = keep_untrimmed_reads
+    }
+    #__________________________________________
+    call qiime.join_paired_ends {
+        input: 
+            trimmed_reads_qza = trim_reads.trimmed_reads_qza
+    }
+    #_________________________________________
+    call qiime.deblur {
+        input: 
+            joined_end_reads_qza = join_paired_ends.joined_end_reads_qza
+    }
+    #_________________________________________
+    call qiime.tax_analysis {
+        input:
+            trained_classifier = trained_classifier,
+            representative_seqs_qza = deblur.representative_seqs_qza,
+            representative_table_qza = deblur.representative_table_qza
+    }
+}
diff --git a/pipes/WDL/workflows/qiime_import_bam.wdl b/pipes/WDL/workflows/qiime_import_bam.wdl
@@ -0,0 +1,23 @@
+version 1.0
+
+import "../tasks/tasks_16S_amplicon.wdl" as infile
+
+workflow qiime_import_bam {
+
+meta{
+    description: "Importing BAM files into QIIME"
+    author: "fnegrete"
+    email:  "[email protected]"
+    allowNestedInputs: true 
+}
+input {
+    File    reads_bam
+    String  sample_name
+}
+
+call infile.qiime_import_from_bam {
+    input: 
+            reads_bam  = reads_bam,
+            sample_name = sample_name
+    }
+} 
diff --git a/test/input/WDL/cromwell-local/test_inputs-qiime_import_bam-local.json b/test/input/WDL/cromwell-local/test_inputs-qiime_import_bam-local.json
@@ -0,0 +1 @@
+../test_inputs-qiime_import_bam-local.json
diff --git a/test/input/WDL/miniwdl-local/test_inputs-qiime_import_bam-local.json b/test/input/WDL/miniwdl-local/test_inputs-qiime_import_bam-local.json
@@ -0,0 +1 @@
+../test_inputs-qiime_import_bam-local.json
diff --git a/test/input/WDL/test_inputs-qiime_import_bam-local.json b/test/input/WDL/test_inputs-qiime_import_bam-local.json
@@ -0,0 +1,4 @@
+{
+  "qiime_import_bam.reads_bam": "test/input/G5012.3.subset.bam",
+  "qiime_import_bam.sample_name": "G5012.3.subset.bam"
+}