From f46a2c7abadb6e14fb26be4e1a78279b001b4d83 Mon Sep 17 00:00:00 2001
From: "Zachary S.L. Foster" <zacharyfoster1989@gmail.com>
Date: Thu, 18 Jan 2024 12:03:45 -0800
Subject: [PATCH] updated documentation

---
 README.md                           | 24 ++++++++++----
 assets/main_report_old/index.qmd    |  3 +-
 assets/main_report_old/packages.bib | 31 ++++++++++++++----
 workflows/pathogensurveillance.nf   | 50 ++++++++++++++---------------
 4 files changed, 70 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index d1369690..bf24fa5a 100644
--- a/README.md
+++ b/README.md
@@ -22,14 +22,18 @@
 <!-- TODO nf-core: Write a 1-2 sentence summary of what data the pipeline is for and what it does -->
 
 **nf-core/pathogensurveillance** is a population genomic pipeline for pathogen diagnosis, variant detection, and biosurveillance.
-The pipeline accepts the paths to raw reads for one or more organisms and creates reports in the form of interactive HTML reports or PDF documents.
-Significant features include the ability to analyze unidentified eukaryotic and prokaryotic samples, creation of reports for multiple user-defined groupings of samples, automated discovery and downloading of reference assemblies from NCBI RefSeq, and rapid initial identification based on k-mer sketches followed by a more robust core genome phylogeny.
+The pipeline accepts the paths to raw reads for one or more organisms (in the form of a CSV file) and creates reports in the form of interactive HTML reports or PDF documents.
+Significant features include the ability to analyze unidentified eukaryotic and prokaryotic samples, creation of reports for multiple user-defined groupings of samples, automated discovery and downloading of reference assemblies from NCBI RefSeq, and rapid initial identification based on k-mer sketches followed by a more robust core genome phylogeny and SNP-based phylogeny.
 
-The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
+The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner.
+It uses Docker/Singularity containers making installation trivial and results highly reproducible.
+The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies.
+Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
 
 <!-- TODO nf-core: Add full-sized test dataset and amend the paragraph below if applicable -->
 
-On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world data sets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/pathogensurveillance/results).
+On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure.
+This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world data sets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/pathogensurveillance/results).
 
 ## Pipeline summary
 
@@ -44,7 +48,7 @@ On release, automated continuous integration tests run the pipeline on a full-si
 3. Download the pipeline and test it on a minimal dataset with a single command:
 
    ```bash
-   nextflow run nf-core/pathogensurveillance -profile test,YOURPROFILE --outdir <OUTDIR>
+   nextflow run nf-core/pathogensurveillance -profile test,YOURPROFILE --outdir <OUTDIR> -resume
    ```
 
    Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string.
@@ -59,9 +63,17 @@ On release, automated continuous integration tests run the pipeline on a full-si
    <!-- TODO nf-core: Update the example "typical command" below used to run the pipeline -->
 
    ```bash
-   nextflow run nf-core/pathogensurveillance --input samplesheet.csv --outdir <OUTDIR> --genome GRCh37 -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
+   nextflow run nf-core/pathogensurveillance --input samplesheet.csv --outdir <OUTDIR> -profile <docker/singularity/podman/shifter/charliecloud/conda/institute> -resume
    ```
 
+You can also try running a small example dataset hosted with the source code using the following command (no need to download anything):
+
+```
+nextflow run nf-core/pathogensurveillance --input https://raw.githubusercontent.com/grunwaldlab/pathogensurveillance/master/test/data/metadata_small.csv --outdir test_out --download_bakta_db true -profile docker -resume
+```
+
+
+
 ## Documentation
 
 The nf-core/pathogensurveillance pipeline comes with documentation about the pipeline [usage](https://nf-co.re/pathogensurveillance/usage), [parameters](https://nf-co.re/pathogensurveillance/parameters) and [output](https://nf-co.re/pathogensurveillance/output).
diff --git a/assets/main_report_old/index.qmd b/assets/main_report_old/index.qmd
index 6f255f0d..e3631f62 100644
--- a/assets/main_report_old/index.qmd
+++ b/assets/main_report_old/index.qmd
@@ -7,7 +7,7 @@ execute:
 ---
 
 ```{r knitr_settings}
-knitr::opts_chunk$set(echo = FALSE, fig.width = 10, warning = FALSE)
+knitr::opts_chunk$set(echo = TRUE, fig.width = 10, warning = FALSE)
 ```
 
 ```{r load_libraries, warning=FALSE, message=FALSE}
@@ -607,6 +607,7 @@ Please revise as needed. I think we need a better approach to coming up with SNP
 **Martha-I cannot figure out why Poppr samples/nodes legend has a redundant node. I have not seen this with other datasets, and used pretty much the same code**
 
 ```{r poppr msn, height=10, eval = nrow(samp_data) > 2}
+filter_level = 5
 mat <-match(indNames(snp_aln.gi), samp_data$sample)
 samp_data <- samp_data[mat, ]
 
diff --git a/assets/main_report_old/packages.bib b/assets/main_report_old/packages.bib
index 4afe14ea..a805d267 100644
--- a/assets/main_report_old/packages.bib
+++ b/assets/main_report_old/packages.bib
@@ -32,11 +32,20 @@ @Manual{R-base
   url = {https://www.R-project.org/},
 }
 
+@Manual{R-bookdown,
+  title = {bookdown: Authoring Books and Technical Documents with R Markdown},
+  author = {Yihui Xie},
+  year = {2023},
+  note = {R package version 0.36, 
+https://pkgs.rstudio.com/bookdown/},
+  url = {https://github.com/rstudio/bookdown},
+}
+
 @Manual{R-dplyr,
   title = {dplyr: A Grammar of Data Manipulation},
   author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller and Davis Vaughan},
   year = {2023},
-  note = {R package version 1.1.3},
+  note = {R package version 1.1.4},
   url = {https://dplyr.tidyverse.org},
 }
 
@@ -78,7 +87,7 @@ @Manual{R-ggtree
   title = {ggtree: an R package for visualization of tree and annotation data},
   author = {Guangchuang Yu and Tommy Tsan-Yuk Lam and Shuangbin Xu},
   year = {2023},
-  note = {R package version 3.10.0},
+  note = {R package version 3.8.2},
   url = {https://bioconductor.org/packages/ggtree},
   doi = {10.18129/B9.bioc.ggtree},
 }
@@ -174,10 +183,10 @@ @Manual{R-plotly
 @Manual{R-poppr,
   title = {poppr: Genetic Analysis of Populations with Mixed Reproduction},
   author = {Zhian N. Kamvar and Javier F. Tabima and Jonah C. Brooks and David Folarin},
-  year = {2023},
   note = {R package version 2.9.4, 
 https://github.com/grunwaldlab/poppr/},
   url = {https://grunwaldlab.github.io/poppr/},
+  year = {2023},
 }
 
 @Manual{R-purrr,
@@ -208,8 +217,8 @@ @Manual{R-rmarkdown
 @Manual{R-stringr,
   title = {stringr: Simple, Consistent Wrappers for Common String Operations},
   author = {Hadley Wickham},
-  year = {2022},
-  note = {R package version 1.5.0, 
+  year = {2023},
+  note = {R package version 1.5.1, 
 https://github.com/tidyverse/stringr},
   url = {https://stringr.tidyverse.org},
 }
@@ -344,6 +353,16 @@ @Article{ape2019
   doi = {10.1093/bioinformatics/bty633},
 }
 
+@Book{bookdown2016,
+  title = {bookdown: Authoring Books and Technical Documents with {R} Markdown},
+  author = {Yihui Xie},
+  publisher = {Chapman and Hall/CRC},
+  address = {Boca Raton, Florida},
+  year = {2016},
+  isbn = {978-1138700109},
+  url = {https://bookdown.org/yihui/bookdown},
+}
+
 @Book{ggplot22016,
   author = {Hadley Wickham},
   title = {ggplot2: Elegant Graphics for Data Analysis},
@@ -365,7 +384,7 @@ @Book{ggtree2022a
 
 @Article{ggtree2022b,
   title = {Ggtree: A serialized data object for visualization of a phylogenetic tree and annotation data},
-  author = {Shuangbin Xu and Lin Li and Xiao Luo and Meijun Chen and Wenli Tang and Li Zhan and Zehan Dai and {Tommy T. Lam} and Yi Guan and Guangchuang Yu},
+  author = {Shuangbin Xu and Lin Li and Xiao Luo and Meijun Chen and Wenli Tang and Li Zhan and Zehan Dai and Tommy T. Lam and Yi Guan and Guangchuang Yu},
   year = {2022},
   journal = {iMeta},
   volume = {1},
diff --git a/workflows/pathogensurveillance.nf b/workflows/pathogensurveillance.nf
index d0b26881..374a039b 100644
--- a/workflows/pathogensurveillance.nf
+++ b/workflows/pathogensurveillance.nf
@@ -17,7 +17,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 // Check mandatory parameters
 if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified.' }
 if (!params.bakta_db && !params.download_bakta_db ) {
-    exit 1, "No bakta database specified. Use either '--bakta_db' to point to a local bakta database or use '--bakta_db_download true' to download the Bakta database."
+    exit 1, "No bakta database specified. Use either '--bakta_db' to point to a local bakta database or use '--download_bakta_db true' to download the Bakta database."
 }
 
 
@@ -130,12 +130,12 @@ workflow PATHOGENSURVEILLANCE {
     messages = messages.mix(VARIANT_ANALYSIS.out.messages)
 
     // Assemble and annotate bacterial genomes
-    GENOME_ASSEMBLY (                                                           
-        ASSIGN_REFERENCES.out.sample_data                           
+    GENOME_ASSEMBLY (
+        ASSIGN_REFERENCES.out.sample_data
             .combine(COARSE_SAMPLE_TAXONOMY.out.kingdom, by: 0)
-            .combine(COARSE_SAMPLE_TAXONOMY.out.depth, by: 0)                                                   
-    )                                                                           
-    ch_versions = ch_versions.mix(GENOME_ASSEMBLY.out.versions)                 
+            .combine(COARSE_SAMPLE_TAXONOMY.out.depth, by: 0)
+    )
+    ch_versions = ch_versions.mix(GENOME_ASSEMBLY.out.versions)
 
     // Create core gene phylogeny for bacterial samples
     ref_gffs = DOWNLOAD_REFERENCES.out.assem_samp_combos
@@ -146,10 +146,10 @@ workflow PATHOGENSURVEILLANCE {
         .combine(GENOME_ASSEMBLY.out.gff, by: 0) // [val(meta), [file(fastq)], val(ref_meta), file(reference), val(group_meta), file(gff)]
         .combine(ref_gffs, by: 0) // [val(meta), [file(fastq)], val(ref_meta), file(reference), val(group_meta), file(gff), [file(ref_gff)] ]
         .combine(COARSE_SAMPLE_TAXONOMY.out.depth, by:0) // [val(meta), [file(fastq)], val(ref_meta), file(reference), val(group_meta), file(gff), [file(ref_gff)], val(depth)]
-        .map { [it[0], it[5], it[4], it[6], it[7]] } // [ val(meta), file(gff), val(group_meta), [file(ref_gff)], val(depth) ]            
-    CORE_GENOME_PHYLOGENY (                                                     
-        gff_and_group,                             
-        INPUT_CHECK.out.csv                                                          
+        .map { [it[0], it[5], it[4], it[6], it[7]] } // [ val(meta), file(gff), val(group_meta), [file(ref_gff)], val(depth) ]
+    CORE_GENOME_PHYLOGENY (
+        gff_and_group,
+        INPUT_CHECK.out.csv
     )
     ch_versions = ch_versions.mix(CORE_GENOME_PHYLOGENY.out.versions)
     messages  = messages.mix(CORE_GENOME_PHYLOGENY.out.messages)
@@ -159,21 +159,21 @@ workflow PATHOGENSURVEILLANCE {
         .map { [it[1], it[0]] } // val(meta), val(ref_meta)
         .groupTuple() // val(meta), [val(ref_meta)]
     busco_input = ASSIGN_REFERENCES.out.sample_data  // val(meta), [file(fastq)], val(ref_meta), file(reference), val(group_meta)
-        .combine(ref_metas, by: 0) // val(meta), [file(fastq)], val(ref_meta), file(reference), val(group_meta), [val(ref_meta)] 
+        .combine(ref_metas, by: 0) // val(meta), [file(fastq)], val(ref_meta), file(reference), val(group_meta), [val(ref_meta)]
         .combine(COARSE_SAMPLE_TAXONOMY.out.kingdom, by: 0) // val(meta), [file(fastq)], val(ref_meta), file(reference), val(group_meta), [val(ref_meta)], val(kingdom)
         .combine(COARSE_SAMPLE_TAXONOMY.out.depth, by:0) // val(meta), [file(fastq)], val(ref_meta), file(reference), val(group_meta), [val(ref_meta)], val(kingdom), val(depth)
-        .map { it[0..1] + it[4..7] } // val(meta), [file(fastq)], val(group_meta), [val(ref_meta)], val(kingdom), val(depth)      
+        .map { it[0..1] + it[4..7] } // val(meta), [file(fastq)], val(group_meta), [val(ref_meta)], val(kingdom), val(depth)
     BUSCO_PHYLOGENY (
         busco_input,
         DOWNLOAD_REFERENCES.out.sequence
     )
 
     // Save version info
-    CUSTOM_DUMPSOFTWAREVERSIONS (                                               
+    CUSTOM_DUMPSOFTWAREVERSIONS (
         ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )
 
-                                                                          
+
     // MultiQC
     workflow_summary    = WorkflowPathogensurveillance.paramsSummaryMultiqc(workflow, summary_params)
     ch_workflow_summary = Channel.value(workflow_summary)
@@ -195,12 +195,12 @@ workflow PATHOGENSURVEILLANCE {
     )
     multiqc_report = MULTIQC.out.report.toList()
     ch_versions    = ch_versions.mix(MULTIQC.out.versions)
-    
+
     // Save error/waring/message info
-    RECORD_MESSAGES (                                               
+    RECORD_MESSAGES (
         messages.collect(sort:true, flat:false)
     )
-    
+
     // Create main summary report
     report_samp_data = ASSIGN_REFERENCES.out.sample_data // meta, fastq, ref_meta, reference, group_meta
         .combine(COARSE_SAMPLE_TAXONOMY.out.hits, by:0) // meta, fastq, ref_meta, reference, group_meta, sendsketch
@@ -229,18 +229,18 @@ workflow PATHOGENSURVEILLANCE {
             it[7],
             it[8] == null ? [] : it[9]
          ] } // group_meta, [ref_meta],[sendsketch], [quast], [vcf], [align], [tree], ani_matrix, core_phylo
-         
-    MAIN_REPORT (                                                             
-        report_in,                                                              
-        INPUT_CHECK.out.csv,                                                               
-        DOWNLOAD_REFERENCES.out.stats,                                          
-        MULTIQC.out.data,                                                       
-        MULTIQC.out.plots,                                                      
+
+    MAIN_REPORT (
+        report_in,
+        INPUT_CHECK.out.csv,
+        DOWNLOAD_REFERENCES.out.stats,
+        MULTIQC.out.data,
+        MULTIQC.out.plots,
         MULTIQC.out.report,
         CUSTOM_DUMPSOFTWAREVERSIONS.out.yml,
         RECORD_MESSAGES.out.tsv,
         Channel.fromPath("${projectDir}/assets/main_report", checkIfExists: true)
-    )                                                                           
+    )
 
 }