diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 65b078e4..ff6537e3 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -12,12 +12,12 @@ jobs: name: Run AWS full tests if: github.repository == 'nf-core/scrnaseq' runs-on: ubuntu-latest + strategy: + matrix: + aligner: ["alevin", "kallisto", "star", "cellranger", "universc"] steps: - name: Launch workflow via tower uses: seqeralabs/action-tower-launch@v2 - # TODO nf-core: You can customise AWS full pipeline tests as required - # Add full size test data (but still relatively small datasets for few samples) - # on the `test_full.config` test runs with only one set of parameters with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 060785d7..3807805b 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -9,6 +9,9 @@ jobs: name: Run AWS tests if: github.repository == 'nf-core/scrnaseq' runs-on: ubuntu-latest + strategy: + matrix: + aligner: ["alevin", "kallisto", "star", "cellranger", "universc"] steps: # Launch workflow using Tower CLI tool action - name: Launch workflow via tower @@ -21,10 +24,10 @@ jobs: workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/scrnaseq/work-${{ github.sha }} parameters: | { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/scrnaseq/results-test-${{ github.sha }}" + "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/scrnaseq/results-test-${{ github.sha }}/aligner_${{ matrix.aligner }}", + "aligner": "${{ matrix.aligner }}" } profiles: test - - uses: actions/upload-artifact@v3 with: name: Tower debug log file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index de0c4594..368bd21a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ env: NXF_ANSI_LOG: false concurrency: - group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true jobs: @@ -26,7 +26,21 @@ jobs: NXF_VER: - "23.04.0" - "latest-everything" + profile: + [ + "test,docker --aligner alevin", + "test,docker --aligner kallisto", + "test,docker --aligner star", + "test,docker --aligner cellranger -stub", + "test,docker --aligner universc -stub", + ] steps: + - name: Free some space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" - name: Check out pipeline code uses: actions/checkout@v4 @@ -36,8 +50,7 @@ jobs: version: "${{ matrix.NXF_VER }}" - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }} --outdir ./results diff --git a/.gitignore b/.gitignore index 5124c9ac..5d402163 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ results/ testing/ testing* *.pyc +log/ +reports/ +testme.sh diff --git a/.nf-core.yml b/.nf-core.yml index 3805dc81..738ad918 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1 +1,5 @@ repository_type: pipeline +lint: + template_strings: False + files_unchanged: + - .github/ISSUE_TEMPLATE/bug_report.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 8839e0f7..c732afa7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,14 +3,102 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v2.5.0dev - [date] +## [Unreleased] -Initial release of nf-core/scrnaseq, created with the [nf-core](https://nf-co.re/) template. +- Better support for custom protocols ([#273](https://github.com/nf-core/scrnaseq/pull/273)). + - The universc protocol is now specified via the `--protocol` flag + - Any protocol specified is now passed to the respective aligner + - Added a section to the documentation -### `Added` +## v2.4.1 - 2023-09-28 -### `Fixed` +- Fix whitelist logic for dropseq ([#267](https://github.com/nf-core/scrnaseq/pull/267)) +- Fix false-positive filename check in cellranger module ([#261](https://github.com/nf-core/scrnaseq/pull/261)) +- Template update to v2.10 ([#269](https://github.com/nf-core/scrnaseq/pull/269)) -### `Dependencies` +## v2.4.0 - 2023-08-16 Lime Platinum Crab -### `Deprecated` +- Fix typo causing empty version imformation for mtx_conversion subworkflow ([#254](https://github.com/nf-core/scrnaseq/pull/254)) +- Add `singularity.registry = 'quay.io'` and bump NF version to 23.04.0 ([#237](https://github.com/nf-core/scrnaseq/pull/237)) +- Fixed issue with file collisions while using cellranger ([#232](https://github.com/nf-core/scrnaseq/pull/232)) +- Fix issue where multiqc inputs tried to access objects that did not exist ([#239](https://github.com/nf-core/scrnaseq/pull/239)) +- Removed `public_aws_ecr` profile ([#242](https://github.com/nf-core/scrnaseq/pull/242)) +- Include cellranger in MultiQC report ([#244](https://github.com/nf-core/scrnaseq/pull/244)) +- Nf-core template update to v2.9 ([#245](https://github.com/nf-core/scrnaseq/pull/245)) +- Update cellranger and fastqc module ([#246](https://github.com/nf-core/scrnaseq/pull/246)). + The [updated cellranger module](https://github.com/nf-core/modules/pull/3537) now automatically renames input FASTQ + files to match the expected naming conventions. + +## v2.3.2 - 2023-06-07 Sepia Samarium Salmon + +- Move containers for pipeline to quay.io ([#233](https://github.com/nf-core/scrnaseq/pull/233)) + +## v2.3.1 - 2023-06-02 Yellow Strontium Pinscher + +- Add `public_aws_ecr` config for using the AWS mirror of containers where possible ([#225](https://github.com/nf-core/scrnaseq/pull/225)) + +## v2.3.0 Steelblue Waspaloy Dachshund + +- Fix problem on samplesheet check related to amount of columns ([[#211](https://github.com/nf-core/scrnaseq/issues/211)]) +- Fixed bug in starsolo output cardinality. + +## v2.2.0 + +- Added support to output 10x count files in text format. +- Add gene symbols to count matrices +- Added UniverSC aligner to implement open-source version of Cell Ranger with wrapper for 40 technologies +- Update cellranger to v7.1.0 ([#205](https://github.com/nf-core/scrnaseq/pull/205)). + +### Fixes + +- Autocanceling previous CI runs when new changes are pushed. +- Fixed [#193](https://github.com/nf-core/scrnaseq/issues/177) by updating the Seurat container directive +- Fixed [#177](https://github.com/nf-core/scrnaseq/issues/177) by adjusting the channels generation and usage when skipping fastqc +- Fixed [#173](https://github.com/nf-core/scrnaseq/issues/173) by adjusting parameter type and adding them to modules.config +- Fixed [#170](https://github.com/nf-core/scrnaseq/issues/170) by adding UniverSC subworkflow using new module +- Fixed [#196](https://github.com/nf-core/scrnaseq/issues/196) by adjusting runtime requirements for AlevinQC +- Fixed [#191](https://github.com/nf-core/scrnaseq/issues/191) by updating simpleAF containers to latest version + +## v2.1.0 - 2022-10-06 "Green Mercury Siberian Husky" + +- Alevin workflow updated to use Alevin-Fry via simpleaf - thanks to @rob-p for supporting this and @fmalmeida implementing the support + +### Fixes + +- Fixed Kallistobustools workflow [#123](https://github.com/nf-core/scrnaseq/issues/123) by upgrading to nf-core/modules module +- Fixed matrix conversion error when running STAR with --soloFeatures GeneFull [#135](https://github.com/nf-core/scrnaseq/pull/135) +- Fixed seurat matrix conversion error when running with conda profile [#136](https://github.com/nf-core/scrnaseq/pull/136) +- Fixed Kallistobustools module [#116](https://github.com/nf-core/scrnaseq/issues/116). By updating nf-core module and making sure conversion modules take into account the different outputs produced by kallisto standard and non-standard workflows. +- Updated pipeline template to [nf-core/tools 2.6](https://github.com/nf-core/tools/releases/tag/2.6) + +## v2.0.0 - 2022-06-17 "Gray Nickel Beagle" + +- Pipeline ported to dsl2 +- Template update with latest nf-core/tools v2.1 +- Added cellranger v.7.0.0 subworkflow +- Added full size tests + +### Fixes + +- Make sure pipeline runs on multiple samples [#77](https://github.com/nf-core/scrnaseq/pull/77) +- Fix issue where STARsolo always uses 10XV2 chemistry [#60](https://github.com/nf-core/scrnaseq/issues/60) + +## v1.1.0 - 2021-03-24 "Olive Mercury Corgi" + +- Template update with latest nf-core/tools v1.13.2 +- Parameters JSON Schema added [#42](https://github.com/nf-core/scrnaseq/issues/42) +- [25](https://github.com/nf-core/scrnaseq/issues/25) Fix small documentation error with wrong parameter for txp2gene + +### Fixes + +- [#20](https://github.com/nf-core/scrnaseq/issues/20) Fix Transcriptome Fasta argument not detected well +- [#21](https://github.com/nf-core/scrnaseq/issues/21) Fix `--kallisto_index` being ignored + +## v1.0.0 - 2019-11-28 "Tiny Aluminium Crab" + +Initial release of nf-core/scrnaseq, created with the [nf-core](http://nf-co.re/) template. +This includes the following workflow options: + +- Salmon Alevin + AlevinQC +- STARSolo +- Kallisto / BUStools diff --git a/CITATIONS.md b/CITATIONS.md index bd70fa4e..867bde34 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,6 +18,21 @@ > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +* [Alevin](https://doi.org/10.1186/s13059-019-1670-y) + + > Srivastava, A., Malik, L., Smith, T. et al. Alevin efficiently estimates accurate gene abundances from dscRNA-seq data. Genome Biol 20, 65 (2019). + +* [Salmon](https://www.nature.com/articles/nmeth.4197) + + > Patro, R., Duggal, G., Love, M. et al. Salmon provides fast and bias-aware quantification of transcript expression. Nat Methods 14, 417–419 (2017). + +* [Kallisto/Bustools](https://www.nature.com/articles/s41587-021-00870-2) + + > Melsted, P., Booeshaghi, A.S., Liu, L. et al. Modular, efficient and constant-memory single-cell RNA-seq preprocessing. Nat Biotechnol 39, 813–818 (2021). + +* [StarSolo](https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1) + > Benjamin Kaminow, Dinar Yunusov, Alexander Dobin. STARsolo: accurate, fast and versatile mapping/quantification of single-cell and single-nucleus RNA-seq data. BioRxiv 2021.05.05.442755 (2021). + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index 4736c821..bf9dd2b8 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # ![nf-core/scrnaseq](docs/images/nf-core-scrnaseq_logo_light.png#gh-light-mode-only) ![nf-core/scrnaseq](docs/images/nf-core-scrnaseq_logo_dark.png#gh-dark-mode-only) [![GitHub Actions CI Status](https://github.com/nf-core/scrnaseq/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/scrnaseq/actions?query=workflow%3A%22nf-core+CI%22) -[![GitHub Actions Linting Status](https://github.com/nf-core/scrnaseq/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/scrnaseq/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/scrnaseq/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) +[![GitHub Actions Linting Status](https://github.com/nf-core/scrnaseq/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/scrnaseq/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/scrnaseq/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.3568187-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.3568187) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) @@ -13,50 +13,49 @@ ## Introduction -**nf-core/scrnaseq** is a bioinformatics pipeline that ... +**nf-core/scrnaseq** is a bioinformatics best-practice analysis pipeline for processing 10x Genomics single-cell RNA-seq data. - +This is a community effort in building a pipeline capable to support: - - +- Alevin-Fry + AlevinQC +- STARSolo +- Kallisto + BUStools +- Cellranger +- UniverSC -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +## Documentation + +The nf-core/scrnaseq pipeline comes with documentation about the pipeline [usage](https://nf-co.re/scrnaseq/usage), [parameters](https://nf-co.re/scrnaseq/parameters) and [output](https://nf-co.re/scrnaseq/output). + +![scrnaseq workflow](docs/images/scrnaseq_pipeline_v1.0_metro_clean.png) ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - - Now, you can run the pipeline using: - - ```bash nextflow run nf-core/scrnaseq \ -profile \ --input samplesheet.csv \ + --genome_fasta GRCm38.p6.genome.chr19.fa \ + --gtf gencode.vM19.annotation.chr19.gtf \ + --protocol 10XV2 \ + --aligner \ --outdir ``` @@ -66,6 +65,21 @@ nextflow run nf-core/scrnaseq \ For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/scrnaseq/usage) and the [parameter documentation](https://nf-co.re/scrnaseq/parameters). +## Decision Tree for users + +The nf-core/scrnaseq pipeline features several paths to analyze your single cell data. Future additions will also be done soon, e.g. the addition of multi-ome analysis types. To aid users in analyzing their data, we have added a decision tree to help people decide on what type of analysis they want to run and how to choose appropriate parameters for that. + +```mermaid +graph TD + A[sc RNA] -->|alevin-fry| B(h5ad/seurat/mtx matrices) + A[sc RNA] -->|CellRanger| B(h5ad/seurat/mtx matrices) + A[sc RNA] -->|kbpython| B(h5ad/seurat/mtx matrices) + A[sc RNA] -->|STARsolo| B(h5ad/seurat/mtx matrices) + A[sc RNA] -->|Universc| B(h5ad/seurat/mtx matrices) +``` + +Options for the respective alignment method can be found [here](https://github.com/nf-core/scrnaseq/blob/dev/docs/usage.md#aligning-options) to choose between methods. + ## Pipeline output To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/scrnaseq/results) tab on the nf-core website pipeline page. @@ -74,11 +88,14 @@ For more details about the output files and reports, please refer to the ## Credits -nf-core/scrnaseq was originally written by Bailey PJ, Botvinnik O, Marques de Almeida F, Peltzer A, Sturm G. +nf-core/scrnaseq was originally written by Bailey PJ, Botvinnik O, Marques de Almeida F, Gabernet G, Peltzer A, Sturm G. We thank the following people for their extensive assistance in the development of this pipeline: - +- @heylf +- @KevinMenden +- @FloWuenne +- @rob-p ## Contributions and Support @@ -88,17 +105,10 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations - - +If you use nf-core/scrnaseq for your analysis, please cite it using the following doi: [10.5281/zenodo.3568187](https://doi.org/10.5281/zenodo.3568187) - +The basic benchmarks that were used as motivation for incorporating the three available modular workflows can be found in [this publication](https://www.biorxiv.org/content/10.1101/673285v2). -An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. - -You can cite the `nf-core` publication as follows: +We offer all three paths for the processing of scRNAseq data so it remains up to the user to decide which pipeline workflow is chosen for a particular analysis question. -> **The nf-core framework for community-curated bioinformatics pipelines.** -> -> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. -> -> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). +An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index c79ee0c1..107f3120 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,7 +3,6 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/scrnaseq Methods Description" section_href: "https://github.com/nf-core/scrnaseq" plot_type: "html" -## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

diff --git a/assets/protocols.json b/assets/protocols.json new file mode 100644 index 00000000..23ff1328 --- /dev/null +++ b/assets/protocols.json @@ -0,0 +1,90 @@ +{ + "alevin": { + "10XV1": { + "protocol": "10xv1", + "whitelist": "assets/whitelist/10x_V1_barcode_whitelist.txt.gz" + }, + "10XV2": { + "protocol": "10xv2", + "whitelist": "assets/whitelist/10x_V2_barcode_whitelist.txt.gz" + }, + "10XV3": { + "protocol": "10xv3", + "whitelist": "assets/whitelist/10x_V3_barcode_whitelist.txt.gz" + }, + "dropseq": { + "protocol": "dropseq" + } + }, + "cellranger": { + "auto": { + "protocol": "auto" + }, + "10XV1": { + "protocol": "SC3Pv1" + }, + "10XV2": { + "protocol": "SC3Pv2" + }, + "10XV3": { + "protocol": "SC3Pv3" + } + }, + "star": { + "10XV1": { + "protocol": "CB_UMI_Simple", + "extra_args": "--soloUMIlen 10", + "whitelist": "assets/whitelist/10x_V1_barcode_whitelist.txt.gz" + }, + "10XV2": { + "protocol": "CB_UMI_Simple", + "extra_args": "--soloUMIlen 10", + "whitelist": "assets/whitelist/10x_V2_barcode_whitelist.txt.gz" + }, + "10XV3": { + "protocol": "CB_UMI_Simple", + "extra_args": "--soloUMIlen 12", + "whitelist": "assets/whitelist/10x_V3_barcode_whitelist.txt.gz" + }, + "dropseq": { + "protocol": "CB_UMI_Simple" + }, + "smartseq": { + "protocol": "SmartSeq" + } + }, + "kallisto": { + "10XV1": { + "protocol": "10XV1" + }, + "10XV2": { + "protocol": "10XV2" + }, + "10XV3": { + "protocol": "10XV3" + }, + "dropseq": { + "protocol": "DROPSEQ" + }, + "smartseq": { + "protocol": "SMARTSEQ" + } + }, + "universc": { + "auto": { + "protocol": "10x" + }, + "10XV1": { + "protocol": "10x-v1" + }, + "10XV2": { + "protocol": "10x-v2" + }, + "10XV3": { + "protocol": "10x-v3" + }, + "dropseq": { + "protocol": "dropseq" + } + } +} diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab7..d4623eea 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,4 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,fastq_1,fastq_2,protocol,expected_cells +Sample_X,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz,"10xV2","5000" +Sample_Y,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R1_001.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R2_001.fastq.gz,"10xV2","5000" +Sample_Y,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R1_001.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R2_001.fastq.gz,"10xV2","5000" diff --git a/assets/whitelist/10x_V1_barcode_whitelist.txt.gz b/assets/whitelist/10x_V1_barcode_whitelist.txt.gz new file mode 100644 index 00000000..bf838d62 Binary files /dev/null and b/assets/whitelist/10x_V1_barcode_whitelist.txt.gz differ diff --git a/assets/whitelist/10x_V2_barcode_whitelist.txt.gz b/assets/whitelist/10x_V2_barcode_whitelist.txt.gz new file mode 100644 index 00000000..e9203f21 Binary files /dev/null and b/assets/whitelist/10x_V2_barcode_whitelist.txt.gz differ diff --git a/assets/whitelist/10x_V3_barcode_whitelist.txt.gz b/assets/whitelist/10x_V3_barcode_whitelist.txt.gz new file mode 100644 index 00000000..836b665d Binary files /dev/null and b/assets/whitelist/10x_V3_barcode_whitelist.txt.gz differ diff --git a/bin/alevin_qc.r b/bin/alevin_qc.r new file mode 100755 index 00000000..ec1362f9 --- /dev/null +++ b/bin/alevin_qc.r @@ -0,0 +1,20 @@ +#!/usr/bin/env Rscript + +# Command line argument processing +args <- commandArgs(trailingOnly=TRUE) + +if (length(args) < 3) { + stop("Usage: alevin_qc.r ", call.=FALSE) +} + +require(alevinQC) +require(tximport) + +baseDir <- args[1] +sampleId <- args[2] +outDir <- args[3] + +alevinQCReport(baseDir = baseDir, sampleId = sampleId, + outputFile = "alevinReport.html", + outputFormat = "html_document", + outputDir = outDir, forceOverwrite = TRUE) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 4a758fe0..47d1b446 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv """Provide a command line tool to validate and transform tabular samplesheets.""" @@ -14,122 +15,6 @@ logger = logging.getLogger() -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", - ) - - def __init__( - self, - sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - single_col="single_end", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - single_col (str): The name of the new column that will be inserted and - records whether the sample contains single- or paired-end sequencing - reads (default "single_end"). - - """ - super().__init__(**kwargs) - self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._single_col = single_col - self._seen = set() - self.modified = [] - - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) - self.modified.append(row) - - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - if len(row[self._sample_col]) <= 0: - raise AssertionError("Sample input is required.") - # Sanitize samples slightly. - row[self._sample_col] = row[self._sample_col].replace(" ", "_") - - def _validate_first(self, row): - """Assert that the first FASTQ entry is non-empty and has the right format.""" - if len(row[self._first_col]) <= 0: - raise AssertionError("At least the first FASTQ file is required.") - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - row[self._single_col] = False - first_col_suffix = Path(row[self._first_col]).suffixes[-2:] - second_col_suffix = Path(row[self._second_col]).suffixes[-2:] - if first_col_suffix != second_col_suffix: - raise AssertionError("FASTQ pairs must have the same file extensions.") - else: - row[self._single_col] = True - - def _validate_fastq_format(self, filename): - """Assert that a given filename has one of the expected FASTQ extensions.""" - if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): - raise AssertionError( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) - - def validate_unique_samples(self): - """ - Assert that the combination of sample name and FASTQ filename is unique. - - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. - - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and FASTQ must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" - - def read_head(handle, num_lines=10): """Read the specified number of lines from the current position in the file.""" lines = [] @@ -140,6 +25,14 @@ def read_head(handle, num_lines=10): return "".join(lines) +def print_error(error, context="Line", context_str=""): + error_str = f"ERROR: Please check samplesheet -> {error}" + if context != "" and context_str != "": + error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'" + print(error_str) + sys.exit(1) + + def sniff_format(handle): """ Detect the tabular format. @@ -188,32 +81,118 @@ def check_samplesheet(file_in, file_out): https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv """ - required_columns = {"sample", "fastq_1", "fastq_2"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - req_cols = ", ".join(required_columns) - logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") + + sample_mapping_dict = {} + with open(file_in, "r") as fin: + ## Check header + MIN_COLS = 2 + MIN_HEADER = ["sample", "fastq_1", "fastq_2"] + OPT_HEADER = ["expected_cells", "seq_center"] + header = [x.strip('"') for x in fin.readline().strip().split(",")] + + unknown_header = 0 + min_header_count = 0 + colmap = {"sample": 0, "fastq_1": 1, "fastq2": 2} + i = 0 + for h in header: + if h not in MIN_HEADER and h not in OPT_HEADER: + unknown_header = 1 + if h in MIN_HEADER: + min_header_count = min_header_count + 1 + colmap[h] = i + i = i + 1 + if min_header_count < len(MIN_HEADER): + # code was checking for unknown_header or min_header_count however looking at the ifelse, unknown_header does not seem that it should be tested + given = ",".join(header) + wanted = ",".join(MIN_HEADER) + print(f"ERROR: Please check samplesheet header -> {given} != {wanted}") sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - header.insert(1, "single_end") - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) + + ## Check sample entries + for line in fin: + lspl = [x.strip().strip('"') for x in line.strip().split(",")] + + # Check valid number of columns per row + if len(lspl) < len(header): + print_error( + "Invalid number of columns (minimum = {})!".format(len(header)), + "Line", + line, + ) + num_cols = len([x for x in lspl if x]) + if num_cols < MIN_COLS: + print_error( + "Invalid number of populated columns (minimum = {})!".format(MIN_COLS), + "Line", + line, + ) + + ## Check sample name entries + sample, fastq_1, fastq_2 = lspl[: len(MIN_HEADER)] + sample = sample.replace(" ", "_") + if not sample: + print_error("Sample entry has not been specified!", "Line", line) + + ## Check expected cells is an integer if present + expected_cells = "" + if "expected_cells" in header: + expected_cells = lspl[colmap["expected_cells"]] + if not is_integer(expected_cells): + print_error("Expected cells must be an integer", "Line", line) + + ## If present, replace spaces with _ in sequencing center name + seq_center = "" + if "seq_center" in header: + seq_center = lspl[colmap["seq_center"]] + seq_center = seq_center.replace(" ", "_") + + ## Check FastQ file extension + for fastq in [fastq_1, fastq_2]: + if fastq: + if fastq.find(" ") != -1: + print_error("FastQ file contains spaces!", "Line", line) + if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): + print_error( + "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", + "Line", + line, + ) + + ## Auto-detect paired-end/single-end + sample_info = [] ## [single_end, fastq_1, fastq_2] + if sample and fastq_1 and fastq_2: ## Paired-end short reads + sample_info = ["0", fastq_1, fastq_2, expected_cells, seq_center] + elif sample and fastq_1 and not fastq_2: ## Single-end short reads + sample_info = ["1", fastq_1, fastq_2, expected_cells, seq_center] + else: + print_error("Invalid combination of columns provided!", "Line", line) + + ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] } + if sample not in sample_mapping_dict: + sample_mapping_dict[sample] = [sample_info] + else: + if sample_info in sample_mapping_dict[sample]: + # print_error("Samplesheet contains duplicate rows!", "Line", line) + sample_mapping_dict[sample].append(sample_info) + else: + sample_mapping_dict[sample].append(sample_info) + + ## Write validated samplesheet with appropriate columns + if len(sample_mapping_dict) > 0: + with open(file_out, "w") as fout: + fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2", "expected_cells", "seq_center"]) + "\n") + for sample in sorted(sample_mapping_dict.keys()): + ## Check that multiple runs of the same sample are of the same datatype + if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]): + print_error( + "Multiple runs of a sample must be of the same datatype!", + "Sample: {}".format(sample), + ) + + for idx, val in enumerate(sample_mapping_dict[sample]): + fout.write(",".join(["{}".format(sample)] + val) + "\n") + else: + print_error("No entries to process!", "Samplesheet: {}".format(file_in)) def parse_args(argv=None): @@ -244,6 +223,15 @@ def parse_args(argv=None): return parser.parse_args(argv) +def is_integer(n): + try: + float(n) + except ValueError: + return False + else: + return float(n).is_integer() + + def main(argv=None): """Coordinate argument parsing and program execution.""" args = parse_args(argv) diff --git a/bin/concat_h5ad.py b/bin/concat_h5ad.py new file mode 100755 index 00000000..e38ca80e --- /dev/null +++ b/bin/concat_h5ad.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +import scanpy as sc, anndata as ad, pandas as pd +from pathlib import Path +import argparse + + +def read_samplesheet(samplesheet): + df = pd.read_csv(samplesheet) + df.set_index("sample") + + # samplesheet may contain replicates, when it has, + # group information from replicates and collapse with commas + # only keep unique values using set() + df = df.groupby(["sample"]).agg(lambda column: ",".join(set(column))) + + return df + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Concatenates h5ad files and merge metadata from samplesheet") + + parser.add_argument("-i", "--input", dest="input", help="Path to samplesheet.csv") + parser.add_argument("-o", "--out", dest="out", help="Output path.") + parser.add_argument( + "-s", + "--suffix", + dest="suffix", + help="Suffix of matrices to remove and get sample name", + ) + + args = vars(parser.parse_args()) + + # Open samplesheet as dataframe + df_samplesheet = read_samplesheet(args["input"]) + + # find all h5ad and append to dict + dict_of_h5ad = {str(path).replace(args["suffix"], ""): sc.read_h5ad(path) for path in Path(".").rglob("*.h5ad")} + + # concat h5ad files + adata = ad.concat(dict_of_h5ad, label="sample", merge="unique", index_unique="_") + + # merge with data.frame, on sample information + adata.obs = adata.obs.join(df_samplesheet, on="sample") + adata.write_h5ad(args["out"], compression="gzip") + + print("Wrote h5ad file to {}".format(args["out"])) diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py new file mode 100755 index 00000000..485f8392 --- /dev/null +++ b/bin/filter_gtf_for_genes_in_genome.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# Script originally written by Pranathi Vemuri (github.com/pranathivemuri) +# modified by Harshil Patel (github.com/drpatelh) + +from __future__ import print_function +import logging +from itertools import groupby +import argparse + +# Create a logger +logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger(__file__) +logger.setLevel(logging.INFO) + + +def is_header(line): + return line[0] == ">" + + +def extract_fasta_seq_names(fasta_name): + """ + modified from Brent Pedersen + Correct Way To Parse A Fasta File In Python + given a fasta file. yield tuples of header, sequence + from https://www.biostars.org/p/710/ + """ + # first open the file outside + fh = open(fasta_name) + + # ditch the boolean (x[0]) and just keep the header or sequence since + # we know they alternate. + faiter = (x[1] for x in groupby(fh, is_header)) + + for i, header in enumerate(faiter): + line = next(header) + if is_header(line): + # drop the ">" + headerStr = line[1:].strip().split()[0] + yield headerStr + + +def extract_genes_in_genome(fasta, gtf_in, gtf_out): + seq_names_in_genome = set(extract_fasta_seq_names(fasta)) + logger.info("Extracted chromosome sequence names from : %s" % fasta) + logger.info("All chromosome names: " + ", ".join(sorted(x for x in seq_names_in_genome))) + seq_names_in_gtf = set([]) + + n_total_lines = 0 + n_lines_in_genome = 0 + with open(gtf_out, "w") as f: + with open(gtf_in) as g: + for line in g.readlines(): + n_total_lines += 1 + seq_name_gtf = line.split("\t")[0] + seq_names_in_gtf.add(seq_name_gtf) + if seq_name_gtf in seq_names_in_genome: + n_lines_in_genome += 1 + f.write(line) + logger.info( + "Extracted %d / %d lines from %s matching sequences in %s" % (n_lines_in_genome, n_total_lines, gtf_in, fasta) + ) + logger.info("All sequence IDs from GTF: " + ", ".join(sorted(x for x in seq_name_gtf))) + + logger.info("Wrote matching lines to %s" % gtf_out) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="""Filter GTF only for features in the genome""") + parser.add_argument("--gtf", type=str, help="GTF file") + parser.add_argument("--fasta", type=str, help="Genome fasta file") + parser.add_argument( + "-o", + "--output", + dest="output", + default="genes_in_genome.gtf", + type=str, + help="GTF features on fasta genome sequences", + ) + + args = parser.parse_args() + extract_genes_in_genome(args.fasta, args.gtf, args.output) diff --git a/bin/mtx_to_h5ad.py b/bin/mtx_to_h5ad.py new file mode 100755 index 00000000..40e1e025 --- /dev/null +++ b/bin/mtx_to_h5ad.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +import scanpy as sc +import pandas as pd +import argparse +import os +from scipy import io +from anndata import AnnData + + +def _10x_h5_to_adata(mtx_h5: str, sample: str): + adata = sc.read_10x_h5(mtx_h5) + adata.var["gene_symbols"] = adata.var_names + adata.var.set_index("gene_ids", inplace=True) + adata.obs["sample"] = sample + + # reorder columns for 10x mtx files + adata.var = adata.var[["gene_symbols", "feature_types", "genome"]] + + return adata + + +def _mtx_to_adata( + mtx_file: str, + barcode_file: str, + feature_file: str, + sample: str, + aligner: str, +): + adata = sc.read_mtx(mtx_file) + if ( + aligner == "star" + ): # for some reason star matrix comes transposed and doesn't fit when values are appended directly + adata = adata.transpose() + + adata.obs_names = pd.read_csv(barcode_file, header=None, sep="\t")[0].values + adata.var_names = pd.read_csv(feature_file, header=None, sep="\t")[0].values + adata.obs["sample"] = sample + + return adata + + +def input_to_adata( + input_data: str, + barcode_file: str, + feature_file: str, + sample: str, + aligner: str, + txp2gene: str, + star_index: str, + verbose: bool = True, +): + if verbose and (txp2gene or star_index): + print("Reading in {}".format(input_data)) + + if aligner == "cellranger": + adata = _10x_h5_to_adata(input_data, sample) + else: + adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample, aligner) + + if verbose and (txp2gene or star_index): + print("Reading in {}".format(txp2gene)) + + if txp2gene: + t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2]) + elif star_index: + t2g = pd.read_table( + f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1] + ) + + if txp2gene or star_index: + t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id") + adata.var["gene_symbol"] = t2g["gene_symbol"] + + return adata + + +def write_counts( + adata: AnnData, + out: str, + verbose: bool = False, +): + pd.DataFrame(adata.obs.index).to_csv(os.path.join(out, "barcodes.tsv"), sep="\t", index=False, header=None) + pd.DataFrame(adata.var).to_csv(os.path.join(out, "features.tsv"), sep="\t", index=True, header=None) + io.mmwrite(os.path.join(out, "matrix.mtx"), adata.X.T, field="integer") + + if verbose: + print("Wrote features.tsv, barcodes.tsv, and matrix.mtx files to {}".format(args["out"])) + + +def dump_versions(task_process): + import pkg_resources + + with open("versions.yml", "w") as f: + f.write(f"{task_process}:\n\t") + f.write("\n\t".join([f"{pkg.key}: {pkg.version}" for pkg in pkg_resources.working_set])) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Converts mtx output to h5ad.") + + parser.add_argument("-i", "--input_data", dest="input_data", help="Path to either mtx or mtx h5 file.") + parser.add_argument("-v", "--verbose", dest="verbose", help="Toggle verbose messages", default=False) + parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.", nargs="?", const="") + parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.", nargs="?", const="") + parser.add_argument("-s", "--sample", dest="sample", help="Sample name") + parser.add_argument("-o", "--out", dest="out", help="Output path.") + parser.add_argument("-a", "--aligner", dest="aligner", help="Which aligner has been used?") + parser.add_argument("--task_process", dest="task_process", help="Task process name.") + parser.add_argument("--txp2gene", dest="txp2gene", help="Transcript to gene (t2g) file.", nargs="?", const="") + parser.add_argument( + "--star_index", dest="star_index", help="Star index folder containing geneInfo.tab.", nargs="?", const="" + ) + + args = vars(parser.parse_args()) + + # create the directory with the sample name + os.makedirs(os.path.dirname(args["out"]), exist_ok=True) + + adata = input_to_adata( + input_data=args["input_data"], + barcode_file=args["barcode"], + feature_file=args["feature"], + sample=args["sample"], + aligner=args["aligner"], + txp2gene=args["txp2gene"], + star_index=args["star_index"], + verbose=args["verbose"], + ) + + write_counts(adata=adata, out=args["sample"], verbose=args["verbose"]) + + adata.write_h5ad(args["out"], compression="gzip") + + print("Wrote h5ad file to {}".format(args["out"])) + + dump_versions(task_process=args["task_process"]) diff --git a/bin/mtx_to_seurat.R b/bin/mtx_to_seurat.R new file mode 100755 index 00000000..f2680838 --- /dev/null +++ b/bin/mtx_to_seurat.R @@ -0,0 +1,37 @@ +#!/usr/bin/env Rscript +library(Seurat) + +args <- commandArgs(trailingOnly=TRUE) + +mtx_file <- args[1] +barcode_file <- args[2] +feature_file <- args[3] +out.file <- args[4] +aligner <- args[5] + +if(aligner %in% c("kallisto", "alevin")) { + # for kallisto and alevin, the features file contains only one column and matrix needs to be transposed + expression.matrix <- ReadMtx( + mtx = mtx_file, features = feature_file, cells = barcode_file, feature.column = 1, mtx.transpose = TRUE + ) +} else { + expression.matrix <- ReadMtx( + mtx = mtx_file, features = feature_file, cells = barcode_file + ) +} + +seurat.object <- CreateSeuratObject(counts = expression.matrix) + +dir.create(basename(dirname(out.file)), showWarnings = FALSE) + +saveRDS(seurat.object, file = out.file) + + +yaml::write_yaml( +list( + 'MTX_TO_SEURAT'=list( + 'Seurat' = paste(packageVersion('Seurat'), collapse='.') + ) +), +"versions.yml" +) diff --git a/bin/postprocessing.r b/bin/postprocessing.r new file mode 100755 index 00000000..556f82e2 --- /dev/null +++ b/bin/postprocessing.r @@ -0,0 +1,50 @@ + +args = commandArgs(trailingOnly=TRUE) +if (length(args) < 2) { + stop("Usage: postprocessing.r ", call.=FALSE) +} + +base.path <- args[1] + + +# Parts of the function is taken from Seurat's Read10x parsing function +ReadAlevin <- function( base.path = NULL ){ + if (! dir.exists(base.path )){ + stop("Directory provided does not exist") + } + + barcode.loc <- paste0( base.path, "alevin/quants_mat_rows.txt" ) + gene.loc <- paste0( base.path, "alevin/quants_mat_cols.txt" ) + matrix.loc <- paste0( base.path, "alevin/quants_mat.csv" ) + if (!file.exists( barcode.loc )){ + stop("Barcode file missing") + } + if (! file.exists(gene.loc) ){ + stop("Gene name file missing") + } + if (! file.exists(matrix.loc )){ + stop("Expression matrix file missing") + } + matrix <- as.matrix(read.csv( matrix.loc, header=FALSE)) + matrix <- t(matrix[,1:ncol(matrix)-1]) + + cell.names <- readLines( barcode.loc ) + gene.names <- readLines( gene.loc ) + + colnames(matrix) <- cell.names + rownames(matrix) <- gene.names + matrix[is.na(matrix)] <- 0 + return(matrix) +} + +require("seurat") + +alv.data <- ReadAlevin(base.path) +dat <- CreateSeuratObject(raw.data = alv.data, min.cells = 3, min.genes = 200, project = "10X_rnaseq") +dat <- NormalizeData(object = dat, normalization.method = "LogNormalize", scale.factor = 10000) +dat <- FindVariableGenes(object = dat, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5) +dat <- ScaleData(object = dat) +dat <- RunPCA(object = dat, pc.genes = dat@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5) +dat <- FindClusters(object = dat, reduction.type = "pca", dims.use = 1:10, resolution = 0.6, print.output = 0, save.SNN = TRUE) +dat <- RunTSNE(object = dat, dims.use = 1:10, do.fast = TRUE) +TSNEPlot(object = dat) diff --git a/bin/t2g.py b/bin/t2g.py new file mode 100755 index 00000000..2e1953c4 --- /dev/null +++ b/bin/t2g.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# This was downloaded on 2019-06-23 from https://github.com/bustools/getting_started/releases/ +# All credit goes to the original authors from the Kallisto/BUStools team! +# BSD 2-Clause License +# +# Copyright (c) 2017, Nicolas Bray, Harold Pimentel, Páll Melsted and Lior Pachter +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys, argparse + + +def create_transcript_list(input, use_name=True, use_version=False): + r = {} + for line in input: + if len(line) == 0 or line[0] == "#": + continue + l = line.strip().split("\t") + if l[2] == "transcript": + info = l[8] + d = {} + for x in info.split("; "): + x = x.strip() + p = x.find(" ") + if p == -1: + continue + k = x[:p] + p = x.find('"', p) + p2 = x.find('"', p + 1) + v = x[p + 1 : p2] + d[k] = v + + if "transcript_id" not in d or "gene_id" not in d: + continue + + tid = d["transcript_id"].split(".")[0] + gid = d["gene_id"].split(".")[0] + if use_version: + if "transcript_version" not in d or "gene_version" not in d: + continue + + tid += "." + d["transcript_version"] + gid += "." + d["gene_version"] + gname = None + if use_name: + if "gene_name" not in d: + continue + gname = d["gene_name"] + + if tid in r: + continue + + r[tid] = (gid, gname) + return r + + +def print_output(output, r, use_name=True): + for tid in r: + if use_name: + output.write("%s\t%s\t%s\n" % (tid, r[tid][0], r[tid][1])) + else: + output.write("%s\t%s\n" % (tid, r[tid][0])) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + add_help=True, + description="Creates transcript to gene info from GTF files\nreads from standard input and writes to standard output", + ) + parser.add_argument( + "--use_version", + "-v", + action="store_true", + help="Use version numbers in transcript and gene ids", + ) + parser.add_argument("--skip_gene_names", "-s", action="store_true", help="Do not output gene names") + args = parser.parse_args() + + input = sys.stdin + r = create_transcript_list(input, use_name=not args.skip_gene_names, use_version=args.use_version) + output = sys.stdout + print_output(output, r) diff --git a/conf/base.config b/conf/base.config index d6df6f7e..8c6f6db9 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,7 +10,6 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } @@ -24,7 +23,6 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { check_max( 1 , 'cpus' ) } @@ -62,4 +60,8 @@ process { withName:CUSTOM_DUMPSOFTWAREVERSIONS { cache = false } + //Fix for issue 196 + withName: 'NFCORE_SCRNASEQ:SCRNASEQ:SCRNASEQ_ALEVIN:ALEVINQC' { + time = '20.h' + } } diff --git a/conf/modules.config b/conf/modules.config index d91c6aba..f4cea8ad 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -11,7 +11,6 @@ */ process { - publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, @@ -25,11 +24,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - - withName: FASTQC { - ext.args = '--quiet' - } - withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -47,4 +41,137 @@ process { ] } + withName: 'MTX_TO_H5AD|CONCAT_H5AD|MTX_TO_SEURAT' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}/mtx_conversions" }, + mode: params.publish_dir_mode + ] + } + withName: 'GTF_GENE_FILTER' { + publishDir = [ + path: { "${params.outdir}/gtf_filter" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: false + ] + } +} + +if(params.aligner == "cellranger") { + process { + withName: CELLRANGER_MKGTF { + publishDir = [ + path: "${params.outdir}/${params.aligner}/mkgtf", + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = "--attribute=gene_biotype:protein_coding --attribute=gene_biotype:lncRNA --attribute=gene_biotype:pseudogene" + } + withName: CELLRANGER_MKREF { + publishDir = [ + path: "${params.outdir}/${params.aligner}/mkref", + mode: params.publish_dir_mode + ] + } + withName: CELLRANGER_COUNT { + publishDir = [ + path: "${params.outdir}/${params.aligner}/count", + mode: params.publish_dir_mode + ] + ext.args = {"--chemistry ${meta.chemistry} " + (meta.expected_cells ? "--expect-cells ${meta.expected_cells}" : '')} + } + } +} + +if(params.aligner == "universc") { + process { + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: CELLRANGER_MKGTF { + publishDir = [ + path: "${params.outdir}/cellranger/mkgtf", + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = "--attribute=gene_biotype:protein_coding --attribute=gene_biotype:lncRNA --attribute=gene_biotype:pseudogene" + container = "nf-core/universc:1.2.5.1" + } + withName: CELLRANGER_MKREF { + publishDir = [ + path: "${params.outdir}/cellranger/mkref", + mode: params.publish_dir_mode + ] + container = "nf-core/universc:1.2.5.1" + } + withName: UNIVERSC { + publishDir = [ + path: "${params.outdir}/universc", + mode: params.publish_dir_mode + ] + } + } +} + +if (params.aligner == "alevin") { + process { + withName: GFFREAD_TXP2GENE { + ext.args = "--table transcript_id,gene_id" + ext.prefix = { "${gff.baseName}_gffread" } + } + withName: 'SIMPLEAF_INDEX' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + enabled: params.save_reference + ] + ext.args = { "--rlen ${params.simpleaf_rlen}" } + } + withName: 'SIMPLEAF_QUANT' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode + ] + ext.args = "-r cr-like" + } + } +} + +if (params.aligner == "star") { + process { + withName: STAR_ALIGN { + ext.args = "--readFilesCommand zcat --runDirPerm All_RWX --outWigType bedGraph --twopassMode Basic --outSAMtype BAM SortedByCoordinate" + } + withName: STAR_GENOMEGENERATE { + publishDir = [ + path: { "${params.outdir}/${params.aligner}/genome_generate" }, + mode: params.publish_dir_mode, + enabled: params.save_reference + ] + } + withName: STAR_ALIGN { + publishDir = [ + path: { "${params.outdir}/${params.aligner}/${meta.id}" }, + mode: params.publish_dir_mode + ] + } + } +} + +if (params.aligner == 'kallisto') { + process { + withName: KALLISTOBUSTOOLS_REF { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + enabled: params.save_reference + ] + } + withName: KALLISTOBUSTOOLS_COUNT { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode + ] + ext.args = "--workflow ${params.kb_workflow}" + } + } } diff --git a/conf/test.config b/conf/test.config index 05a1d648..45ee54c8 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,10 +20,19 @@ params { max_time = '6.h' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + input = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/samplesheet-2-0.csv' // Genome references - genome = 'R64-1-1' + fasta = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/reference/GRCm38.p6.genome.chr19.fa' + gtf = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/reference/gencode.vM19.annotation.chr19.gtf' + aligner = 'star' + protocol = '10XV2' + + validationSchemaIgnoreParams = 'genomes' +} + +process { + withName: '.*:CELLRANGER_COUNT' { + maxForks = 1 + } } diff --git a/conf/test_full.config b/conf/test_full.config index e50f487c..dd838978 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -2,23 +2,25 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Nextflow config file for running full-size tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a full size pipeline test. + Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run nf-core/scrnaseq -profile test_full, --outdir + nextflow run nf-core/scrnaseq -profile test, --outdir ---------------------------------------------------------------------------------------- */ params { - config_profile_name = 'Full test profile' + config_profile_name = 'Full Test profile' config_profile_description = 'Full test dataset to check pipeline function' - // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/samplesheet_2.0_full.csv' // Genome references - genome = 'R64-1-1' + genome = 'GRCh38' + aligner = 'star' + protocol = '10XV2' + + validationSchemaIgnoreParams = 'genomes' } diff --git a/docs/images/nfcore-scrnaseq_logo.png b/docs/images/nfcore-scrnaseq_logo.png new file mode 100644 index 00000000..23bad886 Binary files /dev/null and b/docs/images/nfcore-scrnaseq_logo.png differ diff --git a/docs/images/rnaseq_pipeline_V1.0-metro_clean.svg b/docs/images/rnaseq_pipeline_V1.0-metro_clean.svg new file mode 100644 index 00000000..a831236f --- /dev/null +++ b/docs/images/rnaseq_pipeline_V1.0-metro_clean.svg @@ -0,0 +1,3471 @@ + +image/svg+xmlCell RangerGene BioType Filteringcellranger mkgtfGenerate Reference Indexcellranger mkrefGenerate Count Matrixcellranger countGene BioType Filteringcellranger-arc mkgtfGenerate Reference Indexcellranger-arc mkrefGenerate Count Matrixcellranger-arc countCell Ranger ArcGene BioType Filteringcellranger-atac mkgtfGenerate Reference Indexcellranger-atac mkrefGenerate Count Matrixcellranger-atac countCell Ranger ATACSalmon-AlevinGenerate Reference Indexsalmon indexQuality ControlalevinQCGenerate Count Matrixsalmon alevinKallistoGenerate Reference Indexkallistobustools refGenerate Count Matrixkallistobustools countSTARsoloGenerate Count MatrixSTARGenerate Reference IndexSTARCount Matrixh5adDemuliplexed Count Matrixh5adConvert MTX to H5ADDemultiplexingReporthtmlMultiQCRaw Data QCFastQCIdentify Genotypescellsnp-liteGenotypingvireocsvSamplesheetLegendStartEndInput:GenomefastagtfReferenceAnnotationgtfReferenceAnnotationGenomefasta+scRNA-SeqscATAC-SeqMultiomefastqCell RangerUniverSCGene BioType Filteringcellranger mkgtfGenerate Reference Indexcellranger mkrefGenerate Count Matrixcellranger countGene BioType Filteringcellranger mkgtfGenerate Reference Indexcellranger mkrefGenerate Count MatrixuniverscSalmon-AlevinQuality ControlalevinQCGenerate Reference Indexsalmon indexGenerate Count Matrixsalmon alevinKallistoGenerate Reference Indexkallistobustools refGenerate Count Matrixkallistobustools countSTARsoloGenerate Count MatrixSTARGenerate Reference IndexSTARCount Matrixh5adConvert MTX to H5ADReporthtmlMultiQCRaw Data QCFastQCcsvSamplesheetLegendStartEndInput:GenomefastagtfReferenceAnnotationgtfReferenceAnnotationGenomefasta+fastq diff --git a/docs/images/scrnaseq_pipeline_v1.0_metro_clean.png b/docs/images/scrnaseq_pipeline_v1.0_metro_clean.png new file mode 100644 index 00000000..74692e2f Binary files /dev/null and b/docs/images/scrnaseq_pipeline_v1.0_metro_clean.png differ diff --git a/docs/images/scrnaseq_pipeline_v2.0_metro_clean.png b/docs/images/scrnaseq_pipeline_v2.0_metro_clean.png new file mode 100644 index 00000000..7329ee16 Binary files /dev/null and b/docs/images/scrnaseq_pipeline_v2.0_metro_clean.png differ diff --git a/docs/output.md b/docs/output.md index 22a1a79f..c1e2b013 100644 --- a/docs/output.md +++ b/docs/output.md @@ -4,42 +4,138 @@ This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [nf-core/scrnaseq: Output](#nf-corescrnaseq-output) + - [:warning: Please read this documentation on the nf-core website: https://nf-co.re/scrnaseq/output](#warning-please-read-this-documentation-on-the-nf-core-website-httpsnf-corescrnaseqoutput) + - [Introduction](#introduction) + - [Pipeline overview](#pipeline-overview) + - [FastQC](#fastqc) + - [Kallisto & Bustools Results](#kallisto--bustools-results) + - [STARsolo](#starsolo) + - [Salmon Alevin & AlevinQC](#salmon-alevin--alevinqc) + - [Cellranger](#cellranger) + - [UniverSC](#universc) + - [Other output data](#other-output-data) + - [MultiQC](#multiqc) + - [Pipeline information](#pipeline-information) -### FastQC +## FastQC -
-Output files +See [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc) for details about FastQC. -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +The pipeline analyzes the raw data and generates for each file a FastQC report. All report are collected in MultiQC. -
+**Output directory: `results/fastqc`** -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +- `.html` + - Contains the FastQC report. +- `.zip` + - Contains additional information, such as individual plots, and FastQC raw data. -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +:::note +The FastQC plots displayed in the MultiQC report shows the unprocessed reads. Depending on the protocol, +they may contain barcodes, UMIs and adapter sequences. +::: -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +## Kallisto & Bustools Results -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +See [Kallisto](https://pachterlab.github.io/kallisto/about) for details about Kallisto and [Bustools](https://bustools.github.io/) for more information on BusTools. -:::note -The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. -::: +The pipeline can analyze data from single cell rnaseq experiments and generates a set of folders with respective outputs from various steps of the analysis. For a detailed summary what the pipeline does specifically, please follow the [excellent tutorial](https://www.kallistobus.tools/getting_started.html) that also describes specific steps for downstream analysis of the generated matrices. + +**Output directory: `results/kallisto`** + +- `raw_bus` + - Contains the unconverted BUS formatted pseudo aligned data +- `sort_bus` + - Contains the same BUS formatted data, sorted and corrected with the supplied barcode whitelist +- `kallisto_gene_map` + - Contains the converted GTF gene map that is used by BUSTools for downstream analysis +- `bustools_counts` + - Contains two subdirectories + - `eqcount`: Containing the Transcript Compatibility Count (TCC) Matrix (`tcc.mtx`) + - `genecount`: Containing the Gene Count Matrix (`gene.mtx`) +- `bustools_metrics` \* Contains the JSON metrics generated by BUStools + +For details on how to load these into R and perform further downstream analysis, please refer to the [BusTools HowTo](https://github.com/BUStools/getting_started/blob/master/getting_started.ipynb). + +**Output directory: `results/reference_genome`** + +- `kallisto_index` + - Contains the index of the supplied (genome/transcriptome) fasta file + +## STARsolo + +**Output directory: `results/star`** + +- Files will be organized in one directory per sample +- Contains the mapped BAM files and output metrics created by STARsolo + +**Output directory: `results/reference_genome`** + +- `star_index` + - Contains the index of the supplied genome fasta file + +## Salmon Alevin & AlevinQC + +**Output directory: `results/alevin`** + +- `alevin` + - Contains the created Salmon Alevin pseudo-aligned output +- `alevinqc` + - Contains the QC report for the aforementioned Salmon Alevin output data + +**Output directory: `results/reference_genome`** + +- `salmon_index` + - Contains the indexed reference transcriptome for Salmon Alevin +- `alevin/txp2gene.tsv` + - The transcriptome to gene mapping TSV file utilized by Salmon Alevin + +## Cellranger + +Cell Ranger is a set of analysis scripts that processes 10X Chromium single cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis. See [Cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) for more information on Cellranger. + +**Output directory: `results/cellranger`** + +- Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by Cellranger + +## UniverSC + +UniverSC is a wrapper that calls an open-source implementation of Cell Ranger v3.0.2 and adjusts run parameters for compatibility with a wide ranger of technologies. +Custom inputs and at least 40 preset technologies are supported. UniverSC is developed independently from 10X Genomics and all software are not subject +to the 10X Genomics End User License Agreement which restricts usage on other platforms. Therefore in principle UniverSC can be run on any scRNA-Seq technology +without restrictions to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis. +See [UniverSC](https://github.com/minoda-lab/universc) for more information on UniverSC. + +UniverSC has been published in _Nature Communications_. + +Battenberg, K., Kelly, S.T., Ras, R.A., Hetherington, N.A., Hayashi, K., and Minoda, A. (2022) A flexible cross-platform single-cell data processing pipeline. Nat Commun 13(1): 1-7. https://doi.org/10.1038/s41467-022-34681-z + +**Output directory: `results/universc`** + +- Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by the open-source implementation of Cell Ranger run via UniverSC + +## Other output data + +**Output directory: `results/reference_genome`** + +- `barcodes` + - Contains the utilized cell barcode whitelists (if applicable) +- `extract_transcriptome` + - When supplied with a `--fasta` genome fasta, this contains the extracted transcriptome + - The GTF file supplied with `--gtf` is used to extract the transcriptome positions appropriately + +**Output directory: `results/${params.aligner}/mtx_conversions`** + +- `*_matrix.h5ad` + - `.mtx` files converted to [AnnData](https://anndata.readthedocs.io/en/latest/) in `.h5ad` format, using [scanpy package](https://scanpy.readthedocs.io/en/stable/). + - One per sample and a single one with all samples concatenated together `combined_matrix.h5ad` -### MultiQC +## MultiQC
Output files diff --git a/docs/usage.md b/docs/usage.md index e9b83795..2b0b9ce2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,16 +1,10 @@ # nf-core/scrnaseq: Usage -## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/scrnaseq/usage](https://nf-co.re/scrnaseq/usage) - > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ -## Introduction - - - ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with at least 3 columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' @@ -29,38 +23,103 @@ CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +There is a strict requirement for the first 3 columns to match those defined in the table below. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +| Column | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Required. Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `fastq_1` | Required. Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `fastq_2` | Required. Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `expected_cells` | Optional. Number of cells expected for a sample. Must be an integer. If multiple rows are provided for the same sample, this must be the same number for all rows, i.e. the total number of expected cells for the sample. | +| `seq_center` | Optional. Sequencing center for the sample. If multiple rows are provided for the same sample, this must be the same string for all rows. Samples sequenced at different centers are considered different samples and must have different identifiers. Used for STARsolo BAM outputs only. Overrides `params.seq_center`. | + +An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. + +### Expected cells + +This parameter is currently supported by + +- [Salmon Alevin](https://salmon.readthedocs.io/en/latest/alevin.html#expectcells) +- [STARsolo](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md) +- [Cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) + +Note that since cellranger v7, it is **not recommended** anymore to supply the `--expected-cells` parameter. + +## Aligning options + +By default, the pipeline uses [Salmon Alevin](https://salmon.readthedocs.io/en/latest/alevin.html) (i.e. --aligner alevin) to perform pseudo-alignment of reads to the reference genome and to perform the downstream BAM-level quantification. Then QC reports are generated with AlevinQC. + +Other aligner options for running the pipeline are: + +- [Kallisto](https://pachterlab.github.io/kallisto/about) & [Bustools](https://bustools.github.io/), where kallisto is used for alignment and bustools is used for downstream analysis + - `--aligner kallisto` +- [STARsolo](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md) to perform both alignment and downstream analysis. + - `--aligner star` +- [Cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) to perform both alignment and downstream analysis. + - `--aligner cellranger` +- [UniverSC](https://github.com/minoda-lab/universc) to run an open-source version of Cell Ranger on any technology + - '--aligner universc` + +### If using cellranger or universc + +This pipeline automatically renames input FASTQ files to follow the +[naming convention by 10x](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input): -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +``` +[Sample Name]_S1_L00[Lane Number]_[Read Type]_001.fastq.gz ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +For more details, see -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +- [this issue](https://github.com/nf-core/scrnaseq/issues/241), discussing various mechanisms to deal with non-conformant filenames +- [the README of the cellranger/count module](https://github.com/nf-core/modules/blob/master/modules/nf-core/cellranger/count/README.md) which demonstrates that renaming files does not affect the results. +- [the code for renaming files in the cellranger/count module](https://github.com/nf-core/modules/blob/master/modules/nf-core/cellranger/count/templates/cellranger_count.py) +- [the code for renaming files in UniverSC](https://github.com/minoda-lab/universc/blob/99a20652430c1dc9f962536a2793536f643810b7/launch_universc.sh#L1411-L1609) + +As a sanity check, we verify that filenames of a pair of FASTQ files only differ by `R1`/`R2`. + +### Support for different scRNA-seq protocols + +The single-cell protocol used in the experiment can be specified using the `--protocol` flag. +For cellranger, it is recommended to stick with the default value `'auto'` for automatic detection of the protocol. +For all other aligner, you need to specify the protocol manually. + +The three 10x Genomics protocols 3' v1 (`10XV1`), 3' v2 (`10XV2`) and 3' v3 (`10XV3`) are universally supported +by all aligners in the pipeline and mapped to the correct options automatically. If the protocol is unknown to the +nf-core pipeline, the value specified to `--protocol` is passed to the aligner _in verbatim_ to support additional protocols. + +Here are some hints on running the various aligners with different protocols + +#### Kallisto/bustools + +The command `kb --list` shows all supported, preconfigured protocols. Additionally, a custom technology string such as +`0,0,16:0,16,26:1,0,0` can be speficied: + +> Additionally kallisto bus will accept a string specifying a new technology in the format of bc:umi:seq where each of bc,umi and seq are a triplet of integers separated by a comma, denoting the file index, start and stop of the sequence used. For example to specify the 10xV2 technology we would use 0,0,16:0,16,26:1,0,0 + +For more details, please refer to the [Kallisto/bustools documentation](https://pachterlab.github.io/kallisto/manual#bus). + +#### Alevin/fry + +Alevin/fry also supports custom chemistries in a slighly different format, e.g. `1{b[16]u[12]x:}2{r:}`. + +For more details, see the [simpleaf documentation](https://simpleaf.readthedocs.io/en/latest/quant-command.html#a-note-on-the-chemistry-flag) + +#### UniverSC + +See the [UniverSC GitHub page](https://github.com/minoda-lab/universc#pre-set-configurations) for all supported protocols. + +Currently only 3\' scRNA-Seq parameters are supported in nextflow, although chemistry parameters for 5\' scRNA-Seq and full-length scRNA-Seq libraries are supported by teh container. ## Running the pipeline -The typical command for running the pipeline is as follows: +The minimum typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/scrnaseq --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/scrnaseq --input ./samplesheet.csv --outdir ./results --genome GRCh38 -profile docker ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +This will launch the pipeline with the `docker` configuration profile and default `--type` and `--barcode_whitelist`. See below for more information about profiles and these options. Note that the pipeline will create the following files in your working directory: @@ -108,7 +167,7 @@ nextflow pull nf-core/scrnaseq It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [nf-core/scrnaseq releases page](https://github.com/nf-core/scrnaseq/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. +First, go to the [nf-core/scrnaseq releases page](https://github.com/nf-core/scrnaseq/releases) and find the latest version number - numeric only (eg. `1.0.0`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.0.0`. This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. @@ -126,6 +185,8 @@ These options are part of Nextflow and use a _single_ hyphen (pipeline parameter ### `-profile` +Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. Note that multiple profiles can be loaded, for example: `-profile docker` - the order of arguments is important! + Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. diff --git a/lib/Utils.groovy b/lib/Utils.groovy index 8d030f4e..2848f116 100644 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -13,10 +13,10 @@ class Utils { Yaml parser = new Yaml() def channels = [] try { - def config = parser.load("conda config --show channels".execute().text) + def config = parser.load('conda config --show channels'.execute().text) channels = config.channels - } catch(NullPointerException | IOException e) { - log.warn "Could not verify conda channel configuration." + } catch (NullPointerException | IOException e) { + log.warn 'Could not verify conda channel configuration.' return } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index bb00019b..6ec444fb 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -11,9 +11,8 @@ class WorkflowMain { // public static String citation(workflow) { return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + - // TODO nf-core: Add Zenodo DOI for pipeline after first release - //"* The pipeline\n" + - //" https://doi.org/10.5281/zenodo.XXXXXXX\n\n" + + "* The pipeline\n" + + " https://doi.org/10.5281/zenodo.3568187\n\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + diff --git a/lib/WorkflowScrnaseq.groovy b/lib/WorkflowScrnaseq.groovy index 04860c20..e4273887 100755 --- a/lib/WorkflowScrnaseq.groovy +++ b/lib/WorkflowScrnaseq.groovy @@ -4,6 +4,8 @@ import nextflow.Nextflow import groovy.text.SimpleTemplateEngine +import groovy.json.JsonSlurper + class WorkflowScrnaseq { @@ -11,9 +13,11 @@ class WorkflowScrnaseq { // Check and validate parameters // public static void initialise(params, log) { + genomeExists(params, log) - genomeExistsError(params, log) - + if (!params.input) { + Nextflow.error "Please provide an input samplesheet with --input" + } if (!params.fasta) { Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." @@ -33,16 +37,16 @@ class WorkflowScrnaseq { for (param in group_params.keySet()) { summary_section += "
$param
${group_params.get(param) ?: 'N/A'}
\n" } - summary_section += " \n" + summary_section += ' \n' } } - String yaml_file_text = "id: '${workflow.manifest.name.replace('/','-')}-summary'\n" + String yaml_file_text = "id: '${workflow.manifest.name.replace('/', '-')}-summary'\n" yaml_file_text += "description: ' - this information is collected when the pipeline is started.'\n" yaml_file_text += "section_name: '${workflow.manifest.name} Workflow Summary'\n" yaml_file_text += "section_href: 'https://github.com/${workflow.manifest.name}'\n" yaml_file_text += "plot_type: 'html'\n" - yaml_file_text += "data: |\n" + yaml_file_text += 'data: |\n' yaml_file_text += "${summary_section}" return yaml_file_text } @@ -53,7 +57,7 @@ class WorkflowScrnaseq { public static String toolCitationText(params) { - // TODO nf-core: Optionally add in-text citation tools to this list. + // TODO: Optionally add in-text citation tools to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ @@ -108,8 +112,7 @@ class WorkflowScrnaseq { // // Exit pipeline if incorrect --genome key provided - // - private static void genomeExistsError(params, log) { + static void genomeExists(params, log) { if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" + @@ -119,4 +122,22 @@ class WorkflowScrnaseq { Nextflow.error(error_string) } } + + // + // Retrieve the aligner-specific protocol based on the specified protocol. + // Returns a map ["protocol": protocol, "extra_args": , "whitelist": ] + // extra_args and whitelist are optional. + public static Map getProtocol(workflow, log, aligner, protocol) { + def jsonSlurper = new JsonSlurper() + def json = new File("${workflow.projectDir}/assets/protocols.json").text + def protocols = jsonSlurper.parseText(json) + def aligner_map = protocols[aligner] + if(aligner_map.containsKey(protocol)) { + return aligner_map[protocol] + } else { + log.warn("Protocol '${protocol}' not recognized by the pipeline. Passing on the protocol to the aligner unmodified.") + return ["protocol": protocol] + } + } + } diff --git a/main.nf b/main.nf index dad8ed70..670b64ce 100644 --- a/main.nf +++ b/main.nf @@ -13,20 +13,12 @@ nextflow.enable.dsl = 2 /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - GENOME PARAMETER VALUES + VALIDATE & PRINT PARAMETER SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// TODO nf-core: Remove this line if you don't need a FASTA file -// This is an example of how to use getGenomeAttribute() to fetch parameters -// from igenomes.config using `--genome` params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE & PRINT PARAMETER SUMMARY -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ +params.gtf = WorkflowMain.getGenomeAttribute(params, 'gtf') include { validateParameters; paramsHelp } from 'plugin/nf-validation' @@ -55,10 +47,11 @@ WorkflowMain.initialise(workflow, params, log) include { SCRNASEQ } from './workflows/scrnaseq' // -// WORKFLOW: Run main nf-core/scrnaseq analysis pipeline +// WORKFLOW: Run main scrnaseq analysis pipeline // -workflow NFCORE_SCRNASEQ { - SCRNASEQ () + +workflow NFCORE_SCRNASEQ{ + SCRNASEQ() } /* diff --git a/modules.json b/modules.json index 47755872..5b4e4a3f 100644 --- a/modules.json +++ b/modules.json @@ -5,19 +5,64 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "cellranger/count": { + "branch": "master", + "git_sha": "5df79e0383386a9e43462a6e81bf978ce0a6db09", + "installed_by": ["modules"] + }, + "cellranger/mkgtf": { + "branch": "master", + "git_sha": "716ef3019b66772a817b417078edce2f7b337858", + "installed_by": ["modules"] + }, + "cellranger/mkref": { + "branch": "master", + "git_sha": "716ef3019b66772a817b417078edce2f7b337858", + "installed_by": ["modules"] + }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", + "git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4", "installed_by": ["modules"] }, "fastqc": { "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "git_sha": "9a4517e720bc812e95b56d23d15a1653b6db4f53", + "installed_by": ["modules"] + }, + "gffread": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "gunzip": { + "branch": "master", + "git_sha": "e06548bfa36ee31869b81041879dd6b3a83b1d57", + "installed_by": ["modules"] + }, + "kallistobustools/count": { + "branch": "master", + "git_sha": "de204d3c950f091336539ad74f0e47ddffe69ed4", + "installed_by": ["modules"] + }, + "kallistobustools/ref": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "multiqc": { "branch": "master", - "git_sha": "4ab13872435962dadc239979554d13709e20bf29", + "git_sha": "a6e11ac655e744f7ebc724be669dd568ffdc0e80", + "installed_by": ["modules"] + }, + "star/genomegenerate": { + "branch": "master", + "git_sha": "cc08a888069f67cab8120259bddab8032d4c0fe3", + "installed_by": ["modules"] + }, + "universc": { + "branch": "master", + "git_sha": "cf67a6d7d043e2bd6a3099be84c72046fc71508f", "installed_by": ["modules"] } } diff --git a/modules/local/alevinqc.nf b/modules/local/alevinqc.nf new file mode 100644 index 00000000..9000d79e --- /dev/null +++ b/modules/local/alevinqc.nf @@ -0,0 +1,46 @@ +process ALEVINQC { + tag "$meta.id" + label 'process_low' + + //The alevinqc 1.14.0 container is broken, missing some libraries - thus reverting this to previous 1.12.1 version + conda "bioconda::bioconductor-alevinqc=1.12.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-alevinqc:1.12.1--r41h9f5acd7_0' : + 'biocontainers/bioconductor-alevinqc:1.12.1--r41h9f5acd7_0' }" + + input: + tuple val(meta), path(alevin_results) + + output: + tuple val(meta), path("alevin_report_${meta.id}.html"), emit: report + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + #!/usr/bin/env Rscript + require(alevinQC) + alevinFryQCReport( + mapDir = "${alevin_results}/af_map", + quantDir = "${alevin_results}/af_quant", + permitDir= "${alevin_results}/af_quant", + sampleId = "${prefix}", + outputFile = "alevin_report_${meta.id}.html", + outputFormat = "html_document", + outputDir = "./", + forceOverwrite = TRUE + ) + + yaml::write_yaml( + list( + '${task.process}'=list( + 'alevinqc' = paste(packageVersion('alevinQC'), collapse='.') + ) + ), + "versions.yml" + ) + """ +} diff --git a/modules/local/concat_h5ad.nf b/modules/local/concat_h5ad.nf new file mode 100644 index 00000000..96920f9e --- /dev/null +++ b/modules/local/concat_h5ad.nf @@ -0,0 +1,31 @@ +process CONCAT_H5AD { + label 'process_medium' + + conda "conda-forge::scanpy conda-forge::python-igraph conda-forge::leidenalg" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/scanpy:1.7.2--pyhdfd78af_0' : + 'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }" + + input: + path h5ad + path samplesheet + + output: + path "*.h5ad", emit: h5ad + + when: + task.ext.when == null || task.ext.when + + script: + """ + concat_h5ad.py \\ + --input $samplesheet \\ + --out combined_matrix.h5ad \\ + --suffix "_matrix.h5ad" + """ + + stub: + """ + touch combined_matrix.h5ad + """ +} diff --git a/modules/local/gene_map.nf b/modules/local/gene_map.nf new file mode 100644 index 00000000..9fd29e0a --- /dev/null +++ b/modules/local/gene_map.nf @@ -0,0 +1,34 @@ +/* + * Reformat design file and check validity + */ +process GENE_MAP { + tag "$gtf" + label 'process_low' + + conda "conda-forge::python=3.8.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3' }" + + input: + path gtf + + output: + path "transcripts_to_genes.txt" , emit: gene_map + + when: + task.ext.when == null || task.ext.when + + script: + if("${gtf}".endsWith('.gz')){ + name = "${gtf.baseName}" + unzip = "gunzip -f ${gtf}" + } else { + unzip = "" + name = "${gtf}" + } + """ + $unzip + cat $name | t2g.py --use_version > transcripts_to_genes.txt + """ +} diff --git a/modules/local/gffread_transcriptome.nf b/modules/local/gffread_transcriptome.nf new file mode 100644 index 00000000..ab573b07 --- /dev/null +++ b/modules/local/gffread_transcriptome.nf @@ -0,0 +1,30 @@ +process GFFREAD_TRANSCRIPTOME { + tag "${genome_fasta}" + label 'process_low' + + conda "bioconda::gffread=0.12.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gffread:0.12.7--hd03093a_1' : + 'biocontainers/gffread:0.12.7--hd03093a_1' }" + + input: + path genome_fasta + path gtf + + output: + path "${genome_fasta}.transcriptome.fa", emit: transcriptome_extracted + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + gffread -F $gtf -w "${genome_fasta}.transcriptome.fa" -g $genome_fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffread: \$(gffread --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/local/gtf_gene_filter.nf b/modules/local/gtf_gene_filter.nf new file mode 100644 index 00000000..063bd228 --- /dev/null +++ b/modules/local/gtf_gene_filter.nf @@ -0,0 +1,32 @@ +process GTF_GENE_FILTER { + tag "$fasta" + label 'process_low' + + conda "conda-forge::python=3.9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'biocontainers/python:3.9--1' }" + + input: + path fasta + path gtf + + output: + path "*.gtf" , emit: gtf + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // filter_gtf_for_genes_in_genome.py is bundled with the pipeline, in nf-core/rnaseq/bin/ + """ + filter_gtf_for_genes_in_genome.py \\ + --gtf $gtf \\ + --fasta $fasta \\ + -o ${fasta.baseName}_genes.gtf + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf new file mode 100644 index 00000000..7961e057 --- /dev/null +++ b/modules/local/mtx_to_h5ad.nf @@ -0,0 +1,91 @@ +process MTX_TO_H5AD { + tag "$meta.id" + label 'process_medium' + + conda "conda-forge::scanpy conda-forge::python-igraph conda-forge::leidenalg" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/scanpy:1.7.2--pyhdfd78af_0' : + 'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }" + + input: + // inputs from cellranger nf-core module does not come in a single sample dir + // for each sample, the sub-folders and files come directly in array. + tuple val(meta), path(inputs) + path txp2gene + path star_index + + output: + path "${meta.id}/*h5ad", emit: h5ad + path "${meta.id}/*", emit: counts + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // def file paths for aligners (except cellranger) + if (params.aligner == 'kallisto') { + mtx_matrix = "*count/counts_unfiltered/*.mtx" + barcodes_tsv = "*count/counts_unfiltered/*.barcodes.txt" + features_tsv = "*count/counts_unfiltered/*.genes.txt" + } else if (params.aligner == 'alevin') { + mtx_matrix = "*_alevin_results/af_quant/alevin/quants_mat.mtx" + barcodes_tsv = "*_alevin_results/af_quant/alevin/quants_mat_rows.txt" + features_tsv = "*_alevin_results/af_quant/alevin/quants_mat_cols.txt" + } else if (params.aligner == 'star') { + mtx_matrix = "*.Solo.out/Gene*/filtered/matrix.mtx.gz" + barcodes_tsv = "*.Solo.out/Gene*/filtered/barcodes.tsv.gz" + features_tsv = "*.Solo.out/Gene*/filtered/features.tsv.gz" + } + + // + // run script + // + if (params.aligner == 'cellranger') + """ + # convert file types + mtx_to_h5ad.py \\ + --aligner ${params.aligner} \\ + --input filtered_feature_bc_matrix.h5 \\ + --sample ${meta.id} \\ + --out ${meta.id}/${meta.id}_matrix.h5ad + """ + + else if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') + """ + # convert file types + for input_type in spliced unspliced ; do + mtx_to_h5ad.py \\ + --aligner ${params.aligner} \\ + --sample ${meta.id} \\ + --input *count/counts_unfiltered/\${input_type}.mtx \\ + --barcode *count/counts_unfiltered/\${input_type}.barcodes.txt \\ + --feature *count/counts_unfiltered/\${input_type}.genes.txt \\ + --txp2gene ${txp2gene} \\ + --star_index ${star_index} \\ + --out ${meta.id}/${meta.id}_\${input_type}_matrix.h5ad ; + done + """ + + else + """ + # convert file types + mtx_to_h5ad.py \\ + --task_process ${task.process} \\ + --aligner ${params.aligner} \\ + --sample ${meta.id} \\ + --input $mtx_matrix \\ + --barcode $barcodes_tsv \\ + --feature $features_tsv \\ + --txp2gene ${txp2gene} \\ + --star_index ${star_index} \\ + --out ${meta.id}/${meta.id}_matrix.h5ad + """ + + stub: + """ + mkdir ${meta.id} + touch ${meta.id}/${meta.id}_matrix.h5ad + touch versions.yml + """ +} diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf new file mode 100644 index 00000000..4351f4b3 --- /dev/null +++ b/modules/local/mtx_to_seurat.nf @@ -0,0 +1,72 @@ +process MTX_TO_SEURAT { + tag "$meta.id" + label 'process_medium' + + conda "r-seurat" + container "nf-core/seurat:4.3.0" + + input: + // inputs from cellranger nf-core module does not come in a single sample dir + // for each sample, the sub-folders and files come directly in array. + tuple val(meta), path(inputs) + + output: + path "${meta.id}/*.rds", emit: seuratObjects + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def aligner = params.aligner + if (params.aligner == "cellranger") { + matrix = "matrix.mtx.gz" + barcodes = "barcodes.tsv.gz" + features = "features.tsv.gz" + } else if (params.aligner == "kallisto") { + matrix = "*count/counts_unfiltered/*.mtx" + barcodes = "*count/counts_unfiltered/*.barcodes.txt" + features = "*count/counts_unfiltered/*.genes.txt" + } else if (params.aligner == "alevin") { + matrix = "*_alevin_results/af_quant/alevin/quants_mat.mtx" + barcodes = "*_alevin_results/af_quant/alevin/quants_mat_rows.txt" + features = "*_alevin_results/af_quant/alevin/quants_mat_cols.txt" + } else if (params.aligner == 'star') { + matrix = "*.Solo.out/Gene*/filtered/matrix.mtx.gz" + barcodes = "*.Solo.out/Gene*/filtered/barcodes.tsv.gz" + features = "*.Solo.out/Gene*/filtered/features.tsv.gz" + } + """ + mkdir ${meta.id} + """ + + if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') + """ + # convert file types + for input_type in spliced unspliced ; do + mtx_to_seurat.R \\ + *count/counts_unfiltered/\${input_type}.mtx \\ + *count/counts_unfiltered/\${input_type}.barcodes.txt \\ + *count/counts_unfiltered/\${input_type}.genes.txt \\ + ${meta.id}/${meta.id}_\${input_type}_matrix.rds \\ + ${aligner} + done + """ + + else + """ + mtx_to_seurat.R \\ + $matrix \\ + $barcodes \\ + $features \\ + ${meta.id}/${meta.id}_matrix.rds \\ + ${aligner} + """ + + stub: + """ + mkdir ${meta.id} + touch ${meta.id}/${meta.id}_matrix.rds + touch versions.yml + """ +} diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index d3934fdd..feaf3dfc 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -1,6 +1,6 @@ process SAMPLESHEET_CHECK { tag "$samplesheet" - label 'process_single' + label 'process_low' conda "conda-forge::python=3.8.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/simpleaf_index.nf b/modules/local/simpleaf_index.nf new file mode 100644 index 00000000..5e8f5c42 --- /dev/null +++ b/modules/local/simpleaf_index.nf @@ -0,0 +1,49 @@ +process SIMPLEAF_INDEX { + tag "$transcript_gtf" + label "process_medium" + + conda 'bioconda::simpleaf=0.10.0-1' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/simpleaf:0.10.0--h9f5acd7_1' : + 'biocontainers/simpleaf:0.10.0--h9f5acd7_1' }" + + input: + path genome_fasta + path transcript_fasta + path transcript_gtf + + output: + path "salmon/index" , emit: index + path "salmon/ref/*_t2g_3col.tsv" , emit: transcript_tsv + path "versions.yml" , emit: versions + path "salmon" , emit: salmon + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def seq_inputs = (params.transcript_fasta) ? "--refseq $transcript_fasta" : "--gtf $transcript_gtf" + """ + # export required var + export ALEVIN_FRY_HOME=. + + # prep simpleaf + simpleaf set-paths + + # run simpleaf index + simpleaf \\ + index \\ + --threads $task.cpus \\ + --fasta $genome_fasta \\ + $seq_inputs \\ + $args \\ + -o salmon + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + simpleaf: \$(simpleaf -V | tr -d '\\n' | cut -d ' ' -f 2) + salmon: \$(salmon --version | sed -e "s/salmon //g") + END_VERSIONS + """ +} diff --git a/modules/local/simpleaf_quant.nf b/modules/local/simpleaf_quant.nf new file mode 100644 index 00000000..f350acf3 --- /dev/null +++ b/modules/local/simpleaf_quant.nf @@ -0,0 +1,83 @@ +process SIMPLEAF_QUANT { + tag "$meta.id" + label 'process_high' + + conda 'bioconda::simpleaf=0.10.0-1' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/simpleaf:0.10.0--h9f5acd7_1' : + 'biocontainers/simpleaf:0.10.0--h9f5acd7_1' }" + + input: + // + // Input reads are expected to come as: [ meta, [ pair1_read1, pair1_read2, pair2_read1, pair2_read2 ] ] + // Input array for a sample is created in the same order reads appear in samplesheet as pairs from replicates are appended to array. + // + tuple val(meta), path(reads) + path index + path txp2gene + val protocol + path whitelist + + output: + tuple val(meta), path("*_alevin_results"), emit: alevin_results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args_list = args.tokenize() + def prefix = task.ext.prefix ?: "${meta.id}" + + // + // check if users are using one of the mutually excludable parameters: + // e.g -k,--knee | -e,--expect-cells | -f, --forced-cells + // + unzip_whitelist = "" + unfiltered_command = "" + save_whitelist = "" + if (!(args_list.any { it in ['-k', '--knee', '-e', '--expect-cells', '-f', '--forced-cells']} || meta.expected_cells)) { + if (whitelist) { + unzip_whitelist = "gzip -dcf $whitelist > whitelist.uncompressed.txt" + unfiltered_command = "-u whitelist.uncompressed.txt" + save_whitelist = "mv whitelist.uncompressed.txt ${prefix}_alevin_results/" + } + } + + // expected cells + def expect_cells = meta.expected_cells ? "--expect-cells $meta.expected_cells" : '' + + // separate forward from reverse pairs + def (forward, reverse) = reads.collate(2).transpose() + """ + # export required var + export ALEVIN_FRY_HOME=. + + # prep simpleaf + simpleaf set-paths + + # run simpleaf quant + $unzip_whitelist + simpleaf quant \\ + -1 ${forward.join( "," )} \\ + -2 ${reverse.join( "," )} \\ + -i ${index} \\ + -o ${prefix}_alevin_results \\ + -m $txp2gene \\ + -t $task.cpus \\ + -c "$protocol" \\ + $expect_cells \\ + $unfiltered_command \\ + $args + + $save_whitelist + [[ ! -f ${prefix}_alevin_results/af_quant/all_freq.bin ]] && cp ${prefix}_alevin_results/af_quant/permit_freq.bin ${prefix}_alevin_results/af_quant/all_freq.bin + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + simpleaf: \$(simpleaf -V | tr -d '\\n' | cut -d ' ' -f 2) + salmon: \$(salmon --version | sed -e "s/salmon //g") + END_VERSIONS + """ +} diff --git a/modules/local/star_align.nf b/modules/local/star_align.nf new file mode 100644 index 00000000..a7dfab3d --- /dev/null +++ b/modules/local/star_align.nf @@ -0,0 +1,91 @@ +process STAR_ALIGN { + tag "$meta.id" + label 'process_high' + + conda 'bioconda::star=2.7.10b' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/star:2.7.10b--h9ee0642_0' : + 'biocontainers/star:2.7.10b--h9ee0642_0' }" + + input: + // + // Input reads are expected to come as: [ meta, [ pair1_read1, pair1_read2, pair2_read1, pair2_read2 ] ] + // Input array for a sample is created in the same order reads appear in samplesheet as pairs from replicates are appended to array. + // + tuple val(meta), path(reads) + tuple val(meta2), path(index) + path gtf + path whitelist + val protocol + val star_feature + val other_10x_parameters + + output: + tuple val(meta), path('*d.out.bam') , emit: bam + tuple val(meta), path('*.Solo.out') , emit: counts + tuple val(meta), path('*Log.final.out') , emit: log_final + tuple val(meta), path('*Log.out') , emit: log_out + tuple val(meta), path('*Log.progress.out'), emit: log_progress + path "versions.yml" , emit: versions + + tuple val(meta), path('*sortedByCoord.out.bam') , optional:true, emit: bam_sorted + tuple val(meta), path('*toTranscriptome.out.bam'), optional:true, emit: bam_transcript + tuple val(meta), path('*Aligned.unsort.out.bam') , optional:true, emit: bam_unsorted + tuple val(meta), path('*fastq.gz') , optional:true, emit: fastq + tuple val(meta), path('*.tab') , optional:true, emit: tab + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def ignore_gtf = params.star_ignore_sjdbgtf ? '' : "--sjdbGTFfile $gtf" + def seq_center = meta.seq_center ? "--outSAMattrRGline ID:$prefix 'CN:$meta.seq_center' 'SM:$prefix'" : "--outSAMattrRGline ID:$prefix 'SM:$prefix'" + def out_sam_type = (args.contains('--outSAMtype')) ? '' : '--outSAMtype BAM Unsorted' + def mv_unsorted_bam = (args.contains('--outSAMtype BAM Unsorted SortedByCoordinate')) ? "mv ${prefix}.Aligned.out.bam ${prefix}.Aligned.unsort.out.bam" : '' + // def read_pair = params.protocol.contains("chromium") ? "${reads[1]} ${reads[0]}" : "${reads[0]} ${reads[1]}" -- commented out to be removed is it is not being used + + // default values max percentile for UMI count 0.99 and max to min ratio for UMI count 10 taken from STARsolo usage + def cell_filter = meta.expected_cells ? "--soloCellFilter CellRanger2.2 $meta.expected_cells 0.99 10" : '' + + // separate forward from reverse pairs + def (forward, reverse) = reads.collate(2).transpose() + """ + STAR \\ + --genomeDir $index \\ + --readFilesIn ${reverse.join( "," )} ${forward.join( "," )} \\ + --runThreadN $task.cpus \\ + --outFileNamePrefix $prefix. \\ + --soloCBwhitelist <(gzip -cdf $whitelist) \\ + --soloType $protocol \\ + --soloFeatures $star_feature \\ + $other_10x_parameters \\ + $out_sam_type \\ + $ignore_gtf \\ + $seq_center \\ + $cell_filter \\ + $args \\ + + $mv_unsorted_bam + + if [ -f ${prefix}.Unmapped.out.mate1 ]; then + mv ${prefix}.Unmapped.out.mate1 ${prefix}.unmapped_1.fastq + gzip ${prefix}.unmapped_1.fastq + fi + if [ -f ${prefix}.Unmapped.out.mate2 ]; then + mv ${prefix}.Unmapped.out.mate2 ${prefix}.unmapped_2.fastq + gzip ${prefix}.unmapped_2.fastq + fi + + if [ -d ${prefix}.Solo.out ]; then + # Backslashes still need to be escaped (https://github.com/nextflow-io/nextflow/issues/67) + find ${prefix}.Solo.out \\( -name "*.tsv" -o -name "*.mtx" \\) -exec gzip {} \\; + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellranger/count/README.md b/modules/nf-core/cellranger/count/README.md new file mode 100644 index 00000000..feafbfa9 --- /dev/null +++ b/modules/nf-core/cellranger/count/README.md @@ -0,0 +1,99 @@ +# Automatically renaming files for Cellranger + +See also https://github.com/nf-core/scrnaseq/issues/241 and the [Cellranger documentation](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input). + +## Motivation + +Cellranger (and also Spaceranger, and probably other 10x pipelines) rely on the following input + +- `--fastqs`, to point to a directory with fastq files that are named according to `{sample_name}_S{i}_L00{j}_{R1,R2}_001.fastq.gz`. +- `--sample`, with a sample name (that is the prefix of all associated fastq files) + +In the easiest case, `--fastqs` points to a directory that contains all fastqs of a single sample that already follow the naming conventions. However, it's not always easy: + +1. the directory may contain fastqs for multiple samples + - this is not a problem for cellranger, it will automatically choose the correct fastq files via the `--sample` flag as long as they follow the naming convention, **but** + - if we stage the whole folder (or all files in that folder) using nextflow, it breaks caching in that if an additional sample (or any other file) gets added to the folder, the cache gets invalidated for _all_ samples. +2. the sample has been sequenced across multiple flow cells + - In this case, we need to specify multiple input folders. Cellranger allows passing a _list of directories_ e.g. `--fastqs=/path/dir1,path/dir2` + - Staging all _files_ in these folders into a single directory using nextflow doesn't do the job, as there may be duplicate file names across the different folders. +3. the raw sequencing data may have been downloaded from a sequence database and doesn't follow the naming convention anymore. + - In that case we need to rename the files to follow the bcl2fastq naming convention. + +## Solution + +Rename files automatically to match the `{sample_name}_S{i}_L00{j}_{R1,R2}_001.fastq.gz` pattern. We assume that +files in the input channel are ordered according to `file1 R1, file1 R2, file2 R1, file2 R2, ...`. +If the files are explicitly listed in a samplesheet, we have this order anyway. If the files follow any sensible naming +convention, this order can be achieved by sorting by filename. + +## Does renaming the files change the result? + +No. Here's how the same dataset was tested in four different scenarios. The result is the same down +to identical md5sums of the output files. + +### Prepare datasets + +1. Download dataset: + +``` +curl https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/6.1.2/10k_PBMC_3p_nextgem_Chromium_X_intron/10k_PBMC_3p_nextgem_Chromium_X_intron_fastqs.tar | tar -xv +INPUT_DIR=10k_PBMC_3p_nextgem_Chromium_X_fastqs +# remove index files +rm $INPUT_DIR/*_I{1,2}_*.fastq.gz +``` + +2. Simulate scenario where files were generated using multiple flow cells: + +``` +MFC=fastq_multiflowcell +mkdir -p $MFC/fc{1,2,3,4} +for f in $INPUT_DIR/*.fastq.gz; do ; ln $f $MFC; done +for i in $(seq 4) ; do ; mv $MFC/*L00$i* $MFC/fc$i ; done +for i in $(seq 4) ; do ; rename L00$i L001 $MFC/fc$i/*; done +``` + +3. Simulate scenario where sample was multiplexed: + +``` +MPX=fastq_multiplexed +mkdir $MPX +for f in $INPUT_DIR/*.fastq.gz; do ; ln $f $MPX; done +for i in $(seq 4) ; do ; rename S1_L00$i S${i}_L001 $MPX/* ; done +``` + +4. Concatenate files + +``` +mkdir fastq_cat +cat $INPUT_DIR/*_R1_*.fastq.gz > fastq_cat/10k_PBMC_3p_nextgem_Chromium_X_S1_L001_R1_001.fastq.gz +cat $INPUT_DIR/*_R2_*.fastq.gz > fastq_cat/10k_PBMC_3p_nextgem_Chromium_X_S1_L001_R2_001.fastq.gz +``` + +### Run cellranger + +``` +singularity pull docker://docker.io/nfcore/cellranger:7.1.0 +cd $INPUT_DIR && singularity exec -B $(pwd):$(pwd) -B /cfs:/cfs docker://docker.io/nfcore/cellranger:7.1.0 cellranger count --localcores=8 --id=10k_PBMC_3p_nextgem_Chromium_X --fastqs=. --transcriptome=/cfs/sturmgre/tmp/cellranger-ref/refdata-gex-GRCh38-2020-A --localmem=64 +cd $MFC && singularity exec -B $(pwd):$(pwd) -B /cfs:/cfs docker://docker.io/nfcore/cellranger:7.1.0 cellranger count --localcores=8 --id=10k_PBMC_3p_nextgem_Chromium_X --fastqs=fc1,fc2,fc3,fc4 --transcriptome=/cfs/sturmgre/tmp/cellranger-ref/refdata-gex-GRCh38-2020-A --localmem=64 +cd $MPX && singularity exec -B $(pwd):$(pwd) -B /cfs:/cfs docker://docker.io/nfcore/cellranger:7.1.0 cellranger count --localcores=8 --id=10k_PBMC_3p_nextgem_Chromium_X --fastqs=. --transcriptome=/cfs/sturmgre/tmp/cellranger-ref/refdata-gex-GRCh38-2020-A --localmem=64 +cd fastq_cat && singularity exec -B $(pwd):$(pwd) -B /cfs:/cfs docker://docker.io/nfcore/cellranger:7.1.0 cellranger count --localcores=8 --id=10k_PBMC_3p_nextgem_Chromium_X --fastqs=. --transcriptome=/cfs/sturmgre/tmp/cellranger-ref/refdata-gex-GRCh38-2020-A --localmem=64 +``` + +### Check results + +``` +> md5sum **/outs/*.h5 | sort +2a7a5a022f01cb8d95965980f0d95ca5 10k_PBMC_3p_nextgem_Chromium_X_fastqs/10k_PBMC_3p_nextgem_Chromium_X/outs/raw_feature_bc_matrix.h5 +2a7a5a022f01cb8d95965980f0d95ca5 fastq_cat/10k_PBMC_3p_nextgem_Chromium_X/outs/raw_feature_bc_matrix.h5 +2a7a5a022f01cb8d95965980f0d95ca5 fastq_multiflowcell/10k_PBMC_3p_nextgem_Chromium_X/outs/raw_feature_bc_matrix.h5 +2a7a5a022f01cb8d95965980f0d95ca5 fastq_multiplexed/10k_PBMC_3p_nextgem_Chromium_X/outs/raw_feature_bc_matrix.h5 +533f4156f2d90152594ebc587b7fde0a 10k_PBMC_3p_nextgem_Chromium_X_fastqs/10k_PBMC_3p_nextgem_Chromium_X/outs/filtered_feature_bc_matrix.h5 +533f4156f2d90152594ebc587b7fde0a fastq_cat/10k_PBMC_3p_nextgem_Chromium_X/outs/filtered_feature_bc_matrix.h5 +533f4156f2d90152594ebc587b7fde0a fastq_multiflowcell/10k_PBMC_3p_nextgem_Chromium_X/outs/filtered_feature_bc_matrix.h5 +533f4156f2d90152594ebc587b7fde0a fastq_multiplexed/10k_PBMC_3p_nextgem_Chromium_X/outs/filtered_feature_bc_matrix.h5 +cd33d3aa95c0d5cda7cf50908c5399c1 10k_PBMC_3p_nextgem_Chromium_X_fastqs/10k_PBMC_3p_nextgem_Chromium_X/outs/molecule_info.h5 +cd33d3aa95c0d5cda7cf50908c5399c1 fastq_cat/10k_PBMC_3p_nextgem_Chromium_X/outs/molecule_info.h5 +cd33d3aa95c0d5cda7cf50908c5399c1 fastq_multiflowcell/10k_PBMC_3p_nextgem_Chromium_X/outs/molecule_info.h5 +cd33d3aa95c0d5cda7cf50908c5399c1 fastq_multiplexed/10k_PBMC_3p_nextgem_Chromium_X/outs/molecule_info.h5 +``` diff --git a/modules/nf-core/cellranger/count/main.nf b/modules/nf-core/cellranger/count/main.nf new file mode 100644 index 00000000..d7a191fc --- /dev/null +++ b/modules/nf-core/cellranger/count/main.nf @@ -0,0 +1,42 @@ +process CELLRANGER_COUNT { + tag "$meta.id" + label 'process_high' + + container "nf-core/cellranger:7.1.0" + + input: + tuple val(meta), path(reads, stageAs: "fastq_???/*") + path reference + + output: + tuple val(meta), path("**/outs/**"), emit: outs + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "CELLRANGER_COUNT module does not support Conda. Please use Docker / Singularity / Podman instead." + } + args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + template "cellranger_count.py" + + stub: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "CELLRANGER_COUNT module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p "${prefix}/outs/" + touch ${prefix}/outs/fake_file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellranger: \$(echo \$( cellranger --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellranger/count/meta.yml b/modules/nf-core/cellranger/count/meta.yml new file mode 100644 index 00000000..c7d82bbc --- /dev/null +++ b/modules/nf-core/cellranger/count/meta.yml @@ -0,0 +1,49 @@ +name: cellranger_count +description: Module to use Cell Ranger's pipelines analyze sequencing data produced from Chromium Single Cell Gene Expression. +keywords: + - align + - count + - reference +tools: + - cellranger: + description: Cell Ranger by 10x Genomics is a set of analysis pipelines that process Chromium single-cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis, and more. + homepage: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger + documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov + tool_dev_url: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov + + licence: 10x Genomics EULA +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files. The order of the input files MUST be ["sample1 R1", "sample1 R2", "sample2, R1", + "sample2, R2", ...]. This can usually be achieved by sorting the input files by file name. + + Background: 10x data is always paired-end with R1 containing cell barcode and UMI + and R2 containing the actual read sequence. Cell Ranger requires files to adhere to the following file-name + convention: `${Sample_Name}_S1_L00${Lane_Number}_${R1,R2}_001.fastq.gz`. This module automatically + renames files to match this convention based on the order of input files to avoid various + issues (see https://github.com/nf-core/scrnaseq/issues/241). To avoid mistakes, the module + throws an error if a pair of R1 and R2 fastq files does not have the same filename except for the "_R1"/"_R2" part. + Renaming the files does not affect the results (see README.md for detailed tests). + pattern: "*{R1,R2}*.fastq.gz" + - reference: + type: directory + description: Folder containing all the reference indices needed by Cell Ranger +output: + - outs: + type: file + description: Files containing the outputs of Cell Ranger, see official 10X Genomics documentation for a complete list + pattern: "${meta.id}/outs/*" + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@ggabernet" + - "@Emiller88" diff --git a/modules/nf-core/cellranger/count/templates/cellranger_count.py b/modules/nf-core/cellranger/count/templates/cellranger_count.py new file mode 100644 index 00000000..4bfb9f4f --- /dev/null +++ b/modules/nf-core/cellranger/count/templates/cellranger_count.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +""" +Automatically rename staged files for input into cellranger count. + +Copyright (c) Gregor Sturm 2023 - MIT License +""" +from subprocess import run +from pathlib import Path +from textwrap import dedent +import shlex +import re + + +def chunk_iter(seq, size): + """iterate over `seq` in chunks of `size`""" + return (seq[pos : pos + size] for pos in range(0, len(seq), size)) + + +sample_id = "${meta.id}" + +# get fastqs, ordered by path. Files are staged into +# - "fastq_001/{original_name.fastq.gz}" +# - "fastq_002/{oritinal_name.fastq.gz}" +# - ... +# Since we require fastq files in the input channel to be ordered such that a R1/R2 pair +# of files follows each other, ordering will get us a sequence of [R1, R2, R1, R2, ...] +fastqs = sorted(Path(".").glob("fastq_*/*")) +assert len(fastqs) % 2 == 0 + +# target directory in which the renamed fastqs will be placed +fastq_all = Path("./fastq_all") +fastq_all.mkdir(exist_ok=True) + +# Match R1 in the filename, but only if it is followed by a non-digit or non-character +# match "file_R1.fastq.gz", "file.R1_000.fastq.gz", etc. but +# do not match "SRR12345", "file_INFIXR12", etc +filename_pattern = r"([^a-zA-Z0-9])R1([^a-zA-Z0-9])" + +for i, (r1, r2) in enumerate(chunk_iter(fastqs, 2)): + # double escapes are required because nextflow processes this python 'template' + if re.sub(filename_pattern, r"\\1R2\\2", r1.name) != r2.name: + raise AssertionError( + dedent( + f"""\ + We expect R1 and R2 of the same sample to have the same filename except for R1/R2. + This has been checked by replacing "R1" with "R2" in the first filename and comparing it to the second filename. + If you believe this check shouldn't have failed on your filenames, please report an issue on GitHub! + + Files involved: + - {r1} + - {r2} + """ + ) + ) + r1.rename(fastq_all / f"{sample_id}_S1_L{i:03d}_R1_001.fastq.gz") + r2.rename(fastq_all / f"{sample_id}_S1_L{i:03d}_R2_001.fastq.gz") + +run( + # fmt: off + [ + "cellranger", "count", + "--id", "${prefix}", + "--fastqs", str(fastq_all), + "--transcriptome", "${reference.name}", + "--localcores", "${task.cpus}", + "--localmem", "${task.memory.toGiga()}", + *shlex.split("""${args}""") + ], + # fmt: on + check=True, +) + +# Output version information +version = run( + ["cellranger", "-V"], + text=True, + check=True, + capture_output=True, +).stdout.replace("cellranger cellranger-", "") + +# alas, no `pyyaml` pre-installed in the cellranger container +with open("versions.yml", "w") as f: + f.write('"${task.process}":\\n') + f.write(f' cellranger: "{version}"\\n') diff --git a/modules/nf-core/cellranger/mkgtf/main.nf b/modules/nf-core/cellranger/mkgtf/main.nf new file mode 100644 index 00000000..e0b0dd67 --- /dev/null +++ b/modules/nf-core/cellranger/mkgtf/main.nf @@ -0,0 +1,36 @@ +process CELLRANGER_MKGTF { + tag "$gtf" + label 'process_low' + + container "nf-core/cellranger:7.1.0" + + input: + path gtf + + output: + path "*.gtf" , emit: gtf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "CELLRANGER_MKGTF module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${gtf.baseName}.filtered" + """ + cellranger \\ + mkgtf \\ + $gtf \\ + ${prefix}.gtf \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellranger: \$(echo \$( cellranger --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellranger/mkgtf/meta.yml b/modules/nf-core/cellranger/mkgtf/meta.yml new file mode 100644 index 00000000..e226e42d --- /dev/null +++ b/modules/nf-core/cellranger/mkgtf/meta.yml @@ -0,0 +1,31 @@ +name: cellranger_mkgtf +description: Module to build a filtered GTF needed by the 10x Genomics Cell Ranger tool. Uses the cellranger mkgtf command. +keywords: + - reference + - mkref + - index +tools: + - cellranger: + description: Cell Ranger by 10x Genomics is a set of analysis pipelines that process Chromium single-cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis, and more. + homepage: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger + documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov + tool_dev_url: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov + + licence: 10x Genomics EULA +input: + - gtf: + type: file + description: The reference GTF transcriptome file + pattern: "*.gtf" +output: + - gtf: + type: directory + description: The filtered GTF transcriptome file + pattern: "*.filtered.gtf" + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@ggabernet" + - "@Emiller88" diff --git a/modules/nf-core/cellranger/mkref/main.nf b/modules/nf-core/cellranger/mkref/main.nf new file mode 100644 index 00000000..986891b8 --- /dev/null +++ b/modules/nf-core/cellranger/mkref/main.nf @@ -0,0 +1,38 @@ +process CELLRANGER_MKREF { + tag "$fasta" + label 'process_high' + + container "nf-core/cellranger:7.1.0" + + input: + path fasta + path gtf + val reference_name + + output: + path "${reference_name}", emit: reference + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "CELLRANGER_MKREF module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + """ + cellranger \\ + mkref \\ + --genome=$reference_name \\ + --fasta=$fasta \\ + --genes=$gtf \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellranger: \$(echo \$( cellranger --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellranger/mkref/meta.yml b/modules/nf-core/cellranger/mkref/meta.yml new file mode 100644 index 00000000..1ad5d6e3 --- /dev/null +++ b/modules/nf-core/cellranger/mkref/meta.yml @@ -0,0 +1,37 @@ +name: cellranger_mkref +description: Module to build the reference needed by the 10x Genomics Cell Ranger tool. Uses the cellranger mkref command. +keywords: + - reference + - mkref + - index +tools: + - cellranger: + description: Cell Ranger by 10x Genomics is a set of analysis pipelines that process Chromium single-cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis, and more. + homepage: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger + documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov + tool_dev_url: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov + + licence: 10x Genomics EULA +input: + - fasta: + type: file + description: Reference genome FASTA file + pattern: "*.{fasta,fa}" + - gtf: + type: file + description: Reference transcriptome GTF file + pattern: "*.gtf" + - reference_name: + type: string + description: The name to give the new reference folder + pattern: str +output: + - reference: + type: directory + description: Folder containing all the reference indices needed by Cell Ranger + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@ggabernet" diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 7685b33c..c9d014b1 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "${moduleDir}/environment.yml" + conda "bioconda::multiqc=1.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : - 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : + 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index 5f15a5fd..c32657de 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,4 +1,4 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: @@ -16,6 +16,7 @@ input: type: file description: YML file containing software versions pattern: "*.yml" + output: - yml: type: file @@ -29,9 +30,7 @@ output: type: file description: File containing software versions pattern: "versions.yml" + authors: - "@drpatelh" - "@grst" -maintainers: - - "@drpatelh" - - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index e55b8d43..da033408 100755 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -4,11 +4,10 @@ """Provide functions to merge multiple versions.yml files.""" +import yaml import platform from textwrap import dedent -import yaml - def _make_versions_html(versions): """Generate a tabular HTML output of all versions for MultiQC.""" diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 9e19a74c..249f9064 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -2,10 +2,10 @@ process FASTQC { tag "$meta.id" label 'process_medium' - conda "${moduleDir}/environment.yml" + conda "bioconda::fastqc=0.11.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : - 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : + 'biocontainers/fastqc:0.11.9--0' }" input: tuple val(meta), path(reads) @@ -37,7 +37,7 @@ process FASTQC { cat <<-END_VERSIONS > versions.yml "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) + fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) END_VERSIONS """ @@ -49,7 +49,7 @@ process FASTQC { cat <<-END_VERSIONS > versions.yml "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) + fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) END_VERSIONS """ } diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml index ee5507e0..4da5bb5a 100644 --- a/modules/nf-core/fastqc/meta.yml +++ b/modules/nf-core/fastqc/meta.yml @@ -50,8 +50,3 @@ authors: - "@grst" - "@ewels" - "@FelixKrueger" -maintainers: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test index b9e8f926..3961de60 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -1,18 +1,13 @@ nextflow_process { name "Test Process FASTQC" - script "../main.nf" + script "modules/nf-core/fastqc/main.nf" process "FASTQC" - tag "modules" - tag "modules_nfcore" tag "fastqc" test("Single-Read") { when { - params { - outdir = "$outputDir" - } process { """ input[0] = [ @@ -26,84 +21,12 @@ nextflow_process { } then { - assertAll ( - { assert process.success }, - // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. - // looks like this:
Mon 2 Oct 2023
test.gz
- // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 - { assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" }, - { assert path(process.out.html.get(0).get(1)).getText().contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match("versions") }, - { assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" } - ) + assert process.success + assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" + assert path(process.out.html.get(0).get(1)).getText().contains("File typeConventional base calls") + assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" } - } -// TODO -// // -// // Test with paired-end data -// // -// workflow test_fastqc_paired_end { -// input = [ -// [id: 'test', single_end: false], // meta map -// [ -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) -// ] -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with interleaved data -// // -// workflow test_fastqc_interleaved { -// input = [ -// [id: 'test', single_end: false], // meta map -// file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } -// // -// // Test with bam data -// // -// workflow test_fastqc_bam { -// input = [ -// [id: 'test', single_end: false], // meta map -// file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with multiple samples -// // -// workflow test_fastqc_multiple { -// input = [ -// [id: 'test', single_end: false], // meta map -// [ -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) -// ] -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with custom prefix -// // -// workflow test_fastqc_custom_prefix { -// input = [ -// [ id:'mysample', single_end:true ], // meta map -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) -// ] + } -// FASTQC ( input ) -// } } diff --git a/modules/nf-core/gffread/main.nf b/modules/nf-core/gffread/main.nf new file mode 100644 index 00000000..f4472b0e --- /dev/null +++ b/modules/nf-core/gffread/main.nf @@ -0,0 +1,33 @@ +process GFFREAD { + tag "$gff" + label 'process_low' + + conda "bioconda::gffread=0.12.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gffread:0.12.1--h8b12597_0' : + 'biocontainers/gffread:0.12.1--h8b12597_0' }" + + input: + path gff + + output: + path "*.gtf" , emit: gtf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${gff.baseName}" + """ + gffread \\ + $gff \\ + $args \\ + -o ${prefix}.gtf + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffread: \$(gffread --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gffread/meta.yml b/modules/nf-core/gffread/meta.yml new file mode 100644 index 00000000..20335747 --- /dev/null +++ b/modules/nf-core/gffread/meta.yml @@ -0,0 +1,33 @@ +name: gffread +description: Validate, filter, convert and perform various other operations on GFF files +keywords: + - gff + - conversion + - validation +tools: + - gffread: + description: GFF/GTF utility providing format conversions, region filtering, FASTA sequence extraction and more. + homepage: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + documentation: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + tool_dev_url: https://github.com/gpertea/gffread + doi: 10.12688/f1000research.23297.1 + licence: ["MIT"] + +input: + - gff: + type: file + description: A reference file in either the GFF3, GFF2 or GTF format. + pattern: "*.{gff, gtf}" + +output: + - gtf: + type: file + description: GTF file resulting from the conversion of the GFF input file + pattern: "*.{gtf}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@emiller88" diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf new file mode 100644 index 00000000..73bf08cd --- /dev/null +++ b/modules/nf-core/gunzip/main.nf @@ -0,0 +1,48 @@ +process GUNZIP { + tag "$archive" + label 'process_single' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$gunzip"), emit: gunzip + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + gunzip = archive.toString() - '.gz' + """ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ + $args \\ + $archive \\ + > $gunzip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + gunzip = archive.toString() - '.gz' + """ + touch $gunzip + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml new file mode 100644 index 00000000..4cdcdf4c --- /dev/null +++ b/modules/nf-core/gunzip/meta.yml @@ -0,0 +1,35 @@ +name: gunzip +description: Compresses and decompresses files. +keywords: + - gunzip + - compression + - decompression +tools: + - gunzip: + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be compressed/uncompressed + pattern: "*.*" +output: + - gunzip: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/kallistobustools/count/main.nf b/modules/nf-core/kallistobustools/count/main.nf new file mode 100644 index 00000000..b7942fc2 --- /dev/null +++ b/modules/nf-core/kallistobustools/count/main.nf @@ -0,0 +1,51 @@ +process KALLISTOBUSTOOLS_COUNT { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::kb-python=0.27.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kb-python:0.27.2--pyhdfd78af_0' : + 'biocontainers/kb-python:0.27.2--pyhdfd78af_0' }" + + input: + tuple val(meta), path(reads) + path index + path t2g + path t1c + path t2c + val technology + + output: + tuple val(meta), path ("*.count"), emit: count + path "versions.yml" , emit: versions + path "*.count/*/*.mtx" , emit: matrix //Ensure that kallisto finished and produced outputs + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def cdna = t1c ? "-c1 $t1c" : '' + def introns = t2c ? "-c2 $t2c" : '' + def memory = task.memory.toGiga() - 1 + """ + kb \\ + count \\ + -t $task.cpus \\ + -i $index \\ + -g $t2g \\ + $cdna \\ + $introns \\ + -x $technology \\ + $args \\ + -o ${prefix}.count \\ + ${reads.join( " " )} \\ + -m ${memory}G + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallistobustools: \$(echo \$(kb --version 2>&1) | sed 's/^.*kb_python //;s/positional arguments.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/kallistobustools/count/meta.yml b/modules/nf-core/kallistobustools/count/meta.yml new file mode 100644 index 00000000..f25b7bc4 --- /dev/null +++ b/modules/nf-core/kallistobustools/count/meta.yml @@ -0,0 +1,69 @@ +name: kallistobustools_count +description: quantifies scRNA-seq data from fastq files using kb-python. +keywords: + - scRNA-seq + - count + - single-cell + - kallisto + - bustools +tools: + - kb: + description: kallisto and bustools are wrapped in an easy-to-use program called kb + homepage: https://www.kallistobus.tools/ + documentation: https://kb-python.readthedocs.io/en/latest/index.html + tool_dev_url: https://github.com/pachterlab/kb_python + licence: MIT License + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - index: + type: file + description: kb-ref index file (.idx) + pattern: "*.{idx}" + - t2g: + type: file + description: t2g file from kallisto + pattern: "*t2g.txt" + - t1c: + type: file + description: kb ref's c1 spliced_t2c file + pattern: "*.{cdna_t2c.txt}" + - t2c: + type: file + description: kb ref's c2 unspliced_t2c file + pattern: "*.{introns_t2c.txt}" + - workflow_mode: + type: string + description: String value defining workflow to use, can be one of "standard", "lamanno", "nucleus" + pattern: "{standard,lamanno,nucleus,kite}" + - technology: + type: string + description: String value defining the sequencing technology used. + pattern: "{10XV1,10XV2,10XV3,CELSEQ,CELSEQ2,DROPSEQ,INDROPSV1,INDROPSV2,INDROPSV3,SCRUBSEQ,SURECELL,SMARTSEQ}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - count: + type: file + description: kb count output folder + pattern: "*.{count}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@flowuenne" diff --git a/modules/nf-core/kallistobustools/ref/main.nf b/modules/nf-core/kallistobustools/ref/main.nf new file mode 100644 index 00000000..9d7f1741 --- /dev/null +++ b/modules/nf-core/kallistobustools/ref/main.nf @@ -0,0 +1,65 @@ +process KALLISTOBUSTOOLS_REF { + tag "$fasta" + label 'process_medium' + + conda "bioconda::kb-python=0.27.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kb-python:0.27.2--pyhdfd78af_0' : + 'biocontainers/kb-python:0.27.2--pyhdfd78af_0' }" + + input: + path fasta + path gtf + val workflow_mode + + output: + path "versions.yml" , emit: versions + path "kb_ref_out.idx" , emit: index + path "t2g.txt" , emit: t2g + path "cdna.fa" , emit: cdna + path "intron.fa" , optional:true, emit: intron + path "cdna_t2c.txt" , optional:true, emit: cdna_t2c + path "intron_t2c.txt" , optional:true, emit: intron_t2c + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + if (workflow_mode == "standard") { + """ + kb \\ + ref \\ + -i kb_ref_out.idx \\ + -g t2g.txt \\ + -f1 cdna.fa \\ + --workflow $workflow_mode \\ + $fasta \\ + $gtf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallistobustools: \$(echo \$(kb --version 2>&1) | sed 's/^.*kb_python //;s/positional arguments.*\$//') + END_VERSIONS + """ + } else { + """ + kb \\ + ref \\ + -i kb_ref_out.idx \\ + -g t2g.txt \\ + -f1 cdna.fa \\ + -f2 intron.fa \\ + -c1 cdna_t2c.txt \\ + -c2 intron_t2c.txt \\ + --workflow $workflow_mode \\ + $fasta \\ + $gtf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallistobustools: \$(echo \$(kb --version 2>&1) | sed 's/^.*kb_python //;s/positional arguments.*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/kallistobustools/ref/meta.yml b/modules/nf-core/kallistobustools/ref/meta.yml new file mode 100644 index 00000000..aca61082 --- /dev/null +++ b/modules/nf-core/kallistobustools/ref/meta.yml @@ -0,0 +1,60 @@ +name: kallistobustools_ref +description: index creation for kb count quantification of single-cell data. +keywords: + - kallisto-bustools + - index +tools: + - kb: + description: kallisto|bustools (kb) is a tool developed for fast and efficient processing of single-cell OMICS data. + homepage: https://www.kallistobus.tools/ + documentation: https://kb-python.readthedocs.io/en/latest/index.html + tool_dev_url: https://github.com/pachterlab/kb_python + doi: "10.22002/D1.1876" + licence: MIT License + +input: + - fasta: + type: file + description: Genomic DNA fasta file + pattern: "*.{fasta,fasta.gz}" + - gtf: + type: file + description: Genomic gtf file + pattern: "*.{gtf,gtf.gz}" + - workflow_mode: + type: value + description: String value defining workflow to use, can be one of "standard", "lamanno", "nucleus" + pattern: "{standard,lamanno,nucleus}" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - kb_ref_idx: + type: file + description: Index file from kb ref. + pattern: "*.{idx}" + - t2g: + type: file + description: Transcript to gene table + pattern: "*t2g.{txt}" + - cdna: + type: file + description: Cdna fasta file + pattern: "*cdna.{fa}" + - intron: + type: file + description: intron fasta file + pattern: "*intron.{fa}" + - cdna_t2c: + type: file + description: cdna transcript to capture file + pattern: "*cdna_t2c.{txt}" + - intron_t2c: + type: file + description: intron transcript to capture file + pattern: "*intron_t2c.{txt}" + +authors: + - "@flowuenne" diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 00cc48d2..65d7dd0d 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_single' - conda "${moduleDir}/environment.yml" + conda "bioconda::multiqc=1.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.18--pyhdfd78af_0' : - 'biocontainers/multiqc:1.18--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : + 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" @@ -25,14 +25,12 @@ process MULTIQC { def args = task.ext.args ?: '' def config = multiqc_config ? "--config $multiqc_config" : '' def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' - def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' """ multiqc \\ --force \\ $args \\ $config \\ $extra_config \\ - $logo \\ . cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index f1aa660e..f93b5ee5 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,5 +1,5 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json -name: multiqc +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: MultiQC description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: - QC @@ -13,6 +13,7 @@ tools: homepage: https://multiqc.info/ documentation: https://multiqc.info/docs/ licence: ["GPL-3.0-or-later"] + input: - multiqc_files: type: file @@ -30,6 +31,7 @@ input: type: file description: Optional logo file for MultiQC pattern: "*.{png}" + output: - report: type: file @@ -52,8 +54,3 @@ authors: - "@bunop" - "@drpatelh" - "@jfy133" -maintainers: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" diff --git a/modules/nf-core/star/genomegenerate/main.nf b/modules/nf-core/star/genomegenerate/main.nf new file mode 100644 index 00000000..43424042 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/main.nf @@ -0,0 +1,96 @@ +process STAR_GENOMEGENERATE { + tag "$fasta" + label 'process_high' + + conda "bioconda::star=2.7.10a bioconda::samtools=1.16.1 conda-forge::gawk=5.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' : + 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta2), path(gtf) + + output: + tuple val(meta), path("star") , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args_list = args.tokenize() + def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + if (args_list.contains('--genomeSAindexNbases')) { + """ + mkdir star + STAR \\ + --runMode genomeGenerate \\ + --genomeDir star/ \\ + --genomeFastaFiles $fasta \\ + --sjdbGTFfile $gtf \\ + --runThreadN $task.cpus \\ + $memory \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } else { + """ + samtools faidx $fasta + NUM_BASES=`gawk '{sum = sum + \$2}END{if ((log(sum)/log(2))/2 - 1 > 14) {printf "%.0f", 14} else {printf "%.0f", (log(sum)/log(2))/2 - 1}}' ${fasta}.fai` + + mkdir star + STAR \\ + --runMode genomeGenerate \\ + --genomeDir star/ \\ + --genomeFastaFiles $fasta \\ + --sjdbGTFfile $gtf \\ + --runThreadN $task.cpus \\ + --genomeSAindexNbases \$NUM_BASES \\ + $memory \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } + + stub: + """ + mkdir star + touch star/Genome + touch star/Log.out + touch star/SA + touch star/SAindex + touch star/chrLength.txt + touch star/chrName.txt + touch star/chrNameLength.txt + touch star/chrStart.txt + touch star/exonGeTrInfo.tab + touch star/exonInfo.tab + touch star/geneInfo.tab + touch star/genomeParameters.txt + touch star/sjdbInfo.txt + touch star/sjdbList.fromGTF.out.tab + touch star/sjdbList.out.tab + touch star/transcriptInfo.tab + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/star/genomegenerate/meta.yml b/modules/nf-core/star/genomegenerate/meta.yml new file mode 100644 index 00000000..eba2d9cf --- /dev/null +++ b/modules/nf-core/star/genomegenerate/meta.yml @@ -0,0 +1,52 @@ +name: star_genomegenerate +description: Create index for STAR +keywords: + - index + - fasta + - genome + - reference +tools: + - star: + description: | + STAR is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: https://github.com/alexdobin/STAR + manual: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf + doi: 10.1093/bioinformatics/bts635 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Fasta file of the reference genome + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - gtf: + type: file + description: GTF file of the reference genome + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - index: + type: directory + description: Folder containing the star index files + pattern: "star" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@kevinmenden" + - "@drpatelh" diff --git a/modules/nf-core/universc/CITATION.cff b/modules/nf-core/universc/CITATION.cff new file mode 100644 index 00000000..b00957d1 --- /dev/null +++ b/modules/nf-core/universc/CITATION.cff @@ -0,0 +1,51 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: + - given-names: "S. Thomas" + family-names: "Kelly" + email: "tom.kelly@riken.jp" + affiliation: "Center for Integrative Medical Sciences, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "https://orcid.org/0000-0003-3904-6690" + - family-names: "Battenberg" + given-names: "Kai" + email: "kai.battenberg@riken.jp" + affiliation: "Center for Sustainable Resource Science, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "http://orcid.org/0000-0001-7517-2657" +version: 1.2.5.1 +doi: 10.1101/2021.01.19.427209 +date-released: 2021-02-14 +url: "https://github.com/minoda-lab/universc" +preferred-citation: + type: article + authors: + - given-names: "S. Thomas" + family-names: "Kelly" + email: "tom.kelly@riken.jp" + affiliation: "Center for Integrative Medical Sciences, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "https://orcid.org/0000-0003-3904-6690" + - family-names: "Battenberg" + given-names: "Kai" + email: "kai.battenberg@riken.jp" + affiliation: "Center for Sustainable Resource Science, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "https://orcid.org/http://orcid.org/0000-0001-7517-2657" + - family-names: "Hetherington" + given-names: "Nicola A." + affiliation: "Center for Integrative Medical Sciences, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "http://orcid.org/0000-0001-8802-2906" + - family-names: "Hayashi" + given-names: "Makoto" + affiliation: "Center for Sustainable Resource Science, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "http://orcid.org/0000-0001-6389-4265" + - given-names: "Aki" + family-names: "Minoda" + email: "akiko.minoda@riken.jp" + affiliation: Center for Integrative Medical Sciences, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "http://orcid.org/0000-0002-2927-5791" + doi: "10.1101/2021.01.19.427209" + title: "UniverSC: a flexible cross-platform single-cell data processing pipeline" + year: "2021" + journal: "bioRxiv" + start: 2021.01.19.427209 + volume: + issue: + month: 1 diff --git a/modules/nf-core/universc/CITATION.md b/modules/nf-core/universc/CITATION.md new file mode 100644 index 00000000..4f420bb8 --- /dev/null +++ b/modules/nf-core/universc/CITATION.md @@ -0,0 +1,37 @@ +### Citation + +A submission to a journal and biorXiv is in progress. Please cite these when +they are available. Currently, the package can be cited +as follows: + +Kelly, S.T., Battenberg, Hetherington, N.A., K., Hayashi, K., and Minoda, A. (2021) +UniverSC: a flexible cross-platform single-cell data processing pipeline. +bioRxiv 2021.01.19.427209; doi: [https://doi.org/10.1101/2021.01.19.427209](https://doi.org/10.1101/2021.01.19.427209) +package version 1.2.5.1. [https://github.com/minoda-lab/universc](https://github.com/minoda-lab/universc) + +``` +@article {Kelly2021.01.19.427209, + author = {Kelly, S. Thomas and Battenberg, Kai and Hetherington, Nicola A. and Hayashi, Makoto and Minoda, Aki}, + title = {{UniverSC}: a flexible cross-platform single-cell data processing pipeline}, + elocation-id = {2021.01.19.427209}, + year = {2021}, + doi = {10.1101/2021.01.19.427209}, + publisher = {Cold Spring Harbor Laboratory}, + abstract = {Single-cell RNA-sequencing analysis to quantify RNA molecules in individual cells has become popular owing to the large amount of information one can obtain from each experiment. We have developed UniverSC (https://github.com/minoda-lab/universc), a universal single-cell processing tool that supports any UMI-based platform. Our command-line tool enables consistent and comprehensive integration, comparison, and evaluation across data generated from a wide range of platforms.Competing Interest StatementThe authors have declared no competing interest.}, + eprint = {https://www.biorxiv.org/content/early/2021/01/19/2021.01.19.427209.full.pdf}, + journal = {{bioRxiv}}, + note = {package version 1.2.5.1}, + URL = {https://github.com/minoda-lab/universc}, +} + +``` + +``` +@Manual{, + title = {{UniverSC}: a flexible cross-platform single-cell data processing pipeline}, + author = {S. Thomas Kelly, Kai Battenberg, Nicola A. Hetherington, Makoto Hayashi, and Aki Minoda}, + year = {2021}, + note = {package version 1.2.5.1}, + url = {https://github.com/minoda-lab/universc}, + } +``` diff --git a/modules/nf-core/universc/README.md b/modules/nf-core/universc/README.md new file mode 100644 index 00000000..793f4f13 --- /dev/null +++ b/modules/nf-core/universc/README.md @@ -0,0 +1,116 @@ +# UniverSC + +## Single-cell processing across technologies + +UniverSC is an open-source single-cell pipeline that runs across platforms on various technologies. + +## Maintainers + +Tom Kelly (RIKEN, IMS) + +Kai Battenberg (RIKEN CSRS/IMS) + +Contact: .[at]riken.jp + +## Implementation + +This container runs Cell Ranger v3.0.2 installed from source on MIT License on GitHub with +modifications for compatibility with updated dependencies. All software is installed from +open-source repositories and available for reuse. + +It is _not_ subject to the 10X Genomics End User License Agreement (EULA). +This version allows running Cell Ranger v3.0.2 on data generated from any experimental platform +without restrictions. However, updating to newer versions on Cell Ranger subject to the +10X EULA is not possible without the agreement of 10X Genomics. + +To comply with licensing and respect 10X Genomics Trademarks, the 10X Genomics logo +has been removed from HTML reports, the tool has been renamed, and proprietary +closed-source tools to build Cloupe files are disabled. + +It is still suffient to generate summary reports and count matrices compatible with +single-cell analysis tools available for 10X Genomics and Cell Ranger output format +in Python and R packages. + +## Usage + +### Generating References + +The Cell Ranger modules can be used to generate reference indexes to run UniverSC. +Note that UniverSC requires the Open Source version v3.0.2 of Cell Ranger included +in the nf-core/universc Docker image. The same module parameters can be run provided +that the container is changed in process configurations (modify nextflow.config). + +``` +process { + +... + withName: CELLRANGER_MKGTF { + container = "nf-core/universc:1.2.5.1" + } + withName: CELLRANGER_MKREF{ + container = "nf-core/universc:1.2.5.1" + } +... +} +``` + +This will generate a compatible index for UniverSC using the same version of the +STAR aligner and a permissive software license without and EULA. + +### Container settings + +The cellranger install directory must have write permissions to run UniverSC. +To run in docker or podman use the `--user root` option in container parameters +and for singularity use the `--writeable` parameter. + +These are set as default in universc/main.nf: + +``` + container "nf-core/universc:1.2.5.1" + if (workflow.containerEngine == 'docker'){ + containerOptions = "--privileged" + } + if (workflow.containerEngine == 'podman'){ + containerOptions = "--runtime /usr/bin/crun --userns=keep-id --user root --systemd=always" + } + if (workflow.containerEngine == 'singularity'){ + containerOptions = "--writable" + } +``` + +Select the container engine with `nextflow --profile "docker"` or set the environment variable +as one of the following before running nextflow. + +``` +export PROFILE="docker" +export PROFILE="podman" +export PROFILE="singularity" +``` + +Note that due to dependencies installed in a docker image, it is not possible to use conda environments. + +## Disclaimer + +We are third party developers not affiliated with 10X Genomics or any other vendor of +single-cell technologies. We are releasing this code on an open-source license which calls Cell Ranger +as an external dependency. + +## Licensing + +This package is provided open-source on a GPL-3 license. This means that you are free to use and +modify this code provided that they also contain this license. + +## Updating the package + +The tomkellygenetics/universc: container is automatically updated with tomkellygenetics/universc:latest. + +A stable release is mirrored at nf-core/universc:1.2.5.1 and will be updated as needed. + +To build an updated container use the Dockerfile provided here: + +[https://github.com/minoda-lab/universc/blob/master/Dockerfile](https://github.com/minoda-lab/universc/blob/master/Dockerfile) + +Note that this uses a custom base image which is built with an open-source implementation of +Cell Ranger v3.0.2 on MIT License and relies of Python 2. The build file can be found here: + +[https://github.com/TomKellyGenetics/cellranger_clean/blob/master/Dockerfile](https://github.com/TomKellyGenetics/cellranger_clean/blob/master/Dockerfile) diff --git a/modules/nf-core/universc/main.nf b/modules/nf-core/universc/main.nf new file mode 100644 index 00000000..de8394a0 --- /dev/null +++ b/modules/nf-core/universc/main.nf @@ -0,0 +1,80 @@ +process UNIVERSC { + tag "$meta.id" + label 'process_medium' + + container "nf-core/universc:1.2.5.1" + if (workflow.containerEngine == 'docker'){ + containerOptions = "--privileged" + } + if ( workflow.containerEngine == 'podman'){ + containerOptions = "--runtime crun --userns=keep-id --systemd=always" + } + if (workflow.containerEngine == 'singularity'){ + containerOptions = "-B /var/tmp --writable-tmpfs" + params.singularity_autoMounts = true + } + + input: + tuple val(meta), path(reads) + path reference + + + output: + tuple val(meta), path("sample-${meta.id}/outs/*"), emit: outs + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "UNIVERSC module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def sample_arg = meta.samples.unique().join(",") + def reference_name = reference.name + def input_reads = meta.single_end ? "--file $reads" : "-R1 ${reads[0]} -R2 ${reads[1]}" + """ + universc \\ + --id 'sample-${meta.id}' \\ + ${input_reads} \\ + --technology '${meta.technology}' \\ + --chemistry '${meta.chemistry}' \\ + --reference ${reference_name} \\ + --description ${sample_arg} \\ + --jobmode "local" \\ + --localcores ${task.cpus} \\ + --localmem ${task.memory.toGiga()} \\ + --per-cell-data \\ + $args 1> _log 2> _err + + # save log files + echo !! > sample-${meta.id}/outs/_invocation + cp _log sample-${meta.id}/outs/_log + cp _err sample-${meta.id}/outs/_err + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellranger: \$(echo \$(cellranger count --version 2>&1 | head -n 2 | tail -n 1 | sed 's/^.* //g' | sed 's/(//g' | sed 's/)//g' )) + universc: \$(echo \$(bash /universc/launch_universc.sh --version | grep version | grep universc | sed 's/^.* //g' )) + END_VERSIONS + """ + + + stub: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "UNIVERSC module does not support Conda. Please use Docker / Singularity / Podman instead." + } + """ + mkdir -p "sample-${meta.id}/outs/" + touch sample-${meta.id}/outs/fake_file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellranger: \$(echo \$(cellranger count --version 2>&1 | head -n 2 | tail -n 1 | sed 's/^.* //g' | sed 's/(//g' | sed 's/)//g' )) + universc: \$(echo \$(bash /universc/launch_universc.sh --version | grep version | grep universc | sed 's/^.* //g' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/universc/meta.yml b/modules/nf-core/universc/meta.yml new file mode 100644 index 00000000..681bb849 --- /dev/null +++ b/modules/nf-core/universc/meta.yml @@ -0,0 +1,42 @@ +name: "universc" +description: Module to run UniverSC an open-source pipeline to demultiplex and process single-cell RNA-Seq data +keywords: + - demultiplex + - align + - single-cell + - scRNA-Seq + - count + - umi +tools: + - "universc": + description: "UniverSC: a flexible cross-platform single-cell data processing pipeline" + homepage: "https://hub.docker.com/r/tomkellygenetics/universc" + documentation: "https://raw.githubusercontent.com/minoda-lab/universc/master/man/launch_universc.sh" + tool_dev_url: "https://github.com/minoda-lab/universc" + doi: "10.1101/2021.01.19.427209" + licence: ["GPL-3.0-or-later"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: FASTQ or FASTQ.GZ file, list of 2 files for paired-end data + pattern: "*.{fastq,fq,fastq.gz,fq.gz}" + +output: + - outs: + type: file + description: Files containing the outputs of Cell Ranger + pattern: "sample-${meta.id}/outs/*" + - versions: + type: file + description: File containing software version + pattern: "versions.yml" + +authors: + - "@kbattenb" + - "@tomkellygenetics" diff --git a/nextflow.config b/nextflow.config index 68563110..181aa724 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,16 +9,50 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags - // Input options - input = null + // generic options + aligner = 'alevin' + outdir = null + input = null + save_reference = false + protocol = 'auto' + + // reference files + genome = null + transcript_fasta = null + + // salmon alevin parameters (simpleaf) + simpleaf_rlen = 91 + barcode_whitelist = null + txp2gene = null + salmon_index = null + + // kallist bustools parameters + kallisto_gene_map = null + kallisto_index = null + kb_workflow = "standard" + + // STARsolo parameters + star_index = null + star_ignore_sjdbgtf = null + seq_center = null + star_feature = "Gene" + + // Cellranger parameters + cellranger_index = null + + // UniverSC paramaters + universc_index = null + + // Template Boilerplate options + skip_multiqc = false + // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false - - // MultiQC options + // QC and MultiQC options + skip_fastqc = false multiqc_config = null multiqc_title = null multiqc_logo = null @@ -26,7 +60,6 @@ params { multiqc_methods_description = null // Boilerplate options - outdir = null publish_dir_mode = 'copy' email = null email_on_fail = null @@ -43,7 +76,7 @@ params { custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" config_profile_contact = null config_profile_url = null - + // Max resource options // Defaults only, expecting to be overwritten @@ -234,7 +267,7 @@ manifest { mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' version = '2.5.0dev' - doi = '' + doi = '10.5281/zenodo.3568187' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index 37c2128c..d3d5fec6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -19,8 +19,7 @@ "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/scrnaseq/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row. See [usage docs](https://nf-co.re/rnaseq/usage#samplesheet-input)." }, "outdir": { "type": "string", @@ -42,6 +41,52 @@ } } }, + "mandatory_arguments": { + "title": "Mandatory arguments", + "type": "object", + "description": "", + "default": "", + "properties": { + "barcode_whitelist": { + "type": "string", + "description": "If not using the 10X Genomics platform, a custom barcode whitelist can be used with `--barcode_whitelist`.", + "fa_icon": "fas fa-barcode" + }, + "aligner": { + "type": "string", + "description": "Name of the tool to use for scRNA (pseudo-) alignment.", + "default": "alevin", + "help_text": "The workflow can handle three types of methods:\n\n- Kallisto/Bustools\n- Salmon Alevin + AlevinQC\n- STARsolo\n\nTo choose which one to use, please specify either `alevin`, `star` or `kallisto` as a parameter option for `--aligner`. By default, the pipeline runs the `alevin` option. Note that specifying another aligner option also requires choosing appropriate parameters (see below) for the selected option.", + "fa_icon": "fas fa-align-center", + "enum": ["kallisto", "star", "alevin", "cellranger", "universc"] + }, + "protocol": { + "type": "string", + "description": "The protocol that was used to generate the single cell data, e.g. 10x Genomics v2 Chemistry.\n\n Can be 'auto' (cellranger only), '10XV1', '10XV2', '10XV3', or any other protocol string that will get directly passed the respective aligner.", + "help_text": "The default is to auto-detect the protocol when running cellranger. For all other aligners the protocol MUST be manually specified. \n\n The following protocols are recognized by the pipeline and mapped to the corresponding protocol name of the respective aligner: '10XV1', '10XV2', '10XV3'. \n\nAny other protocol value is passed to the aligner in verbatim to support other sequencing platforms. See the [kallisto](https://pachterlab.github.io/kallisto/manual#bus), [simpleaf](https://simpleaf.readthedocs.io/en/latest/quant-command.html#a-note-on-the-chemistry-flag), [starsolo](https://gensoft.pasteur.fr/docs/STAR/2.7.9a/STARsolo.html), and [universc](https://github.com/minoda-lab/universc#pre-set-configurations) documentations for more details.", + "default": "auto", + "fa_icon": "fas fa-cogs" + } + }, + "fa_icon": "fas fa-terminal" + }, + "skip_tools": { + "title": "Skip Tools", + "type": "object", + "description": "This section can be used to disable certain tools in the pipeline", + "default": "", + "fa_icon": "fas fa-forward", + "properties": { + "skip_multiqc": { + "type": "boolean", + "description": "Skip MultiQC Report" + }, + "skip_fastqc": { + "type": "boolean", + "description": "Skip FastQC" + } + } + }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -70,6 +115,134 @@ "fa_icon": "fas fa-ban", "hidden": true, "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + }, + "transcript_fasta": { + "type": "string", + "description": "A cDNA FASTA file", + "fa_icon": "fas fa-dna" + }, + "gtf": { + "type": "string", + "description": "Reference GTF annotation file", + "fa_icon": "fas fa-code-branch" + }, + "save_reference": { + "type": "boolean", + "description": "Specify this parameter to save the indices created (STAR, Kallisto, Salmon) to the results.", + "fa_icon": "fas fa-bookmark" + }, + "igenomes_base": { + "type": "string", + "format": "directory-path", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true + } + } + }, + "alevin_options": { + "title": "Alevin Options", + "type": "object", + "description": "", + "default": "", + "properties": { + "salmon_index": { + "type": "string", + "description": "This can be used to specify a precomputed Salmon index in the pipeline, in order to skip the generation of required indices by Salmon itself.", + "fa_icon": "fas fa-fish" + }, + "txp2gene": { + "type": "string", + "description": "Path to transcript to gene mapping file. This allows the specification of a transcript to gene mapping file for Salmon Alevin and AlevinQC.", + "help_text": "> This is not the same as the `kallisto_gene_map` parameter down below and is only used by the Salmon Alevin workflow.", + "fa_icon": "fas fa-map-marked-alt" + }, + "simpleaf_rlen": { + "type": "integer", + "default": 91, + "description": "It is the target read length the index will be built for, using simpleaf.", + "fa_icon": "fas fa-map-marked-alt" + } + } + }, + "starsolo_options": { + "title": "STARSolo Options", + "type": "object", + "description": "", + "default": "", + "properties": { + "star_index": { + "type": "string", + "description": "Specify a path to the precomputed STAR index.", + "help_text": "> NB: This has to be computed with STAR Version 2.7 or later, as STARsolo was only first supported by STAR Version 2.7.", + "fa_icon": "fas fa-asterisk" + }, + "star_ignore_sjdbgtf": { + "type": "string", + "description": "Ignore the SJDB GTF file." + }, + "seq_center": { + "type": "string", + "description": "Name of sequencing center for BAM read group tag." + }, + "star_feature": { + "type": "string", + "default": "Gene", + "enum": ["Gene", "GeneFull", "Gene Velocyto"], + "description": "Quantification type of different transcriptomic feature. Use `GeneFull` on pre-mRNA count for single-nucleus RNA-seq reads. Use `Gene Velocyto` to generate RNA velocity matrix.", + "fa_icon": "fas fa-asterisk" + } + }, + "fa_icon": "fas fa-star" + }, + "kallisto_bus_options": { + "title": "Kallisto/BUS Options", + "type": "object", + "description": "Params related to Kallisto/BUS tool", + "default": "", + "fa_icon": "fas fa-fish", + "properties": { + "kallisto_gene_map": { + "type": "string", + "description": "Specify a Kallisto gene mapping file here. If you don't, this will be automatically created in the Kallisto workflow when specifying a valid `--gtf` file.", + "fa_icon": "fas fa-fish" + }, + "kallisto_index": { + "type": "string", + "description": "Specify a path to the precomputed Kallisto index.", + "fa_icon": "fas fa-fish" + }, + "kb_workflow": { + "type": "string", + "default": "standard", + "description": "Type of workflow. Use `lamanno` for RNA velocity based on La Manno et al. 2018 logic. Use `nucleus` for RNA velocity on single-nucleus RNA-seq reads. Use `kite` for feature barcoding. Use `kite: 10xFB` for 10x Genomics Feature Barcoding technology. (default: standard)", + "fa_icon": "fas fa-fish", + "enum": ["standard", "lamanno", "nucleus", "kite", "kite: 10xFB"] + } + } + }, + "cellranger_options": { + "title": "Cellranger Options", + "type": "object", + "description": "Params related to the Cellranger pipeline", + "default": "", + "properties": { + "cellranger_index": { + "type": "string", + "description": "Specify a pre-calculated cellranger index. Readily prepared indexes can be obtained from the 10x Genomics website. " + } + } + }, + "universc_options": { + "title": "UniverSC Options", + "type": "object", + "description": "Params related to the Cellranger pipeline", + "default": "", + "properties": { + "universc_index": { + "type": "string", + "description": "Specify a pre-calculated cellranger index. Readily prepared indexes can be obtained from the 10x Genomics website." } } }, @@ -272,9 +445,30 @@ { "$ref": "#/definitions/input_output_options" }, + { + "$ref": "#/definitions/mandatory_arguments" + }, + { + "$ref": "#/definitions/skip_tools" + }, { "$ref": "#/definitions/reference_genome_options" }, + { + "$ref": "#/definitions/alevin_options" + }, + { + "$ref": "#/definitions/starsolo_options" + }, + { + "$ref": "#/definitions/kallisto_bus_options" + }, + { + "$ref": "#/definitions/cellranger_options" + }, + { + "$ref": "#/definitions/universc_options" + }, { "$ref": "#/definitions/institutional_config_options" }, diff --git a/subworkflows/local/alevin.nf b/subworkflows/local/alevin.nf new file mode 100644 index 00000000..764c08f8 --- /dev/null +++ b/subworkflows/local/alevin.nf @@ -0,0 +1,71 @@ +/* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ +include { GFFREAD_TRANSCRIPTOME } from '../../modules/local/gffread_transcriptome' +include { ALEVINQC } from '../../modules/local/alevinqc' +include { SIMPLEAF_INDEX } from '../../modules/local/simpleaf_index' +include { SIMPLEAF_QUANT } from '../../modules/local/simpleaf_quant' + +/* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { GFFREAD as GFFREAD_TXP2GENE } from '../../modules/nf-core/gffread/main' + +def multiqc_report = [] + +workflow SCRNASEQ_ALEVIN { + + take: + genome_fasta + gtf + transcript_fasta + salmon_index + txp2gene + barcode_whitelist + protocol + ch_fastq + + + main: + ch_versions = Channel.empty() + + assert (genome_fasta && gtf && salmon_index && txp2gene) || (genome_fasta && gtf) || (genome_fasta && gtf && transcript_fasta && txp2gene): + """Must provide a genome fasta file ('--fasta') and a gtf file ('--gtf'), or a genome fasta file + and a transcriptome fasta file ('--transcript_fasta`) if no index and txp2gene is given!""".stripIndent() + + /* + * Build salmon index + */ + if (!salmon_index) { + SIMPLEAF_INDEX( genome_fasta, transcript_fasta, gtf ) + salmon_index = SIMPLEAF_INDEX.out.index.collect() + transcript_tsv = SIMPLEAF_INDEX.out.transcript_tsv.collect() + ch_versions = ch_versions.mix(SIMPLEAF_INDEX.out.versions) + + if (!txp2gene) { + txp2gene = SIMPLEAF_INDEX.out.transcript_tsv + } + } + + + + /* + * Perform quantification with salmon alevin + */ + SIMPLEAF_QUANT ( + ch_fastq, + salmon_index, + txp2gene, + protocol, + barcode_whitelist + ) + ch_versions = ch_versions.mix(SIMPLEAF_QUANT.out.versions) + + /* + * Run alevinQC + */ + ALEVINQC( SIMPLEAF_QUANT.out.alevin_results ) + ch_versions = ch_versions.mix(ALEVINQC.out.versions) + + emit: + ch_versions + alevin_results = SIMPLEAF_QUANT.out.alevin_results + alevinqc = ALEVINQC.out.report +} diff --git a/subworkflows/local/align_cellranger.nf b/subworkflows/local/align_cellranger.nf new file mode 100644 index 00000000..bfdd533e --- /dev/null +++ b/subworkflows/local/align_cellranger.nf @@ -0,0 +1,47 @@ +/* + * Alignment with Cellranger + */ + +include {CELLRANGER_MKGTF} from "../../modules/nf-core/cellranger/mkgtf/main.nf" +include {CELLRANGER_MKREF} from "../../modules/nf-core/cellranger/mkref/main.nf" +include {CELLRANGER_COUNT} from "../../modules/nf-core/cellranger/count/main.nf" + +// Define workflow to subset and index a genome region fasta file +workflow CELLRANGER_ALIGN { + take: + fasta + gtf + cellranger_index + ch_fastq + protocol + + main: + ch_versions = Channel.empty() + + assert cellranger_index || (fasta && gtf): + "Must provide either a cellranger index or both a fasta file ('--fasta') and a gtf file ('--gtf')." + + if (!cellranger_index) { + // Filter GTF based on gene biotypes passed in params.modules + CELLRANGER_MKGTF( gtf ) + ch_versions = ch_versions.mix(CELLRANGER_MKGTF.out.versions) + + // Make reference genome + CELLRANGER_MKREF( fasta, CELLRANGER_MKGTF.out.gtf, "cellranger_reference" ) + ch_versions = ch_versions.mix(CELLRANGER_MKREF.out.versions) + cellranger_index = CELLRANGER_MKREF.out.reference + } + + // Obtain read counts + CELLRANGER_COUNT ( + // TODO what is `gem` and why is it needed? + ch_fastq.map{ meta, reads -> [meta + ["chemistry": protocol, "gem": meta.id, "samples": [meta.id]], reads] }, + cellranger_index + ) + ch_versions = ch_versions.mix(CELLRANGER_COUNT.out.versions) + + emit: + ch_versions + cellranger_out = CELLRANGER_COUNT.out.outs + star_index = cellranger_index +} diff --git a/subworkflows/local/align_universc.nf b/subworkflows/local/align_universc.nf new file mode 100644 index 00000000..f4e4038e --- /dev/null +++ b/subworkflows/local/align_universc.nf @@ -0,0 +1,47 @@ +/* + * Alignment with Cellranger open-source implementation called by UniverSC + */ + +include {CELLRANGER_MKGTF} from "../../modules/nf-core/cellranger/mkgtf/main.nf" +include {CELLRANGER_MKREF} from "../../modules/nf-core/cellranger/mkref/main.nf" +include {UNIVERSC} from "../../modules/nf-core/universc/main.nf" + +// Define workflow to subset and index a genome region fasta file +workflow UNIVERSC_ALIGN { + take: + fasta + gtf + universc_index + universc_technology + ch_fastq + + main: + ch_versions = Channel.empty() + + assert universc_index || (fasta && gtf): + "Must provide either a cellranger index or both a fasta file ('--fasta') and a gtf file ('--gtf')." + + if (!universc_index) { + // Filter GTF based on gene biotypes passed in params.modules + CELLRANGER_MKGTF( gtf ) + ch_versions = ch_versions.mix(CELLRANGER_MKGTF.out.versions) + + // Make reference genome + CELLRANGER_MKREF( fasta, CELLRANGER_MKGTF.out.gtf, "cellranger_reference" ) + ch_versions = ch_versions.mix(CELLRANGER_MKREF.out.versions) + universc_index = CELLRANGER_MKREF.out.reference + } + + // Obtain read counts + UNIVERSC ( + // TODO add technology and chemistry input parameters and set defaults + ch_fastq.map{ meta, reads -> [meta + ["id": meta.id, "samples": [meta.id], "technology": [universc_technology], "single_end": false, "strandedness": "forward"], reads] }, + universc_index + ) + ch_versions = ch_versions.mix(UNIVERSC.out.versions) + + emit: + ch_versions + universc_out = UNIVERSC.out.outs + star_index = universc_index +} diff --git a/subworkflows/local/fastqc.nf b/subworkflows/local/fastqc.nf new file mode 100644 index 00000000..f18214a1 --- /dev/null +++ b/subworkflows/local/fastqc.nf @@ -0,0 +1,38 @@ +// +// Check input samplesheet and get read channels +// +include { FASTQC } from '../../modules/nf-core/fastqc/main' + +workflow FASTQC_CHECK { + take: + ch_fastq + + main: + ch_fastq + .map { ch -> [ ch[0], ch[1] ] } + .set { ch_fastq } + + /* + * FastQ QC using FASTQC + */ + FASTQC ( ch_fastq ) + fastqc_zip = FASTQC.out.zip + fastqc_html = FASTQC.out.html + + fastqc_zip + .map { it -> [ it[1] ] } + .set { fastqc_zip_only } + fastqc_html + .map { it -> [ it[1] ] } + .set { fastqc_html_only } + + fastqc_multiqc = Channel.empty() + fastqc_multiqc = fastqc_multiqc.mix( fastqc_zip_only, fastqc_html_only ) + fastqc_version = FASTQC.out.versions + + emit: + fastqc_zip + fastqc_html + fastqc_version + fastqc_multiqc +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 0aecf87f..f5a11b18 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -1,3 +1,4 @@ + // // Check input samplesheet and get read channels // @@ -13,6 +14,8 @@ workflow INPUT_CHECK { .csv .splitCsv ( header:true, sep:',' ) .map { create_fastq_channel(it) } + .groupTuple(by: [0]) // group replicate files together, modifies channel to [ val(meta), [ [reads_rep1], [reads_repN] ] ] + .map { meta, reads -> [ meta, reads.flatten() ] } // needs to flatten due to last "groupTuple", so we now have reads as a single array as expected by nf-core modules: [ val(meta), [ reads ] ] .set { reads } emit: @@ -20,12 +23,15 @@ workflow INPUT_CHECK { versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } + // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] def create_fastq_channel(LinkedHashMap row) { // create meta map def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() + meta.id = row.sample + meta.single_end = row.single_end.toBoolean() + meta.expected_cells = row.expected_cells != null ? row.expected_cells : null + meta.seq_center = row.seq_center ? row.seq_center : params.seq_center // add path(s) of the fastq file(s) to the meta map def fastq_meta = [] diff --git a/subworkflows/local/kallisto_bustools.nf b/subworkflows/local/kallisto_bustools.nf new file mode 100644 index 00000000..3210e47a --- /dev/null +++ b/subworkflows/local/kallisto_bustools.nf @@ -0,0 +1,72 @@ +/* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ +include { GENE_MAP } from '../../modules/local/gene_map' +include {KALLISTOBUSTOOLS_COUNT } from '../../modules/nf-core/kallistobustools/count/main' + +/* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { KALLISTOBUSTOOLS_REF } from '../../modules/nf-core/kallistobustools/ref/main' + +def multiqc_report = [] + +workflow KALLISTO_BUSTOOLS { + take: + genome_fasta + gtf + kallisto_index + txp2gene + protocol + kb_workflow + ch_fastq + + main: + ch_versions = Channel.empty() + + assert kallisto_index || (genome_fasta && gtf): + "Must provide a genome fasta file ('--fasta') and a gtf file ('--gtf') if no index is given!" + + assert txp2gene || gtf: + "Must provide either a GTF file ('--gtf') or kallisto gene map ('--kallisto_gene_map') to align with kallisto bustools!" + + /* + * Generate Kallisto Gene Map if not supplied and index is given + * If no index is given, the gene map will be generated in the 'kb ref' step + */ + if (!txp2gene && kallisto_index) { + GENE_MAP( gtf ) + txp2gene = GENE_MAP.out.gene_map + ch_versions = ch_versions.mix(GENE_MAP.out.versions) + } + + /* + * Generate kallisto index + */ + if (!kallisto_index) { + KALLISTOBUSTOOLS_REF( genome_fasta, gtf, kb_workflow ) + txp2gene = KALLISTOBUSTOOLS_REF.out.t2g.collect() + kallisto_index = KALLISTOBUSTOOLS_REF.out.index.collect() + ch_versions = ch_versions.mix(KALLISTOBUSTOOLS_REF.out.versions) + t1c = KALLISTOBUSTOOLS_REF.out.cdna_t2c.ifEmpty{ [] } + t2c = KALLISTOBUSTOOLS_REF.out.intron_t2c.ifEmpty{ [] } + } + + /* + * Quantification with kallistobustools count + */ + KALLISTOBUSTOOLS_COUNT( + ch_fastq, + kallisto_index, + txp2gene, + t1c, + t2c, + protocol + ) + + ch_versions = ch_versions.mix(KALLISTOBUSTOOLS_COUNT.out.versions) + + emit: + ch_versions + counts = KALLISTOBUSTOOLS_COUNT.out.count + txp2gene = txp2gene.collect() + + +} diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf new file mode 100644 index 00000000..1a8381ce --- /dev/null +++ b/subworkflows/local/mtx_conversion.nf @@ -0,0 +1,56 @@ +/* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ +include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad.nf' +include { CONCAT_H5AD } from '../../modules/local/concat_h5ad.nf' +include { MTX_TO_SEURAT } from '../../modules/local/mtx_to_seurat.nf' + +workflow MTX_CONVERSION { + + take: + mtx_matrices + samplesheet + txp2gene + star_index + + main: + ch_versions = Channel.empty() + + // Cellranger module output contains too many files which cause path collisions, we filter to the ones we need. + if ( params.aligner == "cellranger" ) { + mtx_matrices = mtx_matrices.map { meta, mtx_files -> + [ meta, mtx_files.findAll { it.toString().contains("filtered_feature_bc_matrix") } ] + } + .filter { meta, mtx_files -> mtx_files } // Remove any that are missing the relevant files + } + + // + // Convert matrix to h5ad + // + MTX_TO_H5AD ( + mtx_matrices, + txp2gene, + star_index + ) + + // + // Concat sample-specific h5ad in one + // + CONCAT_H5AD ( + MTX_TO_H5AD.out.h5ad.collect(), // gather all sample-specific files + samplesheet + ) + + // + // Convert matrix do seurat + // + MTX_TO_SEURAT ( + mtx_matrices + ) + + //TODO CONCAT h5ad and MTX to h5ad should also have versions.yaml output + ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions, MTX_TO_SEURAT.out.versions) + + emit: + ch_versions + counts = MTX_TO_H5AD.out.counts + +} diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf new file mode 100644 index 00000000..47b2e757 --- /dev/null +++ b/subworkflows/local/starsolo.nf @@ -0,0 +1,64 @@ +/* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ +include { STAR_ALIGN } from '../../modules/local/star_align' + +/* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate/main' + + +def multiqc_report = [] + +workflow STARSOLO { + take: + genome_fasta + gtf + star_index + protocol + barcode_whitelist + ch_fastq + star_feature + other_10x_parameters + + main: + ch_versions = Channel.empty() + + assert star_index || (genome_fasta && gtf): + "Must provide a genome fasta file ('--fasta') and a gtf file ('--gtf') if no index is given!" + + assert gtf: "Must provide a gtf file ('--gtf') for STARSOLO" + + /* + * Build STAR index if not supplied + */ + if (!star_index) { + STAR_GENOMEGENERATE( + genome_fasta.map{ f -> [[id: f.baseName], f]}, + gtf.map{ g -> [[id: g.baseName], g]} + ) + star_index = STAR_GENOMEGENERATE.out.index.collect() + ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) + } + + /* + * Perform mapping with STAR + */ + STAR_ALIGN( + ch_fastq, + star_index, + gtf, + barcode_whitelist, + protocol, + star_feature, + other_10x_parameters + ) + ch_versions = ch_versions.mix(STAR_ALIGN.out.versions) + + + emit: + ch_versions + // get rid of meta for star index + star_index = star_index.map{ meta, index -> index} + star_result = STAR_ALIGN.out.tab + star_counts = STAR_ALIGN.out.counts + for_multiqc = STAR_ALIGN.out.log_final +} diff --git a/tower.yml b/tower.yml index 787aedfe..999e82d3 100644 --- a/tower.yml +++ b/tower.yml @@ -1,5 +1,11 @@ reports: multiqc_report.html: display: "MultiQC HTML report" - samplesheet.csv: - display: "Auto-created samplesheet with collated metadata and FASTQ paths" + "**/fastqc/*_fastqc.html": + display: "FastQC HTML report" + "**/pipeline_info/execution_timeline_*.html": + display: "Execution timeline report" + "**/pipeline_info/execution_report_*.html": + display: "Execution overview report" + "**/star/**/*.Log.final.out": + display: "Star per-sample report" diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 83b66d9d..aeed5a0a 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -6,14 +6,16 @@ include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' -def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) -def citation = '\n' + WorkflowMain.citation(workflow) + '\n' def summary_params = paramsSummaryMap(workflow) -// Print parameter summary log to screen -log.info logo + paramsSummaryLog(workflow) + citation +def checkPathParamList = [ + params.input, params.multiqc_config, params.fasta, params.gtf, + params.transcript_fasta, params.salmon_index, params.kallisto_index, + params.star_index, params.txp2gene, params.barcode_whitelist, params.cellranger_index, + params.universc_index +] +for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } -WorkflowScrnaseq.initialise(params, log) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -35,8 +37,15 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' - +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { FASTQC_CHECK } from '../subworkflows/local/fastqc' +include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kallisto_bustools' +include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' +include { STARSOLO } from '../subworkflows/local/starsolo' +include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" +include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" +include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" +include { GTF_GENE_FILTER } from '../modules/local/gtf_gene_filter' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS @@ -46,9 +55,8 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -57,31 +65,167 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft */ // Info required for completion email and summary -def multiqc_report = [] +// TODO: Are this channels still necessary? +ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true) +ch_output_docs_images = file("$projectDir/docs/images/", checkIfExists: true) +protocol_config = WorkflowScrnaseq.getProtocol(workflow, log, params.aligner, params.protocol) +if (protocol_config['protocol'] == 'auto' && aligner != "cellranger") { + error "Only cellranger supports `protocol = 'auto'`. Please specify the protocol manually!" +} + +// general input and params +ch_input = file(params.input) +ch_genome_fasta = Channel.value(params.fasta ? file(params.fasta) : []) +ch_gtf = params.gtf ? file(params.gtf) : [] +ch_transcript_fasta = params.transcript_fasta ? file(params.transcript_fasta): [] +ch_txp2gene = params.txp2gene ? file(params.txp2gene) : [] +ch_multiqc_alevin = Channel.empty() +ch_multiqc_star = Channel.empty() +ch_multiqc_cellranger = Channel.empty() +if (params.barcode_whitelist) { + ch_barcode_whitelist = file(params.barcode_whitelist) +} else if (protocol_config.containsKey("whitelist")) { + ch_barcode_whitelist = file("$projectDir/${protocol_config['whitelist']}") +} else { + ch_barcode_whitelist = [] +} + + +//kallisto params +ch_kallisto_index = params.kallisto_index ? file(params.kallisto_index) : [] +kb_workflow = params.kb_workflow + +//salmon params +ch_salmon_index = params.salmon_index ? file(params.salmon_index) : [] + +//star params +ch_star_index = params.star_index ? file(params.star_index) : [] +star_feature = params.star_feature + +//cellranger params +ch_cellranger_index = params.cellranger_index ? file(params.cellranger_index) : [] + +//universc params +ch_universc_index = params.universc_index ? file(params.universc_index) : [] workflow SCRNASEQ { - ch_versions = Channel.empty() + ch_versions = Channel.empty() + ch_mtx_matrices = Channel.empty() + + // Check input files and stage input data + ch_fastq = INPUT_CHECK( ch_input ).reads - // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files - // - INPUT_CHECK ( - file(params.input) - ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ // ! There is currently no tooling to help you write a sample sheet schema - // - // MODULE: Run FastQC - // - FASTQC ( - INPUT_CHECK.out.reads + // Run FastQC + ch_multiqc_fastqc = Channel.empty() + if (!params.skip_fastqc) { + FASTQC_CHECK ( ch_fastq ) + ch_versions = ch_versions.mix(FASTQC_CHECK.out.fastqc_version) + ch_multiqc_fastqc = FASTQC_CHECK.out.fastqc_zip + } else { + ch_multiqc_fastqc = Channel.empty() + } + + ch_filter_gtf = GTF_GENE_FILTER ( ch_genome_fasta, ch_gtf ).gtf + + // Run kallisto bustools pipeline + if (params.aligner == "kallisto") { + KALLISTO_BUSTOOLS( + ch_genome_fasta, + ch_filter_gtf, + ch_kallisto_index, + ch_txp2gene, + protocol_config['protocol'], + kb_workflow, + ch_fastq + ) + ch_versions = ch_versions.mix(KALLISTO_BUSTOOLS.out.ch_versions) + ch_mtx_matrices = ch_mtx_matrices.mix(KALLISTO_BUSTOOLS.out.counts) + ch_txp2gene = KALLISTO_BUSTOOLS.out.txp2gene + } + + // Run salmon alevin pipeline + if (params.aligner == "alevin") { + SCRNASEQ_ALEVIN( + ch_genome_fasta, + ch_filter_gtf, + ch_transcript_fasta, + ch_salmon_index, + ch_txp2gene, + ch_barcode_whitelist, + protocol_config['protocol'], + ch_fastq + ) + ch_versions = ch_versions.mix(SCRNASEQ_ALEVIN.out.ch_versions) + ch_multiqc_alevin = SCRNASEQ_ALEVIN.out.alevin_results + ch_mtx_matrices = ch_mtx_matrices.mix(SCRNASEQ_ALEVIN.out.alevin_results) + } + + // Run STARSolo pipeline + if (params.aligner == "star") { + STARSOLO( + ch_genome_fasta, + ch_filter_gtf, + ch_star_index, + protocol_config['protocol'], + ch_barcode_whitelist, + ch_fastq, + star_feature, + protocol_config.get('extra_args', ""), + ) + ch_versions = ch_versions.mix(STARSOLO.out.ch_versions) + ch_mtx_matrices = ch_mtx_matrices.mix(STARSOLO.out.star_counts) + ch_star_index = STARSOLO.out.star_index + ch_multiqc_star = STARSOLO.out.for_multiqc + } + + // Run cellranger pipeline + if (params.aligner == "cellranger") { + CELLRANGER_ALIGN( + ch_genome_fasta, + ch_filter_gtf, + ch_cellranger_index, + ch_fastq, + protocol_config['protocol'] + ) + ch_versions = ch_versions.mix(CELLRANGER_ALIGN.out.ch_versions) + ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_ALIGN.out.cellranger_out) + ch_star_index = CELLRANGER_ALIGN.out.star_index + ch_multiqc_cellranger = CELLRANGER_ALIGN.out.cellranger_out.map{ + meta, outs -> outs.findAll{ it -> it.name == "web_summary.html"} + } + } + + // Run universc pipeline + if (params.aligner == "universc") { + UNIVERSC_ALIGN( + ch_genome_fasta, + ch_filter_gtf, + ch_universc_index, + protocol_config['protocol'], + ch_fastq + ) + ch_versions = ch_versions.mix(UNIVERSC_ALIGN.out.ch_versions) + ch_mtx_matrices = ch_mtx_matrices.mix(UNIVERSC_ALIGN.out.universc_out) + } + + // Run mtx to h5ad conversion subworkflow + MTX_CONVERSION ( + ch_mtx_matrices, + ch_input, + ch_txp2gene, + ch_star_index ) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + //Add Versions from MTX Conversion workflow too + ch_versions.mix(MTX_CONVERSION.out.ch_versions) + + // collect software versions CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) @@ -95,11 +239,18 @@ workflow SCRNASEQ { methods_description = WorkflowScrnaseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) ch_methods_description = Channel.value(methods_description) + ch_multiqc_fastqc.dump(tag: 'fastqc', pretty: true) + ch_multiqc_alevin.dump(tag: 'alevin', pretty: true) + ch_multiqc_star.dump(tag: 'star', pretty: true) + ch_multiqc_files = Channel.empty() ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_fastqc.collect{ meta, qcfile -> qcfile }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_alevin.collect{ meta, qcfile -> qcfile }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_star.collect{ meta, qcfile -> qcfile }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_cellranger.collect().ifEmpty([])) MULTIQC ( ch_multiqc_files.collect(),