Merge pull request #274 from heylf/cellrangerarc

Adding cellranger-arc (multiome scRNA-seq + scATAC support)
nf-core · Dec 22, 2023 · bf280ce · bf280ce
2 parents 7c77143 + c6152b0
commit bf280ce
Show file tree

Hide file tree

Showing 24 changed files with 702 additions and 109 deletions.
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -87,7 +87,8 @@ def check_samplesheet(file_in, file_out):
         ## Check header
         MIN_COLS = 2
         MIN_HEADER = ["sample", "fastq_1", "fastq_2"]
-        OPT_HEADER = ["expected_cells", "seq_center"]
+        OPT_HEADER = ["expected_cells", "seq_center", "fastq_barcode", "sample_type"]
+        SAMPLE_TYPES = ["gex", "atac"]
         header = [x.strip('"') for x in fin.readline().strip().split(",")]
 
         unknown_header = 0
@@ -101,8 +102,7 @@ def check_samplesheet(file_in, file_out):
                 min_header_count = min_header_count + 1
             colmap[h] = i
             i = i + 1
-        if min_header_count < len(MIN_HEADER):
-            # code was checking for unknown_header or min_header_count however looking at the ifelse, unknown_header does not seem that it should be tested
+        if unknown_header or min_header_count < len(MIN_HEADER):
             given = ",".join(header)
             wanted = ",".join(MIN_HEADER)
             print(f"ERROR: Please check samplesheet header -> {given} != {wanted}")
@@ -147,7 +147,26 @@ def check_samplesheet(file_in, file_out):
                 seq_center = seq_center.replace(" ", "_")
 
             ## Check FastQ file extension
-            for fastq in [fastq_1, fastq_2]:
+            fastq_list = [fastq_1, fastq_2]
+
+            fastq_barcode = ""
+            if "fastq_barcode" in header:
+                fastq_barcode = lspl[colmap["fastq_barcode"]]
+                fastq_list.append(fastq_barcode)
+
+            sample_type = ""
+            if "sample_type" in header:
+                sample_type = lspl[colmap["sample_type"]]
+                if sample_type not in SAMPLE_TYPES:
+                    print_error(
+                        "Sample type {} is not supported! Please specify either {}".format(
+                            sample_type, " or ".join(SAMPLE_TYPES)
+                        ),
+                        "Line",
+                        line,
+                    )
+
+            for fastq in fastq_list:
                 if fastq:
                     if fastq.find(" ") != -1:
                         print_error("FastQ file contains spaces!", "Line", line)
@@ -161,9 +180,9 @@ def check_samplesheet(file_in, file_out):
             ## Auto-detect paired-end/single-end
             sample_info = []  ## [single_end, fastq_1, fastq_2]
             if sample and fastq_1 and fastq_2:  ## Paired-end short reads
-                sample_info = ["0", fastq_1, fastq_2, expected_cells, seq_center]
+                sample_info = ["0", fastq_1, fastq_2, expected_cells, seq_center, fastq_barcode, sample_type]
             elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
-                sample_info = ["1", fastq_1, fastq_2, expected_cells, seq_center]
+                sample_info = ["1", fastq_1, fastq_2, expected_cells, seq_center, fastq_barcode, sample_type]
             else:
                 print_error("Invalid combination of columns provided!", "Line", line)
 
@@ -180,7 +199,21 @@ def check_samplesheet(file_in, file_out):
     ## Write validated samplesheet with appropriate columns
     if len(sample_mapping_dict) > 0:
         with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2", "expected_cells", "seq_center"]) + "\n")
+            fout.write(
+                ",".join(
+                    [
+                        "sample",
+                        "single_end",
+                        "fastq_1",
+                        "fastq_2",
+                        "expected_cells",
+                        "seq_center",
+                        "fastq_barcode",
+                        "sample_type",
+                    ]
+                )
+                + "\n"
+            )
             for sample in sorted(sample_mapping_dict.keys()):
                 ## Check that multiple runs of the same sample are of the same datatype
                 if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):

diff --git a/bin/generate_lib_csv.py b/bin/generate_lib_csv.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+import argparse
+import os
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate the lib.csv for cellranger-arc.")
+
+    parser.add_argument("-t", "--sample_types", dest="sample_types", help="Comma seperated list of sample types.")
+    parser.add_argument("-n", "--sample_names", dest="sample_names", help="Comma seperated list of sample names.")
+    parser.add_argument("-f", "--fastq_folder", dest="fastq_folder", help="Folder of FASTQ files.")
+    parser.add_argument("-o", "--out", dest="out", help="Output path.")
+
+    args = vars(parser.parse_args())
+
+    print(args)
+
+    sample_types = args["sample_types"].split(",")
+    sample_names = args["sample_names"].split(",")
+    unique_samples_names = set(sample_names)
+
+    lib_csv = open(args["out"], "w")
+    lib_csv.write("fastqs,sample,library_type")
+
+    for i in range(0, len(sample_types)):
+        if sample_names[i] in unique_samples_names:
+            unique_samples_names.remove(
+                sample_names[i]
+            )  # this has to be done to account for different Lane files (e.g., L002)
+            if sample_types[i] == "gex":
+                lib_csv.write("\n{},{},{}".format(args["fastq_folder"], sample_names[i], "Gene Expression"))
+            else:
+                lib_csv.write("\n{},{},{}".format(args["fastq_folder"], sample_names[i], "Chromatin Accessibility"))
+
+    lib_csv.close()
+
+    print("Wrote lib.csv file to {}".format(args["out"]))
diff --git a/conf/modules.config b/conf/modules.config
@@ -83,6 +83,32 @@ if(params.aligner == "cellranger") {
     }
 }
 
+if(params.aligner == "cellrangerarc") {
+    process {
+        withName: CELLRANGERARC_MKGTF {
+            publishDir = [
+                path: "${params.outdir}/${params.aligner}/mkgtf",
+                mode: params.publish_dir_mode,
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+            ext.args = "--attribute=gene_biotype:protein_coding --attribute=gene_biotype:lncRNA --attribute=gene_biotype:pseudogene"
+        }
+        withName: CELLRANGERARC_MKREF {
+            publishDir = [
+                path: "${params.outdir}/${params.aligner}/mkref",
+                mode: params.publish_dir_mode
+            ]
+        }
+        withName: CELLRANGERARC_COUNT {
+            publishDir = [
+                path: "${params.outdir}/${params.aligner}/count",
+                mode: params.publish_dir_mode
+            ]
+            ext.args = {meta.expected_cells ? "--expect-cells ${meta.expected_cells}" : ''}
+        }
+    }
+}
+
 if(params.aligner == "universc") {
     process {
         publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }

diff --git a/docs/output.md b/docs/output.md
@@ -17,6 +17,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
   - [STARsolo](#starsolo)
   - [Salmon Alevin & AlevinQC](#salmon-alevin--alevinqc)
   - [Cellranger](#cellranger)
+  - [Cellranger ARC](#cellranger-arc)
   - [UniverSC](#universc)
   - [Other output data](#other-output-data)
   - [MultiQC](#multiqc)
@@ -103,6 +104,14 @@ Cell Ranger is a set of analysis scripts that processes 10X Chromium single cell
 
 - Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by Cellranger
 
+## Cellranger ARC
+
+Cell Ranger ARC is a set of analysis pipelines that process Chromium Single Cell Multiome ATAC + Gene Expression sequencing data to generate a variety of analyses pertaining to gene expression (GEX), chromatin accessibility, and their linkage. Furthermore, since the ATAC and GEX measurements are on the very same cell, we are able to perform analyses that link chromatin accessibility and GEX. See [Cellranger ARC](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc) for more information on Cellranger.
+
+**Output directory: `results/cellrangerarc`**
+
+- Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by Cellranger ARC
+
 ## UniverSC
 
 UniverSC is a wrapper that calls an open-source implementation of Cell Ranger v3.0.2 and adjusts run parameters for compatibility with a wide ranger of technologies.

diff --git a/docs/usage.md b/docs/usage.md
@@ -111,6 +111,53 @@ See the [UniverSC GitHub page](https://github.com/minoda-lab/universc#pre-set-co
 
 Currently only 3\' scRNA-Seq parameters are supported in nextflow, although chemistry parameters for 5\' scRNA-Seq and full-length scRNA-Seq libraries are supported by teh container.
 
+### If using cellranger-arc
+
+#### Automatic file name detection
+
+This pipeline currently **does not** automatically renames input FASTQ files to follow the
+[naming convention by 10x](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input):
+
+```
+[Sample Name]_S1_L00[Lane Number]_[Read Type]_001.fastq.gz
+```
+
+Thus please make sure your files follow this naming convention.
+
+#### Sample sheet definition
+
+If you are using cellranger-arc you have to add the column _sample_type_ (atac for scATAC or gex for scRNA) and _fastq_barcode_ (part of the scATAC data) to your samplesheet as an input.
+
+**Beware of the following points:**
+
+- It is important that you give your scRNA and scATAC different [Sample Name]s.
+- Check first which file is your barcode fastq file for your scATAC data ([see](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/using/fastq-input)).
+- If you have more than one sequencing run then you have to give them another suffix (e.g., rep\*) to your [Sample Name] ([see](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/fastq-input#atac_quick_start)).
+
+An example samplesheet for a dataset called test_scARC that has two sequencing runs for the scATAC and one seqeuncing run
+from two lanes for the scRNA could look like this:
+
+```csv
+sample,fastq_1,fastq_2,fastq_barcode,sample_type
+test_scARC,path/test_scARC_atac_rep1_S1_L001_R1_001.fastq.gz,path/test_scARC_atac_rep1_S1_L001_R2_001.fastq.gz,path/test_scARC_atac_rep1_S1_L001_I2_001.fastq.gz,atac
+test_scARC,path/test_scARC_atac_rep2_S2_L001_R1_001.fastq.gz,path/test_scARC_atac_rep2_S2_L001_R2_001.fastq.gz,path/test_scARC_atac_rep2_S2_L001_I2_001.fastq.gz,atac
+test_scARC,path/test_scARC_gex_S1_L001_R1_001.fastq.gz,path/test_scARC_gex_S1_L001_R2_001.fastq.gz,,gex
+test_scARC,path/test_scARC_gex_S1_L002_R1_001.fastq.gz,path/test_scARC_gex_S1_L002_R2_001.fastq.gz,,gex
+```
+
+#### Config file and index
+
+Cellranger-arc needs a reference index directory that you can provide with `--cellranger_index`. Be aware, you can use
+for cellranger-arc the same index you use for cellranger ([see](https://kb.10xgenomics.com/hc/en-us/articles/4408281606797-Are-the-references-interchangeable-between-pipelines)).
+Yet, a cellranger-arc index might include additional data (e.g., TF binding motifs). Therefore, please first check if
+you have to create a new cellranger-arc index ([see here](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/advanced/references) for
+more information)
+
+If you decide to create a cellranger-arc index, then you need to create a config file to generate the index. The pipeline
+can do this autmatically for you if you provide a `--fasta`, `--gtf`, and an optional `--motif` file. However, you can
+also decide to provide your own config file with `--cellrangerarc_config`, then you also have to specify with `--cellrangerarc_reference`
+the reference genome name that you have used and stated as _genome:_ in your config file.
+
 ## Running the pipeline
 
 The minimum typical command for running the pipeline is as follows:

diff --git a/modules.json b/modules.json
@@ -64,6 +64,11 @@
                         "branch": "master",
                         "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
+                    },
+                    "unzip": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
                     }
                 }
             }

diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf
@@ -41,11 +41,11 @@ process MTX_TO_H5AD {
     //
     // run script
     //
-    if (params.aligner == 'cellranger')
+    if (params.aligner in [ 'cellranger', 'cellrangerarc' ])
     """
     # convert file types
     mtx_to_h5ad.py \\
-        --aligner ${params.aligner} \\
+        --aligner cellranger \\
         --input filtered_feature_bc_matrix.h5 \\
         --sample ${meta.id} \\
         --out ${meta.id}/${meta.id}_matrix.h5ad

diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf
@@ -19,10 +19,10 @@ process MTX_TO_SEURAT {
 
     script:
     def aligner = params.aligner
-    if (params.aligner == "cellranger") {
-        matrix   = "matrix.mtx.gz"
-        barcodes = "barcodes.tsv.gz"
-        features = "features.tsv.gz"
+    if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) {
+        matrix   = "filtered_feature_bc_matrix/matrix.mtx.gz"
+        barcodes = "filtered_feature_bc_matrix/barcodes.tsv.gz"
+        features = "filtered_feature_bc_matrix/features.tsv.gz"
     } else if (params.aligner == "kallisto") {
         matrix   = "*count/counts_unfiltered/*.mtx"
         barcodes = "*count/counts_unfiltered/*.barcodes.txt"

diff --git a/modules/nf-core/cellranger/count/templates/cellranger_count.py b/modules/nf-core/cellranger/count/templates/cellranger_count.py
diff --git a/modules/nf-core/cellrangerarc/Dockerfile b/modules/nf-core/cellrangerarc/Dockerfile