Skip to content

Commit

Permalink
Merge pull request #274 from heylf/cellrangerarc
Browse files Browse the repository at this point in the history
Adding cellranger-arc (multiome scRNA-seq + scATAC support)
  • Loading branch information
grst authored Dec 22, 2023
2 parents 7c77143 + c6152b0 commit bf280ce
Show file tree
Hide file tree
Showing 24 changed files with 702 additions and 109 deletions.
47 changes: 40 additions & 7 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ def check_samplesheet(file_in, file_out):
## Check header
MIN_COLS = 2
MIN_HEADER = ["sample", "fastq_1", "fastq_2"]
OPT_HEADER = ["expected_cells", "seq_center"]
OPT_HEADER = ["expected_cells", "seq_center", "fastq_barcode", "sample_type"]
SAMPLE_TYPES = ["gex", "atac"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]

unknown_header = 0
Expand All @@ -101,8 +102,7 @@ def check_samplesheet(file_in, file_out):
min_header_count = min_header_count + 1
colmap[h] = i
i = i + 1
if min_header_count < len(MIN_HEADER):
# code was checking for unknown_header or min_header_count however looking at the ifelse, unknown_header does not seem that it should be tested
if unknown_header or min_header_count < len(MIN_HEADER):
given = ",".join(header)
wanted = ",".join(MIN_HEADER)
print(f"ERROR: Please check samplesheet header -> {given} != {wanted}")
Expand Down Expand Up @@ -147,7 +147,26 @@ def check_samplesheet(file_in, file_out):
seq_center = seq_center.replace(" ", "_")

## Check FastQ file extension
for fastq in [fastq_1, fastq_2]:
fastq_list = [fastq_1, fastq_2]

fastq_barcode = ""
if "fastq_barcode" in header:
fastq_barcode = lspl[colmap["fastq_barcode"]]
fastq_list.append(fastq_barcode)

sample_type = ""
if "sample_type" in header:
sample_type = lspl[colmap["sample_type"]]
if sample_type not in SAMPLE_TYPES:
print_error(
"Sample type {} is not supported! Please specify either {}".format(
sample_type, " or ".join(SAMPLE_TYPES)
),
"Line",
line,
)

for fastq in fastq_list:
if fastq:
if fastq.find(" ") != -1:
print_error("FastQ file contains spaces!", "Line", line)
Expand All @@ -161,9 +180,9 @@ def check_samplesheet(file_in, file_out):
## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2]
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info = ["0", fastq_1, fastq_2, expected_cells, seq_center]
sample_info = ["0", fastq_1, fastq_2, expected_cells, seq_center, fastq_barcode, sample_type]
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
sample_info = ["1", fastq_1, fastq_2, expected_cells, seq_center]
sample_info = ["1", fastq_1, fastq_2, expected_cells, seq_center, fastq_barcode, sample_type]
else:
print_error("Invalid combination of columns provided!", "Line", line)

Expand All @@ -180,7 +199,21 @@ def check_samplesheet(file_in, file_out):
## Write validated samplesheet with appropriate columns
if len(sample_mapping_dict) > 0:
with open(file_out, "w") as fout:
fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2", "expected_cells", "seq_center"]) + "\n")
fout.write(
",".join(
[
"sample",
"single_end",
"fastq_1",
"fastq_2",
"expected_cells",
"seq_center",
"fastq_barcode",
"sample_type",
]
)
+ "\n"
)
for sample in sorted(sample_mapping_dict.keys()):
## Check that multiple runs of the same sample are of the same datatype
if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
Expand Down
36 changes: 36 additions & 0 deletions bin/generate_lib_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python
import argparse
import os

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate the lib.csv for cellranger-arc.")

parser.add_argument("-t", "--sample_types", dest="sample_types", help="Comma seperated list of sample types.")
parser.add_argument("-n", "--sample_names", dest="sample_names", help="Comma seperated list of sample names.")
parser.add_argument("-f", "--fastq_folder", dest="fastq_folder", help="Folder of FASTQ files.")
parser.add_argument("-o", "--out", dest="out", help="Output path.")

args = vars(parser.parse_args())

print(args)

sample_types = args["sample_types"].split(",")
sample_names = args["sample_names"].split(",")
unique_samples_names = set(sample_names)

lib_csv = open(args["out"], "w")
lib_csv.write("fastqs,sample,library_type")

for i in range(0, len(sample_types)):
if sample_names[i] in unique_samples_names:
unique_samples_names.remove(
sample_names[i]
) # this has to be done to account for different Lane files (e.g., L002)
if sample_types[i] == "gex":
lib_csv.write("\n{},{},{}".format(args["fastq_folder"], sample_names[i], "Gene Expression"))
else:
lib_csv.write("\n{},{},{}".format(args["fastq_folder"], sample_names[i], "Chromatin Accessibility"))

lib_csv.close()

print("Wrote lib.csv file to {}".format(args["out"]))
26 changes: 26 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,32 @@ if(params.aligner == "cellranger") {
}
}

if(params.aligner == "cellrangerarc") {
process {
withName: CELLRANGERARC_MKGTF {
publishDir = [
path: "${params.outdir}/${params.aligner}/mkgtf",
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
ext.args = "--attribute=gene_biotype:protein_coding --attribute=gene_biotype:lncRNA --attribute=gene_biotype:pseudogene"
}
withName: CELLRANGERARC_MKREF {
publishDir = [
path: "${params.outdir}/${params.aligner}/mkref",
mode: params.publish_dir_mode
]
}
withName: CELLRANGERARC_COUNT {
publishDir = [
path: "${params.outdir}/${params.aligner}/count",
mode: params.publish_dir_mode
]
ext.args = {meta.expected_cells ? "--expect-cells ${meta.expected_cells}" : ''}
}
}
}

if(params.aligner == "universc") {
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
Expand Down
9 changes: 9 additions & 0 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [STARsolo](#starsolo)
- [Salmon Alevin & AlevinQC](#salmon-alevin--alevinqc)
- [Cellranger](#cellranger)
- [Cellranger ARC](#cellranger-arc)
- [UniverSC](#universc)
- [Other output data](#other-output-data)
- [MultiQC](#multiqc)
Expand Down Expand Up @@ -103,6 +104,14 @@ Cell Ranger is a set of analysis scripts that processes 10X Chromium single cell

- Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by Cellranger

## Cellranger ARC

Cell Ranger ARC is a set of analysis pipelines that process Chromium Single Cell Multiome ATAC + Gene Expression sequencing data to generate a variety of analyses pertaining to gene expression (GEX), chromatin accessibility, and their linkage. Furthermore, since the ATAC and GEX measurements are on the very same cell, we are able to perform analyses that link chromatin accessibility and GEX. See [Cellranger ARC](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc) for more information on Cellranger.

**Output directory: `results/cellrangerarc`**

- Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by Cellranger ARC

## UniverSC

UniverSC is a wrapper that calls an open-source implementation of Cell Ranger v3.0.2 and adjusts run parameters for compatibility with a wide ranger of technologies.
Expand Down
47 changes: 47 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,53 @@ See the [UniverSC GitHub page](https://github.com/minoda-lab/universc#pre-set-co

Currently only 3\' scRNA-Seq parameters are supported in nextflow, although chemistry parameters for 5\' scRNA-Seq and full-length scRNA-Seq libraries are supported by teh container.

### If using cellranger-arc

#### Automatic file name detection

This pipeline currently **does not** automatically renames input FASTQ files to follow the
[naming convention by 10x](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input):

```
[Sample Name]_S1_L00[Lane Number]_[Read Type]_001.fastq.gz
```

Thus please make sure your files follow this naming convention.

#### Sample sheet definition

If you are using cellranger-arc you have to add the column _sample_type_ (atac for scATAC or gex for scRNA) and _fastq_barcode_ (part of the scATAC data) to your samplesheet as an input.

**Beware of the following points:**

- It is important that you give your scRNA and scATAC different [Sample Name]s.
- Check first which file is your barcode fastq file for your scATAC data ([see](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/using/fastq-input)).
- If you have more than one sequencing run then you have to give them another suffix (e.g., rep\*) to your [Sample Name] ([see](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/fastq-input#atac_quick_start)).

An example samplesheet for a dataset called test_scARC that has two sequencing runs for the scATAC and one seqeuncing run
from two lanes for the scRNA could look like this:

```csv
sample,fastq_1,fastq_2,fastq_barcode,sample_type
test_scARC,path/test_scARC_atac_rep1_S1_L001_R1_001.fastq.gz,path/test_scARC_atac_rep1_S1_L001_R2_001.fastq.gz,path/test_scARC_atac_rep1_S1_L001_I2_001.fastq.gz,atac
test_scARC,path/test_scARC_atac_rep2_S2_L001_R1_001.fastq.gz,path/test_scARC_atac_rep2_S2_L001_R2_001.fastq.gz,path/test_scARC_atac_rep2_S2_L001_I2_001.fastq.gz,atac
test_scARC,path/test_scARC_gex_S1_L001_R1_001.fastq.gz,path/test_scARC_gex_S1_L001_R2_001.fastq.gz,,gex
test_scARC,path/test_scARC_gex_S1_L002_R1_001.fastq.gz,path/test_scARC_gex_S1_L002_R2_001.fastq.gz,,gex
```

#### Config file and index

Cellranger-arc needs a reference index directory that you can provide with `--cellranger_index`. Be aware, you can use
for cellranger-arc the same index you use for cellranger ([see](https://kb.10xgenomics.com/hc/en-us/articles/4408281606797-Are-the-references-interchangeable-between-pipelines)).
Yet, a cellranger-arc index might include additional data (e.g., TF binding motifs). Therefore, please first check if
you have to create a new cellranger-arc index ([see here](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/advanced/references) for
more information)

If you decide to create a cellranger-arc index, then you need to create a config file to generate the index. The pipeline
can do this autmatically for you if you provide a `--fasta`, `--gtf`, and an optional `--motif` file. However, you can
also decide to provide your own config file with `--cellrangerarc_config`, then you also have to specify with `--cellrangerarc_reference`
the reference genome name that you have used and stated as _genome:_ in your config file.

## Running the pipeline

The minimum typical command for running the pipeline is as follows:
Expand Down
5 changes: 5 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"]
},
"unzip": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"]
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions modules/local/mtx_to_h5ad.nf
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ process MTX_TO_H5AD {
//
// run script
//
if (params.aligner == 'cellranger')
if (params.aligner in [ 'cellranger', 'cellrangerarc' ])
"""
# convert file types
mtx_to_h5ad.py \\
--aligner ${params.aligner} \\
--aligner cellranger \\
--input filtered_feature_bc_matrix.h5 \\
--sample ${meta.id} \\
--out ${meta.id}/${meta.id}_matrix.h5ad
Expand Down
8 changes: 4 additions & 4 deletions modules/local/mtx_to_seurat.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ process MTX_TO_SEURAT {

script:
def aligner = params.aligner
if (params.aligner == "cellranger") {
matrix = "matrix.mtx.gz"
barcodes = "barcodes.tsv.gz"
features = "features.tsv.gz"
if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) {
matrix = "filtered_feature_bc_matrix/matrix.mtx.gz"
barcodes = "filtered_feature_bc_matrix/barcodes.tsv.gz"
features = "filtered_feature_bc_matrix/features.tsv.gz"
} else if (params.aligner == "kallisto") {
matrix = "*count/counts_unfiltered/*.mtx"
barcodes = "*count/counts_unfiltered/*.barcodes.txt"
Expand Down
84 changes: 0 additions & 84 deletions modules/nf-core/cellranger/count/templates/cellranger_count.py

This file was deleted.

28 changes: 28 additions & 0 deletions modules/nf-core/cellrangerarc/Dockerfile

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit bf280ce

Please sign in to comment.