From ecc9ab712b94e175a2c9e7c79b365faa98a3df44 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Thu, 14 Sep 2023 13:12:46 +0200
Subject: [PATCH] fix: simpler three prime QuantSeq cutadapt setup (#78)

I have tested this on actual QuantSeq data, and the cutadapt setup in
this PR yields mostly the same results that the complicated previous
three step setup recommended by Lexogen gave. It seems to be a bit more
aggressive in the poly-A tail removal, but this should not affect
results much. And it greatly simplifies the workflow setup.
---
 .test/3-prime-config/config/config.yaml | 11 ++--
 config/README.md                        | 72 +++++++++++++++++---
 workflow/Snakefile                      |  1 -
 workflow/rules/trim.smk                 | 43 +++++++-----
 workflow/rules/trim_3prime.smk          | 87 -------------------------
 5 files changed, 98 insertions(+), 116 deletions(-)
 delete mode 100644 workflow/rules/trim_3prime.smk

diff --git a/.test/3-prime-config/config/config.yaml b/.test/3-prime-config/config/config.yaml
index 1a621d9b..f4ff4fa3 100644
--- a/.test/3-prime-config/config/config.yaml
+++ b/.test/3-prime-config/config/config.yaml
@@ -120,8 +120,11 @@ params:
   # the reverse reads of paired end sequencing:
   # * https://cutadapt.readthedocs.io/en/stable/guide.html#trimming-paired-end-reads
   cutadapt-se:
-    adapters: "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
-    extra: "-q 20"
+    # This setup is for Lexogen QuantSeq FWD data, based on (but simplfied):
+    # https://faqs.lexogen.com/faq/what-is-the-adapter-sequence-i-need-to-use-for-t-1
+    # For more details, see the QuantSeq section in the `config/README.md` file.
+    adapters: "-a 'r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=7;max_error_rate=0.005'"
+    extra: "--minimum-length 33 --nextseq-trim=20 --poly-a"
   # reasoning behind parameters:
   #   For reads that are produced by 3’-end sequencing, depending on the protocol, it might be recommended to remove some leading bases (e.g. see https://www.nature.com/articles/s41598-019-55434-x#Sec10)
   #   * `--minimum-length 33`:
@@ -134,5 +137,5 @@ params:
   #   * `--minimum-overlap 7`: the cutadapt default minimum overlap of `5` did trimming on the level
   #     of expected adapter matches by chance
   cutadapt-pe: 
-    adapters: "-a ACGGATCGATCGATCGATCGAT -g GGATCGATCGATCGATCGAT -A ACGGATCGATCGATCGATCGAT -G GGATCGATCGATCGATCGAT"
-    extra: "--minimum-length 33 -e 0.005 --overlap 7"
\ No newline at end of file
+    adapters: ""
+    extra: ""
\ No newline at end of file
diff --git a/config/README.md b/config/README.md
index 1aef098b..bc994104 100644
--- a/config/README.md
+++ b/config/README.md
@@ -1,16 +1,72 @@
 # General settings
 
-To configure this workflow, modify ``config/config.yaml`` according to your needs, following the explanations provided in the file.
+To configure this workflow, modify the following files to reflect your dataset and differential expression analysis model:
+* `config/samples.tsv`: samples sheet with covariates and conditions
+* `config/units.tsv`: (sequencing) units sheet with raw data paths
+* `config/config.yaml`: general workflow configuration and differential expression model setup
 
-# Sample and unit sheet
+## samples sheet
 
-* Add samples to `config/samples.tsv`. For each sample, the column `sample` and any number of covariates and conditions have to be specified.
-  These columns can be used in the model specification section within `config/config.yaml` to define the differential expression analysis of sleuth, including batch effect modeling.
-  Effect size estimates are calculated as so-called beta-values by sleuth. For binary comparisons, they resemble a log2 fold change. The primary comparison variable should be encoded in a way such that the group of samples that shall be the numerator in the fold change calculation have the lexicographically greater value. For example, you could simply use values like `0` and `1` to achieve this.
-* For each sample, add one or more sequencing units (runs, lanes or replicates) to the unit sheet `config/units.tsv`.
-  For each unit, provide either one (column `fq1`) or two (columns `fq1`, `fq2`) FASTQ files (these can point to anywhere in your system). 
-  If only one FASTQ file is provided (single-end data), you also need to specify `fragment_len_mean` and `fragment_len_sd`, which should usually be available from your sequencing provider.
+For each biological sample, add a line to the sample sheet in `config/samples.tsv`.
+The column `sample` is required and gives the sample name.
+Additional columns can specify covariates (including batch effects) and conditions.
+These columns can then be used in the `diffexp: models:` specification section in `config/config.yaml` (see below)
 
 Missing values can be specified by empty columns or by writing `NA`.
 
+## units sheet
+
+For each sample, add one or more sequencing unit lines (runs, lanes or replicates) to the unit sheet in `config/units.tsv`.
+For each unit, provide either of the following:
+* The path to two pairead-end FASTQ files in the columns `fq1`, `fq2`.
+* The path to a single-end FASTQ file in the column `fq1`.
+  For single-end data, you also need to specify `fragment_len_mean` and `fragment_len_sd`, which should usually be available from your sequencing provider.
+
+Missing values can be specified by empty columns or by writing `NA`.
+
+## config.yaml
+
+This file contains the general workflow configuration and the setup for the differential expression analysis performed by sleuth.
+Configurable options should be explained in the comments above the respective entry or right here in this `config/README.md` section.
+If something is unclear, don't hesitate to [file an issue in the `rna-seq-kallisto-sleuth` GitHub repository](https://github.com/snakemake-workflows/rna-seq-kallisto-sleuth/issues/new/choose).
+
+### differential expression model setup
+
+The core functionality of this workflow is provided by the software [`sleuth`](https://pachterlab.github.io/sleuth/about).
+You can use it to test for differential expression of genes or transcripts between two or more subgroups of samples.
+
+#### main sleuth model
+
+The main idea of sleuth's internal model, is to test a `full:` model (containing (a) variable(s) of interest AND batch effects) against a `reduced:` model (containing ONLY the batch effects).
+So these are the most important entries to set up under any model that you specify via `diffexp: models:`.
+If you don't know any batch effects, the `reduced:` model will have to be `~1`.
+Otherwise it will be the tilde followed by an addition of the names of any columns that contain batch effects, for example: `reduced: ~batch_effect_1 + batch_effect_2`.
+The full model than additionally includes variables of interest, so fore example: `full: ~variable_of_interest + batch_effect_1 + batch_effect_2`.
+
+#### sleuth effect sizes
+
+Effect size estimates are calculated as so-called beta-values by `sleuth`.
+For binary comparisons (your variable of interest has two factor levels), they resemble a log2 fold change.
+To know which variable of interest to use for the effect size calculation, you need to provide its column name as the `primary_variable:`.
+And for sleuth to know what level of that variable of interest to use as the base level, specify the respective entry as the `base_level:`.
+
+### Lexogen 3' QuantSeq data analysis
+
+For Lexogen 3' QuantSeq data analysis, please set `experiment: 3-prime-rna-seq: activate: true` in the `config/config.yaml` file.
+For more information information on Lexogen QuantSeq 3' sequencing, see: https://www.lexogen.com/quantseq-3mrna-sequencing/
+In addition, for Lexogen 3' FWD QuantSeq data, we recommend setting the `params: cutadapt-se:` with:
+```
+    adapters: "-a 'r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=7;max_error_rate=0.005'"
+    extra: "--minimum-length 33 --nextseq-trim=20 --poly-a"
+```
+This is an adaptation of the [Lexogen read preprocessing recommendations for 3' FWD QuantSeq data](https://faqs.lexogen.com/faq/what-is-the-adapter-sequence-i-need-to-use-for-t-1).
+Changes to the recommendations are motivated as follows:
+* `-m`: We switched to the easier to read `--minimum-length` and apply this minimum length globally. In addition, we increase the minimum length to a default of 33 that makes more sense for kallisto quantification.
+* `-O`: Instead of this option, minimum overlap is specified per expression.
+* `-a "polyA=A{20}"`: We replace this with `cutudapt`s dedicated option for `--poly-a` tail removal (which is run after adapter trimming).
+* `-a "QUALITY=G{20}"`: We replace this with `cutudapt`s dedicated option for the removal artifactual trailing `G`s in NextSeq and NovaSeq data: `--nextseq-trim=20`.
+* `-n`: With the dedicated `cutadapt` options getting applied in the right order, this option is not needed any more.
+* `-a "r1adapter=A{18}AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=3;max_error_rate=0.100000"`: We remove A{18}, as this is handled by `--poly-a`. We increase `min_overlap` to 7 and set the `max_error_rate` to the Illumina error rate of about 0.005, both to avoid spurious adapter matches being removed.
+* `-g "r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=20"`: This is not needed any more, as `-a` option will lead to complete removal of read sequence if adapter is found at the start of the read, see: https://cutadapt.readthedocs.io/en/stable/guide.html#rightmost
+* `--discard-trimmed`: We omit this, as the `-a` with the adapter sequence will lead to complete read sequence removal if adapter is found at start, and the `--minimum-length` will then discard such empty reads.
 
diff --git a/workflow/Snakefile b/workflow/Snakefile
index ce03a28f..1d3c8401 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -16,7 +16,6 @@ container: "docker://continuumio/miniconda3"
 
 include: "rules/common.smk"
 include: "rules/trim.smk"
-include: "rules/trim_3prime.smk"
 include: "rules/qc_3prime.smk"
 include: "rules/ref.smk"
 include: "rules/ref_3prime.smk"
diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk
index 160be3b4..d134e2d0 100644
--- a/workflow/rules/trim.smk
+++ b/workflow/rules/trim.smk
@@ -12,22 +12,33 @@ rule cutadapt_pe:
     log:
         "results/logs/cutadapt/{sample}-{unit}.log",
     wrapper:
-        "v1.22.0/bio/cutadapt/pe"
+        "v2.6.0/bio/cutadapt/pe"
 
 
-if not is_3prime_experiment:
+rule cutadapt:
+    input:
+        get_fastqs,
+    output:
+        fastq="results/trimmed/{sample}-{unit}.fastq.gz",
+        qc="results/trimmed/{sample}-{unit}.qc.txt",
+    threads: 8
+    params:
+        adapters=config["params"]["cutadapt-se"]["adapters"],
+        extra=config["params"]["cutadapt-se"]["extra"],
+    log:
+        "results/logs/cutadapt/{sample}-{unit}.log",
+    wrapper:
+        "v2.6.0/bio/cutadapt/se"
+
 
-    rule cutadapt:
-        input:
-            get_fastqs,
-        output:
-            fastq="results/trimmed/{sample}-{unit}.fastq.gz",
-            qc="results/trimmed/{sample}-{unit}.qc.txt",
-        threads: 8
-        params:
-            adapters=config["params"]["cutadapt-se"]["adapters"],
-            extra=config["params"]["cutadapt-se"]["extra"],
-        log:
-            "results/logs/cutadapt/{sample}-{unit}.log",
-        wrapper:
-            "v1.22.0/bio/cutadapt/se"
+rule max_read_length:
+    input:
+        get_all_fastqs,
+    output:
+        "results/stats/max-read-length.json",
+    log:
+        "logs/max-read-length.log",
+    conda:
+        "../envs/pysam.yaml"
+    script:
+        "../scripts/get-max-read-length.py"
diff --git a/workflow/rules/trim_3prime.smk b/workflow/rules/trim_3prime.smk
deleted file mode 100644
index a038bfdf..00000000
--- a/workflow/rules/trim_3prime.smk
+++ /dev/null
@@ -1,87 +0,0 @@
-if is_3prime_experiment and three_prime_vendor == "lexogen":
-    # Rule cutadapt1 checks and removes poly-A tails and sequence qualilty score <20.
-    # reasoning behind parameters:
-    #   * `-m 20`:
-    #   * Discards any read under 20bp by -m 20
-    #   * `-O 20 -a ""polyA=A{20}"" -a ""QUALITY=G{20}"" -n 2`:
-    #   * Removes reads having polyA (18bp) or polyG (18bp) and repeat the process twice by -n 2
-    # In short, it removes reads that are predominantly polyA or polyG.
-
-    # Rule cutadapt2 checks if reads contains the polyA-stretch + adapter at the 3'
-    # end with minimum overlap 3bp, maximum error of 1bp every 10bp
-    # reasoning behind parameters:
-    #   * `-m 20`:
-    #   * Discards any read under 20bp by -m 20
-    #   * `--nextseq-trim=10`
-    #   * Quality score<20 is trimmed, based on NextSeq/NovaSeq 2-color SBS system
-    #   * `-a A{18}AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=3;max_error_rate=0.100000`
-
-    # Rule cutadapt3 removes final set of adapters from 5'end and discards reads based on trimmed read length
-    # reasoning behind parameters:
-    #   * `-m 20`:
-    #   * Discards any read under 20bp by -m 20
-    #   * `-O 20 -g r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=20 --discard-trimmed"`
-    #   * Trim the 5' end of adapter with miminum overlap of 20bp and discard its read by --discard-trimmed
-
-    # https://faqs.lexogen.com/faq/what-is-the-adapter-sequence-i-need-to-use-for-t-1
-    rule cutadapt1:
-        input:
-            get_fastqs,
-        output:
-            fastq="results/trimmed/{sample}-{unit}.1.1.fastq.gz",
-            qc="results/trimmed/{sample}-{unit}.1.1.qc.txt",
-        params:
-            extra=lambda w: str(
-                "-m 20 -O 20 -a " "polyA=A{20}" " -a " "QUALITY=G{20}" " -n 2"
-            ),
-        log:
-            "results/logs/cutadapt/{sample}-{unit}.1.1.log",
-        wrapper:
-            "v1.14.1/bio/cutadapt/se"
-
-    rule cutadapt2:
-        input:
-            fastq="results/trimmed/{sample}-{unit}.1.1.fastq.gz",
-        output:
-            fastq="results/trimmed/{sample}-{unit}.2.1.fastq.gz",
-            qc="results/trimmed/{sample}-{unit}.2.1.qc.txt",
-        params:
-            adapters=lambda w: str(
-                '-m 20 -O 3 --nextseq-trim=10 -a "'
-                'A{18}AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=3;max_error_rate=0.100000"'
-                ""
-            ),
-        log:
-            "results/logs/cutadapt/{sample}-{unit}.2.1.log",
-        wrapper:
-            "v1.14.1/bio/cutadapt/se"
-
-    rule cutadapt3:
-        input:
-            fastq="results/trimmed/{sample}-{unit}.2.1.fastq.gz",
-        output:
-            fastq="results/trimmed/{sample}-{unit}.fastq.gz",
-            qc="results/trimmed/{sample}-{unit}.qc.txt",
-        params:
-            adapters=lambda w: str(
-                '-m 20 -O 20 -g "'
-                'r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=20"'
-                " --discard-trimmed"
-            ),
-        log:
-            "results/logs/cutadapt/{sample}-{unit}.log",
-        wrapper:
-            "v1.14.1/bio/cutadapt/se"
-
-
-rule max_read_length:
-    input:
-        get_all_fastqs,
-    output:
-        "results/stats/max-read-length.json",
-    log:
-        "logs/max-read-length.log",
-    conda:
-        "../envs/pysam.yaml"
-    script:
-        "../scripts/get-max-read-length.py"