snakemake-workflows · dlaehnemann · Aug 15, 2024 · Apr 24, 2024 · Apr 24, 2024 · May 24, 2024
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 !config/units.tsv
 !LICENSE
 !README.md
+local/*
 resources
 resources/*
 results

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -16,17 +16,25 @@ resources:
     # ensembl species name
     species: homo_sapiens
     # ensembl release version
-    release: "104"
+    release: "112"
     # genome build
     build: GRCh38
     # pfam release to use for annotation of domains in differential splicing analysis
     pfam: "33.0"
+    # Choose strategy for selecting representative transcripts for each gene.
+    # Possible values:
+    #   - canonical (use the canonical transcript from ensembl, only works for human at the moment)
+    #   - mostsignificant (use the most significant transcript)
+    #   - path/to/any/file.txt (a path to a file with ensembl transcript IDs to use;
+    #     the user has to ensure that there is only one ID per gene given)
     representative_transcripts: canonical
   ontology:
     # gene ontology to download, used e.g. in goatools
     gene_ontology: "http://current.geneontology.org/ontology/go-basic.obo"
 
 pca:
+  # If set to true, samples with NA values in the specified covariate column will be removed for PCA computation;
+  pca_exclude_NAs: false
   labels:
     # columns of sample sheet to use for PCA
     - condition
@@ -96,11 +104,26 @@ enrichment:
     # the species specified by resources -> ref -> species above
     pathway_database: "panther"
 
+meta_comparisons:
+  # comparison is only run if set to `true`
+  activate: false
+  # Define here the comparisons under interest
+  comparisons:
+    # Define any name for comparison. You can add as many comparisions as you want
+    model_X_vs_model_Y:
+      items:
+        # Define the two underlying models for the comparison. The models must be defined in the diffexp/models in the config
+        # items must be of form <arbitrary label>: <existing diffexp model from config> 
+        X: model_X
+        Y: model_Y
+      # Define label for datavzrd report
+      label: model X vs. model Y
+
 report:
   # make this `true`, to get excel files for download in the snakemake
   # report, BUT: this can drastically increase the runtime of datavzrd report
   # generation, especially on larger cohorts
-  offer_excel: true
+  offer_excel: false
 
 bootstrap_plots:
   # desired false discovery rate for bootstrap plots, i.e. a lower FDR will result in fewer boxplots generated

diff --git a/.test/three_prime/config/config.yaml b/.test/three_prime/config/config.yaml
@@ -11,8 +11,6 @@ experiment:
     vendor: lexogen
     plot-qc: all
 
-
-
 resources:
   ref:
     # ensembl species name
@@ -23,12 +21,20 @@ resources:
     build: GRCh38
     # pfam release to use for annotation of domains in differential splicing analysis
     pfam: "33.0"
+    # Choose strategy for selecting representative transcripts for each gene.
+    # Possible values:
+    #   - canonical (use the canonical transcript from ensembl, only works for human at the moment)
+    #   - mostsignificant (use the most significant transcript)
+    #   - path/to/any/file.txt (a path to a file with ensembl transcript IDs to use;
+    #     the user has to ensure that there is only one ID per gene given)
     representative_transcripts: canonical
   ontology:
     # gene ontology to download, used e.g. in goatools
     gene_ontology: "http://current.geneontology.org/ontology/go-basic.obo"
 
 pca:
+  # If set to true, samples with NA values in the specified covariate column will be removed for PCA computation.
+  pca_exclude_NAs: false
   labels:
     # columns of sample sheet to use for PCA
     - condition
@@ -97,6 +103,24 @@ enrichment:
     # pathway database to use in SPIA, needs to be available for
     # the species specified by resources -> ref -> species above
     pathway_database: "panther"
+    # OrgDB Genome wide annotation package (https://www.bioconductor.org/packages/release/BiocViews.html#___OrgDb) for the species under consideration. 
+    # Only required if you want to have a gene analysis for your pathways. Else NA
+    orgDb: org.Hs.eg.db
+
+meta_comparisons:
+  # comparison is only run if set to `true`
+  activate: false
+  # Define here the comparisons under interest
+  comparisons:
+    # Define any name for comparison. You can add as many comparisions as you want
+    model_X_vs_model_Y:
+      items:
+        # Define the two underlying models for the comparison. The models must be defined in the diffexp/models in the config
+        # items must be of form <arbitrary label>: <existing diffexp model from config> 
+        X: model_X
+        Y: model_Y
+      # Define label for datavzrd report
+      label: model X vs. model Y
 
 bootstrap_plots:
   # desired false discovery rate for bootstrap plots, i.e. a lower FDR will result in fewer boxplots generated

diff --git a/config/README.md b/config/README.md
@@ -80,3 +80,9 @@ Changes to the recommendations are motivated as follows:
 * `-a "r1adapter=A{18}AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=3;max_error_rate=0.100000"`: We remove A{18}, as this is handled by `--poly-a`. We increase `min_overlap` to 7 and set the `max_error_rate` to the Illumina error rate of about 0.005, both to avoid spurious adapter matches being removed.
 * `-g "r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=20"`: This is not needed any more, as `-a` option will lead to complete removal of read sequence if adapter is found at the start of the read, see: https://cutadapt.readthedocs.io/en/stable/guide.html#rightmost
 * `--discard-trimmed`: We omit this, as the `-a` with the adapter sequence will lead to complete read sequence removal if adapter is found at start, and the `--minimum-length` will then discard such empty reads.
+
+#### meta comparisons
+Meta comparisons allow for comparing two full models against each other.
+The axes represent the log2-fold changes (beta-scores) for the two models, with each point representing a gene. 
+Points on the diagonal indicate no difference between the comparisons, while deviations from the diagonal suggest differences in gene expression between the treatments.
+For more details see the comments in the `config.yaml`
diff --git a/config/config.yaml b/config/config.yaml
@@ -11,8 +11,6 @@ experiment:
     # this allows to plot QC of aligned read postion for specific transcripts (or 'all' transcripts)
     plot-qc: all
 
-
-
 resources:
   ref:
     # ensembl species name
@@ -35,6 +33,8 @@ resources:
     gene_ontology: "http://current.geneontology.org/ontology/go-basic.obo"
 
 pca:
+  # If set to true, samples with NA values in the specified covariate column will be removed for PCA computation;
+  pca_exclude_NAs: false
   labels:
     # columns of sample sheet to use for PCA
     - condition
@@ -105,6 +105,27 @@ enrichment:
     # the species specified by resources -> ref -> species above
     pathway_database: "reactome"
 
+meta_comparisons:
+  # comparison is only run if set to `true`
+  activate: false
+  # Define here the comparisons under interest
+  comparisons:
+    # Define any name for comparison. You can add as many comparisions as you want
+    model_X_vs_model_Y:
+      items:
+        # Define the two underlying models for the comparison. The models must be defined in the diffexp/models in the config
+        # items must be of form <arbitrary label for plot-axis>: <existing diffexp model from config> 
+        X: model_X
+        Y: model_Y
+      # Define label for datavzrd report
+      label: model X vs. model Y
+
+report:
+  # make this `true`, to get excel files for download in the snakemake
+  # report, BUT: this can drastically increase the runtime of datavzrd report
+  # generation, especially on larger cohorts
+  offer_excel: false
+
 bootstrap_plots:
   # desired false discovery rate for bootstrap plots, i.e. a lower FDR will result in fewer boxplots generated
   FDR: 0.01

diff --git a/config/samples.tsv b/config/samples.tsv
@@ -2,4 +2,4 @@ sample	condition	batch_effect
 A	treated	batch1
 B	untreated	batch1
 C	treated	batch2
-D	untreated	batch2
+D	untreated	batch2
diff --git a/config/units.tsv b/config/units.tsv
@@ -3,4 +3,4 @@ A	1			raw/a.chr21.1.fq	raw/a.chr21.2.fq
 B	1			raw/b.chr21.1.fq	raw/b.chr21.2.fq
 B	2	300	14	raw/b.chr21.1.fq	
 C	1			raw/a.chr21.1.fq	raw/a.chr21.2.fq
-D	1			raw/b.chr21.1.fq	raw/b.chr21.2.fq
+D	1			raw/b.chr21.1.fq	raw/b.chr21.2.fq
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -26,6 +26,7 @@ include: "rules/diffsplice.smk"
 include: "rules/enrichment.smk"
 include: "rules/datavzrd.smk"
 include: "rules/bam.smk"
+include: "rules/meta_comparisons.smk"
 
 
 rule all:

diff --git a/workflow/envs/pandas.yaml b/workflow/envs/pandas.yaml
@@ -0,0 +1,4 @@
+channels:
+  - conda-forge
+dependencies:
+  - pandas =2.2.1
diff --git a/workflow/envs/polars.yaml b/workflow/envs/polars.yaml
@@ -0,0 +1,4 @@
+channels:
+  - conda-forge
+dependencies:
+  - polars =1.2.1
diff --git a/workflow/envs/pystats.yaml b/workflow/envs/pystats.yaml
@@ -0,0 +1,16 @@
+channels:
+  - conda-forge
+  - nodefaults
+dependencies:
+  - polars =0.20.28
+  - pyreadr =0.5
+  - altair =5.2
+  - pyarrow =16.1
+  - vegafusion =1.6
+  - vegafusion-python-embed =1.6
+  - vl-convert-python =1.2
+  - jupyter_core =5.7
+  - ipykernel =6.29
+  - nbconvert =7.14
+  - notebook =7.0
+  - jupyterlab_code_formatter =1.4
diff --git a/workflow/report/meta_compare.rst b/workflow/report/meta_compare.rst
@@ -0,0 +1,6 @@
+Meta comparisons for {{ snakemake.wildcards.meta_comp }}. 
+The axes represent the log2-fold changes (beta-scores) for the two models, with each point representing a gene. 
+Points on the diagonal indicate no difference between the comparisons, while deviations from the diagonal suggest differences in gene expression between the treatments.
+The color encodes the corresponding q-value.
+By clicking on points, their label can be displayed.
+Holding the Shift key allows to select or deselect labels for multiple genes.
diff --git a/workflow/report/units.rst b/workflow/report/units.rst
@@ -0,0 +1 @@
+Unit sheet containing all considered units, which can be multiple units for a single sample (for example, when the same biological sample was sequenced across multiple lanes and demultiplexed into separate lan-specific fastq files).  The annotations in this file determine how the workflow internally handles units.
diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst
@@ -1,3 +1,2 @@
 After adapter removal with `Cutadapt <http://cutadapt.readthedocs.io>`_, transcripts were quantified with `Kallisto <https://pachterlab.github.io/kallisto/>`_.
 Integrated normalization and differential expression analysis was conducted with `Sleuth <https://pachterlab.github.io/sleuth>`_ following standard procedure as outlined in the manual.
-For sample metadata, see {{ snakemake.config["samples"] }}_.
diff --git a/workflow/resources/custom_vega_plots/circle_diagram_genes.json b/workflow/resources/custom_vega_plots/circle_diagram_genes.json
@@ -0,0 +1,99 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
+    "width": 35,
+    "height": 35,
+    "data": {
+        "values": []
+    },
+    "layer": [
+        {
+            "mark": "arc",
+            "encoding": {
+                "theta": {
+                    "field": "amount",
+                    "type": "quantitative"
+                },
+                "color": {
+                    "field": "category",
+                    "type": "nominal",
+                    "scale": {
+                        "domain": [
+                            "DE_genes",
+                            "genes"
+                        ],
+                        "range": [
+                            "#f2e34c",
+                            "#31a354"
+                        ]
+                    },
+                    "legend": null
+                },
+                "tooltip": [
+                    {
+                        "field": "category",
+                        "type": "nominal"
+                    },
+                    {
+                        "field": "amount",
+                        "type": "quantitative"
+                    }
+                ]
+            }
+        },
+        {
+            "mark": {
+                "type": "text",
+                "baseline": "middle",
+                "align": "center",
+                "dx": 2,
+                "fontSize": 9,
+                "color": "white"
+            },
+            "encoding": {
+                "text": {
+                    "field": "percentage",
+                    "type": "quantitative",
+                    "format": "0.2%"
+                }
+            }
+        },
+        {
+            "transform": [
+                {
+                    "pivot": "category",
+                    "value": "amount",
+                    "groupby": [
+                        "percentage"
+                    ]
+                }
+            ],
+            "mark": "rule",
+            "encoding": {
+                "tooltip": [
+                    {
+                        "field": "genes",
+                        "type": "nominal"
+                    },
+                    {
+                        "field": "DE_genes",
+                        "type": "quantitative"
+                    }
+                ]
+            },
+            "params": [
+                {
+                    "name": "hover",
+                    "select": {
+                        "type": "point",
+                        "fields": [
+                            "percentage"
+                        ],
+                        "nearest": true,
+                        "on": "mouseover",
+                        "clear": "mouseout"
+                    }
+                }
+            ]
+        }
+    ]
+}