feat: sort datavzrd tables (#115)

In this pull request we - Introduce pi_value scores for all meta_comparisions, go_enrichment, diffexp and pathways - Sort the tables according to their pi_value - Show datapoints in the meta comparisions where only one model has values - Improve table descriptions - Improve lazy df use in polars - Small bug fixes --------- Co-authored-by: Addimator <adpri100hhu.de> Co-authored-by: David Laehnemann <[email protected]> Co-authored-by: Johannes Köster <[email protected]> Co-authored-by: Johannes Koester <[email protected]>
snakemake-workflows · Sep 19, 2024 · 1ccc3fa · 1ccc3fa
1 parent c1702b0
commit 1ccc3fa
Show file tree

Hide file tree

Showing 12 changed files with 308 additions and 102 deletions.
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -60,11 +60,6 @@ diffexp:
       # in the primary_variable sample sheet column and will be considered as
       # denominator in the fold change/effect size estimation).
       base_level: untreated
-    model_Y:
-      full: ~condition + batch_effect
-      reduced: ~batch_effect
-      primary_variable: condition
-      base_level: untreated
   # significance level to use for volcano, ma- and qq-plots
   sig-level:
     volcano-plot: 0.05
@@ -111,7 +106,7 @@ enrichment:
 
 meta_comparisons:
   # comparison is only run if set to `true`
-  activate: true
+  activate: false
   # Define here the comparisons under interest
   comparisons:
     # Define any name for comparison. You can add as many comparisions as you want

diff --git a/workflow/resources/datavzrd/diffexp-template.yaml b/workflow/resources/datavzrd/diffexp-template.yaml
@@ -134,7 +134,7 @@ views:
           display-mode: hidden
         regex('qval_.+'):
           display-mode: hidden
-        ?f"regex('signed_pi_value_{params.primary_variable}.+')":
+        regex('signed_pi_value_.+'):
           display-mode: normal
           plot:
             heatmap:
@@ -147,33 +147,8 @@ views:
                 - -1
                 - 0
                 - 1
-        ?f"regex('signed_pi_value_(?!{params.primary_variable}).+')":
-          display-mode: detail
-          plot:
-            heatmap:
-              scale: linear
-              range:
-                - "#e6550d"
-                - "white"
-                - "#6baed6"
-              domain:
-                - -1
-                - 0
-                - 1
-        ?f"regex('^b_{params.primary_variable}(?!.*_se$)')":
-          plot:
-            heatmap:
-              scale: linear
-              range:
-                - "#e6550d"
-                - "white"
-                - "#6baed6"
-              domain:
-                - -1
-                - 0
-                - 1
-        ?f"regex('^b_(?!{params.primary_variable})(?!.*_se$)')":
-          display-mode: detail
+        regex('^b_(?!.*_se$)'):
+          display-mode: normal
           plot:
             heatmap:
               scale: linear

diff --git a/workflow/resources/datavzrd/go-enrichment-template.yaml b/workflow/resources/datavzrd/go-enrichment-template.yaml
@@ -12,7 +12,8 @@ views:
     dataset: go_enrichment
     desc: |
       Gene ontology (GO) term enrichment analysis was performed with goatools on differentially expressed genes as determined by the sleuth model.
-      The resulting table includes several columns with key information and is sorted according to the 'p-value' column in ascending order. The "term" column lists the specific GO term, while the "class" column indicates the classification of the GO term (biological process/cellular component/molecular function). The column "FDR" is the p-value obtained from false discovery. The "enrichment" column compares the ratio of differentially expressed genes in the current study to the ratio of total genes present in the current analysis. Lastly, the "study_items" shows the name of each differentially expressed gene together with its corresponding "b" value which is the fold change seperated.
+      The resulting table includes several columns with key information and is sorted in descending order by the values of the pi-value score of sum of fold changes affected (as proposed  by `Xiao et al. 2014 <https://dx.doi.org/10.1093/bioinformatics/`).
+      The "term" column lists the specific GO term, while the "class" column indicates the classification of the GO term (biological process/cellular component/molecular function). The column "FDR" is the p-value obtained from false discovery. The "enrichment" column compares the ratio of differentially expressed genes in the current study to the ratio of total genes present in the current analysis. The "study_items" shows the name of each differentially expressed gene together with its corresponding "b" value which is the fold change seperated. Lastly, the "effect" column shows the sum of the absolute values of fold changes in the study items.
       This analysis helps to identify and understand the biological significance of gene expression changes.
     page-size: 25
     render-table:
@@ -83,7 +84,33 @@ views:
                 - "#bcbd22"
               domain:
                 - 0.0
-                - 1.0   
+                - 1.0
+        effect:
+          display-mode: normal
+          plot:
+            heatmap:
+              scale: linear
+              range:
+                - "#e6550d"
+                - "white"
+                - "#6baed6"
+              domain:
+                - -300
+                - 0
+                - 300        
+        pi_value:
+          display-mode: normal
+          plot:
+            heatmap:
+              scale: linear
+              range:
+                - "#e6550d"
+                - "white"
+                - "#6baed6"
+              domain:
+                - -300
+                - 0
+                - 300
         study_items: 
           display-mode: detail 
           custom-plot:

diff --git a/workflow/resources/datavzrd/meta_comparison-diffexp-template.yaml b/workflow/resources/datavzrd/meta_comparison-diffexp-template.yaml
@@ -11,7 +11,8 @@ views:
   comparison_table:
     dataset: comparison_table
     desc: |
-      Table for values of meta comparison. The table is sorted in descending order by the absolute value of the difference of the effects.
+      Table for values of meta comparison.       
+      The table is sorted in descending order by the pi-value score of the difference in effects of the fold change of the primary variable of the model under consideration (as proposed  by `Xiao et al. 2014 <https://dx.doi.org/10.1093/bioinformatics/`).
     page-size: 25
     render-table:
       columns:
@@ -22,7 +23,7 @@ views:
               url: f"https://www.ensembl.org/{str.capitalize(?params.species)}/Transcript/Summary?t={{target_id}}"
         target_id:
           display-mode: hidden
-        min q-value:
+        qval_min:
           plot:
             heatmap:
               scale: linear
@@ -43,7 +44,7 @@ views:
                 - "#fdae6b"
               domain:
                 - 0
-                - 300
+                - 30
         regex('effect\s+\w+\s+\(beta\s+score\)'):
           display-mode: normal
           plot:
@@ -57,6 +58,19 @@ views:
                 - -8
                 - 0
                 - 8
+        pi_value:
+          display-mode: normal
+          plot:
+            heatmap:
+              scale: linear
+              range:
+                - "#e6550d"
+                - "white"
+                - "#6baed6"
+              domain:
+                - -300
+                - 0
+                - 300
   comparison_plot:
     desc: |
       The axes represent the log2-fold changes (beta-scores) for the two models, with each point representing a gene. Points on the diagonal indicate no difference between the comparisons, while deviations from the diagonal suggest differences in gene expression fold changes between the treatments.

diff --git a/workflow/resources/datavzrd/meta_comparison-go_terms-template.yaml b/workflow/resources/datavzrd/meta_comparison-go_terms-template.yaml
@@ -11,7 +11,8 @@ views:
   comparison_table:
     dataset: comparison_table
     desc: |
-        Table for values of meta comparison. The table is sorted in descending order based on the absolute value of the maximum difference between the positive effects and the negative effects.
+      Table for values of meta comparison. 
+      The table is sorted in descending order by the pi-value score of the maximum difference in positive and negative total effects of fold changes influenced by the enrichment (as proposed  by `Xiao et al. 2014 <https://dx.doi.org/10.1093/bioinformatics/`).
     page-size: 25
     render-table:
       columns:
@@ -66,6 +67,19 @@ views:
               domain:
                 - 0
                 - -300
+        pi_value:
+          display-mode: normal
+          plot:
+            heatmap:
+              scale: linear
+              range:
+                - "#e6550d"
+                - "white"
+                - "#6baed6"
+              domain:
+                - -300
+                - 0
+                - 300
   comparison_plot:
     desc: |
       The left side shows the summed positive beta-scores within a GO-term and the right side shows the summed negative beta-scores, with each point representing a GO-term. This illustrates whether a GO-term is enriched positively, negatively, or in both directions.

diff --git a/workflow/resources/datavzrd/meta_comparison-pathways-template.yaml b/workflow/resources/datavzrd/meta_comparison-pathways-template.yaml
@@ -11,7 +11,8 @@ views:
   comparison_table:
     dataset: comparison_table
     desc: |
-      Table for values of meta comparison. The table is sorted in descending order by the absolute value of the difference of the effects.
+      Table for values of meta comparison. 
+      The table is sorted in descending order by the pi-value score of the difference in effects of the total perturbation accumulation (as proposed  by `Xiao et al. 2014 <https://dx.doi.org/10.1093/bioinformatics/`).
     page-size: 25
     render-table:
       columns:
@@ -34,6 +35,20 @@ views:
                 - 0.25
         pathway id:
           display-mode: hidden
+        pi_value:
+          label: signed_pi_value
+          display-mode: normal
+          plot:
+            heatmap:
+              scale: linear
+              range:
+                - "#e6550d"
+                - "white"
+                - "#6baed6"
+              domain:
+                - -300
+                - 0
+                - 300
         difference:
           plot:
             heatmap:

diff --git a/workflow/resources/datavzrd/spia-template.yaml b/workflow/resources/datavzrd/spia-template.yaml
@@ -12,7 +12,8 @@ views:
     dataset: spia_table
     desc: |
       The SPIA pathway impact analysis results are summarized in a table with the following columns. The "Name" column lists the name of the pathway under investigation. The "total perturbation accumulation" column quantifies the overall perturbation or disruption occurring within each pathway. The "Status" column reflects the activity or response of the pathway. The "gene_ratio" column presents the number of differentially expressed genes present in the pathway compared to the total number of genes in the pathway. The "pathway id" column provides a unique identification number for each pathway. 
-      This analysis aids in understanding the impact and activity of various pathways in the context of the studied biological system. The table is sorted in descending order by the absolute values of the signed versions of the pi-value score of the total pertubation accumulation (as proposed  by `Xiao et al. 2014 <https://dx.doi.org/10.1093/bioinformatics/`). The sign reflects the sign of the effect (i.e. positive for upregulation, negative for downregulation). The column "study_items" (click on the "+") shows the name of each differentially expressed gene together with its corresponding fold change. 
+      This analysis aids in understanding the impact and activity of various pathways in the context of the studied biological system. 
+      The table is sorted in descending order by the absolute values of the signed versions of the pi-value score of the total pertubation accumulation (as proposed  by `Xiao et al. 2014 <https://dx.doi.org/10.1093/bioinformatics/`). The sign reflects the sign of the effect (i.e. positive for upregulation, negative for downregulation). The column "study_items" (click on the "+") shows the name of each differentially expressed gene together with its corresponding fold change. 
     page-size: 25
     render-table:
       columns:
@@ -31,6 +32,19 @@ views:
           display-mode: hidden
         number of DE genes per pathway:
           display-mode: hidden
+        signed_pi_value:
+          display-mode: normal
+          plot:
+            heatmap:
+              scale: linear
+              range:
+                - "#e6550d"
+                - "white"
+                - "#6baed6"
+              domain:
+                - -300
+                - 0
+                - 300
         gene_ratio:
           custom-plot:
             data: |

diff --git a/workflow/scripts/compare_diffexp.py b/workflow/scripts/compare_diffexp.py
@@ -27,12 +27,14 @@ def prepare(df):
     .with_columns(
         pl.min_horizontal("qval", "qval_y").alias("qval_min"),
     )
+    .with_columns(
+        pl.min_horizontal("pval", "pval_y").alias("pval_min"),
+    )
     .filter(pl.col("qval_min") <= 0.05)
     .rename(
         {
             "beta": effect_x,
             "beta_y": effect_y,
-            "qval_min": "min q-value",
         }
     )
     .collect()
@@ -44,11 +46,25 @@ def prepare(df):
 combined = combined.with_columns(
     abs(pl.col(effect_x) - pl.col(effect_y)).alias("difference")
 )
-combined_sorted = combined.sort("difference", descending=True)
-combined_pd = combined_sorted.select(
-    ["ext_gene", "target_id", "min q-value", effect_x, effect_y, "difference"]
-).to_pandas()
-combined_pd.to_csv(snakemake.output[0], sep="\t", index=False)
+combined = (
+    combined.with_columns(
+        (-pl.col("pval_min").log(base=10) * pl.col("difference")).alias("pi_value")
+    )
+    .sort(pl.col("pi_value").abs(), descending=True)
+    .select(
+        pl.col(
+            "ext_gene",
+            "target_id",
+            "qval_min",
+            effect_x,
+            effect_y,
+            "difference",
+            "pi_value",
+        )
+    )
+    .to_pandas()
+)
+combined.to_csv(snakemake.output[0], sep="\t", index=False)
 
 
 # we cannot use vegafusion here because it makes the point selection impossible since
@@ -59,12 +75,12 @@ def prepare(df):
 point_selector = alt.selection_point(fields=["ext_gene"], empty=False)
 
 points = (
-    alt.Chart(combined_pd)
+    alt.Chart(combined)
     .mark_circle(size=15, tooltip={"content": "data"})
     .encode(
         alt.X(effect_x),
         alt.Y(effect_y),
-        alt.Color("min q-value", scale=alt.Scale(scheme="viridis")),
+        alt.Color("qval_min", scale=alt.Scale(scheme="viridis")),
         opacity=alt.value(0.5),
     )
 )
@@ -84,7 +100,7 @@ def prepare(df):
 )
 
 text_background = (
-    alt.Chart(combined_pd)
+    alt.Chart(combined)
     .mark_text(
         align="left",
         baseline="middle",
@@ -102,7 +118,7 @@ def prepare(df):
 )
 
 text = (
-    alt.Chart(combined_pd)
+    alt.Chart(combined)
     .mark_text(
         align="left",
         baseline="middle",