From e44d424aede76e5443f194dd42154256e0826241 Mon Sep 17 00:00:00 2001
From: MANUEL PHILIP <manuelphilip91@gmail.com>
Date: Wed, 14 Jun 2023 20:11:44 +0200
Subject: [PATCH] fix: Updated the get-transcript-info.R file and its
 dependencies (#73)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Johannes Köster <johannes.koester@uni-due.de>
Co-authored-by: Manuel Philip <manuel@sleepy.uk-essen.de>
Co-authored-by: Manuel Philip <manuel@bashful.uk-essen.de>
Co-authored-by: Johannes Köster <johannes.koester@tu-dortmund.de>
Co-authored-by: Manuel Philip <mphilip@c41.ikim.uk-essen.de>
---
 workflow/envs/biomart.yaml                    |   1 +
 .../resources/datavzrd/diffexp-template.yaml  |   9 ++
 workflow/rules/quant_3prime.smk               |   2 +-
 workflow/rules/ref.smk                        |   1 +
 workflow/scripts/get-transcript-info.R        | 142 +++++++++++-------
 5 files changed, 98 insertions(+), 57 deletions(-)

diff --git a/workflow/envs/biomart.yaml b/workflow/envs/biomart.yaml
index 1bb4dfb0..4a5f3189 100644
--- a/workflow/envs/biomart.yaml
+++ b/workflow/envs/biomart.yaml
@@ -4,3 +4,4 @@ channels:
 dependencies:
   - bioconductor-biomart =2.46
   - r-tidyverse =1.3
+  - r-dplyr =1.0.9
diff --git a/workflow/resources/datavzrd/diffexp-template.yaml b/workflow/resources/datavzrd/diffexp-template.yaml
index a31dbdeb..59ba3770 100644
--- a/workflow/resources/datavzrd/diffexp-template.yaml
+++ b/workflow/resources/datavzrd/diffexp-template.yaml
@@ -92,6 +92,15 @@ views:
               domain:
                 - 0.0
                 - 1.0
+        chromosome_name:
+          optional: true
+          display-mode: hidden
+        transcript_mane_select:
+          optional: true
+          display-mode: hidden
+        ensembl_transcript_id_version:
+          optional: true
+          display-mode: hidden
         test_stat:
           display-mode: hidden 
         rss:
diff --git a/workflow/rules/quant_3prime.smk b/workflow/rules/quant_3prime.smk
index 0b4541a2..fc0aa482 100644
--- a/workflow/rules/quant_3prime.smk
+++ b/workflow/rules/quant_3prime.smk
@@ -15,7 +15,7 @@ if is_3prime_experiment:
 
     rule kallisto_3prime_index:
         input:
-            fasta="resources/transcriptome.3prime.fasta",
+            fasta="resources/transcriptome_clean.3prime.fasta",
         output:
             index="results/kallisto_3prime/transcripts.3prime.idx",
         log:
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 0d18b84e..d361fd50 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -36,6 +36,7 @@ rule get_transcript_info:
     params:
         species=get_bioc_species_name(),
         version=config["resources"]["ref"]["release"],
+        three_prime_activated=is_3prime_experiment,
     log:
         "logs/get_transcript_info.log",
     conda:
diff --git a/workflow/scripts/get-transcript-info.R b/workflow/scripts/get-transcript-info.R
index ef94e573..c9734839 100644
--- a/workflow/scripts/get-transcript-info.R
+++ b/workflow/scripts/get-transcript-info.R
@@ -1,16 +1,17 @@
-log <- file(snakemake@log[[1]], open="wt")
-sink(log)
-sink(log, type="message")
+ log <- file(snakemake@log[[1]], open="wt")
+ sink(log)
+ sink(log, type="message")
 
 library("biomaRt")
 library("tidyverse")
+library("dplyr")
 
 # this variable holds a mirror name until
-# useEnsembl succeeds ("www" is last, because 
+# useEnsembl succeeds ("www" is last, because
 # of very frequent "Internal Server Error"s)
 mart <- "useast"
 rounds <- 0
-while ( class(mart)[[1]] != "Mart" ) {
+while (class(mart)[[1]] != "Mart") {
   mart <- tryCatch(
     {
       # done here, because error function does not
@@ -39,69 +40,98 @@ while ( class(mart)[[1]] != "Mart" ) {
       }
       # hop to next mirror
       mart <- switch(mart,
-                     useast = "uswest",
-                     uswest = "asia",
-                     asia = "www",
-                     www = {
-                       # wait before starting another round through the mirrors,
-                       # hoping that intermittent problems disappear
-                       Sys.sleep(30)
-                       "useast"
-                     }
-              )
+        useast = "uswest",
+        uswest = "asia",
+        asia = "www",
+        www = {
+          # wait before starting another round through the mirrors,
+          # hoping that intermittent problems disappear
+          Sys.sleep(30)
+          "useast"
+        }
+      )
     }
   )
 }
+three_prime_activated <- snakemake@params[["three_prime_activated"]]
 
 attributes <- c("ensembl_transcript_id",
                 "ensembl_gene_id",
                 "external_gene_name",
                 "description")
-
-has_canonical <- "transcript_is_canonical" %in% biomaRt::listAttributes(mart=mart)$name
-
-if (has_canonical) {
-  attributes <- c(attributes, "transcript_is_canonical")
+has_canonical <-
+  "transcript_is_canonical" %in% biomaRt::listAttributes(mart = mart)$name
+#Check if three_prime_activated is activated or else if transcipts are cononical
+if (has_canonical && three_prime_activated) {
+  attributes <- c(attributes, "transcript_is_canonical", "chromosome_name",
+    "transcript_mane_select", "ensembl_transcript_id_version")
+  has_mane_select <-
+  "transcript_mane_select" %in% biomaRt::listAttributes(mart = mart)$name
+}else if (has_canonical) {
+     attributes <- c(attributes, "transcript_is_canonical")
 }
-
 t2g <- biomaRt::getBM(
-            attributes = attributes,
-            mart = mart,
-            useCache = FALSE
-            )
-if (!has_canonical) {
-  t2g <- t2g %>% add_column(transcript_is_canonical = NA)
+attributes = attributes,
+mart = mart,
+useCache = FALSE
+)
+# Set columns as NA if three_prime_activated is set to false or if the transcipts are not canonical
+if (!has_canonical || !three_prime_activated) {
+  t2g <- t2g %>% add_column(chromosome_name = NA, transcript_mane_select = NA,
+      ensembl_transcript_id_version = NA)
+}else if (!has_canonical) {
+   t2g <- t2g %>% add_column(transcript_is_canonical = NA)
 }
-
 t2g <- t2g %>%
-        rename( target_id = ensembl_transcript_id,
-                ens_gene = ensembl_gene_id,
-                ext_gene = external_gene_name,
-                gene_desc = description,
-                canonical = transcript_is_canonical
-                ) %>%
-        mutate_at(
-          vars(gene_desc),
-          function(values) { str_trim(map(values, function (v) { str_split(v, r"{\[}")[[1]][1]})) } # remove trailing source annotation (e.g. [Source:HGNC Symbol;Acc:HGNC:5])
-        ) %>%
-        mutate_at(
-          vars(canonical),
-          function(values) {
-            as_vector(
-              map(
-                str_trim(values), 
-                function(v) {
-                  if (is.na(v)) {
-                    NA
-                  } else if (v == "1") { 
-                    TRUE 
-                  } else if (v == "0") {
-                    FALSE
-                  }
-                }
-              )
-            )
+  rename(
+    target_id = ensembl_transcript_id,
+    ens_gene = ensembl_gene_id,
+    ext_gene = external_gene_name,
+    gene_desc = description,
+    canonical = transcript_is_canonical,
+    chromosome_name = chromosome_name,
+    transcript_mane_select = transcript_mane_select,
+    ensembl_transcript_id_version = ensembl_transcript_id_version,
+  ) %>%
+  mutate_at(
+    vars(gene_desc),
+    function(values) {
+      str_trim(map(values, function(v) {
+        str_split(v, r"{\[}")[[1]][1]
+      }))
+    } # remove trailing source annotation (e.g. [Source:HGNC Symbol;Acc:HGNC:5])
+  ) %>%
+  mutate_at(
+    vars(canonical),
+    function(values) {
+      as_vector(
+        map(
+          str_trim(values),
+          function(v) {
+            if (is.na(v)) {
+              NA
+            } else if (v == "1") {
+              TRUE
+            } else if (v == "0") {
+              FALSE
+            }
           }
         )
-
+      )
+    }
+  )
+# Check if 3-prime-rna-seq is activated, filter transcipts that are mane selected and filter chromosomes that are defined as "patch" 
+if (three_prime_activated) {
+  if (has_mane_select) {
+    t2g <- t2g %>%
+    filter(!str_detect(chromosome_name, "patch|PATCH")) %>%
+    filter(str_detect(transcript_mane_select, ""))
+  }else {
+    stop(
+      str_c(
+        "needed mane_selected column in biomart if three prime mode is activated"
+      )
+    )
+  }
+}
 write_rds(t2g, file = snakemake@output[[1]], compress = "gz")
\ No newline at end of file