From e44d424aede76e5443f194dd42154256e0826241 Mon Sep 17 00:00:00 2001 From: MANUEL PHILIP Date: Wed, 14 Jun 2023 20:11:44 +0200 Subject: [PATCH] fix: Updated the get-transcript-info.R file and its dependencies (#73) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Köster Co-authored-by: Manuel Philip Co-authored-by: Manuel Philip Co-authored-by: Johannes Köster Co-authored-by: Manuel Philip --- workflow/envs/biomart.yaml | 1 + .../resources/datavzrd/diffexp-template.yaml | 9 ++ workflow/rules/quant_3prime.smk | 2 +- workflow/rules/ref.smk | 1 + workflow/scripts/get-transcript-info.R | 142 +++++++++++------- 5 files changed, 98 insertions(+), 57 deletions(-) diff --git a/workflow/envs/biomart.yaml b/workflow/envs/biomart.yaml index 1bb4dfb0..4a5f3189 100644 --- a/workflow/envs/biomart.yaml +++ b/workflow/envs/biomart.yaml @@ -4,3 +4,4 @@ channels: dependencies: - bioconductor-biomart =2.46 - r-tidyverse =1.3 + - r-dplyr =1.0.9 diff --git a/workflow/resources/datavzrd/diffexp-template.yaml b/workflow/resources/datavzrd/diffexp-template.yaml index a31dbdeb..59ba3770 100644 --- a/workflow/resources/datavzrd/diffexp-template.yaml +++ b/workflow/resources/datavzrd/diffexp-template.yaml @@ -92,6 +92,15 @@ views: domain: - 0.0 - 1.0 + chromosome_name: + optional: true + display-mode: hidden + transcript_mane_select: + optional: true + display-mode: hidden + ensembl_transcript_id_version: + optional: true + display-mode: hidden test_stat: display-mode: hidden rss: diff --git a/workflow/rules/quant_3prime.smk b/workflow/rules/quant_3prime.smk index 0b4541a2..fc0aa482 100644 --- a/workflow/rules/quant_3prime.smk +++ b/workflow/rules/quant_3prime.smk @@ -15,7 +15,7 @@ if is_3prime_experiment: rule kallisto_3prime_index: input: - fasta="resources/transcriptome.3prime.fasta", + fasta="resources/transcriptome_clean.3prime.fasta", output: index="results/kallisto_3prime/transcripts.3prime.idx", log: diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 0d18b84e..d361fd50 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -36,6 +36,7 @@ rule get_transcript_info: params: species=get_bioc_species_name(), version=config["resources"]["ref"]["release"], + three_prime_activated=is_3prime_experiment, log: "logs/get_transcript_info.log", conda: diff --git a/workflow/scripts/get-transcript-info.R b/workflow/scripts/get-transcript-info.R index ef94e573..c9734839 100644 --- a/workflow/scripts/get-transcript-info.R +++ b/workflow/scripts/get-transcript-info.R @@ -1,16 +1,17 @@ -log <- file(snakemake@log[[1]], open="wt") -sink(log) -sink(log, type="message") + log <- file(snakemake@log[[1]], open="wt") + sink(log) + sink(log, type="message") library("biomaRt") library("tidyverse") +library("dplyr") # this variable holds a mirror name until -# useEnsembl succeeds ("www" is last, because +# useEnsembl succeeds ("www" is last, because # of very frequent "Internal Server Error"s) mart <- "useast" rounds <- 0 -while ( class(mart)[[1]] != "Mart" ) { +while (class(mart)[[1]] != "Mart") { mart <- tryCatch( { # done here, because error function does not @@ -39,69 +40,98 @@ while ( class(mart)[[1]] != "Mart" ) { } # hop to next mirror mart <- switch(mart, - useast = "uswest", - uswest = "asia", - asia = "www", - www = { - # wait before starting another round through the mirrors, - # hoping that intermittent problems disappear - Sys.sleep(30) - "useast" - } - ) + useast = "uswest", + uswest = "asia", + asia = "www", + www = { + # wait before starting another round through the mirrors, + # hoping that intermittent problems disappear + Sys.sleep(30) + "useast" + } + ) } ) } +three_prime_activated <- snakemake@params[["three_prime_activated"]] attributes <- c("ensembl_transcript_id", "ensembl_gene_id", "external_gene_name", "description") - -has_canonical <- "transcript_is_canonical" %in% biomaRt::listAttributes(mart=mart)$name - -if (has_canonical) { - attributes <- c(attributes, "transcript_is_canonical") +has_canonical <- + "transcript_is_canonical" %in% biomaRt::listAttributes(mart = mart)$name +#Check if three_prime_activated is activated or else if transcipts are cononical +if (has_canonical && three_prime_activated) { + attributes <- c(attributes, "transcript_is_canonical", "chromosome_name", + "transcript_mane_select", "ensembl_transcript_id_version") + has_mane_select <- + "transcript_mane_select" %in% biomaRt::listAttributes(mart = mart)$name +}else if (has_canonical) { + attributes <- c(attributes, "transcript_is_canonical") } - t2g <- biomaRt::getBM( - attributes = attributes, - mart = mart, - useCache = FALSE - ) -if (!has_canonical) { - t2g <- t2g %>% add_column(transcript_is_canonical = NA) +attributes = attributes, +mart = mart, +useCache = FALSE +) +# Set columns as NA if three_prime_activated is set to false or if the transcipts are not canonical +if (!has_canonical || !three_prime_activated) { + t2g <- t2g %>% add_column(chromosome_name = NA, transcript_mane_select = NA, + ensembl_transcript_id_version = NA) +}else if (!has_canonical) { + t2g <- t2g %>% add_column(transcript_is_canonical = NA) } - t2g <- t2g %>% - rename( target_id = ensembl_transcript_id, - ens_gene = ensembl_gene_id, - ext_gene = external_gene_name, - gene_desc = description, - canonical = transcript_is_canonical - ) %>% - mutate_at( - vars(gene_desc), - function(values) { str_trim(map(values, function (v) { str_split(v, r"{\[}")[[1]][1]})) } # remove trailing source annotation (e.g. [Source:HGNC Symbol;Acc:HGNC:5]) - ) %>% - mutate_at( - vars(canonical), - function(values) { - as_vector( - map( - str_trim(values), - function(v) { - if (is.na(v)) { - NA - } else if (v == "1") { - TRUE - } else if (v == "0") { - FALSE - } - } - ) - ) + rename( + target_id = ensembl_transcript_id, + ens_gene = ensembl_gene_id, + ext_gene = external_gene_name, + gene_desc = description, + canonical = transcript_is_canonical, + chromosome_name = chromosome_name, + transcript_mane_select = transcript_mane_select, + ensembl_transcript_id_version = ensembl_transcript_id_version, + ) %>% + mutate_at( + vars(gene_desc), + function(values) { + str_trim(map(values, function(v) { + str_split(v, r"{\[}")[[1]][1] + })) + } # remove trailing source annotation (e.g. [Source:HGNC Symbol;Acc:HGNC:5]) + ) %>% + mutate_at( + vars(canonical), + function(values) { + as_vector( + map( + str_trim(values), + function(v) { + if (is.na(v)) { + NA + } else if (v == "1") { + TRUE + } else if (v == "0") { + FALSE + } } ) - + ) + } + ) +# Check if 3-prime-rna-seq is activated, filter transcipts that are mane selected and filter chromosomes that are defined as "patch" +if (three_prime_activated) { + if (has_mane_select) { + t2g <- t2g %>% + filter(!str_detect(chromosome_name, "patch|PATCH")) %>% + filter(str_detect(transcript_mane_select, "")) + }else { + stop( + str_c( + "needed mane_selected column in biomart if three prime mode is activated" + ) + ) + } +} write_rds(t2g, file = snakemake@output[[1]], compress = "gz") \ No newline at end of file