gather isa and rmat stats... scripts added

CCBR · Jan 4, 2022 · 6544a9c · 6544a9c
1 parent c09ca2a
commit 6544a9c
Show file tree

Hide file tree

Showing 5 changed files with 261 additions and 1 deletion.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -24,6 +24,8 @@ rule all:
         expand(join(WORKDIR,"rsem","isoformcounts","{sample}","{sample}.RSEM.isoforms.results"),sample=SAMPLES),
         # rmats output
         expand(join(RESULTSDIR,"{contrast}","rmats","summary.txt"),contrast=CONTRASTS),
+        join(RESULTSDIR,"rmats.results.xlsx"),
         # IsoformSwitchAnalyzeR output
-        expand(join(RESULTSDIR,"{contrast}","isa","SplicingSummary.pdf"),contrast=CONTRASTS)
+        expand(join(RESULTSDIR,"{contrast}","isa","SplicingSummary.pdf"),contrast=CONTRASTS),
+        join(RESULTSDIR,"isa.results.xlsx")
 
diff --git a/workflow/rules/isa.smk b/workflow/rules/isa.smk
@@ -60,3 +60,18 @@ Rscript {params.rscript} \
  -o {params.outdir}
 """
 
+localrules: gather_isa_stats
+
+rule gather_isa_stats:
+    input:
+        expand(join(RESULTSDIR,"{contrast}","isa","SplicingSummary.pdf"),contrast=CONTRASTS)
+    output:
+        xlsx=join(RESULTSDIR,"isa.results.xlsx")
+    params:
+        rscript=join(SCRIPTSDIR,"gather_isa_stats.R"),
+        resultsdir=RESULTSDIR
+    envmodules: TOOLS["R"]["version"]
+    shell:"""
+set -exuf -o pipefail
+Rscript {params.rscript} -d {params.resultsdir}
+"""
diff --git a/workflow/rules/rmats.smk b/workflow/rules/rmats.smk
@@ -45,3 +45,19 @@ python ${{RMATS_SRC}}/rmats.py \
     --libType "fr-secondstrand" \
     --tmp /lscratch/${{SLURM_JOB_ID}}/
 """
+
+localrules: gather_rmats_stats
+
+rule gather_rmats_stats:
+    input:
+        expand(join(RESULTSDIR,"{contrast}","rmats","summary.txt"),contrast=CONTRASTS)
+    output:
+        xlsx=join(RESULTSDIR,"rmats.results.xlsx")
+    params:
+        rscript=join(SCRIPTSDIR,"gather_rmats_stats.R"),
+        resultsdir=RESULTSDIR
+    envmodules: TOOLS["R"]["version"]
+    shell:"""
+set -exuf -o pipefail
+Rscript {params.rscript} -d {params.resultsdir}
+"""
diff --git a/workflow/scripts/gather_isa_stats.R b/workflow/scripts/gather_isa_stats.R
@@ -0,0 +1,95 @@
+#!/usr/bin/env Rscript
+# This script is used to concatenate (row bind) all results from IsoformSwitchAnalyzer.
+# ISA generated 4 types of files:
+# 1. top_switched_genes.tsv which are sorted by gene_switch_q_value
+# 2. top_switched_isoforms.tsv which are sorted by isoform_switch_q_value
+# 3. splicingEnrichment.tsv which are sorted by propUpQval
+# 4. genomeWideSplicing.tsv which are sorted by wilcoxQval
+#
+# Eg:
+#
+# Rscript gather_isa_stats.R \
+#  -d /data/Ziegelbauer_lab/ccbr1140/samples_1/results \
+#  -p top_switched_genes.tsv \
+#  -s gene_switch_q_value \
+#  -o top_switched_genes.all_samples.tsv
+
+#
+suppressPackageStartupMessages(library("argparse"))
+
+# create parser object
+parser <- ArgumentParser()
+
+# specify our desired options 
+# by default ArgumentParser will add an help option 
+
+parser$add_argument("-d", "--dirname", 
+                    type="character", 
+                    help="results dir of the pipeline or dir where to search recursively for files with given pattern",
+                    required=TRUE)
+
+
+args <- parser$parse_args()
+
+suppressPackageStartupMessages(library("tidyverse"))
+suppressPackageStartupMessages(library("openxlsx"))
+
+wd=args$dirname
+setwd(wd)
+
+globpatterns=c("top_switched_genes.tsv","top_switched_isoforms.tsv","splicingEnrichment.tsv","genomeWideSplicing.tsv")
+sortcolumns=c("gene_switch_q_value","isoform_switch_q_value","propUpQval","wilcoxQval")
+outfiles=c("top_switched_genes.all_samples.tsv","top_switched_isoforms.all_samples.tsv","splicingEnrichment.all_samples.tsv","genomeWideSplicing.all_samples.tsv")
+
+for (i in 1:length(globpatterns)){
+  glob_pattern=globpatterns[i]
+  sort_column=sortcolumns[i]
+  outfile=outfiles[i]
+
+  files=list.files(pattern = glob_pattern,recursive = TRUE)
+  files
+
+  read_input_file<-function(fp){
+    df=read.csv(fp,header=TRUE,sep="\t",
+                check.names = FALSE,
+                comment.char = "#",
+                strip.white = TRUE)
+    return(df)
+  }
+
+  for (i in 1:length(files)){
+    x=read_input_file(files[i])
+    if (i==1){
+      d=x
+    } else {
+      d=rbind(d,x)
+    }
+    rm(x)
+  }
+  rm(i)
+
+  d=d[order(d[,c(sort_column)]),]
+  write.table(d,
+              file=outfile,
+              row.names = FALSE,
+              col.names = TRUE,
+              quote = FALSE,
+              sep="\t")
+}
+
+read2df=function(f){
+  d=read.csv(f,header=TRUE,
+             sep="\t",
+             check.names=FALSE,
+             strip.white = TRUE,
+             comment.char = "#")
+  return(as.data.frame(d))
+}
+
+list_of_datasets<-list("top_switched_genes"=read2df("top_switched_genes.all_samples.tsv"),
+                       "top_switched_isoforms"=read2df("top_switched_isoforms.all_samples.tsv"),
+                       "splicingEnrichment"=read2df("splicingEnrichment.all_samples.tsv"),
+                       "genomeWideSplicing"=read2df("genomeWideSplicing.all_samples.tsv"))
+
+
+write.xlsx(list_of_datasets, file = "isa.results.xlsx", overwrite = TRUE)
diff --git a/workflow/scripts/gather_rmats_stats.R b/workflow/scripts/gather_rmats_stats.R
@@ -0,0 +1,132 @@
+#!/usr/bin/env Rscript
+
+suppressPackageStartupMessages(library("argparse"))
+
+# create parser object
+parser <- ArgumentParser()
+
+# specify our desired options 
+# by default ArgumentParser will add an help option 
+
+parser$add_argument("-d", "--dirname", 
+                    type="character", 
+                    help="results dir of the pipeline or dir where to search recursively for files with given pattern",
+                    required=TRUE)
+
+
+args <- parser$parse_args()
+
+
+suppressPackageStartupMessages(library("tidyverse"))
+suppressPackageStartupMessages(library("openxlsx"))
+
+wd=args$dirname
+setwd(wd)
+
+# summary files only
+
+glob_pattern="summary.txt"
+sort_column="SignificantEventsJCEC"
+
+
+files=list.files(pattern = glob_pattern,recursive = TRUE)
+files
+# functions
+get_contrast<-function(fp){
+  return(unlist(strsplit(fp,split="/"))[1])
+}
+read_input_file<-function(fp,contrast_name){
+  df=read.csv(fp,header=TRUE,sep="\t",
+           check.names = FALSE,
+           comment.char = "#",
+           strip.white = TRUE)
+  df$contrast_name=contrast_name
+  return(df)
+}
+
+
+contrasts=unlist(lapply(files, get_contrast))
+
+for (i in 1:length(contrasts)){
+  x=read_input_file(files[i],contrasts[i])
+  if (i==1){
+    d=x
+  } else {
+    d=rbind(d,x)
+  }
+  rm(x)
+}
+rm(i)
+
+d <- arrange(d,desc(SignificantEventsJCEC))
+write.table(d,file="summarys.tsv",sep = "\t",quote = FALSE,row.names = FALSE,col.names = TRUE)
+
+
+# loop througth ASE events
+
+event_types=c("RI","A3SS","A5SS","MXE","SE")
+
+for (event_type in event_types){
+glob_pattern=paste(event_type,"MATS.JCEC.txt",sep=".")
+# sort_column="SignificantEventsJCEC"
+
+
+files=list.files(pattern = glob_pattern,recursive = TRUE)
+files
+# functions
+get_contrast<-function(fp){
+  return(unlist(strsplit(fp,split="/"))[1])
+}
+read_input_file<-function(fp,contrast_name){
+  print("Reading file")
+  print(fp)
+  df=read.csv(fp,header=TRUE,sep="\t",
+           check.names = FALSE,
+           comment.char = "#",
+           strip.white = TRUE)
+  if (nrow(df)!=0){
+    df$contrast_name=contrast_name
+  }
+  return(df)
+}
+
+
+
+contrasts=unlist(lapply(files, get_contrast))
+
+for (i in 1:length(contrasts)){
+  x=read_input_file(files[i],contrasts[i])
+  if (i==1){
+    d=x
+  } else {
+    if (nrow(d)!=0){
+      d=rbind(d,x)
+    }
+  }
+  rm(x)
+}
+rm(i)
+
+d<-d[order(d$FDR),]
+head(d)
+write.table(d,file=paste(event_type,"JCEC.all_contrasts.tsv",sep="."),sep = "\t",quote = FALSE,row.names = FALSE,col.names = TRUE)
+rm(d)
+}
+
+read2df=function(f){
+  d=read.csv(f,header=TRUE,
+             sep="\t",
+             check.names=FALSE,
+             strip.white = TRUE,
+             comment.char = "#")
+  return(as.data.frame(d))
+}
+list_of_datasets<-list("Summary"=read2df("summarys.tsv"),
+                       "RI"=read2df("RI.JCEC.all_contrasts.tsv"),
+                       "SE"=read2df("SE.JCEC.all_contrasts.tsv"),
+                       "A3SS"=read2df("A3SS.JCEC.all_contrasts.tsv"),
+                       "A5SS"=read2df("A5SS.JCEC.all_contrasts.tsv"),
+                       "MXE"=read2df("MXE.JCEC.all_contrasts.tsv"))
+
+
+write.xlsx(list_of_datasets, file = "rmats.results.xlsx", overwrite = TRUE)