07-event-models.Rmd

---
title: "Events"
output: pdf_document
---

```{r}
library(ggplot2)
library(glmnet)
```


# event-specific models
```{r, fig.width=20, fig.height=10}
load('aggregating.rda')
sig_threshold <- .05

event_models <- readRDS('event_models.rds')
nsamples_dt <- rbindlist(sapply(event_models, function(event) list(nsamples=event$nsamples), simplify = FALSE), idcol='ID')
ids_with_big_nsamples <- nsamples_dt[nsamples >= 200, as.integer(ID)]
coef_dt <- rbindlist(sapply(event_models, function(cvfits){
  if ("cvfit" %in% names(cvfits)) {
    ols_vec <- cvfits$cvfit[endsWith(names(cvfits$cvfit), "OLS")]
  if (length(ols_vec) == 0) return(NULL)
  rbindlist(lapply(names(ols_vec), function(ols_name){
    this_ols <- ols_vec[[ols_name]]
    coefs <- coef(summary(this_ols))
    if (ncol(coefs) != 4) stop('something did not work')
    data.table(feature = rownames(coefs), coefs = coefs[, 1], pvalue = coefs[, 4], random = strsplit(ols_name, "::", fixed = TRUE)[[1]][1])
  }))
  }
}, simplify = FALSE), idcol='ID')
coef_dt <- coef_dt[feature != '(Intercept)']
coef_dt[, padj:=p.adjust(pvalue, 'BH'), by=ID]
# coef_dt[, padj:=p.adjust(pvalue, 'BH')]
# coef_dt[, padj_BF:=p.adjust(pvalue, 'bonferroni')]
coef_dt[, ID:=as.integer(ID)]
coef_dt[event_dt, on=.(ID), `Event Type`:=`Event Type`]
# coef_dt[, coef_rank:=rank(-coefs), by=.(ID)]
coef_dt[, feature:=gsub('`', '', feature, fixed = TRUE)]
coef_dt[, cCRE_id := as.integer(gsub('^.+_(\\d+)$', '\\1', feature))]
coef_dt[, c('cCRE_type', 'cCRE_accession'):=cCREs[cCRE_id, .(cCRE_type, accession)]]
coef_dt[, mark:=tstrsplit(feature, ';', fixed = TRUE, keep = 1)]
coef_dt[!is.na(cCRE_type), feature := paste(tstrsplit(feature, '_', fixed = TRUE, keep = 1)[[1]], cCRE_type, sep = ';')]
coef_dt[is.na(cCRE_type), cCRE_type:='no_cCRE']
coef_dt[, cCRE_type:=as.factor(cCRE_type)]

sig_coef_dt <- coef_dt[padj < sig_threshold, if(!'gene_expression' %in% feature) .SD, by = .(ID)]
sig_coef_dt[, positive := coefs>0]
# ggplot(sig_coef_dt, aes(y = coefs, x = tidytext::reorder_within(feature, -coefs, `Event Type`, median), color = random))  + geom_boxplot() + #geom_jitter() + 
#   labs(x='feature', title = 'nonzero coeffficients for each event with an enhancer in 5kb window') + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = .5)) + tidytext::scale_x_reordered() + facet_wrap(~ `Event Type`, scales='free', nrow = 2)
# ggplot(sig_coef_dt#feature != '(Intercept)' & ID %in% coef_dt[!is.na(cCRE_type), ID]
#        , aes(x = coefs, color = mark)) + geom_density() + theme_bw() + theme(axis.text.x = element_text(
#          angle = 90,
#          hjust = 1,
#          vjust = .5
#        )) + facet_wrap(~ `Event Type` + cCRE_type, nrow = 2, scales = 'free_y')
# ggplot(sig_coef_dt,
#        aes(
#          y = coefs,
#          x = tidytext::reorder_within(mark,-coefs, list(`Event Type`, cCRE_type), median),
#          color = random
#        ))  + geom_boxplot() + #geom_jitter() +
#   labs(x = 'feature', title = 'nonzero coeffficients for each event with an enhancer in 5kb window') + theme_bw() + theme(axis.text.x = element_text(
#     angle = 90,
#     hjust = 1,
#     vjust = .5
#   )) + tidytext::scale_x_reordered() + facet_wrap(
#     ~ `Event Type` + cCRE_type,
#     nrow = 2,
#     scales = 'free',
#     drop = FALSE
#   )
```


```{r, eval= FALSE}
all_cCRE_accessions <- cCREs[to(cCRE_hits)[from(cCRE_hits) %in% keep_rows], unique(accession)]
fileConn<-file("all_cCRE_accessions.txt")
writeLines(all_cCRE_accessions, fileConn)
close(fileConn)

nonzero_cCRE_accessions <- coef_dt[, unique(cCRE_accession)]
fileConn<-file("nonzero_cCRE_accessions.txt")
writeLines(nonzero_cCRE_accessions, fileConn)
close(fileConn)

if (!file.exists('tf_dt.csv'))
  source('07-event-specific-get-tf.R')
```

```{r, fig.width=20, fig.height=20}
all_cCRE_accessions <- cCREs[to(cCRE_hits)[from(cCRE_hits) %in% keep_rows], unique(accession)]
cCRE2event <- unique(data.table(accession= cCREs[to(cCRE_hits)[from(cCRE_hits) %in% keep_rows], accession], `Event Type`=event_dt[from(cCRE_hits)[from(cCRE_hits) %in% keep_rows], `Event Type`]))
tf_dt <- unique(fread('tf_dt.csv', stringsAsFactors = TRUE))
stopifnot(all(tf_dt[, all_cCRE_accessions %in% accession]))

tf_wide <- dcast(tf_dt, accession ~ name, value.var = 'n', fun.aggregate = function(x) length(x) > 0)

# tf_dt <- tf_dt[name != 'EMPTY']
# tf_dt[, percentage:=n/total]
# tf_dt <- na.omit(tf_dt[cCRE2event, on=.(accession), allow.cartesian=TRUE], cols='name')
# 
# # tfs_by_accession <- tf_dt[, list(tfs = list(name)), by=.(accession)]
# # coef_dt[alpha == '1' & coefs != 0 & feature != '(Intercept)' & ID %in% coef_dt[!is.na(cCRE_type), ID]]
# # coef_dt[tfs_by_accession, on=c(cCRE_accession='accession'), tfs:=tfs]
# cols_to_tf <- c('coefs', 'cCRE_type', 'mark', 'padj', 'ID', 'random')
# tf_dt[coef_dt, on=c(accession='cCRE_accession'), (cols_to_tf):=mget(cols_to_tf)]
# 
# tf_agg <- tf_dt[padj < sig_threshold, if(.N>2) .(nobs=.N, all_neg=all(coefs<0), all_pos=all(coefs>0), min=min(coefs), median=median(coefs), mean=mean(coefs), max=max(coefs)), by=.(`Event Type`, cCRE_type, mark, name)]
# tf_agg[, .(n_all_neg=sum(all_neg), n_all_pos=sum(all_pos), n_both=sum(!all_neg & !all_pos)), by=.(`Event Type`, cCRE_type, mark)]
# ggplot(melt(tf_agg[, .(n_all_neg=sum(all_neg), n_all_pos=sum(all_pos), n_both=sum(!all_neg & !all_pos)), by=.(`Event Type`, cCRE_type, mark)], id.vars = c('Event Type', 'cCRE_type', 'mark')), aes(x=variable, y = value, fill = mark)) + geom_col(position = 'dodge') + facet_wrap(~ `Event Type` + cCRE_type, nrow= 2)
# 
# # ggplot(tf_dt[n> 1 & cCRE_type == 'dELS' & `Event Type` == 'SE'], aes(x = reorder(name, coefs, mean), y = coefs)) + geom_boxplot() + facet_wrap(~ mark, ncol = 1, scales = 'free_y') + theme_bw() + theme(axis.text.x = element_text(
# #     angle = 90,
# #     hjust = 1,
# #     vjust = .5
# #   ))

```

```{r}
library(simpleCache)
library(qvalue)
library(LOLA)
regionDB <- loadRegionDB("/nfs/data/references/LOLA/LOLACore/hg38")
```


```{r}
library(plotly)
library(ggrepel)
for (event in to_analyze) {
  # contingency_dt <- tf_dt[random == this_random, .(`Event Type`, accession, name, sig_coef=ifelse(is.na(padj), Inf, padj) <= sig_threshold)][, .(count=.N), by=.(`Event Type`, name, sig_coef)]
  event_ids <- event_dt[ID %in% keep_rows & `Event Type` == 'SE', ID]
  event_accessions <- cCREs[to(cCRE_hits)[from(cCRE_hits) %in% event_ids], unique(accession)]
  event_wide <- tf_wide[accession %in% event_accessions]
  for (this_random in c("FALSE", "TRUE")) {
    # for (this_mark in sig_coef_dt[, unique(mark)]) {
    #   for (this_positive in sig_coef_dt[, unique(positive)]) {
        significant_cCREs <- sig_coef_dt[`Event Type` == event & random == this_random
                                         # & this_mark == mark & this_positive == positive
                                         , unique(cCRE_accession)] #coef_dt[`Event Type` == event & random == this_random, if(!'gene_expression' %in% feature) .SD, by = .(ID)][, unique(cCRE_accession)]
        event_wide[, significant:=FALSE]
        event_wide[accession %in% significant_cCREs, significant:=TRUE]
        # Here was a bug, because random == this_random was missing, which probably should be using contingency anyway
        contingency_list <- pbmcapply::pbmclapply(tf_dt[, levels(name)], function(this_tf){
          # a <- contingency_dt[`Event Type` == event & name == this_tf & sig_coef == TRUE, ifelse(length(count) == 0, 0, count)]
          # b <- contingency_dt[`Event Type` == event & name != this_tf & sig_coef == TRUE, sum(count)]
          # c <- contingency_dt[`Event Type` == event & name == this_tf & sig_coef == FALSE, ifelse(length(count) == 0, 0, count)]
          # d <- contingency_dt[`Event Type` == event & name != this_tf & sig_coef == FALSE, sum(count)]
          # stopifnot(a + b + c + d == nrow(tf_dt[random == this_random & `Event Type` == event]) & a + b == tf_dt[random == this_random & `Event Type` == event, sum(ifelse(is.na(padj), Inf, padj) < sig_threshold)] & c + d == tf_dt[random == this_random & `Event Type` == event, sum(ifelse(is.na(padj), Inf, padj) >= sig_threshold)] & a + c == tf_dt[random == this_random & `Event Type` == event, sum(name == this_tf)] & b + d == tf_dt[random == this_random & `Event Type` == event, sum(name != this_tf)])
          tbl <- event_wide[, table(get(this_tf), significant)]
          if (length(tbl) < 4) {
            tbl[(length(tbl)+1):4] <- 0
            tbl <- matrix(tbl, nrow = 2)
            }
          htest <- fisher.test(tbl, alternative = 'greater') #fisher.test(matrix(c(a, b, c, d), ncol = 2, nrow = 2), alternative = 'greater')
          c(list(p.value = htest$p.value, conf.int.lower=htest$conf.int[1], conf.int.upper=htest$conf.int[2], odds.ratio=htest$estimate), rev(tbl))
        })
        names(contingency_list) <- tf_dt[, levels(name)]
        enrichment_table <- rbindlist(contingency_list, idcol = 'tf')
        enrichment_table[, p.adjust:=p.adjust(p.value, method = 'BH')]
        print(enrichment_table[p.adjust <= sig_threshold])
        if (enrichment_table[, all(odds.ratio == 0)]) {
          print(paste(event, "and", this_random, "has only 0 odds ratios"))
          break
        }
        print(ggplot(enrichment_table, aes(y = -log10(p.adjust), x = log2(odds.ratio), label = tf, color = p.adjust <= sig_threshold)) + geom_hline(yintercept = -log10(sig_threshold), color="grey", linetype="dashed") + geom_point() + theme_bw() + geom_label_repel(max.overlaps = 20)+ labs(title=paste(event, "Random:", this_random, this_mark, "Positive:", this_positive)))
        print(ggplot(enrichment_table, aes(x = log2(odds.ratio), fill = p.adjust <= sig_threshold)) + geom_histogram(binwidth = .1) + theme_bw()+ labs(title=paste(event, "Random:", this_random, this_mark, "Positive:", this_positive)))
        lola_res <- runLOLA(cCRE_gr[cCREs[accession %in%  significant_cCREs, which=TRUE]], userUniverse = cCRE_gr[cCREs[accession %in%  event_accessions, which=TRUE]], regionDB=regionDB)
        print(lola_res[qValue <= sig_threshold])
        print(plotTopLOLAEnrichments(lola_res[qValue <= sig_threshold]))
    #   }
    # }
  }
}
```

```{r, eval=FALSE}
library(biomaRt)
library(gprofiler2)
enrichment_tfs <- enrichment_table[odds.ratio >= 2, tf]
ensembl <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
getBM(attributes=c('external_gene_name','description'), filters = 'external_gene_name', values = enrichment_tfs, mart =ensembl)

gostres <- gost(query = enrichment_tfs,
                organism = "hsapiens", ordered_query = FALSE, 
                multi_query = FALSE, significant = FALSE, exclude_iea = FALSE, 
                measure_underrepresentation = FALSE, evcodes = FALSE, 
                user_threshold = sig_threshold, correction_method = "fdr", 
                domain_scope = "custom", custom_bg = tf_dt[, unique(name)],
                numeric_ns = "", sources = c('GO'), as_short_link = FALSE)
gostplot(gostres, capped = TRUE, interactive = FALSE)
```

```{r, fig.width=10, fig.height=7}
library(gprofiler2)
library(UpSetR)
library(grid)

lhs <- c("Event Type", "name", "cCRE_type")
tf_agg_melt <- melt(tf_agg, id.vars = c(lhs, 'mark'), measure.vars = c('all_neg', 'all_pos'), variable.name = 'pos_neg', value.name = 'pos_neg_value', variable.factor = FALSE)
tf_agg_wide <- dcast(tf_agg_melt, `Event Type` + name + cCRE_type + pos_neg ~ mark, fun.aggregate = as.integer, value.var = 'pos_neg_value', fill = 0L)

for (this_event in tf_agg[, unique(`Event Type`)]){
  for (this_cCRE_type in tf_agg[, unique(cCRE_type)]){
    for (this_pos_neg in tf_agg_melt[, unique(pos_neg)]){
      melt_sub <- tf_agg_melt[cCRE_type == this_cCRE_type & `Event Type` == this_event & this_pos_neg == pos_neg & pos_neg_value == TRUE]
      if(nrow(melt_sub) > 0){
      tf_agg_wide <- dcast(melt_sub, `Event Type` + name + cCRE_type ~ mark, fun.aggregate = as.integer, value.var = 'pos_neg_value', fill = 0L)
    tryCatch({
      upset_plot <- upset(tf_agg_wide, sets.bar.color = "#56B4E9", order.by = "freq", nsets = 7)
      print(upset_plot)
      grid.text(paste(this_event, this_cCRE_type, this_pos_neg),x = 0.65, y=0.95, gp=gpar(fontsize=20)) 
    }, error= function(e){
      if (e$message == "'x' must be an array of at least two dimensions") {
        cols_to_keep <- names(tf_agg_wide)[!names(tf_agg_wide) %in% lhs]
        if (length(cols_to_keep) == 1)
          print(ggplot(tf_agg_wide, aes(x=get(cols_to_keep))) + geom_bar() + labs(title=paste(this_event, this_cCRE_type, this_pos_neg), x= cols_to_keep))
      } else {
        browser()
      }
      })
    gostres <- gost(query = tf_agg_wide[, name],
                organism = "hsapiens", ordered_query = FALSE, 
                multi_query = FALSE, significant = FALSE, exclude_iea = FALSE, 
                measure_underrepresentation = FALSE, evcodes = FALSE, 
                user_threshold = sig_threshold, correction_method = "fdr", 
                domain_scope = "custom", custom_bg = tf_dt[, unique(name)],
                numeric_ns = "", sources = c('GO'), as_short_link = FALSE)
    # if(!is.null(gostres))
      # print(gostplot(gostres, capped = TRUE, interactive = FALSE))
      }
    }
  }
}
```

```{r, eval=FALSE}
library(AnnotationDbi)
library(org.Hs.eg.db)
library(GO.db)
protein_id <- 'SYMBOL'
# first get splicing factors
go_id_splicing <- AnnotationDbi::select(GO.db, keys = c("mRNA splicing, via spliceosome"), columns = c("GOID"), keytype = c("TERM"))$GOID
splicing_factors <- unique(AnnotationDbi::select(org.Hs.eg.db, keys=go_id_splicing, columns = protein_id, keytype = "GOALL")[[protein_id]])
# here we manually produced data/splicing_factor_mapping.tab

# now get tfs
go_id_transcription_factor <- AnnotationDbi::select(GO.db, keys = c("DNA-binding transcription factor activity"), columns = c("GOID"), keytype = c("TERM"))$GOID
transcription_factors <- unique(AnnotationDbi::select(org.Hs.eg.db, keys=go_id_transcription_factor, columns = protein_id, keytype = "GOALL")[[protein_id]])
# here we manually produced data/transcription_factor_mapping.tab
```

```{r, fig.width=10, eval=FALSE}
#, fig.width=15, fig.height=10}
#, fig.height=35, fig.width=14}
non_zero_enhancers_wo_gene_expr <- coef_dt[ID %in% ids_non_zero_enhancers & !ID %in% gene_expr_ids & coefs != 0 & feature != '(Intercept)']
print(paste(non_zero_enhancers_wo_gene_expr[, uniqueN(ID)], 'events left'))

all_features <- coef_dt[, unique(feature)]
enhancer_ids <- grep('enhancer', all_features, fixed = TRUE)
ggplot(non_zero_enhancers_wo_gene_expr, aes(y = coefs, x = factor(feature, levels = c(all_features[enhancer_ids], all_features[-enhancer_ids])), color = `Event Type`))  + geom_boxplot() + geom_jitter()+ labs(x='feature', title = 'nonzero coeffficients for each event with an enhancer in 5kb window and > 300 samples') + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = .5)) #+ facet_wrap(~ ID, scales='free_x')
```

```{r, eval=FALSE}
all.equal(event_dt[ID %in% non_zero_enhancers_wo_gene_expr[, unique(ID)]], event_dt[non_zero_enhancers_wo_gene_expr[, sort(unique(ID))]])
event_dt[non_zero_enhancers_wo_gene_expr[, sort(unique(ID))], 'gene_id']
event_gr[non_zero_enhancers_wo_gene_expr[, sort(unique(ID))]]
enhancers <- rtracklayer::import('data/F5.hg38.enhancers.bed')
enhancer_hits <- findOverlaps(event_gr, enhancers, maxgap = vicinity, ignore.strand=TRUE)
enhancers[to(enhancer_hits[from(enhancer_hits) %in% non_zero_enhancers_wo_gene_expr[, unique(ID)]])]
enhancer_hits[from(enhancer_hits) == 719]
head(enhancers)
```