02_network_modeling.Rmd

---
title: "Network modeling"
output: github_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, cache = TRUE, autodep = TRUE)
refined = 'intermediates'
private = 'private'
data = 'data'
library(readr)
library(dplyr)
library(stringr)
library(broom)
library(survival)
library(icenReg)
```

These analyses were used to generate the network plots in figure 6, as well as 

# Analysis of CST Occurrence vs Immunological Parameters - quasi-Poisson Duration

Iterates through every pairwise combination of CST and immunological parameter (either IST or metacluster abundance at one of the three timepoints) and tests for a significant associations between the immune parameter in adjusted and unadjusted models.

```{r, echo = FALSE, include = FALSE}

# Take a big ol' table
# names of response columns and predictor columns

# Iterate through pairs of columns in ist_tbl and microbe_tbl, adjusting for extra_tbl
# All keyed by Subject
# null_formula should include all confounders
# regression_fcn(formula, data) returns an object that permits an anova test and tidy method
# result_id?
pairwise_regression = function(joined_tbl, ist_fields, microbe_fields, null_formula, regression_fcn){
  ist_tbl = joined_tbl[ist_fields]
  response_tbl = joined_tbl[microbe_fields]
  extra_tbl = joined_tbl[setdiff(names(joined_tbl), c(ist_fields, microbe_fields))]
  
  table_length <- ncol(response_tbl) 
  full_formula = update(null_formula, . ~ . + voi)
  
  all_results = list()
  i = 1
  
  warn_trapped_fcn = purrr::quietly(regression_fcn)
  for( cst_i in seq_len(table_length)){
    response_nm = names(response_tbl)[cst_i]
    for( voi_i in names(ist_tbl)){
      this_data = tibble(response = response_tbl[[cst_i]], voi = ist_tbl[[voi_i]]) %>% 
        bind_cols(extra_tbl) %>% na.omit()
      if(is.numeric(this_data$voi)){
        if(var(this_data$voi, na.rm = TRUE) < 1e-6) next
        this_data$voi = scale(this_data$voi)
      }
      try({
      fit.null = warn_trapped_fcn(null_formula, this_data)
      fit.full = warn_trapped_fcn(full_formula, this_data)
      model_test = anova(fit.null$result, fit.full$result, test = "Chisq")
      full_results = tidy(fit.full$result) %>% mutate(response = response_nm,
                                               voi = voi_i, produced_warning = length(fit.full$warnings) > 0)
      anova_p = model_test$`Pr(>Chi)`[2]
      anova_result = tibble(term = 'anova', p.value = anova_p, response = response_nm, voi = voi_i)
      all_results[[i]] = bind_rows(full_results, anova_result)
      })
      i = i + 1
    }
  }
 bind_rows(all_results )
}


`tidy.loglogistic aft` = function(x, ...){
  fit_summary <- summary(x)
  tidy_tbl = as.data.frame(fit_summary$summaryParameters)
  tidy_tbl$term = rownames(tidy_tbl)
  select(tidy_tbl, estimate = Estimate, 
         std.error = Std.Error,
         statistic = `z-value`,
         p.value = p, term)
}

`anova.loglogistic aft` = function(object, ...){
  full = list(...)[[1]]
  ll0 = object$llk
  ll1 = full$llk
  res = data.frame(stat = ll0 - ll1, dof = nrow(full$var) - nrow(object$var))
  res = cbind(res, `Pr(>Chi)` = 1 - pchisq(-2*res[,'stat'], df = res[,'dof']))
  rbind(res, res) # pad so that we can index into second row, analoguous to anova.glm
}


```

```{r, results = 'asis'}
to_fit = expand.grid(site = c('REC', 'NAS'), #microbiome sampling site
                     model_type = c('logit', 'duration', 'time_to_event'), #type of association. logit = odds of CST ever, duration = number of days in CST, time_to_event = rate of first entry to CST
                     specification = c('mod_only', 
                                       'baseline', # MOD, GAB
                                       'full'),# full = MOD, GAB, ABX, MILK
                     stringsAsFactors = FALSE
                     ) 

logit =  function(form_, data_) arm::bayesglm(form_, family = 'binomial', data = data_)
duration = function(form_, data_) arm::bayesglm(form_, family = 'quasipoisson', data = data_)
time_to_event = function(form_, data_){
  data_ = dplyr::filter(data_, !is.na(response))
  data_$response = Surv(data_$PrevSampleDOL, data_$response, type = "interval2")
  res = ic_par(form_, data = data_, model = "aft", dist = "loglogistic")
  res
}

knitr::kable(to_fit)
```

Scenarios to fit.

```{r}
nabx_polish = read_csv('data/antibiotic_exposure.csv') %>% 
  tidyr::pivot_wider(Subject, names_from = 'discharge', values_from = 'Number of systemic antibiotic') %>% 
  rename(abx_hospital = 'FALSE', abx_discharge = 'TRUE') %>%
  mutate(abx_hospital = (abx_hospital - mean(abx_hospital))/sd(abx_hospital), abx_discharge = abx_discharge- mean(abx_discharge))


milk_months = read_csv(file.path('intermediates', 'milk_subject.csv')) %>% select(Subject, milk_months)

hospital_humilk = read_csv('data/milk_hospital.csv') %>% rename(milk_perinatal = `Any Human Milk Perinatal`)
milk = full_join(milk_months, hospital_humilk) %>% mutate(milk_months = milk_months - mean(milk_months, na.rm = TRUE))
```
                                                          
Set up confounders (Antibiotics and Milk).  Both are centered.  Hospital antibiotics (days) are Z-scored so represent SD increases.


```{r}
network_results_path = file.path(refined, 'network_results')
if(!dir.exists(network_results_path)) dir.create(network_results_path)

fit_scenario_result = list()
for(i in seq_len(nrow(to_fit))){
  this_fit = to_fit[i,]
  ist_extra_tbl <- read_tsv(file.path(refined, sprintf("%s_Surv_Mapping.txt", this_fit$site))) %>% mutate(gaBirth = gaBirth - 37)
  
  # Get response data and
  if(this_fit$model_type == 'logit'){ # Logistic
     microbe_tbl <- read_tsv(file.path(refined, sprintf("%s_Logit_Input.txt", this_fit$site))) %>%
       mutate_at(.vars = vars(-Subject, -Total), .funs = function(x) x=='Yes')
  } else if(this_fit$model_type == 'duration'){  # Duration
    microbe_tbl =  read_tsv(file.path(refined, sprintf("%s_Duration_Input.txt", this_fit$site)))
    
  } else if(this_fit$model_type == 'time_to_event'){ #Survival
    microbe_tbl = read_tsv(file.path(refined, sprintf("%s_Surv_Input.txt",  this_fit$site)))
  }
  
  joined_tbl = left_join(microbe_tbl, ist_extra_tbl, by = 'Subject')
  regr_fcn = get(this_fit$model_type)
  
  # mod_only
  this_formula = formula(response ~  MOD)
  if(this_fit$specification == 'full'){
    this_formula = update(this_formula, . ~ . +  milk_perinatal + milk_months + abx_hospital + abx_discharge + gaBirth)
  } else if(this_fit$specification == 'baseline'){
     this_formula = update(this_formula, . ~ . + gaBirth)
  }
  
  # Total used as offset
  if(this_fit$model_type %in% c('logit', 'duration')){
    this_formula = update(this_formula, . ~. + offset(log(Total)))
    
  } else{
    ist_extra_tbl = left_join(microbe_tbl[c('PrevSampleDOL', 'Subject')], ist_extra_tbl, by = 'Subject')
    microbe_tbl = select(microbe_tbl, -PrevSampleDOL)
  }
  
  ist_fields = ist_extra_tbl %>% select(contains('TPHE'), contains('CD4'), contains('CD8'), contains('ICS')) %>% names()
  microbe_fields = microbe_tbl  %>% select(contains('REC'), contains('NAS')) %>% names()
  joined_tbl2 = joined_tbl %>%
    left_join(milk) %>%
    left_join(nabx_polish)
  
  message("Fitting ", paste0(this_fit, collapse = "-"))
  fit_scenario_result[[i]] = pairwise_regression(joined_tbl2, ist_fields = ist_fields, microbe_fields = microbe_fields, null_formula = this_formula, regression_fcn = regr_fcn)
  result_file = stringr::str_c(do.call(paste, c(list(sep = '_'), this_fit)), "_results.csv")
   
  write_csv(fit_scenario_result[[i]], file.path(network_results_path, result_file))
 
}

```

Fit all the scenarios.

# Top interactions

```{r, fig.width = 8, fig.height = 8}
clamp = function (x, modulus = 5) {
    x[x < -modulus] = -modulus
    x[x > modulus] = modulus
    x
}

sign_max = function(x) {
  sign(x[which.max(abs(x))])
}


library(ggplot2)
all_fits = bind_rows(fit_scenario_result, .id = 'scenario') %>% left_join(to_fit %>% mutate(scenario = as.character(seq_along(site))))

all_fits  = all_fits %>% mutate(voi_full = str_replace(voi, 'TPHE_(C[1-9]+)', 'TPHE_CD4_\\1'),
                                voi_full = str_replace(voi_full, '(?<!TPHE)(_CD[48]_C[1-9]+)', '_ICS\\1'))

per_type_fdr = filter(all_fits, term == 'anova')  %>% group_by(model_type, specification)%>% mutate(fdr = p.adjust(p.value, 'fdr'))

per_type_sign = all_fits %>%  filter(stringr::str_detect(term, stringr::fixed('voi'))) %>% group_by(model_type, specification, voi, response) %>% summarize(sign_max = sign_max(estimate))

all_fits = all_fits %>%left_join(per_type_sign) %>% left_join(per_type_fdr[c('fdr', intersect(names(per_type_sign), names(per_type_fdr)))])

top = all_fits %>% filter(term == 'anova') %>% group_by(site, model_type) %>% arrange(p.value) %>% do(head(., n = 1))

top_coefs = semi_join(all_fits, top[c('scenario', 'voi', 'response')]) %>% anti_join(tibble(term = c('anova', '(Intercept)')))


ggplot(top_coefs, aes(x = term, y = estimate, ymin = estimate - std.error, ymax = estimate + std.error, color = clamp(-log10(p.value)))) + geom_pointrange() + facet_wrap(~model_type  + voi + response, scales = 'free') + coord_flip()


of_interest = tibble(response = c("NAS_8", 'REC_10'), voi = c('TPHE_IST_Disch', 'ICS_IST_1YR'))

write_csv(semi_join(all_fits, of_interest), path = 'intermediates/selected_network_effects.csv')
```

[Coefficients / etc for NAS_8 and REC_10](intermediates/selected_network_effects.csv)

[Full results](intermediates/network_results)

# Network plot figures

```{r}
enmatrix = function(x, rownames_from){
  y = x[,setdiff(names(x), rownames_from)]
  y = as.matrix(y)
  rownames(y) = x[[rownames_from]]
  y[is.na(y)] = 0
  y
}

per_type_fdr = left_join(per_type_fdr, per_type_sign) %>% mutate(signed_pval = clamp(-log10(p.value), 8) * sign_max, zeroed_pval = signed_pval * (fdr< .1))


per_type_nest = per_type_fdr %>% select(voi_full, response, zeroed_pval, model_type, specification) %>% tidyr::nest(data = c(zeroed_pval, voi_full, response))

per_type_nest$adj_matrix = purrr::map(per_type_nest$data, ~ tidyr::pivot_wider(.x, voi_full, names_from = response, values_from = zeroed_pval) %>% enmatrix('voi_full'))

per_type_nest %>% rowwise() %>% mutate(n_edges = sum(abs(adj_matrix)>0)) %>% select(model_type, specification, n_edges)

```

```{r networks, dev = c('png', 'pdf'), fig.width = 6, fig.height = 6}
mc_suffix = dplyr::tibble(Family = c('TPHE4', 'TPHE8', 'ICS4', 'ICS8'), 
                   suffix = c('t4', 't8', 'i4', 'i8'),
                   voi3_prefix = c('TPHE_CD4', 'TPHE_CD8', 'ICS_CD4', 'ICS_CD8'))
# Descriptive names for T cell subpop
metacluster_rn = readr::read_csv('intermediates/Metacluster Identities.csv') %>% select(-X5) %>% mutate(Family = Family %>% toupper(), Family = str_replace_all(Family, ' ', '')) %>% left_join(mc_suffix) %>% mutate(marker = str_c('Meta.Cluster_', Cluster, '_', suffix), voi3 = str_c(voi3_prefix, '_C', Cluster), Identity = make.unique(Identity))
#Bipartite Graph of CST vs Immunome Results

#Load packages (Probably don't even need half of them - I tried a lot of things before I settled on this relatively simple solution.)
library(ggplot2)
library(network)
library(GGally)
library(RColorBrewer)
library(tidyverse)

#Define constants

set.seed(11)

rec_csts = c("REC_4", "REC_1", "REC_2", "REC_9", "REC_10", "REC_5", "REC_8", "REC_6", "REC_3", "REC_13", "REC_7", "REC_11", "REC_12")
nas_csts = c("NAS_4", "NAS_1", "NAS_2", "NAS_9", "NAS_10", "NAS_5", "NAS_8", "NAS_6", "NAS_3", "NAS_13", "NAS_7", "NAS_11", "NAS_12")
#tests = c("logit", "surv", "binom", "alt_binom")
# tests = c("logit", "alt_surv")

to_plot = tibble(
  response = c('REC_10', 'NAS_3', 'NAS_8', 'NAS_9'),
  model_type = 'duration',
  specification = 'full')
  
to_plot_response = semi_join(per_type_nest, to_plot) %>%  select(-adj_matrix) %>% unnest(cols = c(data)) %>% ungroup() %>% 
  inner_join(to_plot) %>% filter(!is.na(zeroed_pval) & abs(zeroed_pval) > 0) 


#Loop through the sites and make a figure for each as the target site (i.e. predicted by the CSTs of the other two sites)
for (test in seq_len(nrow(per_type_nest))){
  
  #Read in the site-specific adjacency matrix specifying associations between taxa and the CSTs of the other body sites
  adj.matrix = per_type_nest$adj_matrix[[test]]
    
  #Use that adjacency matrix to construct a network object that is bipartite
  net.obj <- network(adj.matrix, matrix.type = "bipartite", ignore.eval = FALSE, names.eval = "weights")
  
  #Color the CST vertices according to the body site they represent
  network::set.vertex.attribute(net.obj, "color", ifelse(net.obj %v% "vertex.names" %in% rec_csts, "red", ifelse(net.obj %v% "vertex.names" %in% nas_csts, "blue", ifelse(grepl("TPHE", net.obj %v% "vertex.names"), "orange", "green"))))
  
  #Shape the vertices according to the visit they represent
  network::set.vertex.attribute(net.obj, "shape", ifelse(grepl("Birth", net.obj %v% "vertex.names"), 19, ifelse(grepl("Disch", net.obj %v% "vertex.names"), 17, ifelse(grepl("OneYear", net.obj %v% "vertex.names"), 15, 18))))
  
  #Make non-significant associations invisible
  network::set.edge.attribute(net.obj, "alpha", ifelse(net.obj %e% "weight" == 0.0, 0, 1))
  
  if(sum(network::has.edges(net.obj)) == 0) next
  connected.net = network::get.inducedSubgraph(net.obj, which(network::has.edges(net.obj)))
  
  #Make a color gradient that will be used to color the edges
  rbPalFun0 <- colorRamp(c('darkblue', 'grey70',  'red'), space = 'Lab')
  
  #Map edge weights (signed log10 FDR adjusted p-vals) to colors in the gradient
  weights = network::get.edge.attribute(net.obj, "weights")
  rbPalFun = function(x){
    sx = x*.5/max(abs(x)) + .5
    #sx = (x-max(abs(weights))/(max(weights)-min(weights)))
    rgb(rbPalFun0(sx)/255)
  }
  ecols <- rbPalFun(weights)
  
  print(ggnet2(connected.net, color = "color", shape = "shape", edge.color = ecols, label = TRUE, label.size = 3, fontface = "bold", edge.size = 1,layout.par = list(niter = 1000))  + 
          ggtitle(paste0(per_type_nest[test,'model_type'], ':', per_type_nest[test,'specification'])))
  
  if(per_type_nest[test,'model_type'] == 'duration' && per_type_nest[test, 'specification'] == 'full'){
    to_plot_v = which((connected.net %v% "vertex.names") %in% to_plot$response)
    to_plot_neigh =  do.call(c, lapply(to_plot_v, network::get.neighborhood, x = connected.net))
    #subnet = network::get.inducedSubgraph(net.obj,  to_plot_v, alters = setdiff(seq_along(net.obj %v% "vertex.names"), to_plot_v))
    subnet = network::get.inducedSubgraph(connected.net,  union(to_plot_v, to_plot_neigh))
      ecols_sub <- rbPalFun(subnet %eattr% "weights")
  }

}

#ggplot2.multiplot(nas_graph, rec_graph, thr_graph, cols = 2)

#ggsave(paste(workDir, sprintf("%s_graph.pdf", site), sep = "/"), width = 15, height = 15, units = "in")

```


```{r  core_network, dev = c('png', 'pdf'), fig.width = 6, fig.height = 6}
 print(ggnet2(subnet, color = "color", shape = "shape", edge.color = ecols_sub, label = TRUE, label.size = 3, fontface = "bold", edge.size = 1,layout.par = list(niter = 1000))  + 
          ggtitle('duration:full (selected nodes)'))

cleanup_coefs = . %>%
  filter(str_detect(term, 'voi')) %>%
  mutate(is_anova = !str_detect(term, '^voi$'),
    voi2 = ifelse(is_anova, str_replace(term, 'voi', ''), voi_full),
         tp = str_extract(voi, 'OneYear|Disch|Birth'),
         voi3 = str_replace(voi2, '_?(OneYear(_)?|Disch(arge)?(_)?|Birth(_)?)', ''),
         cut_fdr = cut(-log10(ifelse(is.na(fdr), 1, fdr)), breaks = c(0, 1, 2, Inf), include.lowest = TRUE)
         ) %>%
  left_join(metacluster_rn) %>% mutate(metacluster_nm = coalesce(Identity, voi3))

to_plot_coefse_all = semi_join(all_fits, to_plot_response, by = c('model_type', 'specification', 'response')) %>% semi_join(to_plot_response, by = 'voi_full') %>% cleanup_coefs

to_plot_coefse_sub = semi_join(all_fits, to_plot_response) %>% cleanup_coefs %>% mutate(metacluster_nm = factor(metacluster_nm, levels = unique(metacluster_nm[order(voi3)])))


plt = ggplot(to_plot_coefse_all, aes(y = clamp(estimate, 2), ymin = estimate-1.96*std.error, ymax = estimate + std.error*1.96, x = voi3, color = tp))+ facet_grid(is_anova ~ response,  space = 'free_y', scales = 'free', ) + xlab('') + ylab('Log Fold Change') + geom_pointrange(position = position_dodge(width = .4)) + geom_hline(yintercept = 0, lty = 2) + theme_minimal()  + coord_flip(ylim = c(-2, 2)) + scale_color_discrete('Time point')

plt + aes(alpha = cut_fdr) + scale_alpha_manual(values = c('[0,1]'=.3, '(1,2]'=.7, '(2,Inf]'=1))


plt %+% to_plot_coefse_sub
```


```{r core_networks_mc, opts.label='core_networks', fig.width = 8, dev = c('png', 'pdf')}
(plt  %+% to_plot_coefse_sub) + aes(x = metacluster_nm)
```

```{r flow_mfi}
fsom_expr = readRDS('flowcytometry/intermediates/metacluster_mfi.rds')
descaled = bind_rows(fsom_expr$descaled, .id = 'population') %>% pivot_longer(cols = c(-cell_clustering, -population)) %>% filter(!is.na(value))


```

# Targeted analysis of Alloiococcus abundance, Tphe5, and acute illness


```{r}
library(readr)
library(forcats)
library(lme4)
library(geepack)

#Read in mapping file with metadata including Alloiococcus abundance, Tphe5 at birth or discharge, and acute illness
md.nas <- read_delim(file.path(refined, "NAS_Focused_Mapping.txt"), "\t", escape_double = FALSE, trim_ws = TRUE, guess_max = 3600)
rns <- md.nas[[1]]
md.nas <- md.nas[ , 2:ncol(md.nas)]
rownames(md.nas) <- rns
#Turn it into a dataframe
md.nas <- data.frame(md.nas)
#Extract only post-discharge samples
md.post <- md.nas[which(md.nas$PostInitialDischarge == "Yes"), ]

#Center and scale numerical variables and convert categorical variables to factors
md.post$Subject <- factor(md.post$Subject)
md.post$GAB <- c(scale(md.post$gaBirth, center = TRUE, scale = TRUE))
md.post$nDOL <- c(scale(md.post$DOL, center = TRUE, scale = TRUE))
md.post$MOD <- factor(md.post$MOD) %>% fct_collapse(not_vaginal_vertex = c("Vaginal_Breech", "Caesarean_Section"))
md.post$IllnessVisit <- factor(md.post$IllnessVisit)
md.post$TPHE_5 <- factor(md.post$TPHE_5) #This is a binary variable with an affirmative value if the subject exhibited TPHE 5 at either birth or discharge
md.post$allo_counts <- round(md.post$Reads*md.post$Alloiococcus) #Convert Alloiococcus relative abundance to counts
md.post = left_join(md.post, milk, by = 'Subject') %>%
  left_join(nabx_polish, by = 'Subject')
md.post$Subject = factor(md.post$Subject)

#Test Alloiococcus and TPHE_5 as predictors of acute illness, both by themselves and jointly, controlling for confounders.
summary(glmer(IllnessVisit ~ Alloiococcus + nDOL + GAB + MOD + milk_months + milk_perinatal + abx_hospital + abx_discharge + (1|Subject), data = md.post, family = binomial))
summary(glmer(IllnessVisit ~ TPHE_5 + nDOL + GAB + MOD + (1|Subject), data = md.post, family = binomial))
summary(glmer(IllnessVisit ~ Alloiococcus + TPHE_5 + nDOL + GAB + MOD + (1|Subject), data = md.post, family = binomial))

#Test TPHE_5 and acute illness as predictors of Alloiococcus abundance, both by themselves and jointly, controlling for confounders.
summary(geeglm(allo_counts ~ DOL*gaBirth + MOD + TPHE_5, id = Subject, corstr = "exchangeable", family = poisson(link = 'log'), data = md.post, offset = log(Reads)))
summary(geeglm(allo_counts ~ DOL*gaBirth + MOD + IllnessVisit +  milk_perinatal + abx_hospital + abx_discharge, id = Subject, corstr = "exchangeable", family = poisson(link = 'log'), data = md.post, offset = log(Reads)))
summary(geeglm(allo_counts ~ DOL*gaBirth + MOD + TPHE_5 + IllnessVisit, id = Subject, corstr = "exchangeable", family = poisson(link = 'log'), data = md.post, offset = log(Reads)))

```