update regression

greenelab · Mar 22, 2021 · a61d678 · a61d678
1 parent 1e45ddc
commit a61d678
Show file tree

Hide file tree

Showing 31 changed files with 8,387 additions and 7,155 deletions.
diff --git a/10.visualize-gender.Rmd b/10.visualize-gender.Rmd
@@ -112,11 +112,31 @@ iscb_pubmed_sum %>%
   summarise(prob_female_avg = mean(mean_prob))
 ```
 
-
 ### Supplementary Figure S1 {#sup_fig_s1}
-Increasing trend of honorees who were women in each honor category, especially in the group of ISCB Fellows, which markedly increased after 2015. 
 
-```{r eval=FALSE}
+Additional fig. 1 with separated keynote speakers and fellows
+
+```{r}
+iscb_pubmed %>%
+  ungroup() %>% 
+  mutate(type2 = case_when(
+    journal == 'ISCB Fellow' ~ 'ISCB Fellows',
+    type == 'Keynote speakers/Fellows' ~ 'Keynote speakers',
+    TRUE ~ 'Pubmed authors'
+  )) %>% 
+  group_by(type2, year, gender) %>% 
+  summarise(
+    mean_prob = mean(weighted_probs),
+    se_prob = sqrt(var(probabilities) * sum(weight^2)/(sum(weight)^2)),
+    me_prob = alpha_threshold * se_prob,
+    .groups = 'drop'
+  ) %>% 
+  gender_breakdown('main', fct_rev(type2))
+```
+
+<!-- Increasing trend of honorees who were women in each honor category, especially in the group of ISCB Fellows, which markedly increased after 2015.  -->
+
+```{r eval=FALSE, include=FALSE}
 # By conference:
 # fig_1d <- bind_rows(iscb_gender) %>%
 #   gender_breakdown(category = 'sub', journal) +
@@ -150,8 +170,9 @@ get_p <- function(inte, colu){
 ```{r}
 iscb_lm <- iscb_pubmed %>% 
        filter(gender == 'probability_female', !is.na(weighted_probs)) %>% 
-  mutate(type = as.factor(type),
-         type = relevel(type, ref = 'Pubmed authors'))
+  mutate(type = as.factor(type)) %>% 
+  mutate(type = relevel(type, ref = 'Pubmed authors'),
+         year = as.factor(year))
 
 main_lm <- glm(type ~ year + weighted_probs, 
      data = iscb_lm, family = 'binomial')
@@ -160,15 +181,35 @@ summary(main_lm)
 ```
 
 The two groups of scientists did not have a significant association with the gender predicted from fore names (_P_ = `r get_p(main_lm, 'p.value')`).
-Interaction terms do not predict `probabilities` over and above the main effect of group of scientists and year.
+Interaction terms do not predict `type` over and above the main effect of gender probability and year.
 
 ```{r}
-inte_lm <- glm(type ~ year * weighted_probs, 
-   data = iscb_lm, family = 'binomial')
+scaled_iscb <- iscb_lm
+# scaled_iscb$s_prob <- scale(scaled_iscb$weighted_probs, scale = F)
+# scaled_iscb$s_year <- scale(scaled_iscb$year, scale = F)
+main_lm <- glm(type ~ year + weighted_probs, 
+     data = scaled_iscb %>% mutate(year = as.factor(year)), 
+     family = 'binomial')
+
+summary(main_lm)
+inte_lm <- glm(
+  # type ~ scale(year, scale = F) * scale(weighted_probs, scale = F),
+  # type ~ s_year * s_prob, 
+  type ~ year * weighted_probs,
+   data = scaled_iscb %>% mutate(year = as.factor(year))
+   , 
+   family = 'binomial')
 summary(inte_lm)
 anova(main_lm, inte_lm, test = 'Chisq')
+mean(scaled_iscb$year)
+mean(scaled_iscb$weighted_probs)
 ```
 
+```{r}
+# inte_lm <- glm(type ~ (year * weighted_probs), 
+#    data = iscb_lm, 
+#    family = 'binomial')
+```
 
 ```{r}
 sessionInfo()

diff --git a/11.visualize-name-origins.Rmd b/11.visualize-name-origins.Rmd
@@ -169,49 +169,51 @@ ggsave('figs/region_breakdown.svg', fig_4, width = 6.7, height = 5.5)
 
 ```{r}
 iscb_lm <- iscb_pubmed_oth %>% 
-       ungroup() %>% 
-       mutate(year = c(scale(year(year))),
-              type = as.factor(type) %>% relevel(ref = 'Pubmed authors'))
+  ungroup() %>% 
+  mutate(
+    # year = c(scale(year(year))),
+    year = as.factor(year),
+    type = as.factor(type) %>% relevel(ref = 'Pubmed authors'))
 main_lm <- function(regioni){
-  glm(type ~ weighted_probs + year, 
-     data = iscb_lm %>% 
-       filter(region == regioni, !is.na(probabilities)) ,
-     family = 'binomial')
+  glm(type ~ year + weighted_probs, 
+      data = iscb_lm %>% 
+        filter(region == regioni, !is.na(probabilities)) ,
+      family = 'binomial')
 }
 
 inte_lm <- function(regioni){
   glm(type ~ year * weighted_probs, 
       data = iscb_lm %>% 
-  filter(region == regioni, !is.na(weighted_probs)),
+        filter(region == regioni, !is.na(weighted_probs)),
       family = 'binomial')
 }
 
 main_list <- lapply(large_regions, main_lm)
-inte_list <- lapply(large_regions, inte_lm)
 names(main_list) <- large_regions
 lapply(main_list, summary)
-lapply(inte_list, summary)
 
+inte_list <- lapply(large_regions, inte_lm)
+lapply(inte_list, summary)
 for (i in 1:4){
   print(anova(main_list[[i]], inte_list[[i]], test = 'Chisq'))
 }
-
 ```
+Interaction terms do not predict `type` over and above the main effect of name origin probability and year (_p_ > 0.01).
 
 ```{r echo = F}
 get_p <- function(i, colu){
   broom::tidy(main_list[[i]]) %>% 
     filter(term == 'weighted_probs') %>% 
-    pull(colu) %>% 
-    sprintf("%0.5g", .)
+    pull(colu) 
 }
+
+print_p <- function(x) sprintf("%0.5g", x)
 ```
 
 ## Conclusion
-
-A name coming from the group of honorees has significantly higher probability of being Celtic/English, $\beta_\textrm{Celtic/English} =$ `r get_p(1, 'estimate')` (_P_ = `r get_p(1, 'p.value')`), and lower probability of being East Asian, $\beta_\textrm{East Asian} =$ `r get_p(2, 'estimate')` (_P_ = `r get_p(2, 'p.value')`). The two groups of scientists did not have a significant association with names predicted to be European and in Other categories (_P_ = `r get_p(3, 'p.value')` and _P_ = `r get_p(4, 'p.value')`, respectively).
-
-
+A Celtic/English name has `r exp(get_p(1, 'estimate'))` the odds of being selected as an honoree, significantly higher compared to other names ($\beta_\textrm{Celtic/English} =$ `r print_p(get_p(1, 'estimate'))`, _P_ = `r print_p(get_p(1, 'p.value'))`).
+An East Asian name has `r exp(get_p(2, 'estimate'))` the odds of being selected as an honoree, significantly lower than to other names ($\beta_\textrm{East Asian} =$ `r print_p(get_p(2, 'estimate'))`, _P_ = `r print_p(get_p(2, 'p.value'))`).
+The two groups of scientists did not have a significant association with names predicted to be European (_P_ = `r print_p(get_p(3, 'p.value'))`) or in Other categories (_P_ = `r print_p(get_p(4, 'p.value'))`).
 
 ### Supplementary Figure S5 {#sup_fig_s5}
 It's difficult to come to a conclusion for other regions with so few data points and the imperfect accuracy of our prediction.

diff --git a/14.us-name-origin.Rmd b/14.us-name-origin.Rmd
@@ -121,47 +121,51 @@ ggsave('figs/us_name_origin.svg', fig_us_name_origin, width = 6.5, height = 5.5)
 ```{r}
 iscb_lm <- iscb_pubmed_oth %>% 
   ungroup() %>% 
-  mutate(year = c(scale(year)),
-         type = relevel(as.factor(type), ref = 'Pubmed authors'))
+  mutate(
+    # year = c(scale(year)),
+    year = as.factor(year),
+    type = relevel(as.factor(type), ref = 'Pubmed authors'))
 main_lm <- function(regioni){
   glm(type ~ year + weighted_probs, 
       data = iscb_lm %>% 
-  filter(region == regioni, !is.na(weighted_probs)),
+        filter(region == regioni, !is.na(weighted_probs)),
       family = 'binomial')
 }
 
 inte_lm <- function(regioni){
-  glm(type ~ year * weighted_probs, 
+  glm(type ~ weighted_probs*year, 
       data = iscb_lm %>% 
   filter(region == regioni, !is.na(weighted_probs)),
       family = 'binomial')
 }
 
 main_list <- lapply(large_regions, main_lm)
-inte_list <- lapply(large_regions, inte_lm)
 
 names(main_list) <- large_regions
 lapply(main_list, summary)
-lapply(inte_list, summary)
 
+inte_list <- lapply(large_regions, inte_lm)
+lapply(inte_list, summary)
 for (i in 1:4){
   print(anova(main_list[[i]], inte_list[[i]], test = 'Chisq'))
 }
 ```
+Interaction terms do not predict `type` over and above the main effect of name origin probability and year (_p_ > 0.01).
 
 ```{r echo = F}
-get_p <- function(i, colu){
+get_exp <- function(i, colu){
   broom::tidy(main_list[[i]]) %>% 
     filter(term == 'weighted_probs') %>% 
-    pull(colu) %>% 
-    sprintf("%0.5g", .)
+    pull(colu) 
 }
+
+print_p <- function(x) sprintf("%0.5g", x)
 ```
 
 ## Conclusion
 
-A name coming from the group of honorees has significantly lower probability of being East Asian, $\beta_\textrm{East Asian} =$ `r get_p(2, 'estimate')` (_P_ = `r get_p(2, 'p.value')`). The two groups of scientists did not have a significant association with names predicted to be Celtic/English (_P_ = `r get_p(1, 'p.value')`), European (_P_ = `r get_p(3, 'p.value')`), or in Other categories (_P_ = `r get_p(4, 'p.value')`).
-
+An East Asian name has `r exp(get_exp(2, 'estimate'))` the odds of being selected as an honoree, significantly lower compared to other names ($\beta_\textrm{East Asian} =$ `r print_p(get_exp(2, 'estimate'))`, _P_ = `r print_p(get_exp(2, 'p.value'))`).
+The two groups of scientists did not have a significant association with names predicted to be Celtic/English (_P_ = `r print_p(get_exp(1, 'p.value'))`), European (_P_ = `r print_p(get_exp(3, 'p.value'))`), or in Other categories (_P_ = `r print_p(get_exp(4, 'p.value'))`).
 
 ## Supplement
 

diff --git a/docs/091.draw-roc.html b/docs/091.draw-roc.html
diff --git a/docs/092.save-raws-to-Rdata.html b/docs/092.save-raws-to-Rdata.html
diff --git a/docs/093.summary-stats.html b/docs/093.summary-stats.html
diff --git a/docs/10.visualize-gender.html b/docs/10.visualize-gender.html
diff --git a/docs/11.visualize-name-origins.html b/docs/11.visualize-name-origins.html
diff --git a/docs/12.analyze-affiliation.html b/docs/12.analyze-affiliation.html
diff --git a/docs/13.us-race-analysis.html b/docs/13.us-race-analysis.html
diff --git a/docs/14.us-name-origin.html b/docs/14.us-name-origin.html
diff --git a/docs/15.analyze-2020.html b/docs/15.analyze-2020.html
diff --git a/figs/2020-01-31_groupings.png b/figs/2020-01-31_groupings.png