From a61d67806a5212488b4c574078f80467cbd8cf5d Mon Sep 17 00:00:00 2001 From: lelaboratoire Date: Mon, 22 Mar 2021 10:32:35 -0400 Subject: [PATCH] update regression --- 10.visualize-gender.Rmd | 57 +- 11.visualize-name-origins.Rmd | 36 +- 14.us-name-origin.Rmd | 26 +- docs/091.draw-roc.html | 433 ++-- docs/092.save-raws-to-Rdata.html | 298 +-- docs/093.summary-stats.html | 541 ++--- docs/10.visualize-gender.html | 452 ++-- docs/11.visualize-name-origins.html | 736 +++++-- docs/12.analyze-affiliation.html | 225 +- docs/13.us-race-analysis.html | 45 +- docs/14.us-name-origin.html | 732 +++++-- docs/15.analyze-2020.html | 214 +- figs/2020-01-31_groupings.png | Bin 522764 -> 433620 bytes figs/2020-01-31_groupings.svg | 2991 +++++++++++++-------------- figs/enrichment-plot.png | Bin 207316 -> 150904 bytes figs/fig_s5.png | Bin 343732 -> 207401 bytes figs/fig_s5.svg | 1768 ++++++++-------- figs/fig_s7.png | Bin 346822 -> 205525 bytes figs/fig_s7.svg | 1728 ++++++++-------- figs/gender_breakdown.png | Bin 82564 -> 55286 bytes figs/gender_breakdown.svg | 364 ++-- figs/region_breakdown.png | Bin 278745 -> 173042 bytes figs/region_breakdown.svg | 1850 ++++++++--------- figs/us_name_origin.png | Bin 276554 -> 169206 bytes figs/us_name_origin.svg | 1828 ++++++++-------- figs/us_racial_makeup.png | Bin 245733 -> 152094 bytes figs/us_racial_makeup.svg | 1206 +++++------ figs/versions.pdf | Bin 51132 -> 51294 bytes figs/versions.png | Bin 782920 -> 786394 bytes figs/versions.pptx | Bin 45986 -> 45768 bytes knit-all.R | 12 +- 31 files changed, 8387 insertions(+), 7155 deletions(-) diff --git a/10.visualize-gender.Rmd b/10.visualize-gender.Rmd index 46c7d4c..3bec7dd 100644 --- a/10.visualize-gender.Rmd +++ b/10.visualize-gender.Rmd @@ -112,11 +112,31 @@ iscb_pubmed_sum %>% summarise(prob_female_avg = mean(mean_prob)) ``` - ### Supplementary Figure S1 {#sup_fig_s1} -Increasing trend of honorees who were women in each honor category, especially in the group of ISCB Fellows, which markedly increased after 2015. -```{r eval=FALSE} +Additional fig. 1 with separated keynote speakers and fellows + +```{r} +iscb_pubmed %>% + ungroup() %>% + mutate(type2 = case_when( + journal == 'ISCB Fellow' ~ 'ISCB Fellows', + type == 'Keynote speakers/Fellows' ~ 'Keynote speakers', + TRUE ~ 'Pubmed authors' + )) %>% + group_by(type2, year, gender) %>% + summarise( + mean_prob = mean(weighted_probs), + se_prob = sqrt(var(probabilities) * sum(weight^2)/(sum(weight)^2)), + me_prob = alpha_threshold * se_prob, + .groups = 'drop' + ) %>% + gender_breakdown('main', fct_rev(type2)) +``` + + + +```{r eval=FALSE, include=FALSE} # By conference: # fig_1d <- bind_rows(iscb_gender) %>% # gender_breakdown(category = 'sub', journal) + @@ -150,8 +170,9 @@ get_p <- function(inte, colu){ ```{r} iscb_lm <- iscb_pubmed %>% filter(gender == 'probability_female', !is.na(weighted_probs)) %>% - mutate(type = as.factor(type), - type = relevel(type, ref = 'Pubmed authors')) + mutate(type = as.factor(type)) %>% + mutate(type = relevel(type, ref = 'Pubmed authors'), + year = as.factor(year)) main_lm <- glm(type ~ year + weighted_probs, data = iscb_lm, family = 'binomial') @@ -160,15 +181,35 @@ summary(main_lm) ``` The two groups of scientists did not have a significant association with the gender predicted from fore names (_P_ = `r get_p(main_lm, 'p.value')`). -Interaction terms do not predict `probabilities` over and above the main effect of group of scientists and year. +Interaction terms do not predict `type` over and above the main effect of gender probability and year. ```{r} -inte_lm <- glm(type ~ year * weighted_probs, - data = iscb_lm, family = 'binomial') +scaled_iscb <- iscb_lm +# scaled_iscb$s_prob <- scale(scaled_iscb$weighted_probs, scale = F) +# scaled_iscb$s_year <- scale(scaled_iscb$year, scale = F) +main_lm <- glm(type ~ year + weighted_probs, + data = scaled_iscb %>% mutate(year = as.factor(year)), + family = 'binomial') + +summary(main_lm) +inte_lm <- glm( + # type ~ scale(year, scale = F) * scale(weighted_probs, scale = F), + # type ~ s_year * s_prob, + type ~ year * weighted_probs, + data = scaled_iscb %>% mutate(year = as.factor(year)) + , + family = 'binomial') summary(inte_lm) anova(main_lm, inte_lm, test = 'Chisq') +mean(scaled_iscb$year) +mean(scaled_iscb$weighted_probs) ``` +```{r} +# inte_lm <- glm(type ~ (year * weighted_probs), +# data = iscb_lm, +# family = 'binomial') +``` ```{r} sessionInfo() diff --git a/11.visualize-name-origins.Rmd b/11.visualize-name-origins.Rmd index f1a951f..4286182 100644 --- a/11.visualize-name-origins.Rmd +++ b/11.visualize-name-origins.Rmd @@ -169,49 +169,51 @@ ggsave('figs/region_breakdown.svg', fig_4, width = 6.7, height = 5.5) ```{r} iscb_lm <- iscb_pubmed_oth %>% - ungroup() %>% - mutate(year = c(scale(year(year))), - type = as.factor(type) %>% relevel(ref = 'Pubmed authors')) + ungroup() %>% + mutate( + # year = c(scale(year(year))), + year = as.factor(year), + type = as.factor(type) %>% relevel(ref = 'Pubmed authors')) main_lm <- function(regioni){ - glm(type ~ weighted_probs + year, - data = iscb_lm %>% - filter(region == regioni, !is.na(probabilities)) , - family = 'binomial') + glm(type ~ year + weighted_probs, + data = iscb_lm %>% + filter(region == regioni, !is.na(probabilities)) , + family = 'binomial') } inte_lm <- function(regioni){ glm(type ~ year * weighted_probs, data = iscb_lm %>% - filter(region == regioni, !is.na(weighted_probs)), + filter(region == regioni, !is.na(weighted_probs)), family = 'binomial') } main_list <- lapply(large_regions, main_lm) -inte_list <- lapply(large_regions, inte_lm) names(main_list) <- large_regions lapply(main_list, summary) -lapply(inte_list, summary) +inte_list <- lapply(large_regions, inte_lm) +lapply(inte_list, summary) for (i in 1:4){ print(anova(main_list[[i]], inte_list[[i]], test = 'Chisq')) } - ``` +Interaction terms do not predict `type` over and above the main effect of name origin probability and year (_p_ > 0.01). ```{r echo = F} get_p <- function(i, colu){ broom::tidy(main_list[[i]]) %>% filter(term == 'weighted_probs') %>% - pull(colu) %>% - sprintf("%0.5g", .) + pull(colu) } + +print_p <- function(x) sprintf("%0.5g", x) ``` ## Conclusion - -A name coming from the group of honorees has significantly higher probability of being Celtic/English, $\beta_\textrm{Celtic/English} =$ `r get_p(1, 'estimate')` (_P_ = `r get_p(1, 'p.value')`), and lower probability of being East Asian, $\beta_\textrm{East Asian} =$ `r get_p(2, 'estimate')` (_P_ = `r get_p(2, 'p.value')`). The two groups of scientists did not have a significant association with names predicted to be European and in Other categories (_P_ = `r get_p(3, 'p.value')` and _P_ = `r get_p(4, 'p.value')`, respectively). - - +A Celtic/English name has `r exp(get_p(1, 'estimate'))` the odds of being selected as an honoree, significantly higher compared to other names ($\beta_\textrm{Celtic/English} =$ `r print_p(get_p(1, 'estimate'))`, _P_ = `r print_p(get_p(1, 'p.value'))`). +An East Asian name has `r exp(get_p(2, 'estimate'))` the odds of being selected as an honoree, significantly lower than to other names ($\beta_\textrm{East Asian} =$ `r print_p(get_p(2, 'estimate'))`, _P_ = `r print_p(get_p(2, 'p.value'))`). +The two groups of scientists did not have a significant association with names predicted to be European (_P_ = `r print_p(get_p(3, 'p.value'))`) or in Other categories (_P_ = `r print_p(get_p(4, 'p.value'))`). ### Supplementary Figure S5 {#sup_fig_s5} It's difficult to come to a conclusion for other regions with so few data points and the imperfect accuracy of our prediction. diff --git a/14.us-name-origin.Rmd b/14.us-name-origin.Rmd index b46f3d0..7e6b080 100644 --- a/14.us-name-origin.Rmd +++ b/14.us-name-origin.Rmd @@ -121,47 +121,51 @@ ggsave('figs/us_name_origin.svg', fig_us_name_origin, width = 6.5, height = 5.5) ```{r} iscb_lm <- iscb_pubmed_oth %>% ungroup() %>% - mutate(year = c(scale(year)), - type = relevel(as.factor(type), ref = 'Pubmed authors')) + mutate( + # year = c(scale(year)), + year = as.factor(year), + type = relevel(as.factor(type), ref = 'Pubmed authors')) main_lm <- function(regioni){ glm(type ~ year + weighted_probs, data = iscb_lm %>% - filter(region == regioni, !is.na(weighted_probs)), + filter(region == regioni, !is.na(weighted_probs)), family = 'binomial') } inte_lm <- function(regioni){ - glm(type ~ year * weighted_probs, + glm(type ~ weighted_probs*year, data = iscb_lm %>% filter(region == regioni, !is.na(weighted_probs)), family = 'binomial') } main_list <- lapply(large_regions, main_lm) -inte_list <- lapply(large_regions, inte_lm) names(main_list) <- large_regions lapply(main_list, summary) -lapply(inte_list, summary) +inte_list <- lapply(large_regions, inte_lm) +lapply(inte_list, summary) for (i in 1:4){ print(anova(main_list[[i]], inte_list[[i]], test = 'Chisq')) } ``` +Interaction terms do not predict `type` over and above the main effect of name origin probability and year (_p_ > 0.01). ```{r echo = F} -get_p <- function(i, colu){ +get_exp <- function(i, colu){ broom::tidy(main_list[[i]]) %>% filter(term == 'weighted_probs') %>% - pull(colu) %>% - sprintf("%0.5g", .) + pull(colu) } + +print_p <- function(x) sprintf("%0.5g", x) ``` ## Conclusion -A name coming from the group of honorees has significantly lower probability of being East Asian, $\beta_\textrm{East Asian} =$ `r get_p(2, 'estimate')` (_P_ = `r get_p(2, 'p.value')`). The two groups of scientists did not have a significant association with names predicted to be Celtic/English (_P_ = `r get_p(1, 'p.value')`), European (_P_ = `r get_p(3, 'p.value')`), or in Other categories (_P_ = `r get_p(4, 'p.value')`). - +An East Asian name has `r exp(get_exp(2, 'estimate'))` the odds of being selected as an honoree, significantly lower compared to other names ($\beta_\textrm{East Asian} =$ `r print_p(get_exp(2, 'estimate'))`, _P_ = `r print_p(get_exp(2, 'p.value'))`). +The two groups of scientists did not have a significant association with names predicted to be Celtic/English (_P_ = `r print_p(get_exp(1, 'p.value'))`), European (_P_ = `r print_p(get_exp(3, 'p.value'))`), or in Other categories (_P_ = `r print_p(get_exp(4, 'p.value'))`). ## Supplement diff --git a/docs/091.draw-roc.html b/docs/091.draw-roc.html index 90ab6fe..c2826b5 100644 --- a/docs/091.draw-roc.html +++ b/docs/091.draw-roc.html @@ -13,19 +13,6 @@ Plotting ROC curves - - + + + - + @@ -1527,6 +1583,7 @@ max-height: 500px; min-height: 44px; overflow-y: auto; + background: white; border: 1px solid #ddd; border-radius: 4px; } @@ -1655,22 +1712,22 @@ -
-
+
+
-
+
-