Skip to content

Commit

Permalink
Raw probabilities (#16)
Browse files Browse the repository at this point in the history
* use raw probabilities
  • Loading branch information
trangdata authored Jun 10, 2021
1 parent 1924eb8 commit 3bd34a6
Show file tree
Hide file tree
Showing 24 changed files with 1,113 additions and 1,071 deletions.
36 changes: 14 additions & 22 deletions 10.visualize-gender.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -65,21 +65,13 @@ iscb_pubmed <- iscb_gender_df %>%
values_to = "probabilities"
) %>%
filter(!is.na(probabilities)) %>%
group_by(type, year, gender) %>%
mutate(
pmc_citations_year = mean(adjusted_citations),
weight = adjusted_citations / pmc_citations_year,
weighted_probs = probabilities * weight
# weight = 1
)
group_by(type, year, gender)
iscb_pubmed_sum <- iscb_pubmed %>%
summarise(
# n = n(),
mean_prob = mean(weighted_probs),
# mean_prob = mean(probabilities, na.rm = T),
# sd_prob = sd(probabilities, na.rm = T),
se_prob = sqrt(var(probabilities) * sum(weight^2) / (sum(weight)^2)),
mean_prob = mean(probabilities, na.rm = T),
se_prob = sd(probabilities, na.rm = T),
# n = mean(n),
me_prob = alpha_threshold * se_prob,
.groups = "drop"
Expand All @@ -102,7 +94,7 @@ fig_1 <- iscb_pubmed_sum %>%
# group_by(year, type, gender) %>%
gender_breakdown("main", fct_rev(type))
fig_1
ggsave("figs/gender_breakdown.png", fig_1, width = 5, height = 2.5)
ggsave("figs/gender_breakdown.png", fig_1, width = 5, height = 2.5, dpi = 600)
ggsave("figs/gender_breakdown.svg", fig_1, width = 5, height = 2.5)
```

Expand Down Expand Up @@ -131,8 +123,8 @@ fig_1d <- iscb_pubmed %>%
) %>%
group_by(type2, year, gender) %>%
summarise(
mean_prob = mean(weighted_probs),
se_prob = sqrt(var(probabilities) * sum(weight^2) / (sum(weight)^2)),
mean_prob = mean(probabilities),
se_prob = sd(probabilities)/sqrt(n()),
me_prob = alpha_threshold * se_prob,
.groups = "drop"
) %>%
Expand Down Expand Up @@ -173,49 +165,49 @@ iscb_pubmed_sum %>%
```{r echo = F}
get_p <- function(inte, colu) {
broom::tidy(inte) %>%
filter(term == "weighted_probs") %>%
filter(term == "probabilities") %>%
pull(colu) %>%
sprintf("%0.5g", .)
}
```

```{r}
iscb_lm <- iscb_pubmed %>%
filter(gender == "probability_female", !is.na(weighted_probs)) %>%
filter(gender == "probability_female", !is.na(probabilities)) %>%
mutate(type = as.factor(type)) %>%
mutate(type = type %>% relevel(ref = "Pubmed authors"))
```

```{r}
scaled_iscb <- iscb_lm %>%
filter(year(year) >= 2002)
# scaled_iscb$s_prob <- scale(scaled_iscb$weighted_probs, scale = F)
# scaled_iscb$s_prob <- scale(scaled_iscb$probabilities, scale = F)
# scaled_iscb$s_year <- scale(scaled_iscb$year, scale = F)
main_lm <- glm(type ~ year + weighted_probs,
main_lm <- glm(type ~ year + probabilities,
data = scaled_iscb, # %>% mutate(year = as.factor(year))
family = "binomial"
)
broom::tidy(main_lm)
inte_lm <- glm(
# type ~ scale(year, scale = F) * scale(weighted_probs, scale = F),
# type ~ scale(year, scale = F) * scale(probabilities, scale = F),
# type ~ s_year * s_prob,
type ~ year * weighted_probs,
type ~ year * probabilities,
data = scaled_iscb, # %>% mutate(year = as.factor(year))
family = "binomial"
)
broom::tidy(inte_lm)
anova(main_lm, inte_lm, test = "Chisq")
# mean(scaled_iscb$year)
# mean(scaled_iscb$weighted_probs)
# mean(scaled_iscb$probabilities)
```

The two groups of scientists did not have a significant association with the gender predicted from fore names (_P_ = `r get_p(main_lm, 'p.value')`).
Interaction terms do not predict `type` over and above the main effect of gender probability and year.

```{r include=FALSE, eval=FALSE}
# inte_lm <- glm(type ~ (year * weighted_probs),
# inte_lm <- glm(type ~ (year * probabilities),
# data = iscb_lm,
# family = 'binomial')
```
Expand Down
34 changes: 12 additions & 22 deletions 11.visualize-name-origins.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,12 @@ iscb_pubmed_oth <- iscb_nat_df %>%
values_to = "probabilities"
) %>%
filter(!is.na(probabilities)) %>%
group_by(type, year, region) %>%
mutate(
pmc_citations_year = mean(adjusted_citations),
weight = adjusted_citations / pmc_citations_year,
weighted_probs = probabilities * weight
)
group_by(type, year, region)
iscb_pubmed_sum_oth <- iscb_pubmed_oth %>%
summarise(
mean_prob = mean(weighted_probs),
se_prob = sqrt(var(probabilities) * sum(weight^2) / (sum(weight)^2)),
mean_prob = mean(probabilities),
se_prob = sd(probabilities)/sqrt(n()),
me_prob = alpha_threshold * se_prob,
.groups = "drop"
)
Expand All @@ -127,7 +122,7 @@ for (conf in my_confs) {
iscb_nat[[i]] <- iscb_pubmed_oth %>%
filter(region != "OtherCategories", type != "Pubmed authors" & journal == conf) %>%
group_by(type, year, region, journal) %>%
summarise(mean_prob = mean(weighted_probs), .groups = "drop")
summarise(mean_prob = mean(probabilities), .groups = "drop")
}
```

Expand Down Expand Up @@ -169,7 +164,7 @@ fig_4b <- iscb_pubmed_sum_oth %>%
fig_4 <- cowplot::plot_grid(fig_4a, fig_4b, labels = "AUTO", ncol = 1, rel_heights = c(1.3, 1))
fig_4
ggsave("figs/region_breakdown.png", fig_4, width = 6.7, height = 5.5)
ggsave("figs/region_breakdown.png", fig_4, width = 6.7, height = 5.5, dpi = 600)
ggsave("figs/region_breakdown.svg", fig_4, width = 6.7, height = 5.5)
```

Expand All @@ -185,17 +180,17 @@ iscb_lm <- iscb_pubmed_oth %>%
type = as.factor(type) %>% relevel(ref = "Pubmed authors")
)
main_lm <- function(regioni) {
glm(type ~ year + weighted_probs,
glm(type ~ year + probabilities,
data = iscb_lm %>%
filter(region == regioni, !is.na(probabilities), year(year) >= 2002),
family = "binomial"
)
}
inte_lm <- function(regioni) {
glm(type ~ year * weighted_probs,
glm(type ~ year * probabilities,
data = iscb_lm %>%
filter(region == regioni, !is.na(weighted_probs), year(year) >= 2002),
filter(region == regioni, !is.na(probabilities), year(year) >= 2002),
family = "binomial"
)
}
Expand All @@ -215,7 +210,7 @@ Interaction terms do not predict `type` over and above the main effect of name o
```{r echo = F}
get_p <- function(i, colu) {
broom::tidy(main_list[[i]]) %>%
filter(term == "weighted_probs") %>%
filter(term == "probabilities") %>%
pull(colu)
}
Expand Down Expand Up @@ -326,21 +321,16 @@ iscb_pubmed_oth_lag <- iscb_nat_df %>%
values_to = "probabilities"
) %>%
filter(!is.na(probabilities), year(year) >= 2002) %>%
group_by(type, year, region) %>%
mutate(
pmc_citations_year = mean(adjusted_citations),
weight = adjusted_citations / pmc_citations_year,
weighted_probs = probabilities * weight
)
group_by(type, year, region)
iscb_lm_lag <- iscb_pubmed_oth_lag %>%
ungroup() %>%
mutate(type = as.factor(type) %>% relevel(ref = "Pubmed authors"))
main_lm <- function(regioni) {
glm(type ~ year + weighted_probs,
glm(type ~ year + probabilities,
data = iscb_lm_lag %>%
filter(region == regioni, !is.na(weighted_probs)),
filter(region == regioni, !is.na(probabilities)),
family = "binomial"
)
}
Expand Down
2 changes: 1 addition & 1 deletion 12.analyze-affiliation.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ enrichment_plot_right <- plot_obs_exp_right %>%
enrichment_plot <- cowplot::plot_grid(enrichment_plot_left, enrichment_plot_right,
rel_widths = c(1, 1.3))
enrichment_plot
ggsave('figs/enrichment-plot.png', enrichment_plot, width = 5.5, height = 3.5)
ggsave('figs/enrichment-plot.png', enrichment_plot, width = 5.5, height = 3.5, dpi = 600)
```

Expand Down
23 changes: 9 additions & 14 deletions 14.us-name-origin.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,12 @@ iscb_pubmed_oth <- iscb_nat_df %>%
values_to = "probabilities"
) %>%
filter(!is.na(probabilities)) %>%
group_by(type, year, region) %>%
mutate(
pmc_citations_year = mean(adjusted_citations),
weight = adjusted_citations / pmc_citations_year,
weighted_probs = probabilities * weight
)
group_by(type, year, region)
iscb_pubmed_sum_oth <- iscb_pubmed_oth %>%
summarise(
mean_prob = mean(weighted_probs),
se_prob = sqrt(var(probabilities) * sum(weight^2) / (sum(weight)^2)),
mean_prob = mean(probabilities),
se_prob = sd(probabilities)/sqrt(n()),
me_prob = alpha_threshold * se_prob,
.groups = "drop"
)
Expand Down Expand Up @@ -119,7 +114,7 @@ fig_us_name_originb <- iscb_pubmed_sum_oth %>%
fig_us_name_origin <- cowplot::plot_grid(fig_us_name_origina, fig_us_name_originb, labels = "AUTO", ncol = 1, rel_heights = c(1.3, 1))
fig_us_name_origin
ggsave("figs/us_name_origin.png", fig_us_name_origin, width = 6.5, height = 5.5)
ggsave("figs/us_name_origin.png", fig_us_name_origin, width = 6.5, height = 5.5, dpi = 600)
ggsave("figs/us_name_origin.svg", fig_us_name_origin, width = 6.5, height = 5.5)
```

Expand All @@ -134,17 +129,17 @@ iscb_lm <- iscb_pubmed_oth %>%
type = relevel(as.factor(type), ref = "Pubmed authors")
)
main_lm <- function(regioni) {
glm(type ~ year + weighted_probs,
glm(type ~ year + probabilities,
data = iscb_lm %>%
filter(region == regioni, !is.na(weighted_probs), year(year) >= 2002),
filter(region == regioni, !is.na(probabilities), year(year) >= 2002),
family = "binomial"
)
}
inte_lm <- function(regioni) {
glm(type ~ weighted_probs * year,
glm(type ~ probabilities * year,
data = iscb_lm %>%
filter(region == regioni, !is.na(weighted_probs), year(year) >= 2002),
filter(region == regioni, !is.na(probabilities), year(year) >= 2002),
family = "binomial"
)
}
Expand All @@ -165,7 +160,7 @@ Interaction terms do not predict `type` over and above the main effect of name o
```{r echo = F}
get_exp <- function(i, colu) {
broom::tidy(main_list[[i]]) %>%
filter(term == "weighted_probs") %>%
filter(term == "probabilities") %>%
pull(colu)
}
Expand Down
1 change: 1 addition & 0 deletions _output.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ html_document:
toc: true
toc_float: true
code_download: true
dpi: 600
Loading

0 comments on commit 3bd34a6

Please sign in to comment.