-
Notifications
You must be signed in to change notification settings - Fork 0
/
R-error-analysis-viz.R
145 lines (118 loc) · 6.66 KB
/
R-error-analysis-viz.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
library(lme4)
library(sjPlot)
library(tidyverse)
library(stringr)
library(arrow)
setwd("/Users/moritzlaurer/Dropbox/PhD/Papers/meta-metrics/meta-metrics-repo/")
d = read_parquet("./results/df_test_concat.parquet.gzip") |>
# this analysis is only conducted on experiments with biased training data
filter(data_train_biased == TRUE) |>
# new columns: biased_row is 1 if the text in df_test came from the biased group df_train was sampled from
# using (partial) string matching here because group_members_train are 3 countries like "USA|DEU|FRA" and group_members_test is only one string like "USA"
#mutate(biased_row = as.numeric(group_members_test == group_members_train), error = as.numeric(label_pred != label_gold)) |>
mutate(biased_row = as.numeric(str_detect(group_members_train, fixed(group_members_test)))) |>
# add error column
mutate(error = as.numeric(label_pred != label_gold)) |>
# clean classifier names
mutate(classifier = recode(method, 'nli_short' = 'BERT-NLI', 'nli_void' = 'BERT-NLI-void', 'standard_dl' = 'BERT-base', 'classical_ml' = 'logistic reg.')) |>
mutate(group_col = recode(group_col, 'pres_party' = 'party', 'ISO_A3' = 'country_3', 'country_iso' = 'country', 'parfam_text' = 'party_fam')) |>
mutate(dataset = recode(dataset, 'cap-merge' = 'CAP-2', 'cap-sotu' = 'CAP-SotU', 'coronanet' = 'CoronaNet', 'pimpo' = 'PImPo')) |>
mutate(training_run = file_name)
# sample for faster testing
#set.seed(42)
# Get stratified sample
#d <- d %>%
# group_by(training_run) %>%
# sample_n(100) %>%
# ungroup() # Remove grouping
#m1 = glmer(error ~ -1 + classifier*biased_row + (1 | training_run), family=binomial, data=d)
#pred <- ggeffects::ggpredict(m1, terms = c("classifier", "biased_row")) |>
# as_tibble() |>
# mutate(intest=if_else(group==0, "Yes", "No"))
get_plotdata = function(dataset) {
# model with intercept to get confidence intervals
m1 = glmer(error ~ -1 + classifier*biased_row + (1 | training_run), family=binomial, data=dataset)
pred <- ggeffects::ggpredict(m1, terms = c("classifier", "biased_row")) |>
as_tibble() |>
mutate(intest=if_else(group==1, "Yes", "No"))
# model without intercept for more interpretable outputs
m2 = glmer(error ~ -1 + classifier + classifier:biased_row + (1 | training_run), family=binomial, data=dataset)
summary(m2)$coefficients |>
as.data.frame() |>
rownames_to_column("param") |>
as_tibble() |>
filter(str_detect(param, ":biased_row")) |>
mutate(x=str_remove_all(param, "^classifier|:biased_row$"),
predicted=exp(Estimate),
conf.low=exp(Estimate-`Std. Error`), conf.high=exp(Estimate+`Std. Error`)) |>
select(x, predicted, conf.low, conf.high) |>
add_column(intest="Odds ratio") |>
bind_rows(pred) |>
select(-std.error, -group) |>
mutate(z=if_else(intest=='Odds ratio', 'Odds ratio of error on\ngroup seen during training\nvs. unseen groups', 'Probability of making an error ') |> fct_rev(),
x=fct_reorder(x, predicted))
}
p_tot = get_plotdata(d)
# give explicit order to legend
p_tot$intest <- factor(p_tot$intest, levels = c("Yes", "No", "Odds ratio"))
## aggregate plot
ggplot(p_tot, aes(y=x, yend=x, x=predicted, xend=conf.high, color=intest)) +
geom_point(data=filter(p_tot, intest=="Yes"), position=position_nudge(y=.1)) +
geom_point(data=filter(p_tot, intest=="No"), position=position_nudge(y=-.1)) +
geom_point(data=filter(p_tot, intest=="Odds ratio")) +
geom_segment(data=filter(p_tot, intest=="Yes"), aes(x=conf.low), position=position_nudge(y=.1)) +
geom_segment(data=filter(p_tot, intest=="No"), aes(x=conf.low), position=position_nudge(y=-.1)) +
geom_segment(data=filter(p_tot, intest=="Odds ratio"), aes(x=conf.low)) +
geom_vline(data=filter(p_tot, intest=="Odds ratio") |> add_column(xx=1),
mapping=aes(xintercept=xx), color="grey", lty=2) +
theme_classic() +
#scale_color_discrete(name="Test text from same group as training texts?", breaks=c("No", "Yes", "Odds ratio")) +
scale_color_manual(name="Test text from same group as training texts?",
values=c("Yes"="#F8766D", "No"="#619CFF", "Odds ratio"="#00BA38"),
breaks=c("Yes", "No", "Odds ratio")) +
theme(panel.grid.major.y = element_line(),
legend.position = "bottom") +
xlab("") +
ylab("") +
facet_grid(cols=vars(z), scales = "free", space="free")
## disaggregated analysis
d <- d |> mutate(label=str_c(dataset, group_col, sep="-"))
get_plotdata_subset <- function(lbl) {
dataset = dataset = d |> filter(label == lbl)
get_plotdata(dataset) |> add_column(label=lbl)
}
dsets = map(unique(d$label), get_plotdata_subset, .progress = TRUE) |> list_rbind()
dsets <- dsets |> mutate(dataset=str_extract(label, "(.*)-", group=1),
group=str_extract(label, ".*-(.*)", group=1))
# give explicit order to legend
dsets$intest <- factor(dsets$intest, levels = c("Yes", "No", "Odds ratio"))
# adjust order of classifiers
dsets$x <- factor(as.factor(dsets$x), levels = c("BERT-NLI", "BERT-NLI-void", "BERT-base", "logistic reg."))
# disaggregated plot
plot_disaggregated <- ggplot(dsets, aes(y=x, yend=x, x=predicted, xend=conf.high, color=intest)) +
geom_point(data=filter(dsets, intest=="Yes"), position=position_nudge(y=.1)) +
geom_point(data=filter(dsets, intest=="No"), position=position_nudge(y=-.1)) +
geom_point(data=filter(dsets, intest=="Odds ratio")) +
geom_segment(data=filter(dsets, intest=="Yes"), aes(x=conf.low), position=position_nudge(y=.1)) +
geom_segment(data=filter(dsets, intest=="No"), aes(x=conf.low), position=position_nudge(y=-.1)) +
geom_segment(data=filter(dsets, intest=="Odds ratio"), aes(x=conf.low)) +
geom_vline(data=filter(dsets, intest=="Odds ratio") |> add_column(xx=1),
mapping=aes(xintercept=xx), color="grey", lty=2) +
theme_classic() +
#scale_color_discrete(name="Test data point from group in training set?") +
scale_color_manual(name="Test text from same group as training texts?",
values=c("Yes"="#F8766D", "No"="#619CFF", "Odds ratio"="#00BA38"),
breaks=c("Yes", "No", "Odds ratio")) +
theme(panel.grid.major.y = element_line(),
legend.position = "bottom") +
xlab("") +
ylab("") +
ggh4x::facet_nested(dataset+group ~ z, scales = "free", space="free")
# adjust x-axis for odds ratio plot
# convert to ggplotGrob and inspect
plot_disaggregated_g <- ggplotGrob(plot_disaggregated)
plot_disaggregated_g$widths
# identify the two panels. They should have a 'null' dimension as they're relative. g$widths[5] was the width of the first panel, so let's increase it
plot_disaggregated_g$widths[5] <- unit(2, 'null')
# plot
grid::grid.draw(plot_disaggregated_g)