-
Notifications
You must be signed in to change notification settings - Fork 2
/
evaluation.py
167 lines (123 loc) · 5.63 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import torch
import numpy as np
from torch.utils.data import DataLoader
from utils import collate_fn, DataCollatorForSeq2Cls
from sklearn.metrics import roc_auc_score
import os
def merge_keys(l, keys):
new_dict = {}
for key in keys:
new_dict[key] = []
for i in l:
new_dict[key] += i[key]
return new_dict
def evaluate_ood(args, model, tokenizer, features, ood, tag, keys=None):
if keys is None:
keys = ['cosine', 'maha', 'energy','softmax']
dataloader = DataLoader(features, batch_size=args.val_batch_size, collate_fn=DataCollatorForSeq2Cls(tokenizer))
in_scores = []
for batch in dataloader:
model.eval()
batch = {key: value.to(model.device) for key, value in batch.items()}
with torch.no_grad():
ood_keys = model.compute_ood(**batch)
in_scores.append(ood_keys)
in_scores = merge_keys(in_scores, keys)
dataloader = DataLoader(ood, batch_size=args.val_batch_size, collate_fn=DataCollatorForSeq2Cls(tokenizer))
out_scores = []
for batch in dataloader:
model.eval()
batch = {key: value.to(model.device) for key, value in batch.items()}
with torch.no_grad():
ood_keys = model.compute_ood(**batch)
out_scores.append(ood_keys)
out_scores = merge_keys(out_scores, keys)
outputs = {}
for key in keys:
# ins = np.array(in_scores[key], dtype=np.float64)
# outs = np.array(out_scores[key], dtype=np.float64)
# inl = np.ones_like(ins).astype(np.int64)
# outl = np.zeros_like(outs).astype(np.int64)
# scores = np.concatenate([ins, outs], axis=0)
# labels_in = np.concatenate([inl, outl], axis=0)
# labels_out = np.concatenate([np.zeros_like(ins).astype(np.int64), np.ones_like(outs).astype(np.int64)], axis=0)
if key == "mix":
auroc_in, aupr_in, fpr_95_in = get_measures(
np.array([1 if i != 150 else 0 for i in in_scores[key]], dtype=np.float64),
np.array([0 if i == 150 else 1 for i in out_scores[key]], dtype=np.float64))
auroc_out, aupr_out, fpr_95_out = get_measures(
np.array([1 if i == 150 else 0 for i in out_scores[key]], dtype=np.float64) * -1,
np.array([0 if i != 150 else 1 for i in in_scores[key]], dtype=np.float64) * -1)
else:
auroc_in, aupr_in, fpr_95_in = get_measures(np.array(in_scores[key], dtype=np.float64),
np.array(out_scores[key], dtype=np.float64))
outputs[tag + "_" + key + "_auroc_IN"] = auroc_in
outputs[tag + "_" + key + "_fpr95_IN"] = fpr_95_in
outputs[tag + "_" + key + "_aupr_IN"] = aupr_in
# outputs[tag + "_" + key + "_auroc_OUT"] = auroc_out
# outputs[tag + "_" + key + "_fpr95_OUT"] = fpr_95_out
# outputs[tag + "_" + key + "_aupr_OUT"] = aupr_out
return outputs
def get_auroc(key, prediction):
new_key = np.copy(key)
new_key[key == 0] = 0
new_key[key > 0] = 1
return roc_auc_score(new_key, prediction)
def get_fpr_95(key, prediction):
new_key = np.copy(key)
new_key[key == 0] = 0
new_key[key > 0] = 1
score = fpr_and_fdr_at_recall(new_key, prediction)
return score
def stable_cumsum(arr, rtol=1e-05, atol=1e-08):
out = np.cumsum(arr, dtype=np.float64)
expected = np.sum(arr, dtype=np.float64)
if not np.allclose(out[-1], expected, rtol=rtol, atol=atol):
raise RuntimeError('cumsum was found to be unstable: '
'its last element does not correspond to sum')
return out
def fpr_and_fdr_at_recall(y_true, y_score, recall_level=0.95, pos_label=1.):
y_true = (y_true == pos_label)
desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
y_score = y_score[desc_score_indices]
y_true = y_true[desc_score_indices]
distinct_value_indices = np.where(np.diff(y_score))[0]
threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
tps = stable_cumsum(y_true)[threshold_idxs]
fps = 1 + threshold_idxs - tps
thresholds = y_score[threshold_idxs]
recall = tps / tps[-1]
last_ind = tps.searchsorted(tps[-1])
sl = slice(last_ind, None, -1)
recall, fps, tps, thresholds = np.r_[recall[sl], 1], np.r_[fps[sl], 0], np.r_[tps[sl], 0], thresholds[sl]
cutoff = np.argmin(np.abs(recall - recall_level))
return fps[cutoff] / (np.sum(np.logical_not(y_true)))
import sklearn.metrics as sk
def get_measures(_pos, _neg):
# pos = np.array(_pos[:]).reshape((-1, 1))
# neg = np.array(_neg[:]).reshape((-1, 1))
# examples = np.squeeze(np.vstack((pos, neg)))
# labels = np.zeros(len(examples), dtype=np.int32)
# labels[:len(pos)] += 1
ins = _pos
outs = _neg
inl = np.ones_like(ins).astype(np.int64)
outl = np.zeros_like(outs).astype(np.int64)
scores = np.concatenate([ins, outs], axis=0)
labels = np.concatenate([inl, outl], axis=0)
auroc = get_auroc(labels, scores)
aupr = sk.average_precision_score(labels, scores)
fpr = get_fpr_95(labels, scores)
return auroc, aupr, fpr
def plot_density(in_dis, out_dis, save_path, colors, tag='energy'):
import seaborn as sns
import matplotlib.pyplot as plt
f, ax = plt.subplots()
sns.kdeplot(in_dis, fill=True, color=colors[0],label='ID')
sns.kdeplot(out_dis, fill=True, color=colors[1], label='OOD')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
savename = os.path.join(save_path, f'{tag}_distribution.png')
# plt.legend(loc='best', frameon=False)
plt.savefig(savename, bbox_inches='tight', dpi=400)
plt.show()