-
Notifications
You must be signed in to change notification settings - Fork 24
/
run_model.py
231 lines (209 loc) · 12.5 KB
/
run_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import torch
from InstructABSA.data_prep import DatasetLoader
from InstructABSA.utils import T5Generator, T5Classifier
from InstructABSA.config import Config
from instructions import InstructionsHandler
try:
use_mps = True if torch.has_mps else False
except:
use_mps = False
# Set Global Values
config = Config()
instruct_handler = InstructionsHandler()
if config.inst_type == 1:
instruct_handler.load_instruction_set1()
else:
instruct_handler.load_instruction_set2()
print('Task: ', config.task)
if config.mode == 'train':
if config.id_tr_data_path is None:
raise Exception('Please provide training data path for mode=training.')
if config.mode == 'eval':
if config.id_te_data_path is None and config.ood_te_data_path is None:
raise Exception('Please provide testing data path for mode=eval.')
if config.experiment_name is not None and config.mode == 'train':
print('Experiment Name: ', config.experiment_name)
model_checkpoint = config.model_checkpoint
model_out_path = config.output_dir
model_out_path = os.path.join(model_out_path, config.task, f"{model_checkpoint.replace('/', '')}-{config.experiment_name}")
else:
model_checkpoint = config.model_checkpoint
model_out_path = config.model_checkpoint
print('Mode set to: ', 'training' if config.mode == 'train' else ('inference' if config.mode == 'eval' \
else 'Individual sample inference'))
# Load the data
id_tr_data_path = config.id_tr_data_path
ood_tr_data_path = config.ood_tr_data_path
id_te_data_path = config.id_te_data_path
ood_te_data_path = config.ood_te_data_path
if config.mode != 'cli':
id_tr_df, id_te_df = None, None
ood_tr_df, ood_te_df = None, None
if id_tr_data_path is not None:
id_tr_df = pd.read_csv(id_tr_data_path)
if id_te_data_path is not None:
id_te_df = pd.read_csv(id_te_data_path)
if ood_tr_data_path is not None:
ood_tr_df = pd.read_csv(ood_tr_data_path)
if ood_te_data_path is not None:
ood_te_df = pd.read_csv(ood_te_data_path)
print('Loaded data...')
else:
print('Running inference on input: ', config.test_input)
# Training arguments
training_args = {
'output_dir': model_out_path,
'evaluation_strategy': config.evaluation_strategy if config.id_te_data_path is not None else 'no',
'learning_rate': config.learning_rate,
'per_device_train_batch_size': config.per_device_train_batch_size if config.per_device_train_batch_size is not None else None,
'per_device_eval_batch_size': config.per_device_eval_batch_size,
'num_train_epochs': config.num_train_epochs if config.num_train_epochs is not None else None,
'weight_decay': config.weight_decay,
'warmup_ratio': config.warmup_ratio,
'save_strategy': config.save_strategy,
'load_best_model_at_end': config.load_best_model_at_end,
'push_to_hub': config.push_to_hub,
'eval_accumulation_steps': config.eval_accumulation_steps,
'predict_with_generate': config.predict_with_generate,
'use_mps_device': use_mps
}
# Create T5 model object
print(config.set_instruction_key)
if config.set_instruction_key == 1:
indomain = 'bos_instruct1'
outdomain = 'bos_instruct2'
else:
indomain = 'bos_instruct2'
outdomain = 'bos_instruct1'
if config.task == 'ate':
t5_exp = T5Generator(model_checkpoint)
bos_instruction_id = instruct_handler.ate[indomain]
if ood_tr_data_path is not None or ood_te_data_path is not None:
bos_instruction_ood = instruct_handler.ate[outdomain]
eos_instruction = instruct_handler.ate['eos_instruct']
if config.task == 'atsc':
t5_exp = T5Classifier(model_checkpoint)
bos_instruction_id = instruct_handler.atsc[indomain]
if ood_tr_data_path is not None or ood_te_data_path is not None:
bos_instruction_ood = instruct_handler.atsc[outdomain]
delim_instruction = instruct_handler.atsc['delim_instruct']
eos_instruction = instruct_handler.atsc['eos_instruct']
if config.task == 'joint':
t5_exp = T5Generator(model_checkpoint)
bos_instruction_id = instruct_handler.joint[indomain]
if ood_tr_data_path is not None or ood_te_data_path is not None:
bos_instruction_ood = instruct_handler.joint[outdomain]
eos_instruction = instruct_handler.joint['eos_instruct']
if config.mode != 'cli':
# Define function to load datasets and tokenize datasets
loader = DatasetLoader(id_tr_df, id_te_df, ood_tr_df, ood_te_df, config.sample_size)
if config.task == 'ate':
if loader.train_df_id is not None:
loader.train_df_id = loader.create_data_in_ate_format(loader.train_df_id, 'term', 'raw_text', 'aspectTerms', bos_instruction_id, eos_instruction)
if loader.test_df_id is not None:
loader.test_df_id = loader.create_data_in_ate_format(loader.test_df_id, 'term', 'raw_text', 'aspectTerms', bos_instruction_id, eos_instruction)
if loader.train_df_ood is not None:
loader.train_df_ood = loader.create_data_in_ate_format(loader.train_df_ood, 'term', 'raw_text', 'aspectTerms', bos_instruction_ood, eos_instruction)
if loader.test_df_ood is not None:
loader.test_df_ood = loader.create_data_in_ate_format(loader.test_df_ood, 'term', 'raw_text', 'aspectTerms', bos_instruction_ood, eos_instruction)
elif config.task == 'atsc':
if loader.train_df_id is not None:
loader.train_df_id = loader.create_data_in_atsc_format(loader.train_df_id, 'aspectTerms', 'term', 'raw_text', 'aspect', bos_instruction_id, delim_instruction, eos_instruction)
if loader.test_df_id is not None:
loader.test_df_id = loader.create_data_in_atsc_format(loader.test_df_id, 'aspectTerms', 'term', 'raw_text', 'aspect', bos_instruction_id, delim_instruction, eos_instruction)
if loader.train_df_ood is not None:
loader.train_df_ood = loader.create_data_in_atsc_format(loader.train_df_ood, 'aspectTerms', 'term', 'raw_text', 'aspect', bos_instruction_ood, delim_instruction, eos_instruction)
if loader.test_df_ood is not None:
loader.test_df_ood = loader.create_data_in_atsc_format(loader.test_df_ood, 'aspectTerms', 'term', 'raw_text', 'aspect', bos_instruction_ood, delim_instruction, eos_instruction)
elif config.task == 'joint':
if loader.train_df_id is not None:
loader.train_df_id = loader.create_data_in_joint_task_format(loader.train_df_id, 'term', 'polarity', 'raw_text', 'aspectTerms', bos_instruction_id, eos_instruction)
if loader.test_df_id is not None:
loader.test_df_id = loader.create_data_in_joint_task_format(loader.test_df_id, 'term', 'polarity', 'raw_text', 'aspectTerms', bos_instruction_id, eos_instruction)
if loader.train_df_ood is not None:
loader.train_df_ood = loader.create_data_in_joint_task_format(loader.train_df_ood, 'term', 'polarity', 'raw_text', 'aspectTerms', bos_instruction_ood, eos_instruction)
if loader.test_df_ood is not None:
loader.test_df_ood = loader.create_data_in_joint_task_format(loader.test_df_ood, 'term', 'polarity', 'raw_text', 'aspectTerms', bos_instruction_ood, eos_instruction)
# Tokenize dataset
id_ds, id_tokenized_ds, ood_ds, ood_tokenized_ds = loader.set_data_for_training_semeval(t5_exp.tokenize_function_inputs)
if config.mode == 'train':
# Train model
model_trainer = t5_exp.train(id_tokenized_ds, **training_args)
print('Model saved at: ', model_out_path)
elif config.mode == 'eval':
# Get prediction labels
print('Model loaded from: ', model_checkpoint)
if id_tokenized_ds.get("train") is not None:
id_tr_pred_labels = t5_exp.get_labels(tokenized_dataset = id_tokenized_ds, sample_set = 'train',
batch_size=config.per_device_eval_batch_size,
max_length = config.max_token_length)
id_tr_df = pd.DataFrame(id_ds['train'])[['text', 'labels']]
id_tr_df['labels'] = id_tr_df['labels'].apply(lambda x: x.strip())
id_tr_df['pred_labels'] = id_tr_pred_labels
id_tr_df.to_csv(os.path.join(config.output_path, f'{config.experiment_name}_id_train.csv'), index=False)
print('*****Train Metrics*****')
precision, recall, f1, accuracy = t5_exp.get_metrics(id_tr_df['labels'], id_tr_pred_labels)
print('Precision: ', precision)
print('Recall: ', recall)
print('F1-Score: ', f1)
if config.task == 'atsc':
print('Accuracy: ', accuracy)
if id_tokenized_ds.get("test") is not None:
id_te_pred_labels = t5_exp.get_labels(tokenized_dataset = id_tokenized_ds, sample_set = 'test',
batch_size=config.per_device_eval_batch_size,
max_length = config.max_token_length)
id_te_df = pd.DataFrame(id_ds['test'])[['text', 'labels']]
id_te_df['labels'] = id_te_df['labels'].apply(lambda x: x.strip())
id_te_df['pred_labels'] = id_te_pred_labels
id_te_df.to_csv(os.path.join(config.output_path, f'{config.experiment_name}_id_test.csv'), index=False)
print('*****Test Metrics*****')
precision, recall, f1, accuracy = t5_exp.get_metrics(id_te_df['labels'], id_te_pred_labels)
print('Precision: ', precision)
print('Recall: ', recall)
print('F1-Score: ', f1)
if config.task == 'atsc':
print('Accuracy: ', accuracy)
if ood_tokenized_ds.get("train") is not None:
ood_tr_pred_labels = t5_exp.get_labels(tokenized_dataset = ood_tokenized_ds, sample_set = 'train',
batch_size=config.per_device_eval_batch_size,
max_length = config.max_token_length)
ood_tr_df = pd.DataFrame(ood_ds['train'])[['text', 'labels']]
ood_tr_df['labels'] = ood_tr_df['labels'].apply(lambda x: x.strip())
ood_tr_df['pred_labels'] = ood_tr_pred_labels
ood_tr_df.to_csv(os.path.join(config.output_path, f'{config.experiment_name}_ood_train.csv'), index=False)
print('*****Train Metrics - OOD*****')
precision, recall, f1, accuracy = t5_exp.get_metrics(ood_tr_df['labels'], ood_tr_pred_labels)
print('Precision: ', precision)
print('Recall: ', precision)
print('F1-Score: ', precision)
if config.task == 'atsc':
print('Accuracy: ', accuracy)
if ood_tokenized_ds.get("test") is not None:
ood_te_pred_labels = t5_exp.get_labels(tokenized_dataset = ood_tokenized_ds, sample_set = 'test',
batch_size=config.per_device_eval_batch_size,
max_length = config.max_token_length)
ood_te_df = pd.DataFrame(ood_ds['test'])[['text', 'labels']]
ood_te_df['labels'] = ood_te_df['labels'].apply(lambda x: x.strip())
ood_te_df['pred_labels'] = ood_te_pred_labels
ood_te_df.to_csv(os.path.join(config.output_path, f'{config.experiment_name}_ood_test.csv'), index=False)
print('*****Test Metrics - OOD*****')
precision, recall, f1, accuracy = t5_exp.get_metrics(ood_te_df['labels'], ood_te_pred_labels)
print('Precision: ', precision)
print('Recall: ', precision)
print('F1-Score: ', precision)
if config.task == 'atsc':
print('Accuracy: ', accuracy)
else:
print('Model loaded from: ', model_checkpoint)
if config.task == 'atsc':
config.test_input, aspect_term = config.test_input.split('|')[0], config.test_input.split('|')[1]
model_input = bos_instruction_id + config.test_input + f'. The aspect term is: {aspect_term}' + eos_instruction
else:
model_input = bos_instruction_id + config.test_input + eos_instruction
input_ids = t5_exp.tokenizer(model_input, return_tensors="pt").input_ids
outputs = t5_exp.model.generate(input_ids, max_length = config.max_token_length)
print('Model output: ', t5_exp.tokenizer.decode(outputs[0], skip_special_tokens=True))