Skip to content

Commit

Permalink
Fix cnn_rest and associated files to use config and use from_pretrained.
Browse files Browse the repository at this point in the history
  • Loading branch information
tmills committed Sep 24, 2024
1 parent 8dfcba0 commit b4f9a35
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 17 deletions.
5 changes: 3 additions & 2 deletions src/cnlpt/BaselineModels.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@

import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from torch import nn


class CnnSentenceClassifier(nn.Module):
class CnnSentenceClassifier(nn.Module, PyTorchModelHubMixin):
def __init__(
self,
vocab_size,
Expand Down Expand Up @@ -110,7 +111,7 @@ def forward(
return loss, logits


class LstmSentenceClassifier(nn.Module):
class LstmSentenceClassifier(nn.Module, PyTorchModelHubMixin):
def __init__(
self,
vocab_size,
Expand Down
15 changes: 6 additions & 9 deletions src/cnlpt/api/cnn_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
logger = logging.getLogger("CNN_REST_Processor")
logger.setLevel(logging.DEBUG)

max_seq_length = 128


@app.on_event("startup")
async def startup_event():
Expand All @@ -49,15 +51,12 @@ async def startup_event():
num_labels_dict = {
task: len(values) for task, values in conf_dict["label_dictionary"].items()
}
model = CnnSentenceClassifier(
len(tokenizer),
model = CnnSentenceClassifier.from_pretrained(
model_name,
vocab_size=len(tokenizer),
task_names=conf_dict["task_names"],
num_labels_dict=num_labels_dict,
embed_dims=conf_dict["cnn_embed_dim"],
num_filters=conf_dict["num_filters"],
filters=conf_dict["filters"],
)
model.load_state_dict(torch.load(join(model_name, "pytorch_model.bin")))

app.state.model = model.to("cuda")
app.state.tokenizer = tokenizer
Expand All @@ -67,9 +66,7 @@ async def startup_event():
@app.post("/cnn/classify")
async def process(doc: UnannotatedDocument):
instances = [doc.doc_text]
dataset = get_dataset(
instances, app.state.tokenizer, max_length=app.state.conf_dict["max_seq_length"]
)
dataset = get_dataset(instances, app.state.tokenizer, max_length=max_seq_length)
_, logits = app.state.model.forward(
input_ids=torch.LongTensor(dataset["input_ids"]).to("cuda"),
attention_mask=torch.LongTensor(dataset["attention_mask"]).to("cuda"),
Expand Down
30 changes: 24 additions & 6 deletions src/cnlpt/train_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,7 @@ def main(
bias_fit=training_args.bias_fit,
)

model_type = type(model)
output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
if training_args.do_train:
# TODO: This assumes that if there are multiple training sets, they all have the same length, but
Expand Down Expand Up @@ -660,7 +661,12 @@ def compute_metrics_fn(p: EvalPrediction):
),
"w",
) as f:
json.dump(model_args.to_dict(), f)
config_dict = model_args.to_dict()
config_dict[
"label_dictionary"
] = dataset.get_labels()
config_dict["task_names"] = task_names
json.dump(config_dict, f)
for task_ind, task_name in enumerate(metrics):
with open(output_eval_file, "w") as writer:
# logger.info("***** Eval results for task %s *****" % (task_name))
Expand Down Expand Up @@ -712,10 +718,13 @@ def compute_metrics_fn(p: EvalPrediction):
trainer.save_model()
tokenizer.save_pretrained(training_args.output_dir)
if model_name == "cnn" or model_name == "lstm":
config_dict = model_args.to_dict()
config_dict["label_dictionary"] = dataset.get_labels()
config_dict["task_names"] = task_names
with open(
os.path.join(training_args.output_dir, "config.json"), "w"
) as f:
json.dump(model_args, f)
json.dump(config_dict, f)

# Evaluation
eval_results = {}
Expand Down Expand Up @@ -751,10 +760,19 @@ def compute_metrics_fn(p: EvalPrediction):
writer.write(f"{key} : {value} \n")
# here we probably want separate predictions for each dataset:
if training_args.load_best_model_at_end:
model.load_state_dict(
torch.load(join(training_args.output_dir, "pytorch_model.bin"))
) # load best model
trainer = Trainer( # maake trainer from best model
model_path = training_args.output_dir
if model_name == "cnn" or model_name == "lstm":
# non-HF models need manually passed config args
model = model_type.from_pretrained(
model_path,
vocab_size=len(tokenizer),
task_names=task_names,
num_labels_dict=num_labels,
)
else:
model = model_type.from_pretrained(model_path)

trainer = Trainer( # make trainer from best model
model=model,
args=training_args,
train_dataset=dataset.processed_dataset.get("train", None),
Expand Down

0 comments on commit b4f9a35

Please sign in to comment.