From cd47f552a6a75dba5ff0f38a2ef9fbafbd076086 Mon Sep 17 00:00:00 2001 From: zhanghan17 Date: Thu, 20 May 2021 17:48:28 +0800 Subject: [PATCH] change the path of ernie-gram and fix mrc --- .../.meta/ernie-gram.jpeg | Bin {ernie_gram => ernie-gram}/README.en.md | 2 +- {ernie_gram => ernie-gram}/README.md | 0 {ernie_gram => ernie-gram}/README.zh.md | 2 +- {ernie_gram => ernie-gram}/__init__.py | 0 .../finetune_classifier_distributed.py | 6 +- {ernie_gram => ernie-gram}/finetune_mrc.py | 56 +++++++++++++----- {ernie_gram => ernie-gram}/finetune_ner.py | 6 +- {ernie_gram => ernie-gram}/mrc/__init__.py | 0 {ernie_gram => ernie-gram}/mrc/mrc_metrics.py | 5 +- {ernie_gram => ernie-gram}/mrc/mrc_reader.py | 0 {ernie_gram => ernie-gram}/optimization.py | 4 +- {ernie_gram => ernie-gram}/run_cls.sh | 2 +- {ernie_gram => ernie-gram}/run_mrc.sh | 2 +- {ernie_gram => ernie-gram}/run_ner.sh | 2 +- .../task_configs/cmrc_conf | 0 .../task_configs/msra_ner_conf | 0 .../task_configs/xnli_conf | 0 {ernie_gram => ernie-gram}/utils.py | 0 ernie/file_utils.py | 9 +-- 20 files changed, 65 insertions(+), 31 deletions(-) rename {ernie_gram => ernie-gram}/.meta/ernie-gram.jpeg (100%) rename {ernie_gram => ernie-gram}/README.en.md (98%) rename {ernie_gram => ernie-gram}/README.md (100%) rename {ernie_gram => ernie-gram}/README.zh.md (99%) rename {ernie_gram => ernie-gram}/__init__.py (100%) rename {ernie_gram => ernie-gram}/finetune_classifier_distributed.py (98%) rename {ernie_gram => ernie-gram}/finetune_mrc.py (82%) rename {ernie_gram => ernie-gram}/finetune_ner.py (98%) rename {ernie_gram => ernie-gram}/mrc/__init__.py (100%) rename {ernie_gram => ernie-gram}/mrc/mrc_metrics.py (99%) rename {ernie_gram => ernie-gram}/mrc/mrc_reader.py (100%) rename {ernie_gram => ernie-gram}/optimization.py (97%) rename {ernie_gram => ernie-gram}/run_cls.sh (77%) rename {ernie_gram => ernie-gram}/run_mrc.sh (75%) rename {ernie_gram => ernie-gram}/run_ner.sh (71%) rename {ernie_gram => ernie-gram}/task_configs/cmrc_conf (100%) rename {ernie_gram => ernie-gram}/task_configs/msra_ner_conf (100%) rename {ernie_gram => ernie-gram}/task_configs/xnli_conf (100%) rename {ernie_gram => ernie-gram}/utils.py (100%) diff --git a/ernie_gram/.meta/ernie-gram.jpeg b/ernie-gram/.meta/ernie-gram.jpeg similarity index 100% rename from ernie_gram/.meta/ernie-gram.jpeg rename to ernie-gram/.meta/ernie-gram.jpeg diff --git a/ernie_gram/README.en.md b/ernie-gram/README.en.md similarity index 98% rename from ernie_gram/README.en.md rename to ernie-gram/README.en.md index 8a077da54f53e..bb864b00b089e 100644 --- a/ernie_gram/README.en.md +++ b/ernie-gram/README.en.md @@ -36,7 +36,7 @@ wget https://ernie-github.cdn.bcebos.com/data-xnli.tar.gz tar xf data-xnli.tar.gz cd .. #demo for NLI task -sh ernie_gram/run_cls.sh ernie_gram/task_configs/xnli_conf +sh run_cls.sh task_configs/xnli_conf ``` ### Setup diff --git a/ernie_gram/README.md b/ernie-gram/README.md similarity index 100% rename from ernie_gram/README.md rename to ernie-gram/README.md diff --git a/ernie_gram/README.zh.md b/ernie-gram/README.zh.md similarity index 99% rename from ernie_gram/README.zh.md rename to ernie-gram/README.zh.md index bcb1cb22edcc6..d23ce493b16bd 100644 --- a/ernie_gram/README.zh.md +++ b/ernie-gram/README.zh.md @@ -38,7 +38,7 @@ wget https://ernie-github.cdn.bcebos.com/data-xnli.tar.gz tar xf data-xnli.tar.gz cd .. #demo for NLI task -sh ernie_gram/run_cls.sh ernie_gram/task_configs/xnli_conf +sh run_cls.sh task_configs/xnli_conf ``` diff --git a/ernie_gram/__init__.py b/ernie-gram/__init__.py similarity index 100% rename from ernie_gram/__init__.py rename to ernie-gram/__init__.py diff --git a/ernie_gram/finetune_classifier_distributed.py b/ernie-gram/finetune_classifier_distributed.py similarity index 98% rename from ernie_gram/finetune_classifier_distributed.py rename to ernie-gram/finetune_classifier_distributed.py index 4ddd7c5eeaf89..f35a0a72c63d3 100644 --- a/ernie_gram/finetune_classifier_distributed.py +++ b/ernie-gram/finetune_classifier_distributed.py @@ -23,6 +23,8 @@ import numpy as np import logging #from visualdl import LogWriter +import sys +sys.path.append("../") from pathlib import Path import paddle as P @@ -32,8 +34,8 @@ #from model.bert import BertConfig, BertModelLayer from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer -from ernie_gram.optimization import AdamW -from ernie_gram.utils import create_if_not_exists, get_warmup_and_linear_decay +from optimization import AdamW +from utils import create_if_not_exists, get_warmup_and_linear_decay log.setLevel(logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) diff --git a/ernie_gram/finetune_mrc.py b/ernie-gram/finetune_mrc.py similarity index 82% rename from ernie_gram/finetune_mrc.py rename to ernie-gram/finetune_mrc.py index 9d2369742161a..89a9d70935b14 100644 --- a/ernie_gram/finetune_mrc.py +++ b/ernie-gram/finetune_mrc.py @@ -30,6 +30,8 @@ import argparse from functools import partial from io import open +import sys +sys.path.append("../") import numpy as np import logging @@ -38,22 +40,23 @@ from propeller import log import propeller.paddle as propeller -from ernie_gram.optimization import AdamW +from optimization import AdamW from ernie.modeling_ernie import ErnieModel, ErnieModelForQuestionAnswering from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer #from ernie.optimization import AdamW, LinearDecay -from ernie_gram.mrc import mrc_reader -from ernie_gram.mrc import mrc_metrics -from ernie_gram.utils import create_if_not_exists, get_warmup_and_linear_decay +from mrc import mrc_reader +from mrc import mrc_metrics +from utils import create_if_not_exists, get_warmup_and_linear_decay log.setLevel(logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) -def evaluate(model, ds, all_examples, all_features, tokenizer, args): - dev_file = json.loads(open(args.dev_file, encoding='utf8').read()) +def evaluate(model, ds, all_examples, all_features, tokenizer, args, is_test=False): + dev_file = args.dev_file if not is_test else args.test_file + dev_file = json.loads(open(dev_file, encoding='utf8').read()) with P.no_grad(): log.debug('start eval') model.eval() @@ -84,8 +87,8 @@ def evaluate(model, ds, all_examples, all_features, tokenizer, args): return f1, em -def train(model, train_dataset, dev_dataset, dev_examples, dev_features, - tokenizer, args): +def train(model, train_dataset, dev_dataset, dev_examples, dev_features, + tokenizer, args, test_dataset=None, test_examples=None, test_features=None, do_test=False): model = P.DataParallel(model) max_steps = args.max_steps @@ -142,10 +145,14 @@ def train(model, train_dataset, dev_dataset, dev_examples, dev_features, log.debug(msg) if env.dev_id == 0 and step % 100==0 and step: - print(step) f1, em = evaluate(model, dev_dataset, dev_examples, dev_features, tokenizer, args) - log.debug('[step %d] eval result: f1 %.5f em %.5f' % + log.debug('[step %d] dev eval result: f1 %.5f em %.5f' % + (step, f1, em)) + if do_test: + f1, em = evaluate(model, test_dataset, test_examples, + test_features, tokenizer, args, True) + log.debug('[step %d] test eval result: f1 %.5f em %.5f' % (step, f1, em)) if env.dev_id == 0 and args.save_dir is not None: P.save(model.state_dict(), args.save_dir / 'ckpt.bin') @@ -177,7 +184,12 @@ def train(model, train_dataset, dev_dataset, dev_examples, dev_features, type=str, required=True, help='data directory includes train / develop data') - parser.add_argument('--warmup_proportion', type=float, default=0.0) + parser.add_argument( + '--test_file', + type=str, + default=None, + help='data directory includes train / develop data') + parser.add_argument('--warmup_proportion', type=float, default=0.1) parser.add_argument('--lr', type=float, default=3e-5, help='learning rate') parser.add_argument( '--save_dir', type=Path, required=True, help='model output directory') @@ -216,6 +228,10 @@ def train(model, train_dataset, dev_dataset, dev_examples, dev_features, dev_examples = mrc_reader.read_files(args.dev_file, is_training=False) dev_features = mrc_reader.convert_example_to_features( dev_examples, args.max_seqlen, tokenizer, is_training=False) + if args.test_file: + test_examples = mrc_reader.read_files(args.test_file, is_training=False) + test_features = mrc_reader.convert_example_to_features( + test_examples, args.max_seqlen, tokenizer, is_training=False) log.info('train examples: %d, features: %d' % (len(train_examples), len(train_features))) @@ -235,16 +251,28 @@ def map_fn(unique_id, example_index, doc_span_index, tokens, dev_dataset = propeller.data.Dataset.from_list(dev_features).map( map_fn).padded_batch(args.bsz) - model = ErnieModelForQuestionAnswering.from_pretrained( args.from_pretrained, name='') - train(model, train_dataset, dev_dataset, dev_examples, dev_features, + if args.test_file: + test_dataset = propeller.data.Dataset.from_list(test_features).map( + map_fn).padded_batch(args.bsz) + train(model, train_dataset, dev_dataset, dev_examples, dev_features, + tokenizer, args, test_dataset, test_examples, test_features, True) + + else: + train(model, train_dataset, dev_dataset, dev_examples, dev_features, tokenizer, args) + + if env.dev_id == 0: f1, em = evaluate(model, dev_dataset, dev_examples, dev_features, tokenizer, args) - log.debug('final eval result: f1 %.5f em %.5f' % (f1, em)) + log.debug('final dev eval result: f1 %.5f em %.5f' % (f1, em)) + if args.test_file: + f1, em = evaluate(model, test_dataset, test_examples, test_features, + tokenizer, args, True) + log.debug('final test eval result: f1 %.5f em %.5f' % (f1, em)) if env.dev_id == 0 and args.save_dir is not None: P.save(model.state_dict(), args.save_dir / 'ckpt.bin') diff --git a/ernie_gram/finetune_ner.py b/ernie-gram/finetune_ner.py similarity index 98% rename from ernie_gram/finetune_ner.py rename to ernie-gram/finetune_ner.py index a59a815097e6f..6a2913abf3fbb 100644 --- a/ernie_gram/finetune_ner.py +++ b/ernie-gram/finetune_ner.py @@ -29,6 +29,8 @@ import multiprocessing import pickle import logging +import sys +sys.path.append("../") from sklearn.metrics import f1_score import paddle as P @@ -39,10 +41,10 @@ log.setLevel(logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) -from ernie_gram.utils import create_if_not_exists, get_warmup_and_linear_decay +from utils import create_if_not_exists, get_warmup_and_linear_decay from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification, ErnieModelForTokenClassification from ernie.tokenizing_ernie import ErnieTokenizer -from ernie_gram.optimization import AdamW +from optimization import AdamW parser = propeller.ArgumentParser('NER model with ERNIE') parser.add_argument('--max_seqlen', type=int, default=256) diff --git a/ernie_gram/mrc/__init__.py b/ernie-gram/mrc/__init__.py similarity index 100% rename from ernie_gram/mrc/__init__.py rename to ernie-gram/mrc/__init__.py diff --git a/ernie_gram/mrc/mrc_metrics.py b/ernie-gram/mrc/mrc_metrics.py similarity index 99% rename from ernie_gram/mrc/mrc_metrics.py rename to ernie-gram/mrc/mrc_metrics.py index a94859c3be422..366917e954eca 100644 --- a/ernie_gram/mrc/mrc_metrics.py +++ b/ernie-gram/mrc/mrc_metrics.py @@ -349,7 +349,10 @@ def make_results(vocab, all_examples, all_features, all_results, n_best_size, unique_id_to_result = {} for result in all_results: - unique_id_to_result[result.unique_id] = result + try: + unique_id_to_result[result.unique_id] = result + except: + continue _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", [ diff --git a/ernie_gram/mrc/mrc_reader.py b/ernie-gram/mrc/mrc_reader.py similarity index 100% rename from ernie_gram/mrc/mrc_reader.py rename to ernie-gram/mrc/mrc_reader.py diff --git a/ernie_gram/optimization.py b/ernie-gram/optimization.py similarity index 97% rename from ernie_gram/optimization.py rename to ernie-gram/optimization.py index fe5e36118c855..3ebcbe705716d 100644 --- a/ernie_gram/optimization.py +++ b/ernie-gram/optimization.py @@ -25,11 +25,13 @@ import numpy as np import paddle as P import paddle.distributed.fleet as fleet +import sys +sys.path.append("../") from propeller.paddle.train.hooks import RunHook import paddle.fluid as F log = logging.getLogger(__name__) -from ernie_gram.utils import create_if_not_exists, get_warmup_and_linear_decay +from utils import create_if_not_exists, get_warmup_and_linear_decay class AdamW(P.optimizer.AdamW): """AdamW object for dygraph""" diff --git a/ernie_gram/run_cls.sh b/ernie-gram/run_cls.sh similarity index 77% rename from ernie_gram/run_cls.sh rename to ernie-gram/run_cls.sh index b8587e6cb2fde..c46ae2044568e 100644 --- a/ernie_gram/run_cls.sh +++ b/ernie-gram/run_cls.sh @@ -1,6 +1,6 @@ source $1 -python3 -m paddle.distributed.launch ./ernie_gram/finetune_classifier_distributed.py \ +python3 -m paddle.distributed.launch ./finetune_classifier_distributed.py \ --data_dir $data_dir \ --max_steps $max_steps \ --bsz $bsz \ diff --git a/ernie_gram/run_mrc.sh b/ernie-gram/run_mrc.sh similarity index 75% rename from ernie_gram/run_mrc.sh rename to ernie-gram/run_mrc.sh index 0f3980ab1cdf8..e180e8ddf6495 100644 --- a/ernie_gram/run_mrc.sh +++ b/ernie-gram/run_mrc.sh @@ -1,6 +1,6 @@ source $1 export CUDA_VISIBLE_DEVICES=0 -python3 -m paddle.distributed.launch ./ernie_gram/finetune_mrc.py \ +python3 -m paddle.distributed.launch ./finetune_mrc.py \ --train_file $train_file \ --dev_file $dev_file \ --max_steps $max_steps \ diff --git a/ernie_gram/run_ner.sh b/ernie-gram/run_ner.sh similarity index 71% rename from ernie_gram/run_ner.sh rename to ernie-gram/run_ner.sh index 11604b6dd4ce0..be9a5663c7645 100644 --- a/ernie_gram/run_ner.sh +++ b/ernie-gram/run_ner.sh @@ -1,6 +1,6 @@ source $1 -python3 -m paddle.distributed.launch ./ernie_gram/finetune_ner.py \ +python3 -m paddle.distributed.launch ./finetune_ner.py \ --data_dir $data_dir \ --max_steps $max_steps \ --epoch $epoch \ diff --git a/ernie_gram/task_configs/cmrc_conf b/ernie-gram/task_configs/cmrc_conf similarity index 100% rename from ernie_gram/task_configs/cmrc_conf rename to ernie-gram/task_configs/cmrc_conf diff --git a/ernie_gram/task_configs/msra_ner_conf b/ernie-gram/task_configs/msra_ner_conf similarity index 100% rename from ernie_gram/task_configs/msra_ner_conf rename to ernie-gram/task_configs/msra_ner_conf diff --git a/ernie_gram/task_configs/xnli_conf b/ernie-gram/task_configs/xnli_conf similarity index 100% rename from ernie_gram/task_configs/xnli_conf rename to ernie-gram/task_configs/xnli_conf diff --git a/ernie_gram/utils.py b/ernie-gram/utils.py similarity index 100% rename from ernie_gram/utils.py rename to ernie-gram/utils.py diff --git a/ernie/file_utils.py b/ernie/file_utils.py index 4715094548102..03e2784f78c9e 100644 --- a/ernie/file_utils.py +++ b/ernie/file_utils.py @@ -68,12 +68,9 @@ def _fetch_from_remote(url, f = done_file.open('wb') f.close() else: - while True: - if done_file.exists(): - break - else: - time.sleep(1) - + while not done_file.exists(): + time.sleep(1) + log.debug('%s cached in %s' % (url, cached_dir)) return cached_dir_model