From 8102ef18f34e5e6bcec767483786e2c0a95e0892 Mon Sep 17 00:00:00 2001 From: Dani El-Ayyass Date: Mon, 18 Jul 2022 12:25:42 +0300 Subject: [PATCH 01/12] add github ci --- .github/workflows/codecov.yml | 37 +++++++++++++++++++++++++++++++++++ .github/workflows/linter.yml | 37 +++++++++++++++++++++++++++++++++++ .github/workflows/tests.yml | 30 ++++++++++++++++++++++++++++ .pre-commit-config.yaml | 16 +++++++-------- 4 files changed, 112 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/codecov.yml create mode 100644 .github/workflows/linter.yml create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml new file mode 100644 index 0000000..6a2b7a6 --- /dev/null +++ b/.github/workflows/codecov.yml @@ -0,0 +1,37 @@ +# This workflow will install Python dependencies and run codecov +# https://github.com/codecov/codecov-action#example-workflowyml-with-codecov-action + +name: codecov + +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest] + steps: + - uses: actions/checkout@master + - name: Set up Python + uses: actions/setup-python@master + with: + python-version: 3.7 + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -r requirements.txt + pip install pytest pytest-cov + - name: Generate coverage report + run: pytest --cov=./ --cov-report=xml + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + with: + flags: unittests + env_vars: OS,PYTHON + fail_ci_if_error: true + verbose: true diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml new file mode 100644 index 0000000..3895dc9 --- /dev/null +++ b/.github/workflows/linter.yml @@ -0,0 +1,37 @@ +# This workflow will install Python dependencies and run linter +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +# TODO: update linters + +name: linter + +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.7 + - name: Install dependencies + run: | + pip install --upgrade pip + pip install isort black flake8 mypy + - name: Code format check with isort + run: isort --check-only --profile black . + - name: Code format check with black + run: black --check . + - name: Code format check with flake8 + run: flake8 --ignore E501,E203,W503 . + - name: Type check with mypy + run: mypy --ignore-missing-imports . diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..5b3f266 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,30 @@ +# This workflow will install Python dependencies and run tests with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: tests + +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ['3.7', '3.8', '3.9', '3.10'] + os: [ubuntu-latest, macOS-latest, add windows-latest] + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -r requirements.txt + - name: Unittests + run: python -m unittest discover diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2685945..f409b34 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,11 +29,11 @@ repos: rev: v0.961 hooks: - id: mypy -- repo: local - hooks: - - id: unittest - name: unittest - entry: venv/bin/python -m unittest discover - language: python - always_run: true - pass_filenames: false +# - repo: local +# hooks: +# - id: unittest +# name: unittest +# entry: venv/bin/python -m unittest discover +# language: python +# always_run: true +# pass_filenames: false From 4f5f4eb658d252b37c18e2b03987c329175bc4bc Mon Sep 17 00:00:00 2001 From: Dani El-Ayyass Date: Mon, 18 Jul 2022 12:26:25 +0300 Subject: [PATCH 02/12] fix imports --- qaner/__init__.py | 2 +- qaner/dataset.py | 3 ++- qaner/inference.py | 9 +++++---- qaner/inference_utils.py | 3 ++- qaner/metrics.py | 5 +++-- qaner/train.py | 11 ++++++----- qaner/train_utils.py | 7 ++++--- 7 files changed, 23 insertions(+), 17 deletions(-) diff --git a/qaner/__init__.py b/qaner/__init__.py index 4640904..3dc1f76 100644 --- a/qaner/__init__.py +++ b/qaner/__init__.py @@ -1 +1 @@ -# TODO +__version__ = "0.1.0" diff --git a/qaner/dataset.py b/qaner/dataset.py index 6e31194..9910dc0 100644 --- a/qaner/dataset.py +++ b/qaner/dataset.py @@ -2,9 +2,10 @@ import torch import transformers -from data_utils import Instance, Span from tqdm import tqdm +from qaner.data_utils import Instance, Span + # TODO: add documentation class Dataset(torch.utils.data.Dataset): diff --git a/qaner/inference.py b/qaner/inference.py index 4067ba2..e72bb98 100644 --- a/qaner/inference.py +++ b/qaner/inference.py @@ -2,11 +2,12 @@ from typing import Any, Dict import torch -from arg_parse import get_inference_args -from data_utils import Instance -from inference_utils import get_top_valid_spans from transformers import AutoModelForQuestionAnswering, AutoTokenizer -from utils import set_global_seed + +from qaner.arg_parse import get_inference_args +from qaner.data_utils import Instance +from qaner.inference_utils import get_top_valid_spans +from qaner.utils import set_global_seed # TODO: add batch inference diff --git a/qaner/inference_utils.py b/qaner/inference_utils.py index c19b2e3..a5ca560 100644 --- a/qaner/inference_utils.py +++ b/qaner/inference_utils.py @@ -3,7 +3,8 @@ import numpy as np import torch import transformers -from data_utils import Span + +from qaner.data_utils import Span def get_top_valid_spans( diff --git a/qaner/metrics.py b/qaner/metrics.py index 1b60a67..de63963 100644 --- a/qaner/metrics.py +++ b/qaner/metrics.py @@ -1,7 +1,8 @@ from typing import Dict, List import numpy as np -from data_utils import Span + +from qaner.data_utils import Span # TODO: add metrics over label types @@ -33,7 +34,7 @@ def compute_metrics( confusion_matrix_pred_denominator = np.zeros(len(entity_mapper)) for span_true, span_pred in zip(spans_true_batch, spans_pred_batch_top_1): - span_pred = span_pred[0] + span_pred = span_pred[0] # type: ignore i = entity_mapper[span_true.label] j = entity_mapper[span_pred.label] # type: ignore diff --git a/qaner/train.py b/qaner/train.py index 39b35c9..f0747bb 100644 --- a/qaner/train.py +++ b/qaner/train.py @@ -1,13 +1,14 @@ import json import torch -from arg_parse import get_train_args -from data_utils import prepare_sentences_and_spans, read_bio_markup -from dataset import Collator, Dataset from torch.utils.tensorboard import SummaryWriter -from train_utils import train from transformers import AutoModelForQuestionAnswering, AutoTokenizer -from utils import set_global_seed + +from qaner.arg_parse import get_train_args +from qaner.data_utils import prepare_sentences_and_spans, read_bio_markup +from qaner.dataset import Collator, Dataset +from qaner.train_utils import train +from qaner.utils import set_global_seed if __name__ == "__main__": diff --git a/qaner/train_utils.py b/qaner/train_utils.py index c1b1fe6..b20d202 100644 --- a/qaner/train_utils.py +++ b/qaner/train_utils.py @@ -2,13 +2,14 @@ import numpy as np import torch -from data_utils import Span -from inference_utils import get_top_valid_spans -from metrics import compute_metrics from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm from transformers import AutoModelForQuestionAnswering +from qaner.data_utils import Span +from qaner.inference_utils import get_top_valid_spans +from qaner.metrics import compute_metrics + # TODO: add metrics calculation def train( From 728414ecee8c83d3de6cb310cd73b03efe0a05aa Mon Sep 17 00:00:00 2001 From: Dani El-Ayyass Date: Mon, 18 Jul 2022 12:29:03 +0300 Subject: [PATCH 03/12] add badges --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 1c5dc13..861bcce 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,17 @@ +[![tests](https://github.com/dayyass/qaner/actions/workflows/tests.yml/badge.svg)](https://github.com/dayyass/qaner/actions/workflows/tests.yml) +[![linter](https://github.com/dayyass/qaner/actions/workflows/linter.yml/badge.svg)](https://github.com/dayyass/qaner/actions/workflows/linter.yml) +[![codecov](https://codecov.io/gh/dayyass/qaner/branch/main/graph/badge.svg?token=S3UKX8BFP3)](https://codecov.io/gh/dayyass/qaner) + +[![python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://github.com/dayyass/qaner#requirements) +[![release (latest by date)](https://img.shields.io/github/v/release/dayyass/qaner)](https://github.com/dayyass/qaner/releases/latest) +[![license](https://img.shields.io/github/license/dayyass/qaner?color=blue)](https://github.com/dayyass/qaner/blob/main/LICENSE) + +[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-black)](https://github.com/dayyass/qaner/blob/main/.pre-commit-config.yaml) +[![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) + +[![pypi version](https://img.shields.io/pypi/v/qaner)](https://pypi.org/project/qaner) +[![pypi downloads](https://img.shields.io/pypi/dm/qaner)](https://pypi.org/project/qaner) + # QaNER Unofficial implementation of [*QaNER: Prompting Question Answering Models for Few-shot Named Entity Recognition*](https://arxiv.org/abs/2203.01543). From 1d4d8e97aa00cad9824f18f1486b6b18e676a0ef Mon Sep 17 00:00:00 2001 From: Dani El-Ayyass Date: Mon, 18 Jul 2022 12:29:34 +0300 Subject: [PATCH 04/12] add setup.py --- setup.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7a885d6 --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +from setuptools import setup + +from qaner import __version__ + +with open("README.md", mode="r", encoding="utf-8") as fp: + long_description = fp.read() + + +setup( + name="qaner", + version=__version__, + description="Unofficial implementation of QaNER: Prompting Question Answering Models for Few-shot Named Entity Recognition.", + long_description=long_description, + long_description_content_type="text/markdown", + author="Dani El-Ayyass", + author_email="dayyass@yandex.ru", + license_files=["LICENSE"], + url="https://github.com/dayyass/QaNER", + packages=["qaner"], + install_requires=[ + "numpy==1.21.6", + "tensorboard==2.9.0", + "torch==1.8.1", + "tqdm==4.64.0", + "transformers==4.19.2", + ], +) From eb59fca71f49f9069e6a5320ce35d976618b8ee0 Mon Sep 17 00:00:00 2001 From: Dani El-Ayyass Date: Mon, 18 Jul 2022 12:29:54 +0300 Subject: [PATCH 05/12] add setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7a885d6..5606fb1 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ author="Dani El-Ayyass", author_email="dayyass@yandex.ru", license_files=["LICENSE"], - url="https://github.com/dayyass/QaNER", + url="https://github.com/dayyass/qaner", packages=["qaner"], install_requires=[ "numpy==1.21.6", From 70e81efc5230f56da5c00e77f8a49340638465bb Mon Sep 17 00:00:00 2001 From: Dani El-Ayyass Date: Mon, 18 Jul 2022 12:36:21 +0300 Subject: [PATCH 06/12] fix tests --- tests/test_dataset.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 5327aee..03bd9ac 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,5 +1,4 @@ import json -import sys import unittest from typing import List @@ -7,11 +6,9 @@ from tqdm import tqdm from transformers import AutoTokenizer, BatchEncoding -sys.path.append("qaner") # TODO: fix it - -from data_utils import prepare_sentences_and_spans, read_bio_markup # noqa: E402 -from dataset import Collator, Dataset, Instance, Span # noqa: E402 -from utils import set_global_seed # noqa: E402 +from qaner.data_utils import prepare_sentences_and_spans, read_bio_markup +from qaner.dataset import Collator, Dataset, Instance, Span +from qaner.utils import set_global_seed def validate_spans( From b4c2f893871426a6167452b57522281ff7c47be1 Mon Sep 17 00:00:00 2001 From: Dani El-Ayyass Date: Mon, 18 Jul 2022 12:47:25 +0300 Subject: [PATCH 07/12] add console train and inference --- qaner/inference.py | 14 +++++++++++++- qaner/train.py | 15 ++++++++++++++- setup.py | 6 ++++++ 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/qaner/inference.py b/qaner/inference.py index e72bb98..8287d9f 100644 --- a/qaner/inference.py +++ b/qaner/inference.py @@ -72,7 +72,13 @@ def predict( return prediction -if __name__ == "__main__": +def main() -> int: + """ + Main inference function. + + Returns: + int: exit code. + """ # argparse args = get_inference_args() @@ -114,3 +120,9 @@ def predict( print(f"\nquestion: {prediction.question}\n") print(f"context: {prediction.context}") print(f"\nanswer: {prediction.answer}\n") + + return 0 + + +if __name__ == "__main__": + main() diff --git a/qaner/train.py b/qaner/train.py index f0747bb..166d61b 100644 --- a/qaner/train.py +++ b/qaner/train.py @@ -10,7 +10,14 @@ from qaner.train_utils import train from qaner.utils import set_global_seed -if __name__ == "__main__": + +def main() -> int: + """ + Main train function. + + Returns: + int: exit code. + """ # argparse args = get_train_args() @@ -106,3 +113,9 @@ model.save_pretrained(args.path_to_save_model) tokenizer.save_pretrained(args.path_to_save_model) + + return 0 + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 5606fb1..9385cd8 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,12 @@ license_files=["LICENSE"], url="https://github.com/dayyass/qaner", packages=["qaner"], + entry_points={ + "console_scripts": [ + "qaner-train = qaner.train:main", + "qaner-inference = qaner.inference:main", + ], + }, install_requires=[ "numpy==1.21.6", "tensorboard==2.9.0", From 224ad0a71064fc47182befd194990eca29d1d861 Mon Sep 17 00:00:00 2001 From: Dani El-Ayyass Date: Mon, 18 Jul 2022 12:53:41 +0300 Subject: [PATCH 08/12] minor changes --- .gitignore | 2 ++ README.md | 15 ++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 88abad1..d1c4a7e 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ venv runs dayyass + +qaner.egg-info diff --git a/README.md b/README.md index 861bcce..e51ee78 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,12 @@ Unofficial implementation of [*QaNER: Prompting Question Answering Models for Fe You can adopt this pipeline for arbitrary [BIO-markup](https://github.com/dayyass/QaNER/tree/main/data/conll2003) data. -### CoNLL-2003 +## Installation +``` +pip install qaner +``` + +## CoNLL-2003 Pipeline results on CoNLL-2003 dataset: - [Metrics](https://tensorboard.dev/experiment/FEsbNJdmSd2LGVhga8Ku0Q/) - [Trained Hugging Face model](https://huggingface.co/dayyass/qaner-conll-bert-base-uncased) @@ -26,7 +31,7 @@ Pipeline results on CoNLL-2003 dataset: ### Training Script for training QaNER model: ``` -python qaner/train.py \ +qaner-train \ --bert_model_name 'bert-base-uncased' \ --path_to_prompt_mapper 'data/conll2003/prompt_mapper.json' \ --path_to_train_data 'data/conll2003/train.bio' \ @@ -56,7 +61,7 @@ Optional arguments: ### Infrerence Script for inference trained QaNER model: ``` -python qaner/inference.py \ +qaner-inference \ --context 'EU rejects German call to boycott British lamb .' \ --question 'What is the organization?' \ --path_to_prompt_mapper 'data/conll2003/prompt_mapper.json' \ @@ -92,10 +97,10 @@ Possible inference questions for CoNLL-2003: - What is the organization? (ORG) - What is the miscellaneous entity? (MISC) -### Requirements +## Requirements Python >= 3.7 -### Citation +## Citation ```bibtex @misc{liu2022qaner, title = {QaNER: Prompting Question Answering Models for Few-shot Named Entity Recognition}, From 49b2aa6ead528277f22b63ce2b235515184a9027 Mon Sep 17 00:00:00 2001 From: Dani El-Ayyass Date: Mon, 18 Jul 2022 12:56:16 +0300 Subject: [PATCH 09/12] remove python 3.10 --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5b3f266..1acf6be 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -14,7 +14,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] + python-version: ['3.7', '3.8', '3.9'] os: [ubuntu-latest, macOS-latest, add windows-latest] steps: - uses: actions/checkout@v2 From 84ece0e9160d792f452b2c31fdfa6f0e9ef52f9e Mon Sep 17 00:00:00 2001 From: Dani El-Ayyass Date: Mon, 18 Jul 2022 12:57:07 +0300 Subject: [PATCH 10/12] update release version --- qaner/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qaner/__init__.py b/qaner/__init__.py index 3dc1f76..485f44a 100644 --- a/qaner/__init__.py +++ b/qaner/__init__.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.1.1" From 94e38608d9eebcaa26fc689f75c7ec654dcb5cb4 Mon Sep 17 00:00:00 2001 From: Dani El-Ayyass Date: Mon, 18 Jul 2022 12:58:47 +0300 Subject: [PATCH 11/12] remove codecov badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e51ee78..3132e1a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![tests](https://github.com/dayyass/qaner/actions/workflows/tests.yml/badge.svg)](https://github.com/dayyass/qaner/actions/workflows/tests.yml) [![linter](https://github.com/dayyass/qaner/actions/workflows/linter.yml/badge.svg)](https://github.com/dayyass/qaner/actions/workflows/linter.yml) -[![codecov](https://codecov.io/gh/dayyass/qaner/branch/main/graph/badge.svg?token=S3UKX8BFP3)](https://codecov.io/gh/dayyass/qaner) + [![python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://github.com/dayyass/qaner#requirements) [![release (latest by date)](https://img.shields.io/github/v/release/dayyass/qaner)](https://github.com/dayyass/qaner/releases/latest) From 6ffa055b6ecb4be7a04a8e10269699c56d71870f Mon Sep 17 00:00:00 2001 From: Dani El-Ayyass Date: Mon, 18 Jul 2022 13:12:33 +0300 Subject: [PATCH 12/12] hotfix --- qaner/inference_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qaner/inference_utils.py b/qaner/inference_utils.py index a5ca560..c6ff958 100644 --- a/qaner/inference_utils.py +++ b/qaner/inference_utils.py @@ -86,7 +86,7 @@ def get_top_valid_spans( span = Span( token=context[start_context_char_char:end_context_char_char], label=inv_prompt_mapper[ # TODO: add inference exception - question_list[i].lstrip("What is the ").rstrip("?") + question_list[i].split(r"What is the ")[-1].rstrip(r"?") ], start_context_char_pos=start_context_char_char, end_context_char_pos=end_context_char_char,