From d0201e5cd26c89a1db237eedb6d7bf348a06c7d4 Mon Sep 17 00:00:00 2001 From: Nikita Shevtsov <61932814+Travvy88@users.noreply.github.com> Date: Tue, 1 Aug 2023 18:23:56 +0300 Subject: [PATCH] TesseractTextDetector added (#4) Co-authored-by: Nikita Shevtsov Co-authored-by: Nasty --- .github/check_version.py | 23 ++++----------- CHANGELOG.md | 4 +++ VERSION | 2 +- dedocutils/text_detection/__init__.py | 3 +- .../text_detection/tesseract_text_detector.py | 28 +++++++++++++++++++ .../tesseract_detector_recognizer.py | 15 +++++----- .../tesseract_text_recognizer.py | 6 ++-- tests/unit_tests/test_classes.py | 9 +++++- 8 files changed, 60 insertions(+), 30 deletions(-) create mode 100644 dedocutils/text_detection/tesseract_text_detector.py diff --git a/.github/check_version.py b/.github/check_version.py index 0721f41..b42092d 100644 --- a/.github/check_version.py +++ b/.github/check_version.py @@ -1,21 +1,7 @@ import argparse import re -from typing import Pattern - - -def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern) -> bool: - match = regexp.match(version) - - if match is None: - print("New version doesn't match the pattern") # noqa - return False - - if not (tag.startswith("v") and tag[1:] == version): - print("Tag value should be equal to version with `v` in the beginning") # noqa - return False - - return old_version < version +from pkg_resources import parse_version if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -27,7 +13,10 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern print(f"Old version: {args.old_version}, new version: {args.new_version}, tag: {args.tag}") # noqa version_pattern = re.compile(r"^\d+\.\d+(\.\d+)?$") - correct = is_correct_version(args.new_version, args.tag, args.old_version, version_pattern) + match = version_pattern.match(args.new_version) + + assert match is not None, "New version doesn't match the pattern" + assert args.tag.startswith("v") and args.tag[1:] == args.new_version, "Tag value should be equal to version with `v` in the beginning" + assert parse_version(args.old_version) < parse_version(args.new_version), "New version should be greater than old version" - assert correct print("Version is correct") # noqa diff --git a/CHANGELOG.md b/CHANGELOG.md index 16f1940..ef49763 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ Changelog ========= +v0.2 (2023-08-01) +------------------- +* TesseractTextDetector is added + v0.1 (2023-07-26) ------------------- * First version of the library diff --git a/VERSION b/VERSION index ceab6e1..2f45361 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1 \ No newline at end of file +0.2 \ No newline at end of file diff --git a/dedocutils/text_detection/__init__.py b/dedocutils/text_detection/__init__.py index f7307a1..e478006 100644 --- a/dedocutils/text_detection/__init__.py +++ b/dedocutils/text_detection/__init__.py @@ -1,4 +1,5 @@ from .abstract_text_detector import AbstractTextDetector from .doctr_text_detector.doctr_text_detector import DoctrTextDetector +from .tesseract_text_detector import TesseractTextDetector -__all__ = ["AbstractTextDetector", "DoctrTextDetector"] +__all__ = ["AbstractTextDetector", "DoctrTextDetector", "TesseractTextDetector"] diff --git a/dedocutils/text_detection/tesseract_text_detector.py b/dedocutils/text_detection/tesseract_text_detector.py new file mode 100644 index 0000000..38cb678 --- /dev/null +++ b/dedocutils/text_detection/tesseract_text_detector.py @@ -0,0 +1,28 @@ +from typing import List, Optional + +import numpy as np +import pytesseract + +from dedocutils.data_structures import BBox +from dedocutils.text_detection import AbstractTextDetector + + +class TesseractTextDetector(AbstractTextDetector): + def __init__(self, config: str = "--psm 3") -> None: + self.config = config + + def detect(self, image: np.ndarray, parameters: Optional[dict] = None) -> List[BBox]: + parameters = {} if parameters is None else parameters + lang = parameters.get("language", "rus+eng") + + data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=self.config) + + left, top, width, height, levels = data["left"], data["top"], data["width"], data["height"], data["level"] + + bboxes = [] + for x, y, w, h, level in zip(left, top, width, height, levels): + if level == 5: + bbox = BBox(x_top_left=x, y_top_left=y, width=w, height=h) + bboxes.append(bbox) + + return bboxes diff --git a/dedocutils/text_detection_recognition/tesseract_detector_recognizer.py b/dedocutils/text_detection_recognition/tesseract_detector_recognizer.py index f87433c..fbce753 100644 --- a/dedocutils/text_detection_recognition/tesseract_detector_recognizer.py +++ b/dedocutils/text_detection_recognition/tesseract_detector_recognizer.py @@ -10,15 +10,15 @@ class TesseractDetectorRecognizer(AbstractDetectorRecognizer): - def __init__(self, config: Optional[str] = None) -> None: - self.config = config if config is not None else "--psm 3" + def __init__(self, config: str = "--psm 3") -> None: + self.config = config def detect_recognize(self, image: np.ndarray, parameters: Optional[dict] = None) -> List[TextWithBBox]: parameters = {} if parameters is None else parameters lang = parameters.get("language", "rus+eng") data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=self.config) - words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"] + words, left, top, width, height, levels = data["text"], data["left"], data["top"], data["width"], data["height"], data["level"] # filter empty words and corresponding coordinates irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()] @@ -27,12 +27,13 @@ def detect_recognize(self, image: np.ndarray, parameters: Optional[dict] = None) top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices] width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices] height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices] - + levels = [level for idx, level in enumerate(levels) if idx not in irrelevant_indices] assert len(words) == len(left) == len(top) == len(width) == len(height), "Number of words and their coordinates should be equal" text_with_bbox_list = [] - for w, x, y, w, h in zip(words, left, top, width, height): - twb = TextWithBBox(text=w, bbox=BBox(x_top_left=x, y_top_left=y, width=w, height=h)) - text_with_bbox_list.append(twb) + for w, x, y, w, h, level in zip(words, left, top, width, height, levels): + if level == 5: + twb = TextWithBBox(text=w, bbox=BBox(x_top_left=x, y_top_left=y, width=w, height=h)) + text_with_bbox_list.append(twb) return text_with_bbox_list diff --git a/dedocutils/text_recognition/tesseract_text_recognizer.py b/dedocutils/text_recognition/tesseract_text_recognizer.py index 00ea0d7..2960b71 100644 --- a/dedocutils/text_recognition/tesseract_text_recognizer.py +++ b/dedocutils/text_recognition/tesseract_text_recognizer.py @@ -8,12 +8,12 @@ class TesseractTextRecognizer(AbstractTextRecognizer): - def __init__(self, config: Optional[str] = None) -> None: - self.config = config if config is not None else "--psm 6" + def __init__(self, config: str = "--psm 6") -> None: + self.config = config def recognize(self, image: np.ndarray, parameters: Optional[dict] = None) -> str: parameters = {} if parameters is None else parameters lang = parameters.get("language", "rus+eng") - text = pytesseract.pytesseract.image_to_string(image, lang=lang, config=self.config) + text = pytesseract.image_to_string(image, lang=lang, config=self.config) return text diff --git a/tests/unit_tests/test_classes.py b/tests/unit_tests/test_classes.py index f199805..0ae29b8 100644 --- a/tests/unit_tests/test_classes.py +++ b/tests/unit_tests/test_classes.py @@ -3,7 +3,7 @@ import cv2 -from dedocutils.text_detection import DoctrTextDetector +from dedocutils.text_detection import DoctrTextDetector, TesseractTextDetector class TestClasses(unittest.TestCase): @@ -13,3 +13,10 @@ def test_text_detection(self) -> None: text_detector = DoctrTextDetector() bboxes = text_detector.detect(cv2.imread(file_path)) self.assertTrue(len(bboxes) > 0) + + @unittest.skip + def test_tesseract_text_detector(self) -> None: + file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "document_example.png")) + text_detector = TesseractTextDetector() + bboxes = text_detector.detect(cv2.imread(file_path)) + self.assertTrue(len(bboxes) > 0)