From d0201e5cd26c89a1db237eedb6d7bf348a06c7d4 Mon Sep 17 00:00:00 2001
From: Nikita Shevtsov <61932814+Travvy88@users.noreply.github.com>
Date: Tue, 1 Aug 2023 18:23:56 +0300
Subject: [PATCH] TesseractTextDetector added (#4)

Co-authored-by: Nikita Shevtsov <shevtsov@ispras.ru>
Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
---
 .github/check_version.py                      | 23 ++++-----------
 CHANGELOG.md                                  |  4 +++
 VERSION                                       |  2 +-
 dedocutils/text_detection/__init__.py         |  3 +-
 .../text_detection/tesseract_text_detector.py | 28 +++++++++++++++++++
 .../tesseract_detector_recognizer.py          | 15 +++++-----
 .../tesseract_text_recognizer.py              |  6 ++--
 tests/unit_tests/test_classes.py              |  9 +++++-
 8 files changed, 60 insertions(+), 30 deletions(-)
 create mode 100644 dedocutils/text_detection/tesseract_text_detector.py

diff --git a/.github/check_version.py b/.github/check_version.py
index 0721f41..b42092d 100644
--- a/.github/check_version.py
+++ b/.github/check_version.py
@@ -1,21 +1,7 @@
 import argparse
 import re
-from typing import Pattern
-
-
-def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern) -> bool:
-    match = regexp.match(version)
-
-    if match is None:
-        print("New version doesn't match the pattern")  # noqa
-        return False
-
-    if not (tag.startswith("v") and tag[1:] == version):
-        print("Tag value should be equal to version with `v` in the beginning")  # noqa
-        return False
-
-    return old_version < version
 
+from pkg_resources import parse_version
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -27,7 +13,10 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern
     print(f"Old version: {args.old_version}, new version: {args.new_version}, tag: {args.tag}")  # noqa
 
     version_pattern = re.compile(r"^\d+\.\d+(\.\d+)?$")
-    correct = is_correct_version(args.new_version, args.tag, args.old_version, version_pattern)
+    match = version_pattern.match(args.new_version)
+
+    assert match is not None, "New version doesn't match the pattern"
+    assert args.tag.startswith("v") and args.tag[1:] == args.new_version, "Tag value should be equal to version with `v` in the beginning"
+    assert parse_version(args.old_version) < parse_version(args.new_version), "New version should be greater than old version"
 
-    assert correct
     print("Version is correct")  # noqa
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 16f1940..ef49763 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,10 @@
 Changelog
 =========
 
+v0.2 (2023-08-01)
+-------------------
+* TesseractTextDetector is added 
+
 v0.1 (2023-07-26)
 -------------------
 * First version of the library
diff --git a/VERSION b/VERSION
index ceab6e1..2f45361 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.1
\ No newline at end of file
+0.2
\ No newline at end of file
diff --git a/dedocutils/text_detection/__init__.py b/dedocutils/text_detection/__init__.py
index f7307a1..e478006 100644
--- a/dedocutils/text_detection/__init__.py
+++ b/dedocutils/text_detection/__init__.py
@@ -1,4 +1,5 @@
 from .abstract_text_detector import AbstractTextDetector
 from .doctr_text_detector.doctr_text_detector import DoctrTextDetector
+from .tesseract_text_detector import TesseractTextDetector
 
-__all__ = ["AbstractTextDetector", "DoctrTextDetector"]
+__all__ = ["AbstractTextDetector", "DoctrTextDetector", "TesseractTextDetector"]
diff --git a/dedocutils/text_detection/tesseract_text_detector.py b/dedocutils/text_detection/tesseract_text_detector.py
new file mode 100644
index 0000000..38cb678
--- /dev/null
+++ b/dedocutils/text_detection/tesseract_text_detector.py
@@ -0,0 +1,28 @@
+from typing import List, Optional
+
+import numpy as np
+import pytesseract
+
+from dedocutils.data_structures import BBox
+from dedocutils.text_detection import AbstractTextDetector
+
+
+class TesseractTextDetector(AbstractTextDetector):
+    def __init__(self, config: str = "--psm 3") -> None:
+        self.config = config
+
+    def detect(self, image: np.ndarray, parameters: Optional[dict] = None) -> List[BBox]:
+        parameters = {} if parameters is None else parameters
+        lang = parameters.get("language", "rus+eng")
+
+        data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=self.config)
+
+        left, top, width, height, levels = data["left"], data["top"], data["width"], data["height"], data["level"]
+
+        bboxes = []
+        for x, y, w, h, level in zip(left, top, width, height, levels):
+            if level == 5:
+                bbox = BBox(x_top_left=x, y_top_left=y, width=w, height=h)
+                bboxes.append(bbox)
+
+        return bboxes
diff --git a/dedocutils/text_detection_recognition/tesseract_detector_recognizer.py b/dedocutils/text_detection_recognition/tesseract_detector_recognizer.py
index f87433c..fbce753 100644
--- a/dedocutils/text_detection_recognition/tesseract_detector_recognizer.py
+++ b/dedocutils/text_detection_recognition/tesseract_detector_recognizer.py
@@ -10,15 +10,15 @@
 
 class TesseractDetectorRecognizer(AbstractDetectorRecognizer):
 
-    def __init__(self, config: Optional[str] = None) -> None:
-        self.config = config if config is not None else "--psm 3"
+    def __init__(self, config: str = "--psm 3") -> None:
+        self.config = config
 
     def detect_recognize(self, image: np.ndarray, parameters: Optional[dict] = None) -> List[TextWithBBox]:
         parameters = {} if parameters is None else parameters
         lang = parameters.get("language", "rus+eng")
 
         data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=self.config)
-        words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
+        words, left, top, width, height, levels = data["text"], data["left"], data["top"], data["width"], data["height"], data["level"]
 
         # filter empty words and corresponding coordinates
         irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
@@ -27,12 +27,13 @@ def detect_recognize(self, image: np.ndarray, parameters: Optional[dict] = None)
         top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
         width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
         height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
-
+        levels = [level for idx, level in enumerate(levels) if idx not in irrelevant_indices]
         assert len(words) == len(left) == len(top) == len(width) == len(height), "Number of words and their coordinates should be equal"
 
         text_with_bbox_list = []
-        for w, x, y, w, h in zip(words, left, top, width, height):
-            twb = TextWithBBox(text=w, bbox=BBox(x_top_left=x, y_top_left=y, width=w, height=h))
-            text_with_bbox_list.append(twb)
+        for w, x, y, w, h, level in zip(words, left, top, width, height, levels):
+            if level == 5:
+                twb = TextWithBBox(text=w, bbox=BBox(x_top_left=x, y_top_left=y, width=w, height=h))
+                text_with_bbox_list.append(twb)
 
         return text_with_bbox_list
diff --git a/dedocutils/text_recognition/tesseract_text_recognizer.py b/dedocutils/text_recognition/tesseract_text_recognizer.py
index 00ea0d7..2960b71 100644
--- a/dedocutils/text_recognition/tesseract_text_recognizer.py
+++ b/dedocutils/text_recognition/tesseract_text_recognizer.py
@@ -8,12 +8,12 @@
 
 class TesseractTextRecognizer(AbstractTextRecognizer):
 
-    def __init__(self, config: Optional[str] = None) -> None:
-        self.config = config if config is not None else "--psm 6"
+    def __init__(self, config: str = "--psm 6") -> None:
+        self.config = config
 
     def recognize(self, image: np.ndarray, parameters: Optional[dict] = None) -> str:
         parameters = {} if parameters is None else parameters
         lang = parameters.get("language", "rus+eng")
 
-        text = pytesseract.pytesseract.image_to_string(image, lang=lang, config=self.config)
+        text = pytesseract.image_to_string(image, lang=lang, config=self.config)
         return text
diff --git a/tests/unit_tests/test_classes.py b/tests/unit_tests/test_classes.py
index f199805..0ae29b8 100644
--- a/tests/unit_tests/test_classes.py
+++ b/tests/unit_tests/test_classes.py
@@ -3,7 +3,7 @@
 
 import cv2
 
-from dedocutils.text_detection import DoctrTextDetector
+from dedocutils.text_detection import DoctrTextDetector, TesseractTextDetector
 
 
 class TestClasses(unittest.TestCase):
@@ -13,3 +13,10 @@ def test_text_detection(self) -> None:
         text_detector = DoctrTextDetector()
         bboxes = text_detector.detect(cv2.imread(file_path))
         self.assertTrue(len(bboxes) > 0)
+
+    @unittest.skip
+    def test_tesseract_text_detector(self) -> None:
+        file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "document_example.png"))
+        text_detector = TesseractTextDetector()
+        bboxes = text_detector.detect(cv2.imread(file_path))
+        self.assertTrue(len(bboxes) > 0)