Skip to content

Commit

Permalink
TesseractTextDetector added (#4)
Browse files Browse the repository at this point in the history
Co-authored-by: Nikita Shevtsov <[email protected]>
Co-authored-by: Nasty <[email protected]>
  • Loading branch information
3 people authored Aug 1, 2023
1 parent e2f5553 commit d0201e5
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 30 deletions.
23 changes: 6 additions & 17 deletions .github/check_version.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,7 @@
import argparse
import re
from typing import Pattern


def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern) -> bool:
match = regexp.match(version)

if match is None:
print("New version doesn't match the pattern") # noqa
return False

if not (tag.startswith("v") and tag[1:] == version):
print("Tag value should be equal to version with `v` in the beginning") # noqa
return False

return old_version < version

from pkg_resources import parse_version

if __name__ == "__main__":
parser = argparse.ArgumentParser()
Expand All @@ -27,7 +13,10 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern
print(f"Old version: {args.old_version}, new version: {args.new_version}, tag: {args.tag}") # noqa

version_pattern = re.compile(r"^\d+\.\d+(\.\d+)?$")
correct = is_correct_version(args.new_version, args.tag, args.old_version, version_pattern)
match = version_pattern.match(args.new_version)

assert match is not None, "New version doesn't match the pattern"
assert args.tag.startswith("v") and args.tag[1:] == args.new_version, "Tag value should be equal to version with `v` in the beginning"
assert parse_version(args.old_version) < parse_version(args.new_version), "New version should be greater than old version"

assert correct
print("Version is correct") # noqa
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Changelog
=========

v0.2 (2023-08-01)
-------------------
* TesseractTextDetector is added

v0.1 (2023-07-26)
-------------------
* First version of the library
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1
0.2
3 changes: 2 additions & 1 deletion dedocutils/text_detection/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .abstract_text_detector import AbstractTextDetector
from .doctr_text_detector.doctr_text_detector import DoctrTextDetector
from .tesseract_text_detector import TesseractTextDetector

__all__ = ["AbstractTextDetector", "DoctrTextDetector"]
__all__ = ["AbstractTextDetector", "DoctrTextDetector", "TesseractTextDetector"]
28 changes: 28 additions & 0 deletions dedocutils/text_detection/tesseract_text_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from typing import List, Optional

import numpy as np
import pytesseract

from dedocutils.data_structures import BBox
from dedocutils.text_detection import AbstractTextDetector


class TesseractTextDetector(AbstractTextDetector):
def __init__(self, config: str = "--psm 3") -> None:
self.config = config

def detect(self, image: np.ndarray, parameters: Optional[dict] = None) -> List[BBox]:
parameters = {} if parameters is None else parameters
lang = parameters.get("language", "rus+eng")

data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=self.config)

left, top, width, height, levels = data["left"], data["top"], data["width"], data["height"], data["level"]

bboxes = []
for x, y, w, h, level in zip(left, top, width, height, levels):
if level == 5:
bbox = BBox(x_top_left=x, y_top_left=y, width=w, height=h)
bboxes.append(bbox)

return bboxes
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@

class TesseractDetectorRecognizer(AbstractDetectorRecognizer):

def __init__(self, config: Optional[str] = None) -> None:
self.config = config if config is not None else "--psm 3"
def __init__(self, config: str = "--psm 3") -> None:
self.config = config

def detect_recognize(self, image: np.ndarray, parameters: Optional[dict] = None) -> List[TextWithBBox]:
parameters = {} if parameters is None else parameters
lang = parameters.get("language", "rus+eng")

data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=self.config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
words, left, top, width, height, levels = data["text"], data["left"], data["top"], data["width"], data["height"], data["level"]

# filter empty words and corresponding coordinates
irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
Expand All @@ -27,12 +27,13 @@ def detect_recognize(self, image: np.ndarray, parameters: Optional[dict] = None)
top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]

levels = [level for idx, level in enumerate(levels) if idx not in irrelevant_indices]
assert len(words) == len(left) == len(top) == len(width) == len(height), "Number of words and their coordinates should be equal"

text_with_bbox_list = []
for w, x, y, w, h in zip(words, left, top, width, height):
twb = TextWithBBox(text=w, bbox=BBox(x_top_left=x, y_top_left=y, width=w, height=h))
text_with_bbox_list.append(twb)
for w, x, y, w, h, level in zip(words, left, top, width, height, levels):
if level == 5:
twb = TextWithBBox(text=w, bbox=BBox(x_top_left=x, y_top_left=y, width=w, height=h))
text_with_bbox_list.append(twb)

return text_with_bbox_list
6 changes: 3 additions & 3 deletions dedocutils/text_recognition/tesseract_text_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@

class TesseractTextRecognizer(AbstractTextRecognizer):

def __init__(self, config: Optional[str] = None) -> None:
self.config = config if config is not None else "--psm 6"
def __init__(self, config: str = "--psm 6") -> None:
self.config = config

def recognize(self, image: np.ndarray, parameters: Optional[dict] = None) -> str:
parameters = {} if parameters is None else parameters
lang = parameters.get("language", "rus+eng")

text = pytesseract.pytesseract.image_to_string(image, lang=lang, config=self.config)
text = pytesseract.image_to_string(image, lang=lang, config=self.config)
return text
9 changes: 8 additions & 1 deletion tests/unit_tests/test_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import cv2

from dedocutils.text_detection import DoctrTextDetector
from dedocutils.text_detection import DoctrTextDetector, TesseractTextDetector


class TestClasses(unittest.TestCase):
Expand All @@ -13,3 +13,10 @@ def test_text_detection(self) -> None:
text_detector = DoctrTextDetector()
bboxes = text_detector.detect(cv2.imread(file_path))
self.assertTrue(len(bboxes) > 0)

@unittest.skip
def test_tesseract_text_detector(self) -> None:
file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "document_example.png"))
text_detector = TesseractTextDetector()
bboxes = text_detector.detect(cv2.imread(file_path))
self.assertTrue(len(bboxes) > 0)

0 comments on commit d0201e5

Please sign in to comment.