Skip to content

Commit

Permalink
move tesseract env; move constant
Browse files Browse the repository at this point in the history
  • Loading branch information
yuming-long committed Oct 4, 2023
1 parent cd82e31 commit 5cdf327
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 3 deletions.
8 changes: 5 additions & 3 deletions unstructured/partition/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
# unstructured.documents.elements.Image
from PIL import Image as PILImage
from PIL import ImageSequence

from unstructured.partition.utils.constants import OCRMode
from unstructured_inference.inference.elements import (
Rectangle,
TextRegion,
Expand All @@ -24,8 +22,12 @@
from unstructured_pytesseract import Output

from unstructured.logger import logger
from unstructured.partition.utils.constants import SUBREGION_THRESHOLD_FOR_OCR, OCRMode

SUBREGION_THRESHOLD_FOR_OCR = 0.5
# Force tesseract to be single threaded,
# otherwise we see major performance problems
if "OMP_THREAD_LIMIT" not in os.environ:
os.environ["OMP_THREAD_LIMIT"] = "1"


def process_data_with_ocr(
Expand Down
2 changes: 2 additions & 0 deletions unstructured/partition/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ class OCRMode(Enum):

SORT_MODE_XY_CUT = "xy-cut"
SORT_MODE_BASIC = "basic"

SUBREGION_THRESHOLD_FOR_OCR = 0.5

0 comments on commit 5cdf327

Please sign in to comment.