Skip to content

Commit

Permalink
assertions for all segmentation tests
Browse files Browse the repository at this point in the history
  • Loading branch information
bertsky committed Apr 30, 2024
1 parent b70e864 commit 10a21ff
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 0 deletions.
16 changes: 16 additions & 0 deletions test/test_segment_line.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from ocrd_tesserocr import TesserocrSegmentRegion
from ocrd_tesserocr import TesserocrSegmentLine
from ocrd_tesserocr import TesserocrSegment
from ocrd_modelfactory import page_from_file
from ocrd_utils import MIMETYPE_PAGE

def test_run_modular(workspace_herold_small):
TesserocrSegmentRegion(
Expand All @@ -14,6 +16,13 @@ def test_run_modular(workspace_herold_small):
input_file_grp="OCR-D-SEG-BLOCK",
output_file_grp="OCR-D-SEG-LINE"
).process()
out_files = list(workspace_herold_small.find_files(
fileGrp="OCR-D-SEG-LINE", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_lines = out_pcgts.get_Page().get_AllTextLines()
assert len(out_lines)
workspace_herold_small.save_mets()

def test_run_allinone(workspace_herold_small):
Expand All @@ -22,4 +31,11 @@ def test_run_allinone(workspace_herold_small):
input_file_grp="OCR-D-IMG",
output_file_grp="OCR-D-SEG"
).process()
out_files = list(workspace_herold_small.find_files(
fileGrp="OCR-D-SEG", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_lines = out_pcgts.get_Page().get_AllTextLines()
assert len(out_lines)
workspace_herold_small.save_mets()
30 changes: 30 additions & 0 deletions test/test_segment_region.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
from ocrd_tesserocr import TesserocrSegmentRegion
from ocrd_modelfactory import page_from_file
from ocrd_utils import MIMETYPE_PAGE

def test_run(workspace_herold_small):
TesserocrSegmentRegion(
workspace_herold_small,
input_file_grp="OCR-D-IMG",
output_file_grp="OCR-D-SEG-BLOCK"
).process()
out_files = list(workspace_herold_small.find_files(
fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text'])
assert len(out_blocks)
workspace_herold_small.save_mets()

def test_run_shrink(workspace_herold_small):
Expand All @@ -15,6 +24,13 @@ def test_run_shrink(workspace_herold_small):
output_file_grp="OCR-D-SEG-BLOCK",
parameter={'shrink_polygons': True}
).process()
out_files = list(workspace_herold_small.find_files(
fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text'])
assert len(out_blocks)
workspace_herold_small.save_mets()

def test_run_sparse(workspace_herold_small):
Expand All @@ -24,6 +40,13 @@ def test_run_sparse(workspace_herold_small):
output_file_grp="OCR-D-SEG-BLOCK",
parameter={'sparse_text': True}
).process()
out_files = list(workspace_herold_small.find_files(
fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text'])
assert len(out_blocks)
workspace_herold_small.save_mets()

def test_run_staves(workspace_herold_small):
Expand All @@ -33,4 +56,11 @@ def test_run_staves(workspace_herold_small):
output_file_grp="OCR-D-SEG-BLOCK",
parameter={'find_staves': True, 'find_tables': False}
).process()
out_files = list(workspace_herold_small.find_files(
fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text'])
assert len(out_blocks)
workspace_herold_small.save_mets()
10 changes: 10 additions & 0 deletions test/test_segment_word.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from ocrd_tesserocr import TesserocrSegmentRegion
from ocrd_tesserocr import TesserocrSegmentLine
from ocrd_tesserocr import TesserocrSegmentWord
from ocrd_modelfactory import page_from_file
from ocrd_utils import MIMETYPE_PAGE

def test_run_modular(workspace_kant_binarized):
TesserocrSegmentRegion(
Expand All @@ -18,4 +20,12 @@ def test_run_modular(workspace_kant_binarized):
input_file_grp="OCR-D-SEG-LINE",
output_file_grp="OCR-D-SEG-WORD"
).process()
out_files = list(workspace_kant_binarized.find_files(
fileGrp="OCR-D-SEG-WORD", pageId="P_0017", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_lines = out_pcgts.get_Page().get_AllTextLines()
assert len(out_lines)
assert all(len(line.get_Word()) for line in out_lines)
workspace_kant_binarized.save_mets()

0 comments on commit 10a21ff

Please sign in to comment.