Skip to content

Commit

Permalink
Merge pull request #208 from bertsky/subclass-docstrings
Browse files Browse the repository at this point in the history
Subclass docstrings
  • Loading branch information
bertsky committed May 1, 2024
2 parents ef3762e + 10a21ff commit 4e42262
Show file tree
Hide file tree
Showing 11 changed files with 132 additions and 12 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ test: test/assets deps-test
#$(PYTHON) -m pytest -n auto --continue-on-collection-errors test $(PYTEST_ARGS)
# workaround for pytest-xdist not isolating setenv calls in click.CliRunner from each other:
$(PYTHON) -m pytest --continue-on-collection-errors test/test_cli.py $(PYTEST_ARGS)
$(PYTHON) -m pytest --continue-on-collection-errors test/test_{segment_{region,line,word},recognize}.py $(PYTEST_ARGS)
$(PYTHON) -m pytest --continue-on-collection-errors test/test_{segment_{region,table,line,word},recognize}.py $(PYTEST_ARGS)

# Run unit tests and determine test coverage
coverage:
Expand Down
4 changes: 3 additions & 1 deletion ocrd_tesserocr/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def __init__(self, *args, **kwargs):
assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid
self.logger = getLogger('processor.TesserocrSegment')

TesserocrSegment.process.__doc__ = """Performs region and line segmentation with Tesseract on the workspace.
def process(self):
"""Performs region and line segmentation with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
and remove any existing Region and ReadingOrder elements.
Expand Down Expand Up @@ -58,3 +59,4 @@ def __init__(self, *args, **kwargs):
Produce a new output file by serialising the resulting hierarchy.
"""
super(TesserocrSegment, self).process()
4 changes: 3 additions & 1 deletion ocrd_tesserocr/segment_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def __init__(self, *args, **kwargs):
assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid
self.logger = getLogger('processor.TesserocrSegmentLine')

TesserocrSegmentLine.process.__doc__ = """Performs (text) line segmentation with Tesseract on the workspace.
def process(self):
"""Performs (text) line segmentation with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the (text) region level,
Expand All @@ -40,3 +41,4 @@ def __init__(self, *args, **kwargs):
Produce a new output file by serialising the resulting hierarchy.
"""
super(TesserocrSegmentLine, self).process()
4 changes: 3 additions & 1 deletion ocrd_tesserocr/segment_region.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def __init__(self, *args, **kwargs):
assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid
self.logger = getLogger('processor.TesserocrSegmentRegion')

TesserocrSegmentRegion.process.__doc__ = """Performs region segmentation with Tesseract on the workspace.
def process(self):
"""Performs region segmentation with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
and remove any existing Region and ReadingOrder elements
Expand All @@ -48,3 +49,4 @@ def __init__(self, *args, **kwargs):
Produce a new output file by serialising the resulting hierarchy.
"""
super(TesserocrSegmentRegion, self).process()
4 changes: 3 additions & 1 deletion ocrd_tesserocr/segment_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def __init__(self, *args, **kwargs):
assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid
self.logger = getLogger('processor.TesserocrSegmentTable')

TesserocrSegmentTable.process.__doc__ = """Performs table cell segmentation with Tesseract on the workspace.
def process(self):
"""Performs table cell segmentation with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the region level
Expand All @@ -36,3 +37,4 @@ def __init__(self, *args, **kwargs):
Produce a new output file by serialising the resulting hierarchy.
"""
super(TesserocrSegmentTable, self).process()
4 changes: 3 additions & 1 deletion ocrd_tesserocr/segment_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def __init__(self, *args, **kwargs):
assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid
self.logger = getLogger('processor.TesserocrSegmentWord')

TesserocrSegmentWord.process.__doc__ = """Performs word segmentation with Tesseract on the workspace.
def process(self):
"""Performs word segmentation with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the textline level,
Expand All @@ -39,3 +40,4 @@ def __init__(self, *args, **kwargs):
Produce a new output file by serialising the resulting hierarchy.
"""
super(TesserocrSegmentWord, self).process()
19 changes: 13 additions & 6 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,23 @@

METS_KANT_BINARIZED = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml')
METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')
METS_GUTACHTEN = assets.url_of('gutachten/data/mets.xml')

@fixture
def workspace_kant_binarized():
def workspace_kant_binarized(tmpdir):
initLogging()
with pushd_popd(tempdir=True) as tempdir:
yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir, download=True)
with pushd_popd(tmpdir):
yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tmpdir, download=True)

@fixture
def workspace_herold_small():
def workspace_herold_small(tmpdir):
initLogging()
with pushd_popd(tempdir=True) as tempdir:
yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir, download=True)
with pushd_popd(tmpdir):
yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tmpdir, download=True)

@fixture
def workspace_gutachten(tmpdir):
initLogging()
with pushd_popd(tmpdir):
yield Resolver().workspace_from_url(METS_GUTACHTEN, dst_dir=tmpdir, download=True)

16 changes: 16 additions & 0 deletions test/test_segment_line.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from ocrd_tesserocr import TesserocrSegmentRegion
from ocrd_tesserocr import TesserocrSegmentLine
from ocrd_tesserocr import TesserocrSegment
from ocrd_modelfactory import page_from_file
from ocrd_utils import MIMETYPE_PAGE

def test_run_modular(workspace_herold_small):
TesserocrSegmentRegion(
Expand All @@ -14,6 +16,13 @@ def test_run_modular(workspace_herold_small):
input_file_grp="OCR-D-SEG-BLOCK",
output_file_grp="OCR-D-SEG-LINE"
).process()
out_files = list(workspace_herold_small.find_files(
fileGrp="OCR-D-SEG-LINE", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_lines = out_pcgts.get_Page().get_AllTextLines()
assert len(out_lines)
workspace_herold_small.save_mets()

def test_run_allinone(workspace_herold_small):
Expand All @@ -22,4 +31,11 @@ def test_run_allinone(workspace_herold_small):
input_file_grp="OCR-D-IMG",
output_file_grp="OCR-D-SEG"
).process()
out_files = list(workspace_herold_small.find_files(
fileGrp="OCR-D-SEG", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_lines = out_pcgts.get_Page().get_AllTextLines()
assert len(out_lines)
workspace_herold_small.save_mets()
30 changes: 30 additions & 0 deletions test/test_segment_region.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
from ocrd_tesserocr import TesserocrSegmentRegion
from ocrd_modelfactory import page_from_file
from ocrd_utils import MIMETYPE_PAGE

def test_run(workspace_herold_small):
TesserocrSegmentRegion(
workspace_herold_small,
input_file_grp="OCR-D-IMG",
output_file_grp="OCR-D-SEG-BLOCK"
).process()
out_files = list(workspace_herold_small.find_files(
fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text'])
assert len(out_blocks)
workspace_herold_small.save_mets()

def test_run_shrink(workspace_herold_small):
Expand All @@ -15,6 +24,13 @@ def test_run_shrink(workspace_herold_small):
output_file_grp="OCR-D-SEG-BLOCK",
parameter={'shrink_polygons': True}
).process()
out_files = list(workspace_herold_small.find_files(
fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text'])
assert len(out_blocks)
workspace_herold_small.save_mets()

def test_run_sparse(workspace_herold_small):
Expand All @@ -24,6 +40,13 @@ def test_run_sparse(workspace_herold_small):
output_file_grp="OCR-D-SEG-BLOCK",
parameter={'sparse_text': True}
).process()
out_files = list(workspace_herold_small.find_files(
fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text'])
assert len(out_blocks)
workspace_herold_small.save_mets()

def test_run_staves(workspace_herold_small):
Expand All @@ -33,4 +56,11 @@ def test_run_staves(workspace_herold_small):
output_file_grp="OCR-D-SEG-BLOCK",
parameter={'find_staves': True, 'find_tables': False}
).process()
out_files = list(workspace_herold_small.find_files(
fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text'])
assert len(out_blocks)
workspace_herold_small.save_mets()
47 changes: 47 additions & 0 deletions test/test_segment_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from ocrd_tesserocr import TesserocrSegment, TesserocrSegmentRegion, TesserocrSegmentTable
from ocrd_modelfactory import page_from_file
from ocrd_utils import MIMETYPE_PAGE

def test_run_modular(workspace_gutachten):
TesserocrSegmentRegion(
workspace_gutachten,
input_file_grp="IMG",
output_file_grp="OCR-D-SEG-BLOCK",
parameter={'find_tables': True, 'overwrite_regions': True}
).process()
TesserocrSegmentTable(
workspace_gutachten,
input_file_grp="OCR-D-SEG-BLOCK",
output_file_grp="OCR-D-SEG-CELL"
).process()
out_files = list(workspace_gutachten.find_files(
fileGrp="OCR-D-SEG-CELL", pageId="PHYS_1", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_tables = out_pcgts.get_Page().get_AllRegions(classes=['Table'])
assert len(out_tables)
workspace_gutachten.save_mets()

def test_run_allinone(workspace_gutachten):
TesserocrSegment(
workspace_gutachten,
input_file_grp="IMG",
output_file_grp="OCR-D-SEG",
parameter={'find_tables': True} # , 'textequiv_level': 'cell'
).process()
TesserocrSegmentTable(
workspace_gutachten,
input_file_grp="OCR-D-SEG",
output_file_grp="OCR-D-SEG-CELL",
parameter={'overwrite_cells': True}
).process()
out_files = list(workspace_gutachten.find_files(
fileGrp="OCR-D-SEG-CELL", pageId="PHYS_1", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_tables = out_pcgts.get_Page().get_AllRegions(classes=['Table'])
assert len(out_tables)
workspace_gutachten.save_mets()

10 changes: 10 additions & 0 deletions test/test_segment_word.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from ocrd_tesserocr import TesserocrSegmentRegion
from ocrd_tesserocr import TesserocrSegmentLine
from ocrd_tesserocr import TesserocrSegmentWord
from ocrd_modelfactory import page_from_file
from ocrd_utils import MIMETYPE_PAGE

def test_run_modular(workspace_kant_binarized):
TesserocrSegmentRegion(
Expand All @@ -18,4 +20,12 @@ def test_run_modular(workspace_kant_binarized):
input_file_grp="OCR-D-SEG-LINE",
output_file_grp="OCR-D-SEG-WORD"
).process()
out_files = list(workspace_kant_binarized.find_files(
fileGrp="OCR-D-SEG-WORD", pageId="P_0017", mimetype=MIMETYPE_PAGE))
assert len(out_files)
out_pcgts = page_from_file(out_files[0])
assert out_pcgts is not None
out_lines = out_pcgts.get_Page().get_AllTextLines()
assert len(out_lines)
assert all(len(line.get_Word()) for line in out_lines)
workspace_kant_binarized.save_mets()

0 comments on commit 4e42262

Please sign in to comment.