Merge pull request #208 from bertsky/subclass-docstrings

Subclass docstrings
OCR-D · May 1, 2024 · 4e42262 · 4e42262
2 parents ef3762e + 10a21ff
commit 4e42262
Show file tree

Hide file tree

Showing 11 changed files with 132 additions and 12 deletions.
diff --git a/Makefile b/Makefile
@@ -145,7 +145,7 @@ test: test/assets deps-test
 	#$(PYTHON) -m pytest -n auto --continue-on-collection-errors test $(PYTEST_ARGS)
 	# workaround for pytest-xdist not isolating setenv calls in click.CliRunner from each other:
 	$(PYTHON) -m pytest --continue-on-collection-errors test/test_cli.py $(PYTEST_ARGS)
-	$(PYTHON) -m pytest --continue-on-collection-errors test/test_{segment_{region,line,word},recognize}.py $(PYTEST_ARGS)
+	$(PYTHON) -m pytest --continue-on-collection-errors test/test_{segment_{region,table,line,word},recognize}.py $(PYTEST_ARGS)
 
 # Run unit tests and determine test coverage
 coverage:

diff --git a/ocrd_tesserocr/segment.py b/ocrd_tesserocr/segment.py
@@ -21,7 +21,8 @@ def __init__(self, *args, **kwargs):
             assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid
             self.logger = getLogger('processor.TesserocrSegment')
 
-TesserocrSegment.process.__doc__ = """Performs region and line segmentation with Tesseract on the workspace.
+    def process(self):
+        """Performs region and line segmentation with Tesseract on the workspace.
         
         Open and deserialize PAGE input files and their respective images,
         and remove any existing Region and ReadingOrder elements.
@@ -58,3 +59,4 @@ def __init__(self, *args, **kwargs):
         
         Produce a new output file by serialising the resulting hierarchy.
         """
+        super(TesserocrSegment, self).process()
diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py
@@ -22,7 +22,8 @@ def __init__(self, *args, **kwargs):
             assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid
             self.logger = getLogger('processor.TesserocrSegmentLine')
 
-TesserocrSegmentLine.process.__doc__ = """Performs (text) line segmentation with Tesseract on the workspace.
+    def process(self):
+        """Performs (text) line segmentation with Tesseract on the workspace.
         
         Open and deserialize PAGE input files and their respective images,
         then iterate over the element hierarchy down to the (text) region level,
@@ -40,3 +41,4 @@ def __init__(self, *args, **kwargs):
         
         Produce a new output file by serialising the resulting hierarchy.
         """
+        super(TesserocrSegmentLine, self).process()
diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py
@@ -24,7 +24,8 @@ def __init__(self, *args, **kwargs):
             assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid
             self.logger = getLogger('processor.TesserocrSegmentRegion')
 
-TesserocrSegmentRegion.process.__doc__ = """Performs region segmentation with Tesseract on the workspace.
+    def process(self):
+        """Performs region segmentation with Tesseract on the workspace.
         
         Open and deserialize PAGE input files and their respective images,
         and remove any existing Region and ReadingOrder elements
@@ -48,3 +49,4 @@ def __init__(self, *args, **kwargs):
         
         Produce a new output file by serialising the resulting hierarchy.
         """
+        super(TesserocrSegmentRegion, self).process()
diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py
@@ -22,7 +22,8 @@ def __init__(self, *args, **kwargs):
             assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid
             self.logger = getLogger('processor.TesserocrSegmentTable')
 
-TesserocrSegmentTable.process.__doc__ = """Performs table cell segmentation with Tesseract on the workspace.
+    def process(self):
+        """Performs table cell segmentation with Tesseract on the workspace.
         
         Open and deserialize PAGE input files and their respective images,
         then iterate over the element hierarchy down to the region level
@@ -36,3 +37,4 @@ def __init__(self, *args, **kwargs):
         
         Produce a new output file by serialising the resulting hierarchy.
         """
+        super(TesserocrSegmentTable, self).process()
diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py
@@ -22,7 +22,8 @@ def __init__(self, *args, **kwargs):
             assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid
             self.logger = getLogger('processor.TesserocrSegmentWord')
 
-TesserocrSegmentWord.process.__doc__ = """Performs word segmentation with Tesseract on the workspace.
+    def process(self):
+        """Performs word segmentation with Tesseract on the workspace.
         
         Open and deserialize PAGE input files and their respective images,
         then iterate over the element hierarchy down to the textline level,
@@ -39,3 +40,4 @@ def __init__(self, *args, **kwargs):
         
         Produce a new output file by serialising the resulting hierarchy.
         """
+        super(TesserocrSegmentWord, self).process()
diff --git a/test/conftest.py b/test/conftest.py
@@ -6,16 +6,23 @@
 
 METS_KANT_BINARIZED = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml')
 METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')
+METS_GUTACHTEN = assets.url_of('gutachten/data/mets.xml')
 
 @fixture
-def workspace_kant_binarized():
+def workspace_kant_binarized(tmpdir):
     initLogging()
-    with pushd_popd(tempdir=True) as tempdir:
-        yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir, download=True)
+    with pushd_popd(tmpdir):
+        yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tmpdir, download=True)
 
 @fixture
-def workspace_herold_small():
+def workspace_herold_small(tmpdir):
     initLogging()
-    with pushd_popd(tempdir=True) as tempdir:
-        yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir, download=True)
+    with pushd_popd(tmpdir):
+        yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tmpdir, download=True)
+
+@fixture
+def workspace_gutachten(tmpdir):
+    initLogging()
+    with pushd_popd(tmpdir):
+        yield Resolver().workspace_from_url(METS_GUTACHTEN, dst_dir=tmpdir, download=True)
 
diff --git a/test/test_segment_line.py b/test/test_segment_line.py
@@ -1,6 +1,8 @@
 from ocrd_tesserocr import TesserocrSegmentRegion
 from ocrd_tesserocr import TesserocrSegmentLine
 from ocrd_tesserocr import TesserocrSegment
+from ocrd_modelfactory import page_from_file
+from ocrd_utils import MIMETYPE_PAGE
 
 def test_run_modular(workspace_herold_small):
     TesserocrSegmentRegion(
@@ -14,6 +16,13 @@ def test_run_modular(workspace_herold_small):
         input_file_grp="OCR-D-SEG-BLOCK",
         output_file_grp="OCR-D-SEG-LINE"
     ).process()
+    out_files = list(workspace_herold_small.find_files(
+        fileGrp="OCR-D-SEG-LINE", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
+    assert len(out_files)
+    out_pcgts = page_from_file(out_files[0])
+    assert out_pcgts is not None
+    out_lines = out_pcgts.get_Page().get_AllTextLines()
+    assert len(out_lines)
     workspace_herold_small.save_mets()
 
 def test_run_allinone(workspace_herold_small):
@@ -22,4 +31,11 @@ def test_run_allinone(workspace_herold_small):
         input_file_grp="OCR-D-IMG",
         output_file_grp="OCR-D-SEG"
     ).process()
+    out_files = list(workspace_herold_small.find_files(
+        fileGrp="OCR-D-SEG", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
+    assert len(out_files)
+    out_pcgts = page_from_file(out_files[0])
+    assert out_pcgts is not None
+    out_lines = out_pcgts.get_Page().get_AllTextLines()
+    assert len(out_lines)
     workspace_herold_small.save_mets()
diff --git a/test/test_segment_region.py b/test/test_segment_region.py
@@ -1,11 +1,20 @@
 from ocrd_tesserocr import TesserocrSegmentRegion
+from ocrd_modelfactory import page_from_file
+from ocrd_utils import MIMETYPE_PAGE
 
 def test_run(workspace_herold_small):
     TesserocrSegmentRegion(
         workspace_herold_small,
         input_file_grp="OCR-D-IMG",
         output_file_grp="OCR-D-SEG-BLOCK"
     ).process()
+    out_files = list(workspace_herold_small.find_files(
+        fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
+    assert len(out_files)
+    out_pcgts = page_from_file(out_files[0])
+    assert out_pcgts is not None
+    out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text'])
+    assert len(out_blocks)
     workspace_herold_small.save_mets()
 
 def test_run_shrink(workspace_herold_small):
@@ -15,6 +24,13 @@ def test_run_shrink(workspace_herold_small):
         output_file_grp="OCR-D-SEG-BLOCK",
         parameter={'shrink_polygons': True}
     ).process()
+    out_files = list(workspace_herold_small.find_files(
+        fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
+    assert len(out_files)
+    out_pcgts = page_from_file(out_files[0])
+    assert out_pcgts is not None
+    out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text'])
+    assert len(out_blocks)
     workspace_herold_small.save_mets()
 
 def test_run_sparse(workspace_herold_small):
@@ -24,6 +40,13 @@ def test_run_sparse(workspace_herold_small):
         output_file_grp="OCR-D-SEG-BLOCK",
         parameter={'sparse_text': True}
     ).process()
+    out_files = list(workspace_herold_small.find_files(
+        fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
+    assert len(out_files)
+    out_pcgts = page_from_file(out_files[0])
+    assert out_pcgts is not None
+    out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text'])
+    assert len(out_blocks)
     workspace_herold_small.save_mets()
 
 def test_run_staves(workspace_herold_small):
@@ -33,4 +56,11 @@ def test_run_staves(workspace_herold_small):
         output_file_grp="OCR-D-SEG-BLOCK",
         parameter={'find_staves': True, 'find_tables': False}
     ).process()
+    out_files = list(workspace_herold_small.find_files(
+        fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE))
+    assert len(out_files)
+    out_pcgts = page_from_file(out_files[0])
+    assert out_pcgts is not None
+    out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text'])
+    assert len(out_blocks)
     workspace_herold_small.save_mets()
diff --git a/test/test_segment_table.py b/test/test_segment_table.py
@@ -0,0 +1,47 @@
+from ocrd_tesserocr import TesserocrSegment, TesserocrSegmentRegion, TesserocrSegmentTable
+from ocrd_modelfactory import page_from_file
+from ocrd_utils import MIMETYPE_PAGE
+
+def test_run_modular(workspace_gutachten):
+    TesserocrSegmentRegion(
+        workspace_gutachten,
+        input_file_grp="IMG",
+        output_file_grp="OCR-D-SEG-BLOCK",
+        parameter={'find_tables': True, 'overwrite_regions': True}
+    ).process()
+    TesserocrSegmentTable(
+        workspace_gutachten,
+        input_file_grp="OCR-D-SEG-BLOCK",
+        output_file_grp="OCR-D-SEG-CELL"
+    ).process()
+    out_files = list(workspace_gutachten.find_files(
+        fileGrp="OCR-D-SEG-CELL", pageId="PHYS_1", mimetype=MIMETYPE_PAGE))
+    assert len(out_files)
+    out_pcgts = page_from_file(out_files[0])
+    assert out_pcgts is not None
+    out_tables = out_pcgts.get_Page().get_AllRegions(classes=['Table'])
+    assert len(out_tables)
+    workspace_gutachten.save_mets()
+
+def test_run_allinone(workspace_gutachten):
+    TesserocrSegment(
+        workspace_gutachten,
+        input_file_grp="IMG",
+        output_file_grp="OCR-D-SEG",
+        parameter={'find_tables': True} # , 'textequiv_level': 'cell'
+    ).process()
+    TesserocrSegmentTable(
+        workspace_gutachten,
+        input_file_grp="OCR-D-SEG",
+        output_file_grp="OCR-D-SEG-CELL",
+        parameter={'overwrite_cells': True}
+    ).process()
+    out_files = list(workspace_gutachten.find_files(
+        fileGrp="OCR-D-SEG-CELL", pageId="PHYS_1", mimetype=MIMETYPE_PAGE))
+    assert len(out_files)
+    out_pcgts = page_from_file(out_files[0])
+    assert out_pcgts is not None
+    out_tables = out_pcgts.get_Page().get_AllRegions(classes=['Table'])
+    assert len(out_tables)
+    workspace_gutachten.save_mets()
+
diff --git a/test/test_segment_word.py b/test/test_segment_word.py
@@ -1,6 +1,8 @@
 from ocrd_tesserocr import TesserocrSegmentRegion
 from ocrd_tesserocr import TesserocrSegmentLine
 from ocrd_tesserocr import TesserocrSegmentWord
+from ocrd_modelfactory import page_from_file
+from ocrd_utils import MIMETYPE_PAGE
 
 def test_run_modular(workspace_kant_binarized):
     TesserocrSegmentRegion(
@@ -18,4 +20,12 @@ def test_run_modular(workspace_kant_binarized):
         input_file_grp="OCR-D-SEG-LINE",
         output_file_grp="OCR-D-SEG-WORD"
     ).process()
+    out_files = list(workspace_kant_binarized.find_files(
+        fileGrp="OCR-D-SEG-WORD", pageId="P_0017", mimetype=MIMETYPE_PAGE))
+    assert len(out_files)
+    out_pcgts = page_from_file(out_files[0])
+    assert out_pcgts is not None
+    out_lines = out_pcgts.get_Page().get_AllTextLines()
+    assert len(out_lines)
+    assert all(len(line.get_Word()) for line in out_lines)
     workspace_kant_binarized.save_mets()