From a8252b56bb35c584eefdf1c9ffdb73642ab23463 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 Apr 2024 12:59:42 +0200 Subject: [PATCH 1/4] process: delegate instead of overwrite --- ocrd_tesserocr/segment.py | 4 +++- ocrd_tesserocr/segment_line.py | 4 +++- ocrd_tesserocr/segment_region.py | 4 +++- ocrd_tesserocr/segment_table.py | 4 +++- ocrd_tesserocr/segment_word.py | 4 +++- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/ocrd_tesserocr/segment.py b/ocrd_tesserocr/segment.py index a299089..f60e913 100644 --- a/ocrd_tesserocr/segment.py +++ b/ocrd_tesserocr/segment.py @@ -21,7 +21,8 @@ def __init__(self, *args, **kwargs): assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid self.logger = getLogger('processor.TesserocrSegment') -TesserocrSegment.process.__doc__ = """Performs region and line segmentation with Tesseract on the workspace. + def process(self): + """Performs region and line segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, and remove any existing Region and ReadingOrder elements. @@ -58,3 +59,4 @@ def __init__(self, *args, **kwargs): Produce a new output file by serialising the resulting hierarchy. """ + super(TesserocrSegment, self).process() diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 72137d8..626f859 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -22,7 +22,8 @@ def __init__(self, *args, **kwargs): assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid self.logger = getLogger('processor.TesserocrSegmentLine') -TesserocrSegmentLine.process.__doc__ = """Performs (text) line segmentation with Tesseract on the workspace. + def process(self): + """Performs (text) line segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the (text) region level, @@ -40,3 +41,4 @@ def __init__(self, *args, **kwargs): Produce a new output file by serialising the resulting hierarchy. """ + super(TesserocrSegmentLine, self).process() diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index a223c16..f04b0dd 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -24,7 +24,8 @@ def __init__(self, *args, **kwargs): assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid self.logger = getLogger('processor.TesserocrSegmentRegion') -TesserocrSegmentRegion.process.__doc__ = """Performs region segmentation with Tesseract on the workspace. + def process(self): + """Performs region segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, and remove any existing Region and ReadingOrder elements @@ -48,3 +49,4 @@ def __init__(self, *args, **kwargs): Produce a new output file by serialising the resulting hierarchy. """ + super(TesserocrSegmentRegion, self).process() diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py index edc85e5..3aa753c 100644 --- a/ocrd_tesserocr/segment_table.py +++ b/ocrd_tesserocr/segment_table.py @@ -22,7 +22,8 @@ def __init__(self, *args, **kwargs): assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid self.logger = getLogger('processor.TesserocrSegmentTable') -TesserocrSegmentTable.process.__doc__ = """Performs table cell segmentation with Tesseract on the workspace. + def process(self): + """Performs table cell segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level @@ -36,3 +37,4 @@ def __init__(self, *args, **kwargs): Produce a new output file by serialising the resulting hierarchy. """ + super(TesserocrSegmentTable, self).process() diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index 19134fe..ebe8f49 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -22,7 +22,8 @@ def __init__(self, *args, **kwargs): assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid self.logger = getLogger('processor.TesserocrSegmentWord') -TesserocrSegmentWord.process.__doc__ = """Performs word segmentation with Tesseract on the workspace. + def process(self): + """Performs word segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the textline level, @@ -39,3 +40,4 @@ def __init__(self, *args, **kwargs): Produce a new output file by serialising the resulting hierarchy. """ + super(TesserocrSegmentWord, self).process() From 289d41e5fc189b19864394ee2424930d9aeca59f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 1 May 2024 00:08:13 +0200 Subject: [PATCH 2/4] conftest: use tmpdir fixture, add sample --- test/conftest.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 54cba29..fd09f74 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -6,16 +6,23 @@ METS_KANT_BINARIZED = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml') METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml') +METS_GUTACHTEN = assets.url_of('gutachten/data/mets.xml') @fixture -def workspace_kant_binarized(): +def workspace_kant_binarized(tmpdir): initLogging() - with pushd_popd(tempdir=True) as tempdir: - yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir, download=True) + with pushd_popd(tmpdir): + yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tmpdir, download=True) @fixture -def workspace_herold_small(): +def workspace_herold_small(tmpdir): initLogging() - with pushd_popd(tempdir=True) as tempdir: - yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir, download=True) + with pushd_popd(tmpdir): + yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tmpdir, download=True) + +@fixture +def workspace_gutachten(tmpdir): + initLogging() + with pushd_popd(tmpdir): + yield Resolver().workspace_from_url(METS_GUTACHTEN, dst_dir=tmpdir, download=True) From b70e864d28922e1492c7e36a0626783dc5a66f21 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 1 May 2024 00:09:01 +0200 Subject: [PATCH 3/4] add test_segment_table --- Makefile | 2 +- test/test_segment_table.py | 47 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 test/test_segment_table.py diff --git a/Makefile b/Makefile index 62d994e..b58d2da 100644 --- a/Makefile +++ b/Makefile @@ -145,7 +145,7 @@ test: test/assets deps-test #$(PYTHON) -m pytest -n auto --continue-on-collection-errors test $(PYTEST_ARGS) # workaround for pytest-xdist not isolating setenv calls in click.CliRunner from each other: $(PYTHON) -m pytest --continue-on-collection-errors test/test_cli.py $(PYTEST_ARGS) - $(PYTHON) -m pytest --continue-on-collection-errors test/test_{segment_{region,line,word},recognize}.py $(PYTEST_ARGS) + $(PYTHON) -m pytest --continue-on-collection-errors test/test_{segment_{region,table,line,word},recognize}.py $(PYTEST_ARGS) # Run unit tests and determine test coverage coverage: diff --git a/test/test_segment_table.py b/test/test_segment_table.py new file mode 100644 index 0000000..34ecfca --- /dev/null +++ b/test/test_segment_table.py @@ -0,0 +1,47 @@ +from ocrd_tesserocr import TesserocrSegment, TesserocrSegmentRegion, TesserocrSegmentTable +from ocrd_modelfactory import page_from_file +from ocrd_utils import MIMETYPE_PAGE + +def test_run_modular(workspace_gutachten): + TesserocrSegmentRegion( + workspace_gutachten, + input_file_grp="IMG", + output_file_grp="OCR-D-SEG-BLOCK", + parameter={'find_tables': True, 'overwrite_regions': True} + ).process() + TesserocrSegmentTable( + workspace_gutachten, + input_file_grp="OCR-D-SEG-BLOCK", + output_file_grp="OCR-D-SEG-CELL" + ).process() + out_files = list(workspace_gutachten.find_files( + fileGrp="OCR-D-SEG-CELL", pageId="PHYS_1", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_tables = out_pcgts.get_Page().get_AllRegions(classes=['Table']) + assert len(out_tables) + workspace_gutachten.save_mets() + +def test_run_allinone(workspace_gutachten): + TesserocrSegment( + workspace_gutachten, + input_file_grp="IMG", + output_file_grp="OCR-D-SEG", + parameter={'find_tables': True} # , 'textequiv_level': 'cell' + ).process() + TesserocrSegmentTable( + workspace_gutachten, + input_file_grp="OCR-D-SEG", + output_file_grp="OCR-D-SEG-CELL", + parameter={'overwrite_cells': True} + ).process() + out_files = list(workspace_gutachten.find_files( + fileGrp="OCR-D-SEG-CELL", pageId="PHYS_1", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_tables = out_pcgts.get_Page().get_AllRegions(classes=['Table']) + assert len(out_tables) + workspace_gutachten.save_mets() + From 10a21ffb6a6f59bf6720d0e7e711f52ad1e59826 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 1 May 2024 00:31:45 +0200 Subject: [PATCH 4/4] assertions for all segmentation tests --- test/test_segment_line.py | 16 ++++++++++++++++ test/test_segment_region.py | 30 ++++++++++++++++++++++++++++++ test/test_segment_word.py | 10 ++++++++++ 3 files changed, 56 insertions(+) diff --git a/test/test_segment_line.py b/test/test_segment_line.py index 99602f2..f63bc6a 100644 --- a/test/test_segment_line.py +++ b/test/test_segment_line.py @@ -1,6 +1,8 @@ from ocrd_tesserocr import TesserocrSegmentRegion from ocrd_tesserocr import TesserocrSegmentLine from ocrd_tesserocr import TesserocrSegment +from ocrd_modelfactory import page_from_file +from ocrd_utils import MIMETYPE_PAGE def test_run_modular(workspace_herold_small): TesserocrSegmentRegion( @@ -14,6 +16,13 @@ def test_run_modular(workspace_herold_small): input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE" ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-LINE", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_lines = out_pcgts.get_Page().get_AllTextLines() + assert len(out_lines) workspace_herold_small.save_mets() def test_run_allinone(workspace_herold_small): @@ -22,4 +31,11 @@ def test_run_allinone(workspace_herold_small): input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG" ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_lines = out_pcgts.get_Page().get_AllTextLines() + assert len(out_lines) workspace_herold_small.save_mets() diff --git a/test/test_segment_region.py b/test/test_segment_region.py index c250ebd..50f7ed4 100644 --- a/test/test_segment_region.py +++ b/test/test_segment_region.py @@ -1,4 +1,6 @@ from ocrd_tesserocr import TesserocrSegmentRegion +from ocrd_modelfactory import page_from_file +from ocrd_utils import MIMETYPE_PAGE def test_run(workspace_herold_small): TesserocrSegmentRegion( @@ -6,6 +8,13 @@ def test_run(workspace_herold_small): input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK" ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text']) + assert len(out_blocks) workspace_herold_small.save_mets() def test_run_shrink(workspace_herold_small): @@ -15,6 +24,13 @@ def test_run_shrink(workspace_herold_small): output_file_grp="OCR-D-SEG-BLOCK", parameter={'shrink_polygons': True} ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text']) + assert len(out_blocks) workspace_herold_small.save_mets() def test_run_sparse(workspace_herold_small): @@ -24,6 +40,13 @@ def test_run_sparse(workspace_herold_small): output_file_grp="OCR-D-SEG-BLOCK", parameter={'sparse_text': True} ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text']) + assert len(out_blocks) workspace_herold_small.save_mets() def test_run_staves(workspace_herold_small): @@ -33,4 +56,11 @@ def test_run_staves(workspace_herold_small): output_file_grp="OCR-D-SEG-BLOCK", parameter={'find_staves': True, 'find_tables': False} ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text']) + assert len(out_blocks) workspace_herold_small.save_mets() diff --git a/test/test_segment_word.py b/test/test_segment_word.py index 86fcc6b..86fc28d 100644 --- a/test/test_segment_word.py +++ b/test/test_segment_word.py @@ -1,6 +1,8 @@ from ocrd_tesserocr import TesserocrSegmentRegion from ocrd_tesserocr import TesserocrSegmentLine from ocrd_tesserocr import TesserocrSegmentWord +from ocrd_modelfactory import page_from_file +from ocrd_utils import MIMETYPE_PAGE def test_run_modular(workspace_kant_binarized): TesserocrSegmentRegion( @@ -18,4 +20,12 @@ def test_run_modular(workspace_kant_binarized): input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-SEG-WORD" ).process() + out_files = list(workspace_kant_binarized.find_files( + fileGrp="OCR-D-SEG-WORD", pageId="P_0017", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_lines = out_pcgts.get_Page().get_AllTextLines() + assert len(out_lines) + assert all(len(line.get_Word()) for line in out_lines) workspace_kant_binarized.save_mets()