diff --git a/Makefile b/Makefile index 62d994e..b58d2da 100644 --- a/Makefile +++ b/Makefile @@ -145,7 +145,7 @@ test: test/assets deps-test #$(PYTHON) -m pytest -n auto --continue-on-collection-errors test $(PYTEST_ARGS) # workaround for pytest-xdist not isolating setenv calls in click.CliRunner from each other: $(PYTHON) -m pytest --continue-on-collection-errors test/test_cli.py $(PYTEST_ARGS) - $(PYTHON) -m pytest --continue-on-collection-errors test/test_{segment_{region,line,word},recognize}.py $(PYTEST_ARGS) + $(PYTHON) -m pytest --continue-on-collection-errors test/test_{segment_{region,table,line,word},recognize}.py $(PYTEST_ARGS) # Run unit tests and determine test coverage coverage: diff --git a/ocrd_tesserocr/segment.py b/ocrd_tesserocr/segment.py index a299089..f60e913 100644 --- a/ocrd_tesserocr/segment.py +++ b/ocrd_tesserocr/segment.py @@ -21,7 +21,8 @@ def __init__(self, *args, **kwargs): assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid self.logger = getLogger('processor.TesserocrSegment') -TesserocrSegment.process.__doc__ = """Performs region and line segmentation with Tesseract on the workspace. + def process(self): + """Performs region and line segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, and remove any existing Region and ReadingOrder elements. @@ -58,3 +59,4 @@ def __init__(self, *args, **kwargs): Produce a new output file by serialising the resulting hierarchy. """ + super(TesserocrSegment, self).process() diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 72137d8..626f859 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -22,7 +22,8 @@ def __init__(self, *args, **kwargs): assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid self.logger = getLogger('processor.TesserocrSegmentLine') -TesserocrSegmentLine.process.__doc__ = """Performs (text) line segmentation with Tesseract on the workspace. + def process(self): + """Performs (text) line segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the (text) region level, @@ -40,3 +41,4 @@ def __init__(self, *args, **kwargs): Produce a new output file by serialising the resulting hierarchy. """ + super(TesserocrSegmentLine, self).process() diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index a223c16..f04b0dd 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -24,7 +24,8 @@ def __init__(self, *args, **kwargs): assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid self.logger = getLogger('processor.TesserocrSegmentRegion') -TesserocrSegmentRegion.process.__doc__ = """Performs region segmentation with Tesseract on the workspace. + def process(self): + """Performs region segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, and remove any existing Region and ReadingOrder elements @@ -48,3 +49,4 @@ def __init__(self, *args, **kwargs): Produce a new output file by serialising the resulting hierarchy. """ + super(TesserocrSegmentRegion, self).process() diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py index edc85e5..3aa753c 100644 --- a/ocrd_tesserocr/segment_table.py +++ b/ocrd_tesserocr/segment_table.py @@ -22,7 +22,8 @@ def __init__(self, *args, **kwargs): assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid self.logger = getLogger('processor.TesserocrSegmentTable') -TesserocrSegmentTable.process.__doc__ = """Performs table cell segmentation with Tesseract on the workspace. + def process(self): + """Performs table cell segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level @@ -36,3 +37,4 @@ def __init__(self, *args, **kwargs): Produce a new output file by serialising the resulting hierarchy. """ + super(TesserocrSegmentTable, self).process() diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index 19134fe..ebe8f49 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -22,7 +22,8 @@ def __init__(self, *args, **kwargs): assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid self.logger = getLogger('processor.TesserocrSegmentWord') -TesserocrSegmentWord.process.__doc__ = """Performs word segmentation with Tesseract on the workspace. + def process(self): + """Performs word segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the textline level, @@ -39,3 +40,4 @@ def __init__(self, *args, **kwargs): Produce a new output file by serialising the resulting hierarchy. """ + super(TesserocrSegmentWord, self).process() diff --git a/test/conftest.py b/test/conftest.py index 54cba29..fd09f74 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -6,16 +6,23 @@ METS_KANT_BINARIZED = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml') METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml') +METS_GUTACHTEN = assets.url_of('gutachten/data/mets.xml') @fixture -def workspace_kant_binarized(): +def workspace_kant_binarized(tmpdir): initLogging() - with pushd_popd(tempdir=True) as tempdir: - yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tempdir, download=True) + with pushd_popd(tmpdir): + yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tmpdir, download=True) @fixture -def workspace_herold_small(): +def workspace_herold_small(tmpdir): initLogging() - with pushd_popd(tempdir=True) as tempdir: - yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tempdir, download=True) + with pushd_popd(tmpdir): + yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tmpdir, download=True) + +@fixture +def workspace_gutachten(tmpdir): + initLogging() + with pushd_popd(tmpdir): + yield Resolver().workspace_from_url(METS_GUTACHTEN, dst_dir=tmpdir, download=True) diff --git a/test/test_segment_line.py b/test/test_segment_line.py index 99602f2..f63bc6a 100644 --- a/test/test_segment_line.py +++ b/test/test_segment_line.py @@ -1,6 +1,8 @@ from ocrd_tesserocr import TesserocrSegmentRegion from ocrd_tesserocr import TesserocrSegmentLine from ocrd_tesserocr import TesserocrSegment +from ocrd_modelfactory import page_from_file +from ocrd_utils import MIMETYPE_PAGE def test_run_modular(workspace_herold_small): TesserocrSegmentRegion( @@ -14,6 +16,13 @@ def test_run_modular(workspace_herold_small): input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE" ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-LINE", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_lines = out_pcgts.get_Page().get_AllTextLines() + assert len(out_lines) workspace_herold_small.save_mets() def test_run_allinone(workspace_herold_small): @@ -22,4 +31,11 @@ def test_run_allinone(workspace_herold_small): input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG" ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_lines = out_pcgts.get_Page().get_AllTextLines() + assert len(out_lines) workspace_herold_small.save_mets() diff --git a/test/test_segment_region.py b/test/test_segment_region.py index c250ebd..50f7ed4 100644 --- a/test/test_segment_region.py +++ b/test/test_segment_region.py @@ -1,4 +1,6 @@ from ocrd_tesserocr import TesserocrSegmentRegion +from ocrd_modelfactory import page_from_file +from ocrd_utils import MIMETYPE_PAGE def test_run(workspace_herold_small): TesserocrSegmentRegion( @@ -6,6 +8,13 @@ def test_run(workspace_herold_small): input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK" ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text']) + assert len(out_blocks) workspace_herold_small.save_mets() def test_run_shrink(workspace_herold_small): @@ -15,6 +24,13 @@ def test_run_shrink(workspace_herold_small): output_file_grp="OCR-D-SEG-BLOCK", parameter={'shrink_polygons': True} ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text']) + assert len(out_blocks) workspace_herold_small.save_mets() def test_run_sparse(workspace_herold_small): @@ -24,6 +40,13 @@ def test_run_sparse(workspace_herold_small): output_file_grp="OCR-D-SEG-BLOCK", parameter={'sparse_text': True} ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text']) + assert len(out_blocks) workspace_herold_small.save_mets() def test_run_staves(workspace_herold_small): @@ -33,4 +56,11 @@ def test_run_staves(workspace_herold_small): output_file_grp="OCR-D-SEG-BLOCK", parameter={'find_staves': True, 'find_tables': False} ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text']) + assert len(out_blocks) workspace_herold_small.save_mets() diff --git a/test/test_segment_table.py b/test/test_segment_table.py new file mode 100644 index 0000000..34ecfca --- /dev/null +++ b/test/test_segment_table.py @@ -0,0 +1,47 @@ +from ocrd_tesserocr import TesserocrSegment, TesserocrSegmentRegion, TesserocrSegmentTable +from ocrd_modelfactory import page_from_file +from ocrd_utils import MIMETYPE_PAGE + +def test_run_modular(workspace_gutachten): + TesserocrSegmentRegion( + workspace_gutachten, + input_file_grp="IMG", + output_file_grp="OCR-D-SEG-BLOCK", + parameter={'find_tables': True, 'overwrite_regions': True} + ).process() + TesserocrSegmentTable( + workspace_gutachten, + input_file_grp="OCR-D-SEG-BLOCK", + output_file_grp="OCR-D-SEG-CELL" + ).process() + out_files = list(workspace_gutachten.find_files( + fileGrp="OCR-D-SEG-CELL", pageId="PHYS_1", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_tables = out_pcgts.get_Page().get_AllRegions(classes=['Table']) + assert len(out_tables) + workspace_gutachten.save_mets() + +def test_run_allinone(workspace_gutachten): + TesserocrSegment( + workspace_gutachten, + input_file_grp="IMG", + output_file_grp="OCR-D-SEG", + parameter={'find_tables': True} # , 'textequiv_level': 'cell' + ).process() + TesserocrSegmentTable( + workspace_gutachten, + input_file_grp="OCR-D-SEG", + output_file_grp="OCR-D-SEG-CELL", + parameter={'overwrite_cells': True} + ).process() + out_files = list(workspace_gutachten.find_files( + fileGrp="OCR-D-SEG-CELL", pageId="PHYS_1", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_tables = out_pcgts.get_Page().get_AllRegions(classes=['Table']) + assert len(out_tables) + workspace_gutachten.save_mets() + diff --git a/test/test_segment_word.py b/test/test_segment_word.py index 86fcc6b..86fc28d 100644 --- a/test/test_segment_word.py +++ b/test/test_segment_word.py @@ -1,6 +1,8 @@ from ocrd_tesserocr import TesserocrSegmentRegion from ocrd_tesserocr import TesserocrSegmentLine from ocrd_tesserocr import TesserocrSegmentWord +from ocrd_modelfactory import page_from_file +from ocrd_utils import MIMETYPE_PAGE def test_run_modular(workspace_kant_binarized): TesserocrSegmentRegion( @@ -18,4 +20,12 @@ def test_run_modular(workspace_kant_binarized): input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-SEG-WORD" ).process() + out_files = list(workspace_kant_binarized.find_files( + fileGrp="OCR-D-SEG-WORD", pageId="P_0017", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_lines = out_pcgts.get_Page().get_AllTextLines() + assert len(out_lines) + assert all(len(line.get_Word()) for line in out_lines) workspace_kant_binarized.save_mets()