From fd15e2a04120e6753518d0c8670a38f883fbcb15 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 14:44:06 +0200 Subject: [PATCH] tests: add actual assertions --- tests/test_binarize.py | 22 ++++++++++++++++++++-- tests/test_recognize.py | 14 +++++++++++++- tests/test_segment.py | 23 ++++++++++++++++++++--- 3 files changed, 53 insertions(+), 6 deletions(-) diff --git a/tests/test_binarize.py b/tests/test_binarize.py index 2f8a522..da9adea 100644 --- a/tests/test_binarize.py +++ b/tests/test_binarize.py @@ -1,8 +1,13 @@ # pylint: disable=import-error import json +import os from ocrd import run_processor +from ocrd_utils import MIMETYPE_PAGE +from ocrd_models.constants import NAMESPACES +from ocrd_modelfactory import page_from_file + from ocrd_kraken.binarize import KrakenBinarize from .assets import assets @@ -10,6 +15,17 @@ PARAM_JSON = assets.url_of('param-binarize.json') +def analyse_result(ws, level): + assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-BIN-KRAKEN')) + out_files = list(ws.find_files(fileGrp="OCR-D-BIN-KRAKEN", mimetype=MIMETYPE_PAGE)) + assert len(out_files), "found no output PAGE file" + out_images = list(ws.find_files(fileGrp="OCR-D-BIN-KRAKEN", mimetype="//^image/.*")) + assert len(out_images), "found no output image file" + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_images = out_pcgts.etree.xpath('//page:%s/page:AlternativeImage[contains(@comments,"binarized")]' % level, namespaces=NAMESPACES) + assert len(out_images) > 0, "found no binarized AlternativeImages in output PAGE file" + def test_param_json(workspace_sbb): run_processor(KrakenBinarize, input_file_grp="OCR-D-IMG", @@ -19,6 +35,7 @@ def test_param_json(workspace_sbb): ) ws = workspace_sbb['workspace'] ws.save_mets() + analyse_result(ws, 'Page') def test_binarize_regions(workspace_aufklaerung): run_processor(KrakenBinarize, @@ -29,7 +46,7 @@ def test_binarize_regions(workspace_aufklaerung): ) ws = workspace_aufklaerung['workspace'] ws.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) + analyse_result(ws, 'TextRegion') def test_binarize_lines(workspace_aufklaerung): run_processor(KrakenBinarize, @@ -40,4 +57,5 @@ def test_binarize_lines(workspace_aufklaerung): ) ws = workspace_aufklaerung['workspace'] ws.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) + analyse_result(ws, 'TextLine') + diff --git a/tests/test_recognize.py b/tests/test_recognize.py index 3ebeb71..8354a0e 100644 --- a/tests/test_recognize.py +++ b/tests/test_recognize.py @@ -1,6 +1,12 @@ # pylint: disable=import-error +import os + from ocrd import run_processor +from ocrd_utils import MIMETYPE_PAGE +from ocrd_models.constants import NAMESPACES +from ocrd_modelfactory import page_from_file + from ocrd_kraken.recognize import KrakenRecognize from ocrd_kraken.binarize import KrakenBinarize @@ -21,4 +27,10 @@ def test_recognize(workspace_aufklaerung): ) ws = workspace_aufklaerung['workspace'] ws.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) + assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-OCR-KRAKEN')) + results = ws.find_files(file_grp='OCR-D-OCR-KRAKEN', mimetype=MIMETYPE_PAGE) + result0 = next(results, False) + assert result0, "found no output PAGE file" + result0 = page_from_file(result0) + text0 = result0.etree.xpath('//page:Glyph/page:TextEquiv/page:Unicode', namespaces=NAMESPACES) + assert len(text0) > 0, "found no glyph text in output PAGE file" diff --git a/tests/test_segment.py b/tests/test_segment.py index ec7e1ee..6c00880 100644 --- a/tests/test_segment.py +++ b/tests/test_segment.py @@ -1,10 +1,27 @@ # pylint: disable=import-error +import os + from ocrd import run_processor +from ocrd_utils import MIMETYPE_PAGE +from ocrd_models.constants import NAMESPACES +from ocrd_modelfactory import page_from_file + from ocrd_kraken.segment import KrakenSegment from ocrd_kraken.binarize import KrakenBinarize +def analyse_result(ws): + assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-SEG-LINE-KRAKEN')) + out_files = list(ws.find_files(fileGrp="OCR-D-SEG-LINE-KRAKEN", mimetype=MIMETYPE_PAGE)) + assert len(out_files), "found no output PAGE file" + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_regions = out_pcgts.etree.xpath('//page:TextRegion/page:Coords', namespaces=NAMESPACES) + assert len(out_regions) > 0, "found no text regions in output PAGE file" + out_lines = out_pcgts.get_Page().get_AllTextLines() + assert len(out_lines), "found no text lines in output PAGE file" + def test_run_blla(workspace_aufklaerung): run_processor(KrakenSegment, input_file_grp="OCR-D-IMG", @@ -14,7 +31,7 @@ def test_run_blla(workspace_aufklaerung): ) ws = workspace_aufklaerung['workspace'] ws.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) + analyse_result(ws) def test_run_blla_regionlevel(workspace_aufklaerung_region): run_processor(KrakenSegment, @@ -27,7 +44,7 @@ def test_run_blla_regionlevel(workspace_aufklaerung_region): ) ws = workspace_aufklaerung_region['workspace'] ws.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) + analyse_result(ws) def test_run_legacy(workspace_aufklaerung): # legacy segmentation requires binarized images @@ -45,4 +62,4 @@ def test_run_legacy(workspace_aufklaerung): ) ws = workspace_aufklaerung['workspace'] ws.save_mets() - # FIXME: add result assertions (find_files, parsing PAGE etc) + analyse_result(ws)