Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Processor result object #8

Merged
merged 16 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,9 @@ repo/assets repo/spec: always-update

.PHONY: spec
# Copy JSON Schema, OpenAPI from OCR-D/spec
spec: repo/spec
cp repo/spec/ocrd_tool.schema.yml ocrd_validators/ocrd_validators/ocrd_tool.schema.yml
cp repo/spec/bagit-profile.yml ocrd_validators/ocrd_validators/bagit-profile.yml
spec: # repo/spec
cp repo/spec/ocrd_tool.schema.yml src/ocrd_validators/ocrd_tool.schema.yml
cp repo/spec/bagit-profile.yml src/ocrd_validators/bagit-profile.yml

#
# Assets
Expand Down
39 changes: 20 additions & 19 deletions src/ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,16 @@
import os
from os import getcwd
from pathlib import Path
from typing import Optional
from typing import List, Optional
import sys
import inspect
import tarfile
import io
from deprecated import deprecated

from ocrd.workspace import Workspace
from ocrd_models.ocrd_file import OcrdFile
from ocrd_models.ocrd_process_result import OcrdProcessResult
from ocrd_utils import (
VERSION as OCRD_VERSION,
MIMETYPE_PAGE,
Expand Down Expand Up @@ -309,7 +311,7 @@ def process_workspace(self, workspace: Workspace) -> None:
# fall back to deprecated method
self.process()

def process_page_file(self, *input_files) -> None:
def process_page_file(self, *input_files : OcrdFile) -> None:
"""
Process the given ``input_files`` of the :py:attr:`workspace`,
representing one physical page (passed as one opened
Expand All @@ -321,7 +323,7 @@ def process_page_file(self, *input_files) -> None:
to handle cases like multiple fileGrps, non-PAGE input etc.)
"""
log = getLogger('ocrd.processor.base')
input_pcgts = [None] * len(input_files)
input_pcgts : List[OcrdPage] = [None] * len(input_files)
page_id = input_files[0].pageId
for i, input_file in enumerate(input_files):
# FIXME: what about non-PAGE input like image or JSON ???
kba marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -331,28 +333,25 @@ def process_page_file(self, *input_files) -> None:
except ValueError as e:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
except ValueError as e:
except (AssertionError, ValueError) as e:

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this ever happen, ie. can page_from_file(with_etree=False) ever return anything other than a PcGtsType? I think if that was ever the case, we'd want that AssertionError to be raised because then we'd have broken something.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right – it cannot happen. But then what is the assertion good for – satisfying the type checker?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First, my curiosity that I understand the behavior correctly. But secondly, yes, the type checker ;)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But reading this again, I should have used OcrdPage not PcGtsType, which is just an alias but we use OcrdPage in the method typing.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Feel free to change in OCR-D#1240.

log.info("non-PAGE input for page %s: %s", page_id, e)
output_file_id = make_file_id(input_files[0], self.output_file_grp)
output_pcgts = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id)
if isinstance(output_pcgts, (list, tuple)):
output_images = output_pcgts[1:]
output_pcgts = output_pcgts[0]
for output_image_pil, output_image_id, output_image_path in output_images:
self.workspace.save_image_file(
output_image_pil,
output_image_id,
self.output_file_grp,
page_id=page_id,
file_path=output_image_path)
output_pcgts.set_pcGtsId(output_file_id)
self.add_metadata(output_pcgts)
result = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id)
for output_image_pil, output_image_id, output_image_path in result.images:
self.workspace.save_image_file(
output_image_pil,
output_image_id,
self.output_file_grp,
page_id=page_id,
file_path=output_image_path)
result.pcgts.set_pcGtsId(output_file_id)
self.add_metadata(result.pcgts)
# FIXME: what about non-PAGE output like JSON ???
self.workspace.add_file(file_id=output_file_id,
file_grp=self.output_file_grp,
page_id=page_id,
local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
mimetype=MIMETYPE_PAGE,
content=to_xml(output_pcgts))
content=to_xml(result.pcgts))

def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage:
def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdProcessResult:
"""
Process the given ``input_pcgts`` of the :py:attr:`workspace`,
representing one physical page (passed as one parsed
Expand All @@ -374,7 +373,9 @@ def add_metadata(self, pcgts: OcrdPage) -> None:
Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
"""
pcgts.get_Metadata().add_MetadataItem(
metadata_obj = pcgts.get_Metadata()
assert metadata_obj is not None
metadata_obj.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=self.ocrd_tool['executable'],
Expand Down
10 changes: 6 additions & 4 deletions src/ocrd/processor/builtin/dummy_processor.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# pylint: disable=missing-module-docstring,invalid-name
from os.path import join, basename
from typing import Optional

import click

from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_models.ocrd_page import to_xml
from ocrd_models.ocrd_page import OcrdPage, to_xml
kba marked this conversation as resolved.
Show resolved Hide resolved
from ocrd_models.ocrd_process_result import OcrdProcessResult
from ocrd_utils import (
getLogger,
assert_file_grp_cardinality,
Expand All @@ -24,9 +26,9 @@ class DummyProcessor(Processor):
Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
"""

def process_page_pcgts(self, *input_pcgts, output_file_id=None, page_id=None):
def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdProcessResult:
# nothing to do here
return input_pcgts[0]
return OcrdProcessResult(input_pcgts[0])

def process_page_file(self, *input_files):
LOG = getLogger('ocrd.dummy')
Expand All @@ -48,7 +50,7 @@ def process_page_file(self, *input_files):
content=content)
file_id = file_id + '_PAGE'
pcgts = page_from_file(output_file)
pcgts = self.process_page_pcgts(pcgts)
pcgts = self.process_page_pcgts(pcgts).pcgts
pcgts.set_pcGtsId(file_id)
self.add_metadata(pcgts)
LOG.info("Add PAGE-XML %s generated for %s", file_id, output_file)
Expand Down
2 changes: 1 addition & 1 deletion src/ocrd/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -1073,7 +1073,7 @@ def image_from_segment(self, segment, parent_image, parent_coords,
return segment_image, segment_coords

# pylint: disable=redefined-builtin
def save_image_file(self, image : Image,
def save_image_file(self, image : Image.Image,
file_id : str,
file_grp : str,
file_path : Optional[str] = None,
Expand Down
2 changes: 1 addition & 1 deletion src/ocrd_modelfactory/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def page_from_image(input_file, with_tree=False):
revmap = dict(((node, element) for element, node in mapping.items()))
return pcgts, etree, mapping, revmap

def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET.Element, dict, dict]]:
def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET._Element, dict, dict]]:
"""
Create :py:class:`~ocrd_models.ocrd_page.OcrdPage`
from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path
Expand Down
1 change: 1 addition & 0 deletions src/ocrd_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
from .ocrd_mets import OcrdMets
from .ocrd_xml_base import OcrdXmlDocument
from .report import ValidationReport
from .ocrd_process_result import OcrdProcessResult