bertsky · kba · Aug 15, 2024 · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/Makefile b/Makefile
@@ -238,9 +238,9 @@ repo/assets repo/spec: always-update
 
 .PHONY: spec
 # Copy JSON Schema, OpenAPI from OCR-D/spec
-spec: repo/spec
-	cp repo/spec/ocrd_tool.schema.yml ocrd_validators/ocrd_validators/ocrd_tool.schema.yml
-	cp repo/spec/bagit-profile.yml ocrd_validators/ocrd_validators/bagit-profile.yml
+spec: # repo/spec
+	cp repo/spec/ocrd_tool.schema.yml src/ocrd_validators/ocrd_tool.schema.yml
+	cp repo/spec/bagit-profile.yml src/ocrd_validators/bagit-profile.yml
 
 #
 # Assets

diff --git a/repo/spec b/repo/spec
diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py
@@ -9,20 +9,23 @@
     'run_processor'
 ]
 
-from os.path import exists
+from os.path import exists, join
 from shutil import copyfileobj
 import json
 import os
 from os import getcwd
 from pathlib import Path
-from typing import Optional
+from typing import List, Optional, Union
 import sys
 import inspect
 import tarfile
 import io
 from deprecated import deprecated
 
 from ocrd.workspace import Workspace
+from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile
+from ocrd.processor.ocrd_page_result import OcrdPageResult
+from ocrd_models.ocrd_page_generateds import PcGtsType
 from ocrd_utils import (
     VERSION as OCRD_VERSION,
     MIMETYPE_PAGE,
@@ -198,10 +201,11 @@ def verify(self):
         assert self.output_file_grp is not None
         input_file_grps = self.input_file_grp.split(',')
         output_file_grps = self.output_file_grp.split(',')
-        def assert_file_grp_cardinality(grps, spec, msg):
+        def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
             if isinstance(spec, int) and spec > 0:
                 assert len(grps) == spec, msg % (len(grps), str(spec))
             else:
+                assert isinstance(spec, list)
                 minimum = spec[0]
                 maximum = spec[1]
                 if minimum > 0:
@@ -289,7 +293,7 @@ def process_workspace(self, workspace: Workspace) -> None:
                     # - ResourceNotFoundError → use ResourceManager to download (once), then retry
                     # - transient (I/O or OOM) error → maybe sleep, retry
                     # - persistent (data) error → skip / dummy / raise
-                    input_files = [None] * len(input_file_tuple)
+                    input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple)
                     for i, input_file in enumerate(input_file_tuple):
                         if i == 0:
                             log.info("processing page %s", input_file.pageId)
@@ -309,7 +313,7 @@ def process_workspace(self, workspace: Workspace) -> None:
                 # fall back to deprecated method
                 self.process()
 
-    def process_page_file(self, *input_files) -> None:
+    def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None:
         """
         Process the given ``input_files`` of the :py:attr:`workspace`,
         representing one physical page (passed as one opened
@@ -321,49 +325,56 @@ def process_page_file(self, *input_files) -> None:
         to handle cases like multiple fileGrps, non-PAGE input etc.)
         """
         log = getLogger('ocrd.processor.base')
-        input_pcgts = [None] * len(input_files)
+        input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
+        assert isinstance(input_files[0], (OcrdFile, ClientSideOcrdFile))
         page_id = input_files[0].pageId
         for i, input_file in enumerate(input_files):
+            assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile))
             # FIXME: what about non-PAGE input like image or JSON ???
             log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId)
             try:
-                input_pcgts[i] = page_from_file(input_file)
+                page_ = page_from_file(input_file)
+                assert isinstance(page_, PcGtsType)
+                input_pcgts[i] = page_
             except ValueError as e:
-            except ValueError as e:
+            except (AssertionError, ValueError) as e:
-            except ValueError as e:
+            except (AssertionError, ValueError) as e:
                 log.info("non-PAGE input for page %s: %s", page_id, e)
         output_file_id = make_file_id(input_files[0], self.output_file_grp)
-        output_pcgts = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id)
-        if isinstance(output_pcgts, (list, tuple)):
-            output_images = output_pcgts[1:]
-            output_pcgts = output_pcgts[0]
-            for output_image_pil, output_image_id, output_image_path in output_images:
-                self.workspace.save_image_file(
-                    output_image_pil,
-                    output_image_id,
-                    self.output_file_grp,
-                    page_id=page_id,
-                    file_path=output_image_path)
-        output_pcgts.set_pcGtsId(output_file_id)
-        self.add_metadata(output_pcgts)
+        result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
+        for image_result in result.images:
+            image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
+            image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
+            image_result.alternative_image.set_filename(image_file_path)
+            self.workspace.save_image_file(
+                image_result.pil,
+                image_file_id,
+                self.output_file_grp,
+                page_id=page_id,
+                file_path=image_file_path)
+        result.pcgts.set_pcGtsId(output_file_id)
+        self.add_metadata(result.pcgts)
         # FIXME: what about non-PAGE output like JSON ???
         self.workspace.add_file(file_id=output_file_id,
                                 file_grp=self.output_file_grp,
                                 page_id=page_id,
                                 local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
                                 mimetype=MIMETYPE_PAGE,
-                                content=to_xml(output_pcgts))
+                                content=to_xml(result.pcgts))
 
-    def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage:
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """
         Process the given ``input_pcgts`` of the :py:attr:`workspace`,
         representing one physical page (passed as one parsed
         :py:class:`~ocrd_models.OcrdPage` per input fileGrp)
         under the given :py:attr:`parameter`, and return the
-        resulting :py:class:`~ocrd_models.OcrdPage`.
+        resulting :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult`.
 
-        Optionally, return a list or tuple of the :py:class:`~ocrd_models.OcrdPage`
-        and one or more lists or tuples of :py:class:`PIL.Image` (image data),
-        :py:class:str (file ID) and :py:class:str (file path) of derived images
-        to be annotated along with the resulting PAGE file.
+        Optionally, add to the ``images`` attribute of the resulting
+        :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult` instances
+        of :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResultImage`,
+        which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
+        ``file_id_suffix`` (used for generating IDs of saved images) and
+        ``file_path`` (the path used in the AlternativeImage and for saving the
+        file).
 
         (This contains the main functionality and must be overridden by subclasses.)
         """
@@ -374,7 +385,9 @@ def add_metadata(self, pcgts: OcrdPage) -> None:
         Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
         the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
         """
-        pcgts.get_Metadata().add_MetadataItem(
+        metadata_obj = pcgts.get_Metadata()
+        assert metadata_obj is not None
+        metadata_obj.add_MetadataItem(
                 MetadataItemType(type_="processingStep",
                     name=self.ocrd_tool['steps'][0],
                     value=self.ocrd_tool['executable'],

diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py
@@ -1,14 +1,17 @@
 # pylint: disable=missing-module-docstring,invalid-name
 from os.path import join, basename
+from typing import Optional, Union
 
 import click
 
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
-from ocrd_models.ocrd_page import to_xml
+from ocrd.processor.ocrd_page_result import OcrdPageResult
+from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile
+from ocrd_models.ocrd_page import OcrdPage, to_xml
+from ocrd_models.ocrd_page_generateds import PcGtsType
 from ocrd_utils import (
     getLogger,
-    assert_file_grp_cardinality,
     make_file_id,
     MIME_TO_EXT,
     MIMETYPE_PAGE,
@@ -24,13 +27,16 @@ class DummyProcessor(Processor):
     Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
     """
 
-    def process_page_pcgts(self, *input_pcgts, output_file_id=None, page_id=None):
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
+        assert input_pcgts[0]
         # nothing to do here
-        return input_pcgts[0]
+        return OcrdPageResult(input_pcgts[0])
 
-    def process_page_file(self, *input_files):
+    def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None:
         LOG = getLogger('ocrd.dummy')
         input_file = input_files[0]
+        assert input_file
+        assert input_file.local_filename
         if self.parameter['copy_files'] and input_file.mimetype != MIMETYPE_PAGE:
             # we need to mimic the actual copying in addition to the PAGE boilerplate
             file_id = make_file_id(input_file, self.output_file_grp)
@@ -48,7 +54,8 @@ def process_page_file(self, *input_files):
                     content=content)
             file_id = file_id + '_PAGE'
             pcgts = page_from_file(output_file)
-            pcgts = self.process_page_pcgts(pcgts)
+            assert isinstance(pcgts, PcGtsType)
+            pcgts = self.process_page_pcgts(pcgts).pcgts
             pcgts.set_pcGtsId(file_id)
             self.add_metadata(pcgts)
             LOG.info("Add PAGE-XML %s generated for %s", file_id, output_file)

diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py
@@ -0,0 +1,17 @@
+from dataclasses import dataclass, field
+from typing import List
+from ocrd_models.ocrd_page import OcrdPage
+from PIL.Image import Image
+
+from ocrd_models.ocrd_page_generateds import AlternativeImageType
+
+@dataclass
+class OcrdPageResultImage():
+    pil : Image
+    file_id_suffix : str
+    alternative_image : AlternativeImageType
+
+@dataclass
+class OcrdPageResult():
+    pcgts : OcrdPage
+    images : List[OcrdPageResultImage] = field(default_factory=list)
diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py
@@ -1073,7 +1073,7 @@ def image_from_segment(self, segment, parent_image, parent_coords,
         return segment_image, segment_coords
 
     # pylint: disable=redefined-builtin
-    def save_image_file(self, image : Image,
+    def save_image_file(self, image : Image.Image,
                         file_id : str,
                         file_grp : str,
                         file_path : Optional[str] = None,

diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py
@@ -79,7 +79,7 @@ def page_from_image(input_file, with_tree=False):
     revmap = dict(((node, element) for element, node in mapping.items()))
     return pcgts, etree, mapping, revmap
 
-def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET.Element, dict, dict]]:
+def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET._Element, dict, dict]]:
     """
     Create :py:class:`~ocrd_models.ocrd_page.OcrdPage`
     from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path

diff --git a/src/ocrd_validators/ocrd_tool.schema.yml b/src/ocrd_validators/ocrd_tool.schema.yml
@@ -11,7 +11,7 @@ properties:
     type: string
     pattern: '^[0-9]+\.[0-9]+\.[0-9]+$'
   git_url:
-    description: Github/Gitlab URL
+    description: GitHub/GitLab URL
     type: string
     format: url
   dockerhub:
@@ -37,46 +37,41 @@ properties:
             type: string
           input_file_grp:
             deprecated: true
-            description: Input fileGrp@USE this tool expects by default
+            description: (DEPRECATED) Input fileGrp@USE this tool expects by default
             type: array
             items:
               type: string
               # pattern: '^OCR-D-[A-Z0-9-]+$'
           output_file_grp:
             deprecated: true
-            description: Output fileGrp@USE this tool produces by default
+            description: (DEPRECATED) Output fileGrp@USE this tool produces by default
             type: array
             items:
               type: string
               # pattern: '^OCR-D-[A-Z0-9-]+$'
           input_file_grp_cardinality:
             description: Number of (comma-separated) input fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited)
             oneOf:
-              - items:
+              - type: number
+                multipleOf: 1
+              - type: array
+                items:
                   type: number
                   multipleOf: 1
-              - items:
-                  type: array
-                  items:
-                    type: number
-                    multipleOf: 1
-                  minItems: 2
-                  maxItems: 2
+                minItems: 2
+                maxItems: 2
             default: 1
-            additionalProperties: false
           output_file_grp_cardinality:
             description: Number of (comma-separated) output fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited)
             oneOf:
-              - items:
+              - type: number
+                multipleOf: 1
+              - type: array
+                items:
                   type: number
                   multipleOf: 1
-              - items:
-                  type: array
-                  items:
-                    type: number
-                    multipleOf: 1
-                  minItems: 2
-                  maxItems: 2
+                minItems: 2
+                maxItems: 2
             default: 1
           parameters:
             description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas.
@@ -152,9 +147,9 @@ properties:
                     description: "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change."
                     default: false
           description:
-            description: Concise description what the tool does
+            description: Concise description of what the tool does
           categories:
-            description: Tools belong to this categories, representing modules within the OCR-D project structure
+            description: Tools belong to these categories, representing modules within the OCR-D project structure
             type: array
             items:
               type: string
@@ -229,12 +224,12 @@ properties:
                   default: 'as-is'
                 path_in_archive:
                   type: string
-                  description: if type is archive, the resource is at this location in the archive
+                  description: If type is archive, the resource is at this location in the archive
                   default: '.'
                 version_range:
                   type: string
                   description: Range of supported versions, syntax like in PEP 440
                   default: '>= 0.0.1'
                 size:
                   type: number
-                  description: Size of the resource in bytes
+                  description: "Size of the resource in bytes to be retrieved (for archives: size of the archive)"