Merge pull request #43 from bertsky/segment-regions

Segment regions, incremental segmentation via masking
OCR-D · May 29, 2024 · e62d7e4 · e62d7e4
2 parents bdbe6fc + 636ca99
commit e62d7e4
Show file tree

Hide file tree

Showing 11 changed files with 428 additions and 155 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -19,7 +19,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9, '3.10', '3.11']
+        python-version: [3.8, 3.9, '3.10', '3.11']
 
     steps:
       - uses: actions/checkout@v4
@@ -35,14 +35,10 @@ jobs:
       - name: Build
         run: |
           python3 --version
-          python3 -m venv venv
-          source venv/bin/activate
           make deps deps-test
           make install
           pip check
           ocrd resmgr download ocrd-kraken-segment blla.mlmodel
           ocrd resmgr download ocrd-kraken-recognize en_best.mlmodel
       - name: Test
-        run: |
-          source venv/bin/activate
-          make test
+        run: make test
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,11 +5,39 @@ Versioned according to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+Fixed:
+
+  * recognize: improve baseline/polgon robustness
+
+Changed:
+
+  * segment/recognize: adapt and bump to Kraken v5
+  * adapt to Python importlib instead of pkg_resources
+  * update tests/CI
+
+## [0.4.0] - 2024-02-11
+
+Fixed:
+
+  * binarize: OCR-D conformity (PAGE output, AlternativeImage input/output)
+  * docstrings
+
+Added:
+
+  * recognize: param `overwrite_text`
+  * segment: param `overwrite_segments`
+  * segment: param `level-of-operation` (now supports `table` and `region`, too)
+
+Changed:
+
+  * binarize: :fire: renamed `level-of-operation=block` to `region`
+  * segment: existing segmentation will be masked away (unless `overwrite_segments`)
+
 ## [0.3.1] - 2023-08-17
 
 Fixed:
 
-  * recognize: only apply `one_channel_mode` (whether to use `binarized` input)  
+  * recognize: only apply `one_channel_mode` (whether to use `binarized` input)
     if the model has only one input channel
   * recognize: project text results to region level in order
   * recognize: iterate line results via proper word splitting

diff --git a/README.md b/README.md
@@ -83,13 +83,14 @@ Available [OCR-D processors](https://ocr-d.de/en/spec/cli) are:
 
 - [ocrd-kraken-binarize](ocrd_kraken/binarize.py) (nlbin – not recommended)  
   - adds `AlternativeImage` files (per page, region or line) to the output fileGrp
-- [ocrd-kraken-segment](ocrd_kraken/segment.py) (all-in-one segmentation – recommended for handwriting and simply layouted prints)  
-  - adds `TextRegion`s, `TableRegion`s, `ImageRegion`s, `MathsRegion`s, `NoiseRegion`s, `ReadingOrder` and `AlternativeImage` to `Page` (depending on model training)
-  - adds `TextLine`s to `TextRegion`s, including their `Baseline`
+- [ocrd-kraken-segment](ocrd_kraken/segment.py) (all-in-one segmentation – recommended for handwriting and simply layouted prints, or as pure line segmentation)  
+  - adds `TextRegion`s to `Page` (if `level-of-operation=page`) or `TableRegion`s (if `table`)
+  - adds `TextLine`s (with `Baseline`) to `TextRegion`s (for all `level-of-operation`)
+  - masks existing segments during detection (unless `overwrite_segments`)
 - [ocrd-kraken-recognize](ocrd_kraken/recognize.py) (benefits from annotated `Baseline`s, falls back to center-normalized bboxes)
   - adds `Word`s to `TextLine`s
   - adds `Glyph`s to `Word`s
-  - adds `TextEquiv`
+  - adds `TextEquiv` (removing existing `TextEquiv` if `overwrite_text`)
 
 ## Testing
 

diff --git a/ocrd_kraken/binarize.py b/ocrd_kraken/binarize.py
@@ -1,8 +1,9 @@
 from __future__ import absolute_import
-import io
+import os
 import kraken.binarization
 from ocrd import Processor
-from ocrd_utils import getLogger, polygon_from_points, concat_padded
+from ocrd_utils import getLogger, make_file_id, MIMETYPE_PAGE
+from ocrd_models.ocrd_page import AlternativeImageType, to_xml
 from ocrd_modelfactory import page_from_file
 
 from ocrd_kraken.config import OCRD_TOOL
@@ -16,51 +17,83 @@ def __init__(self, *args, **kwargs):
         super(KrakenBinarize, self).__init__(*args, **kwargs)
 
     def process(self):
-        """
-        Performs the binarization.
+        """Binarize the pages/regions/lines with Kraken.
+
+        Open and deserialise PAGE input files and their respective images,
+        then iterate over the element hierarchy down to the requested
+        ``level-of-operation``.
+
+        Next, for each file, crop each segment image according to the layout
+        annotation (via coordinates into the higher-level image, or from the
+        alternative image), and determine the threshold for binarization 
+        (via Ocropy nlbin). Apply results to the image and export it.
+
+        Add the new image file to the workspace along with the output fileGrp,
+        and using a file ID with suffix ``.IMG-BIN`` along with further
+        identification of the input element.
+
+        Reference each new image in the AlternativeImage of the element.
+
+        Produce a new output file by serialising the resulting hierarchy.
         """
         log = getLogger('processor.KrakenBinarize')
         log.debug('Level of operation: "%s"', self.parameter['level-of-operation'])
         log.debug('Input file group %s', self.input_file_grp)
         log.debug('Input files %s', [str(f) for f in self.input_files])
         for (n, input_file) in enumerate(self.input_files):
-            log.info("INPUT FILE %i / %s", n, input_file)
+            log.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            file_id = make_file_id(input_file, self.output_file_grp)
             pcgts = page_from_file(self.workspace.download_file(input_file))
-            image_url = pcgts.get_Page().imageFilename
-            log.info("pcgts %s", pcgts)
+            page = pcgts.get_Page()
+            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
+            self.add_metadata(pcgts)
+
+            page_image, page_coords, page_image_info = self.workspace.image_from_page(
+                page, page_id, feature_filter='binarized')
             if self.parameter['level-of-operation'] == 'page':
-                log.info("About to binarize page '%s'", pcgts.pcGtsId)
-                image = self.workspace.resolve_image_as_pil(image_url)
-                bin_image = kraken.binarization.nlbin(image)
-                bin_image_bytes = io.BytesIO()
-                bin_image.save(bin_image_bytes, format='PNG')
-                ID = concat_padded(self.output_file_grp, n)
-                self.workspace.add_file(
+                log.info("Binarizing page '%s'", page_id)
+                bin_image = kraken.binarization.nlbin(page_image)
+                file_path = self.workspace.save_image_file(
+                    bin_image, file_id + '.IMG-BIN',
                     self.output_file_grp,
-                    pageId=input_file.pageId,
-                    ID=ID,
-                    mimetype='image/png',
-                    local_filename="%s/%s" % (self.output_file_grp, ID),
-                    content=bin_image_bytes.getvalue())
+                    page_id=input_file.pageId)
+                page.add_AlternativeImage(AlternativeImageType(
+                    filename=file_path,
+                    comments=page_coords['features'] + ',binarized'))
             else:
-                for region in pcgts.get_Page().get_TextRegion():
-                    if self.parameter['level-of-operation'] == 'block':
-                        log.info("About to binarize region '%s'", region.id)
-                        image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(region.get_Coords().points))
+                for region in page.get_AllRegions(classes=['Text']):
+                    region_image, region_coords = self.workspace.image_from_segment(
+                        region, page_image, page_coords, feature_filter='binarized')
+                    if self.parameter['level-of-operation'] == 'region':
+                        log.info("Binarizing region '%s'", region.id)
+                        bin_image = kraken.binarization.nlbin(region_image)
+                        file_path = self.workspace.save_image_file(
+                            bin_image, file_id + '_' + region.id + '.IMG-BIN',
+                            self.output_file_grp,
+                            page_id=input_file.pageId)
+                        region.add_AlternativeImage(AlternativeImageType(
+                            filename=file_path,
+                            comments=region_coords['features'] + ',binarized'))
                     else:
-                        textlines = region.get_TextLine()
-                        log.info("About to binarize %i lines of region '%s'", len(textlines), region.id)
-                        for (line_no, line) in enumerate(textlines):
-                            log.debug("Binarizing line '%s' in region '%s'", line_no, region.id)
-                            image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
-                            bin_image = kraken.binarization.nlbin(image)
-                            bin_image_bytes = io.BytesIO()
-                            bin_image.save(bin_image_bytes, format='PNG')
-                            ID = concat_padded(self.output_file_grp, n, region.id, line_no)
-                            self.workspace.add_file(
+                        for line in region.get_TextLine():
+                            line_image, line_coords = self.workspace.image_from_segment(
+                                line, region_image, region_coords, feature_filter='binarized')
+                            log.info("Binarizing line '%s'", line.id)
+                            bin_image = kraken.binarization.nlbin(line_image)
+                            file_path = self.workspace.save_image_file(
+                                bin_image, file_id + '_' + region.id + '_' + line.id + '.IMG-BIN',
                                 self.output_file_grp,
-                                pageId=input_file.pageId,
-                                ID=ID,
-                                local_filename="%s/%s" % (self.output_file_grp, ID),
-                                mimetype='image/png',
-                                content=bin_image_bytes.getvalue())
+                                page_id=input_file.pageId)
+                            line.add_AlternativeImage(AlternativeImageType(
+                                filename=file_path,
+                                comments=line_coords['features'] + ',binarized'))
+            # update METS (add the PAGE file):
+            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            pcgts.set_pcGtsId(file_id)
+            out = self.workspace.add_file(
+                ID=file_id,
+                file_grp=self.output_file_grp,
+                pageId=input_file.pageId,
+                local_filename=file_path,
+                mimetype=MIMETYPE_PAGE,
+                content=to_xml(pcgts))
diff --git a/ocrd_kraken/config.py b/ocrd_kraken/config.py
@@ -1,5 +1,5 @@
 import json
-from pkg_resources import resource_filename
+from ocrd_utils import resource_filename
 
-with open(resource_filename(__name__, 'ocrd-tool.json'), 'r', encoding='utf-8') as f:
+with open(resource_filename('ocrd_kraken', 'ocrd-tool.json'), 'r', encoding='utf-8') as f:
     OCRD_TOOL = json.load(f)
diff --git a/ocrd_kraken/ocrd-tool.json b/ocrd_kraken/ocrd-tool.json
@@ -1,6 +1,6 @@
 {
   "git_url": "https://github.com/OCR-D/ocrd_kraken",
-  "version": "0.3.1",
+  "version": "0.4.0",
   "tools": {
     "ocrd-kraken-binarize": {
       "executable": "ocrd-kraken-binarize",
@@ -15,10 +15,10 @@
       "description": "Binarize images with kraken",
       "parameters": {
         "level-of-operation": {
-          "description": "level-of-operation",
+          "description": "segment hierarchy level to operate on",
           "type": "string",
           "default": "page",
-          "enum": ["page", "block", "line"]
+          "enum": ["page", "region", "line"]
         }
       }
     },
@@ -30,10 +30,22 @@
         "Layout analysis"
       ],
       "steps": [
-        "layout/segmentation/region"
+        "layout/segmentation/region",
+        "layout/segmentation/line"
       ],
-      "description": "Block segmentation with kraken",
+      "description": "Layout segmentation with Kraken",
       "parameters": {
+        "level-of-operation": {
+          "description": "segment hierarchy level to operate on (page into regions+lines, or regions into lines)",
+          "type": "string",
+          "default": "page",
+          "enum": ["page", "table", "region"]
+        },
+        "overwrite_segments": {
+          "description": "remove any existing regions/lines", 
+          "type": "boolean", 
+          "default": false
+        },
         "text_direction": {
           "type": "string", 
           "description": "Sets principal text direction", 
@@ -103,7 +115,14 @@
           "size": 5047020,
           "name": "blla.mlmodel",
           "parameter_usage": "without-extension",
-          "description": "Pretrained baseline segmentation model"
+          "description": "Pretrained region+baseline segmentation model (trained on handwriting)"
+        },
+        {
+          "url": "https://ub-backup.bib.uni-mannheim.de/~stweil/tesstrain/kraken/ubma_segmentation/ubma_segmentation.mlmodel",
+          "size": 5047020,
+          "name": "ubma_segmentation.mlmodel",
+          "parameter_usage": "without-extension",
+          "description": "region+baseline segmentation model trained by UBMA (on print)"
         }
       ]
     },
@@ -113,8 +132,13 @@
       "output_file_grp": ["OCR-D-OCR-KRAK"],
       "categories": ["Text recognition and optimization"],
       "steps": ["recognition/text-recognition"],
-      "description": "OCR with kraken",
+      "description": "Text recognition with Kraken",
       "parameters": {
+        "overwrite_text": {
+          "description": "remove any existing TextEquiv", 
+          "type": "boolean", 
+          "default": false
+        },
         "model": {
           "description": "OCR model to recognize with",
           "type": "string",