Skip to content

Commit

Permalink
Merge pull request #43 from bertsky/segment-regions
Browse files Browse the repository at this point in the history
Segment regions, incremental segmentation via masking
  • Loading branch information
bertsky authored May 29, 2024
2 parents bdbe6fc + 636ca99 commit e62d7e4
Show file tree
Hide file tree
Showing 11 changed files with 428 additions and 155 deletions.
8 changes: 2 additions & 6 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.9, '3.10', '3.11']
python-version: [3.8, 3.9, '3.10', '3.11']

steps:
- uses: actions/checkout@v4
Expand All @@ -35,14 +35,10 @@ jobs:
- name: Build
run: |
python3 --version
python3 -m venv venv
source venv/bin/activate
make deps deps-test
make install
pip check
ocrd resmgr download ocrd-kraken-segment blla.mlmodel
ocrd resmgr download ocrd-kraken-recognize en_best.mlmodel
- name: Test
run: |
source venv/bin/activate
make test
run: make test
30 changes: 29 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,39 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

Fixed:

* recognize: improve baseline/polgon robustness

Changed:

* segment/recognize: adapt and bump to Kraken v5
* adapt to Python importlib instead of pkg_resources
* update tests/CI

## [0.4.0] - 2024-02-11

Fixed:

* binarize: OCR-D conformity (PAGE output, AlternativeImage input/output)
* docstrings

Added:

* recognize: param `overwrite_text`
* segment: param `overwrite_segments`
* segment: param `level-of-operation` (now supports `table` and `region`, too)

Changed:

* binarize: :fire: renamed `level-of-operation=block` to `region`
* segment: existing segmentation will be masked away (unless `overwrite_segments`)

## [0.3.1] - 2023-08-17

Fixed:

* recognize: only apply `one_channel_mode` (whether to use `binarized` input)
* recognize: only apply `one_channel_mode` (whether to use `binarized` input)
if the model has only one input channel
* recognize: project text results to region level in order
* recognize: iterate line results via proper word splitting
Expand Down
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,14 @@ Available [OCR-D processors](https://ocr-d.de/en/spec/cli) are:

- [ocrd-kraken-binarize](ocrd_kraken/binarize.py) (nlbin – not recommended)
- adds `AlternativeImage` files (per page, region or line) to the output fileGrp
- [ocrd-kraken-segment](ocrd_kraken/segment.py) (all-in-one segmentation – recommended for handwriting and simply layouted prints)
- adds `TextRegion`s, `TableRegion`s, `ImageRegion`s, `MathsRegion`s, `NoiseRegion`s, `ReadingOrder` and `AlternativeImage` to `Page` (depending on model training)
- adds `TextLine`s to `TextRegion`s, including their `Baseline`
- [ocrd-kraken-segment](ocrd_kraken/segment.py) (all-in-one segmentation – recommended for handwriting and simply layouted prints, or as pure line segmentation)
- adds `TextRegion`s to `Page` (if `level-of-operation=page`) or `TableRegion`s (if `table`)
- adds `TextLine`s (with `Baseline`) to `TextRegion`s (for all `level-of-operation`)
- masks existing segments during detection (unless `overwrite_segments`)
- [ocrd-kraken-recognize](ocrd_kraken/recognize.py) (benefits from annotated `Baseline`s, falls back to center-normalized bboxes)
- adds `Word`s to `TextLine`s
- adds `Glyph`s to `Word`s
- adds `TextEquiv`
- adds `TextEquiv` (removing existing `TextEquiv` if `overwrite_text`)

## Testing

Expand Down
109 changes: 71 additions & 38 deletions ocrd_kraken/binarize.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from __future__ import absolute_import
import io
import os
import kraken.binarization
from ocrd import Processor
from ocrd_utils import getLogger, polygon_from_points, concat_padded
from ocrd_utils import getLogger, make_file_id, MIMETYPE_PAGE
from ocrd_models.ocrd_page import AlternativeImageType, to_xml
from ocrd_modelfactory import page_from_file

from ocrd_kraken.config import OCRD_TOOL
Expand All @@ -16,51 +17,83 @@ def __init__(self, *args, **kwargs):
super(KrakenBinarize, self).__init__(*args, **kwargs)

def process(self):
"""
Performs the binarization.
"""Binarize the pages/regions/lines with Kraken.
Open and deserialise PAGE input files and their respective images,
then iterate over the element hierarchy down to the requested
``level-of-operation``.
Next, for each file, crop each segment image according to the layout
annotation (via coordinates into the higher-level image, or from the
alternative image), and determine the threshold for binarization
(via Ocropy nlbin). Apply results to the image and export it.
Add the new image file to the workspace along with the output fileGrp,
and using a file ID with suffix ``.IMG-BIN`` along with further
identification of the input element.
Reference each new image in the AlternativeImage of the element.
Produce a new output file by serialising the resulting hierarchy.
"""
log = getLogger('processor.KrakenBinarize')
log.debug('Level of operation: "%s"', self.parameter['level-of-operation'])
log.debug('Input file group %s', self.input_file_grp)
log.debug('Input files %s', [str(f) for f in self.input_files])
for (n, input_file) in enumerate(self.input_files):
log.info("INPUT FILE %i / %s", n, input_file)
log.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
file_id = make_file_id(input_file, self.output_file_grp)
pcgts = page_from_file(self.workspace.download_file(input_file))
image_url = pcgts.get_Page().imageFilename
log.info("pcgts %s", pcgts)
page = pcgts.get_Page()
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
self.add_metadata(pcgts)

page_image, page_coords, page_image_info = self.workspace.image_from_page(
page, page_id, feature_filter='binarized')
if self.parameter['level-of-operation'] == 'page':
log.info("About to binarize page '%s'", pcgts.pcGtsId)
image = self.workspace.resolve_image_as_pil(image_url)
bin_image = kraken.binarization.nlbin(image)
bin_image_bytes = io.BytesIO()
bin_image.save(bin_image_bytes, format='PNG')
ID = concat_padded(self.output_file_grp, n)
self.workspace.add_file(
log.info("Binarizing page '%s'", page_id)
bin_image = kraken.binarization.nlbin(page_image)
file_path = self.workspace.save_image_file(
bin_image, file_id + '.IMG-BIN',
self.output_file_grp,
pageId=input_file.pageId,
ID=ID,
mimetype='image/png',
local_filename="%s/%s" % (self.output_file_grp, ID),
content=bin_image_bytes.getvalue())
page_id=input_file.pageId)
page.add_AlternativeImage(AlternativeImageType(
filename=file_path,
comments=page_coords['features'] + ',binarized'))
else:
for region in pcgts.get_Page().get_TextRegion():
if self.parameter['level-of-operation'] == 'block':
log.info("About to binarize region '%s'", region.id)
image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(region.get_Coords().points))
for region in page.get_AllRegions(classes=['Text']):
region_image, region_coords = self.workspace.image_from_segment(
region, page_image, page_coords, feature_filter='binarized')
if self.parameter['level-of-operation'] == 'region':
log.info("Binarizing region '%s'", region.id)
bin_image = kraken.binarization.nlbin(region_image)
file_path = self.workspace.save_image_file(
bin_image, file_id + '_' + region.id + '.IMG-BIN',
self.output_file_grp,
page_id=input_file.pageId)
region.add_AlternativeImage(AlternativeImageType(
filename=file_path,
comments=region_coords['features'] + ',binarized'))
else:
textlines = region.get_TextLine()
log.info("About to binarize %i lines of region '%s'", len(textlines), region.id)
for (line_no, line) in enumerate(textlines):
log.debug("Binarizing line '%s' in region '%s'", line_no, region.id)
image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
bin_image = kraken.binarization.nlbin(image)
bin_image_bytes = io.BytesIO()
bin_image.save(bin_image_bytes, format='PNG')
ID = concat_padded(self.output_file_grp, n, region.id, line_no)
self.workspace.add_file(
for line in region.get_TextLine():
line_image, line_coords = self.workspace.image_from_segment(
line, region_image, region_coords, feature_filter='binarized')
log.info("Binarizing line '%s'", line.id)
bin_image = kraken.binarization.nlbin(line_image)
file_path = self.workspace.save_image_file(
bin_image, file_id + '_' + region.id + '_' + line.id + '.IMG-BIN',
self.output_file_grp,
pageId=input_file.pageId,
ID=ID,
local_filename="%s/%s" % (self.output_file_grp, ID),
mimetype='image/png',
content=bin_image_bytes.getvalue())
page_id=input_file.pageId)
line.add_AlternativeImage(AlternativeImageType(
filename=file_path,
comments=line_coords['features'] + ',binarized'))
# update METS (add the PAGE file):
file_path = os.path.join(self.output_file_grp, file_id + '.xml')
pcgts.set_pcGtsId(file_id)
out = self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
local_filename=file_path,
mimetype=MIMETYPE_PAGE,
content=to_xml(pcgts))
4 changes: 2 additions & 2 deletions ocrd_kraken/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json
from pkg_resources import resource_filename
from ocrd_utils import resource_filename

with open(resource_filename(__name__, 'ocrd-tool.json'), 'r', encoding='utf-8') as f:
with open(resource_filename('ocrd_kraken', 'ocrd-tool.json'), 'r', encoding='utf-8') as f:
OCRD_TOOL = json.load(f)
38 changes: 31 additions & 7 deletions ocrd_kraken/ocrd-tool.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"git_url": "https://github.com/OCR-D/ocrd_kraken",
"version": "0.3.1",
"version": "0.4.0",
"tools": {
"ocrd-kraken-binarize": {
"executable": "ocrd-kraken-binarize",
Expand All @@ -15,10 +15,10 @@
"description": "Binarize images with kraken",
"parameters": {
"level-of-operation": {
"description": "level-of-operation",
"description": "segment hierarchy level to operate on",
"type": "string",
"default": "page",
"enum": ["page", "block", "line"]
"enum": ["page", "region", "line"]
}
}
},
Expand All @@ -30,10 +30,22 @@
"Layout analysis"
],
"steps": [
"layout/segmentation/region"
"layout/segmentation/region",
"layout/segmentation/line"
],
"description": "Block segmentation with kraken",
"description": "Layout segmentation with Kraken",
"parameters": {
"level-of-operation": {
"description": "segment hierarchy level to operate on (page into regions+lines, or regions into lines)",
"type": "string",
"default": "page",
"enum": ["page", "table", "region"]
},
"overwrite_segments": {
"description": "remove any existing regions/lines",
"type": "boolean",
"default": false
},
"text_direction": {
"type": "string",
"description": "Sets principal text direction",
Expand Down Expand Up @@ -103,7 +115,14 @@
"size": 5047020,
"name": "blla.mlmodel",
"parameter_usage": "without-extension",
"description": "Pretrained baseline segmentation model"
"description": "Pretrained region+baseline segmentation model (trained on handwriting)"
},
{
"url": "https://ub-backup.bib.uni-mannheim.de/~stweil/tesstrain/kraken/ubma_segmentation/ubma_segmentation.mlmodel",
"size": 5047020,
"name": "ubma_segmentation.mlmodel",
"parameter_usage": "without-extension",
"description": "region+baseline segmentation model trained by UBMA (on print)"
}
]
},
Expand All @@ -113,8 +132,13 @@
"output_file_grp": ["OCR-D-OCR-KRAK"],
"categories": ["Text recognition and optimization"],
"steps": ["recognition/text-recognition"],
"description": "OCR with kraken",
"description": "Text recognition with Kraken",
"parameters": {
"overwrite_text": {
"description": "remove any existing TextEquiv",
"type": "boolean",
"default": false
},
"model": {
"description": "OCR model to recognize with",
"type": "string",
Expand Down
Loading

0 comments on commit e62d7e4

Please sign in to comment.