Skip to content

Commit

Permalink
Merge pull request #206 from hakonhagland/remove_span4
Browse files Browse the repository at this point in the history
Add script to remove undefined span tags
  • Loading branch information
lisajulia authored Mar 15, 2024
2 parents ab4570b + c410cfc commit ff907fb
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 0 deletions.
1 change: 1 addition & 0 deletions scripts/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ fodt-remove-elements = "fodt.splitter:remove_elements"
fodt-remove-fonts = "fodt.remove_fonts:remove_fonts"
fodt-remove-lines = "fodt.remove_lines:remove_lines"
fodt-remove-span-tags = "fodt.remove_span_tags:remove_version_span_tags"
fodt-remove-undefined-span-tags = "fodt.remove_undefined_spans:remove_undefined_span_tags"
fodt-set-font-decls = "fodt.set_fonts:set_font_decls"
fodt-set-keyword-status = "fodt.add_keyword_status:set_keyword_status"
fodt-split-all = "fodt.split_all:split_all"
Expand Down
166 changes: 166 additions & 0 deletions scripts/python/src/fodt/remove_undefined_spans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import io
import logging
import xml.sax
import xml.sax.handler
import xml.sax.xmlreader
import xml.sax.saxutils
from pathlib import Path

import click

from fodt.constants import ClickOptions
from fodt.xml_helpers import XMLHelper


class RemoveSpanHandler(xml.sax.handler.ContentHandler):
# Within the sections before the office:body starts, record the style:name attribute of
# all elements (to be sure not to miss any)
# Then, when parsing the office:body section, remove all text:span elements that
# have a style-name attribute that is not in the recorded set.
# NOTE: This will not remove span tags outside the office:body section yet. For
# example inside the office:master-styles section.
def __init__(self) -> None:
self.content = io.StringIO()
self.in_body = False
self.in_span = False
self.start_tag_open = False # For empty tags, do not close with />
self.styles = set() # All style names found before the office:body section
self.removed_styles = set() # All style names that are removed
self.num_removed_spans = 0 # Number of removed span tags
# NOTE: we do not handle nested spans, so only the outermost span is
# currently removed
self.span_recursion = 0

def characters(self, content: str):
if self.start_tag_open:
# NOTE: characters() is only called if there is content between the start
# tag and the end tag. If there is no content, characters() is not called.
self.content.write(">")
self.start_tag_open = False
self.content.write(XMLHelper.escape(content))

def endElement(self, name: str):
if name == "office:body":
self.in_body = False
elif self.in_body and name == "text:span" and self.in_span:
if self.span_recursion > 0:
self.span_recursion -= 1
else:
self.in_span = False
self.num_removed_spans += 1
return # remove this tag
if self.start_tag_open:
self.content.write("/>")
self.start_tag_open = False
else:
self.content.write(XMLHelper.endtag(name))

def get_content(self) -> str:
return self.content.getvalue()

def get_location(self) -> str:
if hasattr(self, "locator"):
return f"[{self.locator.getLineNumber()}:{self.locator.getColumnNumber()}]"
else:
return "unknown location"

def get_removed_styles(self) -> set[str]:
return self.removed_styles

def get_num_removed_spans(self) -> int:
return self.num_removed_spans

# This callback is used for debugging, it can be used to print
# line numbers in the XML file
def setDocumentLocator(self, locator):
self.locator = locator

def startDocument(self):
self.content.write(XMLHelper.header)

def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl):
if self.start_tag_open:
self.content.write(">")
self.start_tag_open = False
if name == "office:body":
self.in_body = True
elif not self.in_body: # Before the office:body section
if attrs.get("style:name"):
self.styles.add(attrs.get("style:name"))
elif self.in_body and name == "text:span":
if self.in_span:
self.span_recursion += 1
elif attrs.get("text:style-name"):
span_style_name = attrs.get("text:style-name")
if span_style_name not in self.styles:
self.in_span = True
self.span_recursion = 0
self.removed_styles.add(span_style_name)
#logging.info(f"removing span: {span_style_name}")
return # remove this tag
self.start_tag_open = True
self.content.write(XMLHelper.starttag(name, attrs, close_tag=False))


class RemoveSpanTags:
def __init__(self, maindir: str, filename: str|None, max_files: int|None) -> None:
self.maindir = Path(maindir)
self.filename = filename
self.max_files = max_files

def remove_span_tags(self) -> None:
if self.filename:
self.remove_span_tags_from_file(self.maindir / self.filename)
else:
self.remove_span_tags_from_all_files()

def remove_span_tags_from_all_files(self) -> None:
for i, filename in enumerate(self.maindir.rglob("*.fodt"), start=1):
if self.max_files and i > self.max_files:
break
logging.info(f"Processing file {i}: {filename}")
self.remove_span_tags_from_file(filename)

def remove_span_tags_from_file(self, filename: Path) -> None:
parser = xml.sax.make_parser()
handler = RemoveSpanHandler()
parser.setContentHandler(handler)
parser.parse(filename)
removed_styles = handler.get_removed_styles()
num_removed_spans = handler.get_num_removed_spans()
if len(removed_styles) > 0:
with open(filename, "w", encoding='utf8') as f:
f.write(handler.get_content())
logging.info(f"Removed {num_removed_spans} span tags from {filename}")

# USAGE:
#
# fodt-remove-undefined-span-tags \
# --maindir=<main directory> \
# --filename=<filename> \
# --max-files=<max files>
#
# DESCRIPTION:
#
# Removes undefined span tags from a given .fodt file, or, if --filename is not
# specified, from all .fodt subdocuments in the specified main directory.
# A span tag is considered undefined if it refers to a style name that is not
# defined in the document.
# In case, --filename option is not given, the max-files option can be used to
# limit the number of files that will be processed. If not given, all files are processed.
#
@click.command()
@ClickOptions.maindir(required=False)
@click.option("--filename", type=str, help="Name of the file to process.", required=False)
@click.option(
"--max-files",
type=int,
help="Maximum number of files to process.",
required=False,
)
def remove_undefined_span_tags(
maindir: str, filename: str|None, max_files: int|None
) -> None:
"""Remove unused span tags from .fodt subdocuments."""
logging.basicConfig(level=logging.INFO)
RemoveSpanTags(maindir, filename, max_files).remove_span_tags()

0 comments on commit ff907fb

Please sign in to comment.