Skip to content

Commit

Permalink
Merge pull request #173 from hakonhagland/remove_span
Browse files Browse the repository at this point in the history
Created script to remove revision span tags
  • Loading branch information
lisajulia authored Mar 8, 2024
2 parents 9cb8879 + 22720b6 commit 5a7b0c2
Show file tree
Hide file tree
Showing 2 changed files with 350 additions and 0 deletions.
1 change: 1 addition & 0 deletions scripts/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ fodt-remove-chapters = "fodt.splitter:remove_chapters"
fodt-remove-elements = "fodt.splitter:remove_elements"
fodt-remove-fonts = "fodt.remove_fonts:remove_fonts"
fodt-remove-lines = "fodt.remove_lines:remove_lines"
fodt-remove-span_tags = "fodt.remove_span_tags:remove_version_span_tags"
fodt-set-font-decls = "fodt.set_fonts:set_font_decls"
fodt-set-keyword-status = "fodt.add_keyword_status:set_keyword_status"
fodt-split-all = "fodt.split_all:split_all"
Expand Down
349 changes: 349 additions & 0 deletions scripts/python/src/fodt/remove_span_tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
import io
import logging
import xml.sax
import xml.sax.handler
import xml.sax.xmlreader
import xml.sax.saxutils
from pathlib import Path

import click

from fodt.constants import ClickOptions
from fodt.xml_helpers import XMLHelper

class RemoveEmptyLinesHandler(xml.sax.handler.ContentHandler):
# Within the office:automatic-styles section, remove empty lines left behind
# by the RemovedStylesHandler.

def __init__(self) -> None:
self.content = io.StringIO()
self.in_auto_styles = False
self.in_tag = False # Within a tag in the automatic-styles section
self.tag_recursion = 0 # Recursion level within the tag
self.start_tag_open = False # For empty tags, do not close with />
self.save_space = "" # See comments for characters() below

def characters(self, content: str):
# Check if we are at the top level of the automatic-styles section
if self.in_auto_styles and not self.in_tag:
# This should only happen after an end tag and before a start tag
if content.isspace():
# We need to accumulate the content since characters() does not
# provide all the space between an end tag of an element and the
# beginning of the start tag of an element in on call, it chunks
# the space into multiple calls..
# Then, we will actually remove the space in the beginning of
# startElement()..
self.save_space += content
return # Do not write the space to the content yet..
if self.start_tag_open:
# NOTE: characters() is only called if there is content between the start
# tag and the end tag. If there is no content, characters() is not called.
self.content.write(">")
self.start_tag_open = False
self.content.write(XMLHelper.escape(content))

def endElement(self, name: str):
if self.in_auto_styles:
if self.in_tag:
self.tag_recursion -= 1
if self.tag_recursion == 0:
# We are at the top level of the automatic-styles section
self.in_tag = False
if name == "office:automatic-styles":
self.in_auto_styles = False
if self.start_tag_open:
self.content.write("/>")
self.start_tag_open = False
else:
self.content.write(XMLHelper.endtag(name))

def get_content(self) -> str:
return self.content.getvalue()

# This callback is used for debugging, it can be used to print
# line numbers in the XML file
def setDocumentLocator(self, locator):
self.locator = locator

def startDocument(self):
self.content.write(XMLHelper.header)

def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl):
if self.start_tag_open:
self.content.write(">") # Close the start tag
self.start_tag_open = False
if self.in_auto_styles and not self.in_tag:
if len(self.save_space) > 0:
if '\n' in self.save_space:
index = self.save_space.rfind("\n")
if index != -1:
# Remove the space left behind by the removed style tag
self.save_space = self.save_space[index:]
self.content.write(self.save_space)
self.save_space = ""
if name == "office:automatic-styles":
self.in_auto_styles = True
elif self.in_auto_styles:
self.in_tag = True
self.tag_recursion += 1
self.start_tag_open = True
self.content.write(XMLHelper.starttag(name, attrs, close_tag=False))


class RemoveStylesHandler(xml.sax.handler.ContentHandler):
# Within the office:automatic-styles section, remove the rsid styles that
# corresponds to the span tags that were removed in the previous iteration.

def __init__(self, removed_styles: set[str]) -> None:
self.removed_styles = removed_styles
self.content = io.StringIO()
self.in_auto_styles = False
self.start_tag_open = False # For empty tags, do not close with />
# NOTE: Assume that the style:style tag does not contain nested style:style tags
# so we omit recursion handling here..
self.in_style = False

def characters(self, content: str):
if self.in_style:
return # remove this tag
if self.start_tag_open:
# NOTE: characters() is only called if there is content between the start
# tag and the end tag. If there is no content, characters() is not called.
self.content.write(">")
self.start_tag_open = False
self.content.write(XMLHelper.escape(content))

def endElement(self, name: str):
if self.in_style:
if name == "style:style":
self.in_style = False
return # remove this tag
if name == "office:automatic-styles":
self.in_auto_styles = False
if self.start_tag_open:
self.content.write("/>")
self.start_tag_open = False
else:
self.content.write(XMLHelper.endtag(name))

def get_content(self) -> str:
return self.content.getvalue()

def startDocument(self):
self.content.write(XMLHelper.header)

def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl):
if self.in_style:
return # remove this tag
if self.start_tag_open:
self.content.write(">")
self.start_tag_open = False
if name == "office:automatic-styles":
self.in_auto_styles = True
elif self.in_auto_styles and name == "style:style":
if attrs.get("style:name"):
style_name = attrs.get("style:name")
if style_name in self.removed_styles:
self.in_style = True
return # remove this tag
self.start_tag_open = True
self.content.write(XMLHelper.starttag(name, attrs, close_tag=False))


class RemoveSpanHandler(xml.sax.handler.ContentHandler):
# Within the office:automatic-styles section, record the style:name attribute of
# all the style:style elements that has an inner style:text-properties element with
# attribute officeooo:rsid.
# Then when parsing the office:body section, remove all text:span elements that
# have the style-name attribute set to one of the recorded style:name values.
# NOTE: This will not remove span tags outside the office:body section yet. For
# example inside the office:master-styles section.
def __init__(self) -> None:
self.content = io.StringIO()
self.in_body = False
self.in_span = False
self.in_style = False
self.in_auto_styles = False
self.start_tag_open = False # For empty tags, do not close with />
self.rsid_styles = set() # All rsid style names
self.removed_styles = set() # All rsid style names that are removed
# NOTE: we do not handle nested spans, so only the outermost span is
# currently removed
self.span_recursion = 0

def characters(self, content: str):
if self.start_tag_open:
# NOTE: characters() is only called if there is content between the start
# tag and the end tag. If there is no content, characters() is not called.
self.content.write(">")
self.start_tag_open = False
self.content.write(XMLHelper.escape(content))

def endElement(self, name: str):
if name == "office:automatic-styles":
self.in_auto_styles = False
elif self.in_style and name == "style:style":
self.in_style = False
elif name == "office:body":
self.in_body = False
elif self.in_body and name == "text:span" and self.in_span:
if self.span_recursion > 0:
self.span_recursion -= 1
else:
self.in_span = False
return # remove this tag
if self.start_tag_open:
self.content.write("/>")
self.start_tag_open = False
else:
self.content.write(XMLHelper.endtag(name))

def get_content(self) -> str:
return self.content.getvalue()

def get_location(self) -> str:
if hasattr(self, "locator"):
return f"[{self.locator.getLineNumber()}:{self.locator.getColumnNumber()}]"
else:
return "unknown location"

def get_removed_styles(self) -> set[str]:
return self.removed_styles

# This callback is used for debugging, it can be used to print
# line numbers in the XML file
def setDocumentLocator(self, locator):
self.locator = locator

def startDocument(self):
self.content.write(XMLHelper.header)

def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl):
if self.start_tag_open:
self.content.write(">")
self.start_tag_open = False
if name == "office:automatic-styles":
self.in_auto_styles = True
elif self.in_auto_styles and name == "style:style":
if attrs.get("style:name"):
self.style_name = attrs.get("style:name")
#logging.info(f"style_name: {self.style_name}")
self.in_style = True
elif self.in_style and name == "style:text-properties":
if attrs.get("officeooo:rsid"):
#logging.info(f"adding style_name: {self.style_name}")
self.rsid_styles.add(self.style_name)
elif name == "office:body":
self.in_body = True
elif self.in_body and name == "text:span":
if self.in_span:
self.span_recursion += 1
elif attrs.get("text:style-name"):
span_style_name = attrs.get("text:style-name")
if span_style_name in self.rsid_styles:
self.in_span = True
self.span_recursion = 0
self.removed_styles.add(span_style_name)
#logging.info(f"removing span: {span_style_name}")
return # remove this tag
self.start_tag_open = True
self.content.write(XMLHelper.starttag(name, attrs, close_tag=False))


class RemoveSpanTags:
def __init__(self, maindir: str, filename: str|None, max_files: int|None) -> None:
self.maindir = Path(maindir)
self.filename = filename
self.max_files = max_files

def remove_empty_lines(self, filename: Path) -> None:
# Remove empty lines from the automtic-styles section
parser = xml.sax.make_parser()
handler = RemoveEmptyLinesHandler()
parser.setContentHandler(handler)
parser.parse(filename)
with open(filename, "w", encoding='utf8') as f:
f.write(handler.get_content())

def remove_span_tags(self) -> None:
if self.filename:
self.remove_span_tags_and_styles_from_file(self.maindir / self.filename)
else:
self.remove_span_tags_from_all_files()

def remove_span_tags_from_all_files(self) -> None:
for i, filename in enumerate(self.maindir.rglob("*.fodt"), start=1):
if self.max_files and i > self.max_files:
break
logging.info(f"Processing file {i}: {filename}")
self.remove_span_tags_and_styles_from_file(filename)

def remove_rsid_styles(self, filename: Path, removed_styles: set[str]) -> None:
parser = xml.sax.make_parser()
handler = RemoveStylesHandler(removed_styles)
parser.setContentHandler(handler)
parser.parse(filename)
with open(filename, "w", encoding='utf8') as f:
f.write(handler.get_content())

def remove_span_tags_and_styles_from_file(self, filename: Path) -> None:
# We do this in three steps:
# - first we remove the span tags, then
# - we remove the corresponding rsid styles from the automatic-styles section, and
# - finally we remove whitespace empty lines from the file.
#
# Why do we need step #3? Because empty lines can be created due to the
# indentation of the removed style tags in step #2 above.
removed_styles = self.remove_span_tags_from_file(filename)
if len(removed_styles) > 0:
self.remove_rsid_styles(filename, removed_styles)
self.remove_empty_lines(filename)
logging.info(f"Removed {len(removed_styles)} styles")
else:
logging.info(f"No styles removed")

def remove_span_tags_from_file(self, filename: Path) -> set[str]:
parser = xml.sax.make_parser()
handler = RemoveSpanHandler()
parser.setContentHandler(handler)
parser.parse(filename)
removed_styles = handler.get_removed_styles()
if len(removed_styles) > 0:
with open(filename, "w", encoding='utf8') as f:
f.write(handler.get_content())
return removed_styles

# USAGE:
#
# fodt-remove-version-span-tags \
# --maindir=<main directory> \
# --filename=<filename> \
# --max-files=<max files>
#
# DESCRIPTION:
#
# Removes the version span tags from a given .fodt file, or, if --filename is not
# specified, from all .fodt subdocuments in the specified main directory.
# In case, --filename option is not given, the max-files option can be used to
# limit the number of files that will be processed. If not given, all files are processed.
#
# For background information on why this is needed see:
#
# https://ask.libreoffice.org/t/where-do-the-text-style-name-tnn-span-tags-come-from-and-how-do-i-get-rid-of-them/31681
#
@click.command()
@ClickOptions.maindir(required=False)
@click.option("--filename", type=str, help="Name of the file to process.", required=False)
@click.option(
"--max-files",
type=int,
help="Maximum number of files to process.",
required=False,
)
def remove_version_span_tags(
maindir: str, filename: str|None, max_files: int|None
) -> None:
"""Remove version span tags from all .fodt subdocuments."""
logging.basicConfig(level=logging.INFO)
RemoveSpanTags(maindir, filename, max_files).remove_span_tags()

0 comments on commit 5a7b0c2

Please sign in to comment.