Skip to content

Commit

Permalink
rfctr: remove obsolete python-docx monkey-patch
Browse files Browse the repository at this point in the history
`python-docx` v1.0.0 includes hyperlink text in `Paragraph.text` so this
monkey-patch is no longer required.
  • Loading branch information
scanny committed Oct 3, 2023
1 parent 0799e4a commit fbc4738
Showing 1 changed file with 1 addition and 43 deletions.
44 changes: 1 addition & 43 deletions unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,14 @@
import docx
from docx.document import Document
from docx.enum.section import WD_SECTION_START
from docx.oxml.ns import nsmap, qn
from docx.oxml.ns import nsmap
from docx.oxml.section import CT_SectPr
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.oxml.text.run import CT_R
from docx.oxml.xmlchemy import BaseOxmlElement
from docx.section import Section, _Footer, _Header
from docx.table import Table as DocxTable
from docx.text.pagebreak import RenderedPageBreak
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from lxml import etree
from typing_extensions import TypeAlias

Expand Down Expand Up @@ -202,8 +199,6 @@ class _DocxPartitioner:
# domain-specific knowledge to comfortable here and is of general use so welcome in the
# library.

# TODO: Move Paragraph._get_paragraph_runs() monkey-patch upstream to `python-docx`.

def __init__(
self,
filename: Optional[str],
Expand Down Expand Up @@ -813,40 +808,3 @@ def _sectPrs(self) -> Sequence[CT_SectPr]:
return self._sectPr.xpath(
"/w:document/w:body/w:p/w:pPr/w:sectPr | /w:document/w:body/w:sectPr",
)


# == monkey-patch docx.text.Paragraph.runs ===========================================


def _get_paragraph_runs(paragraph: Paragraph) -> Sequence[Run]:
"""Gets all runs in paragraph, including hyperlinks python-docx skips.
Without this, the default runs function skips over hyperlinks.
Args:
paragraph (Paragraph): A Paragraph object.
Returns:
list: A list of Run objects.
"""

def _get_runs(node: BaseOxmlElement, parent: Paragraph) -> Iterator[Run]:
"""Recursively get runs."""
for child in node:
# -- the Paragraph has runs as direct children --
if child.tag == qn("w:r"):
yield Run(cast(CT_R, child), parent)
continue
# -- but it also has hyperlink children that themselves contain runs, so
# -- recurse into those
if child.tag == qn("w:hyperlink"):
yield from _get_runs(child, parent)

return list(_get_runs(paragraph._element, paragraph))


Paragraph.runs = property( # pyright: ignore[reportGeneralTypeIssues]
lambda self: _get_paragraph_runs(self),
)

# ====================================================================================

0 comments on commit fbc4738

Please sign in to comment.