rfctr: remove obsolete python-docx monkey-patch

`python-docx` v1.0.0 includes hyperlink text in `Paragraph.text` so this monkey-patch is no longer required.
Unstructured-IO · Oct 3, 2023 · fbc4738 · fbc4738
1 parent 0799e4a
commit fbc4738
Showing 1 changed file with 1 addition and 43 deletions.
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -25,17 +25,14 @@
 import docx
 from docx.document import Document
 from docx.enum.section import WD_SECTION_START
-from docx.oxml.ns import nsmap, qn
+from docx.oxml.ns import nsmap
 from docx.oxml.section import CT_SectPr
 from docx.oxml.table import CT_Tbl
 from docx.oxml.text.paragraph import CT_P
-from docx.oxml.text.run import CT_R
-from docx.oxml.xmlchemy import BaseOxmlElement
 from docx.section import Section, _Footer, _Header
 from docx.table import Table as DocxTable
 from docx.text.pagebreak import RenderedPageBreak
 from docx.text.paragraph import Paragraph
-from docx.text.run import Run
 from lxml import etree
 from typing_extensions import TypeAlias
 
@@ -202,8 +199,6 @@ class _DocxPartitioner:
     #       domain-specific knowledge to comfortable here and is of general use so welcome in the
     #       library.
 
-    # TODO: Move Paragraph._get_paragraph_runs() monkey-patch upstream to `python-docx`.
-
     def __init__(
         self,
         filename: Optional[str],
@@ -813,40 +808,3 @@ def _sectPrs(self) -> Sequence[CT_SectPr]:
         return self._sectPr.xpath(
             "/w:document/w:body/w:p/w:pPr/w:sectPr | /w:document/w:body/w:sectPr",
         )
-
-
-# == monkey-patch docx.text.Paragraph.runs ===========================================
-
-
-def _get_paragraph_runs(paragraph: Paragraph) -> Sequence[Run]:
-    """Gets all runs in paragraph, including hyperlinks python-docx skips.
-
-    Without this, the default runs function skips over hyperlinks.
-
-    Args:
-        paragraph (Paragraph): A Paragraph object.
-
-    Returns:
-        list: A list of Run objects.
-    """
-
-    def _get_runs(node: BaseOxmlElement, parent: Paragraph) -> Iterator[Run]:
-        """Recursively get runs."""
-        for child in node:
-            # -- the Paragraph has runs as direct children --
-            if child.tag == qn("w:r"):
-                yield Run(cast(CT_R, child), parent)
-                continue
-            # -- but it also has hyperlink children that themselves contain runs, so
-            # -- recurse into those
-            if child.tag == qn("w:hyperlink"):
-                yield from _get_runs(child, parent)
-
-    return list(_get_runs(paragraph._element, paragraph))
-
-
-Paragraph.runs = property(  # pyright: ignore[reportGeneralTypeIssues]
-    lambda self: _get_paragraph_runs(self),
-)
-
-# ====================================================================================