Added support for escape character in markdown text (#1224)

* Added support for escape character in markdown text * updated documentation * Fixed escape character handling & added more tests * rephrased docu * updated documentation --------- Co-authored-by: KingOfKaste <[email protected]>
py-pdf · Jul 20, 2024 · 5ffab70 · 5ffab70
1 parent 7c91959
commit 5ffab70
Show file tree

Hide file tree

Showing 12 changed files with 187 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
 
 ## [2.7.10] - Not released yet
 ### Added
+* support for escape character for markers in markdown text [issue #1215](https://github.com/py-pdf/fpdf2/issues/1215)
 * Wrapping words on spaces now considers all common space symbols in addition to regular spaces (' '), addressing issues with word-wrapping for languages like Thai, as per [#1190](https://github.com/py-pdf/fpdf2/issues/1190) and [#1191](https://github.com/py-pdf/fpdf2/pull/1191).
 * [`Templates`](https://py-pdf.github.io/fpdf2/fpdf/Templates.html) can now be also defined in JSON files.
 * support to optionally set `wrapmode` in templates (default `"WORD"` can optionally be set to `"CHAR"` to support wrapping on characters for scripts like Chinese or Japanese) - _cf._ [#1159](https://github.com/py-pdf/fpdf2/issues/1159) - thanks to @carlhiggs

diff --git a/docs/TextStyling.md b/docs/TextStyling.md
@@ -181,20 +181,25 @@ An optional `markdown=True` parameter can be passed to the [`cell()`](fpdf/fpdf.
 & [`multi_cell()`](fpdf/fpdf.html#fpdf.fpdf.FPDF.multi_cell) methods
 in order to enable basic Markdown-like styling: `**bold**, __italics__, --underlined--`.
 
+If the printable text contains a character sequence that would be incorrectly interpreted as a formatting marker, it can be escaped using `\`. The escape character works the same way it generally does in Python (see the example below).
+
 Bold & italics require using dedicated fonts for each style.
 
 For the standard fonts (Courier, Helvetica & Times), those dedicated fonts are configured by default:
 
 ```python
 from fpdf import FPDF
 
-pdf = fpdf.FPDF()
+pdf = FPDF()
 pdf.add_page()
-pdf.set_font("Times", size=60)
-pdf.cell(text="**Lorem** __Ipsum__ --dolor--", markdown=True)
+pdf.set_font("Times", size=50)
+pdf.cell(text="**Lorem** __Ipsum__ --dolor--", markdown=True, new_x='LEFT', new_y='NEXT')
+pdf.cell(text="\\**Lorem\\** \\\\__Ipsum\\\\__ --dolor--", markdown=True)
 pdf.output("markdown-styled.pdf")
 ```
 
+![](markdown-style.png)
+
 Using other fonts means that their variants (bold, italics)
 must be registered using `add_font` with `style="B"` and `style="I"`.
 Several unit tests in `test/text/` demonstrate that:

diff --git a/docs/markdown-style.png b/docs/markdown-style.png
diff --git a/fpdf/fpdf.py b/fpdf/fpdf.py
@@ -204,6 +204,7 @@ class FPDF(GraphicsStateMixin, TextRegionMixin):
     MARKDOWN_BOLD_MARKER = "**"
     MARKDOWN_ITALICS_MARKER = "__"
     MARKDOWN_UNDERLINE_MARKER = "--"
+    MARKDOWN_ESCAPE_CHARACTER = "\\"
     MARKDOWN_LINK_REGEX = re.compile(r"^\[([^][]+)\]\(([^()]+)\)(.*)$", re.DOTALL)
     MARKDOWN_LINK_COLOR = None
     MARKDOWN_LINK_UNDERLINE = True
@@ -2936,7 +2937,7 @@ def cell(
                 (identifier returned by `FPDF.add_link`) or external URL.
             center (bool): center the cell horizontally on the page.
             markdown (bool): enable minimal markdown-like markup to render part
-                of text as bold / italics / underlined. Default to False.
+                of text as bold / italics / underlined. Supports `\\` as escape character. Default to False.
             txt (str): [**DEPRECATED since v2.7.6**] String to print. Default value: empty string.
 
         Returns: a boolean indicating if page break was triggered
@@ -3455,6 +3456,7 @@ def frag():
             font_glyphs = self.current_font.cmap
         else:
             font_glyphs = []
+        num_escape_chars = 0
 
         while text:
             is_marker = text[:2] in (
@@ -3480,16 +3482,27 @@ def frag():
                     and (not txt_frag or txt_frag[-1] != half_marker)
                     and (len(text) < 3 or text[2] != half_marker)
                 ):
-                    if txt_frag:
-                        yield frag()
-                    if text[:2] == self.MARKDOWN_BOLD_MARKER:
-                        in_bold = not in_bold
-                    if text[:2] == self.MARKDOWN_ITALICS_MARKER:
-                        in_italics = not in_italics
-                    if text[:2] == self.MARKDOWN_UNDERLINE_MARKER:
-                        in_underline = not in_underline
-                    text = text[2:]
-                    continue
+                    txt_frag = (
+                        txt_frag[: -((num_escape_chars + 1) // 2)]
+                        if num_escape_chars > 0
+                        else txt_frag
+                    )
+                    if num_escape_chars % 2 == 0:
+                        if txt_frag:
+                            yield frag()
+                        if text[:2] == self.MARKDOWN_BOLD_MARKER:
+                            in_bold = not in_bold
+                        if text[:2] == self.MARKDOWN_ITALICS_MARKER:
+                            in_italics = not in_italics
+                        if text[:2] == self.MARKDOWN_UNDERLINE_MARKER:
+                            in_underline = not in_underline
+                        text = text[2:]
+                        continue
+                num_escape_chars = (
+                    num_escape_chars + 1
+                    if text[0] == self.MARKDOWN_ESCAPE_CHARACTER
+                    else 0
+                )
                 is_link = self.MARKDOWN_LINK_REGEX.match(text)
                 if is_link:
                     link_text, link_dest, text = is_link.groups()
@@ -3673,7 +3686,7 @@ def multi_cell(
             ln (int): **DEPRECATED since 2.5.1**: Use `new_x` and `new_y` instead.
             max_line_height (float): optional maximum height of each sub-cell generated
             markdown (bool): enable minimal markdown-like markup to render part
-                of text as bold / italics / underlined. Default to False.
+                of text as bold / italics / underlined. Supports `\\` as escape character. Default to False.
             print_sh (bool): Treat a soft-hyphen (\\u00ad) as a normal printable
                 character, instead of a line breaking opportunity. Default value: False
             wrapmode (fpdf.enums.WrapMode): "WORD" for word based line wrapping (default),

diff --git a/test/text/cell_markdown_bold_italic_escaped.pdf b/test/text/cell_markdown_bold_italic_escaped.pdf
diff --git a/test/text/cell_markdown_escaped.pdf b/test/text/cell_markdown_escaped.pdf
diff --git a/test/text/cell_markdown_with_ttf_fonts_escaped.pdf b/test/text/cell_markdown_with_ttf_fonts_escaped.pdf
diff --git a/test/text/multi_cell_markdown_escaped.pdf b/test/text/multi_cell_markdown_escaped.pdf
diff --git a/test/text/multi_cell_markdown_with_ttf_fonts_escaped.pdf b/test/text/multi_cell_markdown_with_ttf_fonts_escaped.pdf
diff --git a/test/text/test_cell.py b/test/text/test_cell.py
@@ -173,6 +173,14 @@ def test_cell_markdown(tmp_path):
     assert_pdf_equal(pdf, HERE / "cell_markdown.pdf", tmp_path)
 
 
+def test_cell_markdown_escaped(tmp_path):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Times", size=40)
+    pdf.cell(text="**Lo\\rem** \\__Ipsum\\__ \\\\--dolor\\\\--", markdown=True)
+    assert_pdf_equal(pdf, HERE / "cell_markdown_escaped.pdf", tmp_path)
+
+
 def test_cell_markdown_bold_italic(tmp_path):
     # issue 1094
     pdf = FPDF()
@@ -182,6 +190,14 @@ def test_cell_markdown_bold_italic(tmp_path):
     assert_pdf_equal(pdf, HERE / "cell_markdown_bold_italic.pdf", tmp_path)
 
 
+def test_cell_markdown_bold_italic_escaped(tmp_path):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Times", size=40)
+    pdf.cell(text="**__Lorem \\--Ipsum\\--__**", markdown=True)
+    assert_pdf_equal(pdf, HERE / "cell_markdown_bold_italic_escaped.pdf", tmp_path)
+
+
 def test_cell_markdown_with_ttf_fonts(tmp_path):
     pdf = FPDF()
     pdf.add_page()
@@ -193,6 +209,17 @@ def test_cell_markdown_with_ttf_fonts(tmp_path):
     assert_pdf_equal(pdf, HERE / "cell_markdown_with_ttf_fonts.pdf", tmp_path)
 
 
+def test_cell_markdown_with_ttf_fonts_escaped(tmp_path):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.add_font("Roboto", "", FONTS_DIR / "Roboto-Regular.ttf")
+    pdf.add_font("Roboto", "B", FONTS_DIR / "Roboto-Bold.ttf")
+    pdf.add_font("Roboto", "I", FONTS_DIR / "Roboto-Italic.ttf")
+    pdf.set_font("Roboto", size=40)
+    pdf.cell(text="**Lo\\rem** \\__Ipsum\\__ \\\\--dolor\\\\--", markdown=True)
+    assert_pdf_equal(pdf, HERE / "cell_markdown_with_ttf_fonts_escaped.pdf", tmp_path)
+
+
 def test_cell_markdown_missing_ttf_font():
     pdf = FPDF()
     pdf.add_page()

diff --git a/test/text/test_markdown_parse.py b/test/text/test_markdown_parse.py
@@ -26,6 +26,62 @@ def test_markdown_parse_simple_ok():
     assert frags == expected
 
 
+def test_markdown_parse_simple_ok_escaped():
+    frags = tuple(
+        FPDF()._parse_chars(
+            "\\**bold\\**, \\__italics\\__ and \\--underlined\\-- escaped", True
+        )
+    )
+    expected = (
+        Fragment("**bold**, __italics__ and --underlined-- escaped", GSTATE, k=PDF.k),
+    )
+    assert frags == expected
+    frags = tuple(
+        FPDF()._parse_chars(
+            r"raw \**bold\**, \__italics\__ and \--underlined\-- escaped", True
+        )
+    )
+    expected = (
+        Fragment(
+            "raw **bold**, __italics__ and --underlined-- escaped", GSTATE, k=PDF.k
+        ),
+    )
+    assert frags == expected
+    frags = tuple(FPDF()._parse_chars("escape *\\*between marker*\\*", True))
+    expected = (Fragment("escape *\\*between marker*\\*", GSTATE, k=PDF.k),)
+    assert frags == expected
+    frags = tuple(FPDF()._parse_chars("escape **\\after marker**\\", True))
+    expected = (
+        Fragment("escape ", GSTATE, k=PDF.k),
+        Fragment("\\after marker", GSTATE_B, k=PDF.k),
+        Fragment("\\", GSTATE, k=PDF.k),
+    )
+
+
+def test_markdown_unrelated_escape():
+    frags = tuple(FPDF()._parse_chars("unrelated \\ escape \\**bold\\**", True))
+    expected = (Fragment("unrelated \\ escape **bold**", GSTATE, k=PDF.k),)
+    assert frags == expected
+    frags = tuple(
+        FPDF()._parse_chars("unrelated \\\\ double escape \\**bold\\**", True)
+    )
+    expected = (Fragment("unrelated \\\\ double escape **bold**", GSTATE, k=PDF.k),)
+    assert frags == expected
+
+
+def test_markdown_parse_multiple_escape():
+    frags = tuple(FPDF()._parse_chars("\\\\**bold\\\\** double escaped", True))
+    expected = (
+        Fragment("\\", GSTATE, k=PDF.k),
+        Fragment("bold\\", GSTATE_B, k=PDF.k),
+        Fragment(" double escaped", GSTATE, k=PDF.k),
+    )
+    assert frags == expected
+    frags = tuple(FPDF()._parse_chars("\\\\\\**triple bold\\\\\\** escaped", True))
+    expected = (Fragment("\\**triple bold\\** escaped", GSTATE, k=PDF.k),)
+    assert frags == expected
+
+
 def test_markdown_parse_overlapping():
     frags = tuple(FPDF()._parse_chars("**bold __italics__**", True))
     expected = (
@@ -35,6 +91,12 @@ def test_markdown_parse_overlapping():
     assert frags == expected
 
 
+def test_markdown_parse_overlapping_escaped():
+    frags = tuple(FPDF()._parse_chars("**bold \\__italics\\__**", True))
+    expected = (Fragment("bold __italics__", GSTATE_B, k=PDF.k),)
+    assert frags == expected
+
+
 def test_markdown_parse_crossing_markers():
     frags = tuple(FPDF()._parse_chars("**bold __and** italics__", True))
     expected = (
@@ -45,6 +107,15 @@ def test_markdown_parse_crossing_markers():
     assert frags == expected
 
 
+def test_markdown_parse_crossing_markers_escaped():
+    frags = tuple(FPDF()._parse_chars("**bold __and\\** italics__", True))
+    expected = (
+        Fragment("bold ", GSTATE_B, k=PDF.k),
+        Fragment("and** italics", GSTATE_BI, k=PDF.k),
+    )
+    assert frags == expected
+
+
 def test_markdown_parse_unterminated():
     frags = tuple(FPDF()._parse_chars("**bold __italics__", True))
     expected = (
@@ -54,6 +125,15 @@ def test_markdown_parse_unterminated():
     assert frags == expected
 
 
+def test_markdown_parse_unterminated_escaped():
+    frags = tuple(FPDF()._parse_chars("**bold\\** __italics__", True))
+    expected = (
+        Fragment("bold** ", GSTATE_B, k=PDF.k),
+        Fragment("italics", GSTATE_BI, k=PDF.k),
+    )
+    assert frags == expected
+
+
 def test_markdown_parse_line_of_markers():
     frags = tuple(FPDF()._parse_chars("*** woops", True))
     expected = (Fragment("*** woops", GSTATE, k=PDF.k),)
@@ -72,6 +152,15 @@ def test_markdown_parse_line_of_markers():
     assert frags == expected
 
 
+def test_markdown_parse_line_of_markers_escaped():
+    frags = tuple(FPDF()._parse_chars("\\****BOLD**", True))
+    expected = (Fragment("\\****BOLD", GSTATE, k=PDF.k),)
+    assert frags == expected
+    frags = tuple(FPDF()._parse_chars("*\\***BOLD**", True))
+    expected = (Fragment("*\\***BOLD", GSTATE, k=PDF.k),)
+    assert frags == expected
+
+
 def test_markdown_parse_newline_after_markdown_link():  # issue 916
     text = "[fpdf2](https://py-pdf.github.io/fpdf2/)\nGo visit it!"
     frags = tuple(FPDF()._parse_chars(text, True))

diff --git a/test/text/test_multi_cell_markdown.py b/test/text/test_multi_cell_markdown.py
@@ -25,6 +25,22 @@ def test_multi_cell_markdown(tmp_path):
     assert_pdf_equal(pdf, HERE / "multi_cell_markdown.pdf", tmp_path)
 
 
+def test_multi_cell_markdown_escaped(tmp_path):
+    pdf = fpdf.FPDF()
+    pdf.add_page()
+    pdf.set_font("Times", "", 32)
+    text = (  # Some text where styling occur over line breaks:
+        "Lorem ipsum \\ dolor amet, \\**consectetur adipiscing\\** elit,"
+        " sed do eiusmod \\\\__tempor incididunt\\\\__ ut labore et dolore --magna aliqua--."
+    )
+    pdf.multi_cell(
+        w=pdf.epw, text=text, markdown=True
+    )  # This is tricky to get working well
+    pdf.ln()
+    pdf.multi_cell(w=pdf.epw, text=text, markdown=True, align="L")
+    assert_pdf_equal(pdf, HERE / "multi_cell_markdown_escaped.pdf", tmp_path)
+
+
 def test_multi_cell_markdown_with_ttf_fonts(tmp_path):
     pdf = fpdf.FPDF()
     pdf.add_page()
@@ -44,6 +60,27 @@ def test_multi_cell_markdown_with_ttf_fonts(tmp_path):
     assert_pdf_equal(pdf, HERE / "multi_cell_markdown_with_ttf_fonts.pdf", tmp_path)
 
 
+def test_multi_cell_markdown_with_ttf_fonts_escaped(tmp_path):
+    pdf = fpdf.FPDF()
+    pdf.add_page()
+    pdf.add_font("Roboto", "", FONTS_DIR / "Roboto-Regular.ttf")
+    pdf.add_font("Roboto", "B", FONTS_DIR / "Roboto-Bold.ttf")
+    pdf.add_font("Roboto", "I", FONTS_DIR / "Roboto-Italic.ttf")
+    pdf.set_font("Roboto", size=32)
+    text = (  # Some text where styling occur over line breaks:
+        "Lorem ipsum \\ dolor, \\**consectetur adipiscing\\** elit,"
+        " eiusmod \\\\__tempor incididunt\\\\__ ut labore et dolore --magna aliqua--."
+    )
+    pdf.multi_cell(
+        w=pdf.epw, text=text, markdown=True
+    )  # This is tricky to get working well
+    pdf.ln()
+    pdf.multi_cell(w=pdf.epw, text=text, markdown=True, align="L")
+    assert_pdf_equal(
+        pdf, HERE / "multi_cell_markdown_with_ttf_fonts_escaped.pdf", tmp_path
+    )
+
+
 def test_multi_cell_markdown_missing_ttf_font():
     pdf = fpdf.FPDF()
     pdf.add_page()