fix and catch unintended uses of inline HTML (#1716)

* fix missing text in "KMS Provider" section of the Client Side Encryption spec due to less-than symbol * manual audit of all less-than symbols followed by a letter * reformat python scripts using "black" * let scripts detect fenced code inside block-quotes * add a pre-commit script to check HTML tags against allowed patterns * replace HTTP links with equivalent HTTPS
mongodb · Nov 5, 2024 · f8dbd24 · f8dbd24
1 parent 21c1427
commit f8dbd24
Show file tree

Hide file tree

Showing 13 changed files with 190 additions and 115 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -45,6 +45,11 @@ repos:
     types: [markdown]
     language: system
     entry: python3 scripts/check_links.py
+  - id: markdown-html-check
+    name: markdown-html-check
+    types: [markdown]
+    language: system
+    entry: python3 scripts/check_md_html.py
 
 - repo: https://github.com/tcort/markdown-link-check
   rev: v3.12.2

diff --git a/scripts/check_links.py b/scripts/check_links.py
@@ -1,23 +1,29 @@
-import sys
+import sys, re
+
 fname = sys.argv[-1]
 
+# Roughly detect fenced code even inside block quotes
+fenced_code = re.compile(r"^\s*(>\s+)*```")
+
 # Check for markdown links that got improperly line wrapped.
 in_code_block = False
 with open(fname) as fid:
     for line in fid:
         # Ignore code blocks.
-        if line.strip().startswith('```'):
+        if fenced_code.match(line):
             in_code_block = not in_code_block
         if in_code_block:
             continue
-        id0 = line.index('[') if '[' in line else -1
-        id1 = line.index(']') if ']' in line else -1
-        id2 = line.index('(') if '(' in line else -1
-        id3 = line.index(')') if ')' in line else -1
+        id0 = line.index("[") if "[" in line else -1
+        id1 = line.index("]") if "]" in line else -1
+        id2 = line.index("(") if "(" in line else -1
+        id3 = line.index(")") if ")" in line else -1
         if id1 == -1 or id2 == -1 or id3 == -1:
             continue
         if id2 < id1 or id3 < id2:
             continue
         if id0 == -1:
-            print('*** Malformed link in line:', line, fname)
-            sys.exit(1)
+            print("*** Malformed link in line:", line, fname)
+            sys.exit(1)
+
+assert not in_code_block
diff --git a/scripts/check_md_html.py b/scripts/check_md_html.py
@@ -0,0 +1,55 @@
+import sys, re
+
+fname = sys.argv[-1]
+
+# Check for allowed HTML elements in markdown.
+# Ignores inline and fenced code, but intentionally doesn't ignore backslash
+# escaping. (For compatibility, we want to avoid unintentional inline HTML
+# even on markdown implementations where "\<" escapes are not supported.)
+
+disallowed_re = re.compile(
+    r"""
+    [^`]*(`[^`]+`)*
+    <(?!
+        - |
+        /p> |
+        /span> |
+        /sub> |
+        /sup> |
+        /table> |
+        /td> |
+        /tr> |
+        \d |
+        \s |
+        \w+@(\w+\.)+\w+> | # Cover email addresses in license files
+        = |
+        br> |
+        https:// |         # Cover HTTPS links but not HTTP
+        p> |
+        span[\s>] |
+        sub> |
+        sup> |
+        table[\s>] |
+        td[\s>] |
+        tr> |
+        !-- )
+    """,
+    re.VERBOSE,
+)
+
+# Roughly detect fenced code even inside block quotes
+fenced_code = re.compile(r"^\s*(>\s+)*```")
+
+in_code_block = False
+with open(fname) as fid:
+    for line in fid:
+        # Ignore code blocks.
+        if fenced_code.match(line):
+            in_code_block = not in_code_block
+        if in_code_block:
+            continue
+        if disallowed_re.match(line):
+            print("*** Markdown contains unexpected HTML in line:", line, fname)
+            sys.exit(1)
+
+assert not in_code_block
diff --git a/scripts/generate_index.py b/scripts/generate_index.py
@@ -1,5 +1,6 @@
 import os
 from pathlib import Path
+
 source = Path(__file__).resolve().parent.parent / "source"
 source = source.resolve()
 info = {}
@@ -9,21 +10,21 @@
         continue
     if "node_modules" in relpath:
         continue
-    if p.name in ['index.md']:
+    if p.name in ["index.md"]:
         continue
-    fpath = relpath + '/' + p.name
+    fpath = relpath + "/" + p.name
     name = None
     with p.open() as fid:
         for line in fid:
             if line.startswith("# "):
-                name = line.replace('# ', '').strip()
+                name = line.replace("# ", "").strip()
                 break
     if name is None:
-        raise ValueError(f'Could not find name for {fpath}')
+        raise ValueError(f"Could not find name for {fpath}")
     info[name] = fpath
 
 index_file = source / "index.md"
 with index_file.open("w") as fid:
-    fid.write('# MongoDB Specifications\n\n')
+    fid.write("# MongoDB Specifications\n\n")
     for name in sorted(info):
-        fid.write(f'- [{name}]({info[name]})\n')
+        fid.write(f"- [{name}]({info[name]})\n")
diff --git a/scripts/migrate_to_md.py b/scripts/migrate_to_md.py
@@ -7,18 +7,20 @@
 import subprocess
 
 if len(sys.argv) < 2:
-    print('Must provide a path to an RST file')
+    print("Must provide a path to an RST file")
     sys.exit(1)
 
 path = Path(sys.argv[1])
 
 # Ensure git history for the md file.
-md_file = str(path).replace('.rst', '.md')
-subprocess.check_call(['git', 'mv', path, md_file])
-subprocess.check_call(['git', 'add', md_file])
-subprocess.check_call(['git', 'commit', '--no-verify', '-m', f'Rename {path} to {md_file}'])
-subprocess.check_call(['git', 'checkout', 'HEAD~1', path])
-subprocess.check_call(['git', 'add', path])
+md_file = str(path).replace(".rst", ".md")
+subprocess.check_call(["git", "mv", path, md_file])
+subprocess.check_call(["git", "add", md_file])
+subprocess.check_call(
+    ["git", "commit", "--no-verify", "-m", f"Rename {path} to {md_file}"]
+)
+subprocess.check_call(["git", "checkout", "HEAD~1", path])
+subprocess.check_call(["git", "add", path])
 
 # Get the contents of the file.
 with path.open() as fid:
@@ -31,45 +33,47 @@
 """
 
 # Update the RST file with a stub pointer to the MD file.
-if not path.name == 'README.rst':
+if not path.name == "README.rst":
     new_body = TEMPLATE.format(os.path.basename(md_file))
-    with path.open('w') as fid:
-        fid.write(''.join(new_body))
+    with path.open("w") as fid:
+        fid.write("".join(new_body))
 
 # Pre-process the file.
-for (i, line) in enumerate(lines):
+for i, line in enumerate(lines):
     # Replace curly quotes with regular quotes.
-    line = line.replace('”', '"')
-    line = line.replace('“', '"')
-    line = line.replace('’', "'")
-    line = line.replace('‘', "'")
+    line = line.replace("”", '"')
+    line = line.replace("“", '"')
+    line = line.replace("’", "'")
+    line = line.replace("‘", "'")
     lines[i] = line
 
     # Replace the colon fence blocks with bullets,
     # e.g. :Status:, :deprecated:, :changed:.
     # This also includes the changelog entries.
-    match = re.match(r':(\S+):(.*)', line)
+    match = re.match(r":(\S+):(.*)", line)
     if match:
         name, value = match.groups()
-        lines[i] = f'- {name.capitalize()}:{value}\n'
+        lines[i] = f"- {name.capitalize()}:{value}\n"
 
     # Handle "":Minimum Server Version:"" as a block quote.
-    if line.strip().startswith(':Minimum Server Version:'):
-        lines[i] = '- ' + line.strip()[1:] + ''
+    if line.strip().startswith(":Minimum Server Version:"):
+        lines[i] = "- " + line.strip()[1:] + ""
 
     # Remove the "".. contents::" block - handled by GitHub UI.
-    if line.strip() == '.. contents::':
-        lines[i] = ''
+    if line.strip() == ".. contents::":
+        lines[i] = ""
 
 # Run pandoc and capture output.
-proc = subprocess.Popen(['pandoc', '-f', 'rst', '-t', 'gfm'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-data = ''.join(lines).encode('utf8')
+proc = subprocess.Popen(
+    ["pandoc", "-f", "rst", "-t", "gfm"], stdin=subprocess.PIPE, stdout=subprocess.PIPE
+)
+data = "".join(lines).encode("utf8")
 outs, _ = proc.communicate(data)
-data = outs.decode('utf8')
+data = outs.decode("utf8")
 
 # Fix the strings that were missing backticks.
-data = re.sub(r'<span\W+class="title-ref">', '`', data, flags=re.MULTILINE)
-data = data.replace('</span>', '`')
+data = re.sub(r'<span\W+class="title-ref">', "`", data, flags=re.MULTILINE)
+data = data.replace("</span>", "`")
 
 # Handle div blocks that were created.
 # These are admonition blocks, convert to new GFM format.
@@ -79,55 +83,55 @@
 in_changelog_first = False
 lines = data.splitlines()
 new_lines = []
-for (i, line) in enumerate(lines):
-    match = re.match(r'<div class="(\S+)">',line)
+for i, line in enumerate(lines):
+    match = re.match(r'<div class="(\S+)">', line)
     if not in_block_outer and match:
         in_block_outer = True
-        new_lines.append(f'> [!{match.groups()[0].upper()}]')
+        new_lines.append(f"> [!{match.groups()[0].upper()}]")
         continue
-    if line.strip() == '</div>':
+    if line.strip() == "</div>":
         if in_block_outer:
             in_block_outer = False
             in_block_inner = True
         elif in_block_inner:
             in_block_inner = False
         continue
     if in_block_inner:
-        line = '> ' + line.strip()
+        line = "> " + line.strip()
 
     if in_changelog_first:
-        today = datetime.date.today().strftime('%Y-%m-%d')
-        line = f'\n- {today}: Migrated from reStructuredText to Markdown.'
+        today = datetime.date.today().strftime("%Y-%m-%d")
+        line = f"\n- {today}: Migrated from reStructuredText to Markdown."
         in_changelog_first = False
 
-    if line.strip() == '## Changelog':
+    if line.strip() == "## Changelog":
         in_changelog_first = True
 
     if not in_block_outer:
-        new_lines.append(line)    
+        new_lines.append(line)
 
 
 # Write the new content to the markdown file.
-with open(md_file, 'w') as fid:
-    fid.write('\n'.join(new_lines))
+with open(md_file, "w") as fid:
+    fid.write("\n".join(new_lines))
 
 # Handle links in other files.
-# We accept relative path links or links to master 
+# We accept relative path links or links to master
 # (https://github.com/mongodb/specifications/blob/master/source/...)
 # and rewrite them to use appropriate md links.
 # If the link is malformed we ignore and print an error.
 target = path.name
 curr = path
-while curr.parent.name != 'source':
-    target = f'{curr.parent.name}/{target}'
+while curr.parent.name != "source":
+    target = f"{curr.parent.name}/{target}"
     curr = curr.parent
-suffix = fr'\S*/{target}'
-rel_pattern = re.compile(fr'(\.\.{suffix})')
-md_pattern = re.compile(fr'(\(http{suffix})')
-html_pattern = re.compile(f'(http{suffix})')
-abs_pattern = re.compile(f'(/source{suffix})')
+suffix = rf"\S*/{target}"
+rel_pattern = re.compile(rf"(\.\.{suffix})")
+md_pattern = re.compile(rf"(\(http{suffix})")
+html_pattern = re.compile(f"(http{suffix})")
+abs_pattern = re.compile(f"(/source{suffix})")
 for p in Path("source").rglob("*"):
-    if p.suffix not in ['.rst', '.md']:
+    if p.suffix not in [".rst", ".md"]:
         continue
     with p.open() as fid:
         lines = fid.readlines()
@@ -141,16 +145,20 @@
             new_line = line.replace(matchstr, relpath)
         elif re.search(md_pattern, line):
             matchstr = re.search(md_pattern, line).groups()[0]
-            if not matchstr.startswith('(https://github.com/mongodb/specifications/blob/master/source'):
-                print('*** Error in link: ', matchstr, p)
+            if not matchstr.startswith(
+                "(https://github.com/mongodb/specifications/blob/master/source"
+            ):
+                print("*** Error in link: ", matchstr, p)
             else:
-                new_line = line.replace(matchstr, f'({relpath}')
+                new_line = line.replace(matchstr, f"({relpath}")
         elif re.search(html_pattern, line):
             matchstr = re.search(html_pattern, line).groups()[0]
-            if not matchstr.startswith('https://github.com/mongodb/specifications/blob/master/source'):
-                print('*** Error in link: ', matchstr, p)
+            if not matchstr.startswith(
+                "https://github.com/mongodb/specifications/blob/master/source"
+            ):
+                print("*** Error in link: ", matchstr, p)
             else:
-                new_line = line.replace(matchstr, f'{relpath}')
+                new_line = line.replace(matchstr, f"{relpath}")
         elif re.search(abs_pattern, line):
             matchstr = re.search(abs_pattern, line).groups()[0]
             new_line = line.replace(matchstr, relpath)
@@ -160,11 +168,11 @@
         new_lines.append(new_line)
 
     if changed_lines:
-        with p.open('w') as fid:
+        with p.open("w") as fid:
             fid.writelines(new_lines)
-        print('-' * 80)
-        print(f'Updated link(s) in {p}...')
-        print('    ' + '\n   '.join(changed_lines))
+        print("-" * 80)
+        print(f"Updated link(s) in {p}...")
+        print("    " + "\n   ".join(changed_lines))
 
-print('Created markdown file:')
+print("Created markdown file:")
 print(md_file)