Skip to content

Commit

Permalink
fix and catch unintended uses of inline HTML (#1716)
Browse files Browse the repository at this point in the history
* fix missing text in "KMS Provider" section of the Client Side Encryption spec due to less-than symbol
* manual audit of all less-than symbols followed by a letter
* reformat python scripts using "black"
* let scripts detect fenced code inside block-quotes
* add a pre-commit script to check HTML tags against allowed patterns
* replace HTTP links with equivalent HTTPS
  • Loading branch information
mdbmes authored Nov 5, 2024
1 parent 21c1427 commit f8dbd24
Show file tree
Hide file tree
Showing 13 changed files with 190 additions and 115 deletions.
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ repos:
types: [markdown]
language: system
entry: python3 scripts/check_links.py
- id: markdown-html-check
name: markdown-html-check
types: [markdown]
language: system
entry: python3 scripts/check_md_html.py

- repo: https://github.com/tcort/markdown-link-check
rev: v3.12.2
Expand Down
22 changes: 14 additions & 8 deletions scripts/check_links.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
import sys
import sys, re

fname = sys.argv[-1]

# Roughly detect fenced code even inside block quotes
fenced_code = re.compile(r"^\s*(>\s+)*```")

# Check for markdown links that got improperly line wrapped.
in_code_block = False
with open(fname) as fid:
for line in fid:
# Ignore code blocks.
if line.strip().startswith('```'):
if fenced_code.match(line):
in_code_block = not in_code_block
if in_code_block:
continue
id0 = line.index('[') if '[' in line else -1
id1 = line.index(']') if ']' in line else -1
id2 = line.index('(') if '(' in line else -1
id3 = line.index(')') if ')' in line else -1
id0 = line.index("[") if "[" in line else -1
id1 = line.index("]") if "]" in line else -1
id2 = line.index("(") if "(" in line else -1
id3 = line.index(")") if ")" in line else -1
if id1 == -1 or id2 == -1 or id3 == -1:
continue
if id2 < id1 or id3 < id2:
continue
if id0 == -1:
print('*** Malformed link in line:', line, fname)
sys.exit(1)
print("*** Malformed link in line:", line, fname)
sys.exit(1)

assert not in_code_block
55 changes: 55 additions & 0 deletions scripts/check_md_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import sys, re

fname = sys.argv[-1]

# Check for allowed HTML elements in markdown.
# Ignores inline and fenced code, but intentionally doesn't ignore backslash
# escaping. (For compatibility, we want to avoid unintentional inline HTML
# even on markdown implementations where "\<" escapes are not supported.)

disallowed_re = re.compile(
r"""
[^`]*(`[^`]+`)*
<(?!
- |
/p> |
/span> |
/sub> |
/sup> |
/table> |
/td> |
/tr> |
\d |
\s |
\w+@(\w+\.)+\w+> | # Cover email addresses in license files
= |
br> |
https:// | # Cover HTTPS links but not HTTP
p> |
span[\s>] |
sub> |
sup> |
table[\s>] |
td[\s>] |
tr> |
!-- )
""",
re.VERBOSE,
)

# Roughly detect fenced code even inside block quotes
fenced_code = re.compile(r"^\s*(>\s+)*```")

in_code_block = False
with open(fname) as fid:
for line in fid:
# Ignore code blocks.
if fenced_code.match(line):
in_code_block = not in_code_block
if in_code_block:
continue
if disallowed_re.match(line):
print("*** Markdown contains unexpected HTML in line:", line, fname)
sys.exit(1)

assert not in_code_block
13 changes: 7 additions & 6 deletions scripts/generate_index.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from pathlib import Path

source = Path(__file__).resolve().parent.parent / "source"
source = source.resolve()
info = {}
Expand All @@ -9,21 +10,21 @@
continue
if "node_modules" in relpath:
continue
if p.name in ['index.md']:
if p.name in ["index.md"]:
continue
fpath = relpath + '/' + p.name
fpath = relpath + "/" + p.name
name = None
with p.open() as fid:
for line in fid:
if line.startswith("# "):
name = line.replace('# ', '').strip()
name = line.replace("# ", "").strip()
break
if name is None:
raise ValueError(f'Could not find name for {fpath}')
raise ValueError(f"Could not find name for {fpath}")
info[name] = fpath

index_file = source / "index.md"
with index_file.open("w") as fid:
fid.write('# MongoDB Specifications\n\n')
fid.write("# MongoDB Specifications\n\n")
for name in sorted(info):
fid.write(f'- [{name}]({info[name]})\n')
fid.write(f"- [{name}]({info[name]})\n")
122 changes: 65 additions & 57 deletions scripts/migrate_to_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,20 @@
import subprocess

if len(sys.argv) < 2:
print('Must provide a path to an RST file')
print("Must provide a path to an RST file")
sys.exit(1)

path = Path(sys.argv[1])

# Ensure git history for the md file.
md_file = str(path).replace('.rst', '.md')
subprocess.check_call(['git', 'mv', path, md_file])
subprocess.check_call(['git', 'add', md_file])
subprocess.check_call(['git', 'commit', '--no-verify', '-m', f'Rename {path} to {md_file}'])
subprocess.check_call(['git', 'checkout', 'HEAD~1', path])
subprocess.check_call(['git', 'add', path])
md_file = str(path).replace(".rst", ".md")
subprocess.check_call(["git", "mv", path, md_file])
subprocess.check_call(["git", "add", md_file])
subprocess.check_call(
["git", "commit", "--no-verify", "-m", f"Rename {path} to {md_file}"]
)
subprocess.check_call(["git", "checkout", "HEAD~1", path])
subprocess.check_call(["git", "add", path])

# Get the contents of the file.
with path.open() as fid:
Expand All @@ -31,45 +33,47 @@
"""

# Update the RST file with a stub pointer to the MD file.
if not path.name == 'README.rst':
if not path.name == "README.rst":
new_body = TEMPLATE.format(os.path.basename(md_file))
with path.open('w') as fid:
fid.write(''.join(new_body))
with path.open("w") as fid:
fid.write("".join(new_body))

# Pre-process the file.
for (i, line) in enumerate(lines):
for i, line in enumerate(lines):
# Replace curly quotes with regular quotes.
line = line.replace('”', '"')
line = line.replace('“', '"')
line = line.replace('’', "'")
line = line.replace('‘', "'")
line = line.replace("”", '"')
line = line.replace("“", '"')
line = line.replace("’", "'")
line = line.replace("‘", "'")
lines[i] = line

# Replace the colon fence blocks with bullets,
# e.g. :Status:, :deprecated:, :changed:.
# This also includes the changelog entries.
match = re.match(r':(\S+):(.*)', line)
match = re.match(r":(\S+):(.*)", line)
if match:
name, value = match.groups()
lines[i] = f'- {name.capitalize()}:{value}\n'
lines[i] = f"- {name.capitalize()}:{value}\n"

# Handle "":Minimum Server Version:"" as a block quote.
if line.strip().startswith(':Minimum Server Version:'):
lines[i] = '- ' + line.strip()[1:] + ''
if line.strip().startswith(":Minimum Server Version:"):
lines[i] = "- " + line.strip()[1:] + ""

# Remove the "".. contents::" block - handled by GitHub UI.
if line.strip() == '.. contents::':
lines[i] = ''
if line.strip() == ".. contents::":
lines[i] = ""

# Run pandoc and capture output.
proc = subprocess.Popen(['pandoc', '-f', 'rst', '-t', 'gfm'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
data = ''.join(lines).encode('utf8')
proc = subprocess.Popen(
["pandoc", "-f", "rst", "-t", "gfm"], stdin=subprocess.PIPE, stdout=subprocess.PIPE
)
data = "".join(lines).encode("utf8")
outs, _ = proc.communicate(data)
data = outs.decode('utf8')
data = outs.decode("utf8")

# Fix the strings that were missing backticks.
data = re.sub(r'<span\W+class="title-ref">', '`', data, flags=re.MULTILINE)
data = data.replace('</span>', '`')
data = re.sub(r'<span\W+class="title-ref">', "`", data, flags=re.MULTILINE)
data = data.replace("</span>", "`")

# Handle div blocks that were created.
# These are admonition blocks, convert to new GFM format.
Expand All @@ -79,55 +83,55 @@
in_changelog_first = False
lines = data.splitlines()
new_lines = []
for (i, line) in enumerate(lines):
match = re.match(r'<div class="(\S+)">',line)
for i, line in enumerate(lines):
match = re.match(r'<div class="(\S+)">', line)
if not in_block_outer and match:
in_block_outer = True
new_lines.append(f'> [!{match.groups()[0].upper()}]')
new_lines.append(f"> [!{match.groups()[0].upper()}]")
continue
if line.strip() == '</div>':
if line.strip() == "</div>":
if in_block_outer:
in_block_outer = False
in_block_inner = True
elif in_block_inner:
in_block_inner = False
continue
if in_block_inner:
line = '> ' + line.strip()
line = "> " + line.strip()

if in_changelog_first:
today = datetime.date.today().strftime('%Y-%m-%d')
line = f'\n- {today}: Migrated from reStructuredText to Markdown.'
today = datetime.date.today().strftime("%Y-%m-%d")
line = f"\n- {today}: Migrated from reStructuredText to Markdown."
in_changelog_first = False

if line.strip() == '## Changelog':
if line.strip() == "## Changelog":
in_changelog_first = True

if not in_block_outer:
new_lines.append(line)
new_lines.append(line)


# Write the new content to the markdown file.
with open(md_file, 'w') as fid:
fid.write('\n'.join(new_lines))
with open(md_file, "w") as fid:
fid.write("\n".join(new_lines))

# Handle links in other files.
# We accept relative path links or links to master
# We accept relative path links or links to master
# (https://github.com/mongodb/specifications/blob/master/source/...)
# and rewrite them to use appropriate md links.
# If the link is malformed we ignore and print an error.
target = path.name
curr = path
while curr.parent.name != 'source':
target = f'{curr.parent.name}/{target}'
while curr.parent.name != "source":
target = f"{curr.parent.name}/{target}"
curr = curr.parent
suffix = fr'\S*/{target}'
rel_pattern = re.compile(fr'(\.\.{suffix})')
md_pattern = re.compile(fr'(\(http{suffix})')
html_pattern = re.compile(f'(http{suffix})')
abs_pattern = re.compile(f'(/source{suffix})')
suffix = rf"\S*/{target}"
rel_pattern = re.compile(rf"(\.\.{suffix})")
md_pattern = re.compile(rf"(\(http{suffix})")
html_pattern = re.compile(f"(http{suffix})")
abs_pattern = re.compile(f"(/source{suffix})")
for p in Path("source").rglob("*"):
if p.suffix not in ['.rst', '.md']:
if p.suffix not in [".rst", ".md"]:
continue
with p.open() as fid:
lines = fid.readlines()
Expand All @@ -141,16 +145,20 @@
new_line = line.replace(matchstr, relpath)
elif re.search(md_pattern, line):
matchstr = re.search(md_pattern, line).groups()[0]
if not matchstr.startswith('(https://github.com/mongodb/specifications/blob/master/source'):
print('*** Error in link: ', matchstr, p)
if not matchstr.startswith(
"(https://github.com/mongodb/specifications/blob/master/source"
):
print("*** Error in link: ", matchstr, p)
else:
new_line = line.replace(matchstr, f'({relpath}')
new_line = line.replace(matchstr, f"({relpath}")
elif re.search(html_pattern, line):
matchstr = re.search(html_pattern, line).groups()[0]
if not matchstr.startswith('https://github.com/mongodb/specifications/blob/master/source'):
print('*** Error in link: ', matchstr, p)
if not matchstr.startswith(
"https://github.com/mongodb/specifications/blob/master/source"
):
print("*** Error in link: ", matchstr, p)
else:
new_line = line.replace(matchstr, f'{relpath}')
new_line = line.replace(matchstr, f"{relpath}")
elif re.search(abs_pattern, line):
matchstr = re.search(abs_pattern, line).groups()[0]
new_line = line.replace(matchstr, relpath)
Expand All @@ -160,11 +168,11 @@
new_lines.append(new_line)

if changed_lines:
with p.open('w') as fid:
with p.open("w") as fid:
fid.writelines(new_lines)
print('-' * 80)
print(f'Updated link(s) in {p}...')
print(' ' + '\n '.join(changed_lines))
print("-" * 80)
print(f"Updated link(s) in {p}...")
print(" " + "\n ".join(changed_lines))

print('Created markdown file:')
print("Created markdown file:")
print(md_file)
Loading

0 comments on commit f8dbd24

Please sign in to comment.