diff --git a/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py b/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py index 1885e110..a2aa3b5b 100644 --- a/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py +++ b/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py @@ -1,3 +1,4 @@ +import re from markdown import Extension, Markdown from markdown.preprocessors import Preprocessor from markdown.blockprocessors import BlockProcessor @@ -6,15 +7,29 @@ MARK_PREVENT_RECURSION: str = "\t\t\t\r\r\rMARK_PREVENT_RECURSION\r\r\r\t\t\t" +MARK_CONTINUE: str = "\t\t\t\r\r\rMARK_CONTINUE\r\r\r\t\t\t" + +# @see: markdown.util.HTML_PLACEHOLDER_RE +# PYTHON_MARKDOWN_HTML_PLACEHOLDER_RE: re.Pattern[str] = re.compile( +# "\u0002wzxhzdk:%s\u0003" % r"([0-9]+)" +# ) + class MainExtension(Extension): def extendMarkdown(self, md: Markdown): - meta: dict = {"document_offsets": [], "used_document_offsets": {}} + meta: dict = { + "document_offsets": [], + "used_document_offsets": {}, + "last_parent": None, + } md.preprocessors.register( CalculateDocumentOffsetPreprocessor(md, meta), "capture_document", 1000 ) # Highest priority is required because we need to calc words offset from original document + md.preprocessors.register( + FixDocumentOffsetPreprocessor(md, meta), "fix_document", 0 + ) # Lowest priority is required because we need to fix the offset after all other block processors md.parser.blockprocessors.register( - OffsetsInjectionBlockProcessor(md.parser, meta), "mark_words", 100 + OffsetsInjectionBlockProcessor(md.parser, meta), "mark_words", 200 ) # high priority, usually larger than every other block processor @@ -32,6 +47,9 @@ def run(self, lines: list[str]) -> list[str]: for line in lines: # Skip empty lines if len(line) == 0: + store: tuple[str, int, int] = (line, offset, offset + 1) + self.meta["document_offsets"].append(store) + self.meta["used_document_offsets"][store] = False offset += 1 continue # store the line and offset @@ -43,6 +61,102 @@ def run(self, lines: list[str]) -> list[str]: return lines +class FixDocumentOffsetPreprocessor(Preprocessor): + """ + A preprocessor to fix the offset of each line after the 3rd party extension processed the document + """ + + def __init__(self, md: Markdown, meta: dict): + super(FixDocumentOffsetPreprocessor, self).__init__(md) + self.meta = meta + + def run(self, lines: list[str]) -> list[str]: + document_offsets: list[tuple[str, int, int]] = self.meta["document_offsets"] + + # 最后一次成功匹配的文档偏移量字典索引末,开区间 + last_success_match_end: int = 0 + num_lines: int = 0 + num_document_offsets: int = 0 + while num_document_offsets < len(document_offsets) and num_lines < len(lines): + line = lines[num_lines] + document_offset: tuple[str, int, int] = document_offsets[ + num_document_offsets + ] + + # 如果精准匹配 + if document_offset[0] == line: + # 匹配该行 + self.match(line, num_document_offsets, num_document_offsets + 1) + # 如果上次成功匹配的原文档偏移量未连续,匹配当前行到这部分未连续的原文档偏移量 + if num_document_offsets > last_success_match_end and num_lines > 0: + self.match( + lines[num_lines - 1], + last_success_match_end, + num_document_offsets, + ) + last_success_match_end = num_document_offsets + 1 + num_lines += 1 + num_document_offsets += 1 + # 如果未能精准匹配,查找该行在原文档偏移量字典中的位置 + else: + remain: list[str] = [ + line for line, _, _ in document_offsets[num_document_offsets:] + ] + # 如果存在这样的行 + if line in remain: + # 找到第一次匹配的位置,匹配该行到此处 + idx = remain.index(line) + num_document_offsets + self.match(line, idx, idx + 1) + # 如果上次成功匹配的原文档偏移量未连续,匹配当前行到这部分未连续的原文档偏移量 + if idx > last_success_match_end and num_lines > 0: + self.match(lines[num_lines - 1], last_success_match_end, idx) + last_success_match_end = idx + 1 + num_lines += 1 + num_document_offsets = idx + 1 + # 如果未找到匹配的位置,继续查找下一行 + else: + num_lines += 1 + + # 如果行匹配完成,但原文档偏移量未匹配完成,匹配剩余的原文档偏移量 + if last_success_match_end < len(document_offsets): + self.match( + lines[num_lines - 1], last_success_match_end, len(document_offsets) + ) + + return lines + + def match( + self, + matched_line: str, + num_document_offsets_start: int, + num_document_offsets_end: int, + ): + """ + 将单个匹配行设置到多个原文档偏移量字典,索引范围为[num_document_offsets_start, num_document_offsets_end) + """ + document_offsets: list[tuple[str, int, int]] = self.meta["document_offsets"] + used_document_offsets: dict[tuple[str, int, int], bool] = self.meta[ + "used_document_offsets" + ] + for i in range(num_document_offsets_start, num_document_offsets_end): + document_offset = document_offsets[i] + # 如果是第一个匹配的原文档偏移量,设置为匹配行,否则设置为 MARK_CONTINUE + if i == num_document_offsets_start: + document_offsets[i] = ( + matched_line, + document_offset[1], + document_offset[2], + ) + else: + document_offsets[i] = ( + MARK_CONTINUE, + document_offset[1], + document_offset[2], + ) + del used_document_offsets[document_offset] + used_document_offsets[document_offsets[i]] = False + + class OffsetsInjectionBlockProcessor(BlockProcessor): """ A block processor to mark the words in the document and inject the offset of the block to the HTML element @@ -80,10 +194,18 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: used: dict[tuple[str, int, int], bool] = {} # Search for the block fragment in the document_offsets for store in self.meta["document_offsets"]: + # Skip empty lines + if len(store[0]) == 0: + continue # If already used, skip if self.meta["used_document_offsets"][store]: continue (line, offset, end_offset) = store + # 如果收到 MARK_CONTINUE 标记,直接认为该标记之前的行是连续的 + if line == MARK_CONTINUE: + end = end_offset + used[store] = True + continue # If found one if line in block: # If the line already scanned (usually some lines with same content in different place), skip @@ -111,9 +233,15 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: if start is not None and end is not None: blocks.pop(0) self.meta["used_document_offsets"].update(used) - # append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled flaglist because we don't know if there's some same block in the document + # append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled + # flaglist because we don't know if there's some same block in the document self.parser.parseBlocks(parent, [block + MARK_PREVENT_RECURSION]) + # fix multi blocks in same parents + if self.meta["last_parent"] == parent[-1]: + parent[-1].set("data-original-document-end", str(end)) + return True parent[-1].set("data-original-document-start", str(start)) parent[-1].set("data-original-document-end", str(end)) + self.meta["last_parent"] = parent[-1] return True return False diff --git a/python-markdown-extension/test/__main__.py b/python-markdown-extension/test/__main__.py index eb15716d..93f2eb00 100644 --- a/python-markdown-extension/test/__main__.py +++ b/python-markdown-extension/test/__main__.py @@ -112,8 +112,6 @@ def __init__(self, case, test_case: unittest.TestCase): }, }, ) - print(self.case["document"]) - print(self.result) self.test_case = test_case def test(self): @@ -190,11 +188,11 @@ def test_normal(self): case = { "document": textwrap.dedent("""\ # Lorem ipsum - + Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna. - + ## Morbi neque lectus - + Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit."""), "expected": [ {"tag": "h1", "offset": (0, 13)}, @@ -277,32 +275,30 @@ def test_oi_wiki_index(self): }, { "tag": "p", - "offset": (780, 844), + "offset": (780, 1101), # FIXME: Correct one is (780, 1101) }, - # there's a div tag and a script tag in the document, and they will not be considered. ], } Tester(case, self).test() def test_oi_wiki_search_dfs(self): case = { - # I HATE TEXT BLOCKS "document": textwrap.dedent("""\ ## 引入 - + DFS 为图论中的概念,详见 [DFS(图论)](../graph/dfs.md) 页面。在 **搜索算法** 中,该词常常指利用递归函数方便地实现暴力枚举的算法,与图论中的 DFS 算法有一定相似之处,但并不完全相同。 - + ## 解释 - + 考虑这个例子: - + ???+ note "例题" 把正整数 $n$ 分解为 $3$ 个不同的正整数,如 $6=1+2+3$,排在后面的数必须大于等于前面的数,输出所有方案。 - + 对于这个问题,如果不知道搜索,应该怎么办呢? - + 当然是三重循环,参考代码如下: - + ???+ note "实现" === "C++" ```cpp @@ -311,7 +307,7 @@ def test_oi_wiki_search_dfs(self): for (int k = j; k <= n; ++k) if (i + j + k == n) printf("%d = %d + %d + %d\\n", n, i, j, k); ``` - + === "Python" ```python for i in range(1, n + 1): @@ -320,7 +316,7 @@ def test_oi_wiki_search_dfs(self): if i + j + k == n: print("%d = %d + %d + %d" % (n, i, j, k)) ``` - + === "Java" ```Java for (int i = 1; i < n + 1; i++) { @@ -331,7 +327,7 @@ def test_oi_wiki_search_dfs(self): } } ``` - + 那如果是分解成四个整数呢?再加一重循环?"""), "expected": [ { @@ -350,7 +346,10 @@ def test_oi_wiki_search_dfs(self): "tag": "p", "offset": (126, 133), }, - #
has been ignored + { + "tag": "details", + "offset": (135, 215), + }, { "tag": "p", "offset": (217, 239), @@ -359,7 +358,10 @@ def test_oi_wiki_search_dfs(self): "tag": "p", "offset": (241, 256), }, - #
has been ignored + { + "tag": "details", + "offset": (258, 1092), + }, { "tag": "p", "offset": (1094, 1114),