refacor: heuristics marking

OI-wiki · Jul 4, 2024 · 5bc32d4 · 5bc32d4
1 parent e357c04
commit 5bc32d4
Show file tree

Hide file tree

Showing 2 changed files with 153 additions and 23 deletions.
diff --git a/...-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py b/...-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py
@@ -1,3 +1,4 @@
+import re
 from markdown import Extension, Markdown
 from markdown.preprocessors import Preprocessor
 from markdown.blockprocessors import BlockProcessor
@@ -6,15 +7,29 @@
 
 MARK_PREVENT_RECURSION: str = "\t\t\t\r\r\rMARK_PREVENT_RECURSION\r\r\r\t\t\t"
 
+MARK_CONTINUE: str = "\t\t\t\r\r\rMARK_CONTINUE\r\r\r\t\t\t"
+
+# @see: markdown.util.HTML_PLACEHOLDER_RE
+# PYTHON_MARKDOWN_HTML_PLACEHOLDER_RE: re.Pattern[str] = re.compile(
+#     "\u0002wzxhzdk:%s\u0003" % r"([0-9]+)"
+# )
+
 
 class MainExtension(Extension):
     def extendMarkdown(self, md: Markdown):
-        meta: dict = {"document_offsets": [], "used_document_offsets": {}}
+        meta: dict = {
+            "document_offsets": [],
+            "used_document_offsets": {},
+            "last_parent": None,
+        }
         md.preprocessors.register(
             CalculateDocumentOffsetPreprocessor(md, meta), "capture_document", 1000
         )  # Highest priority is required because we need to calc words offset from original document
+        md.preprocessors.register(
+            FixDocumentOffsetPreprocessor(md, meta), "fix_document", 0
+        )  # Lowest priority is required because we need to fix the offset after all other block processors
         md.parser.blockprocessors.register(
-            OffsetsInjectionBlockProcessor(md.parser, meta), "mark_words", 100
+            OffsetsInjectionBlockProcessor(md.parser, meta), "mark_words", 200
         )  # high priority, usually larger than every other block processor
 
 
@@ -32,6 +47,9 @@ def run(self, lines: list[str]) -> list[str]:
         for line in lines:
             # Skip empty lines
             if len(line) == 0:
+                store: tuple[str, int, int] = (line, offset, offset + 1)
+                self.meta["document_offsets"].append(store)
+                self.meta["used_document_offsets"][store] = False
                 offset += 1
                 continue
             # store the line and offset
@@ -43,6 +61,102 @@ def run(self, lines: list[str]) -> list[str]:
         return lines
 
 
+class FixDocumentOffsetPreprocessor(Preprocessor):
+    """
+    A preprocessor to fix the offset of each line after the 3rd party extension processed the document
+    """
+
+    def __init__(self, md: Markdown, meta: dict):
+        super(FixDocumentOffsetPreprocessor, self).__init__(md)
+        self.meta = meta
+
+    def run(self, lines: list[str]) -> list[str]:
+        document_offsets: list[tuple[str, int, int]] = self.meta["document_offsets"]
+
+        # 最后一次成功匹配的文档偏移量字典索引末，开区间
+        last_success_match_end: int = 0
+        num_lines: int = 0
+        num_document_offsets: int = 0
+        while num_document_offsets < len(document_offsets) and num_lines < len(lines):
+            line = lines[num_lines]
+            document_offset: tuple[str, int, int] = document_offsets[
+                num_document_offsets
+            ]
+
+            # 如果精准匹配
+            if document_offset[0] == line:
+                # 匹配该行
+                self.match(line, num_document_offsets, num_document_offsets + 1)
+                # 如果上次成功匹配的原文档偏移量未连续，匹配当前行到这部分未连续的原文档偏移量
+                if num_document_offsets > last_success_match_end and num_lines > 0:
+                    self.match(
+                        lines[num_lines - 1],
+                        last_success_match_end,
+                        num_document_offsets,
+                    )
+                last_success_match_end = num_document_offsets + 1
+                num_lines += 1
+                num_document_offsets += 1
+            # 如果未能精准匹配，查找该行在原文档偏移量字典中的位置
+            else:
+                remain: list[str] = [
+                    line for line, _, _ in document_offsets[num_document_offsets:]
+                ]
+                # 如果存在这样的行
+                if line in remain:
+                    # 找到第一次匹配的位置，匹配该行到此处
+                    idx = remain.index(line) + num_document_offsets
+                    self.match(line, idx, idx + 1)
+                    # 如果上次成功匹配的原文档偏移量未连续，匹配当前行到这部分未连续的原文档偏移量
+                    if idx > last_success_match_end and num_lines > 0:
+                        self.match(lines[num_lines - 1], last_success_match_end, idx)
+                    last_success_match_end = idx + 1
+                    num_lines += 1
+                    num_document_offsets = idx + 1
+                # 如果未找到匹配的位置，继续查找下一行
+                else:
+                    num_lines += 1
+
+        # 如果行匹配完成，但原文档偏移量未匹配完成，匹配剩余的原文档偏移量
+        if last_success_match_end < len(document_offsets):
+            self.match(
+                lines[num_lines - 1], last_success_match_end, len(document_offsets)
+            )
+
+        return lines
+
+    def match(
+        self,
+        matched_line: str,
+        num_document_offsets_start: int,
+        num_document_offsets_end: int,
+    ):
+        """
+        将单个匹配行设置到多个原文档偏移量字典，索引范围为[num_document_offsets_start, num_document_offsets_end)
+        """
+        document_offsets: list[tuple[str, int, int]] = self.meta["document_offsets"]
+        used_document_offsets: dict[tuple[str, int, int], bool] = self.meta[
+            "used_document_offsets"
+        ]
+        for i in range(num_document_offsets_start, num_document_offsets_end):
+            document_offset = document_offsets[i]
+            # 如果是第一个匹配的原文档偏移量，设置为匹配行，否则设置为 MARK_CONTINUE
+            if i == num_document_offsets_start:
+                document_offsets[i] = (
+                    matched_line,
+                    document_offset[1],
+                    document_offset[2],
+                )
+            else:
+                document_offsets[i] = (
+                    MARK_CONTINUE,
+                    document_offset[1],
+                    document_offset[2],
+                )
+            del used_document_offsets[document_offset]
+            used_document_offsets[document_offsets[i]] = False
+
+
 class OffsetsInjectionBlockProcessor(BlockProcessor):
     """
     A block processor to mark the words in the document and inject the offset of the block to the HTML element
@@ -80,10 +194,18 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
         used: dict[tuple[str, int, int], bool] = {}
         # Search for the block fragment in the document_offsets
         for store in self.meta["document_offsets"]:
+            # Skip empty lines
+            if len(store[0]) == 0:
+                continue
             # If already used, skip
             if self.meta["used_document_offsets"][store]:
                 continue
             (line, offset, end_offset) = store
+            # 如果收到 MARK_CONTINUE 标记，直接认为该标记之前的行是连续的
+            if line == MARK_CONTINUE:
+                end = end_offset
+                used[store] = True
+                continue
             # If found one
             if line in block:
                 # If the line already scanned (usually some lines with same content in different place), skip
@@ -111,9 +233,15 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
         if start is not None and end is not None:
             blocks.pop(0)
             self.meta["used_document_offsets"].update(used)
-            # append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled flaglist because we don't know if there's some same block in the document
+            # append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled
+            # flaglist because we don't know if there's some same block in the document
             self.parser.parseBlocks(parent, [block + MARK_PREVENT_RECURSION])
+            # fix multi blocks in same parents
+            if self.meta["last_parent"] == parent[-1]:
+                parent[-1].set("data-original-document-end", str(end))
+                return True
             parent[-1].set("data-original-document-start", str(start))
             parent[-1].set("data-original-document-end", str(end))
+            self.meta["last_parent"] = parent[-1]
             return True
         return False
diff --git a/python-markdown-extension/test/__main__.py b/python-markdown-extension/test/__main__.py
@@ -112,8 +112,6 @@ def __init__(self, case, test_case: unittest.TestCase):
                 },
             },
         )
-        print(self.case["document"])
-        print(self.result)
         self.test_case = test_case
 
     def test(self):
@@ -190,11 +188,11 @@ def test_normal(self):
         case = {
             "document": textwrap.dedent("""\
                     # Lorem ipsum
-                    
+
                     Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna.
-                    
+
                     ## Morbi neque lectus
-                    
+
                     Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit."""),
             "expected": [
                 {"tag": "h1", "offset": (0, 13)},
@@ -277,32 +275,30 @@ def test_oi_wiki_index(self):
                 },
                 {
                     "tag": "p",
-                    "offset": (780, 844),
+                    "offset": (780, 1101),  # FIXME: Correct one is (780, 1101)
                 },
-                # there's a div tag and a script tag in the document, and they will not be considered.
             ],
         }
         Tester(case, self).test()
 
     def test_oi_wiki_search_dfs(self):
         case = {
-            # I HATE TEXT BLOCKS
             "document": textwrap.dedent("""\
                 ## 引入
-                
+
                 DFS 为图论中的概念，详见 [DFS（图论）](../graph/dfs.md) 页面。在 **搜索算法** 中，该词常常指利用递归函数方便地实现暴力枚举的算法，与图论中的 DFS 算法有一定相似之处，但并不完全相同。
-                
+
                 ## 解释
-                
+
                 考虑这个例子：
-                
+
                 ???+ note "例题"
                     把正整数 $n$ 分解为 $3$ 个不同的正整数，如 $6=1+2+3$，排在后面的数必须大于等于前面的数，输出所有方案。
-                
+
                 对于这个问题，如果不知道搜索，应该怎么办呢？
-                
+
                 当然是三重循环，参考代码如下：
-                
+
                 ???+ note "实现"
                     === "C++"
                         ```cpp
@@ -311,7 +307,7 @@ def test_oi_wiki_search_dfs(self):
                             for (int k = j; k <= n; ++k)
                               if (i + j + k == n) printf("%d = %d + %d + %d\\n", n, i, j, k);
                         ```
-                    
+
                     === "Python"
                         ```python
                         for i in range(1, n + 1):
@@ -320,7 +316,7 @@ def test_oi_wiki_search_dfs(self):
                                     if i + j + k == n:
                                         print("%d = %d + %d + %d" % (n, i, j, k))
                         ```
-                    
+
                     === "Java"
                         ```Java
                         for (int i = 1; i < n + 1; i++) {
@@ -331,7 +327,7 @@ def test_oi_wiki_search_dfs(self):
                             }
                         }
                         ```
-                
+
                 那如果是分解成四个整数呢？再加一重循环？"""),
             "expected": [
                 {
@@ -350,7 +346,10 @@ def test_oi_wiki_search_dfs(self):
                     "tag": "p",
                     "offset": (126, 133),
                 },
-                # <details class="note" open="open"> has been ignored
+                {
+                    "tag": "details",
+                    "offset": (135, 215),
+                },
                 {
                     "tag": "p",
                     "offset": (217, 239),
@@ -359,7 +358,10 @@ def test_oi_wiki_search_dfs(self):
                     "tag": "p",
                     "offset": (241, 256),
                 },
-                # <details class="note" open="open"> has been ignored
+                {
+                    "tag": "details",
+                    "offset": (258, 1092),
+                },
                 {
                     "tag": "p",
                     "offset": (1094, 1114),