Skip to content

Commit

Permalink
refacor: heuristics marking
Browse files Browse the repository at this point in the history
  • Loading branch information
shaokeyibb committed Jul 4, 2024
1 parent e357c04 commit 5bc32d4
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 23 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from markdown import Extension, Markdown
from markdown.preprocessors import Preprocessor
from markdown.blockprocessors import BlockProcessor
Expand All @@ -6,15 +7,29 @@

MARK_PREVENT_RECURSION: str = "\t\t\t\r\r\rMARK_PREVENT_RECURSION\r\r\r\t\t\t"

MARK_CONTINUE: str = "\t\t\t\r\r\rMARK_CONTINUE\r\r\r\t\t\t"

# @see: markdown.util.HTML_PLACEHOLDER_RE
# PYTHON_MARKDOWN_HTML_PLACEHOLDER_RE: re.Pattern[str] = re.compile(
# "\u0002wzxhzdk:%s\u0003" % r"([0-9]+)"
# )


class MainExtension(Extension):
def extendMarkdown(self, md: Markdown):
meta: dict = {"document_offsets": [], "used_document_offsets": {}}
meta: dict = {
"document_offsets": [],
"used_document_offsets": {},
"last_parent": None,
}
md.preprocessors.register(
CalculateDocumentOffsetPreprocessor(md, meta), "capture_document", 1000
) # Highest priority is required because we need to calc words offset from original document
md.preprocessors.register(
FixDocumentOffsetPreprocessor(md, meta), "fix_document", 0
) # Lowest priority is required because we need to fix the offset after all other block processors
md.parser.blockprocessors.register(
OffsetsInjectionBlockProcessor(md.parser, meta), "mark_words", 100
OffsetsInjectionBlockProcessor(md.parser, meta), "mark_words", 200
) # high priority, usually larger than every other block processor


Expand All @@ -32,6 +47,9 @@ def run(self, lines: list[str]) -> list[str]:
for line in lines:
# Skip empty lines
if len(line) == 0:
store: tuple[str, int, int] = (line, offset, offset + 1)
self.meta["document_offsets"].append(store)
self.meta["used_document_offsets"][store] = False
offset += 1
continue
# store the line and offset
Expand All @@ -43,6 +61,102 @@ def run(self, lines: list[str]) -> list[str]:
return lines


class FixDocumentOffsetPreprocessor(Preprocessor):
"""
A preprocessor to fix the offset of each line after the 3rd party extension processed the document
"""

def __init__(self, md: Markdown, meta: dict):
super(FixDocumentOffsetPreprocessor, self).__init__(md)
self.meta = meta

def run(self, lines: list[str]) -> list[str]:
document_offsets: list[tuple[str, int, int]] = self.meta["document_offsets"]

# 最后一次成功匹配的文档偏移量字典索引末,开区间
last_success_match_end: int = 0
num_lines: int = 0
num_document_offsets: int = 0
while num_document_offsets < len(document_offsets) and num_lines < len(lines):
line = lines[num_lines]
document_offset: tuple[str, int, int] = document_offsets[
num_document_offsets
]

# 如果精准匹配
if document_offset[0] == line:
# 匹配该行
self.match(line, num_document_offsets, num_document_offsets + 1)
# 如果上次成功匹配的原文档偏移量未连续,匹配当前行到这部分未连续的原文档偏移量
if num_document_offsets > last_success_match_end and num_lines > 0:
self.match(
lines[num_lines - 1],
last_success_match_end,
num_document_offsets,
)
last_success_match_end = num_document_offsets + 1
num_lines += 1
num_document_offsets += 1
# 如果未能精准匹配,查找该行在原文档偏移量字典中的位置
else:
remain: list[str] = [
line for line, _, _ in document_offsets[num_document_offsets:]
]
# 如果存在这样的行
if line in remain:
# 找到第一次匹配的位置,匹配该行到此处
idx = remain.index(line) + num_document_offsets
self.match(line, idx, idx + 1)
# 如果上次成功匹配的原文档偏移量未连续,匹配当前行到这部分未连续的原文档偏移量
if idx > last_success_match_end and num_lines > 0:
self.match(lines[num_lines - 1], last_success_match_end, idx)
last_success_match_end = idx + 1
num_lines += 1
num_document_offsets = idx + 1
# 如果未找到匹配的位置,继续查找下一行
else:
num_lines += 1

# 如果行匹配完成,但原文档偏移量未匹配完成,匹配剩余的原文档偏移量
if last_success_match_end < len(document_offsets):
self.match(
lines[num_lines - 1], last_success_match_end, len(document_offsets)
)

return lines

def match(
self,
matched_line: str,
num_document_offsets_start: int,
num_document_offsets_end: int,
):
"""
将单个匹配行设置到多个原文档偏移量字典,索引范围为[num_document_offsets_start, num_document_offsets_end)
"""
document_offsets: list[tuple[str, int, int]] = self.meta["document_offsets"]
used_document_offsets: dict[tuple[str, int, int], bool] = self.meta[
"used_document_offsets"
]
for i in range(num_document_offsets_start, num_document_offsets_end):
document_offset = document_offsets[i]
# 如果是第一个匹配的原文档偏移量,设置为匹配行,否则设置为 MARK_CONTINUE
if i == num_document_offsets_start:
document_offsets[i] = (
matched_line,
document_offset[1],
document_offset[2],
)
else:
document_offsets[i] = (
MARK_CONTINUE,
document_offset[1],
document_offset[2],
)
del used_document_offsets[document_offset]
used_document_offsets[document_offsets[i]] = False


class OffsetsInjectionBlockProcessor(BlockProcessor):
"""
A block processor to mark the words in the document and inject the offset of the block to the HTML element
Expand Down Expand Up @@ -80,10 +194,18 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
used: dict[tuple[str, int, int], bool] = {}
# Search for the block fragment in the document_offsets
for store in self.meta["document_offsets"]:
# Skip empty lines
if len(store[0]) == 0:
continue
# If already used, skip
if self.meta["used_document_offsets"][store]:
continue
(line, offset, end_offset) = store
# 如果收到 MARK_CONTINUE 标记,直接认为该标记之前的行是连续的
if line == MARK_CONTINUE:
end = end_offset
used[store] = True
continue
# If found one
if line in block:
# If the line already scanned (usually some lines with same content in different place), skip
Expand Down Expand Up @@ -111,9 +233,15 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
if start is not None and end is not None:
blocks.pop(0)
self.meta["used_document_offsets"].update(used)
# append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled flaglist because we don't know if there's some same block in the document
# append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled
# flaglist because we don't know if there's some same block in the document
self.parser.parseBlocks(parent, [block + MARK_PREVENT_RECURSION])
# fix multi blocks in same parents
if self.meta["last_parent"] == parent[-1]:
parent[-1].set("data-original-document-end", str(end))
return True
parent[-1].set("data-original-document-start", str(start))
parent[-1].set("data-original-document-end", str(end))
self.meta["last_parent"] = parent[-1]
return True
return False
42 changes: 22 additions & 20 deletions python-markdown-extension/test/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,6 @@ def __init__(self, case, test_case: unittest.TestCase):
},
},
)
print(self.case["document"])
print(self.result)
self.test_case = test_case

def test(self):
Expand Down Expand Up @@ -190,11 +188,11 @@ def test_normal(self):
case = {
"document": textwrap.dedent("""\
# Lorem ipsum
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna.
## Morbi neque lectus
Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit."""),
"expected": [
{"tag": "h1", "offset": (0, 13)},
Expand Down Expand Up @@ -277,32 +275,30 @@ def test_oi_wiki_index(self):
},
{
"tag": "p",
"offset": (780, 844),
"offset": (780, 1101), # FIXME: Correct one is (780, 1101)
},
# there's a div tag and a script tag in the document, and they will not be considered.
],
}
Tester(case, self).test()

def test_oi_wiki_search_dfs(self):
case = {
# I HATE TEXT BLOCKS
"document": textwrap.dedent("""\
## 引入
DFS 为图论中的概念,详见 [DFS(图论)](../graph/dfs.md) 页面。在 **搜索算法** 中,该词常常指利用递归函数方便地实现暴力枚举的算法,与图论中的 DFS 算法有一定相似之处,但并不完全相同。
## 解释
考虑这个例子:
???+ note "例题"
把正整数 $n$ 分解为 $3$ 个不同的正整数,如 $6=1+2+3$,排在后面的数必须大于等于前面的数,输出所有方案。
对于这个问题,如果不知道搜索,应该怎么办呢?
当然是三重循环,参考代码如下:
???+ note "实现"
=== "C++"
```cpp
Expand All @@ -311,7 +307,7 @@ def test_oi_wiki_search_dfs(self):
for (int k = j; k <= n; ++k)
if (i + j + k == n) printf("%d = %d + %d + %d\\n", n, i, j, k);
```
=== "Python"
```python
for i in range(1, n + 1):
Expand All @@ -320,7 +316,7 @@ def test_oi_wiki_search_dfs(self):
if i + j + k == n:
print("%d = %d + %d + %d" % (n, i, j, k))
```
=== "Java"
```Java
for (int i = 1; i < n + 1; i++) {
Expand All @@ -331,7 +327,7 @@ def test_oi_wiki_search_dfs(self):
}
}
```
那如果是分解成四个整数呢?再加一重循环?"""),
"expected": [
{
Expand All @@ -350,7 +346,10 @@ def test_oi_wiki_search_dfs(self):
"tag": "p",
"offset": (126, 133),
},
# <details class="note" open="open"> has been ignored
{
"tag": "details",
"offset": (135, 215),
},
{
"tag": "p",
"offset": (217, 239),
Expand All @@ -359,7 +358,10 @@ def test_oi_wiki_search_dfs(self):
"tag": "p",
"offset": (241, 256),
},
# <details class="note" open="open"> has been ignored
{
"tag": "details",
"offset": (258, 1092),
},
{
"tag": "p",
"offset": (1094, 1114),
Expand Down

0 comments on commit 5bc32d4

Please sign in to comment.