Skip to content

Commit

Permalink
v2.1.16: 优化搜索功能,增加错误提示检测并简化正则表达式,更新JM发布页 (#102)
Browse files Browse the repository at this point in the history
  • Loading branch information
hect0x7 authored Aug 24, 2023
1 parent 18ba10e commit 6948743
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 10 deletions.
2 changes: 1 addition & 1 deletion src/jmcomic/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
# 被依赖方 <--- 使用方
# config <--- entity <--- toolkit <--- client <--- option <--- downloader

__version__ = '2.1.15'
__version__ = '2.1.16'

from .api import *
3 changes: 3 additions & 0 deletions src/jmcomic/jm_client_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,9 @@ def headers_key_ts(self):
"accept-encoding": "gzip",
}, key_ts

def debug_topic_request(self):
return 'api'


class AsyncSaveImageClient(JmImageClient):

Expand Down
16 changes: 13 additions & 3 deletions src/jmcomic/jm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,15 @@ def default_postman_constructor(session, **kwargs):
return Postmans.new_postman(**kwargs)


def default_raise_regex_error(msg, *_args, **_kwargs):
raise AssertionError(msg)


class JmModuleConfig:
# 网站相关
PROT = "https://"
JM_REDIRECT_URL = f'{PROT}jm365.work/3YeBdF' # 永久網域,怕走失的小伙伴收藏起来
JM_PUB_URL = f'{PROT}jmcomic2.bet'
JM_PUB_URL = f'{PROT}jmcomic.ltd'
JM_CDN_IMAGE_URL_TEMPLATE = PROT + 'cdn-msp.{domain}/media/photos/{photo_id}/{index:05}{suffix}' # index 从1开始
JM_IMAGE_SUFFIX = ['.jpg', '.webp', '.png', '.gif']

Expand Down Expand Up @@ -62,6 +66,9 @@ class JmModuleConfig:
debug_executor = default_jm_debug
# postman构造函数
postman_constructor = default_postman_constructor
# 网页正则表达式解析失败时,执行抛出异常的函数,可以替换掉用于debug
raise_regex_error_executor = default_raise_regex_error

# debug开关标记
enable_jm_debug = True

Expand Down Expand Up @@ -163,7 +170,7 @@ def get_jmcomic_url(cls, postman=None):
postman = postman or cls.new_postman(session=True)

url = postman.with_redirect_catching().get(cls.JM_REDIRECT_URL)
cls.jm_debug('获取禁漫地址', f'[{cls.JM_REDIRECT_URL}] → [{url}]')
cls.jm_debug('获取禁漫URL', f'[{cls.JM_REDIRECT_URL}] → [{url}]')
return url

@classmethod
Expand All @@ -181,7 +188,10 @@ def get_jmcomic_domain_all(cls, postman=None):
raise AssertionError(resp.text)

from .jm_toolkit import JmcomicText
return JmcomicText.analyse_jm_pub_html(resp.text)
domain_list = JmcomicText.analyse_jm_pub_html(resp.text)

cls.jm_debug('获取禁漫全部域名', f'[{resp.url}] → {domain_list}')
return domain_list

album_comment_headers = {
'authority': '18comic.vip',
Expand Down
29 changes: 23 additions & 6 deletions src/jmcomic/jm_toolkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,12 @@ def match_field(field_key: str, pattern: Union[Pattern, List[Pattern]], text):
field_value = match_field(field_name, pattern_value, html)

if field_value is None:
write_text('./resp.txt', html) # debug
raise AssertionError(f"文本没有匹配上字段:字段名为'{field_name}',pattern: [{pattern_value.pattern}]")
JmModuleConfig.raise_regex_error_executor(
f"文本没有匹配上字段:字段名为'{field_name}',pattern: [{pattern_value.pattern}]",
html,
field_name,
pattern_value
)

# 保存字段
field_dict[field_name] = field_value
Expand All @@ -167,9 +171,7 @@ def format_url(cls, path, domain=None):

class JmSearchSupport:
# 用来缩减html的长度
pattern_html_search_shorten_for = compile('<div class="well well-sm">([\s\S]*)'
'<div class="row">[\s\S]*'
'<div class="bot-per visible-xs visible-sm">')
pattern_html_search_shorten_for = compile('<div class="well well-sm">([\s\S]*)<div class="row">')

# 用来提取搜索页面的的album的信息
pattern_html_search_album_info_list = compile(
Expand All @@ -185,9 +187,24 @@ class JmSearchSupport:
# 用来查找tag列表
pattern_html_search_tag_list = compile('<a href=".*?">(.*?)</a>')

# 查找错误,例如 [错误,關鍵字過短,請至少輸入兩個字以上。]
pattern_html_search_error = compile('<fieldset>\n<legend>(.*?)</legend>\n<div class=.*?>\n(.*?)\n</div>\n</fieldset>')

@classmethod
def analyse_jm_search_html(cls, html: str) -> JmSearchPage:
html = cls.pattern_html_search_shorten_for.search(html)[0]
# 检查是否失败
match = cls.pattern_html_search_error.search(html)
if match is not None:
topic, reason = match[1], match[2]
JmModuleConfig.raise_regex_error_executor(f'{topic}: {reason}', html)

# 缩小文本范围
match = cls.pattern_html_search_shorten_for.search(html)
if match is None:
JmModuleConfig.raise_regex_error_executor('未匹配到搜索结果', html)
html = match[0]

# 提取结果
album_info_list = cls.pattern_html_search_album_info_list.findall(html)

for i, (album_id, title, *args) in enumerate(album_info_list):
Expand Down

0 comments on commit 6948743

Please sign in to comment.