From 083b3860d424888e1ac8339e40a271d08cafecf0 Mon Sep 17 00:00:00 2001 From: Yinan Qin <1522846127@qq.com> Date: Mon, 22 Aug 2022 21:24:55 +0800 Subject: [PATCH] Add Chinese support. --- core/chinese2digit.py | 90 +++++++++++++++++++++++++++++++++++++++++++ core/utils.py | 19 ++++++--- 2 files changed, 103 insertions(+), 6 deletions(-) create mode 100644 core/chinese2digit.py diff --git a/core/chinese2digit.py b/core/chinese2digit.py new file mode 100644 index 0000000..72a4d06 --- /dev/null +++ b/core/chinese2digit.py @@ -0,0 +1,90 @@ +from decimal import * + + +def chinese2digit(cn): + """中文转数字 + + :param cn: 中文字符串 + :return: 数字 + >>> chinese2digit('十一') + 11 + >>> chinese2digit('九万八千零七十六点五四三二一') + Decimal('98076.54321') + """ + CN_NUM = { + '〇': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, + '零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, + '貮': 2, '两': 2 + } + CN_UNIT = { + '十': 10, '拾': 10, '百': 100, '佰': 100, '千': 1000, '仟': 1000, '万': 10000, '萬': 10000, + '亿': 100000000, '億': 100000000, '兆': 1000000000000 + } + + cn = cn.split('点') + integer = list(cn[0]) # 整数部分 + decimal = list(cn[1]) if len(cn) == 2 else [] # 小数部分 + unit = 0 # 当前单位 + parse = [] # 解析数组 + while integer: + x = integer.pop() + if x in CN_UNIT: + # 当前字符是单位 + unit = CN_UNIT.get(x) + if unit == 10000: + parse.append('w') # 万位 + unit = 1 + elif unit == 100000000: + parse.append('y') # 亿位 + unit = 1 + elif unit == 1000000000000: # 兆位 + parse.append('z') + unit = 1 + continue + else: + # 当前字符是数字 + dig = CN_NUM.get(x) + if unit: + dig = dig * unit + unit = 0 + parse.append(dig) + + if unit == 10: # 处理10-19的数字 + parse.append(10) + + result = 0 + tmp = 0 + while parse: + x = parse.pop() + if x == 'w': + tmp *= 10000 + result += tmp + tmp = 0 + elif x == 'y': + tmp *= 100000000 + result += tmp + tmp = 0 + elif x == 'z': + tmp *= 1000000000000 + result += tmp + tmp = 0 + else: + tmp += x + result += tmp + + if decimal: + unit = 0.1 + getcontext().prec = len(decimal) # 小数精度 + result = Decimal(float(result)) + tmp = Decimal(0) + for x in decimal: + dig = CN_NUM.get(x) + tmp += Decimal(str(dig)) * Decimal(str(unit)) + unit *= 0.1 + getcontext().prec = len(result.to_eng_string()) + len(decimal) # 完整精度 + result += tmp + return result + +# ———————————————— +# 版权声明:本文为CSDN博主「XerCis」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。 +# 原文链接:https://blog.csdn.net/lly1122334/article/details/107004681 diff --git a/core/utils.py b/core/utils.py index 03440e0..97ddea8 100644 --- a/core/utils.py +++ b/core/utils.py @@ -2,6 +2,7 @@ import re import shutil import tkinter as tk +from core.chinese2digit import chinese2digit as c2d def split_txt(input_file, output_path, logger) -> None: @@ -28,17 +29,23 @@ def split_txt(input_file, output_path, logger) -> None: break line = line.rstrip('\r\n') - pattern = r'[第章回部节集卷] *[\d一二三四五六七八九十零〇百千两]+ *[第章回部节集卷] ' - chapter = re.search(pattern, line) - if chapter is not None: - logger.insert(tk.INSERT, "[ info ] 找到:%s" % str(chapter[0]) + "\n") - chapter = re.search(r"[\d一二三四五六七八九十零〇百千两]+", chapter[0]) + pattern = r'[第章回部节集卷] *[\d一二三四五六七八九十零〇百千两]+ *[第章回部节集卷]( |、)' + chapter_org = re.search(pattern, line) + if chapter_org is not None: + logger.insert(tk.INSERT, "[ info ] 找到:%s" % str(chapter_org[0]) + "\n") + chapter = re.search(r"[\d]+", chapter_org[0]) + if not chapter: + chapter = re.search(r"[一二三四五六七八九十零〇百千两]+", chapter_org[0]) + chapter = c2d(chapter[0]) + else: + chapter = chapter[0] + # Find new Chapter if save_file is not None: save_file.close() save_file_path = os.path.join(output_path, "temp") - save_file_path = os.path.join(save_file_path, "%s.txt" % str(chapter[0])) + save_file_path = os.path.join(save_file_path, "%s.txt" % str(chapter)) save_file = open(save_file_path, mode='a', encoding='utf-8') save_file.write(line) save_file.write("\n")