Skip to content

Commit

Permalink
Add Chinese support.
Browse files Browse the repository at this point in the history
  • Loading branch information
Yinan Qin committed Aug 22, 2022
1 parent 7314e1b commit 083b386
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 6 deletions.
90 changes: 90 additions & 0 deletions core/chinese2digit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from decimal import *


def chinese2digit(cn):
"""中文转数字
:param cn: 中文字符串
:return: 数字
>>> chinese2digit('十一')
11
>>> chinese2digit('九万八千零七十六点五四三二一')
Decimal('98076.54321')
"""
CN_NUM = {
'〇': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
'零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9,
'貮': 2, '两': 2
}
CN_UNIT = {
'十': 10, '拾': 10, '百': 100, '佰': 100, '千': 1000, '仟': 1000, '万': 10000, '萬': 10000,
'亿': 100000000, '億': 100000000, '兆': 1000000000000
}

cn = cn.split('点')
integer = list(cn[0]) # 整数部分
decimal = list(cn[1]) if len(cn) == 2 else [] # 小数部分
unit = 0 # 当前单位
parse = [] # 解析数组
while integer:
x = integer.pop()
if x in CN_UNIT:
# 当前字符是单位
unit = CN_UNIT.get(x)
if unit == 10000:
parse.append('w') # 万位
unit = 1
elif unit == 100000000:
parse.append('y') # 亿位
unit = 1
elif unit == 1000000000000: # 兆位
parse.append('z')
unit = 1
continue
else:
# 当前字符是数字
dig = CN_NUM.get(x)
if unit:
dig = dig * unit
unit = 0
parse.append(dig)

if unit == 10: # 处理10-19的数字
parse.append(10)

result = 0
tmp = 0
while parse:
x = parse.pop()
if x == 'w':
tmp *= 10000
result += tmp
tmp = 0
elif x == 'y':
tmp *= 100000000
result += tmp
tmp = 0
elif x == 'z':
tmp *= 1000000000000
result += tmp
tmp = 0
else:
tmp += x
result += tmp

if decimal:
unit = 0.1
getcontext().prec = len(decimal) # 小数精度
result = Decimal(float(result))
tmp = Decimal(0)
for x in decimal:
dig = CN_NUM.get(x)
tmp += Decimal(str(dig)) * Decimal(str(unit))
unit *= 0.1
getcontext().prec = len(result.to_eng_string()) + len(decimal) # 完整精度
result += tmp
return result

# ————————————————
# 版权声明:本文为CSDN博主「XerCis」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
# 原文链接:https://blog.csdn.net/lly1122334/article/details/107004681
19 changes: 13 additions & 6 deletions core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
import shutil
import tkinter as tk
from core.chinese2digit import chinese2digit as c2d


def split_txt(input_file, output_path, logger) -> None:
Expand All @@ -28,17 +29,23 @@ def split_txt(input_file, output_path, logger) -> None:
break
line = line.rstrip('\r\n')

pattern = r'[第章回部节集卷] *[\d一二三四五六七八九十零〇百千两]+ *[第章回部节集卷] '
chapter = re.search(pattern, line)
if chapter is not None:
logger.insert(tk.INSERT, "[ info ] 找到:%s" % str(chapter[0]) + "\n")
chapter = re.search(r"[\d一二三四五六七八九十零〇百千两]+", chapter[0])
pattern = r'[第章回部节集卷] *[\d一二三四五六七八九十零〇百千两]+ *[第章回部节集卷]( |、)'
chapter_org = re.search(pattern, line)
if chapter_org is not None:
logger.insert(tk.INSERT, "[ info ] 找到:%s" % str(chapter_org[0]) + "\n")
chapter = re.search(r"[\d]+", chapter_org[0])
if not chapter:
chapter = re.search(r"[一二三四五六七八九十零〇百千两]+", chapter_org[0])
chapter = c2d(chapter[0])
else:
chapter = chapter[0]

# Find new Chapter
if save_file is not None:
save_file.close()

save_file_path = os.path.join(output_path, "temp")
save_file_path = os.path.join(save_file_path, "%s.txt" % str(chapter[0]))
save_file_path = os.path.join(save_file_path, "%s.txt" % str(chapter))
save_file = open(save_file_path, mode='a', encoding='utf-8')
save_file.write(line)
save_file.write("\n")
Expand Down

0 comments on commit 083b386

Please sign in to comment.