From 083b3860d424888e1ac8339e40a271d08cafecf0 Mon Sep 17 00:00:00 2001
From: Yinan Qin <1522846127@qq.com>
Date: Mon, 22 Aug 2022 21:24:55 +0800
Subject: [PATCH] Add Chinese support.

---
 core/chinese2digit.py | 90 +++++++++++++++++++++++++++++++++++++++++++
 core/utils.py         | 19 ++++++---
 2 files changed, 103 insertions(+), 6 deletions(-)
 create mode 100644 core/chinese2digit.py

diff --git a/core/chinese2digit.py b/core/chinese2digit.py
new file mode 100644
index 0000000..72a4d06
--- /dev/null
+++ b/core/chinese2digit.py
@@ -0,0 +1,90 @@
+from decimal import *
+
+
+def chinese2digit(cn):
+    """中文转数字
+
+    :param cn: 中文字符串
+    :return: 数字
+    >>> chinese2digit('十一')
+    11
+    >>> chinese2digit('九万八千零七十六点五四三二一')
+    Decimal('98076.54321')
+    """
+    CN_NUM = {
+        '〇': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
+        '零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9,
+        '貮': 2, '两': 2
+    }
+    CN_UNIT = {
+        '十': 10, '拾': 10, '百': 100, '佰': 100, '千': 1000, '仟': 1000, '万': 10000, '萬': 10000,
+        '亿': 100000000, '億': 100000000, '兆': 1000000000000
+    }
+
+    cn = cn.split('点')
+    integer = list(cn[0])  # 整数部分
+    decimal = list(cn[1]) if len(cn) == 2 else []  # 小数部分
+    unit = 0  # 当前单位
+    parse = []  # 解析数组
+    while integer:
+        x = integer.pop()
+        if x in CN_UNIT:
+            # 当前字符是单位
+            unit = CN_UNIT.get(x)
+            if unit == 10000:
+                parse.append('w')  # 万位
+                unit = 1
+            elif unit == 100000000:
+                parse.append('y')  # 亿位
+                unit = 1
+            elif unit == 1000000000000:  # 兆位
+                parse.append('z')
+                unit = 1
+            continue
+        else:
+            # 当前字符是数字
+            dig = CN_NUM.get(x)
+            if unit:
+                dig = dig * unit
+                unit = 0
+            parse.append(dig)
+
+    if unit == 10:  # 处理10-19的数字
+        parse.append(10)
+
+    result = 0
+    tmp = 0
+    while parse:
+        x = parse.pop()
+        if x == 'w':
+            tmp *= 10000
+            result += tmp
+            tmp = 0
+        elif x == 'y':
+            tmp *= 100000000
+            result += tmp
+            tmp = 0
+        elif x == 'z':
+            tmp *= 1000000000000
+            result += tmp
+            tmp = 0
+        else:
+            tmp += x
+    result += tmp
+
+    if decimal:
+        unit = 0.1
+        getcontext().prec = len(decimal)  # 小数精度
+        result = Decimal(float(result))
+        tmp = Decimal(0)
+        for x in decimal:
+            dig = CN_NUM.get(x)
+            tmp += Decimal(str(dig)) * Decimal(str(unit))
+            unit *= 0.1
+        getcontext().prec = len(result.to_eng_string()) + len(decimal)  # 完整精度
+        result += tmp
+    return result
+
+# ————————————————
+# 版权声明：本文为CSDN博主「XerCis」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
+# 原文链接：https://blog.csdn.net/lly1122334/article/details/107004681
diff --git a/core/utils.py b/core/utils.py
index 03440e0..97ddea8 100644
--- a/core/utils.py
+++ b/core/utils.py
@@ -2,6 +2,7 @@
 import re
 import shutil
 import tkinter as tk
+from core.chinese2digit import chinese2digit as c2d
 
 
 def split_txt(input_file, output_path, logger) -> None:
@@ -28,17 +29,23 @@ def split_txt(input_file, output_path, logger) -> None:
                 break
             line = line.rstrip('\r\n')
 
-            pattern = r'[第章回部节集卷] *[\d一二三四五六七八九十零〇百千两]+ *[第章回部节集卷] '
-            chapter = re.search(pattern, line)
-            if chapter is not None:
-                logger.insert(tk.INSERT, "[ info ] 找到：%s" % str(chapter[0]) + "\n")
-                chapter = re.search(r"[\d一二三四五六七八九十零〇百千两]+", chapter[0])
+            pattern = r'[第章回部节集卷] *[\d一二三四五六七八九十零〇百千两]+ *[第章回部节集卷]( |、)'
+            chapter_org = re.search(pattern, line)
+            if chapter_org is not None:
+                logger.insert(tk.INSERT, "[ info ] 找到：%s" % str(chapter_org[0]) + "\n")
+                chapter = re.search(r"[\d]+", chapter_org[0])
+                if not chapter:
+                    chapter = re.search(r"[一二三四五六七八九十零〇百千两]+", chapter_org[0])
+                    chapter = c2d(chapter[0])
+                else:
+                    chapter = chapter[0]
+
                 # Find new Chapter
                 if save_file is not None:
                     save_file.close()
 
                 save_file_path = os.path.join(output_path, "temp")
-                save_file_path = os.path.join(save_file_path, "%s.txt" % str(chapter[0]))
+                save_file_path = os.path.join(save_file_path, "%s.txt" % str(chapter))
                 save_file = open(save_file_path, mode='a', encoding='utf-8')
                 save_file.write(line)
                 save_file.write("\n")