为客户端加入了三种热词功能：中文、英文、自定义

改进了对中文数字的搜索，当数字的左侧或者右侧有英文时，就一定会被选中。改进了中央空格排版，能够正常输出 iPhone 4s 这样的词语。
HaujetZhao · May 31, 2023 · a7131c3 · a7131c3
1 parent 2476458
commit a7131c3
Show file tree

Hide file tree

Showing 13 changed files with 691 additions and 31 deletions.
diff --git a/assets/image-20230531220203415.png b/assets/image-20230531220203415.png
diff --git a/assets/image-20230531221314983.png b/assets/image-20230531221314983.png
diff --git a/chinese_itn.py b/chinese_itn.py
@@ -16,30 +16,56 @@
 __all__ = ['chinese_to_num']
 
 import re
+from string import ascii_letters
 
 
 
 # 常见的跟在数字后面的单位
-common_units = '个只分万秒'       
+common_units = r'个只分万亿秒'       
 
 # 总模式，筛选出可能需要替换的内容
+# 测试链接  https://regex101.com/r/tFqg9S/3
 pattern = re.compile(f"""(?ix)          # i 表示忽略大小写，x 表示开启注释模式
-    (
-        (
-            [零幺一二两三四五六七八九十百千万亿点比]
-            |(分之)
-            |(?<=[一二两三四五六七八九十])[年月日号{common_units}]
-        ){{2,}}
+([a-z]\s*)?
+(
+  (
+    [零幺一二两三四五六七八九十百千万点比]
+    |[零一二三四五六七八九十][ ]
+    |(?<=[一二两三四五六七八九十])[年月日号]
+    |(分之)
+  )+
+  (
+    (?<=[一二两三四五六七八九十])[a-zA-Z年月日号{common_units}]
+    |(?<=[一二两三四五六七八九十]\s)[a-zA-Z]
+  )?
+  (?(1)
+  |(?(5)
+    |(
+      [零幺一二两三四五六七八九十百千万亿点比]
+      |(分之)
     )
+  )+
+  )
+)
+
 """)
+# pattern = re.compile(f"""(?ix)          # i 表示忽略大小写，x 表示开启注释模式
+#     (
+#         (
+#             [零幺一二两三四五六七八九十百千万亿点比]
+#             |(分之)
+#             |(?<=[一二两三四五六七八九十])[年月日号{common_units}]
+#         ){{2,}}
+#     )
+# """)
 
 # 细分匹配不同的数字类型
 
 # 纯数字序号
-pure_num = re.compile(f'[零幺一二三四五六七八九]+(点[零幺一二三四五六七八九]+)*[{common_units}]?')
+pure_num = re.compile(f'[零幺一二三四五六七八九]+(点[零幺一二三四五六七八九]+)* *[a-zA-Z{common_units}]?')
 
 # 数值
-value_num = re.compile(f"十?(零?[一二两三四五六七八九十][十百千万]{{1,2}})*零?[一二三四五六七八九]?(点[零一二三四五六七八九]+)?[{common_units}]?")
+value_num = re.compile(f"十?(零?[一二两三四五六七八九十][十百千万]{{1,2}})*零?[一二三四五六七八九]?(点[零一二三四五六七八九]+)? *[a-zA-Z{common_units}]?")
 
 # 百分值
 percent_value = re.compile('(?<![一二三四五六七八九])(百分之)[零一二三四五六七八九十百千万]+(点)?(?(2)[零一二三四五六七八九]+)')
@@ -97,15 +123,15 @@
 def strip_unit(original):
     '''把数字后面跟着的单位剥离开'''
     unit = ''       
-    stripped = original.strip(common_units)
+    stripped = original.strip(common_units + ascii_letters).strip()
     if stripped != original: 
         unit = original[len(stripped):]
     return stripped, unit
 
-def convert_pure_num(original):
+def convert_pure_num(original, strict=False):
     '''把中文数字转为对应的阿拉伯数字'''
     stripped, unit = strip_unit(original)
-    if stripped in ['一']:
+    if stripped in ['一'] and not strict:
         return original
     converted = []
     for c in stripped:
@@ -120,7 +146,7 @@ def convert_value_num(original):
     int_part, decimal_part = stripped.split("点")   # 分离小数
 
     # 计算整数部分的值
-    value, temp, base = 0, 0, 0
+    value, temp, base = 0, 0, 1
     for c in int_part:
         if c == '十' : 
             temp = 10 if temp==0 else value_mapper[c]*temp
@@ -142,7 +168,7 @@ def convert_value_num(original):
     final = str(value)
 
     # 小数部分，就是纯数字，直接映射即可
-    decimal_str = convert_pure_num(decimal_part)
+    decimal_str = convert_pure_num(decimal_part, strict=True)
     if decimal_str: final += '.' + decimal_str
     final += unit
 
@@ -193,12 +219,13 @@ def convert_date_value(original):
 
 
 def replace(original):
-    original = original.group()
+    head = original.group(1)
+    original = original.group(2)
     try:
-        if pure_num.fullmatch(original.strip('个只分万')):
+        if pure_num.fullmatch(original.strip(common_units)):
             num_type = '纯数字'
             final = convert_pure_num(original)
-        elif value_num.fullmatch(original.strip('个只分万')):
+        elif value_num.fullmatch(original.strip(common_units)):
             num_type = '数值'
             final = convert_value_num(original)
         elif percent_value.fullmatch(original):
@@ -218,6 +245,7 @@ def replace(original):
             final = convert_date_value(original)
         else:
             final = original
+        final = head + final
     except:
         num_type = '未知'
         final = original
@@ -230,7 +258,7 @@ def chinese_to_num(original):
 if __name__ == "__main__":
 
     # groups = []
-    # with open('./测试集.txt', 'r', encoding="utf-8", newline='') as f:
+    # with open('./old/测试集.txt', 'r', encoding="utf-8", newline='') as f:
     #     lines = f.readlines()
     #     for i in range(0, len(lines), 5):
     #         original = lines[i].split(maxsplit=2)[1]
@@ -244,4 +272,4 @@ def chinese_to_num(original):
     #     print(f'\n{original=}')
     #     print(f'{reference=}') 
     #     print(f'{answer=   }') 
-    print(chinese_to_num('一万三千六'))
+    print(chinese_to_num(' samsung s 八'))
diff --git a/core_client.py b/core_client.py
@@ -1,6 +1,6 @@
 # coding: utf-8
 
-from os import path, mkdir; 
+from os import path, sep, mkdir; 
 if 'BASE_DIR' not in globals():
     BASE_DIR = path.dirname(__file__); 
 print(f'当前基文件夹：{BASE_DIR}')
@@ -17,6 +17,10 @@
 import sounddevice as sd
 import websockets
 
+import hot_sub_zh   # 中文热词替换模块
+import hot_sub_en   # 英文热词替换模块
+import hot_sub_rule   # 自定义规则替换
+
 
 
 # ============================全局变量和检查区====================================
@@ -31,8 +35,13 @@
 save_audio = True           # 是否保存录音文件
 trash_punc = '，。,.'        # 识别结果要消除的末尾标点
 
-# todo 热词替换功能
-# 英文字母拼接
+hot_zh = True              # 是否启用中文热词替换，中文热词存储在 hot_zh.txt 文件里
+hot_sub_zh.多音字 = True    # True 表示多音字匹配
+hot_sub_zh.声调  = False    # False 表示忽略声调区别，这样「黄章」就能匹配「慌张」
+
+hot_en = True              # 是否启用英文热词替换，英文热词存储在 hot_en.txt 文件里
+
+hot_rule = True            # 是否启用自定义规则替换，自定义规则存储在 hot_rule.txt 文件里
 
 # ============================快捷键名字参考====================================
 
@@ -181,6 +190,13 @@ async def recognize():
 
         break
 
+    # 热词替换
+    if hot_zh: 
+        decoding_results = hot_sub_zh.热词替换(decoding_results)
+    if hot_en: 
+        decoding_results = hot_sub_en.热词替换(decoding_results)
+    if hot_rule: 
+        decoding_results = hot_sub_rule.热词替换(decoding_results)
 
     # 打印结果
     keyboard.write(decoding_results)
@@ -253,9 +269,9 @@ def record_open():
     # 显示录音所用的音频设备
     try:
         device = sd.query_devices(kind='input')
-        print(f'\n使用默认音频设备：{device["name"]}\n')
+        print(f'\n使用默认音频设备：{device["name"]}')
     except UnicodeDecodeError:
-        print("\n由于编码问题，暂时无法获得麦克风设备名字\n")
+        print("\n由于编码问题，暂时无法获得麦克风设备名字")
 
     # 打开音频流
     stream = sd.InputStream(
@@ -268,13 +284,44 @@ def record_open():
 
     return stream
 
+def init_hot_words():
+    global BASE_DIR, hot_zh, hot_en, hot_rule
+
+    path_zh = BASE_DIR + sep + "hot-zh.txt"
+    path_en = BASE_DIR + sep + "hot-en.txt"
+    path_rule = BASE_DIR + sep + "hot-rule.txt"
+
+    if hot_zh:
+        if not path.exists(path_zh):
+            with open(path_zh, "w", encoding="utf-8") as f:
+                f.write('# 在此文件放置中文热词，每行一个，开头带井号表示注释，会被省略')
+        with open(path_zh, "r", encoding="utf-8") as f: 
+            num_hot_zh = hot_sub_zh.更新热词词典(f.read())
+        print(f'\n\x9b32m已载入 {num_hot_zh:5} 条中文热词\x9b0m')
+    if hot_en:
+        if not path.exists(path_en):
+            with open(path_en, "w", encoding='utf-8') as f:
+                f.write('# 在此文件放置英文热词 \n# Put English hot words here, one per line. Line starts with # will be ignored. ')
+        with open(path_en, "r", encoding="utf-8") as f: 
+            num_hot_en = hot_sub_en.更新热词词典(f.read())
+        print(f'\x9b32m已载入 {num_hot_en:5} 条英文热词\x9b0m')
+    if hot_rule:
+        if not path.exists(path_rule):
+            with open(path_rule, "w", encoding='utf-8') as f:
+                f.write('# 在此文件放置自定义规则，规则是每行一条的文本，以 # 开头的会被忽略，将查找和匹配用等号隔开，文本两边的空格会被省略。例如：\n\n毫安时 = mAh\n赫兹 = Hz')
+        with open(path_rule, "r", encoding="utf-8") as f: 
+            num_hot_rule = hot_sub_rule.更新热词词典(f.read())
+        print(f'\x9b32m已载入 {num_hot_rule:5} 条自定义替换规则\x9b0m\n')
+
+
+
 def show_tips():
     print(f'服务端地址：\x9b33m{addr}:{port}\x9b0m')
     print(f'''
-项目地址：\x9b36mhttps://github.com/HaujetZhao/CapsWriter-Offline\x9b0m
-
 当前所用快捷键：{shortcut}
 
+项目地址：\x9b36mhttps://github.com/HaujetZhao/CapsWriter-Offline\x9b0m
+
 你好，这是 \x9b33mCapsWriter 简陋的离线版\x9b0m，一个语音输入工具。
 使用步骤：
     1. 运行 Server 端，它会载入 Paraformer 模型识别模型（这会占用1GB的内存）
@@ -287,6 +334,7 @@ def show_tips():
     3. 本地模型对算力要求非常低，基本无需担心性能问题
     4. 为方便用户检查录音质量、识别效果，脚本默认开启了保存录音，所有都被保存在了 audios 文件夹
     5. 默认的快捷键是 {shortcut}，你可以打开 core_client.py 进行修改
+    6. 你可以在  hot-en.txt  hot-zh.txt  hot-rule.txt  中添加热词，客户端会在启动时载入热词
     ''')
 
 
@@ -306,6 +354,12 @@ async def main():
     # 快捷键绑定到函数
     keyboard.hook_key(shortcut, shortcut_handler)
 
+    # 载入热词
+    try:
+        init_hot_words()
+    except Exception as e:
+        print(f'载入热词失败，常见原因一般是热词文件没有使用 UTF-8 编码\n{e}')
+
     # 打印说明
     show_tips()
 
@@ -320,3 +374,4 @@ async def main():
         asyncio.run(main())
     except KeyboardInterrupt:
         print(f'再见！')
+        exit()
diff --git a/core_server.py b/core_server.py
@@ -52,19 +52,27 @@ class args:
 
 # ========================================================================
 
-en_in_zh = re.compile(r"([\u4e00-\u9fa5]|[a-zA-Z]+ )?([a-zA-Z ]+)([\u4e00-\u9fa5]|[a-zA-Z]+)?")
+en_in_zh = re.compile(r"""(?ix)    # i 表示忽略大小写，x 表示开启注释模式
+    ([\u4e00-\u9fa5]|[a-z0-9]+ )?      # 左侧是中文，或者英文加空格
+    ([a-z0-9 ]+)                    # 中间是一个或多个「英文数字加空格」
+    ([\u4e00-\u9fa5]|[a-z0-9]+)?       # 右是中文，或者英文加空格
+""")
 
 def adjust_space(original: re.Match):
     # 如果拼写字母中间有空格，就把空格都去掉
     if original.group(2):
-        final = re.sub(r'(\b\w) (?!\w{2})', r'\1', original.group(2)).strip()
+        final = re.sub(r'((\d) )?(\b\w) ?(?!\w{2})', r'\2\3', original.group(2)).strip()
+        # 测试地址 https://regex101.com/r/1Vtu7V/1
+        # final = re.sub(r'(\b\w) (?!\w{2})', r'\1', original.group(2)).strip()
 
-    # 如果英文的左边有汉字，给中英之间加上空格
+    # 如果英文的左边有汉字或英文（不是数字），给两组之间加上空格
     if original.group(1):
-        final = original.group(1).rstrip() + ' ' + final
+        if not re.match(r'.*\d', original.group(1)):
+            final = original.group(1).rstrip() + ' ' + final
     # 如果英文左边的汉字被前一个组消费了，就要手动去看一下前一个字是不是中文
     elif re.match(r'[\u4e00-\u9fa5]', original.string[original.start(2) - 1]): 
         final = ' ' + final
+
 
     # 如果英文的右边有汉字，给中英之间加上空格
     if original.group(3):