Skip to content

Commit

Permalink
为客户端加入了三种热词功能:中文、英文、自定义
Browse files Browse the repository at this point in the history
改进了对中文数字的搜索,当数字的左侧或者右侧有英文时,就一定会被选中。

改进了中央空格排版,能够正常输出 iPhone 4s 这样的词语。
  • Loading branch information
HaujetZhao committed May 31, 2023
1 parent 2476458 commit a7131c3
Show file tree
Hide file tree
Showing 13 changed files with 691 additions and 31 deletions.
Binary file added assets/image-20230531220203415.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/image-20230531221314983.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
66 changes: 47 additions & 19 deletions chinese_itn.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,56 @@
__all__ = ['chinese_to_num']

import re
from string import ascii_letters



# 常见的跟在数字后面的单位
common_units = '个只分万秒'
common_units = r'个只分万亿秒'

# 总模式,筛选出可能需要替换的内容
# 测试链接 https://regex101.com/r/tFqg9S/3
pattern = re.compile(f"""(?ix) # i 表示忽略大小写,x 表示开启注释模式
(
(
[零幺一二两三四五六七八九十百千万亿点比]
|(分之)
|(?<=[一二两三四五六七八九十])[年月日号{common_units}]
){{2,}}
([a-z]\s*)?
(
(
[零幺一二两三四五六七八九十百千万点比]
|[零一二三四五六七八九十][ ]
|(?<=[一二两三四五六七八九十])[年月日号]
|(分之)
)+
(
(?<=[一二两三四五六七八九十])[a-zA-Z年月日号{common_units}]
|(?<=[一二两三四五六七八九十]\s)[a-zA-Z]
)?
(?(1)
|(?(5)
|(
[零幺一二两三四五六七八九十百千万亿点比]
|(分之)
)
)+
)
)
""")
# pattern = re.compile(f"""(?ix) # i 表示忽略大小写,x 表示开启注释模式
# (
# (
# [零幺一二两三四五六七八九十百千万亿点比]
# |(分之)
# |(?<=[一二两三四五六七八九十])[年月日号{common_units}]
# ){{2,}}
# )
# """)

# 细分匹配不同的数字类型

# 纯数字序号
pure_num = re.compile(f'[零幺一二三四五六七八九]+(点[零幺一二三四五六七八九]+)*[{common_units}]?')
pure_num = re.compile(f'[零幺一二三四五六七八九]+(点[零幺一二三四五六七八九]+)* *[a-zA-Z{common_units}]?')

# 数值
value_num = re.compile(f"十?(零?[一二两三四五六七八九十][十百千万]{{1,2}})*零?[一二三四五六七八九]?(点[零一二三四五六七八九]+)?[{common_units}]?")
value_num = re.compile(f"十?(零?[一二两三四五六七八九十][十百千万]{{1,2}})*零?[一二三四五六七八九]?(点[零一二三四五六七八九]+)? *[a-zA-Z{common_units}]?")

# 百分值
percent_value = re.compile('(?<![一二三四五六七八九])(百分之)[零一二三四五六七八九十百千万]+(点)?(?(2)[零一二三四五六七八九]+)')
Expand Down Expand Up @@ -97,15 +123,15 @@
def strip_unit(original):
'''把数字后面跟着的单位剥离开'''
unit = ''
stripped = original.strip(common_units)
stripped = original.strip(common_units + ascii_letters).strip()
if stripped != original:
unit = original[len(stripped):]
return stripped, unit

def convert_pure_num(original):
def convert_pure_num(original, strict=False):
'''把中文数字转为对应的阿拉伯数字'''
stripped, unit = strip_unit(original)
if stripped in ['一']:
if stripped in ['一'] and not strict:
return original
converted = []
for c in stripped:
Expand All @@ -120,7 +146,7 @@ def convert_value_num(original):
int_part, decimal_part = stripped.split("点") # 分离小数

# 计算整数部分的值
value, temp, base = 0, 0, 0
value, temp, base = 0, 0, 1
for c in int_part:
if c == '十' :
temp = 10 if temp==0 else value_mapper[c]*temp
Expand All @@ -142,7 +168,7 @@ def convert_value_num(original):
final = str(value)

# 小数部分,就是纯数字,直接映射即可
decimal_str = convert_pure_num(decimal_part)
decimal_str = convert_pure_num(decimal_part, strict=True)
if decimal_str: final += '.' + decimal_str
final += unit

Expand Down Expand Up @@ -193,12 +219,13 @@ def convert_date_value(original):


def replace(original):
original = original.group()
head = original.group(1)
original = original.group(2)
try:
if pure_num.fullmatch(original.strip('个只分万')):
if pure_num.fullmatch(original.strip(common_units)):
num_type = '纯数字'
final = convert_pure_num(original)
elif value_num.fullmatch(original.strip('个只分万')):
elif value_num.fullmatch(original.strip(common_units)):
num_type = '数值'
final = convert_value_num(original)
elif percent_value.fullmatch(original):
Expand All @@ -218,6 +245,7 @@ def replace(original):
final = convert_date_value(original)
else:
final = original
final = head + final
except:
num_type = '未知'
final = original
Expand All @@ -230,7 +258,7 @@ def chinese_to_num(original):
if __name__ == "__main__":

# groups = []
# with open('./测试集.txt', 'r', encoding="utf-8", newline='') as f:
# with open('./old/测试集.txt', 'r', encoding="utf-8", newline='') as f:
# lines = f.readlines()
# for i in range(0, len(lines), 5):
# original = lines[i].split(maxsplit=2)[1]
Expand All @@ -244,4 +272,4 @@ def chinese_to_num(original):
# print(f'\n{original=}')
# print(f'{reference=}')
# print(f'{answer= }')
print(chinese_to_num('一万三千六'))
print(chinese_to_num(' samsung s 八'))
69 changes: 62 additions & 7 deletions core_client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# coding: utf-8

from os import path, mkdir;
from os import path, sep, mkdir;
if 'BASE_DIR' not in globals():
BASE_DIR = path.dirname(__file__);
print(f'当前基文件夹:{BASE_DIR}')
Expand All @@ -17,6 +17,10 @@
import sounddevice as sd
import websockets

import hot_sub_zh # 中文热词替换模块
import hot_sub_en # 英文热词替换模块
import hot_sub_rule # 自定义规则替换



# ============================全局变量和检查区====================================
Expand All @@ -31,8 +35,13 @@
save_audio = True # 是否保存录音文件
trash_punc = ',。,.' # 识别结果要消除的末尾标点

# todo 热词替换功能
# 英文字母拼接
hot_zh = True # 是否启用中文热词替换,中文热词存储在 hot_zh.txt 文件里
hot_sub_zh.多音字 = True # True 表示多音字匹配
hot_sub_zh.声调 = False # False 表示忽略声调区别,这样「黄章」就能匹配「慌张」

hot_en = True # 是否启用英文热词替换,英文热词存储在 hot_en.txt 文件里

hot_rule = True # 是否启用自定义规则替换,自定义规则存储在 hot_rule.txt 文件里

# ============================快捷键名字参考====================================

Expand Down Expand Up @@ -181,6 +190,13 @@ async def recognize():

break

# 热词替换
if hot_zh:
decoding_results = hot_sub_zh.热词替换(decoding_results)
if hot_en:
decoding_results = hot_sub_en.热词替换(decoding_results)
if hot_rule:
decoding_results = hot_sub_rule.热词替换(decoding_results)

# 打印结果
keyboard.write(decoding_results)
Expand Down Expand Up @@ -253,9 +269,9 @@ def record_open():
# 显示录音所用的音频设备
try:
device = sd.query_devices(kind='input')
print(f'\n使用默认音频设备:{device["name"]}\n')
print(f'\n使用默认音频设备:{device["name"]}')
except UnicodeDecodeError:
print("\n由于编码问题,暂时无法获得麦克风设备名字\n")
print("\n由于编码问题,暂时无法获得麦克风设备名字")

# 打开音频流
stream = sd.InputStream(
Expand All @@ -268,13 +284,44 @@ def record_open():

return stream

def init_hot_words():
global BASE_DIR, hot_zh, hot_en, hot_rule

path_zh = BASE_DIR + sep + "hot-zh.txt"
path_en = BASE_DIR + sep + "hot-en.txt"
path_rule = BASE_DIR + sep + "hot-rule.txt"

if hot_zh:
if not path.exists(path_zh):
with open(path_zh, "w", encoding="utf-8") as f:
f.write('# 在此文件放置中文热词,每行一个,开头带井号表示注释,会被省略')
with open(path_zh, "r", encoding="utf-8") as f:
num_hot_zh = hot_sub_zh.更新热词词典(f.read())
print(f'\n\x9b32m已载入 {num_hot_zh:5} 条中文热词\x9b0m')
if hot_en:
if not path.exists(path_en):
with open(path_en, "w", encoding='utf-8') as f:
f.write('# 在此文件放置英文热词 \n# Put English hot words here, one per line. Line starts with # will be ignored. ')
with open(path_en, "r", encoding="utf-8") as f:
num_hot_en = hot_sub_en.更新热词词典(f.read())
print(f'\x9b32m已载入 {num_hot_en:5} 条英文热词\x9b0m')
if hot_rule:
if not path.exists(path_rule):
with open(path_rule, "w", encoding='utf-8') as f:
f.write('# 在此文件放置自定义规则,规则是每行一条的文本,以 # 开头的会被忽略,将查找和匹配用等号隔开,文本两边的空格会被省略。例如:\n\n毫安时 = mAh\n赫兹 = Hz')
with open(path_rule, "r", encoding="utf-8") as f:
num_hot_rule = hot_sub_rule.更新热词词典(f.read())
print(f'\x9b32m已载入 {num_hot_rule:5} 条自定义替换规则\x9b0m\n')



def show_tips():
print(f'服务端地址:\x9b33m{addr}:{port}\x9b0m')
print(f'''
项目地址:\x9b36mhttps://github.com/HaujetZhao/CapsWriter-Offline\x9b0m
当前所用快捷键:{shortcut}
项目地址:\x9b36mhttps://github.com/HaujetZhao/CapsWriter-Offline\x9b0m
你好,这是 \x9b33mCapsWriter 简陋的离线版\x9b0m,一个语音输入工具。
使用步骤:
1. 运行 Server 端,它会载入 Paraformer 模型识别模型(这会占用1GB的内存)
Expand All @@ -287,6 +334,7 @@ def show_tips():
3. 本地模型对算力要求非常低,基本无需担心性能问题
4. 为方便用户检查录音质量、识别效果,脚本默认开启了保存录音,所有都被保存在了 audios 文件夹
5. 默认的快捷键是 {shortcut},你可以打开 core_client.py 进行修改
6. 你可以在 hot-en.txt hot-zh.txt hot-rule.txt 中添加热词,客户端会在启动时载入热词
''')


Expand All @@ -306,6 +354,12 @@ async def main():
# 快捷键绑定到函数
keyboard.hook_key(shortcut, shortcut_handler)

# 载入热词
try:
init_hot_words()
except Exception as e:
print(f'载入热词失败,常见原因一般是热词文件没有使用 UTF-8 编码\n{e}')

# 打印说明
show_tips()

Expand All @@ -320,3 +374,4 @@ async def main():
asyncio.run(main())
except KeyboardInterrupt:
print(f'再见!')
exit()
16 changes: 12 additions & 4 deletions core_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,27 @@ class args:

# ========================================================================

en_in_zh = re.compile(r"([\u4e00-\u9fa5]|[a-zA-Z]+ )?([a-zA-Z ]+)([\u4e00-\u9fa5]|[a-zA-Z]+)?")
en_in_zh = re.compile(r"""(?ix) # i 表示忽略大小写,x 表示开启注释模式
([\u4e00-\u9fa5]|[a-z0-9]+ )? # 左侧是中文,或者英文加空格
([a-z0-9 ]+) # 中间是一个或多个「英文数字加空格」
([\u4e00-\u9fa5]|[a-z0-9]+)? # 右是中文,或者英文加空格
""")

def adjust_space(original: re.Match):
# 如果拼写字母中间有空格,就把空格都去掉
if original.group(2):
final = re.sub(r'(\b\w) (?!\w{2})', r'\1', original.group(2)).strip()
final = re.sub(r'((\d) )?(\b\w) ?(?!\w{2})', r'\2\3', original.group(2)).strip()
# 测试地址 https://regex101.com/r/1Vtu7V/1
# final = re.sub(r'(\b\w) (?!\w{2})', r'\1', original.group(2)).strip()

# 如果英文的左边有汉字,给中英之间加上空格
# 如果英文的左边有汉字或英文(不是数字),给两组之间加上空格
if original.group(1):
final = original.group(1).rstrip() + ' ' + final
if not re.match(r'.*\d', original.group(1)):
final = original.group(1).rstrip() + ' ' + final
# 如果英文左边的汉字被前一个组消费了,就要手动去看一下前一个字是不是中文
elif re.match(r'[\u4e00-\u9fa5]', original.string[original.start(2) - 1]):
final = ' ' + final


# 如果英文的右边有汉字,给中英之间加上空格
if original.group(3):
Expand Down
Loading

0 comments on commit a7131c3

Please sign in to comment.