-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #207 from wangxinbiao/main
feat:generate Q&A pairs from text by zhipuai
- Loading branch information
Showing
4 changed files
with
101 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
96 changes: 96 additions & 0 deletions
96
data-process/data_manipulation/transform/text/QA_transform.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# Copyright 2023 KubeAGI. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import zhipuai | ||
import re | ||
|
||
from common import ( | ||
config | ||
) | ||
|
||
### | ||
# QA生成 | ||
# @author: wangxinbiao | ||
# @date: 2023-11-15 16:30:01 | ||
# modify history | ||
# ==== 2023-11-15 16:30:01 ==== | ||
# author: wangxinbiao | ||
# content: | ||
# 1) 基本功能实现 | ||
### | ||
|
||
async def generate_QA(opt={}): | ||
zhipuai.api_key = config.zhipuai_api_key | ||
|
||
text = opt['text'] | ||
content = """ | ||
我会给你一段文本,它们可能包含多个主题内容,学习它们,并整理学习成果,要求为: | ||
1. 提出最多 25 个问题。 | ||
2. 给出每个问题的答案。 | ||
3. 答案要详细完整,答案可以包含普通文字、链接、代码、表格、公示、媒体链接等 markdown 元素。 | ||
4. 按格式返回多个问题和答案: | ||
Q1: 问题。 | ||
A1: 答案。 | ||
Q2: | ||
A2: | ||
…… | ||
我的文本: | ||
""" | ||
|
||
content = content + text | ||
|
||
response = zhipuai.model_api.invoke( | ||
model="chatglm_6b", | ||
prompt=[{"role": "user", "content": content}], | ||
top_p=0.7, | ||
temperature=0.9, | ||
) | ||
|
||
# 格式化后的QA对 | ||
result = await formatSplitText(response['data']['choices'][0]['content']) | ||
|
||
return result | ||
|
||
|
||
### | ||
# 对QA进行格式化 | ||
# @author: wangxinbiao | ||
# @date: 2023-11-15 16:30:01 | ||
# modify history | ||
# ==== 2023-11-15 16:30:01 ==== | ||
# author: wangxinbiao | ||
# content: | ||
# 1) 基本功能实现 | ||
### | ||
async def formatSplitText(text): | ||
|
||
pattern = re.compile(r'Q\d+:(\s*)(.*?)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q|$)') | ||
|
||
# 移除换行符 | ||
text = text.replace('\\n', '') | ||
matches = pattern.findall(text) | ||
|
||
result = [] | ||
for match in matches: | ||
q = match[1] | ||
a = match[4] | ||
if q and a: | ||
result.append({ | ||
'q': q, | ||
'a': a | ||
}) | ||
|
||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,4 +4,5 @@ sanic==23.6.0 | |
sanic_cors==2.2.0 | ||
aiohttp==3.8.6 | ||
ulid==1.1 | ||
minio==7.1.17 | ||
minio==7.1.17 | ||
zhipuai==1.0.7 |