diff --git a/data-process/data_manipulation/common/config.py b/data-process/data_manipulation/common/config.py index def88e1aa..c086fba28 100644 --- a/data-process/data_manipulation/common/config.py +++ b/data-process/data_manipulation/common/config.py @@ -19,3 +19,6 @@ minio_api_url = os.getenv('MINIO_API_URL', '192.168.90.31:9000') # 如果使用HTTP,将secure设置为False;如果使用HTTPS,将其设置为True minio_secure = os.getenv('MINIO_SECURE', False) + +# zhipuai api_key +zhipuai_api_key = os.getenv('ZHIPUAI_API_KEY', 'xxxxx') \ No newline at end of file diff --git a/data-process/data_manipulation/file_handle/csv_handle.py b/data-process/data_manipulation/file_handle/csv_handle.py index dca6d0c90..e689bdc99 100644 --- a/data-process/data_manipulation/file_handle/csv_handle.py +++ b/data-process/data_manipulation/file_handle/csv_handle.py @@ -72,7 +72,6 @@ async def text_manipulate(opt={}): logger.info('data') - clean_text_list = [] logger.info("start text manipulate!") text_data = data['prompt'] diff --git a/data-process/data_manipulation/transform/text/QA_transform.py b/data-process/data_manipulation/transform/text/QA_transform.py new file mode 100644 index 000000000..81391a5a4 --- /dev/null +++ b/data-process/data_manipulation/transform/text/QA_transform.py @@ -0,0 +1,96 @@ +# Copyright 2023 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import zhipuai +import re + +from common import ( + config +) + +### +# QA生成 +# @author: wangxinbiao +# @date: 2023-11-15 16:30:01 +# modify history +# ==== 2023-11-15 16:30:01 ==== +# author: wangxinbiao +# content: +# 1) 基本功能实现 +### + +async def generate_QA(opt={}): + zhipuai.api_key = config.zhipuai_api_key + + text = opt['text'] + content = """ + 我会给你一段文本,它们可能包含多个主题内容,学习它们,并整理学习成果,要求为: + 1. 提出最多 25 个问题。 + 2. 给出每个问题的答案。 + 3. 答案要详细完整,答案可以包含普通文字、链接、代码、表格、公示、媒体链接等 markdown 元素。 + 4. 按格式返回多个问题和答案: + + Q1: 问题。 + A1: 答案。 + Q2: + A2: + …… + + 我的文本: + """ + + content = content + text + + response = zhipuai.model_api.invoke( + model="chatglm_6b", + prompt=[{"role": "user", "content": content}], + top_p=0.7, + temperature=0.9, + ) + + # 格式化后的QA对 + result = await formatSplitText(response['data']['choices'][0]['content']) + + return result + + +### +# 对QA进行格式化 +# @author: wangxinbiao +# @date: 2023-11-15 16:30:01 +# modify history +# ==== 2023-11-15 16:30:01 ==== +# author: wangxinbiao +# content: +# 1) 基本功能实现 +### +async def formatSplitText(text): + + pattern = re.compile(r'Q\d+:(\s*)(.*?)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q|$)') + + # 移除换行符 + text = text.replace('\\n', '') + matches = pattern.findall(text) + + result = [] + for match in matches: + q = match[1] + a = match[4] + if q and a: + result.append({ + 'q': q, + 'a': a + }) + + return result \ No newline at end of file diff --git a/data-process/requirements.txt b/data-process/requirements.txt index 74a6e3d9d..b45e4c6e8 100644 --- a/data-process/requirements.txt +++ b/data-process/requirements.txt @@ -4,4 +4,5 @@ sanic==23.6.0 sanic_cors==2.2.0 aiohttp==3.8.6 ulid==1.1 -minio==7.1.17 \ No newline at end of file +minio==7.1.17 +zhipuai==1.0.7 \ No newline at end of file