Skip to content

Commit

Permalink
Merge pull request #236 from wangxinbiao/main
Browse files Browse the repository at this point in the history
feat:chunks pdf files to generate Q&A
  • Loading branch information
bjwswang authored Nov 22, 2023
2 parents 5ba5f5d + 12f620b commit 31f0f88
Show file tree
Hide file tree
Showing 12 changed files with 2,074 additions and 146 deletions.
3 changes: 3 additions & 0 deletions data-processing/data_manipulation/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@

# zhipuai api_key
zhipuai_api_key = os.getenv('ZHIPUAI_API_KEY', 'xxxxx')

knowledge_chunk_size = os.getenv("KNOWLEDGE_CHUNK_SIZE", 500)
knowledge_chunk_overlap = os.getenv("KNOWLEDGE_CHUNK_OVERLAP", 50)
16 changes: 8 additions & 8 deletions data-processing/data_manipulation/file_handle/csv_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@ async def text_manipulate(opt={}):
# 获取CSV文件的内容
data = pd.read_csv(file_path)

logger.info('data')

logger.info("start text manipulate!")
text_data = data['prompt']

Expand All @@ -81,7 +79,7 @@ async def text_manipulate(opt={}):
return clean_result

text_data = clean_result['data']

# 将清洗后的文件保存为final
new_file_name = await file_utils.get_file_name({
'file_name': file_name,
Expand Down Expand Up @@ -117,6 +115,8 @@ async def text_manipulate(opt={}):
# content:
# 1) 基本功能实现
###


async def data_clean(opt={}):
logger.info("csv text data clean start!")
support_type = opt['support_type']
Expand All @@ -138,7 +138,9 @@ async def data_clean(opt={}):
}

clean_data.append(result['data'])

data = clean_data
data.insert(0, ['prompt'])

# 将文件存为middle
file_name = await file_utils.get_file_name({
Expand Down Expand Up @@ -171,6 +173,8 @@ async def data_clean(opt={}):
# content:
# 1) 基本功能实现
###


async def remove_invisible_characters(opt={}):
return await clean_transform.remove_invisible_characters({
'text': opt['text']
Expand Down Expand Up @@ -221,10 +225,6 @@ async def save_csv(opt={}):

with open(file_path, 'w', newline='') as file:
writer = csv.writer(file)

writer.writerow(['prompt'])

for row in data:
writer.writerow([row])
writer.writerows(data)

return file_path
237 changes: 237 additions & 0 deletions data-processing/data_manipulation/file_handle/pdf_handle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
# Copyright 2023 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

###
# PDF文件处理
# @author: wangxinbiao
# @date: 2023-11-01 16:43:01
# modify history
# ==== 2023-11-01 16:43:01 ====
# author: wangxinbiao
# content:
# 1) 基本功能实现
###

import logging
import os
import pandas as pd

from common import config
from file_handle import csv_handle
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import SpacyTextSplitter
from pypdf import PdfReader
from transform.text import clean_transform, privacy_transform, QA_transform
from utils import file_utils



logger = logging.getLogger('pdf_handle')

###
# 文本数据处理
# @author: wangxinbiao
# @date: 2023-11-17 16:14:01
# modify history
# ==== 2023-11-17 16:14:01 ====
# author: wangxinbiao
# content:
# 1) 基本功能实现
###


async def text_manipulate(request, opt={}):
logger.info("pdf text manipulate!")

"""
数据处理逻辑:
处理某条数据时,如果某个方式(比如:去除不可见字符)处理失败了,则直接结束,不在处理,整个文件都视作处理失败
"""

try:

file_name = opt['file_name']
support_type = opt['support_type']

pdf_file_path = await file_utils.get_temp_file_path()
file_path = pdf_file_path + 'original/' + file_name


# 获取PDF文件的内容
content = await get_content({
"file_path": file_path
})

logger.info("start text manipulate!")

# 数据清洗
clean_result = await data_clean({
'support_type': support_type,
'file_name': file_name,
'data': content
})

if clean_result['status'] != 200:
return clean_result

content = clean_result['data']

# 去隐私


# QA拆分
if 'qa_split' in support_type:
qa_data = await generate_QA(request, {
'support_type': support_type,
'data': content
})

# 将生成的QA数据保存为CSV文件
new_file_name = await file_utils.get_file_name({
'file_name': file_name,
'handle_name': 'final'
})

file_name_without_extension = file_name.rsplit('.', 1)[0]

await csv_handle.save_csv({
'file_name': file_name_without_extension + '.csv',
'phase_value': 'final',
'data': qa_data
})

return {
'status': 200,
'message': '',
'data': ''
}
except Exception as ex:
return {
'status': 400,
'message': '',
'data': ''
}

###
# 数据异常清洗
# @author: wangxinbiao
# @date: 2023-11-17 16:14:01
# modify history
# ==== 2023-11-17 16:14:01 ====
# author: wangxinbiao
# content:
# 1) 基本功能实现
###


async def data_clean(opt={}):
logger.info("pdf text data clean start!")
support_type = opt['support_type']
data = opt['data']

# 去除不可见字符
if 'remove_invisible_characters' in support_type:
result = await clean_transform.remove_invisible_characters({
'text': data
})

if result['status'] != 200:
return {
'status': 400,
'message': '去除不可见字符失败',
'data': ''
}

data = result['data']

logger.info("pdf text data clean stop!")

return {
'status': 200,
'message': '',
'data': data
}


###
# 获取PDF内容
# @author: wangxinbiao
# @date: 2023-11-17 16:14:01
# modify history
# ==== 2023-11-17 16:14:01 ====
# author: wangxinbiao
# content:
# 1) 基本功能实现
###


async def get_content(opt={}):
file_path = opt['file_path']

reader = PdfReader(file_path)
number_of_pages = len(reader.pages)
pages = reader.pages
content = ""
for page in pages:
content += page.extract_text()

return content

###
# QA拆分
# @author: wangxinbiao
# @date: 2023-11-17 16:14:01
# modify history
# ==== 2023-11-17 16:14:01 ====
# author: wangxinbiao
# content:
# 1) 基本功能实现
###


async def generate_QA(request, opt={}):
request_json = request.json

# 文本分段
chunk_size = config.knowledge_chunk_size
if "chunk_size" in request_json:
chunk_size = request_json['chunk_size']

chunk_overlap = config.knowledge_chunk_overlap
if "chunk_overlap" in request_json:
chunk_overlap = request_json['chunk_overlap']

separator = "\n\n"

text_splitter = SpacyTextSplitter(
separator=separator,
pipeline="zh_core_web_sm",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
texts = text_splitter.split_text(opt['data'])

# 生成QA
qa_list = [['q', 'a']]

for item in texts:
text = item.replace("\n", "")
data = await QA_transform.generate_QA({
'text': text
})

qa_list.extend(data)

return qa_list
3 changes: 2 additions & 1 deletion data-processing/data_manipulation/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ async def text_manipulate(request):
Args:
type: 对文本数据需要进行那些处理;
file_path: 文本路径
bucket_name: minio桶名称;
folder_prefix: minio中文件目录
Returns:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import os

import pandas as pd
from file_handle import csv_handle
from file_handle import csv_handle, pdf_handle
from minio import Minio
from minio.commonconfig import Tags
from minio.error import S3Error
Expand Down Expand Up @@ -80,6 +80,13 @@ async def text_manipulate(request):
'support_type': support_type
})

elif file_extension in ['pdf']:
# 处理PDF文件
result = await pdf_handle.text_manipulate(request, {
'file_name': item,
'support_type': support_type
})

# 将清洗后的文件上传到MinIO中
# 上传middle文件夹下的文件,并添加tag
tags = Tags(for_object=True)
Expand Down Expand Up @@ -135,14 +142,16 @@ async def download(opt={}):
for obj in objects:
file_name = obj.object_name[len(folder_prefix):]

data = minio_client.get_object(bucket_name, obj.object_name)
df = pd.read_csv(data)
csv_file_path = await file_utils.get_temp_file_path()

# 如果文件夹不存在,则创建
directory_path = csv_file_path + 'original'
if not os.path.exists(directory_path):
os.makedirs(directory_path)

file_path = directory_path + '/' + file_name

await csv_handle.save_csv({
'file_name': file_name,
'phase_value': 'original',
'data': df['prompt']
})
minio_client.fget_object(bucket_name, obj.object_name, file_path)
file_names.append(file_name)

return file_names
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,6 @@ async def formatSplitText(text):
q = match[1]
a = match[4]
if q and a:
result.append({
'q': q,
'a': a
})
result.append([q, a])

return result
5 changes: 4 additions & 1 deletion data-processing/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,7 @@ sanic_cors==2.2.0
aiohttp==3.8.6
ulid==1.1
minio==7.1.17
zhipuai==1.0.7
zhipuai==1.0.7
langchain==0.0.336
spacy==3.5.4
pypdf==3.17.1
4 changes: 4 additions & 0 deletions gqlgen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ models:
resolver: true
listKnowledgeBases:
resolver: true
DataProcessQuery:
fields:
allDataProcessListByPage:
resolver: true
DatasetQuery:
fields:
getDataset:
Expand Down
Loading

0 comments on commit 31f0f88

Please sign in to comment.