Skip to content

Commit

Permalink
Merge pull request kubeagi#872 from wangxinbiao/main
Browse files Browse the repository at this point in the history
feat:add document chunk
  • Loading branch information
bjwswang authored Mar 18, 2024
2 parents f40665f + fc03782 commit 8f790fc
Show file tree
Hide file tree
Showing 13 changed files with 292 additions and 61 deletions.
126 changes: 125 additions & 1 deletion apiserver/graph/generated/generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions apiserver/graph/generated/models_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions apiserver/graph/schema/dataprocessing.gql
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ query dataProcessDetails($input: DataProcessDetailsInput){
enable
zh_name
description
chunk_size
chunk_overlap
llm_config {
name
namespace
Expand Down
4 changes: 4 additions & 0 deletions apiserver/graph/schema/dataprocessing.graphqls
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ input FileItem {
# 数据处理配置条目
input DataProcessConfigItem {
type: String!
chunk_size: Int
chunk_overlap: Int
llm_config: LLMConfigItem
remove_duplicate_config: RemoveDuplicateConfig
}
Expand Down Expand Up @@ -231,6 +233,8 @@ type DataProcessConfigChildren {
enable: String
zh_name: String
description: String
chunk_size: Int
chunk_overlap: Int
llm_config: LLMConfig
preview: [DataProcessConfigpreView]
file_progress: [DataProcessConfigpreFileProgress]
Expand Down
4 changes: 2 additions & 2 deletions deploy/charts/arcadia/templates/pg-init-data-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ data:
task_id character varying(32) COLLATE pg_catalog."default",
log_id character varying(32) COLLATE pg_catalog."default",
log_datetime character varying(32) COLLATE pg_catalog."default",
file_name character varying(64) COLLATE pg_catalog."default",
file_name character varying(512) COLLATE pg_catalog."default",
stage_name character varying(1024) COLLATE pg_catalog."default",
stage_status character varying(64) COLLATE pg_catalog."default",
stage_detail text COLLATE pg_catalog."default",
Expand Down Expand Up @@ -385,7 +385,7 @@ data:
task_id varchar(32),
document_id varchar(32),
document_chunk_id varchar(32),
file_name varchar(64),
file_name varchar(512),
question text,
answer text,
question_vector vector,
Expand Down
4 changes: 2 additions & 2 deletions pypi/data-processing/db-scripts/init-database-schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@
task_id character varying(32) COLLATE pg_catalog."default",
log_id character varying(32) COLLATE pg_catalog."default",
log_datetime character varying(32) COLLATE pg_catalog."default",
file_name character varying(64) COLLATE pg_catalog."default",
file_name character varying(512) COLLATE pg_catalog."default",
stage_name character varying(1024) COLLATE pg_catalog."default",
stage_status character varying(64) COLLATE pg_catalog."default",
stage_detail text COLLATE pg_catalog."default",
Expand Down Expand Up @@ -380,7 +380,7 @@
task_id varchar(32),
document_id varchar(32),
document_chunk_id varchar(32),
file_name varchar(64),
file_name varchar(512),
question text,
answer text,
question_vector vector,
Expand Down
31 changes: 13 additions & 18 deletions pypi/data-processing/src/data_store_process/minio_store_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,6 @@ async def text_manipulate(
if file_extension in ["pdf"]:
# 处理PDF文件
pdf_handle = PDFHandle(
chunk_size=req_json.get("chunk_size"),
chunk_overlap=req_json.get("chunk_overlap"),
file_name=file_name,
document_id=item.get("document_id"),
support_type=support_type,
Expand All @@ -163,8 +161,6 @@ async def text_manipulate(
elif file_extension in ["docx"]:
# 处理.docx文件
result = word_handle.docx_manipulate(
chunk_size=req_json.get("chunk_size"),
chunk_overlap=req_json.get("chunk_overlap"),
file_name=file_name,
document_id=item.get("document_id"),
support_type=support_type,
Expand All @@ -175,8 +171,6 @@ async def text_manipulate(
elif file_extension == "web":
# 处理.web文件
result = await web_handle.web_manipulate(
chunk_size=req_json.get("chunk_size"),
chunk_overlap=req_json.get("chunk_overlap"),
file_name=file_name,
document_id=item.get("document_id"),
support_type=support_type,
Expand Down Expand Up @@ -510,19 +504,20 @@ def text_manipulate_retry(req_json, pool):
data_process_stage_log_db_operate.insert(insert_stage_log_params, pool=pool)

# insert QA list to detail preview
logger.debug(
f"{log_tag_const.MINIO_STORE_PROCESS} Insert QA list for detail preview."
)
list_qa_params = {"task_id": task_id}
list_qa_res = data_process_detail_db_operate.top_n_list_qa_for_preview(
list_qa_params, pool=pool
)
if any(d.get("type") == "qa_split" for d in support_type):
logger.debug(
f"{log_tag_const.MINIO_STORE_PROCESS} Insert QA list for detail preview."
)
list_qa_params = {"task_id": task_id}
list_qa_res = data_process_detail_db_operate.top_n_list_qa_for_preview(
list_qa_params, pool=pool
)

for item in list_qa_res.get("data"):
item["transform_type"] = "qa_split"
item["pre_content"] = item["question"]
item["post_content"] = item["answer"]
data_process_detail_preview_db_operate.insert(item, pool=pool)
for item in list_qa_res.get("data"):
item["transform_type"] = "qa_split"
item["pre_content"] = item["question"]
item["post_content"] = item["answer"]
data_process_detail_preview_db_operate.insert(item, pool=pool)

# 将清洗后的文件上传到MinIO中
# 上传final文件夹下的文件,并添加tag
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,41 @@ def list_by_status(req_json, pool):

res = postgresql_pool_client.execute_query(pool, sql, params)
return res

def top_n_list_for_preview(req_json, pool):
"""List chunk info with task id for preview.
req_json is a dictionary object. for example:
{
"task_id": "01HGWBE48DT3ADE9ZKA62SW4WS",
"file_name": "MyFile.pdf"
}
pool: databasec connection pool;
"""
params = {"task_id": req_json["task_id"]}

sql = """
select
id,
document_id,
status,
task_id,
content,
meta_info,
page_number,
create_datetime,
create_user,
create_program,
update_datetime,
update_user,
update_program
from
public.data_process_task_document_chunk
where
task_id = %(task_id)s
order by random()
limit 10
""".strip()

res = postgresql_pool_client.execute_query(pool, sql, params)
return res
Loading

0 comments on commit 8f790fc

Please sign in to comment.