Skip to content

Commit

Permalink
Merge pull request #260 from wangxinbiao/main
Browse files Browse the repository at this point in the history
feat:Add Data Processing List, Add, Delete APIs, and Dockerfile
  • Loading branch information
bjwswang authored Nov 22, 2023
2 parents 31f0f88 + f71511a commit fe574c5
Show file tree
Hide file tree
Showing 15 changed files with 891 additions and 62 deletions.
4 changes: 3 additions & 1 deletion data-processing/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ __pycache__

mock_data

log
log

file_handle/temp_file
47 changes: 47 additions & 0 deletions data-processing/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
FROM python:3.10.13-slim

ENV TZ=Asia/Shanghai

RUN sed -i 's/deb.debian.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list.d/debian.sources

RUN export DEBIAN_FRONTEND=noninteractive \
&& apt-get update \
&& apt-get install -y tzdata \
&& ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
&& dpkg-reconfigure --frontend noninteractive tzdata \
&& apt-get install -y python3-distutils curl python3-pip \
&& apt-get install -y wget

RUN wget https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.5.0/zh_core_web_sm-3.5.0-py3-none-any.whl -O /tmp/zh_core_web_sm-3.5.0-py3-none-any.whl \
&& pip3 install /tmp/zh_core_web_sm-3.5.0-py3-none-any.whl -i https://pypi.org/simple \
&& rm /tmp/zh_core_web_sm-3.5.0-py3-none-any.whl


ENV MINIO_ACCESSKEY=minio_accesskey
ENV MINIO_SECRETKEY=minio_secretkey
ENV MINIO_API_URL=localhost:9000
ENV MINIO_SECURE=False

ENV ZHIPUAI_API_KEY=xxxxx

ENV KNOWLEDGE_CHUNK_SIZE=500
ENV KNOWLEDGE_CHUNK_OVERLAP=50

ENV PG_HOST=localhost
ENV PG_PORT=5432
ENV PG_USER=postgres
ENV PG_PASSWORD=xxxxx
ENV PG_DATABASE=data_process

EXPOSE 28888

ADD . /arcadia_app/
WORKDIR /arcadia_app

RUN chmod 777 /arcadia_app/entrypoint.sh

RUN pip install -r requirements.txt

ENTRYPOINT ["./entrypoint.sh"]


14 changes: 0 additions & 14 deletions data-processing/Dockerfile.base

This file was deleted.

11 changes: 9 additions & 2 deletions data-processing/data_manipulation/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,14 @@
minio_secure = os.getenv('MINIO_SECURE', False)

# zhipuai api_key
zhipuai_api_key = os.getenv('ZHIPUAI_API_KEY', 'xxxxx')
zhipuai_api_key = os.getenv('ZHIPUAI_API_KEY', 'xxxxxx')

knowledge_chunk_size = os.getenv("KNOWLEDGE_CHUNK_SIZE", 500)
knowledge_chunk_overlap = os.getenv("KNOWLEDGE_CHUNK_OVERLAP", 50)
knowledge_chunk_overlap = os.getenv("KNOWLEDGE_CHUNK_OVERLAP", 50)

# pg数据库
pg_host = os.getenv("PG_HOST", "localhost")
pg_port = os.getenv("PG_PORT", 5432)
pg_user = os.getenv("PG_USER", "postgres")
pg_password = os.getenv("PG_PASSWORD", "xxxxx")
pg_database = os.getenv("PG_DATABASE", "data_process")
39 changes: 39 additions & 0 deletions data-processing/data_manipulation/common/special_characters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright 2023 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import string

import emoji

# special characters
MAIN_SPECIAL_CHARACTERS = string.punctuation + string.digits \
+ string.whitespace
OTHER_SPECIAL_CHARACTERS = (
"’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═"
"×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
"◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
"゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
"」﴾》"
)
EMOJI = list(emoji.EMOJI_DATA.keys())
SPECIAL_CHARACTERS = set(MAIN_SPECIAL_CHARACTERS + OTHER_SPECIAL_CHARACTERS)
SPECIAL_CHARACTERS.update(EMOJI)

# various whitespaces for whitespace normalization
# whitespaces in unicode can be found here:
# https://en.wikipedia.org/wiki/Whitespace_character
VARIOUS_WHITESPACES = {
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', '​', '‌', '‍', '⁠', '', '„'
}
207 changes: 207 additions & 0 deletions data-processing/data_manipulation/db/data_process_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
# Copyright 2023 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


###
# 数据处理任务
# @author: wangxinbiao
# @date: 2023-11-21 13:57:01
# modify history
# ==== 2023-11-21 13:57:01 ====
# author: wangxinbiao
# content:
# 1) 基本功能实现
###

import ulid
import ujson

from datetime import datetime
from utils import pg_utils
from sanic.response import json


async def list_by_page(request, opt={}):
conn = opt['conn']

req_json = request.json

params = {
'keyword': '%' + req_json['keyword'] + '%',
'page': int(req_json['page']),
'pageSize': int(req_json['pageSize'])
}

sql = """
select
id,
name,
status,
pre_data_set_name,
pre_data_set_version,
post_data_set_name,
post_data_set_version,
start_datetime
from
public.data_process_task
where
name like %(keyword)s
limit %(pageSize)s offset %(page)s
""".strip()

res = await pg_utils.execute_sql(conn,sql,params)
return json(res)


async def list_by_count(request, opt={}):
conn = opt['conn']

req_json = request.json

params = {
'keyword': '%' + req_json['keyword'] + '%'
}

sql = """
select
count(*)
from
public.data_process_task
where
name like %(keyword)s
""".strip()

res = await pg_utils.execute_count_sql(conn,sql,params)
return json(res)


async def add(request, opt={}):
conn = opt['conn']

req_json = request.json

now = datetime.now()
user = 'admin'
program = '数据处理任务-新增'

params = {
'id': opt['id'],
'name': req_json['name'],
'file_type': req_json['file_type'],
'status': 'processing',
'pre_data_set_name': req_json['pre_data_set_name'],
'pre_data_set_version': req_json['pre_data_set_version'],
'file_names': ujson.dumps(req_json['file_names']),
'post_data_set_name': req_json['post_data_set_name'],
'post_data_set_version': req_json['post_data_set_version'],
'data_process_config_info': ujson.dumps(req_json['data_process_config_info']),
'start_datetime': now,
'create_datetime': now,
'create_user': user,
'create_program': program,
'update_datetime': now,
'update_user': user,
'update_program': program
}

sql = """
insert into public.data_process_task (
id,
name,
file_type,
status,
pre_data_set_name,
pre_data_set_version,
file_names,
post_data_set_name,
post_data_set_version,
data_process_config_info,
start_datetime,
create_datetime,
create_user,
create_program,
update_datetime,
update_user,
update_program
)
values (
%(id)s,
%(name)s,
%(file_type)s,
%(status)s,
%(pre_data_set_name)s,
%(pre_data_set_version)s,
%(file_names)s,
%(post_data_set_name)s,
%(post_data_set_version)s,
%(data_process_config_info)s,
%(start_datetime)s,
%(create_datetime)s,
%(create_program)s,
%(create_user)s,
%(update_datetime)s,
%(update_program)s,
%(update_user)s
)
""".strip()

return await pg_utils.execute_insert_sql(conn,sql,params)


async def delete_by_id(request, opt={}):
conn = opt['conn']

req_json = request.json

params = {
'id': req_json['id']
}

sql = """
delete from public.data_process_task
where
id = %(id)s
""".strip()

res = await pg_utils.execute_delete_sql(conn,sql,params)
return json(res)


async def update_status_by_id(opt={}):
conn = opt['conn']

now = datetime.now()
user = 'admin'
program = '修改任务状态'

params = {
'id': opt['id'],
'status': opt['status'],
'update_datetime': now,
'update_program': program,
'update_user': user
}

sql = """
UPDATE public.dataset set
status = %(status)s
update_datetime = %(update_datetime)s,
update_program = %(update_program)s,
update_user = %(update_user)s
WHERE
id = %(id)s
""".strip()

res = await pg_utils.execute_update_sql(conn,sql,params)
return json(res)
2 changes: 2 additions & 0 deletions data-processing/data_manipulation/file_handle/csv_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import logging
import os

import asyncio

import pandas as pd
import ulid
from transform.text import clean_transform, privacy_transform
Expand Down
Loading

0 comments on commit fe574c5

Please sign in to comment.