From f052495a7043d4d62a8d519488c80f271bf0784f Mon Sep 17 00:00:00 2001 From: bjwswang Date: Thu, 18 Jan 2024 08:58:21 +0000 Subject: [PATCH] chore: rename data-processing to pypi Signed-off-by: bjwswang --- .github/workflows/image_build.yml | 4 +- data-processing/.gitignore | 9 -- data-processing/README.md | 123 ------------------ data-processing/entrypoint.sh | 2 - pypi/data-processing/.gitignore | 9 ++ .../data-processing}/Dockerfile | 0 pypi/data-processing/README.md | 56 ++++++++ .../db-scripts/init-database-schema.sql | 0 pypi/data-processing/entrypoint.sh | 2 + .../data-processing}/requirements.txt | 0 .../data-processing/src}/common/config.py | 9 +- .../data-processing/src}/common/const.py | 0 .../src}/common/log_tag_const.py | 0 .../src}/common/special_characters.py | 0 .../controller/data_process_controller.py | 3 +- .../data_store_clients/minio_store_client.py | 5 +- .../data_store_process/minio_store_process.py | 22 ++-- .../postgresql_pool_client.py | 3 +- .../data_process_db_operate.py | 3 +- .../data_process_detail_db_operate.py | 0 .../data_process_detail_preview_db_operate.py | 0 .../data_process_document_chunk_db_operate.py | 2 + .../data_process_document_db_operate.py | 2 + .../data_process_log_db_operate.py | 1 + .../data_process_stage_log_db_operate.py | 1 + .../src}/file_handle/common_handle.py | 16 ++- .../src}/file_handle/csv_handle.py | 1 + .../src}/file_handle/pdf_handle.py | 9 +- .../src}/file_handle/word_handle.py | 7 +- .../data-processing/src}/kube/client.py | 9 +- .../src}/kube/custom_resources.py | 0 .../data-processing/src}/kube/dataset_cr.py | 0 .../data-processing/src}/kube/minio_cr.py | 1 + .../data-processing/src}/kube/model_cr.py | 3 +- .../src}/kube/postgresql_cr.py | 1 + .../src}/llm_api_service/base_qa_provider.py | 0 .../llm_api_service/qa_provider_open_ai.py | 11 +- .../qa_provider_zhi_pu_ai_online.py | 5 +- .../src}/llm_prompt_template/llm_prompt.py | 0 .../src}/parallel/thread_parallel.py | 0 .../data-processing/src}/server.py | 7 +- .../src}/service/data_process_service.py | 5 +- .../src}/transform/text/clean_transform.py | 3 +- .../transform/text/duplicates_transform.py | 0 .../transform/text/filtration_transform.py | 0 .../src}/transform/text/privacy_transform.py | 0 .../src}/transform/text/support_type.py | 0 .../data-processing/src}/utils/class_utils.py | 0 .../data-processing/src}/utils/csv_utils.py | 0 .../src}/utils/date_time_utils.py | 0 .../data-processing/src}/utils/docx_utils.py | 1 + .../data-processing/src}/utils/file_utils.py | 0 .../data-processing/src}/utils/json_utils.py | 0 .../data-processing/src}/utils/log_utils.py | 0 .../data-processing/src}/utils/pdf_utils.py | 0 .../data-processing/src}/utils/sanic_utils.py | 3 +- pypi/ragas_once/README.md | 3 + 57 files changed, 148 insertions(+), 193 deletions(-) delete mode 100644 data-processing/.gitignore delete mode 100644 data-processing/README.md delete mode 100755 data-processing/entrypoint.sh create mode 100644 pypi/data-processing/.gitignore rename {data-processing => pypi/data-processing}/Dockerfile (100%) create mode 100644 pypi/data-processing/README.md rename {data-processing => pypi/data-processing}/db-scripts/init-database-schema.sql (100%) create mode 100755 pypi/data-processing/entrypoint.sh rename {data-processing => pypi/data-processing}/requirements.txt (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/common/config.py (98%) rename {data-processing/data_manipulation => pypi/data-processing/src}/common/const.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/common/log_tag_const.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/common/special_characters.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/controller/data_process_controller.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/data_store_clients/minio_store_client.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/data_store_process/minio_store_process.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/database_clients/postgresql_pool_client.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/database_operate/data_process_db_operate.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/database_operate/data_process_detail_db_operate.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/database_operate/data_process_detail_preview_db_operate.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/database_operate/data_process_document_chunk_db_operate.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/database_operate/data_process_document_db_operate.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/database_operate/data_process_log_db_operate.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/database_operate/data_process_stage_log_db_operate.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/file_handle/common_handle.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/file_handle/csv_handle.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/file_handle/pdf_handle.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/file_handle/word_handle.py (98%) rename {data-processing/data_manipulation => pypi/data-processing/src}/kube/client.py (97%) rename {data-processing/data_manipulation => pypi/data-processing/src}/kube/custom_resources.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/kube/dataset_cr.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/kube/minio_cr.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/kube/model_cr.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/kube/postgresql_cr.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/llm_api_service/base_qa_provider.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/llm_api_service/qa_provider_open_ai.py (97%) rename {data-processing/data_manipulation => pypi/data-processing/src}/llm_api_service/qa_provider_zhi_pu_ai_online.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/llm_prompt_template/llm_prompt.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/parallel/thread_parallel.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/server.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/service/data_process_service.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/transform/text/clean_transform.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/transform/text/duplicates_transform.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/transform/text/filtration_transform.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/transform/text/privacy_transform.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/transform/text/support_type.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/utils/class_utils.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/utils/csv_utils.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/utils/date_time_utils.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/utils/docx_utils.py (99%) rename {data-processing/data_manipulation => pypi/data-processing/src}/utils/file_utils.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/utils/json_utils.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/utils/log_utils.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/utils/pdf_utils.py (100%) rename {data-processing/data_manipulation => pypi/data-processing/src}/utils/sanic_utils.py (99%) create mode 100644 pypi/ragas_once/README.md diff --git a/.github/workflows/image_build.yml b/.github/workflows/image_build.yml index 0d3ca4de2..a60325f6e 100644 --- a/.github/workflows/image_build.yml +++ b/.github/workflows/image_build.yml @@ -78,8 +78,8 @@ jobs: - name: Build data processing image uses: docker/build-push-action@v5 with: - context: ./data-processing - file: ./data-processing/Dockerfile + context: ./pypi + file: ./pypi/data-processing/Dockerfile platforms: linux/amd64,linux/arm64 tags: | kubeagi/data-processing:latest diff --git a/data-processing/.gitignore b/data-processing/.gitignore deleted file mode 100644 index 650de6cc2..000000000 --- a/data-processing/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -# python -__pycache__ -.ipynb_checkpoints - -data_manipulation/mock_data - -data_manipulation/log - -data_manipulation/file_handle/temp_file \ No newline at end of file diff --git a/data-processing/README.md b/data-processing/README.md deleted file mode 100644 index e6e7aace9..000000000 --- a/data-processing/README.md +++ /dev/null @@ -1,123 +0,0 @@ -# Data Processing - -## Current Version Main Features - -Data Processing is used for data processing through MinIO, databases, Web APIs, etc. The data types handled include: -- txt -- json -- doc -- html -- excel -- csv -- pdf -- markdown -- ppt - -### Current Text Type Processing - -The data processing process includes: cleaning abnormal data, filtering, de-duplication, and anonymization. - -## Design - -![Design](../assets/data_process.drawio.png) - -## Local Development -### Software Requirements - -Before setting up the local data-process environment, please make sure the following software is installed: - -- Python 3.10.x - -### Environment Setup - -Install the Python dependencies in the requirements.txt file - -### Running - -Run the server.py file in the data_manipulation directory - -# isort -isort is a tool for sorting imports alphabetically within your Python code. It helps maintain a consistent and clean import order. - -## install -```shell -pip install isort -``` - -## isort a file -```shell -isort server.py -``` - -## isort a directory -```shell -isort data_manipulation -``` - - -# config.yml -## dev phase -The example config.yml is as the following: -```yaml -minio: - access_key: '${MINIO_ACCESSKEY: hpU4SCmj5jixxx}' - secret_key: '${MINIO_SECRETKEY: xxx}' - api_url: '${MINIO_API_URL: 172.22.96.136.nip.io}' - secure: '${MINIO_SECURE: True}' - dataset_prefix: '${MINIO_DATASET_PREFIX: dataset}' - -llm: - qa_retry_count: '${LLM_QA_RETRY_COUNT: 100}' - -knowledge: - chunk_size: '${KNOWLEDGE_CHUNK_SIZE: 500}' - chunk_overlap: '${KNOWLEDGE_CHUNK_OVERLAP: 50}' - -backendPg: - host: '${PG_HOST: localhost}' - port: '${PG_PORT: 5432}' - user: '${PG_USER: postgres}' - password: '${PG_PASSWORD: 123456}' - database: '${PG_DATABASE: arcadia}' - -kubernetes: - default_config: '${DEFAULT_CONFIG: arcadia-config}' - pod_namespace: '${POD_NAMESPACE: arcadia}' -``` - -\${MINIO_ACCESSKEY: hpU4SCmj5jixxx} - -MINIO_ACCESSKEY is the environment variable name. - -hpU4SCmj5jixxx is the default value if the environment variable is not set. - - -## release phase -The example config.yml is as the following: -```yaml -minio: - access_key: 'hpU4SCmj5jixxx' - secret_key: 'minio_sk' - api_url: '172.22.96.136.nip.io' - secure: 'True' - dataset_prefix: 'dataset' - -llm: - qa_retry_count: '100' - -knowledge: - chunk_size: '500' - chunk_overlap: '50' - -backendPg: - host: 'localhost' - port: '5432' - user: 'admin' - password: '123456' - database: 'arcadia' - -kubernetes: - default_config: 'arcadia-config' - pod_namespace: 'arcadia' -``` -In the K8s, you can use the config map to point to the /arcadia_app/data_manipulation/config.yml file. diff --git a/data-processing/entrypoint.sh b/data-processing/entrypoint.sh deleted file mode 100755 index c5e1cdee8..000000000 --- a/data-processing/entrypoint.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -python /arcadia_app/data_manipulation/server.py \ No newline at end of file diff --git a/pypi/data-processing/.gitignore b/pypi/data-processing/.gitignore new file mode 100644 index 000000000..3246da537 --- /dev/null +++ b/pypi/data-processing/.gitignore @@ -0,0 +1,9 @@ +# python +__pycache__ +.ipynb_checkpoints + +data-processing/src/mock_data + +data-processing/src/log + +data-processing/src/file_handle/temp_file \ No newline at end of file diff --git a/data-processing/Dockerfile b/pypi/data-processing/Dockerfile similarity index 100% rename from data-processing/Dockerfile rename to pypi/data-processing/Dockerfile diff --git a/pypi/data-processing/README.md b/pypi/data-processing/README.md new file mode 100644 index 000000000..8ff1e59ca --- /dev/null +++ b/pypi/data-processing/README.md @@ -0,0 +1,56 @@ +# Data Processing + +## Current Version Main Features + +Data Processing is used for data processing through MinIO, databases, Web APIs, etc. The data types handled include: +- txt +- json +- doc +- html +- excel +- csv +- pdf +- markdown +- ppt + +### Current Text Type Processing + +The data processing process includes: cleaning abnormal data, filtering, de-duplication, and anonymization. + +## Design + +![Design](../assets/data_process.drawio.png) + +## Local Development +### Software Requirements + +Before setting up the local data-process environment, please make sure the following software is installed: + +- Python 3.10.x + +### Environment Setup + +Install the Python dependencies in the requirements.txt file + +### Running + +Run the server.py file in the src directory + +# isort +isort is a tool for sorting imports alphabetically within your Python code. It helps maintain a consistent and clean import order. + +## install +```shell +pip install isort +``` + +## isort a file +```shell +isort src/server.py +``` + +## isort a directory +```shell +isort . +``` + diff --git a/data-processing/db-scripts/init-database-schema.sql b/pypi/data-processing/db-scripts/init-database-schema.sql similarity index 100% rename from data-processing/db-scripts/init-database-schema.sql rename to pypi/data-processing/db-scripts/init-database-schema.sql diff --git a/pypi/data-processing/entrypoint.sh b/pypi/data-processing/entrypoint.sh new file mode 100755 index 000000000..c00d401bf --- /dev/null +++ b/pypi/data-processing/entrypoint.sh @@ -0,0 +1,2 @@ +#!/bin/sh +python /arcadia_app/src/server.py \ No newline at end of file diff --git a/data-processing/requirements.txt b/pypi/data-processing/requirements.txt similarity index 100% rename from data-processing/requirements.txt rename to pypi/data-processing/requirements.txt diff --git a/data-processing/data_manipulation/common/config.py b/pypi/data-processing/src/common/config.py similarity index 98% rename from data-processing/data_manipulation/common/config.py rename to pypi/data-processing/src/common/config.py index 58024538d..0c06325e8 100644 --- a/data-processing/data_manipulation/common/config.py +++ b/pypi/data-processing/src/common/config.py @@ -15,16 +15,13 @@ import logging import os -from pathlib import Path import traceback +from pathlib import Path + import yaml +from kube import minio_cr, model_cr, postgresql_cr from utils.class_utils import Singleton -from kube import ( - minio_cr, - model_cr, - postgresql_cr -) from . import log_tag_const diff --git a/data-processing/data_manipulation/common/const.py b/pypi/data-processing/src/common/const.py similarity index 100% rename from data-processing/data_manipulation/common/const.py rename to pypi/data-processing/src/common/const.py diff --git a/data-processing/data_manipulation/common/log_tag_const.py b/pypi/data-processing/src/common/log_tag_const.py similarity index 100% rename from data-processing/data_manipulation/common/log_tag_const.py rename to pypi/data-processing/src/common/log_tag_const.py diff --git a/data-processing/data_manipulation/common/special_characters.py b/pypi/data-processing/src/common/special_characters.py similarity index 100% rename from data-processing/data_manipulation/common/special_characters.py rename to pypi/data-processing/src/common/special_characters.py diff --git a/data-processing/data_manipulation/controller/data_process_controller.py b/pypi/data-processing/src/controller/data_process_controller.py similarity index 99% rename from data-processing/data_manipulation/controller/data_process_controller.py rename to pypi/data-processing/src/controller/data_process_controller.py index a816d0692..fd1676591 100644 --- a/data-processing/data_manipulation/controller/data_process_controller.py +++ b/pypi/data-processing/src/controller/data_process_controller.py @@ -13,9 +13,10 @@ # limitations under the License. -from file_handle import pdf_handle from sanic import Blueprint from sanic.response import json + +from file_handle import pdf_handle from service import data_process_service from transform.text import support_type diff --git a/data-processing/data_manipulation/data_store_clients/minio_store_client.py b/pypi/data-processing/src/data_store_clients/minio_store_client.py similarity index 99% rename from data-processing/data_manipulation/data_store_clients/minio_store_client.py rename to pypi/data-processing/src/data_store_clients/minio_store_client.py index 37d9c5ecd..95328c0e5 100644 --- a/data-processing/data_manipulation/data_store_clients/minio_store_client.py +++ b/pypi/data-processing/src/data_store_clients/minio_store_client.py @@ -17,11 +17,12 @@ import traceback import urllib3 -from common import log_tag_const -from common.config import config from minio import Minio from minio.commonconfig import Tags from minio.error import S3Error + +from common import log_tag_const +from common.config import config from utils import file_utils logger = logging.getLogger(__name__) diff --git a/data-processing/data_manipulation/data_store_process/minio_store_process.py b/pypi/data-processing/src/data_store_process/minio_store_process.py similarity index 99% rename from data-processing/data_manipulation/data_store_process/minio_store_process.py rename to pypi/data-processing/src/data_store_process/minio_store_process.py index 5b554663e..74a7422ef 100644 --- a/data-processing/data_manipulation/data_store_process/minio_store_process.py +++ b/pypi/data-processing/src/data_store_process/minio_store_process.py @@ -16,28 +16,26 @@ import io import logging import os -import ulid import traceback -import ujson +from pathlib import Path import pandas as pd -from common import log_tag_const, const +import ujson +import ulid + +from common import const, log_tag_const from common.config import config from data_store_clients import minio_store_client from database_operate import (data_process_db_operate, - data_process_document_db_operate, data_process_detail_db_operate, data_process_detail_preview_db_operate, + data_process_document_chunk_db_operate, + data_process_document_db_operate, data_process_log_db_operate, - data_process_stage_log_db_operate, - data_process_document_chunk_db_operate) -from file_handle import (csv_handle, - pdf_handle, - word_handle, - common_handle) + data_process_stage_log_db_operate) +from file_handle import common_handle, csv_handle, pdf_handle, word_handle from kube import dataset_cr -from utils import file_utils, date_time_utils, json_utils -from pathlib import Path +from utils import date_time_utils, file_utils, json_utils logger = logging.getLogger(__name__) diff --git a/data-processing/data_manipulation/database_clients/postgresql_pool_client.py b/pypi/data-processing/src/database_clients/postgresql_pool_client.py similarity index 99% rename from data-processing/data_manipulation/database_clients/postgresql_pool_client.py rename to pypi/data-processing/src/database_clients/postgresql_pool_client.py index a6ba3c464..abffa855d 100644 --- a/data-processing/data_manipulation/database_clients/postgresql_pool_client.py +++ b/pypi/data-processing/src/database_clients/postgresql_pool_client.py @@ -16,9 +16,10 @@ import traceback import psycopg2.extras -from common import log_tag_const from dbutils.pooled_db import PooledDB +from common import log_tag_const + logger = logging.getLogger(__name__) diff --git a/data-processing/data_manipulation/database_operate/data_process_db_operate.py b/pypi/data-processing/src/database_operate/data_process_db_operate.py similarity index 99% rename from data-processing/data_manipulation/database_operate/data_process_db_operate.py rename to pypi/data-processing/src/database_operate/data_process_db_operate.py index 92dee60de..2621f2ab7 100644 --- a/data-processing/data_manipulation/database_operate/data_process_db_operate.py +++ b/pypi/data-processing/src/database_operate/data_process_db_operate.py @@ -15,8 +15,9 @@ import ujson import ulid -from database_clients import postgresql_pool_client from sanic.response import json + +from database_clients import postgresql_pool_client from utils import date_time_utils diff --git a/data-processing/data_manipulation/database_operate/data_process_detail_db_operate.py b/pypi/data-processing/src/database_operate/data_process_detail_db_operate.py similarity index 100% rename from data-processing/data_manipulation/database_operate/data_process_detail_db_operate.py rename to pypi/data-processing/src/database_operate/data_process_detail_db_operate.py diff --git a/data-processing/data_manipulation/database_operate/data_process_detail_preview_db_operate.py b/pypi/data-processing/src/database_operate/data_process_detail_preview_db_operate.py similarity index 100% rename from data-processing/data_manipulation/database_operate/data_process_detail_preview_db_operate.py rename to pypi/data-processing/src/database_operate/data_process_detail_preview_db_operate.py diff --git a/data-processing/data_manipulation/database_operate/data_process_document_chunk_db_operate.py b/pypi/data-processing/src/database_operate/data_process_document_chunk_db_operate.py similarity index 99% rename from data-processing/data_manipulation/database_operate/data_process_document_chunk_db_operate.py rename to pypi/data-processing/src/database_operate/data_process_document_chunk_db_operate.py index 95611dac2..509b32737 100644 --- a/data-processing/data_manipulation/database_operate/data_process_document_chunk_db_operate.py +++ b/pypi/data-processing/src/database_operate/data_process_document_chunk_db_operate.py @@ -13,9 +13,11 @@ # limitations under the License. import ulid + from database_clients import postgresql_pool_client from utils import date_time_utils + def add( req_json, pool diff --git a/data-processing/data_manipulation/database_operate/data_process_document_db_operate.py b/pypi/data-processing/src/database_operate/data_process_document_db_operate.py similarity index 99% rename from data-processing/data_manipulation/database_operate/data_process_document_db_operate.py rename to pypi/data-processing/src/database_operate/data_process_document_db_operate.py index c0bea85ab..2a27bbcd0 100644 --- a/data-processing/data_manipulation/database_operate/data_process_document_db_operate.py +++ b/pypi/data-processing/src/database_operate/data_process_document_db_operate.py @@ -13,9 +13,11 @@ # limitations under the License. import ulid + from database_clients import postgresql_pool_client from utils import date_time_utils + def add( req_json, pool diff --git a/data-processing/data_manipulation/database_operate/data_process_log_db_operate.py b/pypi/data-processing/src/database_operate/data_process_log_db_operate.py similarity index 99% rename from data-processing/data_manipulation/database_operate/data_process_log_db_operate.py rename to pypi/data-processing/src/database_operate/data_process_log_db_operate.py index c50549265..f74d412c4 100644 --- a/data-processing/data_manipulation/database_operate/data_process_log_db_operate.py +++ b/pypi/data-processing/src/database_operate/data_process_log_db_operate.py @@ -14,6 +14,7 @@ import ujson + from database_clients import postgresql_pool_client from utils import date_time_utils diff --git a/data-processing/data_manipulation/database_operate/data_process_stage_log_db_operate.py b/pypi/data-processing/src/database_operate/data_process_stage_log_db_operate.py similarity index 99% rename from data-processing/data_manipulation/database_operate/data_process_stage_log_db_operate.py rename to pypi/data-processing/src/database_operate/data_process_stage_log_db_operate.py index 4c3463381..4f1a3800b 100644 --- a/data-processing/data_manipulation/database_operate/data_process_stage_log_db_operate.py +++ b/pypi/data-processing/src/database_operate/data_process_stage_log_db_operate.py @@ -14,6 +14,7 @@ import ulid + from database_clients import postgresql_pool_client from utils import date_time_utils diff --git a/data-processing/data_manipulation/file_handle/common_handle.py b/pypi/data-processing/src/file_handle/common_handle.py similarity index 99% rename from data-processing/data_manipulation/file_handle/common_handle.py rename to pypi/data-processing/src/file_handle/common_handle.py index 45ec1d63f..e252501ff 100644 --- a/data-processing/data_manipulation/file_handle/common_handle.py +++ b/pypi/data-processing/src/file_handle/common_handle.py @@ -13,24 +13,26 @@ # limitations under the License. +import base64 import logging import os import traceback -import base64 import pandas as pd import ulid +from langchain.text_splitter import SpacyTextSplitter + from common import log_tag_const from common.config import config from database_operate import (data_process_detail_db_operate, - data_process_document_db_operate, - data_process_document_chunk_db_operate) -from langchain.text_splitter import SpacyTextSplitter + data_process_document_chunk_db_operate, + data_process_document_db_operate) +from kube import model_cr from llm_api_service.qa_provider_open_ai import QAProviderOpenAI -from llm_api_service.qa_provider_zhi_pu_ai_online import QAProviderZhiPuAIOnline +from llm_api_service.qa_provider_zhi_pu_ai_online import \ + QAProviderZhiPuAIOnline from transform.text import clean_transform, privacy_transform -from utils import csv_utils, file_utils, docx_utils, date_time_utils -from kube import model_cr +from utils import csv_utils, date_time_utils, docx_utils, file_utils logger = logging.getLogger(__name__) diff --git a/data-processing/data_manipulation/file_handle/csv_handle.py b/pypi/data-processing/src/file_handle/csv_handle.py similarity index 99% rename from data-processing/data_manipulation/file_handle/csv_handle.py rename to pypi/data-processing/src/file_handle/csv_handle.py index 071be31f8..339f1e8fd 100644 --- a/data-processing/data_manipulation/file_handle/csv_handle.py +++ b/pypi/data-processing/src/file_handle/csv_handle.py @@ -18,6 +18,7 @@ import pandas as pd import ulid + from common import log_tag_const from transform.text import clean_transform, privacy_transform from utils import csv_utils, date_time_utils, file_utils diff --git a/data-processing/data_manipulation/file_handle/pdf_handle.py b/pypi/data-processing/src/file_handle/pdf_handle.py similarity index 99% rename from data-processing/data_manipulation/file_handle/pdf_handle.py rename to pypi/data-processing/src/file_handle/pdf_handle.py index f9816aead..32dc03378 100644 --- a/data-processing/data_manipulation/file_handle/pdf_handle.py +++ b/pypi/data-processing/src/file_handle/pdf_handle.py @@ -15,16 +15,17 @@ import logging import traceback -import ulid + import ujson +import ulid +from langchain.document_loaders import PyPDFLoader +from langchain.text_splitter import SpacyTextSplitter from common import log_tag_const from common.config import config +from database_operate import data_process_document_chunk_db_operate from file_handle import common_handle from utils import file_utils -from langchain.document_loaders import PyPDFLoader -from langchain.text_splitter import SpacyTextSplitter -from database_operate import data_process_document_chunk_db_operate logger = logging.getLogger(__name__) diff --git a/data-processing/data_manipulation/file_handle/word_handle.py b/pypi/data-processing/src/file_handle/word_handle.py similarity index 98% rename from data-processing/data_manipulation/file_handle/word_handle.py rename to pypi/data-processing/src/file_handle/word_handle.py index 8e8207700..ffacfc0c1 100644 --- a/data-processing/data_manipulation/file_handle/word_handle.py +++ b/pypi/data-processing/src/file_handle/word_handle.py @@ -15,14 +15,15 @@ import logging import traceback + import ulid +from langchain.text_splitter import SpacyTextSplitter from common import log_tag_const from common.config import config -from file_handle import common_handle -from utils import file_utils, docx_utils -from langchain.text_splitter import SpacyTextSplitter from database_operate import data_process_document_chunk_db_operate +from file_handle import common_handle +from utils import docx_utils, file_utils logger = logging.getLogger(__name__) diff --git a/data-processing/data_manipulation/kube/client.py b/pypi/data-processing/src/kube/client.py similarity index 97% rename from data-processing/data_manipulation/kube/client.py rename to pypi/data-processing/src/kube/client.py index 921c2ad31..9d79c055d 100644 --- a/data-processing/data_manipulation/kube/client.py +++ b/pypi/data-processing/src/kube/client.py @@ -17,14 +17,15 @@ import os import traceback -from common import log_tag_const from kubernetes import client, config -from kubernetes.client import CustomObjectsApi, CoreV1Api +from kubernetes.client import CoreV1Api, CustomObjectsApi + +from common import log_tag_const from .custom_resources import (arcadia_resource_datasets, arcadia_resource_datasources, - arcadia_resource_versioneddatasets, - arcadia_resource_models) + arcadia_resource_models, + arcadia_resource_versioneddatasets) logger = logging.getLogger(__name__) diff --git a/data-processing/data_manipulation/kube/custom_resources.py b/pypi/data-processing/src/kube/custom_resources.py similarity index 100% rename from data-processing/data_manipulation/kube/custom_resources.py rename to pypi/data-processing/src/kube/custom_resources.py diff --git a/data-processing/data_manipulation/kube/dataset_cr.py b/pypi/data-processing/src/kube/dataset_cr.py similarity index 100% rename from data-processing/data_manipulation/kube/dataset_cr.py rename to pypi/data-processing/src/kube/dataset_cr.py diff --git a/data-processing/data_manipulation/kube/minio_cr.py b/pypi/data-processing/src/kube/minio_cr.py similarity index 99% rename from data-processing/data_manipulation/kube/minio_cr.py rename to pypi/data-processing/src/kube/minio_cr.py index 0cf484d2c..0cc226202 100644 --- a/data-processing/data_manipulation/kube/minio_cr.py +++ b/pypi/data-processing/src/kube/minio_cr.py @@ -15,6 +15,7 @@ import base64 import logging import traceback + import yaml from . import client diff --git a/data-processing/data_manipulation/kube/model_cr.py b/pypi/data-processing/src/kube/model_cr.py similarity index 99% rename from data-processing/data_manipulation/kube/model_cr.py rename to pypi/data-processing/src/kube/model_cr.py index ed55bab19..171125d48 100644 --- a/data-processing/data_manipulation/kube/model_cr.py +++ b/pypi/data-processing/src/kube/model_cr.py @@ -13,9 +13,10 @@ # limitations under the License. import logging -import yaml import traceback +import yaml + from utils import date_time_utils from . import client diff --git a/data-processing/data_manipulation/kube/postgresql_cr.py b/pypi/data-processing/src/kube/postgresql_cr.py similarity index 99% rename from data-processing/data_manipulation/kube/postgresql_cr.py rename to pypi/data-processing/src/kube/postgresql_cr.py index 426748cb7..bc2606d75 100644 --- a/data-processing/data_manipulation/kube/postgresql_cr.py +++ b/pypi/data-processing/src/kube/postgresql_cr.py @@ -14,6 +14,7 @@ import logging import traceback + import yaml from . import client diff --git a/data-processing/data_manipulation/llm_api_service/base_qa_provider.py b/pypi/data-processing/src/llm_api_service/base_qa_provider.py similarity index 100% rename from data-processing/data_manipulation/llm_api_service/base_qa_provider.py rename to pypi/data-processing/src/llm_api_service/base_qa_provider.py diff --git a/data-processing/data_manipulation/llm_api_service/qa_provider_open_ai.py b/pypi/data-processing/src/llm_api_service/qa_provider_open_ai.py similarity index 97% rename from data-processing/data_manipulation/llm_api_service/qa_provider_open_ai.py rename to pypi/data-processing/src/llm_api_service/qa_provider_open_ai.py index d7668d993..cefb1c726 100644 --- a/data-processing/data_manipulation/llm_api_service/qa_provider_open_ai.py +++ b/pypi/data-processing/src/llm_api_service/qa_provider_open_ai.py @@ -18,14 +18,13 @@ import time import traceback +from langchain import LLMChain +from langchain.chat_models import ChatOpenAI +from langchain.prompts.chat import (ChatPromptTemplate, + HumanMessagePromptTemplate) + from common import log_tag_const from common.config import config -from langchain.chat_models import ChatOpenAI -from langchain import LLMChain -from langchain.prompts.chat import ( - ChatPromptTemplate, - HumanMessagePromptTemplate, -) from llm_prompt_template import llm_prompt from .base_qa_provider import BaseQAProvider diff --git a/data-processing/data_manipulation/llm_api_service/qa_provider_zhi_pu_ai_online.py b/pypi/data-processing/src/llm_api_service/qa_provider_zhi_pu_ai_online.py similarity index 99% rename from data-processing/data_manipulation/llm_api_service/qa_provider_zhi_pu_ai_online.py rename to pypi/data-processing/src/llm_api_service/qa_provider_zhi_pu_ai_online.py index 078c2877b..8ca966059 100644 --- a/data-processing/data_manipulation/llm_api_service/qa_provider_zhi_pu_ai_online.py +++ b/pypi/data-processing/src/llm_api_service/qa_provider_zhi_pu_ai_online.py @@ -15,11 +15,12 @@ import logging import re -import traceback import time +import traceback import zhipuai -from common import log_tag_const, const + +from common import const, log_tag_const from common.config import config from llm_prompt_template import llm_prompt diff --git a/data-processing/data_manipulation/llm_prompt_template/llm_prompt.py b/pypi/data-processing/src/llm_prompt_template/llm_prompt.py similarity index 100% rename from data-processing/data_manipulation/llm_prompt_template/llm_prompt.py rename to pypi/data-processing/src/llm_prompt_template/llm_prompt.py diff --git a/data-processing/data_manipulation/parallel/thread_parallel.py b/pypi/data-processing/src/parallel/thread_parallel.py similarity index 100% rename from data-processing/data_manipulation/parallel/thread_parallel.py rename to pypi/data-processing/src/parallel/thread_parallel.py diff --git a/data-processing/data_manipulation/server.py b/pypi/data-processing/src/server.py similarity index 99% rename from data-processing/data_manipulation/server.py rename to pypi/data-processing/src/server.py index c69b2e781..0b168d84b 100644 --- a/data-processing/data_manipulation/server.py +++ b/pypi/data-processing/src/server.py @@ -18,13 +18,14 @@ import time import psycopg2 +from sanic import Sanic +from sanic.response import json +from sanic_cors import CORS + from common import log_tag_const from common.config import config from controller import data_process_controller from database_clients import postgresql_pool_client -from sanic import Sanic -from sanic.response import json -from sanic_cors import CORS from utils import log_utils, sanic_utils # Initialize the log config diff --git a/data-processing/data_manipulation/service/data_process_service.py b/pypi/data-processing/src/service/data_process_service.py similarity index 99% rename from data-processing/data_manipulation/service/data_process_service.py rename to pypi/data-processing/src/service/data_process_service.py index a60f9c60f..6939ed8b1 100644 --- a/data-processing/data_manipulation/service/data_process_service.py +++ b/pypi/data-processing/src/service/data_process_service.py @@ -18,18 +18,19 @@ import traceback import ulid + from common import log_tag_const from data_store_process import minio_store_process from database_operate import (data_process_db_operate, data_process_detail_db_operate, - data_process_document_db_operate, data_process_detail_preview_db_operate, data_process_document_chunk_db_operate, + data_process_document_db_operate, data_process_log_db_operate, data_process_stage_log_db_operate) +from kube import model_cr from parallel import thread_parallel from utils import date_time_utils -from kube import model_cr logger = logging.getLogger(__name__) diff --git a/data-processing/data_manipulation/transform/text/clean_transform.py b/pypi/data-processing/src/transform/text/clean_transform.py similarity index 99% rename from data-processing/data_manipulation/transform/text/clean_transform.py rename to pypi/data-processing/src/transform/text/clean_transform.py index 9b8e18660..8deb471ed 100644 --- a/data-processing/data_manipulation/transform/text/clean_transform.py +++ b/pypi/data-processing/src/transform/text/clean_transform.py @@ -19,9 +19,10 @@ import ftfy import opencc -from common import log_tag_const, special_characters from selectolax.parser import HTMLParser +from common import log_tag_const, special_characters + logger = logging.getLogger(__name__) diff --git a/data-processing/data_manipulation/transform/text/duplicates_transform.py b/pypi/data-processing/src/transform/text/duplicates_transform.py similarity index 100% rename from data-processing/data_manipulation/transform/text/duplicates_transform.py rename to pypi/data-processing/src/transform/text/duplicates_transform.py diff --git a/data-processing/data_manipulation/transform/text/filtration_transform.py b/pypi/data-processing/src/transform/text/filtration_transform.py similarity index 100% rename from data-processing/data_manipulation/transform/text/filtration_transform.py rename to pypi/data-processing/src/transform/text/filtration_transform.py diff --git a/data-processing/data_manipulation/transform/text/privacy_transform.py b/pypi/data-processing/src/transform/text/privacy_transform.py similarity index 100% rename from data-processing/data_manipulation/transform/text/privacy_transform.py rename to pypi/data-processing/src/transform/text/privacy_transform.py diff --git a/data-processing/data_manipulation/transform/text/support_type.py b/pypi/data-processing/src/transform/text/support_type.py similarity index 100% rename from data-processing/data_manipulation/transform/text/support_type.py rename to pypi/data-processing/src/transform/text/support_type.py diff --git a/data-processing/data_manipulation/utils/class_utils.py b/pypi/data-processing/src/utils/class_utils.py similarity index 100% rename from data-processing/data_manipulation/utils/class_utils.py rename to pypi/data-processing/src/utils/class_utils.py diff --git a/data-processing/data_manipulation/utils/csv_utils.py b/pypi/data-processing/src/utils/csv_utils.py similarity index 100% rename from data-processing/data_manipulation/utils/csv_utils.py rename to pypi/data-processing/src/utils/csv_utils.py diff --git a/data-processing/data_manipulation/utils/date_time_utils.py b/pypi/data-processing/src/utils/date_time_utils.py similarity index 100% rename from data-processing/data_manipulation/utils/date_time_utils.py rename to pypi/data-processing/src/utils/date_time_utils.py diff --git a/data-processing/data_manipulation/utils/docx_utils.py b/pypi/data-processing/src/utils/docx_utils.py similarity index 99% rename from data-processing/data_manipulation/utils/docx_utils.py rename to pypi/data-processing/src/utils/docx_utils.py index 18929dbdb..2e8e1591e 100644 --- a/data-processing/data_manipulation/utils/docx_utils.py +++ b/pypi/data-processing/src/utils/docx_utils.py @@ -14,6 +14,7 @@ import docx + def get_content( file_path ): diff --git a/data-processing/data_manipulation/utils/file_utils.py b/pypi/data-processing/src/utils/file_utils.py similarity index 100% rename from data-processing/data_manipulation/utils/file_utils.py rename to pypi/data-processing/src/utils/file_utils.py diff --git a/data-processing/data_manipulation/utils/json_utils.py b/pypi/data-processing/src/utils/json_utils.py similarity index 100% rename from data-processing/data_manipulation/utils/json_utils.py rename to pypi/data-processing/src/utils/json_utils.py diff --git a/data-processing/data_manipulation/utils/log_utils.py b/pypi/data-processing/src/utils/log_utils.py similarity index 100% rename from data-processing/data_manipulation/utils/log_utils.py rename to pypi/data-processing/src/utils/log_utils.py diff --git a/data-processing/data_manipulation/utils/pdf_utils.py b/pypi/data-processing/src/utils/pdf_utils.py similarity index 100% rename from data-processing/data_manipulation/utils/pdf_utils.py rename to pypi/data-processing/src/utils/pdf_utils.py diff --git a/data-processing/data_manipulation/utils/sanic_utils.py b/pypi/data-processing/src/utils/sanic_utils.py similarity index 99% rename from data-processing/data_manipulation/utils/sanic_utils.py rename to pypi/data-processing/src/utils/sanic_utils.py index 5c910c006..b0e83f006 100644 --- a/data-processing/data_manipulation/utils/sanic_utils.py +++ b/pypi/data-processing/src/utils/sanic_utils.py @@ -16,10 +16,11 @@ import logging import traceback -from common import log_tag_const from sanic.handlers import ErrorHandler from sanic.response import json +from common import log_tag_const + logger = logging.getLogger(__name__) diff --git a/pypi/ragas_once/README.md b/pypi/ragas_once/README.md new file mode 100644 index 000000000..1001aaddf --- /dev/null +++ b/pypi/ragas_once/README.md @@ -0,0 +1,3 @@ +# ragas-once + +A one-stop cli for ragas(RAG Evaluation) \ No newline at end of file