Skip to content

Commit

Permalink
Merge pull request #576 from bjwswang/pypi
Browse files Browse the repository at this point in the history
chore: rename data-processing to pypi
  • Loading branch information
bjwswang authored Jan 19, 2024
2 parents 402461d + f052495 commit f5c3828
Show file tree
Hide file tree
Showing 57 changed files with 148 additions and 193 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/image_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ jobs:
- name: Build data processing image
uses: docker/build-push-action@v5
with:
context: ./data-processing
file: ./data-processing/Dockerfile
context: ./pypi
file: ./pypi/data-processing/Dockerfile
platforms: linux/amd64,linux/arm64
tags: |
kubeagi/data-processing:latest
Expand Down
9 changes: 0 additions & 9 deletions data-processing/.gitignore

This file was deleted.

123 changes: 0 additions & 123 deletions data-processing/README.md

This file was deleted.

2 changes: 0 additions & 2 deletions data-processing/entrypoint.sh

This file was deleted.

9 changes: 9 additions & 0 deletions pypi/data-processing/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# python
__pycache__
.ipynb_checkpoints

data-processing/src/mock_data

data-processing/src/log

data-processing/src/file_handle/temp_file
File renamed without changes.
56 changes: 56 additions & 0 deletions pypi/data-processing/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Data Processing

## Current Version Main Features

Data Processing is used for data processing through MinIO, databases, Web APIs, etc. The data types handled include:
- txt
- json
- doc
- html
- excel
- csv
- pdf
- markdown
- ppt

### Current Text Type Processing

The data processing process includes: cleaning abnormal data, filtering, de-duplication, and anonymization.

## Design

![Design](../assets/data_process.drawio.png)

## Local Development
### Software Requirements

Before setting up the local data-process environment, please make sure the following software is installed:

- Python 3.10.x

### Environment Setup

Install the Python dependencies in the requirements.txt file

### Running

Run the server.py file in the src directory

# isort
isort is a tool for sorting imports alphabetically within your Python code. It helps maintain a consistent and clean import order.

## install
```shell
pip install isort
```

## isort a file
```shell
isort src/server.py
```

## isort a directory
```shell
isort .
```

2 changes: 2 additions & 0 deletions pypi/data-processing/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/sh
python /arcadia_app/src/server.py
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,13 @@

import logging
import os
from pathlib import Path
import traceback
from pathlib import Path

import yaml

from kube import minio_cr, model_cr, postgresql_cr
from utils.class_utils import Singleton
from kube import (
minio_cr,
model_cr,
postgresql_cr
)

from . import log_tag_const

Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
# limitations under the License.


from file_handle import pdf_handle
from sanic import Blueprint
from sanic.response import json

from file_handle import pdf_handle
from service import data_process_service
from transform.text import support_type

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
import traceback

import urllib3
from common import log_tag_const
from common.config import config
from minio import Minio
from minio.commonconfig import Tags
from minio.error import S3Error

from common import log_tag_const
from common.config import config
from utils import file_utils

logger = logging.getLogger(__name__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,26 @@
import io
import logging
import os
import ulid
import traceback
import ujson
from pathlib import Path

import pandas as pd
from common import log_tag_const, const
import ujson
import ulid

from common import const, log_tag_const
from common.config import config
from data_store_clients import minio_store_client
from database_operate import (data_process_db_operate,
data_process_document_db_operate,
data_process_detail_db_operate,
data_process_detail_preview_db_operate,
data_process_document_chunk_db_operate,
data_process_document_db_operate,
data_process_log_db_operate,
data_process_stage_log_db_operate,
data_process_document_chunk_db_operate)
from file_handle import (csv_handle,
pdf_handle,
word_handle,
common_handle)
data_process_stage_log_db_operate)
from file_handle import common_handle, csv_handle, pdf_handle, word_handle
from kube import dataset_cr
from utils import file_utils, date_time_utils, json_utils
from pathlib import Path
from utils import date_time_utils, file_utils, json_utils

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
import traceback

import psycopg2.extras
from common import log_tag_const
from dbutils.pooled_db import PooledDB

from common import log_tag_const

logger = logging.getLogger(__name__)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@

import ujson
import ulid
from database_clients import postgresql_pool_client
from sanic.response import json

from database_clients import postgresql_pool_client
from utils import date_time_utils


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
# limitations under the License.

import ulid

from database_clients import postgresql_pool_client
from utils import date_time_utils


def add(
req_json,
pool
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
# limitations under the License.

import ulid

from database_clients import postgresql_pool_client
from utils import date_time_utils


def add(
req_json,
pool
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@


import ujson

from database_clients import postgresql_pool_client
from utils import date_time_utils

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@


import ulid

from database_clients import postgresql_pool_client
from utils import date_time_utils

Expand Down
Loading

0 comments on commit f5c3828

Please sign in to comment.