diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 000000000..c6dcdc0e8 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,634 @@ +[MAIN] + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Clear in-memory caches upon conclusion of linting. Useful if running pylint +# in a server-like mode. +clear-cache-post-run=no + +# Load and enable all available extensions. Use --list-extensions to see a list +# all available extensions. +#enable-all-extensions= + +# In error mode, messages with a category besides ERROR or FATAL are +# suppressed, and no reports are done by default. Error mode is compatible with +# disabling specific errors. +#errors-only= + +# Always return a 0 (non-error) status code, even if lint errors are found. +# This is primarily useful in continuous integration scripts. +#exit-zero= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-allow-list= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. (This is an alternative name to extension-pkg-allow-list +# for backward compatibility.) +extension-pkg-whitelist= + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +fail-on= + +# Specify a score threshold under which the program will exit with error. +fail-under=10 + +# Interpret the stdin as a python script, whose filename needs to be passed as +# the module_or_package argument. +#from-stdin= + +# Files or directories to be skipped. They should be base names, not paths. +ignore=CVS + +# Add files or directories matching the regular expressions patterns to the +# ignore-list. The regex matches against paths and can be in Posix or Windows +# format. Because '\\' represents the directory delimiter on Windows systems, +# it can't be used as an escape character. +ignore-paths= + +# Files or directories matching the regular expression patterns are skipped. +# The regex matches against base names, not paths. The default value ignores +# Emacs file locks +ignore-patterns=^\.# + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use, and will cap the count on Windows to +# avoid hangs. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# Minimum Python version to use for version dependent checks. Will default to +# the version used to run pylint. +py-version=3.11 + +# Discover python modules and packages in the file system subtree. +recursive=no + +# Add paths to the list of the source roots. Supports globbing patterns. The +# source root is an absolute path or a path relative to the current working +# directory used to determine a package namespace for modules located under the +# source root. +source-roots= + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + +# In verbose mode, extra non-checker-related info will be displayed. +#verbose= + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. If left empty, argument names will be checked with the set +# naming style. +#argument-rgx= + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. If left empty, attribute names will be checked with the set naming +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +bad-names-rgxs= + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. If left empty, class attribute names will be checked +# with the set naming style. +#class-attribute-rgx= + +# Naming style matching correct class constant names. +class-const-naming-style=UPPER_CASE + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. If left empty, class constant names will be checked with +# the set naming style. +#class-const-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. If left empty, class names will be checked with the set naming style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style. If left empty, constant names will be checked with the set naming +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. If left empty, function names will be checked with the set +# naming style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=i, + j, + k, + ex, + Run, + _ + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +good-names-rgxs= + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. If left empty, inline iteration names will be checked +# with the set naming style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. If left empty, method names will be checked with the set naming style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. If left empty, module names will be checked with the set naming style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Regular expression matching correct type alias names. If left empty, type +# alias names will be checked with the set naming style. +#typealias-rgx= + +# Regular expression matching correct type variable names. If left empty, type +# variable names will be checked with the set naming style. +#typevar-rgx= + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. If left empty, variable names will be checked with the set +# naming style. +#variable-rgx= + + +[CLASSES] + +# Warn about protected attribute access inside special methods +check-protected-access-in-special-methods=no + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + asyncSetUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs + + +[DESIGN] + +# List of regular expressions of class ancestor names to ignore when counting +# public methods (see R0903) +exclude-too-few-public-methods= + +# List of qualified class names to ignore when counting class parents (see +# R0901) +ignored-parents= + +# Maximum number of arguments for function / method. +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when caught. +overgeneral-exceptions=builtins.BaseException,builtins.Exception + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=100 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow explicit reexports by alias from a package __init__. +allow-reexport-from-package=no + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules= + +# Output a graph (.gv or any supported image format) of external dependencies +# to the given file (report RP0402 must not be disabled). +ext-import-graph= + +# Output a graph (.gv or any supported image format) of all (i.e. internal and +# external) dependencies to the given file (report RP0402 must not be +# disabled). +import-graph= + +# Output a graph (.gv or any supported image format) of internal dependencies +# to the given file (report RP0402 must not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + +[LOGGING] + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, +# UNDEFINED. +confidence=HIGH, + CONTROL_FLOW, + INFERENCE, + INFERENCE_FAILURE, + UNDEFINED + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then re-enable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + use-implicit-booleaness-not-comparison-to-string, + use-implicit-booleaness-not-comparison-to-zero + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable= + + +[METHOD_ARGS] + +# List of qualified names (i.e., library.method) which require a timeout +# parameter e.g. 'requests.api.get,requests.api.post' +timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + +# Regular expression of note tags to take in consideration. +notes-rgx= + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit,argparse.parse_error + + +[REPORTS] + +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'fatal', 'error', 'warning', 'refactor', +# 'convention', and 'info' which contain the number of messages in each +# category, as well as 'statement' which is the total number of statements +# analyzed. This score is used by the global evaluation report (RP0004). +evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +msg-template= + +# Set the output format. Available formats are: text, parseable, colorized, +# json2 (improved json format), json (old json format) and msvs (visual +# studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +#output-format= + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[SIMILARITIES] + +# Comments are removed from the similarity computation +ignore-comments=yes + +# Docstrings are removed from the similarity computation +ignore-docstrings=yes + +# Imports are removed from the similarity computation +ignore-imports=yes + +# Signatures are removed from the similarity computation +ignore-signatures=yes + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. No available dictionaries : You need to install +# both the python package and the system dependency for enchant to work. +spelling-dict= + +# List of comma separated words that should be considered directives if they +# appear at the beginning of a comment and should not be checked. +spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains the private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +spelling-store-unknown-words=no + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=no + +# This flag controls whether the implicit-str-concat should generate a warning +# on implicit string concatenation in sequences defined over several lines. +check-str-concat-over-line-jumps=no + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of symbolic message names to ignore for Mixin members. +ignored-checks-for-mixins=no-member, + not-async-context-manager, + not-context-manager, + attribute-defined-outside-init + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +# Regex pattern to define which classes are considered mixins. +mixin-class-rgx=.*[Mm]ixin + +# List of decorators that change the signature of a decorated function. +signature-mutators= + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of names allowed to shadow builtins +allowed-redefined-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io diff --git a/Makefile b/Makefile index c6bb577bf..841ccbcc2 100644 --- a/Makefile +++ b/Makefile @@ -252,4 +252,13 @@ arctl: fmt vet ## Build manager binary. gql-gen: @go run github.com/99designs/gqlgen@v0.17.40 generate build-graphql-server: gql-gen - @CGO_ENABLED=0 GOOS=linux go build -o bin/graphql-server graphql-server/go-server/main.go \ No newline at end of file + @CGO_ENABLED=0 GOOS=linux go build -o bin/graphql-server graphql-server/go-server/main.go + + +# Commands for Data-Processing +DATA_PROCESSING_IMAGE ?= kubebb/dp-base + +.PHONY: docker-build-dp-base +docker-build-dp-base: + docker build -f ./data-process/Dockerfile.base -t $(DATA_PROCESSING_IMAGE):$(VERSION) ./data-process/ + \ No newline at end of file diff --git a/data-process/Dockerfile.base b/data-process/Dockerfile.base new file mode 100644 index 000000000..4ab63694d --- /dev/null +++ b/data-process/Dockerfile.base @@ -0,0 +1,14 @@ +FROM python:3.10.13-slim + +ENV TZ=Asia/Shanghai + +RUN sed -i 's/deb.debian.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list.d/debian.sources + +RUN export DEBIAN_FRONTEND=noninteractive \ + && apt-get update \ + && apt-get install -y tzdata \ + && ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \ + && dpkg-reconfigure --frontend noninteractive tzdata \ + && apt-get install -y python3-distutils curl python3-pip + +WORKDIR /happy_work_space \ No newline at end of file diff --git a/data-process/README.md b/data-process/README.md index ab0485585..e248bafb9 100644 --- a/data-process/README.md +++ b/data-process/README.md @@ -1,2 +1,37 @@ -# Data Process -The current documentation is only available in Chinese. Please refer to the content in .zh.md for specific details. \ No newline at end of file +# Data Processing + +## Current Version Main Features + +Data Processing is used for data processing through MinIO, databases, Web APIs, etc. The data types handled include: +- txt +- json +- doc +- html +- excel +- csv +- pdf +- markdown +- ppt + +### Current Text Type Processing + +The data processing process includes: cleaning abnormal data, filtering, de-duplication, and anonymization. + +## Design + +![Design](../assets/data_process.drawio.png) + +## Local Development +### Software Requirements + +Before setting up the local data-process environment, please make sure the following software is installed: + +- Python 3.10.x + +### Environment Setup + +Install the Python dependencies in the requirements.txt file + +### Running + +Run the server.py file in the data_manipulation directory \ No newline at end of file diff --git a/data-process/data_manipulation/common/config.py b/data-process/data_manipulation/common/config.py index c086fba28..6a21b72a8 100644 --- a/data-process/data_manipulation/common/config.py +++ b/data-process/data_manipulation/common/config.py @@ -21,4 +21,4 @@ minio_secure = os.getenv('MINIO_SECURE', False) # zhipuai api_key -zhipuai_api_key = os.getenv('ZHIPUAI_API_KEY', 'xxxxx') \ No newline at end of file +zhipuai_api_key = os.getenv('ZHIPUAI_API_KEY', 'xxxxx') diff --git a/data-process/data_manipulation/file_handle/csv_handle.py b/data-process/data_manipulation/file_handle/csv_handle.py index e689bdc99..d1a8de73f 100644 --- a/data-process/data_manipulation/file_handle/csv_handle.py +++ b/data-process/data_manipulation/file_handle/csv_handle.py @@ -24,20 +24,13 @@ ### import csv -import ulid -import pandas as pd -import os import logging +import os -from transform.text import ( - clean_transform, - privacy_transform -) - -from utils import ( - date_time_utils, - file_utils -) +import pandas as pd +import ulid +from transform.text import clean_transform, privacy_transform +from utils import date_time_utils, file_utils logger = logging.getLogger('csv_handle') @@ -51,6 +44,8 @@ # content: # 1) 基本功能实现 ### + + async def text_manipulate(opt={}): logger.info("csv text manipulate!") @@ -59,7 +54,7 @@ async def text_manipulate(opt={}): 处理某条数据时,如果某个方式(比如:去除不可见字符)处理失败了,则直接结束,不在处理,整个文件都视作处理失败 """ - + try: file_name = opt['file_name'] support_type = opt['support_type'] @@ -87,7 +82,6 @@ async def text_manipulate(opt={}): text_data = clean_result['data'] - # 将清洗后的文件保存为final new_file_name = await file_utils.get_file_name({ 'file_name': file_name, @@ -159,7 +153,7 @@ async def data_clean(opt={}): }) logger.info("csv text data clean stop!") - + return { 'status': 200, 'message': '', @@ -179,8 +173,8 @@ async def data_clean(opt={}): ### async def remove_invisible_characters(opt={}): return await clean_transform.remove_invisible_characters({ - 'text': opt['text'] - }) + 'text': opt['text'] + }) ### # 去除邮箱地址 @@ -192,10 +186,12 @@ async def remove_invisible_characters(opt={}): # content: # 1) 基本功能实现 ### + + async def remove_email(opt={}): return await privacy_transform.remove_email({ - 'text': opt['text'] - }) + 'text': opt['text'] + }) ### # 将数据存到CSV中 @@ -207,6 +203,8 @@ async def remove_email(opt={}): # content: # 1) 基本功能实现 ### + + async def save_csv(opt={}): file_name = opt['file_name'] phase_value = opt['phase_value'] diff --git a/data-process/data_manipulation/server.py b/data-process/data_manipulation/server.py index 0059e6340..bd652a425 100644 --- a/data-process/data_manipulation/server.py +++ b/data-process/data_manipulation/server.py @@ -24,36 +24,22 @@ # 1) 基本功能实现 ### -from sanic import Sanic -from sanic.response import json, text -from sanic_cors import CORS, cross_origin -from sanic.exceptions import NotFound - import asyncio -import aiohttp - -import sys - import logging - -from service import ( - minio_store_process_service -) - -from transform.text import ( - support_type -) - -from utils import ( - log_utils -) +from sanic import Sanic +from sanic.response import json +from sanic_cors import CORS +from service import minio_store_process_service +from transform.text import support_type +from utils import log_utils ### # 初始化日志配置 ### log_utils.init_config({ - 'source_type': 'manipulate_server' + 'source_type': 'manipulate_server', + 'log_dir': "log" }) @@ -62,7 +48,7 @@ app = Sanic(name='data_manipulate') CORS(app) -app.config['REQUEST_MAX_SIZE'] = 1024 * 1024 * 1024 # 1G +app.config['REQUEST_MAX_SIZE'] = 1024 * 1024 * 1024 # 1G app.config['REQUEST_TIMEOUT'] = 60 * 60 * 60 app.config['RESPONSE_TIMEOUT'] = 60 * 60 * 60 app.config['KEEP_ALIVE_TIMEOUT'] = 60 * 60 * 60 @@ -77,6 +63,8 @@ # content: # 1) 基本功能实现 ### + + @app.route('text-manipulate', methods=['POST']) async def text_manipulate(request): """ @@ -87,7 +75,7 @@ async def text_manipulate(request): file_path: 文本路径 Returns: - + """ await asyncio.create_task( @@ -110,13 +98,14 @@ async def text_manipulate(request): # content: # 1) 基本功能实现 ### + + @app.route('text-process-type', methods=['POST']) async def text_process_type(request): """ 获取数据处理支持的类型 Args: - Returns: json: 支持的类型 @@ -127,11 +116,10 @@ async def text_process_type(request): 'message': '', 'data': support_type.support_types }) - if __name__ == '__main__': app.run(host='0.0.0.0', port=28888, access_log=True, debug=True, - workers=2) \ No newline at end of file + workers=2) diff --git a/data-process/data_manipulation/service/minio_store_process_service.py b/data-process/data_manipulation/service/minio_store_process_service.py index beba735ba..c506e0885 100644 --- a/data-process/data_manipulation/service/minio_store_process_service.py +++ b/data-process/data_manipulation/service/minio_store_process_service.py @@ -23,24 +23,17 @@ # 1) 基本功能实现 ### -from sanic.response import json, raw -from minio import Minio -from minio.commonconfig import Tags -from minio.error import S3Error -import pandas as pd import io -import os - import logging +import os -from file_handle import ( - csv_handle -) - -from utils import ( - minio_utils, - file_utils -) +import pandas as pd +from file_handle import csv_handle +from minio import Minio +from minio.commonconfig import Tags +from minio.error import S3Error +from sanic.response import json, raw +from utils import file_utils, minio_utils logger = logging.getLogger('minio_store_process_service') @@ -54,6 +47,8 @@ # content: # 1) 基本功能实现 ### + + async def text_manipulate(request): request_json = request.json @@ -63,7 +58,7 @@ async def text_manipulate(request): # create minio client minio_client = await minio_utils.create_client() - + # 查询存储桶下的所有对象 objects = minio_client.list_objects(bucket_name, prefix=folder_prefix) @@ -81,9 +76,9 @@ async def text_manipulate(request): if file_extension in ['csv']: # 处理CSV文件 result = await csv_handle.text_manipulate({ - 'file_name': item, - 'support_type': support_type - }) + 'file_name': item, + 'support_type': support_type + }) # 将清洗后的文件上传到MinIO中 # 上传middle文件夹下的文件,并添加tag @@ -112,7 +107,7 @@ async def text_manipulate(request): for item in file_names: remove_file_path = await file_utils.get_temp_file_path() await file_utils.delete_file(remove_file_path + 'original/' + item) - + return json({ 'status': 200, 'message': '', @@ -129,6 +124,8 @@ async def text_manipulate(request): # content: # 1) 基本功能实现 ### + + async def download(opt={}): objects = opt['objects'] minio_client = opt['minio_client'] @@ -160,17 +157,21 @@ async def download(opt={}): # content: # 1) 基本功能实现 ### + + async def upload_files_to_minio_with_tags(minio_client, local_folder, minio_bucket, minio_prefix="", tags=None): for root, dirs, files in os.walk(local_folder): for file in files: local_file_path = os.path.join(root, file) - minio_object_name = os.path.join(minio_prefix, os.path.relpath(local_file_path, local_folder)) - + minio_object_name = os.path.join( + minio_prefix, os.path.relpath(local_file_path, local_folder)) + try: - minio_client.fput_object(minio_bucket, minio_object_name, local_file_path, tags=tags) - + minio_client.fput_object( + minio_bucket, minio_object_name, local_file_path, tags=tags) + # 删除本地文件 await file_utils.delete_file(local_file_path) except S3Error as e: - logger.error(f"Error uploading {minio_object_name} to {minio_bucket}: {e}") - + logger.error( + f"Error uploading {minio_object_name} to {minio_bucket}: {e}") diff --git a/data-process/data_manipulation/transform/text/QA_transform.py b/data-process/data_manipulation/transform/text/QA_transform.py index 81391a5a4..85a607bc1 100644 --- a/data-process/data_manipulation/transform/text/QA_transform.py +++ b/data-process/data_manipulation/transform/text/QA_transform.py @@ -12,12 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import zhipuai import re -from common import ( - config -) +import zhipuai +from common import config ### # QA生成 @@ -30,6 +28,7 @@ # 1) 基本功能实现 ### + async def generate_QA(opt={}): zhipuai.api_key = config.zhipuai_api_key @@ -76,7 +75,7 @@ async def generate_QA(opt={}): # 1) 基本功能实现 ### async def formatSplitText(text): - + pattern = re.compile(r'Q\d+:(\s*)(.*?)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q|$)') # 移除换行符 @@ -93,4 +92,4 @@ async def formatSplitText(text): 'a': a }) - return result \ No newline at end of file + return result diff --git a/data-process/data_manipulation/transform/text/clean_transform.py b/data-process/data_manipulation/transform/text/clean_transform.py index 5e50dc3c0..1fa729891 100644 --- a/data-process/data_manipulation/transform/text/clean_transform.py +++ b/data-process/data_manipulation/transform/text/clean_transform.py @@ -26,6 +26,7 @@ import re + ### # 去除不可见字符 # @author: wangxinbiao @@ -38,9 +39,10 @@ ### async def remove_invisible_characters(opt={}): text = opt['text'] - + try: - clean_text = re.sub(r'[\x00-\x1F\x7F-\x9F\xAD\r\n\t\b\x0B\x1C\x1D\x1E]', '', text) + clean_text = re.sub( + r'[\x00-\x1F\x7F-\x9F\xAD\r\n\t\b\x0B\x1C\x1D\x1E]', '', text) return { 'status': 200, 'message': '', diff --git a/data-process/data_manipulation/transform/text/duplicates_transform.py b/data-process/data_manipulation/transform/text/duplicates_transform.py index 69962ef96..91739e76e 100644 --- a/data-process/data_manipulation/transform/text/duplicates_transform.py +++ b/data-process/data_manipulation/transform/text/duplicates_transform.py @@ -25,4 +25,3 @@ ### async def remove_duplicates(opt={}): return opt['text'] - diff --git a/data-process/data_manipulation/transform/text/filtration_transform.py b/data-process/data_manipulation/transform/text/filtration_transform.py index ba692eff0..b67f27f16 100644 --- a/data-process/data_manipulation/transform/text/filtration_transform.py +++ b/data-process/data_manipulation/transform/text/filtration_transform.py @@ -26,6 +26,7 @@ import re + ### # 检查文档的词数目 # @author: wangxinbiao @@ -38,5 +39,3 @@ ### async def word_count(opt={}): return 49 - - diff --git a/data-process/data_manipulation/transform/text/privacy_transform.py b/data-process/data_manipulation/transform/text/privacy_transform.py index 6b3e6df1d..f5277c285 100644 --- a/data-process/data_manipulation/transform/text/privacy_transform.py +++ b/data-process/data_manipulation/transform/text/privacy_transform.py @@ -26,6 +26,7 @@ import re + ### # 去除邮箱地址 # @author: wangxinbiao @@ -38,7 +39,7 @@ ### async def remove_email(opt={}): text = opt['text'] - + try: email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' @@ -50,4 +51,3 @@ async def remove_email(opt={}): except Exception as ex: return '' - \ No newline at end of file diff --git a/data-process/data_manipulation/utils/date_time_utils.py b/data-process/data_manipulation/utils/date_time_utils.py index 3bd3121bb..364b87584 100644 --- a/data-process/data_manipulation/utils/date_time_utils.py +++ b/data-process/data_manipulation/utils/date_time_utils.py @@ -19,6 +19,7 @@ def now_str(): return f"{datetime.datetime.now():%Y-%m-%d %H:%M:%S.%f}" + def now_str_for_day(): return f"{datetime.datetime.now():%Y-%m-%d}" @@ -37,7 +38,7 @@ def timestamp_to_str_second(timestamp): def chage_datetime_fromat(opt={}): my_date_time = datetime.datetime.strptime( - opt['date_time'], - opt['from_format']) + opt['date_time'], + opt['from_format']) - return my_date_time.strftime(opt.get('to_format', '%Y-%m-%d %H:%M:%S')) \ No newline at end of file + return my_date_time.strftime(opt.get('to_format', '%Y-%m-%d %H:%M:%S')) diff --git a/data-process/data_manipulation/utils/file_utils.py b/data-process/data_manipulation/utils/file_utils.py index d4c50afce..16bcee21e 100644 --- a/data-process/data_manipulation/utils/file_utils.py +++ b/data-process/data_manipulation/utils/file_utils.py @@ -19,6 +19,7 @@ import os + ### # 生成文件名称 # @author: wangxinbiao @@ -48,8 +49,10 @@ async def get_file_name(opt={}): # content: # 1) 基本功能实现 ### + + async def get_temp_file_path(): - current_directory = os.getcwd() + current_directory = os.getcwd() csv_file_path = os.path.join(current_directory, 'file_handle/temp_file/') @@ -65,5 +68,7 @@ async def get_temp_file_path(): # content: # 1) 基本功能实现 ### + + async def delete_file(file_path): os.remove(file_path) diff --git a/data-process/data_manipulation/utils/json_utils.py b/data-process/data_manipulation/utils/json_utils.py index ed97a60ce..18c40487d 100644 --- a/data-process/data_manipulation/utils/json_utils.py +++ b/data-process/data_manipulation/utils/json_utils.py @@ -13,17 +13,18 @@ # limitations under the License. -import ujson from pathlib import Path +import ujson + def pretty_print(opt={}): data = opt.get('data', {}) print(ujson.dumps(data, - ensure_ascii=False, - escape_forward_slashes=False, - indent=4)) + ensure_ascii=False, + escape_forward_slashes=False, + indent=4)) def get_str_empty(opt={}): @@ -38,21 +39,19 @@ def get_str_empty(opt={}): def write_json_file(opt={}): file_name = Path(opt['file_name']) - with open(file_name, 'w', encoding = 'utf-8') as outfile: - dump(opt['data'], outfile, opt) + with open(file_name, 'w', encoding='utf-8') as outfile: + dump(opt['data'], outfile, opt) def read_json_file(opt={}): file_name = Path(opt['file_name']) json_result = None - with open(file_name, 'r', encoding = 'utf-8') as f: + with open(file_name, 'r', encoding='utf-8') as f: json_result = ujson.load(f) return json_result - - def dumps(json_data, opt={}): indent = opt.get('indent', 2) ensure_ascii = opt.get('ensure_ascii', False) @@ -73,4 +72,4 @@ def dump(json_data, file, opt={}): file, indent=indent, ensure_ascii=ensure_ascii, - escape_forward_slashes=escape_forward_slashes) \ No newline at end of file + escape_forward_slashes=escape_forward_slashes) diff --git a/data-process/data_manipulation/utils/log_utils.py b/data-process/data_manipulation/utils/log_utils.py index 4d97b19b2..8648ec625 100644 --- a/data-process/data_manipulation/utils/log_utils.py +++ b/data-process/data_manipulation/utils/log_utils.py @@ -12,32 +12,35 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import logging -from logging.handlers import TimedRotatingFileHandler, RotatingFileHandler +import os import datetime +import logging +from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler + -def init_config (opt={}): +def init_config(opt={}): source_type = opt['source_type'] + log_dir = opt['log_dir'] + os.makedirs(log_dir, exist_ok=True) ### # 配置全局日志配置 ### file_handler = TimedRotatingFileHandler( - f'log/{source_type}/{source_type}_{datetime.datetime.now().strftime("%Y-%m-%d")}.log', - when="midnight", - interval=1, + f'{log_dir}/{source_type}_{datetime.datetime.now().strftime("%Y-%m-%d")}.log', + when="midnight", + interval=1, backupCount=30 - ) # 按天生成日志文件,最多保存30天的日志文件 + ) # 按天生成日志文件,最多保存30天的日志文件 file_handler.setLevel(logging.DEBUG) # 将error和critical级别的日志单独存放 error_file_handler = TimedRotatingFileHandler( - f'log/{source_type}/error/{source_type}_{datetime.datetime.now().strftime("%Y-%m-%d")}.err.log', - when="midnight", - interval=1, + f'log/{source_type}_{datetime.datetime.now().strftime("%Y-%m-%d")}.err.log', + when="midnight", + interval=1, backupCount=30 - ) # 按天生成日志文件,最多保存30天的日志文件 + ) # 按天生成日志文件,最多保存30天的日志文件 error_file_handler.suffix = "%Y-%m-%d" # 文件名的时间格式 error_file_handler.setLevel(logging.ERROR) @@ -50,4 +53,4 @@ def init_config (opt={}): error_file_handler, logging.StreamHandler() ] - ) \ No newline at end of file + ) diff --git a/data-process/data_manipulation/utils/minio_utils.py b/data-process/data_manipulation/utils/minio_utils.py index 8dedcc5f8..2d4350216 100644 --- a/data-process/data_manipulation/utils/minio_utils.py +++ b/data-process/data_manipulation/utils/minio_utils.py @@ -16,11 +16,9 @@ # MinIO ### +from common import config from minio import Minio -from common import ( - config -) async def create_client(): return Minio( diff --git a/data-process/docker/base/Dockerfile.base b/data-process/docker/base/Dockerfile.base deleted file mode 100644 index ec52b9bef..000000000 --- a/data-process/docker/base/Dockerfile.base +++ /dev/null @@ -1,43 +0,0 @@ -FROM ubuntu:20.04 - -ENV TIME_ZONE Asia/Shanghai - -RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \ -sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list && \ -sed -i s@/security.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list && \ -cat /etc/apt/sources.list.bak >> /etc/apt/sources.list && \ -apt-get clean && \ -apt-get update && \ -DEBIAN_FRONTEND="noninteractive" apt-get install -y vim build-essential zlib1g-dev libncurses5-dev python3.10 python3-tk libgdbm-dev libgdbm-compat-dev libnss3-dev libssl-dev libreadline-dev libbz2-dev libgdbm-dev liblzma-dev openssl uuid-dev libffi-dev libsqlite3-dev wget tzdata \ -libatk1.0-0 libatk-bridge2.0-0 libcups2 libdbus-1-3 libxkbcommon0 libatspi2.0-0 libxdamage1 libgbm1 libpango-1.0-0 libcairo2 libasound2 tesseract-ocr libtesseract-dev tesseract-ocr-chi-sim && \ -ln -snf /user/share/zoneinfo/$TIME_ZONE /etc/localtime && echo $TIME_ZONE > /etc/timezone && \ -dpkg-reconfigure -f noninteractive tzdata - -RUN cd /opt/local -wget https://www.python.org/ftp/python/3.10.13/Python-3.10.13.tar.xz - - -RUN cd /opt/local/Python-3.10.13 && \ -./configure --enable-optimizations --enable-loadable-sqlite-extensions && \ -make -j 8 && \ -make altinstall - -ENV LD_LIBRARY_PATH /usr/local/lib:$LD_LIBRARY_PATH -ENV PYTHON_VERSION 3.10 -RUN cd /usr/local/bin && \ -ln -sf python$PYTHON_VERSION python3 && \ -ln -sf pip$PYTHON_VERSION pip3 && \ -echo "/usr/local/lib" > /etc/ld.so.conf.d/local.conf - - -RUN mkdir -p /happy_work_space -WORKDIR /happy_work_space - -VOLUME ["/happy_work_space"] - -RUN python3.10 -m pip install --upgrade pip setuptools - -ADD requirements.txt / -RUN python3.10 -m pip install -U -r /requirements.txt - -ADD lzma.py /usr/local/lib/python3.10/lzma.py \ No newline at end of file diff --git a/data-process/docker/base/build_image.sh b/data-process/docker/base/build_image.sh deleted file mode 100644 index 917a245f7..000000000 --- a/data-process/docker/base/build_image.sh +++ /dev/null @@ -1,6 +0,0 @@ -set e - -release_image="python:3.10.13" - - -docker build -f ./Dockerfile.base -t ${release_image} --build-arg GIT_VERSION="$gitVersion" . \ No newline at end of file diff --git a/data-process/docker/base/lzma.py b/data-process/docker/base/lzma.py deleted file mode 100644 index acb07805b..000000000 --- a/data-process/docker/base/lzma.py +++ /dev/null @@ -1,352 +0,0 @@ -"""Interface to the liblzma compression library. - -This module provides a class for reading and writing compressed files, -classes for incremental (de)compression, and convenience functions for -one-shot (de)compression. - -These classes and functions support both the XZ and legacy LZMA -container formats, as well as raw compressed data streams. -""" - -__all__ = [ - "CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256", - "CHECK_ID_MAX", "CHECK_UNKNOWN", - "FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64", - "FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC", - "FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW", - "MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4", - "MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME", - - "LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError", - "open", "compress", "decompress", "is_check_supported", -] - -import builtins -import io -import os -try: - from _lzma import * - from _lzma import _encode_filter_properties, _decode_filter_properties -except ImportError: - from backports.lzma import * - from backports.lzma import _encode_filter_properties, _decode_filter_properties - -import _compression - - -_MODE_CLOSED = 0 -_MODE_READ = 1 -# Value 2 no longer used -_MODE_WRITE = 3 - - -class LZMAFile(_compression.BaseStream): - - """A file object providing transparent LZMA (de)compression. - - An LZMAFile can act as a wrapper for an existing file object, or - refer directly to a named file on disk. - - Note that LZMAFile provides a *binary* file interface - data read - is returned as bytes, and data to be written must be given as bytes. - """ - - def __init__(self, filename=None, mode="r", *, - format=None, check=-1, preset=None, filters=None): - """Open an LZMA-compressed file in binary mode. - - filename can be either an actual file name (given as a str, - bytes, or PathLike object), in which case the named file is - opened, or it can be an existing file object to read from or - write to. - - mode can be "r" for reading (default), "w" for (over)writing, - "x" for creating exclusively, or "a" for appending. These can - equivalently be given as "rb", "wb", "xb" and "ab" respectively. - - format specifies the container format to use for the file. - If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the - default is FORMAT_XZ. - - check specifies the integrity check to use. This argument can - only be used when opening a file for writing. For FORMAT_XZ, - the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not - support integrity checks - for these formats, check must be - omitted, or be CHECK_NONE. - - When opening a file for reading, the *preset* argument is not - meaningful, and should be omitted. The *filters* argument should - also be omitted, except when format is FORMAT_RAW (in which case - it is required). - - When opening a file for writing, the settings used by the - compressor can be specified either as a preset compression - level (with the *preset* argument), or in detail as a custom - filter chain (with the *filters* argument). For FORMAT_XZ and - FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset - level. For FORMAT_RAW, the caller must always specify a filter - chain; the raw compressor does not support preset compression - levels. - - preset (if provided) should be an integer in the range 0-9, - optionally OR-ed with the constant PRESET_EXTREME. - - filters (if provided) should be a sequence of dicts. Each dict - should have an entry for "id" indicating ID of the filter, plus - additional entries for options to the filter. - """ - self._fp = None - self._closefp = False - self._mode = _MODE_CLOSED - - if mode in ("r", "rb"): - if check != -1: - raise ValueError("Cannot specify an integrity check " - "when opening a file for reading") - if preset is not None: - raise ValueError("Cannot specify a preset compression " - "level when opening a file for reading") - if format is None: - format = FORMAT_AUTO - mode_code = _MODE_READ - elif mode in ("w", "wb", "a", "ab", "x", "xb"): - if format is None: - format = FORMAT_XZ - mode_code = _MODE_WRITE - self._compressor = LZMACompressor(format=format, check=check, - preset=preset, filters=filters) - self._pos = 0 - else: - raise ValueError("Invalid mode: {!r}".format(mode)) - - if isinstance(filename, (str, bytes, os.PathLike)): - if "b" not in mode: - mode += "b" - self._fp = builtins.open(filename, mode) - self._closefp = True - self._mode = mode_code - elif hasattr(filename, "read") or hasattr(filename, "write"): - self._fp = filename - self._mode = mode_code - else: - raise TypeError("filename must be a str, bytes, file or PathLike object") - - if self._mode == _MODE_READ: - raw = _compression.DecompressReader(self._fp, LZMADecompressor, - trailing_error=LZMAError, format=format, filters=filters) - self._buffer = io.BufferedReader(raw) - - def close(self): - """Flush and close the file. - - May be called more than once without error. Once the file is - closed, any other operation on it will raise a ValueError. - """ - if self._mode == _MODE_CLOSED: - return - try: - if self._mode == _MODE_READ: - self._buffer.close() - self._buffer = None - elif self._mode == _MODE_WRITE: - self._fp.write(self._compressor.flush()) - self._compressor = None - finally: - try: - if self._closefp: - self._fp.close() - finally: - self._fp = None - self._closefp = False - self._mode = _MODE_CLOSED - - @property - def closed(self): - """True if this file is closed.""" - return self._mode == _MODE_CLOSED - - def fileno(self): - """Return the file descriptor for the underlying file.""" - self._check_not_closed() - return self._fp.fileno() - - def seekable(self): - """Return whether the file supports seeking.""" - return self.readable() and self._buffer.seekable() - - def readable(self): - """Return whether the file was opened for reading.""" - self._check_not_closed() - return self._mode == _MODE_READ - - def writable(self): - """Return whether the file was opened for writing.""" - self._check_not_closed() - return self._mode == _MODE_WRITE - - def peek(self, size=-1): - """Return buffered data without advancing the file position. - - Always returns at least one byte of data, unless at EOF. - The exact number of bytes returned is unspecified. - """ - self._check_can_read() - # Relies on the undocumented fact that BufferedReader.peek() always - # returns at least one byte (except at EOF) - return self._buffer.peek(size) - - def read(self, size=-1): - """Read up to size uncompressed bytes from the file. - - If size is negative or omitted, read until EOF is reached. - Returns b"" if the file is already at EOF. - """ - self._check_can_read() - return self._buffer.read(size) - - def read1(self, size=-1): - """Read up to size uncompressed bytes, while trying to avoid - making multiple reads from the underlying stream. Reads up to a - buffer's worth of data if size is negative. - - Returns b"" if the file is at EOF. - """ - self._check_can_read() - if size < 0: - size = io.DEFAULT_BUFFER_SIZE - return self._buffer.read1(size) - - def readline(self, size=-1): - """Read a line of uncompressed bytes from the file. - - The terminating newline (if present) is retained. If size is - non-negative, no more than size bytes will be read (in which - case the line may be incomplete). Returns b'' if already at EOF. - """ - self._check_can_read() - return self._buffer.readline(size) - - def write(self, data): - """Write a bytes object to the file. - - Returns the number of uncompressed bytes written, which is - always len(data). Note that due to buffering, the file on disk - may not reflect the data written until close() is called. - """ - self._check_can_write() - compressed = self._compressor.compress(data) - self._fp.write(compressed) - self._pos += len(data) - return len(data) - - def seek(self, offset, whence=io.SEEK_SET): - """Change the file position. - - The new position is specified by offset, relative to the - position indicated by whence. Possible values for whence are: - - 0: start of stream (default): offset must not be negative - 1: current stream position - 2: end of stream; offset must not be positive - - Returns the new file position. - - Note that seeking is emulated, so depending on the parameters, - this operation may be extremely slow. - """ - self._check_can_seek() - return self._buffer.seek(offset, whence) - - def tell(self): - """Return the current file position.""" - self._check_not_closed() - if self._mode == _MODE_READ: - return self._buffer.tell() - return self._pos - - -def open(filename, mode="rb", *, - format=None, check=-1, preset=None, filters=None, - encoding=None, errors=None, newline=None): - """Open an LZMA-compressed file in binary or text mode. - - filename can be either an actual file name (given as a str, bytes, - or PathLike object), in which case the named file is opened, or it - can be an existing file object to read from or write to. - - The mode argument can be "r", "rb" (default), "w", "wb", "x", "xb", - "a", or "ab" for binary mode, or "rt", "wt", "xt", or "at" for text - mode. - - The format, check, preset and filters arguments specify the - compression settings, as for LZMACompressor, LZMADecompressor and - LZMAFile. - - For binary mode, this function is equivalent to the LZMAFile - constructor: LZMAFile(filename, mode, ...). In this case, the - encoding, errors and newline arguments must not be provided. - - For text mode, an LZMAFile object is created, and wrapped in an - io.TextIOWrapper instance with the specified encoding, error - handling behavior, and line ending(s). - - """ - if "t" in mode: - if "b" in mode: - raise ValueError("Invalid mode: %r" % (mode,)) - else: - if encoding is not None: - raise ValueError("Argument 'encoding' not supported in binary mode") - if errors is not None: - raise ValueError("Argument 'errors' not supported in binary mode") - if newline is not None: - raise ValueError("Argument 'newline' not supported in binary mode") - - lz_mode = mode.replace("t", "") - binary_file = LZMAFile(filename, lz_mode, format=format, check=check, - preset=preset, filters=filters) - - if "t" in mode: - return io.TextIOWrapper(binary_file, encoding, errors, newline) - else: - return binary_file - - -def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None): - """Compress a block of data. - - Refer to LZMACompressor's docstring for a description of the - optional arguments *format*, *check*, *preset* and *filters*. - - For incremental compression, use an LZMACompressor instead. - """ - comp = LZMACompressor(format, check, preset, filters) - return comp.compress(data) + comp.flush() - - -def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None): - """Decompress a block of data. - - Refer to LZMADecompressor's docstring for a description of the - optional arguments *format*, *check* and *filters*. - - For incremental decompression, use an LZMADecompressor instead. - """ - results = [] - while True: - decomp = LZMADecompressor(format, memlimit, filters) - try: - res = decomp.decompress(data) - except LZMAError: - if results: - break # Leftover data is not a valid LZMA/XZ stream; ignore it. - else: - raise # Error on the first iteration; bail out. - results.append(res) - if not decomp.eof: - raise LZMAError("Compressed data ended before the " - "end-of-stream marker was reached") - data = decomp.unused_data - if not data: - break - return b"".join(results) \ No newline at end of file diff --git a/data-process/docker/base/requirements.txt b/data-process/docker/base/requirements.txt deleted file mode 100644 index 023663fd2..000000000 --- a/data-process/docker/base/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -backports.lzma==0.0.14 \ No newline at end of file diff --git a/llms/Dockerfile.server b/llms/Dockerfile.server index 7f29f6168..1e504629e 100644 --- a/llms/Dockerfile.server +++ b/llms/Dockerfile.server @@ -2,8 +2,7 @@ FROM python:3.9-slim ENV TZ=Asia/Shanghai -RUN sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list -RUN sed -i 's/security.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list +RUN sed -i 's/deb.debian.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list.d/debian.sources RUN export DEBIAN_FRONTEND=noninteractive \ && apt-get update \