diff --git a/Makefile b/Makefile index e020f07e30..9dc2ba765e 100644 --- a/Makefile +++ b/Makefile @@ -401,6 +401,12 @@ docker-build: docker-start-bash: docker run -ti --rm ${DOCKER_IMAGE} +.PHONY: docker-start-dev +docker-start-dev: + docker run --rm \ + -v ${CURRENT_DIR}:/mnt/local_unstructued \ + -ti ${DOCKER_IMAGE} + .PHONY: docker-test docker-test: docker run --rm \ diff --git a/README.md b/README.md index dacd07dec0..5bcb951456 100644 --- a/README.md +++ b/README.md @@ -165,6 +165,14 @@ If using the optional `pre-commit`, you'll just need to install the hooks with ` `pre-commit` package is installed as part of `make install` mentioned above. Finally, if you decided to use `pre-commit` you can also uninstall the hooks with `pre-commit uninstall`. +In addition to develop in your local OS we also provide a helper to use docker providing a development environment: + +```bash +make docker-start-dev +``` + +This starts a docker container with your local repo mounted to `/mnt/local_unstructured`. This docker image allows you to develop without worrying about your OS's compatibility with the repo and its dependencies. + ## :clap: Quick Tour ### Documentation diff --git a/scripts/performance/README.md b/scripts/performance/README.md index c87ecfe4fd..514cf8464e 100644 --- a/scripts/performance/README.md +++ b/scripts/performance/README.md @@ -1,5 +1,5 @@ # Performance -This is a collection of tools helpful for inspecting and tracking performance of the Unstructured library. +This is a collection of tools helpful for inspecting and tracking performance of the Unstructured library. The benchmarking script allows a user to track performance time to partitioning results against a fixed set of test documents and store those results with indication of architecture, instance type, and git hash, in S3. @@ -7,8 +7,14 @@ The profiling script allows a user to inspect how time time and memory are spent ## Install Benchmarking requires no additional dependencies and should work without any initial setup. -Profiling has a few dependencies which can be installed with: -`pip install -r scripts/performance/requirements.txt` +Profiling has a few dependencies which can be installed with: + +```bash +pip install -r scripts/performance/requirements.txt +npm install -g speedscope +``` + +The second dependency `speedscope` provides a tool to view profiling results from `py-spy` locally. Alternatively you can also drop the profile result `*.speedscope` into https://www.speedscope.app/ to view the results online. ## Run ### Benchmark @@ -17,7 +23,7 @@ Export / assign desired environment variable settings: - NUM_ITERATIONS: Number of iterations for benchmark (e.g., 100) (default: 3) - INSTANCE_TYPE: Type of benchmark instance (e.g., "c5.xlarge") (default: unspecified) - PUBLISH_RESULTS: Set to true to publish results to S3 bucket (default: false) -- +- Usage: `./scripts/performance/benchmark.sh` ### Profile @@ -25,11 +31,15 @@ Usage: `./scripts/performance/benchmark.sh` Export / assign desired environment variable settings: - DOCKER_TEST: Set to true to run profiling inside a Docker container (default: false) -Usage: `./scripts/performance/profile.sh` +Usage: + +**on Linux**: `./scripts/performance/profile.sh` + +**on macOS**: `sudo -E ./scripts/performance/profile.sh`; `py-spy` requires su to run on macOS + - Run the script and choose the profiling mode: 'run' or 'view'. - In the 'run' mode, you can profile custom files or select existing test files. - In the 'view' mode, you can view previously generated profiling results. - The script supports time profiling with cProfile and memory profiling with memray. - Users can choose different visualization options such as flamegraphs, tables, trees, summaries, and statistics. - Test documents are synced from an S3 bucket to a local directory before running the profiles - diff --git a/scripts/performance/profile.sh b/scripts/performance/profile.sh index 0712f8ec02..d109797cad 100755 --- a/scripts/performance/profile.sh +++ b/scripts/performance/profile.sh @@ -5,7 +5,7 @@ # Environment Variables: # - DOCKER_TEST: Set to true to run profiling inside a Docker container (default: false) -# Usage: +# Usage: # - Run the script and choose the profiling mode: 'run' or 'view'. # - In the 'run' mode, you can profile custom files or select existing test files. # - In the 'view' mode, you can view previously generated profiling results. @@ -34,7 +34,7 @@ check_python_module() { fi } validate_dependencies() { - check_python_module memray + check_python_module memray check_python_module flameprof } @@ -117,7 +117,7 @@ view_profile_headless() { view_profile_with_head() { while true; do - read -r -p "Choose profile type: (1) time (2) memory (b) back, (q) quit: " -n 1 profile_type + read -r -p "Choose profile type: (1) time (2) memory (3) speedscope (b) back, (q) quit: " -n 1 profile_type echo if [[ $profile_type == "b" ]]; then @@ -131,6 +131,8 @@ view_profile_with_head() { extension=".prof" elif [[ $profile_type == "2" ]]; then extension=".bin" + elif [[ $profile_type == "3" ]]; then + extension=".speedscope" else echo "Invalid profile type. Please try again." continue @@ -143,7 +145,9 @@ view_profile_with_head() { continue fi - if [[ $profile_type == "2" ]]; then + if [[ $profile_type == "3" ]]; then + speedscope "$result_file" + elif [[ $profile_type == "2" ]]; then while true; do read -r -p "Choose visualization type: (1) flamegraph (2) table (3) tree (4) summary (5) stats (b) back, (q) quit: " -n 1 visualization_type echo @@ -293,7 +297,7 @@ run_profile() { # Pick the strategy while true; do - read -r -p "Choose a strategy: 1) auto, (2) fast, (3) hi_res, (b) back, (q) quit: " -n 1 strategy_option + read -r -p "Choose a strategy: 1) auto, (2) fast, (3) hi_res, (4) ocr_only (b) back, (q) quit: " -n 1 strategy_option echo if [[ $strategy_option == "b" ]]; then @@ -315,6 +319,10 @@ run_profile() { strategy="hi_res" break ;; + "4") + strategy="ocr_only" + break + ;; *) echo "Invalid strategy option. Please try again." ;; @@ -325,8 +333,11 @@ run_profile() { python3 -m cProfile -s cumulative -o "$PROFILE_RESULTS_DIR/${test_file##*/}.prof" -m "$MODULE_PATH.run_partition" "$test_file" "$strategy" echo "Running memory profile..." python3 -m memray run -o "$PROFILE_RESULTS_DIR/${test_file##*/}.bin" -m "$MODULE_PATH.run_partition" "$test_file" "$strategy" + echo "Running py-spy for detailed run time profiling (this can take some time)..." + py-spy record --subprocesses -i -o "$PROFILE_RESULTS_DIR/${test_file##*/}.speedscope" --format speedscope -- python3 -m "$MODULE_PATH.run_partition" "$test_file" "$strategy" echo "Profiling completed." echo "Viewing results for $test_file" + echo "The py-spy produced speedscope profile can be viewed on https://www.speedscope.app or locally by installing via 'npm install -g speedscope'" result_file=$PROFILE_RESULTS_DIR/$(basename "$test_file") view_profile "${result_file}.bin" # Go directly to view mode done @@ -336,7 +347,7 @@ while true; do if [[ -n "$1" ]]; then mode="$1" fi - + if [[ -z $result_file ]]; then read -r -p "Choose mode: (1) run, (2) view, (q) quit: " -n 1 mode echo diff --git a/scripts/performance/requirements.txt b/scripts/performance/requirements.txt index 23b1f9aa31..dadec17aa3 100644 --- a/scripts/performance/requirements.txt +++ b/scripts/performance/requirements.txt @@ -1,3 +1,4 @@ flameprof>=0.4 memray>=1.7.0 snakeviz>=2.2.0 +py-spy>=0.3.14 diff --git a/scripts/performance/run_partition.py b/scripts/performance/run_partition.py index 3336ca4878..4da380f02e 100644 --- a/scripts/performance/run_partition.py +++ b/scripts/performance/run_partition.py @@ -1,3 +1,4 @@ +import os import sys from unstructured.partition.auto import partition @@ -12,6 +13,11 @@ file_path = sys.argv[1] strategy = sys.argv[2] - result = partition(file_path, strategy=strategy) + model_name = None + if len(sys.argv) > 3: + model_name = sys.argv[3] + else: + model_name = os.environ.get("PARTITION_MODEL_NAME") + result = partition(file_path, strategy=strategy, model_name=model_name) # access element in the return value to make sure we got something back, otherwise error result[1]