From 11cdd8d71f61e8549056baf2c7bbe687579d58ff Mon Sep 17 00:00:00 2001 From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com> Date: Mon, 2 Oct 2023 16:47:24 -0400 Subject: [PATCH] roman/drop downloads in ingest tests (#1614) ### Description In an effort to mitigate resource consumption when running CI tests, cleanup download dir for ingest tests after each one. --- .github/workflows/ci.yml | 1 + .github/workflows/ingest-test-fixtures-update-pr.yml | 1 + CHANGELOG.md | 2 +- test_unstructured_ingest/test-ingest-airtable-diff.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-airtable-large.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-azure.sh | 5 ++++- test_unstructured_ingest/test-ingest-biomed-api.sh | 5 ++++- test_unstructured_ingest/test-ingest-biomed-path.sh | 5 ++++- test_unstructured_ingest/test-ingest-box.sh | 9 ++++++++- .../test-ingest-confluence-diff.sh | 9 ++++++++- .../test-ingest-confluence-large.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-delta-table.sh | 4 ++++ test_unstructured_ingest/test-ingest-discord.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-dropbox.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-elasticsearch.sh | 4 ++++ test_unstructured_ingest/test-ingest-gcs.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-github.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-gitlab.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-google-drive.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-jira.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-notion.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-onedrive.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-outlook.sh | 9 ++++++++- .../test-ingest-pdf-fast-reprocess.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-s3.sh | 5 ++++- test_unstructured_ingest/test-ingest-salesforce.sh | 9 ++++++++- .../test-ingest-sharepoint-embed-cog-index.sh | 9 +++++++++ test_unstructured_ingest/test-ingest-sharepoint.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-slack.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-wikipedia.sh | 10 +++++++++- test_unstructured_ingest/test-ingest.sh | 8 ++++---- unstructured/__version__.py | 2 +- unstructured/ingest/runner/box.py | 2 +- 33 files changed, 203 insertions(+), 31 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c5ff1e2920..9d4f066fec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -293,6 +293,7 @@ jobs: AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} TABLE_OCR: "tesseract" ENTIRE_PAGE_OCR: "tesseract" + CI: "true" run: | source .venv/bin/activate sudo apt-get update diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 63b97fd1f4..724a893128 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -91,6 +91,7 @@ jobs: TABLE_OCR: "tesseract" ENTIRE_PAGE_OCR: "tesseract" OVERWRITE_FIXTURES: "true" + CI: "true" run: | source .venv/bin/activate sudo apt-get update diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f1a1d4778..009b31a377 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.19-dev3 +## 0.10.19-dev4 ### Enhancements diff --git a/test_unstructured_ingest/test-ingest-airtable-diff.sh b/test_unstructured_ingest/test-ingest-airtable-diff.sh index 11727e298a..8c69a31146 100755 --- a/test_unstructured_ingest/test-ingest-airtable-diff.sh +++ b/test_unstructured_ingest/test-ingest-airtable-diff.sh @@ -10,11 +10,18 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=airtable-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +CI=${CI:-"false"} max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT VARIED_DATA_BASE_ID="app5YQxSfp220fWtm" VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88" diff --git a/test_unstructured_ingest/test-ingest-airtable-large.sh b/test_unstructured_ingest/test-ingest-airtable-large.sh index a5a26be1cf..b87e728187 100755 --- a/test_unstructured_ingest/test-ingest-airtable-large.sh +++ b/test_unstructured_ingest/test-ingest-airtable-large.sh @@ -12,10 +12,17 @@ OUTPUT_FOLDER_NAME=airtable-large OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set." diff --git a/test_unstructured_ingest/test-ingest-azure.sh b/test_unstructured_ingest/test-ingest-azure.sh index 38e27294d9..9fdb9dd5e5 100755 --- a/test_unstructured_ingest/test-ingest-azure.sh +++ b/test_unstructured_ingest/test-ingest-azure.sh @@ -11,7 +11,10 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" +} +trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ azure \ diff --git a/test_unstructured_ingest/test-ingest-biomed-api.sh b/test_unstructured_ingest/test-ingest-biomed-api.sh index 0f09757d62..bf0de6998f 100755 --- a/test_unstructured_ingest/test-ingest-biomed-api.sh +++ b/test_unstructured_ingest/test-ingest-biomed-api.sh @@ -12,7 +12,10 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" +} +trap cleanup EXIT "$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k diff --git a/test_unstructured_ingest/test-ingest-biomed-path.sh b/test_unstructured_ingest/test-ingest-biomed-path.sh index 49d2f2f72c..b726364ef3 100755 --- a/test_unstructured_ingest/test-ingest-biomed-path.sh +++ b/test_unstructured_ingest/test-ingest-biomed-path.sh @@ -12,7 +12,10 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" +} +trap cleanup EXIT "$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k diff --git a/test_unstructured_ingest/test-ingest-box.sh b/test_unstructured_ingest/test-ingest-box.sh index 08e6803066..43a8ad38ff 100755 --- a/test_unstructured_ingest/test-ingest-box.sh +++ b/test_unstructured_ingest/test-ingest-box.sh @@ -11,10 +11,17 @@ OUTPUT_FOLDER_NAME=box OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set." diff --git a/test_unstructured_ingest/test-ingest-confluence-diff.sh b/test_unstructured_ingest/test-ingest-confluence-diff.sh index d785ff3a18..c9c0c21483 100755 --- a/test_unstructured_ingest/test-ingest-confluence-diff.sh +++ b/test_unstructured_ingest/test-ingest-confluence-diff.sh @@ -10,10 +10,17 @@ OUTPUT_FOLDER_NAME=confluence-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." diff --git a/test_unstructured_ingest/test-ingest-confluence-large.sh b/test_unstructured_ingest/test-ingest-confluence-large.sh index 7a5114e340..c1196bdd3d 100755 --- a/test_unstructured_ingest/test-ingest-confluence-large.sh +++ b/test_unstructured_ingest/test-ingest-confluence-large.sh @@ -12,10 +12,17 @@ OUTPUT_FOLDER_NAME=confluence-large OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." diff --git a/test_unstructured_ingest/test-ingest-delta-table.sh b/test_unstructured_ingest/test-ingest-delta-table.sh index d019017a7a..d4c79a8f0d 100755 --- a/test_unstructured_ingest/test-ingest-delta-table.sh +++ b/test_unstructured_ingest/test-ingest-delta-table.sh @@ -9,6 +9,7 @@ OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set." @@ -21,6 +22,9 @@ source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$DESTINATION_TABLE" cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi } trap cleanup EXIT diff --git a/test_unstructured_ingest/test-ingest-discord.sh b/test_unstructured_ingest/test-ingest-discord.sh index b55e37ab9f..7aedb2b352 100755 --- a/test_unstructured_ingest/test-ingest-discord.sh +++ b/test_unstructured_ingest/test-ingest-discord.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=discord OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$DISCORD_TOKEN" ]; then echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set." diff --git a/test_unstructured_ingest/test-ingest-dropbox.sh b/test_unstructured_ingest/test-ingest-dropbox.sh index e58f5c6389..b591f0cdd8 100755 --- a/test_unstructured_ingest/test-ingest-dropbox.sh +++ b/test_unstructured_ingest/test-ingest-dropbox.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=dropbox OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" diff --git a/test_unstructured_ingest/test-ingest-elasticsearch.sh b/test_unstructured_ingest/test-ingest-elasticsearch.sh index 33a755b222..530ddf1bed 100755 --- a/test_unstructured_ingest/test-ingest-elasticsearch.sh +++ b/test_unstructured_ingest/test-ingest-elasticsearch.sh @@ -9,6 +9,7 @@ OUTPUT_FOLDER_NAME=elasticsearch OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh @@ -21,6 +22,9 @@ function cleanup() { fi cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi } trap cleanup EXIT diff --git a/test_unstructured_ingest/test-ingest-gcs.sh b/test_unstructured_ingest/test-ingest-gcs.sh index 5827105dfb..dd43710941 100755 --- a/test_unstructured_ingest/test-ingest-gcs.sh +++ b/test_unstructured_ingest/test-ingest-gcs.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=gcs OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." diff --git a/test_unstructured_ingest/test-ingest-github.sh b/test_unstructured_ingest/test-ingest-github.sh index a81be26732..4061bea956 100755 --- a/test_unstructured_ingest/test-ingest-github.sh +++ b/test_unstructured_ingest/test-ingest-github.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=github OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT GH_READ_ONLY_ACCESS_TOKEN=${GH_READ_ONLY_ACCESS_TOKEN:-none} diff --git a/test_unstructured_ingest/test-ingest-gitlab.sh b/test_unstructured_ingest/test-ingest-gitlab.sh index d8e7ce5fe9..1a9031c7a7 100755 --- a/test_unstructured_ingest/test-ingest-gitlab.sh +++ b/test_unstructured_ingest/test-ingest-gitlab.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=gitlab OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ gitlab \ diff --git a/test_unstructured_ingest/test-ingest-google-drive.sh b/test_unstructured_ingest/test-ingest-google-drive.sh index 12d802fe48..218a5cfe0a 100755 --- a/test_unstructured_ingest/test-ingest-google-drive.sh +++ b/test_unstructured_ingest/test-ingest-google-drive.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=google-drive OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." diff --git a/test_unstructured_ingest/test-ingest-jira.sh b/test_unstructured_ingest/test-ingest-jira.sh index 3982141cb8..173fc4f94b 100755 --- a/test_unstructured_ingest/test-ingest-jira.sh +++ b/test_unstructured_ingest/test-ingest-jira.sh @@ -9,10 +9,17 @@ OUTPUT_FOLDER_NAME=jira-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set." diff --git a/test_unstructured_ingest/test-ingest-notion.sh b/test_unstructured_ingest/test-ingest-notion.sh index b7e9c399f4..2a83a47bb3 100755 --- a/test_unstructured_ingest/test-ingest-notion.sh +++ b/test_unstructured_ingest/test-ingest-notion.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=notion OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$NOTION_API_KEY" ]; then echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set." diff --git a/test_unstructured_ingest/test-ingest-onedrive.sh b/test_unstructured_ingest/test-ingest-onedrive.sh index 6e683351ed..290643815d 100755 --- a/test_unstructured_ingest/test-ingest-onedrive.sh +++ b/test_unstructured_ingest/test-ingest-onedrive.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=onedrive OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set." diff --git a/test_unstructured_ingest/test-ingest-outlook.sh b/test_unstructured_ingest/test-ingest-outlook.sh index fdc3e90bfc..384287e7ea 100755 --- a/test_unstructured_ingest/test-ingest-outlook.sh +++ b/test_unstructured_ingest/test-ingest-outlook.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=outlook OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set." diff --git a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh index a17c91b806..96acee7bd3 100755 --- a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh @@ -9,10 +9,17 @@ OUTPUT_FOLDER_NAME=pdf-fast-reprocess OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME INPUT_PATH=$SCRIPT_DIR/download max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$INPUT_PATH" + fi +} +trap cleanup EXIT echo "REPROCESS INPUT PATH" ls "$INPUT_PATH" diff --git a/test_unstructured_ingest/test-ingest-s3.sh b/test_unstructured_ingest/test-ingest-s3.sh index c48941ab12..214a70ab71 100755 --- a/test_unstructured_ingest/test-ingest-s3.sh +++ b/test_unstructured_ingest/test-ingest-s3.sh @@ -12,7 +12,10 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" +} +trap cleanup EXIT "$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k diff --git a/test_unstructured_ingest/test-ingest-salesforce.sh b/test_unstructured_ingest/test-ingest-salesforce.sh index a9ee1a106c..04f686e1d9 100755 --- a/test_unstructured_ingest/test-ingest-salesforce.sh +++ b/test_unstructured_ingest/test-ingest-salesforce.sh @@ -11,10 +11,17 @@ OUTPUT_FOLDER_NAME=salesforce OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set." diff --git a/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh b/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh index 5ea8b9b416..738848e008 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh @@ -11,6 +11,7 @@ DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(date +%s)" # The vector configs on the schema currently only exist on versions: # 2023-07-01-Preview, 2021-04-30-Preview, 2020-06-30-Preview API_VERSION=2023-07-01-Preview +CI=${CI:-"false"} if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ] ; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." @@ -27,6 +28,9 @@ if [ -z "$AZURE_SEARCH_ENDPOINT" ] && [ -z "$AZURE_SEARCH_API_KEY" ]; then exit 0 fi +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh + function cleanup { response_code=$(curl -s -o /dev/null -w "%{http_code}" \ "https://utic-test-ingest-fixtures.search.windows.net/indexes/$DESTINATION_INDEX?api-version=$API_VERSION" \ @@ -41,6 +45,11 @@ function cleanup { else echo "Index $DESTINATION_INDEX does not exist, nothing to delete" fi + + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi } trap cleanup EXIT diff --git a/test_unstructured_ingest/test-ingest-sharepoint.sh b/test_unstructured_ingest/test-ingest-sharepoint.sh index 46fd041a8d..8eefa87a60 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=Sharepoint OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." diff --git a/test_unstructured_ingest/test-ingest-slack.sh b/test_unstructured_ingest/test-ingest-slack.sh index e8974e1502..ff51d63692 100755 --- a/test_unstructured_ingest/test-ingest-slack.sh +++ b/test_unstructured_ingest/test-ingest-slack.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=slack OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$SLACK_TOKEN" ]; then echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set." diff --git a/test_unstructured_ingest/test-ingest-wikipedia.sh b/test_unstructured_ingest/test-ingest-wikipedia.sh index eb168aa731..1dc5e428b4 100755 --- a/test_unstructured_ingest/test-ingest-wikipedia.sh +++ b/test_unstructured_ingest/test-ingest-wikipedia.sh @@ -8,9 +8,17 @@ OUTPUT_FOLDER_NAME=wikipedia OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} + # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ wikipedia \ diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 926821943e..56568b37f0 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -11,6 +11,10 @@ export OMP_THREAD_LIMIT=1 scripts=( 'test-ingest-s3.sh' 'test-ingest-azure.sh' +'test-ingest-biomed-api.sh' +'test-ingest-biomed-path.sh' +## NOTE(yuming): The following test should be put after any tests with --preserve-downloads option +'test-ingest-pdf-fast-reprocess.sh' 'test-ingest-box.sh' 'test-ingest-discord.sh' 'test-ingest-dropbox.sh' @@ -18,8 +22,6 @@ scripts=( 'test-ingest-gitlab.sh' 'test-ingest-google-drive.sh' 'test-ingest-wikipedia.sh' -'test-ingest-biomed-api.sh' -'test-ingest-biomed-path.sh' 'test-ingest-local.sh' 'test-ingest-slack.sh' 'test-ingest-against-api.sh' @@ -39,8 +41,6 @@ scripts=( 'test-ingest-delta-table.sh' 'test-ingest-salesforce.sh' 'test-ingest-jira.sh' -## NOTE(yuming): The following test should be put after any tests with --preserve-downloads option -'test-ingest-pdf-fast-reprocess.sh' 'test-ingest-sharepoint.sh' ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a84e0335aa..884933d470 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev3" # pragma: no cover +__version__ = "0.10.19-dev4" # pragma: no cover diff --git a/unstructured/ingest/runner/box.py b/unstructured/ingest/runner/box.py index 7ac9d44d7e..1856f075ca 100644 --- a/unstructured/ingest/runner/box.py +++ b/unstructured/ingest/runner/box.py @@ -23,7 +23,7 @@ def box( ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) read_config.download_dir = update_download_dir_remote_url( - connector_name="azure", + connector_name="box", read_config=read_config, remote_url=remote_url, logger=logger,