From 92692ad8d7d5001601dd88fef869a29660f492cb Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Mon, 4 Sep 2023 13:52:32 -0700 Subject: [PATCH] fix: wrapped error handling for connectors (#1262) The CustomError that we use to wrap custom ingest errors inherits from BaseException rather than Exception (as we should, per specification [here](https://docs.python.org/3/library/exceptions.html#BaseException)). This resulted in exceptions not properly raising as expected. This PR changes the inheritance which resolves the known issue. Additionally, our base definition for get_file on IngestDoc was wrapped with SourceConnectionError, however this must be explicitly decorating each subclass definition in order to function. This PR does that. ## Testing Some unit test coverage was added for the error wrapping class, however this wasn't properly recreating the issue we are seeing when running ingest tests. To recreate that issue one can intentionally raise an exception in the [partition_file](https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/interfaces.py#L214C9-L214C23) definition and then run any ingest test. Prior to this change: the code and logs suggest that everything ran without exception, but the partitioned output was not generated (as a result the test will fail without any clues as to what went wrong). With this update, the expected custom partition error, error message, and stack trace will be visible. --------- Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com> --- CHANGELOG.md | 7 ++++- test_unstructured_ingest/unit/test_error.py | 29 +++++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/ingest/connector/airtable.py | 2 ++ unstructured/ingest/connector/azure.py | 2 ++ unstructured/ingest/connector/biomed.py | 2 ++ unstructured/ingest/connector/box.py | 2 ++ unstructured/ingest/connector/confluence.py | 2 ++ unstructured/ingest/connector/delta_table.py | 2 ++ unstructured/ingest/connector/discord.py | 2 ++ unstructured/ingest/connector/dropbox.py | 2 ++ .../ingest/connector/elasticsearch.py | 2 ++ unstructured/ingest/connector/fsspec.py | 2 ++ unstructured/ingest/connector/gcs.py | 2 ++ unstructured/ingest/connector/git.py | 2 ++ unstructured/ingest/connector/github.py | 2 ++ unstructured/ingest/connector/gitlab.py | 2 ++ unstructured/ingest/connector/google_drive.py | 2 ++ unstructured/ingest/connector/onedrive.py | 3 ++ unstructured/ingest/connector/outlook.py | 2 ++ unstructured/ingest/connector/reddit.py | 2 ++ unstructured/ingest/connector/s3.py | 2 ++ unstructured/ingest/connector/s3_2.py | 2 ++ unstructured/ingest/connector/sharepoint.py | 2 ++ unstructured/ingest/connector/slack.py | 2 ++ unstructured/ingest/connector/wikipedia.py | 2 ++ unstructured/ingest/error.py | 6 ++-- 27 files changed, 86 insertions(+), 5 deletions(-) create mode 100644 test_unstructured_ingest/unit/test_error.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 609e673239..fbf5a00915 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,13 @@ -## 0.10.13-dev0 +## 0.10.13-dev1 ### Enhancements * Updated documentation: Added back support doc types for partitioning, more Python codes in the API page, RAG definition, and use case. +### Fixes + +* Ingest error handling to properly raise errors when wrapped + ## 0.10.12 @@ -21,6 +25,7 @@ ### Fixes + * Bump unstructured-inference * Avoid divide-by-zero errors swith `safe_division` (0.5.21) diff --git a/test_unstructured_ingest/unit/test_error.py b/test_unstructured_ingest/unit/test_error.py new file mode 100644 index 0000000000..3caed5399c --- /dev/null +++ b/test_unstructured_ingest/unit/test_error.py @@ -0,0 +1,29 @@ +import traceback + +import pytest + +from unstructured.ingest.error import ( + DestinationConnectionError, + PartitionError, + SourceConnectionError, +) + + +@pytest.mark.parametrize( + ("error_class", "exception_type", "error_message"), + [ + (SourceConnectionError, ValueError, "Simulated connection error"), + (DestinationConnectionError, RuntimeError, "Simulated connection error"), + (PartitionError, FileNotFoundError, "Simulated partition error"), + ], +) +def test_custom_error_decorator(error_class, exception_type, error_message): + @error_class.wrap + def simulate_error(): + raise exception_type(error_message) + + with pytest.raises(error_class) as context: + simulate_error() + + expected_error_string = error_class.error_string.format(error_message) + assert str(context.value) == expected_error_string diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ba254c7ae9..9497f9dce4 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.13-dev0" # pragma: no cover +__version__ = "0.10.13-dev1" # pragma: no cover diff --git a/unstructured/ingest/connector/airtable.py b/unstructured/ingest/connector/airtable.py index 731fa04e92..20727c099e 100644 --- a/unstructured/ingest/connector/airtable.py +++ b/unstructured/ingest/connector/airtable.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -65,6 +66,7 @@ def _output_filename(self): output_file = f"{self.file_meta.table_id}.json" return Path(self.standard_config.output_dir) / self.file_meta.base_id / output_file + @SourceConnectionError.wrap @requires_dependencies(["pyairtable", "pandas"], extras="airtable") @BaseIngestDoc.skip_if_file_exists def get_file(self): diff --git a/unstructured/ingest/connector/azure.py b/unstructured/ingest/connector/azure.py index d382d0965d..6589e1a39a 100644 --- a/unstructured/ingest/connector/azure.py +++ b/unstructured/ingest/connector/azure.py @@ -6,6 +6,7 @@ FsspecIngestDoc, SimpleFsspecConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import StandardConnectorConfig from unstructured.utils import requires_dependencies @@ -19,6 +20,7 @@ class SimpleAzureBlobStorageConfig(SimpleFsspecConfig): class AzureBlobStorageIngestDoc(FsspecIngestDoc): registry_name: str = "azure" + @SourceConnectionError.wrap @requires_dependencies(["adlfs", "fsspec"], extras="azure") def get_file(self): super().get_file() diff --git a/unstructured/ingest/connector/biomed.py b/unstructured/ingest/connector/biomed.py index 3e68dbd51b..a7d686e1aa 100644 --- a/unstructured/ingest/connector/biomed.py +++ b/unstructured/ingest/connector/biomed.py @@ -10,6 +10,7 @@ from requests.adapters import HTTPAdapter from urllib3.util import Retry +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -129,6 +130,7 @@ def cleanup_file(self): logger.debug(f"Cleaning up {self}") Path.unlink(self.filename) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): download_path = self.file_meta.download_filepath # type: ignore diff --git a/unstructured/ingest/connector/box.py b/unstructured/ingest/connector/box.py index ac5d5366ad..f95c508d5d 100644 --- a/unstructured/ingest/connector/box.py +++ b/unstructured/ingest/connector/box.py @@ -16,6 +16,7 @@ FsspecIngestDoc, SimpleFsspecConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import StandardConnectorConfig from unstructured.utils import requires_dependencies @@ -46,6 +47,7 @@ class BoxIngestDoc(FsspecIngestDoc): config: SimpleBoxConfig registry_name: str = "box" + @SourceConnectionError.wrap @requires_dependencies(["boxfs", "fsspec"], extras="box") def get_file(self): super().get_file() diff --git a/unstructured/ingest/connector/confluence.py b/unstructured/ingest/connector/confluence.py index bbec761128..82569d0152 100644 --- a/unstructured/ingest/connector/confluence.py +++ b/unstructured/ingest/connector/confluence.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -100,6 +101,7 @@ def _output_filename(self): output_file = f"{self.file_meta.document_id}.json" return Path(self.standard_config.output_dir) / self.file_meta.space_id / output_file + @SourceConnectionError.wrap @requires_dependencies(["atlassian"], extras="Confluence") @BaseIngestDoc.skip_if_file_exists def get_file(self): diff --git a/unstructured/ingest/connector/delta_table.py b/unstructured/ingest/connector/delta_table.py index f2d8bb57b6..1eef5691e4 100644 --- a/unstructured/ingest/connector/delta_table.py +++ b/unstructured/ingest/connector/delta_table.py @@ -6,6 +6,7 @@ import pandas as pd +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -73,6 +74,7 @@ def _create_full_tmp_dir_path(self): self.filename.parent.mkdir(parents=True, exist_ok=True) self._output_filename.parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists @requires_dependencies(["fsspec"], extras="delta-table") def get_file(self): diff --git a/unstructured/ingest/connector/discord.py b/unstructured/ingest/connector/discord.py index bea9f25fe1..7cda1d77d9 100644 --- a/unstructured/ingest/connector/discord.py +++ b/unstructured/ingest/connector/discord.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import List, Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -74,6 +75,7 @@ def _output_filename(self): def _create_full_tmp_dir_path(self): self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists @requires_dependencies(dependencies=["discord"], extras="discord") def get_file(self): diff --git a/unstructured/ingest/connector/dropbox.py b/unstructured/ingest/connector/dropbox.py index 1d4891dd6b..335946ef2c 100644 --- a/unstructured/ingest/connector/dropbox.py +++ b/unstructured/ingest/connector/dropbox.py @@ -18,6 +18,7 @@ FsspecIngestDoc, SimpleFsspecConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import StandardConnectorConfig from unstructured.utils import requires_dependencies @@ -35,6 +36,7 @@ class SimpleDropboxConfig(SimpleFsspecConfig): class DropboxIngestDoc(FsspecIngestDoc): registry_name: str = "dropbox" + @SourceConnectionError.wrap @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") def get_file(self): super().get_file() diff --git a/unstructured/ingest/connector/elasticsearch.py b/unstructured/ingest/connector/elasticsearch.py index de00c254f6..fe0044ab7e 100644 --- a/unstructured/ingest/connector/elasticsearch.py +++ b/unstructured/ingest/connector/elasticsearch.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -102,6 +103,7 @@ def _concatenate_dict_fields(self, dictionary, seperator="\n"): concatenated_values = seperator.join(values) return concatenated_values + @SourceConnectionError.wrap @requires_dependencies(["elasticsearch", "jq"], extras="elasticsearch") @BaseIngestDoc.skip_if_file_exists def get_file(self): diff --git a/unstructured/ingest/connector/fsspec.py b/unstructured/ingest/connector/fsspec.py index d5b1c01853..c455124898 100644 --- a/unstructured/ingest/connector/fsspec.py +++ b/unstructured/ingest/connector/fsspec.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Type +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -101,6 +102,7 @@ def _create_full_tmp_dir_path(self): """Includes "directories" in the object path""" self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): """Fetches the file from the current filesystem and stores it locally.""" diff --git a/unstructured/ingest/connector/gcs.py b/unstructured/ingest/connector/gcs.py index da69f6b47d..0540d2618b 100644 --- a/unstructured/ingest/connector/gcs.py +++ b/unstructured/ingest/connector/gcs.py @@ -6,6 +6,7 @@ FsspecIngestDoc, SimpleFsspecConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import StandardConnectorConfig from unstructured.utils import requires_dependencies @@ -20,6 +21,7 @@ class GcsIngestDoc(FsspecIngestDoc): config: SimpleGcsConfig registry_name: str = "gcs" + @SourceConnectionError.wrap @requires_dependencies(["gcsfs", "fsspec"], extras="gcs") def get_file(self): super().get_file() diff --git a/unstructured/ingest/connector/git.py b/unstructured/ingest/connector/git.py index 85268a9203..5c39cebb8c 100644 --- a/unstructured/ingest/connector/git.py +++ b/unstructured/ingest/connector/git.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -40,6 +41,7 @@ def _create_full_tmp_dir_path(self): """includes directories in in the gitlab repository""" self.filename.parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): """Fetches the "remote" doc and stores it locally on the filesystem.""" diff --git a/unstructured/ingest/connector/github.py b/unstructured/ingest/connector/github.py index 5a1e20cd91..a90e10b3f8 100644 --- a/unstructured/ingest/connector/github.py +++ b/unstructured/ingest/connector/github.py @@ -9,6 +9,7 @@ GitIngestDoc, SimpleGitConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.logger import logger from unstructured.utils import requires_dependencies @@ -37,6 +38,7 @@ def __post_init__(self): # If there's no issues, store the core repository info self.repo_path = parsed_gh_url.path + @SourceConnectionError.wrap @requires_dependencies(["github"], extras="github") def _get_repo(self) -> "Repository": from github import Github diff --git a/unstructured/ingest/connector/gitlab.py b/unstructured/ingest/connector/gitlab.py index 82e6591489..67c85663cd 100644 --- a/unstructured/ingest/connector/gitlab.py +++ b/unstructured/ingest/connector/gitlab.py @@ -7,6 +7,7 @@ GitIngestDoc, SimpleGitConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.utils import requires_dependencies if TYPE_CHECKING: @@ -26,6 +27,7 @@ def __post_init__(self): while self.repo_path.startswith("/"): self.repo_path = self.repo_path[1:] + @SourceConnectionError.wrap @requires_dependencies(["gitlab"], extras="gitlab") def _get_project(self) -> "Project": from gitlab import Gitlab diff --git a/unstructured/ingest/connector/google_drive.py b/unstructured/ingest/connector/google_drive.py index 6e6a588879..019f1a6d68 100644 --- a/unstructured/ingest/connector/google_drive.py +++ b/unstructured/ingest/connector/google_drive.py @@ -8,6 +8,7 @@ from unstructured.file_utils.filetype import EXT_TO_FILETYPE from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -114,6 +115,7 @@ def filename(self): def _output_filename(self): return Path(f"{self.file_meta.get('output_filepath')}.json").resolve() + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists @requires_dependencies(["googleapiclient"], extras="google-drive") def get_file(self): diff --git a/unstructured/ingest/connector/onedrive.py b/unstructured/ingest/connector/onedrive.py index e6b31ee9ff..7f94441aa9 100644 --- a/unstructured/ingest/connector/onedrive.py +++ b/unstructured/ingest/connector/onedrive.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, List, Optional from unstructured.file_utils.filetype import EXT_TO_FILETYPE +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -38,6 +39,7 @@ def __post_init__(self): ) self.token_factory = self._acquire_token + @SourceConnectionError.wrap @requires_dependencies(["msal"]) def _acquire_token(self): from msal import ConfidentialClientApplication @@ -103,6 +105,7 @@ def filename(self): def _output_filename(self): return Path(self.output_filepath).resolve() + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists @requires_dependencies(["office365"], extras="onedrive") def get_file(self): diff --git a/unstructured/ingest/connector/outlook.py b/unstructured/ingest/connector/outlook.py index 7fa6b99f8b..6da1447221 100644 --- a/unstructured/ingest/connector/outlook.py +++ b/unstructured/ingest/connector/outlook.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import List, Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -109,6 +110,7 @@ def filename(self): def _output_filename(self): return Path(self.output_filepath).resolve() + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists @requires_dependencies(["office365"], extras="outlook") def get_file(self): diff --git a/unstructured/ingest/connector/reddit.py b/unstructured/ingest/connector/reddit.py index c4d620fc6f..9a7742fd69 100644 --- a/unstructured/ingest/connector/reddit.py +++ b/unstructured/ingest/connector/reddit.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -49,6 +50,7 @@ def _output_filename(self): def _create_full_tmp_dir_path(self): self.filename.parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): """Fetches the "remote" doc and stores it locally on the filesystem.""" diff --git a/unstructured/ingest/connector/s3.py b/unstructured/ingest/connector/s3.py index 9724d81e89..8e8416bdbd 100644 --- a/unstructured/ingest/connector/s3.py +++ b/unstructured/ingest/connector/s3.py @@ -6,6 +6,7 @@ FsspecIngestDoc, SimpleFsspecConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import StandardConnectorConfig from unstructured.utils import requires_dependencies @@ -20,6 +21,7 @@ class S3IngestDoc(FsspecIngestDoc): remote_file_path: str registry_name: str = "s3" + @SourceConnectionError.wrap @requires_dependencies(["s3fs", "fsspec"], extras="s3") def get_file(self): super().get_file() diff --git a/unstructured/ingest/connector/s3_2.py b/unstructured/ingest/connector/s3_2.py index 51219ee7f6..35d7601d02 100644 --- a/unstructured/ingest/connector/s3_2.py +++ b/unstructured/ingest/connector/s3_2.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Type +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces2 import ( BaseConnectorConfig, BaseDestinationConnector, @@ -104,6 +105,7 @@ def _create_full_tmp_dir_path(self): """Includes "directories" in the object path""" self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): """Fetches the file from the current filesystem and stores it locally.""" diff --git a/unstructured/ingest/connector/sharepoint.py b/unstructured/ingest/connector/sharepoint.py index 23b0dbb296..5906d5c4a8 100644 --- a/unstructured/ingest/connector/sharepoint.py +++ b/unstructured/ingest/connector/sharepoint.py @@ -5,6 +5,7 @@ from urllib.parse import urlparse from unstructured.file_utils.filetype import EXT_TO_FILETYPE +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -123,6 +124,7 @@ def _get_page(self): return logger.info(f"File downloaded: {self.filename}") + @SourceConnectionError.wrap @requires_dependencies(["office365"], extras="sharepoint") def _get_file(self): from office365.runtime.auth.client_credential import ClientCredential diff --git a/unstructured/ingest/connector/slack.py b/unstructured/ingest/connector/slack.py index 775ae8d5f6..067dfae3d5 100644 --- a/unstructured/ingest/connector/slack.py +++ b/unstructured/ingest/connector/slack.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import List, Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -88,6 +89,7 @@ def _output_filename(self): def _create_full_tmp_dir_path(self): self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists @requires_dependencies(dependencies=["slack_sdk"], extras="slack") def get_file(self): diff --git a/unstructured/ingest/connector/wikipedia.py b/unstructured/ingest/connector/wikipedia.py index b6946dea31..eb21ddf6c2 100644 --- a/unstructured/ingest/connector/wikipedia.py +++ b/unstructured/ingest/connector/wikipedia.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import TYPE_CHECKING +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -53,6 +54,7 @@ def _output_filename(self): def _create_full_tmp_dir_path(self): self.filename.parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): """Fetches the "remote" doc and stores it locally on the filesystem.""" diff --git a/unstructured/ingest/error.py b/unstructured/ingest/error.py index 1fe272e1f4..f08d9cdd0a 100644 --- a/unstructured/ingest/error.py +++ b/unstructured/ingest/error.py @@ -2,7 +2,7 @@ from functools import wraps -class CustomError(BaseException, ABC): +class CustomError(Exception, ABC): error_string: str @classmethod @@ -19,8 +19,8 @@ def wrapper(*args, **kwargs): return f(*args, **kwargs) except BaseException as error: if not isinstance(error, cls): - raise cls(cls.error_string.format(str(error))) - raise error + raise cls(cls.error_string.format(str(error))) from error + raise return wrapper