diff --git a/CHANGELOG.md b/CHANGELOG.md index 609e673239..fbf5a00915 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,13 @@ -## 0.10.13-dev0 +## 0.10.13-dev1 ### Enhancements * Updated documentation: Added back support doc types for partitioning, more Python codes in the API page, RAG definition, and use case. +### Fixes + +* Ingest error handling to properly raise errors when wrapped + ## 0.10.12 @@ -21,6 +25,7 @@ ### Fixes + * Bump unstructured-inference * Avoid divide-by-zero errors swith `safe_division` (0.5.21) diff --git a/test_unstructured_ingest/unit/test_error.py b/test_unstructured_ingest/unit/test_error.py new file mode 100644 index 0000000000..3caed5399c --- /dev/null +++ b/test_unstructured_ingest/unit/test_error.py @@ -0,0 +1,29 @@ +import traceback + +import pytest + +from unstructured.ingest.error import ( + DestinationConnectionError, + PartitionError, + SourceConnectionError, +) + + +@pytest.mark.parametrize( + ("error_class", "exception_type", "error_message"), + [ + (SourceConnectionError, ValueError, "Simulated connection error"), + (DestinationConnectionError, RuntimeError, "Simulated connection error"), + (PartitionError, FileNotFoundError, "Simulated partition error"), + ], +) +def test_custom_error_decorator(error_class, exception_type, error_message): + @error_class.wrap + def simulate_error(): + raise exception_type(error_message) + + with pytest.raises(error_class) as context: + simulate_error() + + expected_error_string = error_class.error_string.format(error_message) + assert str(context.value) == expected_error_string diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ba254c7ae9..9497f9dce4 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.13-dev0" # pragma: no cover +__version__ = "0.10.13-dev1" # pragma: no cover diff --git a/unstructured/ingest/connector/airtable.py b/unstructured/ingest/connector/airtable.py index 731fa04e92..20727c099e 100644 --- a/unstructured/ingest/connector/airtable.py +++ b/unstructured/ingest/connector/airtable.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -65,6 +66,7 @@ def _output_filename(self): output_file = f"{self.file_meta.table_id}.json" return Path(self.standard_config.output_dir) / self.file_meta.base_id / output_file + @SourceConnectionError.wrap @requires_dependencies(["pyairtable", "pandas"], extras="airtable") @BaseIngestDoc.skip_if_file_exists def get_file(self): diff --git a/unstructured/ingest/connector/azure.py b/unstructured/ingest/connector/azure.py index d382d0965d..6589e1a39a 100644 --- a/unstructured/ingest/connector/azure.py +++ b/unstructured/ingest/connector/azure.py @@ -6,6 +6,7 @@ FsspecIngestDoc, SimpleFsspecConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import StandardConnectorConfig from unstructured.utils import requires_dependencies @@ -19,6 +20,7 @@ class SimpleAzureBlobStorageConfig(SimpleFsspecConfig): class AzureBlobStorageIngestDoc(FsspecIngestDoc): registry_name: str = "azure" + @SourceConnectionError.wrap @requires_dependencies(["adlfs", "fsspec"], extras="azure") def get_file(self): super().get_file() diff --git a/unstructured/ingest/connector/biomed.py b/unstructured/ingest/connector/biomed.py index 3e68dbd51b..a7d686e1aa 100644 --- a/unstructured/ingest/connector/biomed.py +++ b/unstructured/ingest/connector/biomed.py @@ -10,6 +10,7 @@ from requests.adapters import HTTPAdapter from urllib3.util import Retry +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -129,6 +130,7 @@ def cleanup_file(self): logger.debug(f"Cleaning up {self}") Path.unlink(self.filename) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): download_path = self.file_meta.download_filepath # type: ignore diff --git a/unstructured/ingest/connector/box.py b/unstructured/ingest/connector/box.py index ac5d5366ad..f95c508d5d 100644 --- a/unstructured/ingest/connector/box.py +++ b/unstructured/ingest/connector/box.py @@ -16,6 +16,7 @@ FsspecIngestDoc, SimpleFsspecConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import StandardConnectorConfig from unstructured.utils import requires_dependencies @@ -46,6 +47,7 @@ class BoxIngestDoc(FsspecIngestDoc): config: SimpleBoxConfig registry_name: str = "box" + @SourceConnectionError.wrap @requires_dependencies(["boxfs", "fsspec"], extras="box") def get_file(self): super().get_file() diff --git a/unstructured/ingest/connector/confluence.py b/unstructured/ingest/connector/confluence.py index bbec761128..82569d0152 100644 --- a/unstructured/ingest/connector/confluence.py +++ b/unstructured/ingest/connector/confluence.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -100,6 +101,7 @@ def _output_filename(self): output_file = f"{self.file_meta.document_id}.json" return Path(self.standard_config.output_dir) / self.file_meta.space_id / output_file + @SourceConnectionError.wrap @requires_dependencies(["atlassian"], extras="Confluence") @BaseIngestDoc.skip_if_file_exists def get_file(self): diff --git a/unstructured/ingest/connector/delta_table.py b/unstructured/ingest/connector/delta_table.py index f2d8bb57b6..1eef5691e4 100644 --- a/unstructured/ingest/connector/delta_table.py +++ b/unstructured/ingest/connector/delta_table.py @@ -6,6 +6,7 @@ import pandas as pd +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -73,6 +74,7 @@ def _create_full_tmp_dir_path(self): self.filename.parent.mkdir(parents=True, exist_ok=True) self._output_filename.parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists @requires_dependencies(["fsspec"], extras="delta-table") def get_file(self): diff --git a/unstructured/ingest/connector/discord.py b/unstructured/ingest/connector/discord.py index bea9f25fe1..7cda1d77d9 100644 --- a/unstructured/ingest/connector/discord.py +++ b/unstructured/ingest/connector/discord.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import List, Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -74,6 +75,7 @@ def _output_filename(self): def _create_full_tmp_dir_path(self): self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists @requires_dependencies(dependencies=["discord"], extras="discord") def get_file(self): diff --git a/unstructured/ingest/connector/dropbox.py b/unstructured/ingest/connector/dropbox.py index 1d4891dd6b..335946ef2c 100644 --- a/unstructured/ingest/connector/dropbox.py +++ b/unstructured/ingest/connector/dropbox.py @@ -18,6 +18,7 @@ FsspecIngestDoc, SimpleFsspecConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import StandardConnectorConfig from unstructured.utils import requires_dependencies @@ -35,6 +36,7 @@ class SimpleDropboxConfig(SimpleFsspecConfig): class DropboxIngestDoc(FsspecIngestDoc): registry_name: str = "dropbox" + @SourceConnectionError.wrap @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") def get_file(self): super().get_file() diff --git a/unstructured/ingest/connector/elasticsearch.py b/unstructured/ingest/connector/elasticsearch.py index de00c254f6..fe0044ab7e 100644 --- a/unstructured/ingest/connector/elasticsearch.py +++ b/unstructured/ingest/connector/elasticsearch.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -102,6 +103,7 @@ def _concatenate_dict_fields(self, dictionary, seperator="\n"): concatenated_values = seperator.join(values) return concatenated_values + @SourceConnectionError.wrap @requires_dependencies(["elasticsearch", "jq"], extras="elasticsearch") @BaseIngestDoc.skip_if_file_exists def get_file(self): diff --git a/unstructured/ingest/connector/fsspec.py b/unstructured/ingest/connector/fsspec.py index d5b1c01853..c455124898 100644 --- a/unstructured/ingest/connector/fsspec.py +++ b/unstructured/ingest/connector/fsspec.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Type +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -101,6 +102,7 @@ def _create_full_tmp_dir_path(self): """Includes "directories" in the object path""" self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): """Fetches the file from the current filesystem and stores it locally.""" diff --git a/unstructured/ingest/connector/gcs.py b/unstructured/ingest/connector/gcs.py index da69f6b47d..0540d2618b 100644 --- a/unstructured/ingest/connector/gcs.py +++ b/unstructured/ingest/connector/gcs.py @@ -6,6 +6,7 @@ FsspecIngestDoc, SimpleFsspecConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import StandardConnectorConfig from unstructured.utils import requires_dependencies @@ -20,6 +21,7 @@ class GcsIngestDoc(FsspecIngestDoc): config: SimpleGcsConfig registry_name: str = "gcs" + @SourceConnectionError.wrap @requires_dependencies(["gcsfs", "fsspec"], extras="gcs") def get_file(self): super().get_file() diff --git a/unstructured/ingest/connector/git.py b/unstructured/ingest/connector/git.py index 85268a9203..5c39cebb8c 100644 --- a/unstructured/ingest/connector/git.py +++ b/unstructured/ingest/connector/git.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -40,6 +41,7 @@ def _create_full_tmp_dir_path(self): """includes directories in in the gitlab repository""" self.filename.parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): """Fetches the "remote" doc and stores it locally on the filesystem.""" diff --git a/unstructured/ingest/connector/github.py b/unstructured/ingest/connector/github.py index 5a1e20cd91..a90e10b3f8 100644 --- a/unstructured/ingest/connector/github.py +++ b/unstructured/ingest/connector/github.py @@ -9,6 +9,7 @@ GitIngestDoc, SimpleGitConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.logger import logger from unstructured.utils import requires_dependencies @@ -37,6 +38,7 @@ def __post_init__(self): # If there's no issues, store the core repository info self.repo_path = parsed_gh_url.path + @SourceConnectionError.wrap @requires_dependencies(["github"], extras="github") def _get_repo(self) -> "Repository": from github import Github diff --git a/unstructured/ingest/connector/gitlab.py b/unstructured/ingest/connector/gitlab.py index 82e6591489..67c85663cd 100644 --- a/unstructured/ingest/connector/gitlab.py +++ b/unstructured/ingest/connector/gitlab.py @@ -7,6 +7,7 @@ GitIngestDoc, SimpleGitConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.utils import requires_dependencies if TYPE_CHECKING: @@ -26,6 +27,7 @@ def __post_init__(self): while self.repo_path.startswith("/"): self.repo_path = self.repo_path[1:] + @SourceConnectionError.wrap @requires_dependencies(["gitlab"], extras="gitlab") def _get_project(self) -> "Project": from gitlab import Gitlab diff --git a/unstructured/ingest/connector/google_drive.py b/unstructured/ingest/connector/google_drive.py index 6e6a588879..019f1a6d68 100644 --- a/unstructured/ingest/connector/google_drive.py +++ b/unstructured/ingest/connector/google_drive.py @@ -8,6 +8,7 @@ from unstructured.file_utils.filetype import EXT_TO_FILETYPE from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -114,6 +115,7 @@ def filename(self): def _output_filename(self): return Path(f"{self.file_meta.get('output_filepath')}.json").resolve() + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists @requires_dependencies(["googleapiclient"], extras="google-drive") def get_file(self): diff --git a/unstructured/ingest/connector/onedrive.py b/unstructured/ingest/connector/onedrive.py index e6b31ee9ff..7f94441aa9 100644 --- a/unstructured/ingest/connector/onedrive.py +++ b/unstructured/ingest/connector/onedrive.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, List, Optional from unstructured.file_utils.filetype import EXT_TO_FILETYPE +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -38,6 +39,7 @@ def __post_init__(self): ) self.token_factory = self._acquire_token + @SourceConnectionError.wrap @requires_dependencies(["msal"]) def _acquire_token(self): from msal import ConfidentialClientApplication @@ -103,6 +105,7 @@ def filename(self): def _output_filename(self): return Path(self.output_filepath).resolve() + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists @requires_dependencies(["office365"], extras="onedrive") def get_file(self): diff --git a/unstructured/ingest/connector/outlook.py b/unstructured/ingest/connector/outlook.py index 7fa6b99f8b..6da1447221 100644 --- a/unstructured/ingest/connector/outlook.py +++ b/unstructured/ingest/connector/outlook.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import List, Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -109,6 +110,7 @@ def filename(self): def _output_filename(self): return Path(self.output_filepath).resolve() + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists @requires_dependencies(["office365"], extras="outlook") def get_file(self): diff --git a/unstructured/ingest/connector/reddit.py b/unstructured/ingest/connector/reddit.py index c4d620fc6f..9a7742fd69 100644 --- a/unstructured/ingest/connector/reddit.py +++ b/unstructured/ingest/connector/reddit.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -49,6 +50,7 @@ def _output_filename(self): def _create_full_tmp_dir_path(self): self.filename.parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): """Fetches the "remote" doc and stores it locally on the filesystem.""" diff --git a/unstructured/ingest/connector/s3.py b/unstructured/ingest/connector/s3.py index 9724d81e89..8e8416bdbd 100644 --- a/unstructured/ingest/connector/s3.py +++ b/unstructured/ingest/connector/s3.py @@ -6,6 +6,7 @@ FsspecIngestDoc, SimpleFsspecConfig, ) +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import StandardConnectorConfig from unstructured.utils import requires_dependencies @@ -20,6 +21,7 @@ class S3IngestDoc(FsspecIngestDoc): remote_file_path: str registry_name: str = "s3" + @SourceConnectionError.wrap @requires_dependencies(["s3fs", "fsspec"], extras="s3") def get_file(self): super().get_file() diff --git a/unstructured/ingest/connector/s3_2.py b/unstructured/ingest/connector/s3_2.py index 51219ee7f6..35d7601d02 100644 --- a/unstructured/ingest/connector/s3_2.py +++ b/unstructured/ingest/connector/s3_2.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Type +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces2 import ( BaseConnectorConfig, BaseDestinationConnector, @@ -104,6 +105,7 @@ def _create_full_tmp_dir_path(self): """Includes "directories" in the object path""" self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): """Fetches the file from the current filesystem and stores it locally.""" diff --git a/unstructured/ingest/connector/sharepoint.py b/unstructured/ingest/connector/sharepoint.py index 23b0dbb296..5906d5c4a8 100644 --- a/unstructured/ingest/connector/sharepoint.py +++ b/unstructured/ingest/connector/sharepoint.py @@ -5,6 +5,7 @@ from urllib.parse import urlparse from unstructured.file_utils.filetype import EXT_TO_FILETYPE +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -123,6 +124,7 @@ def _get_page(self): return logger.info(f"File downloaded: {self.filename}") + @SourceConnectionError.wrap @requires_dependencies(["office365"], extras="sharepoint") def _get_file(self): from office365.runtime.auth.client_credential import ClientCredential diff --git a/unstructured/ingest/connector/slack.py b/unstructured/ingest/connector/slack.py index 775ae8d5f6..067dfae3d5 100644 --- a/unstructured/ingest/connector/slack.py +++ b/unstructured/ingest/connector/slack.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import List, Optional +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -88,6 +89,7 @@ def _output_filename(self): def _create_full_tmp_dir_path(self): self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists @requires_dependencies(dependencies=["slack_sdk"], extras="slack") def get_file(self): diff --git a/unstructured/ingest/connector/wikipedia.py b/unstructured/ingest/connector/wikipedia.py index b6946dea31..eb21ddf6c2 100644 --- a/unstructured/ingest/connector/wikipedia.py +++ b/unstructured/ingest/connector/wikipedia.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import TYPE_CHECKING +from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnector, BaseConnectorConfig, @@ -53,6 +54,7 @@ def _output_filename(self): def _create_full_tmp_dir_path(self): self.filename.parent.mkdir(parents=True, exist_ok=True) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): """Fetches the "remote" doc and stores it locally on the filesystem.""" diff --git a/unstructured/ingest/error.py b/unstructured/ingest/error.py index 1fe272e1f4..f08d9cdd0a 100644 --- a/unstructured/ingest/error.py +++ b/unstructured/ingest/error.py @@ -2,7 +2,7 @@ from functools import wraps -class CustomError(BaseException, ABC): +class CustomError(Exception, ABC): error_string: str @classmethod @@ -19,8 +19,8 @@ def wrapper(*args, **kwargs): return f(*args, **kwargs) except BaseException as error: if not isinstance(error, cls): - raise cls(cls.error_string.format(str(error))) - raise error + raise cls(cls.error_string.format(str(error))) from error + raise return wrapper