Skip to content

Commit

Permalink
fix: wrapped error handling for connectors (#1262)
Browse files Browse the repository at this point in the history
The CustomError that we use to wrap custom ingest errors inherits from
BaseException rather than Exception (as we should, per specification
[here](https://docs.python.org/3/library/exceptions.html#BaseException)).
This resulted in exceptions not properly raising as expected. This PR
changes the inheritance which resolves the known issue.

Additionally, our base definition for get_file on IngestDoc was wrapped
with SourceConnectionError, however this must be explicitly decorating
each subclass definition in order to function. This PR does that.

## Testing
Some unit test coverage was added for the error wrapping class, however
this wasn't properly recreating the issue we are seeing when running
ingest tests.

To recreate that issue one can intentionally raise an exception in the
[partition_file](https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/interfaces.py#L214C9-L214C23)
definition and then run any ingest test. Prior to this change: the code
and logs suggest that everything ran without exception, but the
partitioned output was not generated (as a result the test will fail
without any clues as to what went wrong). With this update, the expected
custom partition error, error message, and stack trace will be visible.

---------

Co-authored-by: Ahmet Melek <[email protected]>
  • Loading branch information
ryannikolaidis and ahmetmeleq authored Sep 4, 2023
1 parent 95b6295 commit 92692ad
Show file tree
Hide file tree
Showing 27 changed files with 86 additions and 5 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
## 0.10.13-dev0
## 0.10.13-dev1

### Enhancements

* Updated documentation: Added back support doc types for partitioning, more Python codes in the API page, RAG definition, and use case.

### Fixes

* Ingest error handling to properly raise errors when wrapped


## 0.10.12

Expand All @@ -21,6 +25,7 @@

### Fixes


* Bump unstructured-inference
* Avoid divide-by-zero errors swith `safe_division` (0.5.21)

Expand Down
29 changes: 29 additions & 0 deletions test_unstructured_ingest/unit/test_error.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import traceback

import pytest

from unstructured.ingest.error import (
DestinationConnectionError,
PartitionError,
SourceConnectionError,
)


@pytest.mark.parametrize(
("error_class", "exception_type", "error_message"),
[
(SourceConnectionError, ValueError, "Simulated connection error"),
(DestinationConnectionError, RuntimeError, "Simulated connection error"),
(PartitionError, FileNotFoundError, "Simulated partition error"),
],
)
def test_custom_error_decorator(error_class, exception_type, error_message):
@error_class.wrap
def simulate_error():
raise exception_type(error_message)

with pytest.raises(error_class) as context:
simulate_error()

expected_error_string = error_class.error_string.format(error_message)
assert str(context.value) == expected_error_string
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.10.13-dev0" # pragma: no cover
__version__ = "0.10.13-dev1" # pragma: no cover
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/airtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pathlib import Path
from typing import Optional

from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -65,6 +66,7 @@ def _output_filename(self):
output_file = f"{self.file_meta.table_id}.json"
return Path(self.standard_config.output_dir) / self.file_meta.base_id / output_file

@SourceConnectionError.wrap
@requires_dependencies(["pyairtable", "pandas"], extras="airtable")
@BaseIngestDoc.skip_if_file_exists
def get_file(self):
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
FsspecIngestDoc,
SimpleFsspecConfig,
)
from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import StandardConnectorConfig
from unstructured.utils import requires_dependencies

Expand All @@ -19,6 +20,7 @@ class SimpleAzureBlobStorageConfig(SimpleFsspecConfig):
class AzureBlobStorageIngestDoc(FsspecIngestDoc):
registry_name: str = "azure"

@SourceConnectionError.wrap
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
def get_file(self):
super().get_file()
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/biomed.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from requests.adapters import HTTPAdapter
from urllib3.util import Retry

from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -129,6 +130,7 @@ def cleanup_file(self):
logger.debug(f"Cleaning up {self}")
Path.unlink(self.filename)

@SourceConnectionError.wrap
@BaseIngestDoc.skip_if_file_exists
def get_file(self):
download_path = self.file_meta.download_filepath # type: ignore
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/box.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
FsspecIngestDoc,
SimpleFsspecConfig,
)
from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import StandardConnectorConfig
from unstructured.utils import requires_dependencies

Expand Down Expand Up @@ -46,6 +47,7 @@ class BoxIngestDoc(FsspecIngestDoc):
config: SimpleBoxConfig
registry_name: str = "box"

@SourceConnectionError.wrap
@requires_dependencies(["boxfs", "fsspec"], extras="box")
def get_file(self):
super().get_file()
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/confluence.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from typing import Optional

from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -100,6 +101,7 @@ def _output_filename(self):
output_file = f"{self.file_meta.document_id}.json"
return Path(self.standard_config.output_dir) / self.file_meta.space_id / output_file

@SourceConnectionError.wrap
@requires_dependencies(["atlassian"], extras="Confluence")
@BaseIngestDoc.skip_if_file_exists
def get_file(self):
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/delta_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import pandas as pd

from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -73,6 +74,7 @@ def _create_full_tmp_dir_path(self):
self.filename.parent.mkdir(parents=True, exist_ok=True)
self._output_filename.parent.mkdir(parents=True, exist_ok=True)

@SourceConnectionError.wrap
@BaseIngestDoc.skip_if_file_exists
@requires_dependencies(["fsspec"], extras="delta-table")
def get_file(self):
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/discord.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from typing import List, Optional

from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -74,6 +75,7 @@ def _output_filename(self):
def _create_full_tmp_dir_path(self):
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)

@SourceConnectionError.wrap
@BaseIngestDoc.skip_if_file_exists
@requires_dependencies(dependencies=["discord"], extras="discord")
def get_file(self):
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/dropbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
FsspecIngestDoc,
SimpleFsspecConfig,
)
from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import StandardConnectorConfig
from unstructured.utils import requires_dependencies

Expand All @@ -35,6 +36,7 @@ class SimpleDropboxConfig(SimpleFsspecConfig):
class DropboxIngestDoc(FsspecIngestDoc):
registry_name: str = "dropbox"

@SourceConnectionError.wrap
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
def get_file(self):
super().get_file()
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path
from typing import Optional

from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -102,6 +103,7 @@ def _concatenate_dict_fields(self, dictionary, seperator="\n"):
concatenated_values = seperator.join(values)
return concatenated_values

@SourceConnectionError.wrap
@requires_dependencies(["elasticsearch", "jq"], extras="elasticsearch")
@BaseIngestDoc.skip_if_file_exists
def get_file(self):
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from typing import Type

from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -101,6 +102,7 @@ def _create_full_tmp_dir_path(self):
"""Includes "directories" in the object path"""
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)

@SourceConnectionError.wrap
@BaseIngestDoc.skip_if_file_exists
def get_file(self):
"""Fetches the file from the current filesystem and stores it locally."""
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
FsspecIngestDoc,
SimpleFsspecConfig,
)
from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import StandardConnectorConfig
from unstructured.utils import requires_dependencies

Expand All @@ -20,6 +21,7 @@ class GcsIngestDoc(FsspecIngestDoc):
config: SimpleGcsConfig
registry_name: str = "gcs"

@SourceConnectionError.wrap
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
def get_file(self):
super().get_file()
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from typing import Optional

from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -40,6 +41,7 @@ def _create_full_tmp_dir_path(self):
"""includes directories in in the gitlab repository"""
self.filename.parent.mkdir(parents=True, exist_ok=True)

@SourceConnectionError.wrap
@BaseIngestDoc.skip_if_file_exists
def get_file(self):
"""Fetches the "remote" doc and stores it locally on the filesystem."""
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
GitIngestDoc,
SimpleGitConfig,
)
from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.logger import logger
from unstructured.utils import requires_dependencies

Expand Down Expand Up @@ -37,6 +38,7 @@ def __post_init__(self):
# If there's no issues, store the core repository info
self.repo_path = parsed_gh_url.path

@SourceConnectionError.wrap
@requires_dependencies(["github"], extras="github")
def _get_repo(self) -> "Repository":
from github import Github
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/gitlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
GitIngestDoc,
SimpleGitConfig,
)
from unstructured.ingest.error import SourceConnectionError
from unstructured.utils import requires_dependencies

if TYPE_CHECKING:
Expand All @@ -26,6 +27,7 @@ def __post_init__(self):
while self.repo_path.startswith("/"):
self.repo_path = self.repo_path[1:]

@SourceConnectionError.wrap
@requires_dependencies(["gitlab"], extras="gitlab")
def _get_project(self) -> "Project":
from gitlab import Gitlab
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/google_drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from unstructured.file_utils.filetype import EXT_TO_FILETYPE
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -114,6 +115,7 @@ def filename(self):
def _output_filename(self):
return Path(f"{self.file_meta.get('output_filepath')}.json").resolve()

@SourceConnectionError.wrap
@BaseIngestDoc.skip_if_file_exists
@requires_dependencies(["googleapiclient"], extras="google-drive")
def get_file(self):
Expand Down
3 changes: 3 additions & 0 deletions unstructured/ingest/connector/onedrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import TYPE_CHECKING, List, Optional

from unstructured.file_utils.filetype import EXT_TO_FILETYPE
from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -38,6 +39,7 @@ def __post_init__(self):
)
self.token_factory = self._acquire_token

@SourceConnectionError.wrap
@requires_dependencies(["msal"])
def _acquire_token(self):
from msal import ConfidentialClientApplication
Expand Down Expand Up @@ -103,6 +105,7 @@ def filename(self):
def _output_filename(self):
return Path(self.output_filepath).resolve()

@SourceConnectionError.wrap
@BaseIngestDoc.skip_if_file_exists
@requires_dependencies(["office365"], extras="onedrive")
def get_file(self):
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/outlook.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pathlib import Path
from typing import List, Optional

from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -109,6 +110,7 @@ def filename(self):
def _output_filename(self):
return Path(self.output_filepath).resolve()

@SourceConnectionError.wrap
@BaseIngestDoc.skip_if_file_exists
@requires_dependencies(["office365"], extras="outlook")
def get_file(self):
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/reddit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pathlib import Path
from typing import TYPE_CHECKING, Optional

from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -49,6 +50,7 @@ def _output_filename(self):
def _create_full_tmp_dir_path(self):
self.filename.parent.mkdir(parents=True, exist_ok=True)

@SourceConnectionError.wrap
@BaseIngestDoc.skip_if_file_exists
def get_file(self):
"""Fetches the "remote" doc and stores it locally on the filesystem."""
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
FsspecIngestDoc,
SimpleFsspecConfig,
)
from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import StandardConnectorConfig
from unstructured.utils import requires_dependencies

Expand All @@ -20,6 +21,7 @@ class S3IngestDoc(FsspecIngestDoc):
remote_file_path: str
registry_name: str = "s3"

@SourceConnectionError.wrap
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
def get_file(self):
super().get_file()
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/s3_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path
from typing import Type

from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces2 import (
BaseConnectorConfig,
BaseDestinationConnector,
Expand Down Expand Up @@ -104,6 +105,7 @@ def _create_full_tmp_dir_path(self):
"""Includes "directories" in the object path"""
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)

@SourceConnectionError.wrap
@BaseIngestDoc.skip_if_file_exists
def get_file(self):
"""Fetches the file from the current filesystem and stores it locally."""
Expand Down
2 changes: 2 additions & 0 deletions unstructured/ingest/connector/sharepoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from urllib.parse import urlparse

from unstructured.file_utils.filetype import EXT_TO_FILETYPE
from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
Expand Down Expand Up @@ -123,6 +124,7 @@ def _get_page(self):
return
logger.info(f"File downloaded: {self.filename}")

@SourceConnectionError.wrap
@requires_dependencies(["office365"], extras="sharepoint")
def _get_file(self):
from office365.runtime.auth.client_credential import ClientCredential
Expand Down
Loading

0 comments on commit 92692ad

Please sign in to comment.