diff --git a/CHANGELOG.md b/CHANGELOG.md index 330070d7f7..65a462ca81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.17-dev13 +## 0.10.17-dev14 ### Enhancements @@ -10,6 +10,7 @@ * **PPTX partitioner refactored in preparation for enhancement.** Behavior should be unchanged except that shapes enclosed in a group-shape are now included, as many levels deep as required (a group-shape can itself contain a group-shape). * **Embeddings support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally create embeddings from the elements it pulls out during partition and upload those embeddings to Azure Cognitive Search index. * **Improves hierarchy from docx files by leveraging natural hierarchies built into docx documents** Hierarchy can now be detected from an indentation level for list bullets/numbers and by style name (e.g. Heading 1, List Bullet 2, List Number). +* **Chunking support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally chunk the elements pulled out during partition via the chunking unstructured brick. This can be used as a stage before creating embeddings. ### Features diff --git a/test_unstructured_ingest/files/azure_cognitive_index_schema.json b/test_unstructured_ingest/files/azure_cognitive_index_schema.json index 2abdc7b1d4..d77fd8da32 100644 --- a/test_unstructured_ingest/files/azure_cognitive_index_schema.json +++ b/test_unstructured_ingest/files/azure_cognitive_index_schema.json @@ -109,6 +109,10 @@ } ] }, + { + "name": "languages", + "type": "Collection(Edm.String)" + }, { "name": "page_number", "type": "Edm.String" diff --git a/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh b/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh index af9d8f33ae..5ea8b9b416 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh @@ -77,6 +77,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --path "Shared Documents" \ --recursive \ --embedding-api-key "$OPENAI_API_KEY" \ + --chunk-elements \ + --chunk-multipage-sections \ azure-cognitive-search \ --key "$AZURE_SEARCH_API_KEY" \ --endpoint "$AZURE_SEARCH_ENDPOINT" \ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 7490d266e7..f87f0c2764 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev13" # pragma: no cover +__version__ = "0.10.17-dev14" # pragma: no cover diff --git a/unstructured/embed/openai.py b/unstructured/embed/openai.py index dd5a360970..b79763f8ec 100644 --- a/unstructured/embed/openai.py +++ b/unstructured/embed/openai.py @@ -1,5 +1,5 @@ import types -from typing import List, Optional +from typing import List import numpy as np @@ -12,7 +12,7 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder): - def __init__(self, api_key: str, model_name: Optional[str] = "text-embedding-ada-002"): + def __init__(self, api_key: str, model_name: str = "text-embedding-ada-002"): self.api_key = api_key self.model_name = model_name self.initialize() diff --git a/unstructured/ingest/cli/cmds/azure_cognitive_search.py b/unstructured/ingest/cli/cmds/azure_cognitive_search.py index 22eded4373..241a66b2ba 100644 --- a/unstructured/ingest/cli/cmds/azure_cognitive_search.py +++ b/unstructured/ingest/cli/cmds/azure_cognitive_search.py @@ -9,6 +9,7 @@ log_options, ) from unstructured.ingest.cli.interfaces import ( + CliChunkingConfig, CliEmbeddingsConfig, CliMixin, CliPartitionConfig, @@ -74,6 +75,7 @@ def azure_cognitive_search_dest(ctx: click.Context, **options): read_config = CliReadConfig.from_dict(parent_options) partition_config = CliPartitionConfig.from_dict(parent_options) embedding_config = CliEmbeddingsConfig.from_dict(parent_options) + chunking_config = CliChunkingConfig.from_dict(parent_options) # Run for schema validation AzureCognitiveSearchCliWriteConfig.from_dict(options) runner = runner_map[source_cmd] @@ -93,6 +95,7 @@ def azure_cognitive_search_dest(ctx: click.Context, **options): writer_type="azure_cognitive_search", writer_kwargs=options, embedding_config=embedding_config, + chunking_config=chunking_config, ) runner_instance.run( **parent_options, diff --git a/unstructured/ingest/cli/cmds/sharepoint.py b/unstructured/ingest/cli/cmds/sharepoint.py index 2457f474c8..5027fe3a80 100644 --- a/unstructured/ingest/cli/cmds/sharepoint.py +++ b/unstructured/ingest/cli/cmds/sharepoint.py @@ -9,6 +9,7 @@ log_options, ) from unstructured.ingest.cli.interfaces import ( + CliChunkingConfig, CliEmbeddingsConfig, CliMixin, CliPartitionConfig, @@ -86,6 +87,7 @@ def sharepoint_source(ctx: click.Context, **options): read_config = CliReadConfig.from_dict(options) partition_config = CliPartitionConfig.from_dict(options) embedding_config = CliEmbeddingsConfig.from_dict(options) + chunking_config = CliChunkingConfig.from_dict(options) # Run for schema validation SharepointCliConfig.from_dict(options) sharepoint_runner = SharePoint( @@ -93,6 +95,7 @@ def sharepoint_source(ctx: click.Context, **options): partition_config=partition_config, verbose=verbose, embedding_config=embedding_config, + chunking_config=chunking_config, ) sharepoint_runner.run(**options) except Exception as e: @@ -109,5 +112,6 @@ def get_source_cmd() -> click.Group: CliReadConfig.add_cli_options(cmd) CliPartitionConfig.add_cli_options(cmd) CliEmbeddingsConfig.add_cli_options(cmd) + CliChunkingConfig.add_cli_options(cmd) cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) return cmd diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py index 2190744b5b..7ec4660a6f 100644 --- a/unstructured/ingest/cli/interfaces.py +++ b/unstructured/ingest/cli/interfaces.py @@ -4,7 +4,13 @@ from dataclasses_json.core import Json, _decode_dataclass from unstructured.ingest.cli.cmds.utils import DelimitedString -from unstructured.ingest.interfaces import BaseConfig, EmbeddingConfig, PartitionConfig, ReadConfig +from unstructured.ingest.interfaces import ( + BaseConfig, + ChunkingConfig, + EmbeddingConfig, + PartitionConfig, + ReadConfig, +) class CliMixin: @@ -212,7 +218,7 @@ def from_dict( ): """ Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params. - This allows CLI arguments to be prepended with embedding_ during CLI invocation but + This allows CLI arguments to be prepended with chunk_ during CLI invocation but doesn't require that as part of the field names in this class """ if isinstance(kvs, dict): @@ -225,3 +231,61 @@ def from_dict( return None return _decode_dataclass(cls, new_kvs, infer_missing) return _decode_dataclass(cls, kvs, infer_missing) + + +class CliChunkingConfig(ChunkingConfig, CliMixin): + @staticmethod + def add_cli_options(cmd: click.Command) -> None: + options = [ + click.Option( + ["--chunk-elements"], + is_flag=True, + default=False, + ), + click.Option( + ["--chunk-multipage-sections"], + is_flag=True, + default=False, + ), + click.Option( + ["--chunk-combine-under-n-chars"], + type=int, + default=500, + show_default=True, + ), + click.Option( + ["--chunk-new-after-n-chars"], + type=int, + default=1500, + show_default=True, + ), + ] + cmd.params.extend(options) + + @classmethod + def from_dict( + cls, + kvs: Json, + *, + infer_missing=False, + ): + """ + Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params. + This allows CLI arguments to be prepended with chunking_ during CLI invocation but + doesn't require that as part of the field names in this class + """ + if isinstance(kvs, dict): + new_kvs = {} + if "chunk_elements" in kvs: + new_kvs["chunk_elements"] = kvs.pop("chunk_elements") + new_kvs.update( + { + k[len("chunking_") :]: v # noqa: E203 + for k, v in kvs.items() + if k.startswith("chunking_") + }, + ) + if len(new_kvs.keys()) == 0: + return None + return _decode_dataclass(cls, new_kvs, infer_missing) + return _decode_dataclass(cls, kvs, infer_missing) diff --git a/unstructured/ingest/connector/sharepoint.py b/unstructured/ingest/connector/sharepoint.py index 0dacea83d4..9fdcf87c9e 100644 --- a/unstructured/ingest/connector/sharepoint.py +++ b/unstructured/ingest/connector/sharepoint.py @@ -5,6 +5,7 @@ from pathlib import Path from urllib.parse import urlparse +from unstructured.documents.elements import Element from unstructured.embed.interfaces import BaseEmbeddingEncoder from unstructured.file_utils.filetype import EXT_TO_FILETYPE from unstructured.ingest.error import SourceConnectionError @@ -12,6 +13,7 @@ BaseConnectorConfig, BaseIngestDoc, BaseSourceConnector, + ChunkingConfig, EmbeddingConfig, IngestDocCleanupMixin, SourceConnectorCleanupMixin, @@ -69,6 +71,19 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): file_path: str registry_name: str = "sharepoint" embedding_config: t.Optional[EmbeddingConfig] = None + chunking_config: t.Optional[ChunkingConfig] = None + + def run_chunking(self, elements: t.List[Element]) -> t.List[Element]: + if self.chunking_config: + logger.info( + "Running chunking to split up elements with config: " + f"{self.chunking_config.to_dict()}", + ) + chunked_elements = self.chunking_config.chunk(elements=elements) + logger.info(f"chunked {len(elements)} elements into {len(chunked_elements)}") + return chunked_elements + else: + return elements @property def embedder(self) -> t.Optional[BaseEmbeddingEncoder]: @@ -244,6 +259,7 @@ def get_file(self): class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): connector_config: SimpleSharepointConfig embedding_config: t.Optional[EmbeddingConfig] = None + chunking_config: t.Optional[ChunkingConfig] = None @requires_dependencies(["office365"], extras="sharepoint") def _list_files(self, folder, recursive) -> t.List["File"]: @@ -283,6 +299,7 @@ def _prepare_ingest_doc(self, obj: t.Union["File", "SitePage"], base_url, is_pag is_page=is_page, file_path=file_path, embedding_config=self.embedding_config, + chunking_config=self.chunking_config, ) @requires_dependencies(["office365"], extras="sharepoint") diff --git a/unstructured/ingest/doc_processor/generalized.py b/unstructured/ingest/doc_processor/generalized.py index 849b53853c..f44b2fa8f4 100644 --- a/unstructured/ingest/doc_processor/generalized.py +++ b/unstructured/ingest/doc_processor/generalized.py @@ -62,8 +62,9 @@ def process_document(ingest_doc_json: str, **partition_kwargs) -> Optional[List[ doc.write_result() except Exception: # TODO(crag) save the exception instead of print? - logger.error(f"Failed to process {doc}", exc_info=True) + logger.error(f"Failed to process {doc}") + raise Exception finally: if doc: doc.cleanup_file() - return isd_elems_no_filename + return isd_elems_no_filename diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index c708938bfd..c76fdfb783 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -13,7 +13,8 @@ import requests from dataclasses_json import DataClassJsonMixin -from unstructured.documents.elements import DataSourceMetadata +from unstructured.chunking.title import chunk_by_title +from unstructured.documents.elements import DataSourceMetadata, Element from unstructured.embed.interfaces import BaseEmbeddingEncoder from unstructured.embed.openai import OpenAIEmbeddingEncoder from unstructured.ingest.error import PartitionError, SourceConnectionError @@ -78,6 +79,25 @@ def get_embedder(self) -> BaseEmbeddingEncoder: return OpenAIEmbeddingEncoder(**kwargs) +@dataclass +class ChunkingConfig(BaseConfig): + chunk_elements: bool = False + multipage_sections: bool = True + combine_under_n_chars: int = 500 + new_after_n_chars: int = 1500 + + def chunk(self, elements: t.List[Element]) -> t.List[Element]: + if self.chunk_elements: + return chunk_by_title( + elements=elements, + multipage_sections=self.multipage_sections, + combine_under_n_chars=self.combine_under_n_chars, + new_after_n_chars=self.new_after_n_chars, + ) + else: + return elements + + @dataclass class WriteConfig(BaseConfig): pass @@ -115,6 +135,9 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._date_processed = None + def run_chunking(self, elements: t.List[Element]) -> t.List[Element]: + return elements + @property def embedder(self) -> t.Optional[BaseEmbeddingEncoder]: return None @@ -263,6 +286,7 @@ def partition_file(self, **partition_kwargs) -> t.List[t.Dict[str, t.Any]]: if response.status_code != 200: raise RuntimeError(f"Caught {response.status_code} from API: {response.text}") elements = elements_from_json(text=json.dumps(response.json())) + elements = self.run_chunking(elements=elements) if self.embedder: logger.info("Running embedder to add vector content to elements") elements = self.embedder.embed_documents(elements) diff --git a/unstructured/ingest/runner/base_runner.py b/unstructured/ingest/runner/base_runner.py index 772e282f0d..c12bdce1e0 100644 --- a/unstructured/ingest/runner/base_runner.py +++ b/unstructured/ingest/runner/base_runner.py @@ -2,7 +2,12 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from unstructured.ingest.interfaces import EmbeddingConfig, PartitionConfig, ReadConfig +from unstructured.ingest.interfaces import ( + ChunkingConfig, + EmbeddingConfig, + PartitionConfig, + ReadConfig, +) @dataclass @@ -13,6 +18,7 @@ class Runner(ABC): writer_type: t.Optional[str] = None writer_kwargs: t.Optional[dict] = None embedding_config: t.Optional[EmbeddingConfig] = None + chunking_config: t.Optional[ChunkingConfig] = None @abstractmethod def run(self, *args, **kwargs): diff --git a/unstructured/ingest/runner/sharepoint.py b/unstructured/ingest/runner/sharepoint.py index a20e64bdf8..d5ab2ec940 100644 --- a/unstructured/ingest/runner/sharepoint.py +++ b/unstructured/ingest/runner/sharepoint.py @@ -51,6 +51,7 @@ def run( read_config=self.read_config, partition_config=self.partition_config, embedding_config=self.embedding_config, + chunking_config=self.chunking_config, ) dest_doc_connector = None