Skip to content

Commit

Permalink
Chunking support for SharePoint Connector (#1548)
Browse files Browse the repository at this point in the history
### Description
Optionally adds in chunking to the CLI which adds a flag to trigger
chunking and exposes the parameters used by the `chunk_by_title` method.
Runs chunking before the embedding step.


Opened to replace original PR
#1531
  • Loading branch information
rbiseck3 authored Sep 27, 2023
1 parent b283962 commit 9836235
Show file tree
Hide file tree
Showing 13 changed files with 137 additions and 10 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.10.17-dev13
## 0.10.17-dev14

### Enhancements

Expand All @@ -10,6 +10,7 @@
* **PPTX partitioner refactored in preparation for enhancement.** Behavior should be unchanged except that shapes enclosed in a group-shape are now included, as many levels deep as required (a group-shape can itself contain a group-shape).
* **Embeddings support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally create embeddings from the elements it pulls out during partition and upload those embeddings to Azure Cognitive Search index.
* **Improves hierarchy from docx files by leveraging natural hierarchies built into docx documents** Hierarchy can now be detected from an indentation level for list bullets/numbers and by style name (e.g. Heading 1, List Bullet 2, List Number).
* **Chunking support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally chunk the elements pulled out during partition via the chunking unstructured brick. This can be used as a stage before creating embeddings.

### Features

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@
}
]
},
{
"name": "languages",
"type": "Collection(Edm.String)"
},
{
"name": "page_number",
"type": "Edm.String"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--path "Shared Documents" \
--recursive \
--embedding-api-key "$OPENAI_API_KEY" \
--chunk-elements \
--chunk-multipage-sections \
azure-cognitive-search \
--key "$AZURE_SEARCH_API_KEY" \
--endpoint "$AZURE_SEARCH_ENDPOINT" \
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.10.17-dev13" # pragma: no cover
__version__ = "0.10.17-dev14" # pragma: no cover
4 changes: 2 additions & 2 deletions unstructured/embed/openai.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import types
from typing import List, Optional
from typing import List

import numpy as np

Expand All @@ -12,7 +12,7 @@


class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
def __init__(self, api_key: str, model_name: Optional[str] = "text-embedding-ada-002"):
def __init__(self, api_key: str, model_name: str = "text-embedding-ada-002"):
self.api_key = api_key
self.model_name = model_name
self.initialize()
Expand Down
3 changes: 3 additions & 0 deletions unstructured/ingest/cli/cmds/azure_cognitive_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
log_options,
)
from unstructured.ingest.cli.interfaces import (
CliChunkingConfig,
CliEmbeddingsConfig,
CliMixin,
CliPartitionConfig,
Expand Down Expand Up @@ -74,6 +75,7 @@ def azure_cognitive_search_dest(ctx: click.Context, **options):
read_config = CliReadConfig.from_dict(parent_options)
partition_config = CliPartitionConfig.from_dict(parent_options)
embedding_config = CliEmbeddingsConfig.from_dict(parent_options)
chunking_config = CliChunkingConfig.from_dict(parent_options)
# Run for schema validation
AzureCognitiveSearchCliWriteConfig.from_dict(options)
runner = runner_map[source_cmd]
Expand All @@ -93,6 +95,7 @@ def azure_cognitive_search_dest(ctx: click.Context, **options):
writer_type="azure_cognitive_search",
writer_kwargs=options,
embedding_config=embedding_config,
chunking_config=chunking_config,
)
runner_instance.run(
**parent_options,
Expand Down
4 changes: 4 additions & 0 deletions unstructured/ingest/cli/cmds/sharepoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
log_options,
)
from unstructured.ingest.cli.interfaces import (
CliChunkingConfig,
CliEmbeddingsConfig,
CliMixin,
CliPartitionConfig,
Expand Down Expand Up @@ -86,13 +87,15 @@ def sharepoint_source(ctx: click.Context, **options):
read_config = CliReadConfig.from_dict(options)
partition_config = CliPartitionConfig.from_dict(options)
embedding_config = CliEmbeddingsConfig.from_dict(options)
chunking_config = CliChunkingConfig.from_dict(options)
# Run for schema validation
SharepointCliConfig.from_dict(options)
sharepoint_runner = SharePoint(
read_config=read_config,
partition_config=partition_config,
verbose=verbose,
embedding_config=embedding_config,
chunking_config=chunking_config,
)
sharepoint_runner.run(**options)
except Exception as e:
Expand All @@ -109,5 +112,6 @@ def get_source_cmd() -> click.Group:
CliReadConfig.add_cli_options(cmd)
CliPartitionConfig.add_cli_options(cmd)
CliEmbeddingsConfig.add_cli_options(cmd)
CliChunkingConfig.add_cli_options(cmd)
cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False))
return cmd
68 changes: 66 additions & 2 deletions unstructured/ingest/cli/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@
from dataclasses_json.core import Json, _decode_dataclass

from unstructured.ingest.cli.cmds.utils import DelimitedString
from unstructured.ingest.interfaces import BaseConfig, EmbeddingConfig, PartitionConfig, ReadConfig
from unstructured.ingest.interfaces import (
BaseConfig,
ChunkingConfig,
EmbeddingConfig,
PartitionConfig,
ReadConfig,
)


class CliMixin:
Expand Down Expand Up @@ -212,7 +218,7 @@ def from_dict(
):
"""
Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
This allows CLI arguments to be prepended with embedding_ during CLI invocation but
This allows CLI arguments to be prepended with chunk_ during CLI invocation but
doesn't require that as part of the field names in this class
"""
if isinstance(kvs, dict):
Expand All @@ -225,3 +231,61 @@ def from_dict(
return None
return _decode_dataclass(cls, new_kvs, infer_missing)
return _decode_dataclass(cls, kvs, infer_missing)


class CliChunkingConfig(ChunkingConfig, CliMixin):
@staticmethod
def add_cli_options(cmd: click.Command) -> None:
options = [
click.Option(
["--chunk-elements"],
is_flag=True,
default=False,
),
click.Option(
["--chunk-multipage-sections"],
is_flag=True,
default=False,
),
click.Option(
["--chunk-combine-under-n-chars"],
type=int,
default=500,
show_default=True,
),
click.Option(
["--chunk-new-after-n-chars"],
type=int,
default=1500,
show_default=True,
),
]
cmd.params.extend(options)

@classmethod
def from_dict(
cls,
kvs: Json,
*,
infer_missing=False,
):
"""
Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
This allows CLI arguments to be prepended with chunking_ during CLI invocation but
doesn't require that as part of the field names in this class
"""
if isinstance(kvs, dict):
new_kvs = {}
if "chunk_elements" in kvs:
new_kvs["chunk_elements"] = kvs.pop("chunk_elements")
new_kvs.update(
{
k[len("chunking_") :]: v # noqa: E203
for k, v in kvs.items()
if k.startswith("chunking_")
},
)
if len(new_kvs.keys()) == 0:
return None
return _decode_dataclass(cls, new_kvs, infer_missing)
return _decode_dataclass(cls, kvs, infer_missing)
17 changes: 17 additions & 0 deletions unstructured/ingest/connector/sharepoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
from pathlib import Path
from urllib.parse import urlparse

from unstructured.documents.elements import Element
from unstructured.embed.interfaces import BaseEmbeddingEncoder
from unstructured.file_utils.filetype import EXT_TO_FILETYPE
from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import (
BaseConnectorConfig,
BaseIngestDoc,
BaseSourceConnector,
ChunkingConfig,
EmbeddingConfig,
IngestDocCleanupMixin,
SourceConnectorCleanupMixin,
Expand Down Expand Up @@ -69,6 +71,19 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
file_path: str
registry_name: str = "sharepoint"
embedding_config: t.Optional[EmbeddingConfig] = None
chunking_config: t.Optional[ChunkingConfig] = None

def run_chunking(self, elements: t.List[Element]) -> t.List[Element]:
if self.chunking_config:
logger.info(
"Running chunking to split up elements with config: "
f"{self.chunking_config.to_dict()}",
)
chunked_elements = self.chunking_config.chunk(elements=elements)
logger.info(f"chunked {len(elements)} elements into {len(chunked_elements)}")
return chunked_elements
else:
return elements

@property
def embedder(self) -> t.Optional[BaseEmbeddingEncoder]:
Expand Down Expand Up @@ -244,6 +259,7 @@ def get_file(self):
class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
connector_config: SimpleSharepointConfig
embedding_config: t.Optional[EmbeddingConfig] = None
chunking_config: t.Optional[ChunkingConfig] = None

@requires_dependencies(["office365"], extras="sharepoint")
def _list_files(self, folder, recursive) -> t.List["File"]:
Expand Down Expand Up @@ -283,6 +299,7 @@ def _prepare_ingest_doc(self, obj: t.Union["File", "SitePage"], base_url, is_pag
is_page=is_page,
file_path=file_path,
embedding_config=self.embedding_config,
chunking_config=self.chunking_config,
)

@requires_dependencies(["office365"], extras="sharepoint")
Expand Down
5 changes: 3 additions & 2 deletions unstructured/ingest/doc_processor/generalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ def process_document(ingest_doc_json: str, **partition_kwargs) -> Optional[List[
doc.write_result()
except Exception:
# TODO(crag) save the exception instead of print?
logger.error(f"Failed to process {doc}", exc_info=True)
logger.error(f"Failed to process {doc}")
raise Exception
finally:
if doc:
doc.cleanup_file()
return isd_elems_no_filename
return isd_elems_no_filename
26 changes: 25 additions & 1 deletion unstructured/ingest/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
import requests
from dataclasses_json import DataClassJsonMixin

from unstructured.documents.elements import DataSourceMetadata
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata, Element
from unstructured.embed.interfaces import BaseEmbeddingEncoder
from unstructured.embed.openai import OpenAIEmbeddingEncoder
from unstructured.ingest.error import PartitionError, SourceConnectionError
Expand Down Expand Up @@ -78,6 +79,25 @@ def get_embedder(self) -> BaseEmbeddingEncoder:
return OpenAIEmbeddingEncoder(**kwargs)


@dataclass
class ChunkingConfig(BaseConfig):
chunk_elements: bool = False
multipage_sections: bool = True
combine_under_n_chars: int = 500
new_after_n_chars: int = 1500

def chunk(self, elements: t.List[Element]) -> t.List[Element]:
if self.chunk_elements:
return chunk_by_title(
elements=elements,
multipage_sections=self.multipage_sections,
combine_under_n_chars=self.combine_under_n_chars,
new_after_n_chars=self.new_after_n_chars,
)
else:
return elements


@dataclass
class WriteConfig(BaseConfig):
pass
Expand Down Expand Up @@ -115,6 +135,9 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._date_processed = None

def run_chunking(self, elements: t.List[Element]) -> t.List[Element]:
return elements

@property
def embedder(self) -> t.Optional[BaseEmbeddingEncoder]:
return None
Expand Down Expand Up @@ -263,6 +286,7 @@ def partition_file(self, **partition_kwargs) -> t.List[t.Dict[str, t.Any]]:
if response.status_code != 200:
raise RuntimeError(f"Caught {response.status_code} from API: {response.text}")
elements = elements_from_json(text=json.dumps(response.json()))
elements = self.run_chunking(elements=elements)
if self.embedder:
logger.info("Running embedder to add vector content to elements")
elements = self.embedder.embed_documents(elements)
Expand Down
8 changes: 7 additions & 1 deletion unstructured/ingest/runner/base_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass

from unstructured.ingest.interfaces import EmbeddingConfig, PartitionConfig, ReadConfig
from unstructured.ingest.interfaces import (
ChunkingConfig,
EmbeddingConfig,
PartitionConfig,
ReadConfig,
)


@dataclass
Expand All @@ -13,6 +18,7 @@ class Runner(ABC):
writer_type: t.Optional[str] = None
writer_kwargs: t.Optional[dict] = None
embedding_config: t.Optional[EmbeddingConfig] = None
chunking_config: t.Optional[ChunkingConfig] = None

@abstractmethod
def run(self, *args, **kwargs):
Expand Down
1 change: 1 addition & 0 deletions unstructured/ingest/runner/sharepoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def run(
read_config=self.read_config,
partition_config=self.partition_config,
embedding_config=self.embedding_config,
chunking_config=self.chunking_config,
)

dest_doc_connector = None
Expand Down

0 comments on commit 9836235

Please sign in to comment.