Roman/azure cognitive embeddings (#1524)

### Description This PR is two-fold: **Embeddings:** * Embeddings incorporated into the sharepoint source connector, which will now call out to OpenAI and create embeddings if the flag is passed in and the api key provided. **Writing vector content (embeddings) to Azure cognitive search index:** * The schema for the index expected to exist in Azure has been updated to include the vector field type and a test script has been added to test the new content being produced from the Sharepoint connector to push the embedding content. Some important notes about other changes in here: * The embedding code had to be updated to patch the `to_dict` method on elements to add `embeddings` to the dict output if that was added. While the code originally added the embedding content, when `to_dict` was called to save the content as json, this was lost.
Unstructured-IO · Sep 26, 2023 · 5c7b4f5 · 5c7b4f5
1 parent d8a36af
commit 5c7b4f5
Show file tree

Hide file tree

Showing 31 changed files with 569 additions and 127 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 * **Improves salesforce partitioning** Partitions Salesforce data as xlm instead of text for improved detail and flexibility. Partitions htmlbody instead of textbody for Salesforce emails. Importance: Allows all Salesforce fields to be ingested and gives Salesforce emails more detailed partitioning.
 * **Add document level language detection functionality.** Introduces the "auto" default for the languages param, which then detects the languages present in the document using the `langdetect` package. Adds the document languages as ISO 639-3 codes to the element metadata. Implemented only for the partition_text function to start.
 * **PPTX partitioner refactored in preparation for enhancement.** Behavior should be unchanged except that shapes enclosed in a group-shape are now included, as many levels deep as required (a group-shape can itself contain a group-shape).
+* **Embeddings support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally create embeddings from the elements it pulls out during partition and upload those embeddings to Azure Cognitive Search index.
 
 ### Features
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -24,6 +24,7 @@ charset-normalizer==3.2.0
     #   requests
 docutils==0.18.1
     # via
+    #   myst-parser
     #   sphinx
     #   sphinx-rtd-theme
     #   sphinx-tabs
@@ -38,10 +39,21 @@ imagesize==1.4.1
 importlib-metadata==6.8.0
     # via sphinx
 jinja2==3.1.2
-    # via sphinx
+    # via
+    #   myst-parser
+    #   sphinx
+markdown-it-py==3.0.0
+    # via
+    #   mdit-py-plugins
+    #   myst-parser
 markupsafe==2.1.3
     # via jinja2
+mdit-py-plugins==0.4.0
+    # via myst-parser
+mdurl==0.1.2
+    # via markdown-it-py
 myst-parser==2.0.0
+    # via -r requirements/build.in
 packaging==23.1
     # via
     #   -c requirements/base.txt
@@ -53,6 +65,8 @@ pygments==2.16.1
     #   sphinx-tabs
 pytz==2023.3.post1
     # via babel
+pyyaml==6.0.1
+    # via myst-parser
 requests==2.31.0
     # via
     #   -c requirements/base.txt
@@ -67,6 +81,7 @@ sphinx==6.2.1
     # via
     #   -r requirements/build.in
     #   furo
+    #   myst-parser
     #   sphinx-basic-ng
     #   sphinx-rtd-theme
     #   sphinx-tabs

diff --git a/docs/source/destination_connectors/azure_cognitive_sample_index_schema.json b/docs/source/destination_connectors/azure_cognitive_sample_index_schema.json
@@ -16,6 +16,12 @@
       "name": "text",
       "type": "Edm.String"
     },
+    {
+      "name": "embeddings",
+      "type": "Collection(Edm.Single)",
+      "dimensions": 400,
+      "vectorSearchConfiguration": "embeddings-config"
+    },
     {
       "name": "type",
       "type": "Edm.String"
@@ -162,5 +168,19 @@
         }
       ]
     }
-  ]
+  ],
+  "vectorSearch": {
+    "algorithmConfigurations": [
+      {
+        "name": "embeddings-config",
+        "kind": "hnsw",
+        "hnswParameters": {
+          "metric": "cosine",
+          "m": 4,
+          "efConstruction": 400,
+          "efSearch": 500
+        }
+      }
+    ]
+  }
 }
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -16,7 +16,7 @@ charset-normalizer==3.2.0
     # via requests
 click==8.1.7
     # via nltk
-dataclasses-json==0.6.0
+dataclasses-json==0.6.1
     # via -r requirements/base.in
 emoji==2.8.0
     # via -r requirements/base.in

diff --git a/requirements/build.txt b/requirements/build.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
-#    pip-compile --config=pyproject.toml requirements/build.in
+#    pip-compile requirements/build.in
 #
 alabaster==0.7.13
     # via sphinx

diff --git a/requirements/constraints.in b/requirements/constraints.in
@@ -39,3 +39,5 @@ matplotlib==3.7.2
 # NOTE(crag) - pin to available pandas for python 3.8 (at least in CI)
 fsspec==2023.9.1
 pandas<2.0.4
+# langchain limits this to 3.1.7
+anyio==3.1.7
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -360,7 +360,7 @@ tornado==6.3.3
     #   jupyterlab
     #   notebook
     #   terminado
-traitlets==5.10.0
+traitlets==5.10.1
     # via
     #   comm
     #   ipykernel

diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -41,7 +41,7 @@ fsspec==2023.9.1
     # via
     #   -c requirements/constraints.in
     #   huggingface-hub
-huggingface-hub==0.17.2
+huggingface-hub==0.17.3
     # via
     #   timm
     #   transformers

diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt
@@ -26,7 +26,7 @@ fsspec==2023.9.1
     # via
     #   -c requirements/constraints.in
     #   huggingface-hub
-huggingface-hub==0.17.2
+huggingface-hub==0.17.3
     # via transformers
 idna==3.4
     # via

diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt
@@ -23,7 +23,7 @@ azure-datalake-store==0.0.53
     # via adlfs
 azure-identity==1.14.0
     # via adlfs
-azure-storage-blob==12.18.1
+azure-storage-blob==12.18.2
     # via adlfs
 certifi==2023.7.22
     # via

diff --git a/requirements/ingest-delta-table.txt b/requirements/ingest-delta-table.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile requirements/ingest-delta-table.in
 #
-deltalake==0.10.2
+deltalake==0.11.0
     # via -r requirements/ingest-delta-table.in
 fsspec==2023.9.1
     # via

diff --git a/requirements/ingest-notion.txt b/requirements/ingest-notion.txt
@@ -4,33 +4,33 @@
 #
 #    pip-compile requirements/ingest-notion.in
 #
-anyio==4.0.0
-    # via httpcore
 certifi==2023.7.22
     # via
     #   -c requirements/base.txt
     #   -c requirements/constraints.in
-    #   httpcore
     #   httpx
-exceptiongroup==1.1.3
-    # via anyio
-h11==0.14.0
+charset-normalizer==3.2.0
+    # via
+    #   -c requirements/base.txt
+    #   httpx
+h11==0.12.0
     # via httpcore
 htmlbuilder==1.0.0
     # via -r requirements/ingest-notion.in
-httpcore==0.18.0
+httpcore==0.13.3
     # via httpx
-httpx==0.25.0
+httpx==0.20.0
     # via notion-client
 idna==3.4
     # via
     #   -c requirements/base.txt
-    #   anyio
     #   httpx
+    #   rfc3986
 notion-client==2.0.0
     # via -r requirements/ingest-notion.in
+rfc3986[idna2008]==1.5.0
+    # via httpx
 sniffio==1.3.0
     # via
-    #   anyio
     #   httpcore
     #   httpx
diff --git a/requirements/ingest-openai.in b/requirements/ingest-openai.in
@@ -0,0 +1,5 @@
+-c constraints.in
+-c base.txt
+langchain
+tiktoken
+openai
diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt
@@ -0,0 +1,115 @@
+#
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
+#
+#    pip-compile requirements/ingest-openai.in
+#
+aiohttp==3.8.5
+    # via
+    #   langchain
+    #   openai
+aiosignal==1.3.1
+    # via aiohttp
+async-timeout==4.0.3
+    # via
+    #   aiohttp
+    #   langchain
+attrs==23.1.0
+    # via aiohttp
+certifi==2023.7.22
+    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   requests
+charset-normalizer==3.2.0
+    # via
+    #   -c requirements/base.txt
+    #   aiohttp
+    #   requests
+dataclasses-json==0.6.1
+    # via
+    #   -c requirements/base.txt
+    #   langchain
+frozenlist==1.4.0
+    # via
+    #   aiohttp
+    #   aiosignal
+idna==3.4
+    # via
+    #   -c requirements/base.txt
+    #   requests
+    #   yarl
+langchain==0.0.298
+    # via -r requirements/ingest-openai.in
+langsmith==0.0.40
+    # via langchain
+marshmallow==3.20.1
+    # via
+    #   -c requirements/base.txt
+    #   dataclasses-json
+multidict==6.0.4
+    # via
+    #   aiohttp
+    #   yarl
+mypy-extensions==1.0.0
+    # via
+    #   -c requirements/base.txt
+    #   typing-inspect
+numexpr==2.8.6
+    # via langchain
+numpy==1.24.4
+    # via
+    #   -c requirements/constraints.in
+    #   langchain
+    #   numexpr
+openai==0.28.1
+    # via -r requirements/ingest-openai.in
+packaging==23.1
+    # via
+    #   -c requirements/base.txt
+    #   marshmallow
+pydantic==1.10.12
+    # via
+    #   -c requirements/constraints.in
+    #   langchain
+    #   langsmith
+pyyaml==6.0.1
+    # via langchain
+regex==2023.8.8
+    # via
+    #   -c requirements/base.txt
+    #   tiktoken
+requests==2.31.0
+    # via
+    #   -c requirements/base.txt
+    #   langchain
+    #   langsmith
+    #   openai
+    #   tiktoken
+sqlalchemy==2.0.21
+    # via langchain
+tenacity==8.2.3
+    # via langchain
+tiktoken==0.5.1
+    # via -r requirements/ingest-openai.in
+tqdm==4.66.1
+    # via
+    #   -c requirements/base.txt
+    #   openai
+typing-extensions==4.8.0
+    # via
+    #   -c requirements/base.txt
+    #   pydantic
+    #   sqlalchemy
+    #   typing-inspect
+typing-inspect==0.9.0
+    # via
+    #   -c requirements/base.txt
+    #   dataclasses-json
+urllib3==1.26.16
+    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   requests
+yarl==1.9.2
+    # via aiohttp
diff --git a/setup.py b/setup.py
@@ -157,6 +157,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
         "huggingface": load_requirements("requirements/huggingface.in"),
         "local-inference": all_doc_reqs,
         "paddleocr": load_requirements("requirements/extra-paddleocr.in"),
+        "openai": load_requirements("requirements/ingest-openai.in"),
     },
     package_dir={"unstructured": "unstructured"},
     package_data={"unstructured": ["nlp/*.txt"]},

diff --git a/test_unstructured_ingest/files/azure_cognitive_index_schema.json b/test_unstructured_ingest/files/azure_cognitive_index_schema.json
@@ -15,6 +15,12 @@
       "name": "text",
       "type": "Edm.String"
     },
+    {
+      "name": "embeddings",
+      "type": "Collection(Edm.Single)",
+      "dimensions": 1536,
+      "vectorSearchConfiguration": "embeddings-config"
+    },
     {
       "name": "type",
       "type": "Edm.String"
@@ -107,6 +113,10 @@
           "name": "page_number",
           "type": "Edm.String"
         },
+        {
+          "name": "page_name",
+          "type": "Edm.String"
+        },
         {
           "name": "url",
           "type": "Edm.String"
@@ -161,5 +171,19 @@
         }
       ]
     }
-  ]
+  ],
+  "vectorSearch": {
+    "algorithmConfigurations": [
+      {
+        "name": "embeddings-config",
+        "kind": "hnsw",
+        "hnswParameters": {
+          "metric": "cosine",
+          "m": 4,
+          "efConstruction": 400,
+          "efSearch": 500
+        }
+      }
+    ]
+  }
 }