From fbdb9a05f9b9cef1210032abe6c36ba2fcb49e6f Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Fri, 22 Dec 2023 15:52:22 +0100 Subject: [PATCH] Pinecone Document Store - minimal implementation (#81) * Add PineconeDocumentStore * adapt to Document refactoring * start improving existing tests * try to setup a testing workflow * fix some format errors * adapt to new strucure * adapt pyproject; rm about * fix workflow * add hatch-vcs * simplification - first draft * simplified tests * make workflow read the api key * rm score when filtering docs * increase wait time * improve api key reading; more tests * improvements from PR review * test simplification * test simplification 2 * fix * std ds tests want valueerror * put tests together * format * add fallback for namespace in _embedding_retrieval * try to parallelize tests * better try * labeler * format fix * Apply suggestions from code review Co-authored-by: Massimiliano Pippi * Revert "Apply suggestions from code review" This reverts commit f42c54080d12d34b5f304c66d0c32aa726a073af. * improve document conversion * rm deepcopy * missing return * fix fmt * copy metadata * fmt * mv comment * improve tests * readmes --------- Co-authored-by: vrunm <97465624+vrunm@users.noreply.github.com> Co-authored-by: Massimiliano Pippi --- .github/labeler.yml | 5 + .github/workflows/pinecone.yml | 51 ++++ README.md | 3 +- integrations/pinecone/README.md | 24 ++ integrations/pinecone/pyproject.toml | 186 ++++++++++++ .../src/pinecone_haystack/__init__.py | 6 + .../src/pinecone_haystack/document_store.py | 268 ++++++++++++++++++ .../pinecone/src/pinecone_haystack/errors.py | 10 + integrations/pinecone/tests/__init__.py | 3 + integrations/pinecone/tests/conftest.py | 54 ++++ .../pinecone/tests/test_document_store.py | 113 ++++++++ 11 files changed, 722 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/pinecone.yml create mode 100644 integrations/pinecone/README.md create mode 100644 integrations/pinecone/pyproject.toml create mode 100644 integrations/pinecone/src/pinecone_haystack/__init__.py create mode 100644 integrations/pinecone/src/pinecone_haystack/document_store.py create mode 100644 integrations/pinecone/src/pinecone_haystack/errors.py create mode 100644 integrations/pinecone/tests/__init__.py create mode 100644 integrations/pinecone/tests/conftest.py create mode 100644 integrations/pinecone/tests/test_document_store.py diff --git a/.github/labeler.yml b/.github/labeler.yml index 3c1a626c7..151deead6 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -44,6 +44,11 @@ integration:qdrant: - any-glob-to-any-file: "integrations/qdrant/**/*" - any-glob-to-any-file: ".github/workflows/qdrant.yml" +integration:pinecone: + - changed-files: + - any-glob-to-any-file: "integrations/pinecone/**/*" + - any-glob-to-any-file: ".github/workflows/pinecone.yml" + integration:unstructured-fileconverter: - changed-files: - any-glob-to-any-file: "integrations/unstructured/fileconverter/**/*" diff --git a/.github/workflows/pinecone.yml b/.github/workflows/pinecone.yml new file mode 100644 index 000000000..fe1b1d456 --- /dev/null +++ b/.github/workflows/pinecone.yml @@ -0,0 +1,51 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / pinecone + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/pinecone/**" + - ".github/workflows/pinecone.yml" + +concurrency: + group: pinecone-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + # Pinecone tests are time expensive, so the matrix is limited to Python 3.9 and 3.10 + os: [ubuntu-latest] + python-version: ["3.9", "3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + + - name: Lint + working-directory: integrations/pinecone + if: matrix.python-version == '3.9' + run: hatch run lint:all + + - name: Run tests + working-directory: integrations/pinecone + run: hatch run cov diff --git a/README.md b/README.md index 4de6092c9..bc9d1b6d1 100644 --- a/README.md +++ b/README.md @@ -71,4 +71,5 @@ deepset-haystack | [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | | [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) | | [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) | -| [jina-haystack](integrations/jina/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml) | +| [jina-haystack](integrations/jina/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml) +| [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) | diff --git a/integrations/pinecone/README.md b/integrations/pinecone/README.md new file mode 100644 index 000000000..bf48e1e66 --- /dev/null +++ b/integrations/pinecone/README.md @@ -0,0 +1,24 @@ +[![test](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) + +[![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg)](https://pypi.org/project/pinecone-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pinecone-haystack.svg)](https://pypi.org/project/pinecone-haystack) + +# Pinecone Document Store + +Document Store for Haystack 2.x, supports Pinecone. + +## Installation + +```console +pip install pinecone-haystack +``` + +## Testing + +```console +hatch run test +``` + +## License + +`pinecone-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml new file mode 100644 index 000000000..069dba1be --- /dev/null +++ b/integrations/pinecone/pyproject.toml @@ -0,0 +1,186 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "pinecone_haystack" +dynamic = ["version"] +description = '' +readme = "README.md" +requires-python = ">=3.8" +license = "Apache-2.0" +keywords = [] +authors = [ + { name = "deepset GmbH", email = "info@deepset.ai" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai", + "pinecone-client", +] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/pinecone#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/pinecone" + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/pinecone-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/pinecone-v[0-9]*"' + +[tool.hatch.envs.default] +dependencies = [ + "coverage[toml]>=6.5", + "pytest", + "pytest-xdist", +] +[tool.hatch.envs.default.scripts] +# Pinecone tests are slow (require HTTP requests), so we run them in parallel +# with pytest-xdist (https://pytest-xdist.readthedocs.io/en/stable/distribution.html) +test = "pytest -n auto --maxprocesses=3 {args:tests}" +test-cov = "coverage run -m pytest -n auto --maxprocesses=3 {args:tests}" +cov-report = [ + "- coverage combine", + "coverage report", +] +cov = [ + "test-cov", + "cov-report", +] + +[[tool.hatch.envs.all.matrix]] +python = ["3.8", "3.9", "3.10", "3.11"] + +[tool.hatch.envs.lint] +detached = true +dependencies = [ + "black>=23.1.0", + "mypy>=1.0.0", + "ruff>=0.0.243", + "numpy", +] +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive {args:src/pinecone_haystack tests}" +style = [ + "ruff {args:.}", + "black --check --diff {args:.}", +] +fmt = [ + "black {args:.}", + "ruff --fix {args:.}", + "style", +] +all = [ + "style", + "typing", +] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.black] +target-version = ["py37"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py37" +line-length = 120 +select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "FBT", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["pinecone_haystack"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.coverage.run] +source_pkgs = ["pinecone_haystack", "tests"] +branch = true +parallel = true +omit = [ + "example" +] + +[tool.coverage.paths] +pinecone_haystack = ["src/pinecone_haystack", "*/pinecone_haystack/src/pinecone_haystack"] +tests = ["tests", "*/pinecone_haystack/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.pytest.ini_options] +minversion = "6.0" +markers = [ + "unit: unit tests", + "integration: integration tests" +] + +[[tool.mypy.overrides]] +module = [ + "pinecone.*", + "haystack.*", + "pytest.*" +] +ignore_missing_imports = true diff --git a/integrations/pinecone/src/pinecone_haystack/__init__.py b/integrations/pinecone/src/pinecone_haystack/__init__.py new file mode 100644 index 000000000..dbfb60832 --- /dev/null +++ b/integrations/pinecone/src/pinecone_haystack/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from pinecone_haystack.document_store import PineconeDocumentStore + +__all__ = ["PineconeDocumentStore"] diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py new file mode 100644 index 000000000..576993de6 --- /dev/null +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -0,0 +1,268 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +import io +import logging +import os +from copy import copy +from typing import Any, Dict, List, Optional + +import pandas as pd +import pinecone +from haystack import default_to_dict +from haystack.dataclasses import Document +from haystack.document_stores import DuplicatePolicy + +logger = logging.getLogger(__name__) + +# Pinecone has a limit of 1000 documents that can be returned in a query +# with include_metadata=True or include_data=True +# https://docs.pinecone.io/docs/limits +TOP_K_LIMIT = 1_000 + + +class PineconeDocumentStore: + def __init__( + self, + *, + api_key: Optional[str] = None, + environment: str = "us-west1-gcp", + index: str = "default", + namespace: str = "default", + batch_size: int = 100, + dimension: int = 768, + **index_creation_kwargs, + ): + """ + Creates a new PineconeDocumentStore instance. + It is meant to be connected to a Pinecone index and namespace. + + :param api_key: The Pinecone API key. It can be explicitly provided or automatically read from the + environment variable PINECONE_API_KEY (recommended). + :param environment: The Pinecone environment to connect to. Defaults to "us-west1-gcp". + :param index: The Pinecone index to connect to. If the index does not exist, it will be created. + Defaults to "default". + :param namespace: The Pinecone namespace to connect to. If the namespace does not exist, it will be created + at the first write. Defaults to "default". + :param batch_size: The number of documents to write in a single batch. Defaults to 100, as recommended by + Pinecone. + :param dimension: The dimension of the embeddings. This parameter is only used when creating a new index. + Defaults to 768. + :param index_creation_kwargs: Additional keyword arguments to pass to the index creation method. + For example, you can specify `metric`, `pods`, `replicas`... + You can find the full list of supported arguments in the + [API reference](https://docs.pinecone.io/reference/create_index-1). + + """ + api_key = api_key or os.environ.get("PINECONE_API_KEY") + if not api_key: + msg = ( + "PineconeDocumentStore expects a Pinecone API key. " + "Set the PINECONE_API_KEY environment variable (recommended) or pass it explicitly." + ) + raise ValueError(msg) + + pinecone.init(api_key=api_key, environment=environment) + + if index not in pinecone.list_indexes(): + logger.info(f"Index {index} does not exist. Creating a new index.") + pinecone.create_index(name=index, dimension=dimension, **index_creation_kwargs) + else: + logger.info(f"Index {index} already exists. Connecting to it.") + + self._index = pinecone.Index(index_name=index) + + actual_dimension = self._index.describe_index_stats().get("dimension") + if actual_dimension and actual_dimension != dimension: + logger.warning( + f"Dimension of index {index} is {actual_dimension}, but {dimension} was specified. " + "The specified dimension will be ignored." + "If you need an index with a different dimension, please create a new one." + ) + self.dimension = actual_dimension or dimension + + self._dummy_vector = [0.0] * self.dimension + self.environment = environment + self.index = index + self.namespace = namespace + self.batch_size = batch_size + self.index_creation_kwargs = index_creation_kwargs + + def to_dict(self) -> Dict[str, Any]: + return default_to_dict( + self, + environment=self.environment, + index=self.index, + dimension=self.dimension, + namespace=self.namespace, + batch_size=self.batch_size, + **self.index_creation_kwargs, + ) + + def count_documents(self) -> int: + """ + Returns how many documents are present in the document store. + """ + try: + count = self._index.describe_index_stats()["namespaces"][self.namespace]["vector_count"] + except KeyError: + count = 0 + return count + + def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: + """ + Writes Documents to Pinecone. + + :param documents: A list of Documents to write to the document store. + :param policy: The duplicate policy to use when writing documents. + PineconeDocumentStore only supports `DuplicatePolicy.OVERWRITE`. + + :return: The number of documents written to the document store. + """ + if len(documents) > 0 and not isinstance(documents[0], Document): + msg = "param 'documents' must contain a list of objects of type Document" + raise ValueError(msg) + + if policy not in [DuplicatePolicy.NONE, DuplicatePolicy.OVERWRITE]: + logger.warning( + f"PineconeDocumentStore only supports `DuplicatePolicy.OVERWRITE`" + f"but got {policy}. Overwriting duplicates is enabled by default." + ) + + documents_for_pinecone = self._convert_documents_to_pinecone_format(documents) + + result = self._index.upsert( + vectors=documents_for_pinecone, namespace=self.namespace, batch_size=self.batch_size + ) + + written_docs = result["upserted_count"] + return written_docs + + def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: + """ + Returns the documents that match the filters provided. + + For a detailed specification of the filters, + refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering) + + :param filters: The filters to apply to the document list. + :return: A list of Documents that match the given filters. + """ + + # Pinecone only performs vector similarity search + # here we are querying with a dummy vector and the max compatible top_k + documents = self._embedding_retrieval(query_embedding=self._dummy_vector, filters=filters, top_k=TOP_K_LIMIT) + + # when simply filtering, we don't want to return any scores + # furthermore, we are querying with a dummy vector, so the scores are meaningless + for doc in documents: + doc.score = None + + if len(documents) == TOP_K_LIMIT: + logger.warning( + f"PineconeDocumentStore can return at most {TOP_K_LIMIT} documents and the query has hit this limit. " + f"It is likely that there are more matching documents in the document store. " + ) + return documents + + def delete_documents(self, document_ids: List[str]) -> None: + """ + Deletes all documents with a matching document_ids from the document store. + + :param document_ids: the document ids to delete + """ + self._index.delete(ids=document_ids, namespace=self.namespace) + + def _embedding_retrieval( + self, + query_embedding: List[float], + *, + namespace: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, # noqa: ARG002 (filters to be implemented) + top_k: int = 10, + ) -> List[Document]: + """ + Retrieves documents that are most similar to the query embedding using a vector similarity metric. + + This method is not mean to be part of the public interface of + `PineconeDocumentStore` nor called directly. + `PineconeDenseRetriever` uses this method directly and is the public interface for it. + + :param query_embedding: Embedding of the query. + :param namespace: Pinecone namespace to query. Defaults the namespace of the document store. + :param filters: Filters applied to the retrieved Documents. Defaults to None. + :param top_k: Maximum number of Documents to return, defaults to 10 + + :return: List of Document that are most similar to `query_embedding` + """ + + if not query_embedding: + msg = "query_embedding must be a non-empty list of floats" + raise ValueError(msg) + + result = self._index.query( + vector=query_embedding, + top_k=top_k, + namespace=namespace or self.namespace, + include_values=True, + include_metadata=True, + ) + + return self._convert_query_result_to_documents(result) + + def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> List[Document]: + pinecone_docs = query_result["matches"] + documents = [] + for pinecone_doc in pinecone_docs: + content = pinecone_doc["metadata"].pop("content", None) + + dataframe = None + dataframe_string = pinecone_doc["metadata"].pop("dataframe", None) + if dataframe_string: + dataframe = pd.read_json(io.StringIO(dataframe_string)) + + # we always store vectors during writing + # but we don't want to return them if they are dummy vectors + embedding = None + if pinecone_doc["values"] != self._dummy_vector: + embedding = pinecone_doc["values"] + + doc = Document( + id=pinecone_doc["id"], + content=content, + dataframe=dataframe, + meta=pinecone_doc["metadata"], + embedding=embedding, + score=pinecone_doc["score"], + ) + documents.append(doc) + + return documents + + def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> List[Dict[str, Any]]: + documents_for_pinecone = [] + for document in documents: + embedding = copy(document.embedding) + if embedding is None: + logger.warning( + f"Document {document.id} has no embedding. Pinecone is a purely vector database. " + "A dummy embedding will be used, but this can affect the search results. " + ) + embedding = self._dummy_vector + doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": dict(document.meta)} + + # we save content/dataframe as metadata + if document.content is not None: + doc_for_pinecone["metadata"]["content"] = document.content + if document.dataframe is not None: + doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json() + # currently, storing blob in Pinecone is not supported + if document.blob is not None: + logger.warning( + f"Document {document.id} has the `blob` field set, but storing `ByteStream` " + "objects in Pinecone is not supported. " + "The content of the `blob` field will be ignored." + ) + + documents_for_pinecone.append(doc_for_pinecone) + return documents_for_pinecone diff --git a/integrations/pinecone/src/pinecone_haystack/errors.py b/integrations/pinecone/src/pinecone_haystack/errors.py new file mode 100644 index 000000000..994f34cf0 --- /dev/null +++ b/integrations/pinecone/src/pinecone_haystack/errors.py @@ -0,0 +1,10 @@ +from haystack.document_stores.errors import DocumentStoreError +from haystack.errors import FilterError + + +class PineconeDocumentStoreError(DocumentStoreError): + pass + + +class PineconeDocumentStoreFilterError(FilterError): + pass diff --git a/integrations/pinecone/tests/__init__.py b/integrations/pinecone/tests/__init__.py new file mode 100644 index 000000000..e873bc332 --- /dev/null +++ b/integrations/pinecone/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/pinecone/tests/conftest.py b/integrations/pinecone/tests/conftest.py new file mode 100644 index 000000000..ea0fc0167 --- /dev/null +++ b/integrations/pinecone/tests/conftest.py @@ -0,0 +1,54 @@ +import time + +import pytest +from haystack.document_stores import DuplicatePolicy + +from pinecone_haystack.document_store import PineconeDocumentStore + +# This is the approximate time it takes for the documents to be available +SLEEP_TIME = 20 + + +@pytest.fixture() +def sleep_time(): + return SLEEP_TIME + + +@pytest.fixture +def document_store(request): + """ + This is the most basic requirement for the child class: provide + an instance of this document store so the base class can use it. + """ + environment = "gcp-starter" + index = "default" + # Use a different namespace for each test so we can run them in parallel + namespace = f"{request.node.name}-{int(time.time())}" + dimension = 768 + + store = PineconeDocumentStore( + environment=environment, + index=index, + namespace=namespace, + dimension=dimension, + ) + + # Override some methods to wait for the documents to be available + original_write_documents = store.write_documents + + def write_documents_and_wait(documents, policy=DuplicatePolicy.NONE): + written_docs = original_write_documents(documents, policy) + time.sleep(SLEEP_TIME) + return written_docs + + original_delete_documents = store.delete_documents + + def delete_documents_and_wait(filters): + original_delete_documents(filters) + time.sleep(SLEEP_TIME) + + store.write_documents = write_documents_and_wait + store.delete_documents = delete_documents_and_wait + + yield store + store._index.delete(delete_all=True, namespace=namespace) diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py new file mode 100644 index 000000000..5c9b32698 --- /dev/null +++ b/integrations/pinecone/tests/test_document_store.py @@ -0,0 +1,113 @@ +from unittest.mock import patch + +import numpy as np +import pytest +from haystack import Document +from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest + +from pinecone_haystack.document_store import PineconeDocumentStore + + +class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest): + def test_write_documents(self, document_store: PineconeDocumentStore): + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + + @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") + def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): + ... + + @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") + def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): + ... + + @patch("pinecone_haystack.document_store.pinecone") + def test_init(self, mock_pinecone): + mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30} + + document_store = PineconeDocumentStore( + api_key="fake-api-key", + environment="gcp-starter", + index="my_index", + namespace="test", + batch_size=50, + dimension=30, + metric="euclidean", + ) + + mock_pinecone.init.assert_called_with(api_key="fake-api-key", environment="gcp-starter") + + assert document_store.environment == "gcp-starter" + assert document_store.index == "my_index" + assert document_store.namespace == "test" + assert document_store.batch_size == 50 + assert document_store.dimension == 30 + assert document_store.index_creation_kwargs == {"metric": "euclidean"} + + @patch("pinecone_haystack.document_store.pinecone") + def test_init_api_key_in_environment_variable(self, mock_pinecone, monkeypatch): + monkeypatch.setenv("PINECONE_API_KEY", "fake-api-key") + + PineconeDocumentStore( + environment="gcp-starter", + index="my_index", + namespace="test", + batch_size=50, + dimension=30, + metric="euclidean", + ) + + mock_pinecone.init.assert_called_with(api_key="fake-api-key", environment="gcp-starter") + + def test_init_fails_wo_api_key(self, monkeypatch): + api_key = None + monkeypatch.delenv("PINECONE_API_KEY", raising=False) + with pytest.raises(ValueError): + PineconeDocumentStore( + api_key=api_key, + environment="gcp-starter", + index="my_index", + ) + + @patch("pinecone_haystack.document_store.pinecone") + def test_to_dict(self, mock_pinecone): + mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30} + document_store = PineconeDocumentStore( + api_key="fake-api-key", + environment="gcp-starter", + index="my_index", + namespace="test", + batch_size=50, + dimension=30, + metric="euclidean", + ) + assert document_store.to_dict() == { + "type": "pinecone_haystack.document_store.PineconeDocumentStore", + "init_parameters": { + "environment": "gcp-starter", + "index": "my_index", + "dimension": 30, + "namespace": "test", + "batch_size": 50, + "metric": "euclidean", + }, + } + + def test_embedding_retrieval(self, document_store: PineconeDocumentStore): + query_embedding = [0.1] * 768 + most_similar_embedding = [0.8] * 768 + second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65 + another_embedding = np.random.rand(768).tolist() + + docs = [ + Document(content="Most similar document", embedding=most_similar_embedding), + Document(content="2nd best document", embedding=second_best_embedding), + Document(content="Not very similar document", embedding=another_embedding), + ] + + document_store.write_documents(docs) + + results = document_store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={}) + assert len(results) == 2 + assert results[0].content == "Most similar document" + assert results[1].content == "2nd best document"