From fbdb9a05f9b9cef1210032abe6c36ba2fcb49e6f Mon Sep 17 00:00:00 2001
From: Stefano Fiorucci <stefanofiorucci@gmail.com>
Date: Fri, 22 Dec 2023 15:52:22 +0100
Subject: [PATCH] Pinecone Document Store - minimal implementation (#81)

* Add PineconeDocumentStore

* adapt to Document refactoring

* start improving existing tests

* try to setup a testing workflow

* fix some format errors

* adapt to new strucure

* adapt pyproject; rm about

* fix workflow

* add hatch-vcs

* simplification - first draft

* simplified tests

* make workflow read the api key

* rm score when filtering docs

* increase wait time

* improve api key reading; more tests

* improvements from PR review

* test simplification

* test simplification 2

* fix

* std ds tests want valueerror

* put tests together

* format

* add fallback for namespace in _embedding_retrieval

* try to parallelize tests

* better try

* labeler

* format fix

* Apply suggestions from code review

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

* Revert "Apply suggestions from code review"

This reverts commit f42c54080d12d34b5f304c66d0c32aa726a073af.

* improve document conversion

* rm deepcopy

* missing return

* fix fmt

* copy metadata

* fmt

* mv comment

* improve tests

* readmes

---------

Co-authored-by: vrunm <97465624+vrunm@users.noreply.github.com>
Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
---
 .github/labeler.yml                           |   5 +
 .github/workflows/pinecone.yml                |  51 ++++
 README.md                                     |   3 +-
 integrations/pinecone/README.md               |  24 ++
 integrations/pinecone/pyproject.toml          | 186 ++++++++++++
 .../src/pinecone_haystack/__init__.py         |   6 +
 .../src/pinecone_haystack/document_store.py   | 268 ++++++++++++++++++
 .../pinecone/src/pinecone_haystack/errors.py  |  10 +
 integrations/pinecone/tests/__init__.py       |   3 +
 integrations/pinecone/tests/conftest.py       |  54 ++++
 .../pinecone/tests/test_document_store.py     | 113 ++++++++
 11 files changed, 722 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/pinecone.yml
 create mode 100644 integrations/pinecone/README.md
 create mode 100644 integrations/pinecone/pyproject.toml
 create mode 100644 integrations/pinecone/src/pinecone_haystack/__init__.py
 create mode 100644 integrations/pinecone/src/pinecone_haystack/document_store.py
 create mode 100644 integrations/pinecone/src/pinecone_haystack/errors.py
 create mode 100644 integrations/pinecone/tests/__init__.py
 create mode 100644 integrations/pinecone/tests/conftest.py
 create mode 100644 integrations/pinecone/tests/test_document_store.py

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 3c1a626c7..151deead6 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -44,6 +44,11 @@ integration:qdrant:
       - any-glob-to-any-file: "integrations/qdrant/**/*"
       - any-glob-to-any-file: ".github/workflows/qdrant.yml"
 
+integration:pinecone:
+  - changed-files:
+      - any-glob-to-any-file: "integrations/pinecone/**/*"
+      - any-glob-to-any-file: ".github/workflows/pinecone.yml"      
+
 integration:unstructured-fileconverter:
   - changed-files:
       - any-glob-to-any-file: "integrations/unstructured/fileconverter/**/*"
diff --git a/.github/workflows/pinecone.yml b/.github/workflows/pinecone.yml
new file mode 100644
index 000000000..fe1b1d456
--- /dev/null
+++ b/.github/workflows/pinecone.yml
@@ -0,0 +1,51 @@
+# This workflow comes from https://github.com/ofek/hatch-mypyc
+# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
+name: Test / pinecone
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - "integrations/pinecone/**"
+      - ".github/workflows/pinecone.yml"
+
+concurrency:
+  group: pinecone-${{ github.head_ref }}
+  cancel-in-progress: true
+
+env:
+  PYTHONUNBUFFERED: "1"
+  FORCE_COLOR: "1"
+  PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
+
+jobs:
+  run:
+    name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        # Pinecone tests are time expensive, so the matrix is limited to Python 3.9 and 3.10
+        os: [ubuntu-latest]
+        python-version: ["3.9", "3.10"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install Hatch
+        run: pip install --upgrade hatch
+
+      - name: Lint
+        working-directory: integrations/pinecone
+        if: matrix.python-version == '3.9'
+        run: hatch run lint:all
+
+      - name: Run tests
+        working-directory: integrations/pinecone
+        run: hatch run cov
diff --git a/README.md b/README.md
index 4de6092c9..bc9d1b6d1 100644
--- a/README.md
+++ b/README.md
@@ -71,4 +71,5 @@ deepset-haystack
 | [opensearch-haystack](integrations/opensearch/)                                 | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack)                                 | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml)                                                   |
 | [qdrant-haystack](integrations/qdrant/)                                         | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack)                            | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml)                                                               |
 | [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter      | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) |
-| [jina-haystack](integrations/jina/)                                             | Embedder            | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack)                                             | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml)                                                                   |
+| [jina-haystack](integrations/jina/)                                             | Embedder            | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack)                                             | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml)
+| [pinecone-haystack](integrations/pinecone/)                                         | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack)                            | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml)                                                                    |
diff --git a/integrations/pinecone/README.md b/integrations/pinecone/README.md
new file mode 100644
index 000000000..bf48e1e66
--- /dev/null
+++ b/integrations/pinecone/README.md
@@ -0,0 +1,24 @@
+[![test](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml)
+
+[![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg)](https://pypi.org/project/pinecone-haystack)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pinecone-haystack.svg)](https://pypi.org/project/pinecone-haystack)
+
+# Pinecone Document Store
+
+Document Store for Haystack 2.x, supports Pinecone.
+
+## Installation
+
+```console
+pip install pinecone-haystack
+```
+
+## Testing
+
+```console
+hatch run test
+```
+
+## License
+
+`pinecone-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.
diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml
new file mode 100644
index 000000000..069dba1be
--- /dev/null
+++ b/integrations/pinecone/pyproject.toml
@@ -0,0 +1,186 @@
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+
+[project]
+name = "pinecone_haystack"
+dynamic = ["version"]
+description = ''
+readme = "README.md"
+requires-python = ">=3.8"
+license = "Apache-2.0"
+keywords = []
+authors = [
+  { name = "deepset GmbH", email = "info@deepset.ai" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+]
+dependencies = [
+  "haystack-ai",
+  "pinecone-client",
+]
+
+[project.urls]
+Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/pinecone#readme"
+Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues"
+Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/pinecone"
+
+[tool.hatch.version]
+source = "vcs"
+tag-pattern = 'integrations\/pinecone-v(?P<version>.*)'
+
+[tool.hatch.version.raw-options]
+root = "../.."
+git_describe_command = 'git describe --tags --match="integrations/pinecone-v[0-9]*"'
+
+[tool.hatch.envs.default]
+dependencies = [
+  "coverage[toml]>=6.5",
+  "pytest",
+  "pytest-xdist",
+]
+[tool.hatch.envs.default.scripts]
+# Pinecone tests are slow (require HTTP requests), so we run them in parallel
+# with pytest-xdist (https://pytest-xdist.readthedocs.io/en/stable/distribution.html)
+test = "pytest -n auto --maxprocesses=3 {args:tests}"
+test-cov = "coverage run -m pytest -n auto --maxprocesses=3 {args:tests}"
+cov-report = [
+  "- coverage combine",
+  "coverage report",
+]
+cov = [
+  "test-cov",
+  "cov-report",
+]
+
+[[tool.hatch.envs.all.matrix]]
+python = ["3.8", "3.9", "3.10", "3.11"]
+
+[tool.hatch.envs.lint]
+detached = true
+dependencies = [
+  "black>=23.1.0",
+  "mypy>=1.0.0",
+  "ruff>=0.0.243",
+  "numpy",
+]
+[tool.hatch.envs.lint.scripts]
+typing = "mypy --install-types --non-interactive {args:src/pinecone_haystack tests}"
+style = [
+  "ruff {args:.}",
+  "black --check --diff {args:.}",
+]
+fmt = [
+  "black {args:.}",
+  "ruff --fix {args:.}",
+  "style",
+]
+all = [
+  "style",
+  "typing",
+]
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.black]
+target-version = ["py37"]
+line-length = 120
+skip-string-normalization = true
+
+[tool.ruff]
+target-version = "py37"
+line-length = 120
+select = [
+  "A",
+  "ARG",
+  "B",
+  "C",
+  "DTZ",
+  "E",
+  "EM",
+  "F",
+  "FBT",
+  "I",
+  "ICN",
+  "ISC",
+  "N",
+  "PLC",
+  "PLE",
+  "PLR",
+  "PLW",
+  "Q",
+  "RUF",
+  "S",
+  "T",
+  "TID",
+  "UP",
+  "W",
+  "YTT",
+]
+ignore = [
+  # Allow non-abstract empty methods in abstract base classes
+  "B027",
+  # Allow boolean positional values in function calls, like `dict.get(... True)`
+  "FBT003",
+  # Ignore checks for possible passwords
+  "S105", "S106", "S107",
+  # Ignore complexity
+  "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
+]
+unfixable = [
+  # Don't touch unused imports
+  "F401",
+]
+
+[tool.ruff.isort]
+known-first-party = ["pinecone_haystack"]
+
+[tool.ruff.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[tool.ruff.per-file-ignores]
+# Tests can use magic values, assertions, and relative imports
+"tests/**/*" = ["PLR2004", "S101", "TID252"]
+
+[tool.coverage.run]
+source_pkgs = ["pinecone_haystack", "tests"]
+branch = true
+parallel = true
+omit = [
+  "example"
+]
+
+[tool.coverage.paths]
+pinecone_haystack = ["src/pinecone_haystack", "*/pinecone_haystack/src/pinecone_haystack"]
+tests = ["tests", "*/pinecone_haystack/tests"]
+
+[tool.coverage.report]
+exclude_lines = [
+  "no cov",
+  "if __name__ == .__main__.:",
+  "if TYPE_CHECKING:",
+]
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+markers = [
+  "unit: unit tests",
+  "integration: integration tests"
+]
+
+[[tool.mypy.overrides]]
+module = [
+  "pinecone.*",
+  "haystack.*",
+  "pytest.*"
+]
+ignore_missing_imports = true
diff --git a/integrations/pinecone/src/pinecone_haystack/__init__.py b/integrations/pinecone/src/pinecone_haystack/__init__.py
new file mode 100644
index 000000000..dbfb60832
--- /dev/null
+++ b/integrations/pinecone/src/pinecone_haystack/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+from pinecone_haystack.document_store import PineconeDocumentStore
+
+__all__ = ["PineconeDocumentStore"]
diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py
new file mode 100644
index 000000000..576993de6
--- /dev/null
+++ b/integrations/pinecone/src/pinecone_haystack/document_store.py
@@ -0,0 +1,268 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+import io
+import logging
+import os
+from copy import copy
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+import pinecone
+from haystack import default_to_dict
+from haystack.dataclasses import Document
+from haystack.document_stores import DuplicatePolicy
+
+logger = logging.getLogger(__name__)
+
+# Pinecone has a limit of 1000 documents that can be returned in a query
+# with include_metadata=True or include_data=True
+# https://docs.pinecone.io/docs/limits
+TOP_K_LIMIT = 1_000
+
+
+class PineconeDocumentStore:
+    def __init__(
+        self,
+        *,
+        api_key: Optional[str] = None,
+        environment: str = "us-west1-gcp",
+        index: str = "default",
+        namespace: str = "default",
+        batch_size: int = 100,
+        dimension: int = 768,
+        **index_creation_kwargs,
+    ):
+        """
+        Creates a new PineconeDocumentStore instance.
+        It is meant to be connected to a Pinecone index and namespace.
+
+        :param api_key: The Pinecone API key. It can be explicitly provided or automatically read from the
+            environment variable PINECONE_API_KEY (recommended).
+        :param environment: The Pinecone environment to connect to. Defaults to "us-west1-gcp".
+        :param index: The Pinecone index to connect to. If the index does not exist, it will be created.
+            Defaults to "default".
+        :param namespace: The Pinecone namespace to connect to. If the namespace does not exist, it will be created
+            at the first write. Defaults to "default".
+        :param batch_size: The number of documents to write in a single batch. Defaults to 100, as recommended by
+            Pinecone.
+        :param dimension: The dimension of the embeddings. This parameter is only used when creating a new index.
+            Defaults to 768.
+        :param index_creation_kwargs: Additional keyword arguments to pass to the index creation method.
+            For example, you can specify `metric`, `pods`, `replicas`...
+            You can find the full list of supported arguments in the
+            [API reference](https://docs.pinecone.io/reference/create_index-1).
+
+        """
+        api_key = api_key or os.environ.get("PINECONE_API_KEY")
+        if not api_key:
+            msg = (
+                "PineconeDocumentStore expects a Pinecone API key. "
+                "Set the PINECONE_API_KEY environment variable (recommended) or pass it explicitly."
+            )
+            raise ValueError(msg)
+
+        pinecone.init(api_key=api_key, environment=environment)
+
+        if index not in pinecone.list_indexes():
+            logger.info(f"Index {index} does not exist. Creating a new index.")
+            pinecone.create_index(name=index, dimension=dimension, **index_creation_kwargs)
+        else:
+            logger.info(f"Index {index} already exists. Connecting to it.")
+
+        self._index = pinecone.Index(index_name=index)
+
+        actual_dimension = self._index.describe_index_stats().get("dimension")
+        if actual_dimension and actual_dimension != dimension:
+            logger.warning(
+                f"Dimension of index {index} is {actual_dimension}, but {dimension} was specified. "
+                "The specified dimension will be ignored."
+                "If you need an index with a different dimension, please create a new one."
+            )
+        self.dimension = actual_dimension or dimension
+
+        self._dummy_vector = [0.0] * self.dimension
+        self.environment = environment
+        self.index = index
+        self.namespace = namespace
+        self.batch_size = batch_size
+        self.index_creation_kwargs = index_creation_kwargs
+
+    def to_dict(self) -> Dict[str, Any]:
+        return default_to_dict(
+            self,
+            environment=self.environment,
+            index=self.index,
+            dimension=self.dimension,
+            namespace=self.namespace,
+            batch_size=self.batch_size,
+            **self.index_creation_kwargs,
+        )
+
+    def count_documents(self) -> int:
+        """
+        Returns how many documents are present in the document store.
+        """
+        try:
+            count = self._index.describe_index_stats()["namespaces"][self.namespace]["vector_count"]
+        except KeyError:
+            count = 0
+        return count
+
+    def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
+        """
+        Writes Documents to Pinecone.
+
+        :param documents: A list of Documents to write to the document store.
+        :param policy: The duplicate policy to use when writing documents.
+            PineconeDocumentStore only supports `DuplicatePolicy.OVERWRITE`.
+
+        :return: The number of documents written to the document store.
+        """
+        if len(documents) > 0 and not isinstance(documents[0], Document):
+            msg = "param 'documents' must contain a list of objects of type Document"
+            raise ValueError(msg)
+
+        if policy not in [DuplicatePolicy.NONE, DuplicatePolicy.OVERWRITE]:
+            logger.warning(
+                f"PineconeDocumentStore only supports `DuplicatePolicy.OVERWRITE`"
+                f"but got {policy}. Overwriting duplicates is enabled by default."
+            )
+
+        documents_for_pinecone = self._convert_documents_to_pinecone_format(documents)
+
+        result = self._index.upsert(
+            vectors=documents_for_pinecone, namespace=self.namespace, batch_size=self.batch_size
+        )
+
+        written_docs = result["upserted_count"]
+        return written_docs
+
+    def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
+        """
+        Returns the documents that match the filters provided.
+
+        For a detailed specification of the filters,
+        refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering)
+
+        :param filters: The filters to apply to the document list.
+        :return: A list of Documents that match the given filters.
+        """
+
+        # Pinecone only performs vector similarity search
+        # here we are querying with a dummy vector and the max compatible top_k
+        documents = self._embedding_retrieval(query_embedding=self._dummy_vector, filters=filters, top_k=TOP_K_LIMIT)
+
+        # when simply filtering, we don't want to return any scores
+        # furthermore, we are querying with a dummy vector, so the scores are meaningless
+        for doc in documents:
+            doc.score = None
+
+        if len(documents) == TOP_K_LIMIT:
+            logger.warning(
+                f"PineconeDocumentStore can return at most {TOP_K_LIMIT} documents and the query has hit this limit. "
+                f"It is likely that there are more matching documents in the document store. "
+            )
+        return documents
+
+    def delete_documents(self, document_ids: List[str]) -> None:
+        """
+        Deletes all documents with a matching document_ids from the document store.
+
+        :param document_ids: the document ids to delete
+        """
+        self._index.delete(ids=document_ids, namespace=self.namespace)
+
+    def _embedding_retrieval(
+        self,
+        query_embedding: List[float],
+        *,
+        namespace: Optional[str] = None,
+        filters: Optional[Dict[str, Any]] = None,  # noqa: ARG002 (filters to be implemented)
+        top_k: int = 10,
+    ) -> List[Document]:
+        """
+        Retrieves documents that are most similar to the query embedding using a vector similarity metric.
+
+        This method is not mean to be part of the public interface of
+        `PineconeDocumentStore` nor called directly.
+        `PineconeDenseRetriever` uses this method directly and is the public interface for it.
+
+        :param query_embedding: Embedding of the query.
+        :param namespace: Pinecone namespace to query. Defaults the namespace of the document store.
+        :param filters: Filters applied to the retrieved Documents. Defaults to None.
+        :param top_k: Maximum number of Documents to return, defaults to 10
+
+        :return: List of Document that are most similar to `query_embedding`
+        """
+
+        if not query_embedding:
+            msg = "query_embedding must be a non-empty list of floats"
+            raise ValueError(msg)
+
+        result = self._index.query(
+            vector=query_embedding,
+            top_k=top_k,
+            namespace=namespace or self.namespace,
+            include_values=True,
+            include_metadata=True,
+        )
+
+        return self._convert_query_result_to_documents(result)
+
+    def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> List[Document]:
+        pinecone_docs = query_result["matches"]
+        documents = []
+        for pinecone_doc in pinecone_docs:
+            content = pinecone_doc["metadata"].pop("content", None)
+
+            dataframe = None
+            dataframe_string = pinecone_doc["metadata"].pop("dataframe", None)
+            if dataframe_string:
+                dataframe = pd.read_json(io.StringIO(dataframe_string))
+
+            # we always store vectors during writing
+            # but we don't want to return them if they are dummy vectors
+            embedding = None
+            if pinecone_doc["values"] != self._dummy_vector:
+                embedding = pinecone_doc["values"]
+
+            doc = Document(
+                id=pinecone_doc["id"],
+                content=content,
+                dataframe=dataframe,
+                meta=pinecone_doc["metadata"],
+                embedding=embedding,
+                score=pinecone_doc["score"],
+            )
+            documents.append(doc)
+
+        return documents
+
+    def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> List[Dict[str, Any]]:
+        documents_for_pinecone = []
+        for document in documents:
+            embedding = copy(document.embedding)
+            if embedding is None:
+                logger.warning(
+                    f"Document {document.id} has no embedding. Pinecone is a purely vector database. "
+                    "A dummy embedding will be used, but this can affect the search results. "
+                )
+                embedding = self._dummy_vector
+            doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": dict(document.meta)}
+
+            # we save content/dataframe as metadata
+            if document.content is not None:
+                doc_for_pinecone["metadata"]["content"] = document.content
+            if document.dataframe is not None:
+                doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json()
+            # currently, storing blob in Pinecone is not supported
+            if document.blob is not None:
+                logger.warning(
+                    f"Document {document.id} has the `blob` field set, but storing `ByteStream` "
+                    "objects in Pinecone is not supported. "
+                    "The content of the `blob` field will be ignored."
+                )
+
+            documents_for_pinecone.append(doc_for_pinecone)
+        return documents_for_pinecone
diff --git a/integrations/pinecone/src/pinecone_haystack/errors.py b/integrations/pinecone/src/pinecone_haystack/errors.py
new file mode 100644
index 000000000..994f34cf0
--- /dev/null
+++ b/integrations/pinecone/src/pinecone_haystack/errors.py
@@ -0,0 +1,10 @@
+from haystack.document_stores.errors import DocumentStoreError
+from haystack.errors import FilterError
+
+
+class PineconeDocumentStoreError(DocumentStoreError):
+    pass
+
+
+class PineconeDocumentStoreFilterError(FilterError):
+    pass
diff --git a/integrations/pinecone/tests/__init__.py b/integrations/pinecone/tests/__init__.py
new file mode 100644
index 000000000..e873bc332
--- /dev/null
+++ b/integrations/pinecone/tests/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/integrations/pinecone/tests/conftest.py b/integrations/pinecone/tests/conftest.py
new file mode 100644
index 000000000..ea0fc0167
--- /dev/null
+++ b/integrations/pinecone/tests/conftest.py
@@ -0,0 +1,54 @@
+import time
+
+import pytest
+from haystack.document_stores import DuplicatePolicy
+
+from pinecone_haystack.document_store import PineconeDocumentStore
+
+# This is the approximate time it takes for the documents to be available
+SLEEP_TIME = 20
+
+
+@pytest.fixture()
+def sleep_time():
+    return SLEEP_TIME
+
+
+@pytest.fixture
+def document_store(request):
+    """
+    This is the most basic requirement for the child class: provide
+    an instance of this document store so the base class can use it.
+    """
+    environment = "gcp-starter"
+    index = "default"
+    # Use a different namespace for each test so we can run them in parallel
+    namespace = f"{request.node.name}-{int(time.time())}"
+    dimension = 768
+
+    store = PineconeDocumentStore(
+        environment=environment,
+        index=index,
+        namespace=namespace,
+        dimension=dimension,
+    )
+
+    # Override some methods to wait for the documents to be available
+    original_write_documents = store.write_documents
+
+    def write_documents_and_wait(documents, policy=DuplicatePolicy.NONE):
+        written_docs = original_write_documents(documents, policy)
+        time.sleep(SLEEP_TIME)
+        return written_docs
+
+    original_delete_documents = store.delete_documents
+
+    def delete_documents_and_wait(filters):
+        original_delete_documents(filters)
+        time.sleep(SLEEP_TIME)
+
+    store.write_documents = write_documents_and_wait
+    store.delete_documents = delete_documents_and_wait
+
+    yield store
+    store._index.delete(delete_all=True, namespace=namespace)
diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py
new file mode 100644
index 000000000..5c9b32698
--- /dev/null
+++ b/integrations/pinecone/tests/test_document_store.py
@@ -0,0 +1,113 @@
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+from haystack import Document
+from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest
+
+from pinecone_haystack.document_store import PineconeDocumentStore
+
+
+class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest):
+    def test_write_documents(self, document_store: PineconeDocumentStore):
+        docs = [Document(id="1")]
+        assert document_store.write_documents(docs) == 1
+
+    @pytest.mark.skip(reason="Pinecone only supports UPSERT operations")
+    def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore):
+        ...
+
+    @pytest.mark.skip(reason="Pinecone only supports UPSERT operations")
+    def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore):
+        ...
+
+    @patch("pinecone_haystack.document_store.pinecone")
+    def test_init(self, mock_pinecone):
+        mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30}
+
+        document_store = PineconeDocumentStore(
+            api_key="fake-api-key",
+            environment="gcp-starter",
+            index="my_index",
+            namespace="test",
+            batch_size=50,
+            dimension=30,
+            metric="euclidean",
+        )
+
+        mock_pinecone.init.assert_called_with(api_key="fake-api-key", environment="gcp-starter")
+
+        assert document_store.environment == "gcp-starter"
+        assert document_store.index == "my_index"
+        assert document_store.namespace == "test"
+        assert document_store.batch_size == 50
+        assert document_store.dimension == 30
+        assert document_store.index_creation_kwargs == {"metric": "euclidean"}
+
+    @patch("pinecone_haystack.document_store.pinecone")
+    def test_init_api_key_in_environment_variable(self, mock_pinecone, monkeypatch):
+        monkeypatch.setenv("PINECONE_API_KEY", "fake-api-key")
+
+        PineconeDocumentStore(
+            environment="gcp-starter",
+            index="my_index",
+            namespace="test",
+            batch_size=50,
+            dimension=30,
+            metric="euclidean",
+        )
+
+        mock_pinecone.init.assert_called_with(api_key="fake-api-key", environment="gcp-starter")
+
+    def test_init_fails_wo_api_key(self, monkeypatch):
+        api_key = None
+        monkeypatch.delenv("PINECONE_API_KEY", raising=False)
+        with pytest.raises(ValueError):
+            PineconeDocumentStore(
+                api_key=api_key,
+                environment="gcp-starter",
+                index="my_index",
+            )
+
+    @patch("pinecone_haystack.document_store.pinecone")
+    def test_to_dict(self, mock_pinecone):
+        mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30}
+        document_store = PineconeDocumentStore(
+            api_key="fake-api-key",
+            environment="gcp-starter",
+            index="my_index",
+            namespace="test",
+            batch_size=50,
+            dimension=30,
+            metric="euclidean",
+        )
+        assert document_store.to_dict() == {
+            "type": "pinecone_haystack.document_store.PineconeDocumentStore",
+            "init_parameters": {
+                "environment": "gcp-starter",
+                "index": "my_index",
+                "dimension": 30,
+                "namespace": "test",
+                "batch_size": 50,
+                "metric": "euclidean",
+            },
+        }
+
+    def test_embedding_retrieval(self, document_store: PineconeDocumentStore):
+        query_embedding = [0.1] * 768
+        most_similar_embedding = [0.8] * 768
+        second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65
+        another_embedding = np.random.rand(768).tolist()
+
+        docs = [
+            Document(content="Most similar document", embedding=most_similar_embedding),
+            Document(content="2nd best document", embedding=second_best_embedding),
+            Document(content="Not very similar document", embedding=another_embedding),
+        ]
+
+        document_store.write_documents(docs)
+
+        results = document_store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={})
+        assert len(results) == 2
+        assert results[0].content == "Most similar document"
+        assert results[1].content == "2nd best document"