[ENH] Update and implement new unit test for limit/offset (#2796)

## Description of changes *Summarize the changes made by this PR.* - Improvements & Bug fixes - Updates `test_filtering.py`. The new implementation sorts the result by internal `offset_id` instead of user provided `id`, thus the unit tests need to be updated accordingly. - In addition, a simple unit test is added to test `get()` with only `offset` and `limit` ## Test plan Run the existing and new tests locally - [ ] Tests pass locally with `pytest` for python, `yarn test` for js, `cargo test` for rust ## Documentation Changes Add documentation to explain the order of results return by `get()` --------- Co-authored-by: Sicheng Pan <[email protected]>
chroma-core · Sep 18, 2024 · 427ae63 · 427ae63
1 parent 260c835
commit 427ae63
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 9 deletions.
diff --git a/chromadb/segment/impl/metadata/sqlite.py b/chromadb/segment/impl/metadata/sqlite.py
@@ -154,7 +154,7 @@ def get_metadata(
                 .on(embeddings_t.id == metadata_t.id)
             )
             .select(*select_clause)
-            .orderby(embeddings_t.embedding_id)
+            .orderby(embeddings_t.id)
         )
 
         # If there is a query that touches the metadata table, it uses
@@ -166,7 +166,7 @@ def get_metadata(
                 .select(metadata_t.id)
                 .join(embeddings_t)
                 .on(embeddings_t.id == metadata_t.id)
-                .orderby(embeddings_t.embedding_id)
+                .orderby(embeddings_t.id)
                 .where(
                     embeddings_t.segment_id
                     == ParameterValue(self._db.uuid_to_db(self._id))
@@ -207,7 +207,7 @@ def get_metadata(
                     embeddings_t.segment_id
                     == ParameterValue(self._db.uuid_to_db(self._id))
                 )
-                .orderby(embeddings_t.embedding_id)
+                .orderby(embeddings_t.id)
                 .limit(limit)
                 .offset(offset)
             )

diff --git a/chromadb/test/api/test_limit_offset.py b/chromadb/test/api/test_limit_offset.py
@@ -0,0 +1,66 @@
+import logging
+
+import chromadb.test.property.strategies as strategies
+import hypothesis.strategies as st
+from chromadb.api import ClientAPI
+from chromadb.test.conftest import NOT_CLUSTER_ONLY, reset
+from chromadb.test.property import invariants
+from chromadb.test.utils.wait_for_version_increase import wait_for_version_increase
+from hypothesis import HealthCheck, given, settings
+
+collection_st = st.shared(
+    strategies.collections(add_filterable_data=True, with_hnsw_params=True),
+    key="coll",
+)
+recordset_st = st.shared(
+    strategies.recordsets(collection_st, max_size=1000), key="recordset"
+)
+
+
+@settings(
+    deadline=90000,
+    suppress_health_check=[
+        HealthCheck.function_scoped_fixture,
+        HealthCheck.large_base_example,
+        HealthCheck.filter_too_much,
+    ],
+)  # type: ignore
+@given(
+    collection=collection_st,
+    record_set=recordset_st,
+    limit=st.integers(min_value=1, max_value=10),
+    offset=st.integers(min_value=0, max_value=10),
+    should_compact=st.booleans(),
+)
+def test_get_limit_offset(
+    caplog,
+    client: ClientAPI,
+    collection: strategies.Collection,
+    record_set: dict,
+    limit: int,
+    offset: int,
+    should_compact: bool,
+) -> None:
+    caplog.set_level(logging.ERROR)
+
+    reset(client)
+    coll = client.create_collection(
+        name=collection.name,
+        metadata=collection.metadata,  # type: ignore
+        embedding_function=collection.embedding_function,
+    )
+
+    initial_version = coll.get_model()["version"]
+
+    coll.add(**record_set)
+
+    if not NOT_CLUSTER_ONLY:
+        # Only wait for compaction if the size of the collection is
+        # some minimal size
+        if should_compact and len(invariants.wrap(record_set["ids"])) > 10:
+            # Wait for the model to be updated
+            wait_for_version_increase(client, collection.name, initial_version)
+
+    result_ids = coll.get(offset=offset, limit=limit)["ids"]
+    all_offset_ids = coll.get()["ids"]
+    assert result_ids == all_offset_ids[offset : offset + limit]
diff --git a/chromadb/test/property/test_filtering.py b/chromadb/test/property/test_filtering.py
@@ -251,11 +251,6 @@ def test_filterable_metadata_get_limit_offset(
 ) -> None:
     caplog.set_level(logging.ERROR)
 
-    # The distributed system does not support limit/offset yet
-    # so we skip this test for now if the system is distributed
-    if not NOT_CLUSTER_ONLY:
-        pytest.skip("Distributed system does not support limit/offset yet")
-
     reset(client)
     coll = client.create_collection(
         name=collection.name,
@@ -280,7 +275,15 @@ def test_filterable_metadata_get_limit_offset(
         filter["offset"] = offset
         result_ids = coll.get(**filter)["ids"]
         expected_ids = _filter_embedding_set(record_set, filter)
-        assert sorted(result_ids) == sorted(expected_ids)[offset : offset + limit]
+        offset_id_order = {
+            id: index for index, id in enumerate(coll.get(ids=expected_ids)["ids"])
+        }
+        assert (
+            result_ids
+            == sorted(expected_ids, key=lambda id: offset_id_order[id])[
+                offset : offset + limit
+            ]
+        )
 
 
 @settings(

diff --git a/docs/docs.trychroma.com/pages/deployment/migration.md b/docs/docs.trychroma.com/pages/deployment/migration.md
@@ -20,6 +20,12 @@ We will aim to provide:
 
 ## Migration Log
 
+### v0.5.8
+
+The results returned by `collection.get()` is now ordered by internal ids. Whereas previously, the results were ordered by user provided ids, although this behavior was not explicitly documented. We would like to make the change because using user provided ids may not be ideal for performance in hosted Chroma, and we hope to propagate the change to local Chroma for consistency of behavior. In general, newer documents in Chroma has larger internal ids.
+
+A subsequent change in behavior is `limit` and `offset`, which depends on the order of returned results. For example, if you have a collection named `coll` of documents with ids `["3", "2", "1", "0"]` inserted in this order, then previously `coll.get(limit=2, offset=2)["ids"]` gives you `["2", "3"]`, while currently this will give you `["1", "0"]`.
+
 ### v0.5.6
 
 Chroma internally uses a write-ahead log. In all versions prior to v0.5.6, this log was never pruned. This resulted in the data directory being much larger than it needed to be, as well as the directory size not decreasing by the expected amount after deleting a collection.