Skip to content

Commit

Permalink
[ENH] Update and implement new unit test for limit/offset (#2796)
Browse files Browse the repository at this point in the history
## Description of changes

*Summarize the changes made by this PR.*
 - Improvements & Bug fixes
- Updates `test_filtering.py`. The new implementation sorts the result
by internal `offset_id` instead of user provided `id`, thus the unit
tests need to be updated accordingly.
- In addition, a simple unit test is added to test `get()` with only
`offset` and `limit`

## Test plan
Run the existing and new tests locally

- [ ] Tests pass locally with `pytest` for python, `yarn test` for js,
`cargo test` for rust

## Documentation Changes
Add documentation to explain the order of results return by `get()`

---------

Co-authored-by: Sicheng Pan <[email protected]>
  • Loading branch information
Sicheng-Pan and Sicheng Pan authored Sep 18, 2024
1 parent 260c835 commit 427ae63
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 9 deletions.
6 changes: 3 additions & 3 deletions chromadb/segment/impl/metadata/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def get_metadata(
.on(embeddings_t.id == metadata_t.id)
)
.select(*select_clause)
.orderby(embeddings_t.embedding_id)
.orderby(embeddings_t.id)
)

# If there is a query that touches the metadata table, it uses
Expand All @@ -166,7 +166,7 @@ def get_metadata(
.select(metadata_t.id)
.join(embeddings_t)
.on(embeddings_t.id == metadata_t.id)
.orderby(embeddings_t.embedding_id)
.orderby(embeddings_t.id)
.where(
embeddings_t.segment_id
== ParameterValue(self._db.uuid_to_db(self._id))
Expand Down Expand Up @@ -207,7 +207,7 @@ def get_metadata(
embeddings_t.segment_id
== ParameterValue(self._db.uuid_to_db(self._id))
)
.orderby(embeddings_t.embedding_id)
.orderby(embeddings_t.id)
.limit(limit)
.offset(offset)
)
Expand Down
66 changes: 66 additions & 0 deletions chromadb/test/api/test_limit_offset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import logging

import chromadb.test.property.strategies as strategies
import hypothesis.strategies as st
from chromadb.api import ClientAPI
from chromadb.test.conftest import NOT_CLUSTER_ONLY, reset
from chromadb.test.property import invariants
from chromadb.test.utils.wait_for_version_increase import wait_for_version_increase
from hypothesis import HealthCheck, given, settings

collection_st = st.shared(
strategies.collections(add_filterable_data=True, with_hnsw_params=True),
key="coll",
)
recordset_st = st.shared(
strategies.recordsets(collection_st, max_size=1000), key="recordset"
)


@settings(
deadline=90000,
suppress_health_check=[
HealthCheck.function_scoped_fixture,
HealthCheck.large_base_example,
HealthCheck.filter_too_much,
],
) # type: ignore
@given(
collection=collection_st,
record_set=recordset_st,
limit=st.integers(min_value=1, max_value=10),
offset=st.integers(min_value=0, max_value=10),
should_compact=st.booleans(),
)
def test_get_limit_offset(
caplog,
client: ClientAPI,
collection: strategies.Collection,
record_set: dict,
limit: int,
offset: int,
should_compact: bool,
) -> None:
caplog.set_level(logging.ERROR)

reset(client)
coll = client.create_collection(
name=collection.name,
metadata=collection.metadata, # type: ignore
embedding_function=collection.embedding_function,
)

initial_version = coll.get_model()["version"]

coll.add(**record_set)

if not NOT_CLUSTER_ONLY:
# Only wait for compaction if the size of the collection is
# some minimal size
if should_compact and len(invariants.wrap(record_set["ids"])) > 10:
# Wait for the model to be updated
wait_for_version_increase(client, collection.name, initial_version)

result_ids = coll.get(offset=offset, limit=limit)["ids"]
all_offset_ids = coll.get()["ids"]
assert result_ids == all_offset_ids[offset : offset + limit]
15 changes: 9 additions & 6 deletions chromadb/test/property/test_filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,11 +251,6 @@ def test_filterable_metadata_get_limit_offset(
) -> None:
caplog.set_level(logging.ERROR)

# The distributed system does not support limit/offset yet
# so we skip this test for now if the system is distributed
if not NOT_CLUSTER_ONLY:
pytest.skip("Distributed system does not support limit/offset yet")

reset(client)
coll = client.create_collection(
name=collection.name,
Expand All @@ -280,7 +275,15 @@ def test_filterable_metadata_get_limit_offset(
filter["offset"] = offset
result_ids = coll.get(**filter)["ids"]
expected_ids = _filter_embedding_set(record_set, filter)
assert sorted(result_ids) == sorted(expected_ids)[offset : offset + limit]
offset_id_order = {
id: index for index, id in enumerate(coll.get(ids=expected_ids)["ids"])
}
assert (
result_ids
== sorted(expected_ids, key=lambda id: offset_id_order[id])[
offset : offset + limit
]
)


@settings(
Expand Down
6 changes: 6 additions & 0 deletions docs/docs.trychroma.com/pages/deployment/migration.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ We will aim to provide:

## Migration Log

### v0.5.8

The results returned by `collection.get()` is now ordered by internal ids. Whereas previously, the results were ordered by user provided ids, although this behavior was not explicitly documented. We would like to make the change because using user provided ids may not be ideal for performance in hosted Chroma, and we hope to propagate the change to local Chroma for consistency of behavior. In general, newer documents in Chroma has larger internal ids.

A subsequent change in behavior is `limit` and `offset`, which depends on the order of returned results. For example, if you have a collection named `coll` of documents with ids `["3", "2", "1", "0"]` inserted in this order, then previously `coll.get(limit=2, offset=2)["ids"]` gives you `["2", "3"]`, while currently this will give you `["1", "0"]`.

### v0.5.6

Chroma internally uses a write-ahead log. In all versions prior to v0.5.6, this log was never pruned. This resulted in the data directory being much larger than it needed to be, as well as the directory size not decreasing by the expected amount after deleting a collection.
Expand Down

0 comments on commit 427ae63

Please sign in to comment.