From f2ee7670abd92888455f3860b5e62b9edd873627 Mon Sep 17 00:00:00 2001 From: Aditi Ahuja <48997495+metonymic-smokey@users.noreply.github.com> Date: Thu, 19 Sep 2024 22:04:31 +0530 Subject: [PATCH] MB-62230 - Optimizing cluster selection in pre-filtered kNN (#261) Modifies the pre-filtering kNN path for IVF indexes to factor in the distribution of filtered hits in the index. 1. Filter the clusters which have at least one filter hit. 2. Rank the filtered clusters in decreasing order of proximity. 3. Pick either the closest 'nprobe' clusters or as many as needed to satisfy at least K hits, whichever comes first. 4. Search the selected clusters for the k nearest neighbours from among the filtered hits. There is no change in the pre-filtered search path for Flat indexes. --------- Co-authored-by: Abhinav Dangeti --- faiss_vector_posting.go | 85 ++++++++++++++++++++++++++++++++++++++++- go.mod | 2 +- go.sum | 4 +- 3 files changed, 86 insertions(+), 5 deletions(-) diff --git a/faiss_vector_posting.go b/faiss_vector_posting.go index b241b42c..0dd2a368 100644 --- a/faiss_vector_posting.go +++ b/faiss_vector_posting.go @@ -387,13 +387,94 @@ func (sb *SegmentBase) InterpretVectorIndex(field string, requiresFiltering bool vectorIDsToInclude = append(vectorIDsToInclude, docVecIDMap[uint32(id)]...) } - scores, ids, err := vecIndex.SearchWithIDs(qVector, k, - vectorIDsToInclude, params) + // Retrieve the mapping of centroid IDs to vectors within + // the cluster. + clusterAssignment, _ := vecIndex.ObtainClusterToVecIDsFromIVFIndex() + // Accounting for a flat index + if len(clusterAssignment) == 0 { + scores, ids, err := vecIndex.SearchWithIDs(qVector, k, + vectorIDsToInclude, params) + if err != nil { + return nil, err + } + + addIDsToPostingsList(rv, ids, scores) + return rv, nil + } + + // Converting to roaring bitmap for ease of intersect ops with + // the set of eligible doc IDs. + centroidVecIDMap := make(map[int64]*roaring.Bitmap) + for centroidID, vecIDs := range clusterAssignment { + if _, exists := centroidVecIDMap[centroidID]; !exists { + centroidVecIDMap[centroidID] = roaring.NewBitmap() + } + for _, vecID := range vecIDs { + centroidVecIDMap[centroidID].Add(uint32(vecID)) + } + } + + // Getting the vector IDs corresponding to the eligible + // doc IDs. + eligibleVecIDsBitmap := roaring.NewBitmap() + for _, eligibleDocID := range eligibleDocIDs { + vecIDs := docVecIDMap[uint32(eligibleDocID)] + for _, vecID := range vecIDs { + eligibleVecIDsBitmap.Add(uint32(vecID)) + } + } + + // Determining which clusters, identified by centroid ID, + // have at least one eligible vector and hence, ought to be + // probed. + eligibleCentroidIDs := make([]int64, 0) + for centroidID, vecIDs := range centroidVecIDMap { + vecIDs.And(eligibleVecIDsBitmap) + if vecIDs.GetCardinality() > 0 { + // The mapping is now reduced to those vectors which + // are also eligible docs for the filter query. + centroidVecIDMap[centroidID] = vecIDs + eligibleCentroidIDs = append(eligibleCentroidIDs, centroidID) + } else { + // don't consider clusters with no eligible IDs. + delete(centroidVecIDMap, centroidID) + } + } + + // Ordering the retrieved centroid IDs by increasing order + // of distance i.e. decreasing order of proximity to query vector. + closestCentroidIDs, centroidDistances, _ := + vecIndex.ObtainClustersWithDistancesFromIVFIndex(qVector, + eligibleCentroidIDs) + + // Getting the nprobe value set at index time. + nprobe := vecIndex.GetNProbe() + + eligibleDocsTillNow := int64(0) + minEligibleCentroids := 0 + for i, centroidID := range closestCentroidIDs { + eligibleDocsTillNow += int64(centroidVecIDMap[centroidID].GetCardinality()) + if eligibleDocsTillNow >= k && i >= int(nprobe-1) { + // Continue till at least 'K' cumulative vectors are + // collected or 'nprobe' clusters are examined, whichever + // comes later. + minEligibleCentroids = i + 1 + break + } + minEligibleCentroids = i + 1 + } + + // Search the clusters specified by 'closestCentroidIDs' for + // vectors whose IDs are present in 'vectorIDsToInclude' + scores, ids, err := vecIndex.SearchClustersFromIVFIndex( + vectorIDsToInclude, closestCentroidIDs, minEligibleCentroids, + k, qVector, centroidDistances, params) if err != nil { return nil, err } addIDsToPostingsList(rv, ids, scores) + return rv, nil } return rv, nil }, diff --git a/go.mod b/go.mod index bae00656..8036a9f6 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.21 require ( github.com/RoaringBitmap/roaring v1.9.3 github.com/blevesearch/bleve_index_api v1.1.12 - github.com/blevesearch/go-faiss v1.0.22-0.20240909180832-35a1ff78ead4 + github.com/blevesearch/go-faiss v1.0.22-0.20240919162919-05a9ee21155a github.com/blevesearch/mmap-go v1.0.4 github.com/blevesearch/scorch_segment_api/v2 v2.2.16 github.com/blevesearch/vellum v1.0.10 diff --git a/go.sum b/go.sum index 3b5bb897..e50648f2 100644 --- a/go.sum +++ b/go.sum @@ -4,8 +4,8 @@ github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZ github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/blevesearch/bleve_index_api v1.1.12 h1:P4bw9/G/5rulOF7SJ9l4FsDoo7UFJ+5kexNy1RXfegY= github.com/blevesearch/bleve_index_api v1.1.12/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= -github.com/blevesearch/go-faiss v1.0.22-0.20240909180832-35a1ff78ead4 h1:riy8XP3UIBeVjMhsq1r1aGfjvTf3aPp2PuXxdiw9P4s= -github.com/blevesearch/go-faiss v1.0.22-0.20240909180832-35a1ff78ead4/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= +github.com/blevesearch/go-faiss v1.0.22-0.20240919162919-05a9ee21155a h1:mSUfDoOPOLt0OABjiyQq/kQxOzAJmsgIjlAWUPfUDfc= +github.com/blevesearch/go-faiss v1.0.22-0.20240919162919-05a9ee21155a/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= github.com/blevesearch/scorch_segment_api/v2 v2.2.16 h1:uGvKVvG7zvSxCwcm4/ehBa9cCEuZVE+/zvrSl57QUVY=