Skip to content

Commit

Permalink
MB-62230 - Optimizing cluster selection in pre-filtered kNN (#261)
Browse files Browse the repository at this point in the history
Modifies the pre-filtering kNN path for IVF indexes to factor in the distribution of filtered hits in the index.
1. Filter the clusters which have at least one filter hit.
2. Rank the filtered clusters in decreasing order of proximity.
3. Pick either the closest 'nprobe' clusters or as many as needed to satisfy at least K hits, whichever comes first.
4. Search the selected clusters for the k nearest neighbours from among the filtered hits.

There is no change in the pre-filtered search path for Flat indexes.

---------

Co-authored-by: Abhinav Dangeti <[email protected]>
  • Loading branch information
metonymic-smokey and abhinavdangeti authored Sep 19, 2024
1 parent e148470 commit f2ee767
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 5 deletions.
85 changes: 83 additions & 2 deletions faiss_vector_posting.go
Original file line number Diff line number Diff line change
Expand Up @@ -387,13 +387,94 @@ func (sb *SegmentBase) InterpretVectorIndex(field string, requiresFiltering bool
vectorIDsToInclude = append(vectorIDsToInclude, docVecIDMap[uint32(id)]...)
}

scores, ids, err := vecIndex.SearchWithIDs(qVector, k,
vectorIDsToInclude, params)
// Retrieve the mapping of centroid IDs to vectors within
// the cluster.
clusterAssignment, _ := vecIndex.ObtainClusterToVecIDsFromIVFIndex()
// Accounting for a flat index
if len(clusterAssignment) == 0 {
scores, ids, err := vecIndex.SearchWithIDs(qVector, k,
vectorIDsToInclude, params)
if err != nil {
return nil, err
}

addIDsToPostingsList(rv, ids, scores)
return rv, nil
}

// Converting to roaring bitmap for ease of intersect ops with
// the set of eligible doc IDs.
centroidVecIDMap := make(map[int64]*roaring.Bitmap)
for centroidID, vecIDs := range clusterAssignment {
if _, exists := centroidVecIDMap[centroidID]; !exists {
centroidVecIDMap[centroidID] = roaring.NewBitmap()
}
for _, vecID := range vecIDs {
centroidVecIDMap[centroidID].Add(uint32(vecID))
}
}

// Getting the vector IDs corresponding to the eligible
// doc IDs.
eligibleVecIDsBitmap := roaring.NewBitmap()
for _, eligibleDocID := range eligibleDocIDs {
vecIDs := docVecIDMap[uint32(eligibleDocID)]
for _, vecID := range vecIDs {
eligibleVecIDsBitmap.Add(uint32(vecID))
}
}

// Determining which clusters, identified by centroid ID,
// have at least one eligible vector and hence, ought to be
// probed.
eligibleCentroidIDs := make([]int64, 0)
for centroidID, vecIDs := range centroidVecIDMap {
vecIDs.And(eligibleVecIDsBitmap)
if vecIDs.GetCardinality() > 0 {
// The mapping is now reduced to those vectors which
// are also eligible docs for the filter query.
centroidVecIDMap[centroidID] = vecIDs
eligibleCentroidIDs = append(eligibleCentroidIDs, centroidID)
} else {
// don't consider clusters with no eligible IDs.
delete(centroidVecIDMap, centroidID)
}
}

// Ordering the retrieved centroid IDs by increasing order
// of distance i.e. decreasing order of proximity to query vector.
closestCentroidIDs, centroidDistances, _ :=
vecIndex.ObtainClustersWithDistancesFromIVFIndex(qVector,
eligibleCentroidIDs)

// Getting the nprobe value set at index time.
nprobe := vecIndex.GetNProbe()

eligibleDocsTillNow := int64(0)
minEligibleCentroids := 0
for i, centroidID := range closestCentroidIDs {
eligibleDocsTillNow += int64(centroidVecIDMap[centroidID].GetCardinality())
if eligibleDocsTillNow >= k && i >= int(nprobe-1) {
// Continue till at least 'K' cumulative vectors are
// collected or 'nprobe' clusters are examined, whichever
// comes later.
minEligibleCentroids = i + 1
break
}
minEligibleCentroids = i + 1
}

// Search the clusters specified by 'closestCentroidIDs' for
// vectors whose IDs are present in 'vectorIDsToInclude'
scores, ids, err := vecIndex.SearchClustersFromIVFIndex(
vectorIDsToInclude, closestCentroidIDs, minEligibleCentroids,
k, qVector, centroidDistances, params)
if err != nil {
return nil, err
}

addIDsToPostingsList(rv, ids, scores)
return rv, nil
}
return rv, nil
},
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.21
require (
github.com/RoaringBitmap/roaring v1.9.3
github.com/blevesearch/bleve_index_api v1.1.12
github.com/blevesearch/go-faiss v1.0.22-0.20240909180832-35a1ff78ead4
github.com/blevesearch/go-faiss v1.0.22-0.20240919162919-05a9ee21155a
github.com/blevesearch/mmap-go v1.0.4
github.com/blevesearch/scorch_segment_api/v2 v2.2.16
github.com/blevesearch/vellum v1.0.10
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZ
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blevesearch/bleve_index_api v1.1.12 h1:P4bw9/G/5rulOF7SJ9l4FsDoo7UFJ+5kexNy1RXfegY=
github.com/blevesearch/bleve_index_api v1.1.12/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/go-faiss v1.0.22-0.20240909180832-35a1ff78ead4 h1:riy8XP3UIBeVjMhsq1r1aGfjvTf3aPp2PuXxdiw9P4s=
github.com/blevesearch/go-faiss v1.0.22-0.20240909180832-35a1ff78ead4/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
github.com/blevesearch/go-faiss v1.0.22-0.20240919162919-05a9ee21155a h1:mSUfDoOPOLt0OABjiyQq/kQxOzAJmsgIjlAWUPfUDfc=
github.com/blevesearch/go-faiss v1.0.22-0.20240919162919-05a9ee21155a/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/scorch_segment_api/v2 v2.2.16 h1:uGvKVvG7zvSxCwcm4/ehBa9cCEuZVE+/zvrSl57QUVY=
Expand Down

0 comments on commit f2ee767

Please sign in to comment.