Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MB-62230 - Pre-filtering performance optimisations #269

Merged
merged 7 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 56 additions & 4 deletions faiss_vector_posting.go
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,19 @@ func (sb *SegmentBase) InterpretVectorIndex(field string, requiresFiltering bool
if len(eligibleDocIDs) > 0 {
// Non-zero documents eligible per the filter query.

// If every element in the index is eligible(eg. high selectivity
// cases), then this can basically be considered unfiltered kNN.
if len(eligibleDocIDs) == int(sb.numDocs) {
scores, ids, err := vecIndex.SearchWithoutIDs(qVector, k,
vectorIDsToExclude, params)
if err != nil {
return nil, err
}

addIDsToPostingsList(rv, ids, scores)
return rv, nil
}

// vector IDs corresponding to the local doc numbers to be
// considered for the search
vectorIDsToInclude := make([]int64, 0, len(eligibleDocIDs))
Expand Down Expand Up @@ -416,12 +429,51 @@ func (sb *SegmentBase) InterpretVectorIndex(field string, requiresFiltering bool

// Getting the vector IDs corresponding to the eligible
// doc IDs.
// The docVecIDMap maps each docID to vectorIDs corresponding
// to it.
// Usually, each docID has one vecID mapped to it unless
// the vector is nested, in which case there can be multiple
// vectorIDs mapped to the same docID.
// Eg. docID d1 -> vecID v1, for the first case
// d1 -> {v1,v2}, for the second case.
eligibleVecIDsBitmap := roaring.NewBitmap()
vecIDsUint32 := make([]uint32, 0)
metonymic-smokey marked this conversation as resolved.
Show resolved Hide resolved
for _, eligibleDocID := range eligibleDocIDs {
vecIDs := docVecIDMap[uint32(eligibleDocID)]
for _, vecID := range vecIDs {
eligibleVecIDsBitmap.Add(uint32(vecID))
vecIDsUint32 = append(vecIDsUint32, uint32(vecID))
}
}
eligibleVecIDsBitmap.AddMany(vecIDsUint32)

var selector faiss.Selector
var err error
// If there are more elements to be included than excluded, it
// might be quicker to use an exclusion selector as a filter
// instead of an inclusion selector.
if float32(eligibleVecIDsBitmap.GetCardinality())/
float32(len(vecDocIDMap)) > 0.5 {
metonymic-smokey marked this conversation as resolved.
Show resolved Hide resolved
ineligibleVectorIDs := make([]int64, 0, len(vecDocIDMap)-
len(vectorIDsToInclude))
for docID, vecIDs := range docVecIDMap {
for _, vecID := range vecIDs {
if !eligibleVecIDsBitmap.Contains(uint32(vecID)) {
if except != nil && !except.Contains(docID) {
ineligibleVectorIDs = append(ineligibleVectorIDs,
int64(vecID))
} else {
ineligibleVectorIDs = append(ineligibleVectorIDs,
int64(vecID))
}
}
}
}
selector, err = faiss.NewIDSelectorNot(ineligibleVectorIDs)
} else {
selector, err = faiss.NewIDSelectorBatch(vectorIDsToInclude)
}
if err != nil {
return nil, err
}

// Determining which clusters, identified by centroid ID,
Expand All @@ -430,7 +482,7 @@ func (sb *SegmentBase) InterpretVectorIndex(field string, requiresFiltering bool
eligibleCentroidIDs := make([]int64, 0)
for centroidID, vecIDs := range centroidVecIDMap {
vecIDs.And(eligibleVecIDsBitmap)
if vecIDs.GetCardinality() > 0 {
if !vecIDs.IsEmpty() {
metonymic-smokey marked this conversation as resolved.
Show resolved Hide resolved
// The mapping is now reduced to those vectors which
// are also eligible docs for the filter query.
centroidVecIDMap[centroidID] = vecIDs
Expand Down Expand Up @@ -467,8 +519,8 @@ func (sb *SegmentBase) InterpretVectorIndex(field string, requiresFiltering bool
// Search the clusters specified by 'closestCentroidIDs' for
// vectors whose IDs are present in 'vectorIDsToInclude'
scores, ids, err := vecIndex.SearchClustersFromIVFIndex(
vectorIDsToInclude, closestCentroidIDs, minEligibleCentroids,
k, qVector, centroidDistances, params)
selector, len(vectorIDsToInclude), closestCentroidIDs,
minEligibleCentroids, k, qVector, centroidDistances, params)
if err != nil {
return nil, err
}
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.21
require (
github.com/RoaringBitmap/roaring v1.9.3
github.com/blevesearch/bleve_index_api v1.1.12
github.com/blevesearch/go-faiss v1.0.22
github.com/blevesearch/go-faiss v1.0.23
github.com/blevesearch/mmap-go v1.0.4
github.com/blevesearch/scorch_segment_api/v2 v2.2.16
github.com/blevesearch/vellum v1.0.10
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZ
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blevesearch/bleve_index_api v1.1.12 h1:P4bw9/G/5rulOF7SJ9l4FsDoo7UFJ+5kexNy1RXfegY=
github.com/blevesearch/bleve_index_api v1.1.12/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/go-faiss v1.0.22 h1:j6jwgCOy2a2EQUTOYxjBA59rMn5KPA0jbfYyHNgc2Ls=
github.com/blevesearch/go-faiss v1.0.22/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
github.com/blevesearch/go-faiss v1.0.23 h1:Wmc5AFwDLKGl2L6mjLX1Da3vCL0EKa2uHHSorcIS1Uc=
github.com/blevesearch/go-faiss v1.0.23/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/scorch_segment_api/v2 v2.2.16 h1:uGvKVvG7zvSxCwcm4/ehBa9cCEuZVE+/zvrSl57QUVY=
Expand Down
Loading