Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add raw() to DenseSearchResult and PRFDenseSearchResult #1876

Closed
wants to merge 8 commits into from
16 changes: 16 additions & 0 deletions docs/usage-fetch.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Pyserini: Fetching Document Content

## Using a Sparse Representation
Another commonly used feature in Pyserini is to fetch a document (i.e., its text) given its `docid`.
A sparse (Lucene) index can be configured to include the raw document text, in which case the `doc()` method can be used to fetch the document:

Expand Down Expand Up @@ -60,3 +61,18 @@ Thus, a simple way to iterate through all documents in the collection (and for e
for i in range(searcher.num_docs):
print(searcher.doc(i).docid())
```

## Using a Dense Representation

A similar operation can be performed using a dense (Faiss) index **for prebuilt indexes only**.
Note that internally, the corresponding sparse (Lucene) index is used to fetch document content.

```python
from pyserini.search.faiss import FaissSearcher, AutoQueryEncoder

encoder = AutoQueryEncoder('BAAI/bge-base-en-v1.5', device='cpu', pooling='mean', l2_norm=True)
searcher = FaissSearcher.from_prebuilt_index('beir-v1.0.0-nfcorpus.bge-base-en-v1.5', encoder)
doc = searcher.doc('MED-14')
```

Since a sparse index is used internally, all methods used on doc returned by a `LuceneSearcher` apply here as well (see above section).
26 changes: 19 additions & 7 deletions pyserini/search/faiss/_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,13 +419,24 @@ def encode(self, query: str):
class DenseSearchResult:
docid: str
score: float
ssearcher: LuceneSearcher # only useful for prebuilt indexes, otherwise set to None

def raw(self):
if self.ssearcher is None:
return None
return self.ssearcher.doc(self.docid).raw()

@dataclass
class PRFDenseSearchResult:
docid: str
score: float
vectors: [float]
ssearcher: LuceneSearcher # only useful for prebuilt indexes, otherwise set to None

def raw(self):
if self.ssearcher is None:
return None
return self.ssearcher.doc(self.docid).raw()


class FaissSearcher:
Expand All @@ -449,6 +460,7 @@ def __init__(self, index_dir: str, query_encoder: Union[QueryEncoder, str],
self.num_docs = self.index.ntotal

assert self.docids is None or self.num_docs == len(self.docids)
self.ssearcher = None
if prebuilt_index_name:
sparse_index = get_sparse_index(prebuilt_index_name)
self.ssearcher = LuceneSearcher.from_prebuilt_index(sparse_index)
Expand Down Expand Up @@ -525,7 +537,7 @@ def search(self, query: Union[str, np.ndarray], k: int = 10, threads: int = 1, r
vectors = vectors[0]
distances = distances.flat
indexes = indexes.flat
return emb_q, [PRFDenseSearchResult(self.docids[idx], score, vector)
return emb_q, [PRFDenseSearchResult(self.docids[idx], score, vector, self.ssearcher)
for score, idx, vector in zip(distances, indexes, vectors) if idx != -1]
else:
distances, indexes = self.index.search(emb_q, k)
Expand All @@ -537,9 +549,9 @@ def search(self, query: Union[str, np.ndarray], k: int = 10, threads: int = 1, r
for score, idx in zip(distances, indexes):
if idx not in unique_docs:
unique_docs.add(idx)
results.append(DenseSearchResult(self.docids[idx],score))
results.append(DenseSearchResult(self.docids[idx], score, self.sssearcher))
return results
return [DenseSearchResult(self.docids[idx], score)
return [DenseSearchResult(self.docids[idx], score, self.ssearcher)
for score, idx in zip(distances, indexes) if idx != -1]

def batch_search(self, queries: Union[List[str], np.ndarray], q_ids: List[str], k: int = 10,
Expand Down Expand Up @@ -576,12 +588,12 @@ def batch_search(self, queries: Union[List[str], np.ndarray], q_ids: List[str],
faiss.omp_set_num_threads(threads)
if return_vector:
D, I, V = self.index.search_and_reconstruct(q_embs, k)
return q_embs, {key: [PRFDenseSearchResult(self.docids[idx], score, vector)
return q_embs, {key: [PRFDenseSearchResult(self.docids[idx], score, vector, self.ssearcher)
for score, idx, vector in zip(distances, indexes, vectors) if idx != -1]
for key, distances, indexes, vectors in zip(q_ids, D, I, V)}
else:
D, I = self.index.search(q_embs, k)
return {key: [DenseSearchResult(self.docids[idx], score)
return {key: [DenseSearchResult(self.docids[idx], score, self.ssearcher)
for score, idx in zip(distances, indexes) if idx != -1]
for key, distances, indexes in zip(q_ids, D, I)}

Expand Down Expand Up @@ -681,7 +693,7 @@ def search(self, query: str, k: int = 10, binary_k: int = 100, rerank: bool = Tr
distances, indexes = self.binary_dense_search(k, binary_k, rerank, dense_emb_q, sparse_emb_q)
distances = distances.flat
indexes = indexes.flat
return [DenseSearchResult(str(idx), score)
return [DenseSearchResult(str(idx), score, self.ssearcher)
for score, idx in zip(distances, indexes) if idx != -1]

def batch_search(self, queries: List[str], q_ids: List[str], k: int = 10, binary_k: int = 100,
Expand Down Expand Up @@ -721,7 +733,7 @@ def batch_search(self, queries: List[str], q_ids: List[str], k: int = 10, binary
assert m == self.dimension
faiss.omp_set_num_threads(threads)
D, I = self.binary_dense_search(k, binary_k, rerank, dense_q_embs, sparse_q_embs)
return {key: [DenseSearchResult(str(idx), score)
return {key: [DenseSearchResult(str(idx), score, self.ssearcher)
for score, idx in zip(distances, indexes) if idx != -1]
for key, distances, indexes in zip(q_ids, D, I)}

Expand Down
2 changes: 1 addition & 1 deletion pyserini/search/hybrid/_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,5 @@ def _hybrid_results(dense_results, sparse_results, alpha, k, normalization=False
dense_score = (dense_score - (min_dense_score + max_dense_score) / 2) \
/ (max_dense_score - min_dense_score)
score = alpha * sparse_score + dense_score if not weight_on_dense else sparse_score + alpha * dense_score
hybrid_result.append(DenseSearchResult(doc, score))
hybrid_result.append(DenseSearchResult(doc, score, None))
return sorted(hybrid_result, key=lambda x: x.score, reverse=True)[:k]
16 changes: 16 additions & 0 deletions tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
from urllib.request import urlretrieve

from pyserini.search.lucene import LuceneSearcher, JScoredDoc
from pyserini.search.faiss import FaissSearcher, AutoQueryEncoder
from pyserini.index.lucene import Document
from pyserini.util import get_sparse_index


class TestSearch(unittest.TestCase):
Expand Down Expand Up @@ -409,6 +411,20 @@ def test_doc_by_field(self):
# Should return None if we request a docid that doesn't exist
self.assertTrue(self.searcher.doc_by_field('foo', 'bar') is None)

def test_dense_search_result_raw(self):
DENSE_INDEX = 'beir-v1.0.0-nfcorpus.bge-base-en-v1.5'

# Using a prebuilt index as this feature only works for this
encoder = AutoQueryEncoder('BAAI/bge-base-en-v1.5', device='cpu', pooling='mean', l2_norm=True)
faiss_searcher = FaissSearcher.from_prebuilt_index(DENSE_INDEX, encoder)
hits = faiss_searcher.search('How to Help Prevent Abdominal Aortic Aneurysms')
lucene_searcher= LuceneSearcher.from_prebuilt_index(get_sparse_index(DENSE_INDEX))

self.assertEqual(lucene_searcher.doc(hits[0].docid).raw(), hits[0].raw())
self.assertEqual(lucene_searcher.doc(hits[1].docid).raw(), hits[1].raw())
self.assertEqual(lucene_searcher.doc(hits[2].docid).raw(), hits[2].raw())
self.assertEqual(lucene_searcher.doc(hits[3].docid).raw(), hits[3].raw())

@classmethod
def tearDownClass(cls):
cls.searcher.close()
Expand Down