diff --git a/.gitignore b/.gitignore index c013e51..e4ecd84 100644 --- a/.gitignore +++ b/.gitignore @@ -158,4 +158,4 @@ evaluation_datasets/ *.tsv /test-model/ - +colbert-training/ \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7a5a5ab --- /dev/null +++ b/Makefile @@ -0,0 +1,9 @@ +test: + pytest pylate + pytest tests + +ruff: + ruff format pylate + +lint: + ruff check pylate \ No newline at end of file diff --git a/README.md b/README.md index 631cb90..3baf043 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,40 @@ -# giga-cherche +
+

Pylate

+

Efficient training and retrieval with ColBERT

+
-giga-cherche is a library based on [sentence-transformers](https://github.com/UKPLab/sentence-transformers) to train and use ColBERT models. +

-# Installation +
+ + documentation + + license +
-giga-cherche can be installed by running the setup.py file with the needed extras from the following list: -- ```index``` if you want to use the proposed indexes -- ```eval``` if you need to run BEIR evaluations -- ```dev``` if you want to contribute to the repository - -For example, to run the BEIR evaluations using giga-cherche indexes: -```python setup.py install --extras eval, index``` +Pylate is a library built on top of Sentence Transformers, designed to simplify and optimize training, inference, and retrieval using ColBERT models. With Pylate, you can efficiently train ColBERT models on Triplet loss or Knowledge Distillation and deploy them for document retrieval tasks with ease. -# Modeling -The modeling of giga-cherche is based on sentence-transformers which allow to build a ColBERT model from any encoder available by appending a projection layer applied to the output of the encoders to reduce the embeddings dimension. +## Installation + +We can install pylate using: + +```bash +pip install pylate ``` -from pylate import models -model_name = "bert-base-uncased" -model = models.ColBERT(model_name_or_path=model_name) + +Install with evaluation dependencies: + +```bash +pip install "pylate[eval]" ``` -The following parameters can be passed to the constructor to set different properties of the model: -- ```embedding_size```, the output size of the projection layer and so the dimension of the embeddings -- ```query_prefix```, the string version of the query marker to be prepended when encoding queries -- ```document_prefix```, the string version of the document marker to be prepended when encoding documents -- ```query_length```, the length of the query to truncate / pad to with mask tokens -- ```document_length```, the length of the document to truncate -- ```attend_to_expansion_tokens```, whether queries tokens should attend to MASK expansion tokens (original ColBERT did not) -- ```skiplist_words```, a list of words to ignore in documents during scoring (default to punctuation) -## Training +## Documentation + +The complete documentation is available [here](https://lighton.github.io/pylate/), which includes in-depth guides, examples, and API references. -Given that giga-cherche ColBERT models are sentence-transformers models, we can benefit from all the bells and whistles from the latest update, including multi-gpu and BF16 training. -For now, you can train ColBERT models using triplets dataset (datasets containing a positive and a negative for each query). The syntax is the same as sentence-transformers, using the specific elements adapted to ColBERT from giga-cherche: +## Traning + +Here’s a simple example of training a ColBERT model on the MSMARCO dataset using Pylate. This script demonstrates training with triplet loss and evaluating the model on a test set. ```python from datasets import load_dataset @@ -39,36 +42,50 @@ from sentence_transformers import ( SentenceTransformerTrainer, SentenceTransformerTrainingArguments, ) +from sentence_transformers.training_args import BatchSamplers -from pylate import losses, models, datasets, evaluation - -model_name = "bert-base-uncased" -batch_size = 32 -num_train_epochs = 1 -output_dir = "colbert_base" +from pylate import evaluation, losses, models, utils -model = models.ColBERT(model_name_or_path=model_name) +# Define the model +model = models.ColBERT(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2") +# Load dataset dataset = load_dataset("sentence-transformers/msmarco-bm25", "triplet", split="train") -splits = dataset.train_test_split(test_size=0.1) -train_dataset = splits["train"] -eval_dataset = splits["test"] -train_loss = losses.ColBERT(model=model) +# Split the dataset to create a test set +train_dataset, eval_dataset = dataset.train_test_split(test_size=0.01) +# Shuffle and select a subset of the dataset for demonstration purposes +MAX_TRAIN_SIZE, MAX_EVAL_SIZE = 100, 100 +train_dataset = train_dataset.shuffle(seed=21).select(range(MAX_TRAIN_SIZE)) +eval_dataset = eval_dataset.shuffle(seed=21).select(range(MAX_EVAL_SIZE)) + +# Define the loss function +train_loss = losses.Contrastive(model=model) + +args = SentenceTransformerTrainingArguments( + output_dir="colbert-training", + num_train_epochs=1, + per_device_train_batch_size=32, + per_device_eval_batch_size=32, + fp16=False, # Some GPUs support FP16 which is faster than FP32 + bf16=False, # Some GPUs support BF16 which is a faster FP16 + batch_sampler=BatchSamplers.NO_DUPLICATES, + # Tracking parameters: + eval_strategy="steps", + eval_steps=0.1, + save_strategy="steps", + save_steps=5000, + save_total_limit=2, + learning_rate=3e-6, +) + +# Evaluation procedure dev_evaluator = evaluation.ColBERTTripletEvaluator( anchors=eval_dataset["query"], positives=eval_dataset["positive"], negatives=eval_dataset["negative"], ) -args = SentenceTransformerTrainingArguments( - output_dir=output_dir, - num_train_epochs=num_train_epochs, - per_device_train_batch_size=batch_size, - per_device_eval_batch_size=batch_size, - bf16=True, - learning_rate=3e-6, -) trainer = SentenceTransformerTrainer( model=model, @@ -77,174 +94,145 @@ trainer = SentenceTransformerTrainer( eval_dataset=eval_dataset, loss=train_loss, evaluator=dev_evaluator, - data_collator=utils.ColBERTCollator(model.tokenize), + data_collator=utils.ColBERTCollator(tokenize_fn=model.tokenize), ) trainer.train() -``` - -## Tokenization -``` -import ast - -def add_queries_and_documents(Examples dict) -> dict: - """Add queries and documents text to the examples.""" - scores = ast.literal_eval(node_or_string=example["scores"]) - processed_example = {"scores": scores, "query": queries[example["query_id"]]} - - n_scores = len(scores) - for i in range(n_scores): - processed_example[f"document_{i}"] = documents[example[f"document_id_{i}"]] - - return processed_example +model.save_pretrained("custom-colbert-model") ``` -## Inference -Once trained, the model can then be loaded to perform inference (you can also load the models directly from Hugging Face, for example using the provided ColBERTv2 model [NohTow/colbertv2_sentence_transformer](https://huggingface.co/NohTow/colbertv2_sentence_transformer)): +After training, the model can be loaded like this: ```python -model = ColBERT( - "NohTow/colbertv2_sentence_transformer", -) -``` - -You can then call the ```encode``` function to get the embeddings corresponding to your queries: +from pylate import models -```python -queries_embeddings = model.encode( - ["Who is the president of the USA?", "When was the last president of the USA elected?"], - ) +model = models.ColBERT(model_name_or_path="custom-colbert-model") ``` -When encoding documents, simply set the ```is_query``` parameter to false: +## Datasets + +Pylate supports Hugging Face [Datasets](https://huggingface.co/docs/datasets/en/index), enabling seamless triplet / knowledge distillation based training. Below is an example of creating a custom dataset for training: ```python -documents_embeddings = model.encode( - ["Joseph Robinette Biden Jr. is an American politician who is the 46th and current president of the United States since 2021. A member of the Democratic Party, he previously served as the 47th vice president from 2009 to 2017 under President Barack Obama and represented Delaware in the United States Senate from 1973 to 2009.", "Donald John Trump (born June 14, 1946) is an American politician, media personality, and businessman who served as the 45th president of the United States from 2017 to 2021."], - is_query=False, - ) +from datasets import Dataset + +dataset = [ + { + "query": "example query 1", + "positive": "example positive document 1", + "negative": "example negative document 1", + }, + { + "query": "example query 2", + "positive": "example positive document 2", + "negative": "example negative document 2", + }, + { + "query": "example query 3", + "positive": "example positive document 3", + "negative": "example negative document 3", + }, +] + +dataset = Dataset.from_list(mapping=dataset) + +train_dataset, test_dataset = dataset.train_test_split(test_size=0.3) ``` -By default, this will return a list of numpy arrays containing the different embeddings of each sequence in the batch. You can pass the argument ```convert_to_tensor=True``` to get a list of tensors. - -We also provide the option to pool the document embeddings using hierarchical clustering. Our recent study showed that we can pool the document embeddings by a factor of 2 to halve the memory consumption of the embeddings without degrading performance. This is done by feeding ```pool_factor=2```to the encode function. Bigger pooling values can be used to obtain different size/performance trade-offs. -Note that query embeddings cannot be pooled. +## Retrieve -You can then compute the ColBERT max-sim scores like this: +Pylate allows easy retrieval of top documents for a given query set using the trained ColBERT model and Voyager index. ```python -from pylate import scores -similarity_scores = scores.colbert_scores(query_embeddings, document_embeddings) -``` +from pylate import indexes, models, retrieve -## Indexing +model = models.ColBERT( + model_name_or_path="sentence-transformers/all-MiniLM-L6-v2", +) -We provide a ColBERT index based on the [Weaviate vectordb](https://weaviate.io/). To speed-up the processing, the latest async client is used and the document candidates are generated using an HNSW index, which replace the IVF index from the original ColBERT. +index = indexes.Voyager(override=True) -Before being able to create and use an index, you need to need to launch the Weaviate server using Docker (```docker compose up```). +retriever = retrieve.ColBERT(index=index) +``` -To populate an index, simply create it and then add the computed embeddings with their corresponding ids: +Once the model and index are set up, we can add documents to the index: ```python -from pylate import indexes +documents_ids = ["1", "2", "3"] -index = indexes.Weaviate(name="test_index") +documents = [ + "document 1 text", "document 2 text", "document 3 text" +] +# Encode the documents documents_embeddings = model.encode( - ["Document text 1", "Document text 2"], - is_query=False, + documents, + batch_size=32, + is_query=False, # Encoding documents + show_progress_bar=True, ) +# Add the documents ids and embeddings to the Voyager index index.add_documents( - doc_ids=["1", "2"], - doc_embeddings=documents_embeddings, + documents_ids=documents_ids, + documents_embeddings=documents_embeddings, ) ``` -We can also remove documents from the index using their ids: - -```python -index.remove_documents(["1"]) -``` - -To retrieve documents from the index, you can use the following code snippet: +Then we can retrieve the top-k documents for a given query set: ```python -from pylate import retrieve - -retriever = retrieve.ColBERT(Weaviate) - queries_embeddings = model.encode( - ["A query related to the documents", "Another query"], + ["query for document 3", "query for document 1"], + batch_size=32, + is_query=True, # Encoding queries + show_progress_bar=True, ) -retrieved_chunks = retriever.retrieve(queries_embeddings, k=10) -``` - -You can also simply rerank a list of ids produced by an upstream retrieval module (such as BM25): - -```python -from pylate import rerank - -reranker = rerank.ColBERT(Weaviate) - -reranked_chunks = reranker.rerank( - queries_embeddings, batch_doc_ids=[["7912", "4983"], ["8726", "7891"]] +scores = retriever.retrieve( + queries_embeddings=queries_embeddings, + k=10, ) -``` -## Evaluation +print(scores) +``` -We can eavaluate the performance of the model using the BEIR evaluation framework. The following code snippet shows how to evaluate the model on the SciFact dataset: +Sample Output: ```python -from pylate import evaluation, indexes, models, retrieve, utils - -model = models.ColBERT( - model_name_or_path="NohTow/colbertv2_sentence_transformer", -) -index = indexes.Weaviate(recreate=True, max_doc_length=model.document_length) - -retriever = retrieve.ColBERT(index=index) - -# Input dataset for evaluation -documents, queries, qrels = evaluation.load_beir( - dataset_name="scifact", - split="test", -) +[ + [ + {"id": "3", "score": 11.266985893249512}, + {"id": "1", "score": 10.303335189819336}, + {"id": "2", "score": 9.502392768859863}, + ], + [ + {"id": "1", "score": 10.88800048828125}, + {"id": "3", "score": 9.950843811035156}, + {"id": "2", "score": 9.602447509765625}, + ], +] +``` +## Contributing -for batch in utils.iter_batch(documents, batch_size=500): - documents_embeddings = model.encode( - sentences=[document["text"] for document in batch], - convert_to_numpy=True, - is_query=False, - ) +We welcome contributions! To get started: - index.add_documents( - doc_ids=[document["id"] for document in batch], - doc_embeddings=documents_embeddings, - ) +1. Install the development dependencies: +```bash +pip install "pylate[dev]" +``` -scores = [] -for batch in utils.iter_batch(queries, batch_size=5): - queries_embeddings = model.encode( - sentences=[query["text"] for query in batch], - convert_to_numpy=True, - is_query=True, - ) +2. Run tests: - scores.extend(retriever.retrieve(queries=queries_embeddings, k=10)) +```bash +make test +``` +3. Format code with Ruff: -print( - evaluation.evaluate( - scores=scores, - qrels=qrels, - queries=queries, - metrics=["map", "ndcg@10", "ndcg@100", "recall@10", "recall@100"], - ) -) +```bash +make ruff ``` \ No newline at end of file diff --git a/docs/img/logo.png b/docs/img/logo.png new file mode 100644 index 0000000..a4449a3 Binary files /dev/null and b/docs/img/logo.png differ diff --git a/docs/img/logo_2.png b/docs/img/logo_2.png new file mode 100644 index 0000000..9b19354 Binary files /dev/null and b/docs/img/logo_2.png differ diff --git a/pylate/scores/__init__.py b/pylate/scores/__init__.py index f29e636..e80a640 100644 --- a/pylate/scores/__init__.py +++ b/pylate/scores/__init__.py @@ -1,3 +1,3 @@ -from .scores import colbert_scores, colbert_scores_pairwise, colbert_kd_scores +from .scores import colbert_kd_scores, colbert_scores, colbert_scores_pairwise -__all__ = ["colbert_scores", "colbert_scores_pairwise", "colbert_kd_scores"] \ No newline at end of file +__all__ = ["colbert_scores", "colbert_scores_pairwise", "colbert_kd_scores"] diff --git a/setup.py b/setup.py index 86f6c05..f8aedf7 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ "accelerate >= 0.31.0", "voyager >= 2.0.9", "sqlitedict >= 2.1.0", + "pandas >= 2.2.1", ]