Merge pull request #43 from lightonai/documentation

Documentation
lightonai · Aug 27, 2024 · eb591f4 · eb591f4
2 parents 5a0c28f + 027692f
commit eb591f4
Show file tree

Hide file tree

Showing 23 changed files with 866 additions and 444 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 
 <div align="center">
   <!-- Documentation -->
-  <a href="https://github.com/lightonai/pylate"><img src="https://img.shields.io/badge/Documentation-purple.svg?style=flat-square" alt="documentation"></a>
+  <a href="https://lightonai.github.io/pylate/"><img src="https://img.shields.io/badge/Documentation-purple.svg?style=flat-square" alt="documentation"></a>
   <!-- License -->
   <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-blue.svg?style=flat-square" alt="license"></a>
 </div>
@@ -32,123 +32,255 @@ pip install "pylate[eval]"
 
 The complete documentation is available [here](https://lightonai.github.io/pylate/), which includes in-depth guides, examples, and API references.
 
-## Datasets
-
-PyLate supports Hugging Face [Datasets](https://huggingface.co/docs/datasets/en/index), enabling seamless triplet / knowledge distillation based training. Below is an example of creating a custom dataset for training:
-
-```python
-from datasets import Dataset
-
-dataset = [
-    {
-        "query": "example query 1",
-        "positive": "example positive document 1",
-        "negative": "example negative document 1",
-    },
-    {
-        "query": "example query 2",
-        "positive": "example positive document 2",
-        "negative": "example negative document 2",
-    },
-    {
-        "query": "example query 3",
-        "positive": "example positive document 3",
-        "negative": "example negative document 3",
-    },
-]
-
-dataset = Dataset.from_list(mapping=dataset)
-
-train_dataset, test_dataset = dataset.train_test_split(test_size=0.3)
-```
-
 ## Training
+### Contrastive training
 
-Here’s a simple example of training a ColBERT model on the MSMARCO dataset using PyLate. This script demonstrates training with triplet loss and evaluating the model on a test set.
+Here’s a simple example of training a ColBERT model on the MS MARCO dataset triplet dataset using PyLate. This script demonstrates training with contrastive loss and evaluating the model on a held-out eval set:
 
 ```python
+import torch
 from datasets import load_dataset
 from sentence_transformers import (
     SentenceTransformerTrainer,
     SentenceTransformerTrainingArguments,
 )
-from sentence_transformers.training_args import BatchSamplers
 
 from pylate import evaluation, losses, models, utils
 
-# Define the model
-model = models.ColBERT(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2")
+# Define model parameters for contrastive training
+model_name = "bert-base-uncased"  # Choose the pre-trained model you want to use as base
+batch_size = 32  # Larger batch size often improves results, but requires more memory
 
-# Load dataset
-dataset = load_dataset("sentence-transformers/msmarco-bm25", "triplet", split="train")
+num_train_epochs = 1  # Adjust based on your requirements
+# Set the run name for logging and output directory
+run_name = "contrastive-bert-base-uncased"
+output_dir = f"output/{run_name}"
 
-# Split the dataset to create a test set
-train_dataset, eval_dataset = dataset.train_test_split(test_size=0.01)
+# 1. Here we define our ColBERT model. If not a ColBERT model, will add a linear layer to the base encoder.
+model = models.ColBERT(model_name_or_path=model_name)
 
-# Shuffle and select a subset of the dataset for demonstration purposes
-MAX_TRAIN_SIZE, MAX_EVAL_SIZE = 100, 100
-train_dataset = train_dataset.shuffle(seed=21).select(range(MAX_TRAIN_SIZE))
-eval_dataset = eval_dataset.shuffle(seed=21).select(range(MAX_EVAL_SIZE))
+# Compiling the model makes the training faster
+model = torch.compile(model)
+
+# Load dataset
+dataset = load_dataset("sentence-transformers/msmarco-bm25", "triplet", split="train")
+# Split the dataset (this dataset does not have a validation set, so we split the training set)
+splits = dataset.train_test_split(test_size=0.01)
+train_dataset = splits["train"]
+eval_dataset = splits["test"]
 
 # Define the loss function
 train_loss = losses.Contrastive(model=model)
 
-args = SentenceTransformerTrainingArguments(
-    output_dir="colbert-training",
-    num_train_epochs=1,
-    per_device_train_batch_size=32,
-    per_device_eval_batch_size=32,
-    fp16=False,  # Some GPUs support FP16 which is faster than FP32
-    bf16=False,  # Some GPUs support BF16 which is a faster FP16
-    batch_sampler=BatchSamplers.NO_DUPLICATES,
-    # Tracking parameters:
-    eval_strategy="steps",
-    eval_steps=0.1,
-    save_strategy="steps",
-    save_steps=5000,
-    save_total_limit=2,
-    learning_rate=3e-6,
-)
-
-# Evaluation procedure
+# Initialize the evaluator
 dev_evaluator = evaluation.ColBERTTripletEvaluator(
     anchors=eval_dataset["query"],
     positives=eval_dataset["positive"],
     negatives=eval_dataset["negative"],
 )
 
+# Configure the training arguments (e.g., batch size, evaluation strategy, logging steps)
+args = SentenceTransformerTrainingArguments(
+    output_dir=output_dir,
+    num_train_epochs=num_train_epochs,
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=batch_size,
+    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
+    bf16=False,  # Set to True if you have a GPU that supports BF16
+    run_name=run_name,  # Will be used in W&B if `wandb` is installed
+    learning_rate=3e-6,
+)
+
+# Initialize the trainer for the contrastive training
 trainer = SentenceTransformerTrainer(
     model=model,
     args=args,
     train_dataset=train_dataset,
     eval_dataset=eval_dataset,
     loss=train_loss,
     evaluator=dev_evaluator,
+    data_collator=utils.ColBERTCollator(model.tokenize),
+)
+# Start the training process
+trainer.train()
+```
+
+
+After training, the model can be loaded using the output directory path:
+
+```python
+from pylate import models
+
+model = models.ColBERT(model_name_or_path="contrastive-bert-base-uncased")
+```
+
+### Knowledge distillation
+To get the best performance when training a ColBERT model, you should use knowledge distillation to train the model using the scores of a strong teacher model.
+Here's a simple example of how to train a model using knowledge distillation in PyLate on MS MARCO:
+```python
+import torch
+from datasets import load_dataset
+from sentence_transformers import (
+    SentenceTransformerTrainer,
+    SentenceTransformerTrainingArguments,
+)
+
+from pylate import losses, models, utils
+
+# Load the datasets required for knowledge distillation (train, queries, documents)
+train = load_dataset(
+    path="lightonai/ms-marco-en-bge",
+    name="train",
+)
+
+queries = load_dataset(
+    path="lightonai/ms-marco-en-bge",
+    name="queries",
+)
+
+documents = load_dataset(
+    path="lightonai/ms-marco-en-bge",
+    name="documents",
+)
+
+# Set the transformation to load the documents/queries texts using the corresponding ids on the fly
+train.set_transform(
+    utils.KDProcessing(queries=queries, documents=documents).transform,
+)
+
+# Define the base model, training parameters, and output directory
+model_name = "bert-base-uncased"  # Choose the pre-trained model you want to use as base
+batch_size = 16
+num_train_epochs = 1
+# Set the run name for logging and output directory
+run_name = "knowledge-distillation-bert-base"
+output_dir = f"output/{run_name}"
+
+# Initialize the ColBERT model from the base model
+model = models.ColBERT(model_name_or_path=model_name)
+
+# Compiling the model to make the training faster
+model = torch.compile(model)
+
+# Configure the training arguments (e.g., epochs, batch size, learning rate)
+args = SentenceTransformerTrainingArguments(
+    output_dir=output_dir,
+    num_train_epochs=num_train_epochs,
+    per_device_train_batch_size=batch_size,
+    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
+    bf16=False,  # Set to True if you have a GPU that supports BF16
+    run_name=run_name,
+    learning_rate=1e-5,
+)
+
+# Use the Distillation loss function for training
+train_loss = losses.Distillation(model=model)
+
+# Initialize the trainer
+trainer = SentenceTransformerTrainer(
+    model=model,
+    args=args,
+    train_dataset=train,
+    loss=train_loss,
     data_collator=utils.ColBERTCollator(tokenize_fn=model.tokenize),
 )
 
+# Start the training process
 trainer.train()
-
-model.save_pretrained("custom-colbert-model")
 ```
 
-After training, the model can be loaded like this:
+
+
+## Datasets
+
+PyLate supports Hugging Face [Datasets](https://huggingface.co/docs/datasets/en/index), enabling seamless triplet / knowledge distillation based training. For contrastive training, you can use any of the existing sentence transformers triplet datasets. Below is an example of creating a custom triplet dataset for training:
 
 ```python
-from pylate import models
+from datasets import Dataset
+
+dataset = [
+    {
+        "query": "example query 1",
+        "positive": "example positive document 1",
+        "negative": "example negative document 1",
+    },
+    {
+        "query": "example query 2",
+        "positive": "example positive document 2",
+        "negative": "example negative document 2",
+    },
+    {
+        "query": "example query 3",
+        "positive": "example positive document 3",
+        "negative": "example negative document 3",
+    },
+]
+
+dataset = Dataset.from_list(mapping=dataset)
+
+train_dataset, test_dataset = dataset.train_test_split(test_size=0.3)
+```
+
+To create a knowledge distillation dataset, you can use the following snippet:
+```python
+from datasets import Dataset
+
+dataset = [
+    {
+        "query_id": 54528,
+        "document_ids": [
+            6862419,
+            335116,
+            339186,
+        ],
+        "scores": [
+            0.4546215673141326,
+            0.6575686537173476,
+            0.26825184192900203,
+        ],
+    },
+    {
+        "query_id": 749480,
+        "document_ids": [
+            6862419,
+            335116,
+            339186,
+        ],
+        "scores": [
+            0.2546215673141326,
+            0.7575686537173476,
+            0.96825184192900203,
+        ],
+    },
+]
+
+
+dataset = Dataset.from_list(mapping=dataset)
+
+documents = [
+    {"document_id": 6862419, "text": "example doc 1"},
+    {"document_id": 335116, "text": "example doc 2"},
+    {"document_id": 339186, "text": "example doc 3"},
+]
+
+queries = [
+    {"query_id": 749480, "text": "example query"},
+]
+
+documents = Dataset.from_list(mapping=documents)
 
-model = models.ColBERT(model_name_or_path="custom-colbert-model")
+queries = Dataset.from_list(mapping=queries)
 ```
 
 ##  Retrieve
 
-PyLate allows easy retrieval of top documents for a given query set using the trained ColBERT model and Voyager index.
+PyLate allows easy retrieval of top documents for a given query set using the trained ColBERT model and Voyager index, simply load the model and init the index:
 
 ```python
 from pylate import indexes, models, retrieve
 
 model = models.ColBERT(
-    model_name_or_path="sentence-transformers/all-MiniLM-L6-v2",
+    model_name_or_path="lightonai/colbertv2.0",
 )
 
 index = indexes.Voyager(
@@ -160,7 +292,7 @@ index = indexes.Voyager(
 retriever = retrieve.ColBERT(index=index)
 ```
 
-Once the model and index are set up, we can add documents to the index:
+Once the model and index are set up, we can add documents to the index using their embeddings and corresponding ids:
 
 ```python
 documents_ids = ["1", "2", "3"]
@@ -184,7 +316,7 @@ index.add_documents(
 )
 ```
 
-Then we can retrieve the top-k documents for a given query set:
+Then we can retrieve the top-k documents for a given set of queries:
 
 ```python
 queries_embeddings = model.encode(

diff --git a/docs/api/evaluation/evaluate.md b/docs/api/evaluation/evaluate.md
@@ -22,4 +22,28 @@ Evaluate candidates matchs.
 
 
 
+## Examples
+
+```python
+>>> from pylate import evaluation
+
+>>> scores = [
+...     [{"id": "1", "score": 0.9}, {"id": "2", "score": 0.8}],
+...     [{"id": "3", "score": 0.7}, {"id": "4", "score": 0.6}],
+... ]
+
+>>> qrels = {
+...     "query1": {"1": True, "2": True},
+...     "query2": {"3": True, "4": True},
+... }
+
+>>> queries = ["query1", "query2"]
+
+>>> results = evaluation.evaluate(
+...     scores=scores,
+...     qrels=qrels,
+...     queries=queries,
+...     metrics=["ndcg@10", "hits@1"],
+... )
+```
 
diff --git a/docs/api/evaluation/load-custom-dataset.md b/docs/api/evaluation/load-custom-dataset.md
@@ -0,0 +1,19 @@
+# load_custom_dataset
+
+Load a custom dataset.
+
+
+
+## Parameters
+
+- **path** (*str*)
+
+    Path of the dataset.
+
+- **split** (*str*) – defaults to `test`
+
+    Split to load.
+
+
+
+