From 4d5175e62c01b3a1b64ba05a4f2f413a4438f824 Mon Sep 17 00:00:00 2001 From: Antoine Chaffin Date: Thu, 29 Aug 2024 08:12:57 +0000 Subject: [PATCH 1/4] Moving the BEIR import to the function directly to avoid importation error when just training --- pylate/evaluation/custom_dataset.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pylate/evaluation/custom_dataset.py b/pylate/evaluation/custom_dataset.py index f09d749..6090255 100644 --- a/pylate/evaluation/custom_dataset.py +++ b/pylate/evaluation/custom_dataset.py @@ -1,7 +1,6 @@ -from beir.datasets.data_loader import GenericDataLoader - - def load_custom_dataset(path: str, split: str = "test") -> tuple[list, list, dict]: + from beir.datasets.data_loader import GenericDataLoader + """Load a custom dataset. Parameters From 221d3a7b8d283f031d3ae67d669931628db8838b Mon Sep 17 00:00:00 2001 From: Antoine Chaffin Date: Thu, 29 Aug 2024 08:15:01 +0000 Subject: [PATCH 2/4] Moving examples to a common subfolder --- {evaluation => examples/evaluation}/beir_dataset.py | 0 {evaluation => examples/evaluation}/custom_dataset.py | 0 train/triplet.py => examples/train/contrastive.py | 0 {train => examples/train}/knowledge_distillation.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename {evaluation => examples/evaluation}/beir_dataset.py (100%) rename {evaluation => examples/evaluation}/custom_dataset.py (100%) rename train/triplet.py => examples/train/contrastive.py (100%) rename {train => examples/train}/knowledge_distillation.py (100%) diff --git a/evaluation/beir_dataset.py b/examples/evaluation/beir_dataset.py similarity index 100% rename from evaluation/beir_dataset.py rename to examples/evaluation/beir_dataset.py diff --git a/evaluation/custom_dataset.py b/examples/evaluation/custom_dataset.py similarity index 100% rename from evaluation/custom_dataset.py rename to examples/evaluation/custom_dataset.py diff --git a/train/triplet.py b/examples/train/contrastive.py similarity index 100% rename from train/triplet.py rename to examples/train/contrastive.py diff --git a/train/knowledge_distillation.py b/examples/train/knowledge_distillation.py similarity index 100% rename from train/knowledge_distillation.py rename to examples/train/knowledge_distillation.py From a3bc9c9793c5096cf5c2731e921cc3c5c6a681c2 Mon Sep 17 00:00:00 2001 From: Antoine Chaffin Date: Thu, 29 Aug 2024 09:01:21 +0000 Subject: [PATCH 3/4] Adding reference to DDP for multi-gpu training --- docs/documentation/training.md | 7 +++++++ pylate/evaluation/custom_dataset.py | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/documentation/training.md b/docs/documentation/training.md index ba6e650..8ac95b3 100644 --- a/docs/documentation/training.md +++ b/docs/documentation/training.md @@ -83,6 +83,10 @@ trainer.train() ``` +Please note that for multi-GPU training, running ``python training.py`` **will use Data Parallel (DP) by default**. We strongly suggest using using Distributed Data Parallelism (DDP) using accelerate or torchrun: ``accelerate launch --num_processes num_gpu training.py``. + +Refer to this [documentation](https://sbert.net/docs/sentence_transformer/training/distributed.html) for more information. + ## Knowledge Distillation Training The training of late-interaction models have shown to benefit from knowledge distillation compared to a more simple contrastive learning. @@ -162,6 +166,9 @@ trainer = SentenceTransformerTrainer( trainer.train() ``` + +Once again, use [DDP](https://sbert.net/docs/sentence_transformer/training/distributed.html) if you want the best performance when training using multiple GPUs. + ## ColBERT parameters All the parameters of the ColBERT modeling can be found [here](https://lightonai.github.io/pylate/api/models/ColBERT/#parameters). Important parameters to consider are: diff --git a/pylate/evaluation/custom_dataset.py b/pylate/evaluation/custom_dataset.py index 6090255..e0f6b31 100644 --- a/pylate/evaluation/custom_dataset.py +++ b/pylate/evaluation/custom_dataset.py @@ -1,6 +1,4 @@ def load_custom_dataset(path: str, split: str = "test") -> tuple[list, list, dict]: - from beir.datasets.data_loader import GenericDataLoader - """Load a custom dataset. Parameters @@ -13,6 +11,8 @@ def load_custom_dataset(path: str, split: str = "test") -> tuple[list, list, dic Examples -------- """ + from beir.datasets.data_loader import GenericDataLoader + documents, queries, qrels = GenericDataLoader(path).load(split=split) documents = [ From 62a5b5bd7fa425a1c281ddd277ed19670a0dbb36 Mon Sep 17 00:00:00 2001 From: Antoine Chaffin Date: Thu, 29 Aug 2024 09:18:23 +0000 Subject: [PATCH 4/4] Adding citation, French dataset, renaming benchmarks and links to models --- README.md | 11 +++++++++++ docs/.pages | 2 +- docs/api/losses/Contrastive.md | 2 +- docs/api/losses/Distillation.md | 2 +- docs/documentation/datasets.md | 2 +- docs/index.md | 11 +++++++++++ docs/{benchmarks => models}/.pages | 0 docs/{benchmarks => models}/models.md | 5 +++-- 8 files changed, 29 insertions(+), 6 deletions(-) rename docs/{benchmarks => models}/.pages (100%) rename docs/{benchmarks => models}/models.md (74%) diff --git a/README.md b/README.md index 4f71864..89e4293 100644 --- a/README.md +++ b/README.md @@ -413,4 +413,15 @@ make ruff ```bash make livedoc +``` + +## Citation +If you use PyLate for research, you can refer to the library with this BibTeX: +```bibtex +@misc{PyLate, + title={PyLate: Flexible Training and Retrieval for Late Interaction Models}, + author={Chaffin, Antoine and Sourty, Raphaël}, + url={https://github.com/lightonai/pylate}, + year={2024} +} ``` \ No newline at end of file diff --git a/docs/.pages b/docs/.pages index d2d970d..5b54125 100644 --- a/docs/.pages +++ b/docs/.pages @@ -1,4 +1,4 @@ nav: - documentation - - benchmarks + - models - api \ No newline at end of file diff --git a/docs/api/losses/Contrastive.md b/docs/api/losses/Contrastive.md index ea2922f..6579cfa 100644 --- a/docs/api/losses/Contrastive.md +++ b/docs/api/losses/Contrastive.md @@ -10,7 +10,7 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th ColBERT model. -- **score_metric** – defaults to `` +- **score_metric** – defaults to `` ColBERT scoring function. Defaults to colbert_scores. diff --git a/docs/api/losses/Distillation.md b/docs/api/losses/Distillation.md index a172dc1..604f74b 100644 --- a/docs/api/losses/Distillation.md +++ b/docs/api/losses/Distillation.md @@ -10,7 +10,7 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo SentenceTransformer model. -- **score_metric** (*Callable*) – defaults to `` +- **score_metric** (*Callable*) – defaults to `` Function that returns a score between two sequences of embeddings. diff --git a/docs/documentation/datasets.md b/docs/documentation/datasets.md index abda0dd..09adbdd 100644 --- a/docs/documentation/datasets.md +++ b/docs/documentation/datasets.md @@ -136,7 +136,7 @@ Example entry: } ``` ### Loading a pre-built knowledge distillation dataset -You can directly download an existing knowledge distillation dataset from Hugging Face's hub, such as the [MS MARCO dataset with BGE M3 scores](https://huggingface.co/datasets/lightonai/ms-marco-en-bge). +You can directly download an existing knowledge distillation dataset from Hugging Face's hub, such as the English [MS MARCO dataset with BGE M3 scores](https://huggingface.co/datasets/lightonai/ms-marco-en-bge) or the [French version](https://huggingface.co/datasets/lightonai/ms-marco-fr-bge). Simply load the different files by giving the respective names to the ```load_dataset``` function: ```python diff --git a/docs/index.md b/docs/index.md index f8bdb74..9d3307e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -413,4 +413,15 @@ make ruff ```bash make livedoc +``` + +## Citation +If you use PyLate for research, you can refer to the library with this BibTeX: +```bibtex +@misc{PyLate, + title={PyLate: Flexible Training and Retrieval for Late Interaction Models}, + author={Chaffin, Antoine and Sourty, Raphaël}, + url={https://github.com/lightonai/pylate}, + year={2024} +} ``` \ No newline at end of file diff --git a/docs/benchmarks/.pages b/docs/models/.pages similarity index 100% rename from docs/benchmarks/.pages rename to docs/models/.pages diff --git a/docs/benchmarks/models.md b/docs/models/models.md similarity index 74% rename from docs/benchmarks/models.md rename to docs/models/models.md index 5a2cb23..0ea8766 100644 --- a/docs/benchmarks/models.md +++ b/docs/models/models.md @@ -7,8 +7,9 @@ Here is a list of the pre-trained ColBERT models available in PyLate along with | Model | BEIR AVG | NFCorpus | SciFact | SCIDOCS | FiQA2018 | TRECCOVID | HotpotQA | Touche2020 | ArguAna | ClimateFEVER | FEVER | QuoraRetrieval | NQ | DBPedia | |---------------------------------------|----------|----------|---------|---------|----------|-----------|----------|------------|---------|--------------|-------|----------------|------|---------| -| answerdotai/answerai-colbert-small-v1 | 53.79 | 37.3 | 74.77 | 18.42 | 41.15 | 84.59 | 76.11 | 25.69 | 50.09 | 33.07 | 90.96 | 87.72 | 59.1 | 45.58 | -| lightonai/colbertv2.0 | 50.02 | 33.8 | 69.3 | 15.4 | 35.6 | 73.3 | 66.7 | 26.3 | 46.3 | 17.6 | 78.5 | 85.2 | 56.2 | 44.6 | +| [lightonai/colbertv2.0](https://huggingface.co/lightonai/colbertv2.0) | 50.02 | 33.8 | 69.3 | 15.4 | 35.6 | 73.3 | 66.7 | 26.3 | 46.3 | 17.6 | 78.5 | 85.2 | 56.2 | 44.6 | +| [answerdotai/answerai-colbert-small-v1](https://huggingface.co/answerdotai/answerai-colbert-small-v1) | 53.79 | 37.3 | 74.77 | 18.42 | 41.15 | 84.59 | 76.11 | 25.69 | 50.09 | 33.07 | 90.96 | 87.72 | 59.1 | 45.58 | + Please note that the `lightonai/colbertv2.0` is simply a translation of the original [ColBERTv2 model](https://huggingface.co/colbert-ir/colbertv2.0/tree/main) to work with PyLate and we thank Omar Khattab for allowing us to share the model on PyLate.