update scoring function, cleaning

lightonai · Jul 31, 2024 · 8656842 · 8656842
1 parent 8775b7a
commit 8656842
Show file tree

Hide file tree

Showing 27 changed files with 599 additions and 714 deletions.
diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
@@ -0,0 +1,28 @@
+name: Python Tests
+
+on:
+  pull_request:
+    branches:
+      - '**'
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'  # Specify the Python version you need
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install ".[dev]"
+
+    - name: Run tests
+      run: |
+        pytest
diff --git a/README.md b/README.md
@@ -15,9 +15,9 @@ For example, to run the BEIR evaluations using giga-cherche indexes:
 # Modeling
 The modeling of giga-cherche is based on sentence-transformers which allow to build a ColBERT model from any encoder available by appending a projection layer applied to the output of the encoders to reduce the embeddings dimension. 
 ```
-from giga_cherche.models import ColBERT
+from giga_cherche import models
 model_name = "bert-base-uncased"
-model = ColBERT(model_name_or_path=model_name)
+model = models.ColBERT(model_name_or_path=model_name)
 ```
 The following parameters can be passed to the constructor to set different properties of the model:
 - ```embedding_size```, the output size of the projection layer and so the dimension of the embeddings
@@ -40,7 +40,7 @@ from sentence_transformers import (
     SentenceTransformerTrainingArguments,
 )
 
-from giga_cherche import losses, models, data_collator, evaluation
+from giga_cherche import losses, models, datasets, evaluation
 
 model_name = "bert-base-uncased"
 batch_size = 32
@@ -77,7 +77,7 @@ trainer = SentenceTransformerTrainer(
     eval_dataset=eval_dataset,
     loss=train_loss,
     evaluator=dev_evaluator,
-    data_collator=data_collator.ColBERT(model.tokenize),
+    data_collator=datasets.ColBERTCollator(model.tokenize),
 )
 
 trainer.train()
@@ -88,7 +88,7 @@ trainer.train()
 ```
 import ast 
 
-def add_queries_and_documents(example: dict) -> dict:
+def add_queries_and_documents(Examples dict) -> dict:
     """Add queries and documents text to the examples."""
     scores = ast.literal_eval(node_or_string=example["scores"])
     processed_example = {"scores": scores, "query": queries[example["query_id"]]}
@@ -135,7 +135,7 @@ You can then compute the ColBERT max-sim scores like this:
 
 ```python
 from giga_cherche import scores
-similarity_scores = scores.colbert_score(query_embeddings, document_embeddings)
+similarity_scores = scores.colbert_scores(query_embeddings, document_embeddings)
 ```
 
 ## Indexing

diff --git a/giga_cherche/__init__.py b/giga_cherche/__init__.py
@@ -1,9 +1,10 @@
 __all__ = [
-    "models",
-    "losses",
-    "scores",
     "evaluation",
     "indexes",
-    "reranker",
-    "data_collator",
+    "losses",
+    "models",
+    "rerank",
+    "retrieve",
+    "scores",
+    "utils",
 ]
diff --git a/giga_cherche/data_collator/__init__.py b/giga_cherche/data_collator/__init__.py
diff --git a/giga_cherche/data_collator/colbert.py b/giga_cherche/data_collator/colbert.py
diff --git a/giga_cherche/evaluation/beir.py b/giga_cherche/evaluation/beir.py
@@ -3,11 +3,18 @@
 import random
 from collections import defaultdict
 
-__all__ = ["evaluate", "load_beir", "get_beir_triples"]
-
 
 def add_duplicates(queries: list[str], scores: list[list[dict]]) -> list:
-    """Add back duplicates scores to the set of candidates."""
+    """Add back duplicates scores to the set of candidates.
+
+    Parameters
+    ----------
+    queries
+        List of queries.
+    scores
+        Scores of the retrieval model.
+
+    """
     query_counts = defaultdict(int)
     for query in queries:
         query_counts[query] += 1
@@ -31,7 +38,9 @@ def load_beir(dataset_name: str, split: str = "test") -> tuple[list, list, dict]
     Parameters
     ----------
     dataset_name
-        Dataset name: scifact.
+        Name of the beir dataset.
+    split
+        Split to load.
 
     """
     from beir import util
@@ -85,14 +94,14 @@ def get_beir_triples(
 
     Examples
     --------
-    >>> from neural_cherche import utils
+    >>> from giga_cherche import evaluation
 
-    >>> documents, queries, qrels = utils.load_beir(
+    >>> documents, queries, qrels = evaluation.load_beir(
     ...     "scifact",
     ...     split="test",
     ... )
 
-    >>> triples = utils.get_beir_triples(
+    >>> triples = evaluation.get_beir_triples(
     ...     key="id",
     ...     on=["title", "text"],
     ...     documents=documents,
@@ -146,59 +155,6 @@ def evaluate(
     metrics
         Metrics to compute.
 
-    Examples
-    --------
-    >>> from neural_cherche import models, retrieve, utils
-    >>> import torch
-
-    >>> _ = torch.manual_seed(42)
-
-    >>> model = models.Splade(
-    ...     model_name_or_path="raphaelsty/neural-cherche-sparse-embed",
-    ...     device="cpu",
-    ... )
-
-    >>> documents, queries, qrels = utils.load_beir(
-    ...     "scifact",
-    ...     split="test",
-    ... )
-
-    >>> documents = documents[:10]
-
-    >>> retriever = retrieve.Splade(
-    ...     key="id",
-    ...     on=["title", "text"],
-    ...     model=model
-    ... )
-
-    >>> documents_embeddings = retriever.encode_documents(
-    ...     documents=documents,
-    ...     batch_size=1,
-    ... )
-
-    >>> documents_embeddings = retriever.add(
-    ...     documents_embeddings=documents_embeddings,
-    ... )
-
-    >>> queries_embeddings = retriever.encode_queries(
-    ...     queries=queries,
-    ...     batch_size=1,
-    ... )
-
-    >>> scores = retriever(
-    ...     queries_embeddings=queries_embeddings,
-    ...     k=30,
-    ...     batch_size=1,
-    ... )
-
-    >>> utils.evaluate(
-    ...     scores=scores,
-    ...     qrels=qrels,
-    ...     queries=queries,
-    ...     metrics=["map", "ndcg@10", "ndcg@100", "recall@10", "recall@100"]
-    ... )
-    {'map': 0.0033333333333333335, 'ndcg@10': 0.0033333333333333335, 'ndcg@100': 0.0033333333333333335, 'recall@10': 0.0033333333333333335, 'recall@100': 0.0033333333333333335}
-
     """
     from ranx import Qrels, Run, evaluate
 

diff --git a/giga_cherche/evaluation/colbert_triplet_evaluator.py b/giga_cherche/evaluation/colbert_triplet_evaluator.py
@@ -10,19 +10,17 @@
 from sentence_transformers.SentenceTransformer import SentenceTransformer
 from sentence_transformers.similarity_functions import SimilarityFunction
 
-from giga_cherche.scores.colbert_score import colbert_score
+from ..scores import colbert_scores
 
 logger = logging.getLogger(__name__)
 
-__all__ = ["ColBERTTripletEvaluator"]
-
 
 class ColBERTTripletEvaluator(SentenceEvaluator):
     """
     Evaluate a model based on a triplet: (sentence, positive_example, negative_example).
     Checks if colbert distance(sentence, positive_example) < distance(sentence, negative_example).
 
-    Example:
+    Examples
         ::
 
             from sentence_transformers import SentenceTransformer
@@ -198,15 +196,20 @@ def __call__(
         # Colbert distance
         # pos_colbert_distances = colbert_pairwise_score(embeddings_anchors, embeddings_positives)
         # neg_colbert_distances = colbert_pairwise_score(embeddings_anchors, embeddings_negatives)
-        pos_colbert_distances_full = colbert_score(
-            embeddings_anchors, embeddings_positives
+        pos_colbert_distances_full = colbert_scores(
+            queries_embeddings=embeddings_anchors,
+            documents_embeddings=embeddings_positives,
         )
-        neg_colbert_distances_full = colbert_score(
-            embeddings_anchors, embeddings_negatives
+
+        neg_colbert_distances_full = colbert_scores(
+            queries_embeddings=embeddings_anchors,
+            documents_embeddings=embeddings_negatives,
         )
+
         distances_full = torch.cat(
             [pos_colbert_distances_full, neg_colbert_distances_full], dim=1
         )
+
         # print(distances_full.shape)
         labels = np.arange(0, len(embeddings_anchors))
         indices = np.argsort(-distances_full.cpu().numpy(), axis=1)

diff --git a/giga_cherche/indexes/base.py b/giga_cherche/indexes/base.py
@@ -1,7 +1,5 @@
 from abc import ABC, abstractmethod
 
-__all__ = ["Base"]
-
 
 class Base(ABC):
     """Base class for all indexes. Indexes are used to store and retrieve embeddings."""

diff --git a/giga_cherche/indexes/weaviate.py b/giga_cherche/indexes/weaviate.py
@@ -6,8 +6,6 @@
 
 from .base import Base
 
-__all__ = ["Weaviate"]
-
 
 # TODO: define Index metaclass
 # max_doc_length is used to set a limit in the fetch embeddings method as the speed is dependant on the number of embeddings fetched

diff --git a/giga_cherche/losses/__init__.py b/giga_cherche/losses/__init__.py
@@ -1,3 +1,4 @@
-from .colbert import ColBERTLossv1, ColBERTLossv2
+from .contrastive import Triples
+from .distillation import Distillation
 
-__all__ = ["ColBERTLossv1", "ColBERTLossv2"]
+__all__ = ["Triples", "Distillation"]