Allow to set the prefixes for stanford-nlp models

lightonai · Sep 13, 2024 · 02bd268 · 02bd268
1 parent 0c74287
commit 02bd268
Showing 1 changed file with 11 additions and 4 deletions.
diff --git a/pylate/models/colbert.py b/pylate/models/colbert.py
@@ -206,8 +206,8 @@ def __init__(
         truncate_dim: int | None = None,
         embedding_size: int | None = None,
         bias: bool = False,
-        query_prefix: str | None = "[Q] ",
-        document_prefix: str | None = "[D] ",
+        query_prefix: str | None = None,
+        document_prefix: str | None = None,
         add_special_tokens: bool = True,
         truncation: bool = True,
         query_length: int | None = None,
@@ -262,8 +262,10 @@ def __init__(
                     )
                 )
                 # Setting the prefixes from stanford-nlp models
-                self.query_prefix = "[unused0]"
-                self.document_prefix = "[unused1]"
+                if self.query_prefix is None:
+                    self.query_prefix = "[unused0]"
+                if self.document_prefix is None:
+                    self.document_prefix = "[unused1]"
                 logger.warning("Loaded the ColBERT model from Stanford NLP.")
             else:
                 # Add a linear projection layer to the model in order to project the embeddings to the desired size
@@ -308,6 +310,11 @@ def __init__(
         self.to(device)
         self.is_hpu_graph_enabled = False
 
+        if self.query_prefix is None:
+            self.query_prefix = "[Q] "
+        if self.document_prefix is None:
+            self.document_prefix = "[D] "
+
         # Try adding the prefixes to the tokenizer. We call resize_token_embeddings twice to ensure the tokens are added only if resize_token_embeddings works. There should be a better way to do this.
         try:
             self._first_module().auto_model.resize_token_embeddings(len(self.tokenizer))