From 1123a1cfc158093e71991c2172b69bbfefaf657a Mon Sep 17 00:00:00 2001 From: Antoine Chaffin Date: Mon, 19 Aug 2024 13:36:08 +0000 Subject: [PATCH] Adding back padding to query encoding --- giga_cherche/models/colbert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/giga_cherche/models/colbert.py b/giga_cherche/models/colbert.py index 3f23a2a..cae08df 100644 --- a/giga_cherche/models/colbert.py +++ b/giga_cherche/models/colbert.py @@ -875,8 +875,8 @@ def tokenize( max_length = self.query_length if is_query else self.document_length self._first_module().max_seq_length = max_length - # Handle padding for documents if specified - tokenize_args = {"padding": "max_length"} if pad_document else {} + # Pad queries (query expansion) and handle padding for documents if specified + tokenize_args = {"padding": "max_length"} if pad_document or is_query else {} # Tokenize the texts tokenized_outputs = self._first_module().tokenize(texts, **tokenize_args)