UKPLab · kacperlukawski · Aug 8, 2024 · Aug 9, 2024 · Aug 9, 2024 · Aug 30, 2024
diff --git a/sentence_transformers/quantization.py b/sentence_transformers/quantization.py
@@ -394,17 +394,23 @@ def quantize_embeddings(
     Returns:
         Quantized embeddings with the specified precision
     """
+    outputs, lengths = None, None
     if isinstance(embeddings, Tensor):
         embeddings = embeddings.cpu().numpy()
+        embeddings = np.concatenate(embeddings)
     elif isinstance(embeddings, list):
+        if not isinstance(embeddings[0], list) and len(embeddings[0].shape) == 2:
+            # It will happen when we request token_embeddings
+            lengths = [embedding.shape[0] for embedding in embeddings]
+            embeddings = np.concatenate(embeddings)
         if isinstance(embeddings[0], Tensor):
             embeddings = [embedding.cpu().numpy() for embedding in embeddings]
         embeddings = np.array(embeddings)
     if embeddings.dtype in (np.uint8, np.int8):
         raise Exception("Embeddings to quantize must be float rather than int8 or uint8.")
 
     if precision == "float32":
-        return embeddings.astype(np.float32)
+        outputs = embeddings.astype(np.float32)
 
     if precision.endswith("int8"):
         # Either use the 1. provided ranges, 2. the calibration dataset or 3. the provided embeddings
@@ -423,14 +429,20 @@ def quantize_embeddings(
         steps = (ranges[1, :] - ranges[0, :]) / 255
 
         if precision == "uint8":
-            return ((embeddings - starts) / steps).astype(np.uint8)
+            outputs = ((embeddings - starts) / steps).astype(np.uint8)
         elif precision == "int8":
-            return ((embeddings - starts) / steps - 128).astype(np.int8)
+            outputs = ((embeddings - starts) / steps - 128).astype(np.int8)
 
     if precision == "binary":
-        return (np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1) - 128).astype(np.int8)
+        outputs = (np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1) - 128).astype(np.int8)
 
     if precision == "ubinary":
-        return np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1)
+        outputs = np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1)
 
-    raise ValueError(f"Precision {precision} is not supported")
+    if outputs is None:
+        raise ValueError(f"Precision {precision} is not supported")
+
+    if lengths is not None:
+        outputs = np.split(outputs, np.cumsum(lengths)[:-1])
+
+    return outputs
diff --git a/tests/test_compute_embeddings.py b/tests/test_compute_embeddings.py
@@ -4,7 +4,10 @@
 
 from __future__ import annotations
 
+from typing import Literal
+
 import numpy as np
+import pytest
 
 from sentence_transformers import SentenceTransformer
 
@@ -84,3 +87,34 @@ def test_encode_tuple_sentences(paraphrase_distilroberta_base_v1_model: Sentence
     )
     assert emb.shape == (3, 768)
     assert abs(np.sum(emb) - 32.14627) < 0.002
+
+
+@pytest.mark.parametrize("precision", ("float32", "int8", "uint8"))
+def test_encode_token_embeddings_int_precision(
+    paraphrase_distilroberta_base_v1_model: SentenceTransformer,
+    precision: Literal["float32", "int8", "uint8", "binary", "ubinary"]
+) -> None:
+    model = paraphrase_distilroberta_base_v1_model
+    # Single sentence
+    emb = model.encode("Hello Word, a test sentence", output_value="token_embeddings", precision="uint8")
+    assert emb.shape == (8, 768)
+
+    # Single sentence as list
+    emb = model.encode(["Hello Word, a test sentence"], output_value="token_embeddings", precision="uint8")
+    assert isinstance(emb, list)
+    assert emb[0].shape == (8, 768)
+
+    # Sentence list
+    emb = model.encode(
+        [
+            "Hello Word, a test sentence",
+            "Here comes another sentence",
+            "My final sentence",
+        ],
+        output_value="token_embeddings",
+        precision=precision,
+    )
+    assert isinstance(emb, list)
+    assert emb[0].shape == (8, 768)
+    assert emb[1].shape == (6, 768)
+    assert emb[2].shape == (5, 768)