qdrant
diff --git a/‎fastembed/common/onnx_model.py‎
Lines changed: 10 additions & 1 deletion b/‎fastembed/common/onnx_model.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎fastembed/common/preprocessor_utils.py‎
Lines changed: 3 additions & 3 deletions b/‎fastembed/common/preprocessor_utils.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎fastembed/common/types.py‎
Lines changed: 1 addition & 0 deletions b/‎fastembed/common/types.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎fastembed/image/onnx_embedding.py‎
Lines changed: 3 additions & 1 deletion b/‎fastembed/image/onnx_embedding.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎fastembed/image/onnx_image_model.py‎
Lines changed: 12 additions & 3 deletions b/‎fastembed/image/onnx_image_model.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎fastembed/image/transform/functional.py‎
Lines changed: 10 additions & 10 deletions b/‎fastembed/image/transform/functional.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎fastembed/late_interaction/colbert.py‎
Lines changed: 1 addition & 1 deletion b/‎fastembed/late_interaction/colbert.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastembed/late_interaction/token_embeddings.py‎
Lines changed: 102 additions & 0 deletions b/‎fastembed/late_interaction/token_embeddings.py‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py‎
Lines changed: 3 additions & 1 deletion b/‎fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎fastembed/rerank/cross_encoder/onnx_text_model.py‎
Lines changed: 12 additions & 1 deletion b/‎fastembed/rerank/cross_encoder/onnx_text_model.py‎
Lines changed: 12 additions & 1 deletion
@@ -28,7 +28,16 @@ class OnnxModel(Generic[T]):
     def _get_worker_class(cls) -> Type["EmbeddingWorker[T]"]:
         raise NotImplementedError("Subclasses must implement this method")
 
-    def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[T]:
+    def _post_process_onnx_output(self, output: OnnxOutputContext, **kwargs: Any) -> Iterable[T]:
+        """Post-process the ONNX model output to convert it into a usable format.
+
+        Args:
+            output (OnnxOutputContext): The raw output from the ONNX model.
+            **kwargs: Additional keyword arguments that may be needed by specific implementations.
+
+        Returns:
+            Iterable[T]: Post-processed output as an iterable of type T.
+        """
         raise NotImplementedError("Subclasses must implement this method")
 
     def __init__(self) -> None:
 
@@ -36,9 +36,9 @@ def load_tokenizer(model_dir: Path) -> tuple[Tokenizer, dict[str, int]]:
 
     with open(str(tokenizer_config_path)) as tokenizer_config_file:
         tokenizer_config = json.load(tokenizer_config_file)
-        assert (
-            "model_max_length" in tokenizer_config or "max_length" in tokenizer_config
-        ), "Models without model_max_length or max_length are not supported."
+        assert "model_max_length" in tokenizer_config or "max_length" in tokenizer_config, (
+            "Models without model_max_length or max_length are not supported."
+        )
         if "model_max_length" not in tokenizer_config:
             max_context = tokenizer_config["max_length"]
         elif "max_length" not in tokenizer_config:
 
@@ -16,6 +16,7 @@
 
 OnnxProvider: TypeAlias = Union[str, tuple[str, dict[Any, Any]]]
 NumpyArray = Union[
+    NDArray[np.float64],
     NDArray[np.float32],
     NDArray[np.float16],
     NDArray[np.int8],
 
@@ -193,7 +193,9 @@ def _preprocess_onnx_input(
 
         return onnx_input
 
-    def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[NumpyArray]:
+    def _post_process_onnx_output(
+        self, output: OnnxOutputContext, **kwargs: Any
+    ) -> Iterable[NumpyArray]:
         return normalize(output.model_output)
 
 
 
@@ -23,7 +23,16 @@ class OnnxImageModel(OnnxModel[T]):
     def _get_worker_class(cls) -> Type["ImageEmbeddingWorker[T]"]:
         raise NotImplementedError("Subclasses must implement this method")
 
-    def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[T]:
+    def _post_process_onnx_output(self, output: OnnxOutputContext, **kwargs: Any) -> Iterable[T]:
+        """Post-process the ONNX model output to convert it into a usable format.
+
+        Args:
+            output (OnnxOutputContext): The raw output from the ONNX model.
+            **kwargs: Additional keyword arguments that may be needed by specific implementations.
+
+        Returns:
+            Iterable[T]: Post-processed output as an iterable of type T.
+        """
         raise NotImplementedError("Subclasses must implement this method")
 
     def __init__(self) -> None:
@@ -104,7 +113,7 @@ def _embed_images(
                 self.load_onnx_model()
 
             for batch in iter_batch(images, batch_size):
-                yield from self._post_process_onnx_output(self.onnx_embed(batch))
+                yield from self._post_process_onnx_output(self.onnx_embed(batch), **kwargs)
         else:
             if parallel == 0:
                 parallel = os.cpu_count()
@@ -125,7 +134,7 @@ def _embed_images(
                 start_method=start_method,
             )
             for batch in pool.ordered_map(iter_batch(images, batch_size), **params):
-                yield from self._post_process_onnx_output(batch)  # type: ignore
+                yield from self._post_process_onnx_output(batch, **kwargs)  # type: ignore
 
 
 class ImageEmbeddingWorker(EmbeddingWorker[T]):
 
@@ -72,26 +72,26 @@ def normalize(
     if not np.issubdtype(image.dtype, np.floating):
         image = image.astype(np.float32)
 
-    mean = mean if isinstance(mean, list) else [mean] * num_channels
+    mean_list = mean if isinstance(mean, list) else [mean] * num_channels
 
-    if len(mean) != num_channels:
+    if len(mean_list) != num_channels:
         raise ValueError(
             f"mean must have the same number of channels as the image, image has {num_channels} channels, got "
-            f"{len(mean)}"
+            f"{len(mean_list)}"
         )
 
-    mean_arr = np.array(mean, dtype=np.float32)
+    mean_arr = np.array(mean_list, dtype=np.float32)
 
-    std = std if isinstance(std, list) else [std] * num_channels
-    if len(std) != num_channels:
+    std_list = std if isinstance(std, list) else [std] * num_channels
+    if len(std_list) != num_channels:
         raise ValueError(
-            f"std must have the same number of channels as the image, image has {num_channels} channels, got {len(std)}"
+            f"std must have the same number of channels as the image, image has {num_channels} channels, got {len(std_list)}"
         )
 
-    std_arr = np.array(std, dtype=np.float32)
+    std_arr = np.array(std_list, dtype=np.float32)
 
-    image = ((image.T - mean_arr) / std_arr).T
-    return image
+    image_upd = ((image.T - mean_arr) / std_arr).T
+    return image_upd
 
 
 def resize(
 
@@ -43,7 +43,7 @@ class Colbert(LateInteractionTextEmbeddingBase, OnnxTextModel[NumpyArray]):
     MASK_TOKEN = "[MASK]"
 
     def _post_process_onnx_output(
-        self, output: OnnxOutputContext, is_doc: bool = True
+        self, output: OnnxOutputContext, is_doc: bool = True, **kwargs: Any
     ) -> Iterable[NumpyArray]:
         if not is_doc:
             return output.model_output
 
@@ -0,0 +1,102 @@
+from dataclasses import asdict
+from typing import Union, Iterable, Optional, Any, Type
+
+from fastembed.common.model_description import DenseModelDescription, ModelSource
+from fastembed.common.onnx_model import OnnxOutputContext
+from fastembed.common.types import NumpyArray
+from fastembed.late_interaction.late_interaction_embedding_base import (
+    LateInteractionTextEmbeddingBase,
+)
+from fastembed.text.onnx_embedding import OnnxTextEmbedding
+from fastembed.text.onnx_text_model import TextEmbeddingWorker
+import numpy as np
+
+supported_token_embeddings_models = [
+    DenseModelDescription(
+        model="jinaai/jina-embeddings-v2-small-en-tokens",
+        dim=512,
+        description="Text embeddings, Unimodal (text), English, 8192 input tokens truncation,"
+        " Prefixes for queries/documents: not necessary, 2023 year.",
+        license="apache-2.0",
+        size_in_GB=0.12,
+        sources=ModelSource(hf="xenova/jina-embeddings-v2-small-en"),
+        model_file="onnx/model.onnx",
+    ),
+]
+
+
+class TokenEmbeddingsModel(OnnxTextEmbedding, LateInteractionTextEmbeddingBase):
+    @classmethod
+    def _list_supported_models(cls) -> list[DenseModelDescription]:
+        """Lists the supported models.
+
+        Returns:
+            list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information.
+        """
+        return supported_token_embeddings_models
+
+    @classmethod
+    def list_supported_models(cls) -> list[dict[str, Any]]:
+        """Lists the supported models.
+
+        Returns:
+            list[dict[str, Any]]: A list of dictionaries containing the model information.
+        """
+        return [asdict(model) for model in cls._list_supported_models()]
+
+    @classmethod
+    def _get_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]:
+        return TokensEmbeddingWorker
+
+    def _post_process_onnx_output(
+        self, output: OnnxOutputContext, **kwargs: Any
+    ) -> Iterable[NumpyArray]:
+        # Size: (batch_size, sequence_length, hidden_size)
+        embeddings = output.model_output
+        # Size: (batch_size, sequence_length)
+        assert output.attention_mask is not None
+        masks = output.attention_mask
+
+        # For each document we only select those embeddings that are not masked out
+        for i in range(embeddings.shape[0]):
+            yield embeddings[i, masks[i] == 1]
+
+    def embed(
+        self,
+        documents: Union[str, Iterable[str]],
+        batch_size: int = 256,
+        parallel: Optional[int] = None,
+        **kwargs: Any,
+    ) -> Iterable[NumpyArray]:
+        yield from super().embed(documents, batch_size=batch_size, parallel=parallel, **kwargs)
+
+    def tokenize_docs(self, documents: list[str]) -> list[NumpyArray]:
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer not initialized")
+        encoded = self.tokenizer.encode_batch(documents)
+        return [np.array(e.ids, dtype=np.int32) for e in encoded]
+
+
+class TokensEmbeddingWorker(TextEmbeddingWorker[NumpyArray]):
+    def init_embedding(
+        self, model_name: str, cache_dir: str, **kwargs: Any
+    ) -> TokenEmbeddingsModel:
+        return TokenEmbeddingsModel(
+            model_name=model_name,
+            cache_dir=cache_dir,
+            threads=1,
+            **kwargs,
+        )
+
+
+if __name__ == "__main__":
+    # Example usage
+    print(TokenEmbeddingsModel.list_supported_models())
+    model = TokenEmbeddingsModel(model_name="jinaai/jina-embeddings-v2-small-en-tokens")
+    docs = ["Hello, world!", "hello", "hello hello"]
+
+    embeddings = model.embed(docs)
+    for emb in embeddings:
+        print(emb.shape)
+
+    print(model.tokenize_docs(docs))
@@ -196,7 +196,9 @@ def rerank_pairs(
     def _get_worker_class(cls) -> Type[TextRerankerWorker]:
         return TextCrossEncoderWorker
 
-    def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[float]:
+    def _post_process_onnx_output(
+        self, output: OnnxOutputContext, **kwargs: Any
+    ) -> Iterable[float]:
         return (float(elem) for elem in output.model_output)
 
 
 
@@ -133,7 +133,18 @@ def _rerank_pairs(
             for batch in pool.ordered_map(iter_batch(pairs, batch_size), **params):
                 yield from self._post_process_onnx_output(batch)  # type: ignore
 
-    def _post_process_onnx_output(self, output: OnnxOutputContext) -> Iterable[float]:
+    def _post_process_onnx_output(
+        self, output: OnnxOutputContext, **kwargs: Any
+    ) -> Iterable[float]:
+        """Post-process the ONNX model output to convert it into a usable format.
+
+        Args:
+            output (OnnxOutputContext): The raw output from the ONNX model.
+            **kwargs: Additional keyword arguments that may be needed by specific implementations.
+
+        Returns:
+            Iterable[float]: Post-processed output as an iterable of float values.
+        """
         raise NotImplementedError("Subclasses must implement this method")
 
     def _preprocess_onnx_input(