docs: Add description for changes

qdrant · Mar 5, 2025 · 5ea3bbc · 5ea3bbc
1 parent 1c016a2
commit 5ea3bbc
Show file tree

Hide file tree

Showing 6 changed files with 18 additions and 1 deletion.
diff --git a/fastembed/common/onnx_model.py b/fastembed/common/onnx_model.py
@@ -68,6 +68,9 @@ def _load_onnx_model(
             if device_id is None:
                 onnx_providers = ["CUDAExecutionProvider"]
             else:
+                # kSameAsRequested: Allocates only the requested memory, avoiding over-allocation.
+                # more precise than 'kNextPowerOfTwo', which grows memory aggressively.
+                # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage:
                 onnx_providers = [
                     (
                         "CUDAExecutionProvider",

diff --git a/fastembed/image/onnx_image_model.py b/fastembed/image/onnx_image_model.py
@@ -82,6 +82,9 @@ def onnx_embed(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputConte
         if is_cuda_enabled(cuda, providers):
             device_id = kwargs.get("device_id", None)
             device_id = str(device_id if isinstance(device_id, int) else 0)
+            # enables memory arena shrinkage, freeing unused memory after each Run() cycle.
+            # helps prevent excessive memory retention, especially for dynamic workloads.
+            # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage:
             run_options.add_run_config_entry(
                 "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
             )

diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py
@@ -201,7 +201,6 @@ def load_onnx_model(self) -> None:
         current_max_length = self.tokenizer.truncation["max_length"]
         # ensure not to overflow after adding document-marker
         self.tokenizer.enable_truncation(max_length=current_max_length - 1)
-        print("ME VERSION")
 
     def embed(
         self,

diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -111,6 +111,9 @@ def onnx_embed_text(
         if is_cuda_enabled(cuda, providers):
             device_id = kwargs.get("device_id", None)
             device_id = str(device_id if isinstance(device_id, int) else 0)
+            # enables memory arena shrinkage, freeing unused memory after each Run() cycle.
+            # helps prevent excessive memory retention, especially for dynamic workloads.
+            # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage:
             run_options.add_run_config_entry(
                 "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
             )
@@ -188,6 +191,9 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
         if is_cuda_enabled(cuda, providers):
             device_id = kwargs.get("device_id", None)
             device_id = str(device_id if isinstance(device_id, int) else 0)
+            # enables memory arena shrinkage, freeing unused memory after each Run() cycle.
+            # helps prevent excessive memory retention, especially for dynamic workloads.
+            # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage:
             run_options.add_run_config_entry(
                 "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
             )

diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py
@@ -79,6 +79,9 @@ def onnx_embed_pairs(self, pairs: list[tuple[str, str]], **kwargs: Any) -> OnnxO
         if is_cuda_enabled(cuda, providers):
             device_id = kwargs.get("device_id", None)
             device_id = str(device_id if isinstance(device_id, int) else 0)
+            # Enables memory arena shrinkage, freeing unused memory after each Run() cycle.
+            # Helps prevent excessive memory retention, especially for dynamic workloads.
+            # Source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage:
             run_options.add_run_config_entry(
                 "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
             )

diff --git a/fastembed/text/onnx_text_model.py b/fastembed/text/onnx_text_model.py
@@ -89,6 +89,9 @@ def onnx_embed(
         if is_cuda_enabled(cuda, providers):
             device_id = kwargs.get("device_id", None)
             device_id = str(device_id if isinstance(device_id, int) else 0)
+            # enables memory arena shrinkage, freeing unused memory after each Run() cycle.
+            # helps prevent excessive memory retention, especially for dynamic workloads.
+            # source: https://onnxruntime.ai/docs/get-started/with-c.html#features:~:text=Memory%20arena%20shrinkage:
             run_options.add_run_config_entry(
                 "memory.enable_memory_arena_shrinkage", f"gpu:{device_id}"
             )