Skip to content

Commit

Permalink
Speedup ci (#489)
Browse files Browse the repository at this point in the history
* chore: Trigger CI test

* chore: Trigger CI test

* chore: Trigger CI test

* chore: Trigger CI test

* chore: Trigger CI test

* chore: Trigger CI test

* chore: Trigger CI test

* chore: Trigger CI test

* chore: Trigger CI test

* Trigger CI

* Trigger CI

* Trigger CI

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* Trigger CI test

* new: Added on workflow dispatch

* tests: Updated tests

* fix: Fix CI

* fix: Fix CI

* fix: Fix CI

* improve: Prevent stop iteration error caused by next

* fix: Fix variable might be referenced before assignment

* refactor: Revised the way of getting models to test

* fix: Fix test in image model

* refactor: Call one model

* fix: Fix ci

* fix: Fix splade model name

* tests: Updated tests

* chore: Remove cache

* tests: Update multi task tests

* tests: Update multi task tests

* tests: Updated tests

* refactor: refactor utils func, add comments, conditions refactor

---------

Co-authored-by: George Panchuk <george.panchuk@qdrant.tech>
  • Loading branch information
hh-space-invader and joein authored Mar 6, 2025
1 parent 1729aab commit 6acfb00
Show file tree
Hide file tree
Showing 9 changed files with 223 additions and 191 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/python-tests.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
name: Tests

on:
push:
branches: [ master, main, gpu ]
pull_request:
branches: [ master, main, gpu ]
workflow_dispatch:


env:
CARGO_TERM_COLOR: always
Expand Down Expand Up @@ -42,4 +43,4 @@ jobs:
- name: Run pytest
run: |
poetry run pytest
poetry run pytest
12 changes: 9 additions & 3 deletions tests/test_image_onnx_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from fastembed import ImageEmbedding
from tests.config import TEST_MISC_DIR
from tests.utils import delete_model_cache
from tests.utils import delete_model_cache, should_test_model

CANONICAL_VECTOR_VALUES = {
"Qdrant/clip-ViT-B-32-vision": np.array([-0.0098, 0.0128, -0.0274, 0.002, -0.0059]),
Expand All @@ -27,11 +27,13 @@
}


def test_embedding() -> None:
@pytest.mark.parametrize("model_name", ["Qdrant/clip-ViT-B-32-vision"])
def test_embedding(model_name: str) -> None:
is_ci = os.getenv("CI")
is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch"

for model_desc in ImageEmbedding._list_supported_models():
if not is_ci and model_desc.size_in_GB > 1:
if not should_test_model(model_desc, model_name, is_ci, is_manual):
continue

dim = model_desc.dim
Expand Down Expand Up @@ -74,8 +76,12 @@ def test_batch_embedding(n_dims: int, model_name: str) -> None:

embeddings = list(model.embed(images, batch_size=10))
embeddings = np.stack(embeddings, axis=0)
assert np.allclose(embeddings[1], embeddings[2])

canonical_vector = CANONICAL_VECTOR_VALUES[model_name]

assert embeddings.shape == (len(test_images) * n_images, n_dims)
assert np.allclose(embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3)
if is_ci:
delete_model_cache(model.model._model_dir)

Expand Down
54 changes: 32 additions & 22 deletions tests/test_late_interaction_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from fastembed.late_interaction.late_interaction_text_embedding import (
LateInteractionTextEmbedding,
)
from tests.utils import delete_model_cache
from tests.utils import delete_model_cache, should_test_model

# vectors are abridged and rounded for brevity
CANONICAL_COLUMN_VALUES = {
Expand Down Expand Up @@ -153,57 +153,70 @@
docs = ["Hello World"]


def test_batch_embedding():
@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
def test_batch_embedding(model_name: str):
is_ci = os.getenv("CI")
docs_to_embed = docs * 10

for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
print("evaluating", model_name)
model = LateInteractionTextEmbedding(model_name=model_name)
result = list(model.embed(docs_to_embed, batch_size=6))
model = LateInteractionTextEmbedding(model_name=model_name)
result = list(model.embed(docs_to_embed, batch_size=6))
expected_result = CANONICAL_COLUMN_VALUES[model_name]

for value in result:
token_num, abridged_dim = expected_result.shape
assert np.allclose(value[:, :abridged_dim], expected_result, atol=2e-3)
for value in result:
token_num, abridged_dim = expected_result.shape
assert np.allclose(value[:, :abridged_dim], expected_result, atol=2e-3)

if is_ci:
delete_model_cache(model.model._model_dir)
if is_ci:
delete_model_cache(model.model._model_dir)


def test_single_embedding():
@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
def test_single_embedding(model_name: str):
is_ci = os.getenv("CI")
is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch"
docs_to_embed = docs

for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
for model_desc in LateInteractionTextEmbedding._list_supported_models():
if not should_test_model(model_desc, model_name, is_ci, is_manual):
continue

print("evaluating", model_name)
model = LateInteractionTextEmbedding(model_name=model_name)
result = next(iter(model.embed(docs_to_embed, batch_size=6)))
expected_result = CANONICAL_COLUMN_VALUES[model_name]
token_num, abridged_dim = expected_result.shape
assert np.allclose(result[:, :abridged_dim], expected_result, atol=2e-3)

if is_ci:
delete_model_cache(model.model._model_dir)


def test_single_embedding_query():
@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
def test_single_embedding_query(model_name: str):
is_ci = os.getenv("CI")
is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch"
queries_to_embed = docs

for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
for model_desc in LateInteractionTextEmbedding._list_supported_models():
if not should_test_model(model_desc, model_name, is_ci, is_manual):
continue

print("evaluating", model_name)
model = LateInteractionTextEmbedding(model_name=model_name)
result = next(iter(model.query_embed(queries_to_embed)))
expected_result = CANONICAL_QUERY_VALUES[model_name]
token_num, abridged_dim = expected_result.shape
assert np.allclose(result[:, :abridged_dim], expected_result, atol=2e-3)

if is_ci:
delete_model_cache(model.model._model_dir)


def test_parallel_processing():
@pytest.mark.parametrize("token_dim,model_name", [(96, "answerdotai/answerai-colbert-small-v1")])
def test_parallel_processing(token_dim: int, model_name: str):
is_ci = os.getenv("CI")
model = LateInteractionTextEmbedding(model_name="colbert-ir/colbertv2.0")
token_dim = 128
model = LateInteractionTextEmbedding(model_name=model_name)

docs = ["hello world", "flag embedding"] * 100
embeddings = list(model.embed(docs, batch_size=10, parallel=2))
embeddings = np.stack(embeddings, axis=0)
Expand All @@ -222,10 +235,7 @@ def test_parallel_processing():
delete_model_cache(model.model._model_dir)


@pytest.mark.parametrize(
"model_name",
["colbert-ir/colbertv2.0"],
)
@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
def test_lazy_load(model_name: str):
is_ci = os.getenv("CI")

Expand Down
55 changes: 28 additions & 27 deletions tests/test_late_interaction_multimodal.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

import pytest
from PIL import Image
import numpy as np

Expand Down Expand Up @@ -45,38 +46,38 @@


def test_batch_embedding():
is_ci = os.getenv("CI")
if os.getenv("CI"):
pytest.skip("Colpali is too large to test in CI")

if not is_ci:
for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
print("evaluating", model_name)
model = LateInteractionMultimodalEmbedding(model_name=model_name)
result = list(model.embed_image(images, batch_size=2))
for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
print("evaluating", model_name)
model = LateInteractionMultimodalEmbedding(model_name=model_name)
result = list(model.embed_image(images, batch_size=2))

for value in result:
token_num, abridged_dim = expected_result.shape
assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=2e-3)
for value in result:
token_num, abridged_dim = expected_result.shape
assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=2e-3)


def test_single_embedding():
is_ci = os.getenv("CI")
if not is_ci:
for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
print("evaluating", model_name)
model = LateInteractionMultimodalEmbedding(model_name=model_name)
result = next(iter(model.embed_image(images, batch_size=6)))
token_num, abridged_dim = expected_result.shape
assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
if os.getenv("CI"):
pytest.skip("Colpali is too large to test in CI")

for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
print("evaluating", model_name)
model = LateInteractionMultimodalEmbedding(model_name=model_name)
result = next(iter(model.embed_image(images, batch_size=6)))
token_num, abridged_dim = expected_result.shape
assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)


def test_single_embedding_query():
is_ci = os.getenv("CI")
if not is_ci:
queries_to_embed = queries

for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
print("evaluating", model_name)
model = LateInteractionMultimodalEmbedding(model_name=model_name)
result = next(iter(model.embed_text(queries_to_embed)))
token_num, abridged_dim = expected_result.shape
assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
if os.getenv("CI"):
pytest.skip("Colpali is too large to test in CI")

for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
print("evaluating", model_name)
model = LateInteractionMultimodalEmbedding(model_name=model_name)
result = next(iter(model.embed_text(queries)))
token_num, abridged_dim = expected_result.shape
assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
49 changes: 30 additions & 19 deletions tests/test_sparse_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

from fastembed.sparse.bm25 import Bm25
from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding
from tests.utils import delete_model_cache
from tests.utils import delete_model_cache, should_test_model

CANONICAL_COLUMN_VALUES = {
"prithvida/Splade_PP_en_v1": {
"prithivida/Splade_PP_en_v1": {
"indices": [
2040,
2047,
Expand Down Expand Up @@ -49,28 +49,41 @@
docs = ["Hello World"]


def test_batch_embedding() -> None:
@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"])
def test_batch_embedding(model_name: str) -> None:
is_ci = os.getenv("CI")
docs_to_embed = docs * 10

for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
model = SparseTextEmbedding(model_name=model_name)
result = next(iter(model.embed(docs_to_embed, batch_size=6)))
assert result.indices.tolist() == expected_result["indices"]
model = SparseTextEmbedding(model_name=model_name)
result = next(iter(model.embed(docs_to_embed, batch_size=6)))
expected_result = CANONICAL_COLUMN_VALUES[model_name]
assert result.indices.tolist() == expected_result["indices"]

for i, value in enumerate(result.values):
assert pytest.approx(value, abs=0.001) == expected_result["values"][i]
if is_ci:
delete_model_cache(model.model._model_dir)
for i, value in enumerate(result.values):
assert pytest.approx(value, abs=0.001) == expected_result["values"][i]
if is_ci:
delete_model_cache(model.model._model_dir)


def test_single_embedding() -> None:
@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"])
def test_single_embedding(model_name: str) -> None:
is_ci = os.getenv("CI")
for model_name, expected_result in CANONICAL_COLUMN_VALUES.items():
is_manual = os.getenv("GITHUB_EVENT_NAME") == "workflow_dispatch"

for model_desc in SparseTextEmbedding._list_supported_models():
if (
model_desc.model not in CANONICAL_COLUMN_VALUES
): # attention models and bm25 are also parts of
# SparseTextEmbedding, however, they have their own tests
continue
if not should_test_model(model_desc, model_name, is_ci, is_manual):
continue

model = SparseTextEmbedding(model_name=model_name)

passage_result = next(iter(model.embed(docs, batch_size=6)))
query_result = next(iter(model.query_embed(docs)))
expected_result = CANONICAL_COLUMN_VALUES[model_name]
for result in [passage_result, query_result]:
assert result.indices.tolist() == expected_result["indices"]

Expand All @@ -80,9 +93,10 @@ def test_single_embedding() -> None:
delete_model_cache(model.model._model_dir)


def test_parallel_processing() -> None:
@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"])
def test_parallel_processing(model_name: str) -> None:
is_ci = os.getenv("CI")
model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1")
model = SparseTextEmbedding(model_name=model_name)
docs = ["hello world", "flag embedding"] * 30
sparse_embeddings_duo = list(model.embed(docs, batch_size=10, parallel=2))
sparse_embeddings_all = list(model.embed(docs, batch_size=10, parallel=0))
Expand Down Expand Up @@ -172,10 +186,7 @@ def test_disable_stemmer_behavior(disable_stemmer: bool) -> None:
assert result == expected, f"Expected {expected}, but got {result}"


@pytest.mark.parametrize(
"model_name",
["prithivida/Splade_PP_en_v1"],
)
@pytest.mark.parametrize("model_name", ["prithivida/Splade_PP_en_v1"])
def test_lazy_load(model_name: str) -> None:
is_ci = os.getenv("CI")
model = SparseTextEmbedding(model_name=model_name, lazy_load=True)
Expand Down
Loading

0 comments on commit 6acfb00

Please sign in to comment.