Testing Vector Embedding Operations in LlamaIndex Core
This test suite validates the core embedding functionality in LlamaIndex, focusing on text embedding generation, similarity calculations, and embedding aggregation methods. It ensures reliable vector representations and similarity comparisons for text processing.
Test Coverage Overview
Implementation Analysis
Technical Details
Best Practices Demonstrated
run-llama/llama_index
llama-index-core/tests/embeddings/test_base.py
"""Embeddings."""
from typing import Any, List
from unittest.mock import patch
from llama_index.core.base.embeddings.base import SimilarityMode, mean_agg
from llama_index.core.embeddings.mock_embed_model import MockEmbedding
def mock_get_text_embedding(text: str) -> List[float]:
"""Mock get text embedding."""
# assume dimensions are 5
if text == "Hello world.":
return [1, 0, 0, 0, 0]
elif text == "This is a test.":
return [0, 1, 0, 0, 0]
elif text == "This is another test.":
return [0, 0, 1, 0, 0]
elif text == "This is a test v2.":
return [0, 0, 0, 1, 0]
elif text == "This is a test v3.":
return [0, 0, 0, 0, 1]
elif text == "This is bar test.":
return [0, 0, 1, 0, 0]
elif text == "Hello world backup.":
# this is used when "Hello world." is deleted.
return [1, 0, 0, 0, 0]
else:
raise ValueError("Invalid text for `mock_get_text_embedding`.")
def mock_get_text_embeddings(texts: List[str]) -> List[List[float]]:
"""Mock get text embeddings."""
return [mock_get_text_embedding(text) for text in texts]
@patch.object(MockEmbedding, "_get_text_embedding", side_effect=mock_get_text_embedding)
@patch.object(
MockEmbedding, "_get_text_embeddings", side_effect=mock_get_text_embeddings
)
def test_get_text_embeddings(
_mock_get_text_embeddings: Any, _mock_get_text_embedding: Any
) -> None:
"""Test get queued text embeddings."""
embed_model = MockEmbedding(embed_dim=8)
texts_to_embed = []
for i in range(8):
texts_to_embed.append("Hello world.")
for i in range(8):
texts_to_embed.append("This is a test.")
for i in range(4):
texts_to_embed.append("This is another test.")
for i in range(4):
texts_to_embed.append("This is a test v2.")
result_embeddings = embed_model.get_text_embedding_batch(texts_to_embed)
for i in range(8):
assert result_embeddings[i] == [1, 0, 0, 0, 0]
for i in range(8, 16):
assert result_embeddings[i] == [0, 1, 0, 0, 0]
for i in range(16, 20):
assert result_embeddings[i] == [0, 0, 1, 0, 0]
for i in range(20, 24):
assert result_embeddings[i] == [0, 0, 0, 1, 0]
def test_embedding_similarity() -> None:
"""Test embedding similarity."""
embed_model = MockEmbedding(embed_dim=3)
text_embedding = [3.0, 4.0, 0.0]
query_embedding = [0.0, 1.0, 0.0]
cosine = embed_model.similarity(query_embedding, text_embedding)
assert cosine == 0.8
def test_embedding_similarity_euclidean() -> None:
embed_model = MockEmbedding(embed_dim=2)
query_embedding = [1.0, 0.0]
text1_embedding = [0.0, 1.0] # further from query_embedding distance=1.414
text2_embedding = [1.0, 1.0] # closer to query_embedding distance=1.0
euclidean_similarity1 = embed_model.similarity(
query_embedding, text1_embedding, mode=SimilarityMode.EUCLIDEAN
)
euclidean_similarity2 = embed_model.similarity(
query_embedding, text2_embedding, mode=SimilarityMode.EUCLIDEAN
)
assert euclidean_similarity1 < euclidean_similarity2
def test_mean_agg() -> None:
"""Test mean aggregation for embeddings."""
embedding_0 = [3.0, 4.0, 0.0]
embedding_1 = [0.0, 1.0, 0.0]
output = mean_agg([embedding_0, embedding_1])
assert output == [1.5, 2.5, 0.0]