Testing Sentence Embedding Similarity Calculations in FastChat
This test suite evaluates sentence similarity using different embedding models including Vicuna-7B and OpenAI’s Ada models. It implements cosine similarity calculations to compare embeddings of similar and different text phrases, demonstrating the effectiveness of various embedding approaches.
Test Coverage Overview
Implementation Analysis
Technical Details
Best Practices Demonstrated
lm-sys/fastchat
playground/test_embedding/test_sentence_similarity.py
import json
import os
import numpy as np
import openai
import requests
from scipy.spatial.distance import cosine
def get_embedding_from_api(word, model="vicuna-7b-v1.5"):
if "ada" in model:
resp = openai.Embedding.create(
model=model,
input=word,
)
embedding = np.array(resp["data"][0]["embedding"])
return embedding
url = "http://localhost:8000/v1/embeddings"
headers = {"Content-Type": "application/json"}
data = json.dumps({"model": model, "input": word})
response = requests.post(url, headers=headers, data=data)
if response.status_code == 200:
embedding = np.array(response.json()["data"][0]["embedding"])
return embedding
else:
print(f"Error: {response.status_code} - {response.text}")
return None
def cosine_similarity(vec1, vec2):
return 1 - cosine(vec1, vec2)
def print_cosine_similarity(embeddings, texts):
for i in range(len(texts)):
for j in range(i + 1, len(texts)):
sim = cosine_similarity(embeddings[texts[i]], embeddings[texts[j]])
print(f"Cosine similarity between '{texts[i]}' and '{texts[j]}': {sim:.2f}")
texts = [
"The quick brown fox",
"The quick brown dog",
"The fast brown fox",
"A completely different sentence",
]
embeddings = {}
for text in texts:
embeddings[text] = get_embedding_from_api(text)
print("Vicuna-7B:")
print_cosine_similarity(embeddings, texts)
for text in texts:
embeddings[text] = get_embedding_from_api(text, model="text-similarity-ada-001")
print("text-similarity-ada-001:")
print_cosine_similarity(embeddings, texts)
for text in texts:
embeddings[text] = get_embedding_from_api(text, model="text-embedding-ada-002")
print("text-embedding-ada-002:")
print_cosine_similarity(embeddings, texts)