Coverage for langsmith/_internal/_embedding_distance.py: 0%
84 statements
« prev ^ index » next coverage.py v7.10.1, created at 2025-12-11 16:15 -0800
« prev ^ index » next coverage.py v7.10.1, created at 2025-12-11 16:15 -0800
1from __future__ import annotations
3import logging
4from collections.abc import Sequence
5from typing import (
6 TYPE_CHECKING,
7 Any,
8 Callable,
9 Literal,
10 Optional,
11 Union,
12)
14from typing_extensions import TypedDict
16if TYPE_CHECKING:
17 import numpy as np # type: ignore
20logger = logging.getLogger(__name__)
22Matrix = Union[list[list[float]], list[Any], Any]
25def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
26 """Row-wise cosine similarity between two equal-width matrices."""
27 import numpy as np
29 if len(X) == 0 or len(Y) == 0:
30 return np.array([])
32 X = np.array(X)
33 Y = np.array(Y)
34 if X.shape[1] != Y.shape[1]:
35 raise ValueError(
36 f"Number of columns in X and Y must be the same. X has shape {X.shape} "
37 f"and Y has shape {Y.shape}."
38 )
39 try:
40 import simsimd as simd # type: ignore
42 X = np.array(X, dtype=np.float32)
43 Y = np.array(Y, dtype=np.float32)
44 Z = 1 - simd.cdist(X, Y, metric="cosine")
45 if isinstance(Z, float):
46 return np.array([Z])
47 return np.array(Z)
48 except ImportError:
49 logger.debug(
50 "Unable to import simsimd, defaulting to NumPy implementation. If you want "
51 "to use simsimd please install with `pip install simsimd`."
52 )
53 X_norm = np.linalg.norm(X, axis=1)
54 Y_norm = np.linalg.norm(Y, axis=1)
55 # Ignore divide by zero errors run time warnings as those are handled below.
56 with np.errstate(divide="ignore", invalid="ignore"):
57 similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
58 similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
59 return similarity
62def _get_openai_encoder() -> Callable[[Sequence[str]], Sequence[Sequence[float]]]:
63 """Get the OpenAI GPT-3 encoder."""
64 try:
65 from openai import Client as OpenAIClient
66 except ImportError:
67 raise ImportError(
68 "THe default encoder for the EmbeddingDistance class uses the OpenAI API. "
69 "Please either install the openai library with `pip install openai` or "
70 "provide a custom encoder function (Callable[[str], Sequence[float]])."
71 )
73 def encode_text(texts: Sequence[str]) -> Sequence[Sequence[float]]:
74 client = OpenAIClient()
75 response = client.embeddings.create(
76 input=list(texts), model="text-embedding-3-small"
77 )
78 return [d.embedding for d in response.data]
80 return encode_text
83class EmbeddingConfig(TypedDict, total=False):
84 encoder: Callable[[list[str]], Sequence[Sequence[float]]]
85 metric: Literal["cosine", "euclidean", "manhattan", "chebyshev", "hamming"]
88class EmbeddingDistance:
89 def __init__(
90 self,
91 config: Optional[EmbeddingConfig] = None,
92 ):
93 config = config or {}
94 self.distance = config.get("metric") or "cosine"
95 self.encoder = config.get("encoder") or _get_openai_encoder()
97 def evaluate(
98 self,
99 prediction: str,
100 reference: str,
101 ) -> float:
102 try:
103 import numpy as np
104 except ImportError:
105 raise ImportError(
106 "The EmbeddingDistance class requires NumPy. Please install it with "
107 "`pip install numpy`."
108 )
109 embeddings = self.encoder([prediction, reference])
110 vector = np.array(embeddings)
111 return self._compute_distance(vector[0], vector[1]).item()
113 def _compute_distance(self, a: np.ndarray, b: np.ndarray) -> np.floating:
114 if self.distance == "cosine":
115 return self._cosine_distance(a, b) # type: ignore
116 elif self.distance == "euclidean":
117 return self._euclidean_distance(a, b)
118 elif self.distance == "manhattan":
119 return self._manhattan_distance(a, b)
120 elif self.distance == "chebyshev":
121 return self._chebyshev_distance(a, b)
122 elif self.distance == "hamming":
123 return self._hamming_distance(a, b)
124 else:
125 raise ValueError(f"Invalid distance metric: {self.distance}")
127 @staticmethod
128 def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
129 """Compute the cosine distance between two vectors.
131 Args:
132 a (np.ndarray): The first vector.
133 b (np.ndarray): The second vector.
135 Returns:
136 np.ndarray: The cosine distance.
137 """
138 return 1.0 - cosine_similarity([a], [b])
140 @staticmethod
141 def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
142 """Compute the Euclidean distance between two vectors.
144 Args:
145 a (np.ndarray): The first vector.
146 b (np.ndarray): The second vector.
148 Returns:
149 np.floating: The Euclidean distance.
150 """
151 return np.linalg.norm(a - b)
153 @staticmethod
154 def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
155 """Compute the Manhattan distance between two vectors.
157 Args:
158 a (np.ndarray): The first vector.
159 b (np.ndarray): The second vector.
161 Returns:
162 np.floating: The Manhattan distance.
163 """
164 return np.sum(np.abs(a - b))
166 @staticmethod
167 def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
168 """Compute the Chebyshev distance between two vectors.
170 Args:
171 a (np.ndarray): The first vector.
172 b (np.ndarray): The second vector.
174 Returns:
175 np.floating: The Chebyshev distance.
176 """
177 return np.max(np.abs(a - b))
179 @staticmethod
180 def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
181 """Compute the Hamming distance between two vectors.
183 Args:
184 a (np.ndarray): The first vector.
185 b (np.ndarray): The second vector.
187 Returns:
188 np.floating: The Hamming distance.
189 """
190 return np.mean(a != b)