Coverage for langsmith/_internal/_embedding_distance.py: 0%

84 statements  

« prev     ^ index     » next       coverage.py v7.10.1, created at 2025-12-11 16:15 -0800

1from __future__ import annotations 

2 

3import logging 

4from collections.abc import Sequence 

5from typing import ( 

6 TYPE_CHECKING, 

7 Any, 

8 Callable, 

9 Literal, 

10 Optional, 

11 Union, 

12) 

13 

14from typing_extensions import TypedDict 

15 

16if TYPE_CHECKING: 

17 import numpy as np # type: ignore 

18 

19 

20logger = logging.getLogger(__name__) 

21 

22Matrix = Union[list[list[float]], list[Any], Any] 

23 

24 

25def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: 

26 """Row-wise cosine similarity between two equal-width matrices.""" 

27 import numpy as np 

28 

29 if len(X) == 0 or len(Y) == 0: 

30 return np.array([]) 

31 

32 X = np.array(X) 

33 Y = np.array(Y) 

34 if X.shape[1] != Y.shape[1]: 

35 raise ValueError( 

36 f"Number of columns in X and Y must be the same. X has shape {X.shape} " 

37 f"and Y has shape {Y.shape}." 

38 ) 

39 try: 

40 import simsimd as simd # type: ignore 

41 

42 X = np.array(X, dtype=np.float32) 

43 Y = np.array(Y, dtype=np.float32) 

44 Z = 1 - simd.cdist(X, Y, metric="cosine") 

45 if isinstance(Z, float): 

46 return np.array([Z]) 

47 return np.array(Z) 

48 except ImportError: 

49 logger.debug( 

50 "Unable to import simsimd, defaulting to NumPy implementation. If you want " 

51 "to use simsimd please install with `pip install simsimd`." 

52 ) 

53 X_norm = np.linalg.norm(X, axis=1) 

54 Y_norm = np.linalg.norm(Y, axis=1) 

55 # Ignore divide by zero errors run time warnings as those are handled below. 

56 with np.errstate(divide="ignore", invalid="ignore"): 

57 similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm) 

58 similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0 

59 return similarity 

60 

61 

62def _get_openai_encoder() -> Callable[[Sequence[str]], Sequence[Sequence[float]]]: 

63 """Get the OpenAI GPT-3 encoder.""" 

64 try: 

65 from openai import Client as OpenAIClient 

66 except ImportError: 

67 raise ImportError( 

68 "THe default encoder for the EmbeddingDistance class uses the OpenAI API. " 

69 "Please either install the openai library with `pip install openai` or " 

70 "provide a custom encoder function (Callable[[str], Sequence[float]])." 

71 ) 

72 

73 def encode_text(texts: Sequence[str]) -> Sequence[Sequence[float]]: 

74 client = OpenAIClient() 

75 response = client.embeddings.create( 

76 input=list(texts), model="text-embedding-3-small" 

77 ) 

78 return [d.embedding for d in response.data] 

79 

80 return encode_text 

81 

82 

83class EmbeddingConfig(TypedDict, total=False): 

84 encoder: Callable[[list[str]], Sequence[Sequence[float]]] 

85 metric: Literal["cosine", "euclidean", "manhattan", "chebyshev", "hamming"] 

86 

87 

88class EmbeddingDistance: 

89 def __init__( 

90 self, 

91 config: Optional[EmbeddingConfig] = None, 

92 ): 

93 config = config or {} 

94 self.distance = config.get("metric") or "cosine" 

95 self.encoder = config.get("encoder") or _get_openai_encoder() 

96 

97 def evaluate( 

98 self, 

99 prediction: str, 

100 reference: str, 

101 ) -> float: 

102 try: 

103 import numpy as np 

104 except ImportError: 

105 raise ImportError( 

106 "The EmbeddingDistance class requires NumPy. Please install it with " 

107 "`pip install numpy`." 

108 ) 

109 embeddings = self.encoder([prediction, reference]) 

110 vector = np.array(embeddings) 

111 return self._compute_distance(vector[0], vector[1]).item() 

112 

113 def _compute_distance(self, a: np.ndarray, b: np.ndarray) -> np.floating: 

114 if self.distance == "cosine": 

115 return self._cosine_distance(a, b) # type: ignore 

116 elif self.distance == "euclidean": 

117 return self._euclidean_distance(a, b) 

118 elif self.distance == "manhattan": 

119 return self._manhattan_distance(a, b) 

120 elif self.distance == "chebyshev": 

121 return self._chebyshev_distance(a, b) 

122 elif self.distance == "hamming": 

123 return self._hamming_distance(a, b) 

124 else: 

125 raise ValueError(f"Invalid distance metric: {self.distance}") 

126 

127 @staticmethod 

128 def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray: 

129 """Compute the cosine distance between two vectors. 

130 

131 Args: 

132 a (np.ndarray): The first vector. 

133 b (np.ndarray): The second vector. 

134 

135 Returns: 

136 np.ndarray: The cosine distance. 

137 """ 

138 return 1.0 - cosine_similarity([a], [b]) 

139 

140 @staticmethod 

141 def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating: 

142 """Compute the Euclidean distance between two vectors. 

143 

144 Args: 

145 a (np.ndarray): The first vector. 

146 b (np.ndarray): The second vector. 

147 

148 Returns: 

149 np.floating: The Euclidean distance. 

150 """ 

151 return np.linalg.norm(a - b) 

152 

153 @staticmethod 

154 def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating: 

155 """Compute the Manhattan distance between two vectors. 

156 

157 Args: 

158 a (np.ndarray): The first vector. 

159 b (np.ndarray): The second vector. 

160 

161 Returns: 

162 np.floating: The Manhattan distance. 

163 """ 

164 return np.sum(np.abs(a - b)) 

165 

166 @staticmethod 

167 def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating: 

168 """Compute the Chebyshev distance between two vectors. 

169 

170 Args: 

171 a (np.ndarray): The first vector. 

172 b (np.ndarray): The second vector. 

173 

174 Returns: 

175 np.floating: The Chebyshev distance. 

176 """ 

177 return np.max(np.abs(a - b)) 

178 

179 @staticmethod 

180 def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating: 

181 """Compute the Hamming distance between two vectors. 

182 

183 Args: 

184 a (np.ndarray): The first vector. 

185 b (np.ndarray): The second vector. 

186 

187 Returns: 

188 np.floating: The Hamming distance. 

189 """ 

190 return np.mean(a != b)