Coverage for langsmith/_expect.py: 39%

94 statements  

« prev     ^ index     » next       coverage.py v7.10.1, created at 2025-12-11 16:15 -0800

1"""Make approximate assertions as "expectations" on test results. 

2 

3This module is designed to be used within test cases decorated with the 

4`@pytest.mark.decorator` decorator 

5 

6It allows you to log scores about a test case and optionally make assertions that log as 

7"expectation" feedback to LangSmith. 

8 

9Example: 

10 ```python 

11 import pytest 

12 from langsmith import expect 

13 

14 

15 @pytest.mark.langsmith 

16 def test_output_semantically_close(): 

17 response = oai_client.chat.completions.create( 

18 model="gpt-3.5-turbo", 

19 messages=[ 

20 {"role": "system", "content": "You are a helpful assistant."}, 

21 {"role": "user", "content": "Say hello!"}, 

22 ], 

23 ) 

24 response_txt = response.choices[0].message.content 

25 # Intended usage 

26 expect.embedding_distance( 

27 prediction=response_txt, 

28 reference="Hello!", 

29 ).to_be_less_than(0.9) 

30 

31 # Score the test case 

32 matcher = expect.edit_distance( 

33 prediction=response_txt, 

34 reference="Hello!", 

35 ) 

36 # Apply an assertion and log 'expectation' feedback to LangSmith 

37 matcher.to_be_less_than(1) 

38 

39 # You can also directly make assertions on values directly 

40 expect.value(response_txt).to_contain("Hello!") 

41 # Or using a custom check 

42 expect.value(response_txt).against(lambda x: "Hello" in x) 

43 

44 # You can even use this for basic metric logging within tests 

45 

46 expect.score(0.8) 

47 expect.score(0.7, key="similarity").to_be_greater_than(0.7) 

48 ``` 

49""" # noqa: E501 

50 

51from __future__ import annotations 

52 

53import atexit 

54import inspect 

55from typing import ( 

56 TYPE_CHECKING, 

57 Any, 

58 Callable, 

59 Literal, 

60 Optional, 

61 Union, 

62 overload, 

63) 

64 

65from langsmith import client as ls_client 

66from langsmith import run_helpers as rh 

67from langsmith import run_trees as rt 

68from langsmith import utils as ls_utils 

69 

70if TYPE_CHECKING: 

71 from langsmith._internal._edit_distance import EditDistanceConfig 

72 from langsmith._internal._embedding_distance import EmbeddingConfig 

73 

74 

75# Sentinel class used until PEP 0661 is accepted 

76class _NULL_SENTRY: 

77 """A sentinel singleton class used to distinguish omitted keyword arguments 

78 from those passed in with the value None (which may have different behavior). 

79 """ # noqa: D205 

80 

81 def __bool__(self) -> Literal[False]: 

82 return False 

83 

84 def __repr__(self) -> str: 

85 return "NOT_GIVEN" 

86 

87 

88NOT_GIVEN = _NULL_SENTRY() 

89 

90 

91class _Matcher: 

92 """A class for making assertions on expectation values.""" 

93 

94 def __init__( 

95 self, 

96 client: Optional[ls_client.Client], 

97 key: str, 

98 value: Any, 

99 _executor: Optional[ls_utils.ContextThreadPoolExecutor] = None, 

100 run_id: Optional[str] = None, 

101 ): 

102 self._client = client 

103 self.key = key 

104 self.value = value 

105 self._executor = _executor or ls_utils.ContextThreadPoolExecutor(max_workers=3) 

106 rt = rh.get_current_run_tree() 

107 self._run_id = rt.trace_id if rt else run_id 

108 

109 def _submit_feedback(self, score: int, message: Optional[str] = None) -> None: 

110 if not ls_utils.test_tracking_is_disabled(): 

111 if not self._client: 

112 self._client = rt.get_cached_client() 

113 self._executor.submit( 

114 self._client.create_feedback, 

115 run_id=self._run_id, 

116 key="expectation", 

117 score=score, 

118 comment=message, 

119 ) 

120 

121 def _assert(self, condition: bool, message: str, method_name: str) -> None: 

122 try: 

123 assert condition, message 

124 self._submit_feedback(1, message=f"Success: {self.key}.{method_name}") 

125 except AssertionError as e: 

126 self._submit_feedback(0, repr(e)) 

127 raise e from None 

128 

129 def to_be_less_than(self, value: float) -> None: 

130 """Assert that the expectation value is less than the given value. 

131 

132 Args: 

133 value: The value to compare against. 

134 

135 Raises: 

136 AssertionError: If the expectation value is not less than the given value. 

137 """ 

138 self._assert( 

139 self.value < value, 

140 f"Expected {self.key} to be less than {value}, but got {self.value}", 

141 "to_be_less_than", 

142 ) 

143 

144 def to_be_greater_than(self, value: float) -> None: 

145 """Assert that the expectation value is greater than the given value. 

146 

147 Args: 

148 value: The value to compare against. 

149 

150 Raises: 

151 AssertionError: If the expectation value is not 

152 greater than the given value. 

153 """ 

154 self._assert( 

155 self.value > value, 

156 f"Expected {self.key} to be greater than {value}, but got {self.value}", 

157 "to_be_greater_than", 

158 ) 

159 

160 def to_be_between(self, min_value: float, max_value: float) -> None: 

161 """Assert that the expectation value is between the given min and max values. 

162 

163 Args: 

164 min_value: The minimum value (exclusive). 

165 max_value: The maximum value (exclusive). 

166 

167 Raises: 

168 AssertionError: If the expectation value is not between the min and max. 

169 """ 

170 self._assert( 

171 min_value < self.value < max_value, 

172 f"Expected {self.key} to be between {min_value} and {max_value}," 

173 f" but got {self.value}", 

174 "to_be_between", 

175 ) 

176 

177 def to_be_approximately(self, value: float, precision: int = 2) -> None: 

178 """Assert that the expectation value is approximately equal to the given value. 

179 

180 Args: 

181 value: The value to compare against. 

182 precision: The number of decimal places to round to for comparison. 

183 

184 Raises: 

185 AssertionError: If the rounded expectation value 

186 does not equal the rounded given value. 

187 """ 

188 self._assert( 

189 round(self.value, precision) == round(value, precision), 

190 f"Expected {self.key} to be approximately {value}, but got {self.value}", 

191 "to_be_approximately", 

192 ) 

193 

194 def to_equal(self, value: float) -> None: 

195 """Assert that the expectation value equals the given value. 

196 

197 Args: 

198 value: The value to compare against. 

199 

200 Raises: 

201 AssertionError: If the expectation value does 

202 not exactly equal the given value. 

203 """ 

204 self._assert( 

205 self.value == value, 

206 f"Expected {self.key} to be equal to {value}, but got {self.value}", 

207 "to_equal", 

208 ) 

209 

210 def to_be_none(self) -> None: 

211 """Assert that the expectation value is `None`. 

212 

213 Raises: 

214 AssertionError: If the expectation value is not `None`. 

215 """ 

216 self._assert( 

217 self.value is None, 

218 f"Expected {self.key} to be None, but got {self.value}", 

219 "to_be_none", 

220 ) 

221 

222 def to_contain(self, value: Any) -> None: 

223 """Assert that the expectation value contains the given value. 

224 

225 Args: 

226 value: The value to check for containment. 

227 

228 Raises: 

229 AssertionError: If the expectation value does not contain the given value. 

230 """ 

231 self._assert( 

232 value in self.value, 

233 f"Expected {self.key} to contain {value}, but it does not", 

234 "to_contain", 

235 ) 

236 

237 # Custom assertions 

238 def against(self, func: Callable, /) -> None: 

239 """Assert the expectation value against a custom function. 

240 

241 Args: 

242 func: A custom function that takes the expectation value as input. 

243 

244 Raises: 

245 AssertionError: If the custom function returns False. 

246 """ 

247 func_signature = inspect.signature(func) 

248 self._assert( 

249 func(self.value), 

250 f"Assertion {func_signature} failed for {self.key}", 

251 "against", 

252 ) 

253 

254 

255class _Expect: 

256 """A class for setting expectations on test results.""" 

257 

258 def __init__(self, *, client: Optional[ls_client.Client] = None): 

259 self._client = client 

260 self.executor = ls_utils.ContextThreadPoolExecutor(max_workers=3) 

261 atexit.register(self.executor.shutdown, wait=True) 

262 

263 def embedding_distance( 

264 self, 

265 prediction: str, 

266 reference: str, 

267 *, 

268 config: Optional[EmbeddingConfig] = None, 

269 ) -> _Matcher: 

270 """Compute the embedding distance between the prediction and reference. 

271 

272 This logs the embedding distance to LangSmith and returns a `_Matcher` instance 

273 for making assertions on the distance value. 

274 

275 By default, this uses the OpenAI API for computing embeddings. 

276 

277 Args: 

278 prediction: The predicted string to compare. 

279 reference: The reference string to compare against. 

280 config: Optional configuration for the embedding distance evaluator. 

281 

282 Supported options: 

283 

284 - `encoder`: A custom encoder function to encode the list of input 

285 strings to embeddings. 

286 

287 Defaults to the OpenAI API. 

288 - `metric`: The distance metric to use for comparison. 

289 

290 Supported values: `'cosine'`, `'euclidean'`, `'manhattan'`, 

291 `'chebyshev'`, `'hamming'`. 

292 

293 Returns: 

294 A `_Matcher` instance for the embedding distance value. 

295 

296 

297 Example: 

298 ```python 

299 expect.embedding_distance( 

300 prediction="hello", 

301 reference="hi", 

302 ).to_be_less_than(1.0) 

303 ``` 

304 """ # noqa: E501 

305 from langsmith._internal._embedding_distance import EmbeddingDistance 

306 

307 config = config or {} 

308 encoder_func = "custom" if config.get("encoder") else "openai" 

309 evaluator = EmbeddingDistance(config=config) 

310 score = evaluator.evaluate(prediction=prediction, reference=reference) 

311 src_info = {"encoder": encoder_func, "metric": evaluator.distance} 

312 self._submit_feedback( 

313 "embedding_distance", 

314 { 

315 "score": score, 

316 "source_info": src_info, 

317 "comment": f"Using {encoder_func}, Metric: {evaluator.distance}", 

318 }, 

319 ) 

320 return _Matcher( 

321 self._client, "embedding_distance", score, _executor=self.executor 

322 ) 

323 

324 def edit_distance( 

325 self, 

326 prediction: str, 

327 reference: str, 

328 *, 

329 config: Optional[EditDistanceConfig] = None, 

330 ) -> _Matcher: 

331 """Compute the string distance between the prediction and reference. 

332 

333 This logs the string distance (Damerau-Levenshtein) to LangSmith and returns 

334 a `_Matcher` instance for making assertions on the distance value. 

335 

336 This depends on the `rapidfuzz` package for string distance computation. 

337 

338 Args: 

339 prediction: The predicted string to compare. 

340 reference: The reference string to compare against. 

341 config: Optional configuration for the string distance evaluator. 

342 

343 Supported options: 

344 

345 - `metric`: The distance metric to use for comparison. 

346 

347 Supported values: `'damerau_levenshtein'`, `'levenshtein'`, 

348 `'jaro'`, `'jaro_winkler'`, `'hamming'`, `'indel'`. 

349 - `normalize_score`: Whether to normalize the score between `0` and `1`. 

350 

351 Returns: 

352 A `_Matcher` instance for the string distance value. 

353 

354 Examples: 

355 ```python 

356 expect.edit_distance("hello", "helo").to_be_less_than(1) 

357 ``` 

358 """ 

359 from langsmith._internal._edit_distance import EditDistance 

360 

361 config = config or {} 

362 metric = config.get("metric") or "damerau_levenshtein" 

363 normalize = config.get("normalize_score", True) 

364 evaluator = EditDistance(config=config) 

365 score = evaluator.evaluate(prediction=prediction, reference=reference) 

366 src_info = {"metric": metric, "normalize": normalize} 

367 self._submit_feedback( 

368 "edit_distance", 

369 { 

370 "score": score, 

371 "source_info": src_info, 

372 "comment": f"Using {metric}, Normalize: {normalize}", 

373 }, 

374 ) 

375 return _Matcher( 

376 self._client, 

377 "edit_distance", 

378 score, 

379 _executor=self.executor, 

380 ) 

381 

382 def value(self, value: Any) -> _Matcher: 

383 """Create a `_Matcher` instance for making assertions on the given value. 

384 

385 Args: 

386 value: The value to make assertions on. 

387 

388 Returns: 

389 A `_Matcher` instance for the given value. 

390 

391 Example: 

392 ```python 

393 expect.value(10).to_be_less_than(20) 

394 ``` 

395 """ 

396 return _Matcher(self._client, "value", value, _executor=self.executor) 

397 

398 def score( 

399 self, 

400 score: Union[float, int, bool], 

401 *, 

402 key: str = "score", 

403 source_run_id: Optional[ls_client.ID_TYPE] = None, 

404 comment: Optional[str] = None, 

405 ) -> _Matcher: 

406 """Log a numeric score to LangSmith. 

407 

408 Args: 

409 score: The score value to log. 

410 key: The key to use for logging the score. Defaults to `'score'`. 

411 

412 Example: 

413 ```python 

414 expect.score(0.8) # doctest: +ELLIPSIS 

415 <langsmith._expect._Matcher object at ...> 

416 

417 expect.score(0.8, key="similarity").to_be_greater_than(0.7) 

418 ``` 

419 """ 

420 self._submit_feedback( 

421 key, 

422 { 

423 "score": score, 

424 "source_info": {"method": "expect.score"}, 

425 "source_run_id": source_run_id, 

426 "comment": comment, 

427 }, 

428 ) 

429 return _Matcher(self._client, key, score, _executor=self.executor) 

430 

431 ## Private Methods 

432 

433 @overload 

434 def __call__(self, value: Any, /) -> _Matcher: ... 

435 

436 @overload 

437 def __call__(self, /, *, client: ls_client.Client) -> _Expect: ... 

438 

439 def __call__( 

440 self, 

441 value: Optional[Any] = NOT_GIVEN, 

442 /, 

443 client: Optional[ls_client.Client] = None, 

444 ) -> Union[_Expect, _Matcher]: 

445 expected = _Expect(client=client) 

446 if value is not NOT_GIVEN: 

447 return expected.value(value) 

448 return expected 

449 

450 def _submit_feedback(self, key: str, results: dict): 

451 current_run = rh.get_current_run_tree() 

452 run_id = current_run.trace_id if current_run else None 

453 if not ls_utils.test_tracking_is_disabled(): 

454 if not self._client: 

455 self._client = rt.get_cached_client() 

456 self.executor.submit( 

457 self._client.create_feedback, run_id=run_id, key=key, **results 

458 ) 

459 

460 

461expect = _Expect() 

462 

463__all__ = ["expect"]