Coverage for langsmith/_expect.py: 39%
94 statements
« prev ^ index » next coverage.py v7.10.1, created at 2025-12-11 16:15 -0800
« prev ^ index » next coverage.py v7.10.1, created at 2025-12-11 16:15 -0800
1"""Make approximate assertions as "expectations" on test results.
3This module is designed to be used within test cases decorated with the
4`@pytest.mark.decorator` decorator
6It allows you to log scores about a test case and optionally make assertions that log as
7"expectation" feedback to LangSmith.
9Example:
10 ```python
11 import pytest
12 from langsmith import expect
15 @pytest.mark.langsmith
16 def test_output_semantically_close():
17 response = oai_client.chat.completions.create(
18 model="gpt-3.5-turbo",
19 messages=[
20 {"role": "system", "content": "You are a helpful assistant."},
21 {"role": "user", "content": "Say hello!"},
22 ],
23 )
24 response_txt = response.choices[0].message.content
25 # Intended usage
26 expect.embedding_distance(
27 prediction=response_txt,
28 reference="Hello!",
29 ).to_be_less_than(0.9)
31 # Score the test case
32 matcher = expect.edit_distance(
33 prediction=response_txt,
34 reference="Hello!",
35 )
36 # Apply an assertion and log 'expectation' feedback to LangSmith
37 matcher.to_be_less_than(1)
39 # You can also directly make assertions on values directly
40 expect.value(response_txt).to_contain("Hello!")
41 # Or using a custom check
42 expect.value(response_txt).against(lambda x: "Hello" in x)
44 # You can even use this for basic metric logging within tests
46 expect.score(0.8)
47 expect.score(0.7, key="similarity").to_be_greater_than(0.7)
48 ```
49""" # noqa: E501
51from __future__ import annotations
53import atexit
54import inspect
55from typing import (
56 TYPE_CHECKING,
57 Any,
58 Callable,
59 Literal,
60 Optional,
61 Union,
62 overload,
63)
65from langsmith import client as ls_client
66from langsmith import run_helpers as rh
67from langsmith import run_trees as rt
68from langsmith import utils as ls_utils
70if TYPE_CHECKING:
71 from langsmith._internal._edit_distance import EditDistanceConfig
72 from langsmith._internal._embedding_distance import EmbeddingConfig
75# Sentinel class used until PEP 0661 is accepted
76class _NULL_SENTRY:
77 """A sentinel singleton class used to distinguish omitted keyword arguments
78 from those passed in with the value None (which may have different behavior).
79 """ # noqa: D205
81 def __bool__(self) -> Literal[False]:
82 return False
84 def __repr__(self) -> str:
85 return "NOT_GIVEN"
88NOT_GIVEN = _NULL_SENTRY()
91class _Matcher:
92 """A class for making assertions on expectation values."""
94 def __init__(
95 self,
96 client: Optional[ls_client.Client],
97 key: str,
98 value: Any,
99 _executor: Optional[ls_utils.ContextThreadPoolExecutor] = None,
100 run_id: Optional[str] = None,
101 ):
102 self._client = client
103 self.key = key
104 self.value = value
105 self._executor = _executor or ls_utils.ContextThreadPoolExecutor(max_workers=3)
106 rt = rh.get_current_run_tree()
107 self._run_id = rt.trace_id if rt else run_id
109 def _submit_feedback(self, score: int, message: Optional[str] = None) -> None:
110 if not ls_utils.test_tracking_is_disabled():
111 if not self._client:
112 self._client = rt.get_cached_client()
113 self._executor.submit(
114 self._client.create_feedback,
115 run_id=self._run_id,
116 key="expectation",
117 score=score,
118 comment=message,
119 )
121 def _assert(self, condition: bool, message: str, method_name: str) -> None:
122 try:
123 assert condition, message
124 self._submit_feedback(1, message=f"Success: {self.key}.{method_name}")
125 except AssertionError as e:
126 self._submit_feedback(0, repr(e))
127 raise e from None
129 def to_be_less_than(self, value: float) -> None:
130 """Assert that the expectation value is less than the given value.
132 Args:
133 value: The value to compare against.
135 Raises:
136 AssertionError: If the expectation value is not less than the given value.
137 """
138 self._assert(
139 self.value < value,
140 f"Expected {self.key} to be less than {value}, but got {self.value}",
141 "to_be_less_than",
142 )
144 def to_be_greater_than(self, value: float) -> None:
145 """Assert that the expectation value is greater than the given value.
147 Args:
148 value: The value to compare against.
150 Raises:
151 AssertionError: If the expectation value is not
152 greater than the given value.
153 """
154 self._assert(
155 self.value > value,
156 f"Expected {self.key} to be greater than {value}, but got {self.value}",
157 "to_be_greater_than",
158 )
160 def to_be_between(self, min_value: float, max_value: float) -> None:
161 """Assert that the expectation value is between the given min and max values.
163 Args:
164 min_value: The minimum value (exclusive).
165 max_value: The maximum value (exclusive).
167 Raises:
168 AssertionError: If the expectation value is not between the min and max.
169 """
170 self._assert(
171 min_value < self.value < max_value,
172 f"Expected {self.key} to be between {min_value} and {max_value},"
173 f" but got {self.value}",
174 "to_be_between",
175 )
177 def to_be_approximately(self, value: float, precision: int = 2) -> None:
178 """Assert that the expectation value is approximately equal to the given value.
180 Args:
181 value: The value to compare against.
182 precision: The number of decimal places to round to for comparison.
184 Raises:
185 AssertionError: If the rounded expectation value
186 does not equal the rounded given value.
187 """
188 self._assert(
189 round(self.value, precision) == round(value, precision),
190 f"Expected {self.key} to be approximately {value}, but got {self.value}",
191 "to_be_approximately",
192 )
194 def to_equal(self, value: float) -> None:
195 """Assert that the expectation value equals the given value.
197 Args:
198 value: The value to compare against.
200 Raises:
201 AssertionError: If the expectation value does
202 not exactly equal the given value.
203 """
204 self._assert(
205 self.value == value,
206 f"Expected {self.key} to be equal to {value}, but got {self.value}",
207 "to_equal",
208 )
210 def to_be_none(self) -> None:
211 """Assert that the expectation value is `None`.
213 Raises:
214 AssertionError: If the expectation value is not `None`.
215 """
216 self._assert(
217 self.value is None,
218 f"Expected {self.key} to be None, but got {self.value}",
219 "to_be_none",
220 )
222 def to_contain(self, value: Any) -> None:
223 """Assert that the expectation value contains the given value.
225 Args:
226 value: The value to check for containment.
228 Raises:
229 AssertionError: If the expectation value does not contain the given value.
230 """
231 self._assert(
232 value in self.value,
233 f"Expected {self.key} to contain {value}, but it does not",
234 "to_contain",
235 )
237 # Custom assertions
238 def against(self, func: Callable, /) -> None:
239 """Assert the expectation value against a custom function.
241 Args:
242 func: A custom function that takes the expectation value as input.
244 Raises:
245 AssertionError: If the custom function returns False.
246 """
247 func_signature = inspect.signature(func)
248 self._assert(
249 func(self.value),
250 f"Assertion {func_signature} failed for {self.key}",
251 "against",
252 )
255class _Expect:
256 """A class for setting expectations on test results."""
258 def __init__(self, *, client: Optional[ls_client.Client] = None):
259 self._client = client
260 self.executor = ls_utils.ContextThreadPoolExecutor(max_workers=3)
261 atexit.register(self.executor.shutdown, wait=True)
263 def embedding_distance(
264 self,
265 prediction: str,
266 reference: str,
267 *,
268 config: Optional[EmbeddingConfig] = None,
269 ) -> _Matcher:
270 """Compute the embedding distance between the prediction and reference.
272 This logs the embedding distance to LangSmith and returns a `_Matcher` instance
273 for making assertions on the distance value.
275 By default, this uses the OpenAI API for computing embeddings.
277 Args:
278 prediction: The predicted string to compare.
279 reference: The reference string to compare against.
280 config: Optional configuration for the embedding distance evaluator.
282 Supported options:
284 - `encoder`: A custom encoder function to encode the list of input
285 strings to embeddings.
287 Defaults to the OpenAI API.
288 - `metric`: The distance metric to use for comparison.
290 Supported values: `'cosine'`, `'euclidean'`, `'manhattan'`,
291 `'chebyshev'`, `'hamming'`.
293 Returns:
294 A `_Matcher` instance for the embedding distance value.
297 Example:
298 ```python
299 expect.embedding_distance(
300 prediction="hello",
301 reference="hi",
302 ).to_be_less_than(1.0)
303 ```
304 """ # noqa: E501
305 from langsmith._internal._embedding_distance import EmbeddingDistance
307 config = config or {}
308 encoder_func = "custom" if config.get("encoder") else "openai"
309 evaluator = EmbeddingDistance(config=config)
310 score = evaluator.evaluate(prediction=prediction, reference=reference)
311 src_info = {"encoder": encoder_func, "metric": evaluator.distance}
312 self._submit_feedback(
313 "embedding_distance",
314 {
315 "score": score,
316 "source_info": src_info,
317 "comment": f"Using {encoder_func}, Metric: {evaluator.distance}",
318 },
319 )
320 return _Matcher(
321 self._client, "embedding_distance", score, _executor=self.executor
322 )
324 def edit_distance(
325 self,
326 prediction: str,
327 reference: str,
328 *,
329 config: Optional[EditDistanceConfig] = None,
330 ) -> _Matcher:
331 """Compute the string distance between the prediction and reference.
333 This logs the string distance (Damerau-Levenshtein) to LangSmith and returns
334 a `_Matcher` instance for making assertions on the distance value.
336 This depends on the `rapidfuzz` package for string distance computation.
338 Args:
339 prediction: The predicted string to compare.
340 reference: The reference string to compare against.
341 config: Optional configuration for the string distance evaluator.
343 Supported options:
345 - `metric`: The distance metric to use for comparison.
347 Supported values: `'damerau_levenshtein'`, `'levenshtein'`,
348 `'jaro'`, `'jaro_winkler'`, `'hamming'`, `'indel'`.
349 - `normalize_score`: Whether to normalize the score between `0` and `1`.
351 Returns:
352 A `_Matcher` instance for the string distance value.
354 Examples:
355 ```python
356 expect.edit_distance("hello", "helo").to_be_less_than(1)
357 ```
358 """
359 from langsmith._internal._edit_distance import EditDistance
361 config = config or {}
362 metric = config.get("metric") or "damerau_levenshtein"
363 normalize = config.get("normalize_score", True)
364 evaluator = EditDistance(config=config)
365 score = evaluator.evaluate(prediction=prediction, reference=reference)
366 src_info = {"metric": metric, "normalize": normalize}
367 self._submit_feedback(
368 "edit_distance",
369 {
370 "score": score,
371 "source_info": src_info,
372 "comment": f"Using {metric}, Normalize: {normalize}",
373 },
374 )
375 return _Matcher(
376 self._client,
377 "edit_distance",
378 score,
379 _executor=self.executor,
380 )
382 def value(self, value: Any) -> _Matcher:
383 """Create a `_Matcher` instance for making assertions on the given value.
385 Args:
386 value: The value to make assertions on.
388 Returns:
389 A `_Matcher` instance for the given value.
391 Example:
392 ```python
393 expect.value(10).to_be_less_than(20)
394 ```
395 """
396 return _Matcher(self._client, "value", value, _executor=self.executor)
398 def score(
399 self,
400 score: Union[float, int, bool],
401 *,
402 key: str = "score",
403 source_run_id: Optional[ls_client.ID_TYPE] = None,
404 comment: Optional[str] = None,
405 ) -> _Matcher:
406 """Log a numeric score to LangSmith.
408 Args:
409 score: The score value to log.
410 key: The key to use for logging the score. Defaults to `'score'`.
412 Example:
413 ```python
414 expect.score(0.8) # doctest: +ELLIPSIS
415 <langsmith._expect._Matcher object at ...>
417 expect.score(0.8, key="similarity").to_be_greater_than(0.7)
418 ```
419 """
420 self._submit_feedback(
421 key,
422 {
423 "score": score,
424 "source_info": {"method": "expect.score"},
425 "source_run_id": source_run_id,
426 "comment": comment,
427 },
428 )
429 return _Matcher(self._client, key, score, _executor=self.executor)
431 ## Private Methods
433 @overload
434 def __call__(self, value: Any, /) -> _Matcher: ...
436 @overload
437 def __call__(self, /, *, client: ls_client.Client) -> _Expect: ...
439 def __call__(
440 self,
441 value: Optional[Any] = NOT_GIVEN,
442 /,
443 client: Optional[ls_client.Client] = None,
444 ) -> Union[_Expect, _Matcher]:
445 expected = _Expect(client=client)
446 if value is not NOT_GIVEN:
447 return expected.value(value)
448 return expected
450 def _submit_feedback(self, key: str, results: dict):
451 current_run = rh.get_current_run_tree()
452 run_id = current_run.trace_id if current_run else None
453 if not ls_utils.test_tracking_is_disabled():
454 if not self._client:
455 self._client = rt.get_cached_client()
456 self.executor.submit(
457 self._client.create_feedback, run_id=run_id, key=key, **results
458 )
461expect = _Expect()
463__all__ = ["expect"]