Coverage for langsmith/evaluation/integrations/_langchain.py: 33%
46 statements
« prev ^ index » next coverage.py v7.10.1, created at 2025-12-11 16:15 -0800
« prev ^ index » next coverage.py v7.10.1, created at 2025-12-11 16:15 -0800
1from __future__ import annotations
3from typing import TYPE_CHECKING, Any, Callable, Optional, TypedDict, Union
5from langsmith.evaluation.evaluator import DynamicRunEvaluator
6from langsmith.run_helpers import traceable
7from langsmith.schemas import Example, Run
9if TYPE_CHECKING:
10 from langchain.evaluation.schema import ( # type: ignore[import-not-found]
11 StringEvaluator,
12 )
14 from langsmith.evaluation.evaluator import ( # type: ignore[import-not-found]
15 RunEvaluator,
16 )
19class SingleEvaluatorInput(TypedDict):
20 """The input to a `StringEvaluator`."""
22 prediction: str
23 """The prediction string."""
24 reference: Optional[Any]
25 """The reference string."""
26 input: Optional[str]
27 """The input string."""
30class LangChainStringEvaluator:
31 r"""A class for wrapping a LangChain `StringEvaluator`.
33 Requires the `langchain` package to be installed.
35 Attributes:
36 evaluator (StringEvaluator): The underlying `StringEvaluator` OR the name
37 of the evaluator to load.
39 Methods:
40 `as_run_evaluator() -> RunEvaluator`:
41 Convert the `LangChainStringEvaluator` to a `RunEvaluator`.
43 Examples:
44 !!! example "Creating a simple `LangChainStringEvaluator`"
46 ```python
47 evaluator = LangChainStringEvaluator("exact_match")
48 ```
50 !!! example "Converting a `LangChainStringEvaluator` to a `RunEvaluator`"
52 ```python
53 from langsmith.evaluation import LangChainStringEvaluator
54 from langchain_openai import ChatOpenAI
55 evaluator = LangChainStringEvaluator(
56 "criteria",
57 config={
58 "criteria": {
59 "usefulness": "The prediction is useful if"
60 " it is correct and/or asks a useful followup question."
61 },
62 "llm": ChatOpenAI(model="gpt-4o"),
63 },
64 )
65 run_evaluator = evaluator.as_run_evaluator()
66 run_evaluator # doctest: +ELLIPSIS
67 <DynamicRunEvaluator ...>
68 ```
70 !!! example "Customizing the LLM model used by the evaluator"
72 ```python
73 from langsmith.evaluation import LangChainStringEvaluator
74 from langchain_anthropic import ChatAnthropic
75 evaluator = LangChainStringEvaluator(
76 "criteria",
77 config={
78 "criteria": {
79 "usefulness": "The prediction is useful if"
80 " it is correct and/or asks a useful followup question."
81 },
82 "llm": ChatAnthropic(model="claude-3-opus-20240229"),
83 },
84 )
86 run_evaluator = evaluator.as_run_evaluator()
87 run_evaluator # doctest: +ELLIPSIS
88 <DynamicRunEvaluator ...>
89 ```
91 !!! example "Using the `evaluate` API with different evaluators"
93 ```python
94 def prepare_data(run: Run, example: Example):
95 # Convert the evaluation data into the format expected by the evaluator
96 # Only required for datasets with multiple inputs/output keys
97 return {
98 "prediction": run.outputs["prediction"],
99 "reference": example.outputs["answer"],
100 "input": str(example.inputs),
101 }
104 import re
105 from langchain_anthropic import ChatAnthropic
106 import langsmith
107 from langsmith.evaluation import LangChainStringEvaluator, evaluate
109 criteria_evaluator = LangChainStringEvaluator(
110 "criteria",
111 config={
112 "criteria": {
113 "usefulness": "The prediction is useful if it is correct"
114 " and/or asks a useful followup question."
115 },
116 "llm": ChatAnthropic(model="claude-3-opus-20240229"),
117 },
118 prepare_data=prepare_data,
119 )
121 embedding_evaluator = LangChainStringEvaluator("embedding_distance")
122 exact_match_evaluator = LangChainStringEvaluator("exact_match")
123 regex_match_evaluator = LangChainStringEvaluator(
124 "regex_match", config={"flags": re.IGNORECASE}, prepare_data=prepare_data
125 )
127 scoring_evaluator = LangChainStringEvaluator(
128 "labeled_score_string",
129 config={
130 "criteria": {
131 "accuracy": "Score 1: Completely inaccurate\nScore 5: Somewhat accurate\nScore 10: Completely accurate"
132 },
133 "normalize_by": 10,
134 "llm": ChatAnthropic(model="claude-3-opus-20240229"),
135 },
136 prepare_data=prepare_data,
137 )
138 string_distance_evaluator = LangChainStringEvaluator(
139 "string_distance",
140 config={"distance_metric": "levenshtein"},
141 prepare_data=prepare_data,
142 )
144 from langsmith import Client
146 client = Client()
147 results = evaluate(
148 lambda inputs: {"prediction": "foo"},
149 data=client.list_examples(dataset_name="Evaluate Examples", limit=1),
150 evaluators=[
151 embedding_evaluator,
152 criteria_evaluator,
153 exact_match_evaluator,
154 regex_match_evaluator,
155 scoring_evaluator,
156 string_distance_evaluator,
157 ],
158 ) # doctest: +ELLIPSIS
159 ```
160 """ # noqa: E501
162 def __init__(
163 self,
164 evaluator: Union[StringEvaluator, str],
165 *,
166 config: Optional[dict] = None,
167 prepare_data: Optional[
168 Callable[[Run, Optional[Example]], SingleEvaluatorInput]
169 ] = None,
170 ):
171 """Initialize a `LangChainStringEvaluator`.
173 Args:
174 evaluator (StringEvaluator): The underlying `StringEvaluator`.
175 """
176 from langchain.evaluation.schema import StringEvaluator # noqa: F811
178 if isinstance(evaluator, StringEvaluator):
179 self.evaluator = evaluator
180 elif isinstance(evaluator, str):
181 from langchain.evaluation import ( # type: ignore[import-not-found]
182 load_evaluator, # noqa: F811
183 )
185 self.evaluator = load_evaluator(evaluator, **(config or {})) # type: ignore[assignment, arg-type]
186 else:
187 raise NotImplementedError(f"Unsupported evaluator type: {type(evaluator)}")
189 self._prepare_data = prepare_data
191 def as_run_evaluator(
192 self,
193 ) -> RunEvaluator:
194 """Convert the `LangChainStringEvaluator` to a `RunEvaluator`.
196 This is the object used in the LangSmith `evaluate` API.
198 Returns:
199 RunEvaluator: The converted `RunEvaluator`.
200 """
201 input_str = (
202 "\n \"input\": example.inputs['input'],"
203 if self.evaluator.requires_input
204 else ""
205 )
206 reference_str = (
207 "\n \"reference\": example.outputs['expected']"
208 if self.evaluator.requires_reference
209 else ""
210 )
211 customization_error_str = f"""
212def prepare_data(run, example):
213 return {{
214 "prediction": run.outputs['my_output'],{reference_str}{input_str}
215 }}
216evaluator = LangChainStringEvaluator(..., prepare_data=prepare_data)
217"""
219 @traceable
220 def prepare_evaluator_inputs(
221 run: Run, example: Optional[Example] = None
222 ) -> SingleEvaluatorInput:
223 if run.outputs and len(run.outputs) > 1:
224 raise ValueError(
225 f"Evaluator {self.evaluator} only supports a single prediction "
226 "key. Please ensure that the run has a single output."
227 " Or initialize with a prepare_data:\n"
228 f"{customization_error_str}"
229 )
230 if (
231 self.evaluator.requires_reference
232 and example
233 and example.outputs
234 and len(example.outputs) > 1
235 ):
236 raise ValueError(
237 f"Evaluator {self.evaluator} nly supports a single reference key. "
238 "Please ensure that the example has a single output."
239 " Or create a custom evaluator yourself:\n"
240 f"{customization_error_str}"
241 )
242 if (
243 self.evaluator.requires_input
244 and example
245 and example.inputs
246 and len(example.inputs) > 1
247 ):
248 raise ValueError(
249 f"Evaluator {self.evaluator} only supports a single input key. "
250 "Please ensure that the example has a single input."
251 " Or initialize with a prepare_data:\n"
252 f"{customization_error_str}"
253 )
255 return SingleEvaluatorInput(
256 prediction=next(iter(run.outputs.values())), # type: ignore[union-attr]
257 reference=(
258 next(iter(example.outputs.values()))
259 if (
260 self.evaluator.requires_reference
261 and example
262 and example.outputs
263 )
264 else None
265 ),
266 input=(
267 next(iter(example.inputs.values()))
268 if (self.evaluator.requires_input and example and example.inputs)
269 else None
270 ),
271 )
273 @traceable(name=self.evaluator.evaluation_name)
274 def evaluate(run: Run, example: Optional[Example] = None) -> dict:
275 eval_inputs = (
276 prepare_evaluator_inputs(run, example)
277 if self._prepare_data is None
278 else self._prepare_data(run, example)
279 )
280 results = self.evaluator.evaluate_strings(**eval_inputs)
281 return {"key": self.evaluator.evaluation_name, **results}
283 @traceable(name=self.evaluator.evaluation_name)
284 async def aevaluate(run: Run, example: Optional[Example] = None) -> dict:
285 eval_inputs = (
286 prepare_evaluator_inputs(run, example)
287 if self._prepare_data is None
288 else self._prepare_data(run, example)
289 )
290 results = await self.evaluator.aevaluate_strings(**eval_inputs)
291 return {"key": self.evaluator.evaluation_name, **results}
293 return DynamicRunEvaluator(evaluate, aevaluate)