Coverage for langsmith/evaluation/llm_evaluator.py: 28%
106 statements
« prev ^ index » next coverage.py v7.10.1, created at 2025-12-11 16:15 -0800
« prev ^ index » next coverage.py v7.10.1, created at 2025-12-11 16:15 -0800
1"""Contains the `LLMEvaluator` class for building LLM-as-a-judge evaluators."""
3from typing import Any, Callable, Optional, Union, cast
5from pydantic import BaseModel
7from langsmith._internal._beta_decorator import warn_beta
8from langsmith.evaluation import EvaluationResult, EvaluationResults, RunEvaluator
9from langsmith.schemas import Example, Run
12class CategoricalScoreConfig(BaseModel):
13 """Configuration for a categorical score."""
15 key: str
16 choices: list[str]
17 description: str
18 include_explanation: bool = False
19 explanation_description: Optional[str] = None
22class ContinuousScoreConfig(BaseModel):
23 """Configuration for a continuous score."""
25 key: str
26 min: float = 0
27 max: float = 1
28 description: str
29 include_explanation: bool = False
30 explanation_description: Optional[str] = None
33def _create_score_json_schema(
34 score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
35) -> dict:
36 properties: dict[str, Any] = {}
37 if isinstance(score_config, CategoricalScoreConfig):
38 properties["score"] = {
39 "type": "string",
40 "enum": score_config.choices,
41 "description": f"The score for the evaluation, one of "
42 f"{', '.join(score_config.choices)}.",
43 }
44 elif isinstance(score_config, ContinuousScoreConfig):
45 properties["score"] = {
46 "type": "number",
47 "minimum": score_config.min,
48 "maximum": score_config.max,
49 "description": f"The score for the evaluation, between "
50 f"{score_config.min} and {score_config.max}, inclusive.",
51 }
52 else:
53 raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'")
55 if score_config.include_explanation:
56 properties["explanation"] = {
57 "type": "string",
58 "description": (
59 "The explanation for the score."
60 if score_config.explanation_description is None
61 else score_config.explanation_description
62 ),
63 }
65 return {
66 "title": score_config.key,
67 "description": score_config.description,
68 "type": "object",
69 "properties": properties,
70 "required": (
71 ["score", "explanation"] if score_config.include_explanation else ["score"]
72 ),
73 }
76class LLMEvaluator(RunEvaluator):
77 """A class for building LLM-as-a-judge evaluators."""
79 def __init__(
80 self,
81 *,
82 prompt_template: Union[str, list[tuple[str, str]]],
83 score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
84 map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
85 model_name: str = "gpt-4o",
86 model_provider: str = "openai",
87 **kwargs,
88 ):
89 """Initialize the `LLMEvaluator`.
91 Args:
92 prompt_template (Union[str, List[Tuple[str, str]]): The prompt
93 template to use for the evaluation. If a string is provided, it is
94 assumed to be a human / user message.
95 score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
96 The configuration for the score, either categorical or continuous.
97 map_variables (Optional[Callable[[Run, Example], dict]], optional):
98 A function that maps the run and example to the variables in the
99 prompt.
101 If `None`, it is assumed that the prompt only requires 'input',
102 'output', and 'expected'.
103 model_name (Optional[str], optional): The model to use for the evaluation.
104 model_provider (Optional[str], optional): The model provider to use
105 for the evaluation.
106 """
107 try:
108 from langchain.chat_models import ( # type: ignore[import-not-found]
109 init_chat_model,
110 )
111 except ImportError as e:
112 raise ImportError(
113 "LLMEvaluator requires langchain to be installed. "
114 "Please install langchain by running `pip install langchain`."
115 ) from e
117 chat_model = init_chat_model(
118 model=model_name, model_provider=model_provider, **kwargs
119 )
121 self._initialize(prompt_template, score_config, map_variables, chat_model)
123 @classmethod
124 def from_model(
125 cls,
126 model: Any,
127 *,
128 prompt_template: Union[str, list[tuple[str, str]]],
129 score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
130 map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
131 ):
132 """Create an `LLMEvaluator` instance from a `BaseChatModel` instance.
134 Args:
135 model (BaseChatModel): The chat model instance to use for the evaluation.
136 prompt_template (Union[str, List[Tuple[str, str]]): The prompt
137 template to use for the evaluation. If a string is provided, it is
138 assumed to be a system message.
139 score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
140 The configuration for the score, either categorical or continuous.
141 map_variables (Optional[Callable[[Run, Example]], dict]], optional):
142 A function that maps the run and example to the variables in the
143 prompt.
145 If `None`, it is assumed that the prompt only requires 'input',
146 'output', and 'expected'.
148 Returns:
149 LLMEvaluator: An instance of `LLMEvaluator`.
150 """
151 instance = cls.__new__(cls)
152 instance._initialize(prompt_template, score_config, map_variables, model)
153 return instance
155 def _initialize(
156 self,
157 prompt_template: Union[str, list[tuple[str, str]]],
158 score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
159 map_variables: Optional[Callable[[Run, Optional[Example]], dict]],
160 chat_model: Any,
161 ):
162 """Shared initialization code for `__init__` and `from_model`.
164 Args:
165 prompt_template (Union[str, List[Tuple[str, str]]): The prompt template.
166 score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
167 The score configuration.
168 map_variables (Optional[Callable[[Run, Example]], dict]]):
169 Function to map variables.
170 chat_model (BaseChatModel): The chat model instance.
171 """
172 try:
173 from langchain_core.language_models.chat_models import BaseChatModel
174 from langchain_core.prompts import ChatPromptTemplate
175 except ImportError as e:
176 raise ImportError(
177 "LLMEvaluator requires langchain-core to be installed. "
178 "Please install langchain-core by running `pip install langchain-core`."
179 ) from e
181 if not (
182 isinstance(chat_model, BaseChatModel)
183 and hasattr(chat_model, "with_structured_output")
184 ):
185 raise ValueError(
186 "chat_model must be an instance of "
187 "BaseLanguageModel and support structured output."
188 )
190 if isinstance(prompt_template, str):
191 self.prompt = ChatPromptTemplate.from_messages([("human", prompt_template)])
192 else:
193 self.prompt = ChatPromptTemplate.from_messages(prompt_template)
195 if set(self.prompt.input_variables) - {"input", "output", "expected"}:
196 if not map_variables:
197 raise ValueError(
198 "map_inputs must be provided if the prompt template contains "
199 "variables other than 'input', 'output', and 'expected'"
200 )
201 self.map_variables = map_variables
203 self.score_config = score_config
204 self.score_schema = _create_score_json_schema(self.score_config)
206 chat_model = chat_model.with_structured_output(self.score_schema)
207 self.runnable = self.prompt | chat_model
209 @warn_beta
210 def evaluate_run(
211 self, run: Run, example: Optional[Example] = None
212 ) -> Union[EvaluationResult, EvaluationResults]:
213 """Evaluate a run."""
214 variables = self._prepare_variables(run, example)
215 output: dict = cast(dict, self.runnable.invoke(variables))
216 return self._parse_output(output)
218 @warn_beta
219 async def aevaluate_run(
220 self, run: Run, example: Optional[Example] = None
221 ) -> Union[EvaluationResult, EvaluationResults]:
222 """Asynchronously evaluate a run."""
223 variables = self._prepare_variables(run, example)
224 output: dict = cast(dict, await self.runnable.ainvoke(variables))
225 return self._parse_output(output)
227 def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
228 """Prepare variables for model invocation."""
229 if self.map_variables:
230 return self.map_variables(run, example)
232 variables = {}
233 if "input" in self.prompt.input_variables:
234 if len(run.inputs) == 0:
235 raise ValueError(
236 "No input keys are present in run.inputs but the prompt "
237 "requires 'input'."
238 )
239 if len(run.inputs) != 1:
240 raise ValueError(
241 "Multiple input keys are present in run.inputs. Please provide "
242 "a map_variables function."
243 )
244 variables["input"] = list(run.inputs.values())[0]
246 if "output" in self.prompt.input_variables:
247 if not run.outputs:
248 raise ValueError(
249 "No output keys are present in run.outputs but the prompt "
250 "requires 'output'."
251 )
252 if len(run.outputs) == 0:
253 raise ValueError(
254 "No output keys are present in run.outputs but the prompt "
255 "requires 'output'."
256 )
257 if len(run.outputs) != 1:
258 raise ValueError(
259 "Multiple output keys are present in run.outputs. Please "
260 "provide a map_variables function."
261 )
262 variables["output"] = list(run.outputs.values())[0]
264 if "expected" in self.prompt.input_variables:
265 if not example or not example.outputs:
266 raise ValueError(
267 "No example or example outputs is provided but the prompt "
268 "requires 'expected'."
269 )
270 if len(example.outputs) == 0:
271 raise ValueError(
272 "No output keys are present in example.outputs but the prompt "
273 "requires 'expected'."
274 )
275 if len(example.outputs) != 1:
276 raise ValueError(
277 "Multiple output keys are present in example.outputs. Please "
278 "provide a map_variables function."
279 )
280 variables["expected"] = list(example.outputs.values())[0]
282 return variables
284 def _parse_output(self, output: dict) -> Union[EvaluationResult, EvaluationResults]:
285 """Parse the model output into an evaluation result."""
286 if isinstance(self.score_config, CategoricalScoreConfig):
287 value = output["score"]
288 explanation = output.get("explanation", None)
289 return EvaluationResult(
290 key=self.score_config.key, value=value, comment=explanation
291 )
292 elif isinstance(self.score_config, ContinuousScoreConfig):
293 score = output["score"]
294 explanation = output.get("explanation", None)
295 return EvaluationResult(
296 key=self.score_config.key, score=score, comment=explanation
297 )