Coverage for langsmith/evaluation/llm_evaluator.py: 28%

106 statements  

« prev     ^ index     » next       coverage.py v7.10.1, created at 2025-12-11 16:15 -0800

1"""Contains the `LLMEvaluator` class for building LLM-as-a-judge evaluators.""" 

2 

3from typing import Any, Callable, Optional, Union, cast 

4 

5from pydantic import BaseModel 

6 

7from langsmith._internal._beta_decorator import warn_beta 

8from langsmith.evaluation import EvaluationResult, EvaluationResults, RunEvaluator 

9from langsmith.schemas import Example, Run 

10 

11 

12class CategoricalScoreConfig(BaseModel): 

13 """Configuration for a categorical score.""" 

14 

15 key: str 

16 choices: list[str] 

17 description: str 

18 include_explanation: bool = False 

19 explanation_description: Optional[str] = None 

20 

21 

22class ContinuousScoreConfig(BaseModel): 

23 """Configuration for a continuous score.""" 

24 

25 key: str 

26 min: float = 0 

27 max: float = 1 

28 description: str 

29 include_explanation: bool = False 

30 explanation_description: Optional[str] = None 

31 

32 

33def _create_score_json_schema( 

34 score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig], 

35) -> dict: 

36 properties: dict[str, Any] = {} 

37 if isinstance(score_config, CategoricalScoreConfig): 

38 properties["score"] = { 

39 "type": "string", 

40 "enum": score_config.choices, 

41 "description": f"The score for the evaluation, one of " 

42 f"{', '.join(score_config.choices)}.", 

43 } 

44 elif isinstance(score_config, ContinuousScoreConfig): 

45 properties["score"] = { 

46 "type": "number", 

47 "minimum": score_config.min, 

48 "maximum": score_config.max, 

49 "description": f"The score for the evaluation, between " 

50 f"{score_config.min} and {score_config.max}, inclusive.", 

51 } 

52 else: 

53 raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'") 

54 

55 if score_config.include_explanation: 

56 properties["explanation"] = { 

57 "type": "string", 

58 "description": ( 

59 "The explanation for the score." 

60 if score_config.explanation_description is None 

61 else score_config.explanation_description 

62 ), 

63 } 

64 

65 return { 

66 "title": score_config.key, 

67 "description": score_config.description, 

68 "type": "object", 

69 "properties": properties, 

70 "required": ( 

71 ["score", "explanation"] if score_config.include_explanation else ["score"] 

72 ), 

73 } 

74 

75 

76class LLMEvaluator(RunEvaluator): 

77 """A class for building LLM-as-a-judge evaluators.""" 

78 

79 def __init__( 

80 self, 

81 *, 

82 prompt_template: Union[str, list[tuple[str, str]]], 

83 score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig], 

84 map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None, 

85 model_name: str = "gpt-4o", 

86 model_provider: str = "openai", 

87 **kwargs, 

88 ): 

89 """Initialize the `LLMEvaluator`. 

90 

91 Args: 

92 prompt_template (Union[str, List[Tuple[str, str]]): The prompt 

93 template to use for the evaluation. If a string is provided, it is 

94 assumed to be a human / user message. 

95 score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]): 

96 The configuration for the score, either categorical or continuous. 

97 map_variables (Optional[Callable[[Run, Example], dict]], optional): 

98 A function that maps the run and example to the variables in the 

99 prompt. 

100 

101 If `None`, it is assumed that the prompt only requires 'input', 

102 'output', and 'expected'. 

103 model_name (Optional[str], optional): The model to use for the evaluation. 

104 model_provider (Optional[str], optional): The model provider to use 

105 for the evaluation. 

106 """ 

107 try: 

108 from langchain.chat_models import ( # type: ignore[import-not-found] 

109 init_chat_model, 

110 ) 

111 except ImportError as e: 

112 raise ImportError( 

113 "LLMEvaluator requires langchain to be installed. " 

114 "Please install langchain by running `pip install langchain`." 

115 ) from e 

116 

117 chat_model = init_chat_model( 

118 model=model_name, model_provider=model_provider, **kwargs 

119 ) 

120 

121 self._initialize(prompt_template, score_config, map_variables, chat_model) 

122 

123 @classmethod 

124 def from_model( 

125 cls, 

126 model: Any, 

127 *, 

128 prompt_template: Union[str, list[tuple[str, str]]], 

129 score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig], 

130 map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None, 

131 ): 

132 """Create an `LLMEvaluator` instance from a `BaseChatModel` instance. 

133 

134 Args: 

135 model (BaseChatModel): The chat model instance to use for the evaluation. 

136 prompt_template (Union[str, List[Tuple[str, str]]): The prompt 

137 template to use for the evaluation. If a string is provided, it is 

138 assumed to be a system message. 

139 score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]): 

140 The configuration for the score, either categorical or continuous. 

141 map_variables (Optional[Callable[[Run, Example]], dict]], optional): 

142 A function that maps the run and example to the variables in the 

143 prompt. 

144 

145 If `None`, it is assumed that the prompt only requires 'input', 

146 'output', and 'expected'. 

147 

148 Returns: 

149 LLMEvaluator: An instance of `LLMEvaluator`. 

150 """ 

151 instance = cls.__new__(cls) 

152 instance._initialize(prompt_template, score_config, map_variables, model) 

153 return instance 

154 

155 def _initialize( 

156 self, 

157 prompt_template: Union[str, list[tuple[str, str]]], 

158 score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig], 

159 map_variables: Optional[Callable[[Run, Optional[Example]], dict]], 

160 chat_model: Any, 

161 ): 

162 """Shared initialization code for `__init__` and `from_model`. 

163 

164 Args: 

165 prompt_template (Union[str, List[Tuple[str, str]]): The prompt template. 

166 score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]): 

167 The score configuration. 

168 map_variables (Optional[Callable[[Run, Example]], dict]]): 

169 Function to map variables. 

170 chat_model (BaseChatModel): The chat model instance. 

171 """ 

172 try: 

173 from langchain_core.language_models.chat_models import BaseChatModel 

174 from langchain_core.prompts import ChatPromptTemplate 

175 except ImportError as e: 

176 raise ImportError( 

177 "LLMEvaluator requires langchain-core to be installed. " 

178 "Please install langchain-core by running `pip install langchain-core`." 

179 ) from e 

180 

181 if not ( 

182 isinstance(chat_model, BaseChatModel) 

183 and hasattr(chat_model, "with_structured_output") 

184 ): 

185 raise ValueError( 

186 "chat_model must be an instance of " 

187 "BaseLanguageModel and support structured output." 

188 ) 

189 

190 if isinstance(prompt_template, str): 

191 self.prompt = ChatPromptTemplate.from_messages([("human", prompt_template)]) 

192 else: 

193 self.prompt = ChatPromptTemplate.from_messages(prompt_template) 

194 

195 if set(self.prompt.input_variables) - {"input", "output", "expected"}: 

196 if not map_variables: 

197 raise ValueError( 

198 "map_inputs must be provided if the prompt template contains " 

199 "variables other than 'input', 'output', and 'expected'" 

200 ) 

201 self.map_variables = map_variables 

202 

203 self.score_config = score_config 

204 self.score_schema = _create_score_json_schema(self.score_config) 

205 

206 chat_model = chat_model.with_structured_output(self.score_schema) 

207 self.runnable = self.prompt | chat_model 

208 

209 @warn_beta 

210 def evaluate_run( 

211 self, run: Run, example: Optional[Example] = None 

212 ) -> Union[EvaluationResult, EvaluationResults]: 

213 """Evaluate a run.""" 

214 variables = self._prepare_variables(run, example) 

215 output: dict = cast(dict, self.runnable.invoke(variables)) 

216 return self._parse_output(output) 

217 

218 @warn_beta 

219 async def aevaluate_run( 

220 self, run: Run, example: Optional[Example] = None 

221 ) -> Union[EvaluationResult, EvaluationResults]: 

222 """Asynchronously evaluate a run.""" 

223 variables = self._prepare_variables(run, example) 

224 output: dict = cast(dict, await self.runnable.ainvoke(variables)) 

225 return self._parse_output(output) 

226 

227 def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict: 

228 """Prepare variables for model invocation.""" 

229 if self.map_variables: 

230 return self.map_variables(run, example) 

231 

232 variables = {} 

233 if "input" in self.prompt.input_variables: 

234 if len(run.inputs) == 0: 

235 raise ValueError( 

236 "No input keys are present in run.inputs but the prompt " 

237 "requires 'input'." 

238 ) 

239 if len(run.inputs) != 1: 

240 raise ValueError( 

241 "Multiple input keys are present in run.inputs. Please provide " 

242 "a map_variables function." 

243 ) 

244 variables["input"] = list(run.inputs.values())[0] 

245 

246 if "output" in self.prompt.input_variables: 

247 if not run.outputs: 

248 raise ValueError( 

249 "No output keys are present in run.outputs but the prompt " 

250 "requires 'output'." 

251 ) 

252 if len(run.outputs) == 0: 

253 raise ValueError( 

254 "No output keys are present in run.outputs but the prompt " 

255 "requires 'output'." 

256 ) 

257 if len(run.outputs) != 1: 

258 raise ValueError( 

259 "Multiple output keys are present in run.outputs. Please " 

260 "provide a map_variables function." 

261 ) 

262 variables["output"] = list(run.outputs.values())[0] 

263 

264 if "expected" in self.prompt.input_variables: 

265 if not example or not example.outputs: 

266 raise ValueError( 

267 "No example or example outputs is provided but the prompt " 

268 "requires 'expected'." 

269 ) 

270 if len(example.outputs) == 0: 

271 raise ValueError( 

272 "No output keys are present in example.outputs but the prompt " 

273 "requires 'expected'." 

274 ) 

275 if len(example.outputs) != 1: 

276 raise ValueError( 

277 "Multiple output keys are present in example.outputs. Please " 

278 "provide a map_variables function." 

279 ) 

280 variables["expected"] = list(example.outputs.values())[0] 

281 

282 return variables 

283 

284 def _parse_output(self, output: dict) -> Union[EvaluationResult, EvaluationResults]: 

285 """Parse the model output into an evaluation result.""" 

286 if isinstance(self.score_config, CategoricalScoreConfig): 

287 value = output["score"] 

288 explanation = output.get("explanation", None) 

289 return EvaluationResult( 

290 key=self.score_config.key, value=value, comment=explanation 

291 ) 

292 elif isinstance(self.score_config, ContinuousScoreConfig): 

293 score = output["score"] 

294 explanation = output.get("explanation", None) 

295 return EvaluationResult( 

296 key=self.score_config.key, score=score, comment=explanation 

297 )