Coverage for langsmith/evaluation/integrations/_langchain.py: 33%

46 statements  

« prev     ^ index     » next       coverage.py v7.10.1, created at 2025-12-11 16:15 -0800

1from __future__ import annotations 

2 

3from typing import TYPE_CHECKING, Any, Callable, Optional, TypedDict, Union 

4 

5from langsmith.evaluation.evaluator import DynamicRunEvaluator 

6from langsmith.run_helpers import traceable 

7from langsmith.schemas import Example, Run 

8 

9if TYPE_CHECKING: 

10 from langchain.evaluation.schema import ( # type: ignore[import-not-found] 

11 StringEvaluator, 

12 ) 

13 

14 from langsmith.evaluation.evaluator import ( # type: ignore[import-not-found] 

15 RunEvaluator, 

16 ) 

17 

18 

19class SingleEvaluatorInput(TypedDict): 

20 """The input to a `StringEvaluator`.""" 

21 

22 prediction: str 

23 """The prediction string.""" 

24 reference: Optional[Any] 

25 """The reference string.""" 

26 input: Optional[str] 

27 """The input string.""" 

28 

29 

30class LangChainStringEvaluator: 

31 r"""A class for wrapping a LangChain `StringEvaluator`. 

32 

33 Requires the `langchain` package to be installed. 

34 

35 Attributes: 

36 evaluator (StringEvaluator): The underlying `StringEvaluator` OR the name 

37 of the evaluator to load. 

38 

39 Methods: 

40 `as_run_evaluator() -> RunEvaluator`: 

41 Convert the `LangChainStringEvaluator` to a `RunEvaluator`. 

42 

43 Examples: 

44 !!! example "Creating a simple `LangChainStringEvaluator`" 

45 

46 ```python 

47 evaluator = LangChainStringEvaluator("exact_match") 

48 ``` 

49 

50 !!! example "Converting a `LangChainStringEvaluator` to a `RunEvaluator`" 

51 

52 ```python 

53 from langsmith.evaluation import LangChainStringEvaluator 

54 from langchain_openai import ChatOpenAI 

55 evaluator = LangChainStringEvaluator( 

56 "criteria", 

57 config={ 

58 "criteria": { 

59 "usefulness": "The prediction is useful if" 

60 " it is correct and/or asks a useful followup question." 

61 }, 

62 "llm": ChatOpenAI(model="gpt-4o"), 

63 }, 

64 ) 

65 run_evaluator = evaluator.as_run_evaluator() 

66 run_evaluator # doctest: +ELLIPSIS 

67 <DynamicRunEvaluator ...> 

68 ``` 

69 

70 !!! example "Customizing the LLM model used by the evaluator" 

71 

72 ```python 

73 from langsmith.evaluation import LangChainStringEvaluator 

74 from langchain_anthropic import ChatAnthropic 

75 evaluator = LangChainStringEvaluator( 

76 "criteria", 

77 config={ 

78 "criteria": { 

79 "usefulness": "The prediction is useful if" 

80 " it is correct and/or asks a useful followup question." 

81 }, 

82 "llm": ChatAnthropic(model="claude-3-opus-20240229"), 

83 }, 

84 ) 

85 

86 run_evaluator = evaluator.as_run_evaluator() 

87 run_evaluator # doctest: +ELLIPSIS 

88 <DynamicRunEvaluator ...> 

89 ``` 

90 

91 !!! example "Using the `evaluate` API with different evaluators" 

92 

93 ```python 

94 def prepare_data(run: Run, example: Example): 

95 # Convert the evaluation data into the format expected by the evaluator 

96 # Only required for datasets with multiple inputs/output keys 

97 return { 

98 "prediction": run.outputs["prediction"], 

99 "reference": example.outputs["answer"], 

100 "input": str(example.inputs), 

101 } 

102 

103 

104 import re 

105 from langchain_anthropic import ChatAnthropic 

106 import langsmith 

107 from langsmith.evaluation import LangChainStringEvaluator, evaluate 

108 

109 criteria_evaluator = LangChainStringEvaluator( 

110 "criteria", 

111 config={ 

112 "criteria": { 

113 "usefulness": "The prediction is useful if it is correct" 

114 " and/or asks a useful followup question." 

115 }, 

116 "llm": ChatAnthropic(model="claude-3-opus-20240229"), 

117 }, 

118 prepare_data=prepare_data, 

119 ) 

120 

121 embedding_evaluator = LangChainStringEvaluator("embedding_distance") 

122 exact_match_evaluator = LangChainStringEvaluator("exact_match") 

123 regex_match_evaluator = LangChainStringEvaluator( 

124 "regex_match", config={"flags": re.IGNORECASE}, prepare_data=prepare_data 

125 ) 

126 

127 scoring_evaluator = LangChainStringEvaluator( 

128 "labeled_score_string", 

129 config={ 

130 "criteria": { 

131 "accuracy": "Score 1: Completely inaccurate\nScore 5: Somewhat accurate\nScore 10: Completely accurate" 

132 }, 

133 "normalize_by": 10, 

134 "llm": ChatAnthropic(model="claude-3-opus-20240229"), 

135 }, 

136 prepare_data=prepare_data, 

137 ) 

138 string_distance_evaluator = LangChainStringEvaluator( 

139 "string_distance", 

140 config={"distance_metric": "levenshtein"}, 

141 prepare_data=prepare_data, 

142 ) 

143 

144 from langsmith import Client 

145 

146 client = Client() 

147 results = evaluate( 

148 lambda inputs: {"prediction": "foo"}, 

149 data=client.list_examples(dataset_name="Evaluate Examples", limit=1), 

150 evaluators=[ 

151 embedding_evaluator, 

152 criteria_evaluator, 

153 exact_match_evaluator, 

154 regex_match_evaluator, 

155 scoring_evaluator, 

156 string_distance_evaluator, 

157 ], 

158 ) # doctest: +ELLIPSIS 

159 ``` 

160 """ # noqa: E501 

161 

162 def __init__( 

163 self, 

164 evaluator: Union[StringEvaluator, str], 

165 *, 

166 config: Optional[dict] = None, 

167 prepare_data: Optional[ 

168 Callable[[Run, Optional[Example]], SingleEvaluatorInput] 

169 ] = None, 

170 ): 

171 """Initialize a `LangChainStringEvaluator`. 

172 

173 Args: 

174 evaluator (StringEvaluator): The underlying `StringEvaluator`. 

175 """ 

176 from langchain.evaluation.schema import StringEvaluator # noqa: F811 

177 

178 if isinstance(evaluator, StringEvaluator): 

179 self.evaluator = evaluator 

180 elif isinstance(evaluator, str): 

181 from langchain.evaluation import ( # type: ignore[import-not-found] 

182 load_evaluator, # noqa: F811 

183 ) 

184 

185 self.evaluator = load_evaluator(evaluator, **(config or {})) # type: ignore[assignment, arg-type] 

186 else: 

187 raise NotImplementedError(f"Unsupported evaluator type: {type(evaluator)}") 

188 

189 self._prepare_data = prepare_data 

190 

191 def as_run_evaluator( 

192 self, 

193 ) -> RunEvaluator: 

194 """Convert the `LangChainStringEvaluator` to a `RunEvaluator`. 

195 

196 This is the object used in the LangSmith `evaluate` API. 

197 

198 Returns: 

199 RunEvaluator: The converted `RunEvaluator`. 

200 """ 

201 input_str = ( 

202 "\n \"input\": example.inputs['input']," 

203 if self.evaluator.requires_input 

204 else "" 

205 ) 

206 reference_str = ( 

207 "\n \"reference\": example.outputs['expected']" 

208 if self.evaluator.requires_reference 

209 else "" 

210 ) 

211 customization_error_str = f""" 

212def prepare_data(run, example): 

213 return {{ 

214 "prediction": run.outputs['my_output'],{reference_str}{input_str} 

215 }} 

216evaluator = LangChainStringEvaluator(..., prepare_data=prepare_data) 

217""" 

218 

219 @traceable 

220 def prepare_evaluator_inputs( 

221 run: Run, example: Optional[Example] = None 

222 ) -> SingleEvaluatorInput: 

223 if run.outputs and len(run.outputs) > 1: 

224 raise ValueError( 

225 f"Evaluator {self.evaluator} only supports a single prediction " 

226 "key. Please ensure that the run has a single output." 

227 " Or initialize with a prepare_data:\n" 

228 f"{customization_error_str}" 

229 ) 

230 if ( 

231 self.evaluator.requires_reference 

232 and example 

233 and example.outputs 

234 and len(example.outputs) > 1 

235 ): 

236 raise ValueError( 

237 f"Evaluator {self.evaluator} nly supports a single reference key. " 

238 "Please ensure that the example has a single output." 

239 " Or create a custom evaluator yourself:\n" 

240 f"{customization_error_str}" 

241 ) 

242 if ( 

243 self.evaluator.requires_input 

244 and example 

245 and example.inputs 

246 and len(example.inputs) > 1 

247 ): 

248 raise ValueError( 

249 f"Evaluator {self.evaluator} only supports a single input key. " 

250 "Please ensure that the example has a single input." 

251 " Or initialize with a prepare_data:\n" 

252 f"{customization_error_str}" 

253 ) 

254 

255 return SingleEvaluatorInput( 

256 prediction=next(iter(run.outputs.values())), # type: ignore[union-attr] 

257 reference=( 

258 next(iter(example.outputs.values())) 

259 if ( 

260 self.evaluator.requires_reference 

261 and example 

262 and example.outputs 

263 ) 

264 else None 

265 ), 

266 input=( 

267 next(iter(example.inputs.values())) 

268 if (self.evaluator.requires_input and example and example.inputs) 

269 else None 

270 ), 

271 ) 

272 

273 @traceable(name=self.evaluator.evaluation_name) 

274 def evaluate(run: Run, example: Optional[Example] = None) -> dict: 

275 eval_inputs = ( 

276 prepare_evaluator_inputs(run, example) 

277 if self._prepare_data is None 

278 else self._prepare_data(run, example) 

279 ) 

280 results = self.evaluator.evaluate_strings(**eval_inputs) 

281 return {"key": self.evaluator.evaluation_name, **results} 

282 

283 @traceable(name=self.evaluator.evaluation_name) 

284 async def aevaluate(run: Run, example: Optional[Example] = None) -> dict: 

285 eval_inputs = ( 

286 prepare_evaluator_inputs(run, example) 

287 if self._prepare_data is None 

288 else self._prepare_data(run, example) 

289 ) 

290 results = await self.evaluator.aevaluate_strings(**eval_inputs) 

291 return {"key": self.evaluator.evaluation_name, **results} 

292 

293 return DynamicRunEvaluator(evaluate, aevaluate)