kiln_ai.adapters.eval.g_eval

  1import math
  2from typing import Dict, List, Tuple
  3
  4from litellm.types.utils import ChatCompletionTokenLogprob
  5
  6from kiln_ai.adapters.adapter_registry import adapter_for_task
  7from kiln_ai.adapters.eval.base_eval import BaseEval
  8from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
  9from kiln_ai.adapters.prompt_builders import PromptGenerators
 10from kiln_ai.datamodel import Project, Task, TaskRun
 11from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
 12from kiln_ai.datamodel.task import RunConfig
 13
 14# all the tokens we score for, and their float scores.
 15TOKEN_TO_SCORE_MAP: Dict[str, float] = {
 16    "1": 1.0,
 17    "2": 2.0,
 18    "3": 3.0,
 19    "4": 4.0,
 20    "5": 5.0,
 21    "pass": 1.0,
 22    "fail": 0.0,
 23    "critical": -1.0,
 24}
 25
 26
 27class GEvalTask(Task, parent_of={}):
 28    """
 29    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
 30
 31    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
 32    """
 33
 34    def __init__(self, eval_config: EvalConfig):
 35        tmp_project = Project(name="GEval")
 36
 37        # Build a simple LLM as Judge system instruction
 38        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
 39        # Optionally add a short task description
 40        task_description = eval_config.properties.get("task_description", None)
 41        if task_description:
 42            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
 43
 44        # Build the COT eval instructions
 45        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
 46        steps = eval_config.properties.get("eval_steps", None)
 47        if not steps or not isinstance(steps, list):
 48            raise ValueError("eval_steps must be a list")
 49        for i, step in enumerate(steps):
 50            cot_instructions += f"{i + 1}) {step}\n"
 51
 52        eval = eval_config.parent_eval()
 53        if not eval:
 54            raise ValueError("Eval config must have a parent eval")
 55
 56        # Build the output schema from the eval's target output scores.
 57        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
 58        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
 59        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
 60
 61        super().__init__(
 62            name="GEval Task",
 63            parent=tmp_project,
 64            instruction=system_instruction,
 65            thinking_instruction=cot_instructions,
 66            output_json_schema=output_schema,
 67        )
 68
 69
 70class GEval(BaseEval):
 71    """
 72    A evaluator which implements G-Eval and LLM as Judge.
 73
 74    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 75
 76    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 77
 78    @misc{liu2023gevalnlgevaluationusing,
 79        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 80        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 81        year={2023},
 82        eprint={2303.16634},
 83        archivePrefix={arXiv},
 84        primaryClass={cs.CL},
 85        url={https://arxiv.org/abs/2303.16634},
 86    }
 87    """
 88
 89    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 90        if (
 91            eval_config.config_type != EvalConfigType.g_eval
 92            and eval_config.config_type != EvalConfigType.llm_as_judge
 93        ):
 94            raise ValueError(
 95                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
 96            )
 97
 98        super().__init__(eval_config, run_config)
 99
100        self.geval_task = GEvalTask(eval_config)
101
102    async def run_eval(
103        self, task_run: TaskRun
104    ) -> tuple[EvalScores, Dict[str, str] | None]:
105        """
106        Run this eval on the given task run.
107        """
108
109        model_name, provider = self.model_and_provider()
110
111        # Only fetch logprobs for G-Eval
112        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
113        top_logprobs = (
114            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
115        )
116
117        adapter = adapter_for_task(
118            self.geval_task,
119            model_name,
120            provider,
121            # We always use Simple COT for G-Eval and LLM as Judge
122            prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
123            base_adapter_config=AdapterConfig(
124                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
125                allow_saving=False,
126                top_logprobs=top_logprobs,
127            ),
128        )
129
130        input = f"""The model was given the following input for the task: 
131<eval_data>
132{task_run.input}
133</eval_data>
134
135The model produced the following output for the task:
136<eval_data>
137{task_run.output}
138</eval_data>
139"""
140
141        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
142        _, run_output = await adapter.invoke_returning_run_output(input)
143
144        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
145            return self.build_llm_as_judge_score(
146                run_output
147            ), run_output.intermediate_outputs
148        else:
149            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
150
151    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
152        """
153        Build the LLM as Judge score for the given run and run output.
154        """
155        # Convert the output format we asked for (discreet values) to our float scores
156        scores: EvalScores = {}
157        if not isinstance(run_output.output, dict):
158            raise ValueError("LLM as Judge output must be a dictionary")
159
160        for metric, score in run_output.output.items():
161            token_score = self.score_from_token_string(f"{score}")
162            if token_score is None:
163                raise ValueError(
164                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
165                )
166            scores[metric] = token_score
167        return scores
168
169    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
170        """
171        Build the G-Eval score for the given run and run output.
172
173        We create a weighted average of each rating using the logprobs.
174
175        @misc{liu2023gevalnlgevaluationusing,
176            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
177            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
178            year={2023},
179            eprint={2303.16634},
180            archivePrefix={arXiv},
181            primaryClass={cs.CL},
182            url={https://arxiv.org/abs/2303.16634},
183        }
184        """
185        # We use structured output
186        outputs = run_output.output
187        assert isinstance(outputs, dict)
188
189        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
190        raw_output = self.raw_output_from_logprobs(run_output)
191
192        # find the offset the start of each metric in the raw output json
193        metrics: List[str] = list(outputs.keys())
194        metric_offsets = self.metric_offsets(raw_output, metrics)
195
196        final_scores: EvalScores = {}
197        for metric in metrics:
198            score = self.g_eval_single_metric(
199                run_output, metric, metric_offsets, raw_output
200            )
201            if score is None:
202                raise ValueError(
203                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
204                )
205            final_scores[metric] = score
206
207        return final_scores
208
209    def g_eval_single_metric(
210        self,
211        run_output: RunOutput,
212        metric: str,
213        metric_offsets: Dict[str, int],
214        raw_output: str,
215    ) -> float | None:
216        """
217        Run the G-Eval for a single metric.
218
219        Scan the logprobs for the metric and return the weighted score of the rating token.
220        """
221
222        start_offset, end_offset = self.token_search_range(
223            raw_output, metric, metric_offsets
224        )
225
226        offset = 0
227
228        if (
229            run_output.output_logprobs is None
230            or run_output.output_logprobs.content is None
231        ):
232            raise RuntimeError(
233                "No logprobs found for output - can not calculate g-eval"
234            )
235
236        # scan the tokens in the range, looking for the rating token
237        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
238            if offset >= end_offset:
239                break
240            if offset >= start_offset:
241                score = self.rating_token_to_score(chat_logprob)
242                if score is not None:
243                    return score
244            offset += len(chat_logprob.token)
245
246        return None
247
248    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
249        """
250        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
251        """
252        if (
253            run_output.output_logprobs is None
254            or run_output.output_logprobs.content is None
255        ):
256            raise RuntimeError(
257                "No logprobs found for output - can not calculate g-eval"
258            )
259
260        raw = ""
261        for chat_logprob in run_output.output_logprobs.content:
262            raw += chat_logprob.token
263        return raw
264
265    def token_search_range(
266        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
267    ) -> Tuple[int, int]:
268        """
269        Find the start and end offsets of the metric in the raw output.
270
271        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
272        """
273        start_offset = metric_offsets[metric] + len(metric)
274
275        # Find the lowest end offset that is greater than the start offset
276        end_offset = len(raw_output)
277        for v in list(metric_offsets.values()):
278            if v < end_offset and v > start_offset:
279                end_offset = v
280
281        return start_offset, end_offset
282
283    def rating_token_to_score(
284        self, token_logprob: ChatCompletionTokenLogprob
285    ) -> float | None:
286        """
287        Convert a rating token to a score using weighted average of top logprobs.
288
289        Only includes tokens that have valid scores.
290
291        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
292        """
293        primary_token_score = self.score_from_token_string(token_logprob.token)
294        # check this is a real rating token, it could just be the ": ", "," or whitespace
295        if not primary_token_score:
296            return None
297
298        total_score = 0.0
299        total_probability = 0.0
300
301        # Process all valid scoring tokens
302        for top_logprob in token_logprob.top_logprobs:
303            token_score = self.score_from_token_string(top_logprob.token)
304            if token_score is not None:
305                # Convert logprob to probability
306                probability = math.exp(top_logprob.logprob)
307                total_score += token_score * probability
308                total_probability += probability
309
310        if total_probability <= 0.0:
311            raise RuntimeError(
312                f"No valid scoring tokens found for {token_logprob.token}. This should never happen. Please file a bug if you see this."
313            )
314
315        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
316        weighted_score = total_score / total_probability
317
318        return weighted_score
319
320    def score_from_token_string(self, token: str) -> float | None:
321        if token in TOKEN_TO_SCORE_MAP:
322            return TOKEN_TO_SCORE_MAP[token]
323
324        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
325        unquoted_token = token.strip().strip('"').lower()
326        if unquoted_token in TOKEN_TO_SCORE_MAP:
327            return TOKEN_TO_SCORE_MAP[unquoted_token]
328
329        # handle numeric tokens like "1.0"
330        try:
331            float_value = float(token)
332            if float_value.is_integer():
333                str_token = str(int(float_value))
334                if str_token in TOKEN_TO_SCORE_MAP:
335                    return TOKEN_TO_SCORE_MAP[str_token]
336        except ValueError:
337            pass
338
339        return None
340
341    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
342        """
343        Find the offset to the start of each metric in the raw output json
344
345        For the example json: `{"overall_rating": 1}` == 1
346
347        should return:
348        {
349            "overall_rating": 1 # it's 1 character into the json string
350        }
351        """
352        metric_offsets: Dict[str, int] = {}
353        for metric in metrics:
354            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
355            metric_name = f'"{metric}"'
356
357            # we expect it exactly once
358            count = raw_output.count(metric_name)
359            if count != 1:
360                raise ValueError(
361                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
362                )
363
364            offset = raw_output.find(metric_name)
365            if offset == -1:
366                raise ValueError(f"Metric {metric} not found in raw output")
367            metric_offsets[metric] = offset
368        return metric_offsets
TOKEN_TO_SCORE_MAP: Dict[str, float] = {'1': 1.0, '2': 2.0, '3': 3.0, '4': 4.0, '5': 5.0, 'pass': 1.0, 'fail': 0.0, 'critical': -1.0}
class GEvalTask(kiln_ai.datamodel.task.Task):
28class GEvalTask(Task, parent_of={}):
29    """
30    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
31
32    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
33    """
34
35    def __init__(self, eval_config: EvalConfig):
36        tmp_project = Project(name="GEval")
37
38        # Build a simple LLM as Judge system instruction
39        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
40        # Optionally add a short task description
41        task_description = eval_config.properties.get("task_description", None)
42        if task_description:
43            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
44
45        # Build the COT eval instructions
46        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
47        steps = eval_config.properties.get("eval_steps", None)
48        if not steps or not isinstance(steps, list):
49            raise ValueError("eval_steps must be a list")
50        for i, step in enumerate(steps):
51            cot_instructions += f"{i + 1}) {step}\n"
52
53        eval = eval_config.parent_eval()
54        if not eval:
55            raise ValueError("Eval config must have a parent eval")
56
57        # Build the output schema from the eval's target output scores.
58        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
59        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
60        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
61
62        super().__init__(
63            name="GEval Task",
64            parent=tmp_project,
65            instruction=system_instruction,
66            thinking_instruction=cot_instructions,
67            output_json_schema=output_schema,
68        )

Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.

Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.

GEvalTask(eval_config: kiln_ai.datamodel.eval.EvalConfig)
35    def __init__(self, eval_config: EvalConfig):
36        tmp_project = Project(name="GEval")
37
38        # Build a simple LLM as Judge system instruction
39        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
40        # Optionally add a short task description
41        task_description = eval_config.properties.get("task_description", None)
42        if task_description:
43            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
44
45        # Build the COT eval instructions
46        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
47        steps = eval_config.properties.get("eval_steps", None)
48        if not steps or not isinstance(steps, list):
49            raise ValueError("eval_steps must be a list")
50        for i, step in enumerate(steps):
51            cot_instructions += f"{i + 1}) {step}\n"
52
53        eval = eval_config.parent_eval()
54        if not eval:
55            raise ValueError("Eval config must have a parent eval")
56
57        # Build the output schema from the eval's target output scores.
58        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
59        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
60        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
61
62        super().__init__(
63            name="GEval Task",
64            parent=tmp_project,
65            instruction=system_instruction,
66            thinking_instruction=cot_instructions,
67            output_json_schema=output_schema,
68        )

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class GEval(kiln_ai.adapters.eval.base_eval.BaseEval):
 71class GEval(BaseEval):
 72    """
 73    A evaluator which implements G-Eval and LLM as Judge.
 74
 75    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
 76
 77    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
 78
 79    @misc{liu2023gevalnlgevaluationusing,
 80        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
 81        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
 82        year={2023},
 83        eprint={2303.16634},
 84        archivePrefix={arXiv},
 85        primaryClass={cs.CL},
 86        url={https://arxiv.org/abs/2303.16634},
 87    }
 88    """
 89
 90    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 91        if (
 92            eval_config.config_type != EvalConfigType.g_eval
 93            and eval_config.config_type != EvalConfigType.llm_as_judge
 94        ):
 95            raise ValueError(
 96                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
 97            )
 98
 99        super().__init__(eval_config, run_config)
100
101        self.geval_task = GEvalTask(eval_config)
102
103    async def run_eval(
104        self, task_run: TaskRun
105    ) -> tuple[EvalScores, Dict[str, str] | None]:
106        """
107        Run this eval on the given task run.
108        """
109
110        model_name, provider = self.model_and_provider()
111
112        # Only fetch logprobs for G-Eval
113        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
114        top_logprobs = (
115            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
116        )
117
118        adapter = adapter_for_task(
119            self.geval_task,
120            model_name,
121            provider,
122            # We always use Simple COT for G-Eval and LLM as Judge
123            prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
124            base_adapter_config=AdapterConfig(
125                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
126                allow_saving=False,
127                top_logprobs=top_logprobs,
128            ),
129        )
130
131        input = f"""The model was given the following input for the task: 
132<eval_data>
133{task_run.input}
134</eval_data>
135
136The model produced the following output for the task:
137<eval_data>
138{task_run.output}
139</eval_data>
140"""
141
142        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
143        _, run_output = await adapter.invoke_returning_run_output(input)
144
145        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
146            return self.build_llm_as_judge_score(
147                run_output
148            ), run_output.intermediate_outputs
149        else:
150            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
151
152    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
153        """
154        Build the LLM as Judge score for the given run and run output.
155        """
156        # Convert the output format we asked for (discreet values) to our float scores
157        scores: EvalScores = {}
158        if not isinstance(run_output.output, dict):
159            raise ValueError("LLM as Judge output must be a dictionary")
160
161        for metric, score in run_output.output.items():
162            token_score = self.score_from_token_string(f"{score}")
163            if token_score is None:
164                raise ValueError(
165                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
166                )
167            scores[metric] = token_score
168        return scores
169
170    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
171        """
172        Build the G-Eval score for the given run and run output.
173
174        We create a weighted average of each rating using the logprobs.
175
176        @misc{liu2023gevalnlgevaluationusing,
177            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
178            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
179            year={2023},
180            eprint={2303.16634},
181            archivePrefix={arXiv},
182            primaryClass={cs.CL},
183            url={https://arxiv.org/abs/2303.16634},
184        }
185        """
186        # We use structured output
187        outputs = run_output.output
188        assert isinstance(outputs, dict)
189
190        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
191        raw_output = self.raw_output_from_logprobs(run_output)
192
193        # find the offset the start of each metric in the raw output json
194        metrics: List[str] = list(outputs.keys())
195        metric_offsets = self.metric_offsets(raw_output, metrics)
196
197        final_scores: EvalScores = {}
198        for metric in metrics:
199            score = self.g_eval_single_metric(
200                run_output, metric, metric_offsets, raw_output
201            )
202            if score is None:
203                raise ValueError(
204                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
205                )
206            final_scores[metric] = score
207
208        return final_scores
209
210    def g_eval_single_metric(
211        self,
212        run_output: RunOutput,
213        metric: str,
214        metric_offsets: Dict[str, int],
215        raw_output: str,
216    ) -> float | None:
217        """
218        Run the G-Eval for a single metric.
219
220        Scan the logprobs for the metric and return the weighted score of the rating token.
221        """
222
223        start_offset, end_offset = self.token_search_range(
224            raw_output, metric, metric_offsets
225        )
226
227        offset = 0
228
229        if (
230            run_output.output_logprobs is None
231            or run_output.output_logprobs.content is None
232        ):
233            raise RuntimeError(
234                "No logprobs found for output - can not calculate g-eval"
235            )
236
237        # scan the tokens in the range, looking for the rating token
238        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
239            if offset >= end_offset:
240                break
241            if offset >= start_offset:
242                score = self.rating_token_to_score(chat_logprob)
243                if score is not None:
244                    return score
245            offset += len(chat_logprob.token)
246
247        return None
248
249    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
250        """
251        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
252        """
253        if (
254            run_output.output_logprobs is None
255            or run_output.output_logprobs.content is None
256        ):
257            raise RuntimeError(
258                "No logprobs found for output - can not calculate g-eval"
259            )
260
261        raw = ""
262        for chat_logprob in run_output.output_logprobs.content:
263            raw += chat_logprob.token
264        return raw
265
266    def token_search_range(
267        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
268    ) -> Tuple[int, int]:
269        """
270        Find the start and end offsets of the metric in the raw output.
271
272        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
273        """
274        start_offset = metric_offsets[metric] + len(metric)
275
276        # Find the lowest end offset that is greater than the start offset
277        end_offset = len(raw_output)
278        for v in list(metric_offsets.values()):
279            if v < end_offset and v > start_offset:
280                end_offset = v
281
282        return start_offset, end_offset
283
284    def rating_token_to_score(
285        self, token_logprob: ChatCompletionTokenLogprob
286    ) -> float | None:
287        """
288        Convert a rating token to a score using weighted average of top logprobs.
289
290        Only includes tokens that have valid scores.
291
292        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
293        """
294        primary_token_score = self.score_from_token_string(token_logprob.token)
295        # check this is a real rating token, it could just be the ": ", "," or whitespace
296        if not primary_token_score:
297            return None
298
299        total_score = 0.0
300        total_probability = 0.0
301
302        # Process all valid scoring tokens
303        for top_logprob in token_logprob.top_logprobs:
304            token_score = self.score_from_token_string(top_logprob.token)
305            if token_score is not None:
306                # Convert logprob to probability
307                probability = math.exp(top_logprob.logprob)
308                total_score += token_score * probability
309                total_probability += probability
310
311        if total_probability <= 0.0:
312            raise RuntimeError(
313                f"No valid scoring tokens found for {token_logprob.token}. This should never happen. Please file a bug if you see this."
314            )
315
316        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
317        weighted_score = total_score / total_probability
318
319        return weighted_score
320
321    def score_from_token_string(self, token: str) -> float | None:
322        if token in TOKEN_TO_SCORE_MAP:
323            return TOKEN_TO_SCORE_MAP[token]
324
325        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
326        unquoted_token = token.strip().strip('"').lower()
327        if unquoted_token in TOKEN_TO_SCORE_MAP:
328            return TOKEN_TO_SCORE_MAP[unquoted_token]
329
330        # handle numeric tokens like "1.0"
331        try:
332            float_value = float(token)
333            if float_value.is_integer():
334                str_token = str(int(float_value))
335                if str_token in TOKEN_TO_SCORE_MAP:
336                    return TOKEN_TO_SCORE_MAP[str_token]
337        except ValueError:
338            pass
339
340        return None
341
342    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
343        """
344        Find the offset to the start of each metric in the raw output json
345
346        For the example json: `{"overall_rating": 1}` == 1
347
348        should return:
349        {
350            "overall_rating": 1 # it's 1 character into the json string
351        }
352        """
353        metric_offsets: Dict[str, int] = {}
354        for metric in metrics:
355            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
356            metric_name = f'"{metric}"'
357
358            # we expect it exactly once
359            count = raw_output.count(metric_name)
360            if count != 1:
361                raise ValueError(
362                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
363                )
364
365            offset = raw_output.find(metric_name)
366            if offset == -1:
367                raise ValueError(f"Metric {metric} not found in raw output")
368            metric_offsets[metric] = offset
369        return metric_offsets

A evaluator which implements G-Eval and LLM as Judge.

G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634

LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.

@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }

GEval( eval_config: kiln_ai.datamodel.eval.EvalConfig, run_config: kiln_ai.datamodel.task.RunConfig | None)
 90    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
 91        if (
 92            eval_config.config_type != EvalConfigType.g_eval
 93            and eval_config.config_type != EvalConfigType.llm_as_judge
 94        ):
 95            raise ValueError(
 96                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
 97            )
 98
 99        super().__init__(eval_config, run_config)
100
101        self.geval_task = GEvalTask(eval_config)
geval_task
async def run_eval( self, task_run: kiln_ai.datamodel.TaskRun) -> tuple[typing.Dict[str, float], typing.Optional[typing.Dict[str, str]]]:
103    async def run_eval(
104        self, task_run: TaskRun
105    ) -> tuple[EvalScores, Dict[str, str] | None]:
106        """
107        Run this eval on the given task run.
108        """
109
110        model_name, provider = self.model_and_provider()
111
112        # Only fetch logprobs for G-Eval
113        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
114        top_logprobs = (
115            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
116        )
117
118        adapter = adapter_for_task(
119            self.geval_task,
120            model_name,
121            provider,
122            # We always use Simple COT for G-Eval and LLM as Judge
123            prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
124            base_adapter_config=AdapterConfig(
125                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
126                allow_saving=False,
127                top_logprobs=top_logprobs,
128            ),
129        )
130
131        input = f"""The model was given the following input for the task: 
132<eval_data>
133{task_run.input}
134</eval_data>
135
136The model produced the following output for the task:
137<eval_data>
138{task_run.output}
139</eval_data>
140"""
141
142        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
143        _, run_output = await adapter.invoke_returning_run_output(input)
144
145        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
146            return self.build_llm_as_judge_score(
147                run_output
148            ), run_output.intermediate_outputs
149        else:
150            return self.build_g_eval_score(run_output), run_output.intermediate_outputs

Run this eval on the given task run.

def build_llm_as_judge_score( self, run_output: kiln_ai.adapters.run_output.RunOutput) -> Dict[str, float]:
152    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
153        """
154        Build the LLM as Judge score for the given run and run output.
155        """
156        # Convert the output format we asked for (discreet values) to our float scores
157        scores: EvalScores = {}
158        if not isinstance(run_output.output, dict):
159            raise ValueError("LLM as Judge output must be a dictionary")
160
161        for metric, score in run_output.output.items():
162            token_score = self.score_from_token_string(f"{score}")
163            if token_score is None:
164                raise ValueError(
165                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
166                )
167            scores[metric] = token_score
168        return scores

Build the LLM as Judge score for the given run and run output.

def build_g_eval_score( self, run_output: kiln_ai.adapters.run_output.RunOutput) -> Dict[str, float]:
170    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
171        """
172        Build the G-Eval score for the given run and run output.
173
174        We create a weighted average of each rating using the logprobs.
175
176        @misc{liu2023gevalnlgevaluationusing,
177            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
178            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
179            year={2023},
180            eprint={2303.16634},
181            archivePrefix={arXiv},
182            primaryClass={cs.CL},
183            url={https://arxiv.org/abs/2303.16634},
184        }
185        """
186        # We use structured output
187        outputs = run_output.output
188        assert isinstance(outputs, dict)
189
190        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
191        raw_output = self.raw_output_from_logprobs(run_output)
192
193        # find the offset the start of each metric in the raw output json
194        metrics: List[str] = list(outputs.keys())
195        metric_offsets = self.metric_offsets(raw_output, metrics)
196
197        final_scores: EvalScores = {}
198        for metric in metrics:
199            score = self.g_eval_single_metric(
200                run_output, metric, metric_offsets, raw_output
201            )
202            if score is None:
203                raise ValueError(
204                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
205                )
206            final_scores[metric] = score
207
208        return final_scores

Build the G-Eval score for the given run and run output.

We create a weighted average of each rating using the logprobs.

@misc{liu2023gevalnlgevaluationusing, title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, year={2023}, eprint={2303.16634}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2303.16634}, }

def g_eval_single_metric( self, run_output: kiln_ai.adapters.run_output.RunOutput, metric: str, metric_offsets: Dict[str, int], raw_output: str) -> float | None:
210    def g_eval_single_metric(
211        self,
212        run_output: RunOutput,
213        metric: str,
214        metric_offsets: Dict[str, int],
215        raw_output: str,
216    ) -> float | None:
217        """
218        Run the G-Eval for a single metric.
219
220        Scan the logprobs for the metric and return the weighted score of the rating token.
221        """
222
223        start_offset, end_offset = self.token_search_range(
224            raw_output, metric, metric_offsets
225        )
226
227        offset = 0
228
229        if (
230            run_output.output_logprobs is None
231            or run_output.output_logprobs.content is None
232        ):
233            raise RuntimeError(
234                "No logprobs found for output - can not calculate g-eval"
235            )
236
237        # scan the tokens in the range, looking for the rating token
238        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
239            if offset >= end_offset:
240                break
241            if offset >= start_offset:
242                score = self.rating_token_to_score(chat_logprob)
243                if score is not None:
244                    return score
245            offset += len(chat_logprob.token)
246
247        return None

Run the G-Eval for a single metric.

Scan the logprobs for the metric and return the weighted score of the rating token.

def raw_output_from_logprobs(self, run_output: kiln_ai.adapters.run_output.RunOutput) -> str:
249    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
250        """
251        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
252        """
253        if (
254            run_output.output_logprobs is None
255            or run_output.output_logprobs.content is None
256        ):
257            raise RuntimeError(
258                "No logprobs found for output - can not calculate g-eval"
259            )
260
261        raw = ""
262        for chat_logprob in run_output.output_logprobs.content:
263            raw += chat_logprob.token
264        return raw

Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets

def token_search_range( self, raw_output: str, metric: str, metric_offsets: Dict[str, int]) -> Tuple[int, int]:
266    def token_search_range(
267        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
268    ) -> Tuple[int, int]:
269        """
270        Find the start and end offsets of the metric in the raw output.
271
272        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
273        """
274        start_offset = metric_offsets[metric] + len(metric)
275
276        # Find the lowest end offset that is greater than the start offset
277        end_offset = len(raw_output)
278        for v in list(metric_offsets.values()):
279            if v < end_offset and v > start_offset:
280                end_offset = v
281
282        return start_offset, end_offset

Find the start and end offsets of the metric in the raw output.

Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").

def rating_token_to_score( self, token_logprob: litellm.types.utils.ChatCompletionTokenLogprob) -> float | None:
284    def rating_token_to_score(
285        self, token_logprob: ChatCompletionTokenLogprob
286    ) -> float | None:
287        """
288        Convert a rating token to a score using weighted average of top logprobs.
289
290        Only includes tokens that have valid scores.
291
292        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
293        """
294        primary_token_score = self.score_from_token_string(token_logprob.token)
295        # check this is a real rating token, it could just be the ": ", "," or whitespace
296        if not primary_token_score:
297            return None
298
299        total_score = 0.0
300        total_probability = 0.0
301
302        # Process all valid scoring tokens
303        for top_logprob in token_logprob.top_logprobs:
304            token_score = self.score_from_token_string(top_logprob.token)
305            if token_score is not None:
306                # Convert logprob to probability
307                probability = math.exp(top_logprob.logprob)
308                total_score += token_score * probability
309                total_probability += probability
310
311        if total_probability <= 0.0:
312            raise RuntimeError(
313                f"No valid scoring tokens found for {token_logprob.token}. This should never happen. Please file a bug if you see this."
314            )
315
316        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
317        weighted_score = total_score / total_probability
318
319        return weighted_score

Convert a rating token to a score using weighted average of top logprobs.

Only includes tokens that have valid scores.

Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.

def score_from_token_string(self, token: str) -> float | None:
321    def score_from_token_string(self, token: str) -> float | None:
322        if token in TOKEN_TO_SCORE_MAP:
323            return TOKEN_TO_SCORE_MAP[token]
324
325        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
326        unquoted_token = token.strip().strip('"').lower()
327        if unquoted_token in TOKEN_TO_SCORE_MAP:
328            return TOKEN_TO_SCORE_MAP[unquoted_token]
329
330        # handle numeric tokens like "1.0"
331        try:
332            float_value = float(token)
333            if float_value.is_integer():
334                str_token = str(int(float_value))
335                if str_token in TOKEN_TO_SCORE_MAP:
336                    return TOKEN_TO_SCORE_MAP[str_token]
337        except ValueError:
338            pass
339
340        return None
def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
342    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
343        """
344        Find the offset to the start of each metric in the raw output json
345
346        For the example json: `{"overall_rating": 1}` == 1
347
348        should return:
349        {
350            "overall_rating": 1 # it's 1 character into the json string
351        }
352        """
353        metric_offsets: Dict[str, int] = {}
354        for metric in metrics:
355            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
356            metric_name = f'"{metric}"'
357
358            # we expect it exactly once
359            count = raw_output.count(metric_name)
360            if count != 1:
361                raise ValueError(
362                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
363                )
364
365            offset = raw_output.find(metric_name)
366            if offset == -1:
367                raise ValueError(f"Metric {metric} not found in raw output")
368            metric_offsets[metric] = offset
369        return metric_offsets

Find the offset to the start of each metric in the raw output json

For the example json: {"overall_rating": 1} == 1

should return: { "overall_rating": 1 # it's 1 character into the json string }