kiln_ai.datamodel

See our docs for details about our datamodel: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html

  1"""
  2See our docs for details about our datamodel: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html
  3"""
  4
  5from __future__ import annotations
  6
  7import json
  8import math
  9import random
 10from enum import Enum, IntEnum
 11from typing import TYPE_CHECKING, Callable, Dict, List, Type, Union
 12
 13import jsonschema
 14import jsonschema.exceptions
 15from pydantic import (
 16    BaseModel,
 17    Field,
 18    ValidationInfo,
 19    model_validator,
 20)
 21from typing_extensions import Self
 22
 23from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
 24
 25from .basemodel import (
 26    ID_FIELD,
 27    ID_TYPE,
 28    NAME_FIELD,
 29    SHORT_NAME_FIELD,
 30    KilnBaseModel,
 31    KilnParentedModel,
 32    KilnParentModel,
 33)
 34from .json_schema import validate_schema
 35
 36if TYPE_CHECKING:
 37    from . import Task
 38
 39
 40__all__ = [
 41    "basemodel",
 42    "json_schema",
 43    "Task",
 44    "Project",
 45    "TaskRun",
 46    "TaskOutput",
 47    "TaskOutputRating",
 48    "Priority",
 49    "DataSource",
 50    "DataSourceType",
 51    "DataSourceProperty",
 52    "TaskOutputRatingType",
 53    "TaskRequirement",
 54    "TaskDeterminism",
 55    "strict_mode",
 56    "set_strict_mode",
 57]
 58
 59
 60# We want to be hard on ourselves for data completeness generated by the Kiln App, but don't want to make it hard for users to use the datamodel/library.
 61# Strict mode enables extra validations that we want to enforce in Kiln App (and any other client that wants best practices), but not in the library (unless they opt in)
 62_strict_mode: bool = False
 63
 64
 65def strict_mode() -> bool:
 66    return _strict_mode
 67
 68
 69def set_strict_mode(value: bool) -> None:
 70    global _strict_mode
 71    _strict_mode = value
 72
 73
 74class Priority(IntEnum):
 75    """Defines priority levels for tasks and requirements, where P0 is highest priority."""
 76
 77    p0 = 0
 78    p1 = 1
 79    p2 = 2
 80    p3 = 3
 81
 82
 83# Only one rating type for now, but this allows for extensibility if we want to add more in the future
 84class TaskOutputRatingType(str, Enum):
 85    """Defines the types of rating systems available for task outputs."""
 86
 87    five_star = "five_star"
 88    custom = "custom"
 89
 90
 91class TaskOutputRating(KilnBaseModel):
 92    """
 93    A rating for a task output, including an overall rating and ratings for each requirement.
 94
 95    Only supports five star ratings for now, but extensible for custom values.
 96    """
 97
 98    type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
 99    value: float | None = Field(
100        description="The overall rating value (typically 1-5 stars).",
101        default=None,
102    )
103    requirement_ratings: Dict[ID_TYPE, float] = Field(
104        default={},
105        description="The ratings of the requirements of the task. The keys are the ids of the requirements. The values are the ratings (typically 1-5 stars).",
106    )
107
108    # Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc)
109    def is_high_quality(self) -> bool:
110        if self.type == TaskOutputRatingType.five_star:
111            return self.value is not None and self.value >= 4
112        return False
113
114    @model_validator(mode="after")
115    def validate_rating(self) -> Self:
116        if self.type not in TaskOutputRatingType:
117            raise ValueError(f"Invalid rating type: {self.type}")
118
119        if self.type == TaskOutputRatingType.five_star:
120            if self.value is not None:
121                self._validate_five_star(self.value, "overall rating")
122            for req_id, req_rating in self.requirement_ratings.items():
123                self._validate_five_star(req_rating, f"requirement rating for {req_id}")
124
125        return self
126
127    def _validate_five_star(self, rating: float, rating_name: str) -> None:
128        if not isinstance(rating, float) or not rating.is_integer():
129            raise ValueError(
130                f"{rating_name.capitalize()} of type five_star must be an integer value (1.0, 2.0, 3.0, 4.0, or 5.0)"
131            )
132        if rating < 1 or rating > 5:
133            raise ValueError(
134                f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars"
135            )
136
137
138class TaskOutput(KilnBaseModel):
139    """
140    An output for a specific task run.
141
142    Contains the actual output content, its source (human or synthetic),
143    and optional rating information.
144    """
145
146    output: str = Field(
147        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
148    )
149    source: DataSource | None = Field(
150        description="The source of the output: human or synthetic.",
151        default=None,
152    )
153    rating: TaskOutputRating | None = Field(
154        default=None, description="The rating of the output"
155    )
156
157    def validate_output_format(self, task: Task) -> Self:
158        # validate output
159        if task.output_json_schema is not None:
160            try:
161                validate_schema(json.loads(self.output), task.output_json_schema)
162            except json.JSONDecodeError:
163                raise ValueError("Output is not a valid JSON object")
164            except jsonschema.exceptions.ValidationError as e:
165                raise ValueError(f"Output does not match task output schema: {e}")
166        return self
167
168    @model_validator(mode="after")
169    def validate_output_source(self, info: ValidationInfo) -> Self:
170        # On strict mode and not loaded from file, we validate output_source is not None.
171        # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
172        if not strict_mode():
173            return self
174        if self.loaded_from_file(info):
175            return self
176        if self.source is None:
177            raise ValueError("Output source is required when strict mode is enabled")
178        return self
179
180
181class FineTuneStatusType(str, Enum):
182    """
183    The status type of a fine-tune (running, completed, failed, etc).
184    """
185
186    unknown = "unknown"  # server error
187    pending = "pending"
188    running = "running"
189    completed = "completed"
190    failed = "failed"
191
192
193class Finetune(KilnParentedModel):
194    name: str = NAME_FIELD
195    description: str | None = Field(
196        default=None,
197        description="A description of the fine-tune for you and your team. Not used in training.",
198    )
199    provider: str = Field(
200        description="The provider to use for the fine-tune (e.g. 'openai')."
201    )
202    base_model_id: str = Field(
203        description="The id of the base model to use for the fine-tune. This string relates to the provider's IDs for their own models, not Kiln IDs."
204    )
205    provider_id: str | None = Field(
206        default=None,
207        description="The ID of the fine-tune job on the provider's side. May not be the same as the fine_tune_model_id.",
208    )
209    fine_tune_model_id: str | None = Field(
210        default=None,
211        description="The ID of the fine-tuned model on the provider's side. May not be the same as the provider_id.",
212    )
213    dataset_split_id: str = Field(
214        description="The ID of the dataset split to use for this fine-tune.",
215    )
216    train_split_name: str = Field(
217        default="train",
218        description="The name of the training split to use for this fine-tune.",
219    )
220    validation_split_name: str | None = Field(
221        default=None,
222        description="The name of the validation split to use for this fine-tune. Optional.",
223    )
224    parameters: dict[str, str | int | float | bool] = Field(
225        default={},
226        description="The parameters to use for this fine-tune. These are provider-specific.",
227    )
228    system_message: str = Field(
229        description="The system message to use for this fine-tune.",
230    )
231    latest_status: FineTuneStatusType = Field(
232        default=FineTuneStatusType.unknown,
233        description="The latest known status of this fine-tune. Not updated in real time.",
234    )
235    properties: Dict[str, str | int | float] = Field(
236        default={},
237        description="Properties of the fine-tune. Different providers may use different properties.",
238    )
239
240    def parent_task(self) -> Task | None:
241        if not isinstance(self.parent, Task):
242            return None
243        return self.parent
244
245
246class DataSourceType(str, Enum):
247    """
248    The source type of a piece of data.
249
250    Human: a human created the data
251    Synthetic: a model created the data
252    """
253
254    human = "human"
255    synthetic = "synthetic"
256
257
258class DataSourceProperty(BaseModel):
259    """
260    Defines a property that can be associated with a data source.
261
262    Includes validation rules for when properties are required or not allowed
263    based on the data source type.
264    """
265
266    name: str
267    type: Type[Union[str, int, float]]
268    required_for: List[DataSourceType] = []
269    not_allowed_for: List[DataSourceType] = []
270
271
272class DataSource(BaseModel):
273    """
274    Represents the origin of data, either human or synthetic, with associated properties.
275
276    Properties vary based on the source type - for synthetic sources this includes
277    model information, for human sources this includes creator information.
278    """
279
280    type: DataSourceType
281    properties: Dict[str, str | int | float] = Field(
282        default={},
283        description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
284    )
285
286    _data_source_properties = [
287        DataSourceProperty(
288            name="created_by",
289            type=str,
290            required_for=[DataSourceType.human],
291            not_allowed_for=[DataSourceType.synthetic],
292        ),
293        DataSourceProperty(
294            name="model_name",
295            type=str,
296            required_for=[DataSourceType.synthetic],
297            not_allowed_for=[DataSourceType.human],
298        ),
299        DataSourceProperty(
300            name="model_provider",
301            type=str,
302            required_for=[DataSourceType.synthetic],
303            not_allowed_for=[DataSourceType.human],
304        ),
305        DataSourceProperty(
306            name="adapter_name",
307            type=str,
308            required_for=[DataSourceType.synthetic],
309            not_allowed_for=[DataSourceType.human],
310        ),
311        DataSourceProperty(
312            name="prompt_builder_name",
313            type=str,
314            not_allowed_for=[DataSourceType.human],
315        ),
316    ]
317
318    @model_validator(mode="after")
319    def validate_type(self) -> "DataSource":
320        if self.type not in DataSourceType:
321            raise ValueError(f"Invalid data source type: {self.type}")
322        return self
323
324    @model_validator(mode="after")
325    def validate_properties(self) -> "DataSource":
326        for prop in self._data_source_properties:
327            # Check the property type is correct
328            if prop.name in self.properties:
329                if not isinstance(self.properties[prop.name], prop.type):
330                    raise ValueError(
331                        f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
332                    )
333            # Check the property is required for the data source type
334            if self.type in prop.required_for:
335                if prop.name not in self.properties:
336                    raise ValueError(
337                        f"'{prop.name}' is required for {self.type} data source"
338                    )
339            # Check the property is not allowed for the data source type
340            elif self.type in prop.not_allowed_for and prop.name in self.properties:
341                raise ValueError(
342                    f"'{prop.name}' is not allowed for {self.type} data source"
343                )
344        return self
345
346    @model_validator(mode="after")
347    def validate_no_empty_properties(self) -> Self:
348        for prop, value in self.properties.items():
349            if isinstance(value, str) and value == "":
350                raise ValueError(
351                    f"Property '{prop}' must be a non-empty string for {self.type} data source"
352                )
353        return self
354
355
356class TaskRun(KilnParentedModel):
357    """
358    Represents a single execution of a Task.
359
360    Contains the input used, its source, the output produced, and optional
361    repair information if the output needed correction.
362    """
363
364    input: str = Field(
365        description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input."
366    )
367    input_source: DataSource | None = Field(
368        default=None, description="The source of the input: human or synthetic."
369    )
370
371    output: TaskOutput = Field(description="The output of the task run.")
372    repair_instructions: str | None = Field(
373        default=None,
374        description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
375    )
376    repaired_output: TaskOutput | None = Field(
377        default=None,
378        description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",
379    )
380    intermediate_outputs: Dict[str, str] | None = Field(
381        default=None,
382        description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.",
383    )
384
385    def parent_task(self) -> Task | None:
386        if not isinstance(self.parent, Task):
387            return None
388        return self.parent
389
390    @model_validator(mode="after")
391    def validate_input_format(self) -> Self:
392        task = self.parent_task()
393        if task is None:
394            # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
395            return self
396
397        # validate output
398        if task.input_json_schema is not None:
399            try:
400                validate_schema(json.loads(self.input), task.input_json_schema)
401            except json.JSONDecodeError:
402                raise ValueError("Input is not a valid JSON object")
403            except jsonschema.exceptions.ValidationError as e:
404                raise ValueError(f"Input does not match task input schema: {e}")
405        return self
406
407    @model_validator(mode="after")
408    def validate_output_format(self) -> Self:
409        task = self.parent_task()
410        if task is None:
411            return self
412
413        self.output.validate_output_format(task)
414        return self
415
416    @model_validator(mode="after")
417    def validate_repaired_output(self) -> Self:
418        if self.repaired_output is not None:
419            if self.repaired_output.rating is not None:
420                raise ValueError(
421                    "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
422                )
423        if self.repair_instructions is None and self.repaired_output is not None:
424            raise ValueError(
425                "Repair instructions are required if providing a repaired output."
426            )
427        if self.repair_instructions is not None and self.repaired_output is None:
428            raise ValueError(
429                "A repaired output is required if providing repair instructions."
430            )
431        return self
432
433    @model_validator(mode="after")
434    def validate_input_source(self, info: ValidationInfo) -> Self:
435        # On strict mode and not loaded from file, we validate input_source is not None.
436        # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
437        if not strict_mode():
438            return self
439        if self.loaded_from_file(info):
440            return self
441        if self.input_source is None:
442            raise ValueError("input_source is required when strict mode is enabled")
443        return self
444
445
446# Define the type alias for clarity
447DatasetFilter = Callable[[TaskRun], bool]
448
449
450def AllDatasetFilter(_: TaskRun) -> bool:
451    return True
452
453
454def HighRatingDatasetFilter(task_run: TaskRun) -> bool:
455    if task_run.output is None or task_run.output.rating is None:
456        return False
457    return task_run.output.rating.is_high_quality()
458
459
460class DatasetSplitDefinition(BaseModel):
461    """
462    A definition of a split in a dataset.
463
464    Example: name="train", description="The training set", percentage=0.8 (80% of the dataset)
465    """
466
467    name: str = NAME_FIELD
468    description: str | None = Field(
469        default=None,
470        description="A description of the dataset for you and your team. Not used in training.",
471    )
472    percentage: float = Field(
473        ge=0.0,
474        le=1.0,
475        description="The percentage of the dataset that this split represents (between 0 and 1).",
476    )
477
478
479AllSplitDefinition: list[DatasetSplitDefinition] = [
480    DatasetSplitDefinition(name="all", percentage=1.0)
481]
482Train80Test20SplitDefinition: list[DatasetSplitDefinition] = [
483    DatasetSplitDefinition(name="train", percentage=0.8),
484    DatasetSplitDefinition(name="test", percentage=0.2),
485]
486Train60Test20Val20SplitDefinition: list[DatasetSplitDefinition] = [
487    DatasetSplitDefinition(name="train", percentage=0.6),
488    DatasetSplitDefinition(name="test", percentage=0.2),
489    DatasetSplitDefinition(name="val", percentage=0.2),
490]
491
492
493class DatasetSplit(KilnParentedModel):
494    """
495    A collection of task runs, with optional splits (train, test, validation).
496
497    Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks.
498
499    Maintains a list of IDs for each split, to avoid data duplication.
500    """
501
502    name: str = NAME_FIELD
503    description: str | None = Field(
504        default=None,
505        description="A description of the dataset for you and your team. Not used in training.",
506    )
507    splits: list[DatasetSplitDefinition] = Field(
508        default_factory=list,
509        description="The splits in the dataset.",
510    )
511    split_contents: dict[str, list[str]] = Field(
512        description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.",
513    )
514
515    @model_validator(mode="after")
516    def validate_split_percentages(self) -> "DatasetSplit":
517        total = sum(split.percentage for split in self.splits)
518        if not math.isclose(total, 1.0, rel_tol=1e-9):
519            raise ValueError(f"The sum of split percentages must be 1.0 (got {total})")
520        return self
521
522    @classmethod
523    def from_task(
524        cls,
525        name: str,
526        task: "Task",
527        splits: list[DatasetSplitDefinition],
528        filter: DatasetFilter = AllDatasetFilter,
529        description: str | None = None,
530    ):
531        """
532        Build a dataset split from a task.
533        """
534        split_contents = cls.build_split_contents(task, splits, filter)
535        return cls(
536            parent=task,
537            name=name,
538            description=description,
539            splits=splits,
540            split_contents=split_contents,
541        )
542
543    @classmethod
544    def build_split_contents(
545        cls,
546        task: "Task",
547        splits: list[DatasetSplitDefinition],
548        filter: DatasetFilter,
549    ) -> dict[str, list[str]]:
550        valid_ids = []
551        for task_run in task.runs():
552            if filter(task_run):
553                valid_ids.append(task_run.id)
554
555        # Shuffle and split by split percentage
556        random.shuffle(valid_ids)
557        split_contents = {}
558        start_idx = 0
559        remaining_items = len(valid_ids)
560
561        # Handle all splits except the last one
562        for split in splits[:-1]:
563            split_size = round(len(valid_ids) * split.percentage)
564            split_contents[split.name] = valid_ids[start_idx : start_idx + split_size]
565            start_idx += split_size
566            remaining_items -= split_size
567
568        # Last split gets all remaining items (for rounding)
569        if splits:
570            split_contents[splits[-1].name] = valid_ids[start_idx:]
571
572        return split_contents
573
574    def parent_task(self) -> "Task | None":
575        # inline import to avoid circular import
576        from kiln_ai.datamodel import Task
577
578        if not isinstance(self.parent, Task):
579            return None
580        return self.parent
581
582    def missing_count(self) -> int:
583        """
584        Returns:
585            int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset
586        """
587        parent = self.parent_task()
588        if parent is None:
589            raise ValueError("DatasetSplit has no parent task")
590
591        runs = parent.runs()
592        all_ids = set(run.id for run in runs)
593        all_ids_in_splits = set()
594        for ids in self.split_contents.values():
595            all_ids_in_splits.update(ids)
596        missing = all_ids_in_splits - all_ids
597        return len(missing)
598
599
600class TaskRequirement(BaseModel):
601    """
602    Defines a specific requirement that should be met by task outputs.
603
604    Includes an identifier, name, description, instruction for meeting the requirement,
605    and priority level.
606    """
607
608    id: ID_TYPE = ID_FIELD
609    name: str = SHORT_NAME_FIELD
610    description: str | None = Field(default=None)
611    instruction: str = Field(min_length=1)
612    priority: Priority = Field(default=Priority.p2)
613
614
615class TaskDeterminism(str, Enum):
616    """
617    Defines how strictly task outputs should match expected results.
618
619    - deterministic: Requires exact matches
620    - semantic_match: Allows different wording with same meaning
621    - flexible: Allows variation in both wording and meaning within requirements
622    """
623
624    deterministic = "deterministic"  # Expect exact match
625    semantic_match = "semantic_match"  # Expect same meaning, but flexible on expression of the meaning
626    flexible = "flexible"  # Flexible on semantic output. Eval should be custom based on parsing requirements.
627
628
629class Task(
630    KilnParentedModel,
631    KilnParentModel,
632    parent_of={
633        "runs": TaskRun,
634        "dataset_splits": DatasetSplit,
635        "finetunes": Finetune,
636    },
637):
638    """
639    Represents a specific task to be performed, with associated requirements and validation rules.
640
641    Contains the task definition, requirements, input/output schemas, and maintains
642    a collection of task runs.
643    """
644
645    name: str = NAME_FIELD
646    description: str | None = Field(
647        default=None,
648        description="A description of the task for you and your team. Will not be used in prompts/training/validation.",
649    )
650    instruction: str = Field(
651        min_length=1,
652        description="The instructions for the task. Will be used in prompts/training/validation.",
653    )
654    requirements: List[TaskRequirement] = Field(default=[])
655    output_json_schema: JsonObjectSchema | None = None
656    input_json_schema: JsonObjectSchema | None = None
657    thinking_instruction: str | None = Field(
658        default=None,
659        description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.",
660    )
661
662    def output_schema(self) -> Dict | None:
663        if self.output_json_schema is None:
664            return None
665        return schema_from_json_str(self.output_json_schema)
666
667    def input_schema(self) -> Dict | None:
668        if self.input_json_schema is None:
669            return None
670        return schema_from_json_str(self.input_json_schema)
671
672    # Needed for typechecking. TODO P2: fix this in KilnParentModel
673    def runs(self) -> list[TaskRun]:
674        return super().runs()  # type: ignore
675
676    def dataset_splits(self) -> list[DatasetSplit]:
677        return super().dataset_splits()  # type: ignore
678
679    def finetunes(self) -> list[Finetune]:
680        return super().finetunes()  # type: ignore
681
682
683class Project(KilnParentModel, parent_of={"tasks": Task}):
684    """
685    A collection of related tasks.
686
687    Projects organize tasks into logical groups and provide high-level descriptions
688    of the overall goals.
689    """
690
691    name: str = NAME_FIELD
692    description: str | None = Field(
693        default=None,
694        description="A description of the project for you and your team. Will not be used in prompts/training/validation.",
695    )
696
697    # Needed for typechecking. TODO P2: fix this in KilnParentModel
698    def tasks(self) -> list[Task]:
699        return super().tasks()  # type: ignore
630class Task(
631    KilnParentedModel,
632    KilnParentModel,
633    parent_of={
634        "runs": TaskRun,
635        "dataset_splits": DatasetSplit,
636        "finetunes": Finetune,
637    },
638):
639    """
640    Represents a specific task to be performed, with associated requirements and validation rules.
641
642    Contains the task definition, requirements, input/output schemas, and maintains
643    a collection of task runs.
644    """
645
646    name: str = NAME_FIELD
647    description: str | None = Field(
648        default=None,
649        description="A description of the task for you and your team. Will not be used in prompts/training/validation.",
650    )
651    instruction: str = Field(
652        min_length=1,
653        description="The instructions for the task. Will be used in prompts/training/validation.",
654    )
655    requirements: List[TaskRequirement] = Field(default=[])
656    output_json_schema: JsonObjectSchema | None = None
657    input_json_schema: JsonObjectSchema | None = None
658    thinking_instruction: str | None = Field(
659        default=None,
660        description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.",
661    )
662
663    def output_schema(self) -> Dict | None:
664        if self.output_json_schema is None:
665            return None
666        return schema_from_json_str(self.output_json_schema)
667
668    def input_schema(self) -> Dict | None:
669        if self.input_json_schema is None:
670            return None
671        return schema_from_json_str(self.input_json_schema)
672
673    # Needed for typechecking. TODO P2: fix this in KilnParentModel
674    def runs(self) -> list[TaskRun]:
675        return super().runs()  # type: ignore
676
677    def dataset_splits(self) -> list[DatasetSplit]:
678        return super().dataset_splits()  # type: ignore
679
680    def finetunes(self) -> list[Finetune]:
681        return super().finetunes()  # type: ignore

Represents a specific task to be performed, with associated requirements and validation rules.

Contains the task definition, requirements, input/output schemas, and maintains a collection of task runs.

name: str
description: str | None
instruction: str
requirements: List[TaskRequirement]
output_json_schema: Optional[Annotated[str, AfterValidator(func=<function <lambda> at 0x10281c180>)]]
input_json_schema: Optional[Annotated[str, AfterValidator(func=<function <lambda> at 0x10281c180>)]]
thinking_instruction: str | None
def output_schema(self) -> Optional[Dict]:
663    def output_schema(self) -> Dict | None:
664        if self.output_json_schema is None:
665            return None
666        return schema_from_json_str(self.output_json_schema)
def input_schema(self) -> Optional[Dict]:
668    def input_schema(self) -> Dict | None:
669        if self.input_json_schema is None:
670            return None
671        return schema_from_json_str(self.input_json_schema)
def runs(self) -> List[TaskRun]:
404        def child_method(self) -> list[child_class]:
405            return child_class.all_children_of_parent_path(self.path)
def dataset_splits(self) -> List[kiln_ai.datamodel.DatasetSplit]:
404        def child_method(self) -> list[child_class]:
405            return child_class.all_children_of_parent_path(self.path)
def finetunes(self) -> List[kiln_ai.datamodel.Finetune]:
404        def child_method(self) -> list[child_class]:
405            return child_class.all_children_of_parent_path(self.path)
def relationship_name() -> str:
422        def relationship_name_method() -> str:
423            return relationship_name
def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
415        def parent_class_method() -> Type[KilnParentModel]:
416            return cls
model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class Project(kiln_ai.datamodel.basemodel.KilnParentModel):
684class Project(KilnParentModel, parent_of={"tasks": Task}):
685    """
686    A collection of related tasks.
687
688    Projects organize tasks into logical groups and provide high-level descriptions
689    of the overall goals.
690    """
691
692    name: str = NAME_FIELD
693    description: str | None = Field(
694        default=None,
695        description="A description of the project for you and your team. Will not be used in prompts/training/validation.",
696    )
697
698    # Needed for typechecking. TODO P2: fix this in KilnParentModel
699    def tasks(self) -> list[Task]:
700        return super().tasks()  # type: ignore

A collection of related tasks.

Projects organize tasks into logical groups and provide high-level descriptions of the overall goals.

name: str
description: str | None
def tasks(self) -> List[Task]:
404        def child_method(self) -> list[child_class]:
405            return child_class.all_children_of_parent_path(self.path)
model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class TaskRun(kiln_ai.datamodel.basemodel.KilnParentedModel):
357class TaskRun(KilnParentedModel):
358    """
359    Represents a single execution of a Task.
360
361    Contains the input used, its source, the output produced, and optional
362    repair information if the output needed correction.
363    """
364
365    input: str = Field(
366        description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input."
367    )
368    input_source: DataSource | None = Field(
369        default=None, description="The source of the input: human or synthetic."
370    )
371
372    output: TaskOutput = Field(description="The output of the task run.")
373    repair_instructions: str | None = Field(
374        default=None,
375        description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
376    )
377    repaired_output: TaskOutput | None = Field(
378        default=None,
379        description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",
380    )
381    intermediate_outputs: Dict[str, str] | None = Field(
382        default=None,
383        description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.",
384    )
385
386    def parent_task(self) -> Task | None:
387        if not isinstance(self.parent, Task):
388            return None
389        return self.parent
390
391    @model_validator(mode="after")
392    def validate_input_format(self) -> Self:
393        task = self.parent_task()
394        if task is None:
395            # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
396            return self
397
398        # validate output
399        if task.input_json_schema is not None:
400            try:
401                validate_schema(json.loads(self.input), task.input_json_schema)
402            except json.JSONDecodeError:
403                raise ValueError("Input is not a valid JSON object")
404            except jsonschema.exceptions.ValidationError as e:
405                raise ValueError(f"Input does not match task input schema: {e}")
406        return self
407
408    @model_validator(mode="after")
409    def validate_output_format(self) -> Self:
410        task = self.parent_task()
411        if task is None:
412            return self
413
414        self.output.validate_output_format(task)
415        return self
416
417    @model_validator(mode="after")
418    def validate_repaired_output(self) -> Self:
419        if self.repaired_output is not None:
420            if self.repaired_output.rating is not None:
421                raise ValueError(
422                    "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
423                )
424        if self.repair_instructions is None and self.repaired_output is not None:
425            raise ValueError(
426                "Repair instructions are required if providing a repaired output."
427            )
428        if self.repair_instructions is not None and self.repaired_output is None:
429            raise ValueError(
430                "A repaired output is required if providing repair instructions."
431            )
432        return self
433
434    @model_validator(mode="after")
435    def validate_input_source(self, info: ValidationInfo) -> Self:
436        # On strict mode and not loaded from file, we validate input_source is not None.
437        # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
438        if not strict_mode():
439            return self
440        if self.loaded_from_file(info):
441            return self
442        if self.input_source is None:
443            raise ValueError("input_source is required when strict mode is enabled")
444        return self

Represents a single execution of a Task.

Contains the input used, its source, the output produced, and optional repair information if the output needed correction.

input: str
input_source: DataSource | None
output: TaskOutput
repair_instructions: str | None
repaired_output: TaskOutput | None
intermediate_outputs: Optional[Dict[str, str]]
def parent_task(self) -> Task | None:
386    def parent_task(self) -> Task | None:
387        if not isinstance(self.parent, Task):
388            return None
389        return self.parent
@model_validator(mode='after')
def validate_input_format(self) -> Self:
391    @model_validator(mode="after")
392    def validate_input_format(self) -> Self:
393        task = self.parent_task()
394        if task is None:
395            # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
396            return self
397
398        # validate output
399        if task.input_json_schema is not None:
400            try:
401                validate_schema(json.loads(self.input), task.input_json_schema)
402            except json.JSONDecodeError:
403                raise ValueError("Input is not a valid JSON object")
404            except jsonschema.exceptions.ValidationError as e:
405                raise ValueError(f"Input does not match task input schema: {e}")
406        return self
@model_validator(mode='after')
def validate_output_format(self) -> Self:
408    @model_validator(mode="after")
409    def validate_output_format(self) -> Self:
410        task = self.parent_task()
411        if task is None:
412            return self
413
414        self.output.validate_output_format(task)
415        return self
@model_validator(mode='after')
def validate_repaired_output(self) -> Self:
417    @model_validator(mode="after")
418    def validate_repaired_output(self) -> Self:
419        if self.repaired_output is not None:
420            if self.repaired_output.rating is not None:
421                raise ValueError(
422                    "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
423                )
424        if self.repair_instructions is None and self.repaired_output is not None:
425            raise ValueError(
426                "Repair instructions are required if providing a repaired output."
427            )
428        if self.repair_instructions is not None and self.repaired_output is None:
429            raise ValueError(
430                "A repaired output is required if providing repair instructions."
431            )
432        return self
@model_validator(mode='after')
def validate_input_source(self, info: pydantic_core.core_schema.ValidationInfo) -> Self:
434    @model_validator(mode="after")
435    def validate_input_source(self, info: ValidationInfo) -> Self:
436        # On strict mode and not loaded from file, we validate input_source is not None.
437        # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
438        if not strict_mode():
439            return self
440        if self.loaded_from_file(info):
441            return self
442        if self.input_source is None:
443            raise ValueError("input_source is required when strict mode is enabled")
444        return self
def relationship_name() -> str:
422        def relationship_name_method() -> str:
423            return relationship_name
def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
415        def parent_class_method() -> Type[KilnParentModel]:
416            return cls
model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class TaskOutput(kiln_ai.datamodel.basemodel.KilnBaseModel):
139class TaskOutput(KilnBaseModel):
140    """
141    An output for a specific task run.
142
143    Contains the actual output content, its source (human or synthetic),
144    and optional rating information.
145    """
146
147    output: str = Field(
148        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
149    )
150    source: DataSource | None = Field(
151        description="The source of the output: human or synthetic.",
152        default=None,
153    )
154    rating: TaskOutputRating | None = Field(
155        default=None, description="The rating of the output"
156    )
157
158    def validate_output_format(self, task: Task) -> Self:
159        # validate output
160        if task.output_json_schema is not None:
161            try:
162                validate_schema(json.loads(self.output), task.output_json_schema)
163            except json.JSONDecodeError:
164                raise ValueError("Output is not a valid JSON object")
165            except jsonschema.exceptions.ValidationError as e:
166                raise ValueError(f"Output does not match task output schema: {e}")
167        return self
168
169    @model_validator(mode="after")
170    def validate_output_source(self, info: ValidationInfo) -> Self:
171        # On strict mode and not loaded from file, we validate output_source is not None.
172        # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
173        if not strict_mode():
174            return self
175        if self.loaded_from_file(info):
176            return self
177        if self.source is None:
178            raise ValueError("Output source is required when strict mode is enabled")
179        return self

An output for a specific task run.

Contains the actual output content, its source (human or synthetic), and optional rating information.

output: str
source: DataSource | None
rating: TaskOutputRating | None
def validate_output_format(self, task: Task) -> Self:
158    def validate_output_format(self, task: Task) -> Self:
159        # validate output
160        if task.output_json_schema is not None:
161            try:
162                validate_schema(json.loads(self.output), task.output_json_schema)
163            except json.JSONDecodeError:
164                raise ValueError("Output is not a valid JSON object")
165            except jsonschema.exceptions.ValidationError as e:
166                raise ValueError(f"Output does not match task output schema: {e}")
167        return self
@model_validator(mode='after')
def validate_output_source(self, info: pydantic_core.core_schema.ValidationInfo) -> Self:
169    @model_validator(mode="after")
170    def validate_output_source(self, info: ValidationInfo) -> Self:
171        # On strict mode and not loaded from file, we validate output_source is not None.
172        # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
173        if not strict_mode():
174            return self
175        if self.loaded_from_file(info):
176            return self
177        if self.source is None:
178            raise ValueError("Output source is required when strict mode is enabled")
179        return self
model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class TaskOutputRating(kiln_ai.datamodel.basemodel.KilnBaseModel):
 92class TaskOutputRating(KilnBaseModel):
 93    """
 94    A rating for a task output, including an overall rating and ratings for each requirement.
 95
 96    Only supports five star ratings for now, but extensible for custom values.
 97    """
 98
 99    type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
100    value: float | None = Field(
101        description="The overall rating value (typically 1-5 stars).",
102        default=None,
103    )
104    requirement_ratings: Dict[ID_TYPE, float] = Field(
105        default={},
106        description="The ratings of the requirements of the task. The keys are the ids of the requirements. The values are the ratings (typically 1-5 stars).",
107    )
108
109    # Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc)
110    def is_high_quality(self) -> bool:
111        if self.type == TaskOutputRatingType.five_star:
112            return self.value is not None and self.value >= 4
113        return False
114
115    @model_validator(mode="after")
116    def validate_rating(self) -> Self:
117        if self.type not in TaskOutputRatingType:
118            raise ValueError(f"Invalid rating type: {self.type}")
119
120        if self.type == TaskOutputRatingType.five_star:
121            if self.value is not None:
122                self._validate_five_star(self.value, "overall rating")
123            for req_id, req_rating in self.requirement_ratings.items():
124                self._validate_five_star(req_rating, f"requirement rating for {req_id}")
125
126        return self
127
128    def _validate_five_star(self, rating: float, rating_name: str) -> None:
129        if not isinstance(rating, float) or not rating.is_integer():
130            raise ValueError(
131                f"{rating_name.capitalize()} of type five_star must be an integer value (1.0, 2.0, 3.0, 4.0, or 5.0)"
132            )
133        if rating < 1 or rating > 5:
134            raise ValueError(
135                f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars"
136            )

A rating for a task output, including an overall rating and ratings for each requirement.

Only supports five star ratings for now, but extensible for custom values.

value: float | None
requirement_ratings: Dict[Optional[str], float]
def is_high_quality(self) -> bool:
110    def is_high_quality(self) -> bool:
111        if self.type == TaskOutputRatingType.five_star:
112            return self.value is not None and self.value >= 4
113        return False
@model_validator(mode='after')
def validate_rating(self) -> Self:
115    @model_validator(mode="after")
116    def validate_rating(self) -> Self:
117        if self.type not in TaskOutputRatingType:
118            raise ValueError(f"Invalid rating type: {self.type}")
119
120        if self.type == TaskOutputRatingType.five_star:
121            if self.value is not None:
122                self._validate_five_star(self.value, "overall rating")
123            for req_id, req_rating in self.requirement_ratings.items():
124                self._validate_five_star(req_rating, f"requirement rating for {req_id}")
125
126        return self
model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class Priority(enum.IntEnum):
75class Priority(IntEnum):
76    """Defines priority levels for tasks and requirements, where P0 is highest priority."""
77
78    p0 = 0
79    p1 = 1
80    p2 = 2
81    p3 = 3

Defines priority levels for tasks and requirements, where P0 is highest priority.

p0 = <Priority.p0: 0>
p1 = <Priority.p1: 1>
p2 = <Priority.p2: 2>
p3 = <Priority.p3: 3>
class DataSource(pydantic.main.BaseModel):
273class DataSource(BaseModel):
274    """
275    Represents the origin of data, either human or synthetic, with associated properties.
276
277    Properties vary based on the source type - for synthetic sources this includes
278    model information, for human sources this includes creator information.
279    """
280
281    type: DataSourceType
282    properties: Dict[str, str | int | float] = Field(
283        default={},
284        description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
285    )
286
287    _data_source_properties = [
288        DataSourceProperty(
289            name="created_by",
290            type=str,
291            required_for=[DataSourceType.human],
292            not_allowed_for=[DataSourceType.synthetic],
293        ),
294        DataSourceProperty(
295            name="model_name",
296            type=str,
297            required_for=[DataSourceType.synthetic],
298            not_allowed_for=[DataSourceType.human],
299        ),
300        DataSourceProperty(
301            name="model_provider",
302            type=str,
303            required_for=[DataSourceType.synthetic],
304            not_allowed_for=[DataSourceType.human],
305        ),
306        DataSourceProperty(
307            name="adapter_name",
308            type=str,
309            required_for=[DataSourceType.synthetic],
310            not_allowed_for=[DataSourceType.human],
311        ),
312        DataSourceProperty(
313            name="prompt_builder_name",
314            type=str,
315            not_allowed_for=[DataSourceType.human],
316        ),
317    ]
318
319    @model_validator(mode="after")
320    def validate_type(self) -> "DataSource":
321        if self.type not in DataSourceType:
322            raise ValueError(f"Invalid data source type: {self.type}")
323        return self
324
325    @model_validator(mode="after")
326    def validate_properties(self) -> "DataSource":
327        for prop in self._data_source_properties:
328            # Check the property type is correct
329            if prop.name in self.properties:
330                if not isinstance(self.properties[prop.name], prop.type):
331                    raise ValueError(
332                        f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
333                    )
334            # Check the property is required for the data source type
335            if self.type in prop.required_for:
336                if prop.name not in self.properties:
337                    raise ValueError(
338                        f"'{prop.name}' is required for {self.type} data source"
339                    )
340            # Check the property is not allowed for the data source type
341            elif self.type in prop.not_allowed_for and prop.name in self.properties:
342                raise ValueError(
343                    f"'{prop.name}' is not allowed for {self.type} data source"
344                )
345        return self
346
347    @model_validator(mode="after")
348    def validate_no_empty_properties(self) -> Self:
349        for prop, value in self.properties.items():
350            if isinstance(value, str) and value == "":
351                raise ValueError(
352                    f"Property '{prop}' must be a non-empty string for {self.type} data source"
353                )
354        return self

Represents the origin of data, either human or synthetic, with associated properties.

Properties vary based on the source type - for synthetic sources this includes model information, for human sources this includes creator information.

properties: Dict[str, str | int | float]
@model_validator(mode='after')
def validate_type(self) -> DataSource:
319    @model_validator(mode="after")
320    def validate_type(self) -> "DataSource":
321        if self.type not in DataSourceType:
322            raise ValueError(f"Invalid data source type: {self.type}")
323        return self
@model_validator(mode='after')
def validate_properties(self) -> DataSource:
325    @model_validator(mode="after")
326    def validate_properties(self) -> "DataSource":
327        for prop in self._data_source_properties:
328            # Check the property type is correct
329            if prop.name in self.properties:
330                if not isinstance(self.properties[prop.name], prop.type):
331                    raise ValueError(
332                        f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
333                    )
334            # Check the property is required for the data source type
335            if self.type in prop.required_for:
336                if prop.name not in self.properties:
337                    raise ValueError(
338                        f"'{prop.name}' is required for {self.type} data source"
339                    )
340            # Check the property is not allowed for the data source type
341            elif self.type in prop.not_allowed_for and prop.name in self.properties:
342                raise ValueError(
343                    f"'{prop.name}' is not allowed for {self.type} data source"
344                )
345        return self
@model_validator(mode='after')
def validate_no_empty_properties(self) -> Self:
347    @model_validator(mode="after")
348    def validate_no_empty_properties(self) -> Self:
349        for prop, value in self.properties.items():
350            if isinstance(value, str) and value == "":
351                raise ValueError(
352                    f"Property '{prop}' must be a non-empty string for {self.type} data source"
353                )
354        return self
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
384def init_private_attributes(self: BaseModel, context: Any, /) -> None:
385    """This function is meant to behave like a BaseModel method to initialise private attributes.
386
387    It takes context as an argument since that's what pydantic-core passes when calling it.
388
389    Args:
390        self: The BaseModel instance.
391        context: The context.
392    """
393    if getattr(self, '__pydantic_private__', None) is None:
394        pydantic_private = {}
395        for name, private_attr in self.__private_attributes__.items():
396            default = private_attr.get_default()
397            if default is not PydanticUndefined:
398                pydantic_private[name] = default
399        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class DataSourceType(builtins.str, enum.Enum):
247class DataSourceType(str, Enum):
248    """
249    The source type of a piece of data.
250
251    Human: a human created the data
252    Synthetic: a model created the data
253    """
254
255    human = "human"
256    synthetic = "synthetic"

The source type of a piece of data.

Human: a human created the data Synthetic: a model created the data

human = <DataSourceType.human: 'human'>
synthetic = <DataSourceType.synthetic: 'synthetic'>
class DataSourceProperty(pydantic.main.BaseModel):
259class DataSourceProperty(BaseModel):
260    """
261    Defines a property that can be associated with a data source.
262
263    Includes validation rules for when properties are required or not allowed
264    based on the data source type.
265    """
266
267    name: str
268    type: Type[Union[str, int, float]]
269    required_for: List[DataSourceType] = []
270    not_allowed_for: List[DataSourceType] = []

Defines a property that can be associated with a data source.

Includes validation rules for when properties are required or not allowed based on the data source type.

name: str
type: Type[Union[str, int, float]]
required_for: List[DataSourceType]
not_allowed_for: List[DataSourceType]
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class TaskOutputRatingType(builtins.str, enum.Enum):
85class TaskOutputRatingType(str, Enum):
86    """Defines the types of rating systems available for task outputs."""
87
88    five_star = "five_star"
89    custom = "custom"

Defines the types of rating systems available for task outputs.

five_star = <TaskOutputRatingType.five_star: 'five_star'>
custom = <TaskOutputRatingType.custom: 'custom'>
class TaskRequirement(pydantic.main.BaseModel):
601class TaskRequirement(BaseModel):
602    """
603    Defines a specific requirement that should be met by task outputs.
604
605    Includes an identifier, name, description, instruction for meeting the requirement,
606    and priority level.
607    """
608
609    id: ID_TYPE = ID_FIELD
610    name: str = SHORT_NAME_FIELD
611    description: str | None = Field(default=None)
612    instruction: str = Field(min_length=1)
613    priority: Priority = Field(default=Priority.p2)

Defines a specific requirement that should be met by task outputs.

Includes an identifier, name, description, instruction for meeting the requirement, and priority level.

id: Optional[str]
name: str
description: str | None
instruction: str
priority: Priority
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class TaskDeterminism(builtins.str, enum.Enum):
616class TaskDeterminism(str, Enum):
617    """
618    Defines how strictly task outputs should match expected results.
619
620    - deterministic: Requires exact matches
621    - semantic_match: Allows different wording with same meaning
622    - flexible: Allows variation in both wording and meaning within requirements
623    """
624
625    deterministic = "deterministic"  # Expect exact match
626    semantic_match = "semantic_match"  # Expect same meaning, but flexible on expression of the meaning
627    flexible = "flexible"  # Flexible on semantic output. Eval should be custom based on parsing requirements.

Defines how strictly task outputs should match expected results.

  • deterministic: Requires exact matches
  • semantic_match: Allows different wording with same meaning
  • flexible: Allows variation in both wording and meaning within requirements
deterministic = <TaskDeterminism.deterministic: 'deterministic'>
semantic_match = <TaskDeterminism.semantic_match: 'semantic_match'>
flexible = <TaskDeterminism.flexible: 'flexible'>
def strict_mode() -> bool:
66def strict_mode() -> bool:
67    return _strict_mode
def set_strict_mode(value: bool) -> None:
70def set_strict_mode(value: bool) -> None:
71    global _strict_mode
72    _strict_mode = value