kiln_ai.datamodel
See our docs for details about our datamodel: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html
1""" 2See our docs for details about our datamodel: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html 3""" 4 5from __future__ import annotations 6 7import json 8import math 9import random 10from enum import Enum, IntEnum 11from typing import TYPE_CHECKING, Callable, Dict, List, Type, Union 12 13import jsonschema 14import jsonschema.exceptions 15from pydantic import ( 16 BaseModel, 17 Field, 18 ValidationInfo, 19 model_validator, 20) 21from typing_extensions import Self 22 23from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str 24 25from .basemodel import ( 26 ID_FIELD, 27 ID_TYPE, 28 NAME_FIELD, 29 SHORT_NAME_FIELD, 30 KilnBaseModel, 31 KilnParentedModel, 32 KilnParentModel, 33) 34from .json_schema import validate_schema 35 36if TYPE_CHECKING: 37 from . import Task 38 39 40__all__ = [ 41 "basemodel", 42 "json_schema", 43 "Task", 44 "Project", 45 "TaskRun", 46 "TaskOutput", 47 "TaskOutputRating", 48 "Priority", 49 "DataSource", 50 "DataSourceType", 51 "DataSourceProperty", 52 "TaskOutputRatingType", 53 "TaskRequirement", 54 "TaskDeterminism", 55 "strict_mode", 56 "set_strict_mode", 57] 58 59 60# We want to be hard on ourselves for data completeness generated by the Kiln App, but don't want to make it hard for users to use the datamodel/library. 61# Strict mode enables extra validations that we want to enforce in Kiln App (and any other client that wants best practices), but not in the library (unless they opt in) 62_strict_mode: bool = False 63 64 65def strict_mode() -> bool: 66 return _strict_mode 67 68 69def set_strict_mode(value: bool) -> None: 70 global _strict_mode 71 _strict_mode = value 72 73 74class Priority(IntEnum): 75 """Defines priority levels for tasks and requirements, where P0 is highest priority.""" 76 77 p0 = 0 78 p1 = 1 79 p2 = 2 80 p3 = 3 81 82 83# Only one rating type for now, but this allows for extensibility if we want to add more in the future 84class TaskOutputRatingType(str, Enum): 85 """Defines the types of rating systems available for task outputs.""" 86 87 five_star = "five_star" 88 custom = "custom" 89 90 91class TaskOutputRating(KilnBaseModel): 92 """ 93 A rating for a task output, including an overall rating and ratings for each requirement. 94 95 Only supports five star ratings for now, but extensible for custom values. 96 """ 97 98 type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star) 99 value: float | None = Field( 100 description="The overall rating value (typically 1-5 stars).", 101 default=None, 102 ) 103 requirement_ratings: Dict[ID_TYPE, float] = Field( 104 default={}, 105 description="The ratings of the requirements of the task. The keys are the ids of the requirements. The values are the ratings (typically 1-5 stars).", 106 ) 107 108 # Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc) 109 def is_high_quality(self) -> bool: 110 if self.type == TaskOutputRatingType.five_star: 111 return self.value is not None and self.value >= 4 112 return False 113 114 @model_validator(mode="after") 115 def validate_rating(self) -> Self: 116 if self.type not in TaskOutputRatingType: 117 raise ValueError(f"Invalid rating type: {self.type}") 118 119 if self.type == TaskOutputRatingType.five_star: 120 if self.value is not None: 121 self._validate_five_star(self.value, "overall rating") 122 for req_id, req_rating in self.requirement_ratings.items(): 123 self._validate_five_star(req_rating, f"requirement rating for {req_id}") 124 125 return self 126 127 def _validate_five_star(self, rating: float, rating_name: str) -> None: 128 if not isinstance(rating, float) or not rating.is_integer(): 129 raise ValueError( 130 f"{rating_name.capitalize()} of type five_star must be an integer value (1.0, 2.0, 3.0, 4.0, or 5.0)" 131 ) 132 if rating < 1 or rating > 5: 133 raise ValueError( 134 f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars" 135 ) 136 137 138class TaskOutput(KilnBaseModel): 139 """ 140 An output for a specific task run. 141 142 Contains the actual output content, its source (human or synthetic), 143 and optional rating information. 144 """ 145 146 output: str = Field( 147 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 148 ) 149 source: DataSource | None = Field( 150 description="The source of the output: human or synthetic.", 151 default=None, 152 ) 153 rating: TaskOutputRating | None = Field( 154 default=None, description="The rating of the output" 155 ) 156 157 def validate_output_format(self, task: Task) -> Self: 158 # validate output 159 if task.output_json_schema is not None: 160 try: 161 validate_schema(json.loads(self.output), task.output_json_schema) 162 except json.JSONDecodeError: 163 raise ValueError("Output is not a valid JSON object") 164 except jsonschema.exceptions.ValidationError as e: 165 raise ValueError(f"Output does not match task output schema: {e}") 166 return self 167 168 @model_validator(mode="after") 169 def validate_output_source(self, info: ValidationInfo) -> Self: 170 # On strict mode and not loaded from file, we validate output_source is not None. 171 # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data. 172 if not strict_mode(): 173 return self 174 if self.loaded_from_file(info): 175 return self 176 if self.source is None: 177 raise ValueError("Output source is required when strict mode is enabled") 178 return self 179 180 181class FineTuneStatusType(str, Enum): 182 """ 183 The status type of a fine-tune (running, completed, failed, etc). 184 """ 185 186 unknown = "unknown" # server error 187 pending = "pending" 188 running = "running" 189 completed = "completed" 190 failed = "failed" 191 192 193class Finetune(KilnParentedModel): 194 name: str = NAME_FIELD 195 description: str | None = Field( 196 default=None, 197 description="A description of the fine-tune for you and your team. Not used in training.", 198 ) 199 provider: str = Field( 200 description="The provider to use for the fine-tune (e.g. 'openai')." 201 ) 202 base_model_id: str = Field( 203 description="The id of the base model to use for the fine-tune. This string relates to the provider's IDs for their own models, not Kiln IDs." 204 ) 205 provider_id: str | None = Field( 206 default=None, 207 description="The ID of the fine-tune job on the provider's side. May not be the same as the fine_tune_model_id.", 208 ) 209 fine_tune_model_id: str | None = Field( 210 default=None, 211 description="The ID of the fine-tuned model on the provider's side. May not be the same as the provider_id.", 212 ) 213 dataset_split_id: str = Field( 214 description="The ID of the dataset split to use for this fine-tune.", 215 ) 216 train_split_name: str = Field( 217 default="train", 218 description="The name of the training split to use for this fine-tune.", 219 ) 220 validation_split_name: str | None = Field( 221 default=None, 222 description="The name of the validation split to use for this fine-tune. Optional.", 223 ) 224 parameters: dict[str, str | int | float | bool] = Field( 225 default={}, 226 description="The parameters to use for this fine-tune. These are provider-specific.", 227 ) 228 system_message: str = Field( 229 description="The system message to use for this fine-tune.", 230 ) 231 latest_status: FineTuneStatusType = Field( 232 default=FineTuneStatusType.unknown, 233 description="The latest known status of this fine-tune. Not updated in real time.", 234 ) 235 properties: Dict[str, str | int | float] = Field( 236 default={}, 237 description="Properties of the fine-tune. Different providers may use different properties.", 238 ) 239 240 def parent_task(self) -> Task | None: 241 if not isinstance(self.parent, Task): 242 return None 243 return self.parent 244 245 246class DataSourceType(str, Enum): 247 """ 248 The source type of a piece of data. 249 250 Human: a human created the data 251 Synthetic: a model created the data 252 """ 253 254 human = "human" 255 synthetic = "synthetic" 256 257 258class DataSourceProperty(BaseModel): 259 """ 260 Defines a property that can be associated with a data source. 261 262 Includes validation rules for when properties are required or not allowed 263 based on the data source type. 264 """ 265 266 name: str 267 type: Type[Union[str, int, float]] 268 required_for: List[DataSourceType] = [] 269 not_allowed_for: List[DataSourceType] = [] 270 271 272class DataSource(BaseModel): 273 """ 274 Represents the origin of data, either human or synthetic, with associated properties. 275 276 Properties vary based on the source type - for synthetic sources this includes 277 model information, for human sources this includes creator information. 278 """ 279 280 type: DataSourceType 281 properties: Dict[str, str | int | float] = Field( 282 default={}, 283 description="Properties describing the data source. For synthetic things like model. For human, the human's name.", 284 ) 285 286 _data_source_properties = [ 287 DataSourceProperty( 288 name="created_by", 289 type=str, 290 required_for=[DataSourceType.human], 291 not_allowed_for=[DataSourceType.synthetic], 292 ), 293 DataSourceProperty( 294 name="model_name", 295 type=str, 296 required_for=[DataSourceType.synthetic], 297 not_allowed_for=[DataSourceType.human], 298 ), 299 DataSourceProperty( 300 name="model_provider", 301 type=str, 302 required_for=[DataSourceType.synthetic], 303 not_allowed_for=[DataSourceType.human], 304 ), 305 DataSourceProperty( 306 name="adapter_name", 307 type=str, 308 required_for=[DataSourceType.synthetic], 309 not_allowed_for=[DataSourceType.human], 310 ), 311 DataSourceProperty( 312 name="prompt_builder_name", 313 type=str, 314 not_allowed_for=[DataSourceType.human], 315 ), 316 ] 317 318 @model_validator(mode="after") 319 def validate_type(self) -> "DataSource": 320 if self.type not in DataSourceType: 321 raise ValueError(f"Invalid data source type: {self.type}") 322 return self 323 324 @model_validator(mode="after") 325 def validate_properties(self) -> "DataSource": 326 for prop in self._data_source_properties: 327 # Check the property type is correct 328 if prop.name in self.properties: 329 if not isinstance(self.properties[prop.name], prop.type): 330 raise ValueError( 331 f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source" 332 ) 333 # Check the property is required for the data source type 334 if self.type in prop.required_for: 335 if prop.name not in self.properties: 336 raise ValueError( 337 f"'{prop.name}' is required for {self.type} data source" 338 ) 339 # Check the property is not allowed for the data source type 340 elif self.type in prop.not_allowed_for and prop.name in self.properties: 341 raise ValueError( 342 f"'{prop.name}' is not allowed for {self.type} data source" 343 ) 344 return self 345 346 @model_validator(mode="after") 347 def validate_no_empty_properties(self) -> Self: 348 for prop, value in self.properties.items(): 349 if isinstance(value, str) and value == "": 350 raise ValueError( 351 f"Property '{prop}' must be a non-empty string for {self.type} data source" 352 ) 353 return self 354 355 356class TaskRun(KilnParentedModel): 357 """ 358 Represents a single execution of a Task. 359 360 Contains the input used, its source, the output produced, and optional 361 repair information if the output needed correction. 362 """ 363 364 input: str = Field( 365 description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input." 366 ) 367 input_source: DataSource | None = Field( 368 default=None, description="The source of the input: human or synthetic." 369 ) 370 371 output: TaskOutput = Field(description="The output of the task run.") 372 repair_instructions: str | None = Field( 373 default=None, 374 description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.", 375 ) 376 repaired_output: TaskOutput | None = Field( 377 default=None, 378 description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.", 379 ) 380 intermediate_outputs: Dict[str, str] | None = Field( 381 default=None, 382 description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.", 383 ) 384 385 def parent_task(self) -> Task | None: 386 if not isinstance(self.parent, Task): 387 return None 388 return self.parent 389 390 @model_validator(mode="after") 391 def validate_input_format(self) -> Self: 392 task = self.parent_task() 393 if task is None: 394 # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving) 395 return self 396 397 # validate output 398 if task.input_json_schema is not None: 399 try: 400 validate_schema(json.loads(self.input), task.input_json_schema) 401 except json.JSONDecodeError: 402 raise ValueError("Input is not a valid JSON object") 403 except jsonschema.exceptions.ValidationError as e: 404 raise ValueError(f"Input does not match task input schema: {e}") 405 return self 406 407 @model_validator(mode="after") 408 def validate_output_format(self) -> Self: 409 task = self.parent_task() 410 if task is None: 411 return self 412 413 self.output.validate_output_format(task) 414 return self 415 416 @model_validator(mode="after") 417 def validate_repaired_output(self) -> Self: 418 if self.repaired_output is not None: 419 if self.repaired_output.rating is not None: 420 raise ValueError( 421 "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed." 422 ) 423 if self.repair_instructions is None and self.repaired_output is not None: 424 raise ValueError( 425 "Repair instructions are required if providing a repaired output." 426 ) 427 if self.repair_instructions is not None and self.repaired_output is None: 428 raise ValueError( 429 "A repaired output is required if providing repair instructions." 430 ) 431 return self 432 433 @model_validator(mode="after") 434 def validate_input_source(self, info: ValidationInfo) -> Self: 435 # On strict mode and not loaded from file, we validate input_source is not None. 436 # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data. 437 if not strict_mode(): 438 return self 439 if self.loaded_from_file(info): 440 return self 441 if self.input_source is None: 442 raise ValueError("input_source is required when strict mode is enabled") 443 return self 444 445 446# Define the type alias for clarity 447DatasetFilter = Callable[[TaskRun], bool] 448 449 450def AllDatasetFilter(_: TaskRun) -> bool: 451 return True 452 453 454def HighRatingDatasetFilter(task_run: TaskRun) -> bool: 455 if task_run.output is None or task_run.output.rating is None: 456 return False 457 return task_run.output.rating.is_high_quality() 458 459 460class DatasetSplitDefinition(BaseModel): 461 """ 462 A definition of a split in a dataset. 463 464 Example: name="train", description="The training set", percentage=0.8 (80% of the dataset) 465 """ 466 467 name: str = NAME_FIELD 468 description: str | None = Field( 469 default=None, 470 description="A description of the dataset for you and your team. Not used in training.", 471 ) 472 percentage: float = Field( 473 ge=0.0, 474 le=1.0, 475 description="The percentage of the dataset that this split represents (between 0 and 1).", 476 ) 477 478 479AllSplitDefinition: list[DatasetSplitDefinition] = [ 480 DatasetSplitDefinition(name="all", percentage=1.0) 481] 482Train80Test20SplitDefinition: list[DatasetSplitDefinition] = [ 483 DatasetSplitDefinition(name="train", percentage=0.8), 484 DatasetSplitDefinition(name="test", percentage=0.2), 485] 486Train60Test20Val20SplitDefinition: list[DatasetSplitDefinition] = [ 487 DatasetSplitDefinition(name="train", percentage=0.6), 488 DatasetSplitDefinition(name="test", percentage=0.2), 489 DatasetSplitDefinition(name="val", percentage=0.2), 490] 491 492 493class DatasetSplit(KilnParentedModel): 494 """ 495 A collection of task runs, with optional splits (train, test, validation). 496 497 Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks. 498 499 Maintains a list of IDs for each split, to avoid data duplication. 500 """ 501 502 name: str = NAME_FIELD 503 description: str | None = Field( 504 default=None, 505 description="A description of the dataset for you and your team. Not used in training.", 506 ) 507 splits: list[DatasetSplitDefinition] = Field( 508 default_factory=list, 509 description="The splits in the dataset.", 510 ) 511 split_contents: dict[str, list[str]] = Field( 512 description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.", 513 ) 514 515 @model_validator(mode="after") 516 def validate_split_percentages(self) -> "DatasetSplit": 517 total = sum(split.percentage for split in self.splits) 518 if not math.isclose(total, 1.0, rel_tol=1e-9): 519 raise ValueError(f"The sum of split percentages must be 1.0 (got {total})") 520 return self 521 522 @classmethod 523 def from_task( 524 cls, 525 name: str, 526 task: "Task", 527 splits: list[DatasetSplitDefinition], 528 filter: DatasetFilter = AllDatasetFilter, 529 description: str | None = None, 530 ): 531 """ 532 Build a dataset split from a task. 533 """ 534 split_contents = cls.build_split_contents(task, splits, filter) 535 return cls( 536 parent=task, 537 name=name, 538 description=description, 539 splits=splits, 540 split_contents=split_contents, 541 ) 542 543 @classmethod 544 def build_split_contents( 545 cls, 546 task: "Task", 547 splits: list[DatasetSplitDefinition], 548 filter: DatasetFilter, 549 ) -> dict[str, list[str]]: 550 valid_ids = [] 551 for task_run in task.runs(): 552 if filter(task_run): 553 valid_ids.append(task_run.id) 554 555 # Shuffle and split by split percentage 556 random.shuffle(valid_ids) 557 split_contents = {} 558 start_idx = 0 559 remaining_items = len(valid_ids) 560 561 # Handle all splits except the last one 562 for split in splits[:-1]: 563 split_size = round(len(valid_ids) * split.percentage) 564 split_contents[split.name] = valid_ids[start_idx : start_idx + split_size] 565 start_idx += split_size 566 remaining_items -= split_size 567 568 # Last split gets all remaining items (for rounding) 569 if splits: 570 split_contents[splits[-1].name] = valid_ids[start_idx:] 571 572 return split_contents 573 574 def parent_task(self) -> "Task | None": 575 # inline import to avoid circular import 576 from kiln_ai.datamodel import Task 577 578 if not isinstance(self.parent, Task): 579 return None 580 return self.parent 581 582 def missing_count(self) -> int: 583 """ 584 Returns: 585 int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset 586 """ 587 parent = self.parent_task() 588 if parent is None: 589 raise ValueError("DatasetSplit has no parent task") 590 591 runs = parent.runs() 592 all_ids = set(run.id for run in runs) 593 all_ids_in_splits = set() 594 for ids in self.split_contents.values(): 595 all_ids_in_splits.update(ids) 596 missing = all_ids_in_splits - all_ids 597 return len(missing) 598 599 600class TaskRequirement(BaseModel): 601 """ 602 Defines a specific requirement that should be met by task outputs. 603 604 Includes an identifier, name, description, instruction for meeting the requirement, 605 and priority level. 606 """ 607 608 id: ID_TYPE = ID_FIELD 609 name: str = SHORT_NAME_FIELD 610 description: str | None = Field(default=None) 611 instruction: str = Field(min_length=1) 612 priority: Priority = Field(default=Priority.p2) 613 614 615class TaskDeterminism(str, Enum): 616 """ 617 Defines how strictly task outputs should match expected results. 618 619 - deterministic: Requires exact matches 620 - semantic_match: Allows different wording with same meaning 621 - flexible: Allows variation in both wording and meaning within requirements 622 """ 623 624 deterministic = "deterministic" # Expect exact match 625 semantic_match = "semantic_match" # Expect same meaning, but flexible on expression of the meaning 626 flexible = "flexible" # Flexible on semantic output. Eval should be custom based on parsing requirements. 627 628 629class Task( 630 KilnParentedModel, 631 KilnParentModel, 632 parent_of={ 633 "runs": TaskRun, 634 "dataset_splits": DatasetSplit, 635 "finetunes": Finetune, 636 }, 637): 638 """ 639 Represents a specific task to be performed, with associated requirements and validation rules. 640 641 Contains the task definition, requirements, input/output schemas, and maintains 642 a collection of task runs. 643 """ 644 645 name: str = NAME_FIELD 646 description: str | None = Field( 647 default=None, 648 description="A description of the task for you and your team. Will not be used in prompts/training/validation.", 649 ) 650 instruction: str = Field( 651 min_length=1, 652 description="The instructions for the task. Will be used in prompts/training/validation.", 653 ) 654 requirements: List[TaskRequirement] = Field(default=[]) 655 output_json_schema: JsonObjectSchema | None = None 656 input_json_schema: JsonObjectSchema | None = None 657 thinking_instruction: str | None = Field( 658 default=None, 659 description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.", 660 ) 661 662 def output_schema(self) -> Dict | None: 663 if self.output_json_schema is None: 664 return None 665 return schema_from_json_str(self.output_json_schema) 666 667 def input_schema(self) -> Dict | None: 668 if self.input_json_schema is None: 669 return None 670 return schema_from_json_str(self.input_json_schema) 671 672 # Needed for typechecking. TODO P2: fix this in KilnParentModel 673 def runs(self) -> list[TaskRun]: 674 return super().runs() # type: ignore 675 676 def dataset_splits(self) -> list[DatasetSplit]: 677 return super().dataset_splits() # type: ignore 678 679 def finetunes(self) -> list[Finetune]: 680 return super().finetunes() # type: ignore 681 682 683class Project(KilnParentModel, parent_of={"tasks": Task}): 684 """ 685 A collection of related tasks. 686 687 Projects organize tasks into logical groups and provide high-level descriptions 688 of the overall goals. 689 """ 690 691 name: str = NAME_FIELD 692 description: str | None = Field( 693 default=None, 694 description="A description of the project for you and your team. Will not be used in prompts/training/validation.", 695 ) 696 697 # Needed for typechecking. TODO P2: fix this in KilnParentModel 698 def tasks(self) -> list[Task]: 699 return super().tasks() # type: ignore
630class Task( 631 KilnParentedModel, 632 KilnParentModel, 633 parent_of={ 634 "runs": TaskRun, 635 "dataset_splits": DatasetSplit, 636 "finetunes": Finetune, 637 }, 638): 639 """ 640 Represents a specific task to be performed, with associated requirements and validation rules. 641 642 Contains the task definition, requirements, input/output schemas, and maintains 643 a collection of task runs. 644 """ 645 646 name: str = NAME_FIELD 647 description: str | None = Field( 648 default=None, 649 description="A description of the task for you and your team. Will not be used in prompts/training/validation.", 650 ) 651 instruction: str = Field( 652 min_length=1, 653 description="The instructions for the task. Will be used in prompts/training/validation.", 654 ) 655 requirements: List[TaskRequirement] = Field(default=[]) 656 output_json_schema: JsonObjectSchema | None = None 657 input_json_schema: JsonObjectSchema | None = None 658 thinking_instruction: str | None = Field( 659 default=None, 660 description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.", 661 ) 662 663 def output_schema(self) -> Dict | None: 664 if self.output_json_schema is None: 665 return None 666 return schema_from_json_str(self.output_json_schema) 667 668 def input_schema(self) -> Dict | None: 669 if self.input_json_schema is None: 670 return None 671 return schema_from_json_str(self.input_json_schema) 672 673 # Needed for typechecking. TODO P2: fix this in KilnParentModel 674 def runs(self) -> list[TaskRun]: 675 return super().runs() # type: ignore 676 677 def dataset_splits(self) -> list[DatasetSplit]: 678 return super().dataset_splits() # type: ignore 679 680 def finetunes(self) -> list[Finetune]: 681 return super().finetunes() # type: ignore
Represents a specific task to be performed, with associated requirements and validation rules.
Contains the task definition, requirements, input/output schemas, and maintains a collection of task runs.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
Inherited Members
684class Project(KilnParentModel, parent_of={"tasks": Task}): 685 """ 686 A collection of related tasks. 687 688 Projects organize tasks into logical groups and provide high-level descriptions 689 of the overall goals. 690 """ 691 692 name: str = NAME_FIELD 693 description: str | None = Field( 694 default=None, 695 description="A description of the project for you and your team. Will not be used in prompts/training/validation.", 696 ) 697 698 # Needed for typechecking. TODO P2: fix this in KilnParentModel 699 def tasks(self) -> list[Task]: 700 return super().tasks() # type: ignore
A collection of related tasks.
Projects organize tasks into logical groups and provide high-level descriptions of the overall goals.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
Inherited Members
357class TaskRun(KilnParentedModel): 358 """ 359 Represents a single execution of a Task. 360 361 Contains the input used, its source, the output produced, and optional 362 repair information if the output needed correction. 363 """ 364 365 input: str = Field( 366 description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input." 367 ) 368 input_source: DataSource | None = Field( 369 default=None, description="The source of the input: human or synthetic." 370 ) 371 372 output: TaskOutput = Field(description="The output of the task run.") 373 repair_instructions: str | None = Field( 374 default=None, 375 description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.", 376 ) 377 repaired_output: TaskOutput | None = Field( 378 default=None, 379 description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.", 380 ) 381 intermediate_outputs: Dict[str, str] | None = Field( 382 default=None, 383 description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.", 384 ) 385 386 def parent_task(self) -> Task | None: 387 if not isinstance(self.parent, Task): 388 return None 389 return self.parent 390 391 @model_validator(mode="after") 392 def validate_input_format(self) -> Self: 393 task = self.parent_task() 394 if task is None: 395 # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving) 396 return self 397 398 # validate output 399 if task.input_json_schema is not None: 400 try: 401 validate_schema(json.loads(self.input), task.input_json_schema) 402 except json.JSONDecodeError: 403 raise ValueError("Input is not a valid JSON object") 404 except jsonschema.exceptions.ValidationError as e: 405 raise ValueError(f"Input does not match task input schema: {e}") 406 return self 407 408 @model_validator(mode="after") 409 def validate_output_format(self) -> Self: 410 task = self.parent_task() 411 if task is None: 412 return self 413 414 self.output.validate_output_format(task) 415 return self 416 417 @model_validator(mode="after") 418 def validate_repaired_output(self) -> Self: 419 if self.repaired_output is not None: 420 if self.repaired_output.rating is not None: 421 raise ValueError( 422 "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed." 423 ) 424 if self.repair_instructions is None and self.repaired_output is not None: 425 raise ValueError( 426 "Repair instructions are required if providing a repaired output." 427 ) 428 if self.repair_instructions is not None and self.repaired_output is None: 429 raise ValueError( 430 "A repaired output is required if providing repair instructions." 431 ) 432 return self 433 434 @model_validator(mode="after") 435 def validate_input_source(self, info: ValidationInfo) -> Self: 436 # On strict mode and not loaded from file, we validate input_source is not None. 437 # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data. 438 if not strict_mode(): 439 return self 440 if self.loaded_from_file(info): 441 return self 442 if self.input_source is None: 443 raise ValueError("input_source is required when strict mode is enabled") 444 return self
Represents a single execution of a Task.
Contains the input used, its source, the output produced, and optional repair information if the output needed correction.
391 @model_validator(mode="after") 392 def validate_input_format(self) -> Self: 393 task = self.parent_task() 394 if task is None: 395 # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving) 396 return self 397 398 # validate output 399 if task.input_json_schema is not None: 400 try: 401 validate_schema(json.loads(self.input), task.input_json_schema) 402 except json.JSONDecodeError: 403 raise ValueError("Input is not a valid JSON object") 404 except jsonschema.exceptions.ValidationError as e: 405 raise ValueError(f"Input does not match task input schema: {e}") 406 return self
417 @model_validator(mode="after") 418 def validate_repaired_output(self) -> Self: 419 if self.repaired_output is not None: 420 if self.repaired_output.rating is not None: 421 raise ValueError( 422 "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed." 423 ) 424 if self.repair_instructions is None and self.repaired_output is not None: 425 raise ValueError( 426 "Repair instructions are required if providing a repaired output." 427 ) 428 if self.repair_instructions is not None and self.repaired_output is None: 429 raise ValueError( 430 "A repaired output is required if providing repair instructions." 431 ) 432 return self
434 @model_validator(mode="after") 435 def validate_input_source(self, info: ValidationInfo) -> Self: 436 # On strict mode and not loaded from file, we validate input_source is not None. 437 # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data. 438 if not strict_mode(): 439 return self 440 if self.loaded_from_file(info): 441 return self 442 if self.input_source is None: 443 raise ValueError("input_source is required when strict mode is enabled") 444 return self
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
Inherited Members
139class TaskOutput(KilnBaseModel): 140 """ 141 An output for a specific task run. 142 143 Contains the actual output content, its source (human or synthetic), 144 and optional rating information. 145 """ 146 147 output: str = Field( 148 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 149 ) 150 source: DataSource | None = Field( 151 description="The source of the output: human or synthetic.", 152 default=None, 153 ) 154 rating: TaskOutputRating | None = Field( 155 default=None, description="The rating of the output" 156 ) 157 158 def validate_output_format(self, task: Task) -> Self: 159 # validate output 160 if task.output_json_schema is not None: 161 try: 162 validate_schema(json.loads(self.output), task.output_json_schema) 163 except json.JSONDecodeError: 164 raise ValueError("Output is not a valid JSON object") 165 except jsonschema.exceptions.ValidationError as e: 166 raise ValueError(f"Output does not match task output schema: {e}") 167 return self 168 169 @model_validator(mode="after") 170 def validate_output_source(self, info: ValidationInfo) -> Self: 171 # On strict mode and not loaded from file, we validate output_source is not None. 172 # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data. 173 if not strict_mode(): 174 return self 175 if self.loaded_from_file(info): 176 return self 177 if self.source is None: 178 raise ValueError("Output source is required when strict mode is enabled") 179 return self
An output for a specific task run.
Contains the actual output content, its source (human or synthetic), and optional rating information.
158 def validate_output_format(self, task: Task) -> Self: 159 # validate output 160 if task.output_json_schema is not None: 161 try: 162 validate_schema(json.loads(self.output), task.output_json_schema) 163 except json.JSONDecodeError: 164 raise ValueError("Output is not a valid JSON object") 165 except jsonschema.exceptions.ValidationError as e: 166 raise ValueError(f"Output does not match task output schema: {e}") 167 return self
169 @model_validator(mode="after") 170 def validate_output_source(self, info: ValidationInfo) -> Self: 171 # On strict mode and not loaded from file, we validate output_source is not None. 172 # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data. 173 if not strict_mode(): 174 return self 175 if self.loaded_from_file(info): 176 return self 177 if self.source is None: 178 raise ValueError("Output source is required when strict mode is enabled") 179 return self
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
92class TaskOutputRating(KilnBaseModel): 93 """ 94 A rating for a task output, including an overall rating and ratings for each requirement. 95 96 Only supports five star ratings for now, but extensible for custom values. 97 """ 98 99 type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star) 100 value: float | None = Field( 101 description="The overall rating value (typically 1-5 stars).", 102 default=None, 103 ) 104 requirement_ratings: Dict[ID_TYPE, float] = Field( 105 default={}, 106 description="The ratings of the requirements of the task. The keys are the ids of the requirements. The values are the ratings (typically 1-5 stars).", 107 ) 108 109 # Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc) 110 def is_high_quality(self) -> bool: 111 if self.type == TaskOutputRatingType.five_star: 112 return self.value is not None and self.value >= 4 113 return False 114 115 @model_validator(mode="after") 116 def validate_rating(self) -> Self: 117 if self.type not in TaskOutputRatingType: 118 raise ValueError(f"Invalid rating type: {self.type}") 119 120 if self.type == TaskOutputRatingType.five_star: 121 if self.value is not None: 122 self._validate_five_star(self.value, "overall rating") 123 for req_id, req_rating in self.requirement_ratings.items(): 124 self._validate_five_star(req_rating, f"requirement rating for {req_id}") 125 126 return self 127 128 def _validate_five_star(self, rating: float, rating_name: str) -> None: 129 if not isinstance(rating, float) or not rating.is_integer(): 130 raise ValueError( 131 f"{rating_name.capitalize()} of type five_star must be an integer value (1.0, 2.0, 3.0, 4.0, or 5.0)" 132 ) 133 if rating < 1 or rating > 5: 134 raise ValueError( 135 f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars" 136 )
A rating for a task output, including an overall rating and ratings for each requirement.
Only supports five star ratings for now, but extensible for custom values.
115 @model_validator(mode="after") 116 def validate_rating(self) -> Self: 117 if self.type not in TaskOutputRatingType: 118 raise ValueError(f"Invalid rating type: {self.type}") 119 120 if self.type == TaskOutputRatingType.five_star: 121 if self.value is not None: 122 self._validate_five_star(self.value, "overall rating") 123 for req_id, req_rating in self.requirement_ratings.items(): 124 self._validate_five_star(req_rating, f"requirement rating for {req_id}") 125 126 return self
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
75class Priority(IntEnum): 76 """Defines priority levels for tasks and requirements, where P0 is highest priority.""" 77 78 p0 = 0 79 p1 = 1 80 p2 = 2 81 p3 = 3
Defines priority levels for tasks and requirements, where P0 is highest priority.
273class DataSource(BaseModel): 274 """ 275 Represents the origin of data, either human or synthetic, with associated properties. 276 277 Properties vary based on the source type - for synthetic sources this includes 278 model information, for human sources this includes creator information. 279 """ 280 281 type: DataSourceType 282 properties: Dict[str, str | int | float] = Field( 283 default={}, 284 description="Properties describing the data source. For synthetic things like model. For human, the human's name.", 285 ) 286 287 _data_source_properties = [ 288 DataSourceProperty( 289 name="created_by", 290 type=str, 291 required_for=[DataSourceType.human], 292 not_allowed_for=[DataSourceType.synthetic], 293 ), 294 DataSourceProperty( 295 name="model_name", 296 type=str, 297 required_for=[DataSourceType.synthetic], 298 not_allowed_for=[DataSourceType.human], 299 ), 300 DataSourceProperty( 301 name="model_provider", 302 type=str, 303 required_for=[DataSourceType.synthetic], 304 not_allowed_for=[DataSourceType.human], 305 ), 306 DataSourceProperty( 307 name="adapter_name", 308 type=str, 309 required_for=[DataSourceType.synthetic], 310 not_allowed_for=[DataSourceType.human], 311 ), 312 DataSourceProperty( 313 name="prompt_builder_name", 314 type=str, 315 not_allowed_for=[DataSourceType.human], 316 ), 317 ] 318 319 @model_validator(mode="after") 320 def validate_type(self) -> "DataSource": 321 if self.type not in DataSourceType: 322 raise ValueError(f"Invalid data source type: {self.type}") 323 return self 324 325 @model_validator(mode="after") 326 def validate_properties(self) -> "DataSource": 327 for prop in self._data_source_properties: 328 # Check the property type is correct 329 if prop.name in self.properties: 330 if not isinstance(self.properties[prop.name], prop.type): 331 raise ValueError( 332 f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source" 333 ) 334 # Check the property is required for the data source type 335 if self.type in prop.required_for: 336 if prop.name not in self.properties: 337 raise ValueError( 338 f"'{prop.name}' is required for {self.type} data source" 339 ) 340 # Check the property is not allowed for the data source type 341 elif self.type in prop.not_allowed_for and prop.name in self.properties: 342 raise ValueError( 343 f"'{prop.name}' is not allowed for {self.type} data source" 344 ) 345 return self 346 347 @model_validator(mode="after") 348 def validate_no_empty_properties(self) -> Self: 349 for prop, value in self.properties.items(): 350 if isinstance(value, str) and value == "": 351 raise ValueError( 352 f"Property '{prop}' must be a non-empty string for {self.type} data source" 353 ) 354 return self
Represents the origin of data, either human or synthetic, with associated properties.
Properties vary based on the source type - for synthetic sources this includes model information, for human sources this includes creator information.
325 @model_validator(mode="after") 326 def validate_properties(self) -> "DataSource": 327 for prop in self._data_source_properties: 328 # Check the property type is correct 329 if prop.name in self.properties: 330 if not isinstance(self.properties[prop.name], prop.type): 331 raise ValueError( 332 f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source" 333 ) 334 # Check the property is required for the data source type 335 if self.type in prop.required_for: 336 if prop.name not in self.properties: 337 raise ValueError( 338 f"'{prop.name}' is required for {self.type} data source" 339 ) 340 # Check the property is not allowed for the data source type 341 elif self.type in prop.not_allowed_for and prop.name in self.properties: 342 raise ValueError( 343 f"'{prop.name}' is not allowed for {self.type} data source" 344 ) 345 return self
347 @model_validator(mode="after") 348 def validate_no_empty_properties(self) -> Self: 349 for prop, value in self.properties.items(): 350 if isinstance(value, str) and value == "": 351 raise ValueError( 352 f"Property '{prop}' must be a non-empty string for {self.type} data source" 353 ) 354 return self
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
384def init_private_attributes(self: BaseModel, context: Any, /) -> None: 385 """This function is meant to behave like a BaseModel method to initialise private attributes. 386 387 It takes context as an argument since that's what pydantic-core passes when calling it. 388 389 Args: 390 self: The BaseModel instance. 391 context: The context. 392 """ 393 if getattr(self, '__pydantic_private__', None) is None: 394 pydantic_private = {} 395 for name, private_attr in self.__private_attributes__.items(): 396 default = private_attr.get_default() 397 if default is not PydanticUndefined: 398 pydantic_private[name] = default 399 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
247class DataSourceType(str, Enum): 248 """ 249 The source type of a piece of data. 250 251 Human: a human created the data 252 Synthetic: a model created the data 253 """ 254 255 human = "human" 256 synthetic = "synthetic"
The source type of a piece of data.
Human: a human created the data Synthetic: a model created the data
259class DataSourceProperty(BaseModel): 260 """ 261 Defines a property that can be associated with a data source. 262 263 Includes validation rules for when properties are required or not allowed 264 based on the data source type. 265 """ 266 267 name: str 268 type: Type[Union[str, int, float]] 269 required_for: List[DataSourceType] = [] 270 not_allowed_for: List[DataSourceType] = []
Defines a property that can be associated with a data source.
Includes validation rules for when properties are required or not allowed based on the data source type.
85class TaskOutputRatingType(str, Enum): 86 """Defines the types of rating systems available for task outputs.""" 87 88 five_star = "five_star" 89 custom = "custom"
Defines the types of rating systems available for task outputs.
601class TaskRequirement(BaseModel): 602 """ 603 Defines a specific requirement that should be met by task outputs. 604 605 Includes an identifier, name, description, instruction for meeting the requirement, 606 and priority level. 607 """ 608 609 id: ID_TYPE = ID_FIELD 610 name: str = SHORT_NAME_FIELD 611 description: str | None = Field(default=None) 612 instruction: str = Field(min_length=1) 613 priority: Priority = Field(default=Priority.p2)
Defines a specific requirement that should be met by task outputs.
Includes an identifier, name, description, instruction for meeting the requirement, and priority level.
616class TaskDeterminism(str, Enum): 617 """ 618 Defines how strictly task outputs should match expected results. 619 620 - deterministic: Requires exact matches 621 - semantic_match: Allows different wording with same meaning 622 - flexible: Allows variation in both wording and meaning within requirements 623 """ 624 625 deterministic = "deterministic" # Expect exact match 626 semantic_match = "semantic_match" # Expect same meaning, but flexible on expression of the meaning 627 flexible = "flexible" # Flexible on semantic output. Eval should be custom based on parsing requirements.
Defines how strictly task outputs should match expected results.
- deterministic: Requires exact matches
- semantic_match: Allows different wording with same meaning
- flexible: Allows variation in both wording and meaning within requirements