kiln_ai.adapters.data_gen.data_gen_task

  1import json
  2
  3from pydantic import BaseModel
  4
  5from kiln_ai.adapters.prompt_builders import SimplePromptBuilder
  6from kiln_ai.datamodel import Project, Task
  7
  8from .data_gen_prompts import (
  9    SAMPLE_GENERATION_PROMPT,
 10    TREE_GENERATION_PROMPT,
 11)
 12
 13
 14class DataGenCategoriesTaskInput(BaseModel):
 15    """Input model for generating categories/subtopics.
 16
 17    Attributes:
 18        node_path: List of strings representing the hierarchical path to current node
 19        system_prompt: System prompt to guide the AI generation
 20        num_subtopics: Number of subtopics to generate
 21        human_guidance: Optional human guidance to influence generation
 22        existing_topics: Optional list of existing topics to avoid duplication
 23    """
 24
 25    node_path: list[str]
 26    system_prompt: str
 27    num_subtopics: int
 28    human_guidance: str | None = None
 29    existing_topics: list[str] | None = None
 30
 31    @classmethod
 32    def from_task(
 33        cls,
 34        task: Task,
 35        node_path: list[str] = [],
 36        num_subtopics: int = 6,
 37        human_guidance: str | None = None,
 38        existing_topics: list[str] | None = None,
 39    ) -> "DataGenCategoriesTaskInput":
 40        """Create a DataGenCategoriesTaskInput instance from a Task.
 41
 42        Args:
 43            task: The source Task object
 44            node_path: Path to current node in topic hierarchy
 45            num_subtopics: Number of subtopics to generate
 46            human_guidance: Optional guidance for generation
 47            existing_topics: Optional list of existing topics
 48
 49        Returns:
 50            A new DataGenCategoriesTaskInput instance
 51        """
 52        prompt_builder = SimplePromptBuilder(task=task)
 53        return cls(
 54            node_path=node_path,
 55            num_subtopics=num_subtopics,
 56            human_guidance=human_guidance,
 57            existing_topics=existing_topics,
 58            system_prompt=prompt_builder.build_prompt(include_json_instructions=False),
 59        )
 60
 61
 62class DataGenCategoriesTaskOutput(BaseModel):
 63    """Output model for generated categories/subtopics.
 64
 65    Attributes:
 66        subtopics: List of generated subtopic strings
 67    """
 68
 69    subtopics: list[str]
 70
 71
 72class DataGenCategoriesTask(Task, parent_of={}):
 73    """Task for generating hierarchical categories/subtopics.
 74
 75    Generates synthetic data categories which can be used to generate
 76    training data for model learning.
 77    """
 78
 79    def __init__(self):
 80        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
 81        tmp_project = Project(name="DataGen")
 82        super().__init__(
 83            name="DataGen",
 84            parent=tmp_project,
 85            description="A task which generates synthetic data categories, which in turn are used to generate training data for a model to learn from.",
 86            instruction=TREE_GENERATION_PROMPT,
 87            input_json_schema=json.dumps(
 88                DataGenCategoriesTaskInput.model_json_schema()
 89            ),
 90            output_json_schema=json.dumps(
 91                DataGenCategoriesTaskOutput.model_json_schema()
 92            ),
 93        )
 94
 95
 96class DataGenSampleTaskInput(BaseModel):
 97    """Input model for generating data samples for a kiln task.
 98
 99    Attributes:
100        topic: List of strings representing the topic path
101        system_prompt: System prompt to guide the AI generation
102        num_samples: Number of samples to generate
103        human_guidance: Optional human guidance to influence generation
104    """
105
106    topic: list[str]
107    system_prompt: str
108    num_samples: int
109    human_guidance: str | None = None
110
111    @classmethod
112    def from_task(
113        cls,
114        task: Task,
115        topic: list[str] = [],
116        num_samples: int = 8,
117        human_guidance: str | None = None,
118    ) -> "DataGenSampleTaskInput":
119        """Create a DataGenSampleTaskInput instance from a Task.
120
121        Args:
122            task: The source Task object
123            topic: Topic path for sample generation
124            num_samples: Number of samples to generate
125            human_guidance: Optional guidance for generation
126
127        Returns:
128            A new DataGenSampleTaskInput instance
129        """
130        prompt_builder = SimplePromptBuilder(task=task)
131        return cls(
132            topic=topic,
133            num_samples=num_samples,
134            human_guidance=human_guidance,
135            system_prompt=prompt_builder.build_prompt(include_json_instructions=False),
136        )
137
138
139def list_json_schema_for_task(task: Task) -> str:
140    """Generate a JSON schema for a list of task inputs (json schema)
141
142    Args:
143        task: Task object whose input schema will be used
144
145    Returns:
146        JSON string representing the schema for a list of task inputs
147    """
148    if task.input_json_schema:
149        items_schema = json.loads(task.input_json_schema)
150    else:
151        items_schema = {"type": "string"}
152
153    list_schema = {
154        "type": "array",
155        "items": items_schema,
156    }
157
158    top_level_schema = {
159        "type": "object",
160        "properties": {
161            "generated_samples": list_schema,
162        },
163        "required": ["generated_samples"],
164    }
165
166    return json.dumps(top_level_schema, ensure_ascii=False)
167
168
169class DataGenSampleTask(Task, parent_of={}):
170    """Task for generating data samples for a given topic.
171
172    Generates synthetic data samples based on provided topics and subtopics.
173    """
174
175    def __init__(self, target_task: Task, num_samples: int = 8):
176        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
177        tmp_project = Project(name="DataGenSample")
178        super().__init__(
179            name="DataGenSample",
180            parent=tmp_project,
181            description="A task which generates synthetic data samples for a given topic (and optional subtopic).",
182            instruction=SAMPLE_GENERATION_PROMPT,
183            input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()),
184            output_json_schema=list_json_schema_for_task(target_task),
185        )
186
187
188def wrap_task_with_guidance(original_instruction: str, guidance: str) -> str:
189    """Wrap the original instruction with human guidance.
190
191    Args:
192        original_instruction: The original instruction to wrap
193        guidance: The human guidance to wrap the instruction with
194    """
195    return f"""{original_instruction}
196
197# Special Instructions
198
199The above instructions are the original instructions for this task. For this execution, we've been given additional instructions. Follow both, but prioritize the additional instructions when they conflict. The additional instructions are:
200<additional_instructions>
201{guidance}
202</additional_instructions>
203"""
class DataGenCategoriesTaskInput(pydantic.main.BaseModel):
15class DataGenCategoriesTaskInput(BaseModel):
16    """Input model for generating categories/subtopics.
17
18    Attributes:
19        node_path: List of strings representing the hierarchical path to current node
20        system_prompt: System prompt to guide the AI generation
21        num_subtopics: Number of subtopics to generate
22        human_guidance: Optional human guidance to influence generation
23        existing_topics: Optional list of existing topics to avoid duplication
24    """
25
26    node_path: list[str]
27    system_prompt: str
28    num_subtopics: int
29    human_guidance: str | None = None
30    existing_topics: list[str] | None = None
31
32    @classmethod
33    def from_task(
34        cls,
35        task: Task,
36        node_path: list[str] = [],
37        num_subtopics: int = 6,
38        human_guidance: str | None = None,
39        existing_topics: list[str] | None = None,
40    ) -> "DataGenCategoriesTaskInput":
41        """Create a DataGenCategoriesTaskInput instance from a Task.
42
43        Args:
44            task: The source Task object
45            node_path: Path to current node in topic hierarchy
46            num_subtopics: Number of subtopics to generate
47            human_guidance: Optional guidance for generation
48            existing_topics: Optional list of existing topics
49
50        Returns:
51            A new DataGenCategoriesTaskInput instance
52        """
53        prompt_builder = SimplePromptBuilder(task=task)
54        return cls(
55            node_path=node_path,
56            num_subtopics=num_subtopics,
57            human_guidance=human_guidance,
58            existing_topics=existing_topics,
59            system_prompt=prompt_builder.build_prompt(include_json_instructions=False),
60        )

Input model for generating categories/subtopics.

Attributes: node_path: List of strings representing the hierarchical path to current node system_prompt: System prompt to guide the AI generation num_subtopics: Number of subtopics to generate human_guidance: Optional human guidance to influence generation existing_topics: Optional list of existing topics to avoid duplication

node_path: list[str]
system_prompt: str
num_subtopics: int
human_guidance: str | None
existing_topics: list[str] | None
@classmethod
def from_task( cls, task: kiln_ai.datamodel.Task, node_path: list[str] = [], num_subtopics: int = 6, human_guidance: str | None = None, existing_topics: list[str] | None = None) -> DataGenCategoriesTaskInput:
32    @classmethod
33    def from_task(
34        cls,
35        task: Task,
36        node_path: list[str] = [],
37        num_subtopics: int = 6,
38        human_guidance: str | None = None,
39        existing_topics: list[str] | None = None,
40    ) -> "DataGenCategoriesTaskInput":
41        """Create a DataGenCategoriesTaskInput instance from a Task.
42
43        Args:
44            task: The source Task object
45            node_path: Path to current node in topic hierarchy
46            num_subtopics: Number of subtopics to generate
47            human_guidance: Optional guidance for generation
48            existing_topics: Optional list of existing topics
49
50        Returns:
51            A new DataGenCategoriesTaskInput instance
52        """
53        prompt_builder = SimplePromptBuilder(task=task)
54        return cls(
55            node_path=node_path,
56            num_subtopics=num_subtopics,
57            human_guidance=human_guidance,
58            existing_topics=existing_topics,
59            system_prompt=prompt_builder.build_prompt(include_json_instructions=False),
60        )

Create a DataGenCategoriesTaskInput instance from a Task.

Args: task: The source Task object node_path: Path to current node in topic hierarchy num_subtopics: Number of subtopics to generate human_guidance: Optional guidance for generation existing_topics: Optional list of existing topics

Returns: A new DataGenCategoriesTaskInput instance

model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class DataGenCategoriesTaskOutput(pydantic.main.BaseModel):
63class DataGenCategoriesTaskOutput(BaseModel):
64    """Output model for generated categories/subtopics.
65
66    Attributes:
67        subtopics: List of generated subtopic strings
68    """
69
70    subtopics: list[str]

Output model for generated categories/subtopics.

Attributes: subtopics: List of generated subtopic strings

subtopics: list[str]
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class DataGenCategoriesTask(kiln_ai.datamodel.task.Task):
73class DataGenCategoriesTask(Task, parent_of={}):
74    """Task for generating hierarchical categories/subtopics.
75
76    Generates synthetic data categories which can be used to generate
77    training data for model learning.
78    """
79
80    def __init__(self):
81        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
82        tmp_project = Project(name="DataGen")
83        super().__init__(
84            name="DataGen",
85            parent=tmp_project,
86            description="A task which generates synthetic data categories, which in turn are used to generate training data for a model to learn from.",
87            instruction=TREE_GENERATION_PROMPT,
88            input_json_schema=json.dumps(
89                DataGenCategoriesTaskInput.model_json_schema()
90            ),
91            output_json_schema=json.dumps(
92                DataGenCategoriesTaskOutput.model_json_schema()
93            ),
94        )

Task for generating hierarchical categories/subtopics.

Generates synthetic data categories which can be used to generate training data for model learning.

DataGenCategoriesTask()
80    def __init__(self):
81        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
82        tmp_project = Project(name="DataGen")
83        super().__init__(
84            name="DataGen",
85            parent=tmp_project,
86            description="A task which generates synthetic data categories, which in turn are used to generate training data for a model to learn from.",
87            instruction=TREE_GENERATION_PROMPT,
88            input_json_schema=json.dumps(
89                DataGenCategoriesTaskInput.model_json_schema()
90            ),
91            output_json_schema=json.dumps(
92                DataGenCategoriesTaskOutput.model_json_schema()
93            ),
94        )

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class DataGenSampleTaskInput(pydantic.main.BaseModel):
 97class DataGenSampleTaskInput(BaseModel):
 98    """Input model for generating data samples for a kiln task.
 99
100    Attributes:
101        topic: List of strings representing the topic path
102        system_prompt: System prompt to guide the AI generation
103        num_samples: Number of samples to generate
104        human_guidance: Optional human guidance to influence generation
105    """
106
107    topic: list[str]
108    system_prompt: str
109    num_samples: int
110    human_guidance: str | None = None
111
112    @classmethod
113    def from_task(
114        cls,
115        task: Task,
116        topic: list[str] = [],
117        num_samples: int = 8,
118        human_guidance: str | None = None,
119    ) -> "DataGenSampleTaskInput":
120        """Create a DataGenSampleTaskInput instance from a Task.
121
122        Args:
123            task: The source Task object
124            topic: Topic path for sample generation
125            num_samples: Number of samples to generate
126            human_guidance: Optional guidance for generation
127
128        Returns:
129            A new DataGenSampleTaskInput instance
130        """
131        prompt_builder = SimplePromptBuilder(task=task)
132        return cls(
133            topic=topic,
134            num_samples=num_samples,
135            human_guidance=human_guidance,
136            system_prompt=prompt_builder.build_prompt(include_json_instructions=False),
137        )

Input model for generating data samples for a kiln task.

Attributes: topic: List of strings representing the topic path system_prompt: System prompt to guide the AI generation num_samples: Number of samples to generate human_guidance: Optional human guidance to influence generation

topic: list[str]
system_prompt: str
num_samples: int
human_guidance: str | None
@classmethod
def from_task( cls, task: kiln_ai.datamodel.Task, topic: list[str] = [], num_samples: int = 8, human_guidance: str | None = None) -> DataGenSampleTaskInput:
112    @classmethod
113    def from_task(
114        cls,
115        task: Task,
116        topic: list[str] = [],
117        num_samples: int = 8,
118        human_guidance: str | None = None,
119    ) -> "DataGenSampleTaskInput":
120        """Create a DataGenSampleTaskInput instance from a Task.
121
122        Args:
123            task: The source Task object
124            topic: Topic path for sample generation
125            num_samples: Number of samples to generate
126            human_guidance: Optional guidance for generation
127
128        Returns:
129            A new DataGenSampleTaskInput instance
130        """
131        prompt_builder = SimplePromptBuilder(task=task)
132        return cls(
133            topic=topic,
134            num_samples=num_samples,
135            human_guidance=human_guidance,
136            system_prompt=prompt_builder.build_prompt(include_json_instructions=False),
137        )

Create a DataGenSampleTaskInput instance from a Task.

Args: task: The source Task object topic: Topic path for sample generation num_samples: Number of samples to generate human_guidance: Optional guidance for generation

Returns: A new DataGenSampleTaskInput instance

model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def list_json_schema_for_task(task: kiln_ai.datamodel.Task) -> str:
140def list_json_schema_for_task(task: Task) -> str:
141    """Generate a JSON schema for a list of task inputs (json schema)
142
143    Args:
144        task: Task object whose input schema will be used
145
146    Returns:
147        JSON string representing the schema for a list of task inputs
148    """
149    if task.input_json_schema:
150        items_schema = json.loads(task.input_json_schema)
151    else:
152        items_schema = {"type": "string"}
153
154    list_schema = {
155        "type": "array",
156        "items": items_schema,
157    }
158
159    top_level_schema = {
160        "type": "object",
161        "properties": {
162            "generated_samples": list_schema,
163        },
164        "required": ["generated_samples"],
165    }
166
167    return json.dumps(top_level_schema, ensure_ascii=False)

Generate a JSON schema for a list of task inputs (json schema)

Args: task: Task object whose input schema will be used

Returns: JSON string representing the schema for a list of task inputs

class DataGenSampleTask(kiln_ai.datamodel.task.Task):
170class DataGenSampleTask(Task, parent_of={}):
171    """Task for generating data samples for a given topic.
172
173    Generates synthetic data samples based on provided topics and subtopics.
174    """
175
176    def __init__(self, target_task: Task, num_samples: int = 8):
177        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
178        tmp_project = Project(name="DataGenSample")
179        super().__init__(
180            name="DataGenSample",
181            parent=tmp_project,
182            description="A task which generates synthetic data samples for a given topic (and optional subtopic).",
183            instruction=SAMPLE_GENERATION_PROMPT,
184            input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()),
185            output_json_schema=list_json_schema_for_task(target_task),
186        )

Task for generating data samples for a given topic.

Generates synthetic data samples based on provided topics and subtopics.

DataGenSampleTask(target_task: kiln_ai.datamodel.Task, num_samples: int = 8)
176    def __init__(self, target_task: Task, num_samples: int = 8):
177        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
178        tmp_project = Project(name="DataGenSample")
179        super().__init__(
180            name="DataGenSample",
181            parent=tmp_project,
182            description="A task which generates synthetic data samples for a given topic (and optional subtopic).",
183            instruction=SAMPLE_GENERATION_PROMPT,
184            input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()),
185            output_json_schema=list_json_schema_for_task(target_task),
186        )

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

def wrap_task_with_guidance(original_instruction: str, guidance: str) -> str:
189def wrap_task_with_guidance(original_instruction: str, guidance: str) -> str:
190    """Wrap the original instruction with human guidance.
191
192    Args:
193        original_instruction: The original instruction to wrap
194        guidance: The human guidance to wrap the instruction with
195    """
196    return f"""{original_instruction}
197
198# Special Instructions
199
200The above instructions are the original instructions for this task. For this execution, we've been given additional instructions. Follow both, but prioritize the additional instructions when they conflict. The additional instructions are:
201<additional_instructions>
202{guidance}
203</additional_instructions>
204"""

Wrap the original instruction with human guidance.

Args: original_instruction: The original instruction to wrap guidance: The human guidance to wrap the instruction with