kiln_ai.adapters.data_gen.data_gen_task
1import json 2 3from pydantic import BaseModel 4 5from kiln_ai.adapters.prompt_builders import SimplePromptBuilder 6from kiln_ai.datamodel import Project, Task 7 8from .data_gen_prompts import ( 9 SAMPLE_GENERATION_PROMPT, 10 TREE_GENERATION_PROMPT, 11) 12 13 14class DataGenCategoriesTaskInput(BaseModel): 15 """Input model for generating categories/subtopics. 16 17 Attributes: 18 node_path: List of strings representing the hierarchical path to current node 19 system_prompt: System prompt to guide the AI generation 20 num_subtopics: Number of subtopics to generate 21 human_guidance: Optional human guidance to influence generation 22 existing_topics: Optional list of existing topics to avoid duplication 23 """ 24 25 node_path: list[str] 26 system_prompt: str 27 num_subtopics: int 28 human_guidance: str | None = None 29 existing_topics: list[str] | None = None 30 31 @classmethod 32 def from_task( 33 cls, 34 task: Task, 35 node_path: list[str] = [], 36 num_subtopics: int = 6, 37 human_guidance: str | None = None, 38 existing_topics: list[str] | None = None, 39 ) -> "DataGenCategoriesTaskInput": 40 """Create a DataGenCategoriesTaskInput instance from a Task. 41 42 Args: 43 task: The source Task object 44 node_path: Path to current node in topic hierarchy 45 num_subtopics: Number of subtopics to generate 46 human_guidance: Optional guidance for generation 47 existing_topics: Optional list of existing topics 48 49 Returns: 50 A new DataGenCategoriesTaskInput instance 51 """ 52 prompt_builder = SimplePromptBuilder(task=task) 53 return cls( 54 node_path=node_path, 55 num_subtopics=num_subtopics, 56 human_guidance=human_guidance, 57 existing_topics=existing_topics, 58 system_prompt=prompt_builder.build_prompt(include_json_instructions=False), 59 ) 60 61 62class DataGenCategoriesTaskOutput(BaseModel): 63 """Output model for generated categories/subtopics. 64 65 Attributes: 66 subtopics: List of generated subtopic strings 67 """ 68 69 subtopics: list[str] 70 71 72class DataGenCategoriesTask(Task, parent_of={}): 73 """Task for generating hierarchical categories/subtopics. 74 75 Generates synthetic data categories which can be used to generate 76 training data for model learning. 77 """ 78 79 def __init__(self): 80 # Keep the typechecker happy. TODO: shouldn't need this or parent_of above. 81 tmp_project = Project(name="DataGen") 82 super().__init__( 83 name="DataGen", 84 parent=tmp_project, 85 description="A task which generates synthetic data categories, which in turn are used to generate training data for a model to learn from.", 86 instruction=TREE_GENERATION_PROMPT, 87 input_json_schema=json.dumps( 88 DataGenCategoriesTaskInput.model_json_schema() 89 ), 90 output_json_schema=json.dumps( 91 DataGenCategoriesTaskOutput.model_json_schema() 92 ), 93 ) 94 95 96class DataGenSampleTaskInput(BaseModel): 97 """Input model for generating data samples for a kiln task. 98 99 Attributes: 100 topic: List of strings representing the topic path 101 system_prompt: System prompt to guide the AI generation 102 num_samples: Number of samples to generate 103 human_guidance: Optional human guidance to influence generation 104 """ 105 106 topic: list[str] 107 system_prompt: str 108 num_samples: int 109 human_guidance: str | None = None 110 111 @classmethod 112 def from_task( 113 cls, 114 task: Task, 115 topic: list[str] = [], 116 num_samples: int = 8, 117 human_guidance: str | None = None, 118 ) -> "DataGenSampleTaskInput": 119 """Create a DataGenSampleTaskInput instance from a Task. 120 121 Args: 122 task: The source Task object 123 topic: Topic path for sample generation 124 num_samples: Number of samples to generate 125 human_guidance: Optional guidance for generation 126 127 Returns: 128 A new DataGenSampleTaskInput instance 129 """ 130 prompt_builder = SimplePromptBuilder(task=task) 131 return cls( 132 topic=topic, 133 num_samples=num_samples, 134 human_guidance=human_guidance, 135 system_prompt=prompt_builder.build_prompt(include_json_instructions=False), 136 ) 137 138 139def list_json_schema_for_task(task: Task) -> str: 140 """Generate a JSON schema for a list of task inputs (json schema) 141 142 Args: 143 task: Task object whose input schema will be used 144 145 Returns: 146 JSON string representing the schema for a list of task inputs 147 """ 148 if task.input_json_schema: 149 items_schema = json.loads(task.input_json_schema) 150 else: 151 items_schema = {"type": "string"} 152 153 list_schema = { 154 "type": "array", 155 "items": items_schema, 156 } 157 158 top_level_schema = { 159 "type": "object", 160 "properties": { 161 "generated_samples": list_schema, 162 }, 163 "required": ["generated_samples"], 164 } 165 166 return json.dumps(top_level_schema, ensure_ascii=False) 167 168 169class DataGenSampleTask(Task, parent_of={}): 170 """Task for generating data samples for a given topic. 171 172 Generates synthetic data samples based on provided topics and subtopics. 173 """ 174 175 def __init__(self, target_task: Task, num_samples: int = 8): 176 # Keep the typechecker happy. TODO: shouldn't need this or parent_of above. 177 tmp_project = Project(name="DataGenSample") 178 super().__init__( 179 name="DataGenSample", 180 parent=tmp_project, 181 description="A task which generates synthetic data samples for a given topic (and optional subtopic).", 182 instruction=SAMPLE_GENERATION_PROMPT, 183 input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()), 184 output_json_schema=list_json_schema_for_task(target_task), 185 ) 186 187 188def wrap_task_with_guidance(original_instruction: str, guidance: str) -> str: 189 """Wrap the original instruction with human guidance. 190 191 Args: 192 original_instruction: The original instruction to wrap 193 guidance: The human guidance to wrap the instruction with 194 """ 195 return f"""{original_instruction} 196 197# Special Instructions 198 199The above instructions are the original instructions for this task. For this execution, we've been given additional instructions. Follow both, but prioritize the additional instructions when they conflict. The additional instructions are: 200<additional_instructions> 201{guidance} 202</additional_instructions> 203"""
15class DataGenCategoriesTaskInput(BaseModel): 16 """Input model for generating categories/subtopics. 17 18 Attributes: 19 node_path: List of strings representing the hierarchical path to current node 20 system_prompt: System prompt to guide the AI generation 21 num_subtopics: Number of subtopics to generate 22 human_guidance: Optional human guidance to influence generation 23 existing_topics: Optional list of existing topics to avoid duplication 24 """ 25 26 node_path: list[str] 27 system_prompt: str 28 num_subtopics: int 29 human_guidance: str | None = None 30 existing_topics: list[str] | None = None 31 32 @classmethod 33 def from_task( 34 cls, 35 task: Task, 36 node_path: list[str] = [], 37 num_subtopics: int = 6, 38 human_guidance: str | None = None, 39 existing_topics: list[str] | None = None, 40 ) -> "DataGenCategoriesTaskInput": 41 """Create a DataGenCategoriesTaskInput instance from a Task. 42 43 Args: 44 task: The source Task object 45 node_path: Path to current node in topic hierarchy 46 num_subtopics: Number of subtopics to generate 47 human_guidance: Optional guidance for generation 48 existing_topics: Optional list of existing topics 49 50 Returns: 51 A new DataGenCategoriesTaskInput instance 52 """ 53 prompt_builder = SimplePromptBuilder(task=task) 54 return cls( 55 node_path=node_path, 56 num_subtopics=num_subtopics, 57 human_guidance=human_guidance, 58 existing_topics=existing_topics, 59 system_prompt=prompt_builder.build_prompt(include_json_instructions=False), 60 )
Input model for generating categories/subtopics.
Attributes: node_path: List of strings representing the hierarchical path to current node system_prompt: System prompt to guide the AI generation num_subtopics: Number of subtopics to generate human_guidance: Optional human guidance to influence generation existing_topics: Optional list of existing topics to avoid duplication
32 @classmethod 33 def from_task( 34 cls, 35 task: Task, 36 node_path: list[str] = [], 37 num_subtopics: int = 6, 38 human_guidance: str | None = None, 39 existing_topics: list[str] | None = None, 40 ) -> "DataGenCategoriesTaskInput": 41 """Create a DataGenCategoriesTaskInput instance from a Task. 42 43 Args: 44 task: The source Task object 45 node_path: Path to current node in topic hierarchy 46 num_subtopics: Number of subtopics to generate 47 human_guidance: Optional guidance for generation 48 existing_topics: Optional list of existing topics 49 50 Returns: 51 A new DataGenCategoriesTaskInput instance 52 """ 53 prompt_builder = SimplePromptBuilder(task=task) 54 return cls( 55 node_path=node_path, 56 num_subtopics=num_subtopics, 57 human_guidance=human_guidance, 58 existing_topics=existing_topics, 59 system_prompt=prompt_builder.build_prompt(include_json_instructions=False), 60 )
Create a DataGenCategoriesTaskInput instance from a Task.
Args: task: The source Task object node_path: Path to current node in topic hierarchy num_subtopics: Number of subtopics to generate human_guidance: Optional guidance for generation existing_topics: Optional list of existing topics
Returns: A new DataGenCategoriesTaskInput instance
63class DataGenCategoriesTaskOutput(BaseModel): 64 """Output model for generated categories/subtopics. 65 66 Attributes: 67 subtopics: List of generated subtopic strings 68 """ 69 70 subtopics: list[str]
Output model for generated categories/subtopics.
Attributes: subtopics: List of generated subtopic strings
73class DataGenCategoriesTask(Task, parent_of={}): 74 """Task for generating hierarchical categories/subtopics. 75 76 Generates synthetic data categories which can be used to generate 77 training data for model learning. 78 """ 79 80 def __init__(self): 81 # Keep the typechecker happy. TODO: shouldn't need this or parent_of above. 82 tmp_project = Project(name="DataGen") 83 super().__init__( 84 name="DataGen", 85 parent=tmp_project, 86 description="A task which generates synthetic data categories, which in turn are used to generate training data for a model to learn from.", 87 instruction=TREE_GENERATION_PROMPT, 88 input_json_schema=json.dumps( 89 DataGenCategoriesTaskInput.model_json_schema() 90 ), 91 output_json_schema=json.dumps( 92 DataGenCategoriesTaskOutput.model_json_schema() 93 ), 94 )
Task for generating hierarchical categories/subtopics.
Generates synthetic data categories which can be used to generate training data for model learning.
80 def __init__(self): 81 # Keep the typechecker happy. TODO: shouldn't need this or parent_of above. 82 tmp_project = Project(name="DataGen") 83 super().__init__( 84 name="DataGen", 85 parent=tmp_project, 86 description="A task which generates synthetic data categories, which in turn are used to generate training data for a model to learn from.", 87 instruction=TREE_GENERATION_PROMPT, 88 input_json_schema=json.dumps( 89 DataGenCategoriesTaskInput.model_json_schema() 90 ), 91 output_json_schema=json.dumps( 92 DataGenCategoriesTaskOutput.model_json_schema() 93 ), 94 )
Create a new model by parsing and validating input data from keyword arguments.
Raises [ValidationError
][pydantic_core.ValidationError] if the input data cannot be
validated to form a valid model.
self
is explicitly positional-only to allow self
as a field name.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
97class DataGenSampleTaskInput(BaseModel): 98 """Input model for generating data samples for a kiln task. 99 100 Attributes: 101 topic: List of strings representing the topic path 102 system_prompt: System prompt to guide the AI generation 103 num_samples: Number of samples to generate 104 human_guidance: Optional human guidance to influence generation 105 """ 106 107 topic: list[str] 108 system_prompt: str 109 num_samples: int 110 human_guidance: str | None = None 111 112 @classmethod 113 def from_task( 114 cls, 115 task: Task, 116 topic: list[str] = [], 117 num_samples: int = 8, 118 human_guidance: str | None = None, 119 ) -> "DataGenSampleTaskInput": 120 """Create a DataGenSampleTaskInput instance from a Task. 121 122 Args: 123 task: The source Task object 124 topic: Topic path for sample generation 125 num_samples: Number of samples to generate 126 human_guidance: Optional guidance for generation 127 128 Returns: 129 A new DataGenSampleTaskInput instance 130 """ 131 prompt_builder = SimplePromptBuilder(task=task) 132 return cls( 133 topic=topic, 134 num_samples=num_samples, 135 human_guidance=human_guidance, 136 system_prompt=prompt_builder.build_prompt(include_json_instructions=False), 137 )
Input model for generating data samples for a kiln task.
Attributes: topic: List of strings representing the topic path system_prompt: System prompt to guide the AI generation num_samples: Number of samples to generate human_guidance: Optional human guidance to influence generation
112 @classmethod 113 def from_task( 114 cls, 115 task: Task, 116 topic: list[str] = [], 117 num_samples: int = 8, 118 human_guidance: str | None = None, 119 ) -> "DataGenSampleTaskInput": 120 """Create a DataGenSampleTaskInput instance from a Task. 121 122 Args: 123 task: The source Task object 124 topic: Topic path for sample generation 125 num_samples: Number of samples to generate 126 human_guidance: Optional guidance for generation 127 128 Returns: 129 A new DataGenSampleTaskInput instance 130 """ 131 prompt_builder = SimplePromptBuilder(task=task) 132 return cls( 133 topic=topic, 134 num_samples=num_samples, 135 human_guidance=human_guidance, 136 system_prompt=prompt_builder.build_prompt(include_json_instructions=False), 137 )
Create a DataGenSampleTaskInput instance from a Task.
Args: task: The source Task object topic: Topic path for sample generation num_samples: Number of samples to generate human_guidance: Optional guidance for generation
Returns: A new DataGenSampleTaskInput instance
140def list_json_schema_for_task(task: Task) -> str: 141 """Generate a JSON schema for a list of task inputs (json schema) 142 143 Args: 144 task: Task object whose input schema will be used 145 146 Returns: 147 JSON string representing the schema for a list of task inputs 148 """ 149 if task.input_json_schema: 150 items_schema = json.loads(task.input_json_schema) 151 else: 152 items_schema = {"type": "string"} 153 154 list_schema = { 155 "type": "array", 156 "items": items_schema, 157 } 158 159 top_level_schema = { 160 "type": "object", 161 "properties": { 162 "generated_samples": list_schema, 163 }, 164 "required": ["generated_samples"], 165 } 166 167 return json.dumps(top_level_schema, ensure_ascii=False)
Generate a JSON schema for a list of task inputs (json schema)
Args: task: Task object whose input schema will be used
Returns: JSON string representing the schema for a list of task inputs
170class DataGenSampleTask(Task, parent_of={}): 171 """Task for generating data samples for a given topic. 172 173 Generates synthetic data samples based on provided topics and subtopics. 174 """ 175 176 def __init__(self, target_task: Task, num_samples: int = 8): 177 # Keep the typechecker happy. TODO: shouldn't need this or parent_of above. 178 tmp_project = Project(name="DataGenSample") 179 super().__init__( 180 name="DataGenSample", 181 parent=tmp_project, 182 description="A task which generates synthetic data samples for a given topic (and optional subtopic).", 183 instruction=SAMPLE_GENERATION_PROMPT, 184 input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()), 185 output_json_schema=list_json_schema_for_task(target_task), 186 )
Task for generating data samples for a given topic.
Generates synthetic data samples based on provided topics and subtopics.
176 def __init__(self, target_task: Task, num_samples: int = 8): 177 # Keep the typechecker happy. TODO: shouldn't need this or parent_of above. 178 tmp_project = Project(name="DataGenSample") 179 super().__init__( 180 name="DataGenSample", 181 parent=tmp_project, 182 description="A task which generates synthetic data samples for a given topic (and optional subtopic).", 183 instruction=SAMPLE_GENERATION_PROMPT, 184 input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()), 185 output_json_schema=list_json_schema_for_task(target_task), 186 )
Create a new model by parsing and validating input data from keyword arguments.
Raises [ValidationError
][pydantic_core.ValidationError] if the input data cannot be
validated to form a valid model.
self
is explicitly positional-only to allow self
as a field name.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
189def wrap_task_with_guidance(original_instruction: str, guidance: str) -> str: 190 """Wrap the original instruction with human guidance. 191 192 Args: 193 original_instruction: The original instruction to wrap 194 guidance: The human guidance to wrap the instruction with 195 """ 196 return f"""{original_instruction} 197 198# Special Instructions 199 200The above instructions are the original instructions for this task. For this execution, we've been given additional instructions. Follow both, but prioritize the additional instructions when they conflict. The additional instructions are: 201<additional_instructions> 202{guidance} 203</additional_instructions> 204"""
Wrap the original instruction with human guidance.
Args: original_instruction: The original instruction to wrap guidance: The human guidance to wrap the instruction with