kiln_ai.adapters.data_gen.data_gen_task
1import json 2 3from pydantic import BaseModel 4 5from kiln_ai.adapters.prompt_builders import SimplePromptBuilder 6from kiln_ai.datamodel import Project, Task 7 8from .data_gen_prompts import ( 9 SAMPLE_GENERATION_PROMPT, 10 TREE_GENERATION_PROMPT, 11) 12 13 14class DataGenCategoriesTaskInput(BaseModel): 15 """Input model for generating categories/subtopics. 16 17 Attributes: 18 node_path: List of strings representing the hierarchical path to current node 19 system_prompt: System prompt to guide the AI generation 20 num_subtopics: Number of subtopics to generate 21 human_guidance: Optional human guidance to influence generation 22 existing_topics: Optional list of existing topics to avoid duplication 23 """ 24 25 node_path: list[str] 26 system_prompt: str 27 num_subtopics: int 28 human_guidance: str | None = None 29 existing_topics: list[str] | None = None 30 31 @classmethod 32 def from_task( 33 cls, 34 task: Task, 35 node_path: list[str] = [], 36 num_subtopics: int = 6, 37 human_guidance: str | None = None, 38 existing_topics: list[str] | None = None, 39 ) -> "DataGenCategoriesTaskInput": 40 """Create a DataGenCategoriesTaskInput instance from a Task. 41 42 Args: 43 task: The source Task object 44 node_path: Path to current node in topic hierarchy 45 num_subtopics: Number of subtopics to generate 46 human_guidance: Optional guidance for generation 47 existing_topics: Optional list of existing topics 48 49 Returns: 50 A new DataGenCategoriesTaskInput instance 51 """ 52 prompt_builder = SimplePromptBuilder(task=task) 53 return cls( 54 node_path=node_path, 55 num_subtopics=num_subtopics, 56 human_guidance=human_guidance, 57 existing_topics=existing_topics, 58 system_prompt=prompt_builder.build_prompt(), 59 ) 60 61 62class DataGenCategoriesTaskOutput(BaseModel): 63 """Output model for generated categories/subtopics. 64 65 Attributes: 66 subtopics: List of generated subtopic strings 67 """ 68 69 subtopics: list[str] 70 71 72class DataGenCategoriesTask(Task, parent_of={}): 73 """Task for generating hierarchical categories/subtopics. 74 75 Generates synthetic data categories which can be used to generate 76 training data for model learning. 77 """ 78 79 def __init__(self): 80 # Keep the typechecker happy. TODO: shouldn't need this or parent_of above. 81 tmp_project = Project(name="DataGen") 82 super().__init__( 83 name="DataGen", 84 parent=tmp_project, 85 description="A task which generates synthetic data categories, which in turn are used to generate training data for a model to learn from.", 86 instruction=TREE_GENERATION_PROMPT, 87 input_json_schema=json.dumps( 88 DataGenCategoriesTaskInput.model_json_schema() 89 ), 90 output_json_schema=json.dumps( 91 DataGenCategoriesTaskOutput.model_json_schema() 92 ), 93 ) 94 95 96class DataGenSampleTaskInput(BaseModel): 97 """Input model for generating data samples for a kiln task. 98 99 Attributes: 100 topic: List of strings representing the topic path 101 system_prompt: System prompt to guide the AI generation 102 num_samples: Number of samples to generate 103 human_guidance: Optional human guidance to influence generation 104 """ 105 106 topic: list[str] 107 system_prompt: str 108 num_samples: int 109 human_guidance: str | None = None 110 111 @classmethod 112 def from_task( 113 cls, 114 task: Task, 115 topic: list[str] = [], 116 num_samples: int = 8, 117 human_guidance: str | None = None, 118 ) -> "DataGenSampleTaskInput": 119 """Create a DataGenSampleTaskInput instance from a Task. 120 121 Args: 122 task: The source Task object 123 topic: Topic path for sample generation 124 num_samples: Number of samples to generate 125 human_guidance: Optional guidance for generation 126 127 Returns: 128 A new DataGenSampleTaskInput instance 129 """ 130 prompt_builder = SimplePromptBuilder(task=task) 131 return cls( 132 topic=topic, 133 num_samples=num_samples, 134 human_guidance=human_guidance, 135 system_prompt=prompt_builder.build_prompt(), 136 ) 137 138 139def list_json_schema_for_task(task: Task) -> str: 140 """Generate a JSON schema for a list of task inputs (json schema) 141 142 Args: 143 task: Task object whose input schema will be used 144 145 Returns: 146 JSON string representing the schema for a list of task inputs 147 """ 148 if task.input_json_schema: 149 items_schema = json.loads(task.input_json_schema) 150 else: 151 items_schema = {"type": "string"} 152 153 list_schema = { 154 "type": "array", 155 "items": items_schema, 156 } 157 158 top_level_schema = { 159 "type": "object", 160 "properties": { 161 "generated_samples": list_schema, 162 }, 163 "required": ["generated_samples"], 164 } 165 166 return json.dumps(top_level_schema, ensure_ascii=False) 167 168 169class DataGenSampleTask(Task, parent_of={}): 170 """Task for generating data samples for a given topic. 171 172 Generates synthetic data samples based on provided topics and subtopics. 173 """ 174 175 def __init__(self, target_task: Task, num_samples: int = 8): 176 # Keep the typechecker happy. TODO: shouldn't need this or parent_of above. 177 tmp_project = Project(name="DataGenSample") 178 super().__init__( 179 name="DataGenSample", 180 parent=tmp_project, 181 description="A task which generates synthetic data samples for a given topic (and optional subtopic).", 182 instruction=SAMPLE_GENERATION_PROMPT, 183 input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()), 184 output_json_schema=list_json_schema_for_task(target_task), 185 )
15class DataGenCategoriesTaskInput(BaseModel): 16 """Input model for generating categories/subtopics. 17 18 Attributes: 19 node_path: List of strings representing the hierarchical path to current node 20 system_prompt: System prompt to guide the AI generation 21 num_subtopics: Number of subtopics to generate 22 human_guidance: Optional human guidance to influence generation 23 existing_topics: Optional list of existing topics to avoid duplication 24 """ 25 26 node_path: list[str] 27 system_prompt: str 28 num_subtopics: int 29 human_guidance: str | None = None 30 existing_topics: list[str] | None = None 31 32 @classmethod 33 def from_task( 34 cls, 35 task: Task, 36 node_path: list[str] = [], 37 num_subtopics: int = 6, 38 human_guidance: str | None = None, 39 existing_topics: list[str] | None = None, 40 ) -> "DataGenCategoriesTaskInput": 41 """Create a DataGenCategoriesTaskInput instance from a Task. 42 43 Args: 44 task: The source Task object 45 node_path: Path to current node in topic hierarchy 46 num_subtopics: Number of subtopics to generate 47 human_guidance: Optional guidance for generation 48 existing_topics: Optional list of existing topics 49 50 Returns: 51 A new DataGenCategoriesTaskInput instance 52 """ 53 prompt_builder = SimplePromptBuilder(task=task) 54 return cls( 55 node_path=node_path, 56 num_subtopics=num_subtopics, 57 human_guidance=human_guidance, 58 existing_topics=existing_topics, 59 system_prompt=prompt_builder.build_prompt(), 60 )
Input model for generating categories/subtopics.
Attributes: node_path: List of strings representing the hierarchical path to current node system_prompt: System prompt to guide the AI generation num_subtopics: Number of subtopics to generate human_guidance: Optional human guidance to influence generation existing_topics: Optional list of existing topics to avoid duplication
32 @classmethod 33 def from_task( 34 cls, 35 task: Task, 36 node_path: list[str] = [], 37 num_subtopics: int = 6, 38 human_guidance: str | None = None, 39 existing_topics: list[str] | None = None, 40 ) -> "DataGenCategoriesTaskInput": 41 """Create a DataGenCategoriesTaskInput instance from a Task. 42 43 Args: 44 task: The source Task object 45 node_path: Path to current node in topic hierarchy 46 num_subtopics: Number of subtopics to generate 47 human_guidance: Optional guidance for generation 48 existing_topics: Optional list of existing topics 49 50 Returns: 51 A new DataGenCategoriesTaskInput instance 52 """ 53 prompt_builder = SimplePromptBuilder(task=task) 54 return cls( 55 node_path=node_path, 56 num_subtopics=num_subtopics, 57 human_guidance=human_guidance, 58 existing_topics=existing_topics, 59 system_prompt=prompt_builder.build_prompt(), 60 )
Create a DataGenCategoriesTaskInput instance from a Task.
Args: task: The source Task object node_path: Path to current node in topic hierarchy num_subtopics: Number of subtopics to generate human_guidance: Optional guidance for generation existing_topics: Optional list of existing topics
Returns: A new DataGenCategoriesTaskInput instance
63class DataGenCategoriesTaskOutput(BaseModel): 64 """Output model for generated categories/subtopics. 65 66 Attributes: 67 subtopics: List of generated subtopic strings 68 """ 69 70 subtopics: list[str]
Output model for generated categories/subtopics.
Attributes: subtopics: List of generated subtopic strings
73class DataGenCategoriesTask(Task, parent_of={}): 74 """Task for generating hierarchical categories/subtopics. 75 76 Generates synthetic data categories which can be used to generate 77 training data for model learning. 78 """ 79 80 def __init__(self): 81 # Keep the typechecker happy. TODO: shouldn't need this or parent_of above. 82 tmp_project = Project(name="DataGen") 83 super().__init__( 84 name="DataGen", 85 parent=tmp_project, 86 description="A task which generates synthetic data categories, which in turn are used to generate training data for a model to learn from.", 87 instruction=TREE_GENERATION_PROMPT, 88 input_json_schema=json.dumps( 89 DataGenCategoriesTaskInput.model_json_schema() 90 ), 91 output_json_schema=json.dumps( 92 DataGenCategoriesTaskOutput.model_json_schema() 93 ), 94 )
Task for generating hierarchical categories/subtopics.
Generates synthetic data categories which can be used to generate training data for model learning.
80 def __init__(self): 81 # Keep the typechecker happy. TODO: shouldn't need this or parent_of above. 82 tmp_project = Project(name="DataGen") 83 super().__init__( 84 name="DataGen", 85 parent=tmp_project, 86 description="A task which generates synthetic data categories, which in turn are used to generate training data for a model to learn from.", 87 instruction=TREE_GENERATION_PROMPT, 88 input_json_schema=json.dumps( 89 DataGenCategoriesTaskInput.model_json_schema() 90 ), 91 output_json_schema=json.dumps( 92 DataGenCategoriesTaskOutput.model_json_schema() 93 ), 94 )
Create a new model by parsing and validating input data from keyword arguments.
Raises [ValidationError
][pydantic_core.ValidationError] if the input data cannot be
validated to form a valid model.
self
is explicitly positional-only to allow self
as a field name.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
Inherited Members
97class DataGenSampleTaskInput(BaseModel): 98 """Input model for generating data samples for a kiln task. 99 100 Attributes: 101 topic: List of strings representing the topic path 102 system_prompt: System prompt to guide the AI generation 103 num_samples: Number of samples to generate 104 human_guidance: Optional human guidance to influence generation 105 """ 106 107 topic: list[str] 108 system_prompt: str 109 num_samples: int 110 human_guidance: str | None = None 111 112 @classmethod 113 def from_task( 114 cls, 115 task: Task, 116 topic: list[str] = [], 117 num_samples: int = 8, 118 human_guidance: str | None = None, 119 ) -> "DataGenSampleTaskInput": 120 """Create a DataGenSampleTaskInput instance from a Task. 121 122 Args: 123 task: The source Task object 124 topic: Topic path for sample generation 125 num_samples: Number of samples to generate 126 human_guidance: Optional guidance for generation 127 128 Returns: 129 A new DataGenSampleTaskInput instance 130 """ 131 prompt_builder = SimplePromptBuilder(task=task) 132 return cls( 133 topic=topic, 134 num_samples=num_samples, 135 human_guidance=human_guidance, 136 system_prompt=prompt_builder.build_prompt(), 137 )
Input model for generating data samples for a kiln task.
Attributes: topic: List of strings representing the topic path system_prompt: System prompt to guide the AI generation num_samples: Number of samples to generate human_guidance: Optional human guidance to influence generation
112 @classmethod 113 def from_task( 114 cls, 115 task: Task, 116 topic: list[str] = [], 117 num_samples: int = 8, 118 human_guidance: str | None = None, 119 ) -> "DataGenSampleTaskInput": 120 """Create a DataGenSampleTaskInput instance from a Task. 121 122 Args: 123 task: The source Task object 124 topic: Topic path for sample generation 125 num_samples: Number of samples to generate 126 human_guidance: Optional guidance for generation 127 128 Returns: 129 A new DataGenSampleTaskInput instance 130 """ 131 prompt_builder = SimplePromptBuilder(task=task) 132 return cls( 133 topic=topic, 134 num_samples=num_samples, 135 human_guidance=human_guidance, 136 system_prompt=prompt_builder.build_prompt(), 137 )
Create a DataGenSampleTaskInput instance from a Task.
Args: task: The source Task object topic: Topic path for sample generation num_samples: Number of samples to generate human_guidance: Optional guidance for generation
Returns: A new DataGenSampleTaskInput instance
140def list_json_schema_for_task(task: Task) -> str: 141 """Generate a JSON schema for a list of task inputs (json schema) 142 143 Args: 144 task: Task object whose input schema will be used 145 146 Returns: 147 JSON string representing the schema for a list of task inputs 148 """ 149 if task.input_json_schema: 150 items_schema = json.loads(task.input_json_schema) 151 else: 152 items_schema = {"type": "string"} 153 154 list_schema = { 155 "type": "array", 156 "items": items_schema, 157 } 158 159 top_level_schema = { 160 "type": "object", 161 "properties": { 162 "generated_samples": list_schema, 163 }, 164 "required": ["generated_samples"], 165 } 166 167 return json.dumps(top_level_schema, ensure_ascii=False)
Generate a JSON schema for a list of task inputs (json schema)
Args: task: Task object whose input schema will be used
Returns: JSON string representing the schema for a list of task inputs
170class DataGenSampleTask(Task, parent_of={}): 171 """Task for generating data samples for a given topic. 172 173 Generates synthetic data samples based on provided topics and subtopics. 174 """ 175 176 def __init__(self, target_task: Task, num_samples: int = 8): 177 # Keep the typechecker happy. TODO: shouldn't need this or parent_of above. 178 tmp_project = Project(name="DataGenSample") 179 super().__init__( 180 name="DataGenSample", 181 parent=tmp_project, 182 description="A task which generates synthetic data samples for a given topic (and optional subtopic).", 183 instruction=SAMPLE_GENERATION_PROMPT, 184 input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()), 185 output_json_schema=list_json_schema_for_task(target_task), 186 )
Task for generating data samples for a given topic.
Generates synthetic data samples based on provided topics and subtopics.
176 def __init__(self, target_task: Task, num_samples: int = 8): 177 # Keep the typechecker happy. TODO: shouldn't need this or parent_of above. 178 tmp_project = Project(name="DataGenSample") 179 super().__init__( 180 name="DataGenSample", 181 parent=tmp_project, 182 description="A task which generates synthetic data samples for a given topic (and optional subtopic).", 183 instruction=SAMPLE_GENERATION_PROMPT, 184 input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()), 185 output_json_schema=list_json_schema_for_task(target_task), 186 )
Create a new model by parsing and validating input data from keyword arguments.
Raises [ValidationError
][pydantic_core.ValidationError] if the input data cannot be
validated to form a valid model.
self
is explicitly positional-only to allow self
as a field name.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.