kiln_ai.adapters.fine_tune.dataset_formatter
1import json 2import tempfile 3from dataclasses import dataclass 4from enum import Enum 5from pathlib import Path 6from typing import Any, Dict, Protocol 7from uuid import uuid4 8 9from kiln_ai.adapters.model_adapters.base_adapter import COT_FINAL_ANSWER_PROMPT 10from kiln_ai.datamodel import DatasetSplit, FinetuneDataStrategy, TaskRun 11 12 13class DatasetFormat(str, Enum): 14 """Formats for dataset generation. Both for file format (like JSONL), and internal structure (like chat/toolcall)""" 15 16 """OpenAI chat format with plaintext response""" 17 OPENAI_CHAT_JSONL = "openai_chat_jsonl" 18 19 """OpenAI chat format with json response_format""" 20 OPENAI_CHAT_JSON_SCHEMA_JSONL = "openai_chat_json_schema_jsonl" 21 22 """OpenAI chat format with tool call response""" 23 OPENAI_CHAT_TOOLCALL_JSONL = "openai_chat_toolcall_jsonl" 24 25 """HuggingFace chat template in JSONL""" 26 HUGGINGFACE_CHAT_TEMPLATE_JSONL = "huggingface_chat_template_jsonl" 27 28 """HuggingFace chat template with tool calls in JSONL""" 29 HUGGINGFACE_CHAT_TEMPLATE_TOOLCALL_JSONL = ( 30 "huggingface_chat_template_toolcall_jsonl" 31 ) 32 33 """Vertex Gemini 1.5 format (flash and pro)""" 34 VERTEX_GEMINI_1_5 = "vertex_gemini_1_5" 35 36 37@dataclass 38class ModelTrainingData: 39 input: str 40 system_message: str 41 final_output: str 42 # These 3 are optional, and used for COT/Thinking style multi-message responses 43 thinking_instructions: str | None = None 44 thinking: str | None = None 45 thinking_final_answer_prompt: str | None = None 46 47 def supports_cot(self) -> bool: 48 return ( 49 self.thinking_instructions is not None 50 and self.thinking is not None 51 and self.thinking_final_answer_prompt is not None 52 ) 53 54 55class FormatGenerator(Protocol): 56 """Protocol for format generators""" 57 58 def __call__( 59 self, 60 training_data: ModelTrainingData, 61 ) -> Dict[str, Any]: ... 62 63 64def build_training_data( 65 task_run: TaskRun, 66 system_message: str, 67 include_cot: bool, 68 thinking_instructions: str | None = None, 69) -> ModelTrainingData: 70 """ 71 Generate data for training. 72 73 For final output, get the best task output from the task run, preferring repaired output if available. 74 75 For thinking, get the intermediate output if it exists, otherwise return None. 76 """ 77 final_output = task_run.output.output 78 if task_run.repaired_output is not None: 79 final_output = task_run.repaired_output.output 80 81 thinking = None 82 thinking_final_answer_prompt = None 83 parent_task = task_run.parent_task() 84 85 if include_cot and task_run.has_thinking_training_data(): 86 if not parent_task: 87 raise ValueError( 88 "TaskRuns for training required a parent Task for building a chain of thought prompts. Train without COT, or save this TaskRun to a parent Task." 89 ) 90 91 # Prefer reasoning to cot if both are present 92 intermediate_outputs = task_run.intermediate_outputs or {} 93 thinking = intermediate_outputs.get("reasoning") or intermediate_outputs.get( 94 "chain_of_thought" 95 ) 96 97 thinking_final_answer_prompt = COT_FINAL_ANSWER_PROMPT 98 99 # Always use the passed thinking instructions, but check they are present for COT 100 if not thinking_instructions: 101 raise ValueError( 102 "Thinking instructions are required when data_strategy is final_and_intermediate" 103 ) 104 105 return ModelTrainingData( 106 input=task_run.input, 107 system_message=system_message, 108 final_output=final_output, 109 thinking=thinking, 110 thinking_instructions=thinking_instructions, 111 thinking_final_answer_prompt=thinking_final_answer_prompt, 112 ) 113 114 115def generate_chat_message_response( 116 training_data: ModelTrainingData, 117) -> Dict[str, Any]: 118 """Generate OpenAI chat format with plaintext response""" 119 120 messages: list[dict[str, str | None]] = [ 121 {"role": "system", "content": training_data.system_message}, 122 {"role": "user", "content": training_data.input}, 123 ] 124 125 if training_data.supports_cot(): 126 messages.extend( 127 [ 128 {"role": "user", "content": training_data.thinking_instructions}, 129 {"role": "assistant", "content": training_data.thinking}, 130 { 131 "role": "user", 132 "content": training_data.thinking_final_answer_prompt, 133 }, 134 ] 135 ) 136 137 messages.append({"role": "assistant", "content": training_data.final_output}) 138 139 return {"messages": messages} 140 141 142def generate_json_schema_message( 143 training_data: ModelTrainingData, 144) -> Dict[str, Any]: 145 """Generate OpenAI chat format with validated JSON response""" 146 # Load and dump to ensure it's valid JSON and goes to 1 line 147 try: 148 json_data = json.loads(training_data.final_output) 149 except json.JSONDecodeError as e: 150 raise ValueError( 151 f"Invalid JSON in JSON Schema training set: {e}\nOutput Data: {training_data.final_output}" 152 ) from e 153 json_string = json.dumps(json_data, ensure_ascii=False) 154 155 messages: list[dict[str, str | None]] = [ 156 {"role": "system", "content": training_data.system_message}, 157 {"role": "user", "content": training_data.input}, 158 ] 159 160 if training_data.supports_cot(): 161 messages.extend( 162 [ 163 {"role": "user", "content": training_data.thinking_instructions}, 164 {"role": "assistant", "content": training_data.thinking}, 165 { 166 "role": "user", 167 "content": training_data.thinking_final_answer_prompt, 168 }, 169 ] 170 ) 171 172 messages.append({"role": "assistant", "content": json_string}) 173 174 return {"messages": messages} 175 176 177def generate_chat_message_toolcall( 178 training_data: ModelTrainingData, 179) -> Dict[str, Any]: 180 """Generate OpenAI chat format with tool call response""" 181 try: 182 arguments = json.loads(training_data.final_output) 183 except json.JSONDecodeError as e: 184 raise ValueError(f"Invalid JSON in for tool call: {e}") from e 185 186 messages: list[dict[str, Any]] = [ 187 {"role": "system", "content": training_data.system_message}, 188 {"role": "user", "content": training_data.input}, 189 ] 190 191 if training_data.supports_cot(): 192 messages.extend( 193 [ 194 {"role": "user", "content": training_data.thinking_instructions}, 195 {"role": "assistant", "content": training_data.thinking}, 196 { 197 "role": "user", 198 "content": training_data.thinking_final_answer_prompt, 199 }, 200 ] 201 ) 202 203 messages.append( 204 { 205 "role": "assistant", 206 "content": None, 207 "tool_calls": [ 208 { 209 "id": "call_1", 210 "type": "function", 211 "function": { 212 "name": "task_response", 213 # Yes we parse then dump again. This ensures it's valid JSON, and ensures it goes to 1 line 214 "arguments": json.dumps(arguments, ensure_ascii=False), 215 }, 216 } 217 ], 218 }, 219 ) 220 221 return {"messages": messages} 222 223 224def generate_huggingface_chat_template( 225 training_data: ModelTrainingData, 226) -> Dict[str, Any]: 227 """Generate HuggingFace chat template""" 228 229 conversations: list[dict[str, Any]] = [ 230 {"role": "system", "content": training_data.system_message}, 231 {"role": "user", "content": training_data.input}, 232 ] 233 234 if training_data.supports_cot(): 235 conversations.extend( 236 [ 237 {"role": "user", "content": training_data.thinking_instructions}, 238 {"role": "assistant", "content": training_data.thinking}, 239 {"role": "user", "content": training_data.thinking_final_answer_prompt}, 240 ] 241 ) 242 243 conversations.append({"role": "assistant", "content": training_data.final_output}) 244 245 return {"conversations": conversations} 246 247 248def generate_huggingface_chat_template_toolcall( 249 training_data: ModelTrainingData, 250) -> Dict[str, Any]: 251 """Generate HuggingFace chat template with tool calls""" 252 try: 253 arguments = json.loads(training_data.final_output) 254 except json.JSONDecodeError as e: 255 raise ValueError(f"Invalid JSON in for tool call: {e}") from e 256 257 # See https://huggingface.co/docs/transformers/en/chat_templating 258 conversations: list[dict[str, Any]] = [ 259 {"role": "system", "content": training_data.system_message}, 260 {"role": "user", "content": training_data.input}, 261 ] 262 263 if training_data.supports_cot(): 264 conversations.extend( 265 [ 266 {"role": "user", "content": training_data.thinking_instructions}, 267 {"role": "assistant", "content": training_data.thinking}, 268 {"role": "user", "content": training_data.thinking_final_answer_prompt}, 269 ] 270 ) 271 272 conversations.append( 273 { 274 "role": "assistant", 275 "tool_calls": [ 276 { 277 "type": "function", 278 "function": { 279 "name": "task_response", 280 "id": str(uuid4()).replace("-", "")[:9], 281 "arguments": arguments, 282 }, 283 } 284 ], 285 }, 286 ) 287 288 return {"conversations": conversations} 289 290 291def generate_vertex_gemini_1_5( 292 training_data: ModelTrainingData, 293) -> Dict[str, Any]: 294 """Generate Vertex Gemini 1.5 format (flash and pro)""" 295 # See https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-prepare 296 297 contents = [ 298 { 299 "role": "user", 300 "parts": [ 301 { 302 "text": training_data.input, 303 } 304 ], 305 } 306 ] 307 308 if training_data.supports_cot(): 309 contents.extend( 310 [ 311 { 312 "role": "user", 313 "parts": [{"text": training_data.thinking_instructions}], 314 }, 315 {"role": "model", "parts": [{"text": training_data.thinking}]}, 316 { 317 "role": "user", 318 "parts": [{"text": training_data.thinking_final_answer_prompt}], 319 }, 320 ] 321 ) 322 323 contents.append( 324 { 325 "role": "model", 326 "parts": [{"text": training_data.final_output}], 327 } 328 ) 329 330 return { 331 "systemInstruction": { 332 "role": "system", 333 "parts": [ 334 { 335 "text": training_data.system_message, 336 } 337 ], 338 }, 339 "contents": contents, 340 } 341 342 343FORMAT_GENERATORS: Dict[DatasetFormat, FormatGenerator] = { 344 DatasetFormat.OPENAI_CHAT_JSONL: generate_chat_message_response, 345 DatasetFormat.OPENAI_CHAT_JSON_SCHEMA_JSONL: generate_json_schema_message, 346 DatasetFormat.OPENAI_CHAT_TOOLCALL_JSONL: generate_chat_message_toolcall, 347 DatasetFormat.HUGGINGFACE_CHAT_TEMPLATE_JSONL: generate_huggingface_chat_template, 348 DatasetFormat.HUGGINGFACE_CHAT_TEMPLATE_TOOLCALL_JSONL: generate_huggingface_chat_template_toolcall, 349 DatasetFormat.VERTEX_GEMINI_1_5: generate_vertex_gemini_1_5, 350} 351 352 353class DatasetFormatter: 354 """Handles formatting of datasets into various output formats""" 355 356 def __init__( 357 self, 358 dataset: DatasetSplit, 359 system_message: str, 360 thinking_instructions: str | None = None, 361 ): 362 self.dataset = dataset 363 self.system_message = system_message 364 self.thinking_instructions = thinking_instructions 365 366 task = dataset.parent_task() 367 if task is None: 368 raise ValueError("Dataset has no parent task") 369 self.task = task 370 371 def dump_to_file( 372 self, 373 split_name: str, 374 format_type: DatasetFormat, 375 data_strategy: FinetuneDataStrategy, 376 path: Path | None = None, 377 ) -> Path: 378 """ 379 Format the dataset into the specified format. 380 381 Args: 382 split_name: Name of the split to dump 383 format_type: Format to generate the dataset in 384 path: Optional path to write to. If None, writes to temp directory 385 386 Returns: 387 Path to the generated file 388 389 Note: 390 The output is written in UTF-8 encoding with ensure_ascii=False to properly 391 support international text content while maintaining readability. 392 """ 393 if format_type not in FORMAT_GENERATORS: 394 raise ValueError(f"Unsupported format: {format_type}") 395 if split_name not in self.dataset.split_contents: 396 raise ValueError(f"Split {split_name} not found in dataset") 397 398 generator = FORMAT_GENERATORS[format_type] 399 400 include_cot = data_strategy == FinetuneDataStrategy.final_and_intermediate 401 402 # Write to a temp file if no path is provided 403 output_path = ( 404 path 405 or Path(tempfile.gettempdir()) 406 / f"{self.dataset.name} -- split-{split_name} -- format-{format_type.value} -- {'cot' if include_cot else 'no-cot'}.jsonl" 407 ) 408 409 runs = self.task.runs() 410 runs_by_id = {run.id: run for run in runs} 411 412 # Generate formatted output with UTF-8 encoding 413 with open(output_path, "w", encoding="utf-8") as f: 414 for run_id in self.dataset.split_contents[split_name]: 415 task_run = runs_by_id[run_id] 416 if task_run is None: 417 raise ValueError( 418 f"Task run {run_id} not found. This is required by this dataset." 419 ) 420 421 training_data = build_training_data( 422 task_run=task_run, 423 system_message=self.system_message, 424 include_cot=include_cot, 425 thinking_instructions=self.thinking_instructions, 426 ) 427 example = generator(training_data) 428 # Allow non-ascii characters in the dataset. 429 # Better readability for non-English users. If you don't support UTF-8... you should. 430 f.write(json.dumps(example, ensure_ascii=False) + "\n") 431 432 return output_path
14class DatasetFormat(str, Enum): 15 """Formats for dataset generation. Both for file format (like JSONL), and internal structure (like chat/toolcall)""" 16 17 """OpenAI chat format with plaintext response""" 18 OPENAI_CHAT_JSONL = "openai_chat_jsonl" 19 20 """OpenAI chat format with json response_format""" 21 OPENAI_CHAT_JSON_SCHEMA_JSONL = "openai_chat_json_schema_jsonl" 22 23 """OpenAI chat format with tool call response""" 24 OPENAI_CHAT_TOOLCALL_JSONL = "openai_chat_toolcall_jsonl" 25 26 """HuggingFace chat template in JSONL""" 27 HUGGINGFACE_CHAT_TEMPLATE_JSONL = "huggingface_chat_template_jsonl" 28 29 """HuggingFace chat template with tool calls in JSONL""" 30 HUGGINGFACE_CHAT_TEMPLATE_TOOLCALL_JSONL = ( 31 "huggingface_chat_template_toolcall_jsonl" 32 ) 33 34 """Vertex Gemini 1.5 format (flash and pro)""" 35 VERTEX_GEMINI_1_5 = "vertex_gemini_1_5"
Formats for dataset generation. Both for file format (like JSONL), and internal structure (like chat/toolcall)
OpenAI chat format with json response_format
OpenAI chat format with tool call response
HuggingFace chat template in JSONL
HuggingFace chat template with tool calls in JSONL
Vertex Gemini 1.5 format (flash and pro)
38@dataclass 39class ModelTrainingData: 40 input: str 41 system_message: str 42 final_output: str 43 # These 3 are optional, and used for COT/Thinking style multi-message responses 44 thinking_instructions: str | None = None 45 thinking: str | None = None 46 thinking_final_answer_prompt: str | None = None 47 48 def supports_cot(self) -> bool: 49 return ( 50 self.thinking_instructions is not None 51 and self.thinking is not None 52 and self.thinking_final_answer_prompt is not None 53 )
56class FormatGenerator(Protocol): 57 """Protocol for format generators""" 58 59 def __call__( 60 self, 61 training_data: ModelTrainingData, 62 ) -> Dict[str, Any]: ...
Protocol for format generators
1945def _no_init_or_replace_init(self, *args, **kwargs): 1946 cls = type(self) 1947 1948 if cls._is_protocol: 1949 raise TypeError('Protocols cannot be instantiated') 1950 1951 # Already using a custom `__init__`. No need to calculate correct 1952 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1953 if cls.__init__ is not _no_init_or_replace_init: 1954 return 1955 1956 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1957 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1958 # searches for a proper new `__init__` in the MRO. The new `__init__` 1959 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1960 # instantiation of the protocol subclass will thus use the new 1961 # `__init__` and no longer call `_no_init_or_replace_init`. 1962 for base in cls.__mro__: 1963 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1964 if init is not _no_init_or_replace_init: 1965 cls.__init__ = init 1966 break 1967 else: 1968 # should not happen 1969 cls.__init__ = object.__init__ 1970 1971 cls.__init__(self, *args, **kwargs)
65def build_training_data( 66 task_run: TaskRun, 67 system_message: str, 68 include_cot: bool, 69 thinking_instructions: str | None = None, 70) -> ModelTrainingData: 71 """ 72 Generate data for training. 73 74 For final output, get the best task output from the task run, preferring repaired output if available. 75 76 For thinking, get the intermediate output if it exists, otherwise return None. 77 """ 78 final_output = task_run.output.output 79 if task_run.repaired_output is not None: 80 final_output = task_run.repaired_output.output 81 82 thinking = None 83 thinking_final_answer_prompt = None 84 parent_task = task_run.parent_task() 85 86 if include_cot and task_run.has_thinking_training_data(): 87 if not parent_task: 88 raise ValueError( 89 "TaskRuns for training required a parent Task for building a chain of thought prompts. Train without COT, or save this TaskRun to a parent Task." 90 ) 91 92 # Prefer reasoning to cot if both are present 93 intermediate_outputs = task_run.intermediate_outputs or {} 94 thinking = intermediate_outputs.get("reasoning") or intermediate_outputs.get( 95 "chain_of_thought" 96 ) 97 98 thinking_final_answer_prompt = COT_FINAL_ANSWER_PROMPT 99 100 # Always use the passed thinking instructions, but check they are present for COT 101 if not thinking_instructions: 102 raise ValueError( 103 "Thinking instructions are required when data_strategy is final_and_intermediate" 104 ) 105 106 return ModelTrainingData( 107 input=task_run.input, 108 system_message=system_message, 109 final_output=final_output, 110 thinking=thinking, 111 thinking_instructions=thinking_instructions, 112 thinking_final_answer_prompt=thinking_final_answer_prompt, 113 )
Generate data for training.
For final output, get the best task output from the task run, preferring repaired output if available.
For thinking, get the intermediate output if it exists, otherwise return None.
116def generate_chat_message_response( 117 training_data: ModelTrainingData, 118) -> Dict[str, Any]: 119 """Generate OpenAI chat format with plaintext response""" 120 121 messages: list[dict[str, str | None]] = [ 122 {"role": "system", "content": training_data.system_message}, 123 {"role": "user", "content": training_data.input}, 124 ] 125 126 if training_data.supports_cot(): 127 messages.extend( 128 [ 129 {"role": "user", "content": training_data.thinking_instructions}, 130 {"role": "assistant", "content": training_data.thinking}, 131 { 132 "role": "user", 133 "content": training_data.thinking_final_answer_prompt, 134 }, 135 ] 136 ) 137 138 messages.append({"role": "assistant", "content": training_data.final_output}) 139 140 return {"messages": messages}
Generate OpenAI chat format with plaintext response
143def generate_json_schema_message( 144 training_data: ModelTrainingData, 145) -> Dict[str, Any]: 146 """Generate OpenAI chat format with validated JSON response""" 147 # Load and dump to ensure it's valid JSON and goes to 1 line 148 try: 149 json_data = json.loads(training_data.final_output) 150 except json.JSONDecodeError as e: 151 raise ValueError( 152 f"Invalid JSON in JSON Schema training set: {e}\nOutput Data: {training_data.final_output}" 153 ) from e 154 json_string = json.dumps(json_data, ensure_ascii=False) 155 156 messages: list[dict[str, str | None]] = [ 157 {"role": "system", "content": training_data.system_message}, 158 {"role": "user", "content": training_data.input}, 159 ] 160 161 if training_data.supports_cot(): 162 messages.extend( 163 [ 164 {"role": "user", "content": training_data.thinking_instructions}, 165 {"role": "assistant", "content": training_data.thinking}, 166 { 167 "role": "user", 168 "content": training_data.thinking_final_answer_prompt, 169 }, 170 ] 171 ) 172 173 messages.append({"role": "assistant", "content": json_string}) 174 175 return {"messages": messages}
Generate OpenAI chat format with validated JSON response
178def generate_chat_message_toolcall( 179 training_data: ModelTrainingData, 180) -> Dict[str, Any]: 181 """Generate OpenAI chat format with tool call response""" 182 try: 183 arguments = json.loads(training_data.final_output) 184 except json.JSONDecodeError as e: 185 raise ValueError(f"Invalid JSON in for tool call: {e}") from e 186 187 messages: list[dict[str, Any]] = [ 188 {"role": "system", "content": training_data.system_message}, 189 {"role": "user", "content": training_data.input}, 190 ] 191 192 if training_data.supports_cot(): 193 messages.extend( 194 [ 195 {"role": "user", "content": training_data.thinking_instructions}, 196 {"role": "assistant", "content": training_data.thinking}, 197 { 198 "role": "user", 199 "content": training_data.thinking_final_answer_prompt, 200 }, 201 ] 202 ) 203 204 messages.append( 205 { 206 "role": "assistant", 207 "content": None, 208 "tool_calls": [ 209 { 210 "id": "call_1", 211 "type": "function", 212 "function": { 213 "name": "task_response", 214 # Yes we parse then dump again. This ensures it's valid JSON, and ensures it goes to 1 line 215 "arguments": json.dumps(arguments, ensure_ascii=False), 216 }, 217 } 218 ], 219 }, 220 ) 221 222 return {"messages": messages}
Generate OpenAI chat format with tool call response
225def generate_huggingface_chat_template( 226 training_data: ModelTrainingData, 227) -> Dict[str, Any]: 228 """Generate HuggingFace chat template""" 229 230 conversations: list[dict[str, Any]] = [ 231 {"role": "system", "content": training_data.system_message}, 232 {"role": "user", "content": training_data.input}, 233 ] 234 235 if training_data.supports_cot(): 236 conversations.extend( 237 [ 238 {"role": "user", "content": training_data.thinking_instructions}, 239 {"role": "assistant", "content": training_data.thinking}, 240 {"role": "user", "content": training_data.thinking_final_answer_prompt}, 241 ] 242 ) 243 244 conversations.append({"role": "assistant", "content": training_data.final_output}) 245 246 return {"conversations": conversations}
Generate HuggingFace chat template
249def generate_huggingface_chat_template_toolcall( 250 training_data: ModelTrainingData, 251) -> Dict[str, Any]: 252 """Generate HuggingFace chat template with tool calls""" 253 try: 254 arguments = json.loads(training_data.final_output) 255 except json.JSONDecodeError as e: 256 raise ValueError(f"Invalid JSON in for tool call: {e}") from e 257 258 # See https://huggingface.co/docs/transformers/en/chat_templating 259 conversations: list[dict[str, Any]] = [ 260 {"role": "system", "content": training_data.system_message}, 261 {"role": "user", "content": training_data.input}, 262 ] 263 264 if training_data.supports_cot(): 265 conversations.extend( 266 [ 267 {"role": "user", "content": training_data.thinking_instructions}, 268 {"role": "assistant", "content": training_data.thinking}, 269 {"role": "user", "content": training_data.thinking_final_answer_prompt}, 270 ] 271 ) 272 273 conversations.append( 274 { 275 "role": "assistant", 276 "tool_calls": [ 277 { 278 "type": "function", 279 "function": { 280 "name": "task_response", 281 "id": str(uuid4()).replace("-", "")[:9], 282 "arguments": arguments, 283 }, 284 } 285 ], 286 }, 287 ) 288 289 return {"conversations": conversations}
Generate HuggingFace chat template with tool calls
292def generate_vertex_gemini_1_5( 293 training_data: ModelTrainingData, 294) -> Dict[str, Any]: 295 """Generate Vertex Gemini 1.5 format (flash and pro)""" 296 # See https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-prepare 297 298 contents = [ 299 { 300 "role": "user", 301 "parts": [ 302 { 303 "text": training_data.input, 304 } 305 ], 306 } 307 ] 308 309 if training_data.supports_cot(): 310 contents.extend( 311 [ 312 { 313 "role": "user", 314 "parts": [{"text": training_data.thinking_instructions}], 315 }, 316 {"role": "model", "parts": [{"text": training_data.thinking}]}, 317 { 318 "role": "user", 319 "parts": [{"text": training_data.thinking_final_answer_prompt}], 320 }, 321 ] 322 ) 323 324 contents.append( 325 { 326 "role": "model", 327 "parts": [{"text": training_data.final_output}], 328 } 329 ) 330 331 return { 332 "systemInstruction": { 333 "role": "system", 334 "parts": [ 335 { 336 "text": training_data.system_message, 337 } 338 ], 339 }, 340 "contents": contents, 341 }
Generate Vertex Gemini 1.5 format (flash and pro)
354class DatasetFormatter: 355 """Handles formatting of datasets into various output formats""" 356 357 def __init__( 358 self, 359 dataset: DatasetSplit, 360 system_message: str, 361 thinking_instructions: str | None = None, 362 ): 363 self.dataset = dataset 364 self.system_message = system_message 365 self.thinking_instructions = thinking_instructions 366 367 task = dataset.parent_task() 368 if task is None: 369 raise ValueError("Dataset has no parent task") 370 self.task = task 371 372 def dump_to_file( 373 self, 374 split_name: str, 375 format_type: DatasetFormat, 376 data_strategy: FinetuneDataStrategy, 377 path: Path | None = None, 378 ) -> Path: 379 """ 380 Format the dataset into the specified format. 381 382 Args: 383 split_name: Name of the split to dump 384 format_type: Format to generate the dataset in 385 path: Optional path to write to. If None, writes to temp directory 386 387 Returns: 388 Path to the generated file 389 390 Note: 391 The output is written in UTF-8 encoding with ensure_ascii=False to properly 392 support international text content while maintaining readability. 393 """ 394 if format_type not in FORMAT_GENERATORS: 395 raise ValueError(f"Unsupported format: {format_type}") 396 if split_name not in self.dataset.split_contents: 397 raise ValueError(f"Split {split_name} not found in dataset") 398 399 generator = FORMAT_GENERATORS[format_type] 400 401 include_cot = data_strategy == FinetuneDataStrategy.final_and_intermediate 402 403 # Write to a temp file if no path is provided 404 output_path = ( 405 path 406 or Path(tempfile.gettempdir()) 407 / f"{self.dataset.name} -- split-{split_name} -- format-{format_type.value} -- {'cot' if include_cot else 'no-cot'}.jsonl" 408 ) 409 410 runs = self.task.runs() 411 runs_by_id = {run.id: run for run in runs} 412 413 # Generate formatted output with UTF-8 encoding 414 with open(output_path, "w", encoding="utf-8") as f: 415 for run_id in self.dataset.split_contents[split_name]: 416 task_run = runs_by_id[run_id] 417 if task_run is None: 418 raise ValueError( 419 f"Task run {run_id} not found. This is required by this dataset." 420 ) 421 422 training_data = build_training_data( 423 task_run=task_run, 424 system_message=self.system_message, 425 include_cot=include_cot, 426 thinking_instructions=self.thinking_instructions, 427 ) 428 example = generator(training_data) 429 # Allow non-ascii characters in the dataset. 430 # Better readability for non-English users. If you don't support UTF-8... you should. 431 f.write(json.dumps(example, ensure_ascii=False) + "\n") 432 433 return output_path
Handles formatting of datasets into various output formats
357 def __init__( 358 self, 359 dataset: DatasetSplit, 360 system_message: str, 361 thinking_instructions: str | None = None, 362 ): 363 self.dataset = dataset 364 self.system_message = system_message 365 self.thinking_instructions = thinking_instructions 366 367 task = dataset.parent_task() 368 if task is None: 369 raise ValueError("Dataset has no parent task") 370 self.task = task
372 def dump_to_file( 373 self, 374 split_name: str, 375 format_type: DatasetFormat, 376 data_strategy: FinetuneDataStrategy, 377 path: Path | None = None, 378 ) -> Path: 379 """ 380 Format the dataset into the specified format. 381 382 Args: 383 split_name: Name of the split to dump 384 format_type: Format to generate the dataset in 385 path: Optional path to write to. If None, writes to temp directory 386 387 Returns: 388 Path to the generated file 389 390 Note: 391 The output is written in UTF-8 encoding with ensure_ascii=False to properly 392 support international text content while maintaining readability. 393 """ 394 if format_type not in FORMAT_GENERATORS: 395 raise ValueError(f"Unsupported format: {format_type}") 396 if split_name not in self.dataset.split_contents: 397 raise ValueError(f"Split {split_name} not found in dataset") 398 399 generator = FORMAT_GENERATORS[format_type] 400 401 include_cot = data_strategy == FinetuneDataStrategy.final_and_intermediate 402 403 # Write to a temp file if no path is provided 404 output_path = ( 405 path 406 or Path(tempfile.gettempdir()) 407 / f"{self.dataset.name} -- split-{split_name} -- format-{format_type.value} -- {'cot' if include_cot else 'no-cot'}.jsonl" 408 ) 409 410 runs = self.task.runs() 411 runs_by_id = {run.id: run for run in runs} 412 413 # Generate formatted output with UTF-8 encoding 414 with open(output_path, "w", encoding="utf-8") as f: 415 for run_id in self.dataset.split_contents[split_name]: 416 task_run = runs_by_id[run_id] 417 if task_run is None: 418 raise ValueError( 419 f"Task run {run_id} not found. This is required by this dataset." 420 ) 421 422 training_data = build_training_data( 423 task_run=task_run, 424 system_message=self.system_message, 425 include_cot=include_cot, 426 thinking_instructions=self.thinking_instructions, 427 ) 428 example = generator(training_data) 429 # Allow non-ascii characters in the dataset. 430 # Better readability for non-English users. If you don't support UTF-8... you should. 431 f.write(json.dumps(example, ensure_ascii=False) + "\n") 432 433 return output_path
Format the dataset into the specified format.
Args: split_name: Name of the split to dump format_type: Format to generate the dataset in path: Optional path to write to. If None, writes to temp directory
Returns: Path to the generated file
Note: The output is written in UTF-8 encoding with ensure_ascii=False to properly support international text content while maintaining readability.