kiln_ai.adapters.fine_tune.dataset_formatter

  1import json
  2import tempfile
  3from dataclasses import dataclass
  4from enum import Enum
  5from pathlib import Path
  6from typing import Any, Dict, Protocol
  7from uuid import uuid4
  8
  9from kiln_ai.adapters.model_adapters.base_adapter import COT_FINAL_ANSWER_PROMPT
 10from kiln_ai.datamodel import DatasetSplit, FinetuneDataStrategy, TaskRun
 11
 12
 13class DatasetFormat(str, Enum):
 14    """Formats for dataset generation. Both for file format (like JSONL), and internal structure (like chat/toolcall)"""
 15
 16    """OpenAI chat format with plaintext response"""
 17    OPENAI_CHAT_JSONL = "openai_chat_jsonl"
 18
 19    """OpenAI chat format with json response_format"""
 20    OPENAI_CHAT_JSON_SCHEMA_JSONL = "openai_chat_json_schema_jsonl"
 21
 22    """OpenAI chat format with tool call response"""
 23    OPENAI_CHAT_TOOLCALL_JSONL = "openai_chat_toolcall_jsonl"
 24
 25    """HuggingFace chat template in JSONL"""
 26    HUGGINGFACE_CHAT_TEMPLATE_JSONL = "huggingface_chat_template_jsonl"
 27
 28    """HuggingFace chat template with tool calls in JSONL"""
 29    HUGGINGFACE_CHAT_TEMPLATE_TOOLCALL_JSONL = (
 30        "huggingface_chat_template_toolcall_jsonl"
 31    )
 32
 33    """Vertex Gemini 1.5 format (flash and pro)"""
 34    VERTEX_GEMINI_1_5 = "vertex_gemini_1_5"
 35
 36
 37@dataclass
 38class ModelTrainingData:
 39    input: str
 40    system_message: str
 41    final_output: str
 42    # These 3 are optional, and used for COT/Thinking style multi-message responses
 43    thinking_instructions: str | None = None
 44    thinking: str | None = None
 45    thinking_final_answer_prompt: str | None = None
 46
 47    def supports_cot(self) -> bool:
 48        return (
 49            self.thinking_instructions is not None
 50            and self.thinking is not None
 51            and self.thinking_final_answer_prompt is not None
 52        )
 53
 54
 55class FormatGenerator(Protocol):
 56    """Protocol for format generators"""
 57
 58    def __call__(
 59        self,
 60        training_data: ModelTrainingData,
 61    ) -> Dict[str, Any]: ...
 62
 63
 64def build_training_data(
 65    task_run: TaskRun,
 66    system_message: str,
 67    include_cot: bool,
 68    thinking_instructions: str | None = None,
 69) -> ModelTrainingData:
 70    """
 71    Generate data for training.
 72
 73    For final output, get the best task output from the task run, preferring repaired output if available.
 74
 75    For thinking, get the intermediate output if it exists, otherwise return None.
 76    """
 77    final_output = task_run.output.output
 78    if task_run.repaired_output is not None:
 79        final_output = task_run.repaired_output.output
 80
 81    thinking = None
 82    thinking_final_answer_prompt = None
 83    parent_task = task_run.parent_task()
 84
 85    if include_cot and task_run.has_thinking_training_data():
 86        if not parent_task:
 87            raise ValueError(
 88                "TaskRuns for training required a parent Task for building a chain of thought prompts. Train without COT, or save this TaskRun to a parent Task."
 89            )
 90
 91        # Prefer reasoning to cot if both are present
 92        intermediate_outputs = task_run.intermediate_outputs or {}
 93        thinking = intermediate_outputs.get("reasoning") or intermediate_outputs.get(
 94            "chain_of_thought"
 95        )
 96
 97        thinking_final_answer_prompt = COT_FINAL_ANSWER_PROMPT
 98
 99        # Always use the passed thinking instructions, but check they are present for COT
100        if not thinking_instructions:
101            raise ValueError(
102                "Thinking instructions are required when data_strategy is final_and_intermediate"
103            )
104
105    return ModelTrainingData(
106        input=task_run.input,
107        system_message=system_message,
108        final_output=final_output,
109        thinking=thinking,
110        thinking_instructions=thinking_instructions,
111        thinking_final_answer_prompt=thinking_final_answer_prompt,
112    )
113
114
115def generate_chat_message_response(
116    training_data: ModelTrainingData,
117) -> Dict[str, Any]:
118    """Generate OpenAI chat format with plaintext response"""
119
120    messages: list[dict[str, str | None]] = [
121        {"role": "system", "content": training_data.system_message},
122        {"role": "user", "content": training_data.input},
123    ]
124
125    if training_data.supports_cot():
126        messages.extend(
127            [
128                {"role": "user", "content": training_data.thinking_instructions},
129                {"role": "assistant", "content": training_data.thinking},
130                {
131                    "role": "user",
132                    "content": training_data.thinking_final_answer_prompt,
133                },
134            ]
135        )
136
137    messages.append({"role": "assistant", "content": training_data.final_output})
138
139    return {"messages": messages}
140
141
142def generate_json_schema_message(
143    training_data: ModelTrainingData,
144) -> Dict[str, Any]:
145    """Generate OpenAI chat format with validated JSON response"""
146    # Load and dump to ensure it's valid JSON and goes to 1 line
147    try:
148        json_data = json.loads(training_data.final_output)
149    except json.JSONDecodeError as e:
150        raise ValueError(
151            f"Invalid JSON in JSON Schema training set: {e}\nOutput Data: {training_data.final_output}"
152        ) from e
153    json_string = json.dumps(json_data, ensure_ascii=False)
154
155    messages: list[dict[str, str | None]] = [
156        {"role": "system", "content": training_data.system_message},
157        {"role": "user", "content": training_data.input},
158    ]
159
160    if training_data.supports_cot():
161        messages.extend(
162            [
163                {"role": "user", "content": training_data.thinking_instructions},
164                {"role": "assistant", "content": training_data.thinking},
165                {
166                    "role": "user",
167                    "content": training_data.thinking_final_answer_prompt,
168                },
169            ]
170        )
171
172    messages.append({"role": "assistant", "content": json_string})
173
174    return {"messages": messages}
175
176
177def generate_chat_message_toolcall(
178    training_data: ModelTrainingData,
179) -> Dict[str, Any]:
180    """Generate OpenAI chat format with tool call response"""
181    try:
182        arguments = json.loads(training_data.final_output)
183    except json.JSONDecodeError as e:
184        raise ValueError(f"Invalid JSON in for tool call: {e}") from e
185
186    messages: list[dict[str, Any]] = [
187        {"role": "system", "content": training_data.system_message},
188        {"role": "user", "content": training_data.input},
189    ]
190
191    if training_data.supports_cot():
192        messages.extend(
193            [
194                {"role": "user", "content": training_data.thinking_instructions},
195                {"role": "assistant", "content": training_data.thinking},
196                {
197                    "role": "user",
198                    "content": training_data.thinking_final_answer_prompt,
199                },
200            ]
201        )
202
203    messages.append(
204        {
205            "role": "assistant",
206            "content": None,
207            "tool_calls": [
208                {
209                    "id": "call_1",
210                    "type": "function",
211                    "function": {
212                        "name": "task_response",
213                        # Yes we parse then dump again. This ensures it's valid JSON, and ensures it goes to 1 line
214                        "arguments": json.dumps(arguments, ensure_ascii=False),
215                    },
216                }
217            ],
218        },
219    )
220
221    return {"messages": messages}
222
223
224def generate_huggingface_chat_template(
225    training_data: ModelTrainingData,
226) -> Dict[str, Any]:
227    """Generate HuggingFace chat template"""
228
229    conversations: list[dict[str, Any]] = [
230        {"role": "system", "content": training_data.system_message},
231        {"role": "user", "content": training_data.input},
232    ]
233
234    if training_data.supports_cot():
235        conversations.extend(
236            [
237                {"role": "user", "content": training_data.thinking_instructions},
238                {"role": "assistant", "content": training_data.thinking},
239                {"role": "user", "content": training_data.thinking_final_answer_prompt},
240            ]
241        )
242
243    conversations.append({"role": "assistant", "content": training_data.final_output})
244
245    return {"conversations": conversations}
246
247
248def generate_huggingface_chat_template_toolcall(
249    training_data: ModelTrainingData,
250) -> Dict[str, Any]:
251    """Generate HuggingFace chat template with tool calls"""
252    try:
253        arguments = json.loads(training_data.final_output)
254    except json.JSONDecodeError as e:
255        raise ValueError(f"Invalid JSON in for tool call: {e}") from e
256
257    # See https://huggingface.co/docs/transformers/en/chat_templating
258    conversations: list[dict[str, Any]] = [
259        {"role": "system", "content": training_data.system_message},
260        {"role": "user", "content": training_data.input},
261    ]
262
263    if training_data.supports_cot():
264        conversations.extend(
265            [
266                {"role": "user", "content": training_data.thinking_instructions},
267                {"role": "assistant", "content": training_data.thinking},
268                {"role": "user", "content": training_data.thinking_final_answer_prompt},
269            ]
270        )
271
272    conversations.append(
273        {
274            "role": "assistant",
275            "tool_calls": [
276                {
277                    "type": "function",
278                    "function": {
279                        "name": "task_response",
280                        "id": str(uuid4()).replace("-", "")[:9],
281                        "arguments": arguments,
282                    },
283                }
284            ],
285        },
286    )
287
288    return {"conversations": conversations}
289
290
291def generate_vertex_gemini_1_5(
292    training_data: ModelTrainingData,
293) -> Dict[str, Any]:
294    """Generate Vertex Gemini 1.5 format (flash and pro)"""
295    # See https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-prepare
296
297    contents = [
298        {
299            "role": "user",
300            "parts": [
301                {
302                    "text": training_data.input,
303                }
304            ],
305        }
306    ]
307
308    if training_data.supports_cot():
309        contents.extend(
310            [
311                {
312                    "role": "user",
313                    "parts": [{"text": training_data.thinking_instructions}],
314                },
315                {"role": "model", "parts": [{"text": training_data.thinking}]},
316                {
317                    "role": "user",
318                    "parts": [{"text": training_data.thinking_final_answer_prompt}],
319                },
320            ]
321        )
322
323    contents.append(
324        {
325            "role": "model",
326            "parts": [{"text": training_data.final_output}],
327        }
328    )
329
330    return {
331        "systemInstruction": {
332            "role": "system",
333            "parts": [
334                {
335                    "text": training_data.system_message,
336                }
337            ],
338        },
339        "contents": contents,
340    }
341
342
343FORMAT_GENERATORS: Dict[DatasetFormat, FormatGenerator] = {
344    DatasetFormat.OPENAI_CHAT_JSONL: generate_chat_message_response,
345    DatasetFormat.OPENAI_CHAT_JSON_SCHEMA_JSONL: generate_json_schema_message,
346    DatasetFormat.OPENAI_CHAT_TOOLCALL_JSONL: generate_chat_message_toolcall,
347    DatasetFormat.HUGGINGFACE_CHAT_TEMPLATE_JSONL: generate_huggingface_chat_template,
348    DatasetFormat.HUGGINGFACE_CHAT_TEMPLATE_TOOLCALL_JSONL: generate_huggingface_chat_template_toolcall,
349    DatasetFormat.VERTEX_GEMINI_1_5: generate_vertex_gemini_1_5,
350}
351
352
353class DatasetFormatter:
354    """Handles formatting of datasets into various output formats"""
355
356    def __init__(
357        self,
358        dataset: DatasetSplit,
359        system_message: str,
360        thinking_instructions: str | None = None,
361    ):
362        self.dataset = dataset
363        self.system_message = system_message
364        self.thinking_instructions = thinking_instructions
365
366        task = dataset.parent_task()
367        if task is None:
368            raise ValueError("Dataset has no parent task")
369        self.task = task
370
371    def dump_to_file(
372        self,
373        split_name: str,
374        format_type: DatasetFormat,
375        data_strategy: FinetuneDataStrategy,
376        path: Path | None = None,
377    ) -> Path:
378        """
379        Format the dataset into the specified format.
380
381        Args:
382            split_name: Name of the split to dump
383            format_type: Format to generate the dataset in
384            path: Optional path to write to. If None, writes to temp directory
385
386        Returns:
387            Path to the generated file
388
389        Note:
390            The output is written in UTF-8 encoding with ensure_ascii=False to properly
391            support international text content while maintaining readability.
392        """
393        if format_type not in FORMAT_GENERATORS:
394            raise ValueError(f"Unsupported format: {format_type}")
395        if split_name not in self.dataset.split_contents:
396            raise ValueError(f"Split {split_name} not found in dataset")
397
398        generator = FORMAT_GENERATORS[format_type]
399
400        include_cot = data_strategy == FinetuneDataStrategy.final_and_intermediate
401
402        # Write to a temp file if no path is provided
403        output_path = (
404            path
405            or Path(tempfile.gettempdir())
406            / f"{self.dataset.name} -- split-{split_name} -- format-{format_type.value} -- {'cot' if include_cot else 'no-cot'}.jsonl"
407        )
408
409        runs = self.task.runs()
410        runs_by_id = {run.id: run for run in runs}
411
412        # Generate formatted output with UTF-8 encoding
413        with open(output_path, "w", encoding="utf-8") as f:
414            for run_id in self.dataset.split_contents[split_name]:
415                task_run = runs_by_id[run_id]
416                if task_run is None:
417                    raise ValueError(
418                        f"Task run {run_id} not found. This is required by this dataset."
419                    )
420
421                training_data = build_training_data(
422                    task_run=task_run,
423                    system_message=self.system_message,
424                    include_cot=include_cot,
425                    thinking_instructions=self.thinking_instructions,
426                )
427                example = generator(training_data)
428                # Allow non-ascii characters in the dataset.
429                # Better readability for non-English users. If you don't support UTF-8... you should.
430                f.write(json.dumps(example, ensure_ascii=False) + "\n")
431
432        return output_path
class DatasetFormat(builtins.str, enum.Enum):
14class DatasetFormat(str, Enum):
15    """Formats for dataset generation. Both for file format (like JSONL), and internal structure (like chat/toolcall)"""
16
17    """OpenAI chat format with plaintext response"""
18    OPENAI_CHAT_JSONL = "openai_chat_jsonl"
19
20    """OpenAI chat format with json response_format"""
21    OPENAI_CHAT_JSON_SCHEMA_JSONL = "openai_chat_json_schema_jsonl"
22
23    """OpenAI chat format with tool call response"""
24    OPENAI_CHAT_TOOLCALL_JSONL = "openai_chat_toolcall_jsonl"
25
26    """HuggingFace chat template in JSONL"""
27    HUGGINGFACE_CHAT_TEMPLATE_JSONL = "huggingface_chat_template_jsonl"
28
29    """HuggingFace chat template with tool calls in JSONL"""
30    HUGGINGFACE_CHAT_TEMPLATE_TOOLCALL_JSONL = (
31        "huggingface_chat_template_toolcall_jsonl"
32    )
33
34    """Vertex Gemini 1.5 format (flash and pro)"""
35    VERTEX_GEMINI_1_5 = "vertex_gemini_1_5"

Formats for dataset generation. Both for file format (like JSONL), and internal structure (like chat/toolcall)

OPENAI_CHAT_JSONL = <DatasetFormat.OPENAI_CHAT_JSONL: 'openai_chat_jsonl'>

OpenAI chat format with json response_format

OPENAI_CHAT_JSON_SCHEMA_JSONL = <DatasetFormat.OPENAI_CHAT_JSON_SCHEMA_JSONL: 'openai_chat_json_schema_jsonl'>

OpenAI chat format with tool call response

OPENAI_CHAT_TOOLCALL_JSONL = <DatasetFormat.OPENAI_CHAT_TOOLCALL_JSONL: 'openai_chat_toolcall_jsonl'>

HuggingFace chat template in JSONL

HUGGINGFACE_CHAT_TEMPLATE_JSONL = <DatasetFormat.HUGGINGFACE_CHAT_TEMPLATE_JSONL: 'huggingface_chat_template_jsonl'>

HuggingFace chat template with tool calls in JSONL

HUGGINGFACE_CHAT_TEMPLATE_TOOLCALL_JSONL = <DatasetFormat.HUGGINGFACE_CHAT_TEMPLATE_TOOLCALL_JSONL: 'huggingface_chat_template_toolcall_jsonl'>

Vertex Gemini 1.5 format (flash and pro)

VERTEX_GEMINI_1_5 = <DatasetFormat.VERTEX_GEMINI_1_5: 'vertex_gemini_1_5'>
@dataclass
class ModelTrainingData:
38@dataclass
39class ModelTrainingData:
40    input: str
41    system_message: str
42    final_output: str
43    # These 3 are optional, and used for COT/Thinking style multi-message responses
44    thinking_instructions: str | None = None
45    thinking: str | None = None
46    thinking_final_answer_prompt: str | None = None
47
48    def supports_cot(self) -> bool:
49        return (
50            self.thinking_instructions is not None
51            and self.thinking is not None
52            and self.thinking_final_answer_prompt is not None
53        )
ModelTrainingData( input: str, system_message: str, final_output: str, thinking_instructions: str | None = None, thinking: str | None = None, thinking_final_answer_prompt: str | None = None)
input: str
system_message: str
final_output: str
thinking_instructions: str | None = None
thinking: str | None = None
thinking_final_answer_prompt: str | None = None
def supports_cot(self) -> bool:
48    def supports_cot(self) -> bool:
49        return (
50            self.thinking_instructions is not None
51            and self.thinking is not None
52            and self.thinking_final_answer_prompt is not None
53        )
class FormatGenerator(typing.Protocol):
56class FormatGenerator(Protocol):
57    """Protocol for format generators"""
58
59    def __call__(
60        self,
61        training_data: ModelTrainingData,
62    ) -> Dict[str, Any]: ...

Protocol for format generators

FormatGenerator(*args, **kwargs)
1945def _no_init_or_replace_init(self, *args, **kwargs):
1946    cls = type(self)
1947
1948    if cls._is_protocol:
1949        raise TypeError('Protocols cannot be instantiated')
1950
1951    # Already using a custom `__init__`. No need to calculate correct
1952    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1953    if cls.__init__ is not _no_init_or_replace_init:
1954        return
1955
1956    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1957    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1958    # searches for a proper new `__init__` in the MRO. The new `__init__`
1959    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1960    # instantiation of the protocol subclass will thus use the new
1961    # `__init__` and no longer call `_no_init_or_replace_init`.
1962    for base in cls.__mro__:
1963        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1964        if init is not _no_init_or_replace_init:
1965            cls.__init__ = init
1966            break
1967    else:
1968        # should not happen
1969        cls.__init__ = object.__init__
1970
1971    cls.__init__(self, *args, **kwargs)
def build_training_data( task_run: kiln_ai.datamodel.TaskRun, system_message: str, include_cot: bool, thinking_instructions: str | None = None) -> ModelTrainingData:
 65def build_training_data(
 66    task_run: TaskRun,
 67    system_message: str,
 68    include_cot: bool,
 69    thinking_instructions: str | None = None,
 70) -> ModelTrainingData:
 71    """
 72    Generate data for training.
 73
 74    For final output, get the best task output from the task run, preferring repaired output if available.
 75
 76    For thinking, get the intermediate output if it exists, otherwise return None.
 77    """
 78    final_output = task_run.output.output
 79    if task_run.repaired_output is not None:
 80        final_output = task_run.repaired_output.output
 81
 82    thinking = None
 83    thinking_final_answer_prompt = None
 84    parent_task = task_run.parent_task()
 85
 86    if include_cot and task_run.has_thinking_training_data():
 87        if not parent_task:
 88            raise ValueError(
 89                "TaskRuns for training required a parent Task for building a chain of thought prompts. Train without COT, or save this TaskRun to a parent Task."
 90            )
 91
 92        # Prefer reasoning to cot if both are present
 93        intermediate_outputs = task_run.intermediate_outputs or {}
 94        thinking = intermediate_outputs.get("reasoning") or intermediate_outputs.get(
 95            "chain_of_thought"
 96        )
 97
 98        thinking_final_answer_prompt = COT_FINAL_ANSWER_PROMPT
 99
100        # Always use the passed thinking instructions, but check they are present for COT
101        if not thinking_instructions:
102            raise ValueError(
103                "Thinking instructions are required when data_strategy is final_and_intermediate"
104            )
105
106    return ModelTrainingData(
107        input=task_run.input,
108        system_message=system_message,
109        final_output=final_output,
110        thinking=thinking,
111        thinking_instructions=thinking_instructions,
112        thinking_final_answer_prompt=thinking_final_answer_prompt,
113    )

Generate data for training.

For final output, get the best task output from the task run, preferring repaired output if available.

For thinking, get the intermediate output if it exists, otherwise return None.

def generate_chat_message_response( training_data: ModelTrainingData) -> Dict[str, Any]:
116def generate_chat_message_response(
117    training_data: ModelTrainingData,
118) -> Dict[str, Any]:
119    """Generate OpenAI chat format with plaintext response"""
120
121    messages: list[dict[str, str | None]] = [
122        {"role": "system", "content": training_data.system_message},
123        {"role": "user", "content": training_data.input},
124    ]
125
126    if training_data.supports_cot():
127        messages.extend(
128            [
129                {"role": "user", "content": training_data.thinking_instructions},
130                {"role": "assistant", "content": training_data.thinking},
131                {
132                    "role": "user",
133                    "content": training_data.thinking_final_answer_prompt,
134                },
135            ]
136        )
137
138    messages.append({"role": "assistant", "content": training_data.final_output})
139
140    return {"messages": messages}

Generate OpenAI chat format with plaintext response

def generate_json_schema_message( training_data: ModelTrainingData) -> Dict[str, Any]:
143def generate_json_schema_message(
144    training_data: ModelTrainingData,
145) -> Dict[str, Any]:
146    """Generate OpenAI chat format with validated JSON response"""
147    # Load and dump to ensure it's valid JSON and goes to 1 line
148    try:
149        json_data = json.loads(training_data.final_output)
150    except json.JSONDecodeError as e:
151        raise ValueError(
152            f"Invalid JSON in JSON Schema training set: {e}\nOutput Data: {training_data.final_output}"
153        ) from e
154    json_string = json.dumps(json_data, ensure_ascii=False)
155
156    messages: list[dict[str, str | None]] = [
157        {"role": "system", "content": training_data.system_message},
158        {"role": "user", "content": training_data.input},
159    ]
160
161    if training_data.supports_cot():
162        messages.extend(
163            [
164                {"role": "user", "content": training_data.thinking_instructions},
165                {"role": "assistant", "content": training_data.thinking},
166                {
167                    "role": "user",
168                    "content": training_data.thinking_final_answer_prompt,
169                },
170            ]
171        )
172
173    messages.append({"role": "assistant", "content": json_string})
174
175    return {"messages": messages}

Generate OpenAI chat format with validated JSON response

def generate_chat_message_toolcall( training_data: ModelTrainingData) -> Dict[str, Any]:
178def generate_chat_message_toolcall(
179    training_data: ModelTrainingData,
180) -> Dict[str, Any]:
181    """Generate OpenAI chat format with tool call response"""
182    try:
183        arguments = json.loads(training_data.final_output)
184    except json.JSONDecodeError as e:
185        raise ValueError(f"Invalid JSON in for tool call: {e}") from e
186
187    messages: list[dict[str, Any]] = [
188        {"role": "system", "content": training_data.system_message},
189        {"role": "user", "content": training_data.input},
190    ]
191
192    if training_data.supports_cot():
193        messages.extend(
194            [
195                {"role": "user", "content": training_data.thinking_instructions},
196                {"role": "assistant", "content": training_data.thinking},
197                {
198                    "role": "user",
199                    "content": training_data.thinking_final_answer_prompt,
200                },
201            ]
202        )
203
204    messages.append(
205        {
206            "role": "assistant",
207            "content": None,
208            "tool_calls": [
209                {
210                    "id": "call_1",
211                    "type": "function",
212                    "function": {
213                        "name": "task_response",
214                        # Yes we parse then dump again. This ensures it's valid JSON, and ensures it goes to 1 line
215                        "arguments": json.dumps(arguments, ensure_ascii=False),
216                    },
217                }
218            ],
219        },
220    )
221
222    return {"messages": messages}

Generate OpenAI chat format with tool call response

def generate_huggingface_chat_template( training_data: ModelTrainingData) -> Dict[str, Any]:
225def generate_huggingface_chat_template(
226    training_data: ModelTrainingData,
227) -> Dict[str, Any]:
228    """Generate HuggingFace chat template"""
229
230    conversations: list[dict[str, Any]] = [
231        {"role": "system", "content": training_data.system_message},
232        {"role": "user", "content": training_data.input},
233    ]
234
235    if training_data.supports_cot():
236        conversations.extend(
237            [
238                {"role": "user", "content": training_data.thinking_instructions},
239                {"role": "assistant", "content": training_data.thinking},
240                {"role": "user", "content": training_data.thinking_final_answer_prompt},
241            ]
242        )
243
244    conversations.append({"role": "assistant", "content": training_data.final_output})
245
246    return {"conversations": conversations}

Generate HuggingFace chat template

def generate_huggingface_chat_template_toolcall( training_data: ModelTrainingData) -> Dict[str, Any]:
249def generate_huggingface_chat_template_toolcall(
250    training_data: ModelTrainingData,
251) -> Dict[str, Any]:
252    """Generate HuggingFace chat template with tool calls"""
253    try:
254        arguments = json.loads(training_data.final_output)
255    except json.JSONDecodeError as e:
256        raise ValueError(f"Invalid JSON in for tool call: {e}") from e
257
258    # See https://huggingface.co/docs/transformers/en/chat_templating
259    conversations: list[dict[str, Any]] = [
260        {"role": "system", "content": training_data.system_message},
261        {"role": "user", "content": training_data.input},
262    ]
263
264    if training_data.supports_cot():
265        conversations.extend(
266            [
267                {"role": "user", "content": training_data.thinking_instructions},
268                {"role": "assistant", "content": training_data.thinking},
269                {"role": "user", "content": training_data.thinking_final_answer_prompt},
270            ]
271        )
272
273    conversations.append(
274        {
275            "role": "assistant",
276            "tool_calls": [
277                {
278                    "type": "function",
279                    "function": {
280                        "name": "task_response",
281                        "id": str(uuid4()).replace("-", "")[:9],
282                        "arguments": arguments,
283                    },
284                }
285            ],
286        },
287    )
288
289    return {"conversations": conversations}

Generate HuggingFace chat template with tool calls

def generate_vertex_gemini_1_5( training_data: ModelTrainingData) -> Dict[str, Any]:
292def generate_vertex_gemini_1_5(
293    training_data: ModelTrainingData,
294) -> Dict[str, Any]:
295    """Generate Vertex Gemini 1.5 format (flash and pro)"""
296    # See https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-prepare
297
298    contents = [
299        {
300            "role": "user",
301            "parts": [
302                {
303                    "text": training_data.input,
304                }
305            ],
306        }
307    ]
308
309    if training_data.supports_cot():
310        contents.extend(
311            [
312                {
313                    "role": "user",
314                    "parts": [{"text": training_data.thinking_instructions}],
315                },
316                {"role": "model", "parts": [{"text": training_data.thinking}]},
317                {
318                    "role": "user",
319                    "parts": [{"text": training_data.thinking_final_answer_prompt}],
320                },
321            ]
322        )
323
324    contents.append(
325        {
326            "role": "model",
327            "parts": [{"text": training_data.final_output}],
328        }
329    )
330
331    return {
332        "systemInstruction": {
333            "role": "system",
334            "parts": [
335                {
336                    "text": training_data.system_message,
337                }
338            ],
339        },
340        "contents": contents,
341    }

Generate Vertex Gemini 1.5 format (flash and pro)

FORMAT_GENERATORS: Dict[DatasetFormat, FormatGenerator] = {<DatasetFormat.OPENAI_CHAT_JSONL: 'openai_chat_jsonl'>: <function generate_chat_message_response>, <DatasetFormat.OPENAI_CHAT_JSON_SCHEMA_JSONL: 'openai_chat_json_schema_jsonl'>: <function generate_json_schema_message>, <DatasetFormat.OPENAI_CHAT_TOOLCALL_JSONL: 'openai_chat_toolcall_jsonl'>: <function generate_chat_message_toolcall>, <DatasetFormat.HUGGINGFACE_CHAT_TEMPLATE_JSONL: 'huggingface_chat_template_jsonl'>: <function generate_huggingface_chat_template>, <DatasetFormat.HUGGINGFACE_CHAT_TEMPLATE_TOOLCALL_JSONL: 'huggingface_chat_template_toolcall_jsonl'>: <function generate_huggingface_chat_template_toolcall>, <DatasetFormat.VERTEX_GEMINI_1_5: 'vertex_gemini_1_5'>: <function generate_vertex_gemini_1_5>}
class DatasetFormatter:
354class DatasetFormatter:
355    """Handles formatting of datasets into various output formats"""
356
357    def __init__(
358        self,
359        dataset: DatasetSplit,
360        system_message: str,
361        thinking_instructions: str | None = None,
362    ):
363        self.dataset = dataset
364        self.system_message = system_message
365        self.thinking_instructions = thinking_instructions
366
367        task = dataset.parent_task()
368        if task is None:
369            raise ValueError("Dataset has no parent task")
370        self.task = task
371
372    def dump_to_file(
373        self,
374        split_name: str,
375        format_type: DatasetFormat,
376        data_strategy: FinetuneDataStrategy,
377        path: Path | None = None,
378    ) -> Path:
379        """
380        Format the dataset into the specified format.
381
382        Args:
383            split_name: Name of the split to dump
384            format_type: Format to generate the dataset in
385            path: Optional path to write to. If None, writes to temp directory
386
387        Returns:
388            Path to the generated file
389
390        Note:
391            The output is written in UTF-8 encoding with ensure_ascii=False to properly
392            support international text content while maintaining readability.
393        """
394        if format_type not in FORMAT_GENERATORS:
395            raise ValueError(f"Unsupported format: {format_type}")
396        if split_name not in self.dataset.split_contents:
397            raise ValueError(f"Split {split_name} not found in dataset")
398
399        generator = FORMAT_GENERATORS[format_type]
400
401        include_cot = data_strategy == FinetuneDataStrategy.final_and_intermediate
402
403        # Write to a temp file if no path is provided
404        output_path = (
405            path
406            or Path(tempfile.gettempdir())
407            / f"{self.dataset.name} -- split-{split_name} -- format-{format_type.value} -- {'cot' if include_cot else 'no-cot'}.jsonl"
408        )
409
410        runs = self.task.runs()
411        runs_by_id = {run.id: run for run in runs}
412
413        # Generate formatted output with UTF-8 encoding
414        with open(output_path, "w", encoding="utf-8") as f:
415            for run_id in self.dataset.split_contents[split_name]:
416                task_run = runs_by_id[run_id]
417                if task_run is None:
418                    raise ValueError(
419                        f"Task run {run_id} not found. This is required by this dataset."
420                    )
421
422                training_data = build_training_data(
423                    task_run=task_run,
424                    system_message=self.system_message,
425                    include_cot=include_cot,
426                    thinking_instructions=self.thinking_instructions,
427                )
428                example = generator(training_data)
429                # Allow non-ascii characters in the dataset.
430                # Better readability for non-English users. If you don't support UTF-8... you should.
431                f.write(json.dumps(example, ensure_ascii=False) + "\n")
432
433        return output_path

Handles formatting of datasets into various output formats

DatasetFormatter( dataset: kiln_ai.datamodel.DatasetSplit, system_message: str, thinking_instructions: str | None = None)
357    def __init__(
358        self,
359        dataset: DatasetSplit,
360        system_message: str,
361        thinking_instructions: str | None = None,
362    ):
363        self.dataset = dataset
364        self.system_message = system_message
365        self.thinking_instructions = thinking_instructions
366
367        task = dataset.parent_task()
368        if task is None:
369            raise ValueError("Dataset has no parent task")
370        self.task = task
dataset
system_message
thinking_instructions
task
def dump_to_file( self, split_name: str, format_type: DatasetFormat, data_strategy: kiln_ai.datamodel.FinetuneDataStrategy, path: pathlib._local.Path | None = None) -> pathlib._local.Path:
372    def dump_to_file(
373        self,
374        split_name: str,
375        format_type: DatasetFormat,
376        data_strategy: FinetuneDataStrategy,
377        path: Path | None = None,
378    ) -> Path:
379        """
380        Format the dataset into the specified format.
381
382        Args:
383            split_name: Name of the split to dump
384            format_type: Format to generate the dataset in
385            path: Optional path to write to. If None, writes to temp directory
386
387        Returns:
388            Path to the generated file
389
390        Note:
391            The output is written in UTF-8 encoding with ensure_ascii=False to properly
392            support international text content while maintaining readability.
393        """
394        if format_type not in FORMAT_GENERATORS:
395            raise ValueError(f"Unsupported format: {format_type}")
396        if split_name not in self.dataset.split_contents:
397            raise ValueError(f"Split {split_name} not found in dataset")
398
399        generator = FORMAT_GENERATORS[format_type]
400
401        include_cot = data_strategy == FinetuneDataStrategy.final_and_intermediate
402
403        # Write to a temp file if no path is provided
404        output_path = (
405            path
406            or Path(tempfile.gettempdir())
407            / f"{self.dataset.name} -- split-{split_name} -- format-{format_type.value} -- {'cot' if include_cot else 'no-cot'}.jsonl"
408        )
409
410        runs = self.task.runs()
411        runs_by_id = {run.id: run for run in runs}
412
413        # Generate formatted output with UTF-8 encoding
414        with open(output_path, "w", encoding="utf-8") as f:
415            for run_id in self.dataset.split_contents[split_name]:
416                task_run = runs_by_id[run_id]
417                if task_run is None:
418                    raise ValueError(
419                        f"Task run {run_id} not found. This is required by this dataset."
420                    )
421
422                training_data = build_training_data(
423                    task_run=task_run,
424                    system_message=self.system_message,
425                    include_cot=include_cot,
426                    thinking_instructions=self.thinking_instructions,
427                )
428                example = generator(training_data)
429                # Allow non-ascii characters in the dataset.
430                # Better readability for non-English users. If you don't support UTF-8... you should.
431                f.write(json.dumps(example, ensure_ascii=False) + "\n")
432
433        return output_path

Format the dataset into the specified format.

Args: split_name: Name of the split to dump format_type: Format to generate the dataset in path: Optional path to write to. If None, writes to temp directory

Returns: Path to the generated file

Note: The output is written in UTF-8 encoding with ensure_ascii=False to properly support international text content while maintaining readability.