marin-community
diff --git a/‎experiments/exp_hermes_trace_sft_pilot.py‎
Lines changed: 115 additions & 0 deletions b/‎experiments/exp_hermes_trace_sft_pilot.py‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎experiments/posttrain/instruction_datasets.py‎
Lines changed: 42 additions & 0 deletions b/‎experiments/posttrain/instruction_datasets.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎lib/marin/src/marin/transform/conversation/trace_normalization.py‎
Lines changed: 113 additions & 0 deletions b/‎lib/marin/src/marin/transform/conversation/trace_normalization.py‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎tests/test_marin_chat_template.py‎
Lines changed: 39 additions & 0 deletions b/‎tests/test_marin_chat_template.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎tests/transform/fixtures/agent_traces/hermes_glm_sample.json‎
Lines changed: 29 additions & 0 deletions b/‎tests/transform/fixtures/agent_traces/hermes_glm_sample.json‎
Lines changed: 29 additions & 0 deletions
@@ -0,0 +1,115 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Trace-focused Hermes SFT pilot built on the SmolTalk2 + Nemotron recipe."""
+
+import dataclasses
+import math
+import re
+
+from levanter.data.text import ChatLmDatasetFormat
+
+from experiments.defaults import default_sft, default_tokenize
+from experiments.evals.evals import default_sft_eval
+from experiments.llama import llama_8b
+from experiments.marin_models import marin_tokenizer
+from experiments.posttrain.instruction_datasets import INSTRUCTION_DATASET_NAME_TO_CONFIG, get_instruction_dataset
+from experiments.simple_sft_config import SimpleSFTConfig
+from fray.cluster import ResourceConfig
+from marin.execution.executor import ExecutorStep, executor_main
+from marin.processing.tokenize import lm_mixture_data_config
+
+SLUGIFY_PATTERN = re.compile(r"[^a-z0-9]+")
+TARGET_EPOCHS = 3
+TRAIN_BATCH_SIZE = 2048
+
+# Row counts captured on 2026-04-16 from the Hugging Face dataset page / datasets-server.
+TRACE_PILOT_DATASETS = {
+    "smoltalk2_smolagents_toolcalling_traces_think": (
+        "HuggingFaceTB/smoltalk2/smolagents_toolcalling_traces_think",
+        9079,
+    ),
+    "smoltalk2_hermes_function_calling_v1_no_think": (
+        "HuggingFaceTB/smoltalk2/hermes_function_calling_v1_no_think",
+        8961,
+    ),
+    "smoltalk2_xlam_traces_no_think": (
+        "HuggingFaceTB/smoltalk2/xlam_traces_no_think",
+        59962,
+    ),
+    "nemotron_v2_chat": ("nvidia/Nemotron-Post-Training-Dataset-v2/chat", 627720),
+    "nemotron_v2_code": ("nvidia/Nemotron-Post-Training-Dataset-v2/code", 175000),
+    "hermes_glm_5_1": ("lambda/hermes-agent-reasoning-traces/glm-5.1", 7055),
+    "hermes_kimi": ("lambda/hermes-agent-reasoning-traces/kimi", 7646),
+}
+
+
+def _slugify(value: str) -> str:
+    slug = SLUGIFY_PATTERN.sub("_", value.lower()).strip("_")
+    return slug or "dataset"
+
+
+def create_tokenization_step(dataset_identifier: str, short_name: str) -> ExecutorStep:
+    dataset_config = INSTRUCTION_DATASET_NAME_TO_CONFIG[dataset_identifier]
+    dataset = get_instruction_dataset(dataset_identifier, splits=dataset_config.splits)
+    return default_tokenize(
+        name=f"{short_name}_marin_tokenizer",
+        dataset=dataset / "**/*.jsonl.gz",
+        tokenizer=marin_tokenizer,
+        format=ChatLmDatasetFormat(),
+    )
+
+
+dataset_ids = {
+    _slugify(short_name): dataset_identifier for short_name, (dataset_identifier, _count) in TRACE_PILOT_DATASETS.items()
+}
+mixture_weights = {
+    _slugify(short_name): row_count for short_name, (_dataset_identifier, row_count) in TRACE_PILOT_DATASETS.items()
+}
+tokenized_datasets = {
+    short_name: create_tokenization_step(dataset_identifier, short_name)
+    for short_name, dataset_identifier in dataset_ids.items()
+}
+
+assert set(tokenized_datasets.keys()) == set(mixture_weights.keys())
+
+total_examples = sum(mixture_weights.values())
+num_train_steps = math.ceil(TARGET_EPOCHS * total_examples / TRAIN_BATCH_SIZE)
+
+pilot_sft_config = SimpleSFTConfig(
+    train_batch_size=TRAIN_BATCH_SIZE,
+    num_train_steps=num_train_steps,
+    learning_rate=1e-5,
+    resources=ResourceConfig.with_tpu("v4-128"),
+    tokenizer=marin_tokenizer,
+    initialize_from_hf="marin-community/marin-8b-base",
+    max_seq_len=8192,
+    seed=0,
+)
+
+pilot_mixture = lm_mixture_data_config(
+    tokenized_datasets,
+    mixture_weights,
+    shuffle=True,
+    missing_weights_are_validation=True,
+)
+
+llama_8b_8k = dataclasses.replace(llama_8b, max_seq_len=8192)
+
+marin_8b_sft_hermes_trace_pilot = default_sft(
+    name="marin_8b_sft_hermes_trace_pilot",
+    tokenized=pilot_mixture,
+    model_config=llama_8b_8k,
+    sft_config=pilot_sft_config,
+    tags=["llama", "smoltalk2", "nemotron_v2", "hermes_trace", "sft"],
+)
+
+marin_8b_sft_hermes_trace_pilot_evals = default_sft_eval(
+    marin_8b_sft_hermes_trace_pilot,
+    use_levanter_inference=True,
+    resource_config=ResourceConfig.with_tpu("v4-8"),
+)
+
+
+if __name__ == "__main__":
+    executor_main(steps=[marin_8b_sft_hermes_trace_pilot, *marin_8b_sft_hermes_trace_pilot_evals])
@@ -35,6 +35,7 @@
 20. nvidia/Nemotron-Post-Training-Dataset-v2
 21. HuggingFaceH4/no_robots
 22. open-thoughts/OpenThoughts3-1.2M  # Original OT3 dataset; smoltalk2 uses a slightly different version
+23. lambda/hermes-agent-reasoning-traces
 """
 
 import hashlib
@@ -67,6 +68,10 @@
     TransformSFTDatasetConfig,
     transform_hf_dataset,
 )
+from marin.transform.conversation.trace_normalization import (
+    hermes_trace_row_id,
+    normalize_hermes_trace_messages,
+)
 
 SMOLTALK2_SPLITS = [
     "LongAlign_64k_Qwen3_32B_yarn_131k_think",
@@ -109,6 +114,7 @@
 ]
 
 NEMOTRON_V1_SPLITS = ["chat", "code", "math", "stem", "tool_calling"]
+HERMES_TRACE_REVISION = "aa7c93605c71578869938359075b1765cf1b26e1"
 
 
 @dataclass(frozen=True)
@@ -309,6 +315,42 @@ def __call__(self, row: dict[str, Any]) -> dict[str, Any]:
         metadata_columns=["id", "category", "source"],
         name="teknium/OpenHermes-2.5",
     ),
+    "lambda/hermes-agent-reasoning-traces/glm-5.1": InstructionDatasetConfig(
+        hf_dataset_id="lambda/hermes-agent-reasoning-traces",
+        revision=HERMES_TRACE_REVISION,
+        adapter=multi_turn_adapter(
+            conversation_column="conversations",
+            role_key="from",
+            user_value="human",
+            assistant_value="gpt",
+            system_value="system",
+            content_key="value",
+            message_postprocess_fn=normalize_hermes_trace_messages,
+            row_id_fn=hermes_trace_row_id,
+        ),
+        metadata_columns=["category", "subcategory", "task"],
+        name="lambda/hermes-agent-reasoning-traces/glm-5.1",
+        subsets=["glm-5.1"],
+        splits=["train"],
+    ),
+    "lambda/hermes-agent-reasoning-traces/kimi": InstructionDatasetConfig(
+        hf_dataset_id="lambda/hermes-agent-reasoning-traces",
+        revision=HERMES_TRACE_REVISION,
+        adapter=multi_turn_adapter(
+            conversation_column="conversations",
+            role_key="from",
+            user_value="human",
+            assistant_value="gpt",
+            system_value="system",
+            content_key="value",
+            message_postprocess_fn=normalize_hermes_trace_messages,
+            row_id_fn=hermes_trace_row_id,
+        ),
+        metadata_columns=["category", "subcategory", "task"],
+        name="lambda/hermes-agent-reasoning-traces/kimi",
+        subsets=["kimi"],
+        splits=["train"],
+    ),
     "allenai/tulu-v2-sft-mixture-olmo-4096": InstructionDatasetConfig(
         hf_dataset_id="allenai/tulu-v2-sft-mixture-olmo-4096",
         revision="7a7c388",
 
@@ -0,0 +1,113 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Normalization helpers for trace-like conversation datasets."""
+
+import hashlib
+import json
+import re
+from typing import Any
+
+from marin.core.conversation import OpenAIChatMessage
+
+_HERMES_TOOL_RESPONSE_RE = re.compile(
+    r"^\s*<tool_response(?P<attrs>[^>]*)>\s*(?P<body>.*?)\s*</tool_response>\s*$",
+    re.DOTALL,
+)
+_HERMES_TOOL_RESPONSE_ATTR_RE = re.compile(r"""(?P<key>name|id)\s*=\s*(?P<quote>["'])(?P<value>.*?)(?P=quote)""")
+
+
+def _hash_messages(messages: list[dict[str, Any]]) -> str:
+    return hashlib.sha256(str(messages).encode()).hexdigest()
+
+
+def _string_or_none(value: Any) -> str | None:
+    return value if isinstance(value, str) and value else None
+
+
+def _parse_tool_response_attrs(attrs: str) -> tuple[str | None, str | None]:
+    name: str | None = None
+    tool_call_id: str | None = None
+
+    for match in _HERMES_TOOL_RESPONSE_ATTR_RE.finditer(attrs):
+        key = match.group("key")
+        value = match.group("value")
+        if key == "name":
+            name = value
+        elif key == "id":
+            tool_call_id = value
+
+    return name, tool_call_id
+
+
+def _parse_tool_response_body(
+    body: str,
+    *,
+    name: str | None,
+    tool_call_id: str | None,
+) -> tuple[str | None, str | None, Any] | None:
+    try:
+        payload = json.loads(body)
+    except json.JSONDecodeError:
+        return None
+
+    if not isinstance(payload, dict):
+        return None
+
+    normalized_name = name or _string_or_none(payload.get("name"))
+    normalized_tool_call_id = tool_call_id or _string_or_none(payload.get("tool_call_id"))
+    if "content" in payload:
+        return normalized_name, normalized_tool_call_id, payload["content"]
+
+    return normalized_name, normalized_tool_call_id, payload
+
+
+def _normalize_hermes_tool_response_message(message: OpenAIChatMessage) -> OpenAIChatMessage:
+    if message.role != "tool" or not isinstance(message.content, str):
+        return message
+
+    match = _HERMES_TOOL_RESPONSE_RE.fullmatch(message.content)
+    if match is None:
+        return message
+
+    name, tool_call_id = _parse_tool_response_attrs(match.group("attrs"))
+    parsed_body = _parse_tool_response_body(
+        match.group("body").strip(),
+        name=name,
+        tool_call_id=tool_call_id,
+    )
+    if parsed_body is None:
+        return message
+
+    normalized_name, normalized_tool_call_id, normalized_content = parsed_body
+    return message.model_copy(
+        update={
+            "content": normalized_content,
+            "name": normalized_name,
+            "tool_call_id": normalized_tool_call_id,
+        }
+    )
+
+
+def normalize_hermes_trace_messages(
+    messages: list[OpenAIChatMessage],
+    row: dict[str, Any],
+) -> list[OpenAIChatMessage]:
+    """Normalize Hermes trace messages for Marin's conversation pipeline.
+
+    Hermes assistant turns already contain the desired `<think>` and `<tool_call>` blocks, so we
+    leave them untouched. Tool turns arrive wrapped in `<tool_response>` tags; Marin's chat
+    template would wrap those again, so we strip only the outer wrapper when the payload parses
+    cleanly. If wrapper parsing fails, we preserve the raw source content unchanged.
+    """
+
+    return [_normalize_hermes_tool_response_message(message) for message in messages]
+
+
+def hermes_trace_row_id(row: dict[str, Any], messages: list[dict[str, Any]]) -> str:
+    """Return the source trace ID when available, otherwise fall back to the message hash."""
+
+    source_id = row.get("id")
+    if isinstance(source_id, str) and source_id:
+        return source_id
+    return _hash_messages(messages)
@@ -1,8 +1,10 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 import tempfile
 from collections.abc import Sequence
+from pathlib import Path
 
 import pytest
 
@@ -11,8 +13,16 @@
     load_llama3_tokenizer,
     run_all_tests,
 )
+from experiments.posttrain.instruction_datasets import INSTRUCTION_DATASET_NAME_TO_CONFIG
 from levanter.data.text import ChatProcessor
 from levanter.tokenizers import load_tokenizer
+from marin.transform.conversation.transform_conversation import TransformSFTDatasetConfig, transform_row
+
+FIXTURE_DIR = Path(__file__).parent / "transform" / "fixtures" / "agent_traces"
+
+
+def _load_agent_trace_fixture(name: str) -> dict:
+    return json.loads((FIXTURE_DIR / name).read_text(encoding="utf-8"))
 
 
 @pytest.fixture()
@@ -113,3 +123,32 @@ def test_marin_chat_template_ipython_output(fresh_marin_tokenizer):
     assert "<|start_header_id|>ipython<|end_header_id|>" in rendered
     assert '{"output": "4\\n"}' in rendered
     assert result["assistant_masks"].sum() > 0
+
+
+def test_marin_chat_template_normalizes_hermes_tool_responses(fresh_marin_tokenizer):
+    tokenizer = fresh_marin_tokenizer
+    processor = ChatProcessor(tokenizer, mask_user_turns=True)
+
+    dataset_cfg = INSTRUCTION_DATASET_NAME_TO_CONFIG["lambda/hermes-agent-reasoning-traces/glm-5.1"]
+    row = _load_agent_trace_fixture("hermes_glm_sample.json")
+    cfg = TransformSFTDatasetConfig(
+        source=dataset_cfg.hf_dataset_id,
+        revision=dataset_cfg.revision,
+        output_path="/tmp/output",
+        metadata_columns=dataset_cfg.metadata_columns,
+        adapter=dataset_cfg.adapter,
+        subsets=dataset_cfg.subsets,
+        splits=dataset_cfg.splits,
+    )
+    transformed = transform_row(row, cfg, dataset_cfg.adapter)
+    assert transformed is not None
+
+    batch = [{"messages": [message.model_dump() for message in transformed.messages]}]
+    result = processor(batch)[0]
+    rendered = decode_sequence(tokenizer, result["input_ids"])
+
+    assert "<|start_think|>" in rendered
+    assert '<tool_response name="write_file" id="glm-tool-call-001">' in rendered
+    assert '"bytes_written": 15' in rendered
+    assert '<tool_response>\n{"tool_call_id": "glm-tool-call-001"' not in rendered
+    assert result["assistant_masks"].sum() > 0
@@ -0,0 +1,29 @@
+{
+  "id": "glm-trace-001",
+  "conversations": [
+    {
+      "from": "system",
+      "value": "You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags.\n<tools>\n[{\"name\": \"write_file\", \"description\": \"Write content to a file\", \"parameters\": {\"type\": \"object\", \"properties\": {\"path\": {\"type\": \"string\"}, \"content\": {\"type\": \"string\"}}, \"required\": [\"path\", \"content\"]}}]\n</tools>"
+    },
+    {
+      "from": "human",
+      "value": "Write a tiny Python script that prints hello."
+    },
+    {
+      "from": "gpt",
+      "value": "<think>\nI should write the file first.\n</think>\n<tool_call>\n{\"name\": \"write_file\", \"arguments\": {\"path\": \"hello.py\", \"content\": \"print('hello')\\n\"}}\n</tool_call>"
+    },
+    {
+      "from": "tool",
+      "value": "<tool_response>\n{\"tool_call_id\": \"glm-tool-call-001\", \"name\": \"write_file\", \"content\": {\"bytes_written\": 15, \"dirs_created\": false}}\n</tool_response>"
+    },
+    {
+      "from": "gpt",
+      "value": "<think>\nThe file was written successfully.\n</think>\nThe script is ready."
+    }
+  ],
+  "tools": "[{\"name\": \"write_file\", \"description\": \"Write content to a file\", \"parameters\": {\"type\": \"object\", \"properties\": {\"path\": {\"type\": \"string\"}, \"content\": {\"type\": \"string\"}}, \"required\": [\"path\", \"content\"]}}]",
+  "category": "Terminal & Coding",
+  "subcategory": "Terminal Tasks",
+  "task": "Write a tiny Python script that prints hello."
+}