From d439faf25b95b6dca727e789af340c9e04b0af29 Mon Sep 17 00:00:00 2001 From: taivu1998 <46636857+taivu1998@users.noreply.github.com> Date: Sun, 7 Jun 2026 15:48:48 -0700 Subject: [PATCH 1/2] [posttrain] Add Nemotron agentic SFT datasets Register the SWE, Cascade, OpenCode, and CUDA Nemotron SFT datasets through the instruction dataset path. Add a rich multi-turn adapter so tool calls, reasoning content, and tool context survive transformation for agentic SFT rows. --- experiments/posttrain/instruction_datasets.py | 104 +++++++++++ .../marin/transform/conversation/adapters.py | 67 +++++++ tests/test_instruction_datasets.py | 175 ++++++++++++++++++ tests/transform/test_conversation.py | 112 +++++++++++ 4 files changed, 458 insertions(+) diff --git a/experiments/posttrain/instruction_datasets.py b/experiments/posttrain/instruction_datasets.py index b56ad12bba..e7b6954b7f 100644 --- a/experiments/posttrain/instruction_datasets.py +++ b/experiments/posttrain/instruction_datasets.py @@ -38,6 +38,10 @@ 23. open-thoughts/OpenThoughts3-1.2M # Original OT3 dataset; smoltalk2 uses a slightly different version 24. lm-provers/FineProofs-SFT 25. lm-provers/FineProofs-SFT/proof-only +26. nvidia/Nemotron-SFT-SWE-v3 +27. nvidia/Nemotron-Cascade-SFT-SWE +28. nvidia/Nemotron-SFT-OpenCode-v1 +29. nvidia/Nemotron-SFT-CUDA-v1 """ import dataclasses @@ -103,6 +107,14 @@ ] NEMOTRON_V1_SPLITS = ["chat", "code", "math", "stem", "tool_calling"] +NEMOTRON_OPENCODE_V1_SPLITS = [ + "general", + "bash_only_tool", + "bash_only_tool_skills", + "question_tool", + "agent_skills", + "agent_skills_question_tool", +] @dataclass(frozen=True) @@ -156,6 +168,33 @@ def multi_turn_adapter( ) +def rich_multi_turn_adapter( + conversation_column: str = "messages", + role_key: str = "role", + user_value: str = "user", + assistant_value: str = "assistant", + system_value: str = "system", + tool_value: str = "tool", + content_key: str = "content", + metadata_remap: dict[str, str] | None = None, + replacements: dict[str, str] | None = None, + extra_metadata_fn=None, +) -> TransformAdapter: + return TransformAdapter( + dataset_format=InputDatasetFormat.SINGLE_COLUMN_MULTI_TURN_RICH, + conversation_column=conversation_column, + role_key=role_key, + user_value=user_value, + assistant_value=assistant_value, + system_value=system_value, + tool_value=tool_value, + content_key=content_key, + metadata_remap=metadata_remap or {}, + replacements=replacements, + extra_metadata_fn=extra_metadata_fn, + ) + + def instruction_response_adapter( *, instruction_column: str, @@ -248,6 +287,22 @@ def __call__(self, row: dict[str, Any]) -> dict[str, Any]: SYNTHETIC2_SFT_VERIFIED_REVISION = "fce247fe48af8ff9624fb51d1de63aa1b2332cef" SYNTHETIC2_SFT_VERIFIED_METADATA_COLUMNS = ["problem_id", "task_type", "reward"] +NEMOTRON_SFT_SWE_V3_REVISION = "3f73de64c1fe928a8f538fe45ccc10c228cc4c6a" +NEMOTRON_SFT_SWE_V3_METADATA_COLUMNS = ["uuid", "license"] +NEMOTRON_CASCADE_SFT_SWE_REVISION = "b4ef73ff9bbcfc33d1ec7a48e53017f9ce7af7a3" +NEMOTRON_CASCADE_SFT_SWE_METADATA_COLUMNS = ["category", "source", "generator", "thinking"] +NEMOTRON_SFT_OPENCODE_V1_REVISION = "556d5237acff203f3e1a0be49428634c3606cda2" +NEMOTRON_SFT_OPENCODE_V1_METADATA_COLUMNS = [ + "question_category", + "complexity_level", + "uuid", + "enabled_tools", + "skills_path", + "hf_split", +] +NEMOTRON_SFT_CUDA_V1_REVISION = "1a06167a6e1e90d928094184173898cbb9bf42de" +NEMOTRON_SFT_CUDA_V1_METADATA_COLUMNS = ["uuid", "license", "used_in"] + FINEPROOFS_SFT_REVISION = "73661e6" FINEPROOFS_SFT_METADATA_COLUMNS = [ "category", @@ -430,6 +485,36 @@ def __call__(self, row: dict[str, Any]) -> dict[str, Any]: subsets=["default"], splits=["train"], ), + "nvidia/Nemotron-SFT-SWE-v3": InstructionDatasetConfig( + hf_dataset_id="nvidia/Nemotron-SFT-SWE-v3", + revision=NEMOTRON_SFT_SWE_V3_REVISION, + adapter=rich_multi_turn_adapter(), + metadata_columns=NEMOTRON_SFT_SWE_V3_METADATA_COLUMNS, + name="nvidia/Nemotron-SFT-SWE-v3", + subsets=["default"], + splits=["train"], + max_parallelism=16, + ), + "nvidia/Nemotron-Cascade-SFT-SWE": InstructionDatasetConfig( + hf_dataset_id="nvidia/Nemotron-Cascade-SFT-SWE", + revision=NEMOTRON_CASCADE_SFT_SWE_REVISION, + adapter=multi_turn_adapter(), + metadata_columns=NEMOTRON_CASCADE_SFT_SWE_METADATA_COLUMNS, + name="nvidia/Nemotron-Cascade-SFT-SWE", + subsets=["default"], + splits=["train"], + max_parallelism=16, + ), + "nvidia/Nemotron-SFT-CUDA-v1": InstructionDatasetConfig( + hf_dataset_id="nvidia/Nemotron-SFT-CUDA-v1", + revision=NEMOTRON_SFT_CUDA_V1_REVISION, + adapter=rich_multi_turn_adapter(metadata_remap={"tools": "tools"}), + metadata_columns=NEMOTRON_SFT_CUDA_V1_METADATA_COLUMNS, + name="nvidia/Nemotron-SFT-CUDA-v1", + subsets=["default"], + splits=["train"], + max_parallelism=8, + ), "sherryy/tulu-3-sft-personas-instruction-following-expanded": InstructionDatasetConfig( hf_dataset_id="sherryy/tulu-3-sft-personas-instruction-following-expanded", revision="79ab2c4", @@ -595,6 +680,25 @@ def __call__(self, row: dict[str, Any]) -> dict[str, Any]: splits=[split_name], ) +for split_name in NEMOTRON_OPENCODE_V1_SPLITS: + dataset_key = f"nvidia/Nemotron-SFT-OpenCode-v1/{split_name}" + INSTRUCTION_DATASET_NAME_TO_CONFIG[dataset_key] = InstructionDatasetConfig( + name=dataset_key, + hf_dataset_id="nvidia/Nemotron-SFT-OpenCode-v1", + revision=NEMOTRON_SFT_OPENCODE_V1_REVISION, + adapter=rich_multi_turn_adapter( + metadata_remap={ + "agent_prompt": "agent_prompt", + "metadata": "source_metadata", + "tools": "tools", + }, + ), + metadata_columns=NEMOTRON_SFT_OPENCODE_V1_METADATA_COLUMNS, + subsets=["default"], + splits=[split_name], + max_parallelism=4, + ) + def get_directory_friendly_dataset_name(hf_dataset_id: str) -> str: dataset_name = hf_dataset_id.replace("/", "--") diff --git a/lib/marin/src/marin/transform/conversation/adapters.py b/lib/marin/src/marin/transform/conversation/adapters.py index c611737bd1..a121160198 100644 --- a/lib/marin/src/marin/transform/conversation/adapters.py +++ b/lib/marin/src/marin/transform/conversation/adapters.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses +import json from collections.abc import Callable from dataclasses import dataclass, field from enum import Enum @@ -50,11 +51,52 @@ class InputDatasetFormat(str, Enum): """ SINGLE_COLUMN_MULTI_TURN = "messages" + SINGLE_COLUMN_MULTI_TURN_RICH = "messages_rich" INSTRUCTION_RESPONSE = "instruction_response" INSTRUCT_COLUMN_RESPONSE = "instruct_column_response" INSTRUCT_MSG_RESPONSE = "instruct_msg_response" +RICH_MESSAGE_EXTRA_KEYS = ( + "name", + "tool_call_id", + "reasoning_content", +) + + +def _parse_tool_calls(value: Any) -> list[dict[str, Any]] | None: + if value is None: + return None + + parsed = value + if isinstance(value, str): + stripped = value.strip() + if not stripped: + return None + try: + parsed = json.loads(stripped) + except json.JSONDecodeError: + return None + + if isinstance(parsed, dict): + parsed = [parsed] + if not isinstance(parsed, list) or not parsed: + return None + if not all(isinstance(item, dict) for item in parsed): + return None + return parsed + + +def _raw_tool_calls(value: Any) -> str | None: + if not isinstance(value, str): + return None + + stripped = value.strip() + if not stripped: + return None + return stripped + + @dataclass class TransformAdapter: dataset_format: InputDatasetFormat = InputDatasetFormat.INSTRUCTION_RESPONSE @@ -130,6 +172,31 @@ def transform_conversation_to_openai_format( role = role_to_openai_role[conv[self.role_key]] messages.append(OpenAIChatMessage(role=role, content=conv[self.content_key])) return messages + elif self.dataset_format == InputDatasetFormat.SINGLE_COLUMN_MULTI_TURN_RICH: + messages = [] + role_to_openai_role = { + self.user_value: "user", + self.assistant_value: "assistant", + self.system_value: "system", + self.tool_value: "tool", + } + conversation = row[self.conversation_column] + for conv in conversation: + role = role_to_openai_role[conv[self.role_key]] + message: dict[str, Any] = {"role": role, "content": conv[self.content_key]} + for key in RICH_MESSAGE_EXTRA_KEYS: + value = conv.get(key) + if value: + message[key] = value + + tool_calls = _parse_tool_calls(conv.get("tool_calls")) + if tool_calls is not None: + message["tool_calls"] = tool_calls + elif raw_tool_calls := _raw_tool_calls(conv.get("tool_calls")): + message["raw_tool_calls"] = raw_tool_calls + + messages.append(OpenAIChatMessage(**message)) + return messages elif self.dataset_format == InputDatasetFormat.INSTRUCT_COLUMN_RESPONSE: messages = [] instruction = row[self.instruction_column] diff --git a/tests/test_instruction_datasets.py b/tests/test_instruction_datasets.py index 997ab422dd..35a2a912f9 100644 --- a/tests/test_instruction_datasets.py +++ b/tests/test_instruction_datasets.py @@ -1,6 +1,8 @@ # Copyright The Marin Authors # SPDX-License-Identifier: Apache-2.0 +import json + from marin.execution.executor import unwrap_versioned_value from marin.transform.conversation.adapters import InputDatasetFormat from marin.transform.conversation.transform_conversation import transform_row @@ -9,6 +11,7 @@ FINEPROOFS_SFT_METADATA_COLUMNS, FINEPROOFS_SFT_REVISION, INSTRUCTION_DATASET_NAME_TO_CONFIG, + NEMOTRON_OPENCODE_V1_SPLITS, SYNTHETIC2_SFT_VERIFIED_HF_ID, SYNTHETIC2_SFT_VERIFIED_METADATA_COLUMNS, SYNTHETIC2_SFT_VERIFIED_REVISION, @@ -25,6 +28,94 @@ ], } +NEMOTRON_CUDA_SAMPLE = { + "uuid": "cuda-row-1", + "license": "cc-by-4.0", + "used_in": ["cuda_sft"], + "metadata": {"topic": "cuda"}, + "tools": [{"type": "function", "function": {"name": "bash"}}], + "messages": [ + {"role": "user", "content": "Compile this CUDA kernel."}, + { + "role": "assistant", + "content": "I'll compile it.", + "reasoning_content": "Compilation output is needed.", + "tool_calls": json.dumps( + [ + { + "id": "call_cuda", + "type": "function", + "function": {"name": "bash", "arguments": json.dumps({"cmd": "nvcc kernel.cu"})}, + } + ] + ), + }, + ], +} + +NEMOTRON_SWE_V3_SAMPLE = { + "uuid": "swe-v3-row-1", + "license": "cc-by-4.0", + "messages": [ + {"role": "system", "content": "You are working inside a repository."}, + {"role": "user", "content": "Fix the failing issue."}, + { + "role": "assistant", + "content": "I'll inspect the tests.", + "reasoning_content": "Need to inspect the failing test output.", + "tool_calls": json.dumps( + [ + { + "id": "call_swe", + "type": "function", + "function": {"name": "bash", "arguments": json.dumps({"cmd": "pytest -q"})}, + } + ] + ), + }, + {"role": "tool", "content": "AssertionError", "tool_call_id": "call_swe"}, + ], +} + +NEMOTRON_CASCADE_SAMPLE = { + "category": "swe", + "source": "cascade", + "generator": "nemotron", + "thinking": True, + "messages": [ + {"role": "user", "content": "Fix the failing test."}, + {"role": "assistant", "content": "The issue is an off-by-one error."}, + ], +} + +NEMOTRON_OPENCODE_SAMPLE = { + "question_category": "repo_debug", + "complexity_level": "hard", + "uuid": "opencode-row-1", + "enabled_tools": ["bash"], + "skills_path": "/skills/bash.md", + "hf_split": "bash_only_tool", + "question": "Why does the test fail?", + "agent_prompt": "You are a coding assistant.", + "metadata": {"source": "opencode"}, + "tools": [{"type": "function", "function": {"name": "bash"}}], + "messages": [ + {"role": "user", "content": "Why does the test fail?"}, + { + "role": "assistant", + "content": "I'll run the test.", + "tool_calls": [ + { + "id": "call_test", + "type": "function", + "function": {"name": "bash", "arguments": json.dumps({"cmd": "pytest"})}, + } + ], + }, + {"role": "tool", "content": "AssertionError", "tool_call_id": "call_test"}, + ], +} + def test_fineproofs_sft_datasets_are_registered(): raw_dataset = INSTRUCTION_DATASET_NAME_TO_CONFIG["lm-provers/FineProofs-SFT"] @@ -105,3 +196,87 @@ def test_synthetic2_sft_verified_step_transforms_chat_rows(): "task_type": "prime_rl_code", "reward": 1.0, } + + +def test_nemotron_cuda_step_transforms_rich_tool_rows(): + step = get_instruction_dataset("nvidia/Nemotron-SFT-CUDA-v1") + cfg = step.config + adapter = unwrap_versioned_value(cfg.adapter) + + result = transform_row(NEMOTRON_CUDA_SAMPLE, cfg, adapter) + + assert result is not None + output = result.model_dump() + assert result.source == "nvidia/Nemotron-SFT-CUDA-v1" + assert output["metadata"] == { + "uuid": "cuda-row-1", + "license": "cc-by-4.0", + "used_in": ["cuda_sft"], + } + assert output["tools"] == NEMOTRON_CUDA_SAMPLE["tools"] + assert output["messages"][1]["reasoning_content"] == "Compilation output is needed." + assert output["messages"][1]["tool_calls"][0]["function"]["arguments"] == {"cmd": "nvcc kernel.cu"} + + +def test_nemotron_swe_v3_step_transforms_rich_tool_rows(): + step = get_instruction_dataset("nvidia/Nemotron-SFT-SWE-v3") + cfg = step.config + adapter = unwrap_versioned_value(cfg.adapter) + + result = transform_row(NEMOTRON_SWE_V3_SAMPLE, cfg, adapter) + + assert result is not None + output = result.model_dump() + assert result.source == "nvidia/Nemotron-SFT-SWE-v3" + assert output["metadata"] == { + "uuid": "swe-v3-row-1", + "license": "cc-by-4.0", + } + assert [message["role"] for message in output["messages"]] == ["system", "user", "assistant", "tool"] + assert output["messages"][2]["reasoning_content"] == "Need to inspect the failing test output." + assert output["messages"][2]["tool_calls"][0]["function"]["arguments"] == {"cmd": "pytest -q"} + assert output["messages"][3]["tool_call_id"] == "call_swe" + + +def test_nemotron_cascade_step_transforms_plain_chat_rows(): + step = get_instruction_dataset("nvidia/Nemotron-Cascade-SFT-SWE") + cfg = step.config + adapter = unwrap_versioned_value(cfg.adapter) + + result = transform_row(NEMOTRON_CASCADE_SAMPLE, cfg, adapter) + + assert result is not None + assert [message.role for message in result.messages] == ["user", "assistant"] + assert result.messages[1].content == "The issue is an off-by-one error." + assert result.metadata == { + "category": "swe", + "source": "cascade", + "generator": "nemotron", + "thinking": True, + } + + +def test_nemotron_opencode_split_steps_transform_rows_with_top_level_context(): + for split_name in NEMOTRON_OPENCODE_V1_SPLITS: + step = get_instruction_dataset(f"nvidia/Nemotron-SFT-OpenCode-v1/{split_name}") + cfg = step.config + adapter = unwrap_versioned_value(cfg.adapter) + row = {**NEMOTRON_OPENCODE_SAMPLE, "hf_split": split_name} + + result = transform_row(row, cfg, adapter) + + assert result is not None + output = result.model_dump() + assert output["metadata"] == { + "question_category": "repo_debug", + "complexity_level": "hard", + "uuid": "opencode-row-1", + "enabled_tools": ["bash"], + "skills_path": "/skills/bash.md", + "hf_split": split_name, + } + assert output["agent_prompt"] == "You are a coding assistant." + assert output["source_metadata"] == {"source": "opencode"} + assert output["tools"] == NEMOTRON_OPENCODE_SAMPLE["tools"] + assert output["messages"][1]["tool_calls"][0]["function"]["arguments"] == {"cmd": "pytest"} + assert output["messages"][2]["tool_call_id"] == "call_test" diff --git a/tests/transform/test_conversation.py b/tests/transform/test_conversation.py index 7856f1e2d8..984fafad8e 100644 --- a/tests/transform/test_conversation.py +++ b/tests/transform/test_conversation.py @@ -3,6 +3,7 @@ """Tests for conversation data transformation scripts.""" +import json from pathlib import Path from marin.transform.conversation.adapters import InputDatasetFormat, TransformAdapter @@ -78,6 +79,36 @@ "source", ] +RICH_MESSAGES_SAMPLE = { + "messages": [ + {"role": "system", "content": "Use tools when helpful."}, + { + "role": "assistant", + "content": "I'll inspect the workspace.", + "reasoning_content": "Need a directory listing.", + "tool_calls": json.dumps( + [ + { + "id": "call_1", + "type": "function", + "function": {"name": "bash", "arguments": json.dumps({"cmd": "ls"})}, + } + ] + ), + }, + {"role": "tool", "content": "README.md", "tool_call_id": "call_1"}, + ], + "tools": [{"type": "function", "function": {"name": "bash"}}], + "uuid": "rich-row-1", +} + +RICH_RAW_TOOL_CALLS_SAMPLE = { + "messages": [ + {"role": "user", "content": "Inspect the workspace."}, + {"role": "assistant", "content": "I tried the tool.", "tool_calls": "{not json"}, + ], +} + class TestTransformAdapters: """Test the different adapter formats.""" @@ -102,6 +133,33 @@ def test_openai_format_adapter(self): assert messages[1].role == "assistant" assert messages[1].content == "The capital of France is Paris." + def test_rich_openai_format_adapter_preserves_tool_and_reasoning_fields(self): + adapter = TransformAdapter( + dataset_format=InputDatasetFormat.SINGLE_COLUMN_MULTI_TURN_RICH, + conversation_column="messages", + role_key="role", + content_key="content", + user_value="user", + assistant_value="assistant", + system_value="system", + ) + + messages = adapter.transform_conversation_to_openai_format(RICH_MESSAGES_SAMPLE) + + assert messages is not None + assistant_message = messages[1].model_dump() + assert assistant_message["role"] == "assistant" + assert assistant_message["reasoning_content"] == "Need a directory listing." + assert assistant_message["tool_calls"] == [ + { + "id": "call_1", + "type": "function", + "function": {"name": "bash", "arguments": '{"cmd": "ls"}'}, + } + ] + assert messages[2].role == "tool" + assert messages[2].tool_call_id == "call_1" + def test_sharegpt_format_adapter(self): """Test ShareGPT format adapter.""" adapter = TransformAdapter( @@ -219,6 +277,60 @@ def test_fineproofs_proof_only_row_builds_instruction_response_chat(self): "source": "aops", } + def test_rich_multi_turn_row_normalizes_tool_calls_and_remaps_top_level_tools(self): + adapter = TransformAdapter( + dataset_format=InputDatasetFormat.SINGLE_COLUMN_MULTI_TURN_RICH, + conversation_column="messages", + role_key="role", + content_key="content", + user_value="user", + assistant_value="assistant", + system_value="system", + metadata_remap={"tools": "tools"}, + ) + cfg = TransformSFTDatasetConfig( + source="nvidia/Nemotron-SFT-CUDA-v1", + revision="1a06167a6e1e90d928094184173898cbb9bf42de", + output_path="/tmp/output", + metadata_columns=["uuid"], + adapter=adapter, + ) + + result = transform_row(RICH_MESSAGES_SAMPLE, cfg, adapter) + + assert result is not None + output = result.model_dump() + assert output["metadata"] == {"uuid": "rich-row-1"} + assert output["tools"] == RICH_MESSAGES_SAMPLE["tools"] + assert output["messages"][1]["reasoning_content"] == "Need a directory listing." + assert output["messages"][1]["tool_calls"][0]["function"]["arguments"] == {"cmd": "ls"} + assert output["messages"][2]["tool_call_id"] == "call_1" + + def test_rich_multi_turn_row_keeps_unparseable_tool_calls_as_raw_value(self): + adapter = TransformAdapter( + dataset_format=InputDatasetFormat.SINGLE_COLUMN_MULTI_TURN_RICH, + conversation_column="messages", + role_key="role", + content_key="content", + user_value="user", + assistant_value="assistant", + system_value="system", + ) + cfg = TransformSFTDatasetConfig( + source="nvidia/Nemotron-SFT-SWE-v3", + revision="3f73de64c1fe928a8f538fe45ccc10c228cc4c6a", + output_path="/tmp/output", + metadata_columns=[], + adapter=adapter, + ) + + result = transform_row(RICH_RAW_TOOL_CALLS_SAMPLE, cfg, adapter) + + assert result is not None + assistant_message = result.messages[1].model_dump() + assert assistant_message["tool_calls"] is None + assert assistant_message["raw_tool_calls"] == "{not json" + def test_instruct_msg_response_skips_misaligned_row(self): """A multi-message instruction is dropped (returns None), not emitted as an empty conversation.""" adapter = TransformAdapter( From fe8a079ef86550bfb327da167b981a826fef2ce6 Mon Sep 17 00:00:00 2001 From: taivu1998 <46636857+taivu1998@users.noreply.github.com> Date: Sun, 7 Jun 2026 16:22:03 -0700 Subject: [PATCH 2/2] [lint] Remove trailing whitespace in lint review skill Fix the all-files trailing whitespace failure reported by the PR lint job so the branch can pass repository-wide pre-commit checks. --- .agents/skills/lint-review/SKILL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.agents/skills/lint-review/SKILL.md b/.agents/skills/lint-review/SKILL.md index 090363efce..88d51720d0 100644 --- a/.agents/skills/lint-review/SKILL.md +++ b/.agents/skills/lint-review/SKILL.md @@ -7,8 +7,8 @@ allowed-tools: Bash(./infra/pre-commit.py:*), Bash(gh pr comment:*), Bash(gh pr # Skill: Lint-catalog review on a PR Run the `infra/lint/` catalog review (`./infra/pre-commit.py --review`) -over a pull request's branch diff and surface every finding — as `file:line` -inline review comments where the finding's line is available, and as a +over a pull request's branch diff and surface every finding — as `file:line` +inline review comments where the finding's line is available, and as a single fallback comment for the rest. ## Your contract