Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .agents/skills/lint-review/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ allowed-tools: Bash(./infra/pre-commit.py:*), Bash(gh pr comment:*), Bash(gh pr
# Skill: Lint-catalog review on a PR

Run the `infra/lint/` catalog review (`./infra/pre-commit.py --review`)
over a pull request's branch diff and surface every finding — as `file:line`
inline review comments where the finding's line is available, and as a
over a pull request's branch diff and surface every finding — as `file:line`
inline review comments where the finding's line is available, and as a
single fallback comment for the rest.

## Your contract
Expand Down
104 changes: 104 additions & 0 deletions experiments/posttrain/instruction_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
23. open-thoughts/OpenThoughts3-1.2M # Original OT3 dataset; smoltalk2 uses a slightly different version
24. lm-provers/FineProofs-SFT
25. lm-provers/FineProofs-SFT/proof-only
26. nvidia/Nemotron-SFT-SWE-v3
27. nvidia/Nemotron-Cascade-SFT-SWE
28. nvidia/Nemotron-SFT-OpenCode-v1
29. nvidia/Nemotron-SFT-CUDA-v1
"""

import dataclasses
Expand Down Expand Up @@ -103,6 +107,14 @@
]

NEMOTRON_V1_SPLITS = ["chat", "code", "math", "stem", "tool_calling"]
NEMOTRON_OPENCODE_V1_SPLITS = [
"general",
"bash_only_tool",
"bash_only_tool_skills",
"question_tool",
"agent_skills",
"agent_skills_question_tool",
]


@dataclass(frozen=True)
Expand Down Expand Up @@ -156,6 +168,33 @@ def multi_turn_adapter(
)


def rich_multi_turn_adapter(
conversation_column: str = "messages",
role_key: str = "role",
user_value: str = "user",
assistant_value: str = "assistant",
system_value: str = "system",
tool_value: str = "tool",
content_key: str = "content",
metadata_remap: dict[str, str] | None = None,
replacements: dict[str, str] | None = None,
extra_metadata_fn=None,
) -> TransformAdapter:
return TransformAdapter(
dataset_format=InputDatasetFormat.SINGLE_COLUMN_MULTI_TURN_RICH,
conversation_column=conversation_column,
role_key=role_key,
user_value=user_value,
assistant_value=assistant_value,
system_value=system_value,
tool_value=tool_value,
content_key=content_key,
metadata_remap=metadata_remap or {},
replacements=replacements,
extra_metadata_fn=extra_metadata_fn,
)


def instruction_response_adapter(
*,
instruction_column: str,
Expand Down Expand Up @@ -248,6 +287,22 @@ def __call__(self, row: dict[str, Any]) -> dict[str, Any]:
SYNTHETIC2_SFT_VERIFIED_REVISION = "fce247fe48af8ff9624fb51d1de63aa1b2332cef"
SYNTHETIC2_SFT_VERIFIED_METADATA_COLUMNS = ["problem_id", "task_type", "reward"]

NEMOTRON_SFT_SWE_V3_REVISION = "3f73de64c1fe928a8f538fe45ccc10c228cc4c6a"
NEMOTRON_SFT_SWE_V3_METADATA_COLUMNS = ["uuid", "license"]
NEMOTRON_CASCADE_SFT_SWE_REVISION = "b4ef73ff9bbcfc33d1ec7a48e53017f9ce7af7a3"
NEMOTRON_CASCADE_SFT_SWE_METADATA_COLUMNS = ["category", "source", "generator", "thinking"]
NEMOTRON_SFT_OPENCODE_V1_REVISION = "556d5237acff203f3e1a0be49428634c3606cda2"
NEMOTRON_SFT_OPENCODE_V1_METADATA_COLUMNS = [
"question_category",
"complexity_level",
"uuid",
"enabled_tools",
"skills_path",
"hf_split",
]
NEMOTRON_SFT_CUDA_V1_REVISION = "1a06167a6e1e90d928094184173898cbb9bf42de"
NEMOTRON_SFT_CUDA_V1_METADATA_COLUMNS = ["uuid", "license", "used_in"]

FINEPROOFS_SFT_REVISION = "73661e6"
FINEPROOFS_SFT_METADATA_COLUMNS = [
"category",
Expand Down Expand Up @@ -430,6 +485,36 @@ def __call__(self, row: dict[str, Any]) -> dict[str, Any]:
subsets=["default"],
splits=["train"],
),
"nvidia/Nemotron-SFT-SWE-v3": InstructionDatasetConfig(
hf_dataset_id="nvidia/Nemotron-SFT-SWE-v3",
revision=NEMOTRON_SFT_SWE_V3_REVISION,
adapter=rich_multi_turn_adapter(),
metadata_columns=NEMOTRON_SFT_SWE_V3_METADATA_COLUMNS,
name="nvidia/Nemotron-SFT-SWE-v3",
subsets=["default"],
splits=["train"],
max_parallelism=16,
),
"nvidia/Nemotron-Cascade-SFT-SWE": InstructionDatasetConfig(
hf_dataset_id="nvidia/Nemotron-Cascade-SFT-SWE",
revision=NEMOTRON_CASCADE_SFT_SWE_REVISION,
adapter=multi_turn_adapter(),
metadata_columns=NEMOTRON_CASCADE_SFT_SWE_METADATA_COLUMNS,
name="nvidia/Nemotron-Cascade-SFT-SWE",
subsets=["default"],
splits=["train"],
max_parallelism=16,
),
"nvidia/Nemotron-SFT-CUDA-v1": InstructionDatasetConfig(
hf_dataset_id="nvidia/Nemotron-SFT-CUDA-v1",
revision=NEMOTRON_SFT_CUDA_V1_REVISION,
adapter=rich_multi_turn_adapter(metadata_remap={"tools": "tools"}),
metadata_columns=NEMOTRON_SFT_CUDA_V1_METADATA_COLUMNS,
name="nvidia/Nemotron-SFT-CUDA-v1",
subsets=["default"],
splits=["train"],
max_parallelism=8,
),
"sherryy/tulu-3-sft-personas-instruction-following-expanded": InstructionDatasetConfig(
hf_dataset_id="sherryy/tulu-3-sft-personas-instruction-following-expanded",
revision="79ab2c4",
Expand Down Expand Up @@ -595,6 +680,25 @@ def __call__(self, row: dict[str, Any]) -> dict[str, Any]:
splits=[split_name],
)

for split_name in NEMOTRON_OPENCODE_V1_SPLITS:
dataset_key = f"nvidia/Nemotron-SFT-OpenCode-v1/{split_name}"
INSTRUCTION_DATASET_NAME_TO_CONFIG[dataset_key] = InstructionDatasetConfig(
name=dataset_key,
hf_dataset_id="nvidia/Nemotron-SFT-OpenCode-v1",
revision=NEMOTRON_SFT_OPENCODE_V1_REVISION,
adapter=rich_multi_turn_adapter(
metadata_remap={
"agent_prompt": "agent_prompt",
"metadata": "source_metadata",
"tools": "tools",
},
),
metadata_columns=NEMOTRON_SFT_OPENCODE_V1_METADATA_COLUMNS,
subsets=["default"],
splits=[split_name],
max_parallelism=4,
)


def get_directory_friendly_dataset_name(hf_dataset_id: str) -> str:
dataset_name = hf_dataset_id.replace("/", "--")
Expand Down
67 changes: 67 additions & 0 deletions lib/marin/src/marin/transform/conversation/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

import dataclasses
import json
from collections.abc import Callable
from dataclasses import dataclass, field
from enum import Enum
Expand Down Expand Up @@ -50,11 +51,52 @@ class InputDatasetFormat(str, Enum):
"""

SINGLE_COLUMN_MULTI_TURN = "messages"
SINGLE_COLUMN_MULTI_TURN_RICH = "messages_rich"
INSTRUCTION_RESPONSE = "instruction_response"
INSTRUCT_COLUMN_RESPONSE = "instruct_column_response"
INSTRUCT_MSG_RESPONSE = "instruct_msg_response"


RICH_MESSAGE_EXTRA_KEYS = (
"name",
"tool_call_id",
"reasoning_content",
)


def _parse_tool_calls(value: Any) -> list[dict[str, Any]] | None:
if value is None:
return None

parsed = value
if isinstance(value, str):
stripped = value.strip()
if not stripped:
return None
try:
parsed = json.loads(stripped)
except json.JSONDecodeError:
return None

if isinstance(parsed, dict):
parsed = [parsed]
if not isinstance(parsed, list) or not parsed:
return None
if not all(isinstance(item, dict) for item in parsed):
return None
return parsed


def _raw_tool_calls(value: Any) -> str | None:
if not isinstance(value, str):
return None

stripped = value.strip()
if not stripped:
return None
return stripped


@dataclass
class TransformAdapter:
dataset_format: InputDatasetFormat = InputDatasetFormat.INSTRUCTION_RESPONSE
Expand Down Expand Up @@ -130,6 +172,31 @@ def transform_conversation_to_openai_format(
role = role_to_openai_role[conv[self.role_key]]
messages.append(OpenAIChatMessage(role=role, content=conv[self.content_key]))
return messages
elif self.dataset_format == InputDatasetFormat.SINGLE_COLUMN_MULTI_TURN_RICH:
messages = []
role_to_openai_role = {
self.user_value: "user",
self.assistant_value: "assistant",
self.system_value: "system",
self.tool_value: "tool",
}
conversation = row[self.conversation_column]
for conv in conversation:
role = role_to_openai_role[conv[self.role_key]]
message: dict[str, Any] = {"role": role, "content": conv[self.content_key]}
for key in RICH_MESSAGE_EXTRA_KEYS:
value = conv.get(key)
if value:
message[key] = value

tool_calls = _parse_tool_calls(conv.get("tool_calls"))
if tool_calls is not None:
message["tool_calls"] = tool_calls
elif raw_tool_calls := _raw_tool_calls(conv.get("tool_calls")):
message["raw_tool_calls"] = raw_tool_calls

messages.append(OpenAIChatMessage(**message))
return messages
elif self.dataset_format == InputDatasetFormat.INSTRUCT_COLUMN_RESPONSE:
messages = []
instruction = row[self.instruction_column]
Expand Down
Loading
Loading