use recipe import system

h-guo18 · h-guo18 · commit 47e13f83d316 · 2026-05-21T19:52:20.000Z
Signed-off-by: h-guo18 &lt;67671475+h-guo18@users.noreply.github.com&gt;
diff --git a/modelopt_recipes/configs/speculative_decoding/dflash/default.yaml b/modelopt_recipes/configs/speculative_decoding/dflash/default.yaml
@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Default DFlashConfig values for DFlash training. Imported into the `dflash:`
+# section of recipes. ``dflash_mask_token_id`` is intentionally omitted; per-model
+# recipes should provide it explicitly, and main.py falls back to
+# ``tokenizer.mask_token_id`` when neither does.
+
+# modelopt-schema: modelopt.torch.speculative.config.DFlashConfig
+dflash_block_size: 8
+dflash_num_anchors: 512
+dflash_use_torch_compile: false
+dflash_self_logit_distillation: true
+dflash_loss_decay_factor: 4.0
+dflash_architecture_config:
+  num_hidden_layers: 5
+  # mask_token_id: auto-detected from model vocab (override for specific models)
+  # sliding_window and layer_types are inherited from base model config automatically
diff --git a/modelopt_recipes/configs/speculative_decoding/dflash/training_default.yaml b/modelopt_recipes/configs/speculative_decoding/dflash/training_default.yaml
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Default `training:` section values for DFlash training. Imported into the
+# `training:` section of recipes. HF trainer fields flow through SpecTrainingArgs
+# via ``extra='allow'`` and are re-validated by transformers.TrainingArguments
+# in main.py.
+
+# modelopt-schema: modelopt.torch.speculative.plugins.hf_training_args.TrainingArguments
+
+# --- commonly modified ---
+output_dir:
+num_train_epochs: 10
+per_device_train_batch_size: 1
+learning_rate: 6.0e-4
+warmup_steps: 100
+training_seq_len: 4096
+logging_steps: 100
+save_steps: 5000
+cp_size: 1
+dp_shard_size: 1
+disable_tqdm: true
+estimate_ar: false
+ar_validate_steps: 0
+answer_only_loss: true
+
+# --- rarely modified ---
+do_eval: false
+lr_scheduler_type: linear
+save_strategy: steps
+weight_decay: 0.0
+dataloader_drop_last: true
+bf16: true
+tf32: true
+remove_unused_columns: false
+ddp_find_unused_parameters: true
+ddp_timeout: 1800
+report_to: tensorboard
diff --git a/modelopt_recipes/configs/speculative_decoding/eagle/default.yaml b/modelopt_recipes/configs/speculative_decoding/eagle/default.yaml
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Default EagleConfig values for EAGLE3 training. Imported into the `eagle:` section of recipes.
+# eagle_offline is derived from data.offline_data_path; do not set here.
+
+# modelopt-schema: modelopt.torch.speculative.config.EagleConfig
+eagle_decoder_type: llama
+eagle_ttt_steps: 3
+eagle_mix_hidden_states: false
+eagle_use_torch_compile: true
+eagle_self_logit_distillation: true
+eagle_freeze_base_model: true
+eagle_loss_decay_factor: 0.9
+eagle_hidden_state_distillation: false
+eagle_reuse_base_decoder: false
+eagle_report_acc: true
+eagle_enable_nvtx: false
+# Rope scaling: disable during training (default_config.py uses rope_type=default),
+# inject YaRN during export for long-context inference.
+eagle_export_rope_scaling:
+  rope_type: yarn
+  factor: 32.0
+  original_max_position_embeddings: 2048
+# overwrite to modelopt/torch/speculative/eagle/default_config.py
+eagle_architecture_config: {}
diff --git a/modelopt_recipes/configs/speculative_decoding/eagle/training_default.yaml b/modelopt_recipes/configs/speculative_decoding/eagle/training_default.yaml
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Default `training:` section values for EAGLE3 training. Imported into the
+# `training:` section of recipes. HF trainer fields flow through SpecTrainingArgs
+# via ``extra='allow'`` and are re-validated by transformers.TrainingArguments
+# in main.py.
+
+# modelopt-schema: modelopt.torch.speculative.plugins.hf_training_args.TrainingArguments
+
+# --- commonly modified ---
+output_dir:
+num_train_epochs: 1
+per_device_train_batch_size: 1
+learning_rate: 1.0e-4
+warmup_steps: 1000
+training_seq_len: 2048
+logging_steps: 100
+save_steps: 8192
+cp_size: 1
+disable_tqdm: false
+estimate_ar: false
+ar_validate_steps: -1
+answer_only_loss: false
+
+# --- rarely modified ---
+do_eval: false
+lr_scheduler_type: linear
+save_strategy: steps
+weight_decay: 0.0
+dataloader_drop_last: true
+bf16: true
+tf32: true
+remove_unused_columns: false
diff --git a/modelopt_recipes/general/speculative_decoding/dflash.yaml b/modelopt_recipes/general/speculative_decoding/dflash.yaml
@@ -1,16 +1,21 @@
-# DFlash speculative-decoding training recipe. Override fields via OmegaConf dotlist on the CLI.
+# DFlash speculative-decoding training recipe. Override fields via OmegaConf dotlist on the CLI
+# or by importing this file from a per-model recipe in modelopt_recipes/models/.
 
 metadata:
   recipe_type: speculative_dflash
   description: DFlash training recipe (model/data/training/dflash bundled).
 
-# maps to ModelArguments (main.py)
+imports:
+  dflash_default: configs/speculative_decoding/dflash/default
+  dflash_training_default: configs/speculative_decoding/dflash/training_default
+
+# maps to ModelArguments
 model:
   model_name_or_path:
   trust_remote_code: false
   use_fake_base_for_offline: false
 
-# maps to DataArguments (main.py)
+# maps to DataArguments
 data:
   data_path:
   offline_data_path:
@@ -19,45 +24,13 @@ data:
   # Templates are in modelopt_recipes/general/speculative_decoding/chat_templates/
   chat_template:
 
-# maps to TrainingArguments (main.py)
+# maps to TrainingArguments
 training:
-  # --- commonly modified ---
-  output_dir:
-  num_train_epochs: 10
-  per_device_train_batch_size: 1
-  learning_rate: 6.0e-4
-  warmup_steps: 100
-  training_seq_len: 4096
-  logging_steps: 100
-  save_steps: 5000
-  cp_size: 1
-  dp_shard_size: 1
-  disable_tqdm: true
-  estimate_ar: false
-  ar_validate_steps: 0
-  answer_only_loss: true
-
-  # --- rarely modified ---
-  do_eval: false
-  lr_scheduler_type: linear
-  save_strategy: steps
-  weight_decay: 0.0
-  dataloader_drop_last: true
-  bf16: true
-  tf32: true
-  remove_unused_columns: false
-  ddp_find_unused_parameters: true
-  ddp_timeout: 1800
-  report_to: tensorboard
+  $import: dflash_training_default
 
 # maps to DFlashConfig (modelopt/torch/speculative/config.py).
+# Per-model recipes should also set ``dflash_mask_token_id``; otherwise main.py
+# falls back to ``tokenizer.mask_token_id``, and DFlashConfig raises if neither
+# source provides one.
 dflash:
-  dflash_block_size: 8
-  dflash_num_anchors: 512
-  dflash_use_torch_compile: false
-  dflash_self_logit_distillation: true
-  dflash_loss_decay_factor: 4.0
-  dflash_architecture_config:
-    num_hidden_layers: 5
-    # mask_token_id: auto-detected from model vocab (override for specific models)
-    # sliding_window and layer_types are inherited from base model config automatically
+  $import: dflash_default
diff --git a/modelopt_recipes/general/speculative_decoding/eagle3.yaml b/modelopt_recipes/general/speculative_decoding/eagle3.yaml
@@ -1,69 +1,32 @@
-# EAGLE3 speculative-decoding training recipe. Override fields via OmegaConf dotlist on the CLI.
+# EAGLE3 speculative-decoding training recipe. Override fields via OmegaConf dotlist on the CLI
+# or by importing this file from a per-model recipe in modelopt_recipes/models/.
 
 metadata:
   recipe_type: speculative_eagle
   description: EAGLE3 training recipe (model/data/training/eagle bundled).
 
-# maps to ModelArguments (main.py)
+imports:
+  eagle_default: configs/speculative_decoding/eagle/default
+  eagle_training_default: configs/speculative_decoding/eagle/training_default
+
+# maps to ModelArguments
 model:
   model_name_or_path:
   trust_remote_code: false
   use_fake_base_for_offline: false
 
-# maps to DataArguments (main.py)
+# maps to DataArguments
 data:
   data_path: input_conversations/train.jsonl
   offline_data_path:
   draft_vocab_cache:
   vlm_img_dir:
   vlm_processor:
 
-# maps to TrainingArguments (main.py)
+# maps to TrainingArguments
 training:
-  # --- commonly modified ---
-  output_dir:
-  num_train_epochs: 1
-  per_device_train_batch_size: 1
-  learning_rate: 1.0e-4
-  warmup_steps: 1000
-  training_seq_len: 2048
-  logging_steps: 100
-  save_steps: 8192
-  cp_size: 1
-  disable_tqdm: false
-  estimate_ar: false
-  ar_validate_steps: -1
-  answer_only_loss: false
-
-  # --- rarely modified ---
-  do_eval: false
-  lr_scheduler_type: linear
-  save_strategy: steps
-  weight_decay: 0.0
-  dataloader_drop_last: true
-  bf16: true
-  tf32: true
-  remove_unused_columns: false
+  $import: eagle_training_default
 
 # maps to EagleConfig (modelopt/torch/speculative/config.py).
 eagle:
-  # eagle_offline is derived from data.offline_data_path; do not set here.
-  eagle_decoder_type: llama
-  eagle_ttt_steps: 3
-  eagle_mix_hidden_states: false
-  eagle_use_torch_compile: true
-  eagle_self_logit_distillation: true
-  eagle_freeze_base_model: true
-  eagle_loss_decay_factor: 0.9
-  eagle_hidden_state_distillation: false
-  eagle_reuse_base_decoder: false
-  eagle_report_acc: true
-  eagle_enable_nvtx: false
-  # Rope scaling: disable during training (default_config.py uses rope_type=default),
-  # inject YaRN during export for long-context inference.
-  eagle_export_rope_scaling:
-    rope_type: yarn
-    factor: 32.0
-    original_max_position_embeddings: 2048
-  # overwrite to modelopt/torch/speculative/eagle/default_config.py
-  eagle_architecture_config: {}
+  $import: eagle_default
diff --git a/modelopt_recipes/models/Kimi-K2.5/dflash.yaml b/modelopt_recipes/models/Kimi-K2.5/dflash.yaml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Per-model DFlash offline training recipe for Kimi-K2.5.
+
+metadata:
+  recipe_type: speculative_dflash
+  description: DFlash offline training recipe for Kimi-K2.5.
+
+imports:
+  dflash_default: configs/speculative_decoding/dflash/default
+  dflash_training_default: configs/speculative_decoding/dflash/training_default
+
+model:
+  model_name_or_path: moonshotai/Kimi-K2.5
+  trust_remote_code: true
+  use_fake_base_for_offline: true
+
+data:
+  offline_data_path: <path to offline data>
+
+training:
+  $import: dflash_training_default
+  output_dir: ckpts/kimi-k25-dflash
+
+dflash:
+  $import: dflash_default
+  # If unset, main.py falls back to tokenizer.mask_token_id; DFlashConfig
+  # raises if neither this field nor the tokenizer provides one.
+  # dflash_mask_token_id:
diff --git a/modelopt_recipes/models/Kimi-K2.5/eagle3.yaml b/modelopt_recipes/models/Kimi-K2.5/eagle3.yaml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Per-model EAGLE3 offline training recipe for Kimi-K2.5.
+# Mirrors examples/speculative_decoding/scripts/train_kimi_k25_offline.sh.
+
+metadata:
+  recipe_type: speculative_eagle
+  description: EAGLE3 offline training recipe for Kimi-K2.5.
+
+imports:
+  eagle_default: configs/speculative_decoding/eagle/default
+  eagle_training_default: configs/speculative_decoding/eagle/training_default
+
+model:
+  model_name_or_path: moonshotai/Kimi-K2.5
+  trust_remote_code: true
+  use_fake_base_for_offline: true
+
+data:
+  offline_data_path: <path to offline data>
+
+training:
+  $import: eagle_training_default
+  output_dir: ckpts/kimi-k25-eagle3
+  training_seq_len: 4096
+
+eagle:
+  $import: eagle_default
+  eagle_decoder_type: kimik2