Skip to content

Commit 11571c0

Browse files
committed
multinode streaming; k2.5 example
Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
1 parent 902d369 commit 11571c0

11 files changed

Lines changed: 530 additions & 37 deletions

File tree

examples/specdec_bench/specdec_bench/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,10 @@ def _checkpoint_provenance(model_dir):
196196

197197

198198
def _is_sensitive_key(key):
199+
# Engine configs can carry non-string dict keys (e.g. int layer ids in a
200+
# serving_config); those are never sensitive field *names*, so skip them.
201+
if not isinstance(key, str):
202+
return False
199203
klow = key.lower()
200204
if klow in _SENSITIVE_KEY_ALLOWLIST:
201205
return False

examples/speculative_decoding/launch_train.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,14 @@ SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
3030
CONFIG_FILE=""
3131
NUM_NODES=1
3232
HEAD_NODE_IP=""
33+
MACHINE_RANK=""
3334
EXTRA_ARGS=()
3435
while [ $# -gt 0 ]; do
3536
case "$1" in
3637
--config*) if [[ "$1" != *=* ]]; then shift; fi; CONFIG_FILE="${1#*=}" ;;
3738
--num_nodes*) if [[ "$1" != *=* ]]; then shift; fi; NUM_NODES="${1#*=}" ;;
3839
--head_node_ip*) if [[ "$1" != *=* ]]; then shift; fi; HEAD_NODE_IP="${1#*=}" ;;
40+
--machine_rank*) if [[ "$1" != *=* ]]; then shift; fi; MACHINE_RANK="${1#*=}" ;;
3941
*) EXTRA_ARGS+=("$1") ;;
4042
esac
4143
shift
@@ -59,9 +61,13 @@ fi
5961
# Multi-node routing args (accelerate only; training config comes from the YAML)
6062
MULTI_NODE_ARGS=""
6163
if [[ "$NUM_NODES" != "1" ]]; then
64+
# machine_rank: caller may pass --machine_rank explicitly (needed when the
65+
# SLURM allocation reserves node 0 for something else, e.g. the streaming
66+
# vllm serve, so SLURM_PROCID is offset from accelerate's 0-based rank).
67+
# Default to $SLURM_PROCID for the all-nodes-are-trainers case.
6268
MULTI_NODE_ARGS="--num_processes $TOTAL_GPU \
6369
--num_machines $NUM_NODES \
64-
--machine_rank $SLURM_PROCID \
70+
--machine_rank ${MACHINE_RANK:-$SLURM_PROCID} \
6571
--rdzv_backend c10d \
6672
--main_process_ip $HEAD_NODE_IP \
6773
--main_process_port 29500"

modelopt/recipe/config.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,11 @@ class ModelOptDFlashRecipe(ModelOptSpeculativeRecipeBase):
178178

179179
@model_validator(mode="after")
180180
def _derive_dflash_offline(self) -> ModelOptDFlashRecipe:
181-
self.dflash.dflash_offline = self.data.offline_data_path is not None
181+
# offline (dumped .pt) and streaming (hidden states over HTTP from a vLLM
182+
# serve) both feed pre-computed base hidden states to the DFlash module, so
183+
# both set dflash_offline. Only fully-online training runs the base model.
184+
# Mirrors ModelOptEagleRecipe._derive_eagle_offline.
185+
self.dflash.dflash_offline = self.data.mode != "online"
182186
return self
183187

184188

modelopt/torch/speculative/config.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,10 @@ class DFlashConfig(ModeloptBaseConfig):
6868
dflash_offline: bool = ModeloptField(
6969
default=False,
7070
description=(
71-
"Whether to use detached DFlash (offline training from pre-computed hidden states). "
72-
"Derived by ModelOptDFlashRecipe from data.offline_data_path; not user-configurable."
71+
"Whether the DFlash module consumes pre-computed hidden states (offline from "
72+
"dumped .pt files, or streaming over HTTP from a vLLM serve) instead of running "
73+
"the base model. Derived by ModelOptDFlashRecipe from data.mode (True unless "
74+
"online); not user-configurable."
7375
),
7476
)
7577

tools/launcher/common/eagle3/train_eagle_streaming.sh

Lines changed: 91 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,19 @@
2424
# $SLURM_NODEID:
2525
# nodes == 1 -> co-located: vllm serve on $SERVE_GPU, trainer on the rest of
2626
# the local GPUs (original single-node behavior).
27-
# nodes >= 2 -> split across nodes: node 0 runs vllm serve on all its GPUs,
28-
# node 1 runs the trainer on all its GPUs. The two roles
29-
# rendezvous through the shared /scratchspace mount (node 0
30-
# publishes its address; node 1 signals completion). For large
31-
# models whose serve needs a whole node (e.g. Kimi-K2.5 TP=8),
32-
# allocate exactly 2 nodes.
27+
# nodes == 2 -> split: node 0 runs vllm serve on all its GPUs, node 1 runs
28+
# the trainer on all its GPUs. Roles rendezvous through the
29+
# shared /scratchspace mount (node 0 publishes its serve
30+
# address; the trainer signals completion).
31+
# nodes >= 3 -> 1 serve node (node 0) + N trainer nodes (nodes 1..NNODES-1)
32+
# doing multi-node DDP. The head trainer (node 1, accelerate
33+
# machine_rank 0) publishes its IP for accelerate's c10d
34+
# rendezvous; all trainer nodes read both the serve address and
35+
# the head-trainer address from /scratchspace. NOTE: only global
36+
# rank 0 fetches hidden states from the single serve and
37+
# broadcasts to the rest (DataLoaderDispatcher), so the single
38+
# serve is the throughput ceiling — adding trainer nodes scales
39+
# effective batch / compute, not data-production throughput.
3340
#
3441
# Env vars (required):
3542
# HF_MODEL_CKPT Target model path. Used by both vllm serve (as the
@@ -56,7 +63,8 @@
5663
# TRAIN_GPUS single-node only: CUDA_VISIBLE_DEVICES for the trainer.
5764
# default = all local GPUs except SERVE_GPU.
5865
# SERVE_ADVERTISE_IP multi-node only: address node 1 should dial. default is
59-
# node 0's first `hostname -I` IP.
66+
# node 0's routable IP (its resolved Slurm node name, else
67+
# its first non-loopback / non-link-local IP).
6068
#
6169
# All script args are forwarded to launch_train.sh (typically: --config <yaml>
6270
# plus OmegaConf dotlist overrides).
@@ -112,7 +120,7 @@ export PATH=$PATH:/workspace/.local/bin
112120

113121
###################################################################################################
114122

115-
trap 'error_handler $0 $LINENO' ERR # ERROR HANDLER
123+
trap 'error_handler $0 $LINENO' ERR
116124

117125
if [ -z "$HF_MODEL_CKPT" ]; then
118126
echo "ERROR: HF_MODEL_CKPT must be set." >&2; exit 1
@@ -154,11 +162,9 @@ launch_vllm() {
154162
# would expose *zero* GPUs (not all), so leave it unset to use the whole node.
155163
local -a gpu_env=()
156164
[ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd")
157-
# Optional single-value memory knobs (each a space-free env value, so they
158-
# survive nemo_run's unquoted `export FOO=value`; assembled into --flag value
159-
# pairs here). --cpu-offload-gb spills N GB of weights/GPU to host RAM, the
160-
# key lever for fitting a large model on too-few GPUs (slower, prefill-only
161-
# use tolerates it). --max-model-len / --max-num-seqs trim KV/activation.
165+
# Optional single-value memory knobs (see header), assembled into --flag
166+
# value pairs. Each is a space-free env value so it survives nemo_run's
167+
# unquoted `export FOO=value`.
162168
local -a opt_args=()
163169
[ -n "${SERVE_CPU_OFFLOAD_GB:-}" ] && opt_args+=(--cpu-offload-gb "$SERVE_CPU_OFFLOAD_GB")
164170
[ -n "${SERVE_MAX_MODEL_LEN:-}" ] && opt_args+=(--max-model-len "$SERVE_MAX_MODEL_LEN")
@@ -222,28 +228,52 @@ wait_vllm_ready() {
222228
# per process; multiple workers would duplicate requests against the server.
223229
run_trainer_and_export() {
224230
local url="$1" cvd="$2"
225-
echo "Launching trainer (server=${url}, CUDA_VISIBLE_DEVICES=${cvd:-all})..."
231+
# Optional multi-node trainer routing (see dispatch section). Defaults keep
232+
# the original single-trainer-node behavior: no --num_nodes, export on rank 0.
233+
local num_tnodes="${3:-1}" head_ip="${4:-}" mrank="${5:-0}"
234+
echo "Launching trainer (server=${url}, CUDA_VISIBLE_DEVICES=${cvd:-all}, trainer_nodes=${num_tnodes}, machine_rank=${mrank})..."
226235
# Empty cvd -> use all GPUs on the node (don't set the var; "" would hide all).
227236
local -a gpu_env=()
228237
[ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd")
238+
# Engage accelerate multi-node routing only when >1 trainer node; a single
239+
# trainer node keeps the original invocation (no --num_nodes) verbatim.
240+
local -a mn_args=()
241+
if [ "${num_tnodes}" -gt 1 ]; then
242+
mn_args=(--num_nodes "$num_tnodes" --head_node_ip "$head_ip" --machine_rank "$mrank")
243+
fi
229244
"${gpu_env[@]}" bash modules/Model-Optimizer/examples/speculative_decoding/launch_train.sh \
230245
"${SCRIPT_ARGS[@]}" \
246+
"${mn_args[@]}" \
231247
data.streaming_server_url="$url" \
232248
data.streaming_model_name="$HF_MODEL_CKPT" \
233249
data.streaming_shared_storage_path="$SERVE_SCRATCH" \
234250
training.dataloader_num_workers=0 || { echo "ERROR: trainer failed." >&2; return 1; }
235251

252+
# Export only on the head trainer (machine_rank 0); non-head trainer nodes
253+
# would race writing the same export dir. The export reads the saved
254+
# checkpoint (training.output_dir), not the serve, so it is serve-independent.
255+
if [ "${mrank}" -ne 0 ]; then
256+
echo "machine_rank=${mrank}: training done, skipping export (head trainer handles it)."
257+
return 0
258+
fi
259+
260+
# Export the trained draft to HF format. Derive the checkpoint dir from the
261+
# forwarded `training.output_dir=` dotlist (defaulting to the EAGLE
262+
# convention) so EAGLE and DFlash runs each export their own output_dir.
263+
# EXPORT_EXTRA_ARGS lets DFlash on a custom-modeling base (e.g. Kimi) pass
264+
# --trust_remote_code; empty by default so EAGLE behavior is unchanged.
265+
local out_dir
266+
out_dir=$(printf '%s\n' "${SCRIPT_ARGS[@]}" | sed -n 's/^training\.output_dir=//p' | tail -1)
267+
out_dir="${out_dir:-/scratchspace/eagle3}"
236268
python3 modules/Model-Optimizer/examples/speculative_decoding/scripts/export_hf_checkpoint.py \
237-
--model_path /scratchspace/eagle3 \
238-
--export_path /scratchspace/export
269+
--model_path "$out_dir" \
270+
--export_path "${EXPORT_PATH:-/scratchspace/export}" \
271+
${EXPORT_EXTRA_ARGS:-}
239272
}
240273

241274
# ---------------------------------------------------------------------------
242-
# Topology dispatch (driven by the Slurm allocation, i.e. the yaml `nodes:`):
243-
# SLURM_NNODES == 1 -> co-located: vllm on $SERVE_GPU, trainer on the rest.
244-
# SLURM_NNODES >= 2 -> split: node 0 serves on all its GPUs, node 1 trains on
245-
# all its GPUs; they rendezvous via /scratchspace.
246-
# nemo_run runs this script once per node, so we branch on $SLURM_NODEID.
275+
# Topology dispatch (see header): nemo_run runs this script once per node, so
276+
# branch on $SLURM_NNODES / $SLURM_NODEID. Per-branch detail in section heads.
247277
# ---------------------------------------------------------------------------
248278
NNODES="${SLURM_NNODES:-1}"
249279
NODEID="${SLURM_NODEID:-0}"
@@ -299,27 +329,55 @@ elif [ "$NODEID" -eq 0 ]; then
299329
while [ ! -f "$DONE_FILE" ]; do sleep 10; done
300330
echo "Training-done sentinel seen; serve node exiting (EXIT trap stops vllm)."
301331

302-
elif [ "$NODEID" -eq 1 ]; then
303-
# ---------------------- multi-node: trainer node -----------------------
304-
# Release the serve node on any exit (success or failure) so it doesn't hang.
305-
trap 'touch "$DONE_FILE" 2>/dev/null || true' EXIT
332+
elif [ "$NODEID" -ge 1 ]; then
333+
# -------------------- multi-node: trainer node(s) ----------------------
334+
# Node 0 is the vllm serve; trainer nodes are SLURM nodes 1..NNODES-1, which
335+
# map to 0-based accelerate machine ranks (head trainer = SLURM node 1).
336+
NUM_TRAINER_NODES=$(( NNODES - 1 ))
337+
TRAINER_RANK=$(( NODEID - 1 ))
338+
TRAINER_ADDR_FILE="/scratchspace/.trainer_addr"
339+
340+
# Only the head trainer (rank 0) signals the serve node to release on exit;
341+
# a non-head node exiting first must NOT tear the serve down early.
342+
if [ "$TRAINER_RANK" -eq 0 ]; then
343+
trap 'touch "$DONE_FILE" 2>/dev/null || true' EXIT
344+
rm -f "$TRAINER_ADDR_FILE" # clear stale rendezvous state
345+
fi
306346

307-
echo "Trainer node waiting (up to ${SERVE_READY_TIMEOUT}s) for the serve address..."
347+
echo "Trainer node (rank ${TRAINER_RANK}/${NUM_TRAINER_NODES}) waiting for the serve address..."
308348
for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do
309349
[ -f "$SERVE_ADDR_FILE" ] && break
310350
sleep 1
311351
done
312352
[ -f "$SERVE_ADDR_FILE" ] || { echo "ERROR: serve node never published its address." >&2; exit 1; }
313353
URL="http://$(cat "$SERVE_ADDR_FILE"):${SERVE_PORT}"
314-
315354
wait_vllm_ready "$URL" || exit 1
316-
run_trainer_and_export "$URL" "" || exit 1
317355

318-
else
319-
# ------------- multi-node: extra nodes (unused by default) -------------
320-
echo "Node rank ${NODEID} idle: the default split uses node 0 = vllm serve, node 1 = trainer."
321-
echo "Multi-node *training* (>1 trainer node) is not wired up yet; allocate exactly 2 nodes."
322-
while [ ! -f "$DONE_FILE" ]; do sleep 10; done
356+
if [ "$NUM_TRAINER_NODES" -le 1 ]; then
357+
# Original 1-serve + 1-trainer topology: single-node DDP, unchanged.
358+
run_trainer_and_export "$URL" "" || exit 1
359+
else
360+
# >1 trainer node: head (rank 0) publishes its routable IP for accelerate's
361+
# c10d rendezvous (port 29500); all trainer nodes read it and join. Reuse
362+
# the serve node's IP-resolution logic (avoid link-local / loopback).
363+
if [ "$TRAINER_RANK" -eq 0 ]; then
364+
head_addr="${TRAINER_ADVERTISE_IP:-}"
365+
[ -z "$head_addr" ] && head_addr=$(getent hosts "${SLURMD_NODENAME:-$(hostname)}" 2>/dev/null | awk '{print $1}' | head -1)
366+
[ -z "$head_addr" ] && head_addr=$(hostname -I | tr ' ' '\n' | grep -vE '^(127\.|169\.254\.|fe80:|::1)' | head -1)
367+
[ -z "$head_addr" ] && head_addr=$(hostname -I | awk '{print $1}')
368+
echo "$head_addr" > "$TRAINER_ADDR_FILE"
369+
echo "Head trainer (rank 0) published ${head_addr} for c10d rendezvous."
370+
else
371+
echo "Trainer rank ${TRAINER_RANK} waiting for head-trainer address..."
372+
for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do
373+
[ -f "$TRAINER_ADDR_FILE" ] && break
374+
sleep 1
375+
done
376+
[ -f "$TRAINER_ADDR_FILE" ] || { echo "ERROR: head trainer never published its address." >&2; exit 1; }
377+
fi
378+
HEAD_IP=$(cat "$TRAINER_ADDR_FILE")
379+
run_trainer_and_export "$URL" "" "$NUM_TRAINER_NODES" "$HEAD_IP" "$TRAINER_RANK" || exit 1
380+
fi
323381
fi
324382

325383
###################################################################################################

tools/launcher/core.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,9 @@ def build_slurm_executor(
286286
retries=0,
287287
packager=packager,
288288
srun_args=slurm_config.srun_args,
289+
# --segment=<N>: pin all nodes into one topology block (one NVL72 / NVLink
290+
# domain). None -> omitted, scheduler places freely (default behavior).
291+
segment=slurm_config.segment,
289292
)
290293
return executor
291294

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# DFlash dry-run smoke test for Kimi-K2.5 (NVFP4).
2+
#
3+
# Single-task pipeline that exercises the full convert→save→export path WITHOUT
4+
# actually training. Uses the same `common/specdec/dflash_online_training.sh`
5+
# entrypoint as a real DFlash run; all dry-run behaviour is expressed as dotlist
6+
# overrides on `main.py` (shared with EAGLE3 — `--dry_run` is mode-agnostic):
7+
#
8+
# --dry_run → main.py skips trainer.train(), saves
9+
# the (untrained) ModelOpt checkpoint
10+
# to training.output_dir right after
11+
# mtsp.convert(model, [("dflash", ...)])
12+
# data.offline_data_path=<placeholder> → DataArguments derives data.mode from
13+
# the data-source fields, so setting an
14+
# offline path makes mode='offline' →
15+
# use_offline_training=True. Combined
16+
# with use_fake_base_for_offline=true
17+
# this loads a FakeBaseModel (only
18+
# embed_tokens + lm_head), so the ~1T
19+
# MoE base fits on a single GPU. The
20+
# file is never read in --dry_run mode.
21+
# model.trust_remote_code=true → Kimi-K2.5 (deepseek_v3 arch) ships a
22+
# custom modeling file
23+
# dflash.dflash_mask_token_id=163838 → Kimi-K2.5 has no dedicated mask token
24+
# ([EOS]=163585, [PAD]=163839); 163838 is
25+
# a reserved slot used as the DFlash mask
26+
# (matches the real Kimi-K2.5 DFlash run)
27+
#
28+
# The dflash_online_training.sh export block then writes an HF-format DFlash draft
29+
# to /scratchspace/dflash/exported-checkpoint-final with the correct architecture
30+
# (5-layer draft block, block_size=8) but untrained weights — acceptance ~0%, by
31+
# design. Useful for smoke-testing the launcher / convert / export plumbing and
32+
# validating downstream loaders without paying for a real training run.
33+
#
34+
# Usage:
35+
# uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml --yes
36+
37+
job_name: Kimi-K2.5_DFlash_dryrun
38+
pipeline:
39+
allow_to_fail: false
40+
skip: false
41+
note:
42+
43+
global_vars:
44+
hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4/
45+
46+
# Convert → save → export (no training).
47+
task_0:
48+
script: common/specdec/dflash_online_training.sh
49+
args:
50+
- --dry_run
51+
- --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
52+
- model.model_name_or_path=<<global_vars.hf_model>>
53+
- model.use_fake_base_for_offline=true
54+
- model.trust_remote_code=true
55+
- data.offline_data_path=/tmp/dryrun-placeholder
56+
- training.output_dir=/scratchspace/dflash
57+
- training.disable_tqdm=true
58+
- dflash.dflash_mask_token_id=163838
59+
slurm_config:
60+
_factory_: "slurm_factory"
61+
nodes: 1
62+
ntasks_per_node: 1
63+
gpus_per_node: 1
64+
container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10

0 commit comments

Comments
 (0)