examples(streaming-rdma): add Qwen3-8B DFlash streaming examples (single + multi-node)

h-guo18 · h-guo18 · commit d69614430c98 · 2026-06-08T05:14:57.000Z
Mirror the EAGLE3 streaming examples for DFlash: same NIXL-RDMA hidden-states
transport and dispatch (train_eagle_streaming.sh), but the DFlash recipe and a
different capture set. DFlash extracts build_target_layer_ids(36,5)=[1,9,17,25,33]
as the draft fc input plus the final layer for self-logit distillation, so vLLM
captures [2,10,18,26,34,36] (each target id +1, plus final layer 36).

No code change needed: the streaming dataset already emits the same dict shape
DFlash's offline path consumes. The streaming corpus is prompt-only (the serve
generates the response and we capture its hidden states), so answer_only_loss is
false (train over the full sequence, as in the EAGLE3 streaming example), and
report_to is set to none since the dflash recipe defaults to tensorboard, which
is absent in the serve container.

Validated single-node on oci-nrt H100: serve captures 6 layers, trainer
RDMA-fetches and DFlash converges (loss 11.0 -&gt; 6.7 in 400 steps) and exports.

Signed-off-by: h-guo18 &lt;67671475+h-guo18@users.noreply.github.com&gt;
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_dflash.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_dflash.yaml
@@ -0,0 +1,96 @@
+# DFlash streaming speculative decoding pipeline for Qwen3-8B.
+#
+# Same streaming transport as the EAGLE3 example (hf_streaming_eagle3.yaml): a live
+# `vllm serve` captures the target model's hidden states and moves them to the trainer
+# over NIXL RDMA (no disk round-trip). DFlash just consumes a different set of captured
+# layers and trains a block-diffusion draft instead of an autoregressive one.
+#
+# 3-step pipeline:
+#   task_0: Build input conversations (jsonl)
+#   task_1: Streaming train — vllm serve + DFlash trainer; hidden states over NIXL RDMA
+#   task_2: vLLM smoke test with DFlash speculative decoding
+#
+# task_1 uses nodes=2: node 0 runs vllm serve, node 1 the trainer. Tasks share
+# /scratchspace to pass artifacts.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3-8B/hf_streaming_dflash.yaml --yes
+
+job_name: Qwen3-8B_DFlash_streaming
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3-8B
+
+  # Step 1: Build input conversations
+  task_0:
+    script: common/eagle3/make_dataset.sh
+    args:
+      - -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml
+      - --full-conversations
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 2: Streaming DFlash training — node 0 vllm serve, node 1 trainer.
+  # DFlash extracts 5 target layers (build_target_layer_ids(36,5)=[1,9,17,25,33], the
+  # draft's fc input) plus the final layer for self-logit distillation. vLLM's capture
+  # ids are those +1 -> [2,10,18,26,34], plus final layer 36.
+  task_1:
+    script: common/eagle3/train_eagle_streaming.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.mode=streaming
+      - data.data_path=/scratchspace/data/train.jsonl
+      - training.output_dir=/scratchspace/dflash
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      # Streaming corpus is prompt-only (the serve generates the response and we
+      # capture its hidden states), so there is no assistant span to mask -> train
+      # over the full sequence, same as the EAGLE3 streaming example.
+      - training.answer_only_loss=false
+      - training.num_train_epochs=1
+      # dflash.yaml sets report_to=tensorboard, which hard-fails if tensorboard
+      # isn't in the serve container; the streaming trainer doesn't need it.
+      - training.report_to=none
+      - dflash.dflash_block_size=16
+      - dflash.dflash_num_anchors=512
+      - dflash.dflash_loss_decay_factor=7
+      - dflash.dflash_mask_token_id=151669
+      - dflash.dflash_architecture_config.num_hidden_layers=5
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      # No spaces: nemo_run emits unquoted `export FOO=value`, so spaces would split.
+      - EAGLE_CAPTURE_IDS: "[2,10,18,26,34,36]"
+      - SERVE_TP: "1"
+      # DFlash uses a custom modeling file; export must trust remote code.
+      - EXPORT_EXTRA_ARGS: "--trust_remote_code"
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 2
+      ntasks_per_node: 1
+      gpus_per_node: 1
+      container: vllm/vllm-openai:latest
+
+  # Step 3: vLLM smoke test (DFlash, uses exported checkpoint from training)
+  task_2:
+    script: common/specdec/vllm_smoke_test.sh
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - DRAFT_MODEL: /scratchspace/export
+      - SPEC_METHOD: "dflash"
+      - NUM_SPEC_TOKENS: "7"
+      - MIN_ACCEPTANCE_LENGTH: "1.2"
+    slurm_config:
+      _factory_: "slurm_factory"
+      container: "vllm/vllm-openai:nightly"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_dflash_multi_node.yaml
@@ -0,0 +1,98 @@
+# DFlash streaming speculative decoding pipeline for Qwen3-8B — MULTI-NODE.
+#
+# Same streaming transport / dispatch as hf_streaming_eagle3_multi_node.yaml: task_1
+# splits N nodes into K serve replicas + (N-K) DDP trainers via SERVE_NODES; hidden
+# states move serve -> trainer over NIXL RDMA. DFlash just consumes a different set of
+# captured layers and trains a block-diffusion draft instead of an autoregressive one.
+# See common/eagle3/train_eagle_streaming.sh for dispatch, rendezvous, and sharding.
+#
+# 3-step pipeline:
+#   task_0: Build input conversations (jsonl)
+#   task_1: Streaming train — 2 serve nodes (2 GPU, TP=2) + 2 trainer nodes (2 GPU)
+#   task_2: vLLM smoke test with DFlash speculative decoding
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3-8B/hf_streaming_dflash_multi_node.yaml --yes
+
+job_name: Qwen3-8B_DFlash_streaming_multi_node
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3-8B
+
+  # Step 1: Build input conversations
+  task_0:
+    script: common/eagle3/make_dataset.sh
+    args:
+      - -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml
+      - --full-conversations
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 2: Streaming DFlash training — 2 serve replicas (TP=2) + 2 trainer nodes (2 GPU each).
+  # DFlash extracts 5 target layers (build_target_layer_ids(36,5)=[1,9,17,25,33], the
+  # draft's fc input) plus the final layer for self-logit distillation. vLLM's capture
+  # ids are those +1 -> [2,10,18,26,34], plus final layer 36.
+  task_1:
+    script: common/eagle3/train_eagle_streaming.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.mode=streaming
+      - data.data_path=/scratchspace/data/train.jsonl
+      - training.output_dir=/scratchspace/dflash
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      # Streaming corpus is prompt-only (the serve generates the response and we
+      # capture its hidden states), so there is no assistant span to mask -> train
+      # over the full sequence, same as the EAGLE3 streaming example.
+      - training.answer_only_loss=false
+      - training.num_train_epochs=1
+      # dflash.yaml sets report_to=tensorboard, which hard-fails if tensorboard
+      # isn't in the serve container; the streaming trainer doesn't need it.
+      - training.report_to=none
+      - dflash.dflash_block_size=16
+      - dflash.dflash_num_anchors=512
+      - dflash.dflash_loss_decay_factor=7
+      - dflash.dflash_mask_token_id=151669
+      - dflash.dflash_architecture_config.num_hidden_layers=5
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      # No spaces: nemo_run emits `export FOO=value` unquoted.
+      - EAGLE_CAPTURE_IDS: "[2,10,18,26,34,36]"
+      - SERVE_TP: "2"
+      # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers.
+      - SERVE_NODES: "2"
+      # Per-rank in-flight fetches; keep low so the cold serve isn't flooded past its execute-model timeout.
+      - STREAMING_NUM_WORKERS: "4"
+      # DFlash uses a custom modeling file; export must trust remote code.
+      - EXPORT_EXTRA_ARGS: "--trust_remote_code"
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 4
+      ntasks_per_node: 1
+      gpus_per_node: 2
+      container: vllm/vllm-openai:latest
+
+  # Step 3: vLLM smoke test (DFlash, uses exported checkpoint from training)
+  task_2:
+    script: common/specdec/vllm_smoke_test.sh
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - DRAFT_MODEL: /scratchspace/export
+      - SPEC_METHOD: "dflash"
+      - NUM_SPEC_TOKENS: "7"
+      - MIN_ACCEPTANCE_LENGTH: "1.2"
+    slurm_config:
+      _factory_: "slurm_factory"
+      container: "vllm/vllm-openai:nightly"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1