NVIDIA
diff --git a/‎examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py‎
Lines changed: 242 additions & 0 deletions b/‎examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py‎
Lines changed: 242 additions & 0 deletions
diff --git a/‎examples/speculative_decoding/pipeline/eagle3/eagle3_new_model_triage_guide.md‎
Lines changed: 131 additions & 0 deletions b/‎examples/speculative_decoding/pipeline/eagle3/eagle3_new_model_triage_guide.md‎
Lines changed: 131 additions & 0 deletions
@@ -0,0 +1,242 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Extract hidden states from an LLM using vLLM's native hidden-state extractor.
+
+This uses vLLM's built-in ``extract_hidden_states`` speculative method together with
+the ``ExampleHiddenStatesConnector`` KV connector, so no third-party data-generation
+dependency (e.g. ``speculators``) is required. Because the same ``eagle_aux_hidden_state_layer_ids``
+convention is used at EAGLE3 deployment time in vLLM, the captured aux layers match
+deployment by construction.
+
+See https://docs.vllm.ai/en/stable/features/speculative_decoding/extract_hidden_states/
+"""
+
+import argparse
+from pathlib import Path
+
+import torch
+from common import add_aux_layers_args, resolve_aux_layers
+from datasets import load_dataset
+from tqdm import tqdm
+from transformers import AutoConfig, AutoTokenizer
+
+REMOVE_THINK_CHAT_TEMPLATE = (
+    "{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}"
+)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="""Collect hidden states from conversations using vLLM's native extractor."""
+    )
+
+    parser.add_argument("--model", type=str, required=True, help="HF model path.")
+    parser.add_argument(
+        "--max-seq-len", type=int, default=3072, help="Max tokens per conversation."
+    )
+    parser.add_argument(
+        "--input-data", type=Path, required=True, help="Path to jsonl file or directory."
+    )
+    parser.add_argument(
+        "--output-dir", type=Path, required=True, help="Directory to save hidden states."
+    )
+    parser.add_argument("--dp-rank", type=int, default=0, help="Data parallel rank.")
+    parser.add_argument("--dp-world-size", type=int, default=1, help="Data parallel world size.")
+    parser.add_argument(
+        "--trust_remote_code", action="store_true", help="Trust remote code for HF models."
+    )
+    parser.add_argument("--tp", type=int, default=None, help="Tensor parallel size.")
+    parser.add_argument(
+        "--debug-max-num-conversations", type=int, default=None, help="Limit conversations."
+    )
+    add_aux_layers_args(parser)
+
+    return parser.parse_args()
+
+
+def main(args: argparse.Namespace) -> None:
+    # Import lazily so --help and arg parsing work without vLLM installed.
+    from vllm import LLM, SamplingParams
+    from vllm.config.kv_transfer import KVTransferConfig
+    from vllm.distributed.kv_transfer.kv_connector.v1 import example_hidden_states_connector
+    from vllm.inputs import TokensPrompt
+
+    # Load conversations
+    if args.input_data.is_file() and str(args.input_data).endswith(".jsonl"):
+        dataset = load_dataset("json", data_files=str(args.input_data), split="train")
+    elif args.input_data.is_dir():
+        dataset = load_dataset(
+            "json", data_files={"train": f"{args.input_data}/*.jsonl"}, split="train"
+        )
+    else:
+        raise ValueError(f"input_data must be a .jsonl file or directory, got: {args.input_data}")
+    print(f"Loaded {len(dataset)} conversations from {args.input_data}")
+
+    # Shard data
+    if args.dp_world_size > 1:
+        dataset = dataset.shard(num_shards=args.dp_world_size, index=args.dp_rank)
+    print(f"Sharded to {len(dataset)} conversations for DP#{args.dp_rank}/{args.dp_world_size}")
+
+    # Remove already dumped conversations
+    output_dir = args.output_dir
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    def keep_conversation(entry):
+        conversation_id = entry.get("conversation_id", entry.get("uuid", None))
+        assert conversation_id is not None, "conversation_id is required"
+        return not (output_dir / f"{conversation_id}.pt").exists()
+
+    original_num = len(dataset)
+    dataset = dataset.filter(keep_conversation)
+    print(f"Removed {original_num - len(dataset)} conversations due to existing output files")
+
+    if args.debug_max_num_conversations is not None:
+        dataset = dataset.select(range(args.debug_max_num_conversations))
+
+    # Resolve the aux-layer indices and append the final-layer output. vLLM saves the
+    # final (un-normed) hidden state when ``num_hidden_layers`` is passed as a layer id.
+    config = AutoConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
+    num_hidden_layers = getattr(config, "num_hidden_layers", None)
+    if num_hidden_layers is None:
+        raise ValueError(f"model config has no 'num_hidden_layers' attribute: {config}")
+    aux_layer_ids = resolve_aux_layers(args, num_hidden_layers)
+    # The trailing entry is the final output hidden state; the rest are aux layers.
+    extract_layer_ids = [*aux_layer_ids, num_hidden_layers]
+    print(f"Extracting hidden states from layers {extract_layer_ids} (last = final output)")
+
+    # Tokenize conversations
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    if tokenizer.chat_template is not None:
+        tokenizer.chat_template = tokenizer.chat_template.replace(REMOVE_THINK_CHAT_TEMPLATE, "")
+
+    # Prepare prompts for vLLM
+    prompts = []
+    conversation_ids = []
+    num_skipped_too_long = 0
+    num_invalid = 0
+
+    for entry in dataset:
+        conversation_id = entry.get("conversation_id", entry.get("uuid"))
+        conversations = entry["conversations"]
+        if not conversations or not isinstance(conversations, list):
+            num_invalid += 1
+            continue
+
+        tokenized = tokenizer.apply_chat_template(
+            conversations, return_tensors="pt", add_generation_prompt=False
+        )
+        # transformers 5.x: BatchEncoding may not inherit from dict; use .input_ids
+        if hasattr(tokenized, "input_ids"):
+            input_ids = tokenized.input_ids
+        elif hasattr(tokenized, "__getitem__") and "input_ids" in tokenized:
+            input_ids = tokenized["input_ids"]
+        else:
+            input_ids = tokenized
+        if not hasattr(input_ids, "shape"):
+            input_ids = torch.tensor(input_ids)
+        input_ids = input_ids.squeeze(0)
+        num_tokens = input_ids.shape[0]
+        if num_tokens <= 10 or num_tokens > args.max_seq_len:
+            num_skipped_too_long += 1
+            continue
+
+        prompts.append(TokensPrompt(prompt_token_ids=input_ids.tolist()))
+        conversation_ids.append(conversation_id)
+
+    print(
+        f"Prepared {len(prompts)} prompts ({num_skipped_too_long} skipped too long, {num_invalid} invalid)"
+    )
+
+    if len(prompts) == 0:
+        print("No prompts to process.")
+        return
+
+    # Initialize vLLM with the native hidden-state extractor.
+    tp = args.tp if args.tp is not None else torch.cuda.device_count()
+    storage_path = output_dir / ".vllm_hidden_states"
+    storage_path.mkdir(parents=True, exist_ok=True)
+
+    llm = LLM(
+        model=args.model,
+        tensor_parallel_size=tp,
+        max_model_len=args.max_seq_len,
+        trust_remote_code=args.trust_remote_code,
+        enable_chunked_prefill=False,  # required by extract_hidden_states
+        speculative_config={
+            "method": "extract_hidden_states",
+            "num_speculative_tokens": 1,
+            "draft_model_config": {
+                "hf_config": {"eagle_aux_hidden_state_layer_ids": extract_layer_ids},
+            },
+        },
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="ExampleHiddenStatesConnector",
+            kv_role="kv_producer",
+            kv_connector_extra_config={
+                "shared_storage_path": str(storage_path),
+                "use_synchronization_lock": False,  # batch generation, no concurrent readers
+            },
+        ),
+    )
+
+    # max_tokens=1: we only need a single forward pass over the prompt tokens.
+    outputs = llm.generate(prompts, SamplingParams(max_tokens=1))
+
+    # Save in the same format as compute_hidden_states_hf.py (sans loss_mask, which the
+    # vLLM path does not compute).
+    num_success = 0
+    for conv_id, output in tqdm(zip(conversation_ids, outputs), total=len(outputs), desc="Saving"):
+        hidden_states_path = output.kv_transfer_params.get("hidden_states_path")
+        if hidden_states_path is None:
+            print(f"WARNING: no hidden_states_path for conversation {conv_id}; skipping")
+            continue
+
+        obj = example_hidden_states_connector.load_hidden_states(hidden_states_path)
+        token_ids = obj["token_ids"]
+        # hidden_states: [num_tokens, num_extracted_layers, hidden_size], ordered to match
+        # extract_layer_ids. Last layer = final output; the rest = aux layers.
+        hidden_states = obj["hidden_states"]
+
+        output_hidden_states = hidden_states[:, -1, :].cpu()
+        if hidden_states.shape[1] > 1:
+            # Concatenate aux layers along the hidden dim, matching the HF dump format.
+            aux = hidden_states[:, :-1, :].cpu()
+            aux_hidden_states = aux.reshape(aux.shape[0], -1)
+        else:
+            aux_hidden_states = torch.empty(0)
+
+        output_file = output_dir / f"{conv_id}.pt"
+        with open(output_file, "wb") as f:
+            torch.save(
+                {
+                    "input_ids": token_ids.cpu(),
+                    "hidden_states": output_hidden_states,
+                    "aux_hidden_states": aux_hidden_states,
+                    "conversation_id": conv_id,
+                },
+                f,
+            )
+        example_hidden_states_connector.cleanup_hidden_states(hidden_states_path)
+        num_success += 1
+
+    print(f"Successfully processed {num_success} out of {len(prompts)} conversations.")
+
+
+if __name__ == "__main__":
+    cli_args = parse_args()
+    main(cli_args)
@@ -0,0 +1,131 @@
+# EAGLE3 New Model Support — Triage Guide for Claude Code
+
+This document describes how to triage EAGLE3 pipeline failures when adding a new model.
+Follow these steps in order. Stop at the first failure, diagnose, and document findings.
+
+## Pipeline Overview
+
+The EAGLE3 pipeline has 4 stages (mapped to task_0 through task_3 in the YAML):
+
+| Task | Stage | Container | Script | What it does |
+|------|-------|-----------|--------|-------------|
+| task_0 | Data synthesis | vllm/vllm-openai | `common/vllm/query.sh` | Serve model with vLLM, generate synthetic conversations |
+| task_1 | Hidden state dump | vllm/vllm-openai | `common/eagle3/dump_offline_data*.sh` | Dump hidden states from generated conversations |
+| task_2 | Training + Export | tensorrt-llm/release | `common/eagle3/train_eagle.sh` | Train EAGLE3 draft model, export HF checkpoint |
+| task_3 | Benchmark | vllm/vllm-openai | `common/specdec_bench/quick_check.sh` | Run speculative decoding benchmark |
+
+Some configs combine task_0+task_1 into a single vLLM dump step, or skip task_0 if data already exists.
+
+## Step 1: Locate the pipeline config
+
+```text
+tools/launcher/examples/<Org>/<Model>/eagle3_quick_check.yaml
+```
+
+If it doesn't exist, create one by copying an existing `eagle3_quick_check.yaml` and adjusting:
+- `HF_MODEL_CKPT` — the HF model path on `/hf-local/`
+- GPU/node counts based on model size
+- `--trust_remote_code` / `--trust-remote-code` if needed
+- Container images
+
+## Step 2: Submit the pipeline
+
+```bash
+cd tools/launcher
+uv run launch.py --yaml examples/<Org>/<Model>/eagle3_quick_check.yaml --yes -v
+```
+
+The rsync can take several minutes. Experiment ID is printed as `cicd_<timestamp>`.
+
+## Step 3: Check experiment output
+
+Experiment directory:
+
+```text
+experiments/cicd/cicd_<id>/
+```
+
+Each task has a directory `<JobName>_<N>/` containing:
+- `sbatch_<JobName>_<N>_<SlurmJobID>.out` — the main log
+- `code/` — snapshot of the code at submission time
+
+Check logs:
+
+```bash
+tail -100 experiments/cicd/cicd_<id>/<JobName>_<N>/sbatch_*.out
+```
+
+## Step 4: Diagnose failures by stage
+
+### task_0/task_1 failures (vLLM data generation / hidden state dump)
+
+Common issues:
+- **Server never starts** → Check for OOM, unsupported architecture, or missing `--trust_remote_code`
+- **`HarmonyError: vocab file`** → gated model, tokenizer not available offline
+- **`TypeError: 'NoneType' object is not iterable`** → vLLM doesn't support this model architecture yet
+- **`CANCELLED DUE TO TIME LIMIT`** → Model too slow for the time limit; increase wall time or reduce data
+- **Server starts but queries fail** → Check prompt format, connection errors
+
+### task_2 failures (training + export)
+
+Common issues:
+- **`No such file or directory: service_utils.sh`** → pipeline infra issue (older experiment)
+- **`ValueError: Unrecognized configuration class ... for AutoModelForCausalLM`** → VLM model not detected as VLM. Check if `load_vlm_or_llm` in `modelopt/torch/speculative/utils.py` handles this model type. Look for `text_config`/`llm_config` attributes.
+- **`FileNotFoundError` on shard files** → Checkpoint has unusual format (e.g., missing HF shards, has consolidated.safetensors instead). Check `FakeBaseModel._load_weights`.
+- **OOM during training** → Reduce `--train_bs` or `--training_seq_len`
+- **NaN loss** → Reduce `--lr`, check data quality
+
+### task_3 failures (benchmark)
+
+Common issues:
+- **`/scratchspace/export` doesn't exist** → task_2 failed; fix training first
+- **`StrictDataclassFieldValidationError`** → exported `config.json` has `null` where a typed field is expected (e.g., `use_cache`). Fix the export template in `modelopt/torch/export/plugins/hf_spec_configs.py`.
+- **`KeyError: '<model_type>'`** → transformers version in container doesn't recognize the model type
+- **`trust_remote_code=True` required** → add to benchmark config
+- **vLLM resolves model as wrong architecture** → VLM wrapper model needs special handling
+
+## Step 5: Applying fixes
+
+### Repo fixes (for merged modelopt)
+
+Edit files in `/home/yeyu/Documents/TensorRT-Model-Optimizer/modelopt/torch/speculative/`.
+The key files:
+- `utils.py` — `load_vlm_or_llm()` for model loading
+- `plugins/modeling_fakebase.py` — `FakeBaseModel` for offline training weight loading
+- `plugins/hf_eagle.py` — EAGLE model definition
+- `../export/plugins/hf_spec_configs.py` — export config templates
+- `../export/plugins/hf_spec_export.py` — export logic
+
+### Container patches (for pipeline)
+
+A container may ship a pre-installed modelopt that can't be easily upgraded (CUDA build issues).
+If a fix is needed against such an installed library, apply a runtime patch in the relevant
+task script (e.g. the training script `common/eagle3/train_eagle.sh`) using a Python heredoc
+that find-and-replaces the exact code pattern in the installed file.
+
+> Note: the vLLM dump path previously relied on source-patching the `speculators` library.
+> That dependency was removed in favor of vLLM's native `extract_hidden_states` extractor, so
+> no speculators patches are applied anymore.
+
+When adding a new patch:
+1. Find the exact `old` string in the installed file (must be unique)
+2. Write the `new` replacement string
+3. Add a `python3 << 'PYEOF' || true` block in the task script before `set -eo pipefail`
+
+## Step 6: Document results
+
+Update `examples/speculative_decoding/pipeline/eagle3/eagle3_triage_chart.md`:
+1. Update the model row in the **Model Test Matrix** (status + per-task results)
+2. Add a **Per-Model Test Results** entry with experiment IDs, errors, and fixes
+3. Add new failure patterns to the **Observed Failure Catalog**
+
+## Known Model-Specific Issues
+
+| Model Type | Issue | Where | Fix |
+|-----------|-------|-------|-----|
+| `mistral3` (Ministral-3-*) | Not detected as VLM by `"vl"` check | `utils.py` | Check `text_config`/`llm_config` attrs |
+| `mistral3` (Ministral-3-8B) | Missing HF shard 1, has `consolidated.safetensors` | `modeling_fakebase.py` | Fallback to consolidated with key aliases |
+| All models via FakeBaseModel | `use_cache=null` in exported config | `hf_spec_configs.py` | Set `use_cache: True` in templates |
+| `gpt-oss-20b` | Tokenizer requires `openai_harmony` | task_0 | Gated/special tokenizer setup |
+| `MiniMax-M2.5` | Custom model code | task_3 | `--trust_remote_code` |
+| `ministral3` | `KeyError: 'ministral3'` in older transformers | task_3 | Needs transformers >= 5.3.0 |