Skip to content

Commit bfcb034

Browse files
committed
Addressed review comments
Signed-off-by: Rohan Joshi <rohjoshi@nvidia.com>
1 parent 3514a2e commit bfcb034

3 files changed

Lines changed: 26 additions & 61 deletions

File tree

CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Changelog
2424
**New Features**
2525

2626
- Extend Claude Code agent skills for PTQ, deployment, evaluation, monitoring, and baseline-vs-quantized result comparison. Adds evaluation task references for additional benchmarks, stronger PTQ checkpoint validation gates, and session-scoped workspace/job tracking.
27+
- Add ``examples/alpamayo`` showing FP8, NVFP4, and AutoQuantize (mixed-precision) quantization of the Alpamayo (formerly Alpamayo-R1) ~10B vision-language-action model, with a joint VLM + diffusion calibration loop and both fake-quant and ``--real-quant`` packed-checkpoint export. See `examples/alpamayo/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/alpamayo>`_ for details.
2728
- Add SLURM Quality of Service (QoS) support to the ModelOpt launcher. Users can set QoS via ``slurm_config.qos`` or ``SLURM_QOS`` and the value is forwarded to ``nemo_run.SlurmExecutor``.
2829
- Add composable ``$import`` system for recipe YAML configs, enabling reusable config snippets referenced via ``{$import: name}`` markers. All built-in PTQ recipes converted to use imports with shared snippets under ``modelopt_recipes/configs/`` (numeric formats, quant_cfg building blocks, presets). See :ref:`composable-imports`.
2930
- Add offline DFlash speculative decoding training. Train the draft module from pre-computed base-model hidden states dumped by ``examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py``; base-model transformer layers are deleted after conversion to save memory. Controlled by the auto-derived ``dflash_offline`` flag on ``DFlashConfig`` (derived from ``data_args.offline_data_path``). The dump scripts now share ``collect_hidden_states/common.py`` for aux-layer selection (``--aux-layers eagle|dflash|<list>``) and optional assistant-token ``loss_mask`` for answer-only-loss training.

examples/alpamayo/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Clone Alpamayo and install it into the current environment so `alpamayo_r1` is
1919
importable:
2020

2121
```bash
22-
git clone https://github.com/nvlabs/alpamayo
22+
git clone https://github.com/nvlabs/alpamayo # tested @ 4cda35d
2323
pip install ./alpamayo
2424
```
2525

examples/alpamayo/quantize.py

Lines changed: 24 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
18-
# SPDX-License-Identifier: Apache-2.0
19-
2017
"""Quantize AlpamayoR1 and export as an HF-style checkpoint.
2118
2219
Usage:
@@ -28,7 +25,6 @@
2825
import collections.abc
2926
import copy
3027
import json
31-
import logging
3228
import os
3329
from pathlib import Path
3430
from typing import Any
@@ -42,23 +38,11 @@
4238
from tqdm import tqdm
4339
from transformers import AutoProcessor, AutoTokenizer
4440

41+
import modelopt.torch.opt as mto
4542
import modelopt.torch.quantization as mtq
46-
from modelopt.torch.export import export_hf_checkpoint
4743
from modelopt.torch.export.quant_utils import get_quant_config
48-
from modelopt.torch.opt.plugins.huggingface import (
49-
_LIBRARY_CLASSES_FOR_PATCHING,
50-
_PATCHED_CLASSES,
51-
patch_pretrained_methods,
52-
)
5344
from modelopt.torch.utils.dataset_utils import create_forward_loop, get_dataset_dataloader
5445

55-
logger = logging.getLogger(__name__)
56-
57-
try:
58-
assert torch.ops.tensorrt.quantize_op.default
59-
except Exception:
60-
logger.warning("Unable to import quantization op. Please install modelopt library")
61-
6246
MIN_PIXELS = 163840
6347
MAX_PIXELS = 196608
6448
BASE_PROCESSOR_NAME = "Qwen/Qwen3-VL-2B-Instruct"
@@ -139,25 +123,6 @@ def to_device(
139123
return data
140124

141125

142-
def enable_huggingface_checkpointing_patch() -> None:
143-
"""Patch PreTrainedModel.from_pretrained / save_pretrained to save/restore ModelOpt state.
144-
145-
Must be called before AlpamayoR1.from_pretrained() when loading a quantized (FP8/NVFP4)
146-
checkpoint so that modelopt_state.pth is restored and _amax scaling factors are applied.
147-
"""
148-
for name, (classes, methods_list) in _LIBRARY_CLASSES_FOR_PATCHING.items():
149-
for cls, patch_methods in zip(classes, methods_list):
150-
if cls in _PATCHED_CLASSES:
151-
continue
152-
patch_methods = [m for m in patch_methods if m[0] != "_from_config"]
153-
patch_pretrained_methods(cls, patch_methods)
154-
_PATCHED_CLASSES.add(cls)
155-
print(f"ModelOpt save/restore enabled for `{name}` library.")
156-
157-
158-
enable_huggingface_checkpointing_patch()
159-
160-
161126
def _teacher_forced_flow_loss_forward(
162127
self,
163128
data: dict[str, Any],
@@ -345,7 +310,7 @@ def _calibration_loop(runtime_model):
345310

346311
def read_clip_ids_from_parquet(parquet_path: str) -> list[str]:
347312
"""
348-
Reads clip_ids from parquet. Tries common column names; falls back to index if needed.
313+
Reads clip_ids from the parquet's "key" column.
349314
Returns clip_ids as a list of strings (unique, preserving first occurrence order).
350315
"""
351316
parquet_path = str(parquet_path)
@@ -375,7 +340,7 @@ def quantize_model(model, args, tokenizer=None, calibration_forward_loop=None):
375340
- nvfp4: 4-bit NVIDIA floating point quantization
376341
Args:
377342
model: PyTorch model to quantize. Must be in evaluation mode.
378-
args: Command line arguments containing quant_format and debug.
343+
args: Command line arguments containing quant_format.
379344
tokenizer: Hugging Face tokenizer for creating calibration data.
380345
Required only when `calibration_forward_loop` is not provided.
381346
calibration_forward_loop: Optional callable taking `model` and running
@@ -424,9 +389,9 @@ def quantize_model(model, args, tokenizer=None, calibration_forward_loop=None):
424389
quant_cfg["quant_cfg"].append({"quantizer_name": f"{_name}.*", "enable": False})
425390

426391
model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
427-
if args.debug:
428-
print("================== quantize_model summary ==================")
429-
mtq.print_quant_summary(model)
392+
393+
print("================== quantize_model summary ==================")
394+
mtq.print_quant_summary(model)
430395

431396
return model
432397

@@ -438,27 +403,23 @@ def auto_quantize_model(
438403
clip_ids,
439404
processor,
440405
t0_us: int,
441-
top_p: float,
442-
temperature: float,
443-
max_generation_length: int,
444-
calibration_traj_samples: int,
445406
device: str,
446407
):
447408
"""
448409
Quantize a PyTorch model using ModelOpt's AutoQuantize API.
449410
450411
Searches per-layer across [NVFP4_DEFAULT_CFG, FP8_DEFAULT_CFG] under the
451-
effective-bits budget in args.auto_quantize_bits. Calibration data is built
452-
from the same joint VLM + diffusion rollout used by
453-
alpamayo_r1.eval.make_joint_calibration_forward_loop.
412+
effective-bits budget in args.auto_quantize_bits. Calibration runs the
413+
teacher-forced flow-matching forward (teacher_forced_flow_loss_forward) on
414+
the calibration clips; the MSE between v_pred and v_target is the search loss.
454415
455416
Args:
456417
model: PyTorch model to quantize. Must be in eval mode.
457-
args: Namespace with `auto_quantize_bits` (float) and `debug` (bool).
418+
args: Namespace with `auto_quantize_bits` (float).
458419
clip_ids: Iterable of clip_ids for calibration.
459420
processor: HF processor used for chat-template tokenization.
460-
t0_us, top_p, temperature, max_generation_length, calibration_traj_samples,
461-
device: Same semantics as make_joint_calibration_forward_loop.
421+
t0_us: Trajectory anchor timestamp passed to load_physical_aiavdataset.
422+
device: Device to place calibration tensors on.
462423
463424
Returns:
464425
Quantized model (the search_state from mtq.auto_quantize is discarded).
@@ -535,9 +496,8 @@ def loss_func(output, batch):
535496
print("================== auto_quantize search_state ==================")
536497
print(search_state)
537498

538-
if args.debug:
539-
print("================== auto_quantize_model summary ==================")
540-
mtq.print_quant_summary(model)
499+
print("================== auto_quantize_model summary ==================")
500+
mtq.print_quant_summary(model)
541501

542502
return model
543503

@@ -575,7 +535,12 @@ def main():
575535
default="0417_16rows_train_set_for_calibration_25.10.parquet",
576536
help="Parquet file with clip_ids for calibration",
577537
)
578-
ap.add_argument("--t0_us", type=int, default=5_100_000)
538+
ap.add_argument(
539+
"--t0_us",
540+
type=int,
541+
default=5_100_000,
542+
help="Trajectory anchor timestamp passed to load_physical_aiavdataset",
543+
)
579544
ap.add_argument("--top_p", type=float, default=0.98)
580545
ap.add_argument("--temperature", type=float, default=0.6)
581546
ap.add_argument("--max_generation_length", type=int, default=256)
@@ -600,6 +565,10 @@ def main():
600565
clip_ids = clip_ids[: args.limit]
601566
print(f"Loaded {len(clip_ids)} clip_ids from: {parquet_path}")
602567

568+
# Patch PreTrainedModel.from_pretrained / save_pretrained so ModelOpt state is saved with the
569+
# checkpoint (and restored when AlpamayoR1.from_pretrained later loads the quantized weights).
570+
mto.enable_huggingface_checkpointing()
571+
603572
device = "cuda"
604573
print(f"Loading model from {args.ckpt!r} ...")
605574
model = AlpamayoR1.from_pretrained(args.ckpt, dtype=torch.float16).to(
@@ -615,7 +584,6 @@ def main():
615584
quant_format=args.quantize,
616585
quant_algo="max",
617586
weight_only=False,
618-
debug=True,
619587
auto_quantize_bits=args.auto_quantize_bits,
620588
real_quant=args.real_quant,
621589
)
@@ -626,10 +594,6 @@ def main():
626594
clip_ids=clip_ids,
627595
processor=processor,
628596
t0_us=args.t0_us,
629-
top_p=args.top_p,
630-
temperature=args.temperature,
631-
max_generation_length=args.max_generation_length,
632-
calibration_traj_samples=args.num_traj_samples,
633597
device=device,
634598
)
635599
else:

0 commit comments

Comments
 (0)