qianbiaoxiang
diff --git a/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 32 additions & 4 deletions b/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 32 additions & 4 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/hf.py‎
Lines changed: 28 additions & 33 deletions b/‎tensorrt_llm/_torch/auto_deploy/models/hf.py‎
Lines changed: 28 additions & 33 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/quant_config_reader.py‎
Lines changed: 130 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/models/quant_config_reader.py‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transform/interface.py‎
Lines changed: 22 additions & 4 deletions b/‎tensorrt_llm/_torch/auto_deploy/transform/interface.py‎
Lines changed: 22 additions & 4 deletions
@@ -19,10 +19,6 @@ transforms:
     stage: post_export
   cleanup_input_constraints:
     stage: post_export
-  quantize:
-    stage: pattern_matcher
-  quantize_moe:
-    stage: pattern_matcher
   match_repeat_kv:
     stage: pattern_matcher
   match_eager_attention:
@@ -31,3 +27,35 @@ transforms:
     stage: pattern_matcher
   match_attention_layout:
     stage: pattern_matcher
+  match_moe_pattern:
+    stage: pattern_matcher
+  match_rope_pattern:
+    stage: pattern_matcher
+  match_rope_layout:
+    stage: pattern_matcher
+  eliminate_redundant_transposes:
+    stage: pattern_matcher
+  # TODO (lucaslie): let's move this to perf optimization once TP sharding is improved
+  # see https://github.com/NVIDIA/TensorRT-LLM/pull/3668#discussion_r2052714528
+  optimize_rope:
+    stage: pattern_matcher
+  quantize_from_config:
+    stage: pattern_matcher
+  quantize_from_graph:
+    stage: pattern_matcher
+  quantize_moe:
+    stage: pattern_matcher
+  # TODO: Infer sharding parameters (tp_size, row/column sharding) from the model config.
+  detect_column_row_shard:
+    stage: sharding
+    simple_shard_only: false
+  detect_ep_shard:
+    stage: sharding
+  detect_dp_bmm_shard:
+    stage: sharding
+  # TODO: (hg) need to ensure run_shape_prop after sharding.
+  sharding_transform_executor:
+    stage: sharding
+    run_shape_prop: true
+  load_weights:
+    stage: weight_load
@@ -1,6 +1,5 @@
 """Interface to initialize and load HF models."""
 
-import json
 import os
 import types
 from contextlib import contextmanager, nullcontext
@@ -31,6 +30,7 @@
 from ..utils._config import deep_merge_dicts
 from ..utils.logger import ad_logger
 from .factory import ModelFactory, ModelFactoryRegistry
+from .quant_config_reader import QuantConfigReader, QuantConfigReaderRegistry
 
 
 @contextmanager
@@ -84,9 +84,7 @@ def _get_max_position_embeddings_config(self) -> Dict[str, Any]:
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-
-        self._quant_config: Optional[Dict] = None
-
+        self._quant_config_reader: QuantConfigReader | None = None
         # Ingest defaults for tokenizer and model kwargs
         self.tokenizer_kwargs = deep_merge_dicts(self._tokenizer_defaults, self.tokenizer_kwargs)
         self.model_kwargs = deep_merge_dicts(
@@ -156,9 +154,6 @@ def _recursive_update_config(self, config: PretrainedConfig, update_dict: Dict[s
 
     def _build_model(self, device: DeviceLikeType) -> nn.Module:
         """Build the model on the desired device."""
-        # We only support fp16 to fp4 conversion.
-        if self._quant_config and self._quant_config.get("quant_algo", None) == "NVFP4":
-            self.model_kwargs["torch_dtype"] = torch.half
 
         # NOTE (lucaslie): HF doesn't recursively update nested PreTrainedConfig objects. Instead,
         # the entire subconfig will be overwritten.
@@ -178,23 +173,27 @@ def _build_model(self, device: DeviceLikeType) -> nn.Module:
         model.forward = types.MethodType(self._simple_forward, model)
 
         model.eval()
+
         return model
 
     def get_quant_config(self) -> Dict:
-        return self._quant_config or {}
+        """Returns the quantization config for this model or an empty dict if not quantized."""
+        if self._quant_config_reader is not None:
+            return self._quant_config_reader.get_config()
+        return {}
 
     def get_cache_config(self):
-        """Setup cache information based on quantization information."""
-        if self._quant_config is not None and "kv_cache_quant_algo" in self._quant_config.keys():
-            kv_cache_format = self._quant_config.get("kv_cache_quant_algo", None)
-            if kv_cache_format is not None:
-                assert kv_cache_format == "FP8", (
-                    f"KV cache quantization format {kv_cache_format} is not supported."
-                )
-            kv_cache_dtype = torch.float8_e4m3fn if kv_cache_format is not None else None
-        else:
-            kv_cache_dtype = None
-        return CacheConfig(dtype=kv_cache_dtype)
+        """Return kv cache dtype configuration."""
+        if not self._quant_config_reader:
+            return CacheConfig(dtype=None)
+
+        kv_cache_dtype = self._quant_config_reader.get_config().get("kv_cache_dtype")
+        torch_dtype = torch.float8_e4m3fn if kv_cache_dtype == "float8_e4m3fn" else None
+        assert torch_dtype in (torch.float8_e4m3fn, None), (
+            f"Unsupported dtype: {torch_dtype}. Only torch.float8_e4m3fn is supported."
+        )
+
+        return CacheConfig(dtype=torch_dtype)
 
     def init_tokenizer(self) -> Optional[Any]:
         """Initialize the tokenizer—either a custom name or the model's default."""
@@ -325,22 +324,18 @@ def _load_checkpoint(self, model: nn.Module, device: DeviceLikeType):
 
     def _load_quantization_config(self, fetched_dir: str):
         """Load the quantization config from the model directory if not done already."""
-        if self._quant_config is not None:
+        if self._quant_config_reader is not None:
+            return
+        # TODO: specified by user or auto-detect
+        reader_cls = QuantConfigReaderRegistry.get("modelopt")
+        result = reader_cls.from_file(fetched_dir)
+        if result is None:
             return
+        reader, extra_model_kwargs = result
 
-        assert self.model
-        hf_quant_config_file = os.path.join(fetched_dir, "hf_quant_config.json")
-        if os.path.exists(hf_quant_config_file):
-            with open(hf_quant_config_file, "r") as file:
-                quantization_config = json.load(file)
-                assert quantization_config.get("producer", {}).get("name", None) == "modelopt", (
-                    "Only support modelopt quantized checkpoint"
-                )
-                self._quant_config = quantization_config.get("quantization", {})
-
-                # We do not quantize lm_head.
-                if "exclude_modules" not in self._quant_config:
-                    self._quant_config["exclude_modules"] = ["lm_head"]
+        if reader is not None:
+            self._quant_config_reader = reader
+            self.model_kwargs = deep_merge_dicts(self.model_kwargs, extra_model_kwargs)
 
 
 @ModelFactoryRegistry.register("AutoModelForImageTextToText")
 
@@ -0,0 +1,130 @@
+"""
+Quantization Config Reader Registry.
+
+This module defines a registry system for parsing quantization configurations
+from various sources (e.g., 'modelopt'). It enables extensible support for different
+quantization producers by delegating parsing logic to dedicated subclasses.
+"""
+
+import json
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Optional, Tuple, Type
+
+
+class QuantConfigReader(ABC):
+    """Base class for reading and parsing quantization config."""
+
+    def __init__(self):
+        self._quant_config: Optional[Dict] = {}
+
+    def get_config(self) -> Dict:
+        """Return the parsed quantization config."""
+        return self._quant_config
+
+    @abstractmethod
+    def read_config(self, config: Dict) -> Dict:
+        """
+        Parse and normalize a quantization config dictionary.
+
+        Args:
+            config: The raw parsed JSON object.
+
+        Returns:
+            A dictionary of extra model kwargs derived from the quantization config.
+            Implementations must also populate self._quant_config with the normalized
+            quantization config.
+        """
+        pass
+
+    @classmethod
+    @abstractmethod
+    def from_file(cls, file_path: str) -> Optional[Tuple["QuantConfigReader", Dict[str, Any]]]:
+        """
+        Load and parse a quantization config file from disk.
+
+        This method is implemented by each reader to handle loading and parsing logic.
+
+        Args:
+            file_path: Path to the quant config JSON file.
+
+        Returns:
+            A (reader, extra_model_kwargs) tuple, or None if the file doesn't exist.
+        """
+        pass
+
+
+class QuantConfigReaderRegistry:
+    _registry: Dict[str, Type[QuantConfigReader]] = {}
+
+    @classmethod
+    def register(cls, name: str) -> Callable[[Type[QuantConfigReader]], Type[QuantConfigReader]]:
+        def inner(reader_cls: Type[QuantConfigReader]) -> Type[QuantConfigReader]:
+            cls._registry[name] = reader_cls
+            return reader_cls
+
+        return inner
+
+    @classmethod
+    def get(cls, name: str) -> Type[QuantConfigReader]:
+        if name not in cls._registry:
+            raise ValueError(f"QuantConfigReader for '{name}' not registered.")
+        return cls._registry[name]
+
+    @classmethod
+    def has(cls, reader_cls: str) -> bool:
+        return reader_cls in cls._registry
+
+
+@QuantConfigReaderRegistry.register("modelopt")
+class ModelOPTQuantConfigReader(QuantConfigReader):
+    def read_config(self, config: Dict) -> Dict:
+        producer = config.get("producer", {}).get("name")
+        # sanity check
+        if producer != "modelopt":
+            raise ValueError(f"Expected producer 'modelopt', got '{producer}'")
+
+        quant_config = config.get("quantization", {})
+        # Inject default exclusion, add "model.embed_tokens" for "tie_word_embedding:true" case
+        quant_config.setdefault("exclude_modules", ["lm_head", "model.embed_tokens"])
+        # Update dtype
+        if quant_config.get("quant_algo") == "NVFP4":
+            quant_config["torch_dtype"] = "float16"
+
+        # Handle kv cache
+        kv_algo = quant_config.get("kv_cache_quant_algo")
+        if kv_algo:
+            if kv_algo != "FP8":
+                raise ValueError(f"KV cache quantization format {kv_algo} not supported.")
+            quant_config["kv_cache_dtype"] = "float8_e4m3fn"
+
+        self._quant_config = quant_config
+
+        extra_model_kwargs: Dict[str, Any] = {}
+        if quant_config.get("quant_algo", None) == "NVFP4":
+            extra_model_kwargs["torch_dtype"] = "float16"
+
+        return extra_model_kwargs
+
+    @classmethod
+    def from_file(
+        cls, ckpt_dir: str
+    ) -> Optional[Tuple["ModelOPTQuantConfigReader", Dict[str, Any]]]:
+        """
+        Load and parse a modelopt-style quantization config from a checkpoint directory.
+
+        Args:
+            ckpt_dir: Path to the root directory containing the checkpoint.
+
+        Returns:
+            An initialized ModelOPTQuantConfigReader instance, or None if the file doesn't exist.
+        """
+        quant_file = os.path.join(ckpt_dir, "hf_quant_config.json")
+        if not os.path.exists(quant_file):
+            return None
+
+        with open(quant_file, "r") as f:
+            raw = json.load(f)
+        reader = cls()
+        extra_model_kwargs = reader.read_config(raw)
+        return reader, extra_model_kwargs
@@ -15,6 +15,7 @@
 from ..shim.interface import CachedSequenceInterface
 from ..transformations._graph import canonicalize_graph, lift_to_meta
 from ..utils.logger import ad_logger
+from ..utils.sharding_utils import ShardingConfig
 
 
 class TransformError(Exception):
@@ -47,6 +48,14 @@ def __lt__(self, other):
         return NotImplemented
 
 
+class SharedConfig(BaseModel):
+    """Global config shared between multiple transforms in the inference optimizer."""
+
+    sharding_config: ShardingConfig = Field(default_factory=ShardingConfig)
+    local_rank: int = Field(default=0)
+    world_size: int = Field(default=1)
+
+
 class TransformConfig(BaseModel):
     """A simple configuration class that can be extended by a transform for configurability."""
 
@@ -190,14 +199,19 @@ def from_kwargs(cls, **kwargs) -> "BaseTransform":
 
     @final
     def __call__(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> GraphModule:
         """Apply the transform to the graph.
 
         Args:
             gm: The graph module to apply the transform to.
             cm: The cached sequence interface defining the sequence interface.
             factory: The model factory used to build the model.
+            shared_config: Global info shared between multiple transforms.
 
         Returns:
             GraphModule: The transformed graph module.
@@ -232,14 +246,14 @@ def __call__(
             # run the transform in a error-handling wrapper if desired
             if self.config.skip_on_error:
                 try:
-                    gm, info = self._apply(gm, cm, factory)
+                    gm, info = self._apply(gm, cm, factory, shared_config)
                 except Exception as e:
                     error_msg = f"Transform {t_name} failed"
                     ad_logger.warning(f"{error_msg}: {e}")
                     info = TransformInfo(skipped=True, num_matches=0)
             else:
                 # handle this here normally to improve debugging and error message
-                gm, info = self._apply(gm, cm, factory)
+                gm, info = self._apply(gm, cm, factory, shared_config)
 
             # we cannot say it's clean if the previous wasn't clean even if this one is
             # create new info object with updated cleanup status
@@ -346,7 +360,11 @@ def _run_post_cleanup(self, gm: GraphModule, info: TransformInfo) -> TransformIn
 
     @abstractmethod
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
         """Apply the transform to the graph.