svdq: add fused gelu mlp/proj pass (#1047)

DefTruth · web-flow · commit c8491c657fa1 · 2026-06-08T19:57:38.000+08:00
* svdq: add fused gelu mlp pass

* svdq: add fused gelu mlp/proj pass

* svdq: add fused gelu mlp/proj pass

* svdq: add fused gelu mlp/proj pass
diff --git a/csrc/kernels/svdq/gemm_w4a4_launch_impl.cuh b/csrc/kernels/svdq/gemm_w4a4_launch_impl.cuh
@@ -251,34 +251,38 @@ void GEMM_W4A4_Launch<GEMMConfig_W4A4_FP16, false>::gemm_w4a4(
   };
 
   if (qout.valid() && oscales.valid()) {
-    // dispatchBool(qout_unsigned, [&]<bool USE_UNSIGNED>() {
-
-    static constexpr float SHIFT_GELU = 0.171875f;
-
-    constexpr bool USE_UNSIGNED = !USE_FP4;
-    using EpilogueQuantize = typename GEMM::EpilogueQuantize<false, USE_UNSIGNED, USE_FP4>;
+    // Use signed INT4 (matching fc2.quantize output).
+    // No unsigned shift — the non-fused path never applies +0.171875
+    // before fc2.quantize, so we must match that behaviour exactly.
+    constexpr bool USE_UNSIGNED = false;
+
+    // GELU is always applied in-place via EpilogueGelu (MidEpilogue)
+    // so that *every* downstream epilogue — including EpilogueDefault
+    // (fp16 `out`) and EpilogueLoraDown (`lora_act_out`) — sees
+    // post-GELU accumulator values.  EpilogueQuantize only quantizes.
+    constexpr bool FUSE_GELU = false;
+    using EpilogueQuantize = typename GEMM::EpilogueQuantize<FUSE_GELU, USE_UNSIGNED, USE_FP4>;
     auto argsQuantize = typename EpilogueQuantize::Arguments{
       .qout = qout.data_ptr<packed_act_t>(),
       .oscales = oscales.data_ptr<typename EpilogueQuantize::oscales_t>(),
-      .shift_value = USE_FP4 ? 0.0f : SHIFT_GELU,
+      .shift_value = 0.0f,
       .smooth_factor = smooth_factor.data_ptr<packed_wscale_t>()};
 
-    // TODO: check if gelu is needed
     if (out.valid()) {
       launch_lora.template operator()<
         typename GEMM::EpilogueCombination<typename GEMM::EpilogueDefault, EpilogueQuantize>,
-        typename Epilogues::EpilogueGelu>({typename GEMM::EpilogueDefault::Arguments{
-                                             .out = out.data_ptr<half_t>(),
-                                             .actualM = actualM,
-                                             .actualN = actualN,
-                                           },
-                                           argsQuantize},
-                                          {});
+        typename Epilogues::EpilogueGelu>(
+        {typename GEMM::EpilogueDefault::Arguments{
+           .out = out.data_ptr<half_t>(),
+           .actualM = actualM,
+           .actualN = actualN,
+         },
+         argsQuantize},
+        {});
     } else {
       launch_lora.template operator()<EpilogueQuantize, typename Epilogues::EpilogueGelu>(
         argsQuantize, {});
     }
-
   } else if (out_linearattn.valid()) {
     assert(out_vk.valid());
 
diff --git a/docs/user_guide/QUANTIZATION.md b/docs/user_guide/QUANTIZATION.md
@@ -896,3 +896,66 @@ Quick examples for enabling SVDQ runtime kernel from the generate CLI:
 python3 -m cache_dit.generate flux --svdq-int4-r64-dq --compile # v1, baseline
 python3 -m cache_dit.generate flux --svdq-int4-r64-dq --svdq-runtime v2 --compile # v2
 ```
+
+## SVDQ with Fused MLP
+
+When SVDQ quantizes a diffusers transformer whose ``FeedForward`` blocks use plain GELU activation, the default execution path runs three GPU kernels per MLP block: the first quantized GEMM, a separate GELU activation kernel, and the second quantized GEMM.  Enabling fused MLP combines the first GEMM and GELU into a single kernel via ``svdq_gemm_w4a4_ext``, eliminating one kernel launch per block.
+
+The feature is controlled by ``svdq_kwargs["fused_mlp"]`` and works automatically with most diffusers transformer families — no per-model configuration is needed.
+
+**Quick start (CLI)**
+
+```bash
+# Add --svdq-fused-mlp to any SVDQ generate command:
+python3 -m cache_dit.generate flux --svdq-int4-r32-dq --svdq-fused-mlp
+```
+
+**Quick start (Python API — dynamic quantization)**
+
+```python
+import torch
+import cache_dit
+from diffusers import FluxPipeline
+from cache_dit.quantization import QuantizeConfig
+
+pipe = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    torch_dtype=torch.bfloat16,
+).to("cuda")
+
+quant_config = QuantizeConfig(
+    quant_type="svdq_int4_r32_dq",
+    svdq_kwargs={"fused_mlp": True},
+)
+pipe.transformer = cache_dit.load(pipe.transformer, quant_config)
+
+image = pipe("A cat holding a sign that says hello world").images[0]
+image.save("flux_fused_mlp.png")
+```
+
+**Quick start (Python API — PTQ with serialized checkpoint)**
+
+```python
+quant_config = QuantizeConfig(
+    quant_type="svdq_int4_r32",
+    serialize_to="./flux-svdq/",
+    svdq_kwargs={"fused_mlp": True},
+    calibrate_fn=my_calibrate_fn,
+)
+cache_dit.quantize(pipe.transformer, quant_config)
+
+# Later, at inference time:
+pipe.transformer = cache_dit.load(
+    pipe.transformer,
+    "./flux-svdq/svdq_int4_r32.safetensors",
+)
+```
+
+When ``fused_mlp`` is enabled, cache-dit applies two complementary passes:
+
+| Pass | Targets | Fusion |
+|---|---|---|
+| ``fused_gelu_mlp`` | Standard ``FeedForward`` double blocks | fc1 + GELU + fc2 (qout path, no fp16 HBM write) |
+| ``fused_gelu_proj`` | Single-stream blocks with concat MLP | fc1 + GELU only (fp16 output, concat unchanged) |
+
+Both passes use generic structural detection — they work with most diffusers transformers (FLUX, SD3, PixArt, HunyuanVideo, Wan, Cosmos, Bria, QwenImage, Chroma, Motif Video, and many more) without per-model code changes.
diff --git a/src/cache_dit/_utils/utils.py b/src/cache_dit/_utils/utils.py
@@ -674,6 +674,13 @@ def get_args(parse: bool = True, ) -> argparse.ArgumentParser | argparse.Namespa
     default=False,
     help="Compile the transformer only after SVDQ few-shot runtime quantization completes.",
   )
+  parser.add_argument(
+    "--svdq-fused-mlp",
+    action="store_true",
+    default=False,
+    help=
+    "Fuse FeedForward GELU MLP blocks into a single fused kernel chain after SVDQ quantization.",
+  )
   # Parallelism settings
   parser.add_argument(
     "--parallel-type",
@@ -1749,6 +1756,7 @@ def _resolve_cli_svdq_kwargs() -> Optional[Dict[str, Any]]:
       "few_shot_relax_top_ratio": args.svdq_few_shot_relax_top_ratio,
       "few_shot_relax_strategy": args.svdq_few_shot_relax_strategy,
       "few_shot_auto_compile": few_shot_auto_compile,
+      "fused_mlp": bool(args.svdq_fused_mlp),
     }
 
   # Quantize transformer by default if quantization is enabled
diff --git a/src/cache_dit/logger.py b/src/cache_dit/logger.py
@@ -47,6 +47,7 @@ def filter(self, record):
 _default_file_handler = None
 _inference_log_file_handler = {}
 _warning_once_messages: set[tuple[str, str]] = set()
+_info_once_messages: set[tuple[str, str]] = set()
 
 
 def _warning_once(self: logging.Logger, msg, *args, **kwargs) -> None:
@@ -66,7 +67,25 @@ def _warning_once(self: logging.Logger, msg, *args, **kwargs) -> None:
   self.warning(msg, *args, **kwargs)
 
 
+def _info_once(self: logging.Logger, msg, *args, **kwargs) -> None:
+  message = logging.LogRecord(
+    name=self.name,
+    level=logging.INFO,
+    pathname="",
+    lineno=0,
+    msg=msg,
+    args=args,
+    exc_info=None,
+  ).getMessage()
+  key = (self.name, message)
+  if key in _info_once_messages:
+    return
+  _info_once_messages.add(key)
+  self.info(msg, *args, **kwargs)
+
+
 logging.Logger.warning_once = _warning_once  # type: ignore[attr-defined]
+logging.Logger.info_once = _info_once  # type: ignore[attr-defined]
 
 
 def _setup_logger():
diff --git a/src/cache_dit/quantization/config.py b/src/cache_dit/quantization/config.py
@@ -121,6 +121,15 @@
   # is needed, set this to ``False`` and compile manually after moving the
   # pipeline to CUDA.
   "few_shot_auto_compile": False,
+  # When enabled, SVDQ fuses the first quantized linear layer, GELU activation,
+  # and second quantized linear layer in standard diffusers ``FeedForward`` GELU
+  # MLP blocks into a single kernel chain via ``svdq_gemm_w4a4_ext``.  The
+  # intermediate fp16 activation is never written to HBM — the first GEMM
+  # directly produces 4-bit quantized output consumed by the second GEMM.
+  # Requires the ``fused_gelu_mlp`` and ``fused_gelu_proj`` passes to be
+  # active; has no effect on models that use GEGLU, SwiGLU, or custom
+  # FeedForward structures.
+  "fused_mlp": False,
 }
 
 
@@ -299,6 +308,7 @@ def _resolve_svdq_kwargs(svdq_kwargs: Optional[Dict[str, Any]]) -> Dict[str, Any
     "few_shot_relax_top_ratio": _resolve_svdq_ratio,
     "few_shot_relax_strategy": _resolve_svdq_few_shot_relax_strategy,
     "few_shot_auto_compile": _resolve_svdq_bool_kwarg,
+    "fused_mlp": _resolve_svdq_bool_kwarg,
   }
   for key, value in svdq_kwargs.items():
     resolved[key] = validators[key](key, value)
@@ -589,15 +599,17 @@ def strify(self) -> str:
 
     def _stringify_quant_type(quant_type: str) -> str:
       quant_type = quant_type.lower()
-      if quant_type.startswith("svdq") and quant_type.endswith("_dq"):
+      if quant_type.startswith("svdq"):
         svdq_kwargs = self.get_svdq_kwargs()
-        smooth_strategy = svdq_kwargs.get("smooth_strategy", "identity")
-        if smooth_strategy != "identity":
-          quant_type = f"{quant_type}_{smooth_strategy}"
-          if smooth_strategy == "few_shot":
-            relax_strategy = svdq_kwargs.get("few_shot_relax_strategy", "auto")
-            quant_type = f"{quant_type}_{relax_strategy}"
-          return quant_type
+        if quant_type.endswith("_dq"):
+          smooth_strategy = svdq_kwargs.get("smooth_strategy", "identity")
+          if smooth_strategy != "identity":
+            quant_type = f"{quant_type}_{smooth_strategy}"
+            if smooth_strategy == "few_shot":
+              relax_strategy = svdq_kwargs.get("few_shot_relax_strategy", "auto")
+              quant_type = f"{quant_type}_{relax_strategy}"
+        if svdq_kwargs.get("fused_mlp", False):
+          quant_type = f"{quant_type}_fused_mlp"
       return quant_type
 
     if self.components_to_quantize is None or isinstance(self.components_to_quantize, list):
diff --git a/src/cache_dit/quantization/svdquant/__init__.py b/src/cache_dit/quantization/svdquant/__init__.py
@@ -40,14 +40,32 @@
 
 from ...kernels import svdq_extension_is_available as svdq_is_available
 from ...kernels import svdq_get_load_error
+from .fused import fused_gelu_mlp
+from .fused import fused_gelu_proj
 from .linear import SVDQW4A4Linear
+from .passes import apply_passes
+from .passes import BasePass
+from .passes import DEFAULT_FUSED_MLP_PASSES
+from .passes import FusedGeluMlpPass
+from .passes import FusedGeluProjPass
+from .passes import get_pass
+from .passes import register_pass
 from .quantizer import CalibrationInputs
 from .quantizer import compute_smooth_scale
 from .quantizer import quantize_linear_svdq_w4a4
 from .quantizer import standardize_calibration_activations
 from .quantizer import validate_svdq_linear_geometry
 
 __all__ = [
+  "apply_passes",
+  "BasePass",
+  "DEFAULT_FUSED_MLP_PASSES",
+  "FusedGeluMlpPass",
+  "fused_gelu_mlp",
+  "fused_gelu_proj",
+  "FusedGeluProjPass",
+  "get_pass",
+  "register_pass",
   "CalibrationInputs",
   "SVDQW4A4Linear",
   "compute_smooth_scale",
diff --git a/src/cache_dit/quantization/svdquant/fused.py b/src/cache_dit/quantization/svdquant/fused.py
diff --git a/src/cache_dit/quantization/svdquant/passes.py b/src/cache_dit/quantization/svdquant/passes.py
diff --git a/src/cache_dit/quantization/svdquant/ptq.py b/src/cache_dit/quantization/svdquant/ptq.py