[Feat] support TeaCache for FLUX1.dev

RuixiangMa · RuixiangMa · commit 8025273ce2bc · 2026-02-06T14:31:01.000+08:00
Signed-off-by: Lancer &lt;maruixiang6688@gmail.com&gt;
diff --git a/vllm_omni/diffusion/cache/teacache/backend.py b/vllm_omni/diffusion/cache/teacache/backend.py
@@ -48,7 +48,26 @@ def forward_alias(self, *args, **kwargs):
     )
 
 
-CUSTOM_TEACACHE_ENABLERS = {"BagelPipeline": enable_bagel_teacache}
+def enable_flux_teacache(pipeline: Any, config: DiffusionCacheConfig) -> None:
+    """
+    Enable TeaCache for Flux (Flux1) model.
+    """
+    teacache_config = TeaCacheConfig(
+        transformer_type="FluxTransformer2DModel",
+        rel_l1_thresh=config.rel_l1_thresh,
+        coefficients=config.coefficients,
+    )
+    transformer = pipeline.transformer
+
+    apply_teacache_hook(transformer, teacache_config)
+
+    logger.info(
+        f"TeaCache applied with rel_l1_thresh={teacache_config.rel_l1_thresh}, "
+        f"transformer_class={teacache_config.transformer_type}"
+    )
+
+
+CUSTOM_TEACACHE_ENABLERS = {"BagelPipeline": enable_bagel_teacache, "FluxPipeline": enable_flux_teacache}
 
 
 class TeaCacheBackend(CacheBackend):
diff --git a/vllm_omni/diffusion/cache/teacache/extractors.py b/vllm_omni/diffusion/cache/teacache/extractors.py
@@ -19,6 +19,7 @@
 
 import torch
 import torch.nn as nn
+from diffusers.utils import is_torch_npu_available
 
 from vllm_omni.diffusion.forward_context import get_forward_context
 
@@ -566,6 +567,144 @@ def postprocess(h):
     )
 
 
+def extract_flux_context(
+    module: nn.Module,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: torch.Tensor = None,
+    pooled_projections: torch.Tensor = None,
+    timestep: torch.LongTensor = None,
+    img_ids: torch.Tensor = None,
+    txt_ids: torch.Tensor = None,
+    guidance: torch.Tensor | None = None,
+    joint_attention_kwargs: dict[str, Any] | None = None,
+    **kwargs: Any,
+) -> CacheContext:
+    """
+    Extract cache context for Flux1-dev model.
+
+    Only caches transformer_blocks output. single_transformer_blocks is always executed.
+
+    Args:
+        module: FluxTransformer2DModel instance
+        hidden_states: Input image hidden states tensor
+        encoder_hidden_states: Input text hidden states tensor
+        pooled_projections: Pooled text embeddings
+        timestep: Current diffusion timestep
+        img_ids: Image position IDs for RoPE
+        txt_ids: Text position IDs for RoPE
+        guidance: Optional guidance scale for CFG
+        joint_attention_kwargs: Additional attention kwargs
+
+    Returns:
+        CacheContext with all information needed for generic caching
+    """
+    from diffusers.models.modeling_outputs import Transformer2DModelOutput
+
+    if not hasattr(module, "transformer_blocks") or len(module.transformer_blocks) == 0:
+        raise ValueError("Module must have transformer_blocks")
+
+    # ============================================================================
+    # PREPROCESSING (Flux-specific)
+    # ============================================================================
+    dtype = hidden_states.dtype
+    device = hidden_states.device
+    timestep = timestep.to(device=device, dtype=dtype) * 1000
+    if guidance is not None:
+        guidance = guidance.to(device=device, dtype=dtype) * 1000
+
+    temb = (
+        module.time_text_embed(timestep, pooled_projections)
+        if guidance is None
+        else module.time_text_embed(timestep, guidance, pooled_projections)
+    )
+
+    hidden_states = module.x_embedder(hidden_states)
+    encoder_hidden_states = module.context_embedder(encoder_hidden_states)
+
+    if txt_ids.ndim == 3:
+        txt_ids = txt_ids[0]
+    if img_ids.ndim == 3:
+        img_ids = img_ids[0]
+
+    ids = torch.cat((txt_ids, img_ids), dim=0)
+    if is_torch_npu_available():
+        freqs_cos, freqs_sin = module.pos_embed(ids.cpu())
+        image_rotary_emb = (freqs_cos.npu(), freqs_sin.npu())
+    else:
+        image_rotary_emb = module.pos_embed(ids)
+
+    # ============================================================================
+    # EXTRACT MODULATED INPUT (for cache decision)
+    # ============================================================================
+    block = module.transformer_blocks[0]
+    norm_output = block.norm1(hidden_states, emb=temb)
+    if isinstance(norm_output, tuple):
+        norm_hidden_states = norm_output[0]
+    else:
+        norm_hidden_states = norm_output
+    modulated_input = norm_hidden_states
+
+    # ============================================================================
+    # DEFINE TRANSFORMER EXECUTION (Flux-specific)
+    # ============================================================================
+    def run_flux_transformer_blocks():
+        h = hidden_states
+        c = encoder_hidden_states
+        for block in module.transformer_blocks:
+            c, h = block(
+                hidden_states=h,
+                encoder_hidden_states=c,
+                temb=temb,
+                image_rotary_emb=image_rotary_emb,
+                joint_attention_kwargs=joint_attention_kwargs,
+            )
+        return (h, c)
+
+    def run_flux_full_transformer_with_single(ori_h, ori_c):
+        h = ori_h
+        c = ori_c
+        for block in module.transformer_blocks:
+            c, h = block(
+                hidden_states=h,
+                encoder_hidden_states=c,
+                temb=temb,
+                image_rotary_emb=image_rotary_emb,
+                joint_attention_kwargs=joint_attention_kwargs,
+            )
+        for block in module.single_transformer_blocks:
+            c, h = block(
+                hidden_states=h,
+                encoder_hidden_states=c,
+                temb=temb,
+                image_rotary_emb=image_rotary_emb,
+                joint_attention_kwargs=joint_attention_kwargs,
+            )
+        return h, c
+
+    # ============================================================================
+    # DEFINE POSTPROCESSING (Flux-specific)
+    # ============================================================================
+    def postprocess(h):
+        h = module.norm_out(h, temb)
+        h = module.proj_out(h)
+        return Transformer2DModelOutput(sample=h)
+
+    # ============================================================================
+    # RETURN CONTEXT
+    # ============================================================================
+    return CacheContext(
+        modulated_input=modulated_input,
+        hidden_states=hidden_states,
+        encoder_hidden_states=encoder_hidden_states,
+        temb=temb,
+        run_transformer_blocks=run_flux_transformer_blocks,
+        postprocess=postprocess,
+        extra_states={
+            "run_flux_full_transformer_with_single": run_flux_full_transformer_with_single,
+        },
+    )
+
+
 # Registry for model-specific extractors
 # Key: Transformer class name
 # Value: extractor function with signature (module, *args, **kwargs) -> CacheContext
@@ -576,6 +715,7 @@ def postprocess(h):
     "QwenImageTransformer2DModel": extract_qwen_context,
     "Bagel": extract_bagel_context,
     "ZImageTransformer2DModel": extract_zimage_context,
+    "FluxTransformer2DModel": extract_flux_context,
     # Future models:
     # "FluxTransformer2DModel": extract_flux_context,
     # "CogVideoXTransformer3DModel": extract_cogvideox_context,
diff --git a/vllm_omni/diffusion/cache/teacache/hook.py b/vllm_omni/diffusion/cache/teacache/hook.py
@@ -157,20 +157,26 @@ def new_forward(self, module: torch.nn.Module, *args: Any, **kwargs: Any) -> Any
                 ctx.encoder_hidden_states.clone() if ctx.encoder_hidden_states is not None else None
             )
 
-            # Run transformer blocks using model-specific callable
-            outputs = ctx.run_transformer_blocks()
-
-            # Update context with outputs
-            ctx.hidden_states = outputs[0]
-            if len(outputs) > 1 and ctx.encoder_hidden_states is not None:
-                ctx.encoder_hidden_states = outputs[1]
-
-            # Cache residuals for next timestep
-            state.previous_residual = (ctx.hidden_states - ori_hidden_states).detach()
-            if ori_encoder_hidden_states is not None:
-                state.previous_residual_encoder = (ctx.encoder_hidden_states - ori_encoder_hidden_states).detach()
-
-            output = ctx.hidden_states
+            # Handle models with additional blocks
+            if getattr(ctx, "extra_states", None) and "run_flux_full_transformer_with_single" in ctx.extra_states:
+                run_full = ctx.extra_states["run_flux_full_transformer_with_single"]
+                ctx.hidden_states, ctx.encoder_hidden_states = run_full(ori_hidden_states, ori_encoder_hidden_states)
+                output = ctx.hidden_states
+                state.previous_residual = (ctx.hidden_states - ori_hidden_states).detach()
+            else:
+                # Run transformer blocks using model-specific callable
+                outputs = ctx.run_transformer_blocks()
+                # Update context with outputs
+                ctx.hidden_states = outputs[0]
+                if len(outputs) > 1 and ctx.encoder_hidden_states is not None:
+                    ctx.encoder_hidden_states = outputs[1]
+
+                output = ctx.hidden_states
+
+                # Cache residuals for next timestep
+                state.previous_residual = (ctx.hidden_states - ori_hidden_states).detach()
+                if ori_encoder_hidden_states is not None:
+                    state.previous_residual_encoder = (ctx.encoder_hidden_states - ori_encoder_hidden_states).detach()
 
         # Update state
         state.previous_modulated_input = ctx.modulated_input.detach()