[chore] Align with diffusers

OleehyO · OleehyO · commit 1bfef3f8d17d · 2025-04-21T07:51:52.000Z
diff --git a/src/cogkit/finetune/diffusion/models/cogview/cogview4/lora_trainer.py b/src/cogkit/finetune/diffusion/models/cogview/cogview4/lora_trainer.py
@@ -11,14 +11,19 @@
 from cogkit.finetune import register
 from cogkit.finetune.diffusion.schemas import DiffusionComponents
 from cogkit.finetune.diffusion.trainer import DiffusionTrainer
-from cogkit.finetune.utils import process_prompt_attention_mask, unwrap_model
+from cogkit.finetune.utils import (
+    process_prompt_attention_mask,
+    unwrap_model,
+    replace_attn_processor,
+)
 from cogkit.utils import load_lora_checkpoint, unload_lora_checkpoint
 from diffusers import (
     AutoencoderKL,
     CogView4Pipeline,
     CogView4Transformer2DModel,
     FlowMatchEulerDiscreteScheduler,
 )
+from diffusers.models.transformers.transformer_cogview4 import CogView4TrainingAttnProcessor
 
 
 class Cogview4Trainer(DiffusionTrainer):
@@ -68,6 +73,7 @@ def load_components(self) -> DiffusionComponents:
                 quantization_config=nf4_config,
                 device=self.accelerator.device,
             )
+        replace_attn_processor(components.transformer, CogView4TrainingAttnProcessor())
 
         ### vae
         components.vae = AutoencoderKL.from_pretrained(
@@ -98,6 +104,7 @@ def initialize_pipeline(self, ckpt_path: str | None = None) -> CogView4Pipeline:
                 subfolder="transformer",
                 torch_dtype=self.state.weight_dtype,
             )
+            replace_attn_processor(transformer, CogView4TrainingAttnProcessor())
             pipe = CogView4Pipeline(
                 tokenizer=self.components.tokenizer,
                 text_encoder=self.components.text_encoder,
@@ -170,7 +177,7 @@ def collate_fn(self, samples: list[dict[str, Any]]) -> dict[str, Any]:
                 - 'prompt_embedding': Tensor of shape [batch_size, sequence_length, embedding_dim]
                 - 'image': List of image tensors (will be empty during validation)
                 - 'encoded_image': Tensor of shape [batch_size, channels, height, width] (None during validation)
-                - 'attention_mask': Dictionary with 'text_embedding_attn_mask' for transformer attention
+                - 'text_attn_mask': Tensor of shape [batch_size, sequence_length] for transformer attention
 
         Note:
             This function assumes that all images in the batch have the same resolution.
@@ -180,7 +187,7 @@ def collate_fn(self, samples: list[dict[str, Any]]) -> dict[str, Any]:
             "prompt_embedding": [],
             "image": [],
             "encoded_image": [],
-            "attention_mask": {"text_embedding_attn_mask": None},
+            "text_attn_mask": None,
         }
 
         for sample in samples:
@@ -206,15 +213,12 @@ def collate_fn(self, samples: list[dict[str, Any]]) -> dict[str, Any]:
         )
 
         ret["prompt_embedding"] = prompt_embedding
-        ret["attention_mask"]["text_embedding_attn_mask"] = prompt_attention_mask
+        ret["text_attn_mask"] = prompt_attention_mask
 
         ret["encoded_image"] = torch.stack(ret["encoded_image"]) if ret["encoded_image"] else None
 
         # shape of prompt_embedding: [batch_size, sequence_length, embedding_dim(4096)]
-        assert (
-            ret["attention_mask"]["text_embedding_attn_mask"].shape
-            == ret["prompt_embedding"].shape[:2]
-        )
+        assert ret["text_attn_mask"].shape == ret["prompt_embedding"].shape[:2]
 
         return ret
 
@@ -232,7 +236,7 @@ def compute_loss(self, batch: dict[str, Any]) -> torch.Tensor:
         ) // (self.state.transformer_config.patch_size**2)
         image_seq_len = torch.tensor([image_seq_len], device=self.accelerator.device)
 
-        attention_mask = batch["attention_mask"]
+        text_attn_mask = batch["text_attn_mask"]
 
         num_train_timesteps = self.components.scheduler.config.num_train_timesteps
         sigmas = self.get_sigmas(batch_size, image_seq_len)
@@ -263,7 +267,7 @@ def compute_loss(self, batch: dict[str, Any]) -> torch.Tensor:
             target_size=target_size,
             crop_coords=crop_coords,
             return_dict=False,
-            attention_mask=attention_mask,
+            attention_kwargs={"text_attn_mask": text_attn_mask},
         )[0]
 
         loss = torch.mean((noise_pred_cond - model_label) ** 2, dim=(1, 2, 3))
diff --git a/src/cogkit/finetune/diffusion/models/cogview/cogview4/lora_trainer_packing.py b/src/cogkit/finetune/diffusion/models/cogview/cogview4/lora_trainer_packing.py
@@ -99,10 +99,10 @@ def collate_fn_packing(self, samples: list[dict[str, list[Any]]]) -> dict[str, A
                 - prompt_embedding: Batched prompt embeddings
                 - encoded_image: Batched encoded image latents
                 - image_rotary_emb: Rotary embeddings for images
-                - attention_mask: Dictionary containing:
+                - attention_kwargs: Dictionary containing:
                     - batch_flag: Indices indicating which sample each item belongs to
-                    - text_embedding_attn_mask: Attention mask for text embeddings
-                    - latent_embedding_attn_mask: Attention mask for latent embeddings
+                    - text_attn_mask: Attention mask for text embeddings
+                    - latent_attn_mask: Attention mask for latent embeddings
                 - pixel_mask: Mask for valid pixel regions
                 - original_size: Original dimensions of the images
 
@@ -114,10 +114,10 @@ def collate_fn_packing(self, samples: list[dict[str, list[Any]]]) -> dict[str, A
             "prompt_embedding": None,
             "encoded_image": None,
             "image_rotary_emb": None,
-            "attention_mask": {
+            "attention_kwargs": {
                 "batch_flag": None,
-                "text_embedding_attn_mask": None,
-                "latent_embedding_attn_mask": None,
+                "text_attn_mask": None,
+                "latent_attn_mask": None,
             },
             "pixel_mask": None,
             "original_size": None,
@@ -144,15 +144,15 @@ def collate_fn_packing(self, samples: list[dict[str, list[Any]]]) -> dict[str, A
 
         # Store in batched_data
         batched_data["prompt_embedding"] = prompt_embedding
-        batched_data["attention_mask"]["text_embedding_attn_mask"] = prompt_attention_mask
+        batched_data["attention_kwargs"]["text_attn_mask"] = prompt_attention_mask
         batched_data["encoded_image"] = padded_latent
         batched_data["image_rotary_emb"] = image_rotary_emb
-        batched_data["attention_mask"]["latent_embedding_attn_mask"] = (
-            vtoken_attention_mask.reshape(len(batch_flag), -1)
+        batched_data["attention_kwargs"]["latent_attn_mask"] = vtoken_attention_mask.reshape(
+            len(batch_flag), -1
         )
         batched_data["pixel_mask"] = pixel_mask
 
-        batched_data["attention_mask"]["batch_flag"] = batch_flag
+        batched_data["attention_kwargs"]["batch_flag"] = batch_flag
         batched_data["original_size"] = torch.tensor(
             [(img.height, img.width) for img in samples["image"]]
         )
@@ -168,8 +168,8 @@ def compute_loss(self, batch: dict[str, Any]) -> torch.Tensor:
         batch_size, text_seqlen, text_embedding_dim = prompt_embeds.shape
         batch_size, num_channels, height, width = latent.shape
 
-        attn_mask = batch["attention_mask"]
-        latent_attention_mask = attn_mask["latent_embedding_attn_mask"].float()
+        attention_kwargs = batch["attention_kwargs"]
+        latent_attention_mask = attention_kwargs["latent_attn_mask"].float()
         assert latent_attention_mask.dim() == 2
         vtoken_seq_len = torch.sum(latent_attention_mask != 0, dim=1)
 
@@ -196,8 +196,8 @@ def compute_loss(self, batch: dict[str, Any]) -> torch.Tensor:
             target_size=target_size,
             crop_coords=crop_coords,
             return_dict=False,
-            attention_mask=attn_mask,
             image_rotary_emb=image_rotary_emb,
+            attention_kwargs=attention_kwargs,
         )[0]
 
         pixel_mask = batch["pixel_mask"]
diff --git a/src/cogkit/finetune/utils/attn_mask.py b/src/cogkit/finetune/utils/attn_mask.py
@@ -1,8 +1,9 @@
 import math
-from typing import List, Tuple
+from typing import Any, List, Tuple
 
 import torch
 from transformers import AutoTokenizer
+from diffusers.models.attention_processor import Attention
 
 from .filters import MeanFilter
 
@@ -124,3 +125,9 @@ def process_latent_attention_mask(
     mask_assert(vtoken_attention_mask)
 
     return padded_latent, vtoken_attention_mask, pixel_mask
+
+
+def replace_attn_processor(model: torch.nn.Module, attn_processor_obj: Any) -> None:
+    for name, submodule in model.named_modules():
+        if isinstance(submodule, Attention):
+            submodule.processor = attn_processor_obj