Add multi-card inference support for Wan pipelines (#2325)

dsocek · web-flow · commit 39047da96dcd · 2025-10-28T10:35:24.000+01:00
Signed-off-by: Daniel Socek &lt;daniel.socek@intel.com&gt;
diff --git a/README.md b/README.md
@@ -300,13 +300,13 @@ The following model architectures, tasks and device distributions have been vali
 | Stable Diffusion           | :heavy_check_mark:     | :heavy_check_mark:            | <ul><li>[text-to-image generation](/examples/stable-diffusion#text-to-image-generation)</li><li>[image-to-image generation](/examples/stable-diffusion#image-to-image-generation)</li></ul>    |
 | Stable Diffusion XL        | :heavy_check_mark:     | :heavy_check_mark:            | <ul><li>[text-to-image generation](/examples/stable-diffusion#stable-diffusion-xl-sdxl)</li><li>[image-to-image generation](/examples/stable-diffusion#stable-diffusion-xl-refiner)</li></ul>  |
 | Stable Diffusion Depth2img |                        | <ul><li>Single card</li></ul> | <ul><li>[depth-to-image generation](/examples/stable-diffusion)</li></ul>                                                                                                                      |
-| Stable Diffusion 3         | :heavy_check_mark:     | <ul><li>Single card</li></ul> | <ul><li>[text-to-image generation](/examples/stable-diffusion#stable-diffusion-3-and-35-sd3)</li></ul>                                                                                         |
+| Stable Diffusion 3         | :heavy_check_mark:     | :heavy_check_mark:            | <ul><li>[text-to-image generation](/examples/stable-diffusion#stable-diffusion-3-and-35-sd3)</li></ul>                                                                                         |
 | LDM3D                      |                        | <ul><li>Single card</li></ul> | <ul><li>[text-to-image generation](/examples/stable-diffusion#text-to-image-generation)</li></ul>                                                                                              |
 | FLUX.1                     | <ul><li>LoRA</li></ul> | <ul><li>Single card</li></ul> | <ul><li>[text-to-image generation](/examples/stable-diffusion#flux1)</li><li>[image-to-image generation](/examples/stable-diffusion#flux1-image-to-image)</li></ul>                            |
 | Text to Video              |                        | <ul><li>Single card</li></ul> | <ul><li>[text-to-video generation](/examples/stable-diffusion#text-to-video-generation)</li></ul>                                                                                              |
 | Image to Video             |                        | <ul><li>Single card</li></ul> | <ul><li>[image-to-video generation](/examples/stable-diffusion#image-to-video-generation)</li></ul>                                                                                            |
 | i2vgen-xl                  |                        | <ul><li>Single card</li></ul> | <ul><li>[image-to-video generation](/examples/stable-diffusion#I2vgen-xl)</li></ul>                                                                                                            |
-| Wan                        |                        | <ul><li>Single card</li></ul> |  <ul><li>[text-to-video generation](/examples/stable-diffusion#text-to-video-with-wan-22)</li><li>[image-to-video generation](/examples/stable-diffusion#image-to-video-with-wan-22)</li></ul> |
+| Wan                        |                        | :heavy_check_mark:            | <ul><li>[text-to-video generation](/examples/stable-diffusion#text-to-video-with-wan-22)</li><li>[image-to-video generation](/examples/stable-diffusion#image-to-video-with-wan-22)</li></ul>  |
 
 ### PyTorch Image Models/TIMM:
 
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
@@ -122,16 +122,16 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 
 | Architecture               | Training.              | Inference                     | Tasks                                                                                                                                                                                         |
 |----------------------------|:----------------------:|:-----------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Stable Diffusion           | ✅                     | ✅                            | <ul><li>[text-to-image generation](/examples/stable-diffusion)</li></ul>                                                                                                                      |
-| Stable Diffusion XL        | ✅                     | ✅                            | <ul><li>[text-to-image generation](/examples/stable-diffusion)</li></ul>                                                                                                                      |
+| Stable Diffusion           | ✅                  | ✅                         | <ul><li>[text-to-image generation](/examples/stable-diffusion)</li></ul>                                                                                                                      |
+| Stable Diffusion XL        | ✅                  | ✅                         | <ul><li>[text-to-image generation](/examples/stable-diffusion)</li></ul>                                                                                                                      |
 | Stable Diffusion Depth2img |                        | <ul><li>Single card</li></ul> | <ul><li>[depth-to-image generation](/examples/stable-diffusion)</li></ul>                                                                                                                     |
-| Stable Diffusion 3         | ✅                     | <ul><li>Single card</li></ul> | <ul><li>[text-to-image generation](/examples/stable-diffusion#stable-diffusion-3-and-35-sd3)</li></ul>                                                                                        |
+| Stable Diffusion 3         | ✅                  | <ul><li>Single card</li></ul> | <ul><li>[text-to-image generation](/examples/stable-diffusion#stable-diffusion-3-and-35-sd3)</li></ul>                                                                                        |
 | LDM3D                      |                        | <ul><li>Single card</li></ul> | <ul><li>[text-to-image generation](/examples/stable-diffusion)</li></ul>                                                                                                                      |
-| FLUX.1                     | <ul><li>LoRA</li></ul> | <ul><li>Single card</li></ul> | <ul><li>[text-to-image generation](/examples/stable-diffusion)</li></ul>                                                                                                                      |
+| FLUX.1                     | <ul><li>LoRA</li></ul> | ✅                         | <ul><li>[text-to-image generation](/examples/stable-diffusion)</li></ul>                                                                                                                      |
 | Text to Video              |                        | <ul><li>Single card</li></ul> | <ul><li>[text-to-video generation](/examples/stable-diffusion#text-to-video-generation)</li></ul>                                                                                             |
 | Image to Video             |                        | <ul><li>Single card</li></ul> | <ul><li>[image-to-video generation](/examples/stable-diffusion#image-to-video-generation)</li></ul>                                                                                           |
 | i2vgen-xl                  |                        | <ul><li>Single card</li></ul> | <ul><li>[image-to-video generation](/examples/stable-diffusion#I2vgen-xl)</li></ul>                                                                                                           |
-| Wan                        |                        | <ul><li>Single card</li></ul> | <ul><li>[text-to-video generation](/examples/stable-diffusion#text-to-video-with-wan-22)</li><li>[image-to-video generation](/examples/stable-diffusion#image-to-video-with-wan-22)</li></ul> |
+| Wan                        |                        | ✅                         | <ul><li>[text-to-video generation](/examples/stable-diffusion#text-to-video-with-wan-22)</li><li>[image-to-video generation](/examples/stable-diffusion#image-to-video-with-wan-22)</li></ul> |
 
 - PyTorch Image Models/TIMM:
 
diff --git a/examples/stable-diffusion/README.md b/examples/stable-diffusion/README.md
@@ -476,9 +476,39 @@ python image_to_video_generation.py \
     --fps 24 \
     --num_frames 121 \
     --sdp_on_bf16 \
-    --bf16 
+    --bf16
 ```
 
+#### Distributed Image-to-Video Wan 2.2 Inference
+
+Wan models use classifier-free guidance (CFG), which processes both conditional and unconditional latents during denoising.
+With the `--use_distributed_cfg` option, we parallelize these 2 steps across a pair of HPU devices and then synchronize to apply guidance.
+While this mode uses 2 HPUs per unique generated video, it achieves almost 2x faster inference.
+
+Here is an example of running Wan2.2 image-to-video model with 2 HPU devices in disributed CFG mode:
+
+```bash
+PT_HPU_LAZY_MODE=1 \
+python ../gaudi_spawn.py --world_size 2 image_to_video_generation.py \
+    --model_name_or_path "Wan-AI/Wan2.2-TI2V-5B-Diffusers" \
+    --image_path "https://raw.githubusercontent.com/Wan-Video/Wan2.2/main/examples/i2v_input.JPG" \
+    --video_save_dir ./wan2.2-output \
+    --prompts "The cat removes the glasses from its eyes." \
+    --use_habana \
+    --use_hpu_graphs \
+    --use_distributed_cfg \
+    --height 1088 \
+    --width 800 \
+    --fps 24 \
+    --num_frames 121 \
+    --sdp_on_bf16 \
+    --bf16
+```
+
+> [!NOTE]
+> Distributed CFG mode requires even number of devices in the `world_size`.
+
+
 ### Text-to-Video with Wan 2.2
 Wan2.2 is a comprehensive and open suite of video foundation models. Please refer to [Huggingface Wan2.2 doc](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B)
 
@@ -502,6 +532,37 @@ python text_to_video_generation.py \
     --dtype bf16
 ```
 
+#### Distributed Text-to-Video Wan 2.2 Inference
+
+Wan models use classifier-free guidance (CFG), which processes both conditional and unconditional latents during denoising.
+With the `--use_distributed_cfg` option, we parallelize these 2 steps across a pair of HPU devices and then synchronize to apply guidance.
+While this mode uses 2 HPUs per unique generated video, it achieves almost 2x faster inference.
+
+Here is an example of running Wan2.2 text-to-video model with 2 HPU devices in disributed CFG mode:
+
+```bash
+PT_HPU_LAZY_MODE=1 \
+python ../gaudi_spawn.py --world_size 2 text_to_video_generation.py \
+    --model_name_or_path "Wan-AI/Wan2.2-TI2V-5B-Diffusers" \
+    --prompts "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
+    --pipeline_type wan \
+    --num_videos_per_prompt 1 \
+    --use_habana \
+    --use_hpu_graphs \
+    --use_distributed_cfg \
+    --height 704 \
+    --width 1280 \
+    --num_frames 121 \
+    --num_inference_steps 50 \
+    --guidance_scale 5.0 \
+    --output_type mp4 \
+    --dtype bf16
+```
+
+> [!NOTE]
+> Distributed CFG mode requires even number of devices in the `world_size`.
+
+
 ### Text-to-Video with CogvideoX
 
 CogVideoX is an open-source version of the video generation model originating from QingYing, unveiled in https://huggingface.co/THUDM/CogVideoX-5b.
diff --git a/examples/stable-diffusion/image_to_video_generation.py b/examples/stable-diffusion/image_to_video_generation.py
@@ -15,6 +15,7 @@
 
 import argparse
 import logging
+import os
 import sys
 from pathlib import Path
 
@@ -206,6 +207,11 @@ def main():
         help="Allow pyTorch to use reduced precision in the SDPA math backend",
     )
     parser.add_argument("--num_frames", type=int, default=25, help="The number of video frames to generate.")
+    parser.add_argument(
+        "--use_distributed_cfg",
+        action="store_true",
+        help="Use distributed CFG (classifier-free guidance) across 2 devices for Wan pipeline. Requires even world size.",
+    )
     parser.add_argument(
         "--profiling_warmup_steps",
         default=0,
@@ -291,7 +297,13 @@ def main():
         "sdp_on_bf16": args.sdp_on_bf16,
     }
 
-    set_seed(args.seed)
+    # Set RNG seed
+    seed_dist_offset = int(os.getenv("RANK", "0"))
+    if args.use_distributed_cfg:
+        # Same seed needed for a pair of workers with distributed CFG for SD3
+        seed_dist_offset = seed_dist_offset // 2
+    set_seed(args.seed + seed_dist_offset)
+
     if args.bf16:
         kwargs["torch_dtype"] = torch.bfloat16
 
@@ -377,6 +389,7 @@ def main():
             num_frames=args.num_frames,
             num_inference_steps=args.num_inference_steps,
             guidance_scale=5.0,  # WAN I2V recommended guidance scale
+            use_distributed_cfg=args.use_distributed_cfg,
             output_type=args.output_type,
             profiling_warmup_steps=args.profiling_warmup_steps,
             profiling_steps=args.profiling_steps,
@@ -419,25 +432,30 @@ def main():
         if args.output_type == "pil":
             video_save_dir = Path(args.video_save_dir)
             video_save_dir.mkdir(parents=True, exist_ok=True)
-            logger.info(f"Saving video frames in {video_save_dir.resolve()}...")
-            for i, frames in enumerate(outputs.frames):
-                if args.gif:
-                    export_to_gif(frames, args.video_save_dir + "/gen_video_" + str(i).zfill(2) + ".gif")
-                else:
-                    export_to_video(
-                        frames, args.video_save_dir + "/gen_video_" + str(i).zfill(2) + ".mp4", fps=args.fps
-                    )
-
-                if args.save_frames_as_images:
-                    for j, frame in enumerate(frames):
-                        frame.save(
-                            args.video_save_dir
-                            + "/gen_video_"
-                            + str(i).zfill(2)
-                            + "_frame_"
-                            + str(j).zfill(2)
-                            + ".png"
+
+            rank = int(os.getenv("RANK", "0"))
+            world_size = int(os.getenv("WORLD_SIZE", "1"))
+            rank_ext = f"_rank{rank}" if world_size > 1 else ""
+            skip_rank = False
+            if args.use_distributed_cfg and world_size > 1:
+                rank_ext += f"and{rank + 1}"
+                skip_rank = rank % 2 == 1
+
+            if not skip_rank:
+                logger.info(f"Saving video frames in {video_save_dir.resolve()}...")
+                for i, frames in enumerate(outputs.frames):
+                    if args.gif:
+                        export_to_gif(frames, f"{args.video_save_dir}/gen_video_{str(i).zfill(2)}{rank_ext}.gif")
+                    else:
+                        export_to_video(
+                            frames, f"{args.video_save_dir}/gen_video_{str(i).zfill(2)}{rank_ext}.mp4", fps=args.fps
                         )
+
+                    if args.save_frames_as_images:
+                        for j, frame in enumerate(frames):
+                            frame.save(
+                                f"{args.video_save_dir}/gen_video_{str(i).zfill(2)}_frame_{str(j).zfill(2)}{rank_ext}.png"
+                            )
         else:
             logger.warning("--output_type should be equal to 'pil' to save frames in --video_save_dir.")
 
diff --git a/examples/stable-diffusion/text_to_video_generation.py b/examples/stable-diffusion/text_to_video_generation.py
@@ -17,6 +17,7 @@
 
 import argparse
 import logging
+import os
 import sys
 from pathlib import Path
 
@@ -149,6 +150,11 @@ def main():
         choices=["bf16", "fp32", "autocast_bf16"],
         help="Which runtime dtype to perform generation in.",
     )
+    parser.add_argument(
+        "--use_distributed_cfg",
+        action="store_true",
+        help="Use distributed CFG (classifier-free guidance) across 2 devices for Wan pipeline. Requires even world size.",
+    )
     args = parser.parse_args()
     # Setup logging
     logging.basicConfig(
@@ -183,6 +189,13 @@ def main():
     elif args.dtype == "fp32":
         kwargs["torch_dtype"] = torch.float32
 
+    # Set RNG seed
+    seed_dist_offset = int(os.getenv("RANK", "0"))
+    if args.use_distributed_cfg:
+        # Same seed needed for a pair of workers with distributed CFG for SD3
+        seed_dist_offset = seed_dist_offset // 2
+    set_seed(args.seed + seed_dist_offset)
+
     # Generate images
     if args.pipeline_type == "stable_diffusion":
         pipeline: GaudiTextToVideoSDPipeline = GaudiTextToVideoSDPipeline.from_pretrained(
@@ -199,7 +212,6 @@ def main():
         return None
 
     if args.pipeline_type == "stable_diffusion":
-        set_seed(args.seed)
         outputs = pipeline(
             prompt=args.prompts,
             num_videos_per_prompt=args.num_videos_per_prompt,
@@ -242,13 +254,13 @@ def main():
         filename = video_save_dir / "cogvideoX_out.mp4"
         export_to_video(video, str(filename.resolve()), fps=8)
     elif args.pipeline_type == "wan":
-        set_seed(args.seed)
         outputs = pipeline(
             prompt=args.prompts,
             num_videos_per_prompt=args.num_videos_per_prompt,
             num_inference_steps=args.num_inference_steps,
             guidance_scale=args.guidance_scale,
             negative_prompt=args.negative_prompts,
+            use_distributed_cfg=args.use_distributed_cfg,
             output_type="np" if args.output_type == "mp4" else args.output_type,
             **kwargs_call,
         )
@@ -262,11 +274,20 @@ def main():
             if args.output_type == "mp4":
                 video_save_dir = Path(args.video_save_dir)
                 video_save_dir.mkdir(parents=True, exist_ok=True)
-                logger.info(f"Saving videos in {video_save_dir.resolve()}...")
 
-                for i, video in enumerate(outputs.frames):
-                    filename = video_save_dir / f"wan_video_{i + 1}.mp4"
-                    export_to_video(video, str(filename.resolve()), fps=16)
+                rank = int(os.getenv("RANK", "0"))
+                world_size = int(os.getenv("WORLD_SIZE", "1"))
+                rank_ext = f"_rank{rank}" if world_size > 1 else ""
+                skip_rank = False
+                if args.use_distributed_cfg and world_size > 1:
+                    rank_ext += f"and{rank + 1}"
+                    skip_rank = rank % 2 == 1
+
+                if not skip_rank:
+                    logger.info(f"Saving videos in {video_save_dir.resolve()}...")
+                    for i, video in enumerate(outputs.frames):
+                        filename = video_save_dir / f"wan_video_{i + 1}{rank_ext}.mp4"
+                        export_to_video(video, str(filename.resolve()), fps=16)
             else:
                 logger.warning("--output_type should be equal to 'mp4' to save videos in --video_save_dir.")
 
diff --git a/optimum/habana/diffusers/pipelines/wan/pipeline_wan.py b/optimum/habana/diffusers/pipelines/wan/pipeline_wan.py
diff --git a/optimum/habana/diffusers/pipelines/wan/pipeline_wan_i2v.py b/optimum/habana/diffusers/pipelines/wan/pipeline_wan_i2v.py