hao-ai-lab · Eigensystem · Mar 7, 2026 · Feb 27, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/.buildkite/performance-benchmarks/tests/wan-t2v-1.3b.json b/.buildkite/performance-benchmarks/tests/wan-t2v-1.3b.json
@@ -0,0 +1,46 @@
+{
+  "benchmark_id": "wan-t2v-1.3b-2gpu",
+  "description": "Wan2.1 T2V 1.3B inference performance",
+  "model": {
+    "model_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+    "model_short_name": "Wan2.1-T2V-1.3B"
+  },
+  "init_kwargs": {
+    "num_gpus": 2,
+    "flow_shift": 7.0,
+    "sp_size": 2,
+    "tp_size": 1,
+    "vae_sp": true,
+    "vae_tiling": true,
+    "text_encoder_precisions": ["fp32"]
+  },
+  "generation_kwargs": {
+    "height": 480,
+    "width": 832,
+    "num_frames": 45,
+    "num_inference_steps": 4,
+    "guidance_scale": 3,
+    "embedded_cfg_scale": 6,
+    "seed": 1024,
+    "fps": 24,
+    "neg_prompt": "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+  },
+  "test_prompts": [
+    "Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of a bustling street food market. The scene captures a mix of humor and authenticity. Mid-shot framing, vibrant lighting."
+  ],
+  "run_config": {
+    "num_warmup_runs": 1,
+    "num_measurement_runs": 3,
+    "required_gpus": 2
+  },
+  "thresholds": {
+    "L40S": {
+      "max_generation_time_s": 34.0,
+      "max_peak_memory_mb": 11000.0
+    },
+    "default": {
+      "max_generation_time_s": 120.0,
+      "max_peak_memory_mb": 30000.0
+    }
+  }
+}
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -171,6 +171,24 @@ steps:
                   - TEST_TYPE=unit_test
                 agents:
                   queue: "default"
+            - path:
+                - "fastvideo/models/dits/**"
+                - "fastvideo/pipelines/**"
+                - "fastvideo/attention/**"
+                - "fastvideo/layers/**"
+                - "fastvideo/worker/**"
+                - "fastvideo/entrypoints/**"
+                - "fastvideo/tests/performance/**"
+                - ".buildkite/performance-benchmarks/**"
+                - "pyproject.toml"
+                - "docker/Dockerfile.python3.12"
+              config:
+                command: "timeout 30m .buildkite/scripts/pr_test.sh"
+                label: "Performance Tests"
+                env:
+                  - TEST_TYPE=performance
+                agents:
+                  queue: "default"
             - path:
                 - "fastvideo/entrypoints/openai/**"
                 - "fastvideo/entrypoints/cli/serve.py"

diff --git a/.buildkite/scripts/pr_test.sh b/.buildkite/scripts/pr_test.sh
@@ -119,6 +119,10 @@ case "$TEST_TYPE" in
         log "Running LoRA extraction tests..."
         MODAL_COMMAND="$MODAL_ENV HF_API_KEY=$HF_API_KEY python3 -m modal run $MODAL_TEST_FILE::run_lora_extraction_tests"
         ;;
+    "performance")
+        log "Running performance tests..."
+        MODAL_COMMAND="$MODAL_ENV HF_API_KEY=$HF_API_KEY python3 -m modal run $MODAL_TEST_FILE::run_performance_tests"
+        ;;
     "api_server")
         log "Running API server integration tests..."
         MODAL_COMMAND="$MODAL_ENV HF_API_KEY=$HF_API_KEY python3 -m modal run $MODAL_TEST_FILE::run_api_server_tests"

diff --git a/docs/inference/architecture.md b/docs/inference/architecture.md
@@ -268,6 +268,7 @@ Specialized variants: `CausalDenoisingStage`, `LTX2DenoisingStage`,
 `positive_int_divisible(divisor)`, etc.
 
 `VerificationResult` collects check results:
+
 ```python
 result = VerificationResult()
 result.add_check("height", batch.height, V.positive_int_divisible(8))

diff --git a/fastvideo/tests/modal/pr_test.py b/fastvideo/tests/modal/pr_test.py
@@ -223,6 +223,20 @@ def run_lora_extraction_tests():
     )
 
 
+@app.function(gpu="L40S:2",
+              image=image,
+              timeout=1800,
+              secrets=[
+                  modal.Secret.from_dict(
+                      {"HF_API_KEY": os.environ.get("HF_API_KEY", "")})
+              ],
+              volumes={"/root/data": model_vol})
+def run_performance_tests():
+    run_test(
+        "export HF_HOME='/root/data/.cache' && hf auth login --token $HF_API_KEY && pytest ./fastvideo/tests/performance -vs"
+    )
+
+
 @app.function(gpu="L40S:1",
               image=image,
               timeout=1800,

diff --git a/fastvideo/tests/performance/__init__.py b/fastvideo/tests/performance/__init__.py
diff --git a/fastvideo/tests/performance/test_inference_performance.py b/fastvideo/tests/performance/test_inference_performance.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Config-driven inference performance tests.
+
+Benchmark configs live in .buildkite/performance-benchmarks/tests/*.json.
+Each JSON file defines model params, generation kwargs, run config, and
+per-device thresholds.  This test module auto-discovers all configs and
+parametrizes a single test function over them.
+"""
+import glob
+import json
+import os
+import time
+from datetime import datetime, timezone
+
+import torch
+import pytest
+
+from fastvideo import VideoGenerator
+from fastvideo.logger import init_logger
+from fastvideo.worker.multiproc_executor import MultiprocExecutor
+
+logger = init_logger(__name__)
+
+# -- Config discovery -------------------------------------------------------
+
+_BENCHMARKS_DIR = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)),
+    "..",
+    "..",
+    "..",
+    ".buildkite",
+    "performance-benchmarks",
+    "tests",
+)
+
+
+def _discover_benchmarks():
+    """Glob benchmark JSON configs and return list of (id, config) tuples."""
+    pattern = os.path.join(_BENCHMARKS_DIR, "*.json")
+    configs = []
+    for path in sorted(glob.glob(pattern)):
+        with open(path) as f:
+            cfg = json.load(f)
+        configs.append(cfg)
+    return configs
+
+
+_BENCHMARK_CONFIGS = _discover_benchmarks()
+
+# -- Helpers ----------------------------------------------------------------
+
+
+def _get_thresholds(cfg):
+    """Return thresholds dict for the current GPU from config."""
+    device_name = torch.cuda.get_device_name()
+    thresholds = cfg.get("thresholds", {})
+    for gpu_key, thresh in thresholds.items():
+        if gpu_key in device_name:
+            logger.info("Using thresholds for %s: %s", gpu_key, thresh)
+            return thresh
+    default = thresholds.get("default", {
+        "max_generation_time_s": 120.0,
+        "max_peak_memory_mb": 30000.0,
+    })
+    logger.warning("No thresholds for device '%s', using defaults", device_name)
+    return default
+
+
+def _shutdown_executor(generator):
+    if generator is None:
+        return
+    if isinstance(generator.executor, MultiprocExecutor):
+        generator.executor.shutdown()
+
+
+def _run_generation(generator, prompt, generation_kwargs):
+    """Run a single generation, return (elapsed_s, peak_memory_mb)."""
+    torch.cuda.synchronize()
+    start = time.perf_counter()
+    result = generator.generate_video(prompt, **generation_kwargs)
+    torch.cuda.synchronize()
+    elapsed = time.perf_counter() - start
+    peak_memory_mb = result.get("peak_memory_mb", 0.0) or 0.0
+    return elapsed, peak_memory_mb
+
+
+def _write_results(results):
+    """Write JSON results to the results directory."""
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    results_dir = os.path.join(script_dir, "results")
+    os.makedirs(results_dir, exist_ok=True)
+
+    bid = results.get("benchmark_id", "unknown")
+    ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    filename = f"perf_{bid}_{ts}.json"
+    filepath = os.path.join(results_dir, filename)
+
+    with open(filepath, "w") as f:
+        json.dump(results, f, indent=2)
+    logger.info("Performance results written to %s", filepath)
+
+
+# -- Test -------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "cfg",
+    _BENCHMARK_CONFIGS,
+    ids=[c["benchmark_id"] for c in _BENCHMARK_CONFIGS],
+)
+def test_inference_performance(cfg):
+    """Measure generation latency and peak GPU memory,
+    assert against device-aware thresholds."""
+    run_config = cfg.get("run_config", {})
+    required_gpus = run_config.get("required_gpus", 1)
+    available = torch.cuda.device_count()
+    if available < required_gpus:
+        pytest.skip(f"Need {required_gpus} GPUs, only {available} available")
+
+    model_info = cfg["model"]
+    init_kwargs = dict(cfg.get("init_kwargs", {}))
+    gen_kwargs = dict(cfg.get("generation_kwargs", {}))
+    prompts = cfg.get("test_prompts", ["A cinematic video."])
+    prompt = prompts[0]
+
+    num_warmup = run_config.get("num_warmup_runs", 1)
+    num_measure = run_config.get("num_measurement_runs", 3)
+    thresholds = _get_thresholds(cfg)
+
+    # Remap JSON keys to VideoGenerator kwargs
+    text_enc_prec = init_kwargs.pop("text_encoder_precisions", None)
+    if text_enc_prec is not None:
+        init_kwargs["text_encoder_precisions"] = tuple(text_enc_prec)
+
+    # Output directory for generated videos
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    output_dir = os.path.join(script_dir, "generated_videos",
+                              cfg["benchmark_id"])
+    os.makedirs(output_dir, exist_ok=True)
+    gen_kwargs["output_path"] = output_dir
+
+    generator = None
+    try:
+        generator = VideoGenerator.from_pretrained(
+            model_path=model_info["model_path"],
+            **init_kwargs,
+        )
+
+        for i in range(num_warmup):
+            logger.info("Warmup run %d/%d", i + 1, num_warmup)
+            _run_generation(generator, prompt, gen_kwargs)
+
+        times = []
+        peak_memories = []
+        for i in range(num_measure):
+            logger.info("Measurement run %d/%d", i + 1, num_measure)
+            elapsed, peak_mb = _run_generation(generator, prompt, gen_kwargs)
+            logger.info("  Time: %.2fs, Peak memory: %.0fMB", elapsed, peak_mb)
+            times.append(elapsed)
+            peak_memories.append(peak_mb)
+    finally:
+        _shutdown_executor(generator)
+
+    avg_time = sum(times) / len(times)
+    max_peak_memory = max(peak_memories)
+    device_name = torch.cuda.get_device_name()
+
+    results = {
+        "benchmark_id": cfg["benchmark_id"],
+        "model_short_name": model_info.get("model_short_name", ""),
+        "device": device_name,
+        "num_gpus": init_kwargs.get("num_gpus", 1),
+        "num_warmup_runs": num_warmup,
+        "num_measurement_runs": num_measure,
+        "avg_generation_time_s": round(avg_time, 3),
+        "individual_times_s": [round(t, 3) for t in times],
+        "max_peak_memory_mb": round(max_peak_memory, 1),
+        "individual_peak_memories_mb": [round(m, 1) for m in peak_memories],
+        "thresholds": thresholds,
+        "commit": os.environ.get("BUILDKITE_COMMIT", ""),
+        "pr_number": os.environ.get("BUILDKITE_PULL_REQUEST", ""),
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+    }
+
+    logger.info(
+        "Performance results: avg_time=%.2fs, "
+        "max_peak_memory=%.0fMB", avg_time, max_peak_memory)
+    _write_results(results)
+
+    max_time = thresholds["max_generation_time_s"]
+    max_mem = thresholds["max_peak_memory_mb"]
+
+    assert avg_time <= max_time, (
+        f"Average generation time {avg_time:.2f}s exceeds "
+        f"threshold {max_time:.1f}s for {device_name}")
+
+    assert max_peak_memory <= max_mem, (
+        f"Peak memory {max_peak_memory:.0f}MB exceeds "
+        f"threshold {max_mem:.0f}MB for {device_name}")
diff --git a/fastvideo/worker/multiproc_executor.py b/fastvideo/worker/multiproc_executor.py
@@ -132,12 +132,9 @@ def execute_forward(self, forward_batch: ForwardBatch,
         else:
             logging_info = None
 
-        # Get extra dict (contains audio, etc.)
+        # Get extra dict (contains audio, peak_memory_mb, etc.)
         extra = responses[0].get("extra", {})
 
-        peak_memory_mb = responses[0].get("peak_memory_mb", 0.0)
-        extra["peak_memory_mb"] = peak_memory_mb
-
         result_batch = ForwardBatch(data_type=forward_batch.data_type,
                                     output=output,
                                     logging_info=logging_info,
@@ -650,18 +647,18 @@ def worker_busy_loop(self) -> None:
                         fastvideo_args = kwargs['fastvideo_args']
                         output_batch = self.worker.execute_forward(
                             forward_batch, fastvideo_args)
-                        peak_memory_mb = torch.cuda.max_memory_allocated() / (
-                            1024 * 1024)
                         logging_info = None
                         if envs.FASTVIDEO_STAGE_LOGGING:
                             logging_info = output_batch.logging_info
                         # result tensor shared by CUDA IPC to avoid serialization overhead
                         result = output_batch.output
+                        extra = output_batch.extra or {}
+                        extra["peak_memory_mb"] = (
+                            torch.cuda.max_memory_allocated() / (1024 * 1024))
                         self.pipe.send({
                             "output_batch": result,
                             "logging_info": logging_info,
-                            "extra": output_batch.extra,
-                            "peak_memory_mb": peak_memory_mb,
+                            "extra": extra,
                         })
                     else:
                         result = self.worker.execute_method(