Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions .buildkite/performance-benchmarks/tests/wan-t2v-1.3b.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"benchmark_id": "wan-t2v-1.3b-2gpu",
"description": "Wan2.1 T2V 1.3B inference performance",
"model": {
"model_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
"model_short_name": "Wan2.1-T2V-1.3B"
},
"init_kwargs": {
"num_gpus": 2,
"flow_shift": 7.0,
"sp_size": 2,
"tp_size": 1,
"vae_sp": true,
"vae_tiling": true,
"text_encoder_precisions": ["fp32"]
},
"generation_kwargs": {
"height": 480,
"width": 832,
"num_frames": 45,
"num_inference_steps": 4,
"guidance_scale": 3,
"embedded_cfg_scale": 6,
"seed": 1024,
"fps": 24,
"neg_prompt": "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
},
"test_prompts": [
"Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of a bustling street food market. The scene captures a mix of humor and authenticity. Mid-shot framing, vibrant lighting."
],
"run_config": {
"num_warmup_runs": 1,
"num_measurement_runs": 3,
"required_gpus": 2
},
"thresholds": {
"L40S": {
"max_generation_time_s": 34.0,
"max_peak_memory_mb": 11000.0
},
"default": {
"max_generation_time_s": 120.0,
"max_peak_memory_mb": 30000.0
}
}
}
18 changes: 18 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,24 @@ steps:
- TEST_TYPE=unit_test
agents:
queue: "default"
- path:
- "fastvideo/models/dits/**"
- "fastvideo/pipelines/**"
- "fastvideo/attention/**"
- "fastvideo/layers/**"
- "fastvideo/worker/**"
- "fastvideo/entrypoints/**"
- "fastvideo/tests/performance/**"
- ".buildkite/performance-benchmarks/**"
- "pyproject.toml"
- "docker/Dockerfile.python3.12"
config:
command: "timeout 30m .buildkite/scripts/pr_test.sh"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The command for the "Performance Tests" step includes timeout 30m. A similar timeout (timeout=1800) is also specified in the run_performance_tests function within fastvideo/tests/modal/pr_test.py. It's generally better to have a single source of truth for timeouts to avoid confusion and potential conflicts. Consider removing one of these timeouts or clarifying their intended roles (e.g., Buildkite timeout as a failsafe, Modal timeout as the primary control).

label: "Performance Tests"
env:
- TEST_TYPE=performance
agents:
queue: "default"
- path:
- "fastvideo/entrypoints/openai/**"
- "fastvideo/entrypoints/cli/serve.py"
Expand Down
4 changes: 4 additions & 0 deletions .buildkite/scripts/pr_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ case "$TEST_TYPE" in
log "Running LoRA extraction tests..."
MODAL_COMMAND="$MODAL_ENV HF_API_KEY=$HF_API_KEY python3 -m modal run $MODAL_TEST_FILE::run_lora_extraction_tests"
;;
"performance")
log "Running performance tests..."
MODAL_COMMAND="$MODAL_ENV HF_API_KEY=$HF_API_KEY python3 -m modal run $MODAL_TEST_FILE::run_performance_tests"
;;
"api_server")
log "Running API server integration tests..."
MODAL_COMMAND="$MODAL_ENV HF_API_KEY=$HF_API_KEY python3 -m modal run $MODAL_TEST_FILE::run_api_server_tests"
Expand Down
1 change: 1 addition & 0 deletions docs/inference/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ Specialized variants: `CausalDenoisingStage`, `LTX2DenoisingStage`,
`positive_int_divisible(divisor)`, etc.

`VerificationResult` collects check results:

```python
result = VerificationResult()
result.add_check("height", batch.height, V.positive_int_divisible(8))
Expand Down
14 changes: 14 additions & 0 deletions fastvideo/tests/modal/pr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,20 @@ def run_lora_extraction_tests():
)


@app.function(gpu="L40S:2",
image=image,
timeout=1800,
secrets=[
modal.Secret.from_dict(
{"HF_API_KEY": os.environ.get("HF_API_KEY", "")})
],
volumes={"/root/data": model_vol})
def run_performance_tests():
run_test(
"export HF_HOME='/root/data/.cache' && hf auth login --token $HF_API_KEY && pytest ./fastvideo/tests/performance -vs"
)


@app.function(gpu="L40S:1",
image=image,
timeout=1800,
Expand Down
Empty file.
199 changes: 199 additions & 0 deletions fastvideo/tests/performance/test_inference_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
# SPDX-License-Identifier: Apache-2.0
"""Config-driven inference performance tests.

Benchmark configs live in .buildkite/performance-benchmarks/tests/*.json.
Each JSON file defines model params, generation kwargs, run config, and
per-device thresholds. This test module auto-discovers all configs and
parametrizes a single test function over them.
"""
import glob
import json
import os
import time
from datetime import datetime, timezone

import torch
import pytest

from fastvideo import VideoGenerator
from fastvideo.logger import init_logger
from fastvideo.worker.multiproc_executor import MultiprocExecutor

logger = init_logger(__name__)

# -- Config discovery -------------------------------------------------------

_BENCHMARKS_DIR = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"..",
"..",
"..",
".buildkite",
"performance-benchmarks",
"tests",
)


def _discover_benchmarks():
"""Glob benchmark JSON configs and return list of (id, config) tuples."""
pattern = os.path.join(_BENCHMARKS_DIR, "*.json")
configs = []
for path in sorted(glob.glob(pattern)):
with open(path) as f:
cfg = json.load(f)
configs.append(cfg)
return configs


_BENCHMARK_CONFIGS = _discover_benchmarks()

# -- Helpers ----------------------------------------------------------------


def _get_thresholds(cfg):
"""Return thresholds dict for the current GPU from config."""
device_name = torch.cuda.get_device_name()
thresholds = cfg.get("thresholds", {})
for gpu_key, thresh in thresholds.items():
if gpu_key in device_name:
logger.info("Using thresholds for %s: %s", gpu_key, thresh)
return thresh
default = thresholds.get("default", {
"max_generation_time_s": 120.0,
"max_peak_memory_mb": 30000.0,
})
logger.warning("No thresholds for device '%s', using defaults", device_name)
return default


def _shutdown_executor(generator):
if generator is None:
return
if isinstance(generator.executor, MultiprocExecutor):
generator.executor.shutdown()


def _run_generation(generator, prompt, generation_kwargs):
"""Run a single generation, return (elapsed_s, peak_memory_mb)."""
torch.cuda.synchronize()
start = time.perf_counter()
result = generator.generate_video(prompt, **generation_kwargs)
torch.cuda.synchronize()
elapsed = time.perf_counter() - start
peak_memory_mb = result.get("peak_memory_mb", 0.0) or 0.0
return elapsed, peak_memory_mb


def _write_results(results):
"""Write JSON results to the results directory."""
script_dir = os.path.dirname(os.path.abspath(__file__))
results_dir = os.path.join(script_dir, "results")
os.makedirs(results_dir, exist_ok=True)

bid = results.get("benchmark_id", "unknown")
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
filename = f"perf_{bid}_{ts}.json"
filepath = os.path.join(results_dir, filename)

with open(filepath, "w") as f:
json.dump(results, f, indent=2)
logger.info("Performance results written to %s", filepath)


# -- Test -------------------------------------------------------------------


@pytest.mark.parametrize(
"cfg",
_BENCHMARK_CONFIGS,
ids=[c["benchmark_id"] for c in _BENCHMARK_CONFIGS],
)
def test_inference_performance(cfg):
"""Measure generation latency and peak GPU memory,
assert against device-aware thresholds."""
run_config = cfg.get("run_config", {})
required_gpus = run_config.get("required_gpus", 1)
available = torch.cuda.device_count()
if available < required_gpus:
pytest.skip(f"Need {required_gpus} GPUs, only {available} available")

model_info = cfg["model"]
init_kwargs = dict(cfg.get("init_kwargs", {}))
gen_kwargs = dict(cfg.get("generation_kwargs", {}))
prompts = cfg.get("test_prompts", ["A cinematic video."])
prompt = prompts[0]

num_warmup = run_config.get("num_warmup_runs", 1)
num_measure = run_config.get("num_measurement_runs", 3)
thresholds = _get_thresholds(cfg)

# Remap JSON keys to VideoGenerator kwargs
text_enc_prec = init_kwargs.pop("text_encoder_precisions", None)
if text_enc_prec is not None:
init_kwargs["text_encoder_precisions"] = tuple(text_enc_prec)

# Output directory for generated videos
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "generated_videos",
cfg["benchmark_id"])
os.makedirs(output_dir, exist_ok=True)
gen_kwargs["output_path"] = output_dir

generator = None
try:
generator = VideoGenerator.from_pretrained(
model_path=model_info["model_path"],
**init_kwargs,
)

for i in range(num_warmup):
logger.info("Warmup run %d/%d", i + 1, num_warmup)
_run_generation(generator, prompt, gen_kwargs)

times = []
peak_memories = []
for i in range(num_measure):
logger.info("Measurement run %d/%d", i + 1, num_measure)
elapsed, peak_mb = _run_generation(generator, prompt, gen_kwargs)
logger.info(" Time: %.2fs, Peak memory: %.0fMB", elapsed, peak_mb)
times.append(elapsed)
peak_memories.append(peak_mb)
finally:
_shutdown_executor(generator)

avg_time = sum(times) / len(times)
max_peak_memory = max(peak_memories)
device_name = torch.cuda.get_device_name()

results = {
"benchmark_id": cfg["benchmark_id"],
"model_short_name": model_info.get("model_short_name", ""),
"device": device_name,
"num_gpus": init_kwargs.get("num_gpus", 1),
"num_warmup_runs": num_warmup,
"num_measurement_runs": num_measure,
"avg_generation_time_s": round(avg_time, 3),
"individual_times_s": [round(t, 3) for t in times],
"max_peak_memory_mb": round(max_peak_memory, 1),
"individual_peak_memories_mb": [round(m, 1) for m in peak_memories],
"thresholds": thresholds,
"commit": os.environ.get("BUILDKITE_COMMIT", ""),
"pr_number": os.environ.get("BUILDKITE_PULL_REQUEST", ""),
"timestamp": datetime.now(timezone.utc).isoformat(),
}

logger.info(
"Performance results: avg_time=%.2fs, "
"max_peak_memory=%.0fMB", avg_time, max_peak_memory)
_write_results(results)

max_time = thresholds["max_generation_time_s"]
max_mem = thresholds["max_peak_memory_mb"]

assert avg_time <= max_time, (
f"Average generation time {avg_time:.2f}s exceeds "
f"threshold {max_time:.1f}s for {device_name}")

assert max_peak_memory <= max_mem, (
f"Peak memory {max_peak_memory:.0f}MB exceeds "
f"threshold {max_mem:.0f}MB for {device_name}")
13 changes: 5 additions & 8 deletions fastvideo/worker/multiproc_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,9 @@ def execute_forward(self, forward_batch: ForwardBatch,
else:
logging_info = None

# Get extra dict (contains audio, etc.)
# Get extra dict (contains audio, peak_memory_mb, etc.)
extra = responses[0].get("extra", {})

peak_memory_mb = responses[0].get("peak_memory_mb", 0.0)
extra["peak_memory_mb"] = peak_memory_mb

result_batch = ForwardBatch(data_type=forward_batch.data_type,
output=output,
logging_info=logging_info,
Expand Down Expand Up @@ -650,18 +647,18 @@ def worker_busy_loop(self) -> None:
fastvideo_args = kwargs['fastvideo_args']
output_batch = self.worker.execute_forward(
forward_batch, fastvideo_args)
peak_memory_mb = torch.cuda.max_memory_allocated() / (
1024 * 1024)
logging_info = None
if envs.FASTVIDEO_STAGE_LOGGING:
logging_info = output_batch.logging_info
# result tensor shared by CUDA IPC to avoid serialization overhead
result = output_batch.output
extra = output_batch.extra or {}
extra["peak_memory_mb"] = (
torch.cuda.max_memory_allocated() / (1024 * 1024))
self.pipe.send({
"output_batch": result,
"logging_info": logging_info,
"extra": output_batch.extra,
"peak_memory_mb": peak_memory_mb,
"extra": extra,
})
else:
result = self.worker.execute_method(
Expand Down