FastVideo/examples/inference/cli/v1_inference_longcat_i2v.sh at 64f822034426ce5422e60e788d436c9441139968 · hao-ai-lab/FastVideo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/bin/bash

# LongCat Image-to-Video (I2V) Inference Script
#
# This script runs LongCat I2V inference using the fastvideo CLI.
# LongCat I2V takes an input image and generates a video from it.
#
# Usage:
#   bash examples/inference/cli/v1_inference_longcat_i2v.sh
#
# Prerequisites:
#   - Install fastvideo: pip install -e .
#   - The model weights will be auto-downloaded from HuggingFace
#   - Or use local weights if you have them

num_gpus=1

export FASTVIDEO_ATTENTION_BACKEND=

# Model path options:
# Option 1: HuggingFace model (auto-downloaded)
export MODEL_BASE=FastVideo/LongCat-Video-I2V-Diffusers

# Option 2: Local weights (uncomment if you have local weights)
# export MODEL_BASE=weights/longcat-for-i2v

# Input image path (must be square for LongCat I2V)
IMAGE_PATH="assets/girl.png"

# Check if image exists
if [ ! -f "$IMAGE_PATH" ]; then
    echo "Error: Image not found at $IMAGE_PATH"
    echo "Please provide a valid image path"
    exit 1
fi

fastvideo generate \
    --model-path $MODEL_BASE \
    --sp-size $num_gpus \
    --tp-size 1 \
    --num-gpus $num_gpus \
    --dit-cpu-offload False \
    --vae-cpu-offload True \
    --text-encoder-cpu-offload True \
    --pin-cpu-memory False \
    --enable-bsa False \
    --image-path "$IMAGE_PATH" \
    --height 480 \
    --width 480 \
    --num-frames 93 \
    --num-inference-steps 50 \
    --fps 15 \
    --guidance-scale 4.0 \
    --prompt "A woman sits at a wooden table by the window in a cozy café. She reaches out with her right hand, picks up the white coffee cup from the saucer, and gently brings it to her lips to take a sip. After drinking, she places the cup back on the table and looks out the window, enjoying the peaceful atmosphere." \
    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
    --seed 42 \
    --output-path outputs_video/longcat_i2v