FastVideo/examples/train/distill_wan2.1_t2v_1.3B_dmd2.yaml at 9da01fa56fb30a23eaaf8f639e5da34c3a477190 · FoundationResearch/FastVideo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# DMD2 distillation: Wan 2.1 T2V 1.3B (teacher 50-step -> student 4-step).
#
# - Teacher: frozen pretrained Wan 2.1 T2V 1.3B
# - Student: trainable, initialized from the same pretrained weights
# - Critic:  trainable, initialized from the same pretrained weights
# - Validation: 4-step SDE sampling

models:
  student:
    _target_: fastvideo.train.models.wan.WanModel
    init_from: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
    trainable: true
  teacher:
    _target_: fastvideo.train.models.wan.WanModel
    init_from: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
    trainable: false
    disable_custom_init_weights: true
  critic:
    _target_: fastvideo.train.models.wan.WanModel
    init_from: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
    trainable: true
    disable_custom_init_weights: true

method:
  _target_: fastvideo.train.methods.distribution_matching.dmd2.DMD2Method
  rollout_mode: simulate
  generator_update_interval: 5
  real_score_guidance_scale: 4.5
  dmd_denoising_steps: [1000, 750, 500, 250]

  # Critic optimizer (required — no fallback to training.optimizer)
  fake_score_learning_rate: 8.0e-6
  fake_score_betas: [0.0, 0.999]
  fake_score_lr_scheduler: constant

training:
  distributed:
    num_gpus: 8
    sp_size: 1
    tp_size: 1
    hsdp_replicate_dim: 1
    hsdp_shard_dim: 8

  data:
    data_path: data/Wan-Syn_77x448x832_600k
    dataloader_num_workers: 4
    train_batch_size: 1
    training_cfg_rate: 0.0
    seed: 1000
    num_latent_t: 20
    num_height: 448
    num_width: 832
    num_frames: 77

  optimizer:
    learning_rate: 2.0e-6
    betas: [0.0, 0.999]
    weight_decay: 0.01
    lr_scheduler: constant
    lr_warmup_steps: 0

  loop:
    max_train_steps: 4000
    gradient_accumulation_steps: 1

  checkpoint:
    output_dir: outputs/wan2.1_dmd2_4steps
    training_state_checkpointing_steps: 1000
    checkpoints_total_limit: 3

  tracker:
    project_name: distillation_wan
    run_name: wan2.1_dmd2_4steps

  model:
    enable_gradient_checkpointing_type: full

callbacks:
  grad_clip:
    max_grad_norm: 1.0
  validation:
    pipeline_target: fastvideo.pipelines.basic.wan.wan_pipeline.WanPipeline
    dataset_file: examples/training/finetune/Wan2.1-VSA/Wan-Syn-Data/validation_4.json
    every_steps: 50
    sampling_steps: [4]
    sampler_kind: sde
    sampling_timesteps: [1000, 750, 500, 250]
    guidance_scale: 6.0

pipeline:
  flow_shift: 8