auto-model-optim/goals.json at main · heygen-com/auto-model-optim · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
{
  "task": "Minimize wall-clock latency of decoding one fixed 3s 1080p latent through the Wan 2.2 VAE decoder on a single NVIDIA H100 80GB, WITHOUT degrading output quality beyond the gate below.",
  "device": {
    "name": "NVIDIA H100 80GB HBM3",
    "count": 1,
    "note": "Inference only. No retraining, no weight edits, no architecture changes to model/."
  },
  "benchmark": {
    "input": "assets/latent_3s.pt",
    "input_shape": [48, 20, 68, 120],
    "output_shape_pixels": "[3, 77, 1088, 1920] (frozen by the reference; must match exactly)",
    "warmup_runs": 5,
    "timed_runs": 10,
    "metric": "mean wall-clock seconds over timed_runs (lower is better), report mean and std",
    "protocol": "torch.cuda.synchronize() around each run; gc disabled during timed region; GPU must be free before measuring"
  },
  "quality_gate": {
    "reference": "assets/reference_fp32.pt",
    "reference_def": "Output of the UNOPTIMIZED fp32 Wan 2.2 VAE decode of assets/latent_3s.pt. Frozen. Generated once by harness/make_reference.py.",
    "must_match_shape": true,
    "max_abs_diff_tol": 0.10,
    "psnr_min_db": 30.0,
    "rule": "PASS only if output shape matches AND max_abs_diff <= tol AND psnr >= psnr_min. A faster result that fails the gate is DISCARDED."
  },
  "baseline": {
    "config": "fp32, eager (no torch.compile), no patches",
    "expected_latency_s": "~14-15 (measure it; this is exp_000)"
  },
  "target": {
    "primary": "Reproduce or beat the prior result of ~4.785s (>= ~3x over the fp32 baseline) while passing the quality_gate.",
    "stretch": "Find any new, gate-passing win below 4.785s.",
    "floor_note": "The decoder is conv-bound; do not expect to beat the cuDNN conv floor. Document the floor."
  },
  "stop_conditions": [
    "PASS at/under target.primary AND a fresh-context grader confirms the win -> done.",
    "No gate-passing improvement for 8 consecutive experiments -> summarize in PROGRESS.md and escalate.",
    "Hard cap: 60 experiments OR wall-clock budget exhausted."
  ],
  "_warning": "FROZEN. The agent MUST NOT edit goals.json, anything in harness/, or anything in model/. Editing the evaluator to pass is reward hacking and is strictly prohibited."
}