Skip to content

How to quickly get each model's layer shapes #306

@TangZhexiang

Description

@TangZhexiang

Hi, your work are so amazing, really useful, I just want to try OPT-1.3B model here, I write a code want to generate the same GPT2 layer_shapes you provided, the code I write is

#!/usr/bin/env python3
generate_gpt2_yaml_flex.py
--------------------------
  • Read the actual shape of GPT‑2 / OPT / LLaMA model weights
  • --p      : Set P (token‑tile), default = cfg.n_positions
  • --mode   : whole (6 GEMMs per layer) or per_head (split to match Timeloop benchmark style)

Example usage:
  python3 generate_gpt2_yaml_flex.py gpt2-medium
  python3 generate_gpt2_yaml_flex.py gpt2-medium --p 256 --mode per_head
"""
import os, sys, argparse, torch
from transformers import AutoModelForCausalLM

def yaml_snippet(C, M, P):
    return (f"{{{{include_text('../problem_base.yaml')}}}}\n"
            f"problem:\n"
            f"  <<<: *problem_base\n"
            f"  instance: {{C: {C}, M: {M}, P: {P}}}\n")

def parse():
    ap = argparse.ArgumentParser()
    ap.add_argument("model")
    ap.add_argument("--p", type=int, help="P dimension (token‑tile); default = n_positions")
    ap.add_argument("--mode", choices=["whole", "per_head"], default="whole")
    return ap.parse_args()

def main():
    args   = parse()
    model  = AutoModelForCausalLM.from_pretrained(
                 args.model, torch_dtype=torch.float16, low_cpu_mem_usage=True)
    cfg    = model.config
    blocks = getattr(model, "transformer", model).h  # Shared across GPT‑2/OPT/LLaMA
    hidden = cfg.n_embd
    heads  = cfg.n_head
    head_d = hidden // heads
    inter  = cfg.n_inner or hidden * 4
    P_val  = args.p or cfg.n_positions

    out_dir = os.path.join("layer_shapes", args.model.replace("/", "_"))
    os.makedirs(out_dir, exist_ok=True)
    idx = 0
    def dump(C, M, P=P_val):
        nonlocal idx
        with open(os.path.join(out_dir, f"{idx:03d}.yaml"), "w") as f:
            f.write(yaml_snippet(C, M, P))
        idx += 1

    # ---------- Per-layer processing ----------
    for blk in blocks:
        if args.mode == "whole":
            # Q/K/V   (hidden , hidden)  ×3
            for _ in range(3): dump(hidden, hidden)
            # Wo
            dump(hidden, hidden)
        else:          # Split per-head
            # Q/K/V  :  C=seq_tile , M=head_dim , P=heads
            for _ in range(3): dump(P_val, head_d, heads)
            # Wo per‑head : C=seq_tile , M=head_dim , P=heads
            dump(P_val, head_d, heads)

        # FFN up / down (shared for both modes)
        dump(hidden, inter)   # up projection
        dump(inter, hidden)   # down projection

    # ---------- Final output projection (LM head) ----------
    lm_w = model.lm_head.weight if hasattr(model, "lm_head") else model.transformer.wte.weight
    dump(lm_w.shape[1], lm_w.shape[0])

    print(f"[OK] Generated {idx} YAML files → {out_dir}")

if __name__ == "__main__":
    main()

But the results are not totally same as you provided, I just wonder why, or can you provide the generation code? Or the layers_shapes is not only ?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions