How to quickly get each model's layer shapes

Hi, your work are so amazing, really useful, I just want to try OPT-1.3B model here, I write a code want to generate the same GPT2 layer_shapes you provided, the code I write is 

```
#!/usr/bin/env python3
generate_gpt2_yaml_flex.py
--------------------------
  • Read the actual shape of GPT‑2 / OPT / LLaMA model weights
  • --p      : Set P (token‑tile), default = cfg.n_positions
  • --mode   : whole (6 GEMMs per layer) or per_head (split to match Timeloop benchmark style)

Example usage:
  python3 generate_gpt2_yaml_flex.py gpt2-medium
  python3 generate_gpt2_yaml_flex.py gpt2-medium --p 256 --mode per_head
"""
import os, sys, argparse, torch
from transformers import AutoModelForCausalLM

def yaml_snippet(C, M, P):
    return (f"{{{{include_text('../problem_base.yaml')}}}}\n"
            f"problem:\n"
            f"  <<<: *problem_base\n"
            f"  instance: {{C: {C}, M: {M}, P: {P}}}\n")

def parse():
    ap = argparse.ArgumentParser()
    ap.add_argument("model")
    ap.add_argument("--p", type=int, help="P dimension (token‑tile); default = n_positions")
    ap.add_argument("--mode", choices=["whole", "per_head"], default="whole")
    return ap.parse_args()

def main():
    args   = parse()
    model  = AutoModelForCausalLM.from_pretrained(
                 args.model, torch_dtype=torch.float16, low_cpu_mem_usage=True)
    cfg    = model.config
    blocks = getattr(model, "transformer", model).h  # Shared across GPT‑2/OPT/LLaMA
    hidden = cfg.n_embd
    heads  = cfg.n_head
    head_d = hidden // heads
    inter  = cfg.n_inner or hidden * 4
    P_val  = args.p or cfg.n_positions

    out_dir = os.path.join("layer_shapes", args.model.replace("/", "_"))
    os.makedirs(out_dir, exist_ok=True)
    idx = 0
    def dump(C, M, P=P_val):
        nonlocal idx
        with open(os.path.join(out_dir, f"{idx:03d}.yaml"), "w") as f:
            f.write(yaml_snippet(C, M, P))
        idx += 1

    # ---------- Per-layer processing ----------
    for blk in blocks:
        if args.mode == "whole":
            # Q/K/V   (hidden , hidden)  ×3
            for _ in range(3): dump(hidden, hidden)
            # Wo
            dump(hidden, hidden)
        else:          # Split per-head
            # Q/K/V  :  C=seq_tile , M=head_dim , P=heads
            for _ in range(3): dump(P_val, head_d, heads)
            # Wo per‑head : C=seq_tile , M=head_dim , P=heads
            dump(P_val, head_d, heads)

        # FFN up / down (shared for both modes)
        dump(hidden, inter)   # up projection
        dump(inter, hidden)   # down projection

    # ---------- Final output projection (LM head) ----------
    lm_w = model.lm_head.weight if hasattr(model, "lm_head") else model.transformer.wte.weight
    dump(lm_w.shape[1], lm_w.shape[0])

    print(f"[OK] Generated {idx} YAML files → {out_dir}")

if __name__ == "__main__":
    main()

```
But the results are not totally same as you provided, I just wonder why, or can you provide the generation code? Or the layers_shapes is not only ?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

How to quickly get each model's layer shapes #306

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

How to quickly get each model's layer shapes #306

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions