-
Notifications
You must be signed in to change notification settings - Fork 127
Closed
Description
Hi, your work are so amazing, really useful, I just want to try OPT-1.3B model here, I write a code want to generate the same GPT2 layer_shapes you provided, the code I write is
#!/usr/bin/env python3
generate_gpt2_yaml_flex.py
--------------------------
• Read the actual shape of GPT‑2 / OPT / LLaMA model weights
• --p : Set P (token‑tile), default = cfg.n_positions
• --mode : whole (6 GEMMs per layer) or per_head (split to match Timeloop benchmark style)
Example usage:
python3 generate_gpt2_yaml_flex.py gpt2-medium
python3 generate_gpt2_yaml_flex.py gpt2-medium --p 256 --mode per_head
"""
import os, sys, argparse, torch
from transformers import AutoModelForCausalLM
def yaml_snippet(C, M, P):
return (f"{{{{include_text('../problem_base.yaml')}}}}\n"
f"problem:\n"
f" <<<: *problem_base\n"
f" instance: {{C: {C}, M: {M}, P: {P}}}\n")
def parse():
ap = argparse.ArgumentParser()
ap.add_argument("model")
ap.add_argument("--p", type=int, help="P dimension (token‑tile); default = n_positions")
ap.add_argument("--mode", choices=["whole", "per_head"], default="whole")
return ap.parse_args()
def main():
args = parse()
model = AutoModelForCausalLM.from_pretrained(
args.model, torch_dtype=torch.float16, low_cpu_mem_usage=True)
cfg = model.config
blocks = getattr(model, "transformer", model).h # Shared across GPT‑2/OPT/LLaMA
hidden = cfg.n_embd
heads = cfg.n_head
head_d = hidden // heads
inter = cfg.n_inner or hidden * 4
P_val = args.p or cfg.n_positions
out_dir = os.path.join("layer_shapes", args.model.replace("/", "_"))
os.makedirs(out_dir, exist_ok=True)
idx = 0
def dump(C, M, P=P_val):
nonlocal idx
with open(os.path.join(out_dir, f"{idx:03d}.yaml"), "w") as f:
f.write(yaml_snippet(C, M, P))
idx += 1
# ---------- Per-layer processing ----------
for blk in blocks:
if args.mode == "whole":
# Q/K/V (hidden , hidden) ×3
for _ in range(3): dump(hidden, hidden)
# Wo
dump(hidden, hidden)
else: # Split per-head
# Q/K/V : C=seq_tile , M=head_dim , P=heads
for _ in range(3): dump(P_val, head_d, heads)
# Wo per‑head : C=seq_tile , M=head_dim , P=heads
dump(P_val, head_d, heads)
# FFN up / down (shared for both modes)
dump(hidden, inter) # up projection
dump(inter, hidden) # down projection
# ---------- Final output projection (LM head) ----------
lm_w = model.lm_head.weight if hasattr(model, "lm_head") else model.transformer.wte.weight
dump(lm_w.shape[1], lm_w.shape[0])
print(f"[OK] Generated {idx} YAML files → {out_dir}")
if __name__ == "__main__":
main()
But the results are not totally same as you provided, I just wonder why, or can you provide the generation code? Or the layers_shapes is not only ?
Metadata
Metadata
Assignees
Labels
No labels