[perf]feat: GPT-OSS mfu compute support (#4750)

mikequan0425 · web-flow · commit 854ca34b749a · 2025-12-31T21:29:29.000+08:00
diff --git a/tests/utils/test_flops_counter.py b/tests/utils/test_flops_counter.py
@@ -206,6 +206,64 @@ def __init__(self, config_dict):
         # total: 986195089686528 / 1e12 = 986.195089686528
         "expected_flops_tuple": (283517065887744 / 1e12, 986195089686528 / 1e12),
     },
+    "gpt_oss": {
+        "config": {
+            "model_type": "gpt_oss",
+            "vocab_size": 201088,
+            "hidden_size": 2880,
+            "num_hidden_layers": 24,
+            "num_attention_heads": 64,
+            "num_key_value_heads": 8,
+            "head_dim": 64,
+            "intermediate_size": 2880,
+            "num_local_experts": 32,
+            "num_experts_per_tok": 4,
+            "sliding_window": 128,
+            "layer_types": [
+                "sliding_attention", "full_attention", "sliding_attention", "full_attention",
+                "sliding_attention", "full_attention", "sliding_attention", "full_attention",
+                "sliding_attention", "full_attention", "sliding_attention", "full_attention",
+                "sliding_attention", "full_attention", "sliding_attention", "full_attention",
+                "sliding_attention", "full_attention", "sliding_attention", "full_attention",
+                "sliding_attention", "full_attention", "sliding_attention", "full_attention"
+            ],
+        },
+        "batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # GPT-OSS has alternating sliding / full attention
+        # Even layers (12 layers) use sliding window attention with window_size = 128
+        # Odd layers  (12 layers) use full attention
+        #
+        # Non-attention FLOPs:
+        # vocab part: 201088 * 2880 * 2 = 1158266880
+        # attn linear part per layer:
+        #   Q: 2880 * (64 * 64) = 11796480
+        #   K: 2880 * (8  * 64) = 1474560
+        #   V: 2880 * (8  * 64) = 1474560
+        #   O: (64 * 64) * 2880 = 11796480
+        #   attn linear total = 26542080
+        # mlp (MoE, SwiGLU) part per layer:
+        #   gate: 2880 * 32 = 92160
+        #   active experts: 3 * 2880 * 2880 * 4 = 99532800
+        #   mlp total = 99624960
+        # total per layer: 26542080 + 99624960 = 126167040
+        # all layers:
+        #   126167040 * 24 = 3028008960
+        # total dense params:
+        #   3028008960 + 1158266880 = 4186275840
+        #
+        # For batch [512, 1024, 2048], tokens_sum = 3584:
+        # dense flops: 6 * 4186275840 * 3584 = 90021675663360
+        # seqlen_square_sum: 71565312 (calculated with sliding window logic)
+        # attn flops: 12 * 71565312 * 64 * 64 = 3517578215424
+        # total: 93539253878784 / 1e12 = 93.539253878784
+        #
+        # For batch [4096, 4096, 4096], tokens_sum = 12288:
+        # dense flops: 6 * 4186275840 * 12288 = 308646629068800
+        # seqlen_square_sum: 622854144 (calculated with sliding window logic)
+        # attn flops: 12 * 622854144 * 64 * 64 = 30613642948608
+        # total: 339260272017408 / 1e12 = 339.260272017408
+        "expected_flops_tuple": (93539253878784 / 1e12, 339260272017408 / 1e12),
+    },
     "apertus": {
         "config": {  # swiss-ai/Apertus-8B
             "model_type": "apertus",
@@ -229,7 +287,7 @@ def __init__(self, config_dict):
 
 @pytest.mark.parametrize(
     "config_type",
-    ["llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3", "mistral", "gemma3_text", "apertus"],
+    ["llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3", "mistral", "gemma3_text", "apertus", "gpt_oss"],
 )
 def test_flops_counter(config_type: str):
     test_config = CONFIG[config_type]
diff --git a/verl/utils/flops_counter.py b/verl/utils/flops_counter.py
@@ -313,6 +313,75 @@ def _estimate_apertus_flops(config, tokens_sum, batch_seqlens, delta_time):
     flops_achieved = flops_all_token * (1.0 / delta_time) / 1e12
     return flops_achieved
 
+def _estimate_gpt_oss_flops(config, tokens_sum, batch_seqlens, delta_time):
+    hidden_size = config.hidden_size
+    vocab_size = config.vocab_size
+    num_hidden_layers = config.num_hidden_layers
+    num_key_value_heads = config.num_key_value_heads
+    num_attention_heads = config.num_attention_heads
+
+    # MoE params
+    moe_intermediate_size = config.intermediate_size
+    num_experts = config.num_local_experts
+    num_experts_per_tok = config.num_experts_per_tok
+    mlp_matrices = 3
+
+    # Head dim
+    head_dim = getattr(config, "head_dim", hidden_size // num_attention_heads)
+    q_size = num_attention_heads * head_dim
+    k_size = num_key_value_heads * head_dim
+    v_size = num_key_value_heads * head_dim
+
+    # 1. Attention Block (GQA)
+    attn_linear_N = hidden_size * (q_size + k_size + v_size + num_attention_heads * head_dim)
+    # 2. MLP / MoE Block
+    # Gate network
+    moe_gate_N = hidden_size * num_experts
+    # Expert forward calculation, Active parameters: mlp_matrices * H * I * num_experts_per_tok
+    moe_expert_N = hidden_size * moe_intermediate_size * mlp_matrices * num_experts_per_tok
+
+    moe_mlp_N = moe_gate_N + moe_expert_N
+
+    emd_and_lm_head_N = vocab_size * hidden_size * 2
+
+    # Total non-attn params per layer * layers + embeddings
+    # (moe_mlp_N + attn_linear_N) * layers
+    dense_N = (moe_mlp_N + attn_linear_N) * num_hidden_layers + emd_and_lm_head_N
+
+    # FLOPs for dense part (fwd + bwd = 6 * N)
+    dense_N_flops = 6 * dense_N * tokens_sum
+
+    # 3. Attention Matrix FLOPs
+    seqlen_square_sum = 0
+
+    # Handle sliding window attention
+    layer_types = getattr(config, "layer_types", None)
+    sliding_window = getattr(config, "sliding_window", 128)
+
+    if layer_types:
+        for layer_type in layer_types:
+            is_sliding = layer_type == "sliding_attention"
+
+            for seqlen in batch_seqlens:
+                if is_sliding and sliding_window:
+                    # Sliding window limits each token to attend to at most window_size tokens
+                    effective_seqlen = min(seqlen, sliding_window)
+                    seqlen_square_sum += seqlen * effective_seqlen
+                else:
+                    # Full attention
+                    seqlen_square_sum += seqlen * seqlen
+    else:
+        # Default to full attention for all layers
+        for seqlen in batch_seqlens:
+            seqlen_square_sum += seqlen * seqlen
+        seqlen_square_sum *= num_hidden_layers
+
+    attn_qkv_flops = 12 * seqlen_square_sum * head_dim * num_attention_heads
+
+    # Total FLOPs
+    flops_all_token = dense_N_flops + attn_qkv_flops
+    flops_achieved = flops_all_token * (1.0 / delta_time) / 1e12
+    return flops_achieved
 
 def _estimate_unknown_flops(config, tokens_sum, batch_seqlens, delta_time):
     return 0
@@ -336,6 +405,7 @@ def _estimate_unknown_flops(config, tokens_sum, batch_seqlens, delta_time):
     "seed_oss": _estimate_qwen2_flops,
     "apertus": _estimate_apertus_flops,
     "glm4v": _estimate_qwen2_flops,
+    "gpt_oss": _estimate_gpt_oss_flops,
 }