sgl-project
diff --git a/‎.github/workflows/pr-test.yml‎
Lines changed: 3 additions & 5 deletions b/‎.github/workflows/pr-test.yml‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎256expert_16chip_fp8_random_20260210_175806.npy‎
-160 KB b/‎256expert_16chip_fp8_random_20260210_175806.npy‎
-160 KB
diff --git a/‎README.md‎
Lines changed: 17 additions & 13 deletions b/‎README.md‎
Lines changed: 17 additions & 13 deletions
diff --git a/‎benchmark/gsm8k/__init__.py‎ b/‎benchmark/gsm8k/__init__.py‎
diff --git a/‎benchmark/gsm8k/bench_sglang_jax.py‎
Lines changed: 259 additions & 0 deletions b/‎benchmark/gsm8k/bench_sglang_jax.py‎
Lines changed: 259 additions & 0 deletions
@@ -71,12 +71,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        part: [0, 1]
+        part: [0, 1, 2]
     steps:
       - name: Checkout code
         uses: actions/checkout@v5
       - name: Run unit test
-        timeout-minutes: 30
+        timeout-minutes: 60
         shell: bash
         env:
           SGLANG_JAX_IS_IN_CI: true
@@ -85,8 +85,7 @@ jobs:
           source .venv/bin/activate
           pip install uv
           uv pip install -e "python[all]"
-          uv pip install "tops @ git+https://github.com/primatrix/pallas-kernel.git@41431b1c4c5e60f12c7f66e84b09a2585e3b9596"  # release/v0.4
-          python test/srt/run_suite.py --suite unit-test-tpu-v6e-1 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+          python test/srt/run_suite.py --suite unit-test-tpu-v6e-1 --auto-partition-id ${{ matrix.part }} --auto-partition-size 3
 
   unit-test-4-tpu:
     needs: [check-changes]
@@ -108,7 +107,6 @@ jobs:
           source .venv/bin/activate
           pip install uv
           uv pip install -e "python[all]"
-          uv pip install "tops @ git+https://github.com/primatrix/pallas-kernel.git@41431b1c4c5e60f12c7f66e84b09a2585e3b9596"  # release/v0.4
           bash scripts/killall_sglang.sh
           python test/srt/run_suite.py --suite unit-test-tpu-v6e-4 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
 
 
@@ -7,6 +7,7 @@ __pycache__/
 *.so
 
 # Distribution / packaging
+.claude/
 .Python
 .python-version
 build/
 
@@ -46,21 +46,25 @@ For more features and usage details, please read the documents in the [`docs`](h
 
 ## Supported Models
 
-SGL-JAX is designed for easy extension to new model architectures. It currently provides first-class, optimized support for:
-
--   **Qwen**: Performance needs to improve.
--   **Qwen 2**: Performance needs to improve.
--   **Qwen 2 MoE**: Performance needs to improve.
--   **Qwen 3**: Currently these series have achieved our best performance.
--   **Qwen 3 MoE**: Apart from models like Qwen-coder3-480B with large parameters, these series have achieved our best performance.
--   **Llama**: Performance needs to improve.
--   **Bailing MoE**: Performance needs to improve.
--   **MiMo-7B**: Support Eagle's Speculative Decoding, Performance needs to improve.
-
-Currently, SGL-JAX already supports MultiModal Models, and its usage is compatible with LLMs. The architecture has been adapted to support flexible multimodal model architectures.
+SGL-JAX is designed for easy extension to new model architectures. It currently provides first-class support for:
+
+-   **Qwen**
+-   **Qwen 2** / **Qwen 2 MoE**
+-   **Qwen 3** / **Qwen 3 MoE**
+-   **Llama**
+-   **Gemma 2**
+-   **DeepSeek V2 / V3**
+-   **GLM-4 MoE**
+-   **Grok-2**
+-   **Bailing MoE** / **Bailing MoE V2**
+-   **MiMo-7B**
+-   **MiMo-V2-Flash**
+-   **MiMo-V2.5-Pro**
+
+SGL-JAX also supports multimodal models with the same usage interface as LLMs. The architecture has been adapted to support flexible multimodal model architectures.
 
 -   **Wan 2.1 T2V**: Text-to-Video generation model.
--   **Wan 2.2 T2V**: Text-to-Video generation model. Uses different DiT models at different noise stages for denoising, achieving higher generation quality.
+-   **Wan 2.2 T2V**: Text-to-Video generation model. Uses different DiT models at different noise stages for denoising.
 -   **Qwen2.5-VL**: Vision-language model series based on Qwen2.5.
 
 For multimodal model usage, see the [Usage Guide](docs/mutlimodal/multimodal_usage.md) and [Architecture Design](docs/mutlimodal/design/[RFC]multimodal_architechure.md).
 
@@ -0,0 +1,259 @@
+"""GSM8K benchmark for sglang-jax.
+
+Sends concurrent requests to a running sglang-jax server's /generate endpoint
+and measures accuracy and throughput on the GSM8K (or GSM8K Platinum) dataset.
+
+Usage:
+    # Start server first:
+    #   python3 -m sgl_jax.launch_server --model-path <model> --port 30000 ...
+
+    # Run benchmark:
+    python bench_sglang_jax.py --base-url http://localhost:30000 --num-questions 200
+"""
+
+import argparse
+import ast
+import asyncio
+import json
+import os
+import re
+import tempfile
+import time
+import urllib.request
+
+import aiohttp
+import numpy as np
+from datasets import load_dataset
+from tqdm import tqdm
+
+INVALID = -9999999
+
+
+def read_jsonl(path):
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                yield json.loads(line)
+
+
+def download_and_cache_file(url):
+    cache_dir = os.path.join(tempfile.gettempdir(), "sgl_jax_bench_cache")
+    os.makedirs(cache_dir, exist_ok=True)
+    filename = url.split("/")[-1]
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.isfile(cache_path):
+        print(f"Downloading {url} to {cache_path}...")
+        urllib.request.urlretrieve(url, cache_path)
+    return cache_path
+
+
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+async def send_request(session, base_url, text, sampling_params, semaphore, pbar):
+    payload = {
+        "text": text,
+        "sampling_params": sampling_params,
+        "stream": False,
+    }
+    async with semaphore:
+        timeout = aiohttp.ClientTimeout(total=300)
+        async with session.post(f"{base_url}/generate", json=payload, timeout=timeout) as response:
+            if response.status != 200:
+                error_text = await response.text()
+                raise RuntimeError(f"Request failed with status {response.status}: {error_text}")
+            result = await response.json()
+            pbar.update(1)
+            return result
+
+
+async def run_batch(base_url, questions, sampling_params, parallel):
+    semaphore = asyncio.Semaphore(parallel)
+    pbar = tqdm(total=len(questions), desc="Generating")
+
+    async with aiohttp.ClientSession() as session:
+        tasks = [
+            send_request(session, base_url, q, sampling_params, semaphore, pbar) for q in questions
+        ]
+        results = await asyncio.gather(*tasks)
+
+    pbar.close()
+    return results
+
+
+def main(args):
+    # Load tokenizer if enable_thinking is set
+    tokenizer = None
+    if args.enable_thinking:
+        from transformers import AutoTokenizer
+
+        assert (
+            args.tokenizer_path is not None
+        ), "--tokenizer-path is required when --enable-thinking is set"
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, trust_remote_code=True)
+
+    # Read data
+    if args.platinum:
+        print("Loading GSM8K Platinum dataset from HuggingFace...")
+        dataset = load_dataset("madrylab/gsm8k-platinum", "main", split="test")
+        lines = [{"question": item["question"], "answer": item["answer"]} for item in dataset]
+    else:
+        data_path = args.data_path
+        url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+        if not os.path.isfile(data_path):
+            data_path = download_and_cache_file(url)
+        lines = list(read_jsonl(data_path))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        raw_question = few_shot_examples + get_one_example(lines, i, False)
+        if tokenizer is not None:
+            messages = [{"role": "user", "content": raw_question}]
+            raw_question = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=True,
+            )
+        questions.append(raw_question)
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(label != INVALID for label in labels)
+
+    # Sampling parameters
+    sampling_params = {
+        "temperature": args.temperature,
+        "top_p": args.top_p,
+        "max_new_tokens": args.max_new_tokens,
+        "stop": ["Question", "Assistant:", "<|separator|>"],
+    }
+
+    # Run requests
+    print(
+        f"Running {len(questions)} requests against {args.base_url} "
+        f"(parallelism={args.parallel})..."
+    )
+    tic = time.perf_counter()
+    results = asyncio.run(run_batch(args.base_url, questions, sampling_params, args.parallel))
+    latency = time.perf_counter() - tic
+
+    # Extract predictions
+    preds = []
+    for r in results:
+        preds.append(get_answer_value(r["text"]))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    # Compute speed
+    num_output_tokens = sum(r["meta_info"]["completion_tokens"] for r in results)
+    output_throughput = num_output_tokens / latency
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+
+    # Dump raw outputs
+    if args.output_file:
+        with open(args.output_file, "w") as f:
+            for i, r in enumerate(results):
+                f.write(f"=== Question {i} ===\n")
+                f.write(questions[i] + "\n")
+                f.write("=== Answer ===\n")
+                f.write(r["text"] + "\n")
+                f.write(f"=== Prediction: {preds[i]}, Label: {labels[i]} ===\n\n")
+        print(f"Raw outputs saved to {args.output_file}")
+
+    # Dump results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "gsm8k-platinum" if args.platinum else "gsm8k",
+            "backend": "sgl-jax",
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+    print(f"Results appended to {args.result_file}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="GSM8K benchmark for sglang-jax")
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default="http://localhost:30000",
+        help="Base URL of the sglang-jax server",
+    )
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--top-p", type=float, default=1.0)
+    parser.add_argument("--parallel", type=int, default=64, help="Max concurrent requests")
+    parser.add_argument(
+        "--result-file",
+        type=str,
+        default="bench_results.jsonl",
+        help="Path to append JSON result summary",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        default=None,
+        help="Path to write detailed per-question outputs",
+    )
+    parser.add_argument(
+        "--enable-thinking",
+        action="store_true",
+        help="Enable thinking mode by wrapping prompts with chat template",
+    )
+    parser.add_argument(
+        "--tokenizer-path",
+        type=str,
+        default=None,
+        help="Path to tokenizer (required when --enable-thinking is set)",
+    )
+    parser.add_argument(
+        "--platinum",
+        action="store_true",
+        help="Use GSM8K Platinum dataset (drop-in replacement with corrected labels)",
+    )
+    args = parser.parse_args()
+    main(args)