|
| 1 | +""" |
| 2 | +SGLang Scoring Benchmark Script |
| 3 | +
|
| 4 | +This script benchmarks SGLang's scoring API performance using HTTP requests. |
| 5 | +
|
| 6 | +Current Features: |
| 7 | +- HTTP-only implementation (open source compatible) |
| 8 | +- Uses /v1/score API endpoint directly |
| 9 | +- Single item scoring with batching support |
| 10 | +- Configurable RPS, duration, and batch sizes |
| 11 | +- Progress tracking and detailed metrics |
| 12 | +- Poisson and constant request distributions |
| 13 | +
|
| 14 | +Usage: |
| 15 | +- Update configuration variables at the top of the file |
| 16 | +- Ensure SGLang server is running on the configured HTTP_URL |
| 17 | +- Run: python bench_score.py |
| 18 | +- Each request will contain ITEM_COUNT_VALUES items for batch scoring |
| 19 | +
|
| 20 | +""" |
| 21 | + |
| 22 | +import asyncio |
| 23 | + |
| 24 | +from transformers import AutoTokenizer |
| 25 | +from util import ( |
| 26 | + BenchmarkConfig, |
| 27 | + generate_text_with_token_count, |
| 28 | + run_benchmark_main, |
| 29 | + run_generic_benchmark, |
| 30 | +) |
| 31 | + |
| 32 | +############################################################################### |
| 33 | +# CONFIG |
| 34 | +############################################################################### |
| 35 | +# Create benchmark configuration |
| 36 | +config = BenchmarkConfig() |
| 37 | +config.rps_values = [160] |
| 38 | +config.duration_secs_values = [60] |
| 39 | +config.num_unique_requests = 100 |
| 40 | +config.distribution = "POISSON" |
| 41 | +config.profile = False |
| 42 | +config.freeze_gc = True # Enable GC freeze functionality |
| 43 | +# Profiler output directory - by default uses present working directory (pwd) |
| 44 | +# Uncomment and customize the line below to override the default location: |
| 45 | +# config.profiler_dir = "/sglang-oss-trace" |
| 46 | + |
| 47 | +# HTTP Configuration |
| 48 | +HTTP_URL = "http://localhost:30000/v1/score" # Use score API directly |
| 49 | + |
| 50 | +# Score API Config |
| 51 | +# ITEM_COUNT_VALUES determines number of items per score request (batch size) |
| 52 | +SCORE_QUERY_TOKENS = 120 |
| 53 | +SCORE_ITEM_TOKENS = 180 |
| 54 | +SCORE_MODEL_PATH = "Qwen/Qwen3-0.6B" |
| 55 | +SCORE_LABEL_TOKEN_IDS = [9454, 2753] # Yes/No token IDs |
| 56 | +ITEM_COUNT_VALUES = [10] # Number of items per request |
| 57 | + |
| 58 | +# Special token to replicate for precise token counting |
| 59 | +SPECIAL_REPLICATED_TOKEN = "<|im_start|>" |
| 60 | + |
| 61 | + |
| 62 | +############################################################################### |
| 63 | +# REQUEST GENERATION (in parallel) |
| 64 | +############################################################################### |
| 65 | +def create_score_request_builder(): |
| 66 | + """Create a score request builder function with shared tokenizer.""" |
| 67 | + # Load tokenizer once here to verify special token and get precise counts |
| 68 | + print("Loading tokenizer...") |
| 69 | + tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH) |
| 70 | + |
| 71 | + # Verify that our special token produces exactly 1 token |
| 72 | + special_token_count = len( |
| 73 | + tokenizer.encode(config.special_replicated_token, add_special_tokens=False) |
| 74 | + ) |
| 75 | + print( |
| 76 | + f"Special token '{config.special_replicated_token}' produces " |
| 77 | + f"{special_token_count} token(s)" |
| 78 | + ) |
| 79 | + |
| 80 | + def generate_text_with_token_count_local(num_toks): |
| 81 | + """Generate text with precise token count using replicated token.""" |
| 82 | + return generate_text_with_token_count( |
| 83 | + SCORE_MODEL_PATH, |
| 84 | + num_toks, |
| 85 | + config.special_replicated_token, |
| 86 | + tokenizer=tokenizer, |
| 87 | + ) |
| 88 | + |
| 89 | + def build_score_request(index: int, item_count: int) -> tuple: |
| 90 | + """Build a single score request.""" |
| 91 | + try: |
| 92 | + # Generate query and items for score API |
| 93 | + query = generate_text_with_token_count_local(SCORE_QUERY_TOKENS) |
| 94 | + items = [ |
| 95 | + generate_text_with_token_count_local(SCORE_ITEM_TOKENS) |
| 96 | + for _ in range(item_count) |
| 97 | + ] |
| 98 | + |
| 99 | + # Return as dict for score API format |
| 100 | + score_data = { |
| 101 | + "query": query, |
| 102 | + "items": items, |
| 103 | + "label_token_ids": SCORE_LABEL_TOKEN_IDS, |
| 104 | + "model": SCORE_MODEL_PATH, |
| 105 | + } |
| 106 | + return (index, score_data) |
| 107 | + |
| 108 | + except Exception as e: |
| 109 | + print(f"Error building request {index}: {e}") |
| 110 | + return (index, None) |
| 111 | + |
| 112 | + return build_score_request |
| 113 | + |
| 114 | + |
| 115 | +def validate_score_response(response_data: dict) -> bool: |
| 116 | + """Validate score API response.""" |
| 117 | + return "scores" in response_data or "logprobs" in response_data |
| 118 | + |
| 119 | + |
| 120 | +def build_warmup_score_request() -> dict: |
| 121 | + """Build a warmup request for the score API.""" |
| 122 | + # Load tokenizer once for warmup generation |
| 123 | + tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH) |
| 124 | + |
| 125 | + warmup_query = generate_text_with_token_count( |
| 126 | + SCORE_MODEL_PATH, |
| 127 | + SCORE_QUERY_TOKENS, |
| 128 | + config.special_replicated_token, |
| 129 | + tokenizer=tokenizer, |
| 130 | + ) |
| 131 | + warmup_items = [ |
| 132 | + generate_text_with_token_count( |
| 133 | + SCORE_MODEL_PATH, |
| 134 | + SCORE_ITEM_TOKENS, |
| 135 | + config.special_replicated_token, |
| 136 | + tokenizer=tokenizer, |
| 137 | + ) |
| 138 | + for _ in range(3) |
| 139 | + ] |
| 140 | + |
| 141 | + return { |
| 142 | + "query": warmup_query, |
| 143 | + "items": warmup_items, |
| 144 | + "label_token_ids": SCORE_LABEL_TOKEN_IDS, |
| 145 | + "model": SCORE_MODEL_PATH, |
| 146 | + # Add missing parameters for consistency with the original warmup |
| 147 | + "apply_softmax": True, |
| 148 | + "item_first": False, |
| 149 | + } |
| 150 | + |
| 151 | + |
| 152 | +############################################################################### |
| 153 | +# MAIN |
| 154 | +############################################################################### |
| 155 | +async def run_benchmark(rps, duration_secs, item_count): |
| 156 | + """Run a single benchmark with the given RPS value.""" |
| 157 | + # Create the request builder function with shared tokenizer |
| 158 | + build_request_func = create_score_request_builder() |
| 159 | + |
| 160 | + return await run_generic_benchmark( |
| 161 | + rps=rps, |
| 162 | + duration_secs=duration_secs, |
| 163 | + item_count=item_count, |
| 164 | + config=config, |
| 165 | + http_url=HTTP_URL, |
| 166 | + build_request_func=build_request_func, |
| 167 | + response_validator=validate_score_response, |
| 168 | + api_name="SINGLE_ITEM_SCORING", |
| 169 | + request_description="score requests", |
| 170 | + ) |
| 171 | + |
| 172 | + |
| 173 | +async def main(): |
| 174 | + """Main function that runs benchmarks for all RPS values.""" |
| 175 | + additional_info = { |
| 176 | + "Query tokens per request": SCORE_QUERY_TOKENS, |
| 177 | + "Item tokens per item": SCORE_ITEM_TOKENS, |
| 178 | + } |
| 179 | + |
| 180 | + await run_benchmark_main( |
| 181 | + config, |
| 182 | + run_benchmark, |
| 183 | + "SINGLE_ITEM_SCORING", |
| 184 | + HTTP_URL, |
| 185 | + ITEM_COUNT_VALUES, |
| 186 | + additional_info, |
| 187 | + build_warmup_score_request, |
| 188 | + ) |
| 189 | + |
| 190 | + |
| 191 | +if __name__ == "__main__": |
| 192 | + asyncio.run(main()) |
0 commit comments