Skip to content

Commit a1d0389

Browse files
authored
[Benchmark] Prefil-only benchmark scripts (sgl-project#10240)
1 parent dccf52f commit a1d0389

4 files changed

Lines changed: 1153 additions & 603 deletions

File tree

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
"""
2+
SGLang Embeddings Benchmark Script
3+
4+
This script benchmarks SGLang's /v1/embeddings API performance using HTTP requests.
5+
6+
Features:
7+
- HTTP-only implementation
8+
- Uses /v1/embeddings API endpoint directly
9+
- Configurable RPS, duration, and batch sizes
10+
- Progress tracking and detailed metrics
11+
- Poisson and constant request distributions
12+
13+
Usage:
14+
- Update configuration variables at the top of the file
15+
- Ensure SGLang server is running on the configured HTTP_URL
16+
- Run: python bench_embeddings.py
17+
"""
18+
19+
import asyncio
20+
import logging
21+
22+
from transformers import AutoTokenizer
23+
from util import (
24+
BenchmarkConfig,
25+
generate_text_with_token_count,
26+
run_benchmark_main,
27+
run_generic_benchmark,
28+
)
29+
30+
# Configure logging
31+
logging.basicConfig(
32+
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
33+
)
34+
logger = logging.getLogger(__name__)
35+
36+
###############################################################################
37+
# CONFIG
38+
###############################################################################
39+
# Create benchmark configuration
40+
config = BenchmarkConfig()
41+
config.rps_values = [500]
42+
config.duration_secs_values = [60]
43+
config.num_unique_requests = 100
44+
config.distribution = "POISSON"
45+
config.profile = False
46+
config.freeze_gc = True # Enable GC freeze functionality
47+
# Profiler output directory - by default uses present working directory (pwd)
48+
# Uncomment and customize the line below to override the default location:
49+
# config.profiler_dir = "/sglang-oss-trace"
50+
51+
# HTTP Configuration
52+
HTTP_URL = "http://localhost:30000/v1/embeddings"
53+
54+
# Embeddings API Config
55+
EMBEDDINGS_MODEL_PATH = "/Qwen/Qwen3-Embedding-0.6B"
56+
BATCH_SIZE = [1] # Number of items per request (batch size)
57+
58+
# Configurable input token length
59+
EMBEDDINGS_INPUT_TOKENS = 500 # Default token length
60+
61+
# Load tokenizer once for embeddings text generation
62+
print("Loading tokenizer for embeddings input generation...")
63+
embeddings_tokenizer = AutoTokenizer.from_pretrained(EMBEDDINGS_MODEL_PATH)
64+
65+
# Generate input text with the specified token length using pre-loaded tokenizer
66+
EMBEDDINGS_INPUT_TEXT = generate_text_with_token_count(
67+
EMBEDDINGS_MODEL_PATH,
68+
EMBEDDINGS_INPUT_TOKENS,
69+
config.special_replicated_token,
70+
tokenizer=embeddings_tokenizer,
71+
)
72+
73+
74+
###############################################################################
75+
# REQUEST GENERATION (in parallel)
76+
###############################################################################
77+
def build_embeddings_request(index: int, item_count: int) -> tuple:
78+
"""Build a single embeddings request."""
79+
try:
80+
# For embeddings, input can be a string or list of strings
81+
if item_count == 1:
82+
input_data = EMBEDDINGS_INPUT_TEXT
83+
else:
84+
input_data = [EMBEDDINGS_INPUT_TEXT for _ in range(item_count)]
85+
req = {
86+
"input": input_data,
87+
"model": EMBEDDINGS_MODEL_PATH,
88+
}
89+
return (index, req)
90+
except Exception as e:
91+
logger.error(f"Error building request {index}: {e}")
92+
return (index, None)
93+
94+
95+
def validate_embeddings_response(response_data: dict) -> bool:
96+
"""Validate embeddings API response."""
97+
return "data" in response_data
98+
99+
100+
def build_warmup_embeddings_request() -> dict:
101+
"""Build a warmup request for the embeddings API."""
102+
return {
103+
"input": EMBEDDINGS_INPUT_TEXT,
104+
"model": EMBEDDINGS_MODEL_PATH,
105+
}
106+
107+
108+
###############################################################################
109+
# MAIN
110+
###############################################################################
111+
async def run_benchmark(rps, duration_secs, item_count):
112+
"""Run a single embeddings benchmark with the given RPS value."""
113+
return await run_generic_benchmark(
114+
rps=rps,
115+
duration_secs=duration_secs,
116+
item_count=item_count,
117+
config=config,
118+
http_url=HTTP_URL,
119+
build_request_func=build_embeddings_request,
120+
response_validator=validate_embeddings_response,
121+
api_name="EMBEDDINGS",
122+
request_description="embeddings requests",
123+
)
124+
125+
126+
async def main():
127+
additional_info = {
128+
"Input text length": f"{EMBEDDINGS_INPUT_TOKENS} tokens",
129+
"Input text preview": (
130+
EMBEDDINGS_INPUT_TEXT[:100] + "..."
131+
if len(EMBEDDINGS_INPUT_TEXT) > 100
132+
else EMBEDDINGS_INPUT_TEXT
133+
),
134+
}
135+
136+
await run_benchmark_main(
137+
config,
138+
run_benchmark,
139+
"EMBEDDINGS",
140+
HTTP_URL,
141+
BATCH_SIZE,
142+
additional_info,
143+
build_warmup_embeddings_request,
144+
)
145+
146+
147+
if __name__ == "__main__":
148+
asyncio.run(main())
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
"""
2+
SGLang Scoring Benchmark Script
3+
4+
This script benchmarks SGLang's scoring API performance using HTTP requests.
5+
6+
Current Features:
7+
- HTTP-only implementation (open source compatible)
8+
- Uses /v1/score API endpoint directly
9+
- Single item scoring with batching support
10+
- Configurable RPS, duration, and batch sizes
11+
- Progress tracking and detailed metrics
12+
- Poisson and constant request distributions
13+
14+
Usage:
15+
- Update configuration variables at the top of the file
16+
- Ensure SGLang server is running on the configured HTTP_URL
17+
- Run: python bench_score.py
18+
- Each request will contain ITEM_COUNT_VALUES items for batch scoring
19+
20+
"""
21+
22+
import asyncio
23+
24+
from transformers import AutoTokenizer
25+
from util import (
26+
BenchmarkConfig,
27+
generate_text_with_token_count,
28+
run_benchmark_main,
29+
run_generic_benchmark,
30+
)
31+
32+
###############################################################################
33+
# CONFIG
34+
###############################################################################
35+
# Create benchmark configuration
36+
config = BenchmarkConfig()
37+
config.rps_values = [160]
38+
config.duration_secs_values = [60]
39+
config.num_unique_requests = 100
40+
config.distribution = "POISSON"
41+
config.profile = False
42+
config.freeze_gc = True # Enable GC freeze functionality
43+
# Profiler output directory - by default uses present working directory (pwd)
44+
# Uncomment and customize the line below to override the default location:
45+
# config.profiler_dir = "/sglang-oss-trace"
46+
47+
# HTTP Configuration
48+
HTTP_URL = "http://localhost:30000/v1/score" # Use score API directly
49+
50+
# Score API Config
51+
# ITEM_COUNT_VALUES determines number of items per score request (batch size)
52+
SCORE_QUERY_TOKENS = 120
53+
SCORE_ITEM_TOKENS = 180
54+
SCORE_MODEL_PATH = "Qwen/Qwen3-0.6B"
55+
SCORE_LABEL_TOKEN_IDS = [9454, 2753] # Yes/No token IDs
56+
ITEM_COUNT_VALUES = [10] # Number of items per request
57+
58+
# Special token to replicate for precise token counting
59+
SPECIAL_REPLICATED_TOKEN = "<|im_start|>"
60+
61+
62+
###############################################################################
63+
# REQUEST GENERATION (in parallel)
64+
###############################################################################
65+
def create_score_request_builder():
66+
"""Create a score request builder function with shared tokenizer."""
67+
# Load tokenizer once here to verify special token and get precise counts
68+
print("Loading tokenizer...")
69+
tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)
70+
71+
# Verify that our special token produces exactly 1 token
72+
special_token_count = len(
73+
tokenizer.encode(config.special_replicated_token, add_special_tokens=False)
74+
)
75+
print(
76+
f"Special token '{config.special_replicated_token}' produces "
77+
f"{special_token_count} token(s)"
78+
)
79+
80+
def generate_text_with_token_count_local(num_toks):
81+
"""Generate text with precise token count using replicated token."""
82+
return generate_text_with_token_count(
83+
SCORE_MODEL_PATH,
84+
num_toks,
85+
config.special_replicated_token,
86+
tokenizer=tokenizer,
87+
)
88+
89+
def build_score_request(index: int, item_count: int) -> tuple:
90+
"""Build a single score request."""
91+
try:
92+
# Generate query and items for score API
93+
query = generate_text_with_token_count_local(SCORE_QUERY_TOKENS)
94+
items = [
95+
generate_text_with_token_count_local(SCORE_ITEM_TOKENS)
96+
for _ in range(item_count)
97+
]
98+
99+
# Return as dict for score API format
100+
score_data = {
101+
"query": query,
102+
"items": items,
103+
"label_token_ids": SCORE_LABEL_TOKEN_IDS,
104+
"model": SCORE_MODEL_PATH,
105+
}
106+
return (index, score_data)
107+
108+
except Exception as e:
109+
print(f"Error building request {index}: {e}")
110+
return (index, None)
111+
112+
return build_score_request
113+
114+
115+
def validate_score_response(response_data: dict) -> bool:
116+
"""Validate score API response."""
117+
return "scores" in response_data or "logprobs" in response_data
118+
119+
120+
def build_warmup_score_request() -> dict:
121+
"""Build a warmup request for the score API."""
122+
# Load tokenizer once for warmup generation
123+
tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)
124+
125+
warmup_query = generate_text_with_token_count(
126+
SCORE_MODEL_PATH,
127+
SCORE_QUERY_TOKENS,
128+
config.special_replicated_token,
129+
tokenizer=tokenizer,
130+
)
131+
warmup_items = [
132+
generate_text_with_token_count(
133+
SCORE_MODEL_PATH,
134+
SCORE_ITEM_TOKENS,
135+
config.special_replicated_token,
136+
tokenizer=tokenizer,
137+
)
138+
for _ in range(3)
139+
]
140+
141+
return {
142+
"query": warmup_query,
143+
"items": warmup_items,
144+
"label_token_ids": SCORE_LABEL_TOKEN_IDS,
145+
"model": SCORE_MODEL_PATH,
146+
# Add missing parameters for consistency with the original warmup
147+
"apply_softmax": True,
148+
"item_first": False,
149+
}
150+
151+
152+
###############################################################################
153+
# MAIN
154+
###############################################################################
155+
async def run_benchmark(rps, duration_secs, item_count):
156+
"""Run a single benchmark with the given RPS value."""
157+
# Create the request builder function with shared tokenizer
158+
build_request_func = create_score_request_builder()
159+
160+
return await run_generic_benchmark(
161+
rps=rps,
162+
duration_secs=duration_secs,
163+
item_count=item_count,
164+
config=config,
165+
http_url=HTTP_URL,
166+
build_request_func=build_request_func,
167+
response_validator=validate_score_response,
168+
api_name="SINGLE_ITEM_SCORING",
169+
request_description="score requests",
170+
)
171+
172+
173+
async def main():
174+
"""Main function that runs benchmarks for all RPS values."""
175+
additional_info = {
176+
"Query tokens per request": SCORE_QUERY_TOKENS,
177+
"Item tokens per item": SCORE_ITEM_TOKENS,
178+
}
179+
180+
await run_benchmark_main(
181+
config,
182+
run_benchmark,
183+
"SINGLE_ITEM_SCORING",
184+
HTTP_URL,
185+
ITEM_COUNT_VALUES,
186+
additional_info,
187+
build_warmup_score_request,
188+
)
189+
190+
191+
if __name__ == "__main__":
192+
asyncio.run(main())

0 commit comments

Comments
 (0)