-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathbenchmark.py
More file actions
400 lines (334 loc) · 15.4 KB
/
Copy pathbenchmark.py
File metadata and controls
400 lines (334 loc) · 15.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
#!/usr/bin/env python3
"""
Ollama Model Benchmark Tool
A lightweight tool for measuring LLM performance metrics via Ollama:
- Token processing speed (t/s)
- Model load time
- Prompt evaluation time
- Response generation time
Usage:
python benchmark.py [-v] [-m MODEL_NAMES...] [-p PROMPTS...]
Example:
python benchmark.py --verbose --models llama2:13b codellama:34b
"""
import argparse
from typing import List, Dict, Optional
from datetime import datetime
import ollama
from pydantic import BaseModel, Field
from tabulate import tabulate
class Message(BaseModel):
"""Represents a single message in the chat interaction."""
role: str
content: str
class OllamaResponse(BaseModel):
"""
Represents a structured response from the Ollama API.
Contains performance metrics and message content.
"""
model: str
created_at: datetime | None = None
message: Message
done: bool
total_duration: int = Field(default=0)
load_duration: int = Field(default=0)
prompt_eval_count: int = Field(default=0)
prompt_eval_duration: int = Field(default=0)
eval_count: int = Field(default=0)
eval_duration: int = Field(default=0)
@classmethod
def from_chat_response(cls, response) -> 'OllamaResponse':
"""
Converts an Ollama API response into an OllamaResponse instance.
Args:
response: Raw response from Ollama API
Returns:
OllamaResponse: Structured response object
"""
return cls(
model=response.model,
message=Message(
role=response.message.role,
content=response.message.content
),
done=response.done,
total_duration=getattr(response, 'total_duration', 0),
load_duration=getattr(response, 'load_duration', 0),
prompt_eval_count=getattr(response, 'prompt_eval_count', 0),
prompt_eval_duration=getattr(response, 'prompt_eval_duration', 0),
eval_count=getattr(response, 'eval_count', 0),
eval_duration=getattr(response, 'eval_duration', 0)
)
def run_benchmark(
model_name: str,
prompt: str,
verbose: bool
) -> Optional[OllamaResponse]:
"""
Executes a benchmark run for a specific model and prompt.
Args:
model_name: Name of the Ollama model to benchmark
prompt: Input text to send to the model
verbose: If True, prints streaming output
Returns:
OllamaResponse object containing benchmark results, or None if failed
"""
messages = [{"role": "user", "content": prompt}]
try:
if verbose:
# For verbose mode, we'll collect the content while streaming
content = ""
stream = ollama.chat(
model=model_name,
messages=messages,
stream=True,
)
for chunk in stream:
if hasattr(chunk.message, 'content'):
content += chunk.message.content
print(chunk.message.content, end="", flush=True)
if not content.strip():
print(f"\nError: Ollama model {model_name} returned empty response. Please check if:")
print("1. The model is properly loaded")
print("2. The Ollama server is functioning correctly")
print("3. Try running 'ollama run {model_name}' in terminal to verify model output")
return None
# Make a non-streaming call to get the metrics
response = ollama.chat(
model=model_name,
messages=messages,
)
# Check if response has content
if not hasattr(response.message, 'content') or not response.message.content.strip():
print(f"\nError: Ollama model {model_name} returned empty response in non-streaming mode")
return None
# Create response with collected content and metrics
return OllamaResponse(
model=model_name,
message=Message(
role="assistant",
content=content
),
done=True,
total_duration=getattr(response, 'total_duration', 0),
load_duration=getattr(response, 'load_duration', 0),
prompt_eval_count=getattr(response, 'prompt_eval_count', 0),
prompt_eval_duration=getattr(response, 'prompt_eval_duration', 0),
eval_count=getattr(response, 'eval_count', 0),
eval_duration=getattr(response, 'eval_duration', 0)
)
else:
# For non-verbose mode, just make a single non-streaming call
response = ollama.chat(
model=model_name,
messages=messages,
)
# Check if response has content
if not hasattr(response.message, 'content') or not response.message.content.strip():
print(f"\nError: Ollama model {model_name} returned empty response. Please check if:")
print("1. The model is properly loaded")
print("2. The Ollama server is functioning correctly")
print("3. Try running 'ollama run {model_name}' in terminal to verify model output")
return None
return OllamaResponse.from_chat_response(response)
except Exception as e:
print(f"Error benchmarking {model_name}: {str(e)}")
return None
def nanosec_to_sec(nanosec: int) -> float:
"""Converts nanoseconds to seconds."""
return nanosec / 1_000_000_000
def inference_stats(model_response: OllamaResponse) -> None:
"""
Calculates and prints detailed inference statistics for a model response.
Args:
model_response: OllamaResponse containing benchmark metrics
"""
# Calculate tokens per second for different phases
prompt_eval_secs = nanosec_to_sec(model_response.prompt_eval_duration)
eval_secs = nanosec_to_sec(model_response.eval_duration)
total_secs = nanosec_to_sec(model_response.prompt_eval_duration + model_response.eval_duration)
prompt_ts = model_response.prompt_eval_count / prompt_eval_secs if prompt_eval_secs > 0 else 0.0
response_ts = model_response.eval_count / eval_secs if eval_secs > 0 else 0.0
total_ts = (model_response.prompt_eval_count + model_response.eval_count) / total_secs if total_secs > 0 else 0.0
print(
f"""
----------------------------------------------------
Model: {model_response.model}
Performance Metrics:
Prompt Processing: {prompt_ts:.2f} tokens/sec
Generation Speed: {response_ts:.2f} tokens/sec
Combined Speed: {total_ts:.2f} tokens/sec
Workload Stats:
Input Tokens: {model_response.prompt_eval_count}
Generated Tokens: {model_response.eval_count}
Model Load Time: {nanosec_to_sec(model_response.load_duration):.2f}s
Processing Time: {nanosec_to_sec(model_response.prompt_eval_duration):.2f}s
Generation Time: {nanosec_to_sec(model_response.eval_duration):.2f}s
Total Time: {nanosec_to_sec(model_response.total_duration):.2f}s
----------------------------------------------------
"""
)
def average_stats(responses: List[OllamaResponse]) -> None:
"""
Calculates and prints average statistics across multiple benchmark runs.
Args:
responses: List of OllamaResponse objects from multiple runs
"""
if not responses:
print("No stats to average")
return
# Calculate aggregate metrics
res = OllamaResponse(
model=responses[0].model,
created_at=datetime.now(),
message=Message(
role="system",
content=f"Average stats across {len(responses)} runs",
),
done=True,
total_duration=sum(r.total_duration for r in responses),
load_duration=sum(r.load_duration for r in responses),
prompt_eval_count=sum(r.prompt_eval_count for r in responses),
prompt_eval_duration=sum(r.prompt_eval_duration for r in responses),
eval_count=sum(r.eval_count for r in responses),
eval_duration=sum(r.eval_duration for r in responses),
)
print("Average stats:")
inference_stats(res)
def table_stats(benchmarks: Dict[str, List[OllamaResponse]]) -> None:
"""
Calculates and prints average statistics across multiple benchmark runs and models, output as table
Args:
benchmarks: Dict of modelNames and List of OllamaResponse objects from multiple runs
"""
if not benchmarks:
print("No results to output")
return
print("Table stats:")
table: List[List] = []
for model_name, responses in benchmarks.items():
# Calculate aggregate metrics
total_duration = sum(r.total_duration for r in responses)
load_duration = sum(r.load_duration for r in responses)
prompt_eval_count = sum(r.prompt_eval_count for r in responses)
prompt_eval_duration = sum(r.prompt_eval_duration for r in responses)
eval_count = sum(r.eval_count for r in responses)
eval_duration = sum(r.eval_duration for r in responses)
# Calculate tokens per second for different phases
prompt_eval_secs = nanosec_to_sec(prompt_eval_duration)
eval_secs = nanosec_to_sec(eval_duration)
total_secs = nanosec_to_sec(prompt_eval_duration + eval_duration)
prompt_ts = prompt_eval_count / prompt_eval_secs if prompt_eval_secs > 0 else 0.0
response_ts = eval_count / eval_secs if eval_secs > 0 else 0.0
total_ts = (prompt_eval_count + eval_count) / total_secs if total_secs > 0 else 0.0
# table.append([model_name, total_duration, load_duration, prompt_eval_duration, eval_count, eval_duration])
table.append([model_name, prompt_ts, response_ts, total_ts,
nanosec_to_sec(load_duration),
prompt_eval_count, nanosec_to_sec(prompt_eval_duration), eval_count,
nanosec_to_sec(eval_duration), nanosec_to_sec(total_duration)])
print(tabulate(table, headers=["Model\nName", "Prompt\nEvaluation Rate\n(T/s)", "Evaluation\nRate\n(T/s)",
"Total\nRate\n(T/s)", "Load Time\n(s)",
"Prompt\nEvaluation Count", "Prompt\nEvaluation Time\n(s)",
"Evaluation\nCount", "Evaluation\nTime\n(s)", "Total Time\n(s)"], tablefmt="orgtbl",
floatfmt=".2f"))
def get_benchmark_models(test_models: List[str] = []) -> List[str]:
"""
Retrieves and validates the list of models to benchmark.
Args:
test_models: List of specific models to test
Returns:
List of validated model names available for benchmarking
"""
response = ollama.list()
available_models = [model.get("model") for model in response.get("models", [])]
if not test_models:
# Use a default subset of models if none specified
default_models = ["llama3", "mistral", "codellama", "deepseek", "gpt-oss", "gemma"] # Common default models
model_names = [m for m in available_models if any(d in m for d in default_models)]
if not model_names:
model_names = available_models[:3] # Take first 3 available models if no defaults found
# sort default subset alphabetically
model_names.sort()
else:
# Filter requested models against available ones
model_names = [model for model in test_models if model in available_models]
if len(model_names) < len(test_models):
missing_models = set(test_models) - set(available_models)
print(f"Warning: Some requested models are not available: {missing_models}")
if not model_names:
raise RuntimeError("No valid models found for benchmarking")
print(f"Evaluating models: {model_names}\n")
return model_names
def main() -> None:
"""
Main execution function for the benchmark tool.
Handles argument parsing and orchestrates the benchmark process.
"""
parser = argparse.ArgumentParser(
description="Benchmark performance metrics for Ollama models."
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Enable verbose output including streaming responses",
default=False,
)
parser.add_argument(
"-m",
"--models",
nargs="*",
default=[],
help="Specific models to benchmark. Tests all available models if not specified.",
)
parser.add_argument(
"-p",
"--prompts",
nargs="*",
default=[
# Short analytical question to test basic reasoning
"Explain the process of photosynthesis in plants, including the key chemical reactions and energy transformations involved.",
# Medium-length creative task
"Write a detailed story about a time traveler who visits three different historical periods. Include specific details about each era and the protagonist's interactions.",
# Long complex analysis
"Analyze the potential impact of artificial intelligence on global employment over the next decade. Consider various industries, economic factors, and potential mitigation strategies. Provide specific examples and data-driven reasoning.",
# Technical task with specific requirements
"Write a Python function that implements a binary search tree with methods for insertion, deletion, and traversal. Include comments explaining the time complexity of each operation.",
# Structured output task
"Create a detailed business plan for a renewable energy startup. Include sections on market analysis, financial projections, competitive advantages, and risk assessment. Format the response with clear headings and bullet points.",
],
help="Prompts to use for benchmarking. Multiple prompts can be specified. Default prompts test various capabilities including analysis, creativity, technical knowledge, and structured output.",
)
parser.add_argument(
"-t",
"--table_output",
action="store_true",
help="Output as table instead of separate results per model",
default=False,
)
args = parser.parse_args()
print(
f"\nVerbose: {args.verbose}\nTest models: {args.models}\nPrompts: {args.prompts}\nTable Output: {args.table_output}"
)
model_names = get_benchmark_models(args.models)
benchmarks: Dict[str, List[OllamaResponse]] = {}
# Execute benchmarks for each model and prompt
for model_name in model_names:
responses: List[OllamaResponse] = []
for prompt in args.prompts:
if args.verbose:
print(f"\n\nBenchmarking: {model_name}\nPrompt: {prompt}")
if response := run_benchmark(model_name, prompt, verbose=args.verbose):
responses.append(response)
if args.verbose:
print(f"Response: {response.message.content}")
inference_stats(response)
benchmarks[model_name] = responses
if args.table_output:
table_stats(benchmarks)
else:
# Calculate and display average statistics
for model_name, responses in benchmarks.items():
average_stats(responses)
if __name__ == "__main__":
main()