forked from llm-d-incubation/llm-d-planner
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdemo_scenarios.json
More file actions
103 lines (103 loc) · 3.85 KB
/
demo_scenarios.json
File metadata and controls
103 lines (103 loc) · 3.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
{
"scenarios": [
{
"id": "scenario-1",
"name": "Customer Service Chatbot - High Volume",
"description": "Large-scale customer service chatbot with strict latency requirements",
"user_description": "I need a customer service chatbot for about 5000 concurrent users. Response time is critical - we need sub-500ms latency. Users will be asking common questions about orders, returns, and product information.",
"expected_extraction": {
"use_case": "customer_service",
"user_count": 5000,
"latency_requirement": "very_high",
"expected_qps": 350,
"slo_targets": {
"ttft_p90_target_ms": 150,
"tpot_p90_target_ms": 40,
"e2e_p90_target_ms": 500
},
"traffic_profile": {
"prompt_tokens_mean": 200,
"generation_tokens_mean": 250
}
},
"expected_recommendation": {
"model_id": "mistralai/Mistral-7B-Instruct-v0.3",
"reasoning": "Fast 7B model meeting strict customer service latency requirements",
"gpu_config": {
"gpu_type": "NVIDIA-A100-80GB",
"gpu_count": 1,
"tensor_parallel": 1,
"replicas": 1
},
"estimated_cost_per_month": 3285,
"meets_slo": true
}
},
{
"id": "scenario-2",
"name": "Code Generation Assistant - Developer Team",
"description": "Internal code copilot for software development team",
"user_description": "We have a team of 500 developers who need a code generation assistant. It should help with code completion, bug fixes, and documentation. We can tolerate slightly higher latency if it means better code quality. Budget is flexible.",
"expected_extraction": {
"use_case": "code_generation",
"user_count": 500,
"latency_requirement": "medium",
"expected_qps": 140,
"slo_targets": {
"ttft_p90_target_ms": 250,
"tpot_p90_target_ms": 60,
"e2e_p90_target_ms": 3000
},
"traffic_profile": {
"prompt_tokens_mean": 500,
"generation_tokens_mean": 300
}
},
"expected_recommendation": {
"model_id": "mistralai/Mistral-7B-Instruct-v0.3",
"reasoning": "Strong code generation capabilities with cost-effective GPU choice",
"gpu_config": {
"gpu_type": "NVIDIA-A10G",
"gpu_count": 1,
"tensor_parallel": 1,
"replicas": 1
},
"estimated_cost_per_month": 730,
"meets_slo": true
}
},
{
"id": "scenario-3",
"name": "Document Summarization - Batch Processing",
"description": "High-throughput document summarization for content analysis pipeline",
"user_description": "We need to summarize thousands of documents daily - news articles, research papers, customer feedback. Speed isn't as critical as throughput. We need to process around 2000 users worth of documents per day, with each document being 3-5 pages. Cost efficiency is important.",
"expected_extraction": {
"use_case": "summarization",
"user_count": 2000,
"latency_requirement": "low",
"expected_qps": 100,
"slo_targets": {
"ttft_p90_target_ms": 500,
"tpot_p90_target_ms": 80,
"e2e_p90_target_ms": 8000
},
"traffic_profile": {
"prompt_tokens_mean": 2000,
"generation_tokens_mean": 400
}
},
"expected_recommendation": {
"model_id": "ibm-granite/granite-3.0-8b-instruct",
"reasoning": "Most cost-effective option with excellent summarization and RAG capabilities",
"gpu_config": {
"gpu_type": "NVIDIA-L4",
"gpu_count": 1,
"tensor_parallel": 1,
"replicas": 1
},
"estimated_cost_per_month": 365,
"meets_slo": true
}
}
]
}