llm-d-planner/data/configuration/demo_scenarios.json at d8f55f3e1f75edb9ab0cc4d23b6ecc353dbdd9c1 · anfredette/llm-d-planner · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
{
  "scenarios": [
    {
      "id": "scenario-1",
      "name": "Customer Service Chatbot - High Volume",
      "description": "Large-scale customer service chatbot with strict latency requirements",
      "user_description": "I need a customer service chatbot for about 5000 concurrent users. Response time is critical - we need sub-500ms latency. Users will be asking common questions about orders, returns, and product information.",
      "expected_extraction": {
        "use_case": "customer_service",
        "user_count": 5000,
        "latency_requirement": "very_high",
        "expected_qps": 350,
        "slo_targets": {
          "ttft_p90_target_ms": 150,
          "tpot_p90_target_ms": 40,
          "e2e_p90_target_ms": 500
        },
        "traffic_profile": {
          "prompt_tokens_mean": 200,
          "generation_tokens_mean": 250
        }
      },
      "expected_recommendation": {
        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
        "reasoning": "Fast 7B model meeting strict customer service latency requirements",
        "gpu_config": {
          "gpu_type": "NVIDIA-A100-80GB",
          "gpu_count": 1,
          "tensor_parallel": 1,
          "replicas": 1
        },
        "estimated_cost_per_month": 3285,
        "meets_slo": true
      }
    },
    {
      "id": "scenario-2",
      "name": "Code Generation Assistant - Developer Team",
      "description": "Internal code copilot for software development team",
      "user_description": "We have a team of 500 developers who need a code generation assistant. It should help with code completion, bug fixes, and documentation. We can tolerate slightly higher latency if it means better code quality. Budget is flexible.",
      "expected_extraction": {
        "use_case": "code_generation",
        "user_count": 500,
        "latency_requirement": "medium",
        "expected_qps": 140,
        "slo_targets": {
          "ttft_p90_target_ms": 250,
          "tpot_p90_target_ms": 60,
          "e2e_p90_target_ms": 3000
        },
        "traffic_profile": {
          "prompt_tokens_mean": 500,
          "generation_tokens_mean": 300
        }
      },
      "expected_recommendation": {
        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
        "reasoning": "Strong code generation capabilities with cost-effective GPU choice",
        "gpu_config": {
          "gpu_type": "NVIDIA-A10G",
          "gpu_count": 1,
          "tensor_parallel": 1,
          "replicas": 1
        },
        "estimated_cost_per_month": 730,
        "meets_slo": true
      }
    },
    {
      "id": "scenario-3",
      "name": "Document Summarization - Batch Processing",
      "description": "High-throughput document summarization for content analysis pipeline",
      "user_description": "We need to summarize thousands of documents daily - news articles, research papers, customer feedback. Speed isn't as critical as throughput. We need to process around 2000 users worth of documents per day, with each document being 3-5 pages. Cost efficiency is important.",
      "expected_extraction": {
        "use_case": "summarization",
        "user_count": 2000,
        "latency_requirement": "low",
        "expected_qps": 100,
        "slo_targets": {
          "ttft_p90_target_ms": 500,
          "tpot_p90_target_ms": 80,
          "e2e_p90_target_ms": 8000
        },
        "traffic_profile": {
          "prompt_tokens_mean": 2000,
          "generation_tokens_mean": 400
        }
      },
      "expected_recommendation": {
        "model_id": "ibm-granite/granite-3.0-8b-instruct",
        "reasoning": "Most cost-effective option with excellent summarization and RAG capabilities",
        "gpu_config": {
          "gpu_type": "NVIDIA-L4",
          "gpu_count": 1,
          "tensor_parallel": 1,
          "replicas": 1
        },
        "estimated_cost_per_month": 365,
        "meets_slo": true
      }
    }
  ]
}