Skip to content

Commit fc8d5ee

Browse files
committed
Add LLM controller deployment and simulator resources
1 parent 8efc1bf commit fc8d5ee

5 files changed

Lines changed: 248 additions & 5 deletions

File tree

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: llm-controller
5+
namespace: kserve
6+
labels:
7+
app: llm-controller
8+
spec:
9+
replicas: 1
10+
selector:
11+
matchLabels:
12+
app: llm-controller
13+
template:
14+
metadata:
15+
labels:
16+
app: llm-controller
17+
spec:
18+
serviceAccountName: kserve-controller-manager
19+
containers:
20+
- name: controller
21+
image: python:3.11-slim
22+
command: ["sh", "-c"]
23+
args:
24+
- |
25+
pip install kubernetes && python -c "
26+
import asyncio
27+
import json
28+
import logging
29+
import os
30+
from datetime import datetime
31+
from kubernetes import client, config
32+
from kubernetes.client.rest import ApiException
33+
34+
logging.basicConfig(level=logging.INFO)
35+
logger = logging.getLogger(__name__)
36+
37+
# Load Kubernetes config
38+
config.load_incluster_config()
39+
v1 = client.CoreV1Api()
40+
custom_api = client.CustomObjectsApi()
41+
42+
def get_llm_services():
43+
\"\"\"Get all LLMInferenceServices\"\"\"
44+
try:
45+
result = custom_api.list_cluster_custom_object(
46+
group='serving.kserve.io',
47+
version='v1alpha1',
48+
plural='llminferenceservices'
49+
)
50+
return result
51+
except ApiException as e:
52+
logger.error(f'Failed to get LLMInferenceServices: {e}')
53+
return None
54+
55+
def get_route_for_service(name, namespace):
56+
\"\"\"Find OpenShift route for the service\"\"\"
57+
try:
58+
# Try to get route with expected name
59+
route_name = f'{name}-llm'
60+
try:
61+
route = custom_api.get_namespaced_custom_object(
62+
group='route.openshift.io',
63+
version='v1',
64+
namespace=namespace,
65+
plural='routes',
66+
name=route_name
67+
)
68+
host = route.get('spec', {}).get('host')
69+
if host:
70+
return f'https://{host}'
71+
except ApiException:
72+
pass
73+
74+
# List all routes and find one matching the service name
75+
try:
76+
routes = custom_api.list_namespaced_custom_object(
77+
group='route.openshift.io',
78+
version='v1',
79+
namespace=namespace,
80+
plural='routes'
81+
)
82+
for route in routes.get('items', []):
83+
route_name = route.get('metadata', {}).get('name', '')
84+
if name in route_name:
85+
host = route.get('spec', {}).get('host')
86+
if host:
87+
return f'https://{host}'
88+
except ApiException:
89+
pass
90+
91+
return None
92+
except Exception as e:
93+
logger.error(f'Error finding route for {name}: {e}')
94+
return None
95+
96+
def update_llm_status(name, namespace, url):
97+
\"\"\"Update LLMInferenceService status with URL\"\"\"
98+
try:
99+
status_patch = {
100+
'status': {
101+
'url': url,
102+
'ready': True,
103+
'conditions': [
104+
{
105+
'type': 'Ready',
106+
'status': 'True',
107+
'lastTransitionTime': datetime.utcnow().isoformat() + 'Z'
108+
}
109+
]
110+
}
111+
}
112+
113+
custom_api.patch_namespaced_custom_object_status(
114+
group='serving.kserve.io',
115+
version='v1alpha1',
116+
namespace=namespace,
117+
plural='llminferenceservices',
118+
name=name,
119+
body=status_patch
120+
)
121+
logger.info(f'Updated {name} in {namespace} with URL: {url}')
122+
return True
123+
except ApiException as e:
124+
logger.error(f'Failed to update {name} in {namespace}: {e}')
125+
return False
126+
127+
async def reconcile_loop():
128+
\"\"\"Main reconciliation loop\"\"\"
129+
logger.info('Starting LLM controller...')
130+
131+
while True:
132+
try:
133+
llm_data = get_llm_services()
134+
if llm_data and llm_data.get('items'):
135+
for item in llm_data['items']:
136+
name = item['metadata']['name']
137+
namespace = item['metadata']['namespace']
138+
139+
# Check if status.url is already set
140+
current_url = item.get('status', {}).get('url')
141+
if current_url:
142+
logger.debug(f'LLMInferenceService {name} already has URL: {current_url}')
143+
continue
144+
145+
# Find route for this service
146+
url = get_route_for_service(name, namespace)
147+
if url:
148+
logger.info(f'Found URL for {name}: {url}')
149+
update_llm_status(name, namespace, url)
150+
else:
151+
logger.warning(f'No route found for {name} in {namespace}')
152+
153+
# Wait before next reconciliation
154+
await asyncio.sleep(30)
155+
156+
except Exception as e:
157+
logger.error(f'Reconciliation error: {e}')
158+
await asyncio.sleep(60)
159+
160+
# Run the controller
161+
asyncio.run(reconcile_loop())
162+
"
163+
env:
164+
- name: PYTHONUNBUFFERED
165+
value: "1"
166+
resources:
167+
limits:
168+
cpu: 100m
169+
memory: 128Mi
170+
requests:
171+
cpu: 50m
172+
memory: 64Mi
173+
---
174+
apiVersion: rbac.authorization.k8s.io/v1
175+
kind: ClusterRole
176+
metadata:
177+
name: llm-controller
178+
rules:
179+
- apiGroups: ["serving.kserve.io"]
180+
resources: ["llminferenceservices"]
181+
verbs: ["get", "list", "watch", "patch"]
182+
- apiGroups: ["serving.kserve.io"]
183+
resources: ["llminferenceservices/status"]
184+
verbs: ["get", "patch", "update"]
185+
- apiGroups: ["route.openshift.io"]
186+
resources: ["routes"]
187+
verbs: ["get", "list", "watch"]
188+
- apiGroups: [""]
189+
resources: ["services"]
190+
verbs: ["get", "list", "watch"]
191+
---
192+
apiVersion: rbac.authorization.k8s.io/v1
193+
kind: ClusterRoleBinding
194+
metadata:
195+
name: llm-controller
196+
roleRef:
197+
apiGroup: rbac.authorization.k8s.io
198+
kind: ClusterRole
199+
name: llm-controller
200+
subjects:
201+
- kind: ServiceAccount
202+
name: kserve-controller-manager
203+
namespace: kserve

maas-api/deploy/models/simulator/kustomization.yaml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,18 @@ apiVersion: kustomize.config.k8s.io/v1beta1
22
kind: Kustomization
33

44
metadata:
5-
name: vllm-simulator
5+
name: facebook-opt-simulator
66

77
namespace: llm
88

99
resources:
1010
- simulated-model.yaml
11+
- service.yaml
12+
- route.yaml
1113
- rbac.yaml
1214

15+
labels:
16+
- pairs:
17+
app: facebook-simulator
18+
component: simulated-model
19+
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: route.openshift.io/v1
2+
kind: Route
3+
metadata:
4+
name: facebook-opt-125m-single-simulated-llm
5+
namespace: llm
6+
labels:
7+
app: facebook-simulator
8+
spec:
9+
host: facebook-opt-125m-single-simulated-llm.apps.summit-gpu.octo-emerging.redhataicoe.com
10+
port:
11+
targetPort: https
12+
to:
13+
kind: Service
14+
name: facebook-opt-125m-single-simulated-llm
15+
weight: 100
16+
tls:
17+
termination: passthrough
18+
wildcardPolicy: None
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: facebook-opt-125m-single-simulated-llm
5+
namespace: llm
6+
labels:
7+
app: facebook-simulator
8+
spec:
9+
type: ClusterIP
10+
ports:
11+
- name: https
12+
port: 443
13+
targetPort: https
14+
protocol: TCP
15+
selector:
16+
app.kubernetes.io/name: facebook-opt-125m-single-simulated
17+
app.kubernetes.io/part-of: llminferenceservice
18+
kserve.io/component: workload

maas-api/deploy/models/simulator/simulated-model.yaml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,7 @@ spec:
2222
- facebook-opt-125m-single-simulated
2323
- --mode
2424
- random
25-
- --ssl-certfile
26-
- /etc/ssl/certs/tls.crt
27-
- --ssl-keyfile
28-
- /etc/ssl/certs/tls.key
25+
- --self-signed-certs
2926
env:
3027
- name: POD_NAME
3128
valueFrom:

0 commit comments

Comments
 (0)