forked from red-hat-data-services/agentic-starter-kits
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathllminferenceservice.yaml
More file actions
86 lines (85 loc) · 3.07 KB
/
llminferenceservice.yaml
File metadata and controls
86 lines (85 loc) · 3.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# LLMInferenceService for llm-d deployment on OpenShift AI
#
# Before applying, replace the following placeholders with your values:
#
# <SERVICE_NAME> - Name for this deployment (e.g., my-model-llmd)
# <MODEL_URI> - HuggingFace model URI (e.g., hf://openai/gpt-oss-20b)
# <MODEL_NAME> - Model identifier for API requests (e.g., openai/gpt-oss-20b)
# <REPLICAS> - Number of vLLM replicas, one per GPU node (e.g., 6)
# <NODE_POOL_NAME> - Label on GPU nodes for scheduling (e.g., gpu-llmd-nodes)
# <VLLM_IMAGE> - vLLM image from Red Hat Ecosystem Catalog
# (e.g., registry.redhat.io/rhaiis/vllm-cuda-rhel9@sha256:...)
# Find the latest at: https://catalog.redhat.com/en/software/containers/rhaiis/vllm-cuda-rhel9
#
# Deploy in redhat-ods-applications namespace (required by the gateway routing).
#
# Usage:
# oc apply -f llminferenceservice.yaml
apiVersion: serving.kserve.io/v1alpha1
kind: LLMInferenceService
metadata:
name: <SERVICE_NAME>
namespace: redhat-ods-applications
annotations:
opendatahub.io/model-type: generative
openshift.io/display-name: <SERVICE_NAME>
# Auth is disabled because maas-default-gateway has no OAuth proxy.
# For production, consider enabling auth and using a ServiceAccount token
# or restricting access via NetworkPolicies.
security.opendatahub.io/enable-auth: 'false'
prometheus.io/path: /metrics
prometheus.io/port: "8000"
spec:
replicas: <REPLICAS>
model:
uri: <MODEL_URI>
name: <MODEL_NAME>
router:
# route: {} creates the HTTPRoute for external access
route: {}
# scheduler: {} is REQUIRED — without it, the controller skips
# creating the scheduler/router deployment and InferencePool
scheduler: {}
# Use maas-default-gateway (no OAuth proxy) instead of
# data-science-gateway (which has OAuth and blocks API clients)
gateway:
refs:
- name: maas-default-gateway
namespace: openshift-ingress
template:
nodeSelector:
node-pool: <NODE_POOL_NAME>
containers:
- name: main
image: <VLLM_IMAGE>
env:
- name: VLLM_ADDITIONAL_ARGS
value: "--max-model-len=16000 --tool-call-parser=openai --enable-auto-tool-choice"
- name: HF_HOME
value: /tmp/huggingface
- name: HOME
value: /tmp
- name: XDG_CACHE_HOME
value: /tmp/.cache
- name: VLLM_LOGGING_LEVEL
value: "DEBUG"
resources:
limits:
cpu: '4'
memory: 16Gi
nvidia.com/gpu: "1"
requests:
cpu: '2'
memory: 8Gi
nvidia.com/gpu: "1"
# KServe automatically injects TLS certs (--ssl-certfile, --ssl-keyfile)
# into the vLLM container at startup, so vLLM serves HTTPS on port 8000.
livenessProbe:
httpGet:
path: /health
port: 8000
scheme: HTTPS
initialDelaySeconds: 120
periodSeconds: 30
timeoutSeconds: 30
failureThreshold: 5