Skip to content

Commit 3d6ce35

Browse files
scarlet25151chenyu.jiang
andauthored
chore: add s3 example (#1988)
Signed-off-by: chenyu.jiang <chenyu.jiang@bytedance.com> Co-authored-by: chenyu.jiang <chenyu.jiang@bytedance.com>
1 parent 2aeaba5 commit 3d6ce35

File tree

1 file changed

+177
-0
lines changed

1 file changed

+177
-0
lines changed
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
apiVersion: orchestration.aibrix.ai/v1alpha1
2+
kind: StormService
3+
metadata:
4+
name: qwen2-5-7b-stormservice
5+
labels:
6+
model.aibrix.ai/name: qwen2.5-7b
7+
model.aibrix.ai/port: "8000"
8+
model.aibrix.ai/metrics-port: "8000"
9+
model.aibrix.ai/engine: vllm
10+
adapter.model.aibrix.ai/enabled: "true"
11+
annotations:
12+
model.aibrix.ai/sidecar-injection: "true"
13+
# Could be replaced with your runtime image
14+
model.aibrix.ai/sidecar-runtime-image: "aibrix-cn-beijing.cr.volces.com/aibrix/runtime:v0.6.0"
15+
spec:
16+
replicas: 1
17+
updateStrategy:
18+
type: InPlaceUpdate
19+
stateful: true
20+
selector:
21+
matchLabels:
22+
model.aibrix.ai/name: qwen2.5-7b
23+
template:
24+
metadata:
25+
labels:
26+
model.aibrix.ai/name: qwen2.5-7b
27+
model.aibrix.ai/port: "8000"
28+
model.aibrix.ai/metrics-port: "8000"
29+
model.aibrix.ai/engine: vllm
30+
adapter.model.aibrix.ai/enabled: "true"
31+
spec:
32+
roles:
33+
- name: worker
34+
replicas: 1
35+
stateful: true
36+
template:
37+
metadata:
38+
labels:
39+
model.aibrix.ai/name: qwen2.5-7b
40+
model.aibrix.ai/port: "8000"
41+
model.aibrix.ai/metrics-port: "8000"
42+
model.aibrix.ai/engine: vllm
43+
adapter.model.aibrix.ai/enabled: "true"
44+
annotations:
45+
prometheus.io/path: "/metrics"
46+
prometheus.io/port: "8000"
47+
prometheus.io/scrape: "true"
48+
spec:
49+
volumes:
50+
- name: model-cache
51+
emptyDir: {}
52+
initContainers:
53+
- name: init-model-and-loras
54+
image: aibrix-cn-beijing.cr.volces.com/aibrix/runtime:v0.6.0
55+
command: ["sh", "-c"]
56+
args:
57+
- |
58+
set -eu
59+
# Optionally download LoRA artifacts here (example). Uncomment to enable:
60+
echo "[init] download lora..."
61+
# aibrix_download --model-uri s3://<YOUR_MODEL> --local-dir "/models/loras/"
62+
echo "[init] done."
63+
env:
64+
- name: DOWNLOADER_MODEL_NAME
65+
value: qwen2.5-7b
66+
- name: DOWNLOADER_NUM_THREADS
67+
value: "16"
68+
- name: DOWNLOADER_ALLOW_FILE_SUFFIX
69+
value: json, safetensors, bin, py
70+
- name: TOS_ACCESS_KEY
71+
valueFrom:
72+
secretKeyRef:
73+
name: tos-credential
74+
key: TOS_ACCESS_KEY
75+
- name: TOS_SECRET_KEY
76+
valueFrom:
77+
secretKeyRef:
78+
name: tos-credential
79+
key: TOS_SECRET_KEY
80+
- name: TOS_ENDPOINT
81+
value: https://tos-s3-cn-beijing.volces.com
82+
- name: TOS_REGION
83+
value: cn-beijing
84+
volumeMounts:
85+
- mountPath: /models
86+
name: model-vol
87+
88+
containers:
89+
- name: vllm
90+
image: aibrix-public-release-cn-beijing.cr.volces.com/vllm/vllm-openai:0.11.0
91+
command: ["sh", "-c"]
92+
args:
93+
- |
94+
python3 -m vllm.entrypoints.openai.api_server \
95+
--host 0.0.0.0 \
96+
--port 8000 \
97+
--model /models/Qwen2.5-7B-Instruct/ \
98+
--served-model-name qwen2.5-7b \
99+
--tensor-parallel-size 2 \
100+
--distributed-executor-backend mp \
101+
--gpu-memory-utilization 0.75 \
102+
--max-model-len 4096 \
103+
--max-num-seqs 16 \
104+
--max-num-batched-tokens 4096 \
105+
--enable-lora \
106+
--max-loras 2 \
107+
--max-lora-rank 64 \
108+
--max-cpu-loras 4 \
109+
--fully-sharded-loras
110+
env:
111+
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
112+
value: "True"
113+
- name: VLLM_LORA_MODULES_LOADING_TIMEOUT
114+
value: "300"
115+
ports:
116+
- containerPort: 8000
117+
protocol: TCP
118+
resources:
119+
limits:
120+
nvidia.com/gpu: "2"
121+
cpu: "12"
122+
memory: "48G"
123+
requests:
124+
nvidia.com/gpu: "2"
125+
cpu: "12"
126+
memory: "48G"
127+
volumeMounts:
128+
- name: model-vol
129+
mountPath: /models
130+
livenessProbe:
131+
httpGet:
132+
path: /health
133+
port: 8000
134+
initialDelaySeconds: 600
135+
periodSeconds: 30
136+
readinessProbe:
137+
httpGet:
138+
path: /health
139+
port: 8000
140+
initialDelaySeconds: 300
141+
periodSeconds: 30
142+
volumes:
143+
- name: model-vol
144+
emptyDir: {}
145+
- emptyDir:
146+
medium: Memory
147+
name: shared-mem
148+
---
149+
apiVersion: model.aibrix.ai/v1alpha1
150+
kind: ModelAdapter
151+
metadata:
152+
name: qwen2-5-7b-lora
153+
namespace: default
154+
labels:
155+
# same as the name
156+
model.aibrix.ai/name: "qwen2-5-7b-lora"
157+
model.aibrix.ai/port: "8000"
158+
spec:
159+
baseModel: qwen2.5-7b
160+
podSelector:
161+
matchLabels:
162+
model.aibrix.ai/name: qwen2.5-7b
163+
adapter.model.aibrix.ai/enabled: "true"
164+
artifactURL: "s3://aibrix-example-model-artifacts/Qwen2.5-7B-LoRA/"
165+
# Create a Secret with the same name in the cluster (same namespace as this ModelAdapter) to access S3:
166+
# apiVersion: v1
167+
# kind: Secret
168+
# metadata:
169+
# name: aws-credential
170+
# type: Opaque
171+
# stringData:
172+
# aws_access_key_id: "YOUR_AWS_ACCESS_KEY_ID"
173+
# aws_secret_access_key: "YOUR_AWS_SECRET_ACCESS_KEY"
174+
# aws_session_token: "YOUR_AWS_SESSION_TOKEN" # Optional; required for temporary STS credentials (e.g., assumed IAM role)
175+
# aws_region: "us-east-1"
176+
credentialsSecretRef:
177+
name: aws-credential

0 commit comments

Comments
 (0)