-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathQwen3.5-122B.yaml
More file actions
356 lines (328 loc) · 12.5 KB
/
Qwen3.5-122B.yaml
File metadata and controls
356 lines (328 loc) · 12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
x-logging-conf: &logging-conf
driver: "json-file"
options:
max-size: "100m"
max-file: "10"
labels: "com.datadoghq.ad.logs"
x-nvidia: &nvidia
runtime: nvidia
ipc: host
ulimits:
memlock: -1
nofile:
soft: 65535
hard: 65535
x-vllm-proxy-common: &vllm-proxy-common
image: nearaidev/vllm-proxy-rs@sha256:59e42dd68faa15eb0c23521029a2fc3d80d86a4143f9f766542357918be33a8c
user: root
privileged: true
<<: *nvidia
extra_hosts:
- "compose-manager:host-gateway"
volumes:
- /var/run/dstack.sock:/var/run/dstack.sock
- certs:/etc/letsencrypt:ro
restart: unless-stopped
logging: *logging-conf
x-sg-qwen35-122b-common: &sg-qwen35-122b-common
<<: *nvidia
init: true
# SGLang v0.5.12 (cu129) — migrated off vLLM 2026-05-22.
# Context dropped from 1.01M (yarn rope override) to native 262144.
# EAGLE spec decoding enabled (Spec V2 default since 0.5.11).
image: lmsysorg/sglang:v0.5.12-cu129@sha256:9e02c8e1fe2790a1c445bd5f6814305fe43639a4adb01c8ad1e8e21e750bf581
command: >
sglang serve
--model-path Qwen/Qwen3.5-122B-A10B
--revision dc4d348443bc740c68e2d77492492c11606384d5
--tp 4
--reasoning-parser qwen3
--tool-call-parser qwen3_coder
--speculative-algorithm EAGLE
--speculative-num-steps 3
--speculative-eagle-topk 1
--speculative-num-draft-tokens 4
--mamba-scheduler-strategy extra_buffer
--kv-cache-dtype fp8_e4m3
--mem-fraction-static 0.88
--context-length 262144
--num-continuous-decode-steps 5
--model-loader-extra-config '{"enable_multithread_load": "true", "num_threads": 64}'
--enable-mixed-chunk
--chunked-prefill-size 16384
--port 8000
--host 0.0.0.0
--enable-cache-report
--enable-metrics
--trust-remote-code
--log-requests-level 0
--served-model-name Qwen/Qwen3.5-122B-A10B
volumes:
- huggingface_cache:/root/.cache/huggingface
- kernel_cache:/root/.cache/deep_gemm
environment:
- HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
- HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- OPENBLAS_L2_SIZE=2097152
- NCCL_DEBUG=WARN
- SGLANG_ENABLE_SPEC_V2=1
restart: unless-stopped
stop_grace_period: 5m
logging: *logging-conf
services:
model-downloader:
image: ghcr.io/astral-sh/uv:python3.11-bookworm-slim@sha256:4f5d923c9dcea037f57bda425dd209f3ec643da2f0b74227f68d09dab0b3bb36
container_name: model-downloader
restart: "no"
entrypoint: ["sh", "-c"]
command:
- |
set -e
echo "Downloading Qwen/Qwen3.5-122B-A10B..."
uvx --from 'huggingface_hub[hf_xet]' hf download Qwen/Qwen3.5-122B-A10B --revision dc4d348443bc740c68e2d77492492c11606384d5
echo "Download complete."
volumes:
- huggingface_cache:/root/.cache/huggingface
environment:
- HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
logging: *logging-conf
proxy-nginx:
image: nginx@sha256:1d13701a5f9f3fb01aaa88cef2344d65b6b5bf6b7d9fa4cf0dca557a8d7702ba
container_name: proxy-nginx
command: /bin/sh -c 'while :; do sleep 6h; nginx -s reload; done & nginx -g "daemon off;"'
ports:
- "8000:80"
- "8444:443"
volumes:
- certs:/etc/letsencrypt:ro
configs:
- source: nginx_conf
target: /etc/nginx/conf.d/default.conf
mode: 0644
restart: unless-stopped
logging: *logging-conf
model-proxy-registrar:
image: curlimages/curl@sha256:d94d07ba9e7d6de898b6d96c1a072f6f8266c687af78a74f380087a0addf5d17
container_name: model-proxy-registrar
entrypoint: ["sh", "/register.sh"]
restart: unless-stopped
environment:
- HOST_IP=${HOST_IP}
- HTTP_PORT=${HTTP_PORT:-8000}
- TLS_PORT=${TLS_PORT:-8444}
- MODEL_PROXY_TOKEN=${MODEL_PROXY_TOKEN}
configs:
- source: registrar_script
target: /register.sh
mode: 0755
logging: *logging-conf
# --- Single proxy for both backends ---
proxy-qwen35:
<<: *vllm-proxy-common
container_name: proxy-qwen35
environment:
- NVIDIA_VISIBLE_DEVICES=all
- CLOUD_API_URL=https://cloud-api.near.ai
- CLOUD_API_USAGE_TOKEN=${CLOUD_API_USAGE_TOKEN}
- COMPOSE_MANAGER_URL=http://compose-manager:8080
- LOG_FORMAT=json
- MODEL_NAME=Qwen/Qwen3.5-122B-A10B
- OHTTP_ENABLED=true
- TOKEN=${PROXY_TOKEN}
- VLLM_BACKEND_URLS=http://model-sg-qwen35-122b-1:8000,http://model-sg-qwen35-122b-2:8000
- VLLM_PROXY_MAX_REQUEST_SIZE=104857600
- TLS_CERT_PATH=/etc/letsencrypt/live/completions.near.ai/fullchain.pem
- USE_NV_ATTESTATION_SDK=true
labels:
com.datadoghq.ad.logs: '[{"source": "vllm-proxy", "service": "vllm-proxy", "tags": ["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8000"]}]'
# --- Qwen3.5-122B-A10B instance 1 (GPUs 0-3) ---
model-sg-qwen35-122b-1:
<<: *sg-qwen35-122b-common
container_name: model-sg-qwen35-122b-1
depends_on:
model-downloader:
condition: service_completed_successfully
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["0","1","2","3"]
capabilities: [gpu]
labels:
com.datadoghq.ad.check_names: '["openmetrics"]'
com.datadoghq.ad.init_configs: "[{}]"
com.datadoghq.ad.logs: '[{"source": "sglang", "service": "sglang", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8000","instance:1"]}]'
com.datadoghq.ad.instances: '[{"openmetrics_endpoint":"http://model-sg-qwen35-122b-1:8000/metrics", "metrics":["sglang:*"], "histogram_buckets_as_distributions": true, "service": "qwen35-1", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8000"]}]'
# --- Qwen3.5-122B-A10B instance 2 (GPUs 4-7) ---
model-sg-qwen35-122b-2:
<<: *sg-qwen35-122b-common
container_name: model-sg-qwen35-122b-2
depends_on:
model-downloader:
condition: service_completed_successfully
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["4","5","6","7"]
capabilities: [gpu]
labels:
com.datadoghq.ad.check_names: '["openmetrics"]'
com.datadoghq.ad.init_configs: "[{}]"
com.datadoghq.ad.logs: '[{"source": "sglang", "service": "sglang", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8001","instance:2"]}]'
com.datadoghq.ad.instances: '[{"openmetrics_endpoint":"http://model-sg-qwen35-122b-2:8000/metrics", "metrics":["sglang:*"], "histogram_buckets_as_distributions": true, "service": "qwen35-2", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8001"]}]'
dcgm-exporter:
image: nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless
container_name: dcgm-exporter
runtime: nvidia
cap_add:
- SYS_ADMIN
environment:
- NVIDIA_VISIBLE_DEVICES=all
ports:
- "9400:9400"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
restart: unless-stopped
logging: *logging-conf
labels:
com.datadoghq.ad.check_names: '["dcgm"]'
com.datadoghq.ad.init_configs: "[{}]"
com.datadoghq.ad.instances: '[{"openmetrics_endpoint": "http://%%host%%:9400/metrics", "tags":["model:Qwen/Qwen3.5-122B-A10B","deployment:Qwen3.5-122B","ip:${HOST_IP}"]}]'
com.datadoghq.ad.logs: '[{"source": "dcgm-exporter", "service": "dcgm-exporter", "tags":["model:Qwen/Qwen3.5-122B-A10B","deployment:Qwen3.5-122B","ip:${HOST_IP}"]}]'
networks:
default:
external: true
name: dstack_default
volumes:
huggingface_cache:
kernel_cache:
certs:
external: true
name: certs
configs:
registrar_script:
content: |
#!/bin/sh
PROXY_URL="https://completions.near.ai"
TOKEN="$${MODEL_PROXY_TOKEN}"
ENDPOINT="$${HOST_IP}:$${HTTP_PORT}"
REGISTERED=false
FAILURE_COUNT=0
MAX_RETRIES=3
register_endpoint() {
echo "Registering endpoint $$1 with routing port $$2"
curl -sS --max-time 10 -X POST "$$PROXY_URL/register/endpoint" \
-H "Authorization: Bearer $$TOKEN" \
-H "Content-Type: application/json" \
-d "{\"endpoint\":\"$$1\",\"routing_port\":$$2}"
}
unregister_endpoint() {
echo "Unregistering endpoint $$1"
curl -sS --max-time 10 -X POST "$$PROXY_URL/unregister/endpoint" \
-H "Authorization: Bearer $$TOKEN" \
-H "Content-Type: application/json" \
-d "{\"endpoint\":\"$$1\"}"
}
register_model() {
curl -sS --max-time 10 -X POST "$$PROXY_URL/register/model" \
-H "Authorization: Bearer $$TOKEN" \
-H "Content-Type: application/json" \
-d "{\"model\":\"$$1\",\"domain\":\"$$2\"}"
}
cleanup() {
echo "SIGTERM received, unregistering $$ENDPOINT"
unregister_endpoint "$$ENDPOINT"
REGISTERED=false
exit 0
}
trap cleanup TERM INT
# Health check directly on backend (no auth needed on raw sglang container)
check_inference() {
echo "Performing health check on backend..."
curl -sSf --max-time 45 -X POST "http://model-sg-qwen35-122b-1:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{"model":"Qwen/Qwen3.5-122B-A10B","messages":[{"role":"user","content":"hi"}],"max_tokens":1}'
}
echo "Waiting for model to be ready..."
until curl -sf http://proxy-nginx:80/v1/models > /dev/null 2>&1; do sleep 30; done
echo "Model ready, starting registration loop"
while true; do
if check_inference; then
FAILURE_COUNT=0
register_endpoint "$$ENDPOINT" "$${TLS_PORT}"
register_model "Qwen/Qwen3.5-122B-A10B" "qwen35-122b.completions.near.ai"
if [ "$$REGISTERED" = false ]; then
echo "[Qwen3.5-122B] Registered Qwen/Qwen3.5-122B-A10B at $$ENDPOINT"
fi
REGISTERED=true
else
FAILURE_COUNT=$$((FAILURE_COUNT + 1))
echo "[Qwen3.5-122B] Health check failed ($$FAILURE_COUNT/$$MAX_RETRIES)"
if [ "$$REGISTERED" = true ] && [ "$$FAILURE_COUNT" -ge "$$MAX_RETRIES" ]; then
echo "[Qwen3.5-122B] Health check failed and Retry limit reached, unregistering $$ENDPOINT"
unregister_endpoint "$$ENDPOINT"
REGISTERED=false
echo "[Qwen3.5-122B] Exiting to refresh DNS (container will auto-restart)"
exit 1
fi
fi
sleep 60
done
nginx_conf:
content: |
log_format json_combined escape=json
'{' '"time":"$$time_iso8601"'
',"request_id":"$$http_x_request_id"'
',"org_id":"$$http_x_org_id"'
',"workspace_id":"$$http_x_workspace_id"'
',"host":"$$host"'
',"method":"$$request_method"'
',"uri":"$$uri"'
',"status":$$status'
',"request_length":$$request_length'
',"bytes_sent":$$bytes_sent'
',"request_time":$$request_time'
',"upstream_addr":"$$upstream_addr"'
'}';
access_log /var/log/nginx/access.log json_combined;
proxy_http_version 1.1;
proxy_set_header Host $$host;
proxy_set_header X-Real-IP $$remote_addr;
proxy_set_header X-Forwarded-For $$proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $$scheme;
proxy_set_header Connection '';
proxy_buffering off;
proxy_cache off;
proxy_read_timeout 3600s;
client_max_body_size 100m;
client_body_buffer_size 1m;
# :80 — single proxy handles both backends
server {
listen 80 default_server;
location / { proxy_pass http://proxy-qwen35:8000; }
}
ssl_certificate /etc/letsencrypt/live/completions.near.ai/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/completions.near.ai/privkey.pem;
ssl_protocols TLSv1.2 TLSv1.3;
server {
listen 443 ssl http2;
server_name qwen35-122b.completions.near.ai;
# Keep H2 connections from cloud-api alive across long idle gaps so the
# bucket-pinned TCP connection survives between chats. Without this,
# nginx defaults (75s idle, 1000 req/conn) close the connection and the
# next request opens a new TCP via model-proxy's L4 LB → may land on a
# different backend → signature 404. Pairs with cloud-api H2 keepalive
# PINGs (http2_keep_alive_while_idle).
keepalive_timeout 1h;
keepalive_requests 1000000;
location / { proxy_pass http://proxy-qwen35:8000; }
}