-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGLM-5.1.yaml
More file actions
346 lines (318 loc) · 11.7 KB
/
GLM-5.1.yaml
File metadata and controls
346 lines (318 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
x-logging-conf: &logging-conf
driver: "json-file"
options:
max-size: "100m"
max-file: "10"
labels: "com.datadoghq.ad.logs"
x-nvidia: &nvidia
runtime: nvidia
ipc: host
ulimits:
memlock: -1
nofile:
soft: 65535
hard: 65535
x-vllm-proxy-common: &vllm-proxy-common
image: nearaidev/vllm-proxy-rs@sha256:59e42dd68faa15eb0c23521029a2fc3d80d86a4143f9f766542357918be33a8c
user: root
privileged: true
<<: *nvidia
extra_hosts:
- "compose-manager:host-gateway"
volumes:
- /var/run/dstack.sock:/var/run/dstack.sock
- certs:/etc/letsencrypt:ro
restart: unless-stopped
logging: *logging-conf
x-downloader-common: &downloader-common
image: ghcr.io/astral-sh/uv:python3.11-bookworm-slim@sha256:4f5d923c9dcea037f57bda425dd209f3ec643da2f0b74227f68d09dab0b3bb36
entrypoint: ["sh", "-c"]
restart: "no"
logging: *logging-conf
services:
model-downloader:
<<: *downloader-common
container_name: model-downloader
command:
- |
set -e
echo "Downloading zai-org/GLM-5.1-FP8..."
uvx --from 'huggingface_hub[hf_xet]' hf download zai-org/GLM-5.1-FP8 --revision f396cf805182f4ca10fa675e1a99815b3ca384db
echo "Download complete."
volumes:
- huggingface_cache:/root/.cache/huggingface
environment:
- HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
proxy-nginx:
image: nginx@sha256:1d13701a5f9f3fb01aaa88cef2344d65b6b5bf6b7d9fa4cf0dca557a8d7702ba
container_name: proxy-nginx
command: /bin/sh -c 'while :; do sleep 6h; nginx -s reload; done & nginx -g "daemon off;"'
ports:
- "8000:80"
- "8444:443"
volumes:
- certs:/etc/letsencrypt:ro
configs:
- source: nginx_conf
target: /etc/nginx/conf.d/default.conf
mode: 0644
restart: unless-stopped
logging: *logging-conf
model-proxy-registrar:
image: curlimages/curl@sha256:d94d07ba9e7d6de898b6d96c1a072f6f8266c687af78a74f380087a0addf5d17
container_name: model-proxy-registrar
entrypoint: ["sh", "/register.sh"]
restart: unless-stopped
environment:
- HOST_IP=${HOST_IP}
- HTTP_PORT=${HTTP_PORT:-8000}
- TLS_PORT=${TLS_PORT:-8444}
- MODEL_PROXY_TOKEN=${MODEL_PROXY_TOKEN}
configs:
- source: registrar_script
target: /register.sh
mode: 0755
logging: *logging-conf
proxy-glm51:
<<: *vllm-proxy-common
container_name: proxy-glm51
environment:
- NVIDIA_VISIBLE_DEVICES=all
- CLOUD_API_URL=https://cloud-api.near.ai
- CLOUD_API_USAGE_TOKEN=${CLOUD_API_USAGE_TOKEN}
- COMPOSE_MANAGER_URL=http://compose-manager:8080
- LOG_FORMAT=json
- MODEL_NAME=zai-org/GLM-5.1-FP8
- OHTTP_ENABLED=true
- TOKEN=${PROXY_TOKEN}
- VLLM_BASE_URL=http://model-sg-glm51:8000
- TLS_CERT_PATH=/etc/letsencrypt/live/completions.near.ai/fullchain.pem
- USE_NV_ATTESTATION_SDK=true
- WEB_CONTEXT_SEARCH_URL=${WEB_CONTEXT_SEARCH_URL}
- WEB_CONTEXT_SEARCH_API_KEY=${WEB_CONTEXT_SEARCH_API_KEY}
model-sg-glm51:
<<: *nvidia
init: true
depends_on:
model-downloader:
condition: service_completed_successfully
image: lmsysorg/sglang:dev-cu12@sha256:aac6b242680daeb74d2ab1d85f70575357552d7d165d2e5d30eb362797db54a1
container_name: model-sg-glm51
command: >
sglang serve
--model-path zai-org/GLM-5.1-FP8
--revision f396cf805182f4ca10fa675e1a99815b3ca384db
--tp 8
--reasoning-parser glm45
--log-requests-level 0
--tool-call-parser glm47
--mem-fraction-static 0.87
--max-queued-requests 8
--num-continuous-decode-steps 5
--model-loader-extra-config '{"enable_multithread_load": "true", "num_threads": 64}'
--enable-mixed-chunk
--chunked-prefill-size 8192
--detokenizer-worker-num 4
--watchdog-timeout 600
--port 8000
--host 0.0.0.0
--enable-cache-report
--enable-metrics
--trust-remote-code
--speculative-algorithm EAGLE
--speculative-num-steps 3
--speculative-eagle-topk 1
--speculative-num-draft-tokens 4
--disable-custom-all-reduce
volumes:
- huggingface_cache:/root/.cache/huggingface
- kernel_cache:/root/.cache/deep_gemm
environment:
- HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
- HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- OPENBLAS_L2_SIZE=2097152
- NCCL_DEBUG=WARN
- SGLANG_ENABLE_SPEC_V2=1
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["0","1","2","3","4","5","6","7"]
capabilities: [gpu]
restart: unless-stopped
stop_grace_period: 5m
logging: *logging-conf
labels:
com.datadoghq.ad.check_names: '["openmetrics"]'
com.datadoghq.ad.init_configs: "[{}]"
com.datadoghq.ad.logs: '[{"source": "sglang", "service": "sglang", "tags":["model:zai-org/GLM-5.1-FP8","ip:${HOST_IP}","port:8000"]}]'
com.datadoghq.ad.instances: '[{"openmetrics_endpoint":"http://model-sg-glm51:8000/metrics", "histogram_buckets_as_distributions": true, "metrics":["sglang:*"], "service": "glm-5.1", "tags":["model:zai-org/GLM-5.1-FP8","ip:${HOST_IP}","port:8000"]}]'
dcgm-exporter:
image: nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless
container_name: dcgm-exporter
runtime: nvidia
cap_add:
- SYS_ADMIN
environment:
- NVIDIA_VISIBLE_DEVICES=all
ports:
- "9400:9400"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
restart: unless-stopped
logging: *logging-conf
labels:
com.datadoghq.ad.check_names: '["dcgm"]'
com.datadoghq.ad.init_configs: "[{}]"
com.datadoghq.ad.instances: '[{"openmetrics_endpoint": "http://%%host%%:9400/metrics", "tags":["model:zai-org/GLM-5.1-FP8","deployment:GLM-5.1","ip:${HOST_IP}"]}]'
com.datadoghq.ad.logs: '[{"source": "dcgm-exporter", "service": "dcgm-exporter", "tags":["model:zai-org/GLM-5.1-FP8","deployment:GLM-5.1","ip:${HOST_IP}"]}]'
networks:
default:
external: true
name: dstack_default
volumes:
huggingface_cache:
kernel_cache:
certs:
external: true
name: certs
configs:
registrar_script:
content: |
#!/bin/sh
PROXY_URL="https://completions.near.ai"
TOKEN="$${MODEL_PROXY_TOKEN}"
ENDPOINT="$${HOST_IP}:$${HTTP_PORT}"
REGISTERED=false
FAILURE_COUNT=0
MAX_RETRIES=3
register_endpoint() {
echo "Registering endpoint $$1 with routing port $$2"
curl -sS --max-time 10 -X POST "$$PROXY_URL/register/endpoint" \
-H "Authorization: Bearer $$TOKEN" \
-H "Content-Type: application/json" \
-d "{\"endpoint\":\"$$1\",\"routing_port\":$$2}"
}
unregister_endpoint() {
echo "Unregistering endpoint $$1"
curl -sS --max-time 10 -X POST "$$PROXY_URL/unregister/endpoint" \
-H "Authorization: Bearer $$TOKEN" \
-H "Content-Type: application/json" \
-d "{\"endpoint\":\"$$1\"}"
}
register_model() {
curl -sS --max-time 10 -X POST "$$PROXY_URL/register/model" \
-H "Authorization: Bearer $$TOKEN" \
-H "Content-Type: application/json" \
-d "{\"model\":\"$$1\",\"domain\":\"$$2\"}"
}
cleanup() {
echo "SIGTERM received, unregistering $$ENDPOINT"
unregister_endpoint "$$ENDPOINT"
REGISTERED=false
exit 0
}
trap cleanup TERM INT
# 1-token completion health check directly to model container (no auth needed)
check_inference() {
echo "Performing health check on model endpoint..."
curl -sSf --max-time 45 -X POST "http://model-sg-glm51:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{"model":"zai-org/GLM-5.1-FP8","messages":[{"role":"user","content":"hi"}],"max_tokens":1}'
}
echo "Waiting for model to be ready..."
until curl -sf http://proxy-nginx:80/v1/models > /dev/null 2>&1; do sleep 30; done
echo "Model ready, starting registration loop"
while true; do
if check_inference; then
FAILURE_COUNT=0
register_endpoint "$$ENDPOINT" "$${TLS_PORT}"
register_model "zai-org/GLM-5.1-FP8" "glm-5-1.completions.near.ai"
if [ "$$REGISTERED" = false ]; then
echo "[GLM-5.1] Registered zai-org/GLM-5.1-FP8 at $$ENDPOINT"
fi
REGISTERED=true
else
FAILURE_COUNT=$$((FAILURE_COUNT + 1))
echo "[GLM-5.1] Health check failed ($$FAILURE_COUNT/$$MAX_RETRIES)"
if [ "$$REGISTERED" = true ] && [ "$$FAILURE_COUNT" -ge "$$MAX_RETRIES" ]; then
echo "[GLM-5.1] Health check failed and Retry limit reached, unregistering $$ENDPOINT"
unregister_endpoint "$$ENDPOINT"
REGISTERED=false
echo "[GLM-5.1] Exiting to refresh DNS (container will auto-restart)"
exit 1
fi
fi
sleep 60
done
nginx_conf:
content: |
log_format json_combined escape=json
'{' '"time":"$$time_iso8601"'
',"request_id":"$$http_x_request_id"'
',"org_id":"$$http_x_org_id"'
',"workspace_id":"$$http_x_workspace_id"'
',"host":"$$host"'
',"method":"$$request_method"'
',"uri":"$$uri"'
',"status":$$status'
',"request_length":$$request_length'
',"bytes_sent":$$bytes_sent'
',"request_time":$$request_time'
',"upstream_addr":"$$upstream_addr"'
'}';
server {
listen 80 default_server;
access_log /var/log/nginx/access.log json_combined;
client_max_body_size 100m;
client_body_buffer_size 1m;
location / {
proxy_pass http://proxy-glm51:8000;
proxy_http_version 1.1;
proxy_set_header Host $$host;
proxy_set_header X-Real-IP $$remote_addr;
proxy_set_header X-Forwarded-For $$proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $$scheme;
proxy_set_header Connection '';
proxy_buffering off;
proxy_cache off;
proxy_read_timeout 3600s;
}
}
server {
listen 443 ssl http2;
server_name glm-5-1.completions.near.ai;
ssl_certificate /etc/letsencrypt/live/completions.near.ai/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/completions.near.ai/privkey.pem;
ssl_protocols TLSv1.2 TLSv1.3;
# Keep H2 connections from cloud-api alive across long idle gaps so the
# bucket-pinned TCP connection survives between chats. Without this,
# nginx defaults (75s idle, 1000 req/conn) close the connection and the
# next request opens a new TCP via model-proxy's L4 LB → may land on a
# different backend → signature 404. Pairs with cloud-api H2 keepalive
# PINGs (http2_keep_alive_while_idle).
keepalive_timeout 1h;
keepalive_requests 1000000;
access_log /var/log/nginx/access.log json_combined;
client_max_body_size 100m;
client_body_buffer_size 1m;
location / {
proxy_pass http://proxy-glm51:8000;
proxy_http_version 1.1;
proxy_set_header Host $$host;
proxy_set_header X-Real-IP $$remote_addr;
proxy_set_header X-Forwarded-For $$proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $$scheme;
proxy_set_header Connection '';
proxy_buffering off;
proxy_cache off;
proxy_read_timeout 3600s;
}
}