cvm-compose-files/GLM-5.1.yaml at main · nearai/cvm-compose-files · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
x-logging-conf: &logging-conf
  driver: "json-file"
  options:
    max-size: "100m"
    max-file: "10"
    labels: "com.datadoghq.ad.logs"

x-nvidia: &nvidia
  runtime: nvidia
  ipc: host
  ulimits:
    memlock: -1
    nofile:
      soft: 65535
      hard: 65535

x-vllm-proxy-common: &vllm-proxy-common
  image: nearaidev/vllm-proxy-rs@sha256:59e42dd68faa15eb0c23521029a2fc3d80d86a4143f9f766542357918be33a8c
  user: root
  privileged: true
  <<: *nvidia
  extra_hosts:
    - "compose-manager:host-gateway"
  volumes:
    - /var/run/dstack.sock:/var/run/dstack.sock
    - certs:/etc/letsencrypt:ro
  restart: unless-stopped
  logging: *logging-conf

x-downloader-common: &downloader-common
  image: ghcr.io/astral-sh/uv:python3.11-bookworm-slim@sha256:4f5d923c9dcea037f57bda425dd209f3ec643da2f0b74227f68d09dab0b3bb36
  entrypoint: ["sh", "-c"]
  restart: "no"
  logging: *logging-conf

services:
  model-downloader:
    <<: *downloader-common
    container_name: model-downloader
    command:
      - |
        set -e
        echo "Downloading zai-org/GLM-5.1-FP8..."
        uvx --from 'huggingface_hub[hf_xet]' hf download zai-org/GLM-5.1-FP8 --revision f396cf805182f4ca10fa675e1a99815b3ca384db
        echo "Download complete."
    volumes:
      - huggingface_cache:/root/.cache/huggingface
    environment:
      - HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}

  proxy-nginx:
    image: nginx@sha256:1d13701a5f9f3fb01aaa88cef2344d65b6b5bf6b7d9fa4cf0dca557a8d7702ba
    container_name: proxy-nginx
    command: /bin/sh -c 'while :; do sleep 6h; nginx -s reload; done & nginx -g "daemon off;"'
    ports:
      - "8000:80"
      - "8444:443"
    volumes:
      - certs:/etc/letsencrypt:ro
    configs:
      - source: nginx_conf
        target: /etc/nginx/conf.d/default.conf
        mode: 0644
    restart: unless-stopped
    logging: *logging-conf

  model-proxy-registrar:
    image: curlimages/curl@sha256:d94d07ba9e7d6de898b6d96c1a072f6f8266c687af78a74f380087a0addf5d17
    container_name: model-proxy-registrar
    entrypoint: ["sh", "/register.sh"]
    restart: unless-stopped
    environment:
      - HOST_IP=${HOST_IP}
      - HTTP_PORT=${HTTP_PORT:-8000}
      - TLS_PORT=${TLS_PORT:-8444}
      - MODEL_PROXY_TOKEN=${MODEL_PROXY_TOKEN}
    configs:
      - source: registrar_script
        target: /register.sh
        mode: 0755
    logging: *logging-conf

  proxy-glm51:
    <<: *vllm-proxy-common
    container_name: proxy-glm51
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
      - CLOUD_API_URL=https://cloud-api.near.ai
      - CLOUD_API_USAGE_TOKEN=${CLOUD_API_USAGE_TOKEN}
      - COMPOSE_MANAGER_URL=http://compose-manager:8080
      - LOG_FORMAT=json
      - MODEL_NAME=zai-org/GLM-5.1-FP8
      - OHTTP_ENABLED=true
      - TOKEN=${PROXY_TOKEN}
      - VLLM_BASE_URL=http://model-sg-glm51:8000
      - TLS_CERT_PATH=/etc/letsencrypt/live/completions.near.ai/fullchain.pem
      - USE_NV_ATTESTATION_SDK=true
      - WEB_CONTEXT_SEARCH_URL=${WEB_CONTEXT_SEARCH_URL}
      - WEB_CONTEXT_SEARCH_API_KEY=${WEB_CONTEXT_SEARCH_API_KEY}

  model-sg-glm51:
    <<: *nvidia
    init: true
    depends_on:
      model-downloader:
        condition: service_completed_successfully
    image: lmsysorg/sglang:dev-cu12@sha256:aac6b242680daeb74d2ab1d85f70575357552d7d165d2e5d30eb362797db54a1
    container_name: model-sg-glm51
    command: >
      sglang serve
      --model-path zai-org/GLM-5.1-FP8
      --revision f396cf805182f4ca10fa675e1a99815b3ca384db
      --tp 8
      --reasoning-parser glm45
      --log-requests-level 0
      --tool-call-parser glm47
      --mem-fraction-static 0.87
      --max-queued-requests 8
      --num-continuous-decode-steps 5
      --model-loader-extra-config '{"enable_multithread_load": "true", "num_threads": 64}'
      --enable-mixed-chunk
      --chunked-prefill-size 8192
      --detokenizer-worker-num 4
      --watchdog-timeout 600
      --port 8000
      --host 0.0.0.0
      --enable-cache-report
      --enable-metrics
      --trust-remote-code
      --speculative-algorithm EAGLE
      --speculative-num-steps 3
      --speculative-eagle-topk 1
      --speculative-num-draft-tokens 4
      --disable-custom-all-reduce
    volumes:
      - huggingface_cache:/root/.cache/huggingface
      - kernel_cache:/root/.cache/deep_gemm
    environment:
      - HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
      - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
      - OPENBLAS_L2_SIZE=2097152
      - NCCL_DEBUG=WARN
      - SGLANG_ENABLE_SPEC_V2=1
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ["0","1","2","3","4","5","6","7"]
              capabilities: [gpu]
    restart: unless-stopped
    stop_grace_period: 5m
    logging: *logging-conf
    labels:
      com.datadoghq.ad.check_names: '["openmetrics"]'
      com.datadoghq.ad.init_configs: "[{}]"
      com.datadoghq.ad.logs: '[{"source": "sglang", "service": "sglang", "tags":["model:zai-org/GLM-5.1-FP8","ip:${HOST_IP}","port:8000"]}]'
      com.datadoghq.ad.instances: '[{"openmetrics_endpoint":"http://model-sg-glm51:8000/metrics", "histogram_buckets_as_distributions": true, "metrics":["sglang:*"], "service": "glm-5.1", "tags":["model:zai-org/GLM-5.1-FP8","ip:${HOST_IP}","port:8000"]}]'

  dcgm-exporter:
    image: nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless
    container_name: dcgm-exporter
    runtime: nvidia
    cap_add:
      - SYS_ADMIN
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    ports:
      - "9400:9400"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    restart: unless-stopped
    logging: *logging-conf
    labels:
      com.datadoghq.ad.check_names: '["dcgm"]'
      com.datadoghq.ad.init_configs: "[{}]"
      com.datadoghq.ad.instances: '[{"openmetrics_endpoint": "http://%%host%%:9400/metrics", "tags":["model:zai-org/GLM-5.1-FP8","deployment:GLM-5.1","ip:${HOST_IP}"]}]'
      com.datadoghq.ad.logs: '[{"source": "dcgm-exporter", "service": "dcgm-exporter", "tags":["model:zai-org/GLM-5.1-FP8","deployment:GLM-5.1","ip:${HOST_IP}"]}]'

networks:
  default:
    external: true
    name: dstack_default

volumes:
  huggingface_cache:
  kernel_cache:
  certs:
    external: true
    name: certs


configs:
  registrar_script:
    content: |
      #!/bin/sh
      PROXY_URL="https://completions.near.ai"
      TOKEN="$${MODEL_PROXY_TOKEN}"
      ENDPOINT="$${HOST_IP}:$${HTTP_PORT}"
      REGISTERED=false
      FAILURE_COUNT=0
      MAX_RETRIES=3

      register_endpoint() {
        echo "Registering endpoint $$1 with routing port $$2"
        curl -sS --max-time 10 -X POST "$$PROXY_URL/register/endpoint" \
          -H "Authorization: Bearer $$TOKEN" \
          -H "Content-Type: application/json" \
          -d "{\"endpoint\":\"$$1\",\"routing_port\":$$2}"
      }

      unregister_endpoint() {
        echo "Unregistering endpoint $$1"
        curl -sS --max-time 10 -X POST "$$PROXY_URL/unregister/endpoint" \
          -H "Authorization: Bearer $$TOKEN" \
          -H "Content-Type: application/json" \
          -d "{\"endpoint\":\"$$1\"}"
      }

      register_model() {
        curl -sS --max-time 10 -X POST "$$PROXY_URL/register/model" \
          -H "Authorization: Bearer $$TOKEN" \
          -H "Content-Type: application/json" \
          -d "{\"model\":\"$$1\",\"domain\":\"$$2\"}"
      }

      cleanup() {
        echo "SIGTERM received, unregistering $$ENDPOINT"
        unregister_endpoint "$$ENDPOINT"
        REGISTERED=false
        exit 0
      }
      trap cleanup TERM INT

      # 1-token completion health check directly to model container (no auth needed)
      check_inference() {
        echo "Performing health check on model endpoint..."
        curl -sSf --max-time 45 -X POST "http://model-sg-glm51:8000/v1/chat/completions" \
          -H "Content-Type: application/json" \
          -d '{"model":"zai-org/GLM-5.1-FP8","messages":[{"role":"user","content":"hi"}],"max_tokens":1}'
      }

      echo "Waiting for model to be ready..."
      until curl -sf http://proxy-nginx:80/v1/models > /dev/null 2>&1; do sleep 30; done
      echo "Model ready, starting registration loop"

      while true; do
        if check_inference; then
          FAILURE_COUNT=0
          register_endpoint "$$ENDPOINT" "$${TLS_PORT}"
          register_model "zai-org/GLM-5.1-FP8" "glm-5-1.completions.near.ai"
          if [ "$$REGISTERED" = false ]; then
            echo "[GLM-5.1] Registered zai-org/GLM-5.1-FP8 at $$ENDPOINT"
          fi
          REGISTERED=true
        else
          FAILURE_COUNT=$$((FAILURE_COUNT + 1))
          echo "[GLM-5.1] Health check failed ($$FAILURE_COUNT/$$MAX_RETRIES)"
          if [ "$$REGISTERED" = true ] && [ "$$FAILURE_COUNT" -ge "$$MAX_RETRIES" ]; then
            echo "[GLM-5.1] Health check failed and Retry limit reached, unregistering $$ENDPOINT"
            unregister_endpoint "$$ENDPOINT"
            REGISTERED=false
            echo "[GLM-5.1] Exiting to refresh DNS (container will auto-restart)"
            exit 1
          fi
        fi
        sleep 60
      done
  nginx_conf:
    content: |
      log_format json_combined escape=json
        '{'  '"time":"$$time_iso8601"'
        ',"request_id":"$$http_x_request_id"'
        ',"org_id":"$$http_x_org_id"'
        ',"workspace_id":"$$http_x_workspace_id"'
        ',"host":"$$host"'
        ',"method":"$$request_method"'
        ',"uri":"$$uri"'
        ',"status":$$status'
        ',"request_length":$$request_length'
        ',"bytes_sent":$$bytes_sent'
        ',"request_time":$$request_time'
        ',"upstream_addr":"$$upstream_addr"'
        '}';

      server {
          listen 80 default_server;

          access_log /var/log/nginx/access.log json_combined;
          client_max_body_size 100m;
          client_body_buffer_size 1m;

          location / {
              proxy_pass http://proxy-glm51:8000;
              proxy_http_version 1.1;
              proxy_set_header Host $$host;
              proxy_set_header X-Real-IP $$remote_addr;
              proxy_set_header X-Forwarded-For $$proxy_add_x_forwarded_for;
              proxy_set_header X-Forwarded-Proto $$scheme;
              proxy_set_header Connection '';
              proxy_buffering off;
              proxy_cache off;
              proxy_read_timeout 3600s;
          }
      }

      server {
          listen 443 ssl http2;
          server_name glm-5-1.completions.near.ai;

          ssl_certificate /etc/letsencrypt/live/completions.near.ai/fullchain.pem;
          ssl_certificate_key /etc/letsencrypt/live/completions.near.ai/privkey.pem;
          ssl_protocols TLSv1.2 TLSv1.3;

          # Keep H2 connections from cloud-api alive across long idle gaps so the
          # bucket-pinned TCP connection survives between chats. Without this,
          # nginx defaults (75s idle, 1000 req/conn) close the connection and the
          # next request opens a new TCP via model-proxy's L4 LB → may land on a
          # different backend → signature 404. Pairs with cloud-api H2 keepalive
          # PINGs (http2_keep_alive_while_idle).
          keepalive_timeout 1h;
          keepalive_requests 1000000;

          access_log /var/log/nginx/access.log json_combined;
          client_max_body_size 100m;
          client_body_buffer_size 1m;

          location / {
              proxy_pass http://proxy-glm51:8000;
              proxy_http_version 1.1;
              proxy_set_header Host $$host;
              proxy_set_header X-Real-IP $$remote_addr;
              proxy_set_header X-Forwarded-For $$proxy_add_x_forwarded_for;
              proxy_set_header X-Forwarded-Proto $$scheme;
              proxy_set_header Connection '';
              proxy_buffering off;
              proxy_cache off;
              proxy_read_timeout 3600s;
          }
      }