cvm-compose-files/Qwen3.5-122B.yaml at main · nearai/cvm-compose-files · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
x-logging-conf: &logging-conf
  driver: "json-file"
  options:
    max-size: "100m"
    max-file: "10"
    labels: "com.datadoghq.ad.logs"

x-nvidia: &nvidia
  runtime: nvidia
  ipc: host
  ulimits:
    memlock: -1
    nofile:
      soft: 65535
      hard: 65535

x-vllm-proxy-common: &vllm-proxy-common
  image: nearaidev/vllm-proxy-rs@sha256:59e42dd68faa15eb0c23521029a2fc3d80d86a4143f9f766542357918be33a8c
  user: root
  privileged: true
  <<: *nvidia
  extra_hosts:
    - "compose-manager:host-gateway"
  volumes:
    - /var/run/dstack.sock:/var/run/dstack.sock
    - certs:/etc/letsencrypt:ro
  restart: unless-stopped
  logging: *logging-conf

x-sg-qwen35-122b-common: &sg-qwen35-122b-common
  <<: *nvidia
  init: true
  # SGLang v0.5.12 (cu129) — migrated off vLLM 2026-05-22.
  # Context dropped from 1.01M (yarn rope override) to native 262144.
  # EAGLE spec decoding enabled (Spec V2 default since 0.5.11).
  image: lmsysorg/sglang:v0.5.12-cu129@sha256:9e02c8e1fe2790a1c445bd5f6814305fe43639a4adb01c8ad1e8e21e750bf581
  command: >
      sglang serve
      --model-path Qwen/Qwen3.5-122B-A10B
      --revision dc4d348443bc740c68e2d77492492c11606384d5
      --tp 4
      --reasoning-parser qwen3
      --tool-call-parser qwen3_coder
      --speculative-algorithm EAGLE
      --speculative-num-steps 3
      --speculative-eagle-topk 1
      --speculative-num-draft-tokens 4
      --mamba-scheduler-strategy extra_buffer
      --kv-cache-dtype fp8_e4m3
      --mem-fraction-static 0.88
      --context-length 262144
      --num-continuous-decode-steps 5
      --model-loader-extra-config '{"enable_multithread_load": "true", "num_threads": 64}'
      --enable-mixed-chunk
      --chunked-prefill-size 16384
      --port 8000
      --host 0.0.0.0
      --enable-cache-report
      --enable-metrics
      --trust-remote-code
      --log-requests-level 0
      --served-model-name Qwen/Qwen3.5-122B-A10B
  volumes:
    - huggingface_cache:/root/.cache/huggingface
    - kernel_cache:/root/.cache/deep_gemm
  environment:
    - HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
    - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
    - NVIDIA_DRIVER_CAPABILITIES=compute,utility
    - OPENBLAS_L2_SIZE=2097152
    - NCCL_DEBUG=WARN
    - SGLANG_ENABLE_SPEC_V2=1
  restart: unless-stopped
  stop_grace_period: 5m
  logging: *logging-conf

services:
  model-downloader:
    image: ghcr.io/astral-sh/uv:python3.11-bookworm-slim@sha256:4f5d923c9dcea037f57bda425dd209f3ec643da2f0b74227f68d09dab0b3bb36
    container_name: model-downloader
    restart: "no"
    entrypoint: ["sh", "-c"]
    command:
      - |
        set -e
        echo "Downloading Qwen/Qwen3.5-122B-A10B..."
        uvx --from 'huggingface_hub[hf_xet]' hf download Qwen/Qwen3.5-122B-A10B --revision dc4d348443bc740c68e2d77492492c11606384d5
        echo "Download complete."
    volumes:
      - huggingface_cache:/root/.cache/huggingface
    environment:
      - HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
    logging: *logging-conf

  proxy-nginx:
    image: nginx@sha256:1d13701a5f9f3fb01aaa88cef2344d65b6b5bf6b7d9fa4cf0dca557a8d7702ba
    container_name: proxy-nginx
    command: /bin/sh -c 'while :; do sleep 6h; nginx -s reload; done & nginx -g "daemon off;"'
    ports:
      - "8000:80"
      - "8444:443"
    volumes:
      - certs:/etc/letsencrypt:ro
    configs:
      - source: nginx_conf
        target: /etc/nginx/conf.d/default.conf
        mode: 0644
    restart: unless-stopped
    logging: *logging-conf

  model-proxy-registrar:
    image: curlimages/curl@sha256:d94d07ba9e7d6de898b6d96c1a072f6f8266c687af78a74f380087a0addf5d17
    container_name: model-proxy-registrar
    entrypoint: ["sh", "/register.sh"]
    restart: unless-stopped
    environment:
      - HOST_IP=${HOST_IP}
      - HTTP_PORT=${HTTP_PORT:-8000}
      - TLS_PORT=${TLS_PORT:-8444}
      - MODEL_PROXY_TOKEN=${MODEL_PROXY_TOKEN}
    configs:
      - source: registrar_script
        target: /register.sh
        mode: 0755
    logging: *logging-conf

  # --- Single proxy for both backends ---

  proxy-qwen35:
    <<: *vllm-proxy-common
    container_name: proxy-qwen35
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
      - CLOUD_API_URL=https://cloud-api.near.ai
      - CLOUD_API_USAGE_TOKEN=${CLOUD_API_USAGE_TOKEN}
      - COMPOSE_MANAGER_URL=http://compose-manager:8080
      - LOG_FORMAT=json
      - MODEL_NAME=Qwen/Qwen3.5-122B-A10B
      - OHTTP_ENABLED=true
      - TOKEN=${PROXY_TOKEN}
      - VLLM_BACKEND_URLS=http://model-sg-qwen35-122b-1:8000,http://model-sg-qwen35-122b-2:8000
      - VLLM_PROXY_MAX_REQUEST_SIZE=104857600
      - TLS_CERT_PATH=/etc/letsencrypt/live/completions.near.ai/fullchain.pem
      - USE_NV_ATTESTATION_SDK=true
    labels:
      com.datadoghq.ad.logs: '[{"source": "vllm-proxy", "service": "vllm-proxy", "tags": ["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8000"]}]'

  # --- Qwen3.5-122B-A10B instance 1 (GPUs 0-3) ---

  model-sg-qwen35-122b-1:
    <<: *sg-qwen35-122b-common
    container_name: model-sg-qwen35-122b-1
    depends_on:
      model-downloader:
        condition: service_completed_successfully
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ["0","1","2","3"]
              capabilities: [gpu]
    labels:
      com.datadoghq.ad.check_names: '["openmetrics"]'
      com.datadoghq.ad.init_configs: "[{}]"
      com.datadoghq.ad.logs: '[{"source": "sglang", "service": "sglang", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8000","instance:1"]}]'
      com.datadoghq.ad.instances: '[{"openmetrics_endpoint":"http://model-sg-qwen35-122b-1:8000/metrics", "metrics":["sglang:*"], "histogram_buckets_as_distributions": true, "service": "qwen35-1", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8000"]}]'

  # --- Qwen3.5-122B-A10B instance 2 (GPUs 4-7) ---

  model-sg-qwen35-122b-2:
    <<: *sg-qwen35-122b-common
    container_name: model-sg-qwen35-122b-2
    depends_on:
      model-downloader:
        condition: service_completed_successfully
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ["4","5","6","7"]
              capabilities: [gpu]
    labels:
      com.datadoghq.ad.check_names: '["openmetrics"]'
      com.datadoghq.ad.init_configs: "[{}]"
      com.datadoghq.ad.logs: '[{"source": "sglang", "service": "sglang", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8001","instance:2"]}]'
      com.datadoghq.ad.instances: '[{"openmetrics_endpoint":"http://model-sg-qwen35-122b-2:8000/metrics", "metrics":["sglang:*"], "histogram_buckets_as_distributions": true, "service": "qwen35-2", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8001"]}]'

  dcgm-exporter:
    image: nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless
    container_name: dcgm-exporter
    runtime: nvidia
    cap_add:
      - SYS_ADMIN
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    ports:
      - "9400:9400"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    restart: unless-stopped
    logging: *logging-conf
    labels:
      com.datadoghq.ad.check_names: '["dcgm"]'
      com.datadoghq.ad.init_configs: "[{}]"
      com.datadoghq.ad.instances: '[{"openmetrics_endpoint": "http://%%host%%:9400/metrics", "tags":["model:Qwen/Qwen3.5-122B-A10B","deployment:Qwen3.5-122B","ip:${HOST_IP}"]}]'
      com.datadoghq.ad.logs: '[{"source": "dcgm-exporter", "service": "dcgm-exporter", "tags":["model:Qwen/Qwen3.5-122B-A10B","deployment:Qwen3.5-122B","ip:${HOST_IP}"]}]'

networks:
  default:
    external: true
    name: dstack_default

volumes:
  huggingface_cache:
  kernel_cache:
  certs:
    external: true
    name: certs

configs:
  registrar_script:
    content: |
      #!/bin/sh
      PROXY_URL="https://completions.near.ai"
      TOKEN="$${MODEL_PROXY_TOKEN}"
      ENDPOINT="$${HOST_IP}:$${HTTP_PORT}"
      REGISTERED=false
      FAILURE_COUNT=0
      MAX_RETRIES=3

      register_endpoint() {
        echo "Registering endpoint $$1 with routing port $$2"
        curl -sS --max-time 10 -X POST "$$PROXY_URL/register/endpoint" \
          -H "Authorization: Bearer $$TOKEN" \
          -H "Content-Type: application/json" \
          -d "{\"endpoint\":\"$$1\",\"routing_port\":$$2}"
      }

      unregister_endpoint() {
        echo "Unregistering endpoint $$1"
        curl -sS --max-time 10 -X POST "$$PROXY_URL/unregister/endpoint" \
          -H "Authorization: Bearer $$TOKEN" \
          -H "Content-Type: application/json" \
          -d "{\"endpoint\":\"$$1\"}"
      }

      register_model() {
        curl -sS --max-time 10 -X POST "$$PROXY_URL/register/model" \
          -H "Authorization: Bearer $$TOKEN" \
          -H "Content-Type: application/json" \
          -d "{\"model\":\"$$1\",\"domain\":\"$$2\"}"
      }

      cleanup() {
        echo "SIGTERM received, unregistering $$ENDPOINT"
        unregister_endpoint "$$ENDPOINT"
        REGISTERED=false
        exit 0
      }
      trap cleanup TERM INT

      # Health check directly on backend (no auth needed on raw sglang container)
      check_inference() {
        echo "Performing health check on backend..."
        curl -sSf --max-time 45 -X POST "http://model-sg-qwen35-122b-1:8000/v1/chat/completions" \
          -H "Content-Type: application/json" \
          -d '{"model":"Qwen/Qwen3.5-122B-A10B","messages":[{"role":"user","content":"hi"}],"max_tokens":1}'
      }

      echo "Waiting for model to be ready..."
      until curl -sf http://proxy-nginx:80/v1/models > /dev/null 2>&1; do sleep 30; done
      echo "Model ready, starting registration loop"

      while true; do
        if check_inference; then
          FAILURE_COUNT=0
          register_endpoint "$$ENDPOINT" "$${TLS_PORT}"
          register_model "Qwen/Qwen3.5-122B-A10B" "qwen35-122b.completions.near.ai"
          if [ "$$REGISTERED" = false ]; then
            echo "[Qwen3.5-122B] Registered Qwen/Qwen3.5-122B-A10B at $$ENDPOINT"
          fi
          REGISTERED=true
        else
          FAILURE_COUNT=$$((FAILURE_COUNT + 1))
          echo "[Qwen3.5-122B] Health check failed ($$FAILURE_COUNT/$$MAX_RETRIES)"
          if [ "$$REGISTERED" = true ] && [ "$$FAILURE_COUNT" -ge "$$MAX_RETRIES" ]; then
            echo "[Qwen3.5-122B] Health check failed and Retry limit reached, unregistering $$ENDPOINT"
            unregister_endpoint "$$ENDPOINT"
            REGISTERED=false
            echo "[Qwen3.5-122B] Exiting to refresh DNS (container will auto-restart)"
            exit 1
          fi
        fi
        sleep 60
      done
  nginx_conf:
    content: |
      log_format json_combined escape=json
        '{'  '"time":"$$time_iso8601"'
        ',"request_id":"$$http_x_request_id"'
        ',"org_id":"$$http_x_org_id"'
        ',"workspace_id":"$$http_x_workspace_id"'
        ',"host":"$$host"'
        ',"method":"$$request_method"'
        ',"uri":"$$uri"'
        ',"status":$$status'
        ',"request_length":$$request_length'
        ',"bytes_sent":$$bytes_sent'
        ',"request_time":$$request_time'
        ',"upstream_addr":"$$upstream_addr"'
        '}';

      access_log /var/log/nginx/access.log json_combined;

      proxy_http_version 1.1;
      proxy_set_header Host $$host;
      proxy_set_header X-Real-IP $$remote_addr;
      proxy_set_header X-Forwarded-For $$proxy_add_x_forwarded_for;
      proxy_set_header X-Forwarded-Proto $$scheme;
      proxy_set_header Connection '';
      proxy_buffering off;
      proxy_cache off;
      proxy_read_timeout 3600s;
      client_max_body_size 100m;
      client_body_buffer_size 1m;

      # :80 — single proxy handles both backends
      server {
        listen 80 default_server;
        location / { proxy_pass http://proxy-qwen35:8000; }
      }

      ssl_certificate /etc/letsencrypt/live/completions.near.ai/fullchain.pem;
      ssl_certificate_key /etc/letsencrypt/live/completions.near.ai/privkey.pem;
      ssl_protocols TLSv1.2 TLSv1.3;

      server {
        listen 443 ssl http2;
        server_name qwen35-122b.completions.near.ai;
        # Keep H2 connections from cloud-api alive across long idle gaps so the
        # bucket-pinned TCP connection survives between chats. Without this,
        # nginx defaults (75s idle, 1000 req/conn) close the connection and the
        # next request opens a new TCP via model-proxy's L4 LB → may land on a
        # different backend → signature 404. Pairs with cloud-api H2 keepalive
        # PINGs (http2_keep_alive_while_idle).
        keepalive_timeout 1h;
        keepalive_requests 1000000;
        location / { proxy_pass http://proxy-qwen35:8000; }
      }