@@ -27,7 +27,7 @@ x-vllm-proxy-common: &vllm-proxy-common
2727 restart : unless-stopped
2828 logging : *logging-conf
2929
30- x-qwen35-122b-common : &qwen35-122b-common
30+ x-sg- qwen35-122b-common : &sg- qwen35-122b-common
3131 << : *nvidia
3232 init : true
3333 # SGLang v0.5.12 (cu129) — migrated off vLLM 2026-05-22.
@@ -92,9 +92,9 @@ services:
9292 - HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
9393 logging : *logging-conf
9494
95- nginx :
95+ proxy- nginx :
9696 image : nginx@sha256:1d13701a5f9f3fb01aaa88cef2344d65b6b5bf6b7d9fa4cf0dca557a8d7702ba
97- container_name : nginx
97+ container_name : proxy- nginx
9898 command : /bin/sh -c 'while :; do sleep 6h; nginx -s reload; done & nginx -g "daemon off;"'
9999 ports :
100100 - " 8000:80"
@@ -126,9 +126,9 @@ services:
126126
127127 # --- Single proxy for both backends ---
128128
129- vllm- proxy-qwen35 :
129+ proxy-qwen35 :
130130 << : *vllm-proxy-common
131- container_name : vllm- proxy-qwen35
131+ container_name : proxy-qwen35
132132 environment :
133133 - NVIDIA_VISIBLE_DEVICES=all
134134 - CLOUD_API_URL=https://cloud-api.near.ai
@@ -138,7 +138,7 @@ services:
138138 - MODEL_NAME=Qwen/Qwen3.5-122B-A10B
139139 - OHTTP_ENABLED=true
140140 - TOKEN=${PROXY_TOKEN}
141- - VLLM_BACKEND_URLS=http://qwen35-1:8000,http://qwen35-2:8000
141+ - VLLM_BACKEND_URLS=http://model-sg- qwen35-122b- 1:8000,http://model-sg- qwen35-122b -2:8000
142142 - VLLM_PROXY_MAX_REQUEST_SIZE=104857600
143143 - TLS_CERT_PATH=/etc/letsencrypt/live/completions.near.ai/fullchain.pem
144144 - USE_NV_ATTESTATION_SDK=true
@@ -147,9 +147,9 @@ services:
147147
148148 # --- Qwen3.5-122B-A10B instance 1 (GPUs 0-3) ---
149149
150- qwen35-1 :
151- << : *qwen35-122b-common
152- container_name : qwen35-1
150+ model-sg- qwen35-122b -1 :
151+ << : *sg- qwen35-122b-common
152+ container_name : model-sg- qwen35-122b -1
153153 depends_on :
154154 model-downloader :
155155 condition : service_completed_successfully
@@ -164,13 +164,13 @@ services:
164164 com.datadoghq.ad.check_names : ' ["openmetrics"]'
165165 com.datadoghq.ad.init_configs : " [{}]"
166166 com.datadoghq.ad.logs : ' [{"source": "sglang", "service": "sglang", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8000","instance:1"]}]'
167- com.datadoghq.ad.instances : ' [{"openmetrics_endpoint":"http://qwen35-1:8000/metrics", "metrics":["sglang:*"], "histogram_buckets_as_distributions": true, "service": "qwen35-1", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8000"]}]'
167+ com.datadoghq.ad.instances : ' [{"openmetrics_endpoint":"http://model-sg- qwen35-122b -1:8000/metrics", "metrics":["sglang:*"], "histogram_buckets_as_distributions": true, "service": "qwen35-1", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8000"]}]'
168168
169169 # --- Qwen3.5-122B-A10B instance 2 (GPUs 4-7) ---
170170
171- qwen35-2 :
172- << : *qwen35-122b-common
173- container_name : qwen35-2
171+ model-sg- qwen35-122b -2 :
172+ << : *sg- qwen35-122b-common
173+ container_name : model-sg- qwen35-122b -2
174174 depends_on :
175175 model-downloader :
176176 condition : service_completed_successfully
@@ -185,7 +185,7 @@ services:
185185 com.datadoghq.ad.check_names : ' ["openmetrics"]'
186186 com.datadoghq.ad.init_configs : " [{}]"
187187 com.datadoghq.ad.logs : ' [{"source": "sglang", "service": "sglang", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8001","instance:2"]}]'
188- com.datadoghq.ad.instances : ' [{"openmetrics_endpoint":"http://qwen35-2:8000/metrics", "metrics":["sglang:*"], "histogram_buckets_as_distributions": true, "service": "qwen35-2", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8001"]}]'
188+ com.datadoghq.ad.instances : ' [{"openmetrics_endpoint":"http://model-sg- qwen35-122b -2:8000/metrics", "metrics":["sglang:*"], "histogram_buckets_as_distributions": true, "service": "qwen35-2", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8001"]}]'
189189
190190 dcgm-exporter :
191191 image : nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless
@@ -269,13 +269,13 @@ configs:
269269 # Health check directly on backend (no auth needed on raw sglang container)
270270 check_inference() {
271271 echo "Performing health check on backend..."
272- curl -sSf --max-time 45 -X POST "http://qwen35-1:8000/v1/chat/completions" \
272+ curl -sSf --max-time 45 -X POST "http://model-sg- qwen35-122b -1:8000/v1/chat/completions" \
273273 -H "Content-Type: application/json" \
274274 -d '{"model":"Qwen/Qwen3.5-122B-A10B","messages":[{"role":"user","content":"hi"}],"max_tokens":1}'
275275 }
276276
277277 echo "Waiting for model to be ready..."
278- until curl -sf http://nginx:80/v1/models > /dev/null 2>&1; do sleep 30; done
278+ until curl -sf http://proxy- nginx:80/v1/models > /dev/null 2>&1; do sleep 30; done
279279 echo "Model ready, starting registration loop"
280280
281281 while true; do
@@ -334,7 +334,7 @@ configs:
334334 # :80 — single proxy handles both backends
335335 server {
336336 listen 80 default_server;
337- location / { proxy_pass http://vllm- proxy-qwen35:8000; }
337+ location / { proxy_pass http://proxy-qwen35:8000; }
338338 }
339339
340340 ssl_certificate /etc/letsencrypt/live/completions.near.ai/fullchain.pem;
@@ -352,5 +352,5 @@ configs:
352352 # PINGs (http2_keep_alive_while_idle).
353353 keepalive_timeout 1h;
354354 keepalive_requests 1000000;
355- location / { proxy_pass http://vllm- proxy-qwen35:8000; }
355+ location / { proxy_pass http://proxy-qwen35:8000; }
356356 }
0 commit comments