Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions GLM-5.1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ services:
environment:
- HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}

nginx:
proxy-nginx:
image: nginx@sha256:1d13701a5f9f3fb01aaa88cef2344d65b6b5bf6b7d9fa4cf0dca557a8d7702ba
container_name: nginx
container_name: proxy-nginx
command: /bin/sh -c 'while :; do sleep 6h; nginx -s reload; done & nginx -g "daemon off;"'
ports:
- "8000:80"
Expand Down Expand Up @@ -91,20 +91,20 @@ services:
- MODEL_NAME=zai-org/GLM-5.1-FP8
- OHTTP_ENABLED=true
- TOKEN=${PROXY_TOKEN}
- VLLM_BASE_URL=http://glm51:8000
- VLLM_BASE_URL=http://model-sg-glm51:8000
- TLS_CERT_PATH=/etc/letsencrypt/live/completions.near.ai/fullchain.pem
- USE_NV_ATTESTATION_SDK=true
- WEB_CONTEXT_SEARCH_URL=${WEB_CONTEXT_SEARCH_URL}
- WEB_CONTEXT_SEARCH_API_KEY=${WEB_CONTEXT_SEARCH_API_KEY}

glm51:
model-sg-glm51:
<<: *nvidia
init: true
depends_on:
model-downloader:
condition: service_completed_successfully
image: lmsysorg/sglang:dev-cu12@sha256:aac6b242680daeb74d2ab1d85f70575357552d7d165d2e5d30eb362797db54a1
container_name: glm51
container_name: model-sg-glm51
command: >
sglang serve
--model-path zai-org/GLM-5.1-FP8
Expand Down Expand Up @@ -155,7 +155,7 @@ services:
com.datadoghq.ad.check_names: '["openmetrics"]'
com.datadoghq.ad.init_configs: "[{}]"
com.datadoghq.ad.logs: '[{"source": "sglang", "service": "sglang", "tags":["model:zai-org/GLM-5.1-FP8","ip:${HOST_IP}","port:8000"]}]'
com.datadoghq.ad.instances: '[{"openmetrics_endpoint":"http://glm51:8000/metrics", "histogram_buckets_as_distributions": true, "metrics":["sglang:*"], "service": "glm-5.1", "tags":["model:zai-org/GLM-5.1-FP8","ip:${HOST_IP}","port:8000"]}]'
com.datadoghq.ad.instances: '[{"openmetrics_endpoint":"http://model-sg-glm51:8000/metrics", "histogram_buckets_as_distributions": true, "metrics":["sglang:*"], "service": "glm-5.1", "tags":["model:zai-org/GLM-5.1-FP8","ip:${HOST_IP}","port:8000"]}]'

dcgm-exporter:
image: nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless
Expand Down Expand Up @@ -240,13 +240,13 @@ configs:
# 1-token completion health check directly to model container (no auth needed)
check_inference() {
echo "Performing health check on model endpoint..."
curl -sSf --max-time 45 -X POST "http://glm51:8000/v1/chat/completions" \
curl -sSf --max-time 45 -X POST "http://model-sg-glm51:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{"model":"zai-org/GLM-5.1-FP8","messages":[{"role":"user","content":"hi"}],"max_tokens":1}'
}

echo "Waiting for model to be ready..."
until curl -sf http://nginx:80/v1/models > /dev/null 2>&1; do sleep 30; done
until curl -sf http://proxy-nginx:80/v1/models > /dev/null 2>&1; do sleep 30; done
echo "Model ready, starting registration loop"

while true; do
Expand Down
36 changes: 18 additions & 18 deletions Qwen3.5-122B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ x-vllm-proxy-common: &vllm-proxy-common
restart: unless-stopped
logging: *logging-conf

x-qwen35-122b-common: &qwen35-122b-common
x-sg-qwen35-122b-common: &sg-qwen35-122b-common
<<: *nvidia
init: true
# SGLang v0.5.12 (cu129) — migrated off vLLM 2026-05-22.
Expand Down Expand Up @@ -92,9 +92,9 @@ services:
- HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
logging: *logging-conf

nginx:
proxy-nginx:
image: nginx@sha256:1d13701a5f9f3fb01aaa88cef2344d65b6b5bf6b7d9fa4cf0dca557a8d7702ba
container_name: nginx
container_name: proxy-nginx
command: /bin/sh -c 'while :; do sleep 6h; nginx -s reload; done & nginx -g "daemon off;"'
ports:
- "8000:80"
Expand Down Expand Up @@ -126,9 +126,9 @@ services:

# --- Single proxy for both backends ---

vllm-proxy-qwen35:
proxy-qwen35:
<<: *vllm-proxy-common
container_name: vllm-proxy-qwen35
container_name: proxy-qwen35
environment:
- NVIDIA_VISIBLE_DEVICES=all
- CLOUD_API_URL=https://cloud-api.near.ai
Expand All @@ -137,7 +137,7 @@ services:
- MODEL_NAME=Qwen/Qwen3.5-122B-A10B
- OHTTP_ENABLED=true
- TOKEN=${PROXY_TOKEN}
- VLLM_BACKEND_URLS=http://qwen35-1:8000,http://qwen35-2:8000
- VLLM_BACKEND_URLS=http://model-sg-qwen35-122b-1:8000,http://model-sg-qwen35-122b-2:8000
- VLLM_PROXY_MAX_REQUEST_SIZE=104857600
- TLS_CERT_PATH=/etc/letsencrypt/live/completions.near.ai/fullchain.pem
- USE_NV_ATTESTATION_SDK=true
Expand All @@ -146,9 +146,9 @@ services:

# --- Qwen3.5-122B-A10B instance 1 (GPUs 0-3) ---

qwen35-1:
<<: *qwen35-122b-common
container_name: qwen35-1
model-sg-qwen35-122b-1:
<<: *sg-qwen35-122b-common
container_name: model-sg-qwen35-122b-1
depends_on:
model-downloader:
condition: service_completed_successfully
Expand All @@ -163,13 +163,13 @@ services:
com.datadoghq.ad.check_names: '["openmetrics"]'
com.datadoghq.ad.init_configs: "[{}]"
com.datadoghq.ad.logs: '[{"source": "sglang", "service": "sglang", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8000","instance:1"]}]'
com.datadoghq.ad.instances: '[{"openmetrics_endpoint":"http://qwen35-1:8000/metrics", "metrics":["sglang:*"], "histogram_buckets_as_distributions": true, "service": "qwen35-1", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8000"]}]'
com.datadoghq.ad.instances: '[{"openmetrics_endpoint":"http://model-sg-qwen35-122b-1:8000/metrics", "metrics":["sglang:*"], "histogram_buckets_as_distributions": true, "service": "qwen35-1", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8000"]}]'

# --- Qwen3.5-122B-A10B instance 2 (GPUs 4-7) ---

qwen35-2:
<<: *qwen35-122b-common
container_name: qwen35-2
model-sg-qwen35-122b-2:
<<: *sg-qwen35-122b-common
container_name: model-sg-qwen35-122b-2
depends_on:
model-downloader:
condition: service_completed_successfully
Expand All @@ -184,7 +184,7 @@ services:
com.datadoghq.ad.check_names: '["openmetrics"]'
com.datadoghq.ad.init_configs: "[{}]"
com.datadoghq.ad.logs: '[{"source": "sglang", "service": "sglang", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8001","instance:2"]}]'
com.datadoghq.ad.instances: '[{"openmetrics_endpoint":"http://qwen35-2:8000/metrics", "metrics":["sglang:*"], "histogram_buckets_as_distributions": true, "service": "qwen35-2", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8001"]}]'
com.datadoghq.ad.instances: '[{"openmetrics_endpoint":"http://model-sg-qwen35-122b-2:8000/metrics", "metrics":["sglang:*"], "histogram_buckets_as_distributions": true, "service": "qwen35-2", "tags":["model:Qwen/Qwen3.5-122B-A10B","ip:${HOST_IP}","port:8001"]}]'

dcgm-exporter:
image: nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless
Expand Down Expand Up @@ -268,13 +268,13 @@ configs:
# Health check directly on backend (no auth needed on raw sglang container)
check_inference() {
echo "Performing health check on backend..."
curl -sSf --max-time 45 -X POST "http://qwen35-1:8000/v1/chat/completions" \
curl -sSf --max-time 45 -X POST "http://model-sg-qwen35-122b-1:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{"model":"Qwen/Qwen3.5-122B-A10B","messages":[{"role":"user","content":"hi"}],"max_tokens":1}'
}

echo "Waiting for model to be ready..."
until curl -sf http://nginx:80/v1/models > /dev/null 2>&1; do sleep 30; done
until curl -sf http://proxy-nginx:80/v1/models > /dev/null 2>&1; do sleep 30; done
echo "Model ready, starting registration loop"

while true; do
Expand Down Expand Up @@ -333,7 +333,7 @@ configs:
# :80 — single proxy handles both backends
server {
listen 80 default_server;
location / { proxy_pass http://vllm-proxy-qwen35:8000; }
location / { proxy_pass http://proxy-qwen35:8000; }
}

ssl_certificate /etc/letsencrypt/live/completions.near.ai/fullchain.pem;
Expand All @@ -351,5 +351,5 @@ configs:
# PINGs (http2_keep_alive_while_idle).
keepalive_timeout 1h;
keepalive_requests 1000000;
location / { proxy_pass http://vllm-proxy-qwen35:8000; }
location / { proxy_pass http://proxy-qwen35:8000; }
}
Loading
Loading