Skip to content

Commit d92a32e

Browse files
committed
feat: add model-proxy registrar, fix vLLM 0.16 speculative config
Add model-proxy-registrar sidecar to all model configs for automatic endpoint/model registration with the proxy fleet. Remove prefill_token_shift and num_draft_tokens from Qwen3-30B speculative config — these params were removed in vLLM v0.16.0.
1 parent cde9be1 commit d92a32e

4 files changed

Lines changed: 200 additions & 1 deletion

File tree

DeepSeek-V3.1.yaml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,22 @@ services:
5959
restart: unless-stopped
6060
logging: *logging-conf
6161

62+
model-proxy-registrar:
63+
image: curlimages/curl@sha256:d94d07ba9e7d6de898b6d96c1a072f6f8266c687af78a74f380087a0addf5d17
64+
container_name: model-proxy-registrar
65+
entrypoint: ["sh", "/register.sh"]
66+
restart: unless-stopped
67+
environment:
68+
- HOST_IP=${HOST_IP}
69+
- HTTP_PORT=${HTTP_PORT:-8000}
70+
- TLS_PORT=${TLS_PORT:-8444}
71+
- MODEL_PROXY_TOKEN=${MODEL_PROXY_TOKEN}
72+
configs:
73+
- source: registrar_script
74+
target: /register.sh
75+
mode: 0755
76+
logging: *logging-conf
77+
6278
vllm-proxy-deepseek:
6379
<<: *vllm-proxy-common
6480
container_name: vllm-proxy-deepseek
@@ -183,6 +199,36 @@ configs:
183199
proxy_read_timeout 300s;
184200
}
185201
}
202+
registrar_script:
203+
content: |
204+
#!/bin/sh
205+
PROXY_URL="https://completions.near.ai"
206+
TOKEN="$${MODEL_PROXY_TOKEN}"
207+
208+
register_endpoint() {
209+
curl -sS -X POST "$$PROXY_URL/register/endpoint" \
210+
-H "Authorization: Bearer $$TOKEN" \
211+
-H "Content-Type: application/json" \
212+
-d "{\"endpoint\":\"$$1\",\"routing_port\":$$2}" || true
213+
}
214+
215+
register_model() {
216+
curl -sS -X POST "$$PROXY_URL/register/model" \
217+
-H "Authorization: Bearer $$TOKEN" \
218+
-H "Content-Type: application/json" \
219+
-d "{\"model\":\"$$1\",\"domain\":\"$$2\"}" || true
220+
}
221+
222+
echo "Waiting for model to be ready..."
223+
until curl -sf http://nginx:80/v1/models > /dev/null 2>&1; do sleep 30; done
224+
echo "Model ready, starting registration loop"
225+
226+
while true; do
227+
register_endpoint "$${HOST_IP}:$${HTTP_PORT}" "$${TLS_PORT}"
228+
register_model "deepseek-ai/DeepSeek-V3.1" "deepseek-v31.completions.near.ai"
229+
echo "Registered deepseek-ai/DeepSeek-V3.1 at $${HOST_IP}:$${HTTP_PORT}"
230+
sleep 300
231+
done
186232
chat_template_deepseek:
187233
content: |
188234
{% if not add_generation_prompt is defined %}

GLM-5.yaml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,22 @@ services:
5151
restart: unless-stopped
5252
logging: *logging-conf
5353

54+
model-proxy-registrar:
55+
image: curlimages/curl@sha256:d94d07ba9e7d6de898b6d96c1a072f6f8266c687af78a74f380087a0addf5d17
56+
container_name: model-proxy-registrar
57+
entrypoint: ["sh", "/register.sh"]
58+
restart: unless-stopped
59+
environment:
60+
- HOST_IP=${HOST_IP}
61+
- HTTP_PORT=${HTTP_PORT:-8000}
62+
- TLS_PORT=${TLS_PORT:-8444}
63+
- MODEL_PROXY_TOKEN=${MODEL_PROXY_TOKEN}
64+
configs:
65+
- source: registrar_script
66+
target: /register.sh
67+
mode: 0755
68+
logging: *logging-conf
69+
5470
proxy-glm:
5571
<<: *vllm-proxy-common
5672
container_name: proxy-glm
@@ -122,6 +138,36 @@ volumes:
122138

123139

124140
configs:
141+
registrar_script:
142+
content: |
143+
#!/bin/sh
144+
PROXY_URL="https://completions.near.ai"
145+
TOKEN="$${MODEL_PROXY_TOKEN}"
146+
147+
register_endpoint() {
148+
curl -sS -X POST "$$PROXY_URL/register/endpoint" \
149+
-H "Authorization: Bearer $$TOKEN" \
150+
-H "Content-Type: application/json" \
151+
-d "{\"endpoint\":\"$$1\",\"routing_port\":$$2}" || true
152+
}
153+
154+
register_model() {
155+
curl -sS -X POST "$$PROXY_URL/register/model" \
156+
-H "Authorization: Bearer $$TOKEN" \
157+
-H "Content-Type: application/json" \
158+
-d "{\"model\":\"$$1\",\"domain\":\"$$2\"}" || true
159+
}
160+
161+
echo "Waiting for model to be ready..."
162+
until curl -sf http://nginx:80/v1/models > /dev/null 2>&1; do sleep 30; done
163+
echo "Model ready, starting registration loop"
164+
165+
while true; do
166+
register_endpoint "$${HOST_IP}:$${HTTP_PORT}" "$${TLS_PORT}"
167+
register_model "zai-org/GLM-5-FP8" "glm-5.completions.near.ai"
168+
echo "Registered zai-org/GLM-5-FP8 at $${HOST_IP}:$${HTTP_PORT}"
169+
sleep 300
170+
done
125171
nginx_conf:
126172
content: |
127173
server {

Qwen3.5-122B.yaml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,21 @@ services:
8282
restart: unless-stopped
8383
logging: *logging-conf
8484

85+
model-proxy-registrar:
86+
image: curlimages/curl@sha256:d94d07ba9e7d6de898b6d96c1a072f6f8266c687af78a74f380087a0addf5d17
87+
container_name: model-proxy-registrar
88+
entrypoint: ["sh", "/register.sh"]
89+
restart: unless-stopped
90+
environment:
91+
- HOST_IP=${HOST_IP}
92+
- TLS_PORT=${TLS_PORT:-8444}
93+
- MODEL_PROXY_TOKEN=${MODEL_PROXY_TOKEN}
94+
configs:
95+
- source: registrar_script
96+
target: /register.sh
97+
mode: 0755
98+
logging: *logging-conf
99+
85100
# --- Qwen3.5-122B-A10B instance 1 (GPUs 0-3) ---
86101

87102
sglang-qwen35-122b-1:
@@ -152,6 +167,39 @@ volumes:
152167
name: certs
153168

154169
configs:
170+
registrar_script:
171+
content: |
172+
#!/bin/sh
173+
PROXY_URL="https://completions.near.ai"
174+
TOKEN="$${MODEL_PROXY_TOKEN}"
175+
176+
register_endpoint() {
177+
curl -sS -X POST "$$PROXY_URL/register/endpoint" \
178+
-H "Authorization: Bearer $$TOKEN" \
179+
-H "Content-Type: application/json" \
180+
-d "{\"endpoint\":\"$$1\",\"routing_port\":$$2}" || true
181+
}
182+
183+
register_model() {
184+
curl -sS -X POST "$$PROXY_URL/register/model" \
185+
-H "Authorization: Bearer $$TOKEN" \
186+
-H "Content-Type: application/json" \
187+
-d "{\"model\":\"$$1\",\"domain\":\"$$2\"}" || true
188+
}
189+
190+
echo "Waiting for models to be ready..."
191+
until curl -sf http://nginx:8000/v1/models > /dev/null 2>&1; do sleep 30; done
192+
echo "Instance 1 ready, starting registration loop"
193+
194+
while true; do
195+
register_endpoint "$${HOST_IP}:8000" "$${TLS_PORT}"
196+
if curl -sf http://nginx:8001/v1/models > /dev/null 2>&1; then
197+
register_endpoint "$${HOST_IP}:8001" "$${TLS_PORT}"
198+
fi
199+
register_model "Qwen/Qwen3.5-122B-A10B" "qwen35-122b.completions.near.ai"
200+
echo "Registered Qwen/Qwen3.5-122B-A10B at $${HOST_IP}:8000,8001"
201+
sleep 300
202+
done
155203
nginx_conf:
156204
content: |
157205
proxy_http_version 1.1;

small-models.yaml

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,20 @@ services:
148148
restart: unless-stopped
149149
logging: *logging-conf
150150

151+
model-proxy-registrar:
152+
image: curlimages/curl@sha256:d94d07ba9e7d6de898b6d96c1a072f6f8266c687af78a74f380087a0addf5d17
153+
container_name: model-proxy-registrar
154+
entrypoint: ["sh", "/register.sh"]
155+
restart: unless-stopped
156+
environment:
157+
- HOST_IP=${HOST_IP}
158+
- MODEL_PROXY_TOKEN=${MODEL_PROXY_TOKEN}
159+
configs:
160+
- source: registrar_script
161+
target: /register.sh
162+
mode: 0755
163+
logging: *logging-conf
164+
151165
# --- Qwen3-30B (GPUs 0-1) ---
152166

153167
vllm-proxy-qwen3-30b:
@@ -179,7 +193,7 @@ services:
179193
--load-format runai_streamer
180194
--dtype float16
181195
--model-loader-extra-config '{"distributed":true, "concurrency":48}'
182-
--speculative-config '{"method":"eagle3","model":"lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge-Nex","prefill_token_shift":false,"num_speculative_tokens":3,"draft_tensor_parallel_size":1, "num_draft_tokens":4}'
196+
--speculative-config '{"method":"eagle3","model":"lmsys/SGLang-EAGLE3-Qwen3-30B-A3B-Instruct-2507-SpecForge-Nex","num_speculative_tokens":3,"draft_tensor_parallel_size":1}'
183197
volumes:
184198
- hugginface_cache:/root/.cache/huggingface
185199
- vllm_cache:/root/.cache/vllm
@@ -506,6 +520,51 @@ volumes:
506520
name: certs
507521

508522
configs:
523+
registrar_script:
524+
content: |
525+
#!/bin/sh
526+
PROXY_URL="https://completions.near.ai"
527+
TOKEN="$${MODEL_PROXY_TOKEN}"
528+
TLS_PORT=8444
529+
530+
register_endpoint() {
531+
curl -sS -X POST "$$PROXY_URL/register/endpoint" \
532+
-H "Authorization: Bearer $$TOKEN" \
533+
-H "Content-Type: application/json" \
534+
-d "{\"endpoint\":\"$$1\",\"routing_port\":$$TLS_PORT}" || true
535+
}
536+
537+
register_model() {
538+
curl -sS -X POST "$$PROXY_URL/register/model" \
539+
-H "Authorization: Bearer $$TOKEN" \
540+
-H "Content-Type: application/json" \
541+
-d "{\"model\":\"$$1\",\"domain\":\"$$2\"}" || true
542+
}
543+
544+
echo "Waiting for first model to be ready..."
545+
until curl -sf http://nginx:8000/v1/models > /dev/null 2>&1; do sleep 30; done
546+
echo "First model ready, starting registration loop"
547+
548+
while true; do
549+
# Register each endpoint if healthy
550+
for port in 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010; do
551+
if curl -sf "http://nginx:$$port/v1/models" > /dev/null 2>&1; then
552+
register_endpoint "$${HOST_IP}:$$port" "$$TLS_PORT"
553+
fi
554+
done
555+
556+
# Model-to-domain mappings
557+
register_model "Qwen/Qwen3-30B-A3B-Instruct-2507" "qwen3-30b.completions.near.ai"
558+
register_model "openai/gpt-oss-120b" "gpt-oss-120b.completions.near.ai"
559+
register_model "black-forest-labs/FLUX.2-klein-4B" "flux2-klein.completions.near.ai"
560+
register_model "Qwen/Qwen3-VL-30B-A3B-Instruct" "qwen3-vl-30b.completions.near.ai"
561+
register_model "Qwen/Qwen3-Embedding-0.6B" "qwen3-embedding.completions.near.ai"
562+
register_model "Qwen/Qwen3-Reranker-0.6B" "qwen3-reranker.completions.near.ai"
563+
register_model "openai/whisper-large-v3" "whisper-large-v3.completions.near.ai"
564+
565+
echo "Registration cycle complete for $${HOST_IP}"
566+
sleep 300
567+
done
509568
nginx_conf:
510569
content: |
511570
# Common proxy settings

0 commit comments

Comments
 (0)