GLM-5.1: bump inference-proxy + wire web_context_search env

Evrard-Nil · Evrard-Nil · commit bf6bd8d4bf8b · 2026-05-26T18:31:45.000+02:00
Image bump - 05ad3e830dfca99e499705deab54616db483cf0b222be83b591b6037b7fd2abc + 59e42dd68faa15eb0c23521029a2fc3d80d86a4143f9f766542357918be33a8c (nearai/inference-proxy#144 — server-side `web_context_search` agent loop for /v1/chat/completions; PR merged on main as 177c58c.) Env wiring - WEB_CONTEXT_SEARCH_URL=${WEB_CONTEXT_SEARCH_URL} - WEB_CONTEXT_SEARCH_API_KEY=${WEB_CONTEXT_SEARCH_API_KEY} Both gated by compose interpolation so the feature stays dormant until the host's .env defines them (the inference-proxy startup validation requires both set or both unset, so leaving one of them hardcoded while the other comes from .env would block startup on hosts that haven't been provisioned with a Brave key). To enable on a deployed host, populate .env with: WEB_CONTEXT_SEARCH_URL=https://api.search.brave.com/res/v1/llm/context WEB_CONTEXT_SEARCH_API_KEY=<brave-subscription-token> Until then, this is a pure image bump — no behavior change.
diff --git a/GLM-5.1.yaml b/GLM-5.1.yaml
@@ -15,7 +15,7 @@ x-nvidia: &nvidia
       hard: 65535
 
 x-vllm-proxy-common: &vllm-proxy-common
-  image: nearaidev/vllm-proxy-rs@sha256:05ad3e830dfca99e499705deab54616db483cf0b222be83b591b6037b7fd2abc
+  image: nearaidev/vllm-proxy-rs@sha256:59e42dd68faa15eb0c23521029a2fc3d80d86a4143f9f766542357918be33a8c
   user: root
   privileged: true
   <<: *nvidia
@@ -94,6 +94,21 @@ services:
       - VLLM_BASE_URL=http://glm51:8000
       - TLS_CERT_PATH=/etc/letsencrypt/live/completions.near.ai/fullchain.pem
       - USE_NV_ATTESTATION_SDK=true
+      # Server-side `web_context_search` tool loop (inference-proxy#144).
+      # Activated only when a client sends
+      # `tools: [{"type":"web_context_search"}]` on /v1/chat/completions
+      # with stream:true. The tool runs inside this CVM, hitting Brave's
+      # LLM Context endpoint directly; tool args + results stay within the
+      # E2EE perimeter. Both URL and key are gated by env interpolation —
+      # if the host's `.env` doesn't define them, compose substitutes
+      # empty strings, the inference-proxy treats the feature as
+      # unconfigured, and existing chat-completion behavior is unchanged.
+      # To enable on a host, set both
+      # `WEB_CONTEXT_SEARCH_URL=https://api.search.brave.com/res/v1/llm/context`
+      # and `WEB_CONTEXT_SEARCH_API_KEY=<brave-subscription-token>` in
+      # the host .env before compose up.
+      - WEB_CONTEXT_SEARCH_URL=${WEB_CONTEXT_SEARCH_URL}
+      - WEB_CONTEXT_SEARCH_API_KEY=${WEB_CONTEXT_SEARCH_API_KEY}
 
   glm51:
     <<: *nvidia