Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 41 additions & 21 deletions small-models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -507,27 +507,47 @@ services:
# https://lmsysorg.mintlify.app/cookbook/autoregressive/Google/Gemma4
image: lmsysorg/sglang:gemma4@sha256:87cecd3c9f4d17632c44b2d7cd1a20c50377c42b461d9ca39b153b4bb2b6e6ae
container_name: model-sg-gemma-4-31b
command: >
sglang serve
--model-path google/gemma-4-31B-it
--revision ba74f5b6c647c0911554e50278d6f6f4477f9010
--tp 2
--reasoning-parser gemma4
--tool-call-parser gemma4
--mem-fraction-static 0.85
--max-running-requests 64
--chunked-prefill-size 8192
--num-continuous-decode-steps 5
--enable-mixed-chunk
--disable-fast-image-processor
--model-loader-extra-config '{"enable_multithread_load": "true", "num_threads": 64}'
--port 8000
--host 0.0.0.0
--enable-cache-report
--enable-metrics
--trust-remote-code
--log-requests-level 0
--served-model-name google/gemma-4-31B-it
# The command wraps `sglang serve` in a shell that first hot-patches the
# transformers gemma4 image processor baked into the image. A bare
# `image.numpy()` on a CUDA tensor crashes multimodal (image) requests with
# TypeError: can't convert cuda:0 device type tensor to numpy
# for inputs SGLang decodes to a GPU tensor (video data-URLs, broken image
# URLs, etc). `--disable-fast-image-processor` does NOT cover this path — the
# tensor is already on GPU upstream of that flag. `.cpu()` is a no-op on CPU
# tensors, so the patch is safe and idempotent. See nearai/infra#156.
command:
- /bin/sh
- -c
- |
BACKENDS=/usr/local/lib/python3.12/dist-packages/transformers/image_processing_backends.py
sed -i 's/image = image\.numpy()/image = image.cpu().numpy()/' "$$BACKENDS"
# Fail loud if the CUDA->numpy hotfix did not land (path moved, transformers
# bumped, or pattern changed) instead of silently serving the crashing build.
# Tolerates a future image that already ships the fixed form. See nearai/infra#156.
if ! grep -q 'image = image\.cpu()\.numpy()' "$$BACKENDS"; then
echo "FATAL: gemma4 CUDA->numpy patch not present in $$BACKENDS; refusing to start" >&2
exit 1
fi
exec sglang serve \
--model-path google/gemma-4-31B-it \
--revision ba74f5b6c647c0911554e50278d6f6f4477f9010 \
--tp 2 \
--reasoning-parser gemma4 \
--tool-call-parser gemma4 \
--mem-fraction-static 0.85 \
--max-running-requests 64 \
--chunked-prefill-size 8192 \
--num-continuous-decode-steps 5 \
--enable-mixed-chunk \
--disable-fast-image-processor \
--model-loader-extra-config '{"enable_multithread_load": "true", "num_threads": 64}' \
--port 8000 \
--host 0.0.0.0 \
--enable-cache-report \
--enable-metrics \
--trust-remote-code \
--log-requests-level 0 \
--served-model-name google/gemma-4-31B-it
volumes:
- hugginface_cache:/root/.cache/huggingface
- kernel_cache:/root/.cache/deep_gemm
Expand Down
Loading