@@ -507,27 +507,47 @@ services:
507507 # https://lmsysorg.mintlify.app/cookbook/autoregressive/Google/Gemma4
508508 image : lmsysorg/sglang:gemma4@sha256:87cecd3c9f4d17632c44b2d7cd1a20c50377c42b461d9ca39b153b4bb2b6e6ae
509509 container_name : model-sg-gemma-4-31b
510- command : >
511- sglang serve
512- --model-path google/gemma-4-31B-it
513- --revision ba74f5b6c647c0911554e50278d6f6f4477f9010
514- --tp 2
515- --reasoning-parser gemma4
516- --tool-call-parser gemma4
517- --mem-fraction-static 0.85
518- --max-running-requests 64
519- --chunked-prefill-size 8192
520- --num-continuous-decode-steps 5
521- --enable-mixed-chunk
522- --disable-fast-image-processor
523- --model-loader-extra-config '{"enable_multithread_load": "true", "num_threads": 64}'
524- --port 8000
525- --host 0.0.0.0
526- --enable-cache-report
527- --enable-metrics
528- --trust-remote-code
529- --log-requests-level 0
530- --served-model-name google/gemma-4-31B-it
510+ # The command wraps `sglang serve` in a shell that first hot-patches the
511+ # transformers gemma4 image processor baked into the image. A bare
512+ # `image.numpy()` on a CUDA tensor crashes multimodal (image) requests with
513+ # TypeError: can't convert cuda:0 device type tensor to numpy
514+ # for inputs SGLang decodes to a GPU tensor (video data-URLs, broken image
515+ # URLs, etc). `--disable-fast-image-processor` does NOT cover this path — the
516+ # tensor is already on GPU upstream of that flag. `.cpu()` is a no-op on CPU
517+ # tensors, so the patch is safe and idempotent. See nearai/infra#156.
518+ command :
519+ - /bin/sh
520+ - -c
521+ - |
522+ BACKENDS=/usr/local/lib/python3.12/dist-packages/transformers/image_processing_backends.py
523+ sed -i 's/image = image\.numpy()/image = image.cpu().numpy()/' "$$BACKENDS"
524+ # Fail loud if the CUDA->numpy hotfix did not land (path moved, transformers
525+ # bumped, or pattern changed) instead of silently serving the crashing build.
526+ # Tolerates a future image that already ships the fixed form. See nearai/infra#156.
527+ if ! grep -q 'image = image\.cpu()\.numpy()' "$$BACKENDS"; then
528+ echo "FATAL: gemma4 CUDA->numpy patch not present in $$BACKENDS; refusing to start" >&2
529+ exit 1
530+ fi
531+ exec sglang serve \
532+ --model-path google/gemma-4-31B-it \
533+ --revision ba74f5b6c647c0911554e50278d6f6f4477f9010 \
534+ --tp 2 \
535+ --reasoning-parser gemma4 \
536+ --tool-call-parser gemma4 \
537+ --mem-fraction-static 0.85 \
538+ --max-running-requests 64 \
539+ --chunked-prefill-size 8192 \
540+ --num-continuous-decode-steps 5 \
541+ --enable-mixed-chunk \
542+ --disable-fast-image-processor \
543+ --model-loader-extra-config '{"enable_multithread_load": "true", "num_threads": 64}' \
544+ --port 8000 \
545+ --host 0.0.0.0 \
546+ --enable-cache-report \
547+ --enable-metrics \
548+ --trust-remote-code \
549+ --log-requests-level 0 \
550+ --served-model-name google/gemma-4-31B-it
531551 volumes :
532552 - hugginface_cache:/root/.cache/huggingface
533553 - kernel_cache:/root/.cache/deep_gemm
0 commit comments