Skip to content

Commit b19e44a

Browse files
committed
small-models/privacy-filter: add CPU-only sibling instance
Adds privacy-filter-cpu service running the same image with PRIVACY_FILTER_DEVICE=cpu (float32, no nvidia runtime, OMP/MKL threads=4). Switches vllm-proxy-privacy-filter to VLLM_BACKEND_URLS so requests least-connections across GPU and CPU backends, dodging GPU-7 contention tails (which today push some calls past the cloud-api 15s timeout).
1 parent 4f36273 commit b19e44a

1 file changed

Lines changed: 52 additions & 12 deletions

File tree

small-models.yaml

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -171,13 +171,9 @@ x-flux-common: &flux-common
171171
device_ids: ["7"]
172172
capabilities: [gpu]
173173

174-
x-privacy-filter-common: &privacy-filter-common
175-
<<: *nvidia
176-
init: true
177-
image: privacy-filter-hf
178-
build:
179-
context: .
180-
dockerfile_inline: |
174+
x-privacy-filter-build: &privacy-filter-build
175+
context: .
176+
dockerfile_inline: |
181177
# syntax=docker/dockerfile:1.4
182178
FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime
183179
RUN pip install --no-cache-dir \
@@ -187,6 +183,7 @@ x-privacy-filter-common: &privacy-filter-common
187183
"uvicorn[standard]"
188184
WORKDIR /app
189185
COPY <<'PYEOF' /app/server.py
186+
import os
190187
import torch
191188
from fastapi import FastAPI, HTTPException
192189
from pydantic import BaseModel, Field
@@ -195,15 +192,21 @@ x-privacy-filter-common: &privacy-filter-common
195192
MODEL_ID = "openai/privacy-filter"
196193
MODEL_REVISION = "7ffa9a043d54d1be65afb281eddf0ffbe629385b"
197194
198-
clf = pipeline(
199-
"token-classification",
195+
# PRIVACY_FILTER_DEVICE: "cpu" forces CPU + float32; anything else (default
196+
# "auto") uses the first visible GPU + bfloat16. Two service variants
197+
# share this image so vllm-proxy-rs can least-connections across them.
198+
_device = os.environ.get("PRIVACY_FILTER_DEVICE", "auto").lower()
199+
_kwargs = dict(
200200
model=MODEL_ID,
201201
revision=MODEL_REVISION,
202202
aggregation_strategy="simple",
203-
device_map="auto",
204-
torch_dtype=torch.bfloat16,
205203
trust_remote_code=True,
206204
)
205+
if _device == "cpu":
206+
_kwargs.update(device=-1, torch_dtype=torch.float32)
207+
else:
208+
_kwargs.update(device_map="auto", torch_dtype=torch.bfloat16)
209+
clf = pipeline("token-classification", **_kwargs)
207210
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, revision=MODEL_REVISION, trust_remote_code=True)
208211
209212
app = FastAPI()
@@ -251,6 +254,12 @@ x-privacy-filter-common: &privacy-filter-common
251254
PYEOF
252255
EXPOSE 8000
253256
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
257+
258+
x-privacy-filter-common: &privacy-filter-common
259+
<<: *nvidia
260+
init: true
261+
image: privacy-filter-hf
262+
build: *privacy-filter-build
254263
volumes:
255264
- hugginface_cache:/root/.cache/huggingface
256265
environment:
@@ -267,6 +276,31 @@ x-privacy-filter-common: &privacy-filter-common
267276
device_ids: ["7"]
268277
capabilities: [gpu]
269278

279+
# CPU-only sibling of x-privacy-filter-common. Same image (so the build is
280+
# shared and dockerfile_inline stays in one place), but no nvidia runtime,
281+
# no GPU reservation, and PRIVACY_FILTER_DEVICE=cpu so the pipeline loads
282+
# on CPU with float32. Routed alongside the GPU instance via vllm-proxy-rs
283+
# least-connections so requests dodge GPU-7 contention spikes.
284+
x-privacy-filter-cpu-common: &privacy-filter-cpu-common
285+
init: true
286+
# Distinct image tag (vs privacy-filter-hf) so compose-manager's
287+
# `docker compose pull --ignore-buildable` doesn't see two services sharing
288+
# one image name and try to pull this one as a registry image. The build
289+
# content is identical, so docker reuses cached layers — it's only a tag
290+
# alias, not a second build pass.
291+
image: privacy-filter-hf-cpu
292+
build: *privacy-filter-build
293+
volumes:
294+
- hugginface_cache:/root/.cache/huggingface
295+
environment:
296+
- HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
297+
- HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
298+
- PRIVACY_FILTER_DEVICE=cpu
299+
- OMP_NUM_THREADS=4
300+
- MKL_NUM_THREADS=4
301+
restart: unless-stopped
302+
logging: *logging-conf
303+
270304
services:
271305
nginx:
272306
image: nginx@sha256:1d13701a5f9f3fb01aaa88cef2344d65b6b5bf6b7d9fa4cf0dca557a8d7702ba
@@ -748,7 +782,7 @@ services:
748782
- MODEL_NAME=openai/privacy-filter
749783
- OHTTP_ENABLED=true
750784
- TOKEN=${PROXY_TOKEN}
751-
- VLLM_BASE_URL=http://privacy-filter:8000
785+
- VLLM_BACKEND_URLS=http://privacy-filter:8000,http://privacy-filter-cpu:8000
752786
- TLS_CERT_PATH=/etc/letsencrypt/live/completions.near.ai/fullchain.pem
753787
- USE_NV_ATTESTATION_SDK=true
754788
labels:
@@ -760,6 +794,12 @@ services:
760794
labels:
761795
com.datadoghq.ad.logs: '[{"source": "privacy-filter", "service": "privacy-filter", "tags":["model:openai/privacy-filter","ip:${HOST_IP}", "port:8007"]}]'
762796

797+
privacy-filter-cpu:
798+
<<: *privacy-filter-cpu-common
799+
container_name: privacy-filter-cpu
800+
labels:
801+
com.datadoghq.ad.logs: '[{"source": "privacy-filter", "service": "privacy-filter-cpu", "tags":["model:openai/privacy-filter","ip:${HOST_IP}", "port:8007", "device:cpu"]}]'
802+
763803
dcgm-exporter:
764804
image: nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless
765805
container_name: dcgm-exporter

0 commit comments

Comments
 (0)