@@ -171,13 +171,9 @@ x-flux-common: &flux-common
171171 device_ids : ["7"]
172172 capabilities : [gpu]
173173
174- x-privacy-filter-common : &privacy-filter-common
175- << : *nvidia
176- init : true
177- image : privacy-filter-hf
178- build :
179- context : .
180- dockerfile_inline : |
174+ x-privacy-filter-build : &privacy-filter-build
175+ context : .
176+ dockerfile_inline : |
181177 # syntax=docker/dockerfile:1.4
182178 FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime
183179 RUN pip install --no-cache-dir \
@@ -187,6 +183,7 @@ x-privacy-filter-common: &privacy-filter-common
187183 "uvicorn[standard]"
188184 WORKDIR /app
189185 COPY <<'PYEOF' /app/server.py
186+ import os
190187 import torch
191188 from fastapi import FastAPI, HTTPException
192189 from pydantic import BaseModel, Field
@@ -195,15 +192,21 @@ x-privacy-filter-common: &privacy-filter-common
195192 MODEL_ID = "openai/privacy-filter"
196193 MODEL_REVISION = "7ffa9a043d54d1be65afb281eddf0ffbe629385b"
197194
198- clf = pipeline(
199- "token-classification",
195+ # PRIVACY_FILTER_DEVICE: "cpu" forces CPU + float32; anything else (default
196+ # "auto") uses the first visible GPU + bfloat16. Two service variants
197+ # share this image so vllm-proxy-rs can least-connections across them.
198+ _device = os.environ.get("PRIVACY_FILTER_DEVICE", "auto").lower()
199+ _kwargs = dict(
200200 model=MODEL_ID,
201201 revision=MODEL_REVISION,
202202 aggregation_strategy="simple",
203- device_map="auto",
204- torch_dtype=torch.bfloat16,
205203 trust_remote_code=True,
206204 )
205+ if _device == "cpu":
206+ _kwargs.update(device=-1, torch_dtype=torch.float32)
207+ else:
208+ _kwargs.update(device_map="auto", torch_dtype=torch.bfloat16)
209+ clf = pipeline("token-classification", **_kwargs)
207210 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, revision=MODEL_REVISION, trust_remote_code=True)
208211
209212 app = FastAPI()
@@ -251,6 +254,12 @@ x-privacy-filter-common: &privacy-filter-common
251254 PYEOF
252255 EXPOSE 8000
253256 CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
257+
258+ x-privacy-filter-common : &privacy-filter-common
259+ << : *nvidia
260+ init : true
261+ image : privacy-filter-hf
262+ build : *privacy-filter-build
254263 volumes :
255264 - hugginface_cache:/root/.cache/huggingface
256265 environment :
@@ -267,6 +276,31 @@ x-privacy-filter-common: &privacy-filter-common
267276 device_ids : ["7"]
268277 capabilities : [gpu]
269278
279+ # CPU-only sibling of x-privacy-filter-common. Same image (so the build is
280+ # shared and dockerfile_inline stays in one place), but no nvidia runtime,
281+ # no GPU reservation, and PRIVACY_FILTER_DEVICE=cpu so the pipeline loads
282+ # on CPU with float32. Routed alongside the GPU instance via vllm-proxy-rs
283+ # least-connections so requests dodge GPU-7 contention spikes.
284+ x-privacy-filter-cpu-common : &privacy-filter-cpu-common
285+ init : true
286+ # Distinct image tag (vs privacy-filter-hf) so compose-manager's
287+ # `docker compose pull --ignore-buildable` doesn't see two services sharing
288+ # one image name and try to pull this one as a registry image. The build
289+ # content is identical, so docker reuses cached layers — it's only a tag
290+ # alias, not a second build pass.
291+ image : privacy-filter-hf-cpu
292+ build : *privacy-filter-build
293+ volumes :
294+ - hugginface_cache:/root/.cache/huggingface
295+ environment :
296+ - HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
297+ - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
298+ - PRIVACY_FILTER_DEVICE=cpu
299+ - OMP_NUM_THREADS=4
300+ - MKL_NUM_THREADS=4
301+ restart : unless-stopped
302+ logging : *logging-conf
303+
270304services :
271305 nginx :
272306 image : nginx@sha256:1d13701a5f9f3fb01aaa88cef2344d65b6b5bf6b7d9fa4cf0dca557a8d7702ba
@@ -748,7 +782,7 @@ services:
748782 - MODEL_NAME=openai/privacy-filter
749783 - OHTTP_ENABLED=true
750784 - TOKEN=${PROXY_TOKEN}
751- - VLLM_BASE_URL =http://privacy-filter:8000
785+ - VLLM_BACKEND_URLS =http://privacy-filter:8000,http://privacy-filter-cpu :8000
752786 - TLS_CERT_PATH=/etc/letsencrypt/live/completions.near.ai/fullchain.pem
753787 - USE_NV_ATTESTATION_SDK=true
754788 labels :
@@ -760,6 +794,12 @@ services:
760794 labels :
761795 com.datadoghq.ad.logs : ' [{"source": "privacy-filter", "service": "privacy-filter", "tags":["model:openai/privacy-filter","ip:${HOST_IP}", "port:8007"]}]'
762796
797+ privacy-filter-cpu :
798+ << : *privacy-filter-cpu-common
799+ container_name : privacy-filter-cpu
800+ labels :
801+ com.datadoghq.ad.logs : ' [{"source": "privacy-filter", "service": "privacy-filter-cpu", "tags":["model:openai/privacy-filter","ip:${HOST_IP}", "port:8007", "device:cpu"]}]'
802+
763803 dcgm-exporter :
764804 image : nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless
765805 container_name : dcgm-exporter
0 commit comments