Skip to content

Commit 7295a65

Browse files
LloydLloyd
authored andcommitted
privacy-filter: revise GPU-leak fix per review (watchdog + expandable_segments)
Addresses the code review of the first cut: - Root cause now fixed at the source: PYTORCH_CUDA_ALLOC_CONF=expandable_segments lets the CUDA allocator shrink reserved segments instead of ratcheting up. - Drop per-request torch.cuda.empty_cache(): a synchronizing cudaFree on the hot path stalled the shared GPU and the co-located models it was meant to protect. A 30s watchdog thread now releases idle blocks off the request path. - Real fail-safe instead of a silent 500-storm: the watchdog hard-restarts the container (os._exit -> restart:unless-stopped) if this process's reserved VRAM exceeds GPU_MEM_LIMIT_GB, and an acute CUDA-OOM in a request also exits. The prior "self-OOMs and restarts" comment was false — a caught OOM returned 500 while the process stayed up behind a still-healthy /v1/models probe. - Drop set_per_process_memory_fraction: the 0.10 (~14GB) guess could OOM legit batch_size=32 requests, and device_map="auto" planned against the full card and ignored the cap anyway. Bound the work via PRIVACY_BATCH_SIZE instead; inputs are NOT truncated (a privacy filter must see the whole text). - device=0 instead of device_map="auto" (no accelerate planner mismatch). - Drop torch.inference_mode(): redundant with the pipeline's internal no_grad and stricter (risked raising under trust_remote_code custom models). - Tolerant env parsing + clamps so a malformed knob can't crash-loop boot. Validated: small-models.yaml parses and the embedded server.py compiles.
1 parent a0b614b commit 7295a65

1 file changed

Lines changed: 85 additions & 45 deletions

File tree

small-models.yaml

Lines changed: 85 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,9 @@ x-privacy-filter-common: &privacy-filter-common
194194
WORKDIR /app
195195
COPY <<'PYEOF' /app/server.py
196196
import os
197+
import sys
198+
import threading
199+
import time
197200
import torch
198201
from fastapi import FastAPI, HTTPException
199202
from pydantic import BaseModel, Field
@@ -202,27 +205,65 @@ x-privacy-filter-common: &privacy-filter-common
202205
MODEL_ID = "openai/privacy-filter"
203206
MODEL_REVISION = "7ffa9a043d54d1be65afb281eddf0ffbe629385b"
204207
208+
209+
def _env(name, default, cast):
210+
# Tolerate absent/blank/garbage env so a typo can't crash-loop the boot.
211+
try:
212+
return cast(os.environ[name])
213+
except (KeyError, ValueError):
214+
return default
215+
216+
205217
# GPU 7 is shared with Qwen3-VL / FLUX / embeddings / reranker / whisper.
206-
# The HF pipeline's CUDA caching allocator ratchets its reserved memory up
207-
# under traffic and never releases it, slowly hoarding the whole card and
208-
# starving the co-located models until they OOM (Qwen3-VL crash-loops).
209-
# Cap this process to a small fraction of the device so it is fail-safe:
210-
# it self-OOMs and restarts instead of stealing VRAM from its neighbours.
211-
GPU_MEM_FRACTION = float(os.environ.get("GPU_MEM_FRACTION", "0.10"))
212-
if torch.cuda.is_available():
213-
torch.cuda.set_per_process_memory_fraction(GPU_MEM_FRACTION, 0)
218+
# Left unbounded, the HF pipeline's CUDA caching allocator ratchets its
219+
# reserved memory up under traffic and never releases it, slowly hoarding the
220+
# card and starving the co-located models until they OOM (Qwen3-VL crash-loops).
221+
# Defence in depth (all tunable via env, no rebuild):
222+
# * PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True (set in compose) is the
223+
# root-cause fix — reserved segments can shrink again instead of ratcheting.
224+
# * A watchdog returns idle cached blocks to the driver on an interval (never
225+
# per request: empty_cache() is a synchronizing cudaFree that would stall
226+
# the shared GPU) and hard-restarts the container if this process's VRAM
227+
# ever climbs toward starving its neighbours — a real restart (clean reclaim)
228+
# rather than silently serving 500s behind a still-healthy /v1/models probe.
229+
# * An acute CUDA-OOM in a request also exits, for the same clean recycle.
230+
# Inputs are deliberately NOT truncated — a privacy filter must see the whole
231+
# text or it would miss PII; per-request peak is bounded by PRIVACY_BATCH_SIZE.
232+
BATCH_SIZE = max(1, _env("PRIVACY_BATCH_SIZE", 32, int))
233+
GPU_MEM_LIMIT_GB = max(1.0, _env("GPU_MEM_LIMIT_GB", 32.0, float))
234+
WATCHDOG_INTERVAL_S = max(1.0, _env("WATCHDOG_INTERVAL_S", 30.0, float))
235+
236+
USE_CUDA = torch.cuda.is_available()
214237
215238
clf = pipeline(
216239
"token-classification",
217240
model=MODEL_ID,
218241
revision=MODEL_REVISION,
219242
aggregation_strategy="simple",
220-
device_map="auto",
243+
device=0 if USE_CUDA else -1,
221244
torch_dtype=torch.bfloat16,
222245
trust_remote_code=True,
223246
)
224247
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, revision=MODEL_REVISION, trust_remote_code=True)
225248
249+
250+
def _gpu_watchdog():
251+
while True:
252+
time.sleep(WATCHDOG_INTERVAL_S)
253+
torch.cuda.empty_cache()
254+
reserved_gb = torch.cuda.memory_reserved(0) / 1024 ** 3
255+
if reserved_gb > GPU_MEM_LIMIT_GB:
256+
sys.stderr.write(
257+
"privacy-filter: reserved %.1f GB > %.1f GB limit; "
258+
"exiting for a clean restart\n" % (reserved_gb, GPU_MEM_LIMIT_GB)
259+
)
260+
sys.stderr.flush()
261+
os._exit(1)
262+
263+
264+
if USE_CUDA:
265+
threading.Thread(target=_gpu_watchdog, daemon=True).start()
266+
226267
app = FastAPI()
227268
228269
@@ -242,39 +283,34 @@ x-privacy-filter-common: &privacy-filter-common
242283
if not texts or any(not isinstance(t, str) for t in texts):
243284
raise HTTPException(400, "input must be a non-empty string or list of strings")
244285
286+
# The pipeline runs the forward under torch.no_grad() internally.
245287
try:
246-
# HF pipeline defaults batch_size=1 — pass 32 so the GPU is actually
247-
# fed in parallel for list inputs. inference_mode avoids retaining
248-
# any autograd state across requests.
249-
with torch.inference_mode():
250-
raw = clf(texts, batch_size=32)
251-
252-
# Single batched tokenize for usage counts instead of N sequential calls.
253-
tok_lens = [len(ids) for ids in tokenizer(texts).input_ids]
254-
255-
data = []
256-
for i, spans in enumerate(raw):
257-
kept = [
258-
{
259-
"category": s["entity_group"],
260-
"score": float(s["score"]),
261-
"text": s["word"],
262-
"start": int(s["start"]),
263-
"end": int(s["end"]),
264-
}
265-
for s in spans
266-
if float(s["score"]) >= req.threshold
267-
]
268-
data.append({"index": i, "spans": kept, "usage": {"input_tokens": tok_lens[i]}})
269-
270-
return {"model": MODEL_ID, "data": data}
271-
finally:
272-
# Return cached-but-unused CUDA blocks to the driver after every
273-
# request so reserved memory does not ratchet up over time on the
274-
# shared GPU. This is the core leak fix; the fraction cap above is
275-
# the fail-safe.
276-
if torch.cuda.is_available():
277-
torch.cuda.empty_cache()
288+
raw = clf(texts, batch_size=BATCH_SIZE)
289+
except torch.cuda.OutOfMemoryError:
290+
# Don't 500-storm behind a healthy /v1/models probe: recycle the
291+
# container so restart:unless-stopped reclaims the VRAM cleanly.
292+
torch.cuda.empty_cache()
293+
os._exit(1)
294+
295+
# Single batched tokenize for usage counts instead of N sequential calls.
296+
tok_lens = [len(ids) for ids in tokenizer(texts).input_ids]
297+
298+
data = []
299+
for i, spans in enumerate(raw):
300+
kept = [
301+
{
302+
"category": s["entity_group"],
303+
"score": float(s["score"]),
304+
"text": s["word"],
305+
"start": int(s["start"]),
306+
"end": int(s["end"]),
307+
}
308+
for s in spans
309+
if float(s["score"]) >= req.threshold
310+
]
311+
data.append({"index": i, "spans": kept, "usage": {"input_tokens": tok_lens[i]}})
312+
313+
return {"model": MODEL_ID, "data": data}
278314
PYEOF
279315
EXPOSE 8000
280316
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
@@ -284,10 +320,14 @@ x-privacy-filter-common: &privacy-filter-common
284320
- HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
285321
- HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
286322
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
287-
# Hard ceiling on this process's share of the shared GPU (see server.py).
288-
# Tune without rebuilding the image. ~0.10 of a 140 GB H200 ≈ 14 GB, ample
289-
# for the classifier and leaves the card for Qwen3-VL/FLUX/etc.
290-
- GPU_MEM_FRACTION=0.10
323+
# Root-cause fix for the VRAM leak: let the CUDA caching allocator shrink
324+
# reserved segments instead of ratcheting them up and hoarding the shared GPU.
325+
- PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
326+
# Watchdog/batch knobs consumed by server.py — tunable per host, no rebuild.
327+
# GPU_MEM_LIMIT_GB caps this process's blast radius on the 140 GB H200 (~10x
328+
# the classifier's footprint), leaving the rest for Qwen3-VL/FLUX/etc.
329+
- GPU_MEM_LIMIT_GB=32
330+
- PRIVACY_BATCH_SIZE=32
291331
restart: unless-stopped
292332
stop_grace_period: 5m
293333
logging: *logging-conf

0 commit comments

Comments
 (0)