From a0b614be721d3a93c6093eed2761d677471f6a63 Mon Sep 17 00:00:00 2001 From: Lloyd Date: Fri, 29 May 2026 16:09:47 -0700 Subject: [PATCH 1/2] privacy-filter: cap GPU memory + release cache to stop VRAM leak privacy-filter is an inline HF Transformers token-classification server (`pipeline(..., device_map="auto")`) with no memory bound. Under steady traffic the CUDA caching allocator's reserved memory ratchets up and is never released, so the process slowly hoards the GPU it shares with Qwen3-VL, FLUX, embeddings, reranker and whisper (GPU 7). Observed ~93 GB held on an H200 for a model that needs ~1-2 GB. As privacy-filter fills the card (free ~50 GB -> ~0 over 1-2 days) the largest co-tenant, Qwen3-VL (~49 GB at --gpu-memory-utilization 0.35), can no longer load and crash-loops with `torch.AcceleratorError: CUDA error: out of memory`. The same leak OOM'd embeddings/whisper on 2026-05-25. Hits both small-models hosts (gpu11, gpu02) since they run identical config. Fix (inline server + container env): - empty_cache() after every request (core fix): returns cached-but-unused CUDA blocks to the driver so reserved memory stops ratcheting. - set_per_process_memory_fraction(GPU_MEM_FRACTION, 0) (fail-safe): hard ceiling so the process self-OOMs/restarts instead of starving neighbours. Default 0.10 (~14 GB on a 140 GB H200), env-tunable. - torch.inference_mode() around inference: no autograd state retained. Interim mitigation already applied by recreating the container, which frees the leaked VRAM but recurs in ~1-2 days; this makes it permanent. Ship via the normal tag + compose/up redeploy of small-models.yaml. --- small-models.yaml | 71 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/small-models.yaml b/small-models.yaml index 1512985..7f1d979 100644 --- a/small-models.yaml +++ b/small-models.yaml @@ -193,6 +193,7 @@ x-privacy-filter-common: &privacy-filter-common "uvicorn[standard]" WORKDIR /app COPY <<'PYEOF' /app/server.py + import os import torch from fastapi import FastAPI, HTTPException from pydantic import BaseModel, Field @@ -201,6 +202,16 @@ x-privacy-filter-common: &privacy-filter-common MODEL_ID = "openai/privacy-filter" MODEL_REVISION = "7ffa9a043d54d1be65afb281eddf0ffbe629385b" + # GPU 7 is shared with Qwen3-VL / FLUX / embeddings / reranker / whisper. + # The HF pipeline's CUDA caching allocator ratchets its reserved memory up + # under traffic and never releases it, slowly hoarding the whole card and + # starving the co-located models until they OOM (Qwen3-VL crash-loops). + # Cap this process to a small fraction of the device so it is fail-safe: + # it self-OOMs and restarts instead of stealing VRAM from its neighbours. + GPU_MEM_FRACTION = float(os.environ.get("GPU_MEM_FRACTION", "0.10")) + if torch.cuda.is_available(): + torch.cuda.set_per_process_memory_fraction(GPU_MEM_FRACTION, 0) + clf = pipeline( "token-classification", model=MODEL_ID, @@ -231,29 +242,39 @@ x-privacy-filter-common: &privacy-filter-common if not texts or any(not isinstance(t, str) for t in texts): raise HTTPException(400, "input must be a non-empty string or list of strings") - # HF pipeline defaults batch_size=1 — pass 32 so the GPU is actually - # fed in parallel for list inputs. - raw = clf(texts, batch_size=32) - - # Single batched tokenize for usage counts instead of N sequential calls. - tok_lens = [len(ids) for ids in tokenizer(texts).input_ids] - - data = [] - for i, spans in enumerate(raw): - kept = [ - { - "category": s["entity_group"], - "score": float(s["score"]), - "text": s["word"], - "start": int(s["start"]), - "end": int(s["end"]), - } - for s in spans - if float(s["score"]) >= req.threshold - ] - data.append({"index": i, "spans": kept, "usage": {"input_tokens": tok_lens[i]}}) - - return {"model": MODEL_ID, "data": data} + try: + # HF pipeline defaults batch_size=1 — pass 32 so the GPU is actually + # fed in parallel for list inputs. inference_mode avoids retaining + # any autograd state across requests. + with torch.inference_mode(): + raw = clf(texts, batch_size=32) + + # Single batched tokenize for usage counts instead of N sequential calls. + tok_lens = [len(ids) for ids in tokenizer(texts).input_ids] + + data = [] + for i, spans in enumerate(raw): + kept = [ + { + "category": s["entity_group"], + "score": float(s["score"]), + "text": s["word"], + "start": int(s["start"]), + "end": int(s["end"]), + } + for s in spans + if float(s["score"]) >= req.threshold + ] + data.append({"index": i, "spans": kept, "usage": {"input_tokens": tok_lens[i]}}) + + return {"model": MODEL_ID, "data": data} + finally: + # Return cached-but-unused CUDA blocks to the driver after every + # request so reserved memory does not ratchet up over time on the + # shared GPU. This is the core leak fix; the fraction cap above is + # the fail-safe. + if torch.cuda.is_available(): + torch.cuda.empty_cache() PYEOF EXPOSE 8000 CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"] @@ -263,6 +284,10 @@ x-privacy-filter-common: &privacy-filter-common - HF_TOKEN=${HUGGING_FACE_HUB_TOKEN} - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0} - NVIDIA_DRIVER_CAPABILITIES=compute,utility + # Hard ceiling on this process's share of the shared GPU (see server.py). + # Tune without rebuilding the image. ~0.10 of a 140 GB H200 ≈ 14 GB, ample + # for the classifier and leaves the card for Qwen3-VL/FLUX/etc. + - GPU_MEM_FRACTION=0.10 restart: unless-stopped stop_grace_period: 5m logging: *logging-conf From 7295a659edb10787159358b3de42d6aa3d47e42b Mon Sep 17 00:00:00 2001 From: Lloyd Date: Fri, 29 May 2026 16:47:00 -0700 Subject: [PATCH 2/2] privacy-filter: revise GPU-leak fix per review (watchdog + expandable_segments) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the code review of the first cut: - Root cause now fixed at the source: PYTORCH_CUDA_ALLOC_CONF=expandable_segments lets the CUDA allocator shrink reserved segments instead of ratcheting up. - Drop per-request torch.cuda.empty_cache(): a synchronizing cudaFree on the hot path stalled the shared GPU and the co-located models it was meant to protect. A 30s watchdog thread now releases idle blocks off the request path. - Real fail-safe instead of a silent 500-storm: the watchdog hard-restarts the container (os._exit -> restart:unless-stopped) if this process's reserved VRAM exceeds GPU_MEM_LIMIT_GB, and an acute CUDA-OOM in a request also exits. The prior "self-OOMs and restarts" comment was false — a caught OOM returned 500 while the process stayed up behind a still-healthy /v1/models probe. - Drop set_per_process_memory_fraction: the 0.10 (~14GB) guess could OOM legit batch_size=32 requests, and device_map="auto" planned against the full card and ignored the cap anyway. Bound the work via PRIVACY_BATCH_SIZE instead; inputs are NOT truncated (a privacy filter must see the whole text). - device=0 instead of device_map="auto" (no accelerate planner mismatch). - Drop torch.inference_mode(): redundant with the pipeline's internal no_grad and stricter (risked raising under trust_remote_code custom models). - Tolerant env parsing + clamps so a malformed knob can't crash-loop boot. Validated: small-models.yaml parses and the embedded server.py compiles. --- small-models.yaml | 130 ++++++++++++++++++++++++++++++---------------- 1 file changed, 85 insertions(+), 45 deletions(-) diff --git a/small-models.yaml b/small-models.yaml index 7f1d979..e19b02c 100644 --- a/small-models.yaml +++ b/small-models.yaml @@ -194,6 +194,9 @@ x-privacy-filter-common: &privacy-filter-common WORKDIR /app COPY <<'PYEOF' /app/server.py import os + import sys + import threading + import time import torch from fastapi import FastAPI, HTTPException from pydantic import BaseModel, Field @@ -202,27 +205,65 @@ x-privacy-filter-common: &privacy-filter-common MODEL_ID = "openai/privacy-filter" MODEL_REVISION = "7ffa9a043d54d1be65afb281eddf0ffbe629385b" + + def _env(name, default, cast): + # Tolerate absent/blank/garbage env so a typo can't crash-loop the boot. + try: + return cast(os.environ[name]) + except (KeyError, ValueError): + return default + + # GPU 7 is shared with Qwen3-VL / FLUX / embeddings / reranker / whisper. - # The HF pipeline's CUDA caching allocator ratchets its reserved memory up - # under traffic and never releases it, slowly hoarding the whole card and - # starving the co-located models until they OOM (Qwen3-VL crash-loops). - # Cap this process to a small fraction of the device so it is fail-safe: - # it self-OOMs and restarts instead of stealing VRAM from its neighbours. - GPU_MEM_FRACTION = float(os.environ.get("GPU_MEM_FRACTION", "0.10")) - if torch.cuda.is_available(): - torch.cuda.set_per_process_memory_fraction(GPU_MEM_FRACTION, 0) + # Left unbounded, the HF pipeline's CUDA caching allocator ratchets its + # reserved memory up under traffic and never releases it, slowly hoarding the + # card and starving the co-located models until they OOM (Qwen3-VL crash-loops). + # Defence in depth (all tunable via env, no rebuild): + # * PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True (set in compose) is the + # root-cause fix — reserved segments can shrink again instead of ratcheting. + # * A watchdog returns idle cached blocks to the driver on an interval (never + # per request: empty_cache() is a synchronizing cudaFree that would stall + # the shared GPU) and hard-restarts the container if this process's VRAM + # ever climbs toward starving its neighbours — a real restart (clean reclaim) + # rather than silently serving 500s behind a still-healthy /v1/models probe. + # * An acute CUDA-OOM in a request also exits, for the same clean recycle. + # Inputs are deliberately NOT truncated — a privacy filter must see the whole + # text or it would miss PII; per-request peak is bounded by PRIVACY_BATCH_SIZE. + BATCH_SIZE = max(1, _env("PRIVACY_BATCH_SIZE", 32, int)) + GPU_MEM_LIMIT_GB = max(1.0, _env("GPU_MEM_LIMIT_GB", 32.0, float)) + WATCHDOG_INTERVAL_S = max(1.0, _env("WATCHDOG_INTERVAL_S", 30.0, float)) + + USE_CUDA = torch.cuda.is_available() clf = pipeline( "token-classification", model=MODEL_ID, revision=MODEL_REVISION, aggregation_strategy="simple", - device_map="auto", + device=0 if USE_CUDA else -1, torch_dtype=torch.bfloat16, trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, revision=MODEL_REVISION, trust_remote_code=True) + + def _gpu_watchdog(): + while True: + time.sleep(WATCHDOG_INTERVAL_S) + torch.cuda.empty_cache() + reserved_gb = torch.cuda.memory_reserved(0) / 1024 ** 3 + if reserved_gb > GPU_MEM_LIMIT_GB: + sys.stderr.write( + "privacy-filter: reserved %.1f GB > %.1f GB limit; " + "exiting for a clean restart\n" % (reserved_gb, GPU_MEM_LIMIT_GB) + ) + sys.stderr.flush() + os._exit(1) + + + if USE_CUDA: + threading.Thread(target=_gpu_watchdog, daemon=True).start() + app = FastAPI() @@ -242,39 +283,34 @@ x-privacy-filter-common: &privacy-filter-common if not texts or any(not isinstance(t, str) for t in texts): raise HTTPException(400, "input must be a non-empty string or list of strings") + # The pipeline runs the forward under torch.no_grad() internally. try: - # HF pipeline defaults batch_size=1 — pass 32 so the GPU is actually - # fed in parallel for list inputs. inference_mode avoids retaining - # any autograd state across requests. - with torch.inference_mode(): - raw = clf(texts, batch_size=32) - - # Single batched tokenize for usage counts instead of N sequential calls. - tok_lens = [len(ids) for ids in tokenizer(texts).input_ids] - - data = [] - for i, spans in enumerate(raw): - kept = [ - { - "category": s["entity_group"], - "score": float(s["score"]), - "text": s["word"], - "start": int(s["start"]), - "end": int(s["end"]), - } - for s in spans - if float(s["score"]) >= req.threshold - ] - data.append({"index": i, "spans": kept, "usage": {"input_tokens": tok_lens[i]}}) - - return {"model": MODEL_ID, "data": data} - finally: - # Return cached-but-unused CUDA blocks to the driver after every - # request so reserved memory does not ratchet up over time on the - # shared GPU. This is the core leak fix; the fraction cap above is - # the fail-safe. - if torch.cuda.is_available(): - torch.cuda.empty_cache() + raw = clf(texts, batch_size=BATCH_SIZE) + except torch.cuda.OutOfMemoryError: + # Don't 500-storm behind a healthy /v1/models probe: recycle the + # container so restart:unless-stopped reclaims the VRAM cleanly. + torch.cuda.empty_cache() + os._exit(1) + + # Single batched tokenize for usage counts instead of N sequential calls. + tok_lens = [len(ids) for ids in tokenizer(texts).input_ids] + + data = [] + for i, spans in enumerate(raw): + kept = [ + { + "category": s["entity_group"], + "score": float(s["score"]), + "text": s["word"], + "start": int(s["start"]), + "end": int(s["end"]), + } + for s in spans + if float(s["score"]) >= req.threshold + ] + data.append({"index": i, "spans": kept, "usage": {"input_tokens": tok_lens[i]}}) + + return {"model": MODEL_ID, "data": data} PYEOF EXPOSE 8000 CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"] @@ -284,10 +320,14 @@ x-privacy-filter-common: &privacy-filter-common - HF_TOKEN=${HUGGING_FACE_HUB_TOKEN} - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0} - NVIDIA_DRIVER_CAPABILITIES=compute,utility - # Hard ceiling on this process's share of the shared GPU (see server.py). - # Tune without rebuilding the image. ~0.10 of a 140 GB H200 ≈ 14 GB, ample - # for the classifier and leaves the card for Qwen3-VL/FLUX/etc. - - GPU_MEM_FRACTION=0.10 + # Root-cause fix for the VRAM leak: let the CUDA caching allocator shrink + # reserved segments instead of ratcheting them up and hoarding the shared GPU. + - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + # Watchdog/batch knobs consumed by server.py — tunable per host, no rebuild. + # GPU_MEM_LIMIT_GB caps this process's blast radius on the 140 GB H200 (~10x + # the classifier's footprint), leaving the rest for Qwen3-VL/FLUX/etc. + - GPU_MEM_LIMIT_GB=32 + - PRIVACY_BATCH_SIZE=32 restart: unless-stopped stop_grace_period: 5m logging: *logging-conf