From a0b614be721d3a93c6093eed2761d677471f6a63 Mon Sep 17 00:00:00 2001
From: Lloyd <lloyd@Lloyds-MacBook-Pro.local>
Date: Fri, 29 May 2026 16:09:47 -0700
Subject: [PATCH 1/2] privacy-filter: cap GPU memory + release cache to stop
 VRAM leak

privacy-filter is an inline HF Transformers token-classification server
(`pipeline(..., device_map="auto")`) with no memory bound. Under steady
traffic the CUDA caching allocator's reserved memory ratchets up and is
never released, so the process slowly hoards the GPU it shares with
Qwen3-VL, FLUX, embeddings, reranker and whisper (GPU 7). Observed ~93 GB
held on an H200 for a model that needs ~1-2 GB.

As privacy-filter fills the card (free ~50 GB -> ~0 over 1-2 days) the
largest co-tenant, Qwen3-VL (~49 GB at --gpu-memory-utilization 0.35),
can no longer load and crash-loops with
`torch.AcceleratorError: CUDA error: out of memory`. The same leak OOM'd
embeddings/whisper on 2026-05-25. Hits both small-models hosts (gpu11,
gpu02) since they run identical config.

Fix (inline server + container env):
- empty_cache() after every request (core fix): returns cached-but-unused
  CUDA blocks to the driver so reserved memory stops ratcheting.
- set_per_process_memory_fraction(GPU_MEM_FRACTION, 0) (fail-safe): hard
  ceiling so the process self-OOMs/restarts instead of starving neighbours.
  Default 0.10 (~14 GB on a 140 GB H200), env-tunable.
- torch.inference_mode() around inference: no autograd state retained.

Interim mitigation already applied by recreating the container, which
frees the leaked VRAM but recurs in ~1-2 days; this makes it permanent.
Ship via the normal tag + compose/up redeploy of small-models.yaml.
---
 small-models.yaml | 71 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/small-models.yaml b/small-models.yaml
index 1512985..7f1d979 100644
--- a/small-models.yaml
+++ b/small-models.yaml
@@ -193,6 +193,7 @@ x-privacy-filter-common: &privacy-filter-common
           "uvicorn[standard]"
       WORKDIR /app
       COPY <<'PYEOF' /app/server.py
+      import os
       import torch
       from fastapi import FastAPI, HTTPException
       from pydantic import BaseModel, Field
@@ -201,6 +202,16 @@ x-privacy-filter-common: &privacy-filter-common
       MODEL_ID = "openai/privacy-filter"
       MODEL_REVISION = "7ffa9a043d54d1be65afb281eddf0ffbe629385b"
 
+      # GPU 7 is shared with Qwen3-VL / FLUX / embeddings / reranker / whisper.
+      # The HF pipeline's CUDA caching allocator ratchets its reserved memory up
+      # under traffic and never releases it, slowly hoarding the whole card and
+      # starving the co-located models until they OOM (Qwen3-VL crash-loops).
+      # Cap this process to a small fraction of the device so it is fail-safe:
+      # it self-OOMs and restarts instead of stealing VRAM from its neighbours.
+      GPU_MEM_FRACTION = float(os.environ.get("GPU_MEM_FRACTION", "0.10"))
+      if torch.cuda.is_available():
+          torch.cuda.set_per_process_memory_fraction(GPU_MEM_FRACTION, 0)
+
       clf = pipeline(
           "token-classification",
           model=MODEL_ID,
@@ -231,29 +242,39 @@ x-privacy-filter-common: &privacy-filter-common
           if not texts or any(not isinstance(t, str) for t in texts):
               raise HTTPException(400, "input must be a non-empty string or list of strings")
 
-          # HF pipeline defaults batch_size=1 — pass 32 so the GPU is actually
-          # fed in parallel for list inputs.
-          raw = clf(texts, batch_size=32)
-
-          # Single batched tokenize for usage counts instead of N sequential calls.
-          tok_lens = [len(ids) for ids in tokenizer(texts).input_ids]
-
-          data = []
-          for i, spans in enumerate(raw):
-              kept = [
-                  {
-                      "category": s["entity_group"],
-                      "score": float(s["score"]),
-                      "text": s["word"],
-                      "start": int(s["start"]),
-                      "end": int(s["end"]),
-                  }
-                  for s in spans
-                  if float(s["score"]) >= req.threshold
-              ]
-              data.append({"index": i, "spans": kept, "usage": {"input_tokens": tok_lens[i]}})
-
-          return {"model": MODEL_ID, "data": data}
+          try:
+              # HF pipeline defaults batch_size=1 — pass 32 so the GPU is actually
+              # fed in parallel for list inputs. inference_mode avoids retaining
+              # any autograd state across requests.
+              with torch.inference_mode():
+                  raw = clf(texts, batch_size=32)
+
+              # Single batched tokenize for usage counts instead of N sequential calls.
+              tok_lens = [len(ids) for ids in tokenizer(texts).input_ids]
+
+              data = []
+              for i, spans in enumerate(raw):
+                  kept = [
+                      {
+                          "category": s["entity_group"],
+                          "score": float(s["score"]),
+                          "text": s["word"],
+                          "start": int(s["start"]),
+                          "end": int(s["end"]),
+                      }
+                      for s in spans
+                      if float(s["score"]) >= req.threshold
+                  ]
+                  data.append({"index": i, "spans": kept, "usage": {"input_tokens": tok_lens[i]}})
+
+              return {"model": MODEL_ID, "data": data}
+          finally:
+              # Return cached-but-unused CUDA blocks to the driver after every
+              # request so reserved memory does not ratchet up over time on the
+              # shared GPU. This is the core leak fix; the fraction cap above is
+              # the fail-safe.
+              if torch.cuda.is_available():
+                  torch.cuda.empty_cache()
       PYEOF
       EXPOSE 8000
       CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
@@ -263,6 +284,10 @@ x-privacy-filter-common: &privacy-filter-common
     - HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
     - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
     - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    # Hard ceiling on this process's share of the shared GPU (see server.py).
+    # Tune without rebuilding the image. ~0.10 of a 140 GB H200 ≈ 14 GB, ample
+    # for the classifier and leaves the card for Qwen3-VL/FLUX/etc.
+    - GPU_MEM_FRACTION=0.10
   restart: unless-stopped
   stop_grace_period: 5m
   logging: *logging-conf

From 7295a659edb10787159358b3de42d6aa3d47e42b Mon Sep 17 00:00:00 2001
From: Lloyd <lloyd@Lloyds-MacBook-Pro.local>
Date: Fri, 29 May 2026 16:47:00 -0700
Subject: [PATCH 2/2] privacy-filter: revise GPU-leak fix per review (watchdog
 + expandable_segments)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses the code review of the first cut:

- Root cause now fixed at the source: PYTORCH_CUDA_ALLOC_CONF=expandable_segments
  lets the CUDA allocator shrink reserved segments instead of ratcheting up.
- Drop per-request torch.cuda.empty_cache(): a synchronizing cudaFree on the hot
  path stalled the shared GPU and the co-located models it was meant to protect.
  A 30s watchdog thread now releases idle blocks off the request path.
- Real fail-safe instead of a silent 500-storm: the watchdog hard-restarts the
  container (os._exit -> restart:unless-stopped) if this process's reserved VRAM
  exceeds GPU_MEM_LIMIT_GB, and an acute CUDA-OOM in a request also exits. The
  prior "self-OOMs and restarts" comment was false — a caught OOM returned 500
  while the process stayed up behind a still-healthy /v1/models probe.
- Drop set_per_process_memory_fraction: the 0.10 (~14GB) guess could OOM legit
  batch_size=32 requests, and device_map="auto" planned against the full card
  and ignored the cap anyway. Bound the work via PRIVACY_BATCH_SIZE instead;
  inputs are NOT truncated (a privacy filter must see the whole text).
- device=0 instead of device_map="auto" (no accelerate planner mismatch).
- Drop torch.inference_mode(): redundant with the pipeline's internal no_grad
  and stricter (risked raising under trust_remote_code custom models).
- Tolerant env parsing + clamps so a malformed knob can't crash-loop boot.

Validated: small-models.yaml parses and the embedded server.py compiles.
---
 small-models.yaml | 130 ++++++++++++++++++++++++++++++----------------
 1 file changed, 85 insertions(+), 45 deletions(-)

diff --git a/small-models.yaml b/small-models.yaml
index 7f1d979..e19b02c 100644
--- a/small-models.yaml
+++ b/small-models.yaml
@@ -194,6 +194,9 @@ x-privacy-filter-common: &privacy-filter-common
       WORKDIR /app
       COPY <<'PYEOF' /app/server.py
       import os
+      import sys
+      import threading
+      import time
       import torch
       from fastapi import FastAPI, HTTPException
       from pydantic import BaseModel, Field
@@ -202,27 +205,65 @@ x-privacy-filter-common: &privacy-filter-common
       MODEL_ID = "openai/privacy-filter"
       MODEL_REVISION = "7ffa9a043d54d1be65afb281eddf0ffbe629385b"
 
+
+      def _env(name, default, cast):
+          # Tolerate absent/blank/garbage env so a typo can't crash-loop the boot.
+          try:
+              return cast(os.environ[name])
+          except (KeyError, ValueError):
+              return default
+
+
       # GPU 7 is shared with Qwen3-VL / FLUX / embeddings / reranker / whisper.
-      # The HF pipeline's CUDA caching allocator ratchets its reserved memory up
-      # under traffic and never releases it, slowly hoarding the whole card and
-      # starving the co-located models until they OOM (Qwen3-VL crash-loops).
-      # Cap this process to a small fraction of the device so it is fail-safe:
-      # it self-OOMs and restarts instead of stealing VRAM from its neighbours.
-      GPU_MEM_FRACTION = float(os.environ.get("GPU_MEM_FRACTION", "0.10"))
-      if torch.cuda.is_available():
-          torch.cuda.set_per_process_memory_fraction(GPU_MEM_FRACTION, 0)
+      # Left unbounded, the HF pipeline's CUDA caching allocator ratchets its
+      # reserved memory up under traffic and never releases it, slowly hoarding the
+      # card and starving the co-located models until they OOM (Qwen3-VL crash-loops).
+      # Defence in depth (all tunable via env, no rebuild):
+      #   * PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True (set in compose) is the
+      #     root-cause fix — reserved segments can shrink again instead of ratcheting.
+      #   * A watchdog returns idle cached blocks to the driver on an interval (never
+      #     per request: empty_cache() is a synchronizing cudaFree that would stall
+      #     the shared GPU) and hard-restarts the container if this process's VRAM
+      #     ever climbs toward starving its neighbours — a real restart (clean reclaim)
+      #     rather than silently serving 500s behind a still-healthy /v1/models probe.
+      #   * An acute CUDA-OOM in a request also exits, for the same clean recycle.
+      # Inputs are deliberately NOT truncated — a privacy filter must see the whole
+      # text or it would miss PII; per-request peak is bounded by PRIVACY_BATCH_SIZE.
+      BATCH_SIZE = max(1, _env("PRIVACY_BATCH_SIZE", 32, int))
+      GPU_MEM_LIMIT_GB = max(1.0, _env("GPU_MEM_LIMIT_GB", 32.0, float))
+      WATCHDOG_INTERVAL_S = max(1.0, _env("WATCHDOG_INTERVAL_S", 30.0, float))
+
+      USE_CUDA = torch.cuda.is_available()
 
       clf = pipeline(
           "token-classification",
           model=MODEL_ID,
           revision=MODEL_REVISION,
           aggregation_strategy="simple",
-          device_map="auto",
+          device=0 if USE_CUDA else -1,
           torch_dtype=torch.bfloat16,
           trust_remote_code=True,
       )
       tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, revision=MODEL_REVISION, trust_remote_code=True)
 
+
+      def _gpu_watchdog():
+          while True:
+              time.sleep(WATCHDOG_INTERVAL_S)
+              torch.cuda.empty_cache()
+              reserved_gb = torch.cuda.memory_reserved(0) / 1024 ** 3
+              if reserved_gb > GPU_MEM_LIMIT_GB:
+                  sys.stderr.write(
+                      "privacy-filter: reserved %.1f GB > %.1f GB limit; "
+                      "exiting for a clean restart\n" % (reserved_gb, GPU_MEM_LIMIT_GB)
+                  )
+                  sys.stderr.flush()
+                  os._exit(1)
+
+
+      if USE_CUDA:
+          threading.Thread(target=_gpu_watchdog, daemon=True).start()
+
       app = FastAPI()
 
 
@@ -242,39 +283,34 @@ x-privacy-filter-common: &privacy-filter-common
           if not texts or any(not isinstance(t, str) for t in texts):
               raise HTTPException(400, "input must be a non-empty string or list of strings")
 
+          # The pipeline runs the forward under torch.no_grad() internally.
           try:
-              # HF pipeline defaults batch_size=1 — pass 32 so the GPU is actually
-              # fed in parallel for list inputs. inference_mode avoids retaining
-              # any autograd state across requests.
-              with torch.inference_mode():
-                  raw = clf(texts, batch_size=32)
-
-              # Single batched tokenize for usage counts instead of N sequential calls.
-              tok_lens = [len(ids) for ids in tokenizer(texts).input_ids]
-
-              data = []
-              for i, spans in enumerate(raw):
-                  kept = [
-                      {
-                          "category": s["entity_group"],
-                          "score": float(s["score"]),
-                          "text": s["word"],
-                          "start": int(s["start"]),
-                          "end": int(s["end"]),
-                      }
-                      for s in spans
-                      if float(s["score"]) >= req.threshold
-                  ]
-                  data.append({"index": i, "spans": kept, "usage": {"input_tokens": tok_lens[i]}})
-
-              return {"model": MODEL_ID, "data": data}
-          finally:
-              # Return cached-but-unused CUDA blocks to the driver after every
-              # request so reserved memory does not ratchet up over time on the
-              # shared GPU. This is the core leak fix; the fraction cap above is
-              # the fail-safe.
-              if torch.cuda.is_available():
-                  torch.cuda.empty_cache()
+              raw = clf(texts, batch_size=BATCH_SIZE)
+          except torch.cuda.OutOfMemoryError:
+              # Don't 500-storm behind a healthy /v1/models probe: recycle the
+              # container so restart:unless-stopped reclaims the VRAM cleanly.
+              torch.cuda.empty_cache()
+              os._exit(1)
+
+          # Single batched tokenize for usage counts instead of N sequential calls.
+          tok_lens = [len(ids) for ids in tokenizer(texts).input_ids]
+
+          data = []
+          for i, spans in enumerate(raw):
+              kept = [
+                  {
+                      "category": s["entity_group"],
+                      "score": float(s["score"]),
+                      "text": s["word"],
+                      "start": int(s["start"]),
+                      "end": int(s["end"]),
+                  }
+                  for s in spans
+                  if float(s["score"]) >= req.threshold
+              ]
+              data.append({"index": i, "spans": kept, "usage": {"input_tokens": tok_lens[i]}})
+
+          return {"model": MODEL_ID, "data": data}
       PYEOF
       EXPOSE 8000
       CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
@@ -284,10 +320,14 @@ x-privacy-filter-common: &privacy-filter-common
     - HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
     - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
     - NVIDIA_DRIVER_CAPABILITIES=compute,utility
-    # Hard ceiling on this process's share of the shared GPU (see server.py).
-    # Tune without rebuilding the image. ~0.10 of a 140 GB H200 ≈ 14 GB, ample
-    # for the classifier and leaves the card for Qwen3-VL/FLUX/etc.
-    - GPU_MEM_FRACTION=0.10
+    # Root-cause fix for the VRAM leak: let the CUDA caching allocator shrink
+    # reserved segments instead of ratcheting them up and hoarding the shared GPU.
+    - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+    # Watchdog/batch knobs consumed by server.py — tunable per host, no rebuild.
+    # GPU_MEM_LIMIT_GB caps this process's blast radius on the 140 GB H200 (~10x
+    # the classifier's footprint), leaving the rest for Qwen3-VL/FLUX/etc.
+    - GPU_MEM_LIMIT_GB=32
+    - PRIVACY_BATCH_SIZE=32
   restart: unless-stopped
   stop_grace_period: 5m
   logging: *logging-conf