nearai · lloydmak99 · May 29, 2026 · May 29, 2026
diff --git a/small-models.yaml b/small-models.yaml
@@ -193,6 +193,10 @@ x-privacy-filter-common: &privacy-filter-common
           "uvicorn[standard]"
       WORKDIR /app
       COPY <<'PYEOF' /app/server.py
+      import os
+      import sys
+      import threading
+      import time
       import torch
       from fastapi import FastAPI, HTTPException
       from pydantic import BaseModel, Field
@@ -201,17 +205,65 @@ x-privacy-filter-common: &privacy-filter-common
       MODEL_ID = "openai/privacy-filter"
       MODEL_REVISION = "7ffa9a043d54d1be65afb281eddf0ffbe629385b"
 
+
+      def _env(name, default, cast):
+          # Tolerate absent/blank/garbage env so a typo can't crash-loop the boot.
+          try:
+              return cast(os.environ[name])
+          except (KeyError, ValueError):
+              return default
+
+
+      # GPU 7 is shared with Qwen3-VL / FLUX / embeddings / reranker / whisper.
+      # Left unbounded, the HF pipeline's CUDA caching allocator ratchets its
+      # reserved memory up under traffic and never releases it, slowly hoarding the
+      # card and starving the co-located models until they OOM (Qwen3-VL crash-loops).
+      # Defence in depth (all tunable via env, no rebuild):
+      #   * PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True (set in compose) is the
+      #     root-cause fix — reserved segments can shrink again instead of ratcheting.
+      #   * A watchdog returns idle cached blocks to the driver on an interval (never
+      #     per request: empty_cache() is a synchronizing cudaFree that would stall
+      #     the shared GPU) and hard-restarts the container if this process's VRAM
+      #     ever climbs toward starving its neighbours — a real restart (clean reclaim)
+      #     rather than silently serving 500s behind a still-healthy /v1/models probe.
+      #   * An acute CUDA-OOM in a request also exits, for the same clean recycle.
+      # Inputs are deliberately NOT truncated — a privacy filter must see the whole
+      # text or it would miss PII; per-request peak is bounded by PRIVACY_BATCH_SIZE.
+      BATCH_SIZE = max(1, _env("PRIVACY_BATCH_SIZE", 32, int))
+      GPU_MEM_LIMIT_GB = max(1.0, _env("GPU_MEM_LIMIT_GB", 32.0, float))
+      WATCHDOG_INTERVAL_S = max(1.0, _env("WATCHDOG_INTERVAL_S", 30.0, float))
+
+      USE_CUDA = torch.cuda.is_available()
+
       clf = pipeline(
           "token-classification",
           model=MODEL_ID,
           revision=MODEL_REVISION,
           aggregation_strategy="simple",
-          device_map="auto",
+          device=0 if USE_CUDA else -1,
           torch_dtype=torch.bfloat16,
           trust_remote_code=True,
       )
       tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, revision=MODEL_REVISION, trust_remote_code=True)
 
+
+      def _gpu_watchdog():
+          while True:
+              time.sleep(WATCHDOG_INTERVAL_S)
+              torch.cuda.empty_cache()
+              reserved_gb = torch.cuda.memory_reserved(0) / 1024 ** 3
+              if reserved_gb > GPU_MEM_LIMIT_GB:
+                  sys.stderr.write(
+                      "privacy-filter: reserved %.1f GB > %.1f GB limit; "
+                      "exiting for a clean restart\n" % (reserved_gb, GPU_MEM_LIMIT_GB)
+                  )
+                  sys.stderr.flush()
+                  os._exit(1)
+
+
+      if USE_CUDA:
+          threading.Thread(target=_gpu_watchdog, daemon=True).start()
+
       app = FastAPI()
 
 
@@ -231,9 +283,14 @@ x-privacy-filter-common: &privacy-filter-common
           if not texts or any(not isinstance(t, str) for t in texts):
               raise HTTPException(400, "input must be a non-empty string or list of strings")
 
-          # HF pipeline defaults batch_size=1 — pass 32 so the GPU is actually
-          # fed in parallel for list inputs.
-          raw = clf(texts, batch_size=32)
+          # The pipeline runs the forward under torch.no_grad() internally.
+          try:
+              raw = clf(texts, batch_size=BATCH_SIZE)
+          except torch.cuda.OutOfMemoryError:
+              # Don't 500-storm behind a healthy /v1/models probe: recycle the
+              # container so restart:unless-stopped reclaims the VRAM cleanly.
+              torch.cuda.empty_cache()
+              os._exit(1)
 
           # Single batched tokenize for usage counts instead of N sequential calls.
           tok_lens = [len(ids) for ids in tokenizer(texts).input_ids]
@@ -263,6 +320,14 @@ x-privacy-filter-common: &privacy-filter-common
     - HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
     - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
     - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    # Root-cause fix for the VRAM leak: let the CUDA caching allocator shrink
+    # reserved segments instead of ratcheting them up and hoarding the shared GPU.
+    - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+    # Watchdog/batch knobs consumed by server.py — tunable per host, no rebuild.
+    # GPU_MEM_LIMIT_GB caps this process's blast radius on the 140 GB H200 (~10x
+    # the classifier's footprint), leaving the rest for Qwen3-VL/FLUX/etc.
+    - GPU_MEM_LIMIT_GB=32
+    - PRIVACY_BATCH_SIZE=32
   restart: unless-stopped
   stop_grace_period: 5m
   logging: *logging-conf