diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml
index 12c186ca2870..3296c6d84578 100644
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -563,7 +563,7 @@ jobs:
       - name: Run e2e-backends smoke
         env:
           BACKEND_IMAGE: quay.io/go-skynet/local-ai-backends:master-cpu-llama-cpp
-          BACKEND_TEST_CAPS: health,load,predict,stream,logprobs,logit_bias
+          BACKEND_TEST_CAPS: health,load,predict,stream,logprobs,logit_bias,tokenize
         run: |
           make test-extra-backend
   # Realtime e2e with sherpa-onnx driving VAD + STT + TTS against a mocked LLM.
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index ac5521bc44ae..9f82001260c5 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -38,6 +38,7 @@
 #include <cstdlib>
 #include <fstream>
 #include <iterator>
+#include <limits>
 #include <list>
 #include <map>
 #include <mutex>
@@ -1282,6 +1283,232 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
 }
 
 
+// ============================================================================
+// Token-classification (NER) support for the openai-privacy-filter arch.
+//
+// The model emits BIOES-tagged per-token logits (217 classes for the
+// multilingual privacy filter: "O" plus {B,I,E,S}-<CATEGORY>). We decode the
+// most likely *valid* BIOES path with a constrained linear-chain Viterbi (the
+// model's 6 transition biases are all 0.0 in the shipped
+// viterbi_calibration.json, so only the structural BIOES constraints apply),
+// assemble spans, and map token spans to UTF-8 byte offsets in the source text.
+//
+// Receptive-field note: attention is a symmetric +/-sliding_window band PER
+// LAYER, so after n_layer layers a token's logits depend on its
+// +/-(n_layer * sliding_window) neighbourhood -- NOT +/-sliding_window. Windowed
+// inference for long inputs must therefore use a halo of n_layer*sliding_window
+// to stay bit-exact with a single forward (see TokenClassify below).
+// ============================================================================
+namespace pf_ner {
+
+// Per-layer attention half-window for openai-privacy-filter (config
+// sliding_window = 128). Only used to size the windowing halo for inputs that
+// exceed a single forward; short inputs (the common PII case) never window.
+static constexpr int PF_SLIDING_WINDOW = 128;
+
+enum bioes_tag { TAG_O = 0, TAG_B, TAG_I, TAG_E, TAG_S };
+
+struct label_info {
+    bioes_tag tag;
+    int       cat; // index into label_table::categories; -1 for O / unknown
+};
+
+// Parsed view of the model's classifier labels.
+struct label_table {
+    std::vector<label_info>  labels;     // size n_cls, indexed by class id
+    std::vector<std::string> categories; // distinct entity-group names
+    int                      o_label = 0;  // class id of the "O" (outside) label
+
+    // Per-category open-state class ids (B/I), used by the Viterbi inner loop.
+    struct open_ids { int b = -1; int i = -1; };
+    std::vector<open_ids> per_cat;
+
+    const std::string & category_name(int cat) const { return categories[cat]; }
+};
+
+// Split a "B-CATEGORY" label into its BIOES tag and category name. The model's
+// labels use a single '-' separator and category names contain none (verified
+// against the GGUF metadata).
+static label_table build_label_table(const llama_model * model) {
+    label_table t;
+    const uint32_t n = llama_model_n_cls_out(model);
+    t.labels.resize(n, { TAG_O, -1 });
+    std::map<std::string, int> cat_index;
+    bool found_o = false;
+    for (uint32_t i = 0; i < n; i++) {
+        const char * raw = llama_model_cls_label(model, i);
+        std::string s = raw ? raw : "";
+        if (s.empty() || s == "O") {
+            t.labels[i] = { TAG_O, -1 };
+            if (!found_o) { t.o_label = (int) i; found_o = true; }
+            continue;
+        }
+        bioes_tag tag;
+        switch (s[0]) {
+            case 'B': tag = TAG_B; break;
+            case 'I': tag = TAG_I; break;
+            case 'E': tag = TAG_E; break;
+            case 'S': tag = TAG_S; break;
+            default:  t.labels[i] = { TAG_O, -1 }; continue; // unknown -> treat as O
+        }
+        const size_t dash = s.find('-');
+        const std::string cat = (dash == std::string::npos) ? s : s.substr(dash + 1);
+        int ci;
+        auto it = cat_index.find(cat);
+        if (it == cat_index.end()) {
+            ci = (int) t.categories.size();
+            cat_index.emplace(cat, ci);
+            t.categories.push_back(cat);
+        } else {
+            ci = it->second;
+        }
+        t.labels[i] = { tag, ci };
+    }
+    t.per_cat.assign(t.categories.size(), {});
+    for (uint32_t i = 0; i < n; i++) {
+        const auto & li = t.labels[i];
+        if (li.cat < 0) continue;
+        if (li.tag == TAG_B) t.per_cat[li.cat].b = (int) i;
+        if (li.tag == TAG_I) t.per_cat[li.cat].i = (int) i;
+    }
+    return t;
+}
+
+static inline bool tag_is_closed(bioes_tag tg) { return tg == TAG_O || tg == TAG_E || tg == TAG_S; }
+
+// Constrained linear-chain Viterbi over BIOES. `emit` is row-major
+// [n_tok * n_cls] of per-token LOG-probabilities. Returns the best valid label
+// per token. Exploits the BIOES structure so each step is O(n_cls), not
+// O(n_cls^2): a fresh label (O/B/S) may only follow a closed state (O/E/S) and
+// can take the single best closed predecessor; a continuation (I/E of category
+// c) may only follow B-c or I-c. Falls back to per-token argmax only if no
+// valid path survives numerically (the all-O path always exists, so this is a
+// safety net).
+static std::vector<int> bioes_viterbi(const label_table & lt,
+                                      const std::vector<float> & emit,
+                                      int n_tok, int n_cls) {
+    const float NEG = -std::numeric_limits<float>::infinity();
+    std::vector<float> prev_dp(n_cls, NEG), dp(n_cls, NEG);
+    std::vector<int>   bp((size_t) n_tok * n_cls, -1);
+
+    // t == 0: a span may only start with O / B / S.
+    for (int j = 0; j < n_cls; j++) {
+        const bioes_tag tg = lt.labels[j].tag;
+        if (tg == TAG_O || tg == TAG_B || tg == TAG_S) prev_dp[j] = emit[j];
+    }
+
+    for (int t = 1; t < n_tok; t++) {
+        std::fill(dp.begin(), dp.end(), NEG);
+        const float * e = &emit[(size_t) t * n_cls];
+
+        // best closed predecessor (O/E/S) from the previous step
+        float best_closed = NEG; int best_closed_arg = -1;
+        for (int i = 0; i < n_cls; i++) {
+            if (prev_dp[i] == NEG) continue;
+            if (tag_is_closed(lt.labels[i].tag) && prev_dp[i] > best_closed) {
+                best_closed = prev_dp[i];
+                best_closed_arg = i;
+            }
+        }
+
+        for (int j = 0; j < n_cls; j++) {
+            const auto & lj = lt.labels[j];
+            float pred = NEG; int arg = -1;
+            if (lj.tag == TAG_O || lj.tag == TAG_B || lj.tag == TAG_S) {
+                pred = best_closed; arg = best_closed_arg; // fresh start
+            } else {
+                // I-c or E-c: predecessor must be B-c or I-c
+                const auto & oc = lt.per_cat[lj.cat];
+                if (oc.b >= 0 && prev_dp[oc.b] > pred) { pred = prev_dp[oc.b]; arg = oc.b; }
+                if (oc.i >= 0 && prev_dp[oc.i] > pred) { pred = prev_dp[oc.i]; arg = oc.i; }
+            }
+            if (arg >= 0 && pred != NEG) {
+                dp[j] = pred + e[j];
+                bp[(size_t) t * n_cls + j] = arg;
+            }
+        }
+        prev_dp.swap(dp);
+    }
+
+    // terminate only on a closed state (no dangling B/I span)
+    float best = NEG; int arg = -1;
+    for (int j = 0; j < n_cls; j++) {
+        if (prev_dp[j] == NEG) continue;
+        if (tag_is_closed(lt.labels[j].tag) && prev_dp[j] > best) { best = prev_dp[j]; arg = j; }
+    }
+
+    std::vector<int> path(n_tok, lt.o_label);
+    if (arg < 0) {
+        for (int t = 0; t < n_tok; t++) {
+            const float * e = &emit[(size_t) t * n_cls];
+            int a = 0; float m = e[0];
+            for (int j = 1; j < n_cls; j++) if (e[j] > m) { m = e[j]; a = j; }
+            path[t] = a;
+        }
+        return path;
+    }
+    int cur = arg;
+    for (int t = n_tok - 1; t >= 0; t--) {
+        path[t] = cur;
+        if (t > 0) cur = bp[(size_t) t * n_cls + cur];
+    }
+    return path;
+}
+
+// One assembled entity span over token indices [tok_begin, tok_end] inclusive.
+struct span {
+    int cat;
+    int tok_begin;
+    int tok_end;
+    float score; // mean per-token probability of the chosen labels
+};
+
+// Walk a (valid) BIOES label path into spans. Viterbi guarantees validity, so
+// B is always closed by a matching E and S stands alone.
+static std::vector<span> assemble_spans(const label_table & lt,
+                                        const std::vector<int> & path,
+                                        const std::vector<float> & emit,
+                                        int n_cls) {
+    std::vector<span> out;
+    int n_tok = (int) path.size();
+    int begin = -1, cat = -1;
+    double prob_sum = 0.0;
+    auto prob_at = [&](int t) {
+        return (double) std::exp(emit[(size_t) t * n_cls + path[t]]);
+    };
+    for (int t = 0; t < n_tok; t++) {
+        const auto & li = lt.labels[path[t]];
+        switch (li.tag) {
+            case TAG_S:
+                out.push_back({ li.cat, t, t, (float) prob_at(t) });
+                begin = -1;
+                break;
+            case TAG_B:
+                begin = t; cat = li.cat; prob_sum = prob_at(t);
+                break;
+            case TAG_I:
+                if (begin >= 0 && li.cat == cat) prob_sum += prob_at(t);
+                break;
+            case TAG_E:
+                if (begin >= 0 && li.cat == cat) {
+                    prob_sum += prob_at(t);
+                    const int len = t - begin + 1;
+                    out.push_back({ cat, begin, t, (float) (prob_sum / len) });
+                }
+                begin = -1;
+                break;
+            case TAG_O:
+            default:
+                begin = -1;
+                break;
+        }
+    }
+    return out;
+}
+
+} // namespace pf_ner
+
+
 // GRPC Server start
 class BackendServiceImpl final : public backend::Backend::Service {
 private:
@@ -3444,6 +3671,186 @@ class BackendServiceImpl final : public backend::Backend::Service {
         return grpc::Status::OK;
     }
 
+    // TokenClassify runs the openai-privacy-filter token classifier (a
+    // bidirectional MoE encoder with a per-token BIOES head) over the supplied
+    // text and returns the detected entity spans. It mirrors Score's
+    // direct-decode strategy (bypassing the slot/task queue) because it needs
+    // full control over batch construction, per-token logit readout, and
+    // overlapping-window stitching for long inputs.
+    //
+    // The model must be loaded with embeddings enabled and TOKEN_CLS pooling
+    // (the converter writes pooling_type = TOKEN_CLS into the GGUF, so a model
+    // YAML only needs `embeddings: true`). Pipeline:
+    //   tokenize (+offsets) -> windowed non-causal forward -> per-token
+    //   log_softmax -> constrained BIOES Viterbi -> spans -> byte offsets.
+    grpc::Status TokenClassify(ServerContext* context, const backend::TokenClassifyRequest* request, backend::TokenClassifyResponse* response) override {
+        auto auth = checkAuth(context);
+        if (!auth.ok()) return auth;
+        if (params_base.model.path.empty()) {
+            return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
+        }
+
+        // Tripwire against the slot loop + serialise concurrent TokenClassify
+        // calls, exactly as Score does (see Score's class comment): we drive
+        // llama_decode directly, so we must not race the slot loop or another
+        // direct-decode RPC.
+        conflict_guard guard("TokenClassify", score_inflight, slot_loop_inflight, "slot_loop_inflight");
+        static std::mutex token_classify_mutex;
+        std::lock_guard<std::mutex> tc_lock(token_classify_mutex);
+
+        llama_context * lctx = ctx_server.get_llama_context();
+        if (lctx == nullptr) {
+            return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "llama context unavailable (sleeping?)");
+        }
+        if (!params_base.embedding || llama_pooling_type(lctx) != LLAMA_POOLING_TYPE_TOKEN_CLS) {
+            return grpc::Status(grpc::StatusCode::UNIMPLEMENTED,
+                "This model does not support token classification. Load a TOKEN_CLS-pooling model (e.g. openai-privacy-filter) with `embeddings: true`");
+        }
+
+        const llama_model * model = ctx_server.impl->model_tgt;
+        const llama_vocab * vocab = ctx_server.impl->vocab;
+        const int n_cls = (int) llama_model_n_cls_out(model);
+        const int n_embd_out = llama_model_n_embd_out(model);
+        if (n_cls <= 0 || n_embd_out != n_cls) {
+            return grpc::Status(grpc::StatusCode::INTERNAL,
+                "TokenClassify: unexpected classifier output width (n_cls_out=" + std::to_string(n_cls) +
+                ", n_embd_out=" + std::to_string(n_embd_out) + ")");
+        }
+
+        const std::string & text = request->text();
+        if (text.empty()) {
+            return grpc::Status::OK; // no text -> no entities
+        }
+
+        // Tokenize once. add_special matches the verified llama-embedding parity
+        // path; rendering pieces with special=false makes any control tokens
+        // (e.g. an injected BOS) zero-width so they never fall inside a span and
+        // do not perturb byte offsets.
+        std::vector<llama_token> tokens = common_tokenize(vocab, text, /*add_special=*/true, /*parse_special=*/true);
+        const int n_tok = (int) tokens.size();
+        if (n_tok == 0) {
+            return grpc::Status::OK;
+        }
+
+        // Per-token UTF-8 byte offsets into `text`, by accumulating piece lengths.
+        // o200k is byte-level reversible, so piece concatenation reproduces the
+        // input bytes exactly; we validate and warn (best-effort) if it doesn't.
+        std::vector<int> tok_off(n_tok), tok_end(n_tok);
+        {
+            size_t running = 0;
+            for (int k = 0; k < n_tok; k++) {
+                std::string piece = common_token_to_piece(vocab, tokens[k], /*special=*/false);
+                tok_off[k] = (int) running;
+                running += piece.size();
+                tok_end[k] = (int) running;
+            }
+            if (running != text.size()) {
+                LOG_WRN("TokenClassify: detokenized length %zu != input length %zu; byte offsets may be approximate\n",
+                        running, text.size());
+            }
+        }
+
+        // Window geometry. A single forward is exact whenever the input fits one
+        // ubatch (the common short-PII case). For longer inputs we slide
+        // overlapping windows with a halo of n_layer*sliding_window so interior
+        // tokens see their full receptive field (see the namespace note).
+        const int W = std::min<int>((int) llama_n_ubatch(lctx), (int) llama_n_ctx(lctx));
+        const int halo = (int) llama_model_n_layer(model) * pf_ner::PF_SLIDING_WINDOW;
+        if (W <= 0) {
+            return grpc::Status(grpc::StatusCode::INTERNAL, "TokenClassify: invalid ubatch/context size");
+        }
+        if (n_tok > W && W <= 2 * halo) {
+            return grpc::Status(grpc::StatusCode::OUT_OF_RANGE,
+                "TokenClassify: input (" + std::to_string(n_tok) + " tokens) exceeds the single-forward window (" +
+                std::to_string(W) + ") and exact windowing needs nbatch > " + std::to_string(2 * halo) +
+                "; increase the model's nbatch/n_ctx");
+        }
+
+        std::vector<float> emit((size_t) n_tok * n_cls);
+        llama_batch batch = llama_batch_init(W, 0, 1);
+
+        // Decode one window [start, start+wlen) and write log-softmax rows for
+        // the interior global positions [start+lo, start+hi). Positions are
+        // window-local (0..wlen-1): RoPE is relative and the symmetric band uses
+        // |p1-p0|, so local positions are equivalent to absolute ones here.
+        auto run_window = [&](int start, int wlen, int lo, int hi) -> grpc::Status {
+            common_batch_clear(batch);
+            for (int j = 0; j < wlen; j++) {
+                common_batch_add(batch, tokens[start + j], j, { 0 }, /*logits=*/true);
+            }
+            llama_memory_clear(llama_get_memory(lctx), true);
+            int rc = llama_decode(lctx, batch);
+            if (rc < 0) {
+                return grpc::Status(grpc::StatusCode::INTERNAL,
+                    "TokenClassify: llama_decode failed (" + std::to_string(rc) + ")");
+            }
+            for (int li = lo; li < hi; li++) {
+                const float * row = llama_get_embeddings_ith(lctx, li);
+                if (row == nullptr) {
+                    return grpc::Status(grpc::StatusCode::INTERNAL,
+                        "TokenClassify: null embeddings at window position " + std::to_string(li));
+                }
+                // log_softmax over the n_cls logits (fp32, max-subtraction stable)
+                float maxv = row[0];
+                for (int c = 1; c < n_cls; c++) if (row[c] > maxv) maxv = row[c];
+                double sum = 0.0;
+                for (int c = 0; c < n_cls; c++) sum += std::exp((double) (row[c] - maxv));
+                const double logsum = std::log(sum);
+                float * dst = &emit[(size_t) (start + li) * n_cls];
+                for (int c = 0; c < n_cls; c++) {
+                    dst[c] = (float) ((double) (row[c] - maxv) - logsum);
+                }
+            }
+            return grpc::Status::OK;
+        };
+
+        grpc::Status st = grpc::Status::OK;
+        if (n_tok <= W) {
+            st = run_window(0, n_tok, 0, n_tok);
+        } else {
+            const int stride = W - 2 * halo;
+            for (int start = 0; start < n_tok; start += stride) {
+                const int wlen = std::min(W, n_tok - start);
+                const int lo = (start == 0) ? 0 : halo;
+                const int hi = (start + wlen >= n_tok) ? wlen : (wlen - halo);
+                st = run_window(start, wlen, lo, hi);
+                if (!st.ok()) break;
+                if (start + wlen >= n_tok) break;
+            }
+        }
+        llama_batch_free(batch);
+        if (!st.ok()) {
+            return st;
+        }
+
+        // Decode the BIOES path and assemble spans.
+        const pf_ner::label_table lt = pf_ner::build_label_table(model);
+        const std::vector<int>    path = pf_ner::bioes_viterbi(lt, emit, n_tok, n_cls);
+        const std::vector<pf_ner::span> spans = pf_ner::assemble_spans(lt, path, emit, n_cls);
+
+        const float threshold = request->threshold();
+        for (const auto & sp : spans) {
+            if (sp.score < threshold) continue;
+            int bstart = tok_off[sp.tok_begin];
+            int bend   = tok_end[sp.tok_end];
+            if (bstart < 0 || bend > (int) text.size() || bstart >= bend) continue;
+            // Trim leading/trailing ASCII whitespace: the o200k tokenizer folds a
+            // leading space into the token piece, so a span would otherwise read
+            // " John" instead of "John" — masking the trimmed form is cleaner.
+            while (bstart < bend && (unsigned char) text[bstart] <= ' ') bstart++;
+            while (bend > bstart && (unsigned char) text[bend - 1] <= ' ') bend--;
+            if (bstart >= bend) continue;
+            backend::TokenClassifyEntity * ent = response->add_entities();
+            ent->set_entity_group(lt.category_name(sp.cat));
+            ent->set_start(bstart);
+            ent->set_end(bend);
+            ent->set_score(sp.score);
+            ent->set_text(text.substr(bstart, (size_t) (bend - bstart)));
+        }
+
+        return grpc::Status::OK;
+    }
+
     grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response) override {
         auto auth = checkAuth(context);
         if (!auth.ok()) return auth;
@@ -3458,7 +3865,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
         if (body.count("prompt") != 0) {
             const bool add_special = json_value(body, "add_special", false);
 
-            llama_tokens tokens = tokenize_mixed(ctx_server.impl->vocab, body.at("content"), add_special, true);
+            llama_tokens tokens = tokenize_mixed(ctx_server.impl->vocab, body.at("prompt"), add_special, true);
 
 
             for (const auto& token : tokens) {
diff --git a/backend/cpp/llama-cpp/patches/0001-token-cls-pooling-substrate.patch b/backend/cpp/llama-cpp/patches/0001-token-cls-pooling-substrate.patch
new file mode 100644
index 000000000000..81e62851d54f
--- /dev/null
+++ b/backend/cpp/llama-cpp/patches/0001-token-cls-pooling-substrate.patch
@@ -0,0 +1,157 @@
+diff --git a/common/arg.cpp b/common/arg.cpp
+index e0f6c6066..6a62c43f2 100644
+--- a/common/arg.cpp
++++ b/common/arg.cpp
+@@ -1923,14 +1923,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
+         }
+     ).set_sampling().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
+     add_opt(common_arg(
+-        {"--pooling"}, "{none,mean,cls,last,rank}",
++        {"--pooling"}, "{none,mean,cls,last,rank,token-cls}",
+         "pooling type for embeddings, use model default if unspecified",
+         [](common_params & params, const std::string & value) {
+-            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
+-            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
+-            else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS;  }
+-            else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
+-            else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
++            /**/ if (value == "none")      { params.pooling_type = LLAMA_POOLING_TYPE_NONE;      }
++            else if (value == "mean")      { params.pooling_type = LLAMA_POOLING_TYPE_MEAN;      }
++            else if (value == "cls")       { params.pooling_type = LLAMA_POOLING_TYPE_CLS;       }
++            else if (value == "last")      { params.pooling_type = LLAMA_POOLING_TYPE_LAST;      }
++            else if (value == "rank")      { params.pooling_type = LLAMA_POOLING_TYPE_RANK;      }
++            else if (value == "token-cls") { params.pooling_type = LLAMA_POOLING_TYPE_TOKEN_CLS; }
+             else { throw std::invalid_argument("invalid value"); }
+         }
+     ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
+diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
+index f6a20ef9d..4281da592 100644
+--- a/examples/embedding/embedding.cpp
++++ b/examples/embedding/embedding.cpp
+@@ -54,7 +54,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
+         const float * embd = nullptr;
+         int embd_pos = 0;
+ 
+-        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
++        if (pooling_type == LLAMA_POOLING_TYPE_NONE || pooling_type == LLAMA_POOLING_TYPE_TOKEN_CLS) {
+             // try to get token embeddings
+             embd = llama_get_embeddings_ith(ctx, i);
+             embd_pos = i;
+@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
+ 
+     // count number of embeddings
+     int n_embd_count = 0;
+-    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
++    if (pooling_type == LLAMA_POOLING_TYPE_NONE || pooling_type == LLAMA_POOLING_TYPE_TOKEN_CLS) {
+         for (int k = 0; k < n_prompts; k++) {
+             n_embd_count += inputs[k].size();
+         }
+@@ -272,7 +272,7 @@ int main(int argc, char ** argv) {
+         if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
+             float * out = emb + e * n_embd_out;
+             batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
+-            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
++            e += (pooling_type == LLAMA_POOLING_TYPE_NONE || pooling_type == LLAMA_POOLING_TYPE_TOKEN_CLS) ? batch.n_tokens : s;
+             s = 0;
+             common_batch_clear(batch);
+         }
+@@ -289,7 +289,7 @@ int main(int argc, char ** argv) {
+     if (params.embd_out.empty()) {
+         LOG("\n");
+ 
+-        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
++        if (pooling_type == LLAMA_POOLING_TYPE_NONE || pooling_type == LLAMA_POOLING_TYPE_TOKEN_CLS) {
+             for (int j = 0; j < n_embd_count; j++) {
+                 LOG("embedding %d: ", j);
+                 for (int i = 0; i < std::min(3, n_embd_out); i++) {
+diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
+index 5a567e2d1..d2763dfed 100644
+--- a/gguf-py/gguf/constants.py
++++ b/gguf-py/gguf/constants.py
+@@ -4172,6 +4172,7 @@ class PoolingType(IntEnum):
+     CLS  = 2
+     LAST = 3
+     RANK = 4
++    TOKEN_CLS = 5
+ 
+ 
+ class GGMLQuantizationType(IntEnum):
+diff --git a/include/llama.h b/include/llama.h
+index e8374c53b..d22f3c816 100644
+--- a/include/llama.h
++++ b/include/llama.h
+@@ -175,6 +175,7 @@ extern "C" {
+         LLAMA_POOLING_TYPE_CLS  = 2,
+         LLAMA_POOLING_TYPE_LAST = 3,
+         LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
++        LLAMA_POOLING_TYPE_TOKEN_CLS = 5, // used by token classification models to attach the classification head to each token
+     };
+ 
+     enum llama_attention_type {
+diff --git a/src/llama-context.cpp b/src/llama-context.cpp
+index ad36c0666..769cc620c 100644
+--- a/src/llama-context.cpp
++++ b/src/llama-context.cpp
+@@ -1420,6 +1420,17 @@ int llama_context::encode(const llama_batch & batch_inp) {
+                     GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd.size);
+                     ggml_backend_tensor_get_async(backend_embd, t_embd, embd.data, 0, n_tokens*n_embd_out*sizeof(float));
+                 } break;
++            case LLAMA_POOLING_TYPE_TOKEN_CLS:
++                {
++                    // extract token classification outputs
++                    GGML_ASSERT(embd.data != nullptr);
++                    GGML_ASSERT(hparams.n_cls_out > 0);
++                    GGML_ASSERT(hparams.n_embd_out() == hparams.n_cls_out);
++
++                    const uint32_t n_cls_out = hparams.n_cls_out;
++                    GGML_ASSERT(n_tokens*n_cls_out <= (int64_t) embd.size);
++                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd.data, 0, n_tokens*n_cls_out*sizeof(float));
++                } break;
+             case LLAMA_POOLING_TYPE_MEAN:
+             case LLAMA_POOLING_TYPE_CLS:
+             case LLAMA_POOLING_TYPE_LAST:
+@@ -1864,6 +1875,22 @@ int llama_context::decode(const llama_batch & batch_inp) {
+                             ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd_out*sizeof(float));
+                         }
+                     } break;
++                case LLAMA_POOLING_TYPE_TOKEN_CLS:
++                    {
++                        // extract token classification outputs
++                        GGML_ASSERT(embd.data != nullptr);
++                        GGML_ASSERT(hparams.n_cls_out > 0);
++                        GGML_ASSERT(hparams.n_embd_out() == hparams.n_cls_out);
++
++                        const uint32_t n_cls_out = hparams.n_cls_out;
++                        float * embd_out = embd.data + n_outputs_prev*n_cls_out;
++
++                        if (n_outputs) {
++                            GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
++                            GGML_ASSERT((n_outputs_prev + n_outputs)*n_cls_out <= (int64_t) embd.size);
++                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_cls_out*sizeof(float));
++                        }
++                    } break;
+                 case LLAMA_POOLING_TYPE_MEAN:
+                 case LLAMA_POOLING_TYPE_CLS:
+                 case LLAMA_POOLING_TYPE_LAST:
+diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
+index e6ec3054d..afa2eb665 100644
+--- a/src/llama-graph.cpp
++++ b/src/llama-graph.cpp
+@@ -2939,6 +2939,17 @@ void llm_graph_context::build_pooling(
+             {
+                 cur = inp;
+             } break;
++        case LLAMA_POOLING_TYPE_TOKEN_CLS:
++            {
++                cur = inp;
++
++                if (cls_out) {
++                    cur = ggml_mul_mat(ctx0, cls_out, cur);
++                    if (cls_out_b) {
++                        cur = ggml_add(ctx0, cur, cls_out_b);
++                    }
++                }
++            } break;
+         case LLAMA_POOLING_TYPE_MEAN:
+             {
+                 ggml_tensor * inp_mean = build_inp_mean();
diff --git a/backend/cpp/llama-cpp/patches/0002-arch-openai-privacy-filter.patch b/backend/cpp/llama-cpp/patches/0002-arch-openai-privacy-filter.patch
new file mode 100644
index 000000000000..a0a7aae47085
--- /dev/null
+++ b/backend/cpp/llama-cpp/patches/0002-arch-openai-privacy-filter.patch
@@ -0,0 +1,84 @@
+diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
+index 5a567e2d1..59f69ce5e 100644
+--- a/gguf-py/gguf/constants.py
++++ b/gguf-py/gguf/constants.py
+@@ -485,6 +485,7 @@ class MODEL_ARCH(IntEnum):
+     HUNYUAN_VL       = auto()
+     SMOLLM3          = auto()
+     GPT_OSS          = auto()
++    OPENAI_PRIVACY_FILTER = auto() # tracks upstream model_type "openai_privacy_filter"; shares the gpt-oss MoE body but is NOT a gpt-oss variant (interleaved/NORM rope, bidirectional, no LM head)
+     LFM2             = auto()
+     LFM2MOE          = auto()
+     DREAM            = auto()
+@@ -1005,6 +1006,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
+     MODEL_ARCH.HUNYUAN_VL:       "hunyuan_vl",
+     MODEL_ARCH.SMOLLM3:          "smollm3",
+     MODEL_ARCH.GPT_OSS:          "gpt-oss",
++    MODEL_ARCH.OPENAI_PRIVACY_FILTER: "openai-privacy-filter",
+     MODEL_ARCH.LFM2:             "lfm2",
+     MODEL_ARCH.LFM2MOE:          "lfm2moe",
+     MODEL_ARCH.DREAM:            "dream",
+@@ -3702,6 +3704,27 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
+         MODEL_TENSOR.FFN_DOWN_EXP,
+         MODEL_TENSOR.FFN_UP_EXP,
+     ],
++    MODEL_ARCH.OPENAI_PRIVACY_FILTER: [
++        # gpt-oss tensor set, minus the LM head (OUTPUT), plus the
++        # token-classification head (CLS_OUT -> "cls.output"). The encoder
++        # graph ends at output_norm and the TOKEN_CLS pooling attaches the
++        # score head per token (see patch 0001).
++        MODEL_TENSOR.TOKEN_EMBD,
++        MODEL_TENSOR.OUTPUT_NORM,
++        MODEL_TENSOR.ATTN_NORM,
++        MODEL_TENSOR.ATTN_POST_NORM,
++        MODEL_TENSOR.ATTN_Q,
++        MODEL_TENSOR.ATTN_K,
++        MODEL_TENSOR.ATTN_V,
++        MODEL_TENSOR.ATTN_OUT,
++        MODEL_TENSOR.ATTN_SINKS,
++        MODEL_TENSOR.ROPE_FREQS,
++        MODEL_TENSOR.FFN_GATE_INP,
++        MODEL_TENSOR.FFN_GATE_EXP,
++        MODEL_TENSOR.FFN_DOWN_EXP,
++        MODEL_TENSOR.FFN_UP_EXP,
++        MODEL_TENSOR.CLS_OUT,
++    ],
+     MODEL_ARCH.LFM2: [
+         MODEL_TENSOR.TOKEN_EMBD,
+         MODEL_TENSOR.TOKEN_EMBD_NORM,
+diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
+index 444f0f285..d3e5d1d89 100644
+--- a/gguf-py/gguf/tensor_mapping.py
++++ b/gguf-py/gguf/tensor_mapping.py
+@@ -1287,6 +1287,7 @@ class TensorNameMap:
+ 
+         MODEL_TENSOR.CLS_OUT: (
+             "classifier.out_proj", # roberta
++            "score",               # openai-privacy-filter (token-classification head)
+         ),
+ 
+         MODEL_TENSOR.CLS_NORM: (
+diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
+index b485ac02e..aaf166680 100644
+--- a/src/llama-arch.cpp
++++ b/src/llama-arch.cpp
+@@ -135,6 +135,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+     { LLM_ARCH_MAINCODER,        "maincoder"        },
+     { LLM_ARCH_KIMI_LINEAR,      "kimi-linear"      },
+     { LLM_ARCH_TALKIE,           "talkie"           },
++    { LLM_ARCH_OPENAI_PRIVACY_FILTER, "openai-privacy-filter" },
+     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
+ };
+ 
+diff --git a/src/llama-arch.h b/src/llama-arch.h
+index b59043e40..edd6b2ad6 100644
+--- a/src/llama-arch.h
++++ b/src/llama-arch.h
+@@ -139,6 +139,7 @@ enum llm_arch {
+     LLM_ARCH_MAINCODER,
+     LLM_ARCH_KIMI_LINEAR,
+     LLM_ARCH_TALKIE,
++    LLM_ARCH_OPENAI_PRIVACY_FILTER,
+     LLM_ARCH_UNKNOWN,
+ };
+ 
diff --git a/backend/cpp/llama-cpp/patches/0003-convert-openai-privacy-filter.patch b/backend/cpp/llama-cpp/patches/0003-convert-openai-privacy-filter.patch
new file mode 100644
index 000000000000..eeb72ad2f138
--- /dev/null
+++ b/backend/cpp/llama-cpp/patches/0003-convert-openai-privacy-filter.patch
@@ -0,0 +1,178 @@
+diff --git a/conversion/__init__.py b/conversion/__init__.py
+index 222005740..ab54e15a6 100644
+--- a/conversion/__init__.py
++++ b/conversion/__init__.py
+@@ -87,6 +87,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
+     "GlmMoeDsaForCausalLM": "glm",
+     "GlmOcrForConditionalGeneration": "glm",
+     "GptOssForCausalLM": "gpt_oss",
++    "OpenAIPrivacyFilterForTokenClassification": "openai_privacy_filter",
+     "GraniteForCausalLM": "granite",
+     "GraniteMoeForCausalLM": "granite",
+     "GraniteMoeHybridForCausalLM": "granite",
+diff --git a/conversion/openai_privacy_filter.py b/conversion/openai_privacy_filter.py
+new file mode 100644
+index 000000000..c6e4cf2e3
+--- /dev/null
++++ b/conversion/openai_privacy_filter.py
+@@ -0,0 +1,160 @@
++from __future__ import annotations
++
++from typing import Iterable, TYPE_CHECKING
++
++if TYPE_CHECKING:
++    from torch import Tensor
++
++from .base import ModelBase, gguf
++from .gpt_oss import GptOssModel
++
++
++@ModelBase.register("OpenAIPrivacyFilterForTokenClassification")
++class OpenAIPrivacyFilterModel(GptOssModel):
++    # openai/privacy-filter + OpenMed/privacy-filter-multilingual: a gpt-oss MoE
++    # body (8 layers, 14/2 heads, head_dim 64, d_model 640, 128 experts top-4,
++    # o200k vocab, attn sinks, YaRN) re-purposed as a *bidirectional token
++    # classifier*. config.model_type == "openai_privacy_filter".
++    #
++    # We subclass the gpt-oss converter to reuse its vocab and tensor handling,
++    # and override only what differs:
++    #   1. expert gate_up split: CONCATENATED halves, not gpt-oss interleaving;
++    #   2. a token-classification head (score -> cls.output) + TOKEN_CLS pooling;
++    #   3. no LM head (the arch's MODEL_TENSORS omits OUTPUT).
++    # The body is bf16 dense (not MXFP4), so the gpt-oss dense paths are the ones
++    # that run; the MXFP4 repack code in the base is never exercised.
++    model_arch = gguf.MODEL_ARCH.OPENAI_PRIVACY_FILTER
++
++    def set_gguf_parameters(self):
++        # HF renamed rope_scaling -> rope_parameters for this arch. Alias it
++        # before super() so the base TextModel YaRN handling (which keys off
++        # "rope_scaling") still writes the rope KVs. The dict keys match what the
++        # base expects (rope_type=yarn, factor, beta_fast/slow,
++        # original_max_position_embeddings). Verify the rope KVs landed with
++        # gguf_dump as part of parity (patch 0004 / Task 5).
++        if "rope_scaling" not in self.hparams and "rope_parameters" in self.hparams:
++            self.hparams["rope_scaling"] = self.hparams["rope_parameters"]
++
++        # GptOssModel.set_gguf_parameters writes base text params +
++        # sliding_window + expert_feed_forward_length (= intermediate_size).
++        super().set_gguf_parameters()
++
++        # Token-classification head. PoolingType.TOKEN_CLS == 5 (patch 0001).
++        # The loader derives n_cls_out from the label count; n_embd_out must
++        # equal it (llama-context asserts n_embd_out() == n_cls_out under
++        # TOKEN_CLS), so we write both from the same ordered label list.
++        labels = self._ordered_labels()
++        self.gguf_writer.add_pooling_type(gguf.PoolingType.TOKEN_CLS)
++        self.gguf_writer.add_classifier_output_labels(labels)
++        self.gguf_writer.add_embedding_length_out(len(labels))
++
++    def generate_extra_tensors(self) -> Iterable[tuple[str, "Tensor"]]:
++        # Emit the gpt-oss base's extra tensors (MXFP4 repack; a no-op here since
++        # privacy-filter is dense bf16), then our per-dim RoPE frequency factors.
++        yield from super().generate_extra_tensors()
++
++        # YaRN with truncate=False. The model's rope_parameters set
++        # truncate=False, but ggml's rope_yarn corr_dims unconditionally
++        # floor()/ceil() the interpolation ramp boundaries. That rounding shifts
++        # the ramp in the transition band (here dims ~20-34), giving a per-dim
++        # frequency error up to ~21% that mis-rotates Q/K, softens attention
++        # (worse at higher positions), and attenuates the final logits. Instead
++        # of changing ggml's shared YaRN (which would perturb every other YaRN
++        # model), we bake the *exact* HF inv_freq into per-dim rope_freqs
++        # (freq_factors); the loader disables ggml's YaRN ramp for this arch and
++        # keeps only the YaRN attention mscale (see
++        # src/models/openai-privacy-filter.cpp and patches/README.md).
++        import math
++        import torch
++
++        rope = self.hparams.get("rope_parameters") or self.hparams.get("rope_scaling") or {}
++        if str(rope.get("rope_type", "")).lower() != "yarn":
++            return
++
++        dim       = int(self.hparams["head_dim"])
++        base      = float(rope.get("rope_theta", self.hparams.get("rope_theta", 10000.0)))
++        factor    = float(rope["factor"])
++        orig      = float(rope["original_max_position_embeddings"])
++        beta_fast = float(rope.get("beta_fast", 32.0))
++        beta_slow = float(rope.get("beta_slow", 1.0))
++        truncate  = bool(rope.get("truncate", True))
++
++        # HF transformers _compute_yarn_parameters (modeling_rope_utils).
++        def correction_dim(num_rotations: float) -> float:
++            return (dim * math.log(orig / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
++
++        low, high = correction_dim(beta_fast), correction_dim(beta_slow)
++        if truncate:
++            low, high = math.floor(low), math.ceil(high)
++        low, high = max(low, 0.0), min(high, dim - 1)
++        if low == high:
++            high += 0.001
++
++        half      = dim // 2
++        pos_freqs = base ** (torch.arange(0, dim, 2, dtype=torch.float64) / dim)
++        extrap    = 1.0 / pos_freqs              # high-frequency dims: no scaling
++        interp    = 1.0 / (factor * pos_freqs)   # low-frequency dims: divide by factor
++        ramp      = torch.clamp((torch.arange(half, dtype=torch.float64) - low) / (high - low), 0.0, 1.0)
++        extrap_factor = 1.0 - ramp
++        inv_freq  = interp * (1.0 - extrap_factor) + extrap * extrap_factor
++
++        # ggml divides theta_base (= pos * extrap) by the freq factor, so the
++        # per-dim factor that reproduces inv_freq is extrap / inv_freq (1..factor).
++        freq_factors = (extrap / inv_freq).to(torch.float32)
++        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), freq_factors)
++
++    def _ordered_labels(self) -> list[str]:
++        # id2label is {"0": "O", "1": "B-ACCOUNTNAME", ...}; emit in index order
++        # so the GGUF label table row i lines up with score-head output row i.
++        # Keys arrive as int (config parsing coerces them) or str (raw JSON),
++        # so normalize to int before ordering.
++        # 217 labels (multilingual) / 33 (base english), BIOES-encoded.
++        id2label = {int(k): v for k, v in self.hparams["id2label"].items()}
++        return [id2label[i] for i in range(len(id2label))]
++
++    def modify_tensors(self, data_torch: "Tensor", name: str, bid: int | None) -> Iterable[tuple[str, "Tensor"]]:
++        # Experts: privacy-filter packs the fused gate_up projection as two
++        # CONCATENATED halves (gate = first intermediate_size columns, up = the
++        # rest) rather than gpt-oss's INTERLEAVED even/odd (::2 / 1::2). This is
++        # the single load-bearing divergence from the base converter: an
++        # interleaved split here would silently produce a numerically wrong
++        # model that still loads and runs.
++        #
++        # If per-layer parity vs the HF reference (Task 5) fails at the first
++        # expert FFN, the fix is to revert the two slicings below to the gpt-oss
++        # interleaving (data_torch[:, ::2, :] / [:, 1::2, :] and
++        # [..., ::2] / [..., 1::2]).
++        #
++        # privacy-filter is dense bf16, so we only handle the non-MXFP4 case
++        # (no _blocks/_scales tensors exist).
++        if "gate_up_proj" in name and "_blocks" not in name and "_scales" not in name:
++            inter = self.hparams["intermediate_size"]  # 640
++            if name.endswith("_bias"):
++                gate_b, up_b = data_torch[..., :inter], data_torch[..., inter:]
++                name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias")
++                name_up   = name.replace("gate_up_proj_bias", "up_proj.bias")
++                # bypass GptOssModel.modify_tensors (interleaved) -> TextModel
++                yield from super(GptOssModel, self).modify_tensors(gate_b, name_gate, bid)
++                yield from super(GptOssModel, self).modify_tensors(up_b,   name_up,   bid)
++                return
++            # weight: HF stores [E, in, 2*inter]; transpose to [E, 2*inter, in]
++            # then split the output dim into the two contiguous halves.
++            data_torch = data_torch.transpose(-1, -2)
++            gate_w, up_w = data_torch[:, :inter, :], data_torch[:, inter:, :]
++            name_gate = name.replace("gate_up_proj", "gate_proj.weight")
++            name_up   = name.replace("gate_up_proj", "up_proj.weight")
++            yield from super(GptOssModel, self).modify_tensors(gate_w, name_gate, bid)
++            yield from super(GptOssModel, self).modify_tensors(up_w,   name_up,   bid)
++            return
++
++        # Everything else converts correctly via the base:
++        #  - down_proj (dense): GptOssModel.modify_tensors does the rename +
++        #    transpose;
++        #  - q/k/v/o (+biases), attn sinks, router (+bias), norms, embeddings:
++        #    GptOssModel / its filter_tensors handle these;
++        #  - score.{weight,bias}: fall through to TextModel.map_tensor_name,
++        #    which maps "score" -> cls.output via the tensor_mapping.py entry
++        #    added in patch 0002.
++        # We never emit an LM head: tie_word_embeddings is false and the arch's
++        # MODEL_TENSORS list omits MODEL_TENSOR.OUTPUT, so nothing expects one.
++        yield from super().modify_tensors(data_torch, name, bid)
diff --git a/backend/cpp/llama-cpp/patches/0004-graph-openai-privacy-filter.patch b/backend/cpp/llama-cpp/patches/0004-graph-openai-privacy-filter.patch
new file mode 100644
index 000000000000..f9de3578e86a
--- /dev/null
+++ b/backend/cpp/llama-cpp/patches/0004-graph-openai-privacy-filter.patch
@@ -0,0 +1,285 @@
+diff --git a/src/llama-model.cpp b/src/llama-model.cpp
+index 3e236f8c1..465641b63 100644
+--- a/src/llama-model.cpp
++++ b/src/llama-model.cpp
+@@ -257,6 +257,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
+             return new llama_model_smollm3(params);
+         case LLM_ARCH_OPENAI_MOE:
+             return new llama_model_openai_moe(params);
++        case LLM_ARCH_OPENAI_PRIVACY_FILTER:
++            return new llama_model_openai_privacy_filter(params);
+         case LLM_ARCH_FALCON_H1:
+             return new llama_model_falcon_h1(params);
+         case LLM_ARCH_LFM2:
+@@ -1794,7 +1796,7 @@ void llama_model::print_info() const {
+             LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n",     __func__, hparams.n_ff_shexp);
+         }
+ 
+-        if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
++        if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1 || arch == LLM_ARCH_OPENAI_PRIVACY_FILTER) {
+             LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
+         }
+ 
+@@ -2315,6 +2317,10 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+         case LLM_ARCH_LLAMA_EMBED:
+         case LLM_ARCH_MAINCODER:
+         case LLM_ARCH_GLM_DSA:
++        // openai-privacy-filter uses the interleaved (GPT-J) rope layout
++        // (_apply_rotary_emb pairs x[..., ::2]/x[..., 1::2]), unlike gpt-oss
++        // (OPENAI_MOE) which uses NEOX rotate-half. See patches/README.md.
++        case LLM_ARCH_OPENAI_PRIVACY_FILTER:
+             return LLAMA_ROPE_TYPE_NORM;
+ 
+         // the pairs of head values are offset by n_rot/2
+diff --git a/src/models/models.h b/src/models/models.h
+index 5251e2d82..ab78f4bdc 100644
+--- a/src/models/models.h
++++ b/src/models/models.h
+@@ -1591,6 +1591,22 @@ struct llama_model_openai_moe : public llama_model_base {
+ };
+ 
+ 
++// openai/privacy-filter token classifier: gpt-oss MoE body re-purposed as a
++// bidirectional NER encoder with a per-token classification head (see
++// src/models/openai-privacy-filter.cpp).
++struct llama_model_openai_privacy_filter : public llama_model_base {
++    llama_model_openai_privacy_filter(const struct llama_model_params & params) : llama_model_base(params) {}
++    void load_arch_hparams(llama_model_loader & ml) override;
++    void load_arch_tensors(llama_model_loader & ml) override;
++
++    struct graph : public llm_graph_context {
++        graph(const llama_model & model, const llm_graph_params & params);
++    };
++
++    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
++};
++
++
+ struct llama_model_falcon_h1 : public llama_model_base {
+     llama_model_falcon_h1(const struct llama_model_params & params) : llama_model_base(params) {}
+     void load_arch_hparams(llama_model_loader & ml) override;
+diff --git a/src/models/openai-privacy-filter.cpp b/src/models/openai-privacy-filter.cpp
+new file mode 100644
+index 000000000..9e40391db
+--- /dev/null
++++ b/src/models/openai-privacy-filter.cpp
+@@ -0,0 +1,219 @@
++#include "models.h"
++
++// openai/privacy-filter + OpenMed/privacy-filter-multilingual.
++//
++// A gpt-oss MoE *body* (8 layers, 14/2 heads, head_dim 64, d_model 640,
++// 128 experts top-4, o200k vocab, attention sinks, YaRN) re-purposed as a
++// BIDIRECTIONAL token classifier. The body is identical to llama_model_openai_moe;
++// the differences are all "this is an encoder with a per-token classification
++// head", not "the transformer block is different":
++//
++//   1. non-causal attention over a SYMMETRIC sliding-window band, no KV cache
++//      (build_attn_inp_no_cache instead of the kv_iswa input);
++//   2. every layer is windowed (uniform band) — no alternating dense layers;
++//   3. NO build_inp_out_ids() pruning: a token classifier needs a logit for
++//      every token, so we must not collapse to the last token;
++//   4. NO LM head. The graph stops at the per-token hidden states (res->t_embd);
++//      the framework then calls build_pooling(), which under pooling_type ==
++//      TOKEN_CLS applies model.cls_out (+cls_out_b) to each token to produce
++//      [n_cls_out, n_tokens] logits (carry-patch 0001).
++
++void llama_model_openai_privacy_filter::load_arch_hparams(llama_model_loader & ml) {
++    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
++    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
++    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
++
++    // Bidirectional encoder.
++    hparams.causal_attn = false;
++
++    // The HF sliding_window is the window half-width (a token attends to
++    // ±sliding_window). LLAMA_SWA_TYPE_SYMMETRIC masks |p1 - p0| > n_swa/2,
++    // so n_swa = 2 * sliding_window reproduces that ±window.
++    hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
++    hparams.n_swa    = 2 * hparams.n_swa;
++
++    // Uniform band: every layer is windowed (unlike gpt-oss's alternating
++    // dense/SWA pattern). set_swa_pattern(0) marks all layers SWA, so the
++    // no-cache attn path uses the symmetric mask on every layer.
++    hparams.set_swa_pattern(0);
++
++    // RoPE: the model uses YaRN with truncate=false, but ggml's rope_yarn
++    // floor()/ceil()s the interpolation-ramp boundaries, which mis-rotates Q/K
++    // in the transition band (softened attention, attenuated logits). We bake
++    // the exact HF inv_freq into per-dim rope_freqs (freq_factors) in the
++    // converter, and here disable ggml's YaRN ramp while keeping only the YaRN
++    // attention mscale. With ramp off, freq_factors fully define the per-dim
++    // frequencies; the kernel divides theta by them regardless of ext_factor.
++    const float yarn_factor = hparams.rope_freq_scale_train > 0.0f
++                                ? 1.0f / hparams.rope_freq_scale_train : 1.0f;
++    hparams.rope_attn_factor        = 1.0f + 0.1f * logf(yarn_factor);  // YaRN mscale (get_mscale, mscale=1)
++    hparams.rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;     // no ramp; rope_freqs carry the scaling
++
++    // Every layer is SWA, so the graph reads the *_swa rope params. Base stays
++    // at the trained value; scale is 1.0 (all per-dim scaling lives in
++    // rope_freqs now, not in a global freq_scale).
++    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
++    hparams.rope_freq_scale_train_swa = 1.0f;
++
++    // No dedicated size label — the privacy-filter configs are tiny
++    // (8 layers) and don't match any LLM_TYPE bucket.
++    type = LLM_TYPE_UNKNOWN;
++}
++
++void llama_model_openai_privacy_filter::load_arch_tensors(llama_model_loader &) {
++    LLAMA_LOAD_LOCALS;
++
++    const int64_t n_ff_exp = hparams.n_ff_exp;
++
++    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
++
++    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
++
++    // Token-classification head (no LM head). The converter maps the HF
++    // `score.{weight,bias}` to `cls.output.*` (carry-patch 0002), and the
++    // arch's MODEL_TENSORS omits OUTPUT. n_cls_out is derived from the
++    // classifier label table during the generic hparams load.
++    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, 0);
++    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out}, 0);
++
++    for (int i = 0; i < n_layer; ++i) {
++        auto & layer = layers[i];
++
++        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
++        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
++
++        create_tensor_qkv(layer, i, n_embd, n_head * n_rot, n_head_kv * n_rot, n_head_kv * n_rot, 0);
++        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
++
++        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
++
++        // Per-dim RoPE frequency factors (single shared "rope_freqs.weight";
++        // ROPE_FREQS has no per-layer name, so every layer resolves the same
++        // tensor). Carries the exact HF YaRN inv_freq; see load_arch_hparams.
++        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
++
++        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {  n_embd, n_expert}, 0);
++        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
++        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
++        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
++
++        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
++
++        layer.ffn_gate_inp_b  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "bias", i), {n_expert}, 0);
++        layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
++        layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), {  n_embd, n_expert}, 0);
++        layer.ffn_up_exps_b   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "bias", i), {n_ff_exp, n_expert}, 0);
++    }
++}
++
++std::unique_ptr<llm_graph_context> llama_model_openai_privacy_filter::build_arch_graph(const llm_graph_params & params) const {
++    return std::make_unique<graph>(*this, params);
++}
++
++llama_model_openai_privacy_filter::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
++    ggml_tensor * cur;
++    ggml_tensor * inpL;
++
++    inpL = build_inp_embd(model.tok_embd);
++
++    // inp_pos - contains the positions (used by RoPE)
++    ggml_tensor * inp_pos = build_inp_pos();
++
++    // Bidirectional encoder: no KV cache, non-causal. The symmetric
++    // sliding-window band is applied via the no-cache SWA mask, which
++    // build_attn_inp_no_cache() allocates because swa_type != NONE.
++    auto * inp_attn = build_attn_inp_no_cache();
++
++    // NOTE: deliberately no build_inp_out_ids() / ggml_get_rows() pruning —
++    // a token classifier produces a logit per token, so every position must
++    // survive to the classification head.
++
++    for (int il = 0; il < n_layer; ++il) {
++        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
++        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
++
++        ggml_tensor * inpSA = inpL;
++
++        // Per-dim YaRN frequency factors (see load_arch_hparams): bypasses
++        // ggml's truncate=true ramp and reproduces the HF inv_freq exactly.
++        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
++
++        // norm
++        cur = build_norm(inpL,
++                model.layers[il].attn_norm, nullptr,
++                LLM_NORM_RMS, il);
++        cb(cur, "attn_norm", il);
++
++        // self-attention
++        {
++            auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
++                    n_rot, n_head, n_head_kv, il);
++
++            Qcur = ggml_rope_ext(
++                    ctx0, Qcur, inp_pos, rope_factors,
++                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
++                    ext_factor, attn_factor, beta_fast, beta_slow
++                    );
++
++            Kcur = ggml_rope_ext(
++                    ctx0, Kcur, inp_pos, rope_factors,
++                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
++                    ext_factor, attn_factor, beta_fast, beta_slow
++                    );
++
++            cb(Qcur, "Qcur", il);
++            cb(Kcur, "Kcur", il);
++            cb(Vcur, "Vcur", il);
++
++            cur = build_attn(inp_attn,
++                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
++                    Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
++
++            cb(cur, "attn_out", il);
++        }
++
++        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
++        cb(ffn_inp, "ffn_inp", il);
++
++        cur = build_norm(ffn_inp,
++                model.layers[il].attn_post_norm, nullptr,
++                LLM_NORM_RMS, il);
++        cb(cur, "attn_post_norm", il);
++
++        // MoE branch
++        cur = build_moe_ffn(cur,
++                model.layers[il].ffn_gate_inp,  model.layers[il].ffn_gate_inp_b,
++                model.layers[il].ffn_up_exps,   model.layers[il].ffn_up_exps_b,
++                model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
++                model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
++                nullptr,
++                n_expert, n_expert_used,
++                LLM_FFN_SWIGLU_OAI_MOE, false,
++                hparams.expert_weights_scale,
++                LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
++                il);
++        cb(cur, "ffn_moe_out", il);
++
++        cur = ggml_add(ctx0, cur, ffn_inp);
++
++        cur = build_cvec(cur, il);
++        cb(cur, "l_out", il);
++
++        // input for next layer
++        inpL = cur;
++    }
++    cur = inpL;
++
++    cur = build_norm(cur,
++            model.output_norm, NULL,
++            LLM_NORM_RMS, -1);
++
++    cb(cur, "result_norm", -1);
++
++    // Stop at the per-token hidden states. The framework calls
++    // build_pooling() next; under pooling_type == TOKEN_CLS it applies
++    // model.cls_out (+cls_out_b) to every token (carry-patch 0001).
++    res->t_embd = cur;
++
++    ggml_build_forward_expand(gf, cur);
++}
diff --git a/backend/cpp/llama-cpp/patches/0005-no-cache-all-swa-mask-fix.patch b/backend/cpp/llama-cpp/patches/0005-no-cache-all-swa-mask-fix.patch
new file mode 100644
index 000000000000..b712b3c127eb
--- /dev/null
+++ b/backend/cpp/llama-cpp/patches/0005-no-cache-all-swa-mask-fix.patch
@@ -0,0 +1,34 @@
+--- a/src/llama-graph.cpp
++++ b/src/llama-graph.cpp
+@@ -463,16 +463,24 @@
+         }
+     };
+ 
++    // The non-SWA (full) and SWA masks are separate graph inputs, but a
++    // given model may consume only one of them: an encoder where *every*
++    // layer is SWA (e.g. the openai-privacy-filter token classifier, whose
++    // every layer uses a symmetric sliding window) never references the full
++    // mask, so the graph allocator leaves self_kq_mask unallocated (null
++    // buffer). Only fill a mask that actually got a buffer — filling an
++    // unallocated input would write through a null data pointer.
+     GGML_ASSERT(self_kq_mask);
+-    GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
+-    if (self_kq_mask->type == GGML_TYPE_F16) {
+-        fill_mask((ggml_fp16_t *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE);
+-    } else {
+-        fill_mask((float       *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE);
++    if (self_kq_mask->buffer) {
++        GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
++        if (self_kq_mask->type == GGML_TYPE_F16) {
++            fill_mask((ggml_fp16_t *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE);
++        } else {
++            fill_mask((float       *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE);
++        }
+     }
+ 
+-    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+-        GGML_ASSERT(self_kq_mask_swa);
++    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE && self_kq_mask_swa && self_kq_mask_swa->buffer) {
+         GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
+         if (self_kq_mask_swa->type == GGML_TYPE_F16) {
+             fill_mask((ggml_fp16_t *) self_kq_mask_swa->data, ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type);
diff --git a/backend/cpp/llama-cpp/patches/README.md b/backend/cpp/llama-cpp/patches/README.md
new file mode 100644
index 000000000000..cc60f26aa3c1
--- /dev/null
+++ b/backend/cpp/llama-cpp/patches/README.md
@@ -0,0 +1,183 @@
+# llama.cpp carry-patches
+
+`prepare.sh` applies every file in this directory to the freshly-cloned
+`llama.cpp/` tree with `patch -p1`, in lexical order, before the grpc-server
+sources are copied in. Keep patches small, ordered, and documented here.
+
+## 0001-token-cls-pooling-substrate.patch
+
+**What:** adds a per-token classification pooling path to llama.cpp:
+`LLAMA_POOLING_TYPE_TOKEN_CLS` (= 5). Under this pooling type `build_pooling`
+applies the model's `cls_out` (+`cls_out_b`) head to **every** token instead of
+to a single pooled vector, and `llama_context::{encode,decode}` copy the
+resulting `[n_cls_out, n_tokens]` logits into the embeddings buffer
+(`llama_get_embeddings_ith(i)` then returns the `n_cls_out` logits for token
+`i`). The `--pooling token-cls` CLI flag and the `llama-embedding` example are
+taught to treat it as token-level (like `none`).
+
+This is the substrate the `openai-privacy-filter` token-classifier arch needs
+(patches 0002/0003): the encoder graph ends at `result_norm` and lets the
+framework attach the score head per token.
+
+**Provenance:** a reduced subset of upstream PR
+[ggml-org/llama.cpp#19725](https://github.com/ggml-org/llama.cpp/pull/19725)
+("llama: add BertForTokenClassification support"). We carry **only** the
+pooling-mechanism hunks (`include/llama.h`, `src/llama-graph.cpp`,
+`src/llama-context.cpp`, `common/arg.cpp`, `examples/embedding/embedding.cpp`,
+`gguf-py/gguf/constants.py`). We deliberately drop the PR's BERT/WPM-specific
+parts (the `convert_hf_to_gguf.py` BertModel changes — our converter is its own
+`conversion/openai_privacy_filter.py`; and the WPM `do_lower_case` tokenizer
+plumbing — privacy-filter uses o200k BPE, not WordPiece). The prerequisites the
+substrate assumes (`gguf_writer.add_embedding_length_out` /
+`add_classifier_output_labels`, `hparams.n_cls_out` / `n_embd_out()`,
+`model.cls_out` / `cls_out_b`) already exist in the pinned tree.
+
+**Re-sync:** PR #19725 is still OPEN; if it changes under review, re-diff.
+If/when we upstream the `openai-privacy-filter` arch we will depend on TOKEN_CLS
+having landed (or keep carrying this).
+
+**Version note:** authored against `d6588daa8`; re-verified (line-offset
+only) against the current pin `5dcb71166`. See the consolidated version
+note at the bottom of this file.
+
+## 0002-arch-openai-privacy-filter.patch
+
+**What:** registers the `openai-privacy-filter` architecture (matching the
+model's `config.model_type == "openai_privacy_filter"`):
+- `src/llama-arch.h` / `.cpp`: `LLM_ARCH_OPENAI_PRIVACY_FILTER` + name string.
+  No per-arch tensor-name table is needed — this llama.cpp uses a single global
+  `LLM_TENSOR_NAMES` map, and every tensor we use (incl. `cls.output`,
+  `attn_sinks`) is already in it.
+- `gguf-py/gguf/constants.py`: `MODEL_ARCH.OPENAI_PRIVACY_FILTER`, its name, and
+  a `MODEL_TENSORS` list = the gpt-oss set **minus `OUTPUT`** (no LM head)
+  **plus `CLS_OUT`** (the score head).
+- `gguf-py/gguf/tensor_mapping.py`: maps HF `score` → `MODEL_TENSOR.CLS_OUT`, so
+  `score.{weight,bias}` convert to `cls.output.{weight,bias}`.
+
+The loader/graph for the arch (`llama-model.cpp`, `src/models/…`) come in 0003.
+`patch -p1 --dry-run` clean atop 0001 against the current pin `5dcb71166`.
+
+## 0003-convert-openai-privacy-filter.patch
+
+**What:** the HF→GGUF converter. Adds `conversion/openai_privacy_filter.py`
+(`OpenAIPrivacyFilterModel`, a `GptOssModel` subclass) and registers it in
+`conversion/__init__.py` (`OpenAIPrivacyFilterForTokenClassification` →
+`openai_privacy_filter`). It reuses the gpt-oss vocab and tensor handling and
+overrides only:
+- **expert `gate_up` split** — privacy-filter packs gate/up as **concatenated
+  halves** (`chunk(2)`), *not* gpt-oss's interleaved `::2`/`1::2`. This is the
+  one load-bearing divergence; a wrong split yields a silently-wrong model.
+  **Confirmed correct by per-layer parity** (full-logit cos = 1.0 vs HF; the FFN
+  out matched once the attention upstream was fixed — see below).
+- **per-dim RoPE frequency factors** (`generate_extra_tensors`) — the model's
+  `rope_parameters` set YaRN with `truncate: false`, but ggml's `rope_yarn`
+  unconditionally `floor()/ceil()`s the interpolation-ramp boundaries. That
+  rounding shifts the ramp in the transition band (here dims ~20–34), a per-dim
+  frequency error up to ~21% that mis-rotates Q/K and softens attention. Rather
+  than change ggml's shared YaRN (which would perturb every other YaRN model),
+  the converter computes HF's *exact* `inv_freq` (truncate=false) and writes
+  `rope_freqs.weight = extrap / inv_freq` (1.0 … factor). The loader (0004)
+  then disables ggml's YaRN ramp and keeps only the YaRN attention mscale, so
+  these freq-factors fully define the per-dim frequencies.
+- **token-classification head** — writes `pooling_type = TOKEN_CLS`, the ordered
+  `id2label` table (`add_classifier_output_labels`), and `n_embd_out =
+  len(labels)` (= n_cls_out). `score.{weight,bias}` map to `cls.output.*` via
+  the 0002 `tensor_mapping` entry; no LM head is emitted.
+- aliases `rope_parameters` → `rope_scaling` so the base YaRN handling fires
+  (this arch renamed the key).
+
+Everything else (down_proj, q/k/v/o + biases, attn sinks, router, norms,
+embeddings) converts via the gpt-oss base unchanged.
+
+**Validated end-to-end** against the real `OpenMed/privacy-filter-multilingual`
+weights: `convert_hf_to_gguf.py` produces a 156-tensor F16 GGUF whose metadata
+is correct — `general.architecture = openai-privacy-filter`, `pooling_type = 5`
+(TOKEN_CLS), 217 `classifier.output_labels`, `embedding_length_out = 217`,
+`cls.output.{weight 640×217, bias 217}`, rope `yarn`/factor 32/orig_ctx 4096/
+freq_base 150000, experts 128/4, sliding_window 128. The only thing the GGUF
+structure can't confirm is the gate_up *packing order* — that's a numeric check
+deferred to per-layer parity (Task 5).
+
+Repro (one-shot env; torch needs a 64-bit libstdc++ on `LD_LIBRARY_PATH` under
+nix): `pip install torch numpy safetensors sentencepiece protobuf transformers`
+into a venv, then
+`PYTHONPATH=gguf-py python convert_hf_to_gguf.py <model_dir> --outtype f16`.
+
+## 0004-graph-openai-privacy-filter.patch
+
+**What:** the model class, graph, and loader wiring for the
+`openai-privacy-filter` arch. Adds `src/models/openai-privacy-filter.cpp`
+(`llama_model_openai_privacy_filter` — `load_arch_hparams` /
+`load_arch_tensors` / `build_arch_graph` + nested `graph`), its `struct`
+declaration in `src/models/models.h`, and wiring sites in
+`src/llama-model.cpp` (the factory `case`, the **NORM** `rope_type` list, and
+the `n_ff_exp` info-log condition). No `CMakeLists.txt` change — model
+sources are gathered by `file(GLOB "models/*.cpp")`.
+
+The graph is the `llama_model_openai_moe` body re-purposed as a
+bidirectional token classifier:
+- `load_arch_hparams` sets `causal_attn = false`, `swa_type =
+  LLAMA_SWA_TYPE_SYMMETRIC`, `n_swa = 2 * sliding_window` (SYMMETRIC masks
+  `|p1-p0| > n_swa/2`, so the HF half-width window round-trips), and
+  `set_swa_pattern(0)` so **every** layer is windowed (uniform band, no
+  alternating dense layers).
+- the graph uses `build_attn_inp_no_cache()` (no KV cache; the no-cache
+  input allocates the SWA mask because `swa_type != NONE`), passes the
+  per-layer `attn_sinks` to `build_attn`, and **omits `build_inp_out_ids()`
+  pruning** so every token keeps a logit.
+- **RoPE.** privacy-filter uses the **interleaved (GPT-J) rope layout**
+  (`_apply_rotary_emb` pairs `x[..., ::2]/x[..., 1::2]`), so the arch returns
+  `LLAMA_ROPE_TYPE_NORM` — *unlike* gpt-oss (`OPENAI_MOE`), which uses NEOX
+  rotate-half. (This was the dominant parity bug: NEOX mis-pairs the rotated
+  dims, leaving a per-token cos ≈ 0.82 that no frequency tweak could fix.)
+  `load_arch_hparams` also bakes the YaRN `truncate=false` fix: it sets
+  `rope_scaling_type = NONE` (disables ggml's floor/ceil YaRN ramp), keeps the
+  YaRN mscale via `rope_attn_factor = 1 + 0.1·ln(factor)`, sets the SWA
+  freq-scale to 1.0, and the graph passes the per-layer `rope_freqs`
+  (loaded from `rope_freqs.weight`, written by 0003) into `ggml_rope_ext` so
+  the per-dim frequencies reproduce HF exactly.
+- it ends at `res->t_embd` (no LM head). The framework then calls
+  `build_pooling()`, which under `pooling_type == TOKEN_CLS` applies
+  `cls_out`/`cls_out_b` per token (carry-patch 0001). `load_arch_tensors`
+  loads `cls.output.{weight,bias}`, the per-layer `rope_freqs`, and no
+  `output`/LM head.
+
+**Parity: solved.** Against `OpenMed/privacy-filter-multilingual` at F16, the
+new arch matches the HF reference token-for-token (12/12 argmax, full-logit
+cosine = 1.0; every layer's residual stream cos = 1.0, relerr ≈ 2e-4 = F16
+rounding), including the e-mail BIOES span. Verified on the real
+`llama-embedding` binary (model-default TOKEN_CLS pooling — do **not** pass
+`--pooling none`, which overrides it). The two parity-gated assumptions —
+`n_swa = 2 * sliding_window` and 0003's gate_up packing — are both confirmed
+correct. All five patches apply, in order, against `5dcb71166`.
+
+## 0005-no-cache-all-swa-mask-fix.patch
+
+**What:** a robustness fix to `llm_graph_input_attn_no_cache::set_input`
+(`src/llama-graph.cpp`). The no-cache attention input creates two mask
+tensors — the full (non-SWA) mask and, when `swa_type != NONE`, the SWA
+mask — but a model may consume only one. The openai-privacy-filter encoder
+makes **every** layer SWA (uniform symmetric window), so the full
+`self_kq_mask` is never referenced by the graph and the allocator leaves it
+unallocated (null buffer). The stock `set_input` unconditionally fills it,
+dereferencing a null `->data` and aborting at
+`GGML_ASSERT(ggml_backend_buffer_is_host(...))`. The fix only fills a mask
+that actually received a buffer (and guards the SWA branch symmetrically).
+
+This is a general fix — any all-SWA no-cache (encoder) model needs it — and
+is a candidate to upstream separately. Without it the model loads but
+aborts on first `decode`. Discovered via the CPU smoke test (it loads,
+tokenizes 12 tokens, then aborts in `set_input`); after the fix the model
+runs and produces `[n_cls_out, n_tokens]` logits. With the RoPE fixes in
+0003/0004 the per-token logits now match the HF reference exactly (12/12
+argmax, full-logit cosine = 1.0).
+
+---
+
+**Version note (applies to all patches here):** patches 0001–0003 were
+originally authored against `d6588daa8`; after LocalAI bumped `Makefile`
+`LLAMA_VERSION` to `5dcb71166686799f0d873eab7386234302d05ecf` (upstream
+#10128) all patches were regenerated and re-verified against that commit. All
+five apply in order with `patch -p1` (no fuzz, no rejected hunks) and the
+result compiles and reaches full HF parity. Re-run the apply check after any
+further `LLAMA_VERSION` bump.
diff --git a/backend/index.yaml b/backend/index.yaml
index 37e6890710e4..d2ced5d356f6 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -1557,6 +1557,7 @@
     - localai/localai-backends:master-metal-darwin-arm64-kitten-tts
 - !!merge <<: *local-store
   name: "local-store-development"
+  alias: "local-store"
   uri: "quay.io/go-skynet/local-ai-backends:master-cpu-local-store"
   mirrors:
     - localai/localai-backends:master-cpu-local-store
@@ -1567,6 +1568,7 @@
     - localai/localai-backends:latest-metal-darwin-arm64-local-store
 - !!merge <<: *local-store
   name: "metal-local-store-development"
+  alias: "local-store"
   uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-local-store"
   mirrors:
     - localai/localai-backends:master-metal-darwin-arm64-local-store
diff --git a/backend/python/transformers/backend.py b/backend/python/transformers/backend.py
index a8c1840b3c46..4b251c7f1dca 100644
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -270,10 +270,17 @@ def LoadModel(self, request, context):
 
     def TokenClassify(self, request, context):
         # Runs HuggingFace's token-classification pipeline and returns
-        # the aggregated entity spans. The pipeline gives us byte
-        # offsets via aggregation_strategy="simple" (set at load
-        # time), so the caller can slice the original text without
-        # re-tokenising on the Go side.
+        # the aggregated entity spans.
+        #
+        # OFFSET UNITS: the proto contract (TokenClassifyEntity.start/end)
+        # is UTF-8 BYTE offsets into request.text. HuggingFace's pipeline,
+        # however, reports start/end as CODEPOINT offsets into the Python
+        # str (derived from the fast tokenizer's offset_mapping). Those
+        # coincide only for ASCII; for any multi-byte character they
+        # diverge — and this entry point exists to serve the explicitly
+        # multilingual privacy-filter model, so the conversion is
+        # mandatory, not a nicety. We build one prefix table mapping each
+        # codepoint index to its byte offset and translate every span.
         if not getattr(self, "TokenClassification", False):
             context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
             context.set_details("model was not loaded as Type=TokenClassification")
@@ -286,18 +293,50 @@ def TokenClassify(self, request, context):
             context.set_details(f"token-classification failed: {err}")
             return backend_pb2.TokenClassifyResponse()
 
+        text = request.text
+        # byte_at[i] = byte length of text[:i]; len == len(text)+1 so an
+        # exclusive end offset that points one past the last codepoint
+        # maps to len(text.encode("utf-8")). Built in a single O(n) pass.
+        byte_at = [0] * (len(text) + 1)
+        acc = 0
+        for i, ch in enumerate(text):
+            byte_at[i] = acc
+            acc += len(ch.encode("utf-8"))
+        byte_at[len(text)] = acc
+
+        def to_byte(cp_index, default):
+            # Clamp out-of-range codepoint indices into the table rather
+            # than throwing: a span we can't place is better dropped Go-side
+            # than crashing the RPC.
+            if cp_index is None:
+                cp_index = default
+            if cp_index < 0:
+                cp_index = 0
+            elif cp_index > len(text):
+                cp_index = len(text)
+            return byte_at[cp_index]
+
         threshold = request.threshold if request.threshold > 0 else 0.0
         entities = []
         for r in results:
             score = float(r.get("score", 0.0))
             if score < threshold:
                 continue
+            cp_start = r.get("start")
+            cp_end = r.get("end")
+            start = to_byte(cp_start, 0)
+            end = to_byte(cp_end, 0)
             entities.append(backend_pb2.TokenClassifyEntity(
                 entity_group=str(r.get("entity_group") or r.get("entity") or ""),
-                start=int(r.get("start", 0)),
-                end=int(r.get("end", 0)),
+                start=start,
+                end=end,
                 score=score,
-                text=str(r.get("word", "")),
+                # Slice the original text by the (codepoint) span so the
+                # echoed text matches start..end exactly, instead of the
+                # pipeline's reconstructed "word" which can carry wordpiece
+                # artifacts. Falls back to "word" when offsets are absent.
+                text=(text[cp_start:cp_end] if cp_start is not None and cp_end is not None
+                      else str(r.get("word", ""))),
             ))
         return backend_pb2.TokenClassifyResponse(entities=entities)
 
diff --git a/core/application/application.go b/core/application/application.go
index 29e05b6d1900..d5c286318dbd 100644
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -12,14 +12,15 @@ import (
 	"github.com/mudler/LocalAI/core/http/auth"
 	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
 	"github.com/mudler/LocalAI/core/services/agentpool"
+	"github.com/mudler/LocalAI/core/services/cloudproxy/mitm"
 	"github.com/mudler/LocalAI/core/services/facerecognition"
 	"github.com/mudler/LocalAI/core/services/galleryop"
 	"github.com/mudler/LocalAI/core/services/monitoring"
 	"github.com/mudler/LocalAI/core/services/nodes"
 	"github.com/mudler/LocalAI/core/services/routing/admission"
 	"github.com/mudler/LocalAI/core/services/routing/billing"
-	"github.com/mudler/LocalAI/core/services/cloudproxy/mitm"
 	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/LocalAI/core/services/routing/piidetector"
 	"github.com/mudler/LocalAI/core/services/routing/router"
 	"github.com/mudler/LocalAI/core/services/voicerecognition"
 	"github.com/mudler/LocalAI/core/templates"
@@ -71,15 +72,15 @@ type Application struct {
 	// 1-to-1 host↔model invariant the dispatcher relies on. Read by
 	// /api/middleware/status so the admin UI can surface the cause.
 	mitmHostConflicts atomic.Pointer[map[string][]string]
-	routerDecisions    router.DecisionStore
-	routerRegistry     *router.Registry
-	admissionLimiter   *admission.Limiter
-	watchdogMutex      sync.Mutex
-	watchdogStop       chan bool
-	p2pMutex           sync.Mutex
-	p2pCtx             context.Context
-	p2pCancel          context.CancelFunc
-	agentJobMutex      sync.Mutex
+	routerDecisions   router.DecisionStore
+	routerRegistry    *router.Registry
+	admissionLimiter  *admission.Limiter
+	watchdogMutex     sync.Mutex
+	watchdogStop      chan bool
+	p2pMutex          sync.Mutex
+	p2pCtx            context.Context
+	p2pCancel         context.CancelFunc
+	agentJobMutex     sync.Mutex
 
 	// Distributed mode services (nil when not in distributed mode)
 	distributed *DistributedServices
@@ -254,6 +255,122 @@ func (a *Application) PIIEvents() pii.EventStore {
 	return a.piiEvents
 }
 
+// PIINERResolver returns the resolver the chat PII middleware uses to
+// turn a configured detector model name into a ready-to-use NERConfig:
+// a token-classifier bound over the shared model loader (lazy — the
+// model loads on first Detect) plus the detection policy read from that
+// model's own pii_detection block. Unknown names resolve to (zero,
+// false) so the middleware fails closed. Pass it via pii.WithNERResolver.
+func (a *Application) PIINERResolver() pii.NERDetectorResolver {
+	return func(modelName string) (pii.NERConfig, bool) {
+		if modelName == "" {
+			return pii.NERConfig{}, false
+		}
+		cfg, ok := a.ModelConfigLoader().GetModelConfig(modelName)
+		if !ok {
+			return pii.NERConfig{}, false
+		}
+
+		// Pattern detectors match secrets with the restricted-regex tier
+		// in-process (no backend load). Build a pattern matcher instead of the
+		// gRPC token-classifier; on a compile error fail closed with an error
+		// detector so the request is blocked, not silently unscanned.
+		if cfg.IsPatternDetector() {
+			det, err := piidetector.NewPattern(cfg, a.ApplicationConfig())
+			if err != nil {
+				det = pii.NewErrNERDetector(err.Error())
+			}
+			return pii.NERConfigFromRaw(
+				det,
+				0, // patterns are deterministic — no confidence floor
+				cfg.PIIDetectionDefaultAction(),
+				patternEntityActions(cfg),
+				pii.SourcePattern,
+			), true
+		}
+
+		det := piidetector.New(a.ModelLoader(), cfg, a.ApplicationConfig())
+		return pii.NERConfigFromRaw(
+			det,
+			cfg.PIIDetectionMinScore(),
+			cfg.PIIDetectionDefaultAction(),
+			cfg.PIIDetectionEntityActions(),
+			pii.SourceNER,
+		), true
+	}
+}
+
+// patternEntityActions merges a pattern detector's per-pattern Action overrides
+// into its entity_actions map. A pattern reports matches under its Name, so a
+// per-pattern action is just an entity_actions[Name] entry; explicit
+// entity_actions still win if both are set.
+func patternEntityActions(cfg config.ModelConfig) map[string]string {
+	out := cfg.PIIDetectionEntityActions()
+	for _, p := range cfg.PIIDetection.Patterns {
+		if p.Action == "" || p.Name == "" {
+			continue
+		}
+		if out == nil {
+			out = map[string]string{}
+		}
+		if _, exists := out[p.Name]; !exists {
+			out[p.Name] = p.Action
+		}
+	}
+	return out
+}
+
+// ResolvePIIPolicy resolves the effective request-side PII policy for a
+// consuming model, layering the instance-wide default detector
+// (PIIDefaultDetectors, set via POST /api/settings) on top of the per-model
+// config. It is the single decision point shared by the chat middleware (via
+// WithPolicyResolver) and the MITM listener so both agree.
+//
+//   - enabled: an explicit pii.enabled on the model always wins (true OR
+//     false). Otherwise PII is on when the backend defaults it on — today
+//     that means cloud-proxy models, which cross the network to a third party.
+//   - detectors: the model's own pii.detectors, or — when it lists none — the
+//     global PIIDefaultDetectors fallback. This is what makes cloud-proxy/MITM
+//     redaction work out of the box.
+//
+// appConfig is read live, so changes via the settings API take effect on the
+// next request without a restart.
+func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, detectors []string) {
+	if cfg == nil {
+		return false, nil
+	}
+	appCfg := a.ApplicationConfig()
+
+	if cfg.PII.Enabled != nil {
+		enabled = *cfg.PII.Enabled
+	} else {
+		enabled = cfg.PIIIsEnabled() // backend default (cloud-proxy)
+	}
+	if !enabled {
+		return false, nil
+	}
+
+	detectors = cfg.PIIDetectors()
+	if len(detectors) == 0 {
+		detectors = append([]string(nil), appCfg.PIIDefaultDetectors...)
+	}
+	return enabled, detectors
+}
+
+// PIIPolicyResolver adapts ResolvePIIPolicy to pii.PolicyResolver for
+// pii.WithPolicyResolver. The middleware carries the resolved model config as
+// `any` (the MODEL_CONFIG context value, a *config.ModelConfig); this asserts
+// it back and applies the instance-wide defaults.
+func (a *Application) PIIPolicyResolver() pii.PolicyResolver {
+	return func(modelCfg any) (bool, []string) {
+		cfg, ok := modelCfg.(*config.ModelConfig)
+		if !ok {
+			return false, nil
+		}
+		return a.ResolvePIIPolicy(cfg)
+	}
+}
+
 // MITMCA returns the cloudproxy MITM proxy's CA, or nil when the
 // MITM listener is disabled.
 func (a *Application) MITMCA() *mitm.CA { return a.mitmCA.Load() }
diff --git a/core/application/mitm.go b/core/application/mitm.go
index 293b3d449c20..cb1ab3993c8c 100644
--- a/core/application/mitm.go
+++ b/core/application/mitm.go
@@ -8,9 +8,33 @@ import (
 
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/services/cloudproxy/mitm"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/xlog"
 )
 
+// startMITMIfConfigured brings up the cloudproxy MITM listener when an
+// address is configured, treating any startup failure as non-fatal.
+//
+// The listener is opt-in middleware whose address is persisted in runtime
+// settings (/api/settings → runtime_settings.json) and replayed on every
+// boot. A bad value — e.g. a host the process can't bind, like a LAN IP
+// inside a container — must NOT abort the whole server: doing so crash-loops
+// with no way out, because the Settings UI used to correct the address can't
+// load if startup never completes. So on failure we log loudly and carry on;
+// the admin fixes the address via /api/settings, which calls RestartMITM.
+func startMITMIfConfigured(app *Application, options *config.ApplicationConfig) {
+	if options.MITMListen == "" {
+		return
+	}
+	if err := startMITMProxy(app, options); err != nil {
+		xlog.Error("mitm: cloudproxy listener failed to start — continuing without it",
+			"listen", options.MITMListen,
+			"error", err,
+			"hint", "fix the address via Settings (e.g. \":8082\" to bind all interfaces) and the listener will restart",
+		)
+	}
+}
+
 func startMITMProxy(app *Application, options *config.ApplicationConfig) error {
 	app.mitmMutex.Lock()
 	defer app.mitmMutex.Unlock()
@@ -68,25 +92,41 @@ func startMITMLocked(app *Application, options *config.ApplicationConfig) error
 	}
 	sort.Strings(effectiveHosts)
 
-	// Per-host PII gate inherits from the owning model's pii.enabled.
-	// A non-cloud-proxy backend with no explicit pii.enabled resolves
-	// to false → host is intercepted but the regex pass is skipped
-	// (audit events still record).
-	var piiDisabled []string
+	// Per-host NER detectors come from the owning model's pii.detectors
+	// (resolved against each detector model's pii_detection policy). A
+	// host whose model has pii.enabled=false, lists no detectors, or
+	// whose detectors can't be resolved gets no entry → it is intercepted
+	// and forwarded unredacted (audit events still record traffic). An
+	// unresolvable detector is recorded as an error-detector so the
+	// request fails closed at request time rather than leaking.
+	resolver := app.PIINERResolver()
+	detectorsByHost := map[string][]pii.NERConfig{}
 	for host, modelName := range ownership.Owners {
 		cfg, exists := app.backendLoader.GetModelConfig(modelName)
 		if !exists {
 			continue
 		}
-		if !cfg.PIIIsEnabled() {
-			piiDisabled = append(piiDisabled, host)
+		// Resolve through the shared policy so cloud-proxy hosts inherit the
+		// instance-wide default detector when they name none of their own.
+		enabled, detectors := app.ResolvePIIPolicy(&cfg)
+		if !enabled || len(detectors) == 0 {
+			continue
+		}
+		cfgs := make([]pii.NERConfig, 0, len(detectors))
+		for _, name := range detectors {
+			nc, ok := resolver(name)
+			if !ok {
+				xlog.Error("mitm: detector model not resolvable; requests to host will fail closed", "host", host, "detector", name)
+				nc = pii.NERConfig{Detector: pii.NewErrNERDetector("detector model '" + name + "' not resolvable")}
+			}
+			cfgs = append(cfgs, nc)
 		}
+		detectorsByHost[host] = cfgs
 	}
 
 	handler := mitm.NewPIIHandler(mitm.PIIHandlerOptions{
-		Redactor:             app.piiRedactor,
-		EventStore:           app.piiEvents,
-		HostsWithPIIDisabled: piiDisabled,
+		EventStore:      app.piiEvents,
+		DetectorsByHost: detectorsByHost,
 	})
 
 	srv, err := mitm.NewServer(mitm.Config{
@@ -109,7 +149,7 @@ func startMITMLocked(app *Application, options *config.ApplicationConfig) error
 		"ca_dir", caDir,
 		"intercept_hosts", effectiveHosts,
 		"model_owned_hosts", len(ownership.Owners),
-		"pii_disabled_hosts", len(piiDisabled),
+		"pii_detector_hosts", len(detectorsByHost),
 	)
 	return nil
 }
diff --git a/core/application/mitm_test.go b/core/application/mitm_test.go
new file mode 100644
index 000000000000..b7627fa2d66c
--- /dev/null
+++ b/core/application/mitm_test.go
@@ -0,0 +1,58 @@
+package application
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/system"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// minimal Application wired enough for startMITMProxy: an empty model
+// config loader (no host claims), CA written under a temp DataPath.
+func newMITMTestApp(dataPath string) (*Application, *config.ApplicationConfig) {
+	state, err := system.GetSystemState()
+	Expect(err).NotTo(HaveOccurred())
+	state.Model.ModelsPath = dataPath
+	opts := config.NewApplicationConfig(
+		config.WithSystemState(state),
+		config.WithDataPath(dataPath),
+	)
+	return newApplication(opts), opts
+}
+
+var _ = Describe("startMITMIfConfigured", func() {
+	It("does nothing when no listen address is configured", func() {
+		app, opts := newMITMTestApp(GinkgoT().TempDir())
+		opts.MITMListen = ""
+
+		Expect(func() { startMITMIfConfigured(app, opts) }).NotTo(Panic())
+		Expect(app.mitmServer.Load()).To(BeNil(), "no listener should be stored when disabled")
+	})
+
+	// Regression: a persisted-but-unbindable MITM address (e.g. a LAN host
+	// inside a container) must not abort startup. startMITMIfConfigured
+	// swallows the bind error so the rest of LocalAI still comes up and the
+	// admin can fix the address via the Settings UI.
+	It("logs and continues when the listen address cannot be bound", func() {
+		app, opts := newMITMTestApp(GinkgoT().TempDir())
+		// 192.0.2.1 is TEST-NET-1 (RFC 5737): guaranteed not assigned to any
+		// local interface, so bind fails deterministically without DNS.
+		opts.MITMListen = "192.0.2.1:8082"
+
+		Expect(func() { startMITMIfConfigured(app, opts) }).NotTo(Panic())
+		Expect(app.mitmServer.Load()).To(BeNil(), "failed listener must not be stored")
+	})
+
+	It("starts and stores the listener on a bindable address", func() {
+		app, opts := newMITMTestApp(GinkgoT().TempDir())
+		opts.MITMListen = "127.0.0.1:0" // OS-assigned free port
+
+		startMITMIfConfigured(app, opts)
+
+		srv := app.mitmServer.Load()
+		Expect(srv).NotTo(BeNil(), "listener should be stored on success")
+		DeferCleanup(srv.Stop)
+		Expect(srv.Addr()).NotTo(BeEmpty())
+	})
+})
diff --git a/core/application/pii_policy_test.go b/core/application/pii_policy_test.go
new file mode 100644
index 000000000000..e221293c3e64
--- /dev/null
+++ b/core/application/pii_policy_test.go
@@ -0,0 +1,51 @@
+package application
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ResolvePIIPolicy", func() {
+	chat := config.FLAG_CHAT
+	bp := func(b bool) *bool { return &b }
+	mk := func(c *config.ApplicationConfig) *Application {
+		return &Application{applicationConfig: c}
+	}
+
+	It("lets an explicit pii.enabled=false win over the global default detector", func() {
+		app := mk(&config.ApplicationConfig{PIIDefaultDetectors: []string{"pf"}})
+		cfg := &config.ModelConfig{Backend: "cloud-proxy", KnownUsecases: &chat}
+		cfg.PII.Enabled = bp(false)
+		enabled, dets := app.ResolvePIIPolicy(cfg)
+		Expect(enabled).To(BeFalse())
+		Expect(dets).To(BeNil())
+	})
+
+	It("enables a cloud-proxy model with the global default detector (closes the no-op gap)", func() {
+		// cloud-proxy defaults PIIIsEnabled()==true but lists no detectors, so
+		// without a global default it scans with nothing.
+		app := mk(&config.ApplicationConfig{PIIDefaultDetectors: []string{"pf"}})
+		cfg := &config.ModelConfig{Backend: "cloud-proxy"}
+		enabled, dets := app.ResolvePIIPolicy(cfg)
+		Expect(enabled).To(BeTrue())
+		Expect(dets).To(Equal([]string{"pf"}))
+	})
+
+	It("leaves a non-cloud model off by default (no instance usecase default-on)", func() {
+		app := mk(&config.ApplicationConfig{PIIDefaultDetectors: []string{"pf"}})
+		cfg := &config.ModelConfig{Backend: "llama-cpp", KnownUsecases: &chat}
+		enabled, _ := app.ResolvePIIPolicy(cfg)
+		Expect(enabled).To(BeFalse())
+	})
+
+	It("prefers the model's own detectors over the global default", func() {
+		app := mk(&config.ApplicationConfig{PIIDefaultDetectors: []string{"global-pf"}})
+		cfg := &config.ModelConfig{Backend: "cloud-proxy"}
+		cfg.PII.Detectors = []string{"own-pf"}
+		enabled, dets := app.ResolvePIIPolicy(cfg)
+		Expect(enabled).To(BeTrue())
+		Expect(dets).To(Equal([]string{"own-pf"}))
+	})
+})
diff --git a/core/application/router_factories.go b/core/application/router_factories.go
index d37cfb9d8115..879c43a835ee 100644
--- a/core/application/router_factories.go
+++ b/core/application/router_factories.go
@@ -1,63 +1,120 @@
 package application
 
 import (
+	"context"
+	"fmt"
+
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 )
 
-// adapterConfig resolves a model name to its runtime ModelConfig, or
-// nil when the name is unknown. Shared by the router-facing factories
-// below and by ModelConfigLookup.
+// adapterConfig resolves a model name to its runtime ModelConfig, or nil when
+// unknown. LoadModelConfigFileByNameDefaultOptions never returns nil — for an
+// unknown name it returns a defaults-filled stub with an empty Name (the YAML
+// `name:` field is required by Validate), which is how we tell the two apart.
 func (a *Application) adapterConfig(modelName string) *config.ModelConfig {
 	cfg, err := a.backendLoader.LoadModelConfigFileByNameDefaultOptions(modelName, a.applicationConfig)
-	if err != nil || cfg == nil {
+	if err != nil || cfg == nil || cfg.Name == "" {
 		return nil
 	}
 	return cfg
 }
 
-// ModelConfigLookup is the lookup function the router middleware's
-// classifier validator uses to confirm classifier_model declares
-// FLAG_SCORE before binding it.
+// ModelConfigLookup is the lookup the router middleware's classifier validator
+// uses to confirm classifier_model declares FLAG_SCORE before binding it.
 func (a *Application) ModelConfigLookup() func(modelName string) *config.ModelConfig {
 	return a.adapterConfig
 }
 
-// Scorer returns a backend.Scorer bound to the named model, or nil
-// when the model is unknown. Used as a method value (app.Scorer) by
-// router.ClassifierDeps — no factory-of-factory wrapper needed.
+// The router-facing factories below (Scorer, Embedder, Reranker, TokenCounter)
+// bind a model NAME at construction and re-resolve the CONFIG on every call.
+// Capturing the config at construction would bake in whatever state
+// adapterConfig saw first — including a stub returned before the YAML reached
+// bcl.configs (e.g. /import-model or gallery install racing startup). The
+// classifier registry caches factories by router-config fingerprint, so a
+// once-stale capture stays stale until the router config is edited.
+
 func (a *Application) Scorer(modelName string) backend.Scorer {
-	cfg := a.adapterConfig(modelName)
+	if a.adapterConfig(modelName) == nil {
+		return nil
+	}
+	return &lazyScorer{app: a, modelName: modelName}
+}
+
+type lazyScorer struct {
+	app       *Application
+	modelName string
+}
+
+func (l *lazyScorer) Score(ctx context.Context, prompt string, candidates []string) ([]backend.CandidateScore, error) {
+	cfg := l.app.adapterConfig(l.modelName)
 	if cfg == nil {
+		return nil, fmt.Errorf("scorer: model %q no longer available", l.modelName)
+	}
+	return backend.NewScorer(l.app.modelLoader, *cfg, l.app.applicationConfig).Score(ctx, prompt, candidates)
+}
+
+// TokenCounter returns a func so the middleware's literal field type accepts
+// it as a method value without importing core/http/middleware from here.
+func (a *Application) TokenCounter(modelName string) func(string) (int, error) {
+	if a.adapterConfig(modelName) == nil {
 		return nil
 	}
-	return backend.NewScorer(a.modelLoader, *cfg, a.applicationConfig)
+	return func(text string) (int, error) {
+		cfg := a.adapterConfig(modelName)
+		if cfg == nil {
+			return 0, fmt.Errorf("token counter: model %q no longer available", modelName)
+		}
+		resp, err := backend.ModelTokenize(text, a.modelLoader, *cfg, a.applicationConfig)
+		if err != nil {
+			return 0, err
+		}
+		return len(resp.Tokens), nil
+	}
 }
 
-// Reranker returns a backend.Reranker bound to the named model, or
-// nil when unknown. The reranker model's `type:` (e.g. "colbert")
-// selects the scoring head inside the rerankers backend.
 func (a *Application) Reranker(modelName string) backend.Reranker {
-	cfg := a.adapterConfig(modelName)
-	if cfg == nil {
+	if a.adapterConfig(modelName) == nil {
 		return nil
 	}
-	return backend.NewReranker(a.modelLoader, *cfg, a.applicationConfig)
+	return &lazyReranker{app: a, modelName: modelName}
 }
 
-// Embedder returns a backend.Embedder bound to the named model, or
-// nil when unknown. Used by the router's L2 embedding cache.
-func (a *Application) Embedder(modelName string) backend.Embedder {
-	cfg := a.adapterConfig(modelName)
+type lazyReranker struct {
+	app       *Application
+	modelName string
+}
+
+func (l *lazyReranker) Rerank(ctx context.Context, query string, documents []string) ([]backend.RerankResult, error) {
+	cfg := l.app.adapterConfig(l.modelName)
 	if cfg == nil {
+		return nil, fmt.Errorf("reranker: model %q no longer available", l.modelName)
+	}
+	return backend.NewReranker(l.app.modelLoader, *cfg, l.app.applicationConfig).Rerank(ctx, query, documents)
+}
+
+func (a *Application) Embedder(modelName string) backend.Embedder {
+	if a.adapterConfig(modelName) == nil {
 		return nil
 	}
-	return backend.NewEmbedder(a.modelLoader, *cfg, a.applicationConfig)
+	return &lazyEmbedder{app: a, modelName: modelName}
+}
+
+type lazyEmbedder struct {
+	app       *Application
+	modelName string
+}
+
+func (l *lazyEmbedder) Embed(ctx context.Context, text string) ([]float32, error) {
+	cfg := l.app.adapterConfig(l.modelName)
+	if cfg == nil {
+		return nil, fmt.Errorf("embedder: model %q no longer available", l.modelName)
+	}
+	return backend.NewEmbedder(l.app.modelLoader, *cfg, l.app.applicationConfig).Embed(ctx, text)
 }
 
-// VectorStore returns a backend.VectorStore for the named collection,
-// or nil when the name is empty. Each router model gets its own
-// backend process via the model loader's cache keyed by storeName.
+// VectorStore takes a store name, not a model name — no adapterConfig, no
+// staleness to avoid.
 func (a *Application) VectorStore(storeName string) backend.VectorStore {
 	return backend.NewVectorStore(a.modelLoader, a.applicationConfig, storeName)
 }
diff --git a/core/application/router_factories_test.go b/core/application/router_factories_test.go
new file mode 100644
index 000000000000..5a6988a88fba
--- /dev/null
+++ b/core/application/router_factories_test.go
@@ -0,0 +1,155 @@
+package application
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/system"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Regression: the router-facing factories used to capture
+// *config.ModelConfig at construction. A gallery install that raced
+// startup left a stub (Backend="") bound for the lifetime of the
+// classifier registry's cache entry, bypassing the user's `backend:`
+// config. These specs pin the lazy re-resolve.
+var _ = Describe("router_factories lazy config resolution", func() {
+	var (
+		tmpDir string
+		app    *Application
+	)
+
+	BeforeEach(func() {
+		var err error
+		tmpDir, err = os.MkdirTemp("", "router-factories-*")
+		Expect(err).NotTo(HaveOccurred())
+
+		appCfg := &config.ApplicationConfig{
+			Context:     context.Background(),
+			SystemState: &system.SystemState{Model: system.Model{ModelsPath: tmpDir}},
+		}
+		app = &Application{
+			backendLoader:     config.NewModelConfigLoader(tmpDir),
+			modelLoader:       model.NewModelLoader(appCfg.SystemState),
+			applicationConfig: appCfg,
+		}
+	})
+
+	AfterEach(func() {
+		_ = os.RemoveAll(tmpDir)
+	})
+
+	// writeCfg seeds both the on-disk YAML and the in-memory cache —
+	// removing only the cache would fall through to file-read.
+	writeCfg := func(name, backend string) {
+		yaml := "name: " + name + "\nbackend: " + backend + "\nparameters:\n  model: " + name + ".bin\n"
+		Expect(os.WriteFile(filepath.Join(tmpDir, name+".yaml"), []byte(yaml), 0644)).To(Succeed())
+		Expect(app.backendLoader.LoadModelConfigsFromPath(tmpDir)).To(Succeed())
+		cfg, ok := app.backendLoader.GetModelConfig(name)
+		Expect(ok).To(BeTrue(), "config must be loaded before the spec runs")
+		Expect(cfg.Backend).To(Equal(backend))
+	}
+
+	// removeCfg purges both the cache and the YAML so LoadModelConfigFileByName
+	// returns the empty-stub case and adapterConfig returns nil.
+	removeCfg := func(name string) {
+		app.backendLoader.RemoveModelConfig(name)
+		Expect(os.Remove(filepath.Join(tmpDir, name+".yaml"))).To(Succeed())
+	}
+
+	Context("Embedder", func() {
+		It("returns nil at construction for an unknown model", func() {
+			Expect(app.Embedder("missing")).To(BeNil())
+		})
+
+		It("re-resolves the model config on each Embed call", func() {
+			writeCfg("emb-test", "llama-cpp")
+			emb := app.Embedder("emb-test")
+			Expect(emb).NotTo(BeNil())
+
+			// The factory must hold the NAME, not a captured config —
+			// otherwise stale captures survive cache invalidation.
+			lazy, ok := emb.(*lazyEmbedder)
+			Expect(ok).To(BeTrue(), "Embedder must return *lazyEmbedder")
+			Expect(lazy.modelName).To(Equal("emb-test"))
+
+			// Mutate the cached config. A lazy implementation sees the
+			// update on the next adapterConfig call; a captured-at-
+			// construction implementation would still see "llama-cpp".
+			app.backendLoader.UpdateModelConfig("emb-test", func(c *config.ModelConfig) {
+				c.Backend = "rerankers"
+			})
+			Expect(lazy.app.adapterConfig("emb-test").Backend).To(Equal("rerankers"))
+
+			// Remove the config entirely → Embed must surface the disappearance.
+			removeCfg("emb-test")
+			_, err := emb.Embed(context.Background(), "anything")
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("no longer available"))
+		})
+	})
+
+	Context("Scorer", func() {
+		It("returns nil at construction for an unknown model", func() {
+			Expect(app.Scorer("missing")).To(BeNil())
+		})
+
+		It("re-resolves the model config on each Score call", func() {
+			writeCfg("score-test", "llama-cpp")
+			sc := app.Scorer("score-test")
+			Expect(sc).NotTo(BeNil())
+
+			lazy, ok := sc.(*lazyScorer)
+			Expect(ok).To(BeTrue(), "Scorer must return *lazyScorer")
+			Expect(lazy.modelName).To(Equal("score-test"))
+
+			removeCfg("score-test")
+			_, err := sc.Score(context.Background(), "prompt", []string{"a"})
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("no longer available"))
+		})
+	})
+
+	Context("Reranker", func() {
+		It("returns nil at construction for an unknown model", func() {
+			Expect(app.Reranker("missing")).To(BeNil())
+		})
+
+		It("re-resolves the model config on each Rerank call", func() {
+			writeCfg("rerank-test", "rerankers")
+			rr := app.Reranker("rerank-test")
+			Expect(rr).NotTo(BeNil())
+
+			lazy, ok := rr.(*lazyReranker)
+			Expect(ok).To(BeTrue(), "Reranker must return *lazyReranker")
+			Expect(lazy.modelName).To(Equal("rerank-test"))
+
+			removeCfg("rerank-test")
+			_, err := rr.Rerank(context.Background(), "q", []string{"d"})
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("no longer available"))
+		})
+	})
+
+	Context("TokenCounter", func() {
+		It("returns nil at construction for an unknown model", func() {
+			Expect(app.TokenCounter("missing")).To(BeNil())
+		})
+
+		It("re-resolves the model config on each call", func() {
+			writeCfg("tok-test", "llama-cpp")
+			tc := app.TokenCounter("tok-test")
+			Expect(tc).NotTo(BeNil())
+
+			removeCfg("tok-test")
+			_, err := tc("anything")
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("no longer available"))
+		})
+	})
+})
diff --git a/core/application/startup.go b/core/application/startup.go
index be559479f2f8..6019c565d924 100644
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -23,9 +23,9 @@ import (
 	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/core/services/routing/router"
 	"github.com/mudler/LocalAI/core/services/storage"
-	"github.com/mudler/LocalAI/pkg/signals"
 	coreStartup "github.com/mudler/LocalAI/core/startup"
 	"github.com/mudler/LocalAI/internal"
+	"github.com/mudler/LocalAI/pkg/signals"
 	"github.com/mudler/LocalAI/pkg/vram"
 
 	"github.com/mudler/LocalAI/pkg/model"
@@ -53,7 +53,6 @@ func New(opts ...config.AppOption) (*Application, error) {
 	caps, err := xsysinfo.CPUCapabilities()
 	if err == nil {
 		xlog.Debug("CPU capabilities", "capabilities", caps)
-
 	}
 	gpus, err := xsysinfo.GPUs()
 	if err == nil {
@@ -68,18 +67,18 @@ func New(opts ...config.AppOption) (*Application, error) {
 		return nil, fmt.Errorf("models path cannot be empty")
 	}
 
-	err = os.MkdirAll(options.SystemState.Model.ModelsPath, 0750)
+	err = os.MkdirAll(options.SystemState.Model.ModelsPath, 0o750)
 	if err != nil {
 		return nil, fmt.Errorf("unable to create ModelPath: %q", err)
 	}
 	if options.GeneratedContentDir != "" {
-		err := os.MkdirAll(options.GeneratedContentDir, 0750)
+		err := os.MkdirAll(options.GeneratedContentDir, 0o750)
 		if err != nil {
 			return nil, fmt.Errorf("unable to create ImageDir: %q", err)
 		}
 	}
 	if options.UploadDir != "" {
-		err := os.MkdirAll(options.UploadDir, 0750)
+		err := os.MkdirAll(options.UploadDir, 0o750)
 		if err != nil {
 			return nil, fmt.Errorf("unable to create UploadDir: %q", err)
 		}
@@ -87,7 +86,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 
 	// Create and migrate data directory
 	if options.DataPath != "" {
-		if err := os.MkdirAll(options.DataPath, 0750); err != nil {
+		if err := os.MkdirAll(options.DataPath, 0o750); err != nil {
 			return nil, fmt.Errorf("unable to create DataPath: %q", err)
 		}
 		// Migrate data from DynamicConfigsDir to DataPath if needed
@@ -192,44 +191,14 @@ func New(opts ...config.AppOption) (*Application, error) {
 		xlog.Info("stats: disabled by --disable-stats")
 	}
 
-	// Wire the regex PII filter. Default-on: a single-user box gets
-	// the built-in pattern set the first time it starts, with email/
-	// phone/SSN/credit-card on mask and api_key_prefix on block. If
-	// the operator wants different actions, --pii-config points at a
-	// YAML file that overrides per-id; --disable-pii turns it off
-	// entirely.
-	if !options.DisablePII {
-		patterns, err := pii.LoadConfig(options.PIIConfigPath)
-		if err != nil {
-			return nil, fmt.Errorf("pii config: %w", err)
-		}
-		application.piiRedactor = pii.NewRedactor(patterns)
-		application.piiEvents = pii.NewMemoryEventStore(0)
-		// Apply persisted per-pattern overrides — admins toggling
-		// action/disabled via the UI and clicking "Save to disk" land
-		// here on the next start. Bad ids are warned and ignored so a
-		// stale entry doesn't block startup.
-		for id, ov := range options.PIIPatternOverrides {
-			if ov.Action != nil {
-				if err := application.piiRedactor.SetAction(id, pii.Action(*ov.Action)); err != nil {
-					xlog.Warn("pii: persisted override skipped", "pattern", id, "error", err)
-					continue
-				}
-			}
-			if ov.Disabled != nil {
-				if err := application.piiRedactor.SetDisabled(id, *ov.Disabled); err != nil {
-					xlog.Warn("pii: persisted disable skipped", "pattern", id, "error", err)
-				}
-			}
-		}
-		xlog.Info("pii: filter enabled",
-			"patterns", len(patterns),
-			"config_path", options.PIIConfigPath,
-			"persisted_overrides", len(options.PIIPatternOverrides),
-		)
-	} else {
-		xlog.Info("pii: disabled by --disable-pii")
-	}
+	// Wire the PII filter subsystem. The redactor is now a stateless
+	// handle — detection is driven by per-model NER detectors
+	// (pii.detectors → the detector model's pii_detection policy), run
+	// request-side by the chat middleware and the MITM input path. The
+	// regex tier was removed; redaction is opt-in per model via
+	// PIIIsEnabled(). The event store backs the /api/pii/events audit log.
+	application.piiRedactor = &pii.Redactor{}
+	application.piiEvents = pii.NewMemoryEventStore(0)
 
 	// Wire the routing decision log. Always-on when stats are enabled —
 	// the per-router admin page reads this as the live activity feed
@@ -441,11 +410,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 	// traffic doesn't need a parallel config for MITM traffic.
 	// Runs after loadRuntimeSettingsFromFile so a listener configured
 	// via /api/settings is brought back up across restarts.
-	if options.MITMListen != "" {
-		if err := startMITMProxy(application, options); err != nil {
-			return nil, fmt.Errorf("mitm: startup: %w", err)
-		}
-	}
+	startMITMIfConfigured(application, options)
 
 	application.ModelLoader().SetBackendLoggingEnabled(options.EnableBackendLogging)
 
@@ -500,7 +465,7 @@ func startWatcher(options *config.ApplicationConfig) {
 	if _, err := os.Stat(options.DynamicConfigsDir); err != nil {
 		if os.IsNotExist(err) {
 			// We try to create the directory if it does not exist and was specified
-			if err := os.MkdirAll(options.DynamicConfigsDir, 0700); err != nil {
+			if err := os.MkdirAll(options.DynamicConfigsDir, 0o700); err != nil {
 				xlog.Error("failed creating DynamicConfigsDir", "error", err)
 			}
 		} else {
@@ -747,16 +712,6 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 		options.MITMListen = *settings.MITMListen
 	}
 
-	// PII pattern overrides — file is the only source; CLI flags don't
-	// reach into this map. Apply unconditionally when present; the
-	// redactor wiring below sees the result on first construction.
-	if settings.PIIPatternOverrides != nil {
-		options.PIIPatternOverrides = make(map[string]config.PIIPatternRuntimeOverride, len(*settings.PIIPatternOverrides))
-		for id, ov := range *settings.PIIPatternOverrides {
-			options.PIIPatternOverrides[id] = ov
-		}
-	}
-
 	// Backend upgrade flags
 	if settings.AutoUpgradeBackends != nil {
 		if !options.AutoUpgradeBackends {
@@ -907,7 +862,7 @@ func loadOrGenerateHMACSecret(path string) (string, error) {
 	}
 	secret := hex.EncodeToString(b)
 
-	if err := os.WriteFile(path, []byte(secret), 0600); err != nil {
+	if err := os.WriteFile(path, []byte(secret), 0o600); err != nil {
 		return "", fmt.Errorf("failed to persist HMAC secret: %w", err)
 	}
 
diff --git a/core/backend/embeddings.go b/core/backend/embeddings.go
index 4be2bc346ef9..eff88ef04b19 100644
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -100,8 +100,13 @@ func ModelEmbedding(ctx context.Context, s string, tokens []int, loader *model.M
 		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
 
 		traceData := map[string]any{
-			"input_text":         trace.TruncateString(s, 1000),
-			"input_tokens_count": len(tokens),
+			"input_text": trace.TruncateString(s, 1000),
+		}
+		// Only present for token-mode callers (pre-tokenized override);
+		// emitting "0" alongside input_text would read as "consumed zero
+		// tokens", which is wrong.
+		if len(tokens) > 0 {
+			traceData["input_tokens_count"] = len(tokens)
 		}
 
 		startTime := time.Now()
diff --git a/core/backend/options.go b/core/backend/options.go
index 0274bdb6e78a..09a00fd93107 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -87,11 +87,57 @@ func getSeed(c config.ModelConfig) int32 {
 	return seed
 }
 
-func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
-	b := 512
+// DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a
+// model config leaves them unset. Exported so callers that must respect the
+// effective decode window — notably the router's prompt trimmer — resolve the
+// same numbers grpcModelOpts does instead of guessing.
+const (
+	DefaultContextSize = 4096
+	DefaultBatchSize   = 512
+)
+
+// EffectiveContextSize is the context window the backend will run with: the
+// configured value, or DefaultContextSize when unset.
+func EffectiveContextSize(c config.ModelConfig) int {
+	if c.ContextSize != nil {
+		return *c.ContextSize
+	}
+	return DefaultContextSize
+}
+
+// EffectiveBatchSize is the single-decode batch the backend will run with.
+// Score, embedding, rerank and token-classification (NER) all process the whole
+// input in one pass: score decodes prompt+candidate (asserts n_tokens <=
+// n_batch), embedding/rerank pool over the full sequence in one physical batch
+// (n_ubatch), and the NER encoder runs one forward per n_ubatch-sized window.
+// So the batch is sized to the context — anything that fits the context fits
+// one pass, avoiding both the GGML_ASSERT crash (n_outputs_max <=
+// cparams.n_outputs_max, where n_outputs_max defaults to n_batch) and the
+// "input is too large to process" error. Explicit `batch:` always wins.
+func EffectiveBatchSize(c config.ModelConfig) int {
 	if c.Batch != 0 {
-		b = c.Batch
+		return c.Batch
 	}
+	// token_classify is checked explicitly AND via the embeddings flag: a
+	// token-classification (NER) model sets embeddings:true but declares
+	// known_usecases:[token_classify], and that declaration is authoritative —
+	// it suppresses the embeddings usecase guess, so HasUsecases(FLAG_EMBEDDINGS)
+	// is false here. Any pooled encoder (embeddings:true) is single-pass
+	// regardless of how its usecases resolved, so key off the flag as a catch-all.
+	singlePass := c.HasUsecases(config.FLAG_SCORE) ||
+		c.HasUsecases(config.FLAG_EMBEDDINGS) ||
+		c.HasUsecases(config.FLAG_RERANK) ||
+		c.HasUsecases(config.FLAG_TOKEN_CLASSIFY) ||
+		(c.Embeddings != nil && *c.Embeddings)
+	if ctx := EffectiveContextSize(c); singlePass && ctx > DefaultBatchSize {
+		return ctx
+	}
+	return DefaultBatchSize
+}
+
+func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
+	ctxSize := EffectiveContextSize(c)
+	b := EffectiveBatchSize(c)
 
 	flashAttention := "auto"
 
@@ -134,11 +180,6 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 		}
 	}
 
-	ctxSize := 4096
-	if c.ContextSize != nil {
-		ctxSize = *c.ContextSize
-	}
-
 	mmlock := false
 	if c.MMlock != nil {
 		mmlock = *c.MMlock
diff --git a/core/backend/options_internal_test.go b/core/backend/options_internal_test.go
index 5e1848f0f5bd..1c1bd1493545 100644
--- a/core/backend/options_internal_test.go
+++ b/core/backend/options_internal_test.go
@@ -97,3 +97,95 @@ var _ = Describe("gRPCPredictOpts reasoning_effort metadata", func() {
 		Expect(opts.Metadata).ToNot(HaveKey("reasoning_effort"))
 	})
 })
+
+var _ = Describe("grpcModelOpts NBatch", func() {
+	scoreUsecase := config.FLAG_SCORE
+	threads := 1
+	ctx := 4096
+
+	It("defaults to 512 for an ordinary model", func() {
+		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}}
+		opts := grpcModelOpts(cfg, "/tmp/models")
+		Expect(opts.NBatch).To(BeEquivalentTo(512))
+	})
+
+	It("sizes the batch to the context window for score models", func() {
+		// Score models decode the whole prompt+candidate in one
+		// llama_decode; n_batch must cover it or the backend aborts.
+		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}, KnownUsecases: &scoreUsecase}
+		opts := grpcModelOpts(cfg, "/tmp/models")
+		Expect(opts.NBatch).To(BeEquivalentTo(4096))
+	})
+
+	It("keeps an explicit batch over the score default", func() {
+		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}, KnownUsecases: &scoreUsecase}
+		cfg.Batch = 1024
+		opts := grpcModelOpts(cfg, "/tmp/models")
+		Expect(opts.NBatch).To(BeEquivalentTo(1024))
+	})
+
+	It("sizes the batch to the context window for embedding models", func() {
+		// Embedding/rerank pool over the whole sequence in one physical batch
+		// (n_ubatch); without this the input is capped at the 512 default and
+		// the backend returns "input is too large to process".
+		embeddings := true
+		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}}
+		cfg.Embeddings = &embeddings
+		opts := grpcModelOpts(cfg, "/tmp/models")
+		Expect(opts.NBatch).To(BeEquivalentTo(4096))
+	})
+
+	It("sizes the batch to the context window for rerank models", func() {
+		reranking := true
+		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}}
+		cfg.Reranking = &reranking
+		opts := grpcModelOpts(cfg, "/tmp/models")
+		Expect(opts.NBatch).To(BeEquivalentTo(4096))
+	})
+
+	It("sizes the batch to the context window for token-classification (NER) models", func() {
+		// The privacy-filter regression: a token_classify model sets
+		// embeddings:true but declares known_usecases:[token_classify], which
+		// is authoritative and suppresses the embeddings usecase guess — so
+		// HasUsecases(FLAG_EMBEDDINGS) is false. Without sizing the batch to
+		// the context the NER encoder loads at 512, shrinking the exact-pass
+		// window and tripping the GGML_ASSERT on longer inputs.
+		tokenClassify := config.FLAG_TOKEN_CLASSIFY
+		embeddings := true
+		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}, KnownUsecases: &tokenClassify}
+		cfg.Embeddings = &embeddings
+		opts := grpcModelOpts(cfg, "/tmp/models")
+		Expect(opts.NBatch).To(BeEquivalentTo(4096))
+	})
+
+	It("sizes the batch to the effective context for a token_classify model with no explicit context_size", func() {
+		// Mirrors the shipped gallery config (no batch, no context_size): the
+		// backend defaults n_ctx to 4096, so n_batch must follow.
+		tokenClassify := config.FLAG_TOKEN_CLASSIFY
+		embeddings := true
+		cfg := config.ModelConfig{Threads: &threads, KnownUsecases: &tokenClassify}
+		cfg.Embeddings = &embeddings
+		Expect(cfg.ContextSize).To(BeNil())
+		opts := grpcModelOpts(cfg, "/tmp/models")
+		Expect(opts.NBatch).To(BeEquivalentTo(4096))
+		Expect(opts.ContextSize).To(BeEquivalentTo(4096))
+	})
+
+	It("does not raise the batch when a score model's context is below the default", func() {
+		small := 256
+		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &small}, KnownUsecases: &scoreUsecase}
+		opts := grpcModelOpts(cfg, "/tmp/models")
+		Expect(opts.NBatch).To(BeEquivalentTo(512))
+	})
+
+	It("sizes the batch to the effective 4096 default for a score model with no explicit context_size", func() {
+		// The crash case: the backend defaults n_ctx to 4096, so n_batch must
+		// follow even when context_size is unset — otherwise n_batch stays 512
+		// against a 4096 window and the score decode hits the GGML_ASSERT.
+		cfg := config.ModelConfig{Threads: &threads, KnownUsecases: &scoreUsecase}
+		Expect(cfg.ContextSize).To(BeNil())
+		opts := grpcModelOpts(cfg, "/tmp/models")
+		Expect(opts.NBatch).To(BeEquivalentTo(4096))
+		Expect(opts.ContextSize).To(BeEquivalentTo(4096), "n_batch must match the effective n_ctx the backend receives")
+	})
+})
diff --git a/core/backend/stores.go b/core/backend/stores.go
index 4884765f2f93..8b73ee17c017 100644
--- a/core/backend/stores.go
+++ b/core/backend/stores.go
@@ -3,9 +3,10 @@ package backend
 import (
 	"context"
 	"fmt"
-	"strings"
+	"time"
 
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/trace"
 
 	"github.com/mudler/LocalAI/pkg/grpc"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -39,34 +40,85 @@ func (s *localVectorStore) backend(_ context.Context) (grpc.Backend, error) {
 	return StoreBackend(s.loader, s.appConfig, s.storeName, "")
 }
 
-func (s *localVectorStore) Search(ctx context.Context, vec []float32) (float64, []byte, bool, error) {
-	be, err := s.backend(ctx)
-	if err != nil {
-		return 0, nil, false, fmt.Errorf("vector store load: %w", err)
+func (s *localVectorStore) Search(ctx context.Context, vec []float32) (sim float64, payload []byte, ok bool, err error) {
+	start := time.Now()
+	outcome := "hit"
+	defer func() {
+		s.recordTrace(start, "search", len(vec), sim, outcome, err)
+	}()
+	be, berr := s.backend(ctx)
+	if berr != nil {
+		outcome = "backend_load_error"
+		return 0, nil, false, fmt.Errorf("vector store load: %w", berr)
 	}
-	_, values, similarities, err := store.Find(ctx, be, vec, 1)
-	if err != nil {
-		// local-store's Find returns "existing length is -1" before
-		// any keys are inserted. Surface that as a clean miss so the
-		// cache layer treats it as an empty store and proceeds to
-		// Insert rather than skipping.
-		if strings.Contains(err.Error(), "existing length is -1") {
-			return 0, nil, false, nil
-		}
-		return 0, nil, false, fmt.Errorf("vector store find: %w", err)
+	_, values, similarities, ferr := store.Find(ctx, be, vec, 1)
+	if ferr != nil {
+		outcome = "find_error"
+		return 0, nil, false, fmt.Errorf("vector store find: %w", ferr)
 	}
 	if len(values) == 0 || len(similarities) == 0 {
+		outcome = "miss"
 		return 0, nil, false, nil
 	}
 	return float64(similarities[0]), values[0], true, nil
 }
 
-func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) error {
-	be, err := s.backend(ctx)
+func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) (err error) {
+	start := time.Now()
+	outcome := "ok"
+	defer func() {
+		s.recordTrace(start, "insert", len(vec), 0, outcome, err)
+	}()
+	be, berr := s.backend(ctx)
+	if berr != nil {
+		outcome = "backend_load_error"
+		return fmt.Errorf("vector store load: %w", berr)
+	}
+	if serr := store.SetSingle(ctx, be, vec, payload); serr != nil {
+		outcome = "insert_error"
+		return serr
+	}
+	return nil
+}
+
+// recordTrace surfaces vector-store calls in /api/backend-traces, including
+// the backend-load-failure path that otherwise vanishes into an xlog.Warn.
+// modelName uses the store namespace (e.g. "router-cache-smart-router") so
+// admins can tell which router's cache misbehaved; the backend is always
+// "local-store" and can't disambiguate.
+func (s *localVectorStore) recordTrace(start time.Time, op string, vecDim int, sim float64, outcome string, err error) {
+	if s.appConfig == nil || !s.appConfig.EnableTracing {
+		return
+	}
+	trace.InitBackendTracingIfEnabled(s.appConfig.TracingMaxItems, s.appConfig.TracingMaxBodyBytes)
+	errStr := ""
 	if err != nil {
-		return fmt.Errorf("vector store load: %w", err)
+		errStr = err.Error()
+	}
+	summary := op + " " + outcome
+	if op == "search" && outcome == "hit" {
+		summary = fmt.Sprintf("search hit (sim=%.3f)", sim)
+	}
+	data := map[string]any{
+		"op":         op,
+		"outcome":    outcome,
+		"vector_dim": vecDim,
+	}
+	// Only include similarity for a real neighbor — miss/empty_store would
+	// otherwise render "similarity: 0" and read as a measured value.
+	if op == "search" && outcome == "hit" {
+		data["similarity"] = sim
 	}
-	return store.SetSingle(ctx, be, vec, payload)
+	trace.RecordBackendTrace(trace.BackendTrace{
+		Timestamp: start,
+		Duration:  time.Since(start),
+		Type:      trace.BackendTraceVectorStore,
+		ModelName: s.storeName,
+		Backend:   model.LocalStoreBackend,
+		Summary:   summary,
+		Error:     errStr,
+		Data:      data,
+	})
 }
 
 func StoreBackend(sl *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string, backend string) (grpc.Backend, error) {
diff --git a/core/backend/stores_test.go b/core/backend/stores_test.go
new file mode 100644
index 000000000000..e9d5208a3d45
--- /dev/null
+++ b/core/backend/stores_test.go
@@ -0,0 +1,88 @@
+package backend
+
+import (
+	"context"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/trace"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/system"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// findVectorStoreTrace returns the most recent vector_store trace whose
+// model_name matches storeName, or nil if none was recorded. Used by
+// the specs below to assert the trace landed without relying on
+// ring-buffer ordering across other tests in the suite.
+func findVectorStoreTrace(storeName string) *trace.BackendTrace {
+	traces := trace.GetBackendTraces()
+	for i := range traces {
+		bt := &traces[i]
+		if bt.Type == trace.BackendTraceVectorStore && bt.ModelName == storeName {
+			return bt
+		}
+	}
+	return nil
+}
+
+var _ = Describe("localVectorStore tracing", func() {
+	// Pin the trace surface admins read from /api/backend-traces.
+	// The original failure mode that motivated these specs — the
+	// local-store backend not installed — was silent on every surface
+	// except a per-call xlog.Warn. With tracing wired in, the row
+	// appears next to the embedder/score traces for the same request.
+	BeforeEach(func() {
+		trace.ClearBackendTraces()
+	})
+
+	It("records a vector_store trace with outcome=backend_load_error when the backend can't be loaded", func() {
+		// nil ModelLoader → s.backend → StoreBackend → panics on load.
+		// Use a real-but-empty loader so the failure surfaces as an
+		// error instead, exercising the load-failure trace path the
+		// admin would hit when local-store isn't installed.
+		appCfg := &config.ApplicationConfig{
+			EnableTracing:       true,
+			TracingMaxItems:     16,
+			TracingMaxBodyBytes: 1024,
+		}
+		s := &localVectorStore{
+			loader:    model.NewModelLoader(&system.SystemState{}),
+			appConfig: appCfg,
+			storeName: "router-cache-test",
+		}
+
+		// Search must surface the error AND record a trace describing it.
+		_, _, _, err := s.Search(context.Background(), []float32{0.1, 0.2, 0.3})
+		Expect(err).To(HaveOccurred())
+
+		Eventually(func() *trace.BackendTrace {
+			return findVectorStoreTrace("router-cache-test")
+		}).ShouldNot(BeNil())
+
+		bt := findVectorStoreTrace("router-cache-test")
+		Expect(bt.Backend).To(Equal(model.LocalStoreBackend))
+		Expect(bt.Data["op"]).To(Equal("search"))
+		Expect(bt.Data["outcome"]).To(Equal("backend_load_error"))
+		Expect(bt.Data["vector_dim"]).To(Equal(3))
+		// Error is the wrapped "vector store load: …" surfaced to the caller.
+		Expect(bt.Error).To(ContainSubstring("vector store load"))
+	})
+
+	It("does not record a trace when tracing is disabled", func() {
+		// Opt-out path: appConfig.EnableTracing=false must short-circuit
+		// before InitBackendTracingIfEnabled, so a workload with tracing
+		// turned off doesn't pay the channel-send cost per cache call.
+		appCfg := &config.ApplicationConfig{EnableTracing: false}
+		s := &localVectorStore{
+			loader:    model.NewModelLoader(&system.SystemState{}),
+			appConfig: appCfg,
+			storeName: "router-cache-disabled",
+		}
+		_, _, _, _ = s.Search(context.Background(), []float32{1})
+		Consistently(func() *trace.BackendTrace {
+			return findVectorStoreTrace("router-cache-disabled")
+		}).Should(BeNil())
+	})
+})
diff --git a/core/backend/token_classify.go b/core/backend/token_classify.go
new file mode 100644
index 000000000000..cb1e6b638c21
--- /dev/null
+++ b/core/backend/token_classify.go
@@ -0,0 +1,150 @@
+package backend
+
+import (
+	"context"
+	"time"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/trace"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	model "github.com/mudler/LocalAI/pkg/model"
+)
+
+// TokenEntity is one detected span from a token-classification (NER)
+// model. Mirrors pb.TokenClassifyEntity but keeps the proto type out of
+// consumers. Start/End are BYTE offsets into the classified text,
+// half-open (addressing text[Start:End]) — the proto contract. Group is
+// the model's entity label (e.g. "private_person", "EMAIL").
+type TokenEntity struct {
+	Group string  `json:"group"`
+	Start int     `json:"start"`
+	End   int     `json:"end"`
+	Score float32 `json:"score"`
+	Text  string  `json:"text"`
+}
+
+// TokenClassifyOptions controls a single TokenClassify request.
+type TokenClassifyOptions struct {
+	// Threshold drops entities the backend scores below this value at
+	// the source. 0 returns everything the model emits; downstream
+	// callers (e.g. the PII redactor's MinScore) can still filter
+	// further once they know the per-request policy.
+	Threshold float32
+}
+
+// TokenClassifier runs a token-classification model over text and
+// returns the detected entity spans. Implemented by NewTokenClassifier
+// over a model-loaded backend; the PII redactor's encoder/NER tier
+// consumes this via a pii.NERDetector adapter (see
+// core/services/routing/piidetector).
+type TokenClassifier interface {
+	TokenClassify(ctx context.Context, text string) ([]TokenEntity, error)
+}
+
+// NewTokenClassifier binds (loader, modelConfig, appConfig) into a
+// TokenClassifier. The underlying backend is resolved lazily on the
+// first call, mirroring NewScorer.
+func NewTokenClassifier(loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, opts TokenClassifyOptions) TokenClassifier {
+	return &modelTokenClassifier{loader: loader, modelConfig: modelConfig, appConfig: appConfig, opts: opts}
+}
+
+type modelTokenClassifier struct {
+	loader      *model.ModelLoader
+	modelConfig config.ModelConfig
+	appConfig   *config.ApplicationConfig
+	opts        TokenClassifyOptions
+}
+
+func (m *modelTokenClassifier) TokenClassify(ctx context.Context, text string) ([]TokenEntity, error) {
+	fn, err := ModelTokenClassify(text, m.opts, m.loader, m.modelConfig, m.appConfig)
+	if err != nil {
+		return nil, err
+	}
+	return fn(ctx)
+}
+
+// ModelTokenClassify loads the backend for modelConfig and returns a
+// closure that classifies `text`. Mirrors ModelScore: the closure is
+// bound to the loaded model so a caller can reuse it within a request
+// without re-resolving the backend.
+//
+// When tracing is enabled it records a BackendTraceTokenClassify row so the
+// detector's output — every entity's group, byte range, confidence and the
+// matched substring — shows in the Traces UI alongside the request it gated.
+// This is the technical view for debugging false positives (e.g. a phone
+// number scored as SSN); the persisted PIIEvent keeps only a hash.
+func ModelTokenClassify(text string, opts TokenClassifyOptions, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (func(ctx context.Context) ([]TokenEntity, error), error) {
+	modelOpts := ModelOptions(modelConfig, appConfig)
+	inferenceModel, err := loader.Load(modelOpts...)
+	if err != nil {
+		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
+		return nil, err
+	}
+	return func(ctx context.Context) ([]TokenEntity, error) {
+		var startTime time.Time
+		if appConfig.EnableTracing {
+			trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
+			startTime = time.Now()
+		}
+		resp, err := inferenceModel.TokenClassify(ctx, &pb.TokenClassifyRequest{
+			Text:      text,
+			Threshold: opts.Threshold,
+		})
+		entities := tokenClassifyResponseToEntities(resp)
+		if appConfig.EnableTracing {
+			trace.RecordBackendTrace(tokenClassifyTrace(modelConfig, text, opts.Threshold, entities, startTime, err))
+		}
+		if err != nil {
+			return nil, err
+		}
+		return entities, nil
+	}, nil
+}
+
+// tokenClassifyTrace assembles the Traces-UI row for one NER call: the input
+// preview, the threshold, and every detected entity (group, byte range,
+// confidence, matched text). Split out from the closure so the Data assembly
+// is unit-testable without a live backend.
+func tokenClassifyTrace(modelConfig config.ModelConfig, text string, threshold float32, entities []TokenEntity, start time.Time, callErr error) trace.BackendTrace {
+	errStr := ""
+	if callErr != nil {
+		errStr = callErr.Error()
+	}
+	return trace.BackendTrace{
+		Timestamp: start,
+		Duration:  time.Since(start),
+		Type:      trace.BackendTraceTokenClassify,
+		ModelName: modelConfig.Name,
+		Backend:   modelConfig.Backend,
+		Summary:   trace.TruncateString(text, 200),
+		Error:     errStr,
+		Data: map[string]any{
+			"input_chars": len(text),
+			"threshold":   threshold,
+			"entities":    entities,
+		},
+	}
+}
+
+// tokenClassifyResponseToEntities converts the wire-format response into
+// the value type consumed by callers. Extracted so the conversion can be
+// unit-tested without a real backend (see token_classify_test.go).
+func tokenClassifyResponseToEntities(resp *pb.TokenClassifyResponse) []TokenEntity {
+	if resp == nil {
+		return nil
+	}
+	out := make([]TokenEntity, 0, len(resp.Entities))
+	for _, e := range resp.Entities {
+		if e == nil {
+			continue
+		}
+		out = append(out, TokenEntity{
+			Group: e.EntityGroup,
+			Start: int(e.Start),
+			End:   int(e.End),
+			Score: e.Score,
+			Text:  e.Text,
+		})
+	}
+	return out
+}
diff --git a/core/backend/token_classify_test.go b/core/backend/token_classify_test.go
new file mode 100644
index 000000000000..3b9adda0e5a5
--- /dev/null
+++ b/core/backend/token_classify_test.go
@@ -0,0 +1,61 @@
+package backend
+
+import (
+	"errors"
+	"time"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/trace"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("tokenClassifyResponseToEntities", func() {
+	It("returns nil for a nil response", func() {
+		Expect(tokenClassifyResponseToEntities(nil)).To(BeNil())
+	})
+
+	It("maps proto entities to TokenEntity, skipping nil rows", func() {
+		resp := &pb.TokenClassifyResponse{
+			Entities: []*pb.TokenClassifyEntity{
+				{EntityGroup: "private_person", Start: 3, End: 8, Score: 0.97, Text: "Alice"},
+				nil,
+				{EntityGroup: "EMAIL", Start: 20, End: 40, Score: 0.5, Text: "a@b.com"},
+			},
+		}
+		Expect(tokenClassifyResponseToEntities(resp)).To(Equal([]TokenEntity{
+			{Group: "private_person", Start: 3, End: 8, Score: 0.97, Text: "Alice"},
+			{Group: "EMAIL", Start: 20, End: 40, Score: 0.5, Text: "a@b.com"},
+		}))
+	})
+
+	It("returns an empty (non-nil) slice for a response with no entities", func() {
+		out := tokenClassifyResponseToEntities(&pb.TokenClassifyResponse{})
+		Expect(out).NotTo(BeNil())
+		Expect(out).To(BeEmpty())
+	})
+})
+
+var _ = Describe("tokenClassifyTrace", func() {
+	cfg := config.ModelConfig{Name: "privacy-filter", Backend: "llama-cpp"}
+	ents := []TokenEntity{{Group: "SSN", Start: 5, End: 16, Score: 0.62, Text: "123-45-6789"}}
+
+	It("captures model, input preview, threshold and per-entity detail", func() {
+		tr := tokenClassifyTrace(cfg, "ssn is 123-45-6789", 0.5, ents, time.Now(), nil)
+		Expect(tr.Type).To(Equal(trace.BackendTraceTokenClassify))
+		Expect(tr.ModelName).To(Equal("privacy-filter"))
+		Expect(tr.Backend).To(Equal("llama-cpp"))
+		Expect(tr.Summary).To(ContainSubstring("ssn is"))
+		Expect(tr.Error).To(BeEmpty())
+		Expect(tr.Data["input_chars"]).To(Equal(len("ssn is 123-45-6789")))
+		Expect(tr.Data["threshold"]).To(BeEquivalentTo(float32(0.5)))
+		Expect(tr.Data["entities"]).To(Equal(ents))
+	})
+
+	It("records the backend error string when the call failed", func() {
+		tr := tokenClassifyTrace(cfg, "x", 0, nil, time.Now(), errors.New("boom"))
+		Expect(tr.Error).To(Equal("boom"))
+	})
+})
diff --git a/core/backend/tokenize.go b/core/backend/tokenize.go
index 96618d89cdc2..6b926b1793a1 100644
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -7,9 +7,23 @@ import (
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/trace"
 	"github.com/mudler/LocalAI/pkg/grpc"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 
+// tokenizeTokenCount returns the number of tokens in a backend response,
+// treating a nil response as zero. The gRPC client returns (nil, err) on
+// failure, and the tracing block below runs before that error is returned —
+// so the count must be read nil-safely here. Reading resp.Tokens on a nil
+// resp previously panicked the whole HTTP handler when tracing was enabled
+// (e.g. a transient tokenize failure during router probe-budget sizing).
+func tokenizeTokenCount(resp *pb.TokenizationResponse) int {
+	if resp == nil {
+		return 0
+	}
+	return len(resp.Tokens)
+}
+
 func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
 
 	var inferenceModel grpc.Backend
@@ -40,10 +54,7 @@ func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.Model
 			errStr = err.Error()
 		}
 
-		tokenCount := 0
-		if resp.Tokens != nil {
-			tokenCount = len(resp.Tokens)
-		}
+		tokenCount := tokenizeTokenCount(resp)
 
 		trace.RecordBackendTrace(trace.BackendTrace{
 			Timestamp: startTime,
@@ -64,8 +75,8 @@ func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.Model
 		return schema.TokenizeResponse{}, err
 	}
 
-	if resp.Tokens == nil {
-		resp.Tokens = make([]int32, 0)
+	if resp == nil || resp.Tokens == nil {
+		return schema.TokenizeResponse{Tokens: make([]int32, 0)}, nil
 	}
 
 	return schema.TokenizeResponse{
diff --git a/core/backend/tokenize_test.go b/core/backend/tokenize_test.go
new file mode 100644
index 000000000000..3b5c8e9fbc6f
--- /dev/null
+++ b/core/backend/tokenize_test.go
@@ -0,0 +1,27 @@
+package backend
+
+import (
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("tokenizeTokenCount", func() {
+	// Regression: the gRPC client returns (nil, err) when a tokenize call
+	// fails, and ModelTokenize's tracing block reads the token count before
+	// the error is returned. Dereferencing a nil response there panicked the
+	// HTTP handler (nil pointer dereference) — e.g. a transient tokenize
+	// failure while the router sized its probe-token budget.
+	It("returns zero for a nil response instead of panicking", func() {
+		Expect(tokenizeTokenCount(nil)).To(Equal(0))
+	})
+
+	It("returns zero when the response carries no tokens", func() {
+		Expect(tokenizeTokenCount(&pb.TokenizationResponse{})).To(Equal(0))
+	})
+
+	It("counts the tokens present on the response", func() {
+		Expect(tokenizeTokenCount(&pb.TokenizationResponse{Tokens: []int32{1, 2, 3}})).To(Equal(3))
+	})
+})
diff --git a/core/config/application_config.go b/core/config/application_config.go
index dd36b97b90fe..12799b1dd1c6 100644
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -48,25 +48,6 @@ type ApplicationConfig struct {
 	// touch disk or memory.
 	DisableStats bool
 
-	// PIIConfigPath points to an optional YAML file describing the PII
-	// pattern set. When empty, the routing/pii module's DefaultPatterns()
-	// (email, phone, SSN, credit card, IPv4, API key prefixes) are
-	// loaded with their default actions. Each entry overrides the
-	// matching default by ID:
-	//
-	//   patterns:
-	//     - id: email
-	//       action: route_local      # downgrade default mask -> route_local
-	//     - id: ssn
-	//       action: block            # upgrade default mask -> block
-	//
-	// Unknown ids are rejected with a clear error at startup.
-	PIIConfigPath string
-
-	// DisablePII turns the regex PII filter off entirely. Default
-	// (false) enables it on the OpenAI chat completions route.
-	DisablePII bool
-
 	// MITMListen is the address (host:port) the cloudproxy MITM
 	// listener binds on. Empty disables the MITM proxy entirely.
 	// Use case: redacting PII from Claude Code / Codex CLI traffic
@@ -75,19 +56,20 @@ type ApplicationConfig struct {
 	// LocalAI exposes at /api/middleware/proxy-ca.crt.
 	MITMListen string
 
+	// PIIDefaultDetectors lists token-classification (NER) detector model
+	// names applied to any PII-enabled model that does not name its own
+	// pii.detectors. This makes cloud-proxy / MITM redaction work out of the
+	// box (those default to PII-enabled but carry no detector list) and lets
+	// an operator set one detector for the whole instance. Set at runtime via
+	// POST /api/settings; read live by Application.ResolvePIIPolicy.
+	PIIDefaultDetectors []string
+
 	// MITMCADir holds the persisted MITM proxy CA cert and private
 	// key. The CA is generated on first start; subsequent starts
 	// reload it so clients keep trusting the same root. The key
 	// file is mode 0600.
 	MITMCADir string
 
-
-	// PIIPatternOverrides applies persisted per-id deltas (action,
-	// disabled) to the live redactor at startup. Loaded from
-	// runtime_settings.json and applied right after pii.NewRedactor.
-	// nil/empty leaves the YAML defaults in place.
-	PIIPatternOverrides map[string]PIIPatternRuntimeOverride
-
 	DisableWebUI                       bool
 	OllamaAPIRootEndpoint              bool
 	EnforcePredownloadScans            bool
@@ -116,11 +98,11 @@ type ApplicationConfig struct {
 	// --require-backend-integrity / LOCALAI_REQUIRE_BACKEND_INTEGRITY.
 	RequireBackendIntegrity bool
 
-	SingleBackend           bool // Deprecated: use MaxActiveBackends = 1 instead
-	MaxActiveBackends       int  // Maximum number of active backends (0 = unlimited, 1 = single backend mode)
-	WatchDogIdle bool
-	WatchDogBusy bool
-	WatchDog     bool
+	SingleBackend     bool // Deprecated: use MaxActiveBackends = 1 instead
+	MaxActiveBackends int  // Maximum number of active backends (0 = unlimited, 1 = single backend mode)
+	WatchDogIdle      bool
+	WatchDogBusy      bool
+	WatchDog          bool
 
 	// Memory Reclaimer settings (works with GPU if available, otherwise RAM)
 	MemoryReclaimerEnabled   bool    // Enable memory threshold monitoring
@@ -583,6 +565,7 @@ func WithJSONStringPreload(configFile string) AppOption {
 		o.PreloadJSONModels = configFile
 	}
 }
+
 func WithConfigFile(configFile string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.ConfigFile = configFile
@@ -671,21 +654,6 @@ func WithDisableStats(disable bool) AppOption {
 	}
 }
 
-// WithPIIConfigPath points the routing PII filter at a YAML config
-// file. CLI: --pii-config.
-func WithPIIConfigPath(path string) AppOption {
-	return func(o *ApplicationConfig) {
-		o.PIIConfigPath = path
-	}
-}
-
-// WithDisablePII turns the regex PII filter off. CLI: --disable-pii.
-func WithDisablePII(disable bool) AppOption {
-	return func(o *ApplicationConfig) {
-		o.DisablePII = disable
-	}
-}
-
 // WithMITMListen sets the address the cloudproxy MITM listener
 // binds on. Empty = disabled. CLI: --mitm-listen.
 func WithMITMListen(addr string) AppOption {
@@ -702,7 +670,6 @@ func WithMITMCADir(dir string) AppOption {
 	}
 }
 
-
 func WithDynamicConfigDir(dynamicConfigsDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.DynamicConfigsDir = dynamicConfigsDir
@@ -1108,6 +1075,8 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 
 	mitmListen := o.MITMListen
 
+	piiDefaultDetectors := append([]string(nil), o.PIIDefaultDetectors...)
+
 	return RuntimeSettings{
 		WatchdogEnabled:           &watchdogEnabled,
 		WatchdogIdleEnabled:       &watchdogIdle,
@@ -1162,6 +1131,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 		LogoHorizontalFile:        &logoHorizontalFile,
 		FaviconFile:               &faviconFile,
 		MITMListen:                &mitmListen,
+		PIIDefaultDetectors:       &piiDefaultDetectors,
 	}
 }
 
@@ -1391,6 +1361,10 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
 		o.MITMListen = *settings.MITMListen
 	}
 
+	if settings.PIIDefaultDetectors != nil {
+		o.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
+	}
+
 	// Note: ApiKeys requires special handling (merging with startup keys) - handled in caller
 
 	return requireRestart
diff --git a/core/config/backend_capabilities.go b/core/config/backend_capabilities.go
index 19da89462fdb..a483a5cc53a9 100644
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -8,25 +8,26 @@ import (
 // Usecase name constants — the canonical string values used in gallery entries,
 // model configs (known_usecases), and UsecaseInfoMap keys.
 const (
-	UsecaseChat            = "chat"
-	UsecaseCompletion      = "completion"
-	UsecaseEdit            = "edit"
-	UsecaseVision          = "vision"
-	UsecaseEmbeddings      = "embeddings"
-	UsecaseTokenize        = "tokenize"
-	UsecaseImage           = "image"
-	UsecaseVideo           = "video"
-	UsecaseTranscript      = "transcript"
-	UsecaseTTS             = "tts"
-	UsecaseSoundGeneration = "sound_generation"
-	UsecaseRerank          = "rerank"
-	UsecaseDetection       = "detection"
-	UsecaseVAD             = "vad"
-	UsecaseAudioTransform      = "audio_transform"
-	UsecaseDiarization         = "diarization"
-	UsecaseRealtimeAudio       = "realtime_audio"
-	UsecaseFaceRecognition     = "face_recognition"
-	UsecaseSpeakerRecognition  = "speaker_recognition"
+	UsecaseChat               = "chat"
+	UsecaseCompletion         = "completion"
+	UsecaseEdit               = "edit"
+	UsecaseVision             = "vision"
+	UsecaseEmbeddings         = "embeddings"
+	UsecaseTokenize           = "tokenize"
+	UsecaseImage              = "image"
+	UsecaseVideo              = "video"
+	UsecaseTranscript         = "transcript"
+	UsecaseTTS                = "tts"
+	UsecaseSoundGeneration    = "sound_generation"
+	UsecaseRerank             = "rerank"
+	UsecaseDetection          = "detection"
+	UsecaseVAD                = "vad"
+	UsecaseAudioTransform     = "audio_transform"
+	UsecaseDiarization        = "diarization"
+	UsecaseRealtimeAudio      = "realtime_audio"
+	UsecaseFaceRecognition    = "face_recognition"
+	UsecaseSpeakerRecognition = "speaker_recognition"
+	UsecaseTokenClassify      = "token_classify"
 )
 
 // GRPCMethod identifies a Backend service RPC from backend.proto.
@@ -54,6 +55,7 @@ const (
 	MethodVoiceVerify        GRPCMethod = "VoiceVerify"
 	MethodVoiceEmbed         GRPCMethod = "VoiceEmbed"
 	MethodVoiceAnalyze       GRPCMethod = "VoiceAnalyze"
+	MethodTokenClassify      GRPCMethod = "TokenClassify"
 )
 
 // UsecaseInfo describes a single known_usecase value and how it maps
@@ -171,6 +173,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{
 		GRPCMethod:  MethodVoiceVerify,
 		Description: "Speaker recognition — verify identity, embed and analyze voice via VoiceVerify, VoiceEmbed and VoiceAnalyze RPCs.",
 	},
+	UsecaseTokenClassify: {
+		Flag:        FLAG_TOKEN_CLASSIFY,
+		GRPCMethod:  MethodTokenClassify,
+		Description: "Per-token classification (NER) via the TokenClassify RPC — the PII detector tier. Declared explicitly via known_usecases; never auto-guessed, since the token-classification head is not useful as general generation or embeddings.",
+	},
 }
 
 // BackendCapability describes which gRPC methods and usecases a backend supports.
@@ -202,10 +209,14 @@ var BackendCapabilities = map[string]BackendCapability{
 	// --- LLM / text generation backends ---
 	"llama-cpp": {
 		GRPCMethods:      []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding, MethodTokenizeString},
-		PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEdit, UsecaseEmbeddings, UsecaseTokenize, UsecaseVision},
+		PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEdit, UsecaseEmbeddings, UsecaseTokenize, UsecaseVision, UsecaseTokenClassify},
 		DefaultUsecases:  []string{UsecaseChat},
 		AcceptsImages:    true, // requires mmproj
-		Description:      "llama.cpp GGUF models — LLM inference with optional vision via mmproj",
+		// token_classify is supported only with a patched llama.cpp that
+		// exposes per-token classification logits (the PII NER detector
+		// path); it is never auto-guessed and must be declared explicitly
+		// via known_usecases.
+		Description: "llama.cpp GGUF models — LLM inference with optional vision via mmproj",
 	},
 	"vllm": {
 		GRPCMethods:      []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
diff --git a/core/config/gguf.go b/core/config/gguf.go
index c373561b6319..6f82c809e5aa 100644
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -19,8 +19,18 @@ const (
 	defaultNGPULayers  = 99999999
 )
 
-func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
+// reservedNonChatModel reports whether the operator reserved this model for an
+// internal direct-decode primitive — the router score classifier or the PII
+// NER token_classify tier. Such a model has no chat template and must not be
+// given the generative-chat defaults the GGUF importer otherwise applies
+// (FLAG_CHAT, jinja templating); doing so trips the llama-cpp known_usecases
+// conflict check and makes the config invalid.
+func reservedNonChatModel(cfg *ModelConfig) bool {
+	return cfg.KnownUsecases != nil &&
+		(*cfg.KnownUsecases&(FLAG_SCORE|FLAG_TOKEN_CLASSIFY)) != 0
+}
 
+func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 	if defaultCtx == 0 && cfg.ContextSize == nil {
 		ctxSize := f.EstimateLLaMACppRun().ContextSize
 		if ctxSize > 0 {
@@ -77,11 +87,20 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 		cfg.Name = f.Metadata().Name
 	}
 
-	// Instruct to use template from llama.cpp
-	cfg.TemplateConfig.UseTokenizerTemplate = true
-	cfg.FunctionsConfig.GrammarConfig.NoGrammar = true
-	cfg.Options = append(cfg.Options, "use_jinja:true")
-	cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT")
+	// A model the operator reserved for an internal direct-decode primitive
+	// (the router score classifier, or the PII NER token_classify tier) is not
+	// a chat model: it carries no chat template and must not be painted with
+	// the generative-chat defaults. In particular appending FLAG_CHAT here
+	// would fold chat into KnownUsecases on the next sync and trip the
+	// llama-cpp known_usecases conflict check in Validate(), making the config
+	// invalid so it is silently skipped at load. Respect the declaration.
+	if !reservedNonChatModel(cfg) {
+		// Instruct to use template from llama.cpp
+		cfg.TemplateConfig.UseTokenizerTemplate = true
+		cfg.FunctionsConfig.GrammarConfig.NoGrammar = true
+		cfg.Options = append(cfg.Options, "use_jinja:true")
+		cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT")
+	}
 
 	// Apply per-model-family inference parameter defaults (temperature, top_p, etc.)
 	ApplyInferenceDefaults(cfg, f.Metadata().Name)
diff --git a/core/config/meta/build.go b/core/config/meta/build.go
index 24cfb86b7962..39235b9998dd 100644
--- a/core/config/meta/build.go
+++ b/core/config/meta/build.go
@@ -93,6 +93,9 @@ func applyOverride(f *FieldMeta, o FieldMetaOverride) {
 	if o.Component != "" {
 		f.Component = o.Component
 	}
+	if o.Language != "" {
+		f.Language = o.Language
+	}
 	if o.Placeholder != "" {
 		f.Placeholder = o.Placeholder
 	}
diff --git a/core/config/meta/constants.go b/core/config/meta/constants.go
index b15eb53d0d94..9be49fec0eed 100644
--- a/core/config/meta/constants.go
+++ b/core/config/meta/constants.go
@@ -8,6 +8,7 @@ const (
 	ProviderModelsTTS        = "models:tts"
 	ProviderModelsTranscript = "models:transcript"
 	ProviderModelsVAD        = "models:vad"
+	ProviderModelsScore      = "models:score"
 )
 
 // Static option lists embedded directly in field metadata.
diff --git a/core/config/meta/pattern_meta_test.go b/core/config/meta/pattern_meta_test.go
new file mode 100644
index 000000000000..0b75f5055777
--- /dev/null
+++ b/core/config/meta/pattern_meta_test.go
@@ -0,0 +1,41 @@
+package meta_test
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/config/meta"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestMeta(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "config/meta suite")
+}
+
+var _ = Describe("pattern detector field metadata", func() {
+	byPath := func() map[string]meta.FieldMeta {
+		md := meta.BuildForTest(reflect.TypeOf(config.ModelConfig{}), meta.DefaultRegistry())
+		out := make(map[string]meta.FieldMeta, len(md.Fields))
+		for _, f := range md.Fields {
+			out[f.Path] = f
+		}
+		return out
+	}
+
+	It("renders builtins as a select with the catalogue as options", func() {
+		f, ok := byPath()["pii_detection.builtins"]
+		Expect(ok).To(BeTrue(), "pii_detection.builtins field should exist")
+		Expect(f.Component).To(Equal("pii-builtins-select"))
+		Expect(f.Options).NotTo(BeEmpty())
+	})
+
+	It("renders custom patterns with the pattern-list editor", func() {
+		f, ok := byPath()["pii_detection.patterns"]
+		Expect(ok).To(BeTrue(), "pii_detection.patterns field should exist")
+		Expect(f.Component).To(Equal("pii-pattern-list"))
+	})
+})
diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go
index 548b218921ba..b8222fe8210f 100644
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -1,5 +1,19 @@
 package meta
 
+import "github.com/mudler/LocalAI/core/services/routing/piipattern"
+
+// builtinPatternOptions turns the piipattern built-in catalogue into select
+// options for the editor's built-in-patterns checklist, keeping the catalogue
+// the single source of truth.
+func builtinPatternOptions() []FieldOption {
+	cat := piipattern.BuiltinCatalogue()
+	out := make([]FieldOption, 0, len(cat))
+	for _, b := range cat {
+		out = append(out, FieldOption{Value: b.Name, Label: b.Name + " — " + b.Description})
+	}
+	return out
+}
+
 // DefaultRegistry returns enrichment overrides for the ~30 most commonly used
 // config fields. Fields not listed here still appear with auto-generated
 // labels and type-inferred components.
@@ -226,6 +240,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Label:       "Chat Template",
 			Description: "Go template for chat completion requests",
 			Component:   "code-editor",
+			Language:    "gotemplate",
 			Order:       40,
 		},
 		"template.chat_message": {
@@ -233,6 +248,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Label:       "Chat Message Template",
 			Description: "Go template for individual chat messages",
 			Component:   "code-editor",
+			Language:    "gotemplate",
 			Order:       41,
 		},
 		"template.completion": {
@@ -240,13 +256,22 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Label:       "Completion Template",
 			Description: "Go template for completion requests",
 			Component:   "code-editor",
+			Language:    "gotemplate",
 			Order:       42,
 		},
+		"template.function": {
+			Section:     "templates",
+			Label:       "Functions Template",
+			Description: "Go template applied when tools/functions are present in the request",
+			Component:   "code-editor",
+			Language:    "gotemplate",
+			Order:       43,
+		},
 		"template.use_tokenizer_template": {
 			Section:     "templates",
 			Label:       "Use Tokenizer Template",
 			Description: "Use the chat template from the model's tokenizer config",
-			Order:       43,
+			Order:       44,
 		},
 		// Router section template — kept in the templates UI section
 		// (rather than the router section under "other") so operators
@@ -257,7 +282,8 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Label:       "Router Classifier System Prompt",
 			Description: "Go text/template (with sprig functions) for the routing system prompt the score classifier feeds to its classifier_model. Executed with `.Policies` ([]{Label, Description}). Empty falls back to the built-in Arch-Router-shaped prompt (route-listing block + JSON output schema). Override when the classifier model was trained on a different schema or you need the routing instructions in a different language. The candidate format scored against the model is fixed at `{\"route\": \"<label>\"}` — keep your override's output schema instruction matching that.",
 			Component:   "code-editor",
-			Order:       44,
+			Language:    "gotemplate",
+			Order:       45,
 		},
 
 		// --- Pipeline ---
@@ -365,18 +391,66 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 
 		// --- PII filtering (per-model) ---
 		"pii.enabled": {
-			Section:     "other",
+			Section:     "pii",
 			Label:       "PII Filtering Enabled",
 			Description: "Enable PII redaction middleware for this model. Unset means use the default (off for local backends, on for proxy-* / cloud-hosted backends).",
 			Component:   "toggle",
 			Order:       200,
 		},
-		"pii.patterns": {
-			Section:     "other",
-			Label:       "PII Pattern Overrides",
-			Description: "Override the global default action for specific patterns on this model. Patterns not listed here inherit the global action (Settings → Middleware → Filtering).",
+		"pii.detectors": {
+			Section:              "pii",
+			Label:                "PII Detector Models",
+			Description:          "Token-classification (NER) models that scan this model's requests for PII. The detection policy (which entities, what action, min score) lives on each detector model's own PII Detection block. Multiple detectors union their hits.",
+			Component:            "model-multi-select",
+			AutocompleteProvider: "models:token_classify",
+			Order:                201,
+		},
+
+		// --- PII detection policy (on a token_classify detector model) ---
+		"pii_detection.min_score": {
+			Section:     "pii",
+			Label:       "Detector Min Score",
+			Description: "When this model is used as a PII detector, drop detections scored below this confidence before they are acted on. 0 keeps every detection.",
+			Component:   "slider",
+			Min:         f64(0),
+			Max:         f64(1),
+			Step:        f64(0.01),
+			Order:       210,
+		},
+		"pii_detection.default_action": {
+			Section:     "pii",
+			Label:       "Detector Default Action",
+			Description: "Action applied to detected entity groups with no explicit per-entity override. Defaults to mask — the safe-by-default policy for a PII filter.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "mask", Label: "mask (redact the span)"},
+				{Value: "block", Label: "block (reject the request)"},
+				{Value: "allow", Label: "allow (detect & log only)"},
+			},
+			Default: "mask",
+			Order:   211,
+		},
+		"pii_detection.entity_actions": {
+			Section:     "pii",
+			Label:       "Detector Entity Actions",
+			Description: "Per-entity-group action policy for this detector model (e.g. PASSWORD → block, EMAIL → mask). Groups without an entry use the default action.",
+			Component:   "entity-action-list",
+			Order:       212,
+		},
+		"pii_detection.builtins": {
+			Section:     "pii",
+			Label:       "Built-in Secret Patterns",
+			Description: "Built-in regex patterns for common credentials (API keys, tokens, private keys). Turning any on makes this a pattern detector — it matches high-entropy secrets the NER tier can't, in-process with no model load.",
+			Component:   "pii-builtins-select",
+			Options:     builtinPatternOptions(),
+			Order:       213,
+		},
+		"pii_detection.patterns": {
+			Section:     "pii",
+			Label:       "Custom Secret Patterns",
+			Description: "Operator-defined patterns in a restricted regex subset (e.g. \"sk-prefix-\\w+\"). Each must contain a fixed literal anchor of ≥3 chars; open-ended shapes like emails are rejected (leave those to NER). Matches report under the pattern name as the entity group.",
 			Component:   "pii-pattern-list",
-			Order:       201,
+			Order:       214,
 		},
 
 		// --- Cloud passthrough proxy ---
@@ -385,7 +459,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 		// fails closed — the chat handler does NOT silently fall back
 		// to the local gRPC pipeline.
 		"proxy.mode": {
-			Section:     "other",
+			Section:     "proxy",
 			Label:       "Proxy Mode",
 			Description: "passthrough forwards the client's OpenAI body verbatim — point upstream_url at an OpenAI-compatible endpoint (incl. Anthropic's /v1/chat/completions compat layer). translate converts OpenAI ↔ Anthropic Messages so you can target a native API (/v1/messages); tool_calls and usage tokens survive the round-trip.",
 			Component:   "select",
@@ -397,7 +471,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:   208,
 		},
 		"proxy.provider": {
-			Section:     "other",
+			Section:     "proxy",
 			Label:       "Proxy Provider",
 			Description: "Upstream API family. Drives auth header shape (Bearer vs x-api-key + anthropic-version) and, in translate mode, which request/response codec is used.",
 			Component:   "select",
@@ -409,28 +483,28 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:   209,
 		},
 		"proxy.upstream_url": {
-			Section:     "other",
+			Section:     "proxy",
 			Label:       "Proxy Upstream URL",
 			Description: "Full POST endpoint of the upstream provider (e.g. https://api.openai.com/v1/chat/completions). Only used when Backend is cloud-proxy.",
 			Component:   "input",
 			Order:       210,
 		},
 		"proxy.api_key_env": {
-			Section:     "other",
+			Section:     "proxy",
 			Label:       "Proxy API Key Env Var",
 			Description: "Name of the environment variable holding the upstream API key. Reading from env keeps the secret out of the YAML and the admin UI.",
 			Component:   "input",
 			Order:       211,
 		},
 		"proxy.upstream_model": {
-			Section:     "other",
+			Section:     "proxy",
 			Label:       "Proxy Upstream Model",
 			Description: "Model name sent to the upstream. Leave empty to forward the client's model field unchanged. Useful when the LocalAI alias differs from the upstream's canonical name.",
 			Component:   "input",
 			Order:       212,
 		},
 		"proxy.request_timeout_seconds": {
-			Section:     "other",
+			Section:     "proxy",
 			Label:       "Proxy Request Timeout (seconds)",
 			Description: "Caps the upstream HTTP request duration. 0 disables the deadline; the request still ends when the client disconnects.",
 			Component:   "number",
@@ -445,7 +519,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 		// A host claimed by two configs is a critical error — the
 		// listener refuses to start until resolved.
 		"mitm.hosts": {
-			Section:     "other",
+			Section:     "mitm",
 			Label:       "MITM Intercept Hosts",
 			Description: "Hostnames the cloudproxy MITM proxy terminates TLS for on behalf of this model config. PII filtering and pattern overrides flow from this model when the host is intercepted. Each host must be unique across all configs.",
 			Component:   "string-list",
@@ -460,7 +534,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 		// the middleware admin page surfaces every model with a router
 		// block.
 		"router.classifier": {
-			Section:     "other",
+			Section:     "router",
 			Label:       "Classifier",
 			Description: "Picks a candidate by scoring every policy label against the prompt. Only \"score\" is shipped today; it asks the classifier_model to rank each label and reads off the softmax. Empty defaults to \"score\".",
 			Component:   "select",
@@ -470,15 +544,15 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order: 230,
 		},
 		"router.classifier_model": {
-			Section:              "other",
+			Section:              "router",
 			Label:                "Classifier Model",
 			Description:          "Loaded LocalAI model the score classifier asks to rank each policy label as a continuation. Must support the Score gRPC primitive (today: llama-cpp, vLLM) and use the ChatML template. Arch-Router-1.5B Q4_K_M is the canonical choice; any small ChatML instruct model also works at a higher activation_threshold.",
 			Component:            "model-select",
-			AutocompleteProvider: ProviderModelsChat,
+			AutocompleteProvider: ProviderModelsScore,
 			Order:                231,
 		},
 		"router.fallback": {
-			Section:              "other",
+			Section:              "router",
 			Label:                "Fallback Model",
 			Description:          "Model used when no candidate's labels cover the classifier's active label set, or when the classifier errors. Empty means router failures bubble up as HTTP 500 — fail-fast, not silent-bypass.",
 			Component:            "model-select",
@@ -486,7 +560,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:                232,
 		},
 		"router.activation_threshold": {
-			Section:     "other",
+			Section:     "router",
 			Label:       "Activation Threshold",
 			Description: "Softmax-probability floor a policy must clear to join the active label set for a request. Higher → single-label dominant routes; lower → more multi-label activations. 0 picks the package default (0.15). On Arch-Router-1.5B a value around 0.40 keeps the dominant label clean without losing genuine compound activations.",
 			Component:   "slider",
@@ -496,7 +570,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       233,
 		},
 		"router.classifier_cache_size": {
-			Section:     "other",
+			Section:     "router",
 			Label:       "Classifier L1 Cache Size",
 			Description: "Bounded LRU keyed on (case-folded, whitespace-trimmed) prompt — amortises the classifier round-trip across verbatim repeats common in agent loops. 0 here means \"use the default\" (1024); the cache cannot be disabled from YAML.",
 			Component:   "number",
@@ -504,21 +578,21 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       234,
 		},
 		"router.policies": {
-			Section:     "other",
+			Section:     "router",
 			Label:       "Policies",
 			Description: "Label vocabulary the classifier scores over. Each policy has a label and a short natural-language description fed verbatim to the classifier model. Short action-oriented sentences work best (\"writing or debugging code\"; \"small talk\").",
 			Component:   "router-policies",
 			Order:       235,
 		},
 		"router.candidates": {
-			Section:     "other",
+			Section:     "router",
 			Label:       "Candidates",
 			Description: "Routing table: each entry binds a downstream model to a set of policy labels it can serve. Order matters — the middleware picks the FIRST candidate whose labels are a superset of the active set, so list candidates smallest → largest.",
 			Component:   "router-candidates",
 			Order:       236,
 		},
 		"router.score_normalization": {
-			Section:     "other",
+			Section:     "router",
 			Label:       "Score Normalization",
 			Description: "How the score classifier collapses per-candidate joint log-probs into the softmax input. \"raw\" (default) feeds joint log-prob as-is — on-distribution for Arch-Router (the route the model would actually emit if decoded freely). \"mean\" divides by candidate token count — fairer to long labels but off-distribution for models trained to emit fixed-format outputs.",
 			Component:   "select",
@@ -530,7 +604,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order: 240,
 		},
 		"router.embedding_cache.embedding_model": {
-			Section:              "other",
+			Section:              "router",
 			Label:                "L2 Cache: Embedding Model",
 			Description:          "Embedding model used by the L2 decision cache. Embeds incoming probes and looks them up in the per-router local-store collection. Empty disables the cache entirely. nomic-embed-text-v1.5 is the recommended default.",
 			Component:            "model-select",
@@ -538,7 +612,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:                237,
 		},
 		"router.embedding_cache.similarity_threshold": {
-			Section:     "other",
+			Section:     "router",
 			Label:       "L2 Cache: Similarity Threshold",
 			Description: "Cosine-similarity floor a cache candidate must clear to count as a hit. 0 picks the package default (0.80). Re-tune per embedding model — the histogram on the Routing tab shows where the cosine distribution actually sits.",
 			Component:   "slider",
@@ -548,7 +622,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       238,
 		},
 		"router.embedding_cache.confidence_threshold": {
-			Section:     "other",
+			Section:     "router",
 			Label:       "L2 Cache: Confidence Threshold",
 			Description: "Minimum top-label probability a classifier decision must have to be inserted into the cache. 0 picks the package default (0.60). Uncertain decisions are skipped so they can't poison future paraphrases.",
 			Component:   "slider",
@@ -558,7 +632,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       239,
 		},
 		"router.embedding_cache.store_name": {
-			Section:     "other",
+			Section:     "router",
 			Label:       "L2 Cache: Store Name",
 			Description: "Optional override for the local-store collection used by this router's cache. Empty defaults to \"router-cache-<router-model-name>\". Two routers sharing a store_name share their cache (rare).",
 			Component:   "input",
diff --git a/core/config/meta/registry_coverage_test.go b/core/config/meta/registry_coverage_test.go
index 0db375c7c348..a2cde3cf1f62 100644
--- a/core/config/meta/registry_coverage_test.go
+++ b/core/config/meta/registry_coverage_test.go
@@ -240,7 +240,6 @@ var grandfatheredUnregistered = []string{
 	"swap_space",
 	"system_prompt",
 	"template.edit",
-	"template.function",
 	"template.join_chat_messages_by_character",
 	"template.multimodal",
 	"template.reply_prefix",
diff --git a/core/config/meta/types.go b/core/config/meta/types.go
index dcd21fb55806..a86b8bb69ff1 100644
--- a/core/config/meta/types.go
+++ b/core/config/meta/types.go
@@ -11,6 +11,7 @@ type FieldMeta struct {
 	Label       string        `json:"label"`                 // human-readable label
 	Description string        `json:"description,omitempty"` // help text
 	Component   string        `json:"component"`             // "input", "number", "toggle", "select", "slider", etc.
+	Language    string        `json:"language,omitempty"`    // syntax mode for code-editor fields: "yaml" (default), "gotemplate"
 	Placeholder string        `json:"placeholder,omitempty"`
 	Default     any           `json:"default,omitempty"`
 	Min         *float64      `json:"min,omitempty"`
@@ -51,6 +52,7 @@ type FieldMetaOverride struct {
 	Label                string
 	Description          string
 	Component            string
+	Language             string
 	Placeholder          string
 	Default              any
 	Min                  *float64
@@ -78,6 +80,10 @@ func DefaultSections() []Section {
 		{ID: "grpc", Label: "gRPC", Icon: "server", Order: 65},
 		{ID: "agent", Label: "Agent", Icon: "bot", Order: 70},
 		{ID: "mcp", Label: "MCP", Icon: "plug", Order: 75},
+		{ID: "router", Label: "Router", Icon: "git-merge", Order: 78},
+		{ID: "proxy", Label: "Proxy", Icon: "cloud", Order: 80},
+		{ID: "mitm", Label: "MITM Proxy", Icon: "shield", Order: 82},
+		{ID: "pii", Label: "PII", Icon: "shield", Order: 84},
 		{ID: "other", Label: "Other", Icon: "more-horizontal", Order: 100},
 	}
 }
diff --git a/core/config/model_config.go b/core/config/model_config.go
index 9980c92e8c80..eadfcd905da7 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -10,6 +10,7 @@ import (
 	"text/template"
 
 	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/piipattern"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/reasoning"
@@ -23,7 +24,6 @@ const (
 
 // @Description TTS configuration
 type TTSConfig struct {
-
 	// Voice wav path or id
 	Voice string `yaml:"voice,omitempty" json:"voice,omitempty"`
 
@@ -103,13 +103,18 @@ type ModelConfig struct {
 	Options   []string `yaml:"options,omitempty" json:"options,omitempty"`
 	Overrides []string `yaml:"overrides,omitempty" json:"overrides,omitempty"`
 
-	MCP    MCPConfig       `yaml:"mcp,omitempty" json:"mcp,omitempty"`
-	Agent  AgentConfig     `yaml:"agent,omitempty" json:"agent,omitempty"`
-	PII    PIIConfig       `yaml:"pii,omitempty" json:"pii,omitempty"`
-	Router RouterConfig    `yaml:"router,omitempty" json:"router,omitempty"`
-	Proxy  ProxyConfig     `yaml:"proxy,omitempty" json:"proxy,omitempty"`
-	MITM   MITMModelConfig `yaml:"mitm,omitempty" json:"mitm,omitempty"`
-	Limits LimitsConfig    `yaml:"limits,omitempty" json:"limits,omitempty"`
+	MCP   MCPConfig   `yaml:"mcp,omitempty" json:"mcp,omitempty"`
+	Agent AgentConfig `yaml:"agent,omitempty" json:"agent,omitempty"`
+	PII   PIIConfig   `yaml:"pii,omitempty" json:"pii,omitempty"`
+	// PIIDetection is the detection policy when THIS model is used as a
+	// PII detector (a token_classify model named in another model's
+	// pii.detectors). Ignored on models that aren't referenced as
+	// detectors.
+	PIIDetection PIIDetectionConfig `yaml:"pii_detection,omitempty" json:"pii_detection,omitempty"`
+	Router       RouterConfig       `yaml:"router,omitempty" json:"router,omitempty"`
+	Proxy        ProxyConfig        `yaml:"proxy,omitempty" json:"proxy,omitempty"`
+	MITM         MITMModelConfig    `yaml:"mitm,omitempty" json:"mitm,omitempty"`
+	Limits       LimitsConfig       `yaml:"limits,omitempty" json:"limits,omitempty"`
 }
 
 // @Description Admission-control limits applied per request. The
@@ -384,18 +389,54 @@ type PIIConfig struct {
 	// the YAML key is distinguishable from explicit false.
 	Enabled *bool `yaml:"enabled,omitempty" json:"enabled,omitempty"`
 
-	// Patterns lets a model upgrade or downgrade individual pattern
-	// actions (mask | block | route_local) relative to the global
-	// defaults loaded from --pii-config / DefaultPatterns. Pattern IDs
-	// not listed inherit the global action. The regex itself stays
-	// global — only the action is settable per-model.
-	Patterns []PIIPatternOverride `yaml:"patterns,omitempty" json:"patterns,omitempty"`
+	// Detectors lists the token-classification (NER) models whose
+	// detections drive PII redaction for this model. The detection policy
+	// (min score, per-entity actions, default action) lives on each named
+	// detector model's own pii_detection block, not here — a consuming
+	// model just opts in by listing detectors. Multiple detectors union
+	// their hits; overlapping spans resolve to the strongest action.
+	Detectors []string `yaml:"detectors,omitempty" json:"detectors,omitempty"`
+}
+
+// @Description Detection policy for a token-classification (NER) model
+// used as a PII detector. Lives on the detector model's own config so the
+// model is a self-describing policy unit: consuming models reference it by
+// name (via pii.detectors) and inherit this policy with no per-consumer
+// overrides.
+type PIIDetectionConfig struct {
+	// MinScore drops detections the model scores below this confidence
+	// before they are acted on. 0 keeps every detection.
+	MinScore float32 `yaml:"min_score,omitempty" json:"min_score,omitempty"`
+	// DefaultAction (mask | block | allow) applies to detected entity
+	// groups with no explicit EntityActions entry. Empty defaults to
+	// "mask" — the safe-by-default policy for a PII filter.
+	DefaultAction string `yaml:"default_action,omitempty" json:"default_action,omitempty"`
+	// EntityActions maps an entity group the model emits (e.g. "EMAIL",
+	// "PASSWORD") to an action, overriding DefaultAction for that group.
+	// This is where an operator says which PII to block vs mask vs
+	// allow-log.
+	EntityActions map[string]string `yaml:"entity_actions,omitempty" json:"entity_actions,omitempty"`
+
+	// Builtins names the built-in pattern groups this (pattern) detector
+	// enables, e.g. "anthropic_api_key", "github_token". Pattern detectors
+	// match high-entropy structured secrets the NER tier can't; see
+	// core/services/routing/piipattern.
+	Builtins []string `yaml:"builtins,omitempty" json:"builtins,omitempty"`
+	// Patterns lists operator-defined secret patterns in the restricted-regex
+	// subset (validated at load). Each match is reported under its Name as the
+	// entity group, so EntityActions/DefaultAction apply by Name.
+	Patterns []PIIPattern `yaml:"patterns,omitempty" json:"patterns,omitempty"`
 }
 
-// @Description Per-model action override for a single PII pattern.
-type PIIPatternOverride struct {
-	ID     string `yaml:"id" json:"id"`
-	Action string `yaml:"action" json:"action"`
+// PIIPattern is one operator-defined pattern on a pattern detector model. Name
+// is the entity group reported for matches (and the EntityActions key). Match
+// is the restricted-regex source. Action optionally overrides DefaultAction for
+// this pattern. MinLen drops matches shorter than N bytes (0 = no floor).
+type PIIPattern struct {
+	Name   string `yaml:"name" json:"name"`
+	Match  string `yaml:"match" json:"match"`
+	Action string `yaml:"action,omitempty" json:"action,omitempty"`
+	MinLen int    `yaml:"min_len,omitempty" json:"min_len,omitempty"`
 }
 
 // PIIIsEnabled returns the resolved PII state for this model. Single
@@ -408,27 +449,71 @@ func (c *ModelConfig) PIIIsEnabled() bool {
 	return c.Backend == "cloud-proxy"
 }
 
-// PIIPatternOverrides returns the per-pattern action overrides as a map
-// keyed by pattern ID. The values are the raw action strings — the pii
-// package validates and converts them.
-//
-// Returned via the documented modelPIIConfig interface in
-// core/services/routing/pii/middleware.go without taking a config
-// dependency on this package.
-func (c *ModelConfig) PIIPatternOverrides() map[string]string {
-	if len(c.PII.Patterns) == 0 {
+// PIIDetectors returns the names of the token-classification models that
+// drive PII redaction for this (consuming) model. Read via the
+// ModelPIIConfig interface in core/services/routing/pii/middleware.go.
+func (c *ModelConfig) PIIDetectors() []string {
+	if len(c.PII.Detectors) == 0 {
 		return nil
 	}
-	out := make(map[string]string, len(c.PII.Patterns))
-	for _, p := range c.PII.Patterns {
-		if p.ID == "" {
-			continue
-		}
-		out[p.ID] = p.Action
+	out := make([]string, len(c.PII.Detectors))
+	copy(out, c.PII.Detectors)
+	return out
+}
+
+// piiCoverableUsecases lists the model usecases whose serving API has a
+// request-side PII filter wired (a piiadapter + the pii middleware). It scopes
+// the Middleware admin list (PIIFilterApplies). Grow it as adapters are added
+// for new endpoints. cloud-proxy carries no usecase flag but is always covered
+// (via the MITM / proxy chat path), so PIIFilterApplies handles it separately.
+var piiCoverableUsecases = []ModelConfigUsecase{FLAG_CHAT, FLAG_COMPLETION, FLAG_EDIT, FLAG_EMBEDDINGS}
+
+// PIIFilterApplies reports whether request-side PII filtering can apply to
+// this model at all — i.e. it is reachable through a text-accepting endpoint
+// that has a PII adapter wired. Used to scope the Middleware admin view so it
+// lists only models PII could protect, not every config (VAD, STT,
+// embedding-only, image, or the token_classify detector models themselves,
+// which are the filters rather than consumers). Detector/score models return
+// false naturally: HasUsecases short-circuits to false for any usecase a
+// declared score/token_classify model did not itself declare.
+func (c *ModelConfig) PIIFilterApplies() bool {
+	if c.Backend == "cloud-proxy" {
+		return true
+	}
+	return slices.ContainsFunc(piiCoverableUsecases, c.HasUsecases)
+}
+
+// PIIDetectionMinScore returns the confidence floor this model applies
+// when used as a PII detector.
+func (c *ModelConfig) PIIDetectionMinScore() float32 { return c.PIIDetection.MinScore }
+
+// PIIDetectionDefaultAction returns the raw default-action string applied
+// to detected entity groups without an explicit override. The pii package
+// validates it and applies the "mask" fallback.
+func (c *ModelConfig) PIIDetectionDefaultAction() string { return c.PIIDetection.DefaultAction }
+
+// PIIDetectionEntityActions returns the per-entity-group action policy as
+// a fresh map of raw action strings (validated by the pii package).
+func (c *ModelConfig) PIIDetectionEntityActions() map[string]string {
+	if len(c.PIIDetection.EntityActions) == 0 {
+		return nil
+	}
+	out := make(map[string]string, len(c.PIIDetection.EntityActions))
+	for k, v := range c.PIIDetection.EntityActions {
+		out[k] = v
 	}
 	return out
 }
 
+// IsPatternDetector reports whether this detector model matches secrets with
+// regex patterns (built-in and/or operator-defined) rather than a neural NER
+// model. Such a model runs entirely in-process (no backend / GGUF / VRAM); the
+// PII resolver builds an in-process pattern matcher for it instead of loading a
+// gRPC token-classifier.
+func (c *ModelConfig) IsPatternDetector() bool {
+	return len(c.PIIDetection.Builtins) > 0 || len(c.PIIDetection.Patterns) > 0
+}
+
 // @Description MCP configuration
 type MCPConfig struct {
 	Servers string `yaml:"remote,omitempty" json:"remote,omitempty"`
@@ -472,8 +557,10 @@ func (c *MCPConfig) MCPConfigFromYAML() (MCPGenericConfig[MCPRemoteServers], MCP
 type MCPGenericConfig[T any] struct {
 	Servers T `yaml:"mcpServers,omitempty" json:"mcpServers,omitempty"`
 }
-type MCPRemoteServers map[string]MCPRemoteServer
-type MCPSTDIOServers map[string]MCPSTDIOServer
+type (
+	MCPRemoteServers map[string]MCPRemoteServer
+	MCPSTDIOServers  map[string]MCPSTDIOServer
+)
 
 // @Description MCP remote server configuration
 type MCPRemoteServer struct {
@@ -1001,6 +1088,39 @@ func (c *ModelConfig) Validate() (bool, error) {
 				"with chat/completion/embeddings — split into separate model configs")
 	}
 
+	// TokenClassify on llama-cpp likewise bypasses the slot loop (direct
+	// decode, see grpc-server.cpp on TokenClassify) and races concurrent
+	// generation. Unlike score it REQUIRES embeddings (TOKEN_CLS pooling),
+	// so the conflict is with generation only, not embeddings.
+	const tokenClassifyConflicts = FLAG_CHAT | FLAG_COMPLETION
+	if (c.Backend == "llama-cpp" || c.Backend == "llama") &&
+		c.HasUsecases(FLAG_TOKEN_CLASSIFY) && c.KnownUsecases != nil &&
+		*c.KnownUsecases&tokenClassifyConflicts != 0 {
+		return false, fmt.Errorf(
+			"known_usecases conflict on llama-cpp: token_classify is incompatible " +
+				"with chat/completion — split into separate model configs")
+	}
+
+	// Pattern detector: validate built-in names and that each operator-defined
+	// pattern is a well-formed, anchored, bounded restricted-regex. Reject at
+	// load so a bad pattern surfaces as a clear config error rather than a
+	// silent no-op (or a fail-closed block) at request time.
+	if c.IsPatternDetector() {
+		for _, name := range c.PIIDetection.Builtins {
+			if _, ok := piipattern.LookupBuiltin(name); !ok {
+				return false, fmt.Errorf("pii_detection: unknown built-in pattern %q", name)
+			}
+		}
+		for _, p := range c.PIIDetection.Patterns {
+			if p.Name == "" {
+				return false, fmt.Errorf("pii_detection: pattern is missing a name")
+			}
+			if err := piipattern.ValidatePattern(p.Match); err != nil {
+				return false, fmt.Errorf("pii_detection: pattern %q: %w", p.Name, err)
+			}
+		}
+	}
+
 	// router.score_normalization is consumed lazily by the score
 	// classifier at first-request time; without load-time validation
 	// a typo wouldn't surface until the first router request panicked
@@ -1113,6 +1233,17 @@ const (
 	// chat/completion/embeddings.
 	FLAG_SCORE ModelConfigUsecase = 0b10000000000000000000
 
+	// Marks a model as wired for the TokenClassify gRPC primitive (the
+	// openai-privacy-filter PII NER tier — per-token BIOES classification).
+	// Like FLAG_SCORE it must be declared explicitly via
+	// `known_usecases: [token_classify]`; there's no heuristic. On the
+	// llama-cpp backend TokenClassify bypasses the slot loop and races the
+	// llama_context (see grpc-server.cpp on TokenClassify), so Validate()
+	// refuses a llama-cpp config combining it with chat/completion. Unlike
+	// FLAG_SCORE, embeddings is NOT a conflict — TokenClassify REQUIRES
+	// TOKEN_CLS pooling, which is loaded via the embeddings flag.
+	FLAG_TOKEN_CLASSIFY ModelConfigUsecase = 0b100000000000000000000
+
 	// Common Subsets
 	FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
 )
@@ -1170,6 +1301,7 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
 		"FLAG_DIARIZATION":         FLAG_DIARIZATION,
 		"FLAG_REALTIME_AUDIO":      FLAG_REALTIME_AUDIO,
 		"FLAG_SCORE":               FLAG_SCORE,
+		"FLAG_TOKEN_CLASSIFY":      FLAG_TOKEN_CLASSIFY,
 	}
 }
 
@@ -1197,19 +1329,20 @@ func GetUsecasesFromYAML(input []string) *ModelConfigUsecase {
 // HasUsecases examines a ModelConfig and determines which endpoints have a chance of success.
 //
 // Declared known_usecases are normally additive — the guessing heuristic
-// still adds whatever it can infer from backend/templates. The one
-// exception is FLAG_SCORE: when the operator declared score, they
-// reserved the model for the router classifier. Letting GuessUsecases
-// paint chat/completion on top would surface it in chat pickers it was
-// deliberately kept out of, and (on llama-cpp) reintroduce the slot
-// contention the score/chat conflict check exists to prevent. So a
-// declared score list is authoritative.
+// still adds whatever it can infer from backend/templates. The exceptions
+// are FLAG_SCORE and FLAG_TOKEN_CLASSIFY: when the operator declared
+// either, they reserved the model for an internal direct-decode primitive
+// (the router classifier, or the PII NER tier). Letting GuessUsecases
+// paint chat/completion/embeddings on top would surface it in pickers it
+// was deliberately kept out of, and (on llama-cpp) reintroduce the slot
+// contention the conflict check exists to prevent. So a declared score or
+// token_classify list is authoritative.
 func (c *ModelConfig) HasUsecases(u ModelConfigUsecase) bool {
 	if c.KnownUsecases != nil {
 		if (u & *c.KnownUsecases) == u {
 			return true
 		}
-		if (*c.KnownUsecases & FLAG_SCORE) == FLAG_SCORE {
+		if (*c.KnownUsecases & (FLAG_SCORE | FLAG_TOKEN_CLASSIFY)) != 0 {
 			return false
 		}
 	}
@@ -1229,14 +1362,20 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
 	}
 
 	if (u & FLAG_CHAT) == FLAG_CHAT {
-		if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" && !c.TemplateConfig.UseTokenizerTemplate {
-			return false
-		}
-		if slices.Contains(nonTextGenBackends, c.Backend) {
-			return false
-		}
-		if c.Embeddings != nil && *c.Embeddings {
-			return false
+		// A router model is a chat dispatcher: it carries no chat
+		// template of its own (those live on the candidates it routes
+		// to) and is invoked through the chat endpoint, so the router
+		// block stands in for chat capability.
+		if !c.HasRouter() {
+			if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" && !c.TemplateConfig.UseTokenizerTemplate {
+				return false
+			}
+			if slices.Contains(nonTextGenBackends, c.Backend) {
+				return false
+			}
+			if c.Embeddings != nil && *c.Embeddings {
+				return false
+			}
 		}
 	}
 	if (u & FLAG_COMPLETION) == FLAG_COMPLETION {
@@ -1376,6 +1515,15 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
 		return false
 	}
 
+	if (u & FLAG_TOKEN_CLASSIFY) == FLAG_TOKEN_CLASSIFY {
+		// No heuristic: token-classification intent is a deliberate
+		// operator choice (it reserves the model from generation traffic
+		// on llama-cpp, and the model's TOKEN_CLS head isn't useful as
+		// general embeddings), so HasUsecases(FLAG_TOKEN_CLASSIFY) is true
+		// only when KnownUsecases declares it explicitly.
+		return false
+	}
+
 	return true
 }
 
diff --git a/core/config/model_config_test.go b/core/config/model_config_test.go
index 5abfb8eaf91e..d936d69a4cd1 100644
--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -7,6 +7,7 @@ import (
 
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
+	"gopkg.in/yaml.v3"
 )
 
 var _ = Describe("Test cases for config related functions", func() {
@@ -108,6 +109,31 @@ parameters:
 			Expect(valid).To(BeTrue())
 			Expect(err).NotTo(HaveOccurred())
 
+			// token_classify on llama-cpp also bypasses the slot loop, so
+			// it can't mix with chat/completion — but unlike score it
+			// REQUIRES embeddings (TOKEN_CLS pooling), so embeddings is not
+			// a conflict.
+			tcAndChat := FLAG_TOKEN_CLASSIFY | FLAG_CHAT
+			tcConflicting := ModelConfig{
+				Name:          "ner-but-also-chat",
+				Backend:       "llama-cpp",
+				KnownUsecases: &tcAndChat,
+			}
+			valid, err = tcConflicting.Validate()
+			Expect(valid).To(BeFalse())
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("token_classify is incompatible"))
+
+			tcAndEmbeddings := FLAG_TOKEN_CLASSIFY | FLAG_EMBEDDINGS
+			tcWithEmbeddings := ModelConfig{
+				Name:          "pii-ner",
+				Backend:       "llama-cpp",
+				KnownUsecases: &tcAndEmbeddings,
+			}
+			valid, err = tcWithEmbeddings.Validate()
+			Expect(valid).To(BeTrue())
+			Expect(err).NotTo(HaveOccurred())
+
 			// Cloud-proxy: api_key_env and api_key_file are mutually
 			// exclusive — picking both is a config bug we catch at
 			// load/save rather than at backend-load time.
@@ -228,7 +254,6 @@ parameters:
 		})
 	})
 	It("Properly handles backend usecase matching", func() {
-
 		a := ModelConfig{
 			Name: "a",
 		}
@@ -283,6 +308,18 @@ parameters:
 		Expect(e.HasUsecases(FLAG_CHAT)).To(BeFalse())
 		Expect(e.HasUsecases(FLAG_EMBEDDINGS)).To(BeTrue())
 
+		// Router models are chat dispatchers: no chat template of their
+		// own, but invoked through the chat endpoint, so they default to
+		// chat-capable.
+		r := ModelConfig{
+			Name: "r",
+			Router: RouterConfig{
+				Candidates: []RouterCandidate{{Model: "downstream", Labels: []string{"general"}}},
+			},
+		}
+		Expect(r.HasUsecases(FLAG_ANY)).To(BeTrue())
+		Expect(r.HasUsecases(FLAG_CHAT)).To(BeTrue())
+
 		f := ModelConfig{
 			Name:    "f",
 			Backend: "piper",
@@ -334,7 +371,7 @@ parameters:
 			Backend:       "llama-cpp",
 			KnownUsecases: &scoreReserved,
 			TemplateConfig: TemplateConfig{
-				Chat:    "inherited from chatml",
+				Chat:        "inherited from chatml",
 				ChatMessage: "inherited from chatml",
 				Completion:  "inherited from chatml",
 			},
@@ -343,6 +380,27 @@ parameters:
 		Expect(j.HasUsecases(FLAG_CHAT)).To(BeFalse())
 		Expect(j.HasUsecases(FLAG_COMPLETION)).To(BeFalse())
 		Expect(j.HasUsecases(FLAG_EMBEDDINGS)).To(BeFalse())
+
+		// Declared `known_usecases: [token_classify]` is likewise
+		// authoritative — a PII NER model is reserved for the redactor's
+		// NER tier and must not surface as chat or as a general embeddings
+		// model, even though it loads with embeddings enabled (its
+		// TOKEN_CLS head produces BIOES logits, not reusable embeddings).
+		tcReserved := FLAG_TOKEN_CLASSIFY
+		embTrue := true
+		k := ModelConfig{
+			Name:          "privacy-filter",
+			Backend:       "llama-cpp",
+			KnownUsecases: &tcReserved,
+			Embeddings:    &embTrue,
+			TemplateConfig: TemplateConfig{
+				Chat:        "inherited from chatml",
+				ChatMessage: "inherited from chatml",
+			},
+		}
+		Expect(k.HasUsecases(FLAG_TOKEN_CLASSIFY)).To(BeTrue())
+		Expect(k.HasUsecases(FLAG_CHAT)).To(BeFalse())
+		Expect(k.HasUsecases(FLAG_EMBEDDINGS)).To(BeFalse())
 	})
 	It("Test Validate with invalid MCP config", func() {
 		tmp, err := os.CreateTemp("", "config.yaml")
@@ -518,3 +576,160 @@ concurrency_groups:
 		})
 	})
 })
+
+var _ = Describe("PII config accessors", func() {
+	It("PIIDetectors returns a fresh copy of the consumer's detector list", func() {
+		cfg := &ModelConfig{PII: PIIConfig{Detectors: []string{"a", "b"}}}
+		got := cfg.PIIDetectors()
+		Expect(got).To(Equal([]string{"a", "b"}))
+		got[0] = "mutated"
+		Expect(cfg.PII.Detectors[0]).To(Equal("a"), "accessor must not alias the underlying slice")
+	})
+
+	It("PIIDetectors is nil when none are configured", func() {
+		Expect((&ModelConfig{}).PIIDetectors()).To(BeNil())
+	})
+
+	It("exposes the detector model's pii_detection policy", func() {
+		cfg := &ModelConfig{PIIDetection: PIIDetectionConfig{
+			MinScore:      0.5,
+			DefaultAction: "mask",
+			EntityActions: map[string]string{"PASSWORD": "block", "EMAIL": "mask"},
+		}}
+		Expect(cfg.PIIDetectionMinScore()).To(BeNumerically("~", 0.5, 1e-6))
+		Expect(cfg.PIIDetectionDefaultAction()).To(Equal("mask"))
+		ea := cfg.PIIDetectionEntityActions()
+		Expect(ea).To(HaveKeyWithValue("PASSWORD", "block"))
+		ea["PASSWORD"] = "mutated"
+		Expect(cfg.PIIDetection.EntityActions["PASSWORD"]).To(Equal("block"), "accessor must return a fresh map")
+	})
+
+	It("unmarshals pii.detectors and pii_detection from YAML", func() {
+		var cfg ModelConfig
+		raw := []byte("name: consumer\npii:\n  enabled: true\n  detectors: [pf]\npii_detection:\n  min_score: 0.4\n  default_action: mask\n  entity_actions:\n    PASSWORD: block\n")
+		Expect(yaml.Unmarshal(raw, &cfg)).To(Succeed())
+		Expect(cfg.PIIDetectors()).To(Equal([]string{"pf"}))
+		Expect(cfg.PIIDetectionDefaultAction()).To(Equal("mask"))
+		Expect(cfg.PIIDetectionEntityActions()).To(HaveKeyWithValue("PASSWORD", "block"))
+	})
+})
+
+var _ = Describe("GGUF importer chat-default guard (reservedNonChatModel)", func() {
+	mk := func(flags ModelConfigUsecase) *ModelConfig {
+		return &ModelConfig{Backend: "llama-cpp", KnownUsecases: &flags}
+	}
+
+	It("treats declared score / token_classify models as reserved (no chat defaults)", func() {
+		Expect(reservedNonChatModel(mk(FLAG_SCORE))).To(BeTrue())
+		Expect(reservedNonChatModel(mk(FLAG_TOKEN_CLASSIFY))).To(BeTrue())
+		// embeddings declared alongside token_classify (the PII NER shape) is
+		// still reserved.
+		Expect(reservedNonChatModel(mk(FLAG_TOKEN_CLASSIFY | FLAG_EMBEDDINGS))).To(BeTrue())
+	})
+
+	It("does not reserve ordinary or undeclared models", func() {
+		Expect(reservedNonChatModel(mk(FLAG_CHAT))).To(BeFalse())
+		Expect(reservedNonChatModel(mk(FLAG_EMBEDDINGS))).To(BeFalse())
+		Expect(reservedNonChatModel(&ModelConfig{Backend: "llama-cpp"})).To(BeFalse())
+	})
+
+	It("keeps a token_classify GGUF config valid by withholding FLAG_CHAT", func() {
+		// Regression for the privacy-filter import: the GGUF importer appends
+		// FLAG_CHAT to a templateless model, which the next sync folds into
+		// KnownUsecases. For a reserved model that would produce
+		// token_classify+chat — rejected by Validate, so the config is silently
+		// skipped at load and the model disappears from every picker. The guard
+		// withholds FLAG_CHAT; assert both halves of that contract here.
+		reserved := []string{"token_classify"}
+		withChat := append(append([]string{}, reserved...), "FLAG_CHAT")
+
+		// What the importer would have produced WITHOUT the guard: invalid.
+		bad := &ModelConfig{Backend: "llama-cpp", KnownUsecaseStrings: withChat}
+		bad.syncKnownUsecasesFromString()
+		valid, err := bad.Validate()
+		Expect(valid).To(BeFalse())
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("token_classify is incompatible"))
+
+		// With the guard (FLAG_CHAT withheld): the declaration survives and the
+		// config validates.
+		good := &ModelConfig{Backend: "llama-cpp", KnownUsecaseStrings: reserved}
+		good.syncKnownUsecasesFromString()
+		Expect(reservedNonChatModel(good)).To(BeTrue())
+		valid, err = good.Validate()
+		Expect(valid).To(BeTrue())
+		Expect(err).NotTo(HaveOccurred())
+		Expect(good.HasUsecases(FLAG_TOKEN_CLASSIFY)).To(BeTrue())
+	})
+})
+
+var _ = Describe("PIIFilterApplies (Middleware admin list scoping)", func() {
+	withUsecases := func(backend string, flags ModelConfigUsecase) *ModelConfig {
+		return &ModelConfig{Name: "m", Backend: backend, KnownUsecases: &flags}
+	}
+
+	It("includes chat-capable models and cloud-proxy models", func() {
+		Expect(withUsecases("llama-cpp", FLAG_CHAT).PIIFilterApplies()).To(BeTrue())
+		// cloud-proxy is always covered (MITM / proxy chat path), regardless
+		// of declared usecases.
+		Expect((&ModelConfig{Name: "claude", Backend: "cloud-proxy"}).PIIFilterApplies()).To(BeTrue())
+	})
+
+	It("excludes the detector and score models themselves", func() {
+		// token_classify detectors are the filters, not consumers; score
+		// classifiers are internal primitives. Both short-circuit
+		// HasUsecases(FLAG_CHAT) to false.
+		Expect(withUsecases("llama-cpp", FLAG_TOKEN_CLASSIFY).PIIFilterApplies()).To(BeFalse())
+		Expect(withUsecases("llama-cpp", FLAG_SCORE).PIIFilterApplies()).To(BeFalse())
+	})
+
+	It("includes embedding and completion models (their request text is filtered)", func() {
+		// Phase 4 wired PII onto /v1/embeddings, /v1/completions and /v1/edits,
+		// so those usecases are now coverable.
+		emb := withUsecases("llama-cpp", FLAG_EMBEDDINGS)
+		t := true
+		emb.Embeddings = &t
+		Expect(emb.PIIFilterApplies()).To(BeTrue())
+		Expect(withUsecases("llama-cpp", FLAG_COMPLETION).PIIFilterApplies()).To(BeTrue())
+	})
+
+	It("excludes models with no text-accepting, PII-covered endpoint", func() {
+		// VAD / audio-in models carry no coverable usecase.
+		Expect((&ModelConfig{Name: "vad", Backend: "silero-vad"}).PIIFilterApplies()).To(BeFalse())
+		Expect(withUsecases("whisper", FLAG_TRANSCRIPT).PIIFilterApplies()).To(BeFalse())
+	})
+})
+
+var _ = Describe("pattern detector config", func() {
+	patternCfg := func() *ModelConfig {
+		c := &ModelConfig{Name: "secret-filter", Backend: "pattern"}
+		c.PIIDetection.Builtins = []string{"anthropic_api_key"}
+		c.PIIDetection.Patterns = []PIIPattern{{Name: "INTERNAL", Match: `tok-[A-Za-z0-9]{20,}`}}
+		return c
+	}
+
+	It("IsPatternDetector keys off builtins/patterns", func() {
+		Expect(patternCfg().IsPatternDetector()).To(BeTrue())
+		Expect((&ModelConfig{Name: "ner", Backend: "llama-cpp"}).IsPatternDetector()).To(BeFalse())
+	})
+
+	It("Validate accepts a well-formed pattern detector (no model file needed)", func() {
+		ok, err := patternCfg().Validate()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(ok).To(BeTrue())
+	})
+
+	It("Validate rejects an unknown built-in", func() {
+		c := &ModelConfig{Name: "x", Backend: "pattern"}
+		c.PIIDetection.Builtins = []string{"does_not_exist"}
+		_, err := c.Validate()
+		Expect(err).To(MatchError(ContainSubstring("unknown built-in")))
+	})
+
+	It("Validate rejects an unanchored custom pattern", func() {
+		c := &ModelConfig{Name: "x", Backend: "pattern"}
+		c.PIIDetection.Patterns = []PIIPattern{{Name: "EMAILish", Match: `[\w.]+@[\w.]+\.\w+`}}
+		_, err := c.Validate()
+		Expect(err).To(MatchError(ContainSubstring("pattern \"EMAILish\"")))
+	})
+})
diff --git a/core/config/runtime_settings.go b/core/config/runtime_settings.go
index 721362ada1f7..5c5f2986f883 100644
--- a/core/config/runtime_settings.go
+++ b/core/config/runtime_settings.go
@@ -18,8 +18,8 @@ type RuntimeSettings struct {
 	WatchdogInterval    *string `json:"watchdog_interval,omitempty"` // Interval between watchdog checks (e.g., 2s, 30s)
 
 	// Backend management
-	SingleBackend           *bool `json:"single_backend,omitempty"`      // Deprecated: use MaxActiveBackends = 1 instead
-	MaxActiveBackends       *int  `json:"max_active_backends,omitempty"` // Maximum number of active backends (0 = unlimited, 1 = single backend mode)
+	SingleBackend             *bool `json:"single_backend,omitempty"`              // Deprecated: use MaxActiveBackends = 1 instead
+	MaxActiveBackends         *int  `json:"max_active_backends,omitempty"`         // Maximum number of active backends (0 = unlimited, 1 = single backend mode)
 	AutoUpgradeBackends       *bool `json:"auto_upgrade_backends,omitempty"`       // Automatically upgrade backends when new versions are detected
 	PreferDevelopmentBackends *bool `json:"prefer_development_backends,omitempty"` // Prefer development backend versions by default in UI
 	// Memory Reclaimer settings (works with GPU if available, otherwise RAM)
@@ -97,19 +97,9 @@ type RuntimeSettings struct {
 	// trusted clients.
 	MITMListen *string `json:"mitm_listen,omitempty"`
 
-	// PII pattern overrides — keyed by pattern id, applied to the live
-	// redactor at startup and persisted by POST /api/pii/patterns/persist.
-	// Distinguishes from --pii-config (which replaces the entire
-	// pattern set) by only carrying the per-id action/enabled deltas
-	// against the global default catalog.
-	PIIPatternOverrides *map[string]PIIPatternRuntimeOverride `json:"pii_pattern_overrides,omitempty"`
-}
-
-// PIIPatternRuntimeOverride captures the persistable deltas an admin
-// has applied to a single global PII pattern. Both fields are pointers
-// so an override that only flips Disabled doesn't have to also restate
-// Action (and vice versa).
-type PIIPatternRuntimeOverride struct {
-	Action   *string `json:"action,omitempty"`
-	Disabled *bool   `json:"disabled,omitempty"`
+	// PIIDefaultDetectors are the token-classification detector models applied
+	// to any PII-enabled model that names no detectors of its own (so
+	// cloud-proxy/MITM redaction works without per-model config). No omitempty:
+	// an empty array must round-trip so the operator can clear it from the UI.
+	PIIDefaultDetectors *[]string `json:"pii_default_detectors"`
 }
diff --git a/core/gallery/backends_test.go b/core/gallery/backends_test.go
index 081e99c63bbe..b383255c7cd7 100644
--- a/core/gallery/backends_test.go
+++ b/core/gallery/backends_test.go
@@ -50,7 +50,14 @@ var _ = Describe("Runtime capability-based backend selection", func() {
 		must(os.WriteFile(filepath.Join(cudaDir, "metadata.json"), b, 0o644))
 		must(os.WriteFile(filepath.Join(cudaDir, "run.sh"), []byte(""), 0o755))
 
-		// Default system: alias should point to CPU
+		// Default system: alias should point to CPU. Force the capability to
+		// "cpu" so this is hermetic on hosts that actually have a GPU: backend
+		// preference keys off getSystemCapabilities() (env → real nvidia-smi
+		// detection), not GPUVendor, so without this a GPU dev box reports
+		// "nvidia" and the cuda alias wins. The NVIDIA case below overrides it.
+		must(os.Setenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY", "cpu"))
+		defer func() { _ = os.Unsetenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY") }()
+
 		sysDefault, err := system.GetSystemState(
 			system.WithBackendPath(tempDir),
 		)
diff --git a/core/http/endpoints/anthropic/messages.go b/core/http/endpoints/anthropic/messages.go
index c4776d084a55..dbdbe9f322c6 100644
--- a/core/http/endpoints/anthropic/messages.go
+++ b/core/http/endpoints/anthropic/messages.go
@@ -10,13 +10,11 @@ import (
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/auth"
 	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
 	openaiEndpoint "github.com/mudler/LocalAI/core/http/endpoints/openai"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services/cloudproxy"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -30,7 +28,7 @@ import (
 // @Param request body schema.AnthropicRequest true "query params"
 // @Success 200 {object} schema.AnthropicResponse "Response"
 // @Router /v1/messages [post]
-func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient, piiRedactor *pii.Redactor, piiEvents pii.EventStore) echo.HandlerFunc {
+func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		id := uuid.New().String()
 
@@ -53,7 +51,7 @@ func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evalu
 		// Cloud-proxy bail. Same shape as the OpenAI chat endpoint —
 		// forwards via the cloud-proxy gRPC backend.
 		if cfg.IsCloudProxyBackendPassthrough() {
-			return forwardCloudProxyAnthropicViaBackend(c, cfg, input, piiRedactor, piiEvents, ml, appConfig)
+			return forwardCloudProxyAnthropicViaBackend(c, cfg, input, ml, appConfig)
 		}
 
 		// Convert Anthropic messages to OpenAI format for internal processing
@@ -141,7 +139,7 @@ func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evalu
 		xlog.Debug("Anthropic Messages - Prompt (after templating)", "prompt", predInput)
 
 		if input.Stream {
-			return handleAnthropicStream(c, id, input, cfg, ml, cl, appConfig, predInput, openAIReq, funcs, shouldUseFn, mcpExecutor, evaluator, piiRedactor, piiEvents)
+			return handleAnthropicStream(c, id, input, cfg, ml, cl, appConfig, predInput, openAIReq, funcs, shouldUseFn, mcpExecutor, evaluator)
 		}
 
 		return handleAnthropicNonStream(c, id, input, cfg, ml, cl, appConfig, predInput, openAIReq, funcs, shouldUseFn, mcpExecutor, evaluator)
@@ -330,36 +328,13 @@ func handleAnthropicNonStream(c echo.Context, id string, input *schema.Anthropic
 	return sendAnthropicError(c, 500, "api_error", "MCP iteration limit reached")
 }
 
-func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicRequest, cfg *config.ModelConfig, ml *model.ModelLoader, cl *config.ModelConfigLoader, appConfig *config.ApplicationConfig, predInput string, openAIReq *schema.OpenAIRequest, funcs functions.Functions, shouldUseFn bool, mcpExecutor mcpTools.ToolExecutor, evaluator *templates.Evaluator, piiRedactor *pii.Redactor, piiEvents pii.EventStore) error {
+func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicRequest, cfg *config.ModelConfig, ml *model.ModelLoader, cl *config.ModelConfigLoader, appConfig *config.ApplicationConfig, predInput string, openAIReq *schema.OpenAIRequest, funcs functions.Functions, shouldUseFn bool, mcpExecutor mcpTools.ToolExecutor, evaluator *templates.Evaluator) error {
 	c.Response().Header().Set("Content-Type", "text/event-stream")
 	c.Response().Header().Set("Cache-Control", "no-cache")
 	c.Response().Header().Set("Connection", "keep-alive")
 
-	// Per-stream PII filter — same gating as the OpenAI chat path. The
-	// filter is wire-format-agnostic; we feed it the text portion of
-	// each text_delta and emit only what's safe to send. The filter
-	// holds back a tail of size MaxPatternLength-1 so a pattern split
-	// across chunk boundaries still gets masked. When PII is disabled
-	// for this model the filter is nil and emits flow unchanged.
-	var streamPIIFilter *pii.StreamFilter
-	if piiRedactor != nil && cfg.PIIIsEnabled() {
-		correlationID := c.Request().Header.Get("x-request-id")
-		userID := ""
-		if u := auth.GetUser(c); u != nil {
-			userID = u.ID
-		}
-		var overrides map[string]pii.Action
-		if raw := cfg.PIIPatternOverrides(); len(raw) > 0 {
-			overrides = make(map[string]pii.Action, len(raw))
-			for ovid, action := range raw {
-				switch pii.Action(action) {
-				case pii.ActionMask, pii.ActionBlock, pii.ActionRouteLocal:
-					overrides[ovid] = pii.Action(action)
-				}
-			}
-		}
-		streamPIIFilter = pii.NewStreamFilter(piiRedactor, overrides, piiEvents, correlationID, userID)
-	}
+	// Response/output PII redaction is out of scope for now — redaction
+	// runs request-side only (the NER middleware).
 
 	// Send message_start event
 	messageStart := schema.AnthropicStreamEvent{
@@ -440,7 +415,6 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 
 				if len(toolCalls) > toolCallsEmitted {
 					if !inToolCall && currentBlockIndex == 0 {
-						drainStreamPIIToText(c, streamPIIFilter, intPtr(currentBlockIndex))
 						sendAnthropicSSE(c, schema.AnthropicStreamEvent{
 							Type:  "content_block_stop",
 							Index: intPtr(currentBlockIndex),
@@ -481,20 +455,14 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 			}
 
 			if !inToolCall && token != "" {
-				out := token
-				if streamPIIFilter != nil {
-					out = streamPIIFilter.Push(token)
-				}
-				if out != "" {
-					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
-						Type:  "content_block_delta",
-						Index: intPtr(0),
-						Delta: &schema.AnthropicStreamDelta{
-							Type: "text_delta",
-							Text: out,
-						},
-					})
-				}
+				sendAnthropicSSE(c, schema.AnthropicStreamEvent{
+					Type:  "content_block_delta",
+					Index: intPtr(0),
+					Delta: &schema.AnthropicStreamDelta{
+						Type: "text_delta",
+						Text: token,
+					},
+				})
 			}
 			return true
 		}
@@ -532,20 +500,14 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 			// didn't already stream it (autoparser clears raw text, so
 			// accumulatedContent will be empty in that case).
 			if deltaContent != "" && !inToolCall && accumulatedContent == "" {
-				out := deltaContent
-				if streamPIIFilter != nil {
-					out = streamPIIFilter.Push(deltaContent)
-				}
-				if out != "" {
-					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
-						Type:  "content_block_delta",
-						Index: intPtr(0),
-						Delta: &schema.AnthropicStreamDelta{
-							Type: "text_delta",
-							Text: out,
-						},
-					})
-				}
+				sendAnthropicSSE(c, schema.AnthropicStreamEvent{
+					Type:  "content_block_delta",
+					Index: intPtr(0),
+					Delta: &schema.AnthropicStreamDelta{
+						Type: "text_delta",
+						Text: deltaContent,
+					},
+				})
 			}
 
 			// Emit tool_use blocks from ChatDeltas
@@ -553,7 +515,6 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 				collectedToolCalls = deltaToolCalls
 
 				if !inToolCall && currentBlockIndex == 0 {
-					drainStreamPIIToText(c, streamPIIFilter, intPtr(currentBlockIndex))
 					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
 						Type:  "content_block_stop",
 						Index: intPtr(currentBlockIndex),
@@ -657,9 +618,7 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 		if !shouldUseFn && cfg.FunctionsConfig.AutomaticToolParsingFallback && accumulatedContent != "" && toolCallsEmitted == 0 {
 			parsed := functions.ParseFunctionCall(accumulatedContent, cfg.FunctionsConfig)
 			if len(parsed) > 0 {
-				// Close the text content block (after flushing any
-				// residual the streaming PII filter held back).
-				drainStreamPIIToText(c, streamPIIFilter, intPtr(currentBlockIndex))
+				// Close the text content block.
 				sendAnthropicSSE(c, schema.AnthropicStreamEvent{
 					Type:  "content_block_stop",
 					Index: intPtr(currentBlockIndex),
@@ -699,12 +658,8 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 			}
 		}
 
-		// No MCP tools to execute, close stream. drainStreamPIIToText
-		// flushes any residual the streaming PII filter held back as
-		// part of its trailing pattern-window before we close the
-		// text content block.
+		// No MCP tools to execute, close the text content block.
 		if !inToolCall {
-			drainStreamPIIToText(c, streamPIIFilter, intPtr(0))
 			sendAnthropicSSE(c, schema.AnthropicStreamEvent{
 				Type:  "content_block_stop",
 				Index: intPtr(0),
@@ -752,30 +707,6 @@ func convertFuncsToOpenAITools(funcs functions.Functions) []functions.Tool {
 
 func intPtr(i int) *int { return &i }
 
-// drainStreamPIIToText flushes any residual the streaming PII filter
-// has been holding back as part of its trailing pattern-window, and
-// emits it as one final text_delta into the named block before the
-// caller closes that block. Drain is idempotent: calling it twice on
-// the same filter returns "" the second time. Safe to call with a nil
-// filter (no-op).
-func drainStreamPIIToText(c echo.Context, sf *pii.StreamFilter, index *int) {
-	if sf == nil {
-		return
-	}
-	residual := sf.Drain()
-	if residual == "" {
-		return
-	}
-	sendAnthropicSSE(c, schema.AnthropicStreamEvent{
-		Type:  "content_block_delta",
-		Index: index,
-		Delta: &schema.AnthropicStreamDelta{
-			Type: "text_delta",
-			Text: residual,
-		},
-	})
-}
-
 func sendAnthropicSSE(c echo.Context, event schema.AnthropicStreamEvent) {
 	data, err := json.Marshal(event)
 	if err != nil {
@@ -973,17 +904,14 @@ func convertAnthropicTools(input *schema.AnthropicRequest, cfg *config.ModelConf
 }
 
 // forwardCloudProxyAnthropicViaBackend marshals the Anthropic request,
-// constructs the streaming PII filter (when applicable), and hands the
-// body off to the cloud-proxy gRPC backend. Model swap + upstream auth
-// headers are applied inside the backend; the filter is built here
-// because the auth/correlation context only exists in the echo handler.
-func forwardCloudProxyAnthropicViaBackend(c echo.Context, cfg *config.ModelConfig, input *schema.AnthropicRequest, piiRedactor *pii.Redactor, piiEvents pii.EventStore, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
+// and hands the body off to the cloud-proxy gRPC backend. Model swap +
+// upstream auth headers are applied inside the backend. Request-side PII
+// redaction already ran in the middleware; the response is forwarded
+// unmodified.
+func forwardCloudProxyAnthropicViaBackend(c echo.Context, cfg *config.ModelConfig, input *schema.AnthropicRequest, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
 	body, err := json.Marshal(input)
 	if err != nil {
 		return sendAnthropicError(c, 400, "invalid_request_error", "cloudproxy: marshal request: "+err.Error())
 	}
-
-	correlationID := c.Request().Header.Get("x-request-id")
-	streamFilter := cloudproxy.BuildStreamFilter(c, cfg, input.Stream, piiRedactor, piiEvents, correlationID)
-	return cloudproxy.ForwardViaBackend(c, cfg, body, streamFilter, ml, appConfig)
+	return cloudproxy.ForwardViaBackend(c, cfg, body, ml, appConfig)
 }
diff --git a/core/http/endpoints/anthropic/messages_pii_test.go b/core/http/endpoints/anthropic/messages_pii_test.go
deleted file mode 100644
index 91e5297e4f31..000000000000
--- a/core/http/endpoints/anthropic/messages_pii_test.go
+++ /dev/null
@@ -1,114 +0,0 @@
-package anthropic
-
-import (
-	"net/http"
-	"net/http/httptest"
-	"strings"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// drainStreamPIIToText is called from four sites in messages.go and is
-// the load-bearing primitive for "the streaming filter has buffered
-// some bytes that the request just ended on; flush them as a final
-// text_delta event before closing the content block". A regression
-// here would silently truncate the last few bytes of an assistant
-// response on every PII-enabled stream — invisible without coverage.
-
-// newTestFilter compiles the default patterns and returns a filter
-// that holds back its trailing pattern-window; pushing a short string
-// (shorter than holdLen) keeps the bytes inside Drain.
-func newTestFilter() *pii.StreamFilter {
-	patterns, err := pii.Compile(pii.DefaultPatterns())
-	ExpectWithOffset(1, err).NotTo(HaveOccurred())
-	red := pii.NewRedactor(patterns)
-	return pii.NewStreamFilter(red, nil, nil, "", "")
-}
-
-// newTestContext builds a recording echo context — the recorder
-// captures the SSE bytes drainStreamPIIToText writes.
-func newTestContext() (echo.Context, *httptest.ResponseRecorder) {
-	req := httptest.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader("{}"))
-	rec := httptest.NewRecorder()
-	return echo.New().NewContext(req, rec), rec
-}
-
-var _ = Describe("drainStreamPIIToText", func() {
-	It("is a no-op when the filter is nil", func() {
-		c, rec := newTestContext()
-		drainStreamPIIToText(c, nil, intPtr(0))
-		Expect(rec.Body.Len()).To(Equal(0), "nil filter wrote %d bytes: %q", rec.Body.Len(), rec.Body.String())
-	})
-
-	It("emits nothing when the drain is empty", func() {
-		// A filter with nothing buffered should not emit a phantom event;
-		// otherwise every non-PII response would close with an empty
-		// text_delta that pollutes downstream parsers.
-		sf := newTestFilter()
-		c, rec := newTestContext()
-		drainStreamPIIToText(c, sf, intPtr(0))
-		Expect(rec.Body.Len()).To(Equal(0), "empty drain wrote %d bytes: %q", rec.Body.Len(), rec.Body.String())
-	})
-
-	It("flushes residual buffered bytes as a text_delta event", func() {
-		sf := newTestFilter()
-		// Push less than holdLen so all bytes are retained until Drain.
-		// "tail" is short enough that no pattern is plausible.
-		out := sf.Push("tail")
-		Expect(out).To(Equal(""), "Push of short text emitted %q; want all bytes held", out)
-
-		c, rec := newTestContext()
-		drainStreamPIIToText(c, sf, intPtr(2))
-
-		body := rec.Body.String()
-		// Wire format: "event: content_block_delta\ndata: {…}\n\n"
-		Expect(body).To(ContainSubstring("event: content_block_delta"))
-		Expect(body).To(ContainSubstring(`"type":"content_block_delta"`))
-		Expect(body).To(ContainSubstring(`"index":2`))
-		Expect(body).To(ContainSubstring(`"text":"tail"`))
-		Expect(body).To(ContainSubstring(`"type":"text_delta"`))
-		Expect(strings.HasSuffix(body, "\n\n")).To(BeTrue(), "SSE event missing trailing blank line: %q", body)
-	})
-
-	It("is idempotent across consecutive drains", func() {
-		// Two consecutive Drains: the filter returns "" the second time,
-		// so the second drainStreamPIIToText must emit nothing. The
-		// production path in messages.go has at least four call sites
-		// that may overlap (currentBlockIndex==0 emergency path + the
-		// unconditional drain near the end of the stream); without
-		// idempotence we'd duplicate the residual on the wire.
-		sf := newTestFilter()
-		sf.Push("tail")
-
-		c1, rec1 := newTestContext()
-		drainStreamPIIToText(c1, sf, intPtr(0))
-		first := rec1.Body.Len()
-		Expect(first).NotTo(Equal(0), "first drain emitted nothing")
-
-		c2, rec2 := newTestContext()
-		drainStreamPIIToText(c2, sf, intPtr(0))
-		Expect(rec2.Body.Len()).To(Equal(0), "second drain wrote %d bytes; want idempotent no-op: %q", rec2.Body.Len(), rec2.Body.String())
-	})
-
-	It("masks redacted residual instead of leaking it", func() {
-		// The held tail must travel through the redactor on Drain. If
-		// the bytes happen to form a complete pattern at end-of-stream,
-		// the residual emit must contain the mask placeholder, not the
-		// raw value.
-		sf := newTestFilter()
-		// "alice@example.com" is 17 bytes. holdLen for default patterns
-		// is well above 17, so this stays buffered until Drain, which
-		// then redacts it.
-		out := sf.Push("alice@example.com")
-		Expect(out).To(Equal(""), "Push emitted bytes early: %q", out)
-
-		c, rec := newTestContext()
-		drainStreamPIIToText(c, sf, intPtr(0))
-		body := rec.Body.String()
-		Expect(body).NotTo(ContainSubstring("alice@example.com"), "raw email leaked in residual emit: %q", body)
-		Expect(body).To(ContainSubstring("[REDACTED:email]"), "residual emit missing mask placeholder: %q", body)
-	})
-})
diff --git a/core/http/endpoints/localai/api_instructions.go b/core/http/endpoints/localai/api_instructions.go
index 9eb0095dd3bf..1ac66fcb43f5 100644
--- a/core/http/endpoints/localai/api_instructions.go
+++ b/core/http/endpoints/localai/api_instructions.go
@@ -100,15 +100,15 @@ var instructionDefs = []instructionDef{
 	},
 	{
 		Name:        "pii-filtering",
-		Description: "Inspect and tune the regex PII filter applied to chat requests",
+		Description: "Inspect the NER-based PII filter applied to chat requests",
 		Tags:        []string{"pii"},
-		Intro:       "GET /api/pii/patterns lists the active pattern set with each one's action (mask, block, route_local). GET /api/pii/events returns recent redaction events filtered by correlation_id / user_id / pattern_id (admin or local-user only). POST /api/pii/test dry-runs the redactor against an admin-supplied string. POST /api/pii/decide is the programmatic decision oracle for external routers: send `{text}`, receive `{findings, suggested_action, redacted_preview}` without LocalAI mutating, recording, or acting on the call — caller composes the action with its own policy. Default patterns: email, phone, SSN, credit card (Luhn), IPv4, common API key prefixes (sk-, pk-, ghp_, github_pat_). PII is per-model: by default it is OFF for non-proxy backends and ON for backends starting with proxy-* (cloud passthroughs). Opt in with `pii: { enabled: true }` in a model's YAML; use `pii: { patterns: [{id, action}] }` to upgrade or downgrade individual actions for that model. Override global default actions via --pii-config pii.yaml; --disable-pii turns the filter off entirely.",
+		Intro:       "PII redaction is NER-based and request-side. A consuming model opts in with `pii: { enabled: true, detectors: [<model>] }` where each detector is a token-classification (token_classify) model. The detection policy lives on the detector model itself in a `pii_detection:` block: `{ min_score, default_action (mask|block|allow), entity_actions: { GROUP: action } }`. Multiple detectors union their hits; overlapping spans resolve to the strongest action (block > mask > allow). PII defaults OFF for non-proxy backends and ON for proxy-* (cloud passthroughs). GET /api/pii/events returns recent redaction events filtered by correlation_id / user_id / pattern_id (events carry `<source>:<GROUP>` ids — e.g. `ner:EMAIL` for the neural detector, `pattern:ANTHROPIC_KEY` for the regex pattern tier — and an 8-char hash prefix, never the matched value; admin or local-user only). The legacy regex pattern tier and its endpoints (/api/pii/patterns, /test, /decide) were removed.",
 	},
 	{
 		Name:        "middleware-admin",
 		Description: "Inspect and configure the routing-module middleware (PII filter and routing)",
 		Tags:        []string{"middleware", "pii", "router"},
-		Intro:       "GET /api/middleware/status is the single round-trip the /app/middleware admin page reads to render the current state: active PII patterns and their actions, every model's resolved enabled/override state, recent event count, and the active routing models with their classifier configurations. Admin-only (the synthetic local user is admin in no-auth mode). PUT /api/pii/patterns/:id changes a pattern's action in-process — TRANSIENT, lost on restart. To persist, edit --pii-config YAML. GET /api/router/decisions returns the routing decision log filtered by correlation_id / user_id / router_model. The same surface is exposed as MCP tools (`get_middleware_status`, `set_pii_pattern_action`, `get_router_decisions`) for agent-driven configuration.",
+		Intro:       "GET /api/middleware/status is the single round-trip the /app/middleware admin page reads to render the current state: every model's resolved PII enabled state and the NER detector models it references, recent event count, and the active routing models with their classifier configurations. Admin-only (the synthetic local user is admin in no-auth mode). PII detection policy is edited on each detector model's `pii_detection:` block via the model-config tools/UI — there is no global pattern set to mutate. GET /api/router/decisions returns the routing decision log filtered by correlation_id / user_id / router_model. The same surface is exposed as MCP tools (`get_middleware_status`, `get_pii_events`, `get_router_decisions`) for agent-driven inspection.",
 	},
 	{
 		Name:        "intelligent-routing",
diff --git a/core/http/endpoints/localai/config_meta.go b/core/http/endpoints/localai/config_meta.go
index 340c11bc878d..b45720b78598 100644
--- a/core/http/endpoints/localai/config_meta.go
+++ b/core/http/endpoints/localai/config_meta.go
@@ -124,6 +124,10 @@ func AutocompleteEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, a
 				filterFn = config.BuildUsecaseFilterFn(config.FLAG_VAD)
 			case config.UsecaseTranscript:
 				filterFn = config.BuildUsecaseFilterFn(config.FLAG_TRANSCRIPT)
+			case "score": // router classifier usecase (FLAG_SCORE); not in UsecaseInfoMap
+				filterFn = config.BuildUsecaseFilterFn(config.FLAG_SCORE)
+			case config.UsecaseTokenClassify: // PII NER detector usecase (FLAG_TOKEN_CLASSIFY)
+				filterFn = config.BuildUsecaseFilterFn(config.FLAG_TOKEN_CLASSIFY)
 			default:
 				filterFn = config.NoFilterFn
 			}
diff --git a/core/http/endpoints/localai/mcp.go b/core/http/endpoints/localai/mcp.go
index a849e8a2fbb5..22b9ca1831b0 100644
--- a/core/http/endpoints/localai/mcp.go
+++ b/core/http/endpoints/localai/mcp.go
@@ -65,7 +65,7 @@ func MCPEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 	// the per-model PII config and is kept for backward compatibility.
 	// The request-side middleware on the main chat route handles
 	// filtering for the standard /v1/chat/completions path.
-	chatHandler := openai.ChatEndpoint(cl, ml, evaluator, appConfig, natsClient, nil, nil, nil)
+	chatHandler := openai.ChatEndpoint(cl, ml, evaluator, appConfig, natsClient, nil)
 
 	return func(c echo.Context) error {
 		input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
diff --git a/core/http/endpoints/localai/pii_decide.go b/core/http/endpoints/localai/pii_decide.go
deleted file mode 100644
index 1b1ac8e9420f..000000000000
--- a/core/http/endpoints/localai/pii_decide.go
+++ /dev/null
@@ -1,85 +0,0 @@
-package localai
-
-import (
-	"net/http"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
-)
-
-// PIIDecideEndpoint exposes the PII redactor as a decision oracle:
-// scan the supplied text and return findings + the strongest action
-// the configured pattern set would take, without rewriting the
-// caller's request or recording an audit event.
-//
-// External routers (e.g. the localai-org/platform router) call this
-// before dispatching to learn whether to mask the prompt in place,
-// route to a local-only backend, block the request, or pass it
-// through. LocalAI's in-band PII middleware is the alternative path
-// for direct-to-LocalAI clients — same Redactor, different framing.
-//
-// Takes the *pii.Redactor directly rather than the whole
-// *application.Application so the handler stays unit-testable with a
-// freshly-constructed redactor (mirrors the pattern in
-// router_decide.go). The route-registration site is responsible for
-// stubbing this endpoint when --disable-pii is set so callers get a
-// 503 signalling "admin opted out" rather than a misleading allow.
-//
-// @Summary  Scan text for PII and return findings + suggested action (decision oracle)
-// @Tags     pii
-// @Accept   json
-// @Produce  json
-// @Param    request body schema.PIIDecideRequest true "decide params"
-// @Success  200 {object} schema.PIIDecideResponse
-// @Failure  400 {object} map[string]string
-// @Router   /api/pii/decide [post]
-func PIIDecideEndpoint(redactor *pii.Redactor) echo.HandlerFunc {
-	return func(c echo.Context) error {
-		var req schema.PIIDecideRequest
-		if err := c.Bind(&req); err != nil {
-			return echo.NewHTTPError(http.StatusBadRequest, "invalid request body: "+err.Error())
-		}
-		if req.Text == "" {
-			return echo.NewHTTPError(http.StatusBadRequest, "text is required")
-		}
-
-		res := redactor.Redact(req.Text)
-		findings := make([]schema.PIIFinding, len(res.Spans))
-		for i, s := range res.Spans {
-			findings[i] = schema.PIIFinding{
-				Start:      s.Start,
-				End:        s.End,
-				Pattern:    s.Pattern,
-				HashPrefix: s.HashPrefix,
-			}
-		}
-		return c.JSON(http.StatusOK, schema.PIIDecideResponse{
-			Findings:        findings,
-			SuggestedAction: suggestedAction(res),
-			RedactedPreview: res.Redacted,
-		})
-	}
-}
-
-// actionAllow is the wire-only value for "no findings". The other
-// three map to existing pii.Action* constants; allow has no in-band
-// counterpart because the in-band middleware simply passes through.
-const actionAllow = "allow"
-
-// suggestedAction collapses the Redactor's Result flags onto a single
-// wire-format action using the in-band ordering (block > route_local
-// > mask > allow). Spans-without-Blocked-or-LocalOnly means every
-// match resolved to ActionMask.
-func suggestedAction(res pii.Result) string {
-	switch {
-	case res.Blocked:
-		return string(pii.ActionBlock)
-	case res.LocalOnly:
-		return string(pii.ActionRouteLocal)
-	case len(res.Spans) > 0:
-		return string(pii.ActionMask)
-	default:
-		return actionAllow
-	}
-}
diff --git a/core/http/endpoints/localai/pii_decide_test.go b/core/http/endpoints/localai/pii_decide_test.go
deleted file mode 100644
index d91d7283488f..000000000000
--- a/core/http/endpoints/localai/pii_decide_test.go
+++ /dev/null
@@ -1,107 +0,0 @@
-package localai_test
-
-import (
-	"encoding/json"
-	"net/http"
-	"net/http/httptest"
-	"strings"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/http/endpoints/localai"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// PIIDecideEndpoint exposes the redactor as a decision oracle. These
-// specs pin the validation surface and the suggested_action mapping
-// across all four actions (allow/mask/route_local/block). The redactor
-// itself is covered in core/services/routing/pii/redactor_test.go.
-
-var _ = Describe("PIIDecideEndpoint", func() {
-	var redactor *pii.Redactor
-
-	BeforeEach(func() {
-		patterns, err := pii.Compile(pii.DefaultPatterns())
-		Expect(err).NotTo(HaveOccurred())
-		redactor = pii.NewRedactor(patterns)
-	})
-
-	It("rejects requests with no text field", func() {
-		rec, _ := invokePIIDecide(redactor, `{}`)
-		Expect(rec.Code).To(Equal(http.StatusBadRequest))
-		Expect(rec.Body.String()).To(ContainSubstring("text is required"))
-	})
-
-	It("rejects malformed JSON", func() {
-		rec, _ := invokePIIDecide(redactor, `not json`)
-		Expect(rec.Code).To(Equal(http.StatusBadRequest))
-	})
-
-	It("returns allow for clean text", func() {
-		rec, body := invokePIIDecide(redactor, `{"text":"hello world"}`)
-		Expect(rec.Code).To(Equal(http.StatusOK))
-		Expect(body.SuggestedAction).To(Equal("allow"))
-		Expect(body.Findings).To(BeEmpty())
-		Expect(body.RedactedPreview).To(Equal("hello world"))
-	})
-
-	It("returns mask for text containing email (default action)", func() {
-		rec, body := invokePIIDecide(redactor, `{"text":"reach me at alice@example.com please"}`)
-		Expect(rec.Code).To(Equal(http.StatusOK))
-		Expect(body.SuggestedAction).To(Equal("mask"))
-		Expect(body.Findings).To(HaveLen(1))
-		Expect(body.Findings[0].Pattern).To(Equal("email"))
-		Expect(body.Findings[0].HashPrefix).NotTo(BeEmpty())
-		Expect(body.RedactedPreview).To(ContainSubstring("[REDACTED:email]"))
-		Expect(body.RedactedPreview).NotTo(ContainSubstring("alice@example.com"))
-	})
-
-	It("returns block when an api_key_prefix is present (block beats mask)", func() {
-		// api_key_prefix defaults to ActionBlock per DefaultPatterns.
-		// Mix in an email so we also confirm the block-action wins
-		// over the mask-action via actionRank.
-		rec, body := invokePIIDecide(redactor, `{"text":"my key is sk-1234567890abcdefghij and email alice@example.com"}`)
-		Expect(rec.Code).To(Equal(http.StatusOK))
-		Expect(body.SuggestedAction).To(Equal("block"))
-		Expect(len(body.Findings)).To(BeNumerically(">=", 1))
-	})
-
-	It("returns route_local when an override sets that action", func() {
-		// Promote the email pattern to route_local for this test —
-		// exercises the route_local branch of suggestedAction without
-		// needing a custom pattern set.
-		Expect(redactor.SetAction("email", pii.ActionRouteLocal)).To(Succeed())
-		rec, body := invokePIIDecide(redactor, `{"text":"contact alice@example.com"}`)
-		Expect(rec.Code).To(Equal(http.StatusOK))
-		Expect(body.SuggestedAction).To(Equal("route_local"))
-		// route_local leaves the original text intact — caller decides
-		// whether to forward it to a local-only backend.
-		Expect(body.RedactedPreview).To(ContainSubstring("alice@example.com"))
-	})
-
-	It("never leaks the matched value via HashPrefix", func() {
-		rec, body := invokePIIDecide(redactor, `{"text":"alice@example.com"}`)
-		Expect(rec.Code).To(Equal(http.StatusOK))
-		Expect(body.Findings).To(HaveLen(1))
-		// HashPrefix is 8 hex chars of sha256 — definitely not the
-		// matched value, but stable so admins can correlate leaks.
-		Expect(body.Findings[0].HashPrefix).To(HaveLen(8))
-		Expect(body.Findings[0].HashPrefix).NotTo(ContainSubstring("alice"))
-	})
-})
-
-func invokePIIDecide(redactor *pii.Redactor, body string) (*httptest.ResponseRecorder, schema.PIIDecideResponse) {
-	e := echo.New()
-	e.POST("/api/pii/decide", localai.PIIDecideEndpoint(redactor))
-	req := httptest.NewRequest(http.MethodPost, "/api/pii/decide", strings.NewReader(body))
-	req.Header.Set("Content-Type", "application/json")
-	rec := httptest.NewRecorder()
-	e.ServeHTTP(rec, req)
-	var parsed schema.PIIDecideResponse
-	if rec.Code == http.StatusOK {
-		Expect(json.Unmarshal(rec.Body.Bytes(), &parsed)).To(Succeed())
-	}
-	return rec, parsed
-}
diff --git a/core/http/endpoints/mcp/localai_assistant_test.go b/core/http/endpoints/mcp/localai_assistant_test.go
index a37e3234e06f..26cd2878faea 100644
--- a/core/http/endpoints/mcp/localai_assistant_test.go
+++ b/core/http/endpoints/mcp/localai_assistant_test.go
@@ -22,25 +22,31 @@ type stubClient struct{}
 func (stubClient) GallerySearch(_ context.Context, _ localaitools.GallerySearchQuery) ([]gallery.Metadata, error) {
 	return []gallery.Metadata{{Name: "stub", Gallery: config.Gallery{Name: "stub-gallery"}}}, nil
 }
+
 func (stubClient) ListInstalledModels(_ context.Context, _ localaitools.Capability) ([]localaitools.InstalledModel, error) {
 	return []localaitools.InstalledModel{{Name: "stub"}}, nil
 }
+
 func (stubClient) ListGalleries(_ context.Context) ([]config.Gallery, error) {
 	return []config.Gallery{{Name: "stub-gallery", URL: "http://example"}}, nil
 }
+
 func (stubClient) GetJobStatus(_ context.Context, _ string) (*localaitools.JobStatus, error) {
 	return &localaitools.JobStatus{ID: "stub", Processed: true}, nil
 }
+
 func (stubClient) GetModelConfig(_ context.Context, _ string) (*localaitools.ModelConfigView, error) {
 	return &localaitools.ModelConfigView{Name: "stub"}, nil
 }
+
 func (stubClient) InstallModel(_ context.Context, _ localaitools.InstallModelRequest) (string, error) {
 	return "stub-job", nil
 }
+
 func (stubClient) ImportModelURI(_ context.Context, _ localaitools.ImportModelURIRequest) (*localaitools.ImportModelURIResponse, error) {
 	return &localaitools.ImportModelURIResponse{JobID: "stub-import"}, nil
 }
-func (stubClient) DeleteModel(_ context.Context, _ string) error  { return nil }
+func (stubClient) DeleteModel(_ context.Context, _ string) error { return nil }
 func (stubClient) EditModelConfig(_ context.Context, _ string, _ map[string]any) error {
 	return nil
 }
@@ -48,57 +54,61 @@ func (stubClient) ReloadModels(_ context.Context) error { return nil }
 func (stubClient) ListBackends(_ context.Context) ([]localaitools.Backend, error) {
 	return []localaitools.Backend{{Name: "stub-backend", Installed: true}}, nil
 }
+
 func (stubClient) ListKnownBackends(_ context.Context) ([]schema.KnownBackend, error) {
 	return []schema.KnownBackend{}, nil
 }
+
 func (stubClient) InstallBackend(_ context.Context, _ localaitools.InstallBackendRequest) (string, error) {
 	return "stub-backend-job", nil
 }
+
 func (stubClient) UpgradeBackend(_ context.Context, _ string) (string, error) {
 	return "stub-upgrade-job", nil
 }
+
 func (stubClient) SystemInfo(_ context.Context) (*localaitools.SystemInfo, error) {
 	return &localaitools.SystemInfo{Version: "stub"}, nil
 }
+
 func (stubClient) ListNodes(_ context.Context) ([]localaitools.Node, error) {
 	return []localaitools.Node{}, nil
 }
+
 func (stubClient) VRAMEstimate(_ context.Context, _ localaitools.VRAMEstimateRequest) (*vram.EstimateResult, error) {
 	return &vram.EstimateResult{SizeDisplay: "stub"}, nil
 }
-func (stubClient) ToggleModelState(_ context.Context, _ string, _ modeladmin.Action) error  { return nil }
-func (stubClient) ToggleModelPinned(_ context.Context, _ string, _ modeladmin.Action) error { return nil }
+func (stubClient) ToggleModelState(_ context.Context, _ string, _ modeladmin.Action) error {
+	return nil
+}
+func (stubClient) ToggleModelPinned(_ context.Context, _ string, _ modeladmin.Action) error {
+	return nil
+}
 func (stubClient) GetBranding(_ context.Context) (*localaitools.Branding, error) {
 	return &localaitools.Branding{InstanceName: "LocalAI"}, nil
 }
+
 func (stubClient) SetBranding(_ context.Context, _ localaitools.SetBrandingRequest) (*localaitools.Branding, error) {
 	return &localaitools.Branding{InstanceName: "LocalAI"}, nil
 }
+
 func (stubClient) GetUsageStats(_ context.Context, _ localaitools.UsageStatsQuery) (*localaitools.UsageStats, error) {
 	return &localaitools.UsageStats{Viewer: localaitools.UsageViewer{ID: "stub", Name: "stub"}, Period: "month"}, nil
 }
-func (stubClient) ListPIIPatterns(_ context.Context) ([]localaitools.PIIPattern, error) {
-	return nil, nil
-}
+
 func (stubClient) GetPIIEvents(_ context.Context, _ localaitools.PIIEventsQuery) ([]localaitools.PIIEvent, error) {
 	return nil, nil
 }
-func (stubClient) TestPIIRedaction(_ context.Context, req localaitools.PIIRedactTestRequest) (*localaitools.PIIRedactTestResult, error) {
-	return &localaitools.PIIRedactTestResult{Redacted: req.Text}, nil
-}
-func (stubClient) SetPIIPatternAction(_ context.Context, _ localaitools.PIIPatternActionUpdate) error {
-	return nil
-}
-func (stubClient) PersistPIIPatterns(_ context.Context) error { return nil }
+
 func (stubClient) GetMiddlewareStatus(_ context.Context) (*localaitools.MiddlewareStatus, error) {
 	return &localaitools.MiddlewareStatus{
 		PII: localaitools.MiddlewarePIIStatus{
 			EnabledGlobally: true,
-			Patterns:        []localaitools.PIIPattern{},
 			Models:          []localaitools.MiddlewarePIIModel{},
 		},
 	}, nil
 }
+
 func (stubClient) GetRouterDecisions(_ context.Context, _ localaitools.RouterDecisionsQuery) ([]localaitools.RouterDecision, error) {
 	return []localaitools.RouterDecision{}, nil
 }
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index 48e86d42ef77..8986342ebbd8 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -14,7 +14,6 @@ import (
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services/cloudproxy"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/pkg/functions"
 	reason "github.com/mudler/LocalAI/pkg/reasoning"
 
@@ -125,7 +124,7 @@ func applyAutoparserOverride(
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/chat/completions [post]
-func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, startupOptions *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient, assistantHolder *mcpTools.LocalAIAssistantHolder, piiRedactor *pii.Redactor, piiEvents pii.EventStore) echo.HandlerFunc {
+func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, startupOptions *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient, assistantHolder *mcpTools.LocalAIAssistantHolder) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		var textContentToReturn string
 		id := uuid.New().String()
@@ -147,11 +146,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 
 		// Cloud-proxy bail. Bypasses the local pipeline (templating,
 		// MCP injection, gRPC backend) and forwards via the cloud-
-		// proxy backend, which does the outbound HTTP. The streaming
-		// PII filter still runs because its input is per-token text
-		// extracted from the wire envelope, not the envelope itself.
+		// proxy backend, which does the outbound HTTP. Request-side PII
+		// redaction already ran in the middleware; the response is
+		// forwarded unmodified.
 		if config.IsCloudProxyBackendPassthrough() {
-			return forwardCloudProxyOpenAIViaBackend(c, config, input, piiRedactor, piiEvents, ml, startupOptions)
+			return forwardCloudProxyOpenAIViaBackend(c, config, input, ml, startupOptions)
 		}
 
 		funcs := input.Functions
@@ -322,7 +321,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 						"message": map[string]any{
 							"type":        "string",
 							"description": "The message to reply the user with",
-						}},
+						},
+					},
 				},
 			}
 
@@ -388,14 +388,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			c.Response().Header().Set("Connection", "keep-alive")
 			c.Response().Header().Set("X-Correlation-ID", id)
 
-			// Per-stream PII filter: when the resolved model has PII
-			// enabled, wrap the response content so values spanning
-			// chunk boundaries still get masked. Shared with the
-			// cloud-proxy bail below via cloudproxy.BuildStreamFilter
-			// so both paths apply the same per-model gate and override
-			// rules.
-			streamPIIFilter := cloudproxy.BuildStreamFilter(c, config, true, piiRedactor, piiEvents, id)
-
 			mcpStreamMaxIterations := 10
 			if config.Agent.MaxIterations > 0 {
 				mcpStreamMaxIterations = config.Agent.MaxIterations
@@ -471,30 +463,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 						if (hasMCPToolsStream || config.FunctionsConfig.AutomaticToolParsingFallback) && haveContent {
 							collectedContent += rawContent
 						}
-						// Stream-side PII filter: feed the content delta
-						// through the buffered-emit filter. The filter
-						// holds back a tail to handle pattern boundaries
-						// across chunks, so a Push may legitimately
-						// return "" — drop the chunk in that case rather
-						// than emitting an empty Delta to the wire.
-						if streamPIIFilter != nil && haveContent {
-							filtered := streamPIIFilter.Push(rawContent)
-							if filtered == "" {
-								// Fully buffered — skip this chunk's
-								// content. Still emit non-content chunks
-								// (role, tool_calls). When this delta is
-								// content-only and we buffer it, drop the
-								// whole event to avoid a vestigial
-								// {"delta":{}} on the wire.
-								if ev.Choices[0].Delta.Role == "" && len(ev.Choices[0].Delta.ToolCalls) == 0 && ev.Choices[0].Delta.Reasoning == nil {
-									continue
-								}
-								// Mixed delta — strip content, keep the rest.
-								ev.Choices[0].Delta.Content = nil
-							} else {
-								ev.Choices[0].Delta.Content = filtered
-							}
-						}
 						respData, err := json.Marshal(ev)
 						if err != nil {
 							xlog.Debug("Failed to marshal response", "error", err)
@@ -639,31 +607,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					}
 				}
 
-				// Drain the per-stream PII filter before the stop chunk
-				// so any text held back by the buffered-emit invariant
-				// reaches the client as a regular content delta. We
-				// emit it as a chunk WITHOUT a finish_reason so the
-				// next "stop" chunk still terminates the stream.
-				if streamPIIFilter != nil {
-					residual := streamPIIFilter.Drain()
-					if residual != "" {
-						drainResp := &schema.OpenAIResponse{
-							ID:      id,
-							Created: created,
-							Model:   input.Model,
-							Choices: []schema.Choice{{
-								Delta: &schema.Message{Content: residual},
-								Index: 0,
-							}},
-							Object: "chat.completion.chunk",
-						}
-						if drainBytes, err := json.Marshal(drainResp); err == nil {
-							_, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", drainBytes)
-							c.Response().Flush()
-						}
-					}
-				}
-
 				// No MCP tools to execute, send final stop message
 				finishReason := FinishReasonStop
 				if toolsCalled && len(input.Tools) > 0 {
@@ -684,7 +627,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 							FinishReason: &finishReason,
 							Index:        0,
 							Delta:        &schema.Message{},
-						}},
+						},
+					},
 					Object: "chat.completion.chunk",
 				}
 				respData, _ := json.Marshal(resp)
@@ -1070,7 +1014,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 }
 
 func handleQuestion(config *config.ModelConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) {
-
 	if len(funcResults) == 0 && result != "" {
 		xlog.Debug("nothing function results but we had a message from the LLM")
 
@@ -1106,19 +1049,16 @@ func handleQuestion(config *config.ModelConfig, funcResults []functions.FuncCall
 	return "", nil
 }
 
-// forwardCloudProxyOpenAIViaBackend marshals the OpenAI request,
-// constructs the streaming PII filter (when this model has PII
-// enabled), and hands off to the cloud-proxy gRPC backend which does
-// the outbound HTTP. The chat endpoint owns the body+filter
-// construction because it's the only place the request lands as a
-// parsed *schema.OpenAIRequest.
-func forwardCloudProxyOpenAIViaBackend(c echo.Context, cfg *config.ModelConfig, input *schema.OpenAIRequest, piiRedactor *pii.Redactor, piiEvents pii.EventStore, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
+// forwardCloudProxyOpenAIViaBackend marshals the OpenAI request and
+// hands off to the cloud-proxy gRPC backend which does the outbound
+// HTTP. The chat endpoint owns the body construction because it's the
+// only place the request lands as a parsed *schema.OpenAIRequest.
+// Request-side PII redaction already ran in the middleware; the
+// response is forwarded unmodified.
+func forwardCloudProxyOpenAIViaBackend(c echo.Context, cfg *config.ModelConfig, input *schema.OpenAIRequest, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
 	body, err := json.Marshal(input)
 	if err != nil {
 		return echo.NewHTTPError(http.StatusBadRequest, "cloudproxy: marshal request: "+err.Error())
 	}
-
-	correlationID := c.Response().Header().Get("X-Correlation-ID")
-	streamFilter := cloudproxy.BuildStreamFilter(c, cfg, input.Stream, piiRedactor, piiEvents, correlationID)
-	return cloudproxy.ForwardViaBackend(c, cfg, body, streamFilter, ml, appConfig)
+	return cloudproxy.ForwardViaBackend(c, cfg, body, ml, appConfig)
 }
diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
index fdcd310cfee6..e771fed46a09 100644
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -9,12 +9,10 @@ import (
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/auth"
 	"github.com/mudler/LocalAI/core/http/middleware"
 
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -27,7 +25,7 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/completions [post]
-func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig, piiRedactor *pii.Redactor, piiEvents pii.EventStore) echo.HandlerFunc {
+func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) echo.HandlerFunc {
 	process := func(id string, s string, req *schema.OpenAIRequest, config *config.ModelConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool) error {
 		tokenCallback := func(s string, tokenUsage backend.TokenUsage) bool {
 			created := int(time.Now().Unix())
@@ -70,7 +68,6 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 	}
 
 	return func(c echo.Context) error {
-
 		created := int(time.Now().Unix())
 
 		// Handle Correlation
@@ -113,31 +110,8 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 				return errors.New("cannot handle more than 1 `PromptStrings` when Streaming")
 			}
 
-			// Per-stream PII filter — same gating as chat. /v1/completions
-			// has no chat-message structure, so request-side PII isn't
-			// wired here, but the response-side filter still catches PII
-			// trained into the model. Filter is nil when this model has
-			// PII disabled.
-			var streamPIIFilter *pii.StreamFilter
-			if piiRedactor != nil && config.PIIIsEnabled() {
-				correlationID := id
-				userID := ""
-				if u := auth.GetUser(c); u != nil {
-					userID = u.ID
-				}
-				var overrides map[string]pii.Action
-				if raw := config.PIIPatternOverrides(); len(raw) > 0 {
-					overrides = make(map[string]pii.Action, len(raw))
-					for ovid, action := range raw {
-						switch pii.Action(action) {
-						case pii.ActionMask, pii.ActionBlock, pii.ActionRouteLocal:
-							overrides[ovid] = pii.Action(action)
-						}
-					}
-				}
-				streamPIIFilter = pii.NewStreamFilter(piiRedactor, overrides, piiEvents, correlationID, userID)
-			}
-
+			// Response/output PII redaction is out of scope for now —
+			// redaction runs request-side via the NER middleware only.
 			predInput := config.PromptStrings[0]
 
 			templatedInput, err := evaluator.EvaluateTemplateForPrompt(templates.CompletionPromptTemplate, *config, templates.PromptTemplateData{
@@ -179,19 +153,6 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 					// OpenAI streaming spec: intermediate chunks must NOT
 					// carry a `usage` field. Strip the tracking copy now.
 					ev.Usage = nil
-					// Run the per-chunk text through the streaming PII
-					// filter. The filter holds back a tail to handle
-					// pattern boundaries, so a Push may legitimately
-					// return "" — drop the chunk's text rather than
-					// emitting a 0-token delta. Choice.Text is the only
-					// content surface in /v1/completions chunks.
-					if streamPIIFilter != nil && ev.Choices[0].Text != "" {
-						filtered := streamPIIFilter.Push(ev.Choices[0].Text)
-						if filtered == "" {
-							continue
-						}
-						ev.Choices[0].Text = filtered
-					}
 					respData, err := json.Marshal(ev)
 					if err != nil {
 						xlog.Debug("Failed to marshal response", "error", err)
@@ -237,25 +198,6 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 				}
 			}
 
-			// Flush any residual the streaming PII filter held back as
-			// part of its trailing pattern-window. Emit it as one final
-			// text-bearing chunk before the synthetic stop chunk so the
-			// completion body remains a contiguous text stream.
-			if streamPIIFilter != nil {
-				if residual := streamPIIFilter.Drain(); residual != "" {
-					residualResp := schema.OpenAIResponse{
-						ID:      id,
-						Created: created,
-						Model:   input.Model,
-						Choices: []schema.Choice{{Index: 0, Text: residual}},
-						Object:  "text_completion",
-					}
-					if data, err := json.Marshal(residualResp); err == nil {
-						_, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", string(data))
-					}
-				}
-			}
-
 			stopReason := FinishReasonStop
 			resp := &schema.OpenAIResponse{
 				ID:      id,
diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go
index b9a3adda92ec..16f4a23c1ccf 100644
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -377,13 +377,14 @@ func buildRealtimeRoutingContext(a *application.Application, sessionID string) *
 		return nil
 	}
 	deps := &middleware.ClassifierDeps{
-		Scorer:      a.Scorer,
-		Embedder:    a.Embedder,
-		VectorStore: a.VectorStore,
-		Reranker:    a.Reranker,
-		ModelLookup: a.ModelConfigLookup(),
-		Registry:    a.RouterClassifierRegistry(),
-		Evaluator:   a.TemplatesEvaluator(),
+		Scorer:       a.Scorer,
+		TokenCounter: a.TokenCounter,
+		Embedder:     a.Embedder,
+		VectorStore:  a.VectorStore,
+		Reranker:     a.Reranker,
+		ModelLookup:  a.ModelConfigLookup(),
+		Registry:     a.RouterClassifierRegistry(),
+		Evaluator:    a.TemplatesEvaluator(),
 	}
 	userID := ""
 	if u := a.FallbackUser(); u != nil {
diff --git a/core/http/middleware/probe_trim_test.go b/core/http/middleware/probe_trim_test.go
new file mode 100644
index 000000000000..978c050c64bf
--- /dev/null
+++ b/core/http/middleware/probe_trim_test.go
@@ -0,0 +1,139 @@
+package middleware
+
+import (
+	"strings"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("routerConfigFingerprint", func() {
+	rc := config.RouterConfig{Classifier: "score", ClassifierModel: "arch-router"}
+	ctx4096 := 4096
+	ctx8192 := 8192
+
+	// Regression: the score classifier bakes context_size into its token
+	// budget at build time, and the built classifier is cached by this
+	// fingerprint. If context_size weren't hashed, editing it and reloading
+	// would return a classifier carrying the stale budget.
+	It("changes when the classifier model's context_size changes", func() {
+		cfgA := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx4096}}
+		cfgB := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx8192}}
+		Expect(routerConfigFingerprint(rc, cfgA)).NotTo(Equal(routerConfigFingerprint(rc, cfgB)))
+	})
+
+	It("is stable for identical classifier configs", func() {
+		cfgA := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx4096}}
+		cfgB := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx4096}}
+		Expect(routerConfigFingerprint(rc, cfgA)).To(Equal(routerConfigFingerprint(rc, cfgB)))
+	})
+})
+
+var _ = Describe("routing probe extraction and trimming", func() {
+	Describe("OpenAIProbeFromRequest", func() {
+		It("keeps a short conversation intact, newline-terminated per message", func() {
+			req := &schema.OpenAIRequest{Messages: []schema.Message{
+				{Role: "user", Content: "first"},
+				{Role: "assistant", Content: "second"},
+				{Role: "user", Content: "third"},
+			}}
+			Expect(OpenAIProbeFromRequest(req).Prompt).To(Equal("first\nsecond\nthird\n"))
+		})
+
+		It("flattens text blocks and skips image-only messages", func() {
+			req := &schema.OpenAIRequest{Messages: []schema.Message{
+				{Role: "user", Content: []any{
+					map[string]any{"type": "text", "text": "describe this"},
+					map[string]any{"type": "image_url", "image_url": map[string]any{"url": "data:..."}},
+				}},
+				{Role: "user", Content: []any{
+					map[string]any{"type": "image_url", "image_url": map[string]any{"url": "data:..."}},
+				}},
+			}}
+			// Second message contributes no text, so it neither adds a blank
+			// line nor a stray newline.
+			Expect(OpenAIProbeFromRequest(req).Prompt).To(Equal("describe this\n"))
+		})
+
+		It("carries the full conversation untrimmed — trimming is each classifier's job", func() {
+			// The middleware no longer caps the probe by a fixed rune budget;
+			// every turn reaches the Probe and each classifier trims to its own
+			// model's context (see modelTokenTrim / promptTrimmer).
+			block := strings.Repeat("x", 999)
+			msgs := make([]schema.Message, 0, 20)
+			msgs = append(msgs, schema.Message{Role: "user", Content: "OLDEST" + strings.Repeat("o", 994)})
+			for range 18 {
+				msgs = append(msgs, schema.Message{Role: "user", Content: block})
+			}
+			msgs = append(msgs, schema.Message{Role: "user", Content: "NEWEST" + strings.Repeat("n", 994)})
+
+			probe := OpenAIProbeFromRequest(&schema.OpenAIRequest{Messages: msgs})
+			Expect(probe.Prompt).To(ContainSubstring("OLDEST"), "no turn is dropped at probe-build time")
+			Expect(probe.Prompt).To(ContainSubstring("NEWEST"))
+			// Messages preserves the per-turn split the classifier trims from.
+			Expect(probe.Messages).To(HaveLen(20))
+			Expect(probe.Messages[0]).To(ContainSubstring("OLDEST"))
+			Expect(probe.Messages[19]).To(ContainSubstring("NEWEST"))
+		})
+	})
+
+	Describe("AnthropicProbe", func() {
+		It("extracts and trims the same way as the OpenAI path", func() {
+			req := &schema.AnthropicRequest{Messages: []schema.AnthropicMessage{
+				{Role: "user", Content: "alpha"},
+				{Role: "assistant", Content: []any{
+					map[string]any{"type": "text", "text": "beta"},
+				}},
+			}}
+			probe, ok := AnthropicProbe(req)
+			Expect(ok).To(BeTrue())
+			Expect(probe.Prompt).To(Equal("alpha\nbeta\n"))
+		})
+
+		It("returns ok=false for a non-Anthropic payload", func() {
+			_, ok := AnthropicProbe(&schema.OpenAIRequest{})
+			Expect(ok).To(BeFalse())
+		})
+	})
+
+	Describe("modelTokenTrim", func() {
+		tok := func(string) (int, error) { return 1, nil }
+		depsFor := func(cfg *config.ModelConfig) ClassifierDeps {
+			return ClassifierDeps{
+				ModelLookup:  func(string) *config.ModelConfig { return cfg },
+				TokenCounter: func(string) func(string) (int, error) { return tok },
+			}
+		}
+
+		It("still trims to the backend default when context_size is unset", func() {
+			// Regression: with the fixed middleware rune cap gone, an unset
+			// context_size must NOT disable trimming — otherwise a non-trivial
+			// prompt overflows the default 4096 window and every score fails.
+			score := config.FLAG_SCORE
+			cfg := &config.ModelConfig{KnownUsecases: &score} // FLAG_SCORE → batch follows context
+			count, ceiling := modelTokenTrim("classifier", depsFor(cfg))
+			Expect(count).NotTo(BeNil())
+			Expect(ceiling).To(Equal(4096), "unset context_size falls back to the backend default, not 0")
+		})
+
+		It("is bounded by the batch when the batch is smaller than the context", func() {
+			// The probe is one decode (n_tokens <= n_batch). A model with a
+			// large context but a small batch can only process the batch — the
+			// ceiling must follow it, not the context.
+			ctx8k := 8192
+			cfg := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx8k}}
+			cfg.Batch = 512
+			_, ceiling := modelTokenTrim("embedder", depsFor(cfg))
+			Expect(ceiling).To(Equal(512), "batch is the binding single-decode limit")
+		})
+
+		It("disables trimming only when no tokenizer is available", func() {
+			count, ceiling := modelTokenTrim("x", ClassifierDeps{ModelLookup: func(string) *config.ModelConfig { return &config.ModelConfig{} }})
+			Expect(count).To(BeNil())
+			Expect(ceiling).To(Equal(0))
+		})
+	})
+})
diff --git a/core/http/middleware/route_model.go b/core/http/middleware/route_model.go
index 2cdbca41cdb1..2e6b949c2bdc 100644
--- a/core/http/middleware/route_model.go
+++ b/core/http/middleware/route_model.go
@@ -6,6 +6,7 @@ import (
 	"encoding/hex"
 	"fmt"
 	"hash/fnv"
+	"strconv"
 	"strings"
 	"time"
 
@@ -86,6 +87,12 @@ type ClassifierDeps struct {
 	// templates.Evaluator so any model the operator points at gets
 	// its own chat template applied.
 	Evaluator *templates.Evaluator
+
+	// TokenCounter binds the classifier model's tokenizer for the score
+	// classifier's token-trim path. Optional; nil falls back to the
+	// backend's n_ctx guard. Plain func type so core/application supplies
+	// it as a method value without importing this package.
+	TokenCounter func(modelName string) func(text string) (int, error)
 }
 
 // ProbeExtractor pulls the prompt content out of a parsed request so
@@ -212,7 +219,6 @@ func recordHTTPDecision(c echo.Context, store router.DecisionStore, result *rout
 	_ = store.Record(context.Background(), result.ToDecisionRecord(newDecisionID(), correlationID, userID, source))
 }
 
-
 // GetOrBuildClassifier looks up a built Classifier for the named router
 // model in the registry and builds it on miss. Exported so the
 // /api/router/decide decision-oracle endpoint can share the same
@@ -262,9 +268,10 @@ func routerConfigFingerprint(rc config.RouterConfig, classifierCfg *config.Model
 	h := fnv.New64a()
 	h.Write(bytes)
 	if classifierCfg != nil {
-		// Narrow projection: only the fields newTemplateRenderer and
-		// firstStopWord actually read. Hashing the whole ModelConfig
-		// would invalidate the cache on irrelevant parameter changes.
+		// Narrow projection: only the fields buildClassifier reads (renderer,
+		// stop tokens, context_size → MaxContextTokens). Hashing the whole
+		// ModelConfig would invalidate the cache on irrelevant changes;
+		// omitting context_size would let a reload leave a stale token budget.
 		h.Write([]byte{0}) // separator so empty fields don't collide
 		h.Write([]byte(classifierCfg.TemplateConfig.Chat))
 		h.Write([]byte{0})
@@ -274,6 +281,10 @@ func routerConfigFingerprint(rc config.RouterConfig, classifierCfg *config.Model
 			h.Write([]byte(sw))
 			h.Write([]byte{0})
 		}
+		h.Write([]byte{0})
+		if classifierCfg.ContextSize != nil {
+			h.Write([]byte(strconv.Itoa(*classifierCfg.ContextSize)))
+		}
 	}
 	return h.Sum64()
 }
@@ -319,11 +330,30 @@ func buildClassifier(cfg *config.ModelConfig, deps ClassifierDeps) (router.Class
 		if deps.ModelLookup != nil {
 			if classifierCfg := deps.ModelLookup(rc.ClassifierModel); classifierCfg != nil {
 				if deps.Evaluator != nil {
-					opts.PromptRenderer = newTemplateRenderer(deps.Evaluator, classifierCfg)
+					// The router renders the scoring prompt client-side, so the
+					// classifier model MUST carry a chat template — refusing
+					// here beats silently falling back to a generic ChatML
+					// envelope the model may not have been trained on.
+					renderer := newTemplateRenderer(deps.Evaluator, classifierCfg)
+					if renderer == nil {
+						return nil, fmt.Errorf(
+							"router classifier score: classifier_model %q has no chat template "+
+								"(set template.chat and template.chat_message in its config). The router "+
+								"renders the scoring prompt with the classifier model's own template; "+
+								"without it the prompt format would not match the model",
+							rc.ClassifierModel)
+					}
+					opts.PromptRenderer = renderer
 				}
 				if st := pickAssistantTurnEnd(classifierCfg.StopWords, classifierCfg.TemplateConfig.ChatMessage); st != "" {
 					opts.StopToken = st
 				}
+				// Token-exact conversation trim — score classifier drops the
+				// oldest turns using the model's own tokenizer.
+				if count, ctxTokens := modelTokenTrim(rc.ClassifierModel, deps); count != nil {
+					opts.TokenCounter = count
+					opts.MaxContextTokens = ctxTokens
+				}
 			}
 		}
 		inner = router.NewScoreClassifier(policies, scorer, opts)
@@ -335,7 +365,11 @@ func buildClassifier(cfg *config.ModelConfig, deps ClassifierDeps) (router.Class
 		if reranker == nil {
 			return nil, fmt.Errorf("router classifier colbert: classifier_model %q not loadable", rc.ClassifierModel)
 		}
-		inner = router.NewRerankClassifier(policies, reranker, cacheCap, rc.ActivationThreshold)
+		rerankClassifier := router.NewRerankClassifier(policies, reranker, cacheCap, rc.ActivationThreshold)
+		if count, ctxTokens := modelTokenTrim(rc.ClassifierModel, deps); count != nil {
+			rerankClassifier = rerankClassifier.WithTokenTrim(count, ctxTokens)
+		}
+		inner = rerankClassifier
 	default:
 		return nil, fmt.Errorf("router: unknown classifier %q (supported: %s)", name, strings.Join([]string{router.ClassifierScore, router.ClassifierColbert}, ", "))
 	}
@@ -523,7 +557,41 @@ func wrapWithEmbeddingCache(cfg *config.ModelConfig, inner router.Classifier, de
 	if vstore == nil {
 		return nil, fmt.Errorf("vector store %q not loadable", storeName)
 	}
-	return router.NewEmbeddingCacheClassifier(inner, embedder, vstore, ec.SimilarityThreshold, ec.ConfidenceThreshold), nil
+	cache := router.NewEmbeddingCacheClassifier(inner, embedder, vstore, ec.SimilarityThreshold, ec.ConfidenceThreshold)
+	// Trim the probe to the embedder model's own context (e.g. nomic-embed at
+	// 8k) rather than a fixed guess — otherwise the cache key is an embedding
+	// of a silently-truncated conversation.
+	if count, ctxTokens := modelTokenTrim(ec.EmbeddingModel, deps); count != nil {
+		cache = cache.WithTokenTrim(count, ctxTokens)
+	}
+	return cache, nil
+}
+
+// modelTokenTrim returns a model's own tokenizer and the token ceiling its
+// probe must fit, or (nil, 0) when no tokenizer is available (only then can we
+// not trim exactly). The ceiling is min(effective context, effective batch):
+// score/embed/rerank all decode the whole prompt in one pass, so it must fit
+// both the context window and a single batch. Using the backend's *effective*
+// values — not the raw config fields — means trimming still works when
+// context_size and batch are unset; otherwise a non-trivial prompt overflows
+// the default window and every classification fails.
+func modelTokenTrim(modelName string, deps ClassifierDeps) (func(string) (int, error), int) {
+	if deps.TokenCounter == nil || deps.ModelLookup == nil {
+		return nil, 0
+	}
+	cfg := deps.ModelLookup(modelName)
+	if cfg == nil {
+		return nil, 0
+	}
+	count := deps.TokenCounter(modelName)
+	if count == nil {
+		return nil, 0
+	}
+	ceiling := backend.EffectiveContextSize(*cfg)
+	if b := backend.EffectiveBatchSize(*cfg); b < ceiling {
+		ceiling = b
+	}
+	return count, ceiling
 }
 
 func newDecisionID() string {
@@ -545,6 +613,41 @@ func OpenAIProbe(parsed any) (router.Probe, bool) {
 	return OpenAIProbeFromRequest(req), true
 }
 
+// messageText flattens a chat message's Content to plain text: string content
+// verbatim; []any structured content contributes only its "text" blocks.
+func messageText(content any) string {
+	switch ct := content.(type) {
+	case string:
+		return ct
+	case []any:
+		var b strings.Builder
+		for _, block := range ct {
+			if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
+				if t, ok := bm["text"].(string); ok {
+					if b.Len() > 0 {
+						b.WriteByte('\n')
+					}
+					b.WriteString(t)
+				}
+			}
+		}
+		return b.String()
+	}
+	return ""
+}
+
+// messageProbeParts drops empty (e.g. image-only) messages so they don't
+// consume budget or emit blank lines.
+func messageProbeParts(texts []string) []string {
+	parts := make([]string, 0, len(texts))
+	for _, t := range texts {
+		if t != "" {
+			parts = append(parts, t)
+		}
+	}
+	return parts
+}
+
 // OpenAIProbeFromRequest is the typed counterpart of OpenAIProbe — same
 // extraction logic, but takes the request struct directly. Realtime and
 // other non-HTTP callers use it to feed a probe to router.Resolve
@@ -553,24 +656,15 @@ func OpenAIProbeFromRequest(req *schema.OpenAIRequest) router.Probe {
 	if req == nil {
 		return router.Probe{}
 	}
-	var b strings.Builder
+	texts := make([]string, len(req.Messages))
 	for i := range req.Messages {
-		switch ct := req.Messages[i].Content.(type) {
-		case string:
-			b.WriteString(ct)
-			b.WriteByte('\n')
-		case []any:
-			for _, block := range ct {
-				if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
-					if t, ok := bm["text"].(string); ok {
-						b.WriteString(t)
-						b.WriteByte('\n')
-					}
-				}
-			}
-		}
+		texts[i] = messageText(req.Messages[i].Content)
 	}
-	return router.Probe{Prompt: b.String()}
+	parts := messageProbeParts(texts)
+	// Prompt carries the full conversation; each classifier trims it to its own
+	// model's context (see modelTokenTrim). Messages preserves the per-turn
+	// split the trimmer drops oldest-first.
+	return router.Probe{Prompt: router.JoinTurns(parts), Messages: parts}
 }
 
 // AnthropicProbe is the AnthropicRequest analogue of OpenAIProbe.
@@ -579,25 +673,10 @@ func AnthropicProbe(parsed any) (router.Probe, bool) {
 	if !ok || req == nil {
 		return router.Probe{}, false
 	}
-	var b strings.Builder
+	texts := make([]string, len(req.Messages))
 	for i := range req.Messages {
-		switch ct := req.Messages[i].Content.(type) {
-		case string:
-			b.WriteString(ct)
-			b.WriteByte('\n')
-		case []any:
-			for _, block := range ct {
-				if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
-					if t, ok := bm["text"].(string); ok {
-						b.WriteString(t)
-						b.WriteByte('\n')
-					}
-				}
-			}
-		}
+		texts[i] = messageText(req.Messages[i].Content)
 	}
-	return router.Probe{
-		Prompt: b.String(),
-	}, true
+	parts := messageProbeParts(texts)
+	return router.Probe{Prompt: router.JoinTurns(parts), Messages: parts}, true
 }
-
diff --git a/core/http/middleware/route_model_test.go b/core/http/middleware/route_model_test.go
index 32cb8afb90b5..4a9be2b12fb3 100644
--- a/core/http/middleware/route_model_test.go
+++ b/core/http/middleware/route_model_test.go
@@ -246,11 +246,12 @@ var _ = Describe("RouteModel rendered classifier prompt", func() {
 			"rendered prompt must end at assistant-open marker. got: %q", s.lastPrompt)
 	})
 
-	It("falls back to chatMLRenderer when the classifier model has no chat_message template", func() {
-		// Partial template config: only outer Chat, no per-role
-		// piece. The renderer must refuse rather than emit a prompt
-		// that drops the system turn, so the score classifier's
-		// built-in ChatML default takes over.
+	It("refuses to build the router when the classifier model has no chat_message template", func() {
+		// Partial template config: only the outer Chat, no per-role piece.
+		// The router renders the scoring prompt client-side from the
+		// classifier model's own template, so a missing template is a hard
+		// error rather than a silent fall back to a generic ChatML envelope
+		// the model may not have been trained on.
 		writePartialClassifierModel(modelDir, "arch-router")
 		routerCfg := newScoreRouterModel(modelDir, "smart-router")
 
@@ -266,19 +267,9 @@ var _ = Describe("RouteModel rendered classifier prompt", func() {
 				ModelLookup: loaderLookup(loader, appConfig),
 				Evaluator:   eval,
 			})
-		Expect(err).NotTo(HaveOccurred())
-
-		// chatMLRenderer fallback emits its own envelope and still
-		// embeds the routing system prompt. OpenAIProbeFromRequest
-		// appends "\n" after each message body, so the user content
-		// reaches the renderer as "hello world\n" — the substring
-		// match accounts for that.
-		Expect(s.lastPrompt).To(ContainSubstring("<routes>"),
-			"fallback renderer also dropped the system prompt")
-		Expect(s.lastPrompt).To(ContainSubstring("<|im_start|>system\n"))
-		Expect(s.lastPrompt).To(ContainSubstring("<|im_start|>user\nhello world\n<|im_end|>"))
-		Expect(strings.HasSuffix(s.lastPrompt, "<|im_start|>assistant\n")).To(BeTrue(),
-			"chatMLRenderer fallback must end at assistant-open marker. got: %q", s.lastPrompt)
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("no chat template"),
+			"missing classifier template must surface as a clear config error. got: %v", err)
 	})
 
 	It("uses the classifier model's first stopword as the candidate suffix", func() {
@@ -533,8 +524,8 @@ template:
 
 // writePartialClassifierModel writes a classifier model that has the
 // outer Chat template but no ChatMessage — exercises the
-// newTemplateRenderer "refuse partial templating" branch that hands
-// off to chatMLRenderer.
+// newTemplateRenderer "refuse partial templating" branch, which makes
+// buildClassifier reject the router with a missing-template error.
 func writePartialClassifierModel(modelDir, name string) {
 	body := `name: ` + name + `
 backend: llama-cpp
diff --git a/core/http/react-ui/e2e/middleware-page.spec.js b/core/http/react-ui/e2e/middleware-page.spec.js
index fe890db77fd5..98e011c1e1d4 100644
--- a/core/http/react-ui/e2e/middleware-page.spec.js
+++ b/core/http/react-ui/e2e/middleware-page.spec.js
@@ -1,24 +1,29 @@
 import { test, expect } from '@playwright/test'
 
-// Mocked fixture covering the three things the page renders:
-//   - PII pattern catalogue (action badges, action-change buttons)
-//   - Per-model resolved PII state (one with default off, one with proxy default on, one with explicit YAML)
+// Mocked fixture covering the things the page renders:
+//   - Per-model resolved PII state + the NER detectors each references
+//     (one with default off, one with proxy default on, one explicit YAML)
 //   - Recent events feed (the page must NEVER show the redacted content)
 const MOCK_STATUS = {
   pii: {
     enabled_globally: true,
     default_enabled_for_backends: ['cloud-proxy'],
-    patterns: [
-      { id: 'email', description: 'Email addresses', action: 'mask', max_match_length: 254 },
-      { id: 'ssn', description: 'US Social Security Numbers', action: 'mask', max_match_length: 11 },
-      { id: 'api_key_prefix', description: 'API key prefixes', action: 'block', max_match_length: 200 },
-    ],
     models: [
-      { name: 'qwen-7b', backend: 'llama-cpp', enabled: false, explicit: false, default_for_backend: false, overrides: null },
-      { name: 'claude-sonnet', backend: 'cloud-proxy', enabled: true, explicit: false, default_for_backend: true, overrides: null },
-      { name: 'claude-strict', backend: 'cloud-proxy', enabled: true, explicit: true, default_for_backend: true, overrides: { ssn: 'block' } },
+      { name: 'qwen-7b', backend: 'llama-cpp', enabled: false, explicit: false, default_for_backend: false, detectors: null },
+      { name: 'claude-sonnet', backend: 'cloud-proxy', enabled: true, explicit: false, default_for_backend: true, detectors: null },
+      { name: 'claude-strict', backend: 'cloud-proxy', enabled: true, explicit: true, default_for_backend: true, detectors: ['privacy-filter-multilingual'] },
     ],
     recent_event_count: 2,
+    // Instance-wide default detector set (managed by the Detector models
+    // table's per-row Default toggle).
+    default_detectors: ['global-ner-default'],
+    // The token_classify "filter" models themselves: one NER, one in-process
+    // pattern matcher, plus an orphan default that names a model not loaded.
+    detector_models: [
+      { name: 'privacy-filter-multilingual', backend: 'llama-cpp', type: 'ner', default: false },
+      { name: 'secret-filter', backend: 'pattern', type: 'pattern', default: false },
+      { name: 'global-ner-default', backend: '', type: 'unknown', default: true, missing: true },
+    ],
   },
   router: {
     configured: true,
@@ -114,23 +119,104 @@ test.describe('Middleware page — admin in no-auth mode', () => {
     await page.route('**/api/router/decisions?**', (route) =>
       route.fulfill({ contentType: 'application/json', body: JSON.stringify(MOCK_DECISIONS) })
     )
+    // The Default PII policy detector picker is capability-filtered to
+    // token_classify via /api/models/capabilities.
+    await page.route('**/api/models/capabilities', (route) =>
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify({ models: [{ id: 'privacy-filter-multilingual', capabilities: ['FLAG_TOKEN_CLASSIFY'], backend: 'llama-cpp' }] }),
+      })
+    )
+    await page.route('**/api/settings', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ success: true }) })
+    )
+    // The per-model PII toggle PATCHes the model config (pii.enabled).
+    await page.route('**/api/models/config-json/**', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ success: true }) })
+    )
   })
 
-  test('Filtering tab renders pattern catalogue and per-model state', async ({ page }) => {
+  test('Filtering tab renders per-model state and referenced detectors', async ({ page }) => {
     await page.goto('/app/middleware')
 
-    // Pattern table — at least one pattern id visible.
-    await expect(page.getByText('email').first()).toBeVisible()
-    await expect(page.getByText('api_key_prefix').first()).toBeVisible()
-
     // Per-model state — each model's name is visible.
     await expect(page.getByText('qwen-7b').first()).toBeVisible()
     await expect(page.getByText('claude-strict').first()).toBeVisible()
 
+    // The detector a model references is shown in its row.
+    await expect(page.getByText('privacy-filter-multilingual').first()).toBeVisible()
+
     // Default-policy banner names the backends with PII on by default.
     await expect(page.getByText(/cloud-proxy/).first()).toBeVisible()
   })
 
+  test('Filtering tab lists detector models with type badges and a default toggle', async ({ page }) => {
+    await page.goto('/app/middleware')
+
+    // The Detector models card renders every token_classify filter model.
+    await expect(page.getByText('Detector models')).toBeVisible()
+    const nerRow = page.locator('tr').filter({ hasText: 'privacy-filter-multilingual' }).first()
+    await expect(nerRow).toContainText(/NER/i)
+    const patternRow = page.locator('tr').filter({ hasText: 'secret-filter' }).first()
+    await expect(patternRow).toContainText(/pattern/i)
+
+    // The NER detector is not (yet) a default — its toggle is unchecked.
+    // (The underlying checkbox is 0×0 by design, so we click the label wrapper.)
+    const nerToggle = nerRow.locator('label.toggle')
+    await expect(nerToggle.locator('input[type="checkbox"]')).not.toBeChecked()
+
+    // Toggling it on persists the new default set via POST /api/settings.
+    const saved = page.waitForRequest(req =>
+      req.url().includes('/api/settings') && req.method() === 'POST')
+    await nerToggle.click()
+    const req = await saved
+    const body = JSON.parse(req.postData() || '{}')
+    expect(body.pii_default_detectors).toContain('privacy-filter-multilingual')
+  })
+
+  test('Filtering tab surfaces an orphan default detector that is not loaded', async ({ page }) => {
+    await page.goto('/app/middleware')
+
+    // global-ner-default names a model that is not loaded, but it is in the
+    // default set — it must still appear (toggled on) so admins can remove it.
+    const orphanRow = page.locator('tr').filter({ hasText: 'global-ner-default' }).first()
+    await expect(orphanRow).toContainText(/not loaded/i)
+    await expect(orphanRow.locator('label.toggle input[type="checkbox"]')).toBeChecked()
+  })
+
+  test('Filtering tab flags an enabled model with no detector as a no-op', async ({ page }) => {
+    await page.goto('/app/middleware')
+
+    // claude-sonnet is enabled by the cloud-proxy backend default but lists
+    // no detectors and there is no instance default detector — it scans
+    // nothing, so the row must warn rather than read as protected.
+    const noopRow = page.locator('tr').filter({ hasText: 'claude-sonnet' }).first()
+    await expect(noopRow).toContainText(/no-op/i)
+
+    // claude-strict has an explicit detector — it must NOT be flagged.
+    const okRow = page.locator('tr').filter({ hasText: 'claude-strict' }).first()
+    await expect(okRow).not.toContainText(/no-op/i)
+  })
+
+  test('Filtering tab PII column toggles a model\'s pii.enabled via PATCH', async ({ page }) => {
+    await page.goto('/app/middleware')
+
+    // qwen-7b is OFF (enabled:false) — its PII toggle reads unchecked.
+    const row = page.locator('tr').filter({ hasText: 'qwen-7b' }).first()
+    const toggle = row.locator('label.toggle')
+    await expect(toggle.locator('input[type="checkbox"]')).not.toBeChecked()
+
+    // Toggling on PATCHes the model config with an explicit pii.enabled:true,
+    // scoped to that model (no other field is sent — the server deep-merges).
+    const patched = page.waitForRequest(req =>
+      req.url().includes('/api/models/config-json/') && req.method() === 'PATCH')
+    await toggle.click()
+    const req = await patched
+    expect(decodeURIComponent(req.url())).toContain('qwen-7b')
+    const body = JSON.parse(req.postData() || '{}')
+    expect(body.pii.enabled).toBe(true)
+  })
+
   test('Routing tab renders configured routers and recent decisions', async ({ page }) => {
     await page.goto('/app/middleware')
     await page.getByRole('button', { name: /Routing/i }).click()
@@ -265,25 +351,6 @@ test.describe('Middleware page — admin in no-auth mode', () => {
     await expect(page.getByText(/^proxy traffic$/i).first()).toBeVisible()
   })
 
-  test('PUT /api/pii/patterns/:id fires when an action button is clicked', async ({ page }) => {
-    let putHit = null
-    await page.route('**/api/pii/patterns/email', (route) => {
-      if (route.request().method() === 'PUT') {
-        putHit = JSON.parse(route.request().postData() || '{}')
-        route.fulfill({ contentType: 'application/json', body: JSON.stringify({ id: 'email', action: putHit.action, persisted: false }) })
-      } else {
-        route.continue()
-      }
-    })
-
-    await page.goto('/app/middleware')
-    // Click the email row's "block" button (currently mask, so block is
-    // enabled). Use a precise locator that matches the inner button.
-    const emailRow = page.locator('tr').filter({ hasText: 'email' }).first()
-    await emailRow.getByRole('button', { name: 'block' }).click()
-
-    await expect.poll(() => putHit).toEqual({ action: 'block' })
-  })
 })
 
 test.describe('Middleware page — non-admin under auth-on', () => {
diff --git a/core/http/react-ui/e2e/model-config.spec.js b/core/http/react-ui/e2e/model-config.spec.js
index 9dbbe859f9fe..2d7f0f8bdcd3 100644
--- a/core/http/react-ui/e2e/model-config.spec.js
+++ b/core/http/react-ui/e2e/model-config.spec.js
@@ -12,6 +12,9 @@ const MOCK_METADATA = {
     { path: 'cuda', yaml_key: 'cuda', go_type: 'bool', ui_type: 'bool', section: 'general', label: 'CUDA', description: 'Enable CUDA GPU acceleration', component: 'toggle', order: 30 },
     { path: 'parameters.temperature', yaml_key: 'temperature', go_type: '*float64', ui_type: 'float', section: 'parameters', label: 'Temperature', description: 'Sampling temperature', component: 'slider', min: 0, max: 2, step: 0.1, order: 0 },
     { path: 'parameters.top_p', yaml_key: 'top_p', go_type: '*float64', ui_type: 'float', section: 'parameters', label: 'Top P', description: 'Nucleus sampling threshold', component: 'slider', min: 0, max: 1, step: 0.05, order: 10 },
+    { path: 'pii_detection.builtins', yaml_key: 'builtins', go_type: '[]string', ui_type: '[]string', section: 'general', label: 'Built-in Secret Patterns', description: 'Built-in credential patterns', component: 'pii-builtins-select', options: [{ value: 'anthropic_api_key', label: 'anthropic_api_key — Anthropic API key' }, { value: 'github_token', label: 'github_token — GitHub token' }], order: 213 },
+    { path: 'pii_detection.patterns', yaml_key: 'patterns', go_type: '[]config.PIIPattern', ui_type: 'object', section: 'general', label: 'Custom Secret Patterns', description: 'Operator-defined restricted-regex patterns', component: 'pii-pattern-list', order: 214 },
+    { path: 'pii_detection.entity_actions', yaml_key: 'entity_actions', go_type: 'map[string]string', ui_type: 'map', section: 'general', label: 'Detector Entity Actions', description: 'Per-entity-group action policy', component: 'entity-action-list', order: 212 },
   ],
 }
 
@@ -224,4 +227,106 @@ test.describe('Model Editor - Interactive Tab', () => {
     expect(estimateCalled).toBe(true)
   })
 
+  test('interactive tab scrolls at body height (no inner overflow pane) and tracks the active section', async ({ page }) => {
+    // Regression: the form sections used to live inside an overflow:auto pane
+    // with maxHeight: calc(100vh - 340px), which kept the global footer in
+    // view on every screen and ate ~50px of editing room on short windows.
+    // Pin two pieces of the fix:
+    //  1. The two-column container (sticky nav + content) has no scrollable
+    //     inner element on its content side — body-scroll handles overflow.
+    //  2. The active-section tracker now listens to window scroll. Scrolling
+    //     the window should run the tracker without throwing, and the
+    //     `<nav>` sidebar must still render.
+    const contentOverflowY = await page.evaluate(() => {
+      const sidebar = document.querySelector('nav')
+      // The content column is the next sibling of the sticky sidebar.
+      const content = sidebar?.nextElementSibling
+      return content ? getComputedStyle(content).overflowY : 'no-content'
+    })
+    expect(['visible', 'normal', 'auto', 'scroll', 'no-content']).toContain(contentOverflowY)
+    expect(contentOverflowY).not.toBe('scroll')
+    // 'auto' could exist on some browsers but should NOT — the fix removes it.
+    // We assert the strong invariant separately.
+    expect(['auto']).not.toContain(contentOverflowY)
+
+    // Add a couple of fields to give the page a touch more height, then
+    // force a window scroll. The tracker should run; the sidebar should
+    // remain visible.
+    const searchInput = page.locator('input[placeholder="Search fields to add..."]')
+    await searchInput.fill('Temperature')
+    const dropdown = searchInput.locator('..').locator('..')
+    await dropdown.locator('div', { hasText: 'Temperature' }).first().click()
+    await page.evaluate(() => window.scrollTo(0, 200))
+    await page.waitForTimeout(50)
+    await expect(page.locator('nav').first()).toBeVisible()
+  })
+
+  test('built-in secret patterns render as a checklist from field options', async ({ page }) => {
+    const searchInput = page.locator('input[placeholder="Search fields to add..."]')
+    await searchInput.fill('Built-in Secret Patterns')
+    const dropdown = searchInput.locator('..').locator('..')
+    await dropdown.locator('div', { hasText: 'Built-in Secret Patterns' }).first().click()
+
+    // One checkbox per catalogue option; toggling one enables Save.
+    const anthropic = page.locator('label', { hasText: 'Anthropic API key' }).locator('input[type="checkbox"]')
+    await expect(anthropic).toHaveCount(1)
+    await anthropic.check()
+    await expect(anthropic).toBeChecked()
+  })
+
+  test('custom secret patterns render the pattern-list editor', async ({ page }) => {
+    const searchInput = page.locator('input[placeholder="Search fields to add..."]')
+    await searchInput.fill('Custom Secret Patterns')
+    const dropdown = searchInput.locator('..').locator('..')
+    await dropdown.locator('div', { hasText: 'Custom Secret Patterns' }).first().click()
+
+    // Empty state + an Add button; adding a row shows the name + match inputs.
+    const addBtn = page.locator('button', { hasText: 'Add pattern' })
+    await expect(addBtn).toBeVisible()
+    await addBtn.click()
+    await expect(page.locator('input[placeholder^="Name (group)"]')).toBeVisible()
+    await expect(page.locator('input[placeholder^="match,"]')).toBeVisible()
+  })
+
+  // Regression: a map-typed field (entity_actions) present in the loaded YAML
+  // must render WITH its values. flattenConfig used to recurse into the map,
+  // scattering it across pii_detection.entity_actions.<GROUP> paths that match
+  // no registered field, so the editor showed neither the field nor the
+  // per-entity policy (e.g. SSN -> block) the operator had configured.
+  test('entity_actions map field present in YAML renders with its values', async ({ page }) => {
+    // Override the edit endpoint for this test: YAML that carries a populated
+    // entity_actions map alongside a scalar sibling (default_action).
+    await page.route('**/api/models/edit/ner-model', (route) => {
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify({
+          name: 'ner-model',
+          config: [
+            'name: ner-model',
+            'backend: llama-cpp',
+            'pii_detection:',
+            '    default_action: mask',
+            '    entity_actions:',
+            '        SSN: block',
+            '        EMAIL: mask',
+            '',
+          ].join('\n'),
+        }),
+      })
+    })
+
+    await page.goto('/app/model-editor/ner-model')
+
+    // The entity-action-list editor is rendered (field label visible)…
+    await expect(page.getByText('Detector Entity Actions').first()).toBeVisible()
+    // …and bound to the existing map: one row per configured group, in order.
+    const groupInputs = page.locator('input[aria-label="Entity group"]')
+    await expect(groupInputs).toHaveCount(2)
+    await expect(groupInputs.nth(0)).toHaveValue('SSN')
+    await expect(groupInputs.nth(1)).toHaveValue('EMAIL')
+    // The action select shows the bound action label (block), proving the map
+    // values bound, not just an empty editor.
+    await expect(page.getByText(/block —/i).first()).toBeVisible()
+  })
+
 })
diff --git a/core/http/react-ui/e2e/model-editor-back-nav.spec.js b/core/http/react-ui/e2e/model-editor-back-nav.spec.js
new file mode 100644
index 000000000000..5ad085aeafb7
--- /dev/null
+++ b/core/http/react-ui/e2e/model-editor-back-nav.spec.js
@@ -0,0 +1,94 @@
+import { test, expect } from './coverage-fixtures.js'
+
+// Exercises the "Back to <page>" navigation convention: whichever page links
+// into the Model Editor stamps its origin as react-router location state, and
+// the editor's Back button returns there (captioned with the origin) instead
+// of a hardcoded route. Also covers the Middleware page's ?tab= persistence,
+// which is what lets the editor return you to the exact tab you came from.
+
+const MOCK_METADATA = {
+  sections: [{ id: 'general', label: 'General', icon: 'settings', order: 0 }],
+  fields: [
+    { path: 'name', yaml_key: 'name', go_type: 'string', ui_type: 'string', section: 'general', label: 'Model Name', description: 'id', component: 'input', order: 0 },
+  ],
+}
+const MOCK_YAML = 'name: mock-model\nbackend: mock-backend\n'
+
+// Router config with one model, so the Routing tab renders an editable model
+// link we can click through to the editor.
+const MOCK_MIDDLEWARE_STATUS = {
+  pii: { enabled_globally: false, default_enabled_for_backends: [], patterns: [], models: [], recent_event_count: 0 },
+  router: {
+    configured: true,
+    models: [{ name: 'smart-router', classifier: 'score', fallback: 'qwen-7b', policies: [], candidates: [] }],
+    recent_decision_count: 0,
+    available_classifiers: ['score'],
+  },
+}
+
+// Make the editor render for any model name (the header — and thus the Back
+// button — only appears once metadata + config have loaded).
+async function mockEditorEndpoints(page) {
+  await page.route('**/api/models/config-metadata*', (route) =>
+    route.fulfill({ contentType: 'application/json', body: JSON.stringify(MOCK_METADATA) }))
+  await page.route('**/api/models/edit/**', (route) =>
+    route.fulfill({ contentType: 'application/json', body: JSON.stringify({ config: MOCK_YAML, name: 'mock-model' }) }))
+  await page.route('**/api/models/config-json/**', (route) =>
+    route.fulfill({ contentType: 'application/json', body: '{}' }))
+}
+
+test.describe('Model Editor — Back navigation', () => {
+  test.beforeEach(async ({ page }) => {
+    await page.route('**/api/auth/status', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ authEnabled: false, staticApiKeyRequired: false, providers: [] }) }))
+    await mockEditorEndpoints(page)
+  })
+
+  test('Back returns to Manage with a "Back to Manage" caption', async ({ page }) => {
+    await page.goto('/app/manage')
+    await expect(page.locator('.table')).toBeVisible({ timeout: 10_000 })
+
+    // Open the first row's action menu and pick "Edit configuration".
+    const trigger = page.locator('button.action-menu__trigger').first()
+    await expect(trigger).toBeVisible()
+    await trigger.click()
+    await page.getByRole('menuitem', { name: 'Edit configuration' }).click()
+
+    await expect(page).toHaveURL(/\/app\/model-editor\//)
+    const back = page.getByRole('button', { name: /Back to Manage/ })
+    await expect(back).toBeVisible({ timeout: 10_000 })
+
+    await back.click()
+    await expect(page).toHaveURL(/\/app\/manage/)
+  })
+
+  test('returns to the originating Middleware tab (?tab=routing) it was opened from', async ({ page }) => {
+    await page.route('**/api/middleware/status', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify(MOCK_MIDDLEWARE_STATUS) }))
+    await page.route('**/api/pii/events?**', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ events: [] }) }))
+    await page.route('**/api/router/decisions?**', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ decisions: [] }) }))
+
+    await page.goto('/app/middleware')
+    // Switching to Routing must push the tab into the URL.
+    await page.getByRole('button', { name: /Routing/i }).click()
+    await expect(page).toHaveURL(/[?&]tab=routing/)
+
+    // Click through to the router model's config, then back.
+    await page.getByRole('link', { name: 'smart-router' }).click()
+    await expect(page).toHaveURL(/\/app\/model-editor\/smart-router/)
+    const back = page.getByRole('button', { name: /Back to Middleware/ })
+    await expect(back).toBeVisible({ timeout: 10_000 })
+
+    await back.click()
+    // Returns to the exact tab, not the default Filtering tab.
+    await expect(page).toHaveURL(/\/app\/middleware\?tab=routing/)
+    await expect(page.getByText('smart-router').first()).toBeVisible()
+  })
+
+  test('falls back to "Back to Manage" on a direct visit with no origin state', async ({ page }) => {
+    await page.goto('/app/model-editor/mock-model')
+    await expect(page.getByRole('button', { name: /Back to Manage/ })).toBeVisible({ timeout: 10_000 })
+  })
+})
diff --git a/core/http/react-ui/e2e/models-gallery.spec.js b/core/http/react-ui/e2e/models-gallery.spec.js
index cb1467b391de..d45b5eedac99 100644
--- a/core/http/react-ui/e2e/models-gallery.spec.js
+++ b/core/http/react-ui/e2e/models-gallery.spec.js
@@ -178,7 +178,7 @@ test.describe("Models Gallery - Backend Features", () => {
 });
 
 const BACKEND_USECASES_MOCK = {
-  "llama-cpp": ["chat", "embeddings", "vision"],
+  "llama-cpp": ["chat", "embeddings", "vision", "token_classify"],
   whisper: ["transcript"],
   stablediffusion: ["image"],
 };
@@ -285,13 +285,15 @@ test.describe("Models Gallery - Multi-select Filters", () => {
     await expect(sttBtn).toBeDisabled();
     await expect(imageBtn).toBeDisabled();
 
-    // Chat, Embeddings, Vision should remain enabled
+    // Chat, Embeddings, Vision, NER should remain enabled
     const chatBtn = page.locator(".filter-btn", { hasText: "Chat" });
     const embBtn = page.locator(".filter-btn", { hasText: "Embeddings" });
     const visBtn = page.locator(".filter-btn", { hasText: "Vision" });
+    const nerBtn = page.locator(".filter-btn", { hasText: "NER" });
     await expect(chatBtn).toBeEnabled();
     await expect(embBtn).toBeEnabled();
     await expect(visBtn).toBeEnabled();
+    await expect(nerBtn).toBeEnabled();
   });
 
   test("backend clears incompatible filters", async ({ page }) => {
diff --git a/core/http/react-ui/e2e/traces-errors.spec.js b/core/http/react-ui/e2e/traces-errors.spec.js
index ebf17ad8c06f..18c82edf33a6 100644
--- a/core/http/react-ui/e2e/traces-errors.spec.js
+++ b/core/http/react-ui/e2e/traces-errors.spec.js
@@ -48,3 +48,77 @@ test.describe('Traces - Error Display', () => {
     await expect(page.locator('th', { hasText: 'Type' })).toBeVisible()
   })
 })
+
+// Pin the BackendTraceDetail expansion path for a vector_store trace —
+// the type that surfaces the router's embedding-cache plumbing. The
+// row click triggers the detail render, which exercises typeBadgeStyle
+// (with the new vector_store badge color), the DataFields component
+// (op / outcome / vector_dim / similarity), and the "View backend
+// logs" link that resolves to the store namespace. Without this spec
+// the new color entry plus the data-field render branches stay
+// uncovered, dragging UI line coverage below the regression gate.
+test.describe('Traces - vector_store backend trace detail', () => {
+  test.beforeEach(async ({ page }) => {
+    await page.route('**/api/traces', (route) => {
+      route.fulfill({ contentType: 'application/json', body: '[]' })
+    })
+    await page.route('**/api/backend-traces', (route) => {
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify([
+          {
+            type: 'vector_store',
+            timestamp: '2026-05-28T13:56:25.558Z',
+            model_name: 'router-cache-smart-router',
+            backend: 'local-store',
+            summary: 'search hit (sim=0.989)',
+            duration: 160_000_000,
+            error: '',
+            data: {
+              op: 'search',
+              outcome: 'hit',
+              vector_dim: 768,
+              similarity: 0.9899752140045166,
+            },
+          },
+          {
+            type: 'vector_store',
+            timestamp: '2026-05-28T13:49:07.545Z',
+            model_name: 'router-cache-smart-router',
+            backend: 'local-store',
+            summary: 'search miss',
+            duration: 100_000_000,
+            error: '',
+            data: {
+              op: 'search',
+              outcome: 'miss',
+              vector_dim: 768,
+            },
+          },
+        ]),
+      })
+    })
+    await page.goto('/app/traces')
+    await expect(page.locator('text=Tracing is')).toBeVisible({ timeout: 10_000 })
+    await page.locator('button', { hasText: 'Backend Traces' }).click()
+  })
+
+  test('renders type badge and expands data fields on row click', async ({ page }) => {
+    // The vector_store badge appears in the type column.
+    await expect(page.locator('td span', { hasText: 'vector_store' }).first()).toBeVisible()
+
+    // Clicking the first row expands BackendTraceDetail, which renders
+    // the four data fields. Use the first row's "search hit" summary
+    // as the anchor to disambiguate from the miss row below.
+    await page.locator('tr', { hasText: 'search hit' }).first().click()
+
+    // DataFields renders op/outcome/vector_dim/similarity as label/value pairs.
+    // 'hit' appears as the rendered outcome value.
+    await expect(page.locator('text=outcome').first()).toBeVisible()
+    await expect(page.locator('text=hit').first()).toBeVisible()
+
+    // The model_name → /app/backend-logs link is the BackendTraceDetail
+    // affordance for jumping to logs for the store namespace.
+    await expect(page.locator('a', { hasText: 'View backend logs' })).toBeVisible()
+  })
+})
diff --git a/core/http/react-ui/playwright.config.js b/core/http/react-ui/playwright.config.js
index 3d4718dae942..40f7e56e457a 100644
--- a/core/http/react-ui/playwright.config.js
+++ b/core/http/react-ui/playwright.config.js
@@ -4,6 +4,12 @@ export default defineConfig({
   testDir: './e2e',
   timeout: 30_000,
   retries: process.env.CI ? 2 : 0,
+  // TEMPORARY: cap parallelism. Playwright's default (cores/2) oversubscribes
+  // high-core dev machines and intermittently starves the page-teardown
+  // coverage harvest past the 30s test timeout (flaky "Tearing down page"
+  // failures, different specs each run). Capped at 8 pending a proper
+  // root-cause fix; override with PW_WORKERS.
+  workers: process.env.PW_WORKERS ? Number(process.env.PW_WORKERS) : 8,
   reporter: process.env.CI ? 'html' : 'list',
   use: {
     baseURL: 'http://127.0.0.1:8089',
diff --git a/core/http/react-ui/public/locales/en/models.json b/core/http/react-ui/public/locales/en/models.json
index 603bd809cc8a..7be30fc51e76 100644
--- a/core/http/react-ui/public/locales/en/models.json
+++ b/core/http/react-ui/public/locales/en/models.json
@@ -29,6 +29,7 @@
     "rerank": "Rerank",
     "detection": "Detection",
     "vad": "VAD",
+    "ner": "NER",
     "fitsGpu": "Fits in GPU",
     "allBackends": "All Backends",
     "searchBackends": "Search backends..."
diff --git a/core/http/react-ui/src/components/CodeEditor.jsx b/core/http/react-ui/src/components/CodeEditor.jsx
index ba0244f5bba7..5e732274510c 100644
--- a/core/http/react-ui/src/components/CodeEditor.jsx
+++ b/core/http/react-ui/src/components/CodeEditor.jsx
@@ -13,6 +13,7 @@ import { useCodeMirror } from '../hooks/useCodeMirror'
 import { useTheme } from '../contexts/ThemeContext'
 import { getThemeExtension } from '../utils/cmTheme'
 import { createYamlCompletionSource } from '../utils/cmYamlComplete'
+import { goTemplate } from '../utils/cmGoTemplate'
 
 function yamlIssueToDiagnostic(issue, cmDoc, severity) {
   const len = cmDoc.length
@@ -43,14 +44,17 @@ const yamlLinter = linter(view => {
   return diagnostics
 })
 
-export default function CodeEditor({ value, onChange, disabled, minHeight = '500px', fields }) {
+export default function CodeEditor({ value, onChange, disabled, minHeight = '500px', fields, language = 'yaml' }) {
   const containerRef = useRef(null)
   const { theme } = useTheme()
+  const isGoTemplate = language === 'gotemplate'
 
-  // Static extensions — only recreate when fields change
+  // Static extensions — only recreate when fields/language change
   const extensions = useMemo(() => {
     const exts = [
-      yaml(),
+      // Go templates aren't YAML — skip the YAML mode/linter so valid
+      // `{{ ... }}` syntax isn't flagged as a YAML parse error.
+      isGoTemplate ? goTemplate : yaml(),
       lineNumbers(),
       highlightActiveLineGutter(),
       highlightActiveLine(),
@@ -59,8 +63,6 @@ export default function CodeEditor({ value, onChange, disabled, minHeight = '500
       indentOnInput(),
       bracketMatching(),
       highlightSelectionMatches(),
-      yamlLinter,
-      lintGutter(),
       history(),
       indentUnit.of('  '),
       EditorState.tabSize.of(2),
@@ -77,15 +79,18 @@ export default function CodeEditor({ value, onChange, disabled, minHeight = '500
       }),
     ]
 
-    if (fields && fields.length > 0) {
-      exts.push(autocompletion({
-        override: [createYamlCompletionSource(fields)],
-        activateOnTyping: true,
-      }))
+    if (!isGoTemplate) {
+      exts.push(yamlLinter, lintGutter())
+      if (fields && fields.length > 0) {
+        exts.push(autocompletion({
+          override: [createYamlCompletionSource(fields)],
+          activateOnTyping: true,
+        }))
+      }
     }
 
     return exts
-  }, [minHeight, fields])
+  }, [minHeight, fields, isGoTemplate])
 
   // Dynamic extensions — reconfigured via Compartments (preserves undo/cursor/scroll)
   const dynamicExtensions = useMemo(() => ({
diff --git a/core/http/react-ui/src/components/ConfigFieldRenderer.jsx b/core/http/react-ui/src/components/ConfigFieldRenderer.jsx
index ccf5bf05c155..13e5680a4733 100644
--- a/core/http/react-ui/src/components/ConfigFieldRenderer.jsx
+++ b/core/http/react-ui/src/components/ConfigFieldRenderer.jsx
@@ -6,7 +6,9 @@ import SearchableModelSelect from './SearchableModelSelect'
 import AutocompleteInput from './AutocompleteInput'
 import CodeEditor from './CodeEditor'
 import StructuredCodeEditor from './StructuredCodeEditor'
-import PIIPatternListEditor from './PIIPatternListEditor'
+import EntityActionListEditor from './EntityActionListEditor'
+import PatternListEditor from './PatternListEditor'
+import ModelMultiSelect from './ModelMultiSelect'
 import RouterCandidatesEditor from './RouterCandidatesEditor'
 import RouterPoliciesEditor from './RouterPoliciesEditor'
 
@@ -16,6 +18,8 @@ const PROVIDER_TO_CAPABILITY = {
   'models:tts': 'FLAG_TTS',
   'models:transcript': 'FLAG_TRANSCRIPT',
   'models:vad': 'FLAG_VAD',
+  'models:score': 'FLAG_SCORE',
+  'models:token_classify': 'FLAG_TOKEN_CLASSIFY',
 }
 
 function coerceValue(raw, uiType) {
@@ -325,7 +329,7 @@ export default function ConfigFieldRenderer({ field, value, onChange, onRemove,
         </div>
         {isStructured
           ? <StructuredCodeEditor value={value} onChange={handleChange} minHeight="80px" />
-          : <CodeEditor value={value || ''} onChange={handleChange} minHeight="80px" />}
+          : <CodeEditor value={value || ''} onChange={handleChange} minHeight="80px" language={field.language} />}
       </div>
     )
   }
@@ -394,10 +398,26 @@ export default function ConfigFieldRenderer({ field, value, onChange, onRemove,
     )
   }
 
-  // PII pattern list — per-model action overrides for named patterns.
-  // The pattern catalog is loaded from /api/pii/patterns at render time
-  // so new built-in patterns surface automatically.
-  if (component === 'pii-pattern-list') {
+  // PII detectors — a capability-filtered multi-select of token_classify
+  // models (the consuming model's pii.detectors list).
+  if (component === 'model-multi-select') {
+    const cap = PROVIDER_TO_CAPABILITY[field.autocomplete_provider] || undefined
+    return (
+      <div style={{ padding: 'var(--spacing-sm) 0', borderBottom: '1px solid var(--color-border-subtle)' }}>
+        <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: 4 }}>
+          <div>
+            <div style={{ fontSize: '0.875rem', fontWeight: 500 }}><FieldLabel field={field} /></div>
+            <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', marginTop: 2 }}>{description}</div>
+          </div>
+        </div>
+        <ModelMultiSelect value={value} onChange={handleChange} capability={cap} placeholder={field.placeholder} />
+      </div>
+    )
+  }
+
+  // PII detection entity-action map — a detector model's
+  // pii_detection.entity_actions (entity group -> mask|block|allow).
+  if (component === 'entity-action-list') {
     return (
       <div style={{ padding: 'var(--spacing-sm) 0', borderBottom: '1px solid var(--color-border-subtle)' }}>
         <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: 4 }}>
@@ -406,7 +426,46 @@ export default function ConfigFieldRenderer({ field, value, onChange, onRemove,
             <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', marginTop: 2 }}>{description}</div>
           </div>
         </div>
-        <PIIPatternListEditor value={value} onChange={handleChange} />
+        <EntityActionListEditor value={value} onChange={handleChange} />
+      </div>
+    )
+  }
+
+  // PII built-in secret patterns — a checklist of named built-in patterns
+  // (pii_detection.builtins). value is an array of selected names.
+  if (component === 'pii-builtins-select') {
+    const selected = Array.isArray(value) ? value : []
+    const toggle = (name) => {
+      handleChange(selected.includes(name) ? selected.filter(n => n !== name) : [...selected, name])
+    }
+    return (
+      <div style={{ padding: 'var(--spacing-sm) 0', borderBottom: '1px solid var(--color-border-subtle)' }}>
+        <div style={{ marginBottom: 4 }}>
+          <div style={{ fontSize: '0.875rem', fontWeight: 500 }}><FieldLabel field={field} /></div>
+          <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', marginTop: 2 }}>{description}</div>
+        </div>
+        <div style={{ display: 'flex', flexDirection: 'column', gap: 4 }}>
+          {(field.options || []).map(opt => (
+            <label key={opt.value} style={{ display: 'flex', alignItems: 'center', gap: 8, fontSize: '0.8125rem', cursor: 'pointer' }}>
+              <input type="checkbox" checked={selected.includes(opt.value)} onChange={() => toggle(opt.value)} />
+              {opt.label || opt.value}
+            </label>
+          ))}
+        </div>
+      </div>
+    )
+  }
+
+  // PII custom secret patterns — operator-defined restricted-regex rules
+  // (pii_detection.patterns). value is an array of {name, match, action, min_len}.
+  if (component === 'pii-pattern-list') {
+    return (
+      <div style={{ padding: 'var(--spacing-sm) 0', borderBottom: '1px solid var(--color-border-subtle)' }}>
+        <div style={{ marginBottom: 4 }}>
+          <div style={{ fontSize: '0.875rem', fontWeight: 500 }}><FieldLabel field={field} /></div>
+          <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', marginTop: 2 }}>{description}</div>
+        </div>
+        <PatternListEditor value={value} onChange={handleChange} />
       </div>
     )
   }
diff --git a/core/http/react-ui/src/components/EntityActionListEditor.jsx b/core/http/react-ui/src/components/EntityActionListEditor.jsx
new file mode 100644
index 000000000000..796e4f0590cc
--- /dev/null
+++ b/core/http/react-ui/src/components/EntityActionListEditor.jsx
@@ -0,0 +1,98 @@
+import { useMemo } from 'react'
+import SearchableSelect from './SearchableSelect'
+
+// Editor for a detector model's pii_detection.entity_actions map:
+// entity-group name -> action. The value is an object {GROUP: action};
+// this component renders one row per entry and emits a fresh object on
+// every change. Entity-group names are model-defined (the privacy-filter
+// family emits uppercase names with no separators), so the group field is
+// free text with a datalist of common high-value categories for
+// convenience — any string the model emits is valid.
+
+const ACTION_OPTIONS = [
+  { value: 'mask', label: 'mask — replace with [REDACTED:ner:GROUP]' },
+  { value: 'block', label: 'block — reject the request (HTTP 400)' },
+  { value: 'allow', label: 'allow — detect & log, leave text unchanged' },
+]
+
+// Common categories surfaced as datalist hints. Not exhaustive and not
+// authoritative — the model's own label set is the source of truth.
+const COMMON_GROUPS = [
+  'PASSWORD', 'PIN', 'CVV', 'CREDITCARD', 'IBAN', 'BIC', 'BANKACCOUNT', 'SSN',
+  'BITCOINADDRESS', 'ETHEREUMADDRESS', 'LITECOINADDRESS',
+  'EMAIL', 'PHONE', 'URL', 'IPADDRESS', 'MACADDRESS',
+  'FIRSTNAME', 'LASTNAME', 'MIDDLENAME', 'USERNAME', 'DATEOFBIRTH',
+  'STREET', 'CITY', 'STATE', 'ZIPCODE', 'GPSCOORDINATES',
+]
+
+export default function EntityActionListEditor({ value, onChange }) {
+  // value is an object map; preserve insertion order via Object.entries.
+  const entries = useMemo(
+    () => (value && typeof value === 'object' && !Array.isArray(value) ? Object.entries(value) : []),
+    [value]
+  )
+
+  const datalistId = 'pii-entity-groups'
+
+  const update = (index, key, action) => {
+    const next = entries.map((e, i) => (i === index ? [key, action] : e))
+    onChange(Object.fromEntries(next.filter(([k]) => k !== '')))
+  }
+
+  const remove = (index) => {
+    onChange(Object.fromEntries(entries.filter((_, i) => i !== index)))
+  }
+
+  const add = () => {
+    // New rows default to mask; an empty key is tolerated transiently and
+    // filtered out on the next edit / when serialised.
+    onChange(Object.fromEntries([...entries, ['', 'mask']]))
+  }
+
+  return (
+    <div style={{ display: 'flex', flexDirection: 'column', gap: 6, width: '100%' }}>
+      <datalist id={datalistId}>
+        {COMMON_GROUPS.map(g => <option key={g} value={g} />)}
+      </datalist>
+
+      {entries.length === 0 && (
+        <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+          No per-entity actions — every detected group uses the default action. Add a row to
+          block or allow-log a specific entity group (e.g. <code>PASSWORD</code> → block).
+        </div>
+      )}
+
+      {entries.map(([group, action], i) => (
+        <div key={i} style={{ display: 'flex', gap: 6, alignItems: 'center', flexWrap: 'wrap' }}>
+          <input
+            className="input"
+            list={datalistId}
+            value={group}
+            placeholder="Entity group (e.g. PASSWORD)"
+            onChange={e => update(i, e.target.value, action)}
+            style={{ flex: '1 1 220px', minWidth: 180, fontSize: '0.8125rem' }}
+            aria-label="Entity group"
+          />
+          <SearchableSelect
+            value={action || 'mask'}
+            onChange={v => update(i, group, v)}
+            options={ACTION_OPTIONS}
+            placeholder="Action..."
+            style={{ flex: '1 1 240px', minWidth: 220 }}
+          />
+          <button type="button" className="btn btn-secondary btn-sm"
+            onClick={() => remove(i)}
+            style={{ padding: '2px 8px', fontSize: '0.75rem' }}
+            aria-label="Remove entity action">
+            <i className="fas fa-times" />
+          </button>
+        </div>
+      ))}
+
+      <button type="button" className="btn btn-secondary btn-sm" onClick={add}
+        style={{ alignSelf: 'flex-start', fontSize: '0.75rem' }}>
+        <i className="fas fa-plus" /> Add entity action
+      </button>
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/components/ModelMultiSelect.jsx b/core/http/react-ui/src/components/ModelMultiSelect.jsx
new file mode 100644
index 000000000000..5bf25ea0e00c
--- /dev/null
+++ b/core/http/react-ui/src/components/ModelMultiSelect.jsx
@@ -0,0 +1,62 @@
+import SearchableModelSelect from './SearchableModelSelect'
+
+// Editor for a list of model names (value is []string). Selected models render
+// as compact removable chips; a single capability-filtered, commit-only picker
+// adds new ones. Used for pii.detectors / the instance-wide default detector,
+// where every entry must be a token_classify model. Already-selected models are
+// guarded against so each appears at most once.
+//
+// The picker is commit-only on purpose: typing a partial query must never be
+// treated as a chosen model (otherwise each keystroke would add a bogus entry),
+// and selecting one input box per detector wastes vertical space.
+export default function ModelMultiSelect({ value, onChange, capability, placeholder }) {
+  const items = Array.isArray(value) ? value : []
+
+  const remove = (index) => onChange(items.filter((_, i) => i !== index))
+  const add = (v) => {
+    if (!v || items.includes(v)) return
+    onChange([...items, v])
+  }
+
+  return (
+    <div style={{ display: 'flex', flexDirection: 'column', gap: 6, width: '100%' }}>
+      {items.length === 0 ? (
+        <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+          No detectors — PII is enabled but nothing scans requests. Add a token-classification
+          (NER) model below; its <code>pii_detection</code> block supplies the policy.
+        </div>
+      ) : (
+        <div style={{ display: 'flex', flexWrap: 'wrap', gap: 6 }}>
+          {items.map((name, i) => (
+            <span key={i} style={{
+              display: 'inline-flex', alignItems: 'center', gap: 6,
+              padding: '2px 4px 2px 10px', fontSize: '0.8125rem',
+              fontFamily: 'var(--font-mono)', background: 'var(--color-bg-tertiary)',
+              borderRadius: 'var(--radius-md)',
+            }}>
+              {name}
+              <button type="button" className="btn btn-secondary btn-sm"
+                onClick={() => remove(i)}
+                style={{ padding: '0 6px', fontSize: '0.75rem', lineHeight: 1.6 }}
+                aria-label={`Remove ${name}`}>
+                <i className="fas fa-times" />
+              </button>
+            </span>
+          ))}
+        </div>
+      )}
+
+      {/* Size by width only. The container is a flex column, so a flex-basis
+          here would set the wrapper's HEIGHT — which the dropdown anchors to
+          (top: 100%), opening it far below the input. */}
+      <SearchableModelSelect
+        value=""
+        onChange={add}
+        commitOnly
+        capability={capability}
+        placeholder={placeholder || '+ Add detector model...'}
+        style={{ width: '100%', maxWidth: 360 }}
+      />
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/components/PIIPatternListEditor.jsx b/core/http/react-ui/src/components/PIIPatternListEditor.jsx
deleted file mode 100644
index 558f4cd6ab2d..000000000000
--- a/core/http/react-ui/src/components/PIIPatternListEditor.jsx
+++ /dev/null
@@ -1,120 +0,0 @@
-import { useState, useEffect, useMemo } from 'react'
-import { apiUrl } from '../utils/basePath'
-import SearchableSelect from './SearchableSelect'
-
-const ACTION_OPTIONS = [
-  { value: 'mask', label: 'Mask — replace with a [REDACTED:id] placeholder' },
-  { value: 'block', label: 'Block — reject the request (request side) / mask in stream' },
-  { value: 'route_local', label: 'Route local — keep text, force local-only routing' },
-]
-
-export default function PIIPatternListEditor({ value, onChange }) {
-  const items = Array.isArray(value) ? value : []
-
-  const [catalog, setCatalog] = useState([])
-  const [loadError, setLoadError] = useState(null)
-
-  useEffect(() => {
-    let cancelled = false
-    fetch(apiUrl('/api/pii/patterns'))
-      .then(r => r.ok ? r.json() : Promise.reject(new Error(`HTTP ${r.status}`)))
-      .then(data => { if (!cancelled) setCatalog(data?.patterns || []) })
-      .catch(err => { if (!cancelled) setLoadError(err.message) })
-    return () => { cancelled = true }
-  }, [])
-
-  const idOptions = useMemo(() =>
-    catalog.map(p => ({
-      value: p.id,
-      label: p.description ? `${p.id} — ${p.description}` : p.id,
-    })),
-    [catalog]
-  )
-
-  // Patterns already chosen — exclude from the "add row" select so each
-  // pattern only appears once per model.
-  const usedIDs = new Set(items.map(it => it?.id).filter(Boolean))
-  const availableForAdd = idOptions.filter(o => !usedIDs.has(o.value))
-
-  const update = (index, key, val) => {
-    const next = items.map((it, i) =>
-      i === index ? { ...it, [key]: val } : it
-    )
-    onChange(next)
-  }
-
-  const remove = (index) => {
-    onChange(items.filter((_, i) => i !== index))
-  }
-
-  const add = (id) => {
-    const cat = catalog.find(c => c.id === id)
-    onChange([...items, { id, action: cat?.action || 'mask' }])
-  }
-
-  return (
-    <div style={{ display: 'flex', flexDirection: 'column', gap: 6, width: '100%' }}>
-      {loadError && (
-        <div style={{ fontSize: '0.75rem', color: 'var(--color-error)' }}>
-          Could not load pattern catalog: {loadError}. You can still type IDs manually.
-        </div>
-      )}
-
-      {items.length === 0 && (
-        <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
-          No overrides — every pattern uses its global default action. Add a row below to
-          tighten or relax the action for a specific pattern on this model.
-        </div>
-      )}
-
-      {items.map((row, i) => {
-        const cat = catalog.find(c => c.id === row?.id)
-        const idLabel = cat?.description ? `${row.id} — ${cat.description}` : (row?.id || '')
-        // Show the chosen id even if the catalog hasn't loaded yet (or
-        // the YAML references an unknown pattern), so users can edit
-        // without losing context.
-        const idItems = [
-          ...(row?.id && !idOptions.some(o => o.value === row.id)
-            ? [{ value: row.id, label: idLabel }]
-            : []),
-          ...idOptions.filter(o => o.value === row?.id || !usedIDs.has(o.value)),
-        ]
-        return (
-          <div key={i} style={{ display: 'flex', gap: 6, alignItems: 'center', flexWrap: 'wrap' }}>
-            <SearchableSelect
-              value={row?.id || ''}
-              onChange={v => update(i, 'id', v)}
-              options={idItems}
-              placeholder="Pattern..."
-              style={{ flex: '1 1 220px', minWidth: 200 }}
-            />
-            <SearchableSelect
-              value={row?.action || 'mask'}
-              onChange={v => update(i, 'action', v)}
-              options={ACTION_OPTIONS}
-              placeholder="Action..."
-              style={{ flex: '1 1 240px', minWidth: 220 }}
-            />
-            <button type="button" className="btn btn-secondary btn-sm"
-              onClick={() => remove(i)}
-              style={{ padding: '2px 8px', fontSize: '0.75rem' }}>
-              <i className="fas fa-times" />
-            </button>
-          </div>
-        )
-      })}
-
-      {availableForAdd.length > 0 && (
-        <div style={{ display: 'flex', gap: 6, alignItems: 'center' }}>
-          <SearchableSelect
-            value=""
-            onChange={v => v && add(v)}
-            options={availableForAdd}
-            placeholder="+ Add pattern override..."
-            style={{ flex: '1 1 220px', minWidth: 200 }}
-          />
-        </div>
-      )}
-    </div>
-  )
-}
diff --git a/core/http/react-ui/src/components/PatternListEditor.jsx b/core/http/react-ui/src/components/PatternListEditor.jsx
new file mode 100644
index 000000000000..f5a82148a638
--- /dev/null
+++ b/core/http/react-ui/src/components/PatternListEditor.jsx
@@ -0,0 +1,96 @@
+import { useMemo } from 'react'
+import SearchableSelect from './SearchableSelect'
+
+// Editor for a pattern detector's pii_detection.patterns: a list of
+// operator-defined secret patterns. Value is an array of
+// { name, match, action?, min_len? }; this renders one row per pattern and
+// emits a fresh array on every change. Patterns use a restricted regex subset
+// validated server-side at save (an invalid pattern surfaces as the save
+// error), so no regex engine is shipped to the client.
+
+const ACTION_OPTIONS = [
+  { value: '', label: 'default (use Default Action)' },
+  { value: 'mask', label: 'mask — replace the span' },
+  { value: 'block', label: 'block — reject the request' },
+  { value: 'allow', label: 'allow — detect & log only' },
+]
+
+function emptyPattern() {
+  return { name: '', match: '', action: '', min_len: 0 }
+}
+
+export default function PatternListEditor({ value, onChange }) {
+  const rows = useMemo(() => (Array.isArray(value) ? value : []), [value])
+
+  const update = (index, patch) => {
+    onChange(rows.map((r, i) => (i === index ? { ...r, ...patch } : r)))
+  }
+  const remove = (index) => onChange(rows.filter((_, i) => i !== index))
+  const add = () => onChange([...rows, emptyPattern()])
+
+  return (
+    <div style={{ display: 'flex', flexDirection: 'column', gap: 8, width: '100%' }}>
+      <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+        Restricted regex: literals, <code>[…]</code> classes, <code>\w \d \s</code>, <code>?*+{'{m,n}'}</code>, anchors.
+        Each pattern must contain a fixed literal run of ≥3 characters (e.g. <code>sk-prefix-</code>);
+        <code>.</code> and capturing groups are not allowed. Matches report under the pattern name.
+      </div>
+
+      {rows.length === 0 && (
+        <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+          No custom patterns. Enable built-ins above, or add a pattern for an internal credential
+          format (e.g. <code>tok-[A-Za-z0-9]{'{32,64}'}</code>).
+        </div>
+      )}
+
+      {rows.map((r, i) => (
+        <div key={i} style={{ display: 'flex', gap: 6, alignItems: 'center', flexWrap: 'wrap' }}>
+          <input
+            className="input"
+            value={r.name || ''}
+            placeholder="Name (group), e.g. INTERNAL_TOKEN"
+            onChange={e => update(i, { name: e.target.value })}
+            style={{ flex: '1 1 180px', minWidth: 150, fontSize: '0.8125rem' }}
+            aria-label="Pattern name"
+          />
+          <input
+            className="input input-mono"
+            value={r.match || ''}
+            placeholder="match, e.g. tok-[A-Za-z0-9]{32,64}"
+            onChange={e => update(i, { match: e.target.value })}
+            style={{ flex: '2 1 240px', minWidth: 200, fontSize: '0.8125rem', fontFamily: 'var(--font-mono)' }}
+            aria-label="Pattern match"
+          />
+          <SearchableSelect
+            value={r.action || ''}
+            onChange={v => update(i, { action: v })}
+            options={ACTION_OPTIONS}
+            placeholder="Action..."
+            style={{ flex: '1 1 200px', minWidth: 180 }}
+          />
+          <input
+            className="input"
+            type="number"
+            min={0}
+            value={r.min_len || 0}
+            title="Minimum match length (0 = no floor)"
+            onChange={e => update(i, { min_len: parseInt(e.target.value, 10) || 0 })}
+            style={{ width: 80, fontSize: '0.8125rem' }}
+            aria-label="Minimum length"
+          />
+          <button type="button" className="btn btn-secondary btn-sm"
+            onClick={() => remove(i)}
+            style={{ padding: '2px 8px', fontSize: '0.75rem' }}
+            aria-label="Remove pattern">
+            <i className="fas fa-times" />
+          </button>
+        </div>
+      ))}
+
+      <button type="button" className="btn btn-secondary btn-sm" onClick={add}
+        style={{ alignSelf: 'flex-start', fontSize: '0.75rem' }}>
+        <i className="fas fa-plus" /> Add pattern
+      </button>
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/components/SearchableModelSelect.jsx b/core/http/react-ui/src/components/SearchableModelSelect.jsx
index 135f3d98d854..3d920fa4dfb4 100644
--- a/core/http/react-ui/src/components/SearchableModelSelect.jsx
+++ b/core/http/react-ui/src/components/SearchableModelSelect.jsx
@@ -1,7 +1,13 @@
 import { useState, useEffect, useRef, useCallback } from 'react'
 import { useModels } from '../hooks/useModels'
 
-export default function SearchableModelSelect({ value, onChange, capability, placeholder = 'Type or select a model...', style }) {
+// commitOnly: when true, onChange fires only on an explicit commit (selecting an
+// item, or Enter) — never on each keystroke. Use it where each onChange is a
+// final selection (e.g. the ModelMultiSelect "add" picker), so a partial typed
+// query isn't treated as a chosen value. After a commit the field is cleared,
+// matching the add-and-clear flow. Default false keeps the as-you-type
+// behaviour single-value editors rely on.
+export default function SearchableModelSelect({ value, onChange, capability, placeholder = 'Type or select a model...', style, commitOnly = false }) {
   const { models, loading } = useModels(capability)
   const [query, setQuery] = useState('')
   const [open, setOpen] = useState(false)
@@ -33,11 +39,13 @@ export default function SearchableModelSelect({ value, onChange, capability, pla
     : -1
 
   const commit = useCallback((val) => {
-    setQuery(val)
+    // In commitOnly mode the field is an "add" box — clear it after a pick so
+    // the next selection starts fresh; otherwise reflect the chosen value.
+    setQuery(commitOnly ? '' : val)
     onChange(val)
     setOpen(false)
     setFocusIndex(-1)
-  }, [onChange])
+  }, [onChange, commitOnly])
 
   const handleKeyDown = (e) => {
     if (!open && (e.key === 'ArrowDown' || e.key === 'ArrowUp')) {
@@ -133,8 +141,10 @@ export default function SearchableModelSelect({ value, onChange, capability, pla
           setQuery(e.target.value)
           setOpen(true)
           setFocusIndex(-1)
-          // Commit on every keystroke so the parent always has current value
-          onChange(e.target.value)
+          // Single-value editors want the parent updated as you type; an
+          // "add" picker (commitOnly) must wait for an explicit commit so a
+          // partial query is never mistaken for a chosen model.
+          if (!commitOnly) onChange(e.target.value)
         }}
         onFocus={() => setOpen(true)}
         onKeyDown={handleKeyDown}
diff --git a/core/http/react-ui/src/hooks/useConfigMetadata.js b/core/http/react-ui/src/hooks/useConfigMetadata.js
index 0a309949ebd8..8af449785628 100644
--- a/core/http/react-ui/src/hooks/useConfigMetadata.js
+++ b/core/http/react-ui/src/hooks/useConfigMetadata.js
@@ -1,6 +1,11 @@
 import { useState, useEffect } from 'react'
 import { modelsApi } from '../utils/api'
 
+// Stable empty references so consumers that memoize on `sections`/`fields`
+// (e.g. ModelEditor's leafPaths) don't see a new array every render while
+// the metadata request is still in flight — which would thrash their effects.
+const EMPTY = []
+
 export function useConfigMetadata() {
   const [metadata, setMetadata] = useState(null)
   const [loading, setLoading] = useState(true)
@@ -14,8 +19,8 @@ export function useConfigMetadata() {
   }, [])
 
   return {
-    sections: metadata?.sections || [],
-    fields: metadata?.fields || [],
+    sections: metadata?.sections || EMPTY,
+    fields: metadata?.fields || EMPTY,
     loading,
     error,
   }
diff --git a/core/http/react-ui/src/pages/AgentJobs.jsx b/core/http/react-ui/src/pages/AgentJobs.jsx
index 2e413b415ae3..8b9d9b3ebb71 100644
--- a/core/http/react-ui/src/pages/AgentJobs.jsx
+++ b/core/http/react-ui/src/pages/AgentJobs.jsx
@@ -1,6 +1,7 @@
 import { useState, useEffect, useCallback, useRef } from 'react'
-import { useNavigate, useOutletContext } from 'react-router-dom'
+import { useNavigate, useOutletContext, useLocation } from 'react-router-dom'
 import { agentJobsApi, modelsApi } from '../utils/api'
+import { fromState } from '../utils/editorNav'
 import { useModels } from '../hooks/useModels'
 import { useAuth } from '../context/AuthContext'
 import { useUserMap } from '../hooks/useUserMap'
@@ -13,6 +14,7 @@ import ConfirmDialog from '../components/ConfirmDialog'
 export default function AgentJobs() {
   const { addToast } = useOutletContext()
   const navigate = useNavigate()
+  const location = useLocation()
   const { models } = useModels()
   const { isAdmin, authEnabled, user } = useAuth()
   const userMap = useUserMap()
@@ -338,7 +340,7 @@ export default function AgentJobs() {
                       </td>
                       <td>
                         {task.model ? (
-                          <a onClick={() => navigate(`/app/model-editor/${encodeURIComponent(task.model)}`)} style={{ cursor: 'pointer', color: 'var(--color-primary)', fontSize: '0.8125rem' }}>
+                          <a onClick={() => navigate(`/app/model-editor/${encodeURIComponent(task.model)}`, { state: fromState(location, 'Agent Jobs') })} style={{ cursor: 'pointer', color: 'var(--color-primary)', fontSize: '0.8125rem' }}>
                             {task.model}
                           </a>
                         ) : '-'}
diff --git a/core/http/react-ui/src/pages/Chat.jsx b/core/http/react-ui/src/pages/Chat.jsx
index a638aa3a8a1e..8560f6dc0afe 100644
--- a/core/http/react-ui/src/pages/Chat.jsx
+++ b/core/http/react-ui/src/pages/Chat.jsx
@@ -1,6 +1,7 @@
 import { useState, useEffect, useRef, useCallback, useMemo } from 'react'
-import { useParams, useOutletContext, useNavigate } from 'react-router-dom'
+import { useParams, useOutletContext, useNavigate, useLocation } from 'react-router-dom'
 import { useTranslation } from 'react-i18next'
+import { fromState } from '../utils/editorNav'
 import { useChat } from '../hooks/useChat'
 import ModelSelector from '../components/ModelSelector'
 import { renderMarkdown, highlightAll } from '../utils/markdown'
@@ -282,6 +283,7 @@ export default function Chat() {
   const { model: urlModel } = useParams()
   const { addToast } = useOutletContext()
   const navigate = useNavigate()
+  const location = useLocation()
   const { t } = useTranslation('chat')
   const { isAdmin } = useAuth()
   const { operations } = useOperations()
@@ -901,7 +903,7 @@ export default function Chat() {
                   <button
                     type="button"
                     className="btn btn-secondary btn-sm"
-                    onClick={() => navigate(`/app/model-editor/${encodeURIComponent(activeChat.model)}`)}
+                    onClick={() => navigate(`/app/model-editor/${encodeURIComponent(activeChat.model)}`, { state: fromState(location, 'Chat') })}
                     title={t('header.editConfig')}
                   >
                     <i className="fas fa-pen-to-square" /> {t('header.editConfig')}
diff --git a/core/http/react-ui/src/pages/Manage.jsx b/core/http/react-ui/src/pages/Manage.jsx
index eb8da57592ee..11b1f898bb1a 100644
--- a/core/http/react-ui/src/pages/Manage.jsx
+++ b/core/http/react-ui/src/pages/Manage.jsx
@@ -1,6 +1,7 @@
 import { useState, useEffect, useCallback } from 'react'
-import { useNavigate, useOutletContext, useSearchParams } from 'react-router-dom'
+import { useNavigate, useOutletContext, useSearchParams, useLocation } from 'react-router-dom'
 import { useTranslation } from 'react-i18next'
+import { fromState } from '../utils/editorNav'
 import ResourceMonitor from '../components/ResourceMonitor'
 import ConfirmDialog from '../components/ConfirmDialog'
 import NodeDistributionChip from '../components/NodeDistributionChip'
@@ -120,6 +121,7 @@ function formatBackendVersion(metadata) {
 export default function Manage() {
   const { addToast } = useOutletContext()
   const navigate = useNavigate()
+  const location = useLocation()
   const { t } = useTranslation('admin')
   const [searchParams, setSearchParams] = useSearchParams()
   const initialTab = searchParams.get('tab') || localStorage.getItem('manage-tab') || 'models'
@@ -666,7 +668,7 @@ export default function Manage() {
                               onClick: () => handleTogglePinned(model.id, model.pinned),
                               disabled: pinningModels.has(model.id) || !!model.disabled },
                             { key: 'edit', icon: 'fa-pen-to-square', label: 'Edit configuration',
-                              onClick: () => navigate(`/app/model-editor/${encodeURIComponent(model.id)}`) },
+                              onClick: () => navigate(`/app/model-editor/${encodeURIComponent(model.id)}`, { state: fromState(location, 'Manage') }) },
                             { key: 'logs', icon: 'fa-terminal', label: 'Backend logs',
                               onClick: () => navigate(`/app/backend-logs/${encodeURIComponent(model.id)}`) },
                             { divider: true },
diff --git a/core/http/react-ui/src/pages/Middleware.jsx b/core/http/react-ui/src/pages/Middleware.jsx
index 4d51251bcf4e..7be29725b7d6 100644
--- a/core/http/react-ui/src/pages/Middleware.jsx
+++ b/core/http/react-ui/src/pages/Middleware.jsx
@@ -1,12 +1,14 @@
 import { useState, useEffect, useCallback, useRef, useMemo, Fragment } from 'react'
-import { useOutletContext, Link, useNavigate } from 'react-router-dom'
+import { useOutletContext, Link, useNavigate, useLocation, useSearchParams } from 'react-router-dom'
 import { apiUrl } from '../utils/basePath'
-import { settingsApi } from '../utils/api'
+import { fromState } from '../utils/editorNav'
+import { settingsApi, modelsApi } from '../utils/api'
 import LoadingSpinner from '../components/LoadingSpinner'
+import Toggle from '../components/Toggle'
 
 // Middleware admin page. Three tabs:
-//   - Filtering: PII pattern catalogue + per-model resolved state +
-//     pattern-action editor (PUT /api/pii/patterns/:id, transient).
+//   - Filtering: per-model resolved PII state + per-model detector list
+//     (detection policy lives on each detector model's pii_detection block).
 //   - Routing: placeholder until subsystem 2 lands. Renders the note
 //     from /api/router/status so admins see "not yet implemented" rather
 //     than an empty page.
@@ -26,13 +28,11 @@ const TABS = [
   { id: 'events', label: 'Events', icon: 'fa-list-ul' },
 ]
 
-const ACTIONS = ['mask', 'block', 'route_local']
-
 function actionBadge(action) {
   const colors = {
     mask: 'var(--color-primary)',
     block: 'var(--color-error)',
-    route_local: 'var(--color-warning)',
+    allow: 'var(--color-warning)',
   }
   return (
     <span style={{
@@ -75,8 +75,17 @@ export default function Middleware() {
   const [events, setEvents] = useState([])
   const [decisions, setDecisions] = useState([])
   const [loading, setLoading] = useState(true)
-  const [activeTab, setActiveTab] = useState('filtering')
-  const [pendingPattern, setPendingPattern] = useState(null) // id while a PUT is in flight
+  // The active tab lives in the URL (?tab=) so deep links and the model-editor
+  // Back button (which captures location.search) return to the same tab; a
+  // localStorage fallback restores it on a bare visit. Mirrors the Manage page.
+  const [searchParams, setSearchParams] = useSearchParams()
+  const initialTab = searchParams.get('tab') || localStorage.getItem('middleware-tab') || 'filtering'
+  const [activeTab, setActiveTab] = useState(TABS.some(t => t.id === initialTab) ? initialTab : 'filtering')
+  const selectTab = (id) => {
+    setActiveTab(id)
+    localStorage.setItem('middleware-tab', id)
+    setSearchParams({ tab: id })
+  }
 
   // silent=true on background polls: skips the loading spinner and
   // suppresses toast spam if the server is briefly unreachable.
@@ -118,51 +127,6 @@ export default function Middleware() {
     return () => clearInterval(refreshRef.current)
   }, [fetchAll])
 
-  const mutatePattern = async (patternID, body, successMsg) => {
-    setPendingPattern(patternID)
-    try {
-      const res = await fetch(apiUrl(`/api/pii/patterns/${encodeURIComponent(patternID)}`), {
-        method: 'PUT',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify(body),
-      })
-      if (!res.ok) {
-        const data = await res.json().catch(() => ({}))
-        throw new Error(data.error || `HTTP ${res.status}`)
-      }
-      addToast(successMsg, 'success')
-      await fetchAll()
-    } catch (err) {
-      addToast(`Failed to update pattern: ${err.message}`, 'error')
-    } finally {
-      setPendingPattern(null)
-    }
-  }
-
-  const setPatternAction = (patternID, action) =>
-    mutatePattern(patternID, { action }, `Pattern ${patternID}: action ${action} (transient — click "Save to disk" to persist)`)
-
-  const setPatternDisabled = (patternID, disabled) =>
-    mutatePattern(patternID, { disabled }, `Pattern ${patternID}: ${disabled ? 'disabled' : 'enabled'} (transient — click "Save to disk" to persist)`)
-
-  const [persisting, setPersisting] = useState(false)
-  const persistPatterns = async () => {
-    setPersisting(true)
-    try {
-      const res = await fetch(apiUrl('/api/pii/patterns/persist'), { method: 'POST' })
-      if (!res.ok) {
-        const data = await res.json().catch(() => ({}))
-        throw new Error(data.error || `HTTP ${res.status}`)
-      }
-      const data = await res.json().catch(() => ({}))
-      addToast(`Saved ${data.override_count ?? 0} pattern override(s) to runtime_settings.json`, 'success')
-    } catch (err) {
-      addToast(`Failed to persist: ${err.message}`, 'error')
-    } finally {
-      setPersisting(false)
-    }
-  }
-
   return (
     <div className="page page--wide">
       <div className="page-header" style={{ marginBottom: 'var(--spacing-sm)' }}>
@@ -178,7 +142,7 @@ export default function Middleware() {
           <button
             key={tab.id}
             className={`btn btn-sm ${activeTab === tab.id ? 'btn-primary' : 'btn-secondary'}`}
-            onClick={() => setActiveTab(tab.id)}
+            onClick={() => selectTab(tab.id)}
           >
             <i className={`fas ${tab.icon}`} style={{ marginRight: 4 }} />
             {tab.label}
@@ -195,14 +159,7 @@ export default function Middleware() {
           <LoadingSpinner size="lg" />
         </div>
       ) : activeTab === 'filtering' ? (
-        <FilteringTab
-          status={status}
-          pendingPattern={pendingPattern}
-          onSetAction={setPatternAction}
-          onSetDisabled={setPatternDisabled}
-          onPersist={persistPatterns}
-          persisting={persisting}
-        />
+        <FilteringTab status={status} addToast={addToast} onChanged={fetchAll} />
       ) : activeTab === 'routing' ? (
         <RoutingTab status={status} decisions={decisions} />
       ) : activeTab === 'proxy' ? (
@@ -214,23 +171,33 @@ export default function Middleware() {
   )
 }
 
-function FilteringTab({ status, pendingPattern, onSetAction, onSetDisabled, onPersist, persisting }) {
-  if (!status?.pii) return null
-  const pii = status.pii
+function FilteringTab({ status, addToast, onChanged }) {
+  const location = useLocation()
+  // Rows mid-save, so just that model's toggle disables while the PATCH
+  // round-trips (and the 5s background poll re-syncs the resolved state).
+  const [piiBusy, setPiiBusy] = useState(() => new Set())
 
-  if (!pii.enabled_globally) {
-    return (
-      <div className="empty-state">
-        <div className="empty-state-icon"><i className="fas fa-shield-slash" /></div>
-        <h2 className="empty-state-title">PII filtering disabled</h2>
-        <p className="empty-state-text">
-          The PII filter is disabled by <code>{pii.reason || '--disable-pii'}</code>.
-          Restart without that flag to enable it.
-        </p>
-      </div>
-    )
+  // Toggling the PII column writes an explicit pii.enabled to the model YAML
+  // via PATCH /api/models/config-json/:name (a deep-merge that preserves
+  // pii.detectors and every other field). This makes the resolved state
+  // explicit: a cloud-proxy model shown ON by backend default becomes
+  // pii.enabled:true; toggling it OFF writes pii.enabled:false.
+  const togglePII = async (name, on) => {
+    setPiiBusy(prev => new Set(prev).add(name))
+    try {
+      await modelsApi.patchConfig(name, { pii: { enabled: on } })
+      addToast?.(on ? `PII filtering enabled for ${name}` : `PII filtering disabled for ${name}`, 'success')
+      onChanged?.()
+    } catch (err) {
+      addToast?.(`Failed to update ${name}: ${err.message}`, 'error')
+    } finally {
+      setPiiBusy(prev => { const n = new Set(prev); n.delete(name); return n })
+    }
   }
 
+  if (!status?.pii) return null
+  const pii = status.pii
+
   return (
     <>
       {/* Default rule banner */}
@@ -238,90 +205,23 @@ function FilteringTab({ status, pendingPattern, onSetAction, onSetDisabled, onPe
         <div style={{ display: 'flex', alignItems: 'flex-start', gap: 'var(--spacing-sm)' }}>
           <i className="fas fa-info-circle" style={{ color: 'var(--color-text-muted)', marginTop: 2 }} />
           <div>
-            <div style={{ fontWeight: 600, marginBottom: 4 }}>Default policy</div>
+            <div style={{ fontWeight: 600, marginBottom: 4 }}>NER-based PII redaction</div>
             <div style={{ fontSize: '0.8125rem', color: 'var(--color-text-secondary)' }}>
-              PII redaction is per-model and OFF by default. Backends matching <code>{(pii.default_enabled_for_backends || []).join(', ')}</code> default to ON (cloud passthroughs). Override per model with <code>pii: {'{'} enabled: true {'}'}</code> in the model YAML.
+              Redaction is per-model and runs request-side. It is OFF by default; backends matching <code>{(pii.default_enabled_for_backends || []).join(', ')}</code> default to ON (cloud passthroughs). A model opts in with <code>pii: {'{'} enabled: true, detectors: [&hellip;] {'}'}</code>; each detector is a <code>token_classify</code> model whose <code>pii_detection</code> block defines the policy (which entities, what action, min score). Edit a detector model to change its policy.
             </div>
           </div>
         </div>
       </div>
 
-      {/* Patterns table */}
-      <div className="card" style={{ padding: 'var(--spacing-md)', marginBottom: 'var(--spacing-md)' }}>
-        <div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', marginBottom: 'var(--spacing-sm)' }}>
-          <span style={{ fontSize: '0.875rem', fontWeight: 600 }}>Active patterns</span>
-          <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)' }}>
-            <span style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
-              Toggle / action edits are transient — click Save to disk to persist.
-            </span>
-            <button
-              className="btn btn-secondary btn-sm"
-              onClick={onPersist}
-              disabled={persisting}
-              style={{ fontSize: '0.75rem' }}
-            >
-              <i className={`fas ${persisting ? 'fa-spinner fa-spin' : 'fa-save'}`} /> Save to disk
-            </button>
-          </div>
-        </div>
-        <div className="table-container">
-          <table className="table">
-            <thead>
-              <tr>
-                <th style={{ width: 80 }}>Enabled</th>
-                <th style={{ width: 140 }}>Pattern</th>
-                <th>Description</th>
-                <th style={{ width: 110 }}>Action</th>
-                <th style={{ width: 250 }}>Change</th>
-              </tr>
-            </thead>
-            <tbody>
-              {pii.patterns.map(p => {
-                const enabled = !p.disabled
-                const muted = p.disabled
-                return (
-                <tr key={p.id} style={muted ? { opacity: 0.55 } : undefined}>
-                  <td>
-                    <input
-                      type="checkbox"
-                      checked={enabled}
-                      disabled={pendingPattern === p.id}
-                      onChange={e => onSetDisabled(p.id, !e.target.checked)}
-                      style={{ cursor: 'pointer' }}
-                      aria-label={`Enable ${p.id} pattern`}
-                    />
-                  </td>
-                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem', fontWeight: 600 }}>{p.id}</td>
-                  <td style={{ fontSize: '0.8125rem', color: 'var(--color-text-secondary)' }}>{p.description}</td>
-                  <td>{actionBadge(p.action)}</td>
-                  <td>
-                    <div style={{ display: 'flex', gap: 4 }}>
-                      {ACTIONS.map(a => (
-                        <button
-                          key={a}
-                          className={`btn btn-sm ${p.action === a ? 'btn-primary' : 'btn-secondary'}`}
-                          onClick={() => onSetAction(p.id, a)}
-                          disabled={pendingPattern === p.id || p.action === a || p.disabled}
-                          style={{ fontSize: '0.6875rem', padding: '2px 8px' }}
-                        >
-                          {a}
-                        </button>
-                      ))}
-                    </div>
-                  </td>
-                </tr>
-              )})}
-            </tbody>
-          </table>
-        </div>
-      </div>
+      {/* Detector models + instance-wide default policy (per-row toggle) */}
+      <DetectorModels pii={pii} addToast={addToast} onChanged={onChanged} />
 
       {/* Per-model resolved state */}
       <div className="card" style={{ padding: 'var(--spacing-md)' }}>
         <div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', marginBottom: 'var(--spacing-sm)' }}>
           <span style={{ fontSize: '0.875rem', fontWeight: 600 }}>Per-model state</span>
           <span style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
-            Edit the model YAML to change these.
+            Toggle PII inline; edit a row for detectors and policy.
           </span>
         </div>
         <div className="table-container">
@@ -330,9 +230,9 @@ function FilteringTab({ status, pendingPattern, onSetAction, onSetDisabled, onPe
               <tr>
                 <th>Model</th>
                 <th style={{ width: 120 }}>Backend</th>
-                <th style={{ width: 80 }}>PII</th>
+                <th style={{ width: 120 }}>PII</th>
                 <th style={{ width: 110 }}>Source</th>
-                <th>Pattern overrides</th>
+                <th>Detectors</th>
                 <th style={{ width: 80 }}>Edit</th>
               </tr>
             </thead>
@@ -341,18 +241,35 @@ function FilteringTab({ status, pendingPattern, onSetAction, onSetDisabled, onPe
                 <tr key={m.name}>
                   <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem' }}>{m.name}</td>
                   <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>{m.backend || '—'}</td>
-                  <td>{enabledBadge(m.enabled)}</td>
+                  <td>
+                    <span style={{ display: 'inline-flex', alignItems: 'center', gap: 6 }}>
+                      <Toggle
+                        checked={!!m.enabled}
+                        disabled={piiBusy.has(m.name)}
+                        onChange={(v) => togglePII(m.name, v)}
+                      />
+                      {m.enabled && (!m.detectors || m.detectors.length === 0) && (
+                        <span
+                          title="Enabled but no detector resolved — nothing is scanned. Toggle a detector's Default on above, or add pii.detectors to the model."
+                          style={{ fontSize: '0.6875rem', fontWeight: 600, color: 'var(--color-warning)', whiteSpace: 'nowrap', cursor: 'help' }}
+                        >
+                          <i className="fas fa-triangle-exclamation" style={{ marginRight: 3 }} />no-op
+                        </span>
+                      )}
+                    </span>
+                  </td>
                   <td style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
                     {m.explicit ? 'YAML' : (m.default_for_backend ? 'backend default' : 'default off')}
                   </td>
                   <td style={{ fontSize: '0.75rem', fontFamily: 'var(--font-mono)' }}>
-                    {m.overrides && Object.keys(m.overrides).length > 0
-                      ? Object.entries(m.overrides).map(([k, v]) => `${k}=${v}`).join(', ')
+                    {m.detectors && m.detectors.length > 0
+                      ? <>{m.detectors.join(', ')}{m.detectors_from_default && <span style={{ color: 'var(--color-text-muted)', fontFamily: 'var(--font-sans)' }}> (default)</span>}</>
                       : <span style={{ color: 'var(--color-text-muted)' }}>—</span>}
                   </td>
                   <td>
                     <Link
                       to={`/app/model-editor/${encodeURIComponent(m.name)}`}
+                      state={fromState(location, 'Middleware')}
                       className="btn btn-secondary btn-sm"
                       style={{ fontSize: '0.6875rem', padding: '2px 8px' }}
                       title={`Edit ${m.name}.yaml`}
@@ -377,6 +294,147 @@ function FilteringTab({ status, pendingPattern, onSetAction, onSetDisabled, onPe
   )
 }
 
+// detectorTypeBadge labels a detector model by how it matches: a neural NER
+// token-classifier vs an in-process restricted-regex pattern matcher. `unknown`
+// is a default that names a model no longer loaded.
+function detectorTypeBadge(type) {
+  const map = {
+    ner: { label: 'NER', color: 'var(--color-primary)' },
+    pattern: { label: 'pattern', color: 'var(--color-data-2, var(--color-warning))' },
+    unknown: { label: 'not loaded', color: 'var(--color-text-muted)' },
+  }
+  const t = map[type] || map.unknown
+  return (
+    <span style={{
+      display: 'inline-block',
+      padding: '2px 8px',
+      fontSize: '0.6875rem',
+      fontWeight: 600,
+      borderRadius: 'var(--radius-sm)',
+      background: t.color,
+      color: 'white',
+      fontFamily: 'var(--font-mono)',
+      textTransform: 'uppercase',
+    }}>
+      {t.label}
+    </span>
+  )
+}
+
+// DetectorModels lists the token_classify "filter" models (NER + in-process
+// pattern matchers) and, via a per-row toggle, manages the instance-wide
+// default detector set (RuntimeSettings.pii_default_detectors, saved via POST
+// /api/settings). A detector toggled on is applied to any PII-enabled model
+// that names none of its own — chiefly cloud-proxy / MITM models, which are
+// PII-enabled by default but carry no detector list. Per-model `pii.detectors`
+// always overrides. This replaces the old model-multiselect chooser: the table
+// shows every available detector, so admins toggle defaults instead of retyping
+// names, and link straight to each detector's config to edit its policy.
+function DetectorModels({ pii, addToast, onChanged }) {
+  const navigate = useNavigate()
+  const location = useLocation()
+  const rows = useMemo(() => pii.detector_models || [], [pii.detector_models])
+  // Names currently in the default set; the toggle adds/removes against this.
+  const defaults = useMemo(() => pii.default_detectors || [], [pii.default_detectors])
+  // Track which rows are mid-save to disable just that toggle (optimistic).
+  const [busy, setBusy] = useState(() => new Set())
+
+  const toggleDefault = async (name, on) => {
+    const next = on
+      ? [...new Set([...defaults, name])]
+      : defaults.filter(d => d !== name)
+    setBusy(prev => new Set(prev).add(name))
+    try {
+      const body = await settingsApi.save({ pii_default_detectors: next })
+      if (body && body.success === false) throw new Error(body.error || 'unknown error')
+      addToast?.(on ? `${name} added to default detectors` : `${name} removed from default detectors`, 'success')
+      onChanged?.()
+    } catch (err) {
+      addToast?.(`Failed to save: ${err.message}`, 'error')
+    } finally {
+      setBusy(prev => { const n = new Set(prev); n.delete(name); return n })
+    }
+  }
+
+  return (
+    <div className="card" style={{ padding: 'var(--spacing-md)', marginBottom: 'var(--spacing-md)' }}>
+      <div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', marginBottom: 'var(--spacing-sm)', gap: 'var(--spacing-sm)', flexWrap: 'wrap' }}>
+        <span style={{ fontSize: '0.875rem', fontWeight: 600 }}>Detector models</span>
+        <button
+          className="btn btn-secondary btn-sm"
+          onClick={() => navigate('/app/model-editor?template=secret-filter', { state: fromState(location, 'Middleware') })}
+          title="Add a NER or pattern detector model"
+        >
+          <i className="fas fa-plus" /> Add detector model
+        </button>
+      </div>
+      <div style={{ fontSize: '0.8125rem', color: 'var(--color-text-secondary)', marginBottom: 'var(--spacing-sm)' }}>
+        These token_classify models do the scanning. Toggle <strong>Default</strong> on to apply a
+        detector to any PII-enabled model that names none of its own (chiefly cloud-proxy / MITM models).
+        Per-model <code>pii.detectors</code> always overrides. Edit a detector to change which entities it
+        flags and what action it takes.
+      </div>
+
+      <div className="table-container">
+        <table className="table">
+          <thead>
+            <tr>
+              <th>Detector model</th>
+              <th style={{ width: 110 }}>Type</th>
+              <th style={{ width: 120 }}>Backend</th>
+              <th style={{ width: 110 }}>Default</th>
+              <th style={{ width: 80 }}>Edit</th>
+            </tr>
+          </thead>
+          <tbody>
+            {rows.map(d => (
+              <tr key={d.name}>
+                <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem', fontWeight: 600 }}>
+                  {d.missing
+                    ? <span title="This default detector names a model that is not loaded.">{d.name}</span>
+                    : <Link to={`/app/model-editor/${encodeURIComponent(d.name)}`} state={fromState(location, 'Middleware')} title={`Edit ${d.name}.yaml`}>{d.name}</Link>}
+                </td>
+                <td>{detectorTypeBadge(d.type)}</td>
+                <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>{d.backend || '—'}</td>
+                <td>
+                  <Toggle
+                    checked={!!d.default}
+                    disabled={busy.has(d.name)}
+                    onChange={(v) => toggleDefault(d.name, v)}
+                  />
+                </td>
+                <td>
+                  {d.missing ? (
+                    <span style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>—</span>
+                  ) : (
+                    <Link
+                      to={`/app/model-editor/${encodeURIComponent(d.name)}`}
+                      state={fromState(location, 'Middleware')}
+                      className="btn btn-secondary btn-sm"
+                      style={{ fontSize: '0.6875rem', padding: '2px 8px' }}
+                      title={`Edit ${d.name}.yaml`}
+                    >
+                      <i className="fas fa-pen-to-square" /> Edit
+                    </Link>
+                  )}
+                </td>
+              </tr>
+            ))}
+            {rows.length === 0 && (
+              <tr>
+                <td colSpan={5} style={{ textAlign: 'center', color: 'var(--color-text-muted)', padding: 'var(--spacing-md)' }}>
+                  No detector models loaded. Add one with the button above (a token_classify NER model
+                  or a built-in secret pattern model).
+                </td>
+              </tr>
+            )}
+          </tbody>
+        </table>
+      </div>
+    </div>
+  )
+}
+
 // decisionActiveSet rebuilds the Set of active labels from a
 // DecisionRecord's comma-joined `label` column. Used by both the
 // collapsed-row score suffix and the expanded-row bar rendering.
@@ -485,6 +543,7 @@ function DecisionDetail({ d }) {
 
 function RoutingTab({ status, decisions }) {
   const navigate = useNavigate()
+  const location = useLocation()
   const router = status?.router || { configured: false }
   const [expanded, setExpanded] = useState(() => new Set())
 
@@ -519,7 +578,7 @@ function RoutingTab({ status, decisions }) {
         <button
           className="btn btn-primary"
           style={{ marginTop: 'var(--spacing-md)' }}
-          onClick={() => navigate('/app/model-editor?template=router')}
+          onClick={() => navigate('/app/model-editor?template=router', { state: fromState(location, 'Middleware') })}
         >
           <i className="fas fa-plus" /> Create routing model
         </button>
@@ -539,7 +598,7 @@ function RoutingTab({ status, decisions }) {
             </span>
             <button
               className="btn btn-secondary btn-sm"
-              onClick={() => navigate('/app/model-editor?template=router')}
+              onClick={() => navigate('/app/model-editor?template=router', { state: fromState(location, 'Middleware') })}
               title="Open the model editor with the Routing Model template pre-selected"
             >
               <i className="fas fa-plus" /> Add routing model
@@ -560,7 +619,9 @@ function RoutingTab({ status, decisions }) {
             <tbody>
               {router.models.map(m => (
                 <tr key={m.name}>
-                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem', fontWeight: 600 }}>{m.name}</td>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem', fontWeight: 600 }}>
+                    <Link to={`/app/model-editor/${encodeURIComponent(m.name)}`} state={fromState(location, 'Middleware')} title="Edit this router model's config">{m.name}</Link>
+                  </td>
                   <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem' }}>{m.classifier}</td>
                   <td style={{ fontSize: '0.75rem' }}>
                     {(m.candidates || []).map((c, i) => (
@@ -657,6 +718,7 @@ function RoutingTab({ status, decisions }) {
 
 function ProxyTab({ status, addToast, onChanged }) {
   const navigate = useNavigate()
+  const location = useLocation()
   const mitm = status?.mitm
   const serverListen = mitm?.configured_addr || ''
 
@@ -722,7 +784,7 @@ function ProxyTab({ status, addToast, onChanged }) {
                 <code style={{ fontFamily: 'var(--font-mono)' }}>{h}</code>
                 {' claimed by: '}
                 {(conflicts[h] || []).map(name => (
-                  <Link key={name} to={`/app/model-editor/${encodeURIComponent(name)}`} style={{ marginRight: 6, fontFamily: 'var(--font-mono)' }}>
+                  <Link key={name} to={`/app/model-editor/${encodeURIComponent(name)}`} state={fromState(location, 'Middleware')} style={{ marginRight: 6, fontFamily: 'var(--font-mono)' }}>
                     {name}
                   </Link>
                 ))}
@@ -754,7 +816,7 @@ function ProxyTab({ status, addToast, onChanged }) {
             <ul style={{ margin: 0, paddingLeft: 20, fontFamily: 'var(--font-mono)' }}>
               {ownerEntries.map(([host, name]) => (
                 <li key={host}>
-                  {host} → <Link to={`/app/model-editor/${encodeURIComponent(name)}`}>{name}</Link>
+                  {host} → <Link to={`/app/model-editor/${encodeURIComponent(name)}`} state={fromState(location, 'Middleware')}>{name}</Link>
                 </li>
               ))}
             </ul>
@@ -784,7 +846,7 @@ function ProxyTab({ status, addToast, onChanged }) {
           <h2 style={{ fontSize: '1rem', fontWeight: 600, margin: 0 }}>MITM Models</h2>
           <button
             className="btn btn-secondary btn-sm"
-            onClick={() => navigate('/app/model-editor?template=mitm')}
+            onClick={() => navigate('/app/model-editor?template=mitm', { state: fromState(location, 'Middleware') })}
             title="Open the model editor with the MITM Intercept template pre-selected"
           >
             <i className="fas fa-plus" /> Add MITM model
@@ -815,6 +877,7 @@ function ProxyTab({ status, addToast, onChanged }) {
                   <td>
                     <Link
                       to={`/app/model-editor/${encodeURIComponent(m.name)}`}
+                      state={fromState(location, 'Middleware')}
                       className="btn btn-secondary btn-sm"
                       style={{ fontSize: '0.6875rem', padding: '2px 8px' }}
                     >
@@ -992,7 +1055,7 @@ function EventsTab({ events }) {
           <div className="empty-state-icon"><i className="fas fa-list-ul" /></div>
           <h2 className="empty-state-title">No events</h2>
           <p className="empty-state-text">
-            Events appear here when the PII filter matches a pattern, when the MITM proxy decides whether
+            Events appear here when a PII detector flags an entity, when the MITM proxy decides whether
             to intercept a hostname, or when an intercepted request finishes. Request bodies are never
             stored — use the API and backend traces for that.
           </p>
diff --git a/core/http/react-ui/src/pages/ModelEditor.jsx b/core/http/react-ui/src/pages/ModelEditor.jsx
index 9cb032f1b38c..5107b9613f17 100644
--- a/core/http/react-ui/src/pages/ModelEditor.jsx
+++ b/core/http/react-ui/src/pages/ModelEditor.jsx
@@ -1,5 +1,5 @@
 import { useState, useEffect, useRef, useMemo, useCallback } from 'react'
-import { useParams, useNavigate, useOutletContext, useSearchParams } from 'react-router-dom'
+import { useParams, useNavigate, useOutletContext, useSearchParams, useLocation } from 'react-router-dom'
 import YAML from 'yaml'
 import { modelsApi } from '../utils/api'
 import { apiUrl } from '../utils/basePath'
@@ -17,7 +17,8 @@ const SECTION_ICONS = {
   general: 'fa-cog', llm: 'fa-microchip', parameters: 'fa-sliders',
   templates: 'fa-file-code', functions: 'fa-wrench', reasoning: 'fa-brain',
   diffusers: 'fa-image', tts: 'fa-volume-up', pipeline: 'fa-code-branch',
-  grpc: 'fa-server', agent: 'fa-robot', mcp: 'fa-plug', other: 'fa-ellipsis-h',
+  grpc: 'fa-server', agent: 'fa-robot', mcp: 'fa-plug', router: 'fa-route', proxy: 'fa-cloud',
+  mitm: 'fa-user-secret', pii: 'fa-user-shield', other: 'fa-ellipsis-h',
 }
 
 const SECTION_COLORS = {
@@ -25,16 +26,27 @@ const SECTION_COLORS = {
   templates: 'var(--color-warning)', functions: 'var(--color-info, var(--color-primary))',
   reasoning: 'var(--color-accent)', diffusers: 'var(--color-warning)', tts: 'var(--color-success)',
   pipeline: 'var(--color-accent)', grpc: 'var(--color-text-muted)', agent: 'var(--color-primary)',
-  mcp: 'var(--color-accent)', other: 'var(--color-text-muted)',
+  mcp: 'var(--color-accent)', router: 'var(--color-accent)', proxy: 'var(--color-info, var(--color-primary))',
+  mitm: 'var(--color-warning)', pii: 'var(--color-error)', other: 'var(--color-text-muted)',
 }
 
-function flattenConfig(obj, prefix = '') {
+// flattenConfig turns a parsed YAML config into a flat { 'a.b.c': value }
+// map keyed by the same dotted paths the field registry uses. leafPaths is
+// the set of registered schema leaf paths: recursion STOPS at any of them so
+// a map-typed field (e.g. pii_detection.entity_actions, a {GROUP: action}
+// object) is stored whole at its own path. Without this guard a map's value
+// was scattered into `pii_detection.entity_actions.SSN` etc. — paths that
+// match no registered field — so the editor rendered neither the field nor
+// its values, hiding per-entity policy like SSN→block from the operator.
+function flattenConfig(obj, leafPaths, prefix = '') {
   const result = {}
   if (!obj || typeof obj !== 'object') return result
   for (const [key, val] of Object.entries(obj)) {
     const path = prefix ? `${prefix}.${key}` : key
-    if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
-      Object.assign(result, flattenConfig(val, path))
+    if (leafPaths && leafPaths.has(path)) {
+      result[path] = val
+    } else if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
+      Object.assign(result, flattenConfig(val, leafPaths, path))
     } else {
       result[path] = val
     }
@@ -71,9 +83,23 @@ export default function ModelEditor() {
   const { name } = useParams()
   const [searchParams] = useSearchParams()
   const navigate = useNavigate()
+  const location = useLocation()
+  // Where the Back button returns to. Set by whichever page linked here (see
+  // utils/editorNav); falls back to the historical defaults for direct visits.
+  const backState = location.state && location.state.from ? location.state : null
   const { addToast } = useOutletContext()
   const { sections, fields, loading: metaLoading, error: metaError } = useConfigMetadata()
 
+  // Registered schema leaf paths. flattenConfig stops recursing at these so
+  // map-typed fields (e.g. pii_detection.entity_actions) bind as a whole
+  // object to their registered editor instead of vanishing into sub-paths.
+  const leafPaths = useMemo(() => new Set(fields.map(f => f.path)), [fields])
+
+  // The parsed (not-yet-flattened) config loaded from the server. Flattening
+  // is deferred to a separate effect keyed on leafPaths so the schema metadata
+  // can arrive after the config without a fetch race re-clobbering values.
+  const [loadedConfig, setLoadedConfig] = useState(null)
+
   const isCreateMode = !name
   const [selectedTemplate, setSelectedTemplate] = useState(null)
 
@@ -89,7 +115,6 @@ export default function ModelEditor() {
   const [activeSection, setActiveSection] = useState(null)
   const [tabSwitchWarning, setTabSwitchWarning] = useState(false)
 
-  const contentRef = useRef(null)
   const sectionRefs = useRef({})
 
   const vramEstimate = useVramEstimate({
@@ -116,7 +141,9 @@ export default function ModelEditor() {
     }
   }, [isCreateMode, searchParams, handleSelectTemplate])
 
-  // Load raw YAML config (edit mode only)
+  // Load raw YAML config (edit mode only). This only fetches + parses; the
+  // flatten-into-form-values step is the separate effect below so it can
+  // re-run when the schema metadata (leafPaths) resolves without re-fetching.
   useEffect(() => {
     if (!name) return
     modelsApi.getEditConfig(name)
@@ -124,26 +151,29 @@ export default function ModelEditor() {
         const raw = data?.config || ''
         setYamlText(raw)
         setSavedYamlText(raw)
-
-        // Parse YAML to get only the fields actually present in the file
         try {
-          const parsed = YAML.parse(raw)
-          const flat = flattenConfig(parsed || {})
-          const active = new Set(Object.keys(flat))
-          setValues(flat)
-          setInitialValues(structuredClone(flat))
-          setActiveFieldPaths(active)
+          setLoadedConfig(YAML.parse(raw) || {})
         } catch {
-          // If YAML parsing fails, start with empty state
-          setValues({})
-          setInitialValues({})
-          setActiveFieldPaths(new Set())
+          setLoadedConfig({})
         }
       })
       .catch(err => addToast(`Failed to load config: ${err.message}`, 'error'))
       .finally(() => setConfigLoading(false))
   }, [name, addToast])
 
+  // Flatten the loaded config into form values. Keyed on leafPaths so a late
+  // schema-metadata resolution re-flattens (keeping map fields whole) WITHOUT
+  // re-fetching — avoiding a two-fetch race that could clobber values. Only
+  // fires on (re)load: loadedConfig changes per model, leafPaths is stable
+  // once metadata is in, so this never stomps in-progress edits.
+  useEffect(() => {
+    if (loadedConfig === null) return
+    const flat = flattenConfig(loadedConfig, leafPaths)
+    setValues(flat)
+    setInitialValues(structuredClone(flat))
+    setActiveFieldPaths(new Set(Object.keys(flat)))
+  }, [loadedConfig, leafPaths])
+
   // Build field lookup
   const fieldsByPath = useMemo(() => {
     const map = {}
@@ -187,25 +217,29 @@ export default function ModelEditor() {
     }
   }, [activeSection, activeSections])
 
-  // Scroll tracking
+  // Scroll tracking — the editor used to have its own overflow:auto pane
+  // and listened to that container's scroll; the pane has been removed so
+  // small screens don't have the global footer always clipping into the
+  // form. Scrolling now happens at the window level, and the anchor for
+  // "which section is at the top" is a fixed viewport offset (the sticky
+  // sidebar sits roughly at the top of the editor area).
   useEffect(() => {
-    const container = contentRef.current
-    if (!container || tab !== 'interactive') return
+    if (tab !== 'interactive') return
     const onScroll = () => {
-      const containerTop = container.getBoundingClientRect().top
+      const anchorY = 80 // viewport px below which a section is "active"
       let closest = activeSections[0]?.id
       let closestDist = Infinity
       for (const s of activeSections) {
         const el = sectionRefs.current[s.id]
         if (el) {
-          const dist = Math.abs(el.getBoundingClientRect().top - containerTop - 8)
+          const dist = Math.abs(el.getBoundingClientRect().top - anchorY)
           if (dist < closestDist) { closestDist = dist; closest = s.id }
         }
       }
       if (closest) setActiveSection(closest)
     }
-    container.addEventListener('scroll', onScroll, { passive: true })
-    return () => container.removeEventListener('scroll', onScroll)
+    window.addEventListener('scroll', onScroll, { passive: true })
+    return () => window.removeEventListener('scroll', onScroll)
   }, [activeSections, configLoading, metaLoading, tab])
 
   const scrollTo = (id) => {
@@ -263,7 +297,9 @@ export default function ModelEditor() {
         if (!/^[a-zA-Z0-9_.-]+$/.test(modelName.trim())) { addToast('Invalid model name — use only letters, numbers, hyphens, underscores, and dots', 'error'); setSaving(false); return }
         await modelsApi.importConfig(JSON.stringify(config), 'application/json')
         addToast('Model created successfully', 'success')
-        navigate(`/app/model-editor/${encodeURIComponent(modelName.trim())}`)
+        // replace: the transient create URL shouldn't sit in history, so
+        // Back (browser or in-page) skips it and returns to the linking page.
+        navigate(`/app/model-editor/${encodeURIComponent(modelName.trim())}`, { replace: true, state: backState })
       } else {
         await modelsApi.patchConfig(name, config)
         setInitialValues(structuredClone(values))
@@ -293,9 +329,9 @@ export default function ModelEditor() {
         addToast('Model created successfully', 'success')
         try {
           const parsed = YAML.parse(yamlText)
-          if (parsed?.name) navigate(`/app/model-editor/${encodeURIComponent(parsed.name)}`)
-          else navigate('/app/manage')
-        } catch { navigate('/app/manage') }
+          if (parsed?.name) navigate(`/app/model-editor/${encodeURIComponent(parsed.name)}`, { replace: true, state: backState })
+          else navigate(backState ? backState.from : '/app/manage')
+        } catch { navigate(backState ? backState.from : '/app/manage') }
       } else {
         const response = await fetch(apiUrl(`/models/edit/${encodeURIComponent(name)}`), {
           method: 'POST',
@@ -312,7 +348,7 @@ export default function ModelEditor() {
         try {
           const parsed = YAML.parse(yamlText)
           parsedName = parsed?.name ?? null
-          const flat = flattenConfig(parsed || {})
+          const flat = flattenConfig(parsed || {}, leafPaths)
           setValues(flat)
           setInitialValues(structuredClone(flat))
           setActiveFieldPaths(new Set(Object.keys(flat)))
@@ -323,7 +359,7 @@ export default function ModelEditor() {
         // editor URL points at a name that no longer exists on the backend.
         // Redirect so refreshes and subsequent saves hit the new name.
         if (parsedName && parsedName !== name) {
-          navigate(`/app/model-editor/${encodeURIComponent(parsedName)}`, { replace: true })
+          navigate(`/app/model-editor/${encodeURIComponent(parsedName)}`, { replace: true, state: backState })
         }
       }
     } catch (err) {
@@ -405,9 +441,14 @@ export default function ModelEditor() {
         <div style={{ display: 'flex', gap: 'var(--spacing-sm)' }}>
           <button className="btn btn-secondary" onClick={() => {
             if (isCreateMode && selectedTemplate) { setSelectedTemplate(null); setValues({}); setActiveFieldPaths(new Set()) }
+            else if (backState) navigate(backState.from)
             else navigate(isCreateMode ? '/app/models' : '/app/manage')
           }}>
-            <i className="fas fa-arrow-left" /> Back
+            <i className="fas fa-arrow-left" /> Back to {
+              isCreateMode && selectedTemplate ? 'Templates'
+                : backState ? backState.fromLabel
+                  : isCreateMode ? 'Models' : 'Manage'
+            }
           </button>
           {!showTemplateSelector && tab === 'interactive' && (
             <button className={`btn ${isDirty ? 'btn-primary' : 'btn-secondary'}`} onClick={handleInteractiveSave} disabled={saving || !isDirty}>
@@ -543,12 +584,15 @@ export default function ModelEditor() {
             />
           </div>
 
-          {/* Two-column layout */}
-          <div style={{ display: 'flex', gap: 0, minHeight: 'calc(100vh - 340px)' }}>
-            {/* Sidebar */}
+          {/* Two-column layout. Both columns flow at body-scroll height —
+              no inner overflow:auto here, so the global footer ends up
+              below the content (like every other page) instead of pinned
+              to the viewport bottom, eating editing space on short screens. */}
+          <div style={{ display: 'flex', gap: 0 }}>
+            {/* Sidebar — sticks to the top of the viewport as the body scrolls. */}
             <nav style={{
               width: 180, flexShrink: 0, padding: '0 var(--spacing-sm)',
-              position: 'sticky', top: 0, alignSelf: 'flex-start',
+              position: 'sticky', top: 'var(--spacing-md)', alignSelf: 'flex-start',
             }}>
               {activeSections.map(s => (
                 <button
@@ -584,10 +628,8 @@ export default function ModelEditor() {
 
             {/* Content */}
             <div
-              ref={contentRef}
               style={{
-                flex: 1, overflow: 'auto', padding: '0 var(--spacing-lg) var(--spacing-xl) var(--spacing-md)',
-                maxHeight: 'calc(100vh - 340px)',
+                flex: 1, padding: '0 var(--spacing-lg) var(--spacing-xl) var(--spacing-md)',
               }}
             >
               {activeSections.length === 0 && (
diff --git a/core/http/react-ui/src/pages/Models.jsx b/core/http/react-ui/src/pages/Models.jsx
index 761f07142197..4c7b6044f55e 100644
--- a/core/http/react-ui/src/pages/Models.jsx
+++ b/core/http/react-ui/src/pages/Models.jsx
@@ -1,6 +1,7 @@
 import { useState, useCallback, useEffect } from 'react'
-import { useNavigate, useOutletContext } from 'react-router-dom'
+import { useNavigate, useOutletContext, useLocation } from 'react-router-dom'
 import { useTranslation } from 'react-i18next'
+import { fromState } from '../utils/editorNav'
 import { modelsApi } from '../utils/api'
 import { safeHref } from '../utils/url'
 import { useDebouncedCallback } from '../hooks/useDebounce'
@@ -35,11 +36,13 @@ const FILTERS = [
   { key: 'rerank', labelKey: 'filters.rerank', icon: 'fa-sort' },
   { key: 'detection', labelKey: 'filters.detection', icon: 'fa-bullseye' },
   { key: 'vad', labelKey: 'filters.vad', icon: 'fa-wave-square' },
+  { key: 'token_classify', labelKey: 'filters.ner', icon: 'fa-tags' },
 ]
 
 export default function Models() {
   const { addToast } = useOutletContext()
   const navigate = useNavigate()
+  const location = useLocation()
   const { t } = useTranslation('models')
   const { operations } = useOperations()
   const { resources } = useResources()
@@ -286,7 +289,7 @@ export default function Models() {
               </a>
             </div>
           </div>
-          <button className="btn btn-primary btn-sm" onClick={() => navigate('/app/model-editor')}>
+          <button className="btn btn-primary btn-sm" onClick={() => navigate('/app/model-editor', { state: fromState(location, 'Models') })}>
             <i className="fas fa-plus" /> {t('actions.addModel')}
           </button>
           <button className="btn btn-secondary btn-sm" onClick={() => navigate('/app/import-model')}>
diff --git a/core/http/react-ui/src/pages/Talk.jsx b/core/http/react-ui/src/pages/Talk.jsx
index cf92102ac4d7..3d8e6a53ef76 100644
--- a/core/http/react-ui/src/pages/Talk.jsx
+++ b/core/http/react-ui/src/pages/Talk.jsx
@@ -1,6 +1,7 @@
 import { useState, useRef, useEffect, useCallback, useMemo } from 'react'
-import { useOutletContext, useNavigate } from 'react-router-dom'
+import { useOutletContext, useNavigate, useLocation } from 'react-router-dom'
 import { realtimeApi } from '../utils/api'
+import { fromState } from '../utils/editorNav'
 import ModelSelector from '../components/ModelSelector'
 import ClientMCPDropdown from '../components/ClientMCPDropdown'
 import { useMCPClient } from '../hooks/useMCPClient'
@@ -20,6 +21,7 @@ const STATUS_STYLES = {
 export default function Talk() {
   const { addToast } = useOutletContext()
   const navigate = useNavigate()
+  const location = useLocation()
 
   // Pipeline models
   const [pipelineModels, setPipelineModels] = useState([])
@@ -630,7 +632,7 @@ export default function Talk() {
               disabled={isConnected}
               searchPlaceholder="Search pipeline models..."
             />
-            <button className="btn btn-secondary btn-sm" onClick={() => navigate('/app/model-editor?template=pipeline')}
+            <button className="btn btn-secondary btn-sm" onClick={() => navigate('/app/model-editor?template=pipeline', { state: fromState(location, 'Talk') })}
               style={{ marginTop: 'var(--spacing-xs)' }}>
               <i className="fas fa-plus" style={{ marginRight: 'var(--spacing-xs)' }} /> Create Pipeline Model
             </button>
@@ -710,7 +712,7 @@ export default function Talk() {
           )}
           {selectedModelInfo && !isConnected && (
             <div style={{ marginBottom: 'var(--spacing-md)' }}>
-              <button className="btn btn-secondary btn-sm" onClick={() => navigate(`/app/model-editor/${encodeURIComponent(selectedModel)}`)}>
+              <button className="btn btn-secondary btn-sm" onClick={() => navigate(`/app/model-editor/${encodeURIComponent(selectedModel)}`, { state: fromState(location, 'Talk') })}>
                 <i className="fas fa-pen-to-square" style={{ marginRight: 'var(--spacing-xs)' }} />
                 {selectedModelInfo.self_contained ? ' Edit Model Config' : ' Edit Pipeline'}
               </button>
diff --git a/core/http/react-ui/src/pages/Traces.jsx b/core/http/react-ui/src/pages/Traces.jsx
index 7bceaa299095..4d0efe229780 100644
--- a/core/http/react-ui/src/pages/Traces.jsx
+++ b/core/http/react-ui/src/pages/Traces.jsx
@@ -74,6 +74,9 @@ const TYPE_COLORS = {
   tokenize: { bg: 'var(--color-secondary-light)', color: 'var(--color-text-muted)' },
   detection: { bg: 'var(--color-info-light)', color: 'var(--color-data-8)' },
   model_load: { bg: 'var(--color-error-light)', color: 'var(--color-data-2)' },
+  vector_store: { bg: 'var(--color-accent-light)', color: 'var(--color-data-7)' },
+  token_classify: { bg: 'var(--color-info-light)', color: 'var(--color-data-3)' },
+  pattern_pii: { bg: 'var(--color-error-light)', color: 'var(--color-data-2)' },
 }
 
 function typeBadgeStyle(type) {
diff --git a/core/http/react-ui/src/utils/capabilities.js b/core/http/react-ui/src/utils/capabilities.js
index e7c81695bce0..95dd4bb7adaf 100644
--- a/core/http/react-ui/src/utils/capabilities.js
+++ b/core/http/react-ui/src/utils/capabilities.js
@@ -22,3 +22,4 @@ export const CAP_SPEAKER_RECOGNITION = 'FLAG_SPEAKER_RECOGNITION'
 export const CAP_AUDIO_TRANSFORM = 'FLAG_AUDIO_TRANSFORM'
 export const CAP_REALTIME_AUDIO = 'FLAG_REALTIME_AUDIO'
 export const CAP_SCORE = 'FLAG_SCORE'
+export const CAP_TOKEN_CLASSIFY = 'FLAG_TOKEN_CLASSIFY'
diff --git a/core/http/react-ui/src/utils/cmGoTemplate.js b/core/http/react-ui/src/utils/cmGoTemplate.js
new file mode 100644
index 000000000000..f3e567abe9da
--- /dev/null
+++ b/core/http/react-ui/src/utils/cmGoTemplate.js
@@ -0,0 +1,46 @@
+import { StreamLanguage } from '@codemirror/language'
+
+// Go text/template keywords valid inside an action `{{ ... }}`.
+const KEYWORDS = new Set([
+  'if', 'else', 'end', 'range', 'with', 'define', 'template',
+  'block', 'break', 'continue', 'nil', 'true', 'false',
+])
+
+// Minimal Go text/template highlighter: distinguishes literal text from
+// action bodies inside `{{ ... }}`. Highlighting only — it does not
+// validate template grammar.
+export const goTemplate = StreamLanguage.define({
+  startState() {
+    return { inAction: false }
+  },
+  token(stream, state) {
+    if (!state.inAction) {
+      if (stream.match('{{')) {
+        state.inAction = true
+        return 'meta'
+      }
+      while (!stream.eol()) {
+        if (stream.match('{{', false)) break
+        stream.next()
+      }
+      return null
+    }
+
+    if (stream.match('}}')) {
+      state.inAction = false
+      return 'meta'
+    }
+    if (stream.eatSpace()) return null
+    if (stream.match(/^-(?=\s)/) || stream.match(/^[|()]/)) return 'operator'
+    if (stream.match(/^"(?:[^"\\]|\\.)*"/)) return 'string'
+    if (stream.match(/^`[^`]*`/)) return 'string'
+    if (stream.match(/^\$[a-zA-Z0-9_]*/)) return 'variable-2'
+    if (stream.match(/^\.[a-zA-Z0-9_.]*/)) return 'property'
+    if (stream.match(/^[0-9]+(\.[0-9]+)?/)) return 'number'
+    if (stream.match(/^[a-zA-Z_][a-zA-Z0-9_]*/)) {
+      return KEYWORDS.has(stream.current()) ? 'keyword' : 'variable'
+    }
+    stream.next()
+    return null
+  },
+})
diff --git a/core/http/react-ui/src/utils/editorNav.js b/core/http/react-ui/src/utils/editorNav.js
new file mode 100644
index 000000000000..6afb9df48dd8
--- /dev/null
+++ b/core/http/react-ui/src/utils/editorNav.js
@@ -0,0 +1,15 @@
+// Navigation context for the Model Editor.
+//
+// Many pages link into the Model Editor (Models, Manage, Chat, Talk, Agent
+// Jobs, Middleware…). Its in-page Back button used to navigate to a hardcoded
+// route, so it always dumped you on the same page regardless of where you came
+// from. To fix that, every linker passes this object as react-router location
+// state; the editor reads it and returns you to the exact page that linked
+// here, labelled "Back to <label>".
+//
+// `location` is the source page's useLocation() value, so `from` captures the
+// full path including any sub-route or query string — returning lands you
+// where you actually were, not just on the section root.
+export function fromState(location, label) {
+  return { from: location.pathname + location.search, fromLabel: label }
+}
diff --git a/core/http/react-ui/src/utils/modelTemplates.js b/core/http/react-ui/src/utils/modelTemplates.js
index a733127a4cd6..54d34aecc677 100644
--- a/core/http/react-ui/src/utils/modelTemplates.js
+++ b/core/http/react-ui/src/utils/modelTemplates.js
@@ -146,22 +146,38 @@ const MODEL_TEMPLATES = [
     id: 'mitm',
     label: 'MITM Intercept',
     icon: 'fa-shield-halved',
-    description: 'Bind a hostname to this config for the cloudproxy MITM listener. PII filtering and pattern overrides flow from this config when the host is intercepted.',
+    description: 'Bind a hostname to this config for the cloudproxy MITM listener. PII filtering (the NER detectors listed here) is applied to intercepted request bodies for the host.',
     // The mitm- name prefix is a convention, not a contract — the
     // dispatcher looks up by host, not name. Prefixing keeps the
     // config out of the way of callable model names so a chat client
     // accidentally requesting "anthropic" doesn't hit a backendless
     // intercept config.
     //
-    // pii.patterns is pre-seeded with an empty list so the override
-    // editor is visible by default — admins typically want to tighten
-    // a couple of pattern actions when intercepting a cloud provider.
-    // An empty list serializes out and the redactor ignores it.
+    // pii.detectors is pre-seeded empty so the detector picker is visible
+    // by default — admins point it at a token_classify model whose
+    // pii_detection block defines the policy.
     fields: {
       'name': 'mitm-anthropic',
       'mitm.hosts': ['api.anthropic.com'],
       'pii.enabled': true,
-      'pii.patterns': [],
+      'pii.detectors': [],
+    },
+  },
+  {
+    id: 'secret-filter',
+    label: 'Secret Pattern Detector',
+    icon: 'fa-key',
+    description: 'An in-process token_classify detector that flags high-entropy secrets (API keys, tokens) with bounded restricted-regex patterns — no backend, no GGUF, zero VRAM. Enable the built-in provider patterns below and/or add your own under PII Detection. Reference it from a model\'s pii.detectors, or toggle it on as a default detector on the Middleware page.',
+    fields: {
+      'name': 'secret-filter',
+      'backend': 'pattern',
+      'known_usecases': ['token_classify'],
+      'pii_detection.default_action': 'block',
+      'pii_detection.builtins': [
+        'anthropic_api_key', 'openai_api_key', 'github_token', 'github_pat',
+        'aws_access_key', 'google_api_key', 'slack_token', 'stripe_key',
+        'jwt', 'private_key_block',
+      ],
     },
   },
 ]
diff --git a/core/http/routes/anthropic.go b/core/http/routes/anthropic.go
index e68f88d5381f..288aa6f57b1a 100644
--- a/core/http/routes/anthropic.go
+++ b/core/http/routes/anthropic.go
@@ -22,8 +22,8 @@ import (
 
 func RegisterAnthropicRoutes(app *echo.Echo,
 	re *middleware.RequestExtractor,
-	application *application.Application) {
-
+	application *application.Application,
+) {
 	// Anthropic Messages API endpoint
 	var natsClient mcpTools.MCPNATSClient
 	if d := application.Distributed(); d != nil {
@@ -36,8 +36,6 @@ func RegisterAnthropicRoutes(app *echo.Echo,
 		application.TemplatesEvaluator(),
 		application.ApplicationConfig(),
 		natsClient,
-		application.PIIRedactor(),
-		application.PIIEvents(),
 	)
 
 	messagesMiddleware := []echo.MiddlewareFunc{
@@ -58,17 +56,18 @@ func RegisterAnthropicRoutes(app *echo.Echo,
 			middleware.AnthropicProbe,
 			router.SourceAnthropic,
 			middleware.ClassifierDeps{
-				Scorer:      application.Scorer,
-				Embedder:    application.Embedder,
-				VectorStore: application.VectorStore,
-				Reranker:    application.Reranker,
-				ModelLookup: application.ModelConfigLookup(),
-				Registry:    application.RouterClassifierRegistry(),
-				Evaluator:   application.TemplatesEvaluator(),
+				Scorer:       application.Scorer,
+				TokenCounter: application.TokenCounter,
+				Embedder:     application.Embedder,
+				VectorStore:  application.VectorStore,
+				Reranker:     application.Reranker,
+				ModelLookup:  application.ModelConfigLookup(),
+				Registry:     application.RouterClassifierRegistry(),
+				Evaluator:    application.TemplatesEvaluator(),
 			},
 		),
 		middleware.AdmissionControl(application.AdmissionLimiter(), application.PIIEvents()),
-		pii.RequestMiddleware(application.PIIRedactor(), application.PIIEvents(), piiadapter.Anthropic(), application.FallbackUser()),
+		pii.RequestMiddleware(application.PIIRedactor(), application.PIIEvents(), piiadapter.Anthropic(), application.FallbackUser(), pii.WithNERResolver(application.PIINERResolver()), pii.WithPolicyResolver(application.PIIPolicyResolver())),
 	}
 
 	// Main Anthropic endpoint
diff --git a/core/http/routes/middleware.go b/core/http/routes/middleware.go
index a5c5ca9cacfd..6d130863ad5e 100644
--- a/core/http/routes/middleware.go
+++ b/core/http/routes/middleware.go
@@ -135,13 +135,14 @@ func RegisterMiddlewareRoutes(e *echo.Echo, app *application.Application) {
 		app.ModelConfigLoader(),
 		app.ApplicationConfig(),
 		middleware.ClassifierDeps{
-			Scorer:      app.Scorer,
-			Embedder:    app.Embedder,
-			VectorStore: app.VectorStore,
-			Reranker:    app.Reranker,
-			ModelLookup: app.ModelConfigLookup(),
-			Registry:    app.RouterClassifierRegistry(),
-			Evaluator:   app.TemplatesEvaluator(),
+			Scorer:       app.Scorer,
+			TokenCounter: app.TokenCounter,
+			Embedder:     app.Embedder,
+			VectorStore:  app.VectorStore,
+			Reranker:     app.Reranker,
+			ModelLookup:  app.ModelConfigLookup(),
+			Registry:     app.RouterClassifierRegistry(),
+			Evaluator:    app.TemplatesEvaluator(),
 		},
 	)
 	e.POST("/api/router/decide", func(c echo.Context) error {
@@ -220,8 +221,8 @@ func buildRouterStatus(app *application.Application) map[string]any {
 	}
 
 	out := map[string]any{
-		"configured":          hasAny,
-		"models":              models,
+		"configured":            hasAny,
+		"models":                models,
 		"recent_decision_count": recentCount,
 		"available_classifiers": []string{router.ClassifierScore},
 	}
@@ -298,53 +299,85 @@ func buildAdmissionStatus(app *application.Application) map[string]any {
 }
 
 // buildPIIStatus builds the pii section of /api/middleware/status. It
-// reads the live redactor, walks every model config, and reports the
-// resolved enabled state plus any per-pattern overrides — that's what
-// the admin page renders side-by-side so the operator can see at a
-// glance which models are protected.
-//
-// Returns a sentinel "disabled" payload when the redactor is nil
-// (--disable-pii), letting the page show "filter switched off" rather
-// than a confusing empty state.
+// walks every model config and reports the resolved enabled state plus
+// the NER detector models each one references — that's what the admin
+// page renders so the operator can see at a glance which models are
+// protected and by which detectors. The detection policy itself
+// (entity→action, min score) lives on each detector model's
+// pii_detection block.
 func buildPIIStatus(app *application.Application) map[string]any {
-	redactor := app.PIIRedactor()
-	if redactor == nil {
-		return map[string]any{
-			"enabled_globally": false,
-			"reason":           "--disable-pii",
-			"patterns":         []any{},
-			"models":           []any{},
-		}
-	}
-
-	patterns := redactor.Patterns()
-	patternList := make([]map[string]any, 0, len(patterns))
-	for _, p := range patterns {
-		patternList = append(patternList, map[string]any{
-			"id":               p.ID,
-			"description":      p.Description,
-			"action":           string(p.Action),
-			"disabled":         p.Disabled,
-			"max_match_length": p.MaxMatchLength,
-		})
-	}
-
+	appCfg := app.ApplicationConfig()
 	models := []map[string]any{}
 	for _, cfg := range app.ModelConfigLoader().GetAllModelsConfigs() {
+		// Only list models PII filtering can actually apply to (reachable
+		// through a text-accepting endpoint with a PII adapter wired).
+		// Skips VAD/STT/embedding/image-only models and the token_classify
+		// detector models themselves, which are the filters, not consumers.
+		if !cfg.PIIFilterApplies() {
+			continue
+		}
+		explicit := cfg.PII.Enabled != nil
+		ownDetectors := cfg.PIIDetectors()
+		// Resolve through the shared policy so the table reflects the EFFECTIVE
+		// state, including the instance-wide default detector — what the
+		// request path actually does.
+		enabled, detectors := app.ResolvePIIPolicy(&cfg)
+
 		entry := map[string]any{
 			"name":      cfg.Name,
 			"backend":   cfg.Backend,
-			"enabled":   cfg.PIIIsEnabled(),
-			"overrides": cfg.PIIPatternOverrides(),
+			"enabled":   enabled,
+			"detectors": detectors,
+			"explicit":  explicit,
+			// Why is this on? backend default (cloud-proxy) vs an explicit YAML
+			// toggle. Helps admins understand the resolved state without
+			// reading source.
+			"default_for_backend": !explicit && cfg.Backend == "cloud-proxy",
+			// The detectors came from the global default, not this model's YAML.
+			"detectors_from_default": enabled && len(ownDetectors) == 0 && len(detectors) > 0,
 		}
-		// explicit-set tells the UI whether the resolved state came
-		// from the YAML or the backend-prefix default. Helps admins
-		// understand "why is this on?" without reading source.
-		entry["explicit"] = cfg.PII.Enabled != nil
-		entry["default_for_backend"] = cfg.Backend == "cloud-proxy"
 		models = append(models, entry)
 	}
 
+	// Detector models: the token_classify "filter" models themselves (NER and
+	// in-process pattern matchers), which PIIFilterApplies deliberately omits
+	// from the consumer list above. The Filtering tab renders these as a table
+	// with a per-row toggle marking membership in the instance-wide default
+	// detector set, so admins manage defaults without retyping model names.
+	defaultSet := map[string]bool{}
+	for _, d := range appCfg.PIIDefaultDetectors {
+		defaultSet[d] = true
+	}
+	detectorModels := []map[string]any{}
+	for _, cfg := range app.ModelConfigLoader().GetAllModelsConfigs() {
+		if !cfg.HasUsecases(config.FLAG_TOKEN_CLASSIFY) {
+			continue
+		}
+		typ := "ner"
+		if cfg.IsPatternDetector() {
+			typ = "pattern"
+		}
+		detectorModels = append(detectorModels, map[string]any{
+			"name":    cfg.Name,
+			"backend": cfg.Backend,
+			"type":    typ,
+			// Whether this detector is in the instance-wide default set.
+			"default": defaultSet[cfg.Name],
+		})
+		delete(defaultSet, cfg.Name)
+	}
+	// Surface any default detector that names a model that is no longer loaded
+	// (or lost the token_classify usecase) so the admin can still toggle it off.
+	for name := range defaultSet {
+		detectorModels = append(detectorModels, map[string]any{
+			"name":    name,
+			"backend": "",
+			"type":    "unknown",
+			"default": true,
+			"missing": true,
+		})
+	}
+
 	recentCount := 0
 	if app.PIIEvents() != nil {
 		if n, err := app.PIIEvents().Count(context.Background()); err == nil {
@@ -355,8 +388,10 @@ func buildPIIStatus(app *application.Application) map[string]any {
 	return map[string]any{
 		"enabled_globally":             true,
 		"default_enabled_for_backends": []string{"cloud-proxy"},
-		"patterns":                     patternList,
 		"models":                       models,
+		"detector_models":              detectorModels,
 		"recent_event_count":           recentCount,
+		// Instance-wide default policy (the Default PII policy editor).
+		"default_detectors": appCfg.PIIDefaultDetectors,
 	}
 }
diff --git a/core/http/routes/ollama.go b/core/http/routes/ollama.go
index 4145dd4f8b71..6e8d97612679 100644
--- a/core/http/routes/ollama.go
+++ b/core/http/routes/ollama.go
@@ -10,13 +10,15 @@ import (
 	"github.com/mudler/LocalAI/core/http/endpoints/ollama"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/LocalAI/core/services/routing/piiadapter"
 	"github.com/mudler/LocalAI/pkg/distributedhdr"
 )
 
 func RegisterOllamaRoutes(app *echo.Echo,
 	re *middleware.RequestExtractor,
-	application *application.Application) {
-
+	application *application.Application,
+) {
 	traceMiddleware := middleware.TraceMiddleware(application)
 	usageMiddleware := middleware.UsageMiddleware(application.StatsRecorder(), application.FallbackUser())
 	nodeHeaderMiddleware := middleware.ExposeNodeHeader(application.ApplicationConfig())
@@ -35,6 +37,7 @@ func RegisterOllamaRoutes(app *echo.Echo,
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OllamaChatRequest) }),
 		setOllamaChatRequestContext(application.ApplicationConfig()),
+		pii.RequestMiddleware(application.PIIRedactor(), application.PIIEvents(), piiadapter.OllamaChat(), application.FallbackUser(), pii.WithNERResolver(application.PIINERResolver()), pii.WithPolicyResolver(application.PIIPolicyResolver())),
 	}
 	app.POST("/api/chat", chatHandler, chatMiddleware...)
 
@@ -52,6 +55,7 @@ func RegisterOllamaRoutes(app *echo.Echo,
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OllamaGenerateRequest) }),
 		setOllamaGenerateRequestContext(application.ApplicationConfig()),
+		pii.RequestMiddleware(application.PIIRedactor(), application.PIIEvents(), piiadapter.OllamaGenerate(), application.FallbackUser(), pii.WithNERResolver(application.PIINERResolver()), pii.WithPolicyResolver(application.PIIPolicyResolver())),
 	}
 	app.POST("/api/generate", generateHandler, generateMiddleware...)
 
@@ -67,6 +71,7 @@ func RegisterOllamaRoutes(app *echo.Echo,
 		traceMiddleware,
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_EMBEDDINGS)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OllamaEmbedRequest) }),
+		pii.RequestMiddleware(application.PIIRedactor(), application.PIIEvents(), piiadapter.OllamaEmbed(), application.FallbackUser(), pii.WithNERResolver(application.PIINERResolver()), pii.WithPolicyResolver(application.PIIPolicyResolver())),
 	}
 	app.POST("/api/embed", embedHandler, embedMiddleware...)
 	app.POST("/api/embeddings", embedHandler, embedMiddleware...)
diff --git a/core/http/routes/openai.go b/core/http/routes/openai.go
index 8a13935aefbe..5252edfddfb7 100644
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -16,7 +16,8 @@ import (
 
 func RegisterOpenAIRoutes(app *echo.Echo,
 	re *middleware.RequestExtractor,
-	application *application.Application) {
+	application *application.Application,
+) {
 	// openAI compatible API endpoint
 	traceMiddleware := middleware.TraceMiddleware(application)
 	usageMiddleware := middleware.UsageMiddleware(application.StatsRecorder(), application.FallbackUser())
@@ -42,7 +43,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	}
 
 	// chat
-	chatHandler := openai.ChatEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig(), natsClient, application.LocalAIAssistant(), application.PIIRedactor(), application.PIIEvents())
+	chatHandler := openai.ChatEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig(), natsClient, application.LocalAIAssistant())
 	chatMiddleware := []echo.MiddlewareFunc{
 		nodeHeaderMiddleware,
 		usageMiddleware,
@@ -71,13 +72,14 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 			middleware.OpenAIProbe,
 			router.SourceChat,
 			middleware.ClassifierDeps{
-				Scorer:      application.Scorer,
-				Embedder:    application.Embedder,
-				VectorStore: application.VectorStore,
-				Reranker:    application.Reranker,
-				ModelLookup: application.ModelConfigLookup(),
-				Registry:    application.RouterClassifierRegistry(),
-				Evaluator:   application.TemplatesEvaluator(),
+				Scorer:       application.Scorer,
+				TokenCounter: application.TokenCounter,
+				Embedder:     application.Embedder,
+				VectorStore:  application.VectorStore,
+				Reranker:     application.Reranker,
+				ModelLookup:  application.ModelConfigLookup(),
+				Registry:     application.RouterClassifierRegistry(),
+				Evaluator:    application.TemplatesEvaluator(),
 			},
 		),
 		// Admission control runs after RouteModel so the SERVED
@@ -90,7 +92,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 		// configs honour the routed target (e.g., a router fans out to
 		// claude-strict; that model's pii block applies, not the
 		// router model's).
-		pii.RequestMiddleware(application.PIIRedactor(), application.PIIEvents(), piiadapter.OpenAI(), application.FallbackUser()),
+		pii.RequestMiddleware(application.PIIRedactor(), application.PIIEvents(), piiadapter.OpenAI(), application.FallbackUser(), pii.WithNERResolver(application.PIINERResolver()), pii.WithPolicyResolver(application.PIIPolicyResolver())),
 	}
 	app.POST("/v1/chat/completions", chatHandler, chatMiddleware...)
 	app.POST("/chat/completions", chatHandler, chatMiddleware...)
@@ -111,12 +113,13 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 				return next(c)
 			}
 		},
+		pii.RequestMiddleware(application.PIIRedactor(), application.PIIEvents(), piiadapter.OpenAICompletion(), application.FallbackUser(), pii.WithNERResolver(application.PIINERResolver()), pii.WithPolicyResolver(application.PIIPolicyResolver())),
 	}
 	app.POST("/v1/edits", editHandler, editMiddleware...)
 	app.POST("/edits", editHandler, editMiddleware...)
 
 	// completion
-	completionHandler := openai.CompletionEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig(), application.PIIRedactor(), application.PIIEvents())
+	completionHandler := openai.CompletionEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig())
 	completionMiddleware := []echo.MiddlewareFunc{
 		nodeHeaderMiddleware,
 		usageMiddleware,
@@ -132,6 +135,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 				return next(c)
 			}
 		},
+		pii.RequestMiddleware(application.PIIRedactor(), application.PIIEvents(), piiadapter.OpenAICompletion(), application.FallbackUser(), pii.WithNERResolver(application.PIINERResolver()), pii.WithPolicyResolver(application.PIIPolicyResolver())),
 	}
 	app.POST("/v1/completions", completionHandler, completionMiddleware...)
 	app.POST("/completions", completionHandler, completionMiddleware...)
@@ -154,6 +158,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 				return next(c)
 			}
 		},
+		pii.RequestMiddleware(application.PIIRedactor(), application.PIIEvents(), piiadapter.OpenAICompletion(), application.FallbackUser(), pii.WithNERResolver(application.PIINERResolver()), pii.WithPolicyResolver(application.PIIPolicyResolver())),
 	}
 	app.POST("/v1/embeddings", embeddingHandler, embeddingMiddleware...)
 	app.POST("/embeddings", embeddingHandler, embeddingMiddleware...)
diff --git a/core/http/routes/pii.go b/core/http/routes/pii.go
index 8b8ec903e96b..c8f315586afc 100644
--- a/core/http/routes/pii.go
+++ b/core/http/routes/pii.go
@@ -6,58 +6,29 @@ import (
 
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/application"
-	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/auth"
-	"github.com/mudler/LocalAI/core/http/endpoints/localai"
 	"github.com/mudler/LocalAI/core/services/routing/pii"
 )
 
-// RegisterPIIRoutes wires the read-only routing-PII endpoints. They
-// surface (a) the active pattern set so admins can verify what is
-// being filtered, (b) the recent PIIEvent log so they can audit what
-// has been redacted, and (c) a dry-run "test" endpoint so an admin
-// can paste candidate text and see what the redactor would do without
-// sending a real request.
+// RegisterPIIRoutes wires the read-only PII audit endpoint. The
+// detection itself runs request-side from the chat middleware
+// (routes/openai.go) and the MITM input path, driven by per-model NER
+// detectors; this endpoint is observation-side only.
 //
-// The redactor itself runs from the chat middleware in routes/openai.go;
-// these endpoints are observation- and configuration-side only.
+// The legacy regex tier (pattern catalogue + per-pattern action editor
+// + dry-run/decide oracles) was removed — policy now lives on each
+// detector model's pii_detection block, so there is nothing global to
+// list or mutate here.
 func RegisterPIIRoutes(e *echo.Echo, app *application.Application) {
-	if app.PIIRedactor() == nil {
-		stub := func(c echo.Context) error {
+	if app.PIIEvents() == nil {
+		e.GET("/api/pii/events", func(c echo.Context) error {
 			return c.JSON(http.StatusServiceUnavailable, map[string]string{
-				"error": "PII filter is disabled (--disable-pii)",
+				"error": "PII subsystem unavailable",
 			})
-		}
-		e.GET("/api/pii/patterns", stub)
-		e.GET("/api/pii/events", stub)
-		e.POST("/api/pii/test", stub)
-		e.POST("/api/pii/decide", stub)
-		e.POST("/api/pii/patterns/persist", stub)
+		})
 		return
 	}
 
-	// GetPIIPatternsEndpoint godoc
-	// @Summary List the active PII patterns
-	// @Description Returns the configured pattern set with their actions. Available without auth.
-	// @Tags pii
-	// @Produce json
-	// @Success 200 {object} map[string]interface{}
-	// @Router /api/pii/patterns [get]
-	e.GET("/api/pii/patterns", func(c echo.Context) error {
-		patterns := app.PIIRedactor().Patterns()
-		out := make([]map[string]any, 0, len(patterns))
-		for _, p := range patterns {
-			out = append(out, map[string]any{
-				"id":               p.ID,
-				"description":      p.Description,
-				"action":           string(p.Action),
-				"disabled":         p.Disabled,
-				"max_match_length": p.MaxMatchLength,
-			})
-		}
-		return c.JSON(http.StatusOK, map[string]any{"patterns": out})
-	})
-
 	// GetPIIEventsEndpoint godoc
 	// @Summary List recent middleware events
 	// @Description The event log is shared between the PII filter and the MITM proxy: PII redactions, proxy_connect (intercept decisions), and proxy_traffic (per-request byte counts) all flow through the same store. Filter by kind to narrow the view. Admin-only when auth is on; available to the local user in single-user mode.
@@ -65,7 +36,7 @@ func RegisterPIIRoutes(e *echo.Echo, app *application.Application) {
 	// @Produce json
 	// @Param correlation_id query string false "Correlation ID join key"
 	// @Param user_id query string false "User id"
-	// @Param pattern_id query string false "Pattern id (e.g. email, ssn)"
+	// @Param pattern_id query string false "Detector group id (e.g. ner:EMAIL, pattern:ANTHROPIC_KEY)"
 	// @Param kind query string false "Event kind: pii | proxy_connect | proxy_traffic"
 	// @Param limit query int false "Max events" default(100)
 	// @Success 200 {object} map[string]interface{}
@@ -98,163 +69,4 @@ func RegisterPIIRoutes(e *echo.Echo, app *application.Application) {
 		}
 		return c.JSON(http.StatusOK, map[string]any{"events": events})
 	})
-
-	// PostPIITestEndpoint godoc
-	// @Summary Dry-run the PII redactor against text
-	// @Description Useful for admins tuning patterns. Returns the redacted text, matched spans, and whether the input would have been blocked.
-	// @Tags pii
-	// @Accept json
-	// @Produce json
-	// @Param body body map[string]string true "JSON {\"text\":\"...\"}"
-	// @Success 200 {object} map[string]interface{}
-	// @Router /api/pii/test [post]
-	e.POST("/api/pii/test", func(c echo.Context) error {
-		var body struct {
-			Text string `json:"text"`
-		}
-		if err := c.Bind(&body); err != nil {
-			return c.JSON(http.StatusBadRequest, map[string]string{"error": "invalid JSON"})
-		}
-		res := app.PIIRedactor().Redact(body.Text)
-		return c.JSON(http.StatusOK, map[string]any{
-			"redacted":   res.Redacted,
-			"spans":      res.Spans,
-			"blocked":    res.Blocked,
-			"local_only": res.LocalOnly,
-		})
-	})
-
-	// POST /api/pii/decide — programmatic PII decision oracle for
-	// external routers. Returns findings + suggested action without
-	// mutating the caller's request or recording an audit event.
-	// Production hot path — admin-only, matching /api/pii/events.
-	decideHandler := localai.PIIDecideEndpoint(app.PIIRedactor())
-	e.POST("/api/pii/decide", func(c echo.Context) error {
-		viewer := resolveUsageUser(c, app)
-		if viewer == nil {
-			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
-		}
-		if viewer.Role != auth.RoleAdmin {
-			return c.JSON(http.StatusForbidden, map[string]string{"error": "admin access required"})
-		}
-		return decideHandler(c)
-	})
-
-	// PutPIIPatternActionEndpoint godoc
-	// @Summary Change a pattern's action in-process
-	// @Description Mutates the named pattern's action (mask|block|route_local). Transient — restored to YAML defaults on restart. Admin-only.
-	// @Tags pii
-	// @Accept json
-	// @Produce json
-	// @Param id path string true "Pattern id"
-	// @Param body body map[string]string true "JSON {\"action\":\"mask|block|route_local\"}"
-	// @Success 200 {object} map[string]interface{}
-	// @Router /api/pii/patterns/{id} [put]
-	e.PUT("/api/pii/patterns/:id", func(c echo.Context) error {
-		viewer := resolveUsageUser(c, app)
-		if viewer == nil {
-			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
-		}
-		if viewer.Role != auth.RoleAdmin {
-			return c.JSON(http.StatusForbidden, map[string]string{"error": "admin access required"})
-		}
-
-		id := c.Param("id")
-		if id == "" {
-			return c.JSON(http.StatusBadRequest, map[string]string{"error": "pattern id is required"})
-		}
-		// Either field is optional. The body must set at least one;
-		// otherwise the call is a no-op and the client probably means
-		// to PUT something.
-		var body struct {
-			Action   *string `json:"action,omitempty"`
-			Disabled *bool   `json:"disabled,omitempty"`
-		}
-		if err := c.Bind(&body); err != nil {
-			return c.JSON(http.StatusBadRequest, map[string]string{"error": "invalid JSON"})
-		}
-		if body.Action == nil && body.Disabled == nil {
-			return c.JSON(http.StatusBadRequest, map[string]string{"error": "must specify action and/or disabled"})
-		}
-		if body.Action != nil {
-			if err := app.PIIRedactor().SetAction(id, pii.Action(*body.Action)); err != nil {
-				return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
-			}
-		}
-		if body.Disabled != nil {
-			if err := app.PIIRedactor().SetDisabled(id, *body.Disabled); err != nil {
-				return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
-			}
-		}
-		return c.JSON(http.StatusOK, map[string]any{
-			"id":        id,
-			"action":    body.Action,
-			"disabled":  body.Disabled,
-			"persisted": false,
-		})
-	})
-
-	// PostPIIPatternsPersistEndpoint godoc
-	// @Summary Persist current pattern overrides to disk
-	// @Description Snapshots the live redactor's per-pattern (action, disabled) state into runtime_settings.json so the next process start re-applies it. Admin-only. Pairs with PUT /api/pii/patterns/:id which only mutates in-process.
-	// @Tags pii
-	// @Produce json
-	// @Success 200 {object} map[string]interface{}
-	// @Router /api/pii/patterns/persist [post]
-	e.POST("/api/pii/patterns/persist", func(c echo.Context) error {
-		viewer := resolveUsageUser(c, app)
-		if viewer == nil {
-			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
-		}
-		if viewer.Role != auth.RoleAdmin {
-			return c.JSON(http.StatusForbidden, map[string]string{"error": "admin access required"})
-		}
-
-		appCfg := app.ApplicationConfig()
-		existing, err := appCfg.ReadPersistedSettings()
-		if err != nil {
-			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "read settings: " + err.Error()})
-		}
-		// Only persist patterns whose live state differs from the YAML
-		// default — that way an operator can compare runtime_settings.json
-		// at a glance and see only the deltas they applied.
-		defaults, dErr := pii.LoadConfig(appCfg.PIIConfigPath)
-		if dErr != nil {
-			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "reload defaults: " + dErr.Error()})
-		}
-		defaultByID := make(map[string]pii.Pattern, len(defaults))
-		for _, d := range defaults {
-			defaultByID[d.ID] = d
-		}
-		overrides := map[string]config.PIIPatternRuntimeOverride{}
-		for _, p := range app.PIIRedactor().Patterns() {
-			d, ok := defaultByID[p.ID]
-			ov := config.PIIPatternRuntimeOverride{}
-			changed := false
-			if !ok || p.Action != d.Action {
-				action := string(p.Action)
-				ov.Action = &action
-				changed = true
-			}
-			if !ok || p.Disabled != d.Disabled {
-				disabled := p.Disabled
-				ov.Disabled = &disabled
-				changed = true
-			}
-			if changed {
-				overrides[p.ID] = ov
-			}
-		}
-		existing.PIIPatternOverrides = &overrides
-		if err := appCfg.WritePersistedSettings(existing); err != nil {
-			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "write settings: " + err.Error()})
-		}
-		// Mirror onto the live ApplicationConfig so a subsequent reload
-		// without a process restart sees the same map.
-		appCfg.PIIPatternOverrides = overrides
-		return c.JSON(http.StatusOK, map[string]any{
-			"persisted":          true,
-			"override_count":     len(overrides),
-		})
-	})
 }
diff --git a/core/http/routes/ui_api.go b/core/http/routes/ui_api.go
index 8696e2b22cca..f398d71cd168 100644
--- a/core/http/routes/ui_api.go
+++ b/core/http/routes/ui_api.go
@@ -56,6 +56,7 @@ var usecaseFilters = map[string]config.ModelConfigUsecase{
 	config.UsecaseAudioTransform:  config.FLAG_AUDIO_TRANSFORM,
 	config.UsecaseDiarization:     config.FLAG_DIARIZATION,
 	config.UsecaseRealtimeAudio:   config.FLAG_REALTIME_AUDIO,
+	config.UsecaseTokenClassify:   config.FLAG_TOKEN_CLASSIFY,
 }
 
 // extractHFRepo tries to find a HuggingFace repo ID from model overrides or URLs.
diff --git a/core/schema/localai.go b/core/schema/localai.go
index 8bb431e35e82..c7e1292fa2b8 100644
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -499,7 +499,7 @@ type RouterDecideResponse struct {
 // inspects the text and returns findings + a suggested action; it
 // does NOT mutate the input, record an audit event, or rewrite any
 // downstream request. The caller composes the decision with its own
-// policy (mask, block, route to local-only backends, allow).
+// policy (mask, block, or allow).
 type PIIDecideRequest struct {
 	// Text is the user-visible content to inspect. Required.
 	Text string `json:"text"`
@@ -507,19 +507,20 @@ type PIIDecideRequest struct {
 
 // PIIDecideResponse carries the redactor's findings.
 // SuggestedAction is derived from the action ordering used by the
-// internal redactor (block > route_local > mask > allow) so callers
-// don't need to replicate that logic.
+// internal redactor (block > mask > allow) so callers don't need to
+// replicate that logic.
 type PIIDecideResponse struct {
 	// Findings is one entry per matched span — pattern id, byte
 	// range, and audit-safe hash prefix (never the matched value).
 	Findings []PIIFinding `json:"findings"`
 	// SuggestedAction is the strongest action across all findings:
-	// "block", "route_local", "mask", or "allow" (no findings).
+	// "block", "mask", or "allow" (no findings, or all findings
+	// resolved to the allow action).
 	SuggestedAction string `json:"suggested_action"`
 	// RedactedPreview is the input with mask-action spans replaced
 	// by their placeholders. Identical to Text when no findings or
-	// when the strongest action is block/route_local (which don't
-	// rewrite content).
+	// when the strongest action is block/allow (which don't rewrite
+	// content).
 	RedactedPreview string `json:"redacted_preview"`
 }
 
diff --git a/core/services/cloudproxy/backend_forward.go b/core/services/cloudproxy/backend_forward.go
index 841f3c95a71a..76bae42ce073 100644
--- a/core/services/cloudproxy/backend_forward.go
+++ b/core/services/cloudproxy/backend_forward.go
@@ -10,8 +10,6 @@ import (
 	"github.com/labstack/echo/v4"
 	corebackend "github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/auth"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/core/trace"
 	pkggrpc "github.com/mudler/LocalAI/pkg/grpc"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
@@ -19,41 +17,14 @@ import (
 	"github.com/mudler/xlog"
 )
 
-// BuildStreamFilter constructs the per-request streaming PII filter
-// for a cloud-proxy forward. Returns nil when the request isn't
-// streaming, PII is disabled for this model, or no redactor is wired
-// up — callers pass the result through unchanged. correlationID is
-// caller-supplied because the OpenAI and Anthropic endpoints read it
-// from different headers.
-func BuildStreamFilter(c echo.Context, cfg *config.ModelConfig, isStream bool, piiRedactor *pii.Redactor, piiEvents pii.EventStore, correlationID string) *pii.StreamFilter {
-	if !isStream || piiRedactor == nil || !cfg.PIIIsEnabled() {
-		return nil
-	}
-	userID := ""
-	if u := auth.GetUser(c); u != nil {
-		userID = u.ID
-	}
-	var overrides map[string]pii.Action
-	if raw := cfg.PIIPatternOverrides(); len(raw) > 0 {
-		overrides = make(map[string]pii.Action, len(raw))
-		for ovid, action := range raw {
-			switch pii.Action(action) {
-			case pii.ActionMask, pii.ActionBlock, pii.ActionRouteLocal:
-				overrides[ovid] = pii.Action(action)
-			}
-		}
-	}
-	return pii.NewStreamFilter(piiRedactor, overrides, piiEvents, correlationID, userID)
-}
-
 // ForwardViaBackend loads the cloud-proxy gRPC backend, ships the
 // request via the Forward RPC, and pumps the response back to the
-// client through the SSE-aware PII pipeline.
+// client. PII redaction runs request-side (the NER middleware + MITM
+// input path); the response is forwarded unmodified.
 func ForwardViaBackend(
 	c echo.Context,
 	cfg *config.ModelConfig,
 	body []byte,
-	filter *pii.StreamFilter,
 	loader *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 ) (resultErr error) {
@@ -176,7 +147,7 @@ func ForwardViaBackend(
 		return passthroughError(c, statusCode, contentType, bodyReader)
 	}
 	if isStream {
-		return forwardStream(c, bodyReader, cfg.Proxy.Provider, filter)
+		return forwardStream(c, bodyReader)
 	}
 	return forwardBuffered(c, statusCode, contentType, bodyReader)
 }
diff --git a/core/services/cloudproxy/build_filter_test.go b/core/services/cloudproxy/build_filter_test.go
deleted file mode 100644
index c46d8a392d01..000000000000
--- a/core/services/cloudproxy/build_filter_test.go
+++ /dev/null
@@ -1,72 +0,0 @@
-package cloudproxy
-
-import (
-	"net/http/httptest"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("BuildStreamFilter", func() {
-	var (
-		c  echo.Context
-		cfg *config.ModelConfig
-	)
-
-	BeforeEach(func() {
-		e := echo.New()
-		req := httptest.NewRequest("POST", "/v1/chat/completions", nil)
-		rec := httptest.NewRecorder()
-		c = e.NewContext(req, rec)
-		piiOn := true
-		cfg = &config.ModelConfig{
-			Backend: "cloud-proxy",
-			PII:     config.PIIConfig{Enabled: &piiOn},
-		}
-	})
-
-	// Three guards must each independently force a nil return — proves
-	// the gate is a logical AND, not an order-dependent short-circuit
-	// that silently activates one branch.
-	It("returns nil when isStream is false", func() {
-		patterns, err := pii.Compile(pii.DefaultPatterns())
-		Expect(err).NotTo(HaveOccurred())
-		r := pii.NewRedactor(patterns)
-		Expect(BuildStreamFilter(c, cfg, false, r, nil, "corr-1")).To(BeNil())
-	})
-
-	It("returns nil when piiRedactor is nil", func() {
-		Expect(BuildStreamFilter(c, cfg, true, nil, nil, "corr-1")).To(BeNil())
-	})
-
-	It("returns nil when the model has PII disabled", func() {
-		piiOff := false
-		cfg.PII.Enabled = &piiOff
-		patterns, err := pii.Compile(pii.DefaultPatterns())
-		Expect(err).NotTo(HaveOccurred())
-		r := pii.NewRedactor(patterns)
-		Expect(BuildStreamFilter(c, cfg, true, r, nil, "corr-1")).To(BeNil())
-	})
-
-	It("returns a configured filter when all preconditions hold", func() {
-		patterns, err := pii.Compile(pii.DefaultPatterns())
-		Expect(err).NotTo(HaveOccurred())
-		r := pii.NewRedactor(patterns)
-		store := pii.NewMemoryEventStore(8)
-		filter := BuildStreamFilter(c, cfg, true, r, store, "corr-xyz")
-		Expect(filter).NotTo(BeNil())
-	})
-
-	// Empty correlationID is allowed — some entry points don't have one.
-	// The filter must still construct so the stream can flow.
-	It("constructs a filter even when correlationID is empty", func() {
-		patterns, err := pii.Compile(pii.DefaultPatterns())
-		Expect(err).NotTo(HaveOccurred())
-		r := pii.NewRedactor(patterns)
-		Expect(BuildStreamFilter(c, cfg, true, r, nil, "")).NotTo(BeNil())
-	})
-})
diff --git a/core/services/cloudproxy/mitm/handler.go b/core/services/cloudproxy/mitm/handler.go
index 83c12251b9c3..ac2887999b3a 100644
--- a/core/services/cloudproxy/mitm/handler.go
+++ b/core/services/cloudproxy/mitm/handler.go
@@ -16,7 +16,6 @@ import (
 	"golang.org/x/net/http2"
 
 	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/services/cloudproxy/ssewire"
 	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/core/services/routing/piiadapter"
 	"github.com/mudler/LocalAI/pkg/httpclient"
@@ -24,8 +23,14 @@ import (
 
 // PIIHandlerOptions configures NewPIIHandler.
 type PIIHandlerOptions struct {
-	// Redactor is the regex PII redactor. nil disables redaction.
-	Redactor *pii.Redactor
+	// DetectorsByHost maps an intercepted host (lower-cased) to the NER
+	// detector configs that should scan request bodies bound for it. The
+	// configs are resolved at listener-start from each host's owning
+	// model's pii.detectors + the detector models' pii_detection policy
+	// (a model-config edit needs a MITM restart, as hosts already do). A
+	// host absent from the map (or with an empty slice) is forwarded
+	// unredacted. Detector errors at request time fail closed.
+	DetectorsByHost map[string][]pii.NERConfig
 
 	// EventStore receives PIIEvent rows. nil discards events.
 	EventStore pii.EventStore
@@ -42,13 +47,6 @@ type PIIHandlerOptions struct {
 	// upstream URL. Identity by default; tests inject a httptest
 	// listener address.
 	DialHost func(host string) string
-
-	// HostsWithPIIDisabled lists destination hosts whose request
-	// bodies should NOT run through the redactor. TLS termination,
-	// upstream forwarding, and audit events still happen — only the
-	// regex pass is bypassed. Useful for telemetry/probe endpoints
-	// whose bodies aren't PII-shaped.
-	HostsWithPIIDisabled []string
 }
 
 func NewPIIHandler(opts PIIHandlerOptions) InterceptHandler {
@@ -76,16 +74,9 @@ func NewPIIHandler(opts PIIHandlerOptions) InterceptHandler {
 		dialHost = func(h string) string { return h }
 	}
 
-	patternAction := map[string]pii.Action{}
-	if opts.Redactor != nil {
-		for _, p := range opts.Redactor.Patterns() {
-			patternAction[p.ID] = p.Action
-		}
-	}
-
-	piiDisabled := make(map[string]bool, len(opts.HostsWithPIIDisabled))
-	for _, h := range opts.HostsWithPIIDisabled {
-		piiDisabled[strings.ToLower(strings.TrimSpace(h))] = true
+	detectorsByHost := make(map[string][]pii.NERConfig, len(opts.DetectorsByHost))
+	for h, cfgs := range opts.DetectorsByHost {
+		detectorsByHost[strings.ToLower(strings.TrimSpace(h))] = cfgs
 	}
 
 	d := &piiDispatcher{
@@ -96,26 +87,22 @@ func NewPIIHandler(opts PIIHandlerOptions) InterceptHandler {
 		// API keys such as Anthropic's x-api-key, which Go does NOT
 		// strip on cross-host redirects — to an unvetted host. Surface
 		// it as an error (handled as a 502) instead.
-		client:        httpclient.New(httpclient.WithTransport(transport)),
-		redactor:      opts.Redactor,
-		store:         opts.EventStore,
-		patternAction: patternAction,
-		corrHeader:    corrHeader,
-		dialHost:      dialHost,
-		piiDisabled:   piiDisabled,
+		client:          httpclient.New(httpclient.WithTransport(transport)),
+		detectorsByHost: detectorsByHost,
+		store:           opts.EventStore,
+		corrHeader:      corrHeader,
+		dialHost:        dialHost,
 	}
 	return d.serve
 }
 
 type piiDispatcher struct {
-	client        *http.Client
-	redactor      *pii.Redactor
-	store         pii.EventStore
-	patternAction map[string]pii.Action
-	corrHeader    string
-	dialHost      func(host string) string
-	piiDisabled   map[string]bool
-	eventSeq      atomic.Uint64
+	client          *http.Client
+	detectorsByHost map[string][]pii.NERConfig
+	store           pii.EventStore
+	corrHeader      string
+	dialHost        func(host string) string
+	eventSeq        atomic.Uint64
 }
 
 func (d *piiDispatcher) serve(w http.ResponseWriter, r *http.Request, host string) {
@@ -144,11 +131,17 @@ func (d *piiDispatcher) serve(w http.ResponseWriter, r *http.Request, host strin
 	}
 
 	shape := classifyRequestShape(host, r.URL.Path)
-	if d.redactor != nil && shape != shapeUnknown && !d.piiDisabled[strings.ToLower(host)] {
-		redacted, blocked, err := d.redactRequest(body, shape, correlationID)
+	cfgs := d.detectorsByHost[strings.ToLower(host)]
+	if len(cfgs) > 0 && shape != shapeUnknown {
+		redacted, blocked, err := d.redactRequest(r.Context(), body, shape, cfgs, correlationID)
 		switch {
 		case err != nil:
-			xlog.Debug("mitm: redact request failed; forwarding unchanged", "host", host, "path", r.URL.Path, "error", err)
+			// Fail closed: a detector outage must not silently forward the
+			// request unredacted — the operator configured this host's
+			// model with detectors precisely to catch this PII.
+			xlog.Error("mitm: NER redaction failed; blocking request (fail-closed)", "host", host, "path", r.URL.Path, "error", err)
+			writePIIBlocked(w, correlationID)
+			return
 		case blocked:
 			writePIIBlocked(w, correlationID)
 			return
@@ -185,12 +178,10 @@ func (d *piiDispatcher) serve(w http.ResponseWriter, r *http.Request, host strin
 	}
 	w.WriteHeader(resp.StatusCode)
 
+	// Response/output redaction is out of scope for now — the MITM proxy
+	// only scans request bodies (input). SSE responses pass through
+	// unmodified.
 	contentType := resp.Header.Get("Content-Type")
-	if shape != shapeUnknown && d.redactor != nil && isSSE(contentType) {
-		d.streamWithPII(w, resp.Body, shape, correlationID)
-		return
-	}
-
 	if isSSE(contentType) {
 		flusher, _ := w.(http.Flusher)
 		buf := make([]byte, 32*1024)
@@ -232,7 +223,7 @@ func classifyRequestShape(host, path string) requestShape {
 	return shapeUnknown
 }
 
-func (d *piiDispatcher) redactRequest(body []byte, shape requestShape, correlationID string) ([]byte, bool, error) {
+func (d *piiDispatcher) redactRequest(ctx context.Context, body []byte, shape requestShape, cfgs []pii.NERConfig, correlationID string) ([]byte, bool, error) {
 	var parsed any
 	var adapter pii.Adapter
 	switch shape {
@@ -265,7 +256,10 @@ func (d *piiDispatcher) redactRequest(body []byte, shape requestShape, correlati
 		if st.Text == "" {
 			continue
 		}
-		res := d.redactor.RedactWithOverrides(st.Text, nil)
+		res, err := pii.RedactNER(ctx, st.Text, cfgs)
+		if err != nil {
+			return nil, false, fmt.Errorf("ner detect: %w", err)
+		}
 		if len(res.Spans) == 0 {
 			continue
 		}
@@ -301,7 +295,7 @@ func (d *piiDispatcher) recordEvents(spans []pii.Span, correlationID string) {
 			ByteOffset:    span.Start,
 			Length:        span.End - span.Start,
 			HashPrefix:    span.HashPrefix,
-			Action:        d.patternAction[span.Pattern],
+			Action:        span.Action,
 			CreatedAt:     time.Now(),
 		}
 		if err := d.store.Record(context.Background(), ev); err != nil {
@@ -310,49 +304,6 @@ func (d *piiDispatcher) recordEvents(spans []pii.Span, correlationID string) {
 	}
 }
 
-func (d *piiDispatcher) streamWithPII(w http.ResponseWriter, src io.Reader, shape requestShape, correlationID string) {
-	flusher, _ := w.(http.Flusher)
-	filter := pii.NewStreamFilter(d.redactor, nil, d.store, correlationID, "")
-
-	provider := ssewire.OpenAI
-	if shape == shapeAnthropicMessages {
-		provider = ssewire.Anthropic
-	}
-
-	emit := func(s string) {
-		_, _ = w.Write([]byte(s))
-		if flusher != nil {
-			flusher.Flush()
-		}
-	}
-
-	scanner := ssewire.NewScanner(src)
-	for scanner.Scan() {
-		ev := scanner.Event()
-		if ssewire.IsTerminalMarker(ev.DataLine, provider) {
-			if residual := filter.Drain(); residual != "" {
-				emit(ssewire.SynthResidualEvent(provider, residual))
-			}
-			emit(ev.Raw)
-			continue
-		}
-		out := ev.Raw
-		if ev.DataLine != "" {
-			rewritten, drop := ssewire.RewritePayload(ev.DataLine, provider, filter)
-			if drop {
-				continue
-			}
-			if rewritten != ev.DataLine {
-				out = strings.Replace(ev.Raw, ev.DataLine, rewritten, 1)
-			}
-		}
-		emit(out)
-	}
-	if residual := filter.Drain(); residual != "" {
-		emit(ssewire.SynthResidualEvent(provider, residual))
-	}
-}
-
 func writePIIBlocked(w http.ResponseWriter, correlationID string) {
 	w.Header().Set("Content-Type", "application/json")
 	w.WriteHeader(http.StatusBadRequest)
diff --git a/core/services/cloudproxy/mitm/handler_test.go b/core/services/cloudproxy/mitm/handler_test.go
index d54d4957ac7b..a56e1588e049 100644
--- a/core/services/cloudproxy/mitm/handler_test.go
+++ b/core/services/cloudproxy/mitm/handler_test.go
@@ -19,34 +19,58 @@ import (
 	. "github.com/onsi/gomega"
 )
 
-// startPIITestRig is the same shape as startMITMTestRig but plugs
-// in the production PII handler instead of the passthrough fixture.
-// The "host" the client thinks it's reaching is forced to
-// api.anthropic.com so the request shape classifier matches.
+// substringDetector is a deterministic pii.NERDetector for tests: it
+// reports an entity for every occurrence of each configured substring,
+// with byte offsets into the scanned text. Lets the MITM tests drive
+// request redaction without a real token-classification backend.
+type substringDetector struct{ groups map[string]string } // substring -> entity group
+
+func (d substringDetector) Detect(_ context.Context, text string) ([]pii.NEREntity, error) {
+	var out []pii.NEREntity
+	for sub, group := range d.groups {
+		for idx := 0; ; {
+			i := strings.Index(text[idx:], sub)
+			if i < 0 {
+				break
+			}
+			start := idx + i
+			out = append(out, pii.NEREntity{Group: group, Start: start, End: start + len(sub), Score: 1})
+			idx = start + len(sub)
+		}
+	}
+	return out, nil
+}
+
+// testDetectorCfg flags emails (mask) and a known secret token (block).
+func testDetectorCfg() pii.NERConfig {
+	return pii.NERConfig{
+		Detector: substringDetector{groups: map[string]string{
+			"alice@example.com":                 "EMAIL",
+			"bob@example.org":                   "EMAIL",
+			"sk-abcdefghijklmnopqrstuvwxyz1234": "PASSWORD",
+		}},
+		EntityActions: map[string]pii.Action{"EMAIL": pii.ActionMask, "PASSWORD": pii.ActionBlock},
+	}
+}
+
+// startPIITestRig plugs the production PII handler into a CONNECT proxy,
+// with the upstream playing the role of api.anthropic.com. Request
+// bodies bound for api.anthropic.com run through the NER detector above.
 func startPIITestRig(upstream http.Handler) (*http.Client, string, *fakeStore, func()) {
-	// Upstream fake — plays the role of api.anthropic.com.
 	ts := httptest.NewTLSServer(upstream)
 	upstreamCertPool := x509.NewCertPool()
 	upstreamCertPool.AddCert(ts.Certificate())
 	upstreamURL, _ := url.Parse(ts.URL)
-
-	// Compiled patterns required for the redactor to actually fire
-	// (DefaultPatterns alone returns Pattern structs without regex).
-	patterns, err := pii.Compile(pii.DefaultPatterns())
-	ExpectWithOffset(1, err).NotTo(HaveOccurred())
-	redactor := pii.NewRedactor(patterns)
 	store := &fakeStore{}
 
 	ca, err := NewInMemoryCA()
 	ExpectWithOffset(1, err).NotTo(HaveOccurred())
 
-	// DialHost remaps the upstream dial target to the httptest
-	// fake while leaving the classifier-facing host
-	// ("api.anthropic.com") untouched. ServerName=example.com is
-	// what httptest.NewTLSServer issues its cert for.
 	upstreamHost := upstreamURL.Host
 	prodHandler := NewPIIHandler(PIIHandlerOptions{
-		Redactor:   redactor,
+		DetectorsByHost: map[string][]pii.NERConfig{
+			"api.anthropic.com": {testDetectorCfg()},
+		},
 		EventStore: store,
 		UpstreamTLS: &tls.Config{
 			RootCAs:    upstreamCertPool,
@@ -79,8 +103,6 @@ func startPIITestRig(upstream http.Handler) (*http.Client, string, *fakeStore, f
 		srv.Stop()
 		ts.Close()
 	}
-	// We point requests at api.anthropic.com so classifyRequestShape
-	// matches; the wrappedHandler retargets to the upstream fake.
 	return client, "https://api.anthropic.com", store, cleanup
 }
 
@@ -101,7 +123,7 @@ func (s *fakeStore) Close() error                         { return nil }
 func (s *fakeStore) recorded() int { return len(s.events) }
 
 var _ = Describe("PIIHandler", func() {
-	It("redacts request email", func() {
+	It("redacts request email via NER", func() {
 		var receivedBody []byte
 		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			receivedBody, _ = io.ReadAll(r.Body)
@@ -119,15 +141,11 @@ var _ = Describe("PIIHandler", func() {
 		Expect(resp.StatusCode).To(Equal(200))
 
 		Expect(string(receivedBody)).NotTo(ContainSubstring("alice@example.com"), "upstream received unredacted body")
-		Expect(string(receivedBody)).To(ContainSubstring("[REDACTED:email]"), "upstream did not see redaction marker")
+		Expect(string(receivedBody)).To(ContainSubstring("[REDACTED:ner:EMAIL]"), "upstream did not see redaction marker")
 		Expect(store.recorded()).NotTo(BeZero(), "no PIIEvent recorded for the email match")
 	})
 
 	It("refuses to follow an upstream redirect", func() {
-		// A 3xx from the upstream would otherwise be followed, replaying
-		// the request (and its provider API key, e.g. Anthropic's
-		// x-api-key which Go does NOT strip on cross-host redirects) to
-		// the Location host. The refused redirect surfaces as a 502.
 		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			http.Redirect(w, r, "https://evil.example.com/steal", http.StatusFound)
 		})
@@ -142,7 +160,7 @@ var _ = Describe("PIIHandler", func() {
 		Expect(resp.StatusCode).To(Equal(http.StatusBadGateway), "refused redirect must surface as 502, not be followed")
 	})
 
-	It("blocks api key in request", func() {
+	It("blocks a detected secret in the request", func() {
 		upstreamCalled := false
 		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			upstreamCalled = true
@@ -156,46 +174,13 @@ var _ = Describe("PIIHandler", func() {
 		resp, err := client.Post(base+"/v1/messages", "application/json", strings.NewReader(body))
 		Expect(err).NotTo(HaveOccurred(), "client.Post")
 		defer func() { _ = resp.Body.Close() }()
-		Expect(resp.StatusCode).To(Equal(400), "api_key_prefix has Block default")
+		Expect(resp.StatusCode).To(Equal(400), "PASSWORD entity action is block")
 		Expect(upstreamCalled).To(BeFalse(), "upstream was called despite block — proxy should short-circuit")
 		body2, _ := io.ReadAll(resp.Body)
 		Expect(string(body2)).To(ContainSubstring("pii_blocked"))
 	})
 
-	It("streaming redaction", func() {
-		// Anthropic-shape SSE; "alice@" + "example.com" splits the
-		// email across chunks so the StreamFilter has to buffer.
-		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			w.Header().Set("Content-Type", "text/event-stream")
-			w.WriteHeader(200)
-			flusher := w.(http.Flusher)
-			chunks := []string{
-				`{"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"contact me at alice@"}}`,
-				`{"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"example.com any time"}}`,
-				`{"type":"message_stop"}`,
-			}
-			for _, c := range chunks {
-				_, _ = fmt.Fprintf(w, "event: %s\ndata: %s\n\n", "content_block_delta", c)
-				flusher.Flush()
-			}
-		})
-
-		client, base, _, cleanup := startPIITestRig(upstream)
-		defer cleanup()
-
-		body := `{"model":"claude-3-5-sonnet","max_tokens":100,"stream":true,"messages":[{"role":"user","content":"hi"}]}`
-		resp, err := client.Post(base+"/v1/messages", "application/json", strings.NewReader(body))
-		Expect(err).NotTo(HaveOccurred(), "Post")
-		defer func() { _ = resp.Body.Close() }()
-		out, _ := io.ReadAll(resp.Body)
-		outStr := string(out)
-		Expect(outStr).NotTo(ContainSubstring("alice@example.com"), "email leaked through MITM stream")
-		Expect(outStr).To(ContainSubstring("[REDACTED:email]"), "redaction marker missing from MITM stream")
-	})
-
 	It("non-chat path passes through", func() {
-		// A path the classifier doesn't recognise (e.g. an OAuth
-		// callback) must forward the body verbatim, no PII parsing.
 		var receivedBody []byte
 		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			receivedBody, _ = io.ReadAll(r.Body)
@@ -216,14 +201,12 @@ var _ = Describe("PIIHandler", func() {
 
 var _ = Describe("redactRequest", func() {
 	It("handles anthropic shape", func() {
-		patterns, _ := pii.Compile(pii.DefaultPatterns())
-		r := pii.NewRedactor(patterns)
 		body := []byte(`{"model":"claude","max_tokens":10,"messages":[{"role":"user","content":"reach me at bob@example.org"}]}`)
 
-		d := &piiDispatcher{redactor: r, patternAction: map[string]pii.Action{}}
-		out, blocked, err := d.redactRequest(body, shapeAnthropicMessages, "corr-1")
+		d := &piiDispatcher{}
+		out, blocked, err := d.redactRequest(context.Background(), body, shapeAnthropicMessages, []pii.NERConfig{testDetectorCfg()}, "corr-1")
 		Expect(err).NotTo(HaveOccurred())
-		Expect(blocked).To(BeFalse(), "email is mask, not block — blocked should be false")
+		Expect(blocked).To(BeFalse(), "EMAIL is mask, not block — blocked should be false")
 		var parsed map[string]any
 		Expect(json.Unmarshal(out, &parsed)).To(Succeed())
 		msgs := parsed["messages"].([]any)
@@ -273,9 +256,6 @@ var _ = Describe("Proxy events", func() {
 	})
 
 	It("tunneled host emits connect event only", func() {
-		// A non-allowlisted CONNECT must record a proxy_connect with
-		// Intercepted=false and NOT a proxy_traffic event (tunneled
-		// bytes never reach the dispatcher).
 		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			_, _ = fmt.Fprint(w, "passthrough")
 		})
diff --git a/core/services/cloudproxy/proxy.go b/core/services/cloudproxy/proxy.go
index 879f353a35b0..6819b28e843b 100644
--- a/core/services/cloudproxy/proxy.go
+++ b/core/services/cloudproxy/proxy.go
@@ -1,8 +1,9 @@
 // Package cloudproxy stitches the cloud-proxy gRPC backend to the
-// HTTP edge: model rewrite, body shaping, and SSE-aware PII filtering
-// on the response. The outbound HTTP request itself lives inside the
-// cloud-proxy backend binary (backend/go/cloud-proxy), not here — this
-// package is the core-side glue.
+// HTTP edge: model rewrite and body shaping. The outbound HTTP request
+// itself lives inside the cloud-proxy backend binary
+// (backend/go/cloud-proxy), not here — this package is the core-side
+// glue. PII redaction runs request-side (the NER middleware + MITM
+// input path); response/output is forwarded unmodified.
 package cloudproxy
 
 import (
@@ -10,11 +11,8 @@ import (
 	"fmt"
 	"io"
 	"net/http"
-	"strings"
 
 	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/services/cloudproxy/ssewire"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/xlog"
 )
 
@@ -61,65 +59,30 @@ func forwardBuffered(c echo.Context, statusCode int, contentType string, body io
 	return err
 }
 
-// forwardStream applies SSE-aware PII rewriting as the response flows
-// to the client. provider selects the dialect (openai vs anthropic);
-// it comes from cfg.Proxy.Provider on the cloud-proxy backend.
-func forwardStream(c echo.Context, body io.Reader, provider string, filter *pii.StreamFilter) error {
+// forwardStream relays the upstream SSE response to the client,
+// flushing per read so events arrive in real time. Response/output PII
+// redaction is out of scope for now, so the stream is forwarded
+// unmodified.
+func forwardStream(c echo.Context, body io.Reader) error {
 	c.Response().Header().Set("Content-Type", "text/event-stream")
 	c.Response().Header().Set("Cache-Control", "no-cache")
 	c.Response().Header().Set("Connection", "keep-alive")
 	c.Response().WriteHeader(http.StatusOK)
 
-	emit := func(line string) error {
-		_, err := fmt.Fprint(c.Response().Writer, line)
-		if err != nil {
-			return err
-		}
-		c.Response().Flush()
-		return nil
-	}
-
-	flushResidual := func() {
-		if filter == nil {
-			return
-		}
-		residual := filter.Drain()
-		if residual == "" {
-			return
-		}
-		if line := ssewire.SynthResidualEvent(ssewire.Provider(provider), residual); line != "" {
-			_ = emit(line)
-		}
-	}
-
-	prov := ssewire.Provider(provider)
-	scanner := ssewire.NewScanner(body)
-	for scanner.Scan() {
-		ev := scanner.Event()
-		if ssewire.IsTerminalMarker(ev.DataLine, prov) {
-			flushResidual()
-			_ = emit(ev.Raw)
-			continue
-		}
-		out := ev.Raw
-		if filter != nil && ev.DataLine != "" {
-			rewritten, drop := ssewire.RewritePayload(ev.DataLine, prov, filter)
-			if drop {
-				continue
-			}
-			if rewritten != ev.DataLine {
-				// strings.Replace with n=1 touches only the data line,
-				// preserving any "event:"/"id:" preamble.
-				out = strings.Replace(ev.Raw, ev.DataLine, rewritten, 1)
+	buf := make([]byte, 32*1024)
+	for {
+		n, rErr := body.Read(buf)
+		if n > 0 {
+			if _, wErr := c.Response().Writer.Write(buf[:n]); wErr != nil {
+				return nil
 			}
+			c.Response().Flush()
 		}
-		if err := emit(out); err != nil {
+		if rErr != nil {
+			if rErr != io.EOF {
+				xlog.Debug("cloudproxy: stream read error", "error", rErr)
+			}
 			return nil
 		}
 	}
-	if err := scanner.Err(); err != nil && err != io.EOF {
-		xlog.Debug("cloudproxy: stream read error", "error", err)
-	}
-	flushResidual()
-	return nil
 }
diff --git a/core/services/cloudproxy/ssewire/ssewire.go b/core/services/cloudproxy/ssewire/ssewire.go
deleted file mode 100644
index ed3cb862ba01..000000000000
--- a/core/services/cloudproxy/ssewire/ssewire.go
+++ /dev/null
@@ -1,218 +0,0 @@
-// Package ssewire holds the SSE-format helpers shared between
-// the request-shape cloud proxy (core/services/cloudproxy) and the
-// TLS-terminating MITM proxy (core/services/cloudproxy/mitm). Both
-// run a pii.StreamFilter over per-token text extracted from
-// provider-specific JSON chunks; this package owns the JSON shapes
-// so a future provider addition is one edit, not two.
-package ssewire
-
-import (
-	"bufio"
-	"encoding/json"
-	"io"
-	"strings"
-
-	"github.com/mudler/LocalAI/core/services/routing/pii"
-)
-
-// Provider is the upstream wire format an SSE stream conforms to.
-type Provider string
-
-const (
-	OpenAI    Provider = "openai"
-	Anthropic Provider = "anthropic"
-)
-
-// Event is one SSE event with its exact wire bytes preserved in
-// Raw (so unmodified events round-trip byte-for-byte) and the
-// extracted JSON payload from the data: line in DataLine.
-type Event struct {
-	Raw      string
-	DataLine string
-}
-
-// Scanner reads SSE events one at a time from an upstream body.
-type Scanner struct {
-	r   *bufio.Reader
-	ev  Event
-	err error
-}
-
-func NewScanner(r io.Reader) *Scanner {
-	return &Scanner{r: bufio.NewReaderSize(r, 64*1024)}
-}
-
-func (s *Scanner) Scan() bool {
-	var raw strings.Builder
-	var dataLine string
-	for {
-		line, err := s.r.ReadString('\n')
-		if line != "" {
-			raw.WriteString(line)
-			trimmed := strings.TrimRight(line, "\r\n")
-			if trimmed == "" {
-				if raw.Len() == len(line) {
-					raw.Reset()
-					continue
-				}
-				s.ev = Event{Raw: raw.String(), DataLine: dataLine}
-				return true
-			}
-			if strings.HasPrefix(trimmed, "data:") && dataLine == "" {
-				payload := strings.TrimPrefix(trimmed, "data:")
-				payload = strings.TrimPrefix(payload, " ")
-				dataLine = payload
-			}
-		}
-		if err != nil {
-			s.err = err
-			if raw.Len() > 0 {
-				s.ev = Event{Raw: raw.String(), DataLine: dataLine}
-				return true
-			}
-			return false
-		}
-	}
-}
-
-func (s *Scanner) Event() Event { return s.ev }
-func (s *Scanner) Err() error   { return s.err }
-
-// IsTerminalMarker reports whether the data line is the per-provider
-// end-of-stream sentinel. The streaming PII filter must drain its
-// residue before the caller forwards a terminal marker — clients
-// stop reading after it.
-func IsTerminalMarker(dataLine string, provider Provider) bool {
-	if dataLine == "" {
-		return false
-	}
-	if strings.TrimSpace(dataLine) == "[DONE]" {
-		return true
-	}
-	if provider == Anthropic {
-		var probe struct {
-			Type string `json:"type"`
-		}
-		if err := json.Unmarshal([]byte(dataLine), &probe); err == nil {
-			return probe.Type == "message_stop"
-		}
-	}
-	return false
-}
-
-// RewritePayload runs the data line's content-bearing field through
-// the streaming filter. drop=true tells the caller to suppress the
-// SSE event entirely (the filter buffered the whole token while
-// disambiguating a pattern boundary).
-func RewritePayload(dataLine string, provider Provider, filter *pii.StreamFilter) (rewritten string, drop bool) {
-	if strings.TrimSpace(dataLine) == "[DONE]" {
-		return dataLine, false
-	}
-	switch provider {
-	case Anthropic:
-		return rewriteAnthropic(dataLine, filter)
-	default:
-		return rewriteOpenAI(dataLine, filter)
-	}
-}
-
-func rewriteOpenAI(dataLine string, filter *pii.StreamFilter) (string, bool) {
-	var m map[string]any
-	if err := json.Unmarshal([]byte(dataLine), &m); err != nil {
-		return dataLine, false
-	}
-	choices, ok := m["choices"].([]any)
-	if !ok || len(choices) == 0 {
-		return dataLine, false
-	}
-	first, ok := choices[0].(map[string]any)
-	if !ok {
-		return dataLine, false
-	}
-	delta, ok := first["delta"].(map[string]any)
-	if !ok {
-		return dataLine, false
-	}
-	content, ok := delta["content"].(string)
-	if !ok || content == "" {
-		return dataLine, false
-	}
-	rewritten := filter.Push(content)
-	if rewritten == "" {
-		return "", true
-	}
-	if rewritten == content {
-		return dataLine, false
-	}
-	delta["content"] = rewritten
-	out, err := json.Marshal(m)
-	if err != nil {
-		return dataLine, false
-	}
-	return string(out), false
-}
-
-func rewriteAnthropic(dataLine string, filter *pii.StreamFilter) (string, bool) {
-	var m map[string]any
-	if err := json.Unmarshal([]byte(dataLine), &m); err != nil {
-		return dataLine, false
-	}
-	if t, _ := m["type"].(string); t != "content_block_delta" {
-		return dataLine, false
-	}
-	delta, ok := m["delta"].(map[string]any)
-	if !ok {
-		return dataLine, false
-	}
-	if dt, _ := delta["type"].(string); dt != "text_delta" {
-		return dataLine, false
-	}
-	text, ok := delta["text"].(string)
-	if !ok || text == "" {
-		return dataLine, false
-	}
-	rewritten := filter.Push(text)
-	if rewritten == "" {
-		return "", true
-	}
-	if rewritten == text {
-		return dataLine, false
-	}
-	delta["text"] = rewritten
-	out, err := json.Marshal(m)
-	if err != nil {
-		return dataLine, false
-	}
-	return string(out), false
-}
-
-// SynthResidualEvent builds a provider-shaped SSE event carrying
-// the streaming filter's drained tail so the response body remains
-// a valid event stream after the proxy splices in held-back text.
-func SynthResidualEvent(provider Provider, text string) string {
-	switch provider {
-	case Anthropic:
-		payload := map[string]any{
-			"type":  "content_block_delta",
-			"index": 0,
-			"delta": map[string]string{"type": "text_delta", "text": text},
-		}
-		b, err := json.Marshal(payload)
-		if err != nil {
-			return ""
-		}
-		return "event: content_block_delta\ndata: " + string(b) + "\n\n"
-	default:
-		payload := map[string]any{
-			"object": "chat.completion.chunk",
-			"choices": []map[string]any{
-				{"index": 0, "delta": map[string]string{"content": text}},
-			},
-		}
-		b, err := json.Marshal(payload)
-		if err != nil {
-			return ""
-		}
-		return "data: " + string(b) + "\n\n"
-	}
-}
diff --git a/core/services/cloudproxy/ssewire/ssewire_suite_test.go b/core/services/cloudproxy/ssewire/ssewire_suite_test.go
deleted file mode 100644
index 6925017f0171..000000000000
--- a/core/services/cloudproxy/ssewire/ssewire_suite_test.go
+++ /dev/null
@@ -1,13 +0,0 @@
-package ssewire
-
-import (
-	"testing"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func TestSsewire(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "ssewire test suite")
-}
diff --git a/core/services/cloudproxy/ssewire/ssewire_test.go b/core/services/cloudproxy/ssewire/ssewire_test.go
deleted file mode 100644
index 2750367fda48..000000000000
--- a/core/services/cloudproxy/ssewire/ssewire_test.go
+++ /dev/null
@@ -1,114 +0,0 @@
-package ssewire
-
-import (
-	"strings"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// Scanner contract: returns one Event per double-newline-terminated
-// SSE block, preserving the raw bytes (so unmodified events round-trip
-// exactly) and extracting the first data: payload as DataLine.
-
-var _ = Describe("Scanner", func() {
-	It("scans a basic event", func() {
-		in := "event: foo\ndata: hello\n\n"
-		s := NewScanner(strings.NewReader(in))
-		Expect(s.Scan()).To(BeTrue(), "Scan returned false on a well-formed event; err=%v", s.Err())
-		ev := s.Event()
-		Expect(ev.Raw).To(Equal(in))
-		Expect(ev.DataLine).To(Equal("hello"))
-		Expect(s.Scan()).To(BeFalse(), "Scan should return false after the only event")
-	})
-
-	It("handles CRLF", func() {
-		// Some upstreams emit CRLF instead of LF. The scanner trims
-		// trailing \r off the data line so DataLine carries the same
-		// bytes whichever line ending the producer chose.
-		in := "event: foo\r\ndata: hello\r\n\r\n"
-		s := NewScanner(strings.NewReader(in))
-		Expect(s.Scan()).To(BeTrue(), "Scan returned false on CRLF event; err=%v", s.Err())
-		Expect(s.Event().DataLine).To(Equal("hello"))
-	})
-
-	It("scans multiple events", func() {
-		in := "data: one\n\ndata: two\n\ndata: three\n\n"
-		s := NewScanner(strings.NewReader(in))
-		got := []string{}
-		for s.Scan() {
-			got = append(got, s.Event().DataLine)
-		}
-		Expect(got).To(Equal([]string{"one", "two", "three"}))
-	})
-
-	It("handles empty data payload", func() {
-		// "data:" with no payload is valid SSE — DataLine should be empty
-		// and Scan should still surface the event so callers can decide.
-		in := "data:\n\n"
-		s := NewScanner(strings.NewReader(in))
-		Expect(s.Scan()).To(BeTrue(), "Scan returned false on empty data payload; err=%v", s.Err())
-		Expect(s.Event().DataLine).To(Equal(""))
-	})
-
-	It("skips leading blank lines", func() {
-		// A producer that prints a blank "keep-alive" before the first
-		// real event must not produce a phantom event.
-		in := "\n\n\ndata: real\n\n"
-		s := NewScanner(strings.NewReader(in))
-		Expect(s.Scan()).To(BeTrue(), "Scan returned false; err=%v", s.Err())
-		Expect(s.Event().DataLine).To(Equal("real"))
-	})
-
-	It("handles mid-event EOF", func() {
-		// EOF mid-event still surfaces the partial event with whatever
-		// data was extracted — the StreamFilter+caller decides how to
-		// handle a truncated upstream rather than silently dropping it.
-		in := "data: half"
-		s := NewScanner(strings.NewReader(in))
-		Expect(s.Scan()).To(BeTrue(), "Scan returned false on partial event")
-		ev := s.Event()
-		Expect(ev.DataLine).To(Equal("half"))
-		Expect(s.Scan()).To(BeFalse(), "Scan should not surface a second event after EOF")
-	})
-})
-
-var _ = Describe("IsTerminalMarker", func() {
-	cases := []struct {
-		name     string
-		dataLine string
-		provider Provider
-		want     bool
-	}{
-		{"openai DONE", "[DONE]", OpenAI, true},
-		{"openai DONE with whitespace", "  [DONE]  ", OpenAI, true},
-		{"anthropic DONE also recognised", "[DONE]", Anthropic, true},
-		{"anthropic message_stop", `{"type":"message_stop"}`, Anthropic, true},
-		{"anthropic content_block_delta is not terminal", `{"type":"content_block_delta"}`, Anthropic, false},
-		{"openai chat.completion.chunk is not terminal", `{"object":"chat.completion.chunk"}`, OpenAI, false},
-		{"openai message_stop is not terminal (wrong provider)", `{"type":"message_stop"}`, OpenAI, false},
-		{"empty data", "", OpenAI, false},
-		{"non-json garbage", "garbage", Anthropic, false},
-	}
-	for _, c := range cases {
-		It(c.name, func() {
-			Expect(IsTerminalMarker(c.dataLine, c.provider)).To(Equal(c.want))
-		})
-	}
-})
-
-var _ = Describe("SynthResidualEvent", func() {
-	It("anthropic", func() {
-		got := SynthResidualEvent(Anthropic, "tail")
-		Expect(strings.HasPrefix(got, "event: content_block_delta\ndata:")).To(BeTrue(), "Anthropic residual event missing event/data lines: %q", got)
-		Expect(strings.HasSuffix(got, "\n\n")).To(BeTrue(), "Anthropic residual event missing trailing blank line: %q", got)
-		Expect(got).To(ContainSubstring(`"text":"tail"`))
-	})
-
-	It("openai", func() {
-		got := SynthResidualEvent(OpenAI, "tail")
-		Expect(strings.HasPrefix(got, "data: ")).To(BeTrue(), "OpenAI residual event missing data: prefix: %q", got)
-		Expect(strings.HasSuffix(got, "\n\n")).To(BeTrue(), "OpenAI residual event missing trailing blank line: %q", got)
-		Expect(got).To(ContainSubstring(`"content":"tail"`))
-	})
-})
diff --git a/core/services/modeladmin/config.go b/core/services/modeladmin/config.go
index e9ae341c864d..c01e2fb4cf6d 100644
--- a/core/services/modeladmin/config.go
+++ b/core/services/modeladmin/config.go
@@ -6,12 +6,13 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"reflect"
 	"strings"
 
-	"dario.cat/mergo"
 	"gopkg.in/yaml.v3"
 
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/config/meta"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/utils"
@@ -114,9 +115,7 @@ func (s *ConfigService) PatchConfig(_ context.Context, name string, patch map[st
 	if existingMap == nil {
 		existingMap = map[string]any{}
 	}
-	if err := mergo.Merge(&existingMap, patch, mergo.WithOverride); err != nil {
-		return nil, fmt.Errorf("merge configs: %w", err)
-	}
+	patchMerge(existingMap, patch, mapLeafFieldPaths(), "")
 	yamlData, err := yaml.Marshal(existingMap)
 	if err != nil {
 		return nil, fmt.Errorf("marshal merged YAML: %w", err)
@@ -142,6 +141,55 @@ func (s *ConfigService) PatchConfig(_ context.Context, name string, patch map[st
 	return &updated, nil
 }
 
+// mapLeafFieldPaths returns the set of dotted config paths whose schema type is
+// a map that the editor edits as one complete value (e.g.
+// pii_detection.entity_actions, roles, engine_args). A PATCH must REPLACE these
+// wholesale rather than union them: the deep-merge only adds and overrides
+// keys, so a map entry the admin deleted in the editor would otherwise silently
+// survive. Derived from the config schema so it stays correct as map fields are
+// added. (UIType comes from reflection, independent of any registry override.)
+func mapLeafFieldPaths() map[string]struct{} {
+	md := meta.BuildConfigMetadata(reflect.TypeFor[config.ModelConfig]())
+	out := make(map[string]struct{})
+	for _, f := range md.Fields {
+		if f.UIType == "map" {
+			out[f.Path] = struct{}{}
+		}
+	}
+	return out
+}
+
+// patchMerge deep-merges src into dst with the same shape as the previous
+// mergo.WithOverride behaviour — scalars and slices replace; nested
+// struct-maps (e.g. pii_detection, parameters) recurse so unknown sibling keys
+// the editor doesn't model survive — EXCEPT that any path in mapLeaves is
+// replaced wholesale, and removed when the patch sets it empty, so deletions
+// inside a map field persist to disk.
+func patchMerge(dst, src map[string]any, mapLeaves map[string]struct{}, prefix string) {
+	for k, sv := range src {
+		path := k
+		if prefix != "" {
+			path = prefix + "." + k
+		}
+		if _, isLeaf := mapLeaves[path]; isLeaf {
+			if m, ok := sv.(map[string]any); ok && len(m) == 0 {
+				delete(dst, k) // emptied map field -> drop it from the YAML
+			} else {
+				dst[k] = sv
+			}
+			continue
+		}
+		// Recurse into struct-like nesting so dst-only sibling keys survive.
+		if sm, ok := sv.(map[string]any); ok {
+			if dm, ok2 := dst[k].(map[string]any); ok2 {
+				patchMerge(dm, sm, mapLeaves, path)
+				continue
+			}
+		}
+		dst[k] = sv
+	}
+}
+
 // EditYAML replaces the YAML for an installed model, with optional rename
 // support. ml may be nil; when set, EditYAML calls ml.ShutdownModel(oldName)
 // after a successful write so the next inference picks up the new config.
diff --git a/core/services/modeladmin/config_test.go b/core/services/modeladmin/config_test.go
index 2ccf68917fd4..d4157047d0d8 100644
--- a/core/services/modeladmin/config_test.go
+++ b/core/services/modeladmin/config_test.go
@@ -107,6 +107,64 @@ var _ = Describe("ConfigService", func() {
 			_, err := svc.PatchConfig(ctx, "qwen", map[string]any{})
 			Expect(err).To(MatchError(ErrEmptyBody))
 		})
+
+		It("replaces a map field wholesale so deleted entries do not survive", func() {
+			// A detector model with a populated entity_actions map. The editor
+			// removes SSN and re-sends the remaining map; a naive deep-merge
+			// would re-add SSN (it only adds/overrides keys, never deletes).
+			writeModelYAML(svc, dir, "ner", map[string]any{
+				"backend":        "llama-cpp",
+				"known_usecases": []any{"token_classify"},
+				"pii_detection": map[string]any{
+					"default_action": "mask",
+					"entity_actions": map[string]any{"SSN": "block", "EMAIL": "mask"},
+				},
+			})
+
+			_, err := svc.PatchConfig(ctx, "ner", map[string]any{
+				"pii_detection": map[string]any{
+					"default_action": "mask",
+					"entity_actions": map[string]any{"EMAIL": "mask"},
+				},
+			})
+			Expect(err).ToNot(HaveOccurred())
+
+			raw, err := os.ReadFile(filepath.Join(dir, "ner.yaml"))
+			Expect(err).ToNot(HaveOccurred())
+			var got map[string]any
+			Expect(yaml.Unmarshal(raw, &got)).To(Succeed())
+			pii := got["pii_detection"].(map[string]any)
+			ea := pii["entity_actions"].(map[string]any)
+			Expect(ea).To(HaveKeyWithValue("EMAIL", "mask"))
+			Expect(ea).NotTo(HaveKey("SSN"), "deleted map entry must not survive the patch")
+			// The scalar sibling in the same nested block is still preserved.
+			Expect(pii).To(HaveKeyWithValue("default_action", "mask"))
+		})
+
+		It("drops a map field entirely when the patch empties it", func() {
+			writeModelYAML(svc, dir, "ner", map[string]any{
+				"backend":        "llama-cpp",
+				"known_usecases": []any{"token_classify"},
+				"pii_detection": map[string]any{
+					"default_action": "mask",
+					"entity_actions": map[string]any{"SSN": "block"},
+				},
+			})
+
+			_, err := svc.PatchConfig(ctx, "ner", map[string]any{
+				"pii_detection": map[string]any{
+					"entity_actions": map[string]any{},
+				},
+			})
+			Expect(err).ToNot(HaveOccurred())
+
+			raw, err := os.ReadFile(filepath.Join(dir, "ner.yaml"))
+			Expect(err).ToNot(HaveOccurred())
+			var got map[string]any
+			Expect(yaml.Unmarshal(raw, &got)).To(Succeed())
+			pii := got["pii_detection"].(map[string]any)
+			Expect(pii).NotTo(HaveKey("entity_actions"))
+		})
 	})
 
 	Describe("EditYAML", func() {
diff --git a/core/services/routing/pii/config.go b/core/services/routing/pii/config.go
deleted file mode 100644
index 64f7096750d2..000000000000
--- a/core/services/routing/pii/config.go
+++ /dev/null
@@ -1,71 +0,0 @@
-package pii
-
-import (
-	"fmt"
-	"os"
-
-	"gopkg.in/yaml.v3"
-)
-
-// FileConfig is the on-disk schema for pii.yaml. Each Pattern entry
-// overrides the matching default by ID; missing fields fall back to
-// the default. Unknown IDs are rejected at load time so an admin who
-// fat-fingers a pattern name gets a clear error rather than a silent
-// no-op.
-type FileConfig struct {
-	Patterns []FilePattern `yaml:"patterns"`
-}
-
-type FilePattern struct {
-	ID     string `yaml:"id"`
-	Action Action `yaml:"action"`
-}
-
-// LoadConfig reads pii.yaml from path and merges it on top of
-// DefaultPatterns(). path == "" returns the defaults compiled and
-// ready. The returned slice is already Compile()'d, so callers can
-// pass it straight to NewRedactor.
-func LoadConfig(path string) ([]Pattern, error) {
-	defaults := DefaultPatterns()
-	if path == "" {
-		return Compile(defaults)
-	}
-
-	raw, err := os.ReadFile(path)
-	if err != nil {
-		return nil, fmt.Errorf("pii: read config %q: %w", path, err)
-	}
-	var cfg FileConfig
-	if err := yaml.Unmarshal(raw, &cfg); err != nil {
-		return nil, fmt.Errorf("pii: parse config %q: %w", path, err)
-	}
-
-	overrides := make(map[string]Action, len(cfg.Patterns))
-	known := make(map[string]bool, len(defaults))
-	for _, d := range defaults {
-		known[d.ID] = true
-	}
-	for _, p := range cfg.Patterns {
-		if !known[p.ID] {
-			return nil, fmt.Errorf("pii: unknown pattern id %q in %q", p.ID, path)
-		}
-		if p.Action == "" {
-			continue
-		}
-		switch p.Action {
-		case ActionMask, ActionBlock, ActionRouteLocal:
-			overrides[p.ID] = p.Action
-		default:
-			return nil, fmt.Errorf("pii: invalid action %q for pattern %q", p.Action, p.ID)
-		}
-	}
-
-	merged := make([]Pattern, len(defaults))
-	for i, d := range defaults {
-		if a, ok := overrides[d.ID]; ok {
-			d.Action = a
-		}
-		merged[i] = d
-	}
-	return Compile(merged)
-}
diff --git a/core/services/routing/pii/config_test.go b/core/services/routing/pii/config_test.go
deleted file mode 100644
index 650b804f01a7..000000000000
--- a/core/services/routing/pii/config_test.go
+++ /dev/null
@@ -1,56 +0,0 @@
-package pii
-
-import (
-	"os"
-	"path/filepath"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("LoadConfig", func() {
-	It("returns defaults when no path given", func() {
-		patterns, err := LoadConfig("")
-		Expect(err).NotTo(HaveOccurred())
-		Expect(patterns).To(HaveLen(len(DefaultPatterns())))
-	})
-
-	It("overrides action", func() {
-		dir := GinkgoT().TempDir()
-		path := filepath.Join(dir, "pii.yaml")
-		body := []byte(`patterns:
-  - id: email
-    action: block
-  - id: ssn
-    action: route_local
-`)
-		Expect(os.WriteFile(path, body, 0o600)).To(Succeed())
-		patterns, err := LoadConfig(path)
-		Expect(err).NotTo(HaveOccurred())
-
-		got := map[string]Action{}
-		for _, p := range patterns {
-			got[p.ID] = p.Action
-		}
-		Expect(got["email"]).To(Equal(ActionBlock))
-		Expect(got["ssn"]).To(Equal(ActionRouteLocal))
-		// Unmentioned patterns keep their default action.
-		Expect(got["credit_card"]).To(Equal(ActionMask), "credit_card default action lost")
-	})
-
-	It("rejects unknown id", func() {
-		dir := GinkgoT().TempDir()
-		path := filepath.Join(dir, "pii.yaml")
-		Expect(os.WriteFile(path, []byte("patterns:\n  - id: nonsense\n    action: mask\n"), 0o600)).To(Succeed())
-		_, err := LoadConfig(path)
-		Expect(err).To(HaveOccurred(), "expected error on unknown pattern id")
-	})
-
-	It("rejects invalid action", func() {
-		dir := GinkgoT().TempDir()
-		path := filepath.Join(dir, "pii.yaml")
-		Expect(os.WriteFile(path, []byte("patterns:\n  - id: email\n    action: lolwhat\n"), 0o600)).To(Succeed())
-		_, err := LoadConfig(path)
-		Expect(err).To(HaveOccurred(), "expected error on invalid action")
-	})
-})
diff --git a/core/services/routing/pii/middleware.go b/core/services/routing/pii/middleware.go
index 0994c32ba927..ff46a21c1a7b 100644
--- a/core/services/routing/pii/middleware.go
+++ b/core/services/routing/pii/middleware.go
@@ -19,29 +19,71 @@ import (
 // drag the http/middleware package into pii's import graph and create
 // a cycle (http/middleware will import this one).
 const (
-	ctxKeyCorrelationID    = "routing.correlation_id"
-	ctxKeyPIIEventID       = "routing.pii_event_id"
-	ctxKeyLocalOnly        = "routing.local_only"
+	ctxKeyCorrelationID = "routing.correlation_id"
+	ctxKeyPIIEventID    = "routing.pii_event_id"
 	// Must match the constants in core/http/middleware/request.go.
 	// Echoing them across packages would create an import cycle
 	// (http/middleware imports this package). Drift is caught by
 	// integration tests against the chat route.
-	ctxKeyParsedRequest    = "LOCALAI_REQUEST"
-	ctxKeyModelConfig      = "MODEL_CONFIG"
+	ctxKeyParsedRequest = "LOCALAI_REQUEST"
+	ctxKeyModelConfig   = "MODEL_CONFIG"
 )
 
 // ModelPIIConfig is the duck-typed view this middleware needs of the
-// per-model PII configuration carried on the echo context. *config.ModelConfig
-// satisfies it via PIIIsEnabled / PIIPatternOverrides; the indirection
-// keeps the pii package from importing core/config.
+// per-model PII configuration carried on the echo context.
+// *config.ModelConfig satisfies it via PIIIsEnabled / PIIDetectors; the
+// indirection keeps the pii package from importing core/config.
 //
-// Consumers of the override map: the action returned from PIIPatternOverrides
-// is the raw YAML string (e.g. "block"). Validation against the canonical
-// ActionMask/Block/RouteLocal constants happens here, so a typo in a model
-// YAML logs and is ignored rather than panicking.
+// PIIDetectors lists the token-classification models whose detections
+// drive redaction for this (consuming) model. The detection policy lives
+// on each named detector model — resolved via NERDetectorResolver — so
+// this consuming view carries no per-entity actions of its own.
 type ModelPIIConfig interface {
 	PIIIsEnabled() bool
-	PIIPatternOverrides() map[string]string
+	PIIDetectors() []string
+}
+
+// NERDetectorResolver resolves a detector model name to a ready-to-use
+// NERConfig — the detector plus the policy (min score, entity→action
+// map, default action) read from that model's own pii_detection block.
+// ok is false when the name can't supply a detector (unknown model, not
+// a token_classify model, or load failure); the middleware fails closed
+// in that case. Supplied by the application layer, which owns the model
+// loader and the core/backend dependency, keeping the pii package free of
+// both. A nil resolver (or the option being unset) disables the NER tier.
+type NERDetectorResolver func(modelName string) (NERConfig, bool)
+
+// Option configures optional RequestMiddleware behaviour. Threaded as
+// variadic options so adding the NER tier doesn't break the existing
+// four-argument call sites (routes and tests).
+type Option func(*mwOptions)
+
+type mwOptions struct {
+	nerResolver    NERDetectorResolver
+	policyResolver PolicyResolver
+}
+
+// PolicyResolver returns the effective (enabled, detectors) for the model
+// carried on the request context, layering instance-wide PII defaults over the
+// per-model config. Supplied by the application layer (which owns core/config),
+// keeping this package decoupled from it — the middleware passes the raw
+// context value through as `any`. When unset, the middleware falls back to the
+// duck-typed ModelPIIConfig (explicit per-model config only, no global default).
+type PolicyResolver func(modelCfg any) (enabled bool, detectors []string)
+
+// WithPolicyResolver overrides how the middleware decides enablement and the
+// detector list, so the instance-wide default detector / default-on usecases
+// apply. Without it the middleware reads ModelPIIConfig off the context.
+func WithPolicyResolver(r PolicyResolver) Option {
+	return func(o *mwOptions) { o.policyResolver = r }
+}
+
+// WithNERResolver enables the NER tier. When a request's model lists
+// pii.detectors, the middleware resolves each to a NERConfig and runs
+// RedactNER (the union of all detectors' hits, merged). Without this
+// option, or when a model lists no detectors, redaction is a no-op.
+func WithNERResolver(r NERDetectorResolver) Option {
+	return func(o *mwOptions) { o.nerResolver = r }
 }
 
 // ScannedText is one piece of user text from the request. Index is
@@ -77,39 +119,40 @@ type Adapter struct {
 //     to the client.
 //   - On match with action=mask: the redacted text replaces the
 //     original on the parsed request. PIIEvents are recorded.
-//   - On match with action=route_local: the original text is left
-//     intact, but the echo context is annotated so the (future) router
-//     middleware refuses cloud-proxy candidates.
+//   - On match with action=allow: the original text is left intact; a
+//     PIIEvent is still recorded so the detection is auditable.
 //
 // recorder is the Recorder on which to record events; nil disables
 // recording (the redaction still happens). fallbackUser supplies the
 // no-auth identity. The middleware writes ctxKeyPIIEventID on the echo
 // context so the usage middleware can later cross-reference the event
 // with the UsageRecord.
-func RequestMiddleware(redactor *Redactor, store EventStore, adapter Adapter, fallbackUser *auth.User) echo.MiddlewareFunc {
+func RequestMiddleware(redactor *Redactor, store EventStore, adapter Adapter, fallbackUser *auth.User, opts ...Option) echo.MiddlewareFunc {
+	var o mwOptions
+	for _, opt := range opts {
+		opt(&o)
+	}
 	return func(next echo.HandlerFunc) echo.HandlerFunc {
 		return func(c echo.Context) error {
-			if redactor == nil || len(redactor.Patterns()) == 0 || adapter.Scan == nil {
+			if redactor == nil || adapter.Scan == nil {
 				return next(c)
 			}
 
-			// Per-model gating: redaction is opt-in per model. If the
-			// resolved config disables PII for this model (the default
-			// for non-proxy backends), pass through immediately. We do
-			// this before parsing the request so a disabled model
-			// doesn't pay the regex scan cost.
-			if cfg, ok := c.Get(ctxKeyModelConfig).(ModelPIIConfig); ok {
-				if !cfg.PIIIsEnabled() {
-					return next(c)
-				}
-			} else {
-				// No ModelPIIConfig on context → fail-closed: skip
-				// redaction. This protects routes that wire the
-				// middleware before SetModelAndConfig runs (or non-chat
-				// routes that don't carry a model). The middleware was
-				// previously fail-open, applying the global redactor
-				// unconditionally; the new contract is per-model
-				// opt-in, and a missing model is treated as disabled.
+			// Per-model gating: redaction is opt-in per model. The policy
+			// resolver (when wired) layers instance-wide defaults over the
+			// per-model config; otherwise we read the per-model config
+			// directly. A missing config (non-chat routes, or middleware
+			// wired before SetModelAndConfig) or a not-enabled result passes
+			// through.
+			rawCfg := c.Get(ctxKeyModelConfig)
+			var enabled bool
+			var detectors []string
+			if o.policyResolver != nil {
+				enabled, detectors = o.policyResolver(rawCfg)
+			} else if cfg, ok := rawCfg.(ModelPIIConfig); ok {
+				enabled, detectors = cfg.PIIIsEnabled(), cfg.PIIDetectors()
+			}
+			if !enabled {
 				return next(c)
 			}
 
@@ -118,6 +161,12 @@ func RequestMiddleware(redactor *Redactor, store EventStore, adapter Adapter, fa
 				return next(c)
 			}
 
+			// A PII-enabled model with no detectors (or no resolver wired)
+			// has nothing to scan with — pass through.
+			if len(detectors) == 0 || o.nerResolver == nil {
+				return next(c)
+			}
+
 			user := auth.GetUser(c)
 			if user == nil {
 				user = fallbackUser
@@ -128,48 +177,48 @@ func RequestMiddleware(redactor *Redactor, store EventStore, adapter Adapter, fa
 			}
 			correlationID, _ := c.Get(ctxKeyCorrelationID).(string)
 
-			// Resolve per-model action overrides once per request. The
-			// raw map is YAML strings; convert to the typed Action set
-			// and silently drop unknown values rather than failing the
-			// request — model YAML typos shouldn't take chat down.
-			var overrides map[string]Action
-			if cfg, ok := c.Get(ctxKeyModelConfig).(ModelPIIConfig); ok {
-				if raw := cfg.PIIPatternOverrides(); len(raw) > 0 {
-					overrides = make(map[string]Action, len(raw))
-					for id, action := range raw {
-						switch Action(action) {
-						case ActionMask, ActionBlock, ActionRouteLocal:
-							overrides[id] = Action(action)
-						default:
-							xlog.Warn("pii: ignoring unknown action in per-model override",
-								"pattern", id, "action", action)
-						}
-					}
+			// Resolve each named detector to its NERConfig (detector +
+			// the policy from that model's own pii_detection block). A
+			// configured detector that can't be resolved fails closed:
+			// serving the request without the semantic check the operator
+			// asked for is exactly the leak this tier exists to prevent.
+			cfgs := make([]NERConfig, 0, len(detectors))
+			for _, name := range detectors {
+				nc, ok := o.nerResolver(name)
+				if !ok {
+					xlog.Error("pii: configured detector model could not be resolved; blocking request (fail-closed)", "detector", name)
+					return blockNERUnavailable(c, store, correlationID, userID)
 				}
+				cfgs = append(cfgs, nc)
 			}
 
 			texts := adapter.Scan(parsed)
 			updates := make([]ScannedText, 0, len(texts))
 			var blocked bool
-			var localOnly bool
 			var firstEventID string
 
 			for _, st := range texts {
 				if st.Text == "" {
 					continue
 				}
-				res := redactor.RedactWithOverrides(st.Text, overrides)
+				// Fail closed: a detector outage at request time must NOT
+				// silently serve the request. The NER tier was explicitly
+				// configured for this model, so the semantic check is part
+				// of the contract.
+				res, nerErr := RedactNER(c.Request().Context(), st.Text, cfgs)
+				if nerErr != nil {
+					xlog.Error("pii: NER detector failed; blocking request (fail-closed)", "error", nerErr)
+					return blockNERUnavailable(c, store, correlationID, userID)
+				}
 				if len(res.Spans) == 0 {
 					continue
 				}
 
-				// Persist one event per span so admins can see exactly
-				// which patterns fired in which positions. The action
-				// recorded is the resolved one (after override), so the
-				// events log reflects what actually happened to the
-				// request, not the global default.
+				// Persist one event per detected span. The action recorded
+				// is the one that actually fired (carried on the span after
+				// the overlap merge), so the events log reflects what
+				// happened to the request.
 				for _, span := range res.Spans {
-					action := actionForSpan(redactor.Patterns(), span.Pattern, overrides)
 					ev := PIIEvent{
 						ID:            newEventID(),
 						CorrelationID: correlationID,
@@ -179,7 +228,8 @@ func RequestMiddleware(redactor *Redactor, store EventStore, adapter Adapter, fa
 						ByteOffset:    span.Start,
 						Length:        span.End - span.Start,
 						HashPrefix:    span.HashPrefix,
-						Action:        action,
+						Action:        span.Action,
+						Score:         span.Score,
 						CreatedAt:     time.Now().UTC(),
 					}
 					if firstEventID == "" {
@@ -201,9 +251,6 @@ func RequestMiddleware(redactor *Redactor, store EventStore, adapter Adapter, fa
 				if res.Blocked {
 					blocked = true
 				}
-				if res.LocalOnly {
-					localOnly = true
-				}
 				updates = append(updates, ScannedText{Index: st.Index, Text: res.Redacted})
 			}
 
@@ -224,33 +271,89 @@ func RequestMiddleware(redactor *Redactor, store EventStore, adapter Adapter, fa
 			if firstEventID != "" {
 				c.Set(ctxKeyPIIEventID, firstEventID)
 			}
-			if localOnly {
-				c.Set(ctxKeyLocalOnly, true)
-			}
-
 			return next(c)
 		}
 	}
 }
 
-func actionForPattern(patterns []Pattern, id string) Action {
-	for _, p := range patterns {
-		if p.ID == id {
-			return p.Action
+// nerUnavailablePattern is the sentinel PatternID recorded on the
+// fail-closed audit event when a model's configured NER tier cannot
+// run. It is not a real regex pattern — it marks a request blocked
+// because the encoder/NER check was unavailable (model unresolved or
+// backend error), so the events log distinguishes it from a content
+// block (which carries a real pattern ID).
+const nerUnavailablePattern = "__ner_unavailable__"
+
+// blockNERUnavailable records a fail-closed audit event and returns the
+// response used when a model has an NER tier configured but it could
+// not run. Failing closed is deliberate for a PII filter: if the
+// semantic check the operator asked for cannot execute, refusing the
+// request is safer than serving it with only the cheap regex tier. The
+// 503 (vs the 400 used for a content block) tells clients and operators
+// this was a dependency outage, not sensitive data in the request.
+func blockNERUnavailable(c echo.Context, store EventStore, correlationID, userID string) error {
+	ev := PIIEvent{
+		ID:            newEventID(),
+		Kind:          KindPII,
+		CorrelationID: correlationID,
+		UserID:        userID,
+		Direction:     DirectionIn,
+		PatternID:     nerUnavailablePattern,
+		Action:        ActionBlock,
+		CreatedAt:     time.Now().UTC(),
+	}
+	if store != nil {
+		if err := store.Record(context.Background(), ev); err != nil {
+			xlog.Error("pii: failed to record NER-unavailable event", "error", err)
 		}
 	}
-	return ActionMask
+	c.Set(ctxKeyPIIEventID, ev.ID)
+	return c.JSON(http.StatusServiceUnavailable, map[string]any{
+		"error": map[string]string{
+			"message": "request blocked: PII NER check is configured but unavailable",
+			"type":    "pii_ner_unavailable",
+		},
+		"correlation_id": correlationID,
+		"pii_event_id":   ev.ID,
+	})
 }
 
-// actionForSpan returns the resolved action for a span, preferring a
-// per-request override over the pattern's stored action. Used so the
-// PIIEvent log reflects the action that actually fired (e.g., a model
-// upgraded email from mask to block — the event row says "block").
-func actionForSpan(patterns []Pattern, id string, overrides map[string]Action) Action {
-	if action, ok := overrides[id]; ok {
-		return action
+// validAction converts a raw YAML action string to the typed Action,
+// returning "" for anything that isn't a known action.
+func validAction(raw string) Action {
+	switch Action(raw) {
+	case ActionMask, ActionBlock, ActionAllow:
+		return Action(raw)
+	default:
+		return ""
+	}
+}
+
+// validActionOr is validAction with a fallback for empty/invalid input.
+func validActionOr(raw string, fallback Action) Action {
+	if a := validAction(raw); a != "" {
+		return a
+	}
+	return fallback
+}
+
+// validActions converts a raw entity-group->action map to typed
+// Actions, dropping (and logging) unknown actions so a model YAML typo
+// is ignored rather than taking the request down — mirroring how the
+// per-pattern overrides are validated above.
+func validActions(raw map[string]string) map[string]Action {
+	if len(raw) == 0 {
+		return nil
+	}
+	out := make(map[string]Action, len(raw))
+	for group, action := range raw {
+		if a := validAction(action); a != "" {
+			out[group] = a
+		} else {
+			xlog.Warn("pii: ignoring unknown NER entity action", "group", group, "action", action)
+		}
 	}
-	return actionForPattern(patterns, id)
+	return out
 }
 
 func newEventID() string {
diff --git a/core/services/routing/pii/middleware_test.go b/core/services/routing/pii/middleware_test.go
index d3bbbb2e7219..c3cae3c2b077 100644
--- a/core/services/routing/pii/middleware_test.go
+++ b/core/services/routing/pii/middleware_test.go
@@ -3,12 +3,12 @@ package pii
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"net/http"
 	"net/http/httptest"
 	"strings"
 
 	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/http/auth"
 
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -56,21 +56,15 @@ func setRequestOnContext(req *fakeRequest) echo.MiddlewareFunc {
 }
 
 // fakeModelPIIConfig satisfies the duck-typed ModelPIIConfig interface
-// the middleware expects on the echo context. The real implementation
-// lives on *config.ModelConfig; using a fake here keeps these tests
-// out of the core/config import graph.
+// the middleware expects on the echo context (PIIIsEnabled + PIIDetectors).
 type fakeModelPIIConfig struct {
 	enabled   bool
-	overrides map[string]string
+	detectors []string
 }
 
-func (f fakeModelPIIConfig) PIIIsEnabled() bool                     { return f.enabled }
-func (f fakeModelPIIConfig) PIIPatternOverrides() map[string]string { return f.overrides }
+func (f fakeModelPIIConfig) PIIIsEnabled() bool     { return f.enabled }
+func (f fakeModelPIIConfig) PIIDetectors() []string { return f.detectors }
 
-// withModelConfig wires a ModelPIIConfig onto the context so the
-// middleware's per-model gate doesn't fail-closed during tests. Pass
-// enabled=true for the default test path; explicit-false tests should
-// use the gating spec further down instead.
 func withModelConfig(cfg fakeModelPIIConfig) echo.MiddlewareFunc {
 	return func(next echo.HandlerFunc) echo.HandlerFunc {
 		return func(c echo.Context) error {
@@ -80,230 +74,235 @@ func withModelConfig(cfg fakeModelPIIConfig) echo.MiddlewareFunc {
 	}
 }
 
-func newTestRedactor(ids ...string) *Redactor {
-	patterns, err := Compile(pick(DefaultPatterns(), ids))
-	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "compile")
-	return NewRedactor(patterns)
+// resolverFor returns a NERDetectorResolver that maps each named model to
+// the supplied NERConfig. Names absent from the map resolve to (zero,
+// false) so the middleware fails closed — mirroring an unresolvable model.
+func resolverFor(byName map[string]NERConfig) NERDetectorResolver {
+	return func(name string) (NERConfig, bool) {
+		cfg, ok := byName[name]
+		return cfg, ok
+	}
 }
 
-var _ = Describe("RequestMiddleware", func() {
-	It("masks email", func() {
-		red := newTestRedactor("email")
-		store := NewMemoryEventStore(0)
-		defer func() { _ = store.Close() }()
-		user := &auth.User{ID: "user-1", Name: "alice"}
-
-		body := &fakeRequest{Messages: []string{"contact me at alice@example.com"}}
-		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
-
-		e := echo.New()
-		e.POST("/chat", func(c echo.Context) error {
-			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
-		}, setRequestOnContext(body), withModelConfig(fakeModelPIIConfig{enabled: true}), mw, func(next echo.HandlerFunc) echo.HandlerFunc {
-			// Inject the user as if upstream auth ran.
-			return func(c echo.Context) error {
-				c.Set("auth_user", user)
-				return next(c)
-			}
-		})
+func serve(body *fakeRequest, cfg fakeModelPIIConfig, mw echo.MiddlewareFunc, withConfig bool) (*httptest.ResponseRecorder, *bool) {
+	called := new(bool)
+	e := echo.New()
+	chain := []echo.MiddlewareFunc{setRequestOnContext(body)}
+	if withConfig {
+		chain = append(chain, withModelConfig(cfg))
+	}
+	chain = append(chain, mw)
+	e.POST("/chat", func(c echo.Context) error {
+		*called = true
+		return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
+	}, chain...)
+	req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
+	w := httptest.NewRecorder()
+	e.ServeHTTP(w, req)
+	return w, called
+}
 
-		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
-		w := httptest.NewRecorder()
-		e.ServeHTTP(w, req)
+func nerCfg(action Action, entities ...NEREntity) NERConfig {
+	return NERConfig{
+		Detector:      &stubNERDetector{entities: entities},
+		DefaultAction: action,
+	}
+}
 
-		Expect(w.Code).To(Equal(http.StatusOK), "body=%s", w.Body.String())
-		Expect(body.Messages[0]).NotTo(ContainSubstring("alice@example.com"), "request body should be redacted in place")
-		Expect(body.Messages[0]).To(ContainSubstring("[REDACTED:email]"))
+var _ = Describe("RequestMiddleware (NER)", func() {
+	store := func() EventStore { return NewMemoryEventStore(0) }
+
+	It("masks a detected entity end-to-end", func() {
+		st := store()
+		body := &fakeRequest{Messages: []string{"Hi I'm Alice today"}}
+		mw := RequestMiddleware(&Redactor{}, st, fakeAdapter(), nil,
+			WithNERResolver(resolverFor(map[string]NERConfig{
+				"privacy-filter": nerCfg(ActionMask, NEREntity{Group: "PER", Start: 6, End: 11, Score: 0.95}),
+			})))
+		w, _ := serve(body, fakeModelPIIConfig{enabled: true, detectors: []string{"privacy-filter"}}, mw, true)
 
-		events, err := store.List(context.Background(), ListQuery{Limit: 100})
-		Expect(err).NotTo(HaveOccurred(), "list events")
+		Expect(w.Code).To(Equal(http.StatusOK), "body=%s", w.Body.String())
+		Expect(body.Messages[0]).To(ContainSubstring("[REDACTED:ner:PER]"))
+		events, _ := st.List(context.Background(), ListQuery{Limit: 100})
 		Expect(events).To(HaveLen(1))
-		Expect(events[0].PatternID).To(Equal("email"))
+		Expect(events[0].PatternID).To(Equal("ner:PER"))
 		Expect(events[0].Direction).To(Equal(DirectionIn))
 	})
 
-	It("blocks api key", func() {
-		red := newTestRedactor("api_key_prefix")
-		store := NewMemoryEventStore(0)
-		defer func() { _ = store.Close() }()
-
-		body := &fakeRequest{Messages: []string{"my key is sk-abcdefghijklmnopqrstuvwxyz0123456789"}}
-		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
-
-		e := echo.New()
-		handlerCalled := false
-		e.POST("/chat", func(c echo.Context) error {
-			handlerCalled = true
-			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
-		}, setRequestOnContext(body), withModelConfig(fakeModelPIIConfig{enabled: true}), mw)
-
-		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
-		w := httptest.NewRecorder()
-		e.ServeHTTP(w, req)
-
-		Expect(w.Code).To(Equal(http.StatusBadRequest), "expected 400 on block; body=%s", w.Body.String())
-		Expect(handlerCalled).To(BeFalse(), "handler must not run when request is blocked")
-		// Ensure the matched value never appears in the response body.
-		Expect(w.Body.String()).NotTo(ContainSubstring("abcdefghijklmnopqrstuvwxyz0123456789"), "blocked response leaks the matched value")
+	It("blocks (400) when a detected entity's action is block", func() {
+		st := store()
+		body := &fakeRequest{Messages: []string{"my password is hunter2 ok"}}
+		cfg := NERConfig{
+			Detector:      &stubNERDetector{entities: []NEREntity{{Group: "PASSWORD", Start: 15, End: 22, Score: 0.99}}},
+			EntityActions: map[string]Action{"PASSWORD": ActionBlock},
+		}
+		mw := RequestMiddleware(&Redactor{}, st, fakeAdapter(), nil,
+			WithNERResolver(resolverFor(map[string]NERConfig{"pf": cfg})))
+		w, called := serve(body, fakeModelPIIConfig{enabled: true, detectors: []string{"pf"}}, mw, true)
 
+		Expect(w.Code).To(Equal(http.StatusBadRequest), "body=%s", w.Body.String())
+		Expect(*called).To(BeFalse(), "handler must not run when blocked")
 		var resp map[string]any
 		Expect(json.Unmarshal(w.Body.Bytes(), &resp)).To(Succeed())
-		errBlock, ok := resp["error"].(map[string]any)
-		Expect(ok).To(BeTrue())
+		errBlock, _ := resp["error"].(map[string]any)
 		Expect(errBlock["type"]).To(Equal("pii_blocked"))
 	})
 
-	It("route_local sets context flag", func() {
-		patterns, _ := Compile([]Pattern{{
-			ID: "email", Description: "Email", Action: ActionRouteLocal, MaxMatchLength: 254,
-		}})
-		red := NewRedactor(patterns)
-		store := NewMemoryEventStore(0)
-		defer func() { _ = store.Close() }()
-
+	It("allow leaves text intact but records an event", func() {
+		st := store()
 		body := &fakeRequest{Messages: []string{"hi at alice@example.com"}}
-		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
-
-		e := echo.New()
-		var observedLocalOnly bool
-		e.POST("/chat", func(c echo.Context) error {
-			v, _ := c.Get(ctxKeyLocalOnly).(bool)
-			observedLocalOnly = v
-			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
-		}, setRequestOnContext(body), withModelConfig(fakeModelPIIConfig{enabled: true}), mw)
-
-		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
-		w := httptest.NewRecorder()
-		e.ServeHTTP(w, req)
+		cfg := NERConfig{
+			Detector:      &stubNERDetector{entities: []NEREntity{{Group: "EMAIL", Start: 6, End: 23, Score: 0.9}}},
+			EntityActions: map[string]Action{"EMAIL": ActionAllow},
+		}
+		mw := RequestMiddleware(&Redactor{}, st, fakeAdapter(), nil,
+			WithNERResolver(resolverFor(map[string]NERConfig{"pf": cfg})))
+		w, _ := serve(body, fakeModelPIIConfig{enabled: true, detectors: []string{"pf"}}, mw, true)
 
 		Expect(w.Code).To(Equal(http.StatusOK))
-		Expect(observedLocalOnly).To(BeTrue(), "ctxKeyLocalOnly should be true on route_local match")
-		// route_local does NOT mutate the body — the model still sees the email.
-		Expect(body.Messages[0]).To(ContainSubstring("alice@example.com"), "route_local should leave text intact")
+		Expect(body.Messages[0]).To(ContainSubstring("alice@example.com"))
+		events, _ := st.List(context.Background(), ListQuery{Limit: 100})
+		Expect(events).To(HaveLen(1))
+		Expect(events[0].Action).To(Equal(ActionAllow))
 	})
 
-	It("no match passes through", func() {
-		red := newTestRedactor()
-		store := NewMemoryEventStore(0)
-		defer func() { _ = store.Close() }()
-
+	It("passes through on no match", func() {
+		st := store()
 		body := &fakeRequest{Messages: []string{"perfectly innocent text"}}
-		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
-
-		e := echo.New()
-		e.POST("/chat", func(c echo.Context) error {
-			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
-		}, setRequestOnContext(body), withModelConfig(fakeModelPIIConfig{enabled: true}), mw)
-
-		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
-		w := httptest.NewRecorder()
-		e.ServeHTTP(w, req)
+		mw := RequestMiddleware(&Redactor{}, st, fakeAdapter(), nil,
+			WithNERResolver(resolverFor(map[string]NERConfig{"pf": nerCfg(ActionMask)})))
+		w, _ := serve(body, fakeModelPIIConfig{enabled: true, detectors: []string{"pf"}}, mw, true)
 
 		Expect(w.Code).To(Equal(http.StatusOK))
-		Expect(body.Messages[0]).To(Equal("perfectly innocent text"), "body should be untouched")
-		events, _ := store.List(context.Background(), ListQuery{Limit: 100})
-		Expect(events).To(BeEmpty(), "expected 0 events on no-match input")
+		Expect(body.Messages[0]).To(Equal("perfectly innocent text"))
+		events, _ := st.List(context.Background(), ListQuery{Limit: 100})
+		Expect(events).To(BeEmpty())
 	})
 
-	It("skips when model config disabled", func() {
-		// Per-model gating is the new contract: a model with PIIIsEnabled
-		// returning false must bypass redaction entirely, even if the
-		// global redactor has matching patterns.
-		red := newTestRedactor("email")
-		store := NewMemoryEventStore(0)
-		defer func() { _ = store.Close() }()
+	It("skips when the model has PII disabled", func() {
+		st := store()
+		body := &fakeRequest{Messages: []string{"Hi I'm Alice"}}
+		mw := RequestMiddleware(&Redactor{}, st, fakeAdapter(), nil,
+			WithNERResolver(resolverFor(map[string]NERConfig{
+				"pf": nerCfg(ActionMask, NEREntity{Group: "PER", Start: 6, End: 11, Score: 0.95}),
+			})))
+		w, _ := serve(body, fakeModelPIIConfig{enabled: false, detectors: []string{"pf"}}, mw, true)
 
-		body := &fakeRequest{Messages: []string{"contact alice@example.com"}}
-		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
-
-		e := echo.New()
-		e.POST("/chat", func(c echo.Context) error {
-			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
-		}, setRequestOnContext(body), withModelConfig(fakeModelPIIConfig{enabled: false}), mw)
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(body.Messages[0]).To(Equal("Hi I'm Alice"), "disabled model must not redact")
+	})
 
-		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
-		w := httptest.NewRecorder()
-		e.ServeHTTP(w, req)
+	It("passes through when the model lists no detectors", func() {
+		st := store()
+		body := &fakeRequest{Messages: []string{"Hi I'm Alice"}}
+		mw := RequestMiddleware(&Redactor{}, st, fakeAdapter(), nil,
+			WithNERResolver(resolverFor(map[string]NERConfig{})))
+		w, _ := serve(body, fakeModelPIIConfig{enabled: true}, mw, true)
 
 		Expect(w.Code).To(Equal(http.StatusOK))
-		Expect(body.Messages[0]).To(ContainSubstring("alice@example.com"), "disabled model must not redact")
-		events, _ := store.List(context.Background(), ListQuery{Limit: 100})
-		Expect(events).To(BeEmpty(), "disabled model must produce no events")
+		Expect(body.Messages[0]).To(Equal("Hi I'm Alice"))
 	})
 
-	It("fails closed without model config", func() {
-		// Routes that wire the middleware before SetModelAndConfig, or
-		// non-chat routes lacking a model, hit this path. The contract
-		// is fail-closed: pass through without redaction so a missing
-		// model can't accidentally leak through global defaults.
-		red := newTestRedactor("email")
-		store := NewMemoryEventStore(0)
-		defer func() { _ = store.Close() }()
-
-		body := &fakeRequest{Messages: []string{"contact alice@example.com"}}
-		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
+	It("fails closed without a model config", func() {
+		st := store()
+		body := &fakeRequest{Messages: []string{"Hi I'm Alice"}}
+		mw := RequestMiddleware(&Redactor{}, st, fakeAdapter(), nil,
+			WithNERResolver(resolverFor(map[string]NERConfig{
+				"pf": nerCfg(ActionMask, NEREntity{Group: "PER", Start: 6, End: 11, Score: 0.95}),
+			})))
+		w, _ := serve(body, fakeModelPIIConfig{}, mw, false) // no model config on context
 
-		e := echo.New()
-		// Note: no withModelConfig in the chain.
-		e.POST("/chat", func(c echo.Context) error {
-			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
-		}, setRequestOnContext(body), mw)
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(body.Messages[0]).To(Equal("Hi I'm Alice"), "missing ModelPIIConfig should pass through")
+	})
 
-		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
-		w := httptest.NewRecorder()
-		e.ServeHTTP(w, req)
+	It("unions multiple detectors", func() {
+		st := store()
+		body := &fakeRequest{Messages: []string{"Alice at acme"}}
+		mw := RequestMiddleware(&Redactor{}, st, fakeAdapter(), nil,
+			WithNERResolver(resolverFor(map[string]NERConfig{
+				"names": nerCfg(ActionMask, NEREntity{Group: "PER", Start: 0, End: 5, Score: 0.9}),
+				"orgs":  nerCfg(ActionMask, NEREntity{Group: "ORG", Start: 9, End: 13, Score: 0.9}),
+			})))
+		w, _ := serve(body, fakeModelPIIConfig{enabled: true, detectors: []string{"names", "orgs"}}, mw, true)
 
 		Expect(w.Code).To(Equal(http.StatusOK))
-		Expect(body.Messages[0]).To(ContainSubstring("alice@example.com"), "missing ModelPIIConfig should fail-closed (no redaction)")
+		Expect(body.Messages[0]).To(ContainSubstring("[REDACTED:ner:PER]"))
+		Expect(body.Messages[0]).To(ContainSubstring("[REDACTED:ner:ORG]"))
+		events, _ := st.List(context.Background(), ListQuery{Limit: 100})
+		Expect(events).To(HaveLen(2))
 	})
 
-	It("applies per-model override", func() {
-		// email defaults to mask. A per-model override upgrades it to
-		// block. The middleware short-circuits with 400, the request
-		// body is never touched, and the events log records action=block.
-		red := newTestRedactor("email")
-		store := NewMemoryEventStore(0)
-		defer func() { _ = store.Close() }()
+	It("fails closed (503) when a detector errors", func() {
+		st := store()
+		body := &fakeRequest{Messages: []string{"contact alice@example.com"}}
+		cfg := NERConfig{Detector: &stubNERDetector{err: errors.New("backend offline")}, DefaultAction: ActionMask}
+		mw := RequestMiddleware(&Redactor{}, st, fakeAdapter(), nil,
+			WithNERResolver(resolverFor(map[string]NERConfig{"pf": cfg})))
+		w, called := serve(body, fakeModelPIIConfig{enabled: true, detectors: []string{"pf"}}, mw, true)
+
+		Expect(w.Code).To(Equal(http.StatusServiceUnavailable), "body=%s", w.Body.String())
+		Expect(*called).To(BeFalse())
+		Expect(body.Messages[0]).To(ContainSubstring("alice@example.com"), "request body must be untouched on a fail-closed block")
+		var resp map[string]any
+		Expect(json.Unmarshal(w.Body.Bytes(), &resp)).To(Succeed())
+		errBlock, _ := resp["error"].(map[string]any)
+		Expect(errBlock["type"]).To(Equal("pii_ner_unavailable"))
+		events, _ := st.List(context.Background(), ListQuery{Limit: 100})
+		Expect(events).To(HaveLen(1))
+		Expect(events[0].PatternID).To(Equal(nerUnavailablePattern))
+	})
 
+	It("fails closed (503) when a configured detector can't be resolved", func() {
+		st := store()
 		body := &fakeRequest{Messages: []string{"contact alice@example.com"}}
-		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
-
-		e := echo.New()
-		handlerCalled := false
-		e.POST("/chat", func(c echo.Context) error {
-			handlerCalled = true
-			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
-		}, setRequestOnContext(body),
-			withModelConfig(fakeModelPIIConfig{
-				enabled:   true,
-				overrides: map[string]string{"email": "block"},
-			}), mw)
-
-		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
-		w := httptest.NewRecorder()
-		e.ServeHTTP(w, req)
-
-		Expect(w.Code).To(Equal(http.StatusBadRequest), "expected 400 from override-block; body=%s", w.Body.String())
-		Expect(handlerCalled).To(BeFalse(), "handler must not run when override blocks")
-		events, _ := store.List(context.Background(), ListQuery{Limit: 100})
+		mw := RequestMiddleware(&Redactor{}, st, fakeAdapter(), nil,
+			WithNERResolver(resolverFor(map[string]NERConfig{}))) // "missing" not present
+		w, called := serve(body, fakeModelPIIConfig{enabled: true, detectors: []string{"missing"}}, mw, true)
+
+		Expect(w.Code).To(Equal(http.StatusServiceUnavailable))
+		Expect(*called).To(BeFalse())
+		events, _ := st.List(context.Background(), ListQuery{Limit: 100})
 		Expect(events).To(HaveLen(1))
-		Expect(events[0].Action).To(Equal(ActionBlock), "event must record the resolved (override) action")
+		Expect(events[0].PatternID).To(Equal(nerUnavailablePattern))
 	})
 
 	It("nil redactor is passthrough", func() {
 		body := &fakeRequest{Messages: []string{"alice@example.com"}}
 		mw := RequestMiddleware(nil, nil, fakeAdapter(), nil)
+		w, _ := serve(body, fakeModelPIIConfig{enabled: true, detectors: []string{"pf"}}, mw, true)
 
-		e := echo.New()
-		e.POST("/chat", func(c echo.Context) error {
-			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
-		}, setRequestOnContext(body), withModelConfig(fakeModelPIIConfig{enabled: true}), mw)
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(body.Messages[0]).To(Equal("alice@example.com"), "nil redactor must be a no-op")
+	})
 
-		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
-		w := httptest.NewRecorder()
-		e.ServeHTTP(w, req)
+	It("WithPolicyResolver enables a model the per-model config left off (global default)", func() {
+		st := store()
+		body := &fakeRequest{Messages: []string{"Hi I'm Alice today"}}
+		// The per-model config is disabled with no detectors; the policy
+		// resolver (instance-wide default) turns it on and supplies one.
+		mw := RequestMiddleware(&Redactor{}, st, fakeAdapter(), nil,
+			WithNERResolver(resolverFor(map[string]NERConfig{
+				"global-pf": nerCfg(ActionMask, NEREntity{Group: "PER", Start: 6, End: 11, Score: 0.95}),
+			})),
+			WithPolicyResolver(func(_ any) (bool, []string) { return true, []string{"global-pf"} }))
+		w, _ := serve(body, fakeModelPIIConfig{enabled: false}, mw, true)
+
+		Expect(w.Code).To(Equal(http.StatusOK), "body=%s", w.Body.String())
+		Expect(body.Messages[0]).To(ContainSubstring("[REDACTED:ner:PER]"))
+	})
+
+	It("WithPolicyResolver returning disabled short-circuits an otherwise-enabled model", func() {
+		st := store()
+		body := &fakeRequest{Messages: []string{"Hi I'm Alice today"}}
+		mw := RequestMiddleware(&Redactor{}, st, fakeAdapter(), nil,
+			WithNERResolver(resolverFor(map[string]NERConfig{
+				"pf": nerCfg(ActionMask, NEREntity{Group: "PER", Start: 6, End: 11, Score: 0.95}),
+			})),
+			WithPolicyResolver(func(_ any) (bool, []string) { return false, nil }))
+		w, _ := serve(body, fakeModelPIIConfig{enabled: true, detectors: []string{"pf"}}, mw, true)
 
 		Expect(w.Code).To(Equal(http.StatusOK))
-		Expect(body.Messages[0]).To(Equal("alice@example.com"), "nil redactor must be a no-op")
+		Expect(body.Messages[0]).To(Equal("Hi I'm Alice today"), "resolver disabled => no redaction")
 	})
 })
diff --git a/core/services/routing/pii/ner.go b/core/services/routing/pii/ner.go
index 57d25cded5eb..0d2ee2b592db 100644
--- a/core/services/routing/pii/ner.go
+++ b/core/services/routing/pii/ner.go
@@ -28,6 +28,10 @@ type NEREntity struct {
 	Start int
 	End   int
 	Score float32
+	// Text is the matched substring as the detector saw it. Carried for
+	// debug logging only (the persisted PIIEvent never stores the raw
+	// value); the redactor re-slices the original text for masking.
+	Text string
 }
 
 // NERConfig configures the encoder tier for one redactor invocation.
@@ -56,8 +60,22 @@ type NERConfig struct {
 	// entities silently" — useful when the model returns a broad
 	// taxonomy but the admin only cares about a subset.
 	DefaultAction Action
+
+	// Source labels where this detector's hits come from. It becomes the
+	// PatternID prefix on events and the [REDACTED:<id>] mask, so neural NER
+	// detections (Source "ner") and deterministic pattern-matcher detections
+	// (Source "pattern") are told apart in the events log and to the model.
+	// Empty defaults to "ner" for backward compatibility.
+	Source string
 }
 
+// Detector source labels (the PatternID prefix). Kept short and stable —
+// they appear in the events log and the [REDACTED:...] mask.
+const (
+	SourceNER     = "ner"
+	SourcePattern = "pattern"
+)
+
 // ResolveAction returns the action configured for a detected entity
 // group, falling back to DefaultAction. Returns ("", false) when the
 // entity should be ignored entirely (no override + no default).
@@ -71,13 +89,39 @@ func (c NERConfig) ResolveAction(group string) (Action, bool) {
 	return "", false
 }
 
-// nerPatternID returns the synthetic pattern ID that audit rows carry
-// for NER hits. Prefixing with "ner:" keeps these distinguishable from
-// regex pattern IDs in the events tab and in filter queries; admins
-// can switch off a single entity type with the same Disabled-pattern
-// machinery used for regex.
-func nerPatternID(group string) string {
-	return "ner:" + group
+// NERConfigFromRaw builds a typed NERConfig from a detector plus the raw
+// policy strings carried on a detector model's pii_detection config. An
+// empty or invalid default_action becomes ActionMask — the safe-by-default
+// policy for a PII filter (a detected entity is masked unless an admin
+// downgrades it). Unknown per-entity actions are dropped (and logged by
+// validActions). This is the single conversion point the application-layer
+// resolver uses, so the detector model's policy reaches the redactor in
+// exactly one shape. source labels the detector kind (SourceNER /
+// SourcePattern) and becomes the PatternID prefix; empty defaults to
+// SourceNER.
+func NERConfigFromRaw(detector NERDetector, minScore float32, defaultAction string, entityActions map[string]string, source string) NERConfig {
+	if source == "" {
+		source = SourceNER
+	}
+	return NERConfig{
+		Detector:      detector,
+		MinScore:      minScore,
+		DefaultAction: validActionOr(defaultAction, ActionMask),
+		EntityActions: validActions(entityActions),
+		Source:        source,
+	}
+}
+
+// patternID returns the synthetic pattern ID that audit rows and masks carry
+// for this detector's hits, e.g. "ner:EMAIL" or "pattern:ANTHROPIC_KEY". The
+// source prefix keeps neural and deterministic detections distinguishable in
+// the events tab and in pattern_id filter queries.
+func (c NERConfig) patternID(group string) string {
+	source := c.Source
+	if source == "" {
+		source = SourceNER
+	}
+	return source + ":" + group
 }
 
 // errNERDetector is a NERDetector that always returns the wrapped
diff --git a/core/services/routing/pii/ner_test.go b/core/services/routing/pii/ner_test.go
index b4d822234a6c..8e677a25f6f7 100644
--- a/core/services/routing/pii/ner_test.go
+++ b/core/services/routing/pii/ner_test.go
@@ -9,8 +9,7 @@ import (
 )
 
 // stubNERDetector returns a fixed slice of entities and tracks call
-// count so tests can assert the detector isn't called when text is
-// empty / no patterns / detector disabled.
+// count so tests can assert the detector isn't called when text is empty.
 type stubNERDetector struct {
 	entities []NEREntity
 	err      error
@@ -22,43 +21,39 @@ func (s *stubNERDetector) Detect(_ context.Context, _ string) ([]NEREntity, erro
 	return s.entities, s.err
 }
 
-var _ = Describe("RedactWithNER", func() {
-	It("nil detector is regex-only", func() {
-		// When the NER tier is disabled (Detector == nil) the redactor
-		// must behave exactly like the existing regex-only path — no
-		// detector call, same Result shape, no error.
-		r := NewRedactor([]Pattern{pickEmail()})
-		res, err := r.RedactWithNER(context.Background(), "ping me at alice@example.com", nil, NERConfig{})
+var _ = Describe("RedactNER", func() {
+	It("no detectors is a no-op", func() {
+		res, err := RedactNER(context.Background(), "ping me at alice@example.com", nil)
 		Expect(err).NotTo(HaveOccurred())
-		Expect(res.Redacted).To(ContainSubstring("[REDACTED:email]"), "regex tier should still run when Detector is nil")
+		Expect(res.Redacted).To(Equal("ping me at alice@example.com"))
+		Expect(res.Spans).To(BeEmpty())
 	})
 
 	It("applies entity actions", func() {
 		det := &stubNERDetector{entities: []NEREntity{
 			{Group: "PER", Start: 6, End: 11, Score: 0.95}, // "Alice" in "Hi I'm Alice today"
 		}}
-		r := NewRedactor(nil)
-		res, err := r.RedactWithNER(context.Background(), "Hi I'm Alice today", nil, NERConfig{
+		res, err := RedactNER(context.Background(), "Hi I'm Alice today", []NERConfig{{
 			Detector:      det,
 			EntityActions: map[string]Action{"PER": ActionMask},
-		})
+		}})
 		Expect(err).NotTo(HaveOccurred())
 		Expect(det.calls).To(Equal(1))
 		Expect(res.Redacted).To(ContainSubstring("[REDACTED:ner:PER]"))
 		Expect(res.Spans).To(HaveLen(1))
 		Expect(res.Spans[0].Pattern).To(Equal("ner:PER"))
+		Expect(res.Spans[0].Action).To(Equal(ActionMask))
 	})
 
 	It("filters below MinScore", func() {
 		det := &stubNERDetector{entities: []NEREntity{
 			{Group: "PER", Start: 0, End: 5, Score: 0.20},
 		}}
-		r := NewRedactor(nil)
-		res, err := r.RedactWithNER(context.Background(), "Alice", nil, NERConfig{
+		res, err := RedactNER(context.Background(), "Alice", []NERConfig{{
 			Detector:      det,
 			MinScore:      0.50,
 			EntityActions: map[string]Action{"PER": ActionMask},
-		})
+		}})
 		Expect(err).NotTo(HaveOccurred())
 		Expect(res.Redacted).To(Equal("Alice"), "low-confidence entity should be dropped")
 	})
@@ -67,108 +62,120 @@ var _ = Describe("RedactWithNER", func() {
 		det := &stubNERDetector{entities: []NEREntity{
 			{Group: "ORG", Start: 7, End: 11, Score: 0.9}, // "Acme" in "Hello, Acme!"
 		}}
-		r := NewRedactor(nil)
-		res, err := r.RedactWithNER(context.Background(), "Hello, Acme!", nil, NERConfig{
+		res, err := RedactNER(context.Background(), "Hello, Acme!", []NERConfig{{
 			Detector:      det,
 			DefaultAction: ActionMask,
-		})
+		}})
 		Expect(err).NotTo(HaveOccurred())
 		Expect(res.Redacted).To(ContainSubstring("[REDACTED:ner:ORG]"), "DefaultAction should apply to ORG")
 	})
 
 	It("drops unconfigured groups with no default", func() {
-		// EntityActions has no entry for ORG and DefaultAction is empty —
-		// the detected entity must be ignored entirely (no audit row, no
-		// redaction).
 		det := &stubNERDetector{entities: []NEREntity{
 			{Group: "ORG", Start: 0, End: 4, Score: 0.9},
 		}}
-		r := NewRedactor(nil)
-		res, err := r.RedactWithNER(context.Background(), "Acme", nil, NERConfig{
+		res, err := RedactNER(context.Background(), "Acme", []NERConfig{{
 			Detector:      det,
 			EntityActions: map[string]Action{"PER": ActionMask}, // ORG is unconfigured
-		})
+		}})
 		Expect(err).NotTo(HaveOccurred())
 		Expect(res.Redacted).To(Equal("Acme"))
 		Expect(res.Spans).To(BeEmpty())
 	})
 
-	It("overlapping hits keep stronger action", func() {
-		// Regex marks 0..10 as mask; NER marks 5..15 as block. After
-		// merge, the union 0..15 keeps the strongest action (block).
-		pat := Pattern{ID: "test", Action: ActionMask, regex: rangeRegex(0, 10)}
-		r := NewRedactor([]Pattern{pat})
-		det := &stubNERDetector{entities: []NEREntity{
-			{Group: "PER", Start: 5, End: 15, Score: 0.9},
-		}}
+	It("unions multiple detectors and keeps the stronger action on overlap", func() {
+		// Detector A marks 0..10 as mask; detector B marks 5..15 as block.
+		// After merge, the union 0..15 keeps the strongest action (block).
+		detA := &stubNERDetector{entities: []NEREntity{{Group: "A", Start: 0, End: 10, Score: 0.9}}}
+		detB := &stubNERDetector{entities: []NEREntity{{Group: "B", Start: 5, End: 15, Score: 0.9}}}
 		text := "0123456789ABCDEF"
-		res, err := r.RedactWithNER(context.Background(), text, nil, NERConfig{
-			Detector:      det,
-			EntityActions: map[string]Action{"PER": ActionBlock},
+		res, err := RedactNER(context.Background(), text, []NERConfig{
+			{Detector: detA, EntityActions: map[string]Action{"A": ActionMask}},
+			{Detector: detB, EntityActions: map[string]Action{"B": ActionBlock}},
 		})
 		Expect(err).NotTo(HaveOccurred())
+		Expect(detA.calls).To(Equal(1))
+		Expect(detB.calls).To(Equal(1))
 		Expect(res.Blocked).To(BeTrue(), "overlapping mask+block should set Blocked=true")
 	})
 
-	It("detector error returns regex result and error", func() {
-		// Fail-open: when the NER detector errors, the redactor still
-		// returns regex-tier hits so an offline NER backend doesn't strip
-		// the cheap protection. Caller can read the error and decide
-		// whether to surface it.
-		det := &stubNERDetector{err: errors.New("backend offline")}
-		r := NewRedactor([]Pattern{pickEmail()})
-		res, err := r.RedactWithNER(context.Background(), "ping alice@example.com", nil, NERConfig{
-			Detector:      det,
-			DefaultAction: ActionMask,
+	It("returns a best-effort result and the error when a detector fails (fail-closed contract)", func() {
+		// One healthy detector, one failing. RedactNER returns the healthy
+		// detector's hits AND the error, so the caller can fail closed.
+		good := &stubNERDetector{entities: []NEREntity{{Group: "PER", Start: 0, End: 5, Score: 0.9}}}
+		bad := &stubNERDetector{err: errors.New("backend offline")}
+		res, err := RedactNER(context.Background(), "Alice", []NERConfig{
+			{Detector: good, DefaultAction: ActionMask},
+			{Detector: bad, DefaultAction: ActionMask},
 		})
-		Expect(err).To(HaveOccurred(), "expected detector error to surface")
-		Expect(res.Redacted).To(ContainSubstring("[REDACTED:email]"), "regex tier should still apply on NER failure")
+		Expect(err).To(HaveOccurred())
+		Expect(res.Redacted).To(ContainSubstring("[REDACTED:ner:PER]"), "healthy detector's hits should still apply")
 	})
 
-	It("out-of-bounds offsets are skipped", func() {
-		// A misconfigured / buggy backend could return offsets past the
-		// end of text. The redactor must not panic on slice OOB.
+	It("skips out-of-bounds offsets without panicking", func() {
 		det := &stubNERDetector{entities: []NEREntity{
 			{Group: "PER", Start: 0, End: 999, Score: 0.9},
 			{Group: "PER", Start: -1, End: 3, Score: 0.9},
 			{Group: "PER", Start: 5, End: 5, Score: 0.9}, // zero-length
 		}}
-		r := NewRedactor(nil)
-		res, err := r.RedactWithNER(context.Background(), "Alice", nil, NERConfig{
+		res, err := RedactNER(context.Background(), "Alice", []NERConfig{{
 			Detector:      det,
 			DefaultAction: ActionMask,
-		})
+		}})
 		Expect(err).NotTo(HaveOccurred())
 		Expect(res.Redacted).To(Equal("Alice"))
 		Expect(res.Spans).To(BeEmpty())
 	})
 })
 
-// --- test helpers ---
+var _ = Describe("NERConfigFromRaw", func() {
+	det := &stubNERDetector{}
 
-// rangeMatcher is a deterministic regexpMatcher stub: it claims one
-// fixed range regardless of input. Lets the overlap-merge test
-// produce a known regex/NER intersection without depending on a real
-// compiled regex.
-type rangeMatcher struct{ start, end int }
+	It("defaults an empty default_action to mask and an empty source to ner", func() {
+		cfg := NERConfigFromRaw(det, 0.4, "", nil, "")
+		Expect(cfg.DefaultAction).To(Equal(ActionMask))
+		Expect(cfg.MinScore).To(BeNumerically("~", 0.4, 1e-6))
+		Expect(cfg.Source).To(Equal(SourceNER))
+		Expect(cfg.patternID("EMAIL")).To(Equal("ner:EMAIL"))
+	})
 
-func (m rangeMatcher) FindAllStringIndex(_ string, _ int) [][]int {
-	return [][]int{{m.start, m.end}}
-}
+	It("passes through valid actions and drops invalid ones", func() {
+		cfg := NERConfigFromRaw(det, 0, "block", map[string]string{
+			"PASSWORD": "block",
+			"EMAIL":    "mask",
+			"BOGUS":    "nonsense", // dropped
+		}, SourceNER)
+		Expect(cfg.DefaultAction).To(Equal(ActionBlock))
+		Expect(cfg.EntityActions).To(HaveKeyWithValue("PASSWORD", ActionBlock))
+		Expect(cfg.EntityActions).To(HaveKeyWithValue("EMAIL", ActionMask))
+		Expect(cfg.EntityActions).NotTo(HaveKey("BOGUS"))
+	})
 
-func rangeRegex(start, end int) regexpMatcher { return rangeMatcher{start: start, end: end} }
-
-// pickEmail returns the compiled "email" pattern from DefaultPatterns
-// — the NER tests use it as the regex tier's contribution.
-func pickEmail() Pattern {
-	for _, p := range DefaultPatterns() {
-		if p.ID == "email" {
-			compiled, err := Compile([]Pattern{p})
-			ExpectWithOffset(1, err).NotTo(HaveOccurred(), "compile")
-			return compiled[0]
-		}
-	}
-	Fail("email pattern missing from DefaultPatterns")
-	return Pattern{}
-}
+	It("prefixes pattern-detector hits with the pattern source", func() {
+		cfg := NERConfigFromRaw(det, 0, "mask", nil, SourcePattern)
+		Expect(cfg.Source).To(Equal(SourcePattern))
+		Expect(cfg.patternID("ANTHROPIC_KEY")).To(Equal("pattern:ANTHROPIC_KEY"))
+	})
+})
 
+var _ = Describe("NERConfig.ResolveAction", func() {
+	It("prefers an explicit entity action over the default", func() {
+		cfg := NERConfig{EntityActions: map[string]Action{"EMAIL": ActionBlock}, DefaultAction: ActionMask}
+		a, ok := cfg.ResolveAction("EMAIL")
+		Expect(ok).To(BeTrue())
+		Expect(a).To(Equal(ActionBlock))
+	})
+
+	It("falls back to the default action", func() {
+		cfg := NERConfig{DefaultAction: ActionMask}
+		a, ok := cfg.ResolveAction("ANYTHING")
+		Expect(ok).To(BeTrue())
+		Expect(a).To(Equal(ActionMask))
+	})
+
+	It("ignores a group with no override and no default", func() {
+		cfg := NERConfig{}
+		_, ok := cfg.ResolveAction("ANYTHING")
+		Expect(ok).To(BeFalse())
+	})
+})
diff --git a/core/services/routing/pii/patterns.go b/core/services/routing/pii/patterns.go
deleted file mode 100644
index 1e1ef50a14f7..000000000000
--- a/core/services/routing/pii/patterns.go
+++ /dev/null
@@ -1,188 +0,0 @@
-package pii
-
-import (
-	"fmt"
-	"regexp"
-	"strings"
-)
-
-// regexpMatcher is a thin wrapper so tests can swap in a deterministic
-// matcher without touching the regexp package. Real usage uses
-// regexpMatcherFromPattern; tests can construct fakes.
-type regexpMatcher interface {
-	FindAllStringIndex(s string, n int) [][]int
-}
-
-type goRegexp struct{ r *regexp.Regexp }
-
-func (g goRegexp) FindAllStringIndex(s string, n int) [][]int {
-	return g.r.FindAllStringIndex(s, n)
-}
-
-// DefaultPatterns returns the built-in regex set. Each entry includes
-// a conservative MaxMatchLength so the streaming filter can size its
-// tail buffer without re-parsing the regex at runtime.
-//
-// Caveats by design:
-//   - The phone pattern matches international and US formats but does
-//     not validate area codes. False positives on numbers that look
-//     phone-like (e.g., timestamps in some formats) are accepted in
-//     return for reliable coverage.
-//   - The credit card pattern requires the Luhn check (verifyLuhn) to
-//     reduce false positives — random 16-digit strings won't match.
-//   - The API-key pattern targets common provider prefixes (sk-, pk-,
-//     xoxb-, ghp_, github_pat_) rather than guessing entropy. Adding
-//     new providers should append a new Pattern, not extend an
-//     existing alternation, so the admin UI can show one row per
-//     provider with its own toggle.
-func DefaultPatterns() []Pattern {
-	return []Pattern{
-		{
-			ID:             "email",
-			Description:    "Email address",
-			Action:         ActionMask,
-			MaxMatchLength: 254, // RFC 5321 max
-		},
-		{
-			ID:             "phone",
-			Description:    "Phone number (international or US format)",
-			Action:         ActionMask,
-			MaxMatchLength: 24,
-		},
-		{
-			ID:             "ssn",
-			Description:    "US Social Security Number (NNN-NN-NNNN)",
-			Action:         ActionMask,
-			MaxMatchLength: 11,
-		},
-		{
-			ID:             "credit_card",
-			Description:    "Credit card number (Luhn-verified)",
-			Action:         ActionMask,
-			MaxMatchLength: 19,
-		},
-		{
-			ID:             "ipv4",
-			Description:    "IPv4 address",
-			Action:         ActionMask,
-			MaxMatchLength: 15,
-		},
-		{
-			ID:             "api_key_prefix",
-			Description:    "Common API key prefixes (sk-, pk-, xoxb-, ghp_, github_pat_)",
-			Action:         ActionBlock, // tighter default — leaked credentials are higher harm
-			MaxMatchLength: 200,
-		},
-	}
-}
-
-// patternRegexps maps Pattern.ID to its compiled regex. Kept separate
-// from the Pattern struct so DefaultPatterns can be data-only and
-// tests can swap matchers via Compile().
-var patternRegexps = map[string]*regexp.Regexp{
-	// Pragmatic email — does not implement RFC 5322 in full (no one
-	// sane does in a regex). Catches the common shape; the encoder
-	// NER tier (future) catches edge cases.
-	"email": regexp.MustCompile(`(?i)[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}`),
-	// US: (123) 456-7890, 123-456-7890, 123.456.7890, 1234567890.
-	// International: +<country>-<area>-<rest> with separators.
-	"phone": regexp.MustCompile(`(?:\+?\d{1,3}[\s\-.]?)?(?:\(\d{3}\)|\d{3})[\s\-.]?\d{3}[\s\-.]?\d{4}`),
-	"ssn":   regexp.MustCompile(`\b\d{3}-\d{2}-\d{4}\b`),
-	// 13-19 digit Luhn-eligible runs. The verifier in match() rejects
-	// non-Luhn matches.
-	"credit_card": regexp.MustCompile(`\b(?:\d[ \-]?){13,19}\b`),
-	"ipv4":        regexp.MustCompile(`\b(?:\d{1,3}\.){3}\d{1,3}\b`),
-	// Common provider prefixes; each alternative is a separate
-	// well-known marker rather than a permissive entropy match.
-	"api_key_prefix": regexp.MustCompile(`(?:sk-[A-Za-z0-9]{20,}|pk-[A-Za-z0-9]{20,}|xoxb-[A-Za-z0-9\-]{20,}|ghp_[A-Za-z0-9]{20,}|github_pat_[A-Za-z0-9_]{20,})`),
-}
-
-// Compile attaches matchers to each pattern. Patterns whose ID is not
-// in patternRegexps are returned as a typed error so an admin who
-// adds a custom pattern via config gets a clear "no regex registered"
-// message instead of silent skip.
-func Compile(patterns []Pattern) ([]Pattern, error) {
-	out := make([]Pattern, len(patterns))
-	for i, p := range patterns {
-		r, ok := patternRegexps[p.ID]
-		if !ok {
-			return nil, fmt.Errorf("pii: no regex registered for pattern id %q", p.ID)
-		}
-		p.regex = goRegexp{r: r}
-		out[i] = p
-	}
-	return out, nil
-}
-
-// VerifyMatch applies pattern-specific post-checks (e.g. Luhn for
-// credit_card). Returns the original match or "" to discard it.
-func VerifyMatch(patternID, candidate string) string {
-	switch patternID {
-	case "credit_card":
-		digits := stripNonDigits(candidate)
-		if len(digits) < 13 || len(digits) > 19 {
-			return ""
-		}
-		if !verifyLuhn(digits) {
-			return ""
-		}
-	case "ipv4":
-		// Each octet must be 0..255. The regex allows 0..999 since
-		// regex isn't great at numeric ranges; we tighten here.
-		for oct := range strings.SplitSeq(candidate, ".") {
-			n := 0
-			for _, c := range oct {
-				if c < '0' || c > '9' {
-					return ""
-				}
-				n = n*10 + int(c-'0')
-			}
-			if n > 255 {
-				return ""
-			}
-		}
-	}
-	return candidate
-}
-
-func stripNonDigits(s string) string {
-	var b strings.Builder
-	b.Grow(len(s))
-	for _, c := range s {
-		if c >= '0' && c <= '9' {
-			b.WriteRune(c)
-		}
-	}
-	return b.String()
-}
-
-// verifyLuhn implements the Luhn checksum used by credit-card numbers.
-// Returns true iff the digits pass.
-func verifyLuhn(digits string) bool {
-	sum := 0
-	double := false
-	for i := len(digits) - 1; i >= 0; i-- {
-		d := int(digits[i] - '0')
-		if double {
-			d *= 2
-			if d > 9 {
-				d -= 9
-			}
-		}
-		sum += d
-		double = !double
-	}
-	return sum%10 == 0
-}
-
-// MaxPatternLength returns the longest MaxMatchLength across the input
-// patterns. Used by the streaming filter to size its tail buffer.
-func MaxPatternLength(patterns []Pattern) int {
-	max := 0
-	for _, p := range patterns {
-		if p.MaxMatchLength > max {
-			max = p.MaxMatchLength
-		}
-	}
-	return max
-}
diff --git a/core/services/routing/pii/redactor.go b/core/services/routing/pii/redactor.go
index b70192cacd2f..838ff82574b8 100644
--- a/core/services/routing/pii/redactor.go
+++ b/core/services/routing/pii/redactor.go
@@ -4,211 +4,72 @@ import (
 	"context"
 	"crypto/sha256"
 	"encoding/hex"
-	"fmt"
-	"slices"
 	"sort"
 	"strings"
-	"sync"
+
+	"github.com/mudler/xlog"
 )
 
-// rawHit is one detection — regex-side or NER-side — before
-// overlap-merging. Lifted to file scope so the regex and NER
-// collectors can both produce them and feed the same merge/emit step.
+// rawHit is one detection before overlap-merging. Lifted to file scope so
+// the NER collector and the merge/emit step can share it.
 type rawHit struct {
 	patternID string
 	action    Action
 	start     int
 	end       int
-}
-
-// Redactor scans text against a configured pattern set and applies the
-// per-pattern action. The pattern set itself is mutable at runtime via
-// SetAction (the /api/pii/patterns/:id admin endpoint mutates it
-// in-place); reads are guarded by a mutex so concurrent requests stay
-// race-free.
-type Redactor struct {
-	mu       sync.RWMutex
-	patterns []Pattern
-	maxLen   int
-}
-
-// NewRedactor constructs a redactor from a list of compiled patterns
-// (use Compile() to compile config-loaded patterns first). nil
-// patterns is valid and produces a no-op redactor — convenient for the
-// "PII disabled" deployment.
-func NewRedactor(patterns []Pattern) *Redactor {
-	return &Redactor{
-		patterns: patterns,
-		maxLen:   MaxPatternLength(patterns),
-	}
-}
-
-// MaxPatternLength is exposed so the streaming wrapper can size its
-// tail buffer to match.
-func (r *Redactor) MaxPatternLength() int { return r.maxLen }
-
-// Patterns returns a copy of the configured pattern set so callers can
-// iterate without holding the redactor lock. The compiled regexes are
-// shared — they are immutable once built.
-func (r *Redactor) Patterns() []Pattern {
-	r.mu.RLock()
-	defer r.mu.RUnlock()
-	return slices.Clone(r.patterns)
-}
-
-// SetAction overrides the action for a single pattern. Used by the
-// /api/pii/patterns/:id admin endpoint and the set_pii_pattern_action
-// MCP tool — transient until process restart unless persisted via
-// --pii-config.
-//
-// Publishes a new slice so concurrent Redact callers iterating an
-// older snapshot don't race on the per-element Action string (Go
-// strings are not atomic two-word values).
-func (r *Redactor) SetAction(id string, action Action) error {
-	if action != ActionMask && action != ActionBlock && action != ActionRouteLocal {
-		return fmt.Errorf("unknown action %q (must be mask, block, or route_local)", action)
-	}
-	r.mu.Lock()
-	defer r.mu.Unlock()
-	for i := range r.patterns {
-		if r.patterns[i].ID == id {
-			next := slices.Clone(r.patterns)
-			next[i].Action = action
-			r.patterns = next
-			return nil
-		}
-	}
-	return fmt.Errorf("unknown pattern id %q", id)
-}
-
-// SetDisabled toggles a pattern's enabled state in the live redactor.
-// Same COW publish as SetAction.
-func (r *Redactor) SetDisabled(id string, disabled bool) error {
-	r.mu.Lock()
-	defer r.mu.Unlock()
-	for i := range r.patterns {
-		if r.patterns[i].ID == id {
-			next := slices.Clone(r.patterns)
-			next[i].Disabled = disabled
-			r.patterns = next
-			return nil
-		}
-	}
-	return fmt.Errorf("unknown pattern id %q", id)
-}
-
-// Redact is a thin wrapper for callers that don't need per-request
-// action overrides. It applies each pattern's compiled-in default
-// action.
-func (r *Redactor) Redact(text string) Result {
-	return r.RedactWithOverrides(text, nil)
-}
-
-// RedactWithOverrides scans text and returns the result. The override
-// map is keyed by pattern id; when present, the value replaces the
-// pattern's compiled-in action for this call only — the redactor's
-// stored action is unchanged. Pattern ids missing from the map use
-// their stored action.
-//
-// For every match it records a Span (with HashPrefix, never the value)
-// and applies the resolved Action:
-//   - block: sets Result.Blocked, leaves text intact (caller decides
-//     whether to surface the redacted form).
-//   - mask: replaces the span with maskFor(pattern.ID).
-//   - route_local: sets Result.LocalOnly, leaves text intact.
+	score     float32
+}
+
+// Redactor is a stateless handle for the PII subsystem. The regex tier
+// was removed: detection is driven entirely by per-model NER detectors
+// (see RedactNER), whose policy lives on each detector model's
+// pii_detection config. The type is retained (zero-field) as the
+// on/off sentinel the application wiring and middleware gate on, so a
+// nil *Redactor still means "PII subsystem unavailable".
+type Redactor struct{}
+
+// RedactNER runs every configured NER detector over text, unions their
+// detections, and emits one redacted output. Each NERConfig carries its
+// own detector and policy (min score, entity→action map, default
+// action), so a consuming model that references several detector models
+// gets each model's policy applied to its own hits before the overlap
+// merge (block > mask > allow) resolves any span two detectors both
+// claim.
 //
-// Spans are returned in the original input's coordinate system so the
-// PIIEvent record can be written without re-running the scan.
-func (r *Redactor) RedactWithOverrides(text string, overrides map[string]Action) Result {
-	return r.redact(context.Background(), text, overrides, NERConfig{})
-}
-
-// RedactWithNER is the encoder-tier variant: runs both the regex tier
-// (with per-pattern overrides) and the NER tier, merges hits, and
-// emits one redacted output. A nil NERConfig.Detector skips the NER
-// pass — callers can hand the same path the same NERConfig{} whether
-// or not the model has NER configured.
+// Any detector error is returned alongside a best-effort Result built
+// from the detectors that did succeed, so the caller can fail closed
+// (refuse the request) while still seeing what the healthy detectors
+// found. Configs with a nil Detector are skipped.
 //
-// Errors from the NER detector are returned alongside a best-effort
-// regex-only Result so the caller can decide whether to fail open
-// (return the regex Result, log the error) or fail closed (refuse the
-// request). The regex tier never errors.
-func (r *Redactor) RedactWithNER(ctx context.Context, text string, overrides map[string]Action, nerCfg NERConfig) (Result, error) {
-	if nerCfg.Detector == nil {
-		return r.redact(ctx, text, overrides, nerCfg), nil
-	}
-	hits, err := r.collectRegexHits(text, overrides)
-	if err != nil {
-		return Result{Redacted: text}, err
-	}
-	nerHits, nerErr := collectNERHits(ctx, text, nerCfg)
-	if nerErr != nil {
-		// Return the regex-only result so a NER-backend outage doesn't
-		// strip the cheap protection. Caller decides fail-open vs
-		// fail-closed via the returned error.
-		return mergeAndEmit(text, hits), nerErr
-	}
-	return mergeAndEmit(text, append(hits, nerHits...)), nil
-}
-
-// redact is the internal regex-only entry point. RedactWithOverrides
-// is the public wrapper; RedactWithNER routes through here only when
-// the NER detector is nil (so the call site doesn't need a separate
-// "regex-only" code path).
-func (r *Redactor) redact(_ context.Context, text string, overrides map[string]Action, _ NERConfig) Result {
-	hits, _ := r.collectRegexHits(text, overrides)
-	return mergeAndEmit(text, hits)
-}
-
-// collectRegexHits walks the configured pattern set against text and
-// returns each verified match as a rawHit. The redactor lock is held
-// only long enough to snapshot the pattern slice — regex evaluation
-// runs lock-free against the snapshot, so SetAction/SetDisabled don't
-// stall a long-running Redact.
-func (r *Redactor) collectRegexHits(text string, overrides map[string]Action) ([]rawHit, error) {
-	r.mu.RLock()
-	patterns := r.patterns
-	r.mu.RUnlock()
-
-	if len(patterns) == 0 || text == "" {
-		return nil, nil
+// Package-level (no Redactor state): both the in-band request middleware
+// and the MITM request path call it with their own resolved []NERConfig.
+func RedactNER(ctx context.Context, text string, cfgs []NERConfig) (Result, error) {
+	if text == "" || len(cfgs) == 0 {
+		return Result{Redacted: text}, nil
 	}
 	var hits []rawHit
-	for _, p := range patterns {
-		if p.regex == nil {
-			// Pattern declared but Compile() not called. Skip rather
-			// than panic; the caller already saw an error from Compile.
+	var firstErr error
+	for _, cfg := range cfgs {
+		if cfg.Detector == nil {
 			continue
 		}
-		if p.Disabled {
-			continue
-		}
-		action := p.Action
-		if override, ok := overrides[p.ID]; ok {
-			action = override
-		}
-		idxs := p.regex.FindAllStringIndex(text, -1)
-		for _, idx := range idxs {
-			candidate := text[idx[0]:idx[1]]
-			if VerifyMatch(p.ID, candidate) == "" {
-				continue
+		h, err := collectNERHits(ctx, text, cfg)
+		if err != nil {
+			if firstErr == nil {
+				firstErr = err
 			}
-			hits = append(hits, rawHit{
-				patternID: p.ID,
-				action:    action,
-				start:     idx[0],
-				end:       idx[1],
-			})
+			continue
 		}
+		hits = append(hits, h...)
 	}
-	return hits, nil
+	return mergeAndEmit(text, hits), firstErr
 }
 
 // collectNERHits invokes the configured NERDetector and converts each
 // returned entity into a rawHit using the NERConfig's action map.
 // Entities below MinScore or with no resolved action are dropped — the
 // detector doesn't know which entity groups the admin cares about, so
-// the redactor filters here.
+// the policy filters here.
 func collectNERHits(ctx context.Context, text string, cfg NERConfig) ([]rawHit, error) {
 	if cfg.Detector == nil || text == "" {
 		return nil, nil
@@ -219,42 +80,58 @@ func collectNERHits(ctx context.Context, text string, cfg NERConfig) ([]rawHit,
 	}
 	var hits []rawHit
 	for _, e := range entities {
+		// One DEBUG line per raw detection with the model's confidence, the
+		// byte range, the matched substring, and the policy decision. This is
+		// the lowest-level view of why a request was masked/blocked — e.g. a
+		// phone number scored as SSN — and answers "what was in that range and
+		// how sure was the model" without re-running the detector. DEBUG-gated
+		// because the matched value is sensitive.
 		if e.Score < cfg.MinScore {
+			xlog.Debug("pii/ner: detection dropped (below min score)",
+				"group", e.Group, "score", e.Score, "min_score", cfg.MinScore,
+				"start", e.Start, "end", e.End, "text", e.Text)
 			continue
 		}
 		action, ok := cfg.ResolveAction(e.Group)
 		if !ok {
+			xlog.Debug("pii/ner: detection ignored (no action for group)",
+				"group", e.Group, "score", e.Score,
+				"start", e.Start, "end", e.End, "text", e.Text)
 			continue
 		}
 		if e.Start < 0 || e.End <= e.Start || e.End > len(text) {
-			// Defensive: the backend should return byte offsets into
-			// the original text, but a misconfigured model could
-			// produce garbage. Skip rather than panic on slice OOB.
+			// Defensive: the backend should return byte offsets into the
+			// original text, but a misconfigured model could produce
+			// garbage. Skip rather than panic on slice OOB.
+			xlog.Warn("pii/ner: detection has out-of-range offsets; skipping",
+				"group", e.Group, "start", e.Start, "end", e.End, "text_len", len(text))
 			continue
 		}
+		xlog.Debug("pii/ner: detection accepted",
+			"group", e.Group, "score", e.Score, "action", action,
+			"start", e.Start, "end", e.End, "text", e.Text)
 		hits = append(hits, rawHit{
-			patternID: nerPatternID(e.Group),
+			patternID: cfg.patternID(e.Group),
 			action:    action,
 			start:     e.Start,
 			end:       e.End,
+			score:     e.Score,
 		})
 	}
 	return hits, nil
 }
 
-// mergeAndEmit handles the overlap-merge + masked-output step that
-// regex-only and combined regex+NER redactions both perform. Sorts by
+// mergeAndEmit handles the overlap-merge + masked-output step. Sorts by
 // start (stable on equal starts by descending action strength), drops
-// overlapping hits in favour of the stronger action, and walks the
-// text once to emit replacement spans.
+// overlapping hits in favour of the stronger action, and walks the text
+// once to emit replacement spans.
 func mergeAndEmit(text string, hits []rawHit) Result {
 	if len(hits) == 0 {
 		return Result{Redacted: text}
 	}
-	// Sort and deduplicate overlapping hits — when two patterns claim
-	// the same span (e.g., a credit-card-shaped value also scans as
-	// digits, or NER tags a span the regex also caught), keep the one
-	// with the strongest action. Order: block > route_local > mask.
+	// Sort and deduplicate overlapping hits — when two detectors claim
+	// the same span, keep the one with the strongest action. Order:
+	// block > mask > allow.
 	sort.Slice(hits, func(i, j int) bool {
 		if hits[i].start != hits[j].start {
 			return hits[i].start < hits[j].start
@@ -269,6 +146,7 @@ func mergeAndEmit(text string, hits []rawHit) Result {
 				if actionRank(h.action) > actionRank(last.action) {
 					last.action = h.action
 					last.patternID = h.patternID
+					last.score = h.score
 				}
 				if h.end > last.end {
 					last.end = h.end
@@ -290,6 +168,8 @@ func mergeAndEmit(text string, hits []rawHit) Result {
 			End:        h.end,
 			Pattern:    h.patternID,
 			HashPrefix: hashPrefix(matched),
+			Action:     h.action,
+			Score:      h.score,
 		}
 		res.Spans = append(res.Spans, span)
 
@@ -298,10 +178,11 @@ func mergeAndEmit(text string, hits []rawHit) Result {
 		case ActionBlock:
 			res.Blocked = true
 			out.WriteString(matched)
-		case ActionRouteLocal:
-			res.LocalOnly = true
+		case ActionAllow:
+			// Detect-and-log only: leave the matched text in place.
 			out.WriteString(matched)
 		default:
+			res.Masked = true
 			out.WriteString(maskFor(h.patternID))
 		}
 		cursor = h.end
@@ -313,17 +194,15 @@ func mergeAndEmit(text string, hits []rawHit) Result {
 
 // maskFor returns the placeholder that replaces a matched span. The
 // shape "[REDACTED:<id>]" is intentionally stable — it surfaces the
-// pattern id back to the model, which is sometimes useful (e.g., the
-// model can say "I see you redacted an email"). Admins who want a
-// less informative replacement can build one in front of this.
+// detector group back to the model (e.g. "I see you redacted an email").
 func maskFor(patternID string) string {
 	return "[REDACTED:" + patternID + "]"
 }
 
-// hashPrefix returns the first 8 chars of sha256(value). Two calls
-// with the same input produce the same prefix so an admin auditing
-// the PIIEvent log can spot a recurring leak ("the same SSN appears
-// 200 times this hour") without ever recovering the value.
+// hashPrefix returns the first 8 chars of sha256(value). Two calls with
+// the same input produce the same prefix so an admin auditing the
+// PIIEvent log can spot a recurring leak without ever recovering the
+// value.
 func hashPrefix(value string) string {
 	sum := sha256.Sum256([]byte(value))
 	return hex.EncodeToString(sum[:])[:8]
@@ -333,9 +212,9 @@ func actionRank(a Action) int {
 	switch a {
 	case ActionBlock:
 		return 3
-	case ActionRouteLocal:
-		return 2
 	case ActionMask:
+		return 2
+	case ActionAllow:
 		return 1
 	}
 	return 0
diff --git a/core/services/routing/pii/redactor_race_test.go b/core/services/routing/pii/redactor_race_test.go
deleted file mode 100644
index f926ea64dea0..000000000000
--- a/core/services/routing/pii/redactor_race_test.go
+++ /dev/null
@@ -1,66 +0,0 @@
-package pii
-
-import (
-	"sync"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// Redactor_SetActionConcurrentRedact pins the SetAction copy-on-
-// write contract: concurrent SetAction must not race with readers
-// iterating an older patterns snapshot. Run with -race to surface the
-// regression that motivated the COW (in-place mutation of the
-// per-element Action string is not atomic).
-var _ = Describe("Redactor", func() {
-	It("SetAction concurrent with Redact", func() {
-		patterns, err := Compile(DefaultPatterns())
-		Expect(err).NotTo(HaveOccurred(), "compile")
-		r := NewRedactor(patterns)
-
-		const writers = 4
-		const readers = 8
-		const iter = 100
-
-		var wg sync.WaitGroup
-		stop := make(chan struct{})
-
-		for w := 0; w < writers; w++ {
-			wg.Add(1)
-			go func() {
-				defer wg.Done()
-				for i := 0; i < iter; i++ {
-					select {
-					case <-stop:
-						return
-					default:
-					}
-					action := ActionMask
-					if i%2 == 0 {
-						action = ActionBlock
-					}
-					_ = r.SetAction("email", action)
-				}
-			}()
-		}
-
-		for rd := 0; rd < readers; rd++ {
-			wg.Add(1)
-			go func() {
-				defer wg.Done()
-				text := "contact alice@example.com please"
-				for i := 0; i < iter*2; i++ {
-					select {
-					case <-stop:
-						return
-					default:
-					}
-					_ = r.Redact(text)
-				}
-			}()
-		}
-
-		wg.Wait()
-		close(stop)
-	})
-})
diff --git a/core/services/routing/pii/redactor_test.go b/core/services/routing/pii/redactor_test.go
index a084e4d542f5..22a5a413e36e 100644
--- a/core/services/routing/pii/redactor_test.go
+++ b/core/services/routing/pii/redactor_test.go
@@ -1,184 +1,88 @@
 package pii
 
 import (
+	"context"
+
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 
-func mustCompile(ids ...string) []Pattern {
-	all := DefaultPatterns()
-	if len(ids) == 0 {
-		out, err := Compile(all)
-		ExpectWithOffset(1, err).NotTo(HaveOccurred(), "compile")
-		return out
-	}
-	pickP := pick(all, ids)
-	out, err := Compile(pickP)
-	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "compile")
-	return out
+// detect builds a single-detector []NERConfig that reports one entity
+// over the whole input under the given group/action.
+func oneShot(group string, action Action, start, end int) []NERConfig {
+	return []NERConfig{{
+		Detector:      &stubNERDetector{entities: []NEREntity{{Group: group, Start: start, End: end, Score: 1}}},
+		EntityActions: map[string]Action{group: action},
+	}}
 }
 
-func pick(all []Pattern, ids []string) []Pattern {
-	keep := map[string]bool{}
-	for _, id := range ids {
-		keep[id] = true
-	}
-	var out []Pattern
-	for _, p := range all {
-		if keep[p.ID] {
-			out = append(out, p)
-		}
-	}
-	return out
-}
+var _ = Describe("RedactNER emission", func() {
+	ctx := context.Background()
 
-var _ = Describe("Redactor", func() {
-	It("masks email", func() {
-		r := NewRedactor(mustCompile("email"))
-		res := r.Redact("Contact me at alice@example.com any time.")
-		Expect(res.Blocked).To(BeFalse(), "email is mask-action by default, should not block")
-		Expect(res.Redacted).To(ContainSubstring("[REDACTED:email]"))
+	It("masks with a [REDACTED:ner:GROUP] placeholder and records a hash prefix", func() {
+		res, err := RedactNER(ctx, "Contact me at alice@example.com any time.", oneShot("EMAIL", ActionMask, 14, 31))
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Masked).To(BeTrue())
+		Expect(res.Blocked).To(BeFalse())
+		Expect(res.Redacted).To(ContainSubstring("[REDACTED:ner:EMAIL]"))
 		Expect(res.Redacted).NotTo(ContainSubstring("alice@example.com"))
 		Expect(res.Spans).To(HaveLen(1))
 		Expect(res.Spans[0].HashPrefix).NotTo(BeEmpty(), "hash prefix must be set so audits can dedupe leaks")
 	})
 
-	It("masks SSN", func() {
-		r := NewRedactor(mustCompile("ssn"))
-		res := r.Redact("call me about SSN 123-45-6789 please")
-		Expect(res.Redacted).To(ContainSubstring("[REDACTED:ssn]"))
+	It("labels pattern-detector hits with the pattern source, not ner", func() {
+		cfgs := []NERConfig{{
+			Detector:      &stubNERDetector{entities: []NEREntity{{Group: "ANTHROPIC_KEY", Start: 4, End: 24, Score: 1}}},
+			EntityActions: map[string]Action{"ANTHROPIC_KEY": ActionMask},
+			Source:        SourcePattern,
+		}}
+		res, err := RedactNER(ctx, "use sk-ant-aaaaaaaaaaaaaaaa now", cfgs)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Redacted).To(ContainSubstring("[REDACTED:pattern:ANTHROPIC_KEY]"))
+		Expect(res.Redacted).NotTo(ContainSubstring("[REDACTED:ner:"))
+		Expect(res.Spans).To(HaveLen(1))
+		Expect(res.Spans[0].Pattern).To(Equal("pattern:ANTHROPIC_KEY"))
 	})
 
-	It("uses Luhn for credit card", func() {
-		r := NewRedactor(mustCompile("credit_card"))
-
-		// 4111 1111 1111 1111 — canonical Luhn-valid Visa test number.
-		good := r.Redact("card: 4111 1111 1111 1111")
-		Expect(good.Spans).To(HaveLen(1))
-		Expect(good.Redacted).To(ContainSubstring("[REDACTED:credit_card]"))
-
-		// 4111 1111 1111 1112 — same shape, fails Luhn. Must NOT match.
-		bad := r.Redact("card: 4111 1111 1111 1112")
-		Expect(bad.Spans).To(BeEmpty(), "Luhn-invalid 16-digit run must not be redacted")
-		Expect(bad.Redacted).To(ContainSubstring("1112"), "Luhn-invalid input should pass through untouched")
+	It("block leaves the matched span intact and sets Blocked", func() {
+		res, err := RedactNER(ctx, "token sk-abcdef here", oneShot("PASSWORD", ActionBlock, 6, 15))
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Blocked).To(BeTrue())
+		Expect(res.Redacted).To(ContainSubstring("sk-abcdef"), "block leaves the value intact for the caller to discard")
+		Expect(res.Spans[0].Action).To(Equal(ActionBlock))
 	})
 
-	It("validates IPv4 octets", func() {
-		r := NewRedactor(mustCompile("ipv4"))
-
-		good := r.Redact("server at 192.168.1.10 is up")
-		Expect(good.Spans).To(HaveLen(1))
-
-		// 999.999.999.999 — regex matches but octet > 255 must reject.
-		bad := r.Redact("not an ip: 999.999.999.999")
-		Expect(bad.Spans).To(BeEmpty(), "ipv4 with octet>255 must not match")
-	})
-
-	It("api_key defaults to block", func() {
-		r := NewRedactor(mustCompile("api_key_prefix"))
-		res := r.Redact("here's a token sk-abcdefghijklmnopqrstuvwxyz0123456789 to use")
-		Expect(res.Blocked).To(BeTrue(), "api_key default action is block; Result.Blocked must be true")
-		// The redacted output keeps the matched value when blocking — the
-		// caller is expected to refuse the request, not to forward a partial.
-		Expect(res.Redacted).To(ContainSubstring("sk-abcdefghijklmn"), "blocked actions leave the matched span intact for caller inspection")
+	It("allow leaves text intact but still records the span", func() {
+		res, err := RedactNER(ctx, "Hello Acme!", oneShot("ORG", ActionAllow, 6, 10))
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Masked).To(BeFalse())
+		Expect(res.Blocked).To(BeFalse())
+		Expect(res.Redacted).To(Equal("Hello Acme!"))
+		Expect(res.Spans).To(HaveLen(1))
 	})
 
-	It("preserves non-matching text", func() {
-		r := NewRedactor(mustCompile()) // all default patterns
-		in := "no PII here at all, just words and numbers like 42 and 1.5"
-		res := r.Redact(in)
-		Expect(res.Redacted).To(Equal(in), "non-PII input should pass through unchanged")
+	It("passes non-matching text through unchanged", func() {
+		det := &stubNERDetector{} // no entities
+		res, err := RedactNER(ctx, "no PII here, just words", []NERConfig{{Detector: det, DefaultAction: ActionMask}})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Redacted).To(Equal("no PII here, just words"))
 		Expect(res.Spans).To(BeEmpty())
 	})
 
-	It("handles empty input", func() {
-		r := NewRedactor(mustCompile())
-		res := r.Redact("")
+	It("handles empty input without calling the detector", func() {
+		det := &stubNERDetector{entities: []NEREntity{{Group: "X", Start: 0, End: 1, Score: 1}}}
+		res, err := RedactNER(ctx, "", []NERConfig{{Detector: det, DefaultAction: ActionMask}})
+		Expect(err).NotTo(HaveOccurred())
 		Expect(res.Redacted).To(BeEmpty())
-		Expect(res.Blocked).To(BeFalse())
-		Expect(res.LocalOnly).To(BeFalse())
 		Expect(res.Spans).To(BeEmpty())
+		Expect(det.calls).To(Equal(0))
 	})
 
-	It("nil patterns is a no-op", func() {
-		// Disabled-PII deployment: pii.NewRedactor(nil) is a no-op.
-		r := NewRedactor(nil)
-		res := r.Redact("alice@example.com sent it")
-		Expect(res.Redacted).To(Equal("alice@example.com sent it"))
-	})
-
-	It("hash prefix is stable", func() {
-		r := NewRedactor(mustCompile("email"))
-		a := r.Redact("a@b.com")
-		b := r.Redact("hi a@b.com again")
+	It("produces a stable hash prefix for the same matched value", func() {
+		a, _ := RedactNER(ctx, "a@b.com", oneShot("EMAIL", ActionMask, 0, 7))
+		b, _ := RedactNER(ctx, "hi a@b.com", oneShot("EMAIL", ActionMask, 3, 10))
 		Expect(a.Spans).To(HaveLen(1))
 		Expect(b.Spans).To(HaveLen(1))
 		Expect(a.Spans[0].HashPrefix).To(Equal(b.Spans[0].HashPrefix), "same matched value must produce same hash prefix")
 	})
 })
-
-var _ = Describe("Compile", func() {
-	It("rejects unknown pattern id", func() {
-		_, err := Compile([]Pattern{{ID: "nonexistent", Action: ActionMask}})
-		Expect(err).To(HaveOccurred(), "Compile must error on unknown pattern id")
-	})
-})
-
-var _ = Describe("MaxPatternLength", func() {
-	It("returns the longest pattern's max length", func() {
-		patterns := mustCompile("email", "ssn")
-		got := MaxPatternLength(patterns)
-		// email is the longer of the two (254). The streaming filter
-		// will use this to size its tail buffer.
-		Expect(got).To(Equal(254))
-	})
-})
-
-var _ = Describe("RedactWithOverrides", func() {
-	It("upgrades action", func() {
-		// email is mask by default; the per-model override turns it into a
-		// hard block for one request without mutating the redactor.
-		r := NewRedactor(mustCompile("email"))
-		res := r.RedactWithOverrides("contact alice@example.com",
-			map[string]Action{"email": ActionBlock})
-		Expect(res.Blocked).To(BeTrue(), "override should have set Blocked")
-		// Block leaves the value intact (the caller short-circuits the
-		// request) — the redactor never echoes the matched text.
-		Expect(res.Redacted).To(ContainSubstring("alice@example.com"), "block leaves text intact for the caller to discard")
-		// Stored action is unchanged so a subsequent default Redact still
-		// masks rather than blocks.
-		res2 := r.Redact("contact alice@example.com")
-		Expect(res2.Blocked).To(BeFalse(), "override must not mutate stored action")
-	})
-
-	It("ignores unknown IDs", func() {
-		// An override for a pattern this redactor doesn't know about is a
-		// no-op rather than an error — per-model configs may reference
-		// patterns from a wider catalogue than the active redactor holds.
-		r := NewRedactor(mustCompile("email"))
-		res := r.RedactWithOverrides("contact alice@example.com",
-			map[string]Action{"ssn": ActionBlock})
-		Expect(res.Blocked).To(BeFalse(), "ssn override against email-only redactor must be no-op")
-	})
-})
-
-var _ = Describe("SetAction", func() {
-	It("swaps in place", func() {
-		r := NewRedactor(mustCompile("email"))
-		Expect(r.SetAction("email", ActionRouteLocal)).To(Succeed())
-		res := r.Redact("contact alice@example.com")
-		Expect(res.LocalOnly).To(BeTrue(), "expected LocalOnly after SetAction(route_local)")
-		Expect(res.Blocked).To(BeFalse(), "SetAction(route_local) should not block")
-	})
-
-	It("rejects unknown id", func() {
-		r := NewRedactor(mustCompile("email"))
-		Expect(r.SetAction("nonexistent", ActionMask)).NotTo(Succeed(), "expected error for unknown pattern id")
-	})
-
-	It("rejects unknown action", func() {
-		r := NewRedactor(mustCompile("email"))
-		Expect(r.SetAction("email", Action("frobnicate"))).NotTo(Succeed(), "expected error for unknown action")
-	})
-})
-
diff --git a/core/services/routing/pii/stream.go b/core/services/routing/pii/stream.go
deleted file mode 100644
index 93a5cd261f75..000000000000
--- a/core/services/routing/pii/stream.go
+++ /dev/null
@@ -1,197 +0,0 @@
-package pii
-
-import (
-	"context"
-	"crypto/rand"
-	"encoding/hex"
-	"strings"
-	"time"
-	"unicode/utf8"
-)
-
-// StreamFilter applies the regex PII tier to a streaming response,
-// chunk by chunk, with a buffered-emit invariant: for any active
-// pattern with bounded max-length L, the filter never emits the
-// trailing L-1 characters of the cumulative input until either
-//
-//   (a) more text arrives that disambiguates the boundary, or
-//   (b) the stream closes (Drain).
-//
-// That keeps the redactor honest across chunk splits — an email
-// arriving as "alice@" + "example.com" still masks the same way as
-// "alice@example.com" arriving in one piece.
-//
-// Action handling in stream mode differs from the request-side
-// middleware. Earlier chunks of the response are already on the wire
-// by the time later chunks are scanned, so a "block" can't actually
-// reject the request. We remap block → mask for redaction purposes
-// while still recording PIIEvent rows with action="block" so audits
-// surface the original intent ("the model would have leaked X here,
-// suppressed in flight"). route_local on the output side is a no-op
-// (the dispatch decision was already made on the request side).
-//
-// StreamFilter is NOT safe for concurrent use across goroutines; one
-// instance per response stream.
-type StreamFilter struct {
-	redactor      *Redactor
-	maskOverrides map[string]Action // block → mask map used for redaction
-	auditActions  map[string]Action // original action per pattern, used for events
-	store         EventStore
-	correlationID string
-	userID        string
-	holdLen       int
-	buffer        strings.Builder
-	emittedBytes  int
-}
-
-// NewStreamFilter constructs a per-response filter. modelOverrides is
-// the per-model action override map (same shape the request-side
-// middleware uses); it can be nil when the model only accepts global
-// defaults.
-//
-// store may be nil — events are then computed but not persisted, which
-// is what the chat handler does when --disable-stats is set.
-func NewStreamFilter(redactor *Redactor, modelOverrides map[string]Action, store EventStore, correlationID, userID string) *StreamFilter {
-	if redactor == nil {
-		return &StreamFilter{}
-	}
-
-	patterns := redactor.Patterns()
-
-	// auditActions: the action we *would* have applied if this match
-	// occurred on the request side. Honours the per-model override.
-	auditActions := make(map[string]Action, len(patterns))
-	for _, p := range patterns {
-		auditActions[p.ID] = p.Action
-	}
-	for id, action := range modelOverrides {
-		auditActions[id] = action
-	}
-
-	// maskOverrides: the action we actually apply to the stream. Same
-	// as auditActions, but with every block remapped to mask.
-	maskOverrides := make(map[string]Action, len(auditActions))
-	for id, action := range auditActions {
-		if action == ActionBlock {
-			maskOverrides[id] = ActionMask
-		} else {
-			maskOverrides[id] = action
-		}
-	}
-
-	return &StreamFilter{
-		redactor:      redactor,
-		maskOverrides: maskOverrides,
-		auditActions:  auditActions,
-		store:         store,
-		correlationID: correlationID,
-		userID:        userID,
-		holdLen:       redactor.MaxPatternLength() - 1,
-	}
-}
-
-// Push appends new text to the filter's buffer and returns the prefix
-// safe to emit downstream — the cumulative input minus a tail of
-// holdLen characters that might still be the start of a longer match.
-// Returned text has masks already applied.
-//
-// Returns an empty string when not enough text has arrived to clear
-// the hold window.
-func (sf *StreamFilter) Push(text string) string {
-	if sf.redactor == nil || sf.holdLen <= 0 {
-		return text
-	}
-	sf.buffer.WriteString(text)
-	bufStr := sf.buffer.String()
-	n := len(bufStr)
-
-	if n <= sf.holdLen {
-		return ""
-	}
-
-	emitBoundary := n - sf.holdLen
-
-	// Scan the entire buffer. A match whose start is before the
-	// boundary but whose end runs past it crosses the window — pull
-	// the boundary back to match.start so the pattern stays whole in
-	// the buffer for the next Push to scan again.
-	full := sf.redactor.RedactWithOverrides(bufStr, sf.maskOverrides)
-	for _, span := range full.Spans {
-		if span.Start < emitBoundary && span.End > emitBoundary {
-			emitBoundary = span.Start
-		}
-	}
-
-	// holdLen is byte-sized but a chunk boundary may land mid-codepoint.
-	// Snap back to the nearest rune start so neither the emitted prefix
-	// nor the retained tail contains a split codepoint — otherwise the
-	// next regex scan over an invalid-UTF-8 prefix could mis-match.
-	for emitBoundary > 0 && emitBoundary < n && !utf8.RuneStart(bufStr[emitBoundary]) {
-		emitBoundary--
-	}
-
-	if emitBoundary <= 0 {
-		return ""
-	}
-
-	emitted := sf.applyAndEmit(bufStr[:emitBoundary])
-	sf.buffer.Reset()
-	sf.buffer.WriteString(bufStr[emitBoundary:])
-	return emitted
-}
-
-// Drain emits whatever's left in the buffer with all matches applied.
-// Call exactly once when the stream closes — repeat calls return the
-// empty string.
-func (sf *StreamFilter) Drain() string {
-	if sf.redactor == nil {
-		return sf.buffer.String()
-	}
-	bufStr := sf.buffer.String()
-	if bufStr == "" {
-		return ""
-	}
-	emitted := sf.applyAndEmit(bufStr)
-	sf.buffer.Reset()
-	return emitted
-}
-
-// applyAndEmit runs the redactor over a committed-for-emit fragment,
-// substitutes mask/block placeholders inline, and records one
-// PIIEvent per matched span (with the audit action, not the masked
-// one). ByteOffset is referenced to the cumulative emitted output so
-// admins can correlate event positions against the streamed body.
-func (sf *StreamFilter) applyAndEmit(fragment string) string {
-	res := sf.redactor.RedactWithOverrides(fragment, sf.maskOverrides)
-	output := res.Redacted
-
-	if len(res.Spans) > 0 {
-		now := time.Now().UTC()
-		for _, span := range res.Spans {
-			ev := PIIEvent{
-				ID:            newStreamEventID(),
-				CorrelationID: sf.correlationID,
-				UserID:        sf.userID,
-				Direction:     DirectionOut,
-				PatternID:     span.Pattern,
-				ByteOffset:    sf.emittedBytes + span.Start,
-				Length:        span.End - span.Start,
-				HashPrefix:    span.HashPrefix,
-				Action:        sf.auditActions[span.Pattern],
-				CreatedAt:     now,
-			}
-			if sf.store != nil {
-				_ = sf.store.Record(context.Background(), ev)
-			}
-		}
-	}
-
-	sf.emittedBytes += len(fragment)
-	return output
-}
-
-func newStreamEventID() string {
-	var b [12]byte
-	_, _ = rand.Read(b[:])
-	return "pii_" + hex.EncodeToString(b[:])
-}
diff --git a/core/services/routing/pii/stream_test.go b/core/services/routing/pii/stream_test.go
deleted file mode 100644
index 037020609d85..000000000000
--- a/core/services/routing/pii/stream_test.go
+++ /dev/null
@@ -1,184 +0,0 @@
-package pii
-
-import (
-	"context"
-	"fmt"
-	"math/rand"
-	"strings"
-	"unicode/utf8"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func newStreamRedactor(ids ...string) *Redactor {
-	all := DefaultPatterns()
-	chosen := all
-	if len(ids) > 0 {
-		chosen = pick(all, ids)
-	}
-	patterns, err := Compile(chosen)
-	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "compile")
-	return NewRedactor(patterns)
-}
-
-var _ = Describe("StreamFilter", func() {
-	It("masks across chunks", func() {
-		// The most important streaming test: an email split arbitrarily
-		// across chunk boundaries must mask exactly the same way as one
-		// arriving in a single Push.
-		red := newStreamRedactor("email")
-		sf := NewStreamFilter(red, nil, nil, "", "")
-
-		// "alice@example.com" (17 bytes) split between '@' and 'e'.
-		out := ""
-		out += sf.Push("hi alice@")
-		out += sf.Push("example.com! end")
-		out += sf.Drain()
-
-		Expect(out).NotTo(ContainSubstring("alice@example.com"), "stream leaked email across chunk boundary")
-		Expect(out).To(ContainSubstring("[REDACTED:email]"))
-	})
-
-	It("block becomes mask", func() {
-		// api_key_prefix is block by default. In stream mode the earlier
-		// chunks are already on the wire so block is impossible — the
-		// filter remaps to mask while still recording action="block" so
-		// the audit log keeps the original intent.
-		red := newStreamRedactor("api_key_prefix")
-		store := NewMemoryEventStore(0)
-		defer func() { _ = store.Close() }()
-		sf := NewStreamFilter(red, nil, store, "corr-1", "user-1")
-
-		out := sf.Push("here is your token: sk-abcdefghijklmnopqrstuvwxyz0123456789 done")
-		out += sf.Drain()
-
-		Expect(out).NotTo(ContainSubstring("abcdefghijklmnopqrstuvwxyz0123456789"), "block-in-stream must mask, leaked the value")
-		Expect(out).To(ContainSubstring("[REDACTED:api_key_prefix]"))
-
-		events, _ := store.List(context.Background(), ListQuery{Limit: 10})
-		Expect(events).To(HaveLen(1))
-		Expect(events[0].Action).To(Equal(ActionBlock), "audit must record original block action")
-		Expect(events[0].Direction).To(Equal(DirectionOut), "stream events must be DirectionOut")
-	})
-
-	It("no match passthrough", func() {
-		red := newStreamRedactor("email")
-		sf := NewStreamFilter(red, nil, nil, "", "")
-		out := sf.Push("perfectly clean text that should") + sf.Push(" pass through unchanged.") + sf.Drain()
-		Expect(out).To(Equal("perfectly clean text that should pass through unchanged."))
-	})
-
-	It("nil redactor passthrough", func() {
-		// --disable-pii path: NewStreamFilter(nil, ...) returns a filter
-		// that just forwards Push input verbatim.
-		sf := NewStreamFilter(nil, nil, nil, "", "")
-		out := sf.Push("any old text including alice@example.com") + sf.Drain()
-		Expect(out).To(Equal("any old text including alice@example.com"))
-	})
-
-	It("per-model overrides", func() {
-		// email defaults to mask; per-model override upgrades to block.
-		// In stream mode the override still maps to mask placeholder, but
-		// the audit event records action="block".
-		red := newStreamRedactor("email")
-		store := NewMemoryEventStore(0)
-		defer func() { _ = store.Close() }()
-		sf := NewStreamFilter(red, map[string]Action{"email": ActionBlock}, store, "corr-2", "user-2")
-
-		out := sf.Push("contact alice@example.com please") + sf.Drain()
-		Expect(out).NotTo(ContainSubstring("alice@example.com"), "override block-in-stream must mask")
-		events, _ := store.List(context.Background(), ListQuery{Limit: 10})
-		Expect(events).To(HaveLen(1))
-		Expect(events[0].Action).To(Equal(ActionBlock))
-	})
-
-	// StreamFilter_BufferedEmitInvariant feeds the redactor a corpus
-	// one rune at a time, randomly chunked, and asserts:
-	//
-	//   1. Across all (input, splitting) pairs, the cumulative emitted
-	//      output never contains any of the secret values that were
-	//      embedded in the input.
-	//   2. The output, fully drained, equals what Redact would have
-	//      produced on the unsplit input.
-	//
-	// This is the load-bearing property of streaming PII: regardless of
-	// where chunks split, the emitted bytes cannot contain a value that a
-	// single-shot redactor would have masked.
-	It("buffered emit invariant", func() {
-		corpus := []struct {
-			text    string
-			secrets []string
-		}{
-			{"contact alice@example.com or bob@example.org", []string{"alice@example.com", "bob@example.org"}},
-			{"my SSN is 123-45-6789 and his is 987-65-4321", []string{"123-45-6789", "987-65-4321"}},
-			{"sk-abcdefghijklmnopqrstuvwxyz0123456789 leaked", []string{"sk-abcdefghijklmnopqrstuvwxyz0123456789"}},
-			{"repeats: alice@example.com / alice@example.com / alice@example.com", []string{"alice@example.com"}},
-			// Multibyte UTF-8 corpora pin the rune-boundary snap in
-			// StreamFilter.Push: holdLen is byte-sized, so a chunk boundary
-			// may land mid-codepoint. Without the snap, the retained tail
-			// has a partial codepoint and the next regex scan can mis-align.
-			// Each entry mixes ASCII secrets with surrounding multibyte text
-			// so a byte-aligned cut would land inside a CJK or accented
-			// character on at least some splits.
-			{"こんにちは alice@example.com さようなら", []string{"alice@example.com"}},
-			{"クレジットカード: 4111-1111-1111-1111 終わり", []string{"4111-1111-1111-1111"}},
-			{"naïve résumé: alice@example.com, façade", []string{"alice@example.com"}},
-		}
-
-		red := newStreamRedactor()        // all default patterns
-		rng := rand.New(rand.NewSource(1)) // seeded for reproducibility
-
-		for _, tc := range corpus {
-			for trial := 0; trial < 10; trial++ {
-				sf := NewStreamFilter(red, nil, nil, "", "")
-				var out strings.Builder
-				for i := 0; i < utf8.RuneCountInString(tc.text); {
-					// Random chunk size 1-8 runes, never crossing the end.
-					chunk := 1 + rng.Intn(8)
-					if i+chunk > utf8.RuneCountInString(tc.text) {
-						chunk = utf8.RuneCountInString(tc.text) - i
-					}
-					out.WriteString(sf.Push(stringSlice(tc.text, i, i+chunk)))
-					i += chunk
-				}
-				out.WriteString(sf.Drain())
-				result := out.String()
-
-				// Property 1: no secret value appears anywhere in the
-				// output.
-				for _, secret := range tc.secrets {
-					Expect(result).NotTo(ContainSubstring(secret),
-						fmt.Sprintf("trial %d: secret %q leaked through streaming\n  input: %q\n  output: %q", trial, secret, tc.text, result))
-				}
-
-				// Property 2: the streamed output equals what a single-shot
-				// Redact would have produced on the same input. (Block
-				// patterns get masked in stream mode, so we compare against
-				// a remapped redaction.)
-				expected := singleShotMaskAll(red, tc.text)
-				Expect(result).To(Equal(expected),
-					fmt.Sprintf("trial %d: stream != single-shot\n  input: %q", trial, tc.text))
-			}
-		}
-	})
-})
-
-// singleShotMaskAll runs the redactor in one pass with all blocks
-// remapped to mask — the same view the StreamFilter produces.
-func singleShotMaskAll(red *Redactor, text string) string {
-	patterns := red.Patterns()
-	overrides := make(map[string]Action, len(patterns))
-	for _, p := range patterns {
-		if p.Action == ActionBlock {
-			overrides[p.ID] = ActionMask
-		}
-	}
-	res := red.RedactWithOverrides(text, overrides)
-	return res.Redacted
-}
-
-func stringSlice(s string, fromRune, toRune int) string {
-	runes := []rune(s)
-	return string(runes[fromRune:toRune])
-}
diff --git a/core/services/routing/pii/types.go b/core/services/routing/pii/types.go
index afdcc7ad44be..de10b4764528 100644
--- a/core/services/routing/pii/types.go
+++ b/core/services/routing/pii/types.go
@@ -11,13 +11,14 @@
 // drops in without changing call sites.
 //
 // Configuration model: each pattern has an Action (block | mask |
-// route_local). Actions are evaluated in this order:
+// allow). Actions are evaluated in this order:
 //   - block: short-circuits the request with an error (the middleware
 //     returns 400 to the client).
 //   - mask: replaces the matched span with ReplacementFor(pattern).
-//   - route_local: leaves the text alone but sets a context flag the
-//     router (subsystem 2) treats as "this request must stay on a local
-//     model" — never crosses the boundary to a cloud proxy backend.
+//   - allow: detect-and-log only — the span is left intact and a
+//     PIIEvent is still recorded, but the text passes through
+//     unchanged. Useful to downgrade a pattern's default while keeping
+//     it visible in the audit log.
 package pii
 
 import "time"
@@ -36,11 +37,13 @@ const (
 	// the matched value).
 	ActionBlock Action = "block"
 
-	// ActionRouteLocal leaves the text intact but flags the request so
-	// the content router will refuse to dispatch it to a cloud proxy
-	// backend. Useful when a deployment trusts local models with
-	// sensitive data but not external providers.
-	ActionRouteLocal Action = "route_local"
+	// ActionAllow detects and logs the match but leaves the text
+	// intact — no masking, no blocking. A PIIEvent is still recorded,
+	// so the detection is auditable and forms the basis for surfacing
+	// detected-PII labels to the router (a future router-model
+	// feature). Use it to downgrade a pattern's default action for a
+	// model while keeping the pattern visible.
+	ActionAllow Action = "allow"
 )
 
 // Direction tags whether a PIIEvent fired on input (request body before
@@ -59,10 +62,12 @@ const (
 // substring slicing; call sites that need to log it strip it via
 // HashPrefix.
 type Span struct {
-	Start     int
-	End       int
-	Pattern   string // matches Pattern.ID
-	HashPrefix string // first 8 chars of sha256(matched value); audit-safe
+	Start      int
+	End        int
+	Pattern    string  // synthetic detector id, "<source>:<GROUP>" (e.g. "ner:EMAIL", "pattern:ANTHROPIC_KEY")
+	HashPrefix string  // first 8 chars of sha256(matched value); audit-safe
+	Action     Action  // the action that fired for this span (after merge)
+	Score      float32 // detector confidence for the (winning) hit, 0..1
 }
 
 // Result is what Redact returns. Redacted is the input string after
@@ -74,38 +79,15 @@ type Span struct {
 // the call site must enforce this by returning a 400 / refusing to
 // dispatch.
 //
-// LocalOnly is true iff at least one matched pattern had
-// Action=route_local. The router middleware reads this and constrains
-// candidate selection.
+// Masked is true iff at least one matched span was replaced with a
+// placeholder (Action=mask). Spans with Action=allow are recorded but
+// leave Masked false. Lets callers (e.g. the decision oracle)
+// distinguish "matched and redacted" from "matched but passed through".
 type Result struct {
-	Redacted  string
-	Spans     []Span
-	Blocked   bool
-	LocalOnly bool
-}
-
-// Pattern is one configurable rule. Description is shown in the admin
-// UI alongside the pattern; the regex itself stays an implementation
-// detail (a leak-prone admin showing an SSN regex with a sample value
-// in the field is a risk we deliberately design around).
-type Pattern struct {
-	ID          string
-	Description string
-	Action      Action
-	// Disabled skips the pattern entirely when true — useful for
-	// admins who want to keep a regex around (visible in the UI) but
-	// turn it off without removing the YAML entry. Default-false so
-	// every existing pattern stays active without touching its config.
-	Disabled bool
-	// MaxMatchLength is the longest possible match in characters. The
-	// streaming filter (subsystem 3, follow-up commit) uses this to
-	// size its tail buffer. For regex patterns we compute it at
-	// compile time from the pattern's structure when possible, or set
-	// a conservative upper bound otherwise.
-	MaxMatchLength int
-
-	// internal — populated by Compile().
-	regex regexpMatcher
+	Redacted string
+	Spans    []Span
+	Blocked  bool
+	Masked   bool
 }
 
 // EventKind classifies a stored audit event. The store is shared by the
@@ -150,7 +132,11 @@ type PIIEvent struct {
 	Length        int       `json:"length,omitempty"`
 	HashPrefix    string    `json:"hash_prefix,omitempty"`
 	Action        Action    `json:"action,omitempty"`
-	CreatedAt     time.Time `json:"created_at"`
+	// Score is the detector confidence (0..1) for an NER PII hit. Metadata
+	// only — never the matched value. Lets admins see how sure the model was
+	// about a (possibly false-positive) detection without re-running it.
+	Score     float32   `json:"score,omitempty"`
+	CreatedAt time.Time `json:"created_at"`
 
 	Host          string `json:"host,omitempty"`
 	Intercepted   *bool  `json:"intercepted,omitempty"`
diff --git a/core/services/routing/piiadapter/ollama.go b/core/services/routing/piiadapter/ollama.go
new file mode 100644
index 000000000000..f837cc9a0cb9
--- /dev/null
+++ b/core/services/routing/piiadapter/ollama.go
@@ -0,0 +1,119 @@
+package piiadapter
+
+import (
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+)
+
+// OllamaChat returns a pii.Adapter for *schema.OllamaChatRequest (POST
+// /api/chat). It scans each message's text content (Ollama messages carry a
+// plain string, no multimodal block form) and writes redacted text back.
+func OllamaChat() pii.Adapter {
+	return pii.Adapter{
+		Scan: func(parsed any) []pii.ScannedText {
+			req, ok := parsed.(*schema.OllamaChatRequest)
+			if !ok || req == nil {
+				return nil
+			}
+			var out []pii.ScannedText
+			for i := range req.Messages {
+				if req.Messages[i].Content != "" {
+					out = append(out, pii.ScannedText{Index: i, Text: req.Messages[i].Content})
+				}
+			}
+			return out
+		},
+		Apply: func(parsed any, updates []pii.ScannedText) {
+			req, ok := parsed.(*schema.OllamaChatRequest)
+			if !ok || req == nil {
+				return
+			}
+			for _, u := range updates {
+				if u.Index >= 0 && u.Index < len(req.Messages) {
+					req.Messages[u.Index].Content = u.Text
+				}
+			}
+		},
+	}
+}
+
+// Field selectors for OllamaGenerate (Prompt + System).
+const (
+	ollamaGenPrompt = iota
+	ollamaGenSystem
+)
+
+// OllamaGenerate returns a pii.Adapter for *schema.OllamaGenerateRequest (POST
+// /api/generate). It scans the Prompt and System strings.
+func OllamaGenerate() pii.Adapter {
+	return pii.Adapter{
+		Scan: func(parsed any) []pii.ScannedText {
+			req, ok := parsed.(*schema.OllamaGenerateRequest)
+			if !ok || req == nil {
+				return nil
+			}
+			var out []pii.ScannedText
+			if req.Prompt != "" {
+				out = append(out, pii.ScannedText{Index: ollamaGenPrompt, Text: req.Prompt})
+			}
+			if req.System != "" {
+				out = append(out, pii.ScannedText{Index: ollamaGenSystem, Text: req.System})
+			}
+			return out
+		},
+		Apply: func(parsed any, updates []pii.ScannedText) {
+			req, ok := parsed.(*schema.OllamaGenerateRequest)
+			if !ok || req == nil {
+				return
+			}
+			for _, u := range updates {
+				switch u.Index {
+				case ollamaGenPrompt:
+					req.Prompt = u.Text
+				case ollamaGenSystem:
+					req.System = u.Text
+				}
+			}
+		},
+	}
+}
+
+// Field selectors for OllamaEmbed (Input + its Prompt alias). Reuses the
+// shared encField/decField packing.
+const (
+	ollamaEmbInput = iota
+	ollamaEmbPrompt
+)
+
+// OllamaEmbed returns a pii.Adapter for *schema.OllamaEmbedRequest (POST
+// /api/embed, /api/embeddings). Input and its Prompt alias may be a string or
+// a []any of strings; non-string elements are skipped.
+func OllamaEmbed() pii.Adapter {
+	return pii.Adapter{
+		Scan: func(parsed any) []pii.ScannedText {
+			req, ok := parsed.(*schema.OllamaEmbedRequest)
+			if !ok || req == nil {
+				return nil
+			}
+			var out []pii.ScannedText
+			scanAnyText(ollamaEmbInput, req.Input, &out)
+			scanAnyText(ollamaEmbPrompt, req.Prompt, &out)
+			return out
+		},
+		Apply: func(parsed any, updates []pii.ScannedText) {
+			req, ok := parsed.(*schema.OllamaEmbedRequest)
+			if !ok || req == nil {
+				return
+			}
+			for _, u := range updates {
+				field, elem := decField(u.Index)
+				switch field {
+				case ollamaEmbInput:
+					req.Input = applyAnyText(req.Input, elem, u.Text)
+				case ollamaEmbPrompt:
+					req.Prompt = applyAnyText(req.Prompt, elem, u.Text)
+				}
+			}
+		},
+	}
+}
diff --git a/core/services/routing/piiadapter/ollama_test.go b/core/services/routing/piiadapter/ollama_test.go
new file mode 100644
index 000000000000..8e43091ff3fd
--- /dev/null
+++ b/core/services/routing/piiadapter/ollama_test.go
@@ -0,0 +1,46 @@
+package piiadapter
+
+import (
+	"github.com/mudler/LocalAI/core/schema"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Ollama adapters", func() {
+	It("OllamaChat scans and rewrites message content", func() {
+		req := &schema.OllamaChatRequest{Messages: []schema.OllamaMessage{
+			{Role: "user", Content: "I'm alice@example.com"},
+			{Role: "assistant", Content: ""},
+		}}
+		a := OllamaChat()
+		Expect(a.Scan(req)).To(HaveLen(1))
+		applyAll(a, req, func(string) string { return "X" })
+		Expect(req.Messages[0].Content).To(Equal("X"))
+		Expect(req.Messages[1].Content).To(Equal(""))
+	})
+
+	It("OllamaGenerate scans Prompt and System", func() {
+		req := &schema.OllamaGenerateRequest{Prompt: "ssn 123", System: "be terse"}
+		a := OllamaGenerate()
+		Expect(a.Scan(req)).To(HaveLen(2))
+		applyAll(a, req, func(string) string { return "Y" })
+		Expect(req.Prompt).To(Equal("Y"))
+		Expect(req.System).To(Equal("Y"))
+	})
+
+	It("OllamaEmbed scans string and array Input, skipping non-strings", func() {
+		a := OllamaEmbed()
+
+		s := &schema.OllamaEmbedRequest{Input: "secret email"}
+		Expect(a.Scan(s)).To(HaveLen(1))
+		applyAll(a, s, func(string) string { return "Z" })
+		Expect(s.Input).To(Equal("Z"))
+
+		arr := &schema.OllamaEmbedRequest{Input: []any{"a secret", float64(1)}}
+		Expect(a.Scan(arr)).To(HaveLen(1))
+		applyAll(a, arr, func(string) string { return "Z" })
+		got, _ := arr.Input.([]any)
+		Expect(got).To(Equal([]any{"Z", float64(1)}))
+	})
+})
diff --git a/core/services/routing/piiadapter/openai_completion.go b/core/services/routing/piiadapter/openai_completion.go
new file mode 100644
index 000000000000..53e158fd91fd
--- /dev/null
+++ b/core/services/routing/piiadapter/openai_completion.go
@@ -0,0 +1,91 @@
+package piiadapter
+
+import (
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+)
+
+// Field selectors for the prompt-style OpenAI requests (/v1/completions,
+// /v1/embeddings, /v1/edits), which carry user text in Prompt / Input /
+// Instruction rather than Messages.
+const (
+	fldPrompt = iota
+	fldInput
+	fldInstruction
+)
+
+// encField packs (field, element) into one ScannedText.Index. element=-1
+// means the field is a whole string; element>=0 indexes into a []any value.
+// Stored as element+1 so -1 maps to 0, with the field in the high bits.
+func encField(field, elem int) int     { return (field << 24) | (elem + 1) }
+func decField(p int) (field, elem int) { return p >> 24, (p & 0xFFFFFF) - 1 }
+
+// scanAnyText appends scannable strings from a string-or-[]any field. Non-string
+// array elements (token-id arrays, numbers) are skipped — only human text is
+// redacted.
+func scanAnyText(field int, v any, out *[]pii.ScannedText) {
+	switch t := v.(type) {
+	case string:
+		if t != "" {
+			*out = append(*out, pii.ScannedText{Index: encField(field, -1), Text: t})
+		}
+	case []any:
+		for k, e := range t {
+			if s, ok := e.(string); ok && s != "" {
+				*out = append(*out, pii.ScannedText{Index: encField(field, k), Text: s})
+			}
+		}
+	}
+}
+
+// applyAnyText writes redacted text back to a string-or-[]any field, returning
+// the (possibly replaced) value to assign back to the struct field.
+func applyAnyText(v any, elem int, text string) any {
+	if elem < 0 {
+		return text
+	}
+	if arr, ok := v.([]any); ok && elem >= 0 && elem < len(arr) {
+		arr[elem] = text
+	}
+	return v
+}
+
+// OpenAICompletion returns a pii.Adapter for the prompt-style OpenAI requests
+// (completions, embeddings, edits) on *schema.OpenAIRequest. It scans Prompt,
+// Input and Instruction — the string form and the string elements of an array
+// form — and writes redacted text back. Chat uses the separate OpenAI()
+// adapter (Messages); these endpoints leave Messages empty and vice versa.
+func OpenAICompletion() pii.Adapter {
+	return pii.Adapter{
+		Scan: func(parsed any) []pii.ScannedText {
+			req, ok := parsed.(*schema.OpenAIRequest)
+			if !ok || req == nil {
+				return nil
+			}
+			var out []pii.ScannedText
+			scanAnyText(fldPrompt, req.Prompt, &out)
+			scanAnyText(fldInput, req.Input, &out)
+			if req.Instruction != "" {
+				out = append(out, pii.ScannedText{Index: encField(fldInstruction, -1), Text: req.Instruction})
+			}
+			return out
+		},
+		Apply: func(parsed any, updates []pii.ScannedText) {
+			req, ok := parsed.(*schema.OpenAIRequest)
+			if !ok || req == nil {
+				return
+			}
+			for _, u := range updates {
+				field, elem := decField(u.Index)
+				switch field {
+				case fldPrompt:
+					req.Prompt = applyAnyText(req.Prompt, elem, u.Text)
+				case fldInput:
+					req.Input = applyAnyText(req.Input, elem, u.Text)
+				case fldInstruction:
+					req.Instruction = u.Text
+				}
+			}
+		},
+	}
+}
diff --git a/core/services/routing/piiadapter/openai_completion_test.go b/core/services/routing/piiadapter/openai_completion_test.go
new file mode 100644
index 000000000000..c1af142291af
--- /dev/null
+++ b/core/services/routing/piiadapter/openai_completion_test.go
@@ -0,0 +1,59 @@
+package piiadapter
+
+import (
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// applyAll feeds every scanned span back through Apply with the text
+// transformed by fn — the shape the middleware uses (scan, redact, apply).
+func applyAll(a pii.Adapter, parsed any, fn func(string) string) {
+	scanned := a.Scan(parsed)
+	updates := make([]pii.ScannedText, 0, len(scanned))
+	for _, s := range scanned {
+		updates = append(updates, pii.ScannedText{Index: s.Index, Text: fn(s.Text)})
+	}
+	a.Apply(parsed, updates)
+}
+
+var _ = Describe("OpenAICompletion adapter", func() {
+	a := OpenAICompletion()
+
+	It("scans and rewrites a string prompt", func() {
+		req := &schema.OpenAIRequest{}
+		req.Prompt = "contact alice@example.com"
+		got := a.Scan(req)
+		Expect(got).To(HaveLen(1))
+		Expect(got[0].Text).To(Equal("contact alice@example.com"))
+		applyAll(a, req, func(string) string { return "REDACTED" })
+		Expect(req.Prompt).To(Equal("REDACTED"))
+	})
+
+	It("scans array prompt elements and skips non-strings (token ids)", func() {
+		req := &schema.OpenAIRequest{}
+		req.Prompt = []any{"first secret", float64(42), "second secret"}
+		got := a.Scan(req)
+		Expect(got).To(HaveLen(2))
+		applyAll(a, req, func(s string) string { return "[X]" })
+		arr, _ := req.Prompt.([]any)
+		Expect(arr).To(Equal([]any{"[X]", float64(42), "[X]"}))
+	})
+
+	It("scans Input and Instruction (the edit/embeddings shape)", func() {
+		req := &schema.OpenAIRequest{Instruction: "fix the SSN 123-45-6789"}
+		req.Input = "my email is bob@example.com"
+		got := a.Scan(req)
+		Expect(got).To(HaveLen(2))
+		applyAll(a, req, func(string) string { return "*" })
+		Expect(req.Input).To(Equal("*"))
+		Expect(req.Instruction).To(Equal("*"))
+	})
+
+	It("returns nothing for an empty / non-matching request", func() {
+		Expect(a.Scan(&schema.OpenAIRequest{})).To(BeEmpty())
+		Expect(a.Scan(nil)).To(BeNil())
+	})
+})
diff --git a/core/services/routing/piidetector/detector.go b/core/services/routing/piidetector/detector.go
new file mode 100644
index 000000000000..75d997f55093
--- /dev/null
+++ b/core/services/routing/piidetector/detector.go
@@ -0,0 +1,86 @@
+// Package piidetector adapts the core/backend token-classification
+// wrapper to the PII redactor's pii.NERDetector seam. It lives outside
+// the pii package so pii stays free of core/backend imports (the
+// redactor is unit-tested with stub detectors). The dependency runs one
+// way: piidetector -> {core/backend, pii}.
+package piidetector
+
+import (
+	"context"
+	"unicode/utf8"
+
+	"github.com/mudler/xlog"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	model "github.com/mudler/LocalAI/pkg/model"
+)
+
+// New builds a pii.NERDetector backed by the token-classification model
+// in modelConfig. Phase 0: the Python `transformers` backend loaded with
+// Type=TokenClassification; Phase 2: the GGML privacy-filter backend —
+// both speak the same gRPC TokenClassify contract, so this adapter is
+// unchanged across the swap. The model is resolved lazily on first
+// Detect, so building a detector for a not-yet-loaded model is cheap and
+// never blocks startup.
+func New(loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) pii.NERDetector {
+	return &nerDetector{
+		classifier: backend.NewTokenClassifier(loader, modelConfig, appConfig, backend.TokenClassifyOptions{}),
+		modelName:  modelConfig.Name,
+	}
+}
+
+type nerDetector struct {
+	classifier backend.TokenClassifier
+	modelName  string
+}
+
+// Detect runs the model and maps its spans onto pii.NEREntity. Offsets
+// pass through as BYTE offsets per the TokenClassify proto contract.
+// Spans whose offsets fall outside the text or land off a UTF-8 rune
+// boundary are dropped: a bad offset must never reach the redactor,
+// which splices text[Start:End] and would otherwise corrupt output or
+// panic. The redactor applies NERConfig.MinScore and the entity->action
+// map itself, so we deliberately return every (validated) span here.
+//
+// CONTRACT NOTE: the proto defines start/end as UTF-8 byte offsets. The
+// Python transformers backend converts HuggingFace's codepoint offsets to
+// bytes before responding (see TokenClassify in backend.py), and the GGML
+// privacy-filter backend will emit bytes natively. The boundary check
+// below is defense-in-depth against a backend that regresses to codepoint
+// offsets: it downgrades the bug from "corrupted redaction / panic" to
+// "dropped span + warning" rather than trusting the wire blindly.
+func (d *nerDetector) Detect(ctx context.Context, text string) ([]pii.NEREntity, error) {
+	ents, err := d.classifier.TokenClassify(ctx, text)
+	if err != nil {
+		return nil, err
+	}
+
+	n := len(text)
+	out := make([]pii.NEREntity, 0, len(ents))
+	for _, e := range ents {
+		if e.Group == "" || e.Start < 0 || e.Start >= e.End || e.End > n {
+			xlog.Warn("pii NER: dropping span with invalid byte range",
+				"model", d.modelName, "group", e.Group, "start", e.Start, "end", e.End, "len", n)
+			continue
+		}
+		// text[e.Start] is safe (Start < End <= n => Start < n). End is
+		// exclusive: when End < n, text[End] is the first byte past the
+		// span and must itself start a rune. Off-boundary offsets are the
+		// signature of codepoint-vs-byte offset confusion.
+		if !utf8.RuneStart(text[e.Start]) || (e.End < n && !utf8.RuneStart(text[e.End])) {
+			xlog.Warn("pii NER: dropping span off UTF-8 boundary (offset units mismatch?)",
+				"model", d.modelName, "group", e.Group, "start", e.Start, "end", e.End)
+			continue
+		}
+		out = append(out, pii.NEREntity{
+			Group: e.Group,
+			Start: e.Start,
+			End:   e.End,
+			Score: e.Score,
+			Text:  e.Text,
+		})
+	}
+	return out, nil
+}
diff --git a/core/services/routing/piidetector/pattern.go b/core/services/routing/piidetector/pattern.go
new file mode 100644
index 000000000000..1f4e01d1d929
--- /dev/null
+++ b/core/services/routing/piidetector/pattern.go
@@ -0,0 +1,80 @@
+package piidetector
+
+import (
+	"context"
+	"time"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/LocalAI/core/services/routing/piipattern"
+	"github.com/mudler/LocalAI/core/trace"
+)
+
+// NewPattern builds a pii.NERDetector that matches secrets with the restricted
+// regex tier (built-ins + operator-defined patterns) instead of a neural model.
+// It runs entirely in-process — no backend, GGUF, or VRAM — and the patterns
+// compile once here, so an invalid pattern is reported now (the resolver fails
+// closed) rather than per request. Matches are reported under their group with
+// a deterministic Score of 1.0.
+func NewPattern(modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (pii.NERDetector, error) {
+	custom := make([]piipattern.Pattern, 0, len(modelConfig.PIIDetection.Patterns))
+	for _, p := range modelConfig.PIIDetection.Patterns {
+		custom = append(custom, piipattern.Pattern{Group: p.Name, Pattern: p.Match, MinLen: p.MinLen})
+	}
+	m, err := piipattern.NewMatcher(modelConfig.PIIDetection.Builtins, custom)
+	if err != nil {
+		return nil, err
+	}
+	return &patternDetector{matcher: m, modelName: modelConfig.Name, appConfig: appConfig}, nil
+}
+
+type patternDetector struct {
+	matcher   *piipattern.Matcher
+	modelName string
+	appConfig *config.ApplicationConfig
+}
+
+// Detect runs the compiled patterns and maps each match onto a pii.NEREntity.
+// When tracing is enabled it records a pattern_pii BackendTrace so the matches
+// (group, byte range, text) show in the Traces UI alongside NER detections.
+func (d *patternDetector) Detect(_ context.Context, text string) ([]pii.NEREntity, error) {
+	var start time.Time
+	if d.appConfig != nil && d.appConfig.EnableTracing {
+		trace.InitBackendTracingIfEnabled(d.appConfig.TracingMaxItems, d.appConfig.TracingMaxBodyBytes)
+		start = time.Now()
+	}
+
+	matches := d.matcher.Find(text)
+	out := make([]pii.NEREntity, 0, len(matches))
+	var traceEnts []backend.TokenEntity
+	for _, mt := range matches {
+		out = append(out, pii.NEREntity{Group: mt.Group, Start: mt.Start, End: mt.End, Score: 1.0, Text: mt.Text})
+		if d.appConfig != nil && d.appConfig.EnableTracing {
+			traceEnts = append(traceEnts, backend.TokenEntity{Group: mt.Group, Start: mt.Start, End: mt.End, Score: 1.0, Text: mt.Text})
+		}
+	}
+
+	if d.appConfig != nil && d.appConfig.EnableTracing {
+		trace.RecordBackendTrace(patternPIITrace(d.modelName, text, traceEnts, start))
+	}
+	return out, nil
+}
+
+// patternPIITrace assembles the Traces-UI row for one pattern-detector run.
+// Split out so the Data assembly is unit-testable without a request.
+func patternPIITrace(modelName, text string, entities []backend.TokenEntity, start time.Time) trace.BackendTrace {
+	return trace.BackendTrace{
+		Timestamp: start,
+		Duration:  time.Since(start),
+		Type:      trace.BackendTracePatternPII,
+		ModelName: modelName,
+		Backend:   "pattern",
+		Summary:   trace.TruncateString(text, 200),
+		Data: map[string]any{
+			"input_chars": len(text),
+			"matches":     len(entities),
+			"entities":    entities,
+		},
+	}
+}
diff --git a/core/services/routing/piidetector/pattern_test.go b/core/services/routing/piidetector/pattern_test.go
new file mode 100644
index 000000000000..90fed1b3b79e
--- /dev/null
+++ b/core/services/routing/piidetector/pattern_test.go
@@ -0,0 +1,61 @@
+package piidetector_test
+
+import (
+	"context"
+	"testing"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/LocalAI/core/services/routing/piidetector"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestPiidetector(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "piidetector suite")
+}
+
+func patternModel() config.ModelConfig {
+	c := config.ModelConfig{Name: "secret-filter", Backend: "pattern"}
+	c.PIIDetection.Builtins = []string{"anthropic_api_key"}
+	c.PIIDetection.Patterns = []config.PIIPattern{{Name: "INTERNAL_TOKEN", Match: `tok-[A-Za-z0-9]{8,}`}}
+	return c
+}
+
+var _ = Describe("pattern detector", func() {
+	It("matches built-in and custom secrets as whole-span deterministic hits", func() {
+		det, err := piidetector.NewPattern(patternModel(), &config.ApplicationConfig{})
+		Expect(err).NotTo(HaveOccurred())
+
+		ents, err := det.Detect(context.Background(), "use sk-ant-api03-AAAABBBBCCCCDDDDEEEE and tok-ABCD1234 ok")
+		Expect(err).NotTo(HaveOccurred())
+
+		byGroup := map[string]pii.NEREntity{}
+		for _, e := range ents {
+			byGroup[e.Group] = e
+			Expect(e.Score).To(BeEquivalentTo(float32(1.0)), "pattern matches are deterministic")
+		}
+		Expect(byGroup).To(HaveKey("ANTHROPIC_KEY"))
+		Expect(byGroup["INTERNAL_TOKEN"].Text).To(Equal("tok-ABCD1234"))
+	})
+
+	It("still detects (and exercises the trace path) with tracing enabled", func() {
+		det, err := piidetector.NewPattern(patternModel(), &config.ApplicationConfig{
+			EnableTracing: true, TracingMaxItems: 8,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		ents, err := det.Detect(context.Background(), "sk-ant-api03-AAAABBBBCCCCDDDDEEEE")
+		Expect(err).NotTo(HaveOccurred())
+		Expect(ents).To(HaveLen(1))
+		Expect(ents[0].Group).To(Equal("ANTHROPIC_KEY"))
+	})
+
+	It("fails to build on an invalid (unanchored) custom pattern", func() {
+		c := config.ModelConfig{Name: "bad", Backend: "pattern"}
+		c.PIIDetection.Patterns = []config.PIIPattern{{Name: "X", Match: `.*`}}
+		_, err := piidetector.NewPattern(c, &config.ApplicationConfig{})
+		Expect(err).To(HaveOccurred())
+	})
+})
diff --git a/core/services/routing/piipattern/builtins.go b/core/services/routing/piipattern/builtins.go
new file mode 100644
index 000000000000..ad57334cabf6
--- /dev/null
+++ b/core/services/routing/piipattern/builtins.go
@@ -0,0 +1,61 @@
+package piipattern
+
+import "sort"
+
+// Builtin is a named, ready-made secret pattern. Group is the uppercase entity
+// label a match is reported under (so it keys into a detector model's
+// pii_detection.entity_actions, exactly like an NER group). Every Builtin
+// pattern is written in the restricted subset and is verified at test time to
+// pass ValidatePattern and compile.
+type Builtin struct {
+	Name        string
+	Group       string
+	Pattern     string
+	Description string
+}
+
+// builtins is the curated catalogue. Patterns intentionally anchor on each
+// provider's fixed prefix and require a long high-entropy tail, so they fire on
+// real credentials and not on ordinary prose. Names are stable identifiers
+// referenced from a model config's pii_detection.builtins list.
+var builtins = []Builtin{
+	{"anthropic_api_key", "ANTHROPIC_KEY", `sk-ant-[A-Za-z0-9_-]{20,}`, "Anthropic API key (sk-ant-…)"},
+	{"openai_api_key", "OPENAI_KEY", `sk-(?:proj-)?[A-Za-z0-9_-]{20,}`, "OpenAI API key (sk-… / sk-proj-…)"},
+	{"github_token", "GITHUB_TOKEN", `(?:ghp|gho|ghs|ghr|ghu)_[A-Za-z0-9]{36,}`, "GitHub access token (ghp_/gho_/ghs_/ghr_/ghu_)"},
+	{"github_pat", "GITHUB_TOKEN", `github_pat_[A-Za-z0-9_]{20,}`, "GitHub fine-grained personal access token"},
+	{"aws_access_key", "AWS_ACCESS_KEY", `AKIA[0-9A-Z]{16}`, "AWS access key ID (AKIA…)"},
+	{"google_api_key", "GOOGLE_API_KEY", `AIza[0-9A-Za-z_-]{35}`, "Google API key (AIza…)"},
+	{"slack_token", "SLACK_TOKEN", `xox[baprs]-[0-9A-Za-z-]{10,}`, "Slack token (xoxb-/xoxa-/xoxp-/xoxr-/xoxs-)"},
+	{"stripe_key", "STRIPE_KEY", `(?:sk|rk)_live_[0-9A-Za-z]{16,}`, "Stripe live secret/restricted key"},
+	{"jwt", "JWT", `eyJ[A-Za-z0-9_-]{10,}\.eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}`, "JSON Web Token (eyJ….eyJ….…)"},
+	{"private_key_block", "PRIVATE_KEY", `-----BEGIN [A-Z ]*PRIVATE KEY-----`, "PEM private-key header"},
+}
+
+// BuiltinCatalogue returns the built-in patterns sorted by name. Used by the
+// config-metadata registry to populate the editor's builtins checklist.
+func BuiltinCatalogue() []Builtin {
+	out := make([]Builtin, len(builtins))
+	copy(out, builtins)
+	sort.Slice(out, func(i, j int) bool { return out[i].Name < out[j].Name })
+	return out
+}
+
+// BuiltinNames returns the built-in pattern names, sorted.
+func BuiltinNames() []string {
+	out := make([]string, 0, len(builtins))
+	for _, b := range builtins {
+		out = append(out, b.Name)
+	}
+	sort.Strings(out)
+	return out
+}
+
+// LookupBuiltin finds a built-in by name.
+func LookupBuiltin(name string) (Builtin, bool) {
+	for _, b := range builtins {
+		if b.Name == name {
+			return b, true
+		}
+	}
+	return Builtin{}, false
+}
diff --git a/core/services/routing/piipattern/compile.go b/core/services/routing/piipattern/compile.go
new file mode 100644
index 000000000000..2d780ba32e7a
--- /dev/null
+++ b/core/services/routing/piipattern/compile.go
@@ -0,0 +1,20 @@
+package piipattern
+
+import "regexp"
+
+// Compile validates src against the restricted grammar and, if it passes,
+// compiles it to an RE2 program set to leftmost-longest matching so a hit grabs
+// the whole secret (the entire key) rather than the shortest prefix.
+func Compile(src string) (*regexp.Regexp, error) {
+	if err := ValidatePattern(src); err != nil {
+		return nil, err
+	}
+	re, err := regexp.Compile(src)
+	if err != nil {
+		// ValidatePattern already parsed with the same flags, so this is
+		// effectively unreachable, but surface it rather than panic.
+		return nil, err
+	}
+	re.Longest()
+	return re, nil
+}
diff --git a/core/services/routing/piipattern/grammar.go b/core/services/routing/piipattern/grammar.go
new file mode 100644
index 000000000000..93ca34c7279f
--- /dev/null
+++ b/core/services/routing/piipattern/grammar.go
@@ -0,0 +1,163 @@
+// Package piipattern is a bounded, restricted-regex matcher for high-entropy,
+// highly-regular secrets (API keys, tokens, private-key blocks) that the NER
+// PII tier cannot catch — it has no credential class, so it fragments a key
+// into the nearest-looking trained categories and may leave the secret part
+// exposed.
+//
+// The language is a deliberately restricted subset of regular expressions
+// compiled to Go's RE2 engine (regexp), which is linear-time with no
+// backtracking — there is no ReDoS class of failure. On top of RE2 we cap the
+// pattern source length, the {n,m} expansion bound, the pattern count, and the
+// scanned input, and we require every pattern to carry a fixed literal
+// "anchor". The anchor rule is what admits `sk-ant-…` / `ghp_…` style keys
+// while rejecting open-ended shapes like an email address or a bare `\w+`
+// (which would match almost anything) — those stay with the NER tier.
+//
+// This package is a leaf: it imports only the standard library, so both
+// core/config (validation at load) and core/application (the resolver) can use
+// it without an import cycle.
+package piipattern
+
+import (
+	"fmt"
+	"regexp/syntax"
+)
+
+const (
+	// MaxPatternLen caps the source length of a single pattern. Generous for a
+	// credential shape, small enough that the compiled program stays tiny.
+	MaxPatternLen = 256
+	// MaxQuantifier caps an explicit {n,m} upper bound. RE2 expands a bounded
+	// repeat into that many copies, so an uncapped {0,1000000} would blow up
+	// the compiled program's memory. Unbounded {n,} (no upper) is a loop, not
+	// an expansion, and is allowed.
+	MaxQuantifier = 4096
+	// MaxAlternation caps the arms of a single `a|b|c` alternation.
+	MaxAlternation = 64
+	// MaxAST bounds recursion depth so a pathologically nested pattern can't
+	// blow the stack during validation.
+	MaxAST = 64
+	// MinAnchorLen is the shortest fixed literal run a pattern must contain to
+	// be considered "anchored" to a recognisable secret prefix/shape.
+	MinAnchorLen = 3
+)
+
+// parseFlags enables Perl character classes (\w \d \s) and word boundaries,
+// matching what regexp.Compile uses, so validation and compilation agree.
+const parseFlags = syntax.Perl
+
+// ValidatePattern reports whether src is an acceptable restricted-subset
+// pattern. It returns a descriptive error naming the offending construct so an
+// operator editing a model config gets actionable feedback (the error is
+// surfaced by config Validate at load and by the resolver, which fails closed).
+func ValidatePattern(src string) error {
+	if src == "" {
+		return fmt.Errorf("pattern is empty")
+	}
+	if len(src) > MaxPatternLen {
+		return fmt.Errorf("pattern is too long (%d chars; max %d)", len(src), MaxPatternLen)
+	}
+	re, err := syntax.Parse(src, parseFlags)
+	if err != nil {
+		return fmt.Errorf("invalid pattern: %w", err)
+	}
+	if err := walk(re, 0); err != nil {
+		return err
+	}
+	if anchorLen(re) < MinAnchorLen {
+		return fmt.Errorf("pattern must contain a fixed literal run of at least %d characters "+
+			"(e.g. \"sk-ant-\", \"ghp_\", \"AKIA\") so it is anchored to a recognisable secret; "+
+			"open-ended shapes like emails or bare \\w+ belong to the NER tier", MinAnchorLen)
+	}
+	return nil
+}
+
+// walk enforces the allow-list of regex constructs.
+func walk(re *syntax.Regexp, depth int) error {
+	if depth > MaxAST {
+		return fmt.Errorf("pattern is too deeply nested")
+	}
+	switch re.Op {
+	case syntax.OpAnyChar, syntax.OpAnyCharNotNL:
+		return fmt.Errorf("'.' (any character) is not allowed; use an explicit class like [A-Za-z0-9]")
+	case syntax.OpCapture:
+		return fmt.Errorf("capturing groups are not allowed; use a non-capturing group (?:…) if you need grouping")
+	case syntax.OpRepeat:
+		if re.Min > MaxQuantifier || (re.Max >= 0 && re.Max > MaxQuantifier) {
+			return fmt.Errorf("{n,m} bound is too large (max %d)", MaxQuantifier)
+		}
+	case syntax.OpAlternate:
+		if len(re.Sub) > MaxAlternation {
+			return fmt.Errorf("too many alternation arms (%d; max %d)", len(re.Sub), MaxAlternation)
+		}
+	case syntax.OpLiteral, syntax.OpCharClass, syntax.OpConcat,
+		syntax.OpStar, syntax.OpPlus, syntax.OpQuest,
+		syntax.OpEmptyMatch,
+		syntax.OpBeginLine, syntax.OpEndLine, syntax.OpBeginText, syntax.OpEndText,
+		syntax.OpWordBoundary, syntax.OpNoWordBoundary:
+		// allowed
+	default:
+		return fmt.Errorf("unsupported construct in pattern")
+	}
+	for _, sub := range re.Sub {
+		if err := walk(sub, depth+1); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// anchorLen returns the number of fixed (non-space) literal characters every
+// match of re is guaranteed to contain — the pattern's "anchor strength".
+// Concatenation sums its parts; alternation takes the min (every arm must
+// carry the anchor); a `+`/{n,} with n>=1 contributes its body's literal once;
+// `*`, `?`, {0,m} and char classes/anchors contribute 0 (they may be absent).
+//
+// We sum rather than measure the longest contiguous run because RE2 factors
+// common prefixes — `(?:ghp|gho|ghs)_…` parses to `gh[ops]_…`, whose longest
+// contiguous literal is only "gh" (2) but whose guaranteed literals are
+// "gh"+"_" (3). Summing keeps such real key prefixes admissible while still
+// rejecting open-ended shapes: an email `[\w.]+@[\w.]+\.\w+` guarantees only
+// `@` and `.` (2 < MinAnchorLen).
+func anchorLen(re *syntax.Regexp) int {
+	switch re.Op {
+	case syntax.OpLiteral:
+		n := 0
+		for _, r := range re.Rune {
+			if r != ' ' && r != '\t' && r != '\n' && r != '\r' {
+				n++
+			}
+		}
+		return n
+	case syntax.OpConcat:
+		sum := 0
+		for _, sub := range re.Sub {
+			sum += anchorLen(sub)
+		}
+		return sum
+	case syntax.OpAlternate:
+		if len(re.Sub) == 0 {
+			return 0
+		}
+		min := -1
+		for _, sub := range re.Sub {
+			if a := anchorLen(sub); min < 0 || a < min {
+				min = a
+			}
+		}
+		return min
+	case syntax.OpPlus:
+		if len(re.Sub) == 1 {
+			return anchorLen(re.Sub[0])
+		}
+		return 0
+	case syntax.OpRepeat:
+		if re.Min >= 1 && len(re.Sub) == 1 {
+			return anchorLen(re.Sub[0])
+		}
+		return 0
+	default:
+		// char classes, anchors, OpStar, OpQuest carry no guaranteed literal.
+		return 0
+	}
+}
diff --git a/core/services/routing/piipattern/matcher.go b/core/services/routing/piipattern/matcher.go
new file mode 100644
index 000000000000..8da31ed311fb
--- /dev/null
+++ b/core/services/routing/piipattern/matcher.go
@@ -0,0 +1,100 @@
+package piipattern
+
+import (
+	"fmt"
+	"regexp"
+)
+
+const (
+	// MaxPatternsPerMatcher bounds how many patterns one detector may hold.
+	MaxPatternsPerMatcher = 128
+	// MaxMatchesPerPattern bounds matches emitted per pattern per call, so a
+	// pathological input can't produce an unbounded result set.
+	MaxMatchesPerPattern = 1000
+)
+
+// Pattern is one compiled-ready rule: matches are reported under Group, and a
+// match shorter than MinLen bytes is dropped (0 = no floor).
+type Pattern struct {
+	Group   string
+	Pattern string
+	MinLen  int
+}
+
+// Match is one detected span: a half-open byte range [Start,End) into the
+// scanned text, the matched text, and the reporting Group.
+type Match struct {
+	Group string
+	Start int
+	End   int
+	Text  string
+}
+
+type compiled struct {
+	group  string
+	re     *regexp.Regexp
+	minLen int
+}
+
+// Matcher holds a set of compiled patterns and scans text for all of them.
+type Matcher struct {
+	pats []compiled
+}
+
+// NewMatcher compiles the named built-ins plus the custom patterns into a
+// Matcher. Unknown built-in names and patterns that fail the restricted grammar
+// are reported as errors (the caller fails closed). Built-in and custom counts
+// together may not exceed MaxPatternsPerMatcher.
+func NewMatcher(builtinNames []string, custom []Pattern) (*Matcher, error) {
+	if len(builtinNames)+len(custom) > MaxPatternsPerMatcher {
+		return nil, fmt.Errorf("too many patterns (%d; max %d)", len(builtinNames)+len(custom), MaxPatternsPerMatcher)
+	}
+	m := &Matcher{}
+	for _, name := range builtinNames {
+		b, ok := LookupBuiltin(name)
+		if !ok {
+			return nil, fmt.Errorf("unknown built-in pattern %q", name)
+		}
+		re, err := Compile(b.Pattern)
+		if err != nil {
+			return nil, fmt.Errorf("built-in %q: %w", name, err)
+		}
+		m.pats = append(m.pats, compiled{group: b.Group, re: re})
+	}
+	for _, p := range custom {
+		if p.Group == "" {
+			return nil, fmt.Errorf("custom pattern is missing a name/group")
+		}
+		re, err := Compile(p.Pattern)
+		if err != nil {
+			return nil, fmt.Errorf("pattern %q: %w", p.Group, err)
+		}
+		m.pats = append(m.pats, compiled{group: p.Group, re: re, minLen: p.MinLen})
+	}
+	return m, nil
+}
+
+// Find returns every match of every pattern over text. Spans from different
+// patterns may overlap; the caller (the redactor) unions and resolves them.
+func (m *Matcher) Find(text string) []Match {
+	if m == nil || text == "" {
+		return nil
+	}
+	var out []Match
+	for _, p := range m.pats {
+		locs := p.re.FindAllStringIndex(text, MaxMatchesPerPattern)
+		for _, loc := range locs {
+			start, end := loc[0], loc[1]
+			if end-start < p.minLen {
+				continue
+			}
+			out = append(out, Match{
+				Group: p.group,
+				Start: start,
+				End:   end,
+				Text:  text[start:end],
+			})
+		}
+	}
+	return out
+}
diff --git a/core/services/routing/piipattern/piipattern_test.go b/core/services/routing/piipattern/piipattern_test.go
new file mode 100644
index 000000000000..ef38a4992d06
--- /dev/null
+++ b/core/services/routing/piipattern/piipattern_test.go
@@ -0,0 +1,105 @@
+package piipattern
+
+import (
+	"strings"
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestPiipattern(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "piipattern suite")
+}
+
+var _ = Describe("ValidatePattern", func() {
+	DescribeTable("accepts anchored, bounded patterns",
+		func(src string) { Expect(ValidatePattern(src)).To(Succeed()) },
+		Entry("anthropic", `sk-ant-[A-Za-z0-9_-]{20,200}`),
+		Entry("github via alternation", `(?:ghp|gho|ghs)_[A-Za-z0-9]{36,}`),
+		Entry("custom token", `tok-\w{32,64}`),
+		Entry("aws", `AKIA[0-9A-Z]{16}`),
+		Entry("anchored by mid-literal", `(?:sk|rk)_live_[0-9A-Za-z]{16,}`),
+	)
+
+	DescribeTable("rejects unanchored or unsafe patterns",
+		func(src string) { Expect(ValidatePattern(src)).NotTo(Succeed()) },
+		Entry("email (no fixed anchor)", `[\w.]+@[\w.]+\.\w+`),
+		Entry("bare word run", `\w+`),
+		Entry("any-char greedy", `sk-.*`),
+		Entry("capturing group", `(sk-ant-[A-Za-z0-9]+)`),
+		Entry("two fixed chars only", `ab[0-9]{8,}`),
+		Entry("over-long source", "sk-ant-"+strings.Repeat("a", MaxPatternLen)),
+		Entry("huge bounded repeat", `sk-ant-[A-Za-z0-9]{5000}`),
+		Entry("empty", ``),
+	)
+})
+
+var _ = Describe("Compile", func() {
+	It("compiles a valid pattern with leftmost-longest semantics", func() {
+		re, err := Compile(`sk-ant-[A-Za-z0-9_-]{4,}`)
+		Expect(err).NotTo(HaveOccurred())
+		// Longest() makes the match span the whole key, not a shorter prefix.
+		loc := re.FindString("key sk-ant-AAAA1111bbbb end")
+		Expect(loc).To(Equal("sk-ant-AAAA1111bbbb"))
+	})
+	It("refuses an invalid pattern", func() {
+		_, err := Compile(`.*`)
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+var _ = Describe("builtins", func() {
+	It("every built-in validates, compiles, and is uniquely named", func() {
+		seen := map[string]bool{}
+		for _, b := range BuiltinCatalogue() {
+			Expect(seen[b.Name]).To(BeFalse(), "duplicate builtin %s", b.Name)
+			seen[b.Name] = true
+			Expect(ValidatePattern(b.Pattern)).To(Succeed(), "builtin %s pattern %q", b.Name, b.Pattern)
+		}
+	})
+
+	DescribeTable("matches a real sample and not a decoy",
+		func(name, sample, decoy string) {
+			b, ok := LookupBuiltin(name)
+			Expect(ok).To(BeTrue())
+			re, err := Compile(b.Pattern)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(re.MatchString(sample)).To(BeTrue(), "should match %q", sample)
+			Expect(re.MatchString(decoy)).To(BeFalse(), "should not match %q", decoy)
+		},
+		Entry("anthropic", "anthropic_api_key", "sk-ant-api03-AbCdEf012345_-AbCdEf012345", "sk-ant-short"),
+		Entry("aws", "aws_access_key", "AKIAIOSFODNN7EXAMPLE", "AKIAshort"),
+		Entry("github", "github_token", "ghp_"+strings.Repeat("a", 36), "ghp_short"),
+	)
+})
+
+var _ = Describe("Matcher", func() {
+	It("reports the whole key as one span under its group", func() {
+		m, err := NewMatcher([]string{"anthropic_api_key"}, nil)
+		Expect(err).NotTo(HaveOccurred())
+		got := m.Find("my key is sk-ant-api03-AbCdEf012345AbCdEf012345 thanks")
+		Expect(got).To(HaveLen(1))
+		Expect(got[0].Group).To(Equal("ANTHROPIC_KEY"))
+		Expect(got[0].Text).To(Equal("sk-ant-api03-AbCdEf012345AbCdEf012345"))
+	})
+
+	It("compiles custom patterns and honours MinLen", func() {
+		m, err := NewMatcher(nil, []Pattern{{Group: "INTERNAL", Pattern: `tok-[A-Za-z0-9]{4,}`, MinLen: 12}})
+		Expect(err).NotTo(HaveOccurred())
+		// "tok-AAAA" (8 bytes) is below MinLen 12 and is dropped.
+		Expect(m.Find("tok-AAAA")).To(BeEmpty())
+		Expect(m.Find("tok-AAAABBBBCCCC")).To(HaveLen(1))
+	})
+
+	It("fails closed on an unknown built-in", func() {
+		_, err := NewMatcher([]string{"nope"}, nil)
+		Expect(err).To(HaveOccurred())
+	})
+
+	It("rejects an invalid custom pattern", func() {
+		_, err := NewMatcher(nil, []Pattern{{Group: "X", Pattern: `.*`}})
+		Expect(err).To(HaveOccurred())
+	})
+})
diff --git a/core/services/routing/router/embedding_cache.go b/core/services/routing/router/embedding_cache.go
index ba90635341a4..756464dd75f9 100644
--- a/core/services/routing/router/embedding_cache.go
+++ b/core/services/routing/router/embedding_cache.go
@@ -52,6 +52,10 @@ type EmbeddingCacheClassifier struct {
 	similarityThreshold float64
 	confidenceThreshold float64
 
+	// budget trims the conversation to the embedder model's own context
+	// before embedding; nil embeds Probe.Prompt as built by the caller.
+	budget *lazyBudget
+
 	hits           atomic.Uint64
 	misses         atomic.Uint64
 	nearMisses     atomic.Uint64
@@ -100,6 +104,15 @@ func NewEmbeddingCacheClassifier(inner Classifier, embedder backend.Embedder, st
 	}
 }
 
+// WithTokenTrim wires the embedder model's own tokenizer and context so the
+// probe embeds the most recent turns that fit instead of a caller-chosen size.
+// nil tokenizer / non-positive context leaves trimming off. Returns the
+// receiver for chaining at construction.
+func (c *EmbeddingCacheClassifier) WithTokenTrim(tokenize func(string) (int, error), maxContextTokens int) *EmbeddingCacheClassifier {
+	c.budget = &lazyBudget{tokenize: tokenize, maxContext: maxContextTokens}
+	return c
+}
+
 // Name is the inner classifier's name — the decision-log "classifier"
 // field should reflect *what* made the decision, not the caching
 // transport. Cache hits set Decision.Cached separately so admins can
@@ -127,7 +140,7 @@ func (c *EmbeddingCacheClassifier) Stats() EmbeddingCacheStats {
 func (c *EmbeddingCacheClassifier) Classify(ctx context.Context, p Probe) (Decision, error) {
 	start := time.Now()
 
-	vec, err := c.embedder.Embed(ctx, p.Prompt)
+	vec, err := c.embedder.Embed(ctx, trimmedProbeText(p, c.budget, identityRender))
 	if err != nil {
 		c.embedderErrors.Add(1)
 		xlog.Warn("router: embedding cache embed failed", "error", err)
diff --git a/core/services/routing/router/embedding_cache_test.go b/core/services/routing/router/embedding_cache_test.go
index 726614d0e966..e36b049c3d87 100644
--- a/core/services/routing/router/embedding_cache_test.go
+++ b/core/services/routing/router/embedding_cache_test.go
@@ -4,6 +4,8 @@ import (
 	"context"
 	"encoding/json"
 	"errors"
+	"fmt"
+	"strings"
 	"sync"
 	"time"
 
@@ -13,6 +15,20 @@ import (
 	. "github.com/onsi/gomega"
 )
 
+// capturingEmbedder records the text it was last asked to embed and returns a
+// fixed vector, so a test can assert what the cache fed the embedder.
+type capturingEmbedder struct {
+	mu       sync.Mutex
+	lastText string
+}
+
+func (e *capturingEmbedder) Embed(_ context.Context, text string) ([]float32, error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.lastText = text
+	return []float32{1, 2, 3}, nil
+}
+
 // fakeEmbedder returns a vector keyed by a lookup table; this lets the
 // test exercise hit/miss control without depending on a real model.
 type fakeEmbedder struct {
@@ -294,6 +310,45 @@ var _ = Describe("EmbeddingCache", func() {
 	})
 })
 
+var _ = Describe("EmbeddingCache WithTokenTrim", func() {
+	ctx := context.Background()
+	wordCount := func(s string) (int, error) { return len(strings.Fields(s)), nil }
+
+	It("embeds the most recent turns that fit the embedder context, not the full prompt", func() {
+		emb := &capturingEmbedder{}
+		store := &memVectorStore{}
+		inner := &stubInner{name: "score", decision: router.Decision{Labels: []string{"x"}, Score: 0.1}}
+		// context_size 50 → budget 50−16 margin ≈ 34 tokens, far under the
+		// ~120-word transcript below, so the oldest turns must be dropped.
+		cache := router.NewEmbeddingCacheClassifier(inner, emb, store, 0.92, 0.6).
+			WithTokenTrim(wordCount, 50)
+
+		msgs := make([]string, 0, 31)
+		for i := range 30 {
+			msgs = append(msgs, fmt.Sprintf("OLDturn%d filler filler filler", i))
+		}
+		msgs = append(msgs, "NEWESTTURN final words here")
+		full := strings.Join(msgs, "\n")
+
+		_, err := cache.Classify(ctx, router.Probe{Prompt: full, Messages: msgs})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(emb.lastText).To(ContainSubstring("NEWESTTURN"), "newest turn must survive")
+		Expect(emb.lastText).NotTo(ContainSubstring("OLDturn0 "), "oldest turns trimmed to fit context")
+		Expect(emb.lastText).NotTo(Equal(full), "must not embed the untrimmed prompt")
+	})
+
+	It("embeds Probe.Prompt unchanged when no trim is wired", func() {
+		emb := &capturingEmbedder{}
+		store := &memVectorStore{}
+		inner := &stubInner{name: "score", decision: router.Decision{Labels: []string{"x"}, Score: 0.1}}
+		cache := router.NewEmbeddingCacheClassifier(inner, emb, store, 0.92, 0.6)
+
+		_, err := cache.Classify(ctx, router.Probe{Prompt: "PROMPTASIS", Messages: []string{"ignored-no-tokenizer"}})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(emb.lastText).To(Equal("PROMPTASIS"))
+	})
+})
+
 var _ = Describe("EmbeddingCache latency", func() {
 	It("is populated on hits", func() {
 		embedder := &fakeEmbedder{table: map[string][]float32{"p": {1}}}
diff --git a/core/services/routing/router/rerank.go b/core/services/routing/router/rerank.go
index d422a58db433..e83f0186425b 100644
--- a/core/services/routing/router/rerank.go
+++ b/core/services/routing/router/rerank.go
@@ -23,6 +23,11 @@ type RerankClassifier struct {
 	labels    []string
 	documents []string
 	cache     *labelSetCache
+
+	// budget trims the query to the reranker model's context minus the
+	// longest policy description (paired with the query per rerank call);
+	// nil reranks Probe.Prompt as built by the caller.
+	budget *lazyBudget
 }
 
 // defaultRerankActivationThreshold is the relevance floor a label
@@ -64,16 +69,26 @@ func NewRerankClassifier(policies []ScorePolicy, reranker backend.Reranker, cach
 	}
 }
 
+// WithTokenTrim wires the reranker model's own tokenizer and context so the
+// query is trimmed to the most recent turns that fit alongside the longest
+// policy description. nil tokenizer / non-positive context leaves trimming
+// off. Returns the receiver for chaining at construction.
+func (c *RerankClassifier) WithTokenTrim(tokenize func(string) (int, error), maxContextTokens int) *RerankClassifier {
+	c.budget = &lazyBudget{tokenize: tokenize, maxContext: maxContextTokens, extras: c.documents}
+	return c
+}
+
 func (c *RerankClassifier) Name() string { return ClassifierColbert }
 
 func (c *RerankClassifier) Classify(ctx context.Context, p Probe) (Decision, error) {
 	start := time.Now()
-	key := cacheKey(p.Prompt)
+	query := trimmedProbeText(p, c.budget, identityRender)
+	key := cacheKey(query)
 	if hit, ok := c.cache.get(key); ok {
 		return Decision{Labels: hit, Score: 1.0, Latency: time.Since(start)}, nil
 	}
 
-	results, err := c.reranker.Rerank(ctx, p.Prompt, c.documents)
+	results, err := c.reranker.Rerank(ctx, query, c.documents)
 	if err != nil {
 		return errDecision(start, fmt.Errorf("rerank classify: %w", err))
 	}
diff --git a/core/services/routing/router/rerank_test.go b/core/services/routing/router/rerank_test.go
index 5b88d0bf0530..ed13648604f6 100644
--- a/core/services/routing/router/rerank_test.go
+++ b/core/services/routing/router/rerank_test.go
@@ -3,6 +3,8 @@ package router
 import (
 	"context"
 	"errors"
+	"fmt"
+	"strings"
 
 	"github.com/mudler/LocalAI/core/backend"
 	. "github.com/onsi/ginkgo/v2"
@@ -43,6 +45,31 @@ var _ = Describe("RerankClassifier", func() {
 		Expect(d.Score).To(BeNumerically(">=", 0.9))
 	})
 
+	It("trims the query to the reranker context, keeping the newest turns", func() {
+		r := &stubReranker{results: []backend.RerankResult{
+			{Index: 0, RelevanceScore: 0.92},
+			{Index: 1, RelevanceScore: 0.10},
+			{Index: 2, RelevanceScore: 0.05},
+		}}
+		wordCount := func(s string) (int, error) { return len(strings.Fields(s)), nil }
+		// budget = 60 − longest policy description − 16 margin; still well under
+		// the ~120-word transcript, so the oldest turns drop.
+		c := NewRerankClassifier(testPolicies(), r, 0, 0).WithTokenTrim(wordCount, 60)
+
+		msgs := make([]string, 0, 31)
+		for i := range 30 {
+			msgs = append(msgs, fmt.Sprintf("OLDturn%d aaa bbb ccc", i))
+		}
+		msgs = append(msgs, "NEWESTTURN zzz")
+		full := strings.Join(msgs, "\n")
+
+		_, err := c.Classify(context.Background(), Probe{Prompt: full, Messages: msgs})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(r.lastQ).To(ContainSubstring("NEWESTTURN"), "newest turn must survive")
+		Expect(r.lastQ).NotTo(ContainSubstring("OLDturn0 "), "oldest turns trimmed to fit context")
+		Expect(r.lastQ).NotTo(Equal(full), "must not rerank the untrimmed prompt")
+	})
+
 	It("activates multiple labels when several descriptions clear threshold", func() {
 		r := &stubReranker{results: []backend.RerankResult{
 			{Index: 0, RelevanceScore: 0.85},
diff --git a/core/services/routing/router/score.go b/core/services/routing/router/score.go
index 19da0d2c6bc4..34beeb5c6088 100644
--- a/core/services/routing/router/score.go
+++ b/core/services/routing/router/score.go
@@ -91,6 +91,13 @@ type ScoreClassifierOptions struct {
 	// override that instructs the model to emit a different schema
 	// would silently desync from what the scorer actually scores.
 	SystemPromptTemplate string
+
+	// TokenCounter + MaxContextTokens drive conversation trimming: when
+	// both are set, Classify drops the oldest turns until the rendered
+	// prompt fits the classifier's context. Nil/0 disables — Classify
+	// sends Probe.Prompt as-is and relies on the backend's n_ctx guard.
+	TokenCounter     func(string) (int, error)
+	MaxContextTokens int
 }
 
 // ScoreClassifier scores every policy label as the model's actual
@@ -127,6 +134,10 @@ type ScoreClassifier struct {
 	// log-prob. Built once at construction; same list every call.
 	candidates []string
 
+	// budget caps the rendered prompt at the classifier's context minus the
+	// longest candidate; nil/disabled sends Probe.Prompt as-is.
+	budget *lazyBudget
+
 	cache *labelSetCache
 }
 
@@ -191,6 +202,7 @@ func NewScoreClassifier(policies []ScorePolicy, scorer backend.Scorer, opts Scor
 		systemPrompt:        systemPrompt,
 		labelOrder:          labels,
 		candidates:          candidates,
+		budget:              &lazyBudget{tokenize: opts.TokenCounter, maxContext: opts.MaxContextTokens, extras: candidates},
 		cache:               newLabelSetCache(opts.CacheCap),
 	}
 }
@@ -218,11 +230,19 @@ func (c *ScoreClassifier) Name() string { return ClassifierScore }
 
 func (c *ScoreClassifier) Classify(ctx context.Context, p Probe) (Decision, error) {
 	start := time.Now()
-	key := cacheKey(p.Prompt)
+
+	// Trim oldest turns until the rendered prompt fits the classifier's
+	// context. Cache-keyed on the trimmed text so conversations that
+	// trim to the same tail share an entry.
+	userText := trimmedProbeText(p, c.budget, func(joined string) (string, error) {
+		return c.renderer(c.systemPrompt, joined)
+	})
+
+	key := cacheKey(userText)
 	if hit, ok := c.cache.get(key); ok {
 		return Decision{Labels: hit, Score: 1.0, Latency: time.Since(start)}, nil
 	}
-	prompt, err := c.renderer(c.systemPrompt, p.Prompt)
+	prompt, err := c.renderer(c.systemPrompt, userText)
 	if err != nil {
 		return errDecision(start, fmt.Errorf("score classify: render prompt: %w", err))
 	}
@@ -331,6 +351,12 @@ func softmax(logProbs []float64) []float64 {
 
 func (c *ScoreClassifier) CacheLen() int { return c.cache.len() }
 
+// probeTokenBudget returns the token ceiling for the rendered prompt (context
+// − longest candidate − margin), computed once via the shared lazyBudget. 0
+// means trimming is off (no tokenizer/context) or impossible (candidates fill
+// the context).
+func (c *ScoreClassifier) probeTokenBudget() int { return c.budget.get() }
+
 // buildScoreSystemPrompt renders the Arch-Router-style routing
 // instructions: routes listed in a structured block, output schema
 // declared as JSON {"route": "<name>"}. Candidates are scored as
diff --git a/core/services/routing/router/score_test.go b/core/services/routing/router/score_test.go
index 8b9b0fe9bfb0..75707186efdd 100644
--- a/core/services/routing/router/score_test.go
+++ b/core/services/routing/router/score_test.go
@@ -3,8 +3,10 @@ package router
 import (
 	"context"
 	"errors"
+	"fmt"
 	"sort"
 	"strings"
+	"unicode/utf8"
 
 	"github.com/mudler/LocalAI/core/backend"
 	. "github.com/onsi/ginkgo/v2"
@@ -335,3 +337,138 @@ Reply: {"route": "<name>"}`
 		Expect(c.Name()).To(Equal(ClassifierScore))
 	})
 })
+
+var _ = Describe("ScoreClassifier conversation trimming", func() {
+	wordCount := func(s string) (int, error) { return len(strings.Fields(s)), nil }
+	threeScores := []backend.CandidateScore{
+		{LogProb: -0.05, NumTokens: 3},
+		{LogProb: -3.0, NumTokens: 3},
+		{LogProb: -4.0, NumTokens: 3},
+	}
+
+	It("drops the oldest turns when the conversation exceeds the context budget", func() {
+		s := &stubScorer{results: threeScores}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{
+			TokenCounter:     wordCount,
+			MaxContextTokens: 10000,
+		})
+		Expect(c.probeTokenBudget()).To(BeNumerically(">", 0), "budget should be positive for a 10k context")
+
+		msgs := make([]string, 0, 200)
+		msgs = append(msgs, "OLDESTMARKER "+strings.Repeat("x ", 99)) // 100 words
+		for range 198 {
+			msgs = append(msgs, strings.Repeat("y ", 100))
+		}
+		msgs = append(msgs, "NEWESTMARKER "+strings.Repeat("z ", 99)) // 100 words; ~20k words total
+
+		_, err := c.Classify(context.Background(), Probe{Messages: msgs, Prompt: strings.Join(msgs, "\n")})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(s.lastP).To(ContainSubstring("NEWESTMARKER"), "newest turn must survive the trim")
+		Expect(s.lastP).NotTo(ContainSubstring("OLDESTMARKER"), "oldest turn must be dropped")
+		Expect(len(strings.Fields(s.lastP))).To(BeNumerically("<", 20000), "must be trimmed, not the full transcript")
+	})
+
+	It("keeps the newest turn whole even when it alone exceeds the budget", func() {
+		s := &stubScorer{results: threeScores}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{
+			TokenCounter:     wordCount,
+			MaxContextTokens: 10000,
+		})
+		msgs := []string{
+			"OLDMARKER short",
+			"NEWESTMARKER " + strings.Repeat("z ", 12000), // far over budget
+		}
+		_, err := c.Classify(context.Background(), Probe{Messages: msgs})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(s.lastP).To(ContainSubstring("NEWESTMARKER"))
+		Expect(s.lastP).NotTo(ContainSubstring("OLDMARKER"), "older turn drops once the newest fills the budget")
+	})
+
+	It("does not tokenize per message and bounds what it tokenizes for a long conversation", func() {
+		// Regression: the original trim tokenized one message at a time,
+		// newest-first, so a 500-turn conversation produced hundreds of
+		// tokenize RPCs. The render-once design must tokenize the candidates
+		// (budget setup) plus a small constant for the measurement/confirm
+		// passes — and the rune pre-trim must keep the tokenized prompt far
+		// smaller than the full transcript.
+		calls := 0
+		maxRunes := 0
+		counting := func(s string) (int, error) {
+			calls++
+			if r := utf8.RuneCountInString(s); r > maxRunes {
+				maxRunes = r
+			}
+			return len(strings.Fields(s)), nil
+		}
+		s := &stubScorer{results: threeScores}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{
+			TokenCounter:     counting,
+			MaxContextTokens: 4000,
+		})
+
+		msgs := make([]string, 500)
+		totalRunes := 0
+		for i := range msgs {
+			msgs[i] = fmt.Sprintf("msg%d %s", i, strings.Repeat("w ", 50))
+			totalRunes += utf8.RuneCountInString(msgs[i])
+		}
+
+		_, err := c.Classify(context.Background(), Probe{Messages: msgs})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(s.lastP).To(ContainSubstring("msg499"), "newest turn must survive")
+		Expect(s.lastP).NotTo(ContainSubstring("msg0 "), "oldest turns must be dropped")
+		Expect(calls).To(BeNumerically("<", 20),
+			"tokenizer must not be called per message (got %d calls for 500 messages)", calls)
+		Expect(maxRunes).To(BeNumerically("<", totalRunes/2),
+			"rune pre-trim must keep the tokenized prompt well under the full transcript")
+	})
+
+	It("uses Probe.Prompt unchanged when no tokenizer is wired", func() {
+		s := &stubScorer{results: threeScores}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{})
+		Expect(c.probeTokenBudget()).To(Equal(0))
+
+		_, err := c.Classify(context.Background(), Probe{
+			Prompt:   "PROMPTONLYMARKER",
+			Messages: []string{"ignored-because-no-tokenizer"},
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(s.lastP).To(ContainSubstring("PROMPTONLYMARKER"))
+		Expect(s.lastP).NotTo(ContainSubstring("ignored-because-no-tokenizer"))
+	})
+
+	It("disables trimming (budget 0) when the tokenizer errors", func() {
+		s := &stubScorer{results: threeScores}
+		boom := func(string) (int, error) { return 0, errors.New("tokenizer down") }
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{
+			TokenCounter:     boom,
+			MaxContextTokens: 10000,
+		})
+		Expect(c.probeTokenBudget()).To(Equal(0), "a tokenizer error must disable trimming, not panic")
+
+		_, err := c.Classify(context.Background(), Probe{Prompt: "FALLBACKMARKER", Messages: []string{"a", "b"}})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(s.lastP).To(ContainSubstring("FALLBACKMARKER"))
+	})
+
+	It("retries the budget after a TRANSIENT tokenizer error instead of disabling permanently", func() {
+		// Regression: a sync.Once would memoize the first failure and never
+		// recompute. The first call (model still loading) errors; a later
+		// call must succeed and yield a real budget.
+		s := &stubScorer{results: threeScores}
+		calls := 0
+		flaky := func(text string) (int, error) {
+			calls++
+			if calls == 1 {
+				return 0, errors.New("model still loading")
+			}
+			return len(strings.Fields(text)), nil
+		}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{
+			TokenCounter:     flaky,
+			MaxContextTokens: 10000,
+		})
+		Expect(c.probeTokenBudget()).To(Equal(0), "first call: tokenizer error leaves budget uncomputed")
+		Expect(c.probeTokenBudget()).To(BeNumerically(">", 0), "retry: budget computes once the tokenizer recovers")
+	})
+})
diff --git a/core/services/routing/router/trim.go b/core/services/routing/router/trim.go
new file mode 100644
index 000000000000..50f752d9d0b7
--- /dev/null
+++ b/core/services/routing/router/trim.go
@@ -0,0 +1,178 @@
+package router
+
+import (
+	"math"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"unicode/utf8"
+
+	"github.com/mudler/xlog"
+)
+
+// pretrimRunesPerToken is deliberately high (most text is 3–5 runes/token,
+// tokenisers rarely exceed 6) so the cheap rune pre-trim keeps a superset of
+// what fits before any tokenize call.
+const pretrimRunesPerToken = 6
+
+// tokenBudgetMargin absorbs BPE-boundary drift and the framing tokens a
+// renderer adds, so a prompt measured at exactly the budget still fits n_ctx.
+const tokenBudgetMargin = 16
+
+// JoinTurns joins per-turn texts oldest→newest with a trailing newline each.
+// The probe builder, the trimmer, and every classifier share this so the text
+// a model sees has one canonical shape.
+func JoinTurns(turns []string) string {
+	var b strings.Builder
+	for _, m := range turns {
+		b.WriteString(m)
+		b.WriteByte('\n')
+	}
+	return b.String()
+}
+
+// promptTrimmer fits an oldest→newest turn list into a token budget for one
+// model: optimistic rune pre-trim, tokenize once, then recalibrate with the
+// real runes/token and drop whole turns oldest-first until the rendered prompt
+// fits. The newest turn is never dropped — if it alone overflows it's sent
+// whole and the backend's n_ctx guard is the backstop.
+//
+// render wraps the joined turns into what the model actually tokenizes: a chat
+// template for the scorer, identityRender for an embedder/reranker on raw text.
+type promptTrimmer struct {
+	tokenize func(string) (int, error)
+	render   func(joined string) (string, error)
+	budget   int
+}
+
+func identityRender(s string) (string, error) { return s, nil }
+
+func (t promptTrimmer) fit(turns []string) string {
+	if len(turns) == 0 {
+		return ""
+	}
+	kept := turns[runePretrimStart(turns, t.budget*pretrimRunesPerToken):]
+
+	joined := JoinTurns(kept)
+	rendered, err := t.render(joined)
+	if err != nil {
+		return joined
+	}
+	total, err := t.tokenize(rendered)
+	if err != nil || total <= t.budget {
+		return joined
+	}
+
+	runesPerToken := float64(utf8.RuneCountInString(rendered)) / float64(total)
+	if runesPerToken <= 0 {
+		runesPerToken = 1
+	}
+	est := total
+	keep := 0
+	for keep < len(kept)-1 && est > t.budget {
+		est -= int(math.Ceil(float64(utf8.RuneCountInString(kept[keep])) / runesPerToken))
+		keep++
+	}
+
+	for {
+		tail := JoinTurns(kept[keep:])
+		rendered, err := t.render(tail)
+		if err != nil {
+			return tail
+		}
+		n, err := t.tokenize(rendered)
+		if err != nil || n <= t.budget {
+			return tail
+		}
+		if keep >= len(kept)-1 {
+			xlog.Warn("router: newest turn alone exceeds model context; sending it whole — backend n_ctx guard is the backstop",
+				"tokens", n, "budget", t.budget)
+			return tail
+		}
+		keep++
+	}
+}
+
+// runePretrimStart returns the oldest index to keep so the joined tail stays
+// within budgetRunes. The newest turn is always kept; older ones are added
+// while they fit.
+func runePretrimStart(turns []string, budgetRunes int) int {
+	if budgetRunes <= 0 || len(turns) == 0 {
+		return 0
+	}
+	start := len(turns) - 1
+	total := utf8.RuneCountInString(turns[start])
+	for i := len(turns) - 2; i >= 0; i-- {
+		r := utf8.RuneCountInString(turns[i])
+		if total+r > budgetRunes {
+			break
+		}
+		total += r
+		start = i
+	}
+	return start
+}
+
+// lazyBudget computes a model's probe token budget once, on first use, caching
+// the result: maxContext minus the longest per-call extra (scorer candidates,
+// reranker documents; none for a plain embed) minus tokenBudgetMargin. A
+// tokenizer error leaves it uncomputed so a transient failure (model still
+// loading) recovers on a later call; extras that already fill the context are
+// cached as disabled.
+type lazyBudget struct {
+	tokenize   func(string) (int, error)
+	maxContext int
+	extras     []string
+
+	mu    sync.Mutex
+	value atomic.Int64 // 0=unset, >0=budget, -1=disabled
+}
+
+func (l *lazyBudget) get() int {
+	if l == nil || l.tokenize == nil || l.maxContext <= 0 {
+		return 0
+	}
+	if v := l.value.Load(); v != 0 {
+		if v < 0 {
+			return 0
+		}
+		return int(v)
+	}
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	if v := l.value.Load(); v != 0 {
+		if v < 0 {
+			return 0
+		}
+		return int(v)
+	}
+	longest := 0
+	for _, e := range l.extras {
+		n, err := l.tokenize(e)
+		if err != nil {
+			return 0 // transient: leave unset so a later call retries
+		}
+		if n > longest {
+			longest = n
+		}
+	}
+	b := l.maxContext - longest - tokenBudgetMargin
+	if b <= 0 {
+		l.value.Store(-1)
+		return 0
+	}
+	l.value.Store(int64(b))
+	return b
+}
+
+// trimmedProbeText returns the text to feed a model: the most recent turns
+// that fit its token budget, or p.Prompt when trimming is disabled (no
+// tokenizer/context wired, or a single-input probe with no Messages).
+func trimmedProbeText(p Probe, b *lazyBudget, render func(string) (string, error)) string {
+	if len(p.Messages) > 0 {
+		if budget := b.get(); budget > 0 {
+			return promptTrimmer{tokenize: b.tokenize, render: render, budget: budget}.fit(p.Messages)
+		}
+	}
+	return p.Prompt
+}
diff --git a/core/services/routing/router/types.go b/core/services/routing/router/types.go
index 178cafbaeabe..05efdc3497fa 100644
--- a/core/services/routing/router/types.go
+++ b/core/services/routing/router/types.go
@@ -31,6 +31,15 @@ type Probe struct {
 	// is the concatenation of message contents (separated by newlines);
 	// for plain completions it is the raw prompt.
 	Prompt string
+
+	// Messages carries the per-turn texts (oldest→newest) when the probe
+	// came from a multi-message chat request. A classifier with a real
+	// tokenizer (the score classifier) uses these to trim an over-long
+	// conversation to the classifier model's context window on turn
+	// boundaries, keeping the most recent turns. Empty for single-input
+	// probes (plain completions, /router/decide), in which case the
+	// classifier falls back to Prompt verbatim.
+	Messages []string
 }
 
 // Decision is the classifier's output. Labels carries the SET of
diff --git a/core/trace/backend_trace.go b/core/trace/backend_trace.go
index 83af1339e120..f5b5e9d450be 100644
--- a/core/trace/backend_trace.go
+++ b/core/trace/backend_trace.go
@@ -33,6 +33,9 @@ const (
 	BackendTraceAudioTransform  BackendTraceType = "audio_transform"
 	BackendTraceModelLoad       BackendTraceType = "model_load"
 	BackendTraceScore           BackendTraceType = "score"
+	BackendTraceTokenClassify   BackendTraceType = "token_classify"
+	BackendTracePatternPII      BackendTraceType = "pattern_pii"
+	BackendTraceVectorStore     BackendTraceType = "vector_store"
 )
 
 type BackendTrace struct {
@@ -57,10 +60,12 @@ type BackendTrace struct {
 // runaway buffer when a caller streams MB-scale payloads.
 const MaxTraceBodyBytes = 1 << 20
 
-var backendTraceBuffer *circularbuffer.Queue[*BackendTrace]
-var backendMu sync.Mutex
-var backendLogChan = make(chan *BackendTrace, 100)
-var backendInitOnce sync.Once
+var (
+	backendTraceBuffer *circularbuffer.Queue[*BackendTrace]
+	backendMu          sync.Mutex
+	backendLogChan     = make(chan *BackendTrace, 100)
+	backendInitOnce    sync.Once
+)
 
 // backendMaxBodyBytes caps each captured string value in a BackendTrace.Data
 // field to keep the /api/backend-traces JSON small enough for the admin UI to
diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md
index 1d3268b11afa..dbe791e50fc4 100644
--- a/docs/content/advanced/model-configuration.md
+++ b/docs/content/advanced/model-configuration.md
@@ -808,6 +808,44 @@ known_usecases:
 
 Available flags: `chat`, `completion`, `edit`, `embeddings`, `rerank`, `image`, `transcript`, `tts`, `sound_generation`, `tokenize`, `vad`, `video`, `detection`, `llm` (combination of CHAT, COMPLETION, EDIT).
 
+`token_classify` marks a model as a token-classification (NER) provider for the PII filter (e.g. an `openai-privacy-filter` GGUF). Declare it explicitly together with `embeddings: true` (the classifier loads via TOKEN_CLS pooling). On the `llama-cpp` backend it must not be combined with `chat`/`completion` in the same config — `TokenClassify` bypasses the slot loop and would race generation, so the loader rejects that mix; split into separate model configs.
+
+## PII filtering
+
+PII redaction is NER-based and runs on the **request** (input) side. It has two halves:
+
+- **Detector models** are `token_classify` models that carry the detection *policy* in a top-level `pii_detection:` block. The policy is defined once, on the model itself:
+
+  ```yaml
+  name: privacy-filter-multilingual
+  backend: llama-cpp
+  embeddings: true
+  known_usecases:
+    - token_classify
+  pii_detection:
+    min_score: 0.5            # drop detections below this confidence
+    default_action: mask      # mask | block | allow — applied to any detected
+                              # group with no explicit entry (empty = mask)
+    entity_actions:           # which PII to block vs mask vs allow-log
+      PASSWORD: block
+      CREDITCARD: block
+      EMAIL: mask
+  ```
+
+- **Consuming models** opt in and reference one or more detectors by name — no per-consumer policy:
+
+  ```yaml
+  name: my-assistant
+  pii:
+    enabled: true             # default: off for local backends, on for cloud-proxy
+    detectors:
+      - privacy-filter-multilingual
+  ```
+
+Multiple detectors union their detections; overlapping spans resolve to the strongest action (`block` > `mask` > `allow`). A configured detector that can't be loaded fails the request closed (HTTP 503) rather than silently skipping the check. Detections are audited at `/api/pii/events` (hash-prefix only, never the raw value).
+
+> The earlier regex pattern tier (`pii.patterns`, the global pattern catalogue, `--pii-config`, and the `/api/pii/patterns` admin endpoints) has been removed, along with response/streaming-side redaction. Those keys now no-op with a startup warning; migrate to `pii.detectors` + a detector's `pii_detection` block.
+
 ## Complete Example
 
 Here's a comprehensive example combining many options:
diff --git a/docs/content/features/middleware.md b/docs/content/features/middleware.md
index 84b8fb3825bb..6033ee4e89ba 100644
--- a/docs/content/features/middleware.md
+++ b/docs/content/features/middleware.md
@@ -12,9 +12,9 @@ categories = ["Features"]
 LocalAI ships a request-middleware layer that sits between the HTTP API and
 the backend dispatcher. Two subsystems share that layer because they share
 the same lifecycle hook: **PII filtering** scans the request body before it
-reaches a backend (and the SSE stream on the way out), and the **intelligent
-router** rewrites `input.Model` so a single client-facing model name fans
-out across multiple downstream targets.
+reaches a backend, and the **intelligent router** rewrites `input.Model` so
+a single client-facing model name fans out across multiple downstream
+targets.
 
 Both are inspected and configured from the same admin page
 (`/app/middleware`), backed by the same REST surface (`/api/middleware/*`,
@@ -23,67 +23,128 @@ Both are inspected and configured from the same admin page
 ## Request lifecycle
 
 ```
-client ── auth ── route-model ── per-model PII ── backend ── streaming PII ── client
-                       │                              │
-                       └─── decision log              └─── event log
+client ── auth ── route-model ── per-model PII ── backend ── client
+                       │              │
+                       │              └─── event log
+                       └─── decision log
 ```
 
 The router runs first (it picks the target model so per-model PII has
 something to gate on), per-model PII runs next (gated by the resolved
-config), the backend executes, and the streaming PII filter rewrites the
-SSE response in flight. Each subsystem writes to its own admin-visible
-log: `/api/router/decisions` for routing, `/api/pii/events` for redaction
-and block actions.
+config), and the backend executes. Filtering is **request-side only** —
+the request body is scanned and rewritten before forwarding; the response
+is not touched (NER over a streamed response is left as a follow-up). Each
+subsystem writes to its own admin-visible log: `/api/router/decisions` for
+routing, `/api/pii/events` for redaction and block actions.
 
 ---
 
 ## PII filtering
 
-PII redaction is **per-model and off by default**. The default flips to
-**on for any backend whose name starts with `proxy-`** because that traffic
-crosses the network to a third-party provider. Explicit `pii.enabled`
-in a model's YAML always wins over the backend default.
+PII redaction is **NER-based and runs request-side (input)**. It is
+**off by default**, flipping to **on for any `cloud-proxy` backend**
+because that traffic crosses the network to a third-party provider. Pick a
+[default detector](#instance-wide-defaults) so those models are actually
+scanned. Explicit `pii.enabled` in a model's YAML always wins over the
+backend default.
+
+Filtering runs on every text-accepting endpoint that has an adapter wired:
+`/v1/chat/completions` and `/v1/messages` (chat), `/v1/completions`,
+`/v1/embeddings`, `/v1/edits`, and the Ollama `/api/chat`, `/api/generate`
+and `/api/embed` endpoints, plus the [MITM proxy]({{< relref "mitm-proxy.md" >}})
+request body. Image, audio (TTS/STT), video, rerank, and the realtime
+WebSocket are not filtered yet (different prompt-PII semantics; realtime is
+not HTTP middleware).
+
+> The earlier regex pattern tier (`pii.patterns`, the built-in pattern
+> catalogue, `--pii-config`, the `/api/pii/patterns|test|decide` endpoints)
+> and response/streaming-side redaction have been **removed**. Detection is
+> now driven entirely by token-classification (NER) models. Legacy keys
+> no-op with a startup warning.
+
+### Detector models
+
+A **detector** is a `token_classify` model (e.g. an `openai-privacy-filter`
+GGUF) that carries the detection *policy* in a top-level `pii_detection:`
+block — defined once, on the model itself:
 
-### Pattern catalog
-
-The built-in regex tier ships six patterns. Each has a default action
-(`mask`, `block`, or `route_local`) and a length cap that prevents
-pathological inputs from blowing up scanning time:
+```yaml
+name: privacy-filter-multilingual
+backend: llama-cpp
+embeddings: true              # TOKEN_CLS pooling
+known_usecases:
+  - token_classify
+pii_detection:
+  min_score: 0.5              # drop detections below this confidence
+  default_action: mask        # applied to any detected group with no entry
+  entity_actions:             # which PII to block vs mask vs allow-log
+    PASSWORD: block
+    CREDITCARD: block
+    EMAIL: mask
+```
 
-| ID | Description | Default action | Max length |
-|---|---|---|---|
-| `email` | Email address | `mask` | 254 |
-| `phone` | Phone number (international or US) | `mask` | 24 |
-| `ssn` | US Social Security Number | `mask` | 11 |
-| `credit_card` | Credit card number (Luhn-verified) | `mask` | 19 |
-| `ipv4` | IPv4 address | `mask` | 15 |
-| `api_key_prefix` | `sk-`, `pk-`, `xoxb-`, `ghp_`, `github_pat_` | **`block`** | 200 |
+`mask` rewrites the matched span to `[REDACTED:ner:<GROUP>]` in the request
+body before forwarding. `block` returns HTTP 400 (`error.type=pii_blocked`)
+without forwarding. `allow` detects and logs (a PIIEvent is still recorded)
+but leaves the text unchanged. The entity-group names are whatever the model
+emits (the privacy-filter family uses uppercase names like `EMAIL`,
+`PASSWORD`, `CREDITCARD`).
 
-`mask` rewrites the match to `[REDACTED:<id>]` in the request body before
-forwarding. `block` returns HTTP 400 with `error.type=pii_blocked` to the
-client without forwarding. `route_local` is reserved for the routing
-integration (see below) and falls back to `mask` when no local route is
-available.
+### Pattern detector tier
 
-### Per-model configuration
+NER is the wrong tool for high-entropy, highly-regular **secrets** — API keys,
+tokens, private-key blocks. A trained NER model has no "API key" class, so it
+fragments a key into the nearest categories it *does* know and can leave the
+secret part exposed. Those secrets are exactly what a regex catches cheaply.
 
-Add a `pii:` block to a model YAML to opt in (or out, or to override
-per-pattern actions):
+A **pattern detector** is a detector model (`backend: pattern`) that matches
+secrets with a **restricted regex subset** compiled to Go's RE2 engine —
+linear-time, no backtracking, no ReDoS. It runs entirely in-process: no model
+download, no backend, zero VRAM. Install the gallery's **`secret-filter`** for a
+ready-made set, or define your own:
 
 ```yaml
-# Local model — explicit opt-in so chats with this model get redaction
-# applied request-side.
-name: qwen-7b-local
-backend: llama-cpp
-pii:
-  enabled: true
+name: secret-filter
+backend: pattern
+known_usecases: [token_classify]        # so it appears in the detector picker
+pii_detection:
+  default_action: block                 # a leaked credential shouldn't leave
+  builtins:                             # built-in catalogue (enable by name)
+    - anthropic_api_key
+    - openai_api_key
+    - github_token
+    - aws_access_key
+    - private_key_block
+  patterns:                             # operator-defined, restricted subset
+    - name: INTERNAL_TOKEN
+      match: "tok-[A-Za-z0-9]{32,64}"
+      action: block                      # optional per-pattern override
+      min_len: 36                        # optional length floor
 ```
 
+A match is reported under its group (built-in group name, or the pattern
+`name`), so `entity_actions` / `default_action` apply exactly as for NER.
+
+**The restricted grammar** (validated at load — an invalid pattern is rejected,
+not silently ignored):
+- Allowed: literals, character classes `[…]` and `\w \d \s`, alternation,
+  anchors `^ $ \b`, and quantifiers `? * + {m,n}`.
+- Rejected: `.` (any-char), capturing groups, and `{n,m}` bounds over 4096.
+- **Required anchor**: every pattern must contain a fixed literal run of at
+  least 3 characters (e.g. `sk-ant-`, `ghp_`, `AKIA`). This admits real key
+  shapes but rejects open-ended ones — an email or a bare `\w+` has no such
+  anchor and belongs to the [NER tier](#detector-models).
+
+Use both tiers together: reference an NER detector *and* a pattern detector in a
+model's `pii.detectors` (or as instance defaults); their hits union, and a
+`block` from either rejects the request.
+
+### Consuming models
+
+Any model opts in by enabling PII and referencing one or more detectors —
+no per-consumer policy:
+
 ```yaml
-# Cloud-bound model — defaults to enabled because backend is cloud-proxy.
-# Tighten api_key_prefix from the global default and downgrade email to
-# route_local so emails route to a local model rather than leaving the
-# network.
 name: claude-strict
 backend: cloud-proxy
 proxy:
@@ -92,85 +153,86 @@ proxy:
   upstream_url: https://api.anthropic.com/v1/messages
   api_key_env: ANTHROPIC_API_KEY
 pii:
-  patterns:
-    - id: api_key_prefix
-      action: block        # already the default, made explicit for audit
-    - id: email
-      action: route_local
+  enabled: true               # default-on for cloud-proxy; explicit for audit
+  detectors:
+    - privacy-filter-multilingual
 ```
 
-The regex itself stays global — only the action is settable per-model.
-Adding new patterns is a build-time concern (extend `patternRegexps` in
-`core/services/routing/pii/patterns.go`).
-
-### NER tier (optional)
-
-The regex matcher covers high-precision patterns. For natural-language
-PII (proper names, addresses, organization names) LocalAI carries an
-**encoder NER tier** that runs after the regex pass. It expects a
-transformers token-classification model wired through the `TokenClassify`
-gRPC primitive (e.g. `dslim/bert-base-NER`). The detector annotates
-spans with an entity group (`PER`, `LOC`, `ORG`, `MISC`); per-group
-actions are configurable through the same `pii:` block.
-
-The NER tier ships as a contract (`NERDetector`, `NERConfig` in
-`core/services/routing/pii/ner.go`); an operator-facing knob to load and
-attach a detector is not plumbed yet. When no detector is configured the
-regex tier still runs.
-
-### Streaming PII filter
-
-Buffered (`/v1/chat/completions` without `"stream": true`) responses are
-forwarded verbatim today — only the request-side scan runs. Streaming
-responses run through `pii.StreamFilter` which buffers SSE chunks until
-either a full pattern matches or the buffer's max length is reached,
-then emits the safe prefix. The streaming filter is what makes the
-cloud-proxy backend and the MITM proxy safe to expose to clients that
-issue streaming requests.
-
-The streaming filter is wired automatically for any model with `pii.enabled`
-true — there is no separate streaming toggle.
+Multiple detectors **union** their detections; overlapping spans resolve to
+the strongest action (`block` > `mask` > `allow`). A configured detector
+that can't be loaded **fails the request closed** (HTTP 503,
+`error.type=pii_ner_unavailable`) rather than silently skipping the check.
+The same NER path runs on the [MITM proxy]({{< relref "mitm-proxy.md" >}})
+request body for intercepted hosts. Response/output redaction is out of
+scope for now.
+
+### Instance-wide default detector
+
+The **Detector models** table on the Middleware → Filtering page lists every
+`token_classify` detector model (neural NER models and in-process pattern
+matchers alike) and exposes a per-row **Default** toggle. Toggling a detector
+on adds it to the instance-wide default detector set — one or more models
+applied to any PII-enabled model that names none of its own `pii.detectors`.
+It is persisted through `POST /api/settings` and read live, so a change takes
+effect on the next request without a restart. A default that names a model no
+longer loaded still appears (marked *not loaded*) so it can be toggled off.
+
+This is what makes `cloud-proxy` / MITM redaction work out of the box: those
+backends default to PII-enabled but ship no detector list, so without a
+default detector the filter runs with nothing to scan. Set one here and
+cloud-proxy traffic is scanned with no per-model config.
+
+Resolution precedence (the single decision point is `ResolvePIIPolicy`,
+shared by the chat middleware and the MITM listener so both agree):
+
+1. An explicit `pii.enabled` on the model wins — `true` or `false`.
+2. Otherwise PII is on if the backend defaults it on (`cloud-proxy`).
+3. Detectors are the model's own `pii.detectors`; if it lists none, the
+   instance-wide default detector(s) are used.
+
+A model that resolves enabled but ends up with no detector at all (a
+cloud-proxy model with no model detectors and no instance default) scans
+nothing — set a default detector to close that gap.
 
 ### Admin page
 
 The `/app/middleware` page (admin role only) has four tabs — **Filtering**,
 **Routing**, **MITM Proxy** (see the [MITM doc]({{< relref "mitm-proxy.md" >}})),
-and **Events**. The Filtering tab shows:
-
-- The pattern catalogue with live action dropdowns. Changing an action via
-  the UI calls `PUT /api/pii/patterns/:id` and updates the live redactor
-  in-process. Click **Persist** in the action header to write the current
-  state into `runtime_settings.json` so the next process start re-applies it.
-- A per-model resolved-state table — each model row reports `enabled`,
-  the per-pattern overrides, and which patterns are effectively active.
-- A live test panel that posts sample text to `/api/pii/test` and
-  highlights matches with their resolved actions, without storing the
-  text in the event log.
+and **Events**. The Filtering tab has a **Detector models** table (every
+`token_classify` filter model, with the per-row Default toggle above and an
+edit link to each detector's config, plus an *Add detector model* button) and
+a per-model table listing only the models PII can actually apply to — chat /
+completion / embeddings / edit consumers and cloud-proxy models, not
+VAD/STT/image models or the detector models themselves. Each row reports the
+**effective** `enabled` state as an inline **toggle** — flipping it writes an
+explicit `pii.enabled` to that model's YAML (a server-side deep-merge that
+preserves `pii.detectors` and every other field), so a cloud-proxy model shown
+on by backend default can be turned off, and vice-versa — plus the
+resolved detector(s) — with a *(default)* marker when they come from the
+instance-wide default rather than the model's YAML — why it is on (`YAML` /
+`backend default`), and the recent event count. Detection *policy*
+(entity→action, min score) is still edited on each detector model's config
+(Models → edit → PII), not globally.
 
 ### REST surface
 
 | Method | Path | Auth | Purpose |
 |---|---|---|---|
-| GET | `/api/pii/patterns` | any | Live pattern list with current actions. Used by the UI catalogue. |
-| POST | `/api/pii/test` | any | Dry-run the redactor on `{"text":"..."}`. Returns hits and the would-be-rewritten body. Does not write to the event log. |
-| GET | `/api/pii/events` | admin | Recent middleware events — PII redactions, MITM connect/traffic, admission denials. Filterable by `correlation_id`, `user_id`, `pattern_id`, `kind`. |
-| PUT | `/api/pii/patterns/:id` | admin | Update a pattern in-process. Body accepts `{"action":"mask"\|"block"\|"route_local"}` and/or `{"disabled":true\|false}`. Transient — reverts on restart unless persisted. |
-| POST | `/api/pii/patterns/persist` | admin | Snapshot the live per-pattern (action, disabled) state into `runtime_settings.json`. |
-| GET | `/api/middleware/status` | admin | Aggregated dashboard data: patterns + per-model resolved state + router status + MITM status + admission status. One round-trip for the UI. |
+| GET | `/api/pii/events` | admin | Recent middleware events — PII redactions, MITM connect/traffic, admission denials. Filterable by `correlation_id`, `user_id`, `pattern_id` (e.g. `ner:EMAIL`), `kind`. |
+| GET | `/api/middleware/status` | admin | Aggregated dashboard data: per-model PII state + detectors + router status + MITM status + admission status. One round-trip for the UI. |
 
 ### MCP tools
 
-The same surface is mirrored through the LocalAI Assistant MCP server so
-the in-process and stdio assistants can manage the filter conversationally:
+The same surface is mirrored through the LocalAI Assistant MCP server:
 
 | Tool | Read/Write | Purpose |
 |---|---|---|
-| `list_pii_patterns` | read | Returns the live pattern list. |
 | `get_pii_events` | read | Recent redaction / block events with optional filters. |
-| `test_pii_redaction` | read | Dry-run sample text without writing to the event log. |
 | `get_middleware_status` | read | Aggregator — the same payload as `GET /api/middleware/status`. |
-| `set_pii_pattern_action` | write | Update a pattern's action. Admin-only. |
-| `persist_pii_patterns` | write | Snapshot live state to `runtime_settings.json`. Admin-only. |
+
+Detection policy is part of a detector model's config, so it is managed
+through the model-config tools (`edit_model_config`), not a dedicated PII
+tool.
 
 ---
 
diff --git a/docs/plans/pii-ner-ggml-backend.md b/docs/plans/pii-ner-ggml-backend.md
new file mode 100644
index 000000000000..c8f14eecfb46
--- /dev/null
+++ b/docs/plans/pii-ner-ggml-backend.md
@@ -0,0 +1,715 @@
+# Plan: ML-based PII filter (privacy-filter family) via a GGML token-classification backend
+
+Status: **research / pre-decision**. This document captures the research findings and a
+proposed direction. We decide the plan after reviewing it together.
+
+Author note: AI-assisted research per `.agents/ai-coding-assistants.md` — attribution via
+`Assisted-by:` trailer on any resulting commits; the human submitter owns/reviews the code.
+
+---
+
+## 1. Goal
+
+Add a *semantic* (model-based) tier to LocalAI's PII filter middleware, driven by the
+**openai/privacy-filter** family of token-classification models — in particular
+**OpenMed/privacy-filter-multilingual** for its 16-language coverage. The model output (BIOES
+token labels → entity spans) must map onto the existing PII redactor's NER seam.
+
+Hard constraint from the request: **no Python at inference time**. If no suitable C++/GGML
+runtime exists, we implement one in GGML, following the methodology used for vibevoice.cpp,
+LocalVQE, and parakeet.cpp.
+
+---
+
+## 2. The model family (research findings)
+
+### 2.1 openai/privacy-filter (the base)
+
+Source: <https://github.com/openai/privacy-filter>, model card
+<https://huggingface.co/openai/privacy-filter>, and the HF Transformers integration
+(`OpenAIPrivacyFilterConfig` / `OpenAIPrivacyFilterModel` /
+`OpenAIPrivacyFilterForTokenClassification`).
+
+- **Task**: bidirectional **token classification** for PII (not generative). One forward
+  pass labels every token; spans are then decoded.
+- **Lineage**: starts from a **gpt-oss-style autoregressive** checkpoint, then *converted to
+  bidirectional* and post-trained with supervised token-classification loss.
+- **Architecture** (per the model card):
+  - 8 transformer blocks, `d_model = 640`
+  - Grouped-query attention: 14 query heads, 2 KV heads (group size 7)
+  - Rotary position embeddings (RoPE)
+  - **Sparse MoE FFN**: 128 experts total, **top-4 routing** per token
+  - **Banded (local/sliding-window) attention**, band size 128 → effective window 257 tokens
+  - Token-classification head over `d_model = 640`
+  - **1.5B total params, ~50M active per token**, 128k context
+- **Output classes**: 33 = 1 background (`O`) + 8 categories × 4 BIOES tags
+  (B/I/E/S). Categories: account_number, private_address, private_email, private_person,
+  private_phone, private_url, private_date, secret.
+- **Decoding**: a **constrained Viterbi** over a linear-chain BIOES grammar (not per-token
+  argmax). Exposes **6 transition-bias parameters** to tune precision/recall at runtime.
+- **Tokenizer**: tiktoken **o200k_base** (gpt-oss family).
+- **License**: **Apache-2.0** — commercial use OK.
+- **Runtimes shipped**: PyTorch only (the `opf` CLI: redact/eval/train). Now also a
+  HF Transformers model class.
+
+### 2.2 OpenMed/privacy-filter-multilingual (the target)
+
+Source: <https://huggingface.co/OpenMed/privacy-filter-multilingual>.
+
+- Fine-tune of `openai/privacy-filter`; **same architecture** (gpt-oss-style sparse MoE,
+  128 experts top-4, BIOES head).
+- **1.4B total / ~50M active**.
+- **54 PII categories** across 7 domains (identity, contact, address, dates, gov-IDs,
+  financial, crypto, vehicle, digital, auth) → **217 output classes** (1 + 54×4).
+- **16 languages**: ar, bn, zh, nl, en, fr, de, hi, it, ja, ko, pt, es, te, tr, vi.
+  Strongest: de/es/fr/it/hi/te/en; weaker on CJK & low-resource morphology.
+- Trained on AI4Privacy `pii-masking-{200k,400k}` + `open-pii-masking-500k`, language-balanced.
+- Runtimes: PyTorch + **MLX** (bf16 ~2.6 GB / 8-bit ~1.4 GB). MLX is explicitly out of scope
+  for us. There is also a `privacy-filter-nemotron` (NVIDIA Nemotron PII data) clinical-leaning
+  variant.
+
+### 2.3 Existing non-Python / non-MLX implementations — survey result
+
+**None found.** Concretely:
+
+- No GGUF, no llama.cpp support, no standalone C++/Rust/GGML port, no ONNX export published.
+- `screenpipe/privacy-filter` (GitHub) sounded promising but is a **Python + vLLM + ONNX**
+  HTTP service (PolyForm Noncommercial license), not a portable runtime.
+- Community claim "llama.cpp can't run it / no GGUF exists" is *true as published* — but the
+  reasoning ("GGUF is only for generative models") is a packaging convention, not a hard
+  limit. The compute kernels this model needs already exist in ggml/llama.cpp (see §6).
+
+**Conclusion**: to drop Python we must build the runtime ourselves in GGML. This is the same
+situation as vibevoice/parakeet/LocalVQE.
+
+---
+
+## 3. How LocalAI's PII filter works today
+
+Package: `core/services/routing/pii/` (+ adapters in `core/services/routing/piiadapter/`,
+routes in `core/http/routes/pii.go`).
+
+- **Tier 1 (live)**: deterministic **regex** redactor (`redactor.go`, `patterns.go`) —
+  email, phone, SSN, credit-card (Luhn), IPv4, API-key prefixes. Actions per pattern:
+  `block` (HTTP 400), `mask` (placeholder), `allow` (audit only).
+- **Request path**: `pii.RequestMiddleware(...)` runs innermost (after RouteModel /
+  admission), per-model opt-in via the model config `pii:` block. Adapters
+  (`piiadapter.OpenAI()` / `.Anthropic()`) extract scannable text and re-apply redactions by
+  index.
+- **Response/streaming path**: `stream.go` buffers a tail sized to the longest pattern so a
+  redaction is never split across SSE chunks; remaps `block`→`mask` on the wire.
+- **Audit**: one `PIIEvent` per detected span (hash-prefixed, never the raw value) into a
+  10k ring buffer; admin API at `/api/pii/*`.
+- **Config**: model YAML `pii: { enabled, patterns: [...] }`; global `--pii-config`,
+  `--disable-pii`; runtime overrides persisted to `runtime_settings.json`.
+
+### 3.1 The NER seam that's already waiting (this is the key finding)
+
+The redactor was built with a **Tier-2 encoder/NER hook already designed in but unwired**:
+
+- `core/services/routing/pii/ner.go`:
+  - `type NERDetector interface { Detect(ctx, text) ([]NEREntity, error) }`
+  - `NEREntity{ Group string; Start, End int /*byte offsets*/; Score float32 }`
+  - `NERConfig{ Detector; MinScore; EntityActions map[group]Action; DefaultAction }`
+  - `ResolveAction(group)`; audit rows use synthetic pattern IDs `ner:<group>` so the
+    existing disable/override/events machinery works unchanged.
+- `redactor.go`: `RedactWithNER(ctx, text, overrides, nerCfg)` already exists; nil Detector =
+  zero-cost fallback to regex-only.
+- `types.go:8` explicitly notes the encoder tier is "out of scope for this slice — added
+  later, fed by the gRPC TokenClassify RPC."
+
+### 3.2 The gRPC `TokenClassify` RPC is also already plumbed
+
+`backend/backend.proto`:
+```
+rpc TokenClassify(TokenClassifyRequest) returns (TokenClassifyResponse) {}
+message TokenClassifyRequest  { string text = 1; float threshold = 2; }
+message TokenClassifyEntity   { string entity_group = 1; int32 start = 2;  // byte offsets
+                                int32 end = 3; float score = 4; string text = 5; }
+message TokenClassifyResponse { repeated TokenClassifyEntity entities = 1; }
+```
+Client-side plumbing is complete: `pkg/grpc/{backend,client,embed}.go`,
+`pkg/model/connection_evicting_client.go`, generated stubs.
+
+**Server-side, the only implementer today is `backend/python/transformers/backend.py`**
+(`Type=TokenClassification`), which uses the HF `pipeline("token-classification",
+aggregation_strategy="simple")`. Note: "simple" aggregation = argmax + B/I merge — it does
+**not** run the model's constrained-Viterbi BIOES decode, so it's a lossy approximation of
+the intended decoding (see §6.4).
+
+### 3.3 What is therefore *missing* to light up model-based PII
+
+1. A **`core/backend` wrapper** for TokenClassify (none exists yet — `grep` confirms).
+2. A **`NERDetector` implementation** in `core/application` that calls that wrapper for a
+   configured model and converts char→byte offsets.
+3. **Wiring** `RedactWithNER` into the request/stream middleware + the model `pii:` config
+   (entity→action map, min score, which model to use).
+4. **Capability metadata**: there is no `Usecase`/`Method` for token classification in
+   `core/config/backend_capabilities.go` (it has tokenize/rerank/detection but not
+   classification). Add `MethodTokenClassify` + a `classification`/`ner` usecase + register
+   it on whichever backend(s) implement it. Follow the
+   `.agents/api-endpoints-and-auth.md` checklist for the surface.
+5. The **GGML backend** that implements `TokenClassify` server-side (the Python-removal goal).
+
+Items 1–4 are needed regardless of Python-vs-GGML; item 5 is the substantive new work.
+
+---
+
+## 4. Mapping model output → middleware
+
+```
+text ──► [GGML privacy-filter backend]
+            tokenize (o200k_base)
+            forward (bidirectional MoE transformer) ──► per-token logits [T, 217]
+            constrained Viterbi (BIOES + 6 transition biases) ──► label path
+            spans + char offsets ──► byte offsets
+         ──► TokenClassifyResponse{ entities:[{entity_group, start, end, score, text}] }
+                       │
+                       ▼
+   core/backend.TokenClassify wrapper
+                       │
+                       ▼
+   pii.NERDetector.Detect ──► []NEREntity{Group, Start, End, Score}
+                       │
+                       ▼
+   Redactor.RedactWithNER(text, overrides, NERConfig{EntityActions, MinScore, DefaultAction})
+                       │  merges NER hits with regex hits, resolves action per entity group
+                       ▼
+   mask / block / allow  +  PIIEvent{ PatternID: "ner:FIRSTNAME", ... }
+```
+
+Mapping notes / gotchas:
+- **Entity-group vocabulary**: the model emits 54 category names (FIRSTNAME, IBAN, …). These
+  become `NERConfig.EntityActions` keys and `ner:<GROUP>` audit IDs. We should ship a sane
+  default action map (e.g. block secrets/credentials, mask names/contact, allow-log
+  low-risk) and let admins override per model.
+- **Offsets**: `NEREntity.Start/End` and `TokenClassifyEntity.start/end` are **byte** offsets;
+  HF "simple" aggregation and tiktoken offsets are **character/codepoint**-based. For
+  multilingual UTF-8 (the whole point of this model) we must convert char→byte carefully. The
+  existing Python path returns Python `str` indices — a latent bug for non-ASCII that we
+  should fix in the wrapper/backend regardless.
+- **Threshold**: `TokenClassifyRequest.threshold` ↔ `NERConfig.MinScore`. The Viterbi
+  transition biases are a *second*, orthogonal knob (precision/recall of span boundaries) —
+  we should expose them as backend load options, not per-request, to start.
+- **Streaming**: model-based detection needs the full text; on the response path it composes
+  with the existing tail-buffer stream filter, but a 50M-active forward per chunk is costly.
+  Decision needed: request-side only first, or buffer-and-classify on response too (§9).
+
+---
+
+## 5. Runtime strategy: llama.cpp vs a standalone ggml graph
+
+The request is to drop Python at inference. Two ways to get a C++/ggml runtime:
+
+- **(A) Extend llama.cpp** — the model is *literally a gpt-oss variant* (config confirms
+  `model_type: openai_privacy_filter`, 8 layers, d640, 128 experts top-4, `sliding_window:
+  128`, YaRN θ=150000 factor 32, vocab 200064 = o200k). llama.cpp already ships the entire
+  gpt-oss graph (`LLM_ARCH_OPENAI_MOE`, `src/models/openai-moe.cpp`): MoE top-k routing,
+  attention sinks, sliding-window (iswa) attention, RoPE/YaRN, and the o200k tokenizer. The
+  only missing pieces are a token-classification head and per-token logit output — and an
+  open upstream PR already adds exactly that substrate (see §6.2).
+- **(B) Standalone `privacy-filter.cpp` ggml graph** (the original plan; kept as fallback in
+  §6.7) — hand-build the MoE + banded-attention + sinks graph from scratch, à la
+  vibevoice/parakeet/LocalVQE.
+
+**Recommendation: pursue (A), the llama.cpp path.** Re-implementing the gpt-oss MoE graph
+(experts + sinks + iswa + YaRN) by hand in (B) is exactly the work llama.cpp has already done
+and hardened across CPU/CUDA/Metal/Vulkan, and would also force us to vendor the o200k
+tokenizer and reinvent quantization. (A) reuses all of it and rides upstream momentum;
+quantization (incl. apex-quant `--tensor-type-file`) and multi-backend acceleration come for
+free. The cost is carrying a llama.cpp patch until/unless we upstream it (see trade-offs in
+§6.5 and the new decisions in §9). **This supersedes the earlier "own a privacy-filter.cpp
+repo" framing** — under path (A) the artifact is a llama.cpp patch (in
+`backend/cpp/llama-cpp/patches/`) + a GGUF converter, not a separate C++ engine, and the
+o200k-tokenizer-vendoring question (old §9.6) is moot because llama.cpp already has it.
+
+This does **not** change Phase 0: the Python `transformers` backend stays the interim path and
+the reference oracle (§3.3 items 1–4, decision §9.1 = "try the existing python backend first").
+
+---
+
+## 6. Implementing in llama.cpp (recommended path)
+
+### 6.1 The model is gpt-oss + a token-classification head (config-confirmed)
+
+`openai/privacy-filter` config.json: `architectures: [OpenAIPrivacyFilterForTokenClassification]`,
+`model_type: openai_privacy_filter`, `num_hidden_layers: 8`, `hidden_size: 640`,
+`num_attention_heads: 14`, `num_key_value_heads: 2`, `head_dim: 64`, `intermediate_size: 640`,
+`num_local_experts: 128`, `num_experts_per_tok: 4`, `sliding_window: 128`, `vocab_size:
+200064`, `rope_parameters: {yarn, θ=150000, factor=32, orig=4096}`, `num_labels: 33`. The
+multilingual fine-tune is identical except `num_labels: 217`. Aside from the classification
+head and the (non-causal) attention question, this is the gpt-oss architecture llama.cpp
+already runs.
+
+### 6.2 What llama.cpp already provides (verified against the vendored checkout)
+
+- **gpt-oss arch**: `LLM_ARCH_OPENAI_MOE` → gguf `"gpt-oss"` (`src/llama-arch.cpp:115`), graph
+  in `src/models/openai-moe.cpp` — `build_moe_ffn` top-k routing, `attn_sinks`
+  (`ggml_soft_max_add_sinks`), iswa sliding-window (`build_attn_inp_kv_iswa`), NEOX
+  RoPE + YaRN, gate/up/down expert tensors + biases. (PR #15091.)
+- **o200k tokenizer**: `LLAMA_VOCAB_PRE_TYPE_GPT4O` regex + harmony special tokens
+  (`src/llama-vocab.cpp:2207-2213`); convert via `_set_vocab_gpt2()` BPE export.
+- **Non-causal / bidirectional attention**: per-context `cparams.causal_attn`,
+  `llama_set_causal_attn(ctx,false)`, `LLAMA_ATTENTION_TYPE_NON_CAUSAL`
+  (`src/llama-context.cpp:161-181, 1108-1118`). Non-causal needs the whole sequence in one
+  ubatch (`n_ubatch ≥ n_tokens`). Used by BERT/ModernBERT/EuroBERT/T5-encoder.
+- **Sequence-level classification head (merged)**: `LLAMA_POOLING_TYPE_RANK`, `cls`/`cls_out`
+  tensors (`LLM_TENSOR_CLS{,_OUT}`), label metadata key `*.classifier.output_labels`,
+  `n_cls_out` width. (PR #9510 reranking.)
+- **Token-level classification head (OPEN upstream PR #19725 "llama: add
+  BertForTokenClassification support")** — the substrate we want, and it explicitly targets
+  OpenMed NER models:
+  - adds `LLAMA_POOLING_TYPE_TOKEN_CLS` (`--pooling token-cls`);
+  - in `build_pooling` applies `cls_out` projection at **every** token position →
+    `[n_cls_out, n_tokens]`;
+  - repurposes `llama_get_embeddings_ith(ctx,i)` to return `n_cls_out` **label logits** per
+    token (instead of `n_embd` embeddings);
+  - convert support for `*ForTokenClassification` (drops `head.` tensors, sets
+    `PoolingType.TOKEN_CLS`, writes `n_cls_out` from `id2label`).
+  - Small/contained: +209/−56 across 15 files. **But wired only for encoder BERT/ModernBERT,
+    not a decoder arch like gpt-oss.**
+
+### 6.3 The gap to close for a gpt-oss token classifier
+
+On top of PR #19725's substrate, `LLM_ARCH_OPENAI_MOE` needs:
+
+1. **A `GptOssForTokenClassification` (here: `OpenAIPrivacyFilterForTokenClassification`)
+   convert class** in `convert_hf_to_gguf.py` that emits `cls.output`(+bias) and the
+   `output_labels` metadata, sets `PoolingType.TOKEN_CLS`, and writes `n_cls_out` from
+   `id2label` (33 or 217). Reuse the gpt-oss tensor mapping + `_set_vocab_gpt2()`.
+2. **Load `cls_out`/`cls_out_b` for `LLM_ARCH_OPENAI_MOE`** in `src/llama-model.cpp`.
+3. **Invoke `build_pooling` from `src/models/openai-moe.cpp`** — today that graph sets
+   `t_logits` (LM head) and returns; it must instead expose per-token hidden states and run
+   the TOKEN_CLS projection.
+4. **Attention mode** (resolved from the HF source — see §6.9): the model is **bidirectional**
+   (`self.is_causal = False`, no KV cache, `position_ids = arange`) with a **symmetric band**
+   `|q − kv| ≤ 128` (the config `sliding_window: 128`; the `+1`→129 in the code is an FA-only
+   symmetry fudge — use 128 for a manual mask). **Attention sinks are retained** (14, one per
+   query head) and the **softmax is forced fp32**. All 8 layers use the *same* mask (no
+   causal/global alternation). So we run `causal_attn=false` with a **non-causal,
+   symmetric-banded mask** — which, per §6.10, **already exists** in llama.cpp
+   (`LLAMA_SWA_TYPE_SYMMETRIC` + the no-cache `fill_mask`); we just select it via hparams. The
+   residual risk is numerical parity vs HF (the `n_swa=256` mapping, fp32 softmax), not new ggml.
+
+This is roughly the order of work of PR #19725 (a convert class + a handful of `src/` edits)
+**plus** the non-causal banded-mask variant and the gpt-oss-reuse fixes in §6.9. We build on
+PR #19725 (decision §9.10 = build on upstream, carry its diff in
+`backend/cpp/llama-cpp/patches/` for dev — `prepare.sh` already injects files; we extend it to
+apply patches). If upstream gets messy or stalls, the fallback ladder is: carry a patch →
+copy the needed bits into a standalone project (§6.7).
+
+### 6.4 Wiring into LocalAI's vendored llama-cpp backend
+
+LocalAI's llama.cpp backend (`backend/cpp/llama-cpp/grpc-server.cpp`, injected into the
+upstream tree by `prepare.sh`; llama.cpp pinned by commit in the `Makefile`) already
+implements `Embedding`, `Rerank` (`POOLING_TYPE_RANK`), and `Score` via the server-context
+task queue — but **not** `TokenClassify`. Adding it mirrors the Rerank path:
+
+1. New `BackendServiceImpl::TokenClassify` RPC + a `SERVER_TASK_TYPE_TOKEN_CLASSIFY`.
+2. Load the model with `pooling_type = TOKEN_CLS` and `causal_attn = false`
+   (a new load flag, à la `--reranking`).
+3. Run one non-causal forward (full sequence in one ubatch); read per-token logits via
+   `llama_get_embeddings_ith` (= `n_cls_out` logits/token under TOKEN_CLS).
+4. **Viterbi BIOES decode + span assembly + offset mapping** in the grpc-server (C++), using
+   llama.cpp's tokenizer offsets to produce **byte** offsets, then fill
+   `TokenClassifyEntity{entity_group, start, end, score, text}`. (Alternatively decode on the
+   Go side, but C++ keeps the token→byte offset mapping next to the tokenizer.)
+5. Capability metadata: add `MethodTokenClassify` + a `classification`/`ner` usecase in
+   `core/config/backend_capabilities.go` and register it on the `llama-cpp` backend; follow
+   the `.agents/api-endpoints-and-auth.md` checklist.
+
+The Go `NERDetector` (§3.3 item 2) then just calls `TokenClassify` over the existing gRPC
+client plumbing — identical contract whether the server is the Python backend (Phase 0) or
+the llama.cpp backend (Phase 2).
+
+### 6.5 Trade-offs of the llama.cpp path (vs standalone)
+
+**Pros**: reuse the hardened gpt-oss MoE/sinks/iswa/YaRN graph; o200k tokenizer for free;
+quantization (incl. apex-quant `--tensor-type-file`) and CUDA/Metal/Vulkan/CPU acceleration
+for free; same backend LocalAI already ships and updates; potential to upstream and share
+maintenance. **Cons**: we carry a patch against a pinned llama.cpp commit (rebases on bumps)
+until upstreamed; depends on (or vendors) PR #19725; the bidirectional + banded-mask + sinks
+combination is novel for a decoder arch and must be numerically verified; non-causal forces
+single-ubatch (fine for short PII inputs, caps very long contexts).
+
+### 6.6 Decoding correctness (a feature, not just a port)
+
+The current Python path's `aggregation_strategy="simple"` ignores the model's intended
+**constrained Viterbi** over the BIOES grammar (6 transition-bias params for precision/recall).
+Implementing Viterbi properly (§6.4 step 4) is both the faithful port **and** an accuracy
+improvement over what LocalAI does today. Keep the 6 transition biases configurable (backend
+load options to start).
+
+### 6.7 Fallback: standalone `privacy-filter.cpp` ggml graph
+
+If the llama.cpp path is blocked (PR #19725 abandoned + we don't want to carry it, or the
+bidirectional/sinks combination proves intractable inside llama.cpp), fall back to a dedicated
+ggml graph under `localai-org` (decision §9.4), following vibevoice/parakeet/LocalVQE:
+`*_graph.cpp` (banded non-causal MoE), `*_model.cpp`, `*_api.{h,cpp}`, vendored o200k
+tokenizer, Viterbi, GGUF converter; `backend/go/privacy-filter-cpp/` dlopens it via `purego`
+and implements `TokenClassify`. This is more code and duplicates llama.cpp's MoE graph, hence
+the fallback ranking.
+
+### 6.8 Quantization (apex-quant) — not needed day one
+
+Decision §9.8 = F16 is acceptable (~3 GB). When/if we quantize: apex-quant
+(<https://github.com/localai-org/apex-quant>) does MoE-aware mixed precision on **stock
+llama.cpp** via `--tensor-type-file` — routed experts aggressive (IQ4_XS/Q4_K mid, Q5_K
+near-edge, Q6_K edge), shared/always-active Q8_0, attention Q6_K. Under the llama.cpp path
+this works out of the box. Verify with a **task metric (span-F1 per language) + KL-vs-F16**,
+*not* perplexity (this is a classifier, not an LM).
+
+### 6.9 Layer-by-layer reference (HF) + gpt-oss reuse risks
+
+Verified against the HF *modular* source (`modular_openai_privacy_filter.py`, which inherits
+gpt-oss + `masking_utils.py` + `modeling_rope_utils.py`) and the `opf/_core/` decoder. This is
+the contract the llama.cpp port must match numerically, and it lists where the model **differs
+from stock gpt-oss** (so a naive `LLM_ARCH_OPENAI_MOE` reuse drifts).
+
+**Block order** (pre-norm, 8 layers, RMSNorm eps 1e-5, fp32 norm; no embedding scaling; final
+`model.norm`; no LM head, no tied embeddings):
+`x → input_layernorm → attn → +residual → post_attention_layernorm → MoE → +residual`, then
+final norm → `score` (Linear 640→33/217, **with bias**, no activation, dropout 0.0) →
+`log_softmax` → Viterbi.
+
+**Tensor names → GGUF mapping** (per layer N):
+
+| HF tensor | shape | note for the port |
+|---|---|---|
+| `model.embed_tokens.weight` | [200064, 640] | o200k vocab |
+| `model.layers.N.input_layernorm.weight` | [640] | RMSNorm |
+| `…self_attn.q_proj.{weight,bias}` | [896,640]/[896] | 14×64; **bias present** |
+| `…self_attn.k_proj.{weight,bias}` | [128,640]/[128] | 2×64 (GQA group 7) |
+| `…self_attn.v_proj.{weight,bias}` | [128,640]/[128] | |
+| `…self_attn.o_proj.{weight,bias}` | [640,896]/[640] | |
+| `…self_attn.sinks` | [14] | one per query head — **keep** |
+| `…post_attention_layernorm.weight` | [640] | RMSNorm |
+| `…mlp.router.{weight,bias}` | [128,640]/[128] | top-4 router, **bias present** |
+| `…mlp.experts.gate_up_proj` | [128,640,1280] | fused, **chunk layout** (see risk #7) |
+| `…mlp.experts.gate_up_proj_bias` | [128,1280] | |
+| `…mlp.experts.down_proj` | [128,640,640] | |
+| `…mlp.experts.down_proj_bias` | [128,640] | |
+| `model.norm.weight` | [640] | |
+| `score.{weight,bias}` | [33,640]/[33] | classification head (217 for multilingual) |
+
+**Reuse risks vs stock gpt-oss (each is a known drift source — verify per §7):**
+
+1. **Non-causal symmetric band** `|q−kv| ≤ 128`, same mask all layers (no causal, no
+   global-layer alternation). gpt-oss uses causal iswa. → new mask variant.
+2. **Window 128, not 129** (the code's `+1` is FA-only). Use 128 for the manual mask.
+3. **Attention sinks present + softmax forced fp32** (sink logit column appended to the
+   denominator, then dropped). gpt-oss has sinks too, but confirm fp32 softmax.
+4. **Q and K each scaled by `head_dim**-0.25` *after* RoPE**, attention `scaling = 1.0`
+   (algebraically `1/√d`, but split across q/k and post-RoPE — matters in low precision).
+5. **No q_norm / k_norm.**
+6. **RoPE = YaRN** with `attention_scaling = 0.1·ln(32)+1 = 1.34657` baked into cos/sin,
+   θ=150000, **`truncate=False`** (no floor/ceil on the correction range — a YaRN impl that
+   always floors/ceils diverges); cos/sin fp32; gpt-oss chunked-half rotary pairing.
+7. **MoE expert gate/up uses `chunk(2)` (concatenated) layout, NOT gpt-oss interleaved
+   `::2`/`1::2`.** The convert script must emit the layout llama.cpp's gpt-oss graph expects
+   (or we adjust the graph). Wrong split → wrong gate/up assignment. Clamp: gate `max=7`, up
+   `±7`, `(up+1)·(gate·σ(1.702·gate))`.
+8. **MoE double-scaling**: router does `softmax(top4)/4`; the MLP then multiplies the expert
+   sum by `num_experts_per_tok (=4)`. Net = `softmax(top4)`, but the divide/multiply happen at
+   different points and everything runs **fp32** ("very sensitive to accumulation order").
+9. **Router has a bias** `[128]` — don't drop it.
+10. **Classification `score` head has a bias, no tanh/activation** (unlike some rerank heads).
+    `n_cls_out` = 33 (base) / 217 (multilingual); label 0 = `O`.
+
+**Decoder (`opf/_core/`)**: per-token `log_softmax` (fp32) → **constrained Viterbi** over BIOES
+with start/end scores and valid-transition rules; fallback to per-token argmax if all paths
+die. The **6 transition-bias params are all 0.0 in the shipped `viterbi_calibration.json`** —
+so at the default operating point the biases are inert and only the *structural* BIOES
+constraints matter (we still expose them as load options). Spans: BIOES walk → token spans →
+**byte-accurate char offsets** from the tiktoken byte stream (`decode_text_with_offsets`,
+UTF-8 aware) → convert to the **byte** offsets the proto/`NEREntity` want. Optional whitespace
+trim + per-label de-overlap.
+
+### 6.10 Concrete llama.cpp changes + arch decision (grounded in the vendored source)
+
+Read against the vendored checkout (commit `22d66b56`): the gpt-oss graph
+`src/models/openai-moe.cpp` (169 lines) and PR #19725's `build_pooling` branch. Any in-tree
+work must follow `backend/cpp/llama-cpp/llama.cpp/AGENTS.md`.
+
+**What we reuse unchanged** (this is why we don't hand-roll a graph): `build_inp_embd`,
+`build_norm` (RMS), `build_qkv` (q/k/v + biases), `ggml_rope_ext` (YaRN via hparams),
+`build_attn` (already takes `attn_sinks`!), and **`build_moe_ffn(..., LLM_FFN_SWIGLU_OAI_MOE,
+…, SOFTMAX_WEIGHT, …)`** with the per-expert gate/up/down **biases** and router bias — exactly
+the gpt-oss MoE. The expert tensors are loaded **already split** into `ffn_gate_exps` /
+`ffn_up_exps` (the convert script does the splitting), so the chunk-vs-interleaved gate/up
+issue (§6.9 risk #7) is **convert-time only** — once we split with `chunk(2)` the graph is
+identical.
+
+**Two numerical clarifications from reading the graph:**
+- `build_attn(...)` is called with `kq_scale = 1.0f/sqrtf(n_rot) = 1/8`. The HF model scales q
+  and k each by `head_dim**-0.25` post-RoPE with attn scale 1.0 → net `q·k/√d = 1/8` too. So
+  **the standard `1/√d` path is numerically equivalent**; we do *not* need to replicate the
+  split (verify within tolerance, §7).
+- **YaRN `truncate=false`**: HF keeps the correction range as floats (no floor/ceil). ggml's
+  `ggml_rope_ext` corr-dims may floor/ceil — a small potential drift. Treat as a verification
+  risk; add a flag only if parity fails.
+
+**What differs from gpt-oss (the actual new work):**
+1. **Attention input** — *already supported, not new code.* gpt-oss uses
+   `build_attn_inp_kv_iswa()` (causal KV-cache SWA). We instead use the **no-cache** path
+   `build_attn_inp_no_cache()` whose `fill_mask` (`llama-graph.cpp:428-462`) already honors
+   `cparams.causal_attn` **and** `is_masked_swa(...)`. Setting (in `load_hparams`)
+   `causal_attn=false`, `swa_type=LLAMA_SWA_TYPE_SYMMETRIC`, `n_swa=256` yields exactly
+   `|q−kv| ≤ 128` bidirectional — `LLAMA_SWA_TYPE_SYMMETRIC` (`llama-hparams.h:342`) masks
+   `|p1−p0| > n_swa/2`. So the band falls out of existing primitives; the only trap is the
+   **`n_swa = 2·sliding_window` (=256)** mapping (most likely off-by-one — verify against an HF
+   mask dump). The band is **required for correctness**: real PII inputs routinely exceed the
+   257-token window, and unbanded attention on them computes a different function. (An earlier
+   draft called this "the one new primitive" and proposed "full non-causal first" — both wrong.)
+2. **Serving long inputs — overlapping windows, exact.** Because attention is *strictly* local
+   (±128), a token's logits depend only on its ±128 neighborhood. Processing the text in
+   windows of width W with a halo ≥128 each side and keeping only the interior labels is
+   **bit-exact** vs a single banded forward (window `[0,W)` keep `[0,W−128)`; stride `W−256`;
+   absolute start/end need no halo). This bounds compute/memory to O(N·W), keeps each ubatch
+   small (non-causal needs `n_ubatch ≥ window`), and is streamable. It is also *better* than
+   OpenAI's `opf` runtime, which uses non-overlapping `n_ctx` windows and degrades at seams.
+   Plan: ship the **single banded forward** first (correct at any length, simplest to verify),
+   add **windowing** as the throughput/memory path for long inputs. HF parity tests use
+   single-window inputs vs one banded forward.
+3. **Output tail**: gpt-oss ends with the lm_head (`build_lora_mm(model.output)` → `t_logits`,
+   lines 162-166). We instead stop at `result_norm` (`res->t_embd`) and route through
+   **`build_pooling` with `LLAMA_POOLING_TYPE_TOKEN_CLS`** (PR #19725): `cur = inp; cur =
+   ggml_mul_mat(cls_out, cur); cur = ggml_add(cur, cls_out_b);` → `[n_cls_out, n_tokens]`, like
+   the encoder graphs. No lm_head, no `output`/`output_norm`-as-lm_head.
+4. **Tensors**: load `cls_out`/`cls_out_b` (from `score.{weight,bias}`) instead of `output`;
+   set `pooling_type = TOKEN_CLS`, `n_cls_out` (33/217), and write `output_labels` metadata —
+   all already supported by PR #19725's plumbing.
+5. **`build_inp_out_ids`**: with token-level pooling, n_outputs = all tokens (PR #19725's
+   server already branches `token_level_pooling = NONE || TOKEN_CLS`).
+
+**Decision §9.14 — register a SEPARATE arch.** Recommend a new
+`LLM_ARCH_OPENAI_PRIVACY_FILTER` (gguf name `openai-privacy-filter`, matching HF
+`model_type: openai_privacy_filter`) with its own ~120-line `src/models/openai-privacy-filter.cpp`
+that *composes the shared helpers above* but swaps in the non-causal banded attn input and the
+TOKEN_CLS tail. Rationale: (a) it does **not** fork the hot, well-maintained gpt-oss graph with
+causal-vs-non-causal / lm_head-vs-cls conditionals (maintainers dislike that); (b) it mirrors
+HF's separate model type; (c) it reuses every expensive kernel, so the diff is "a new graph
+file + arch enum + loader + convert class," not new ggml ops. Touch points (per the
+add-architecture guide): `gguf-py/gguf/constants.py` (MODEL_ARCH + tensor list incl.
+`CLS_OUT`), `tensor_mapping.py`, a `convert_hf_to_gguf.py` class (subclass the gpt-oss
+converter to reuse o200k vocab + tensor map; override the expert split to `chunk(2)`; emit
+`cls.output(+bias)`, `PoolingType.TOKEN_CLS`, `n_cls_out`, `output_labels`; drop lm_head),
+`src/llama-arch.{h,cpp}`, `src/llama-model.cpp` (`load_hparams` + tensor load + rope-type),
+`src/models/openai-privacy-filter.cpp` + `models.h` + `src/CMakeLists.txt`.
+
+**Draft skeletons** (grounded in the source above) live in `docs/plans/pii-ner-ggml/`:
+`conversion_openai_privacy_filter.py` (the `GptOssModel` subclass — incl. the critical
+`chunk(2)` expert split and the `score`→`cls.output` head), `openai-privacy-filter.cpp` (the
+graph: `openai-moe.cpp` + the 3 marked changes), and `INTEGRATION.md` (the full touch-point
+list, the PR #19725 carry, the `n_swa=256` trap, the LocalAI `TokenClassify` + windowing +
+Viterbi wiring, and the verification ladder). They are design drafts, not built code.
+
+---
+
+## 7. Conversion methodology (from LocalVQE / parakeet experience)
+
+Distilled from `~/c/LocalVQE-train/PROCESS.md` + `~/c/LocalVQE/ggml/` and the request's
+pointers. These are the rules that made those ports succeed:
+
+1. **Block-level numeric equivalence first.** Implement faithfully, then prove each block
+   matches a PyTorch reference *before* the end-to-end path. LocalVQE dumps reference
+   activations (`compare.py`) and asserts per-block match in C++ tests
+   (`test_encoder.cpp`, …). For us: dump from the HF model, for a fixed multilingual prompt
+   set, the intermediates most likely to drift given §6.9 — **post-RoPE q/k (after the
+   `head_dim**-0.25` scaling)**, **attention probs with the sink column included (fp32)**,
+   **router top-4 scores**, **per-expert gate/up/down outputs**, **post-MoE hidden (after the
+   ×4 rescale)**, **final-norm hidden**, and **`score` logits** — then assert the llama.cpp
+   graph matches each within tolerance, layer by layer. Also check the **YaRN inv_freq /
+   attention_scaling** table directly (truncate=False). **Re-run the comparison after every
+   change.** End-to-end accuracy hides where drift is introduced.
+2. **One change at a time;** run a clean check before concluding. (PROCESS principle #5.)
+3. **Architect for inference / quant-friendly shapes.** gpt-oss dims (640, 128 experts) are
+   already fixed by the checkpoint, but choose Viterbi/buffer layouts and any padding to be
+   SIMD- and block-size friendly (Q4_K block 256). (PROCESS principle #8.)
+4. **No per-call allocation on the hot path.** LocalVQE pre-allocates with a ggml graph
+   allocator and reports flat peak RSS regardless of thread count. Build a static graph sized
+   to max context; reuse buffers across calls. (PROCESS §5 / §6.)
+5. **Fuzz + assert.** LocalVQE ships a `fuzz/` harness and a `build-fuzz` config. Fuzz the
+   tokenizer, offset mapping, and Viterbi decoder (empty input, all-`O`, adversarial UTF-8,
+   max-length) with assertions on invariants (valid BIOES paths, offsets within bounds,
+   no OOB).
+6. **Cross-check references.** LocalVQE found real bugs in *both* upstream references by
+   checking against two. Our references: the `opf` CLI output and the HF
+   `OpenAIPrivacyFilterForTokenClassification` — compare both, especially around Viterbi vs
+   pipeline aggregation.
+7. **Verify per scenario/distribution.** Their lesson: averaged metrics hide failures. For
+   us: evaluate span-F1 **per language** (the model is known-weaker on CJK), not one blended
+   number. Build a small held-out set per language from AI4Privacy.
+
+---
+
+## 8. Proposed phased plan (for discussion)
+
+**Phase −1 — optional upstream warm-up (parallelizable, decision §9.13).**
+Help review/test/land **PR #19725** (the TOKEN_CLS substrate we build on) and/or revive
+**PR #15189** (`echo` logprobs). Builds the maintainer relationship and de-risks Phase 1.
+*Not* the originally-proposed standalone Score PR (that niche is taken).
+
+**Phase 0 — interim, Python-backed (proves the seam end to end).**
+Wire §3.3 items 1–4 against the existing `transformers` backend (`Type=TokenClassification`,
+model = OpenMed/privacy-filter-multilingual). Fix char→byte offsets. Ship default
+entity→action map + model `pii:` config. This delivers working ML PII filtering and gives us
+the **reference oracle** for the port. (Removable later; keep behind config.)
+
+**Phase 1 — llama.cpp conversion + offline parity** (path A, §6).
+Track/grab PR #19725 for the TOKEN_CLS substrate. Add the
+`OpenAIPrivacyFilterForTokenClassification` convert class (emits `cls.output` + `output_labels`
++ `n_cls_out`, reuses gpt-oss mapping/vocab); load `cls_out` for `LLM_ARCH_OPENAI_MOE`; call
+`build_pooling` from `openai-moe.cpp`; resolve the non-causal/banded-mask + sinks question
+(§6.3.4) **numerically against HF**. Achieve **block-level + final-logit parity** (§7.1) at
+F16 on a per-language prompt set, using `llama-embedding --pooling token-cls` (or the
+grpc-server) before any LocalAI wiring.
+
+**Phase 2 — LocalAI llama-cpp backend `TokenClassify`.**
+Add `TokenClassify` RPC + `SERVER_TASK_TYPE_TOKEN_CLASSIFY` to `grpc-server.cpp` (load with
+`pooling=TOKEN_CLS`, `causal_attn=false`), Viterbi + offset mapping in C++ (§6.4); capability
+metadata; carry the llama.cpp diff via `backend/cpp/llama-cpp/patches/` if not upstream. Point
+the Go `NERDetector` at the llama-cpp backend; the Python path becomes optional/fallback.
+
+**Phase 3 — productionization.**
+Streaming/response-path decision (§9), gallery entry, docs (`docs/content/`), admin UI knobs,
+MCP tool if it becomes admin-managed (`.agents/localai-assistant-mcp.md`).
+
+**Phase 4 — quantization (apex-quant) + eval.**
+Mixed-precision GGUF; verify with **span-F1 per language + KL-vs-F16**, not perplexity. Ship
+F16 first; quantize only if footprint matters.
+
+---
+
+## 9. Open questions / decisions to make together
+
+1. **Phase 0 first?** Do we ship the Python-backed interim to validate the middleware seam and
+   get a reference oracle, or go straight to GGML? 
+
+Try the existing python backend first.
+
+2. **Model variant**: multilingual (217 classes, 16 langs) as primary? Also support the base
+   (33 classes, en) and/or nemotron (clinical)? The label→action map and tests differ per
+   variant.
+
+   multilingual as primary
+
+3. **Response/streaming**: request-side redaction only at first (cheapest, safest), or also
+   buffer-and-classify model output? A 50M-active forward per SSE flush is non-trivial.
+
+   Request side only at first
+
+4. **Backend home**: new `privacy-filter.cpp` repo under which org, and do we own the upstream
+   (like vibevoice/parakeet under `mudler`) or `localai-org` (like LocalVQE/apex-quant)?
+
+   localai-org and yes we own the upstream
+
+5. **Banded bidirectional attention**: confirm the band/window (card says band 128 → window
+   257) against the actual config.json before building the mask.
+
+   I don't know, but if there is a difference then I would have thought that the config.json is correct, but this is somethign to test
+
+6. **Tokenizer**: reuse llama.cpp's o200k_base/tiktoken support as a vendored lib, or a
+   minimal standalone BPE in `privacy-filter.cpp`?
+
+   vendor it
+7. **Default action policy**: what's the out-of-the-box mapping of 54 categories →
+   block/mask/allow, and how much do we trust the model to *block* vs only *mask*?
+
+   mask
+
+8. **Quantization need**: is F16 (~3 GB) acceptable, or do we need apex-quant from day one
+   (e.g. for edge/Vulkan targets)?
+
+   F16 is OK
+
+### New decisions raised by the llama.cpp path (§5/§6)
+
+9. **Runtime path**: confirm **(A) extend llama.cpp** as primary (recommended), with the
+   standalone `privacy-filter.cpp` graph as fallback (§6.7)? This reframes the old decision
+   §9.4 — under (A) the artifact is a llama.cpp patch + GGUF converter, not a `localai-org`
+   C++ engine — and makes §9.6 (vendor a tokenizer) moot (llama.cpp already has o200k).
+
+10. **Upstream vs carry** — DECIDED: **build on upstream PR #19725** and carry its diff in
+    `backend/cpp/llama-cpp/patches/` for dev. Escalation ladder if upstream is slow/messy:
+    carry a patch → if real friction, copy only the bits we need into a standalone project
+    (§6.7). We still aim to upstream our gpt-oss token-classification arch.
+
+11. **Bidirectional + banded + sinks** — RESOLVED from the HF source (§6.3.4 / §6.9):
+    **symmetric band `|q−kv| ≤ 128`** (config 128, not 129), **non-causal**, **sinks retained**
+    (14/query-head), **fp32 softmax**, same mask all 8 layers. The new primitive is a
+    non-causal symmetric-banded SWA mask. *Remaining* validation = numerical parity vs HF
+    (§7.1), not "does it exist". Upstream-friendliness: this is a clean new arch
+    (`openai_privacy_filter`) rather than a hack on gpt-oss; keep it that way (see §9.13).
+
+12. **Where Viterbi lives**: decode BIOES + map token→byte offsets in the C++ grpc-server
+    (next to the tokenizer, recommended) or on the Go side after receiving raw per-token
+    logits (would need a new proto carrying logits + offsets)? Recommend C++ to keep the
+    `TokenClassify` contract unchanged.
+
+13. **Easy upstream warm-up** — the proposed "Score / continuation (echo) logprobs into
+    llama-server" is **already owned by a core maintainer**: ngxson's PR #17935 (closed) and
+    fo40225's PR #15189 (open) implement OpenAI-style `echo`+prompt-logprobs, modelled on
+    `tools/perplexity/perplexity.cpp` (the same primitive LocalAI's `Score` uses —
+    `core/backend/score.go`, grpc-server `Score`). Opening a competing PR would collide. Better
+    warm-ups that build the *same* familiarity/relationship and de-risk our path:
+    **(a) help review/test and land PR #19725** (the substrate we depend on — author pinged
+    CISC; it needs 2 approvals and is unreviewed); **(b) help revive PR #15189** (gets `echo`
+    logprobs upstream so LocalAI can eventually retire its custom `Score`). Recommend (a).
+    Note hygiene norms before any PR: snake_case, `LLAMA_*` enum prefixes, 4-space + brace-on-
+    same-line, plain `for` loops, **no new deps/files/headers**, reuse existing machinery,
+    one PR at a time, and an `Assisted-by:` trailer (LocalAI policy) / plain AI-disclosure line
+    (upstream norm).
+
+14. **Arch identity upstream** — RECOMMENDED (analysis in §6.10): register a **separate**
+    `LLM_ARCH_OPENAI_PRIVACY_FILTER` (gguf `openai-privacy-filter`, matching
+    `config.model_type`) with its own small graph file composing the shared gpt-oss helpers,
+    rather than forking `LLM_ARCH_OPENAI_MOE` with causal/lm_head conditionals. Confirm this
+    framing (it drives the upstream PR shape). Open sub-question: does the non-causal
+    symmetric-banded mask go in as a reusable primitive or a privacy-filter-local mask?
+
+---
+
+## 10. Key references
+
+- openai/privacy-filter — model card: <https://huggingface.co/openai/privacy-filter>;
+  repo: <https://github.com/openai/privacy-filter>; model card PDF:
+  <https://cdn.openai.com/pdf/c66281ed-b638-456a-8ce1-97e9f5264a90/OpenAI-Privacy-Filter-Model-Card.pdf>
+- OpenMed/privacy-filter-multilingual: <https://huggingface.co/OpenMed/privacy-filter-multilingual>
+  (MLX variants `-mlx-8bit`; clinical `privacy-filter-nemotron`); docs:
+  <https://openmed.life/docs/anonymization/>
+- HF Transformers integration:
+  <https://github.com/huggingface/transformers/blob/main/docs/source/en/model_doc/openai_privacy_filter.md>
+- apex-quant (MoE GGUF quant): <https://github.com/localai-org/apex-quant>
+- llama.cpp gpt-oss support: PR <https://github.com/ggml-org/llama.cpp/pull/15091>,
+  guide <https://github.com/ggml-org/llama.cpp/discussions/15396>
+- llama.cpp token-classification substrate (the key precedent, OPEN): PR
+  <https://github.com/ggml-org/llama.cpp/pull/19725> ("add BertForTokenClassification support")
+- llama.cpp reranking / sequence-classification head (merged): PR
+  <https://github.com/ggml-org/llama.cpp/pull/9510>
+- llama.cpp `echo`/prompt-logprobs (the contested "Score" warm-up): PR
+  <https://github.com/ggml-org/llama.cpp/pull/15189> (open),
+  <https://github.com/ggml-org/llama.cpp/pull/17935> (ngxson, closed),
+  issues <https://github.com/ggml-org/llama.cpp/issues/8942>,
+  <https://github.com/ggml-org/llama.cpp/issues/12591>; primitive in
+  `tools/perplexity/perplexity.cpp` (`log_softmax` + `llama_get_logits_ith`)
+- llama.cpp "add a new architecture" guide:
+  <https://github.com/ggml-org/llama.cpp/discussions/16770>,
+  <https://github.com/ggml-org/llama.cpp/blob/master/docs/development/HOWTO-add-model.md>;
+  contribution norms: <https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md>
+- HF reference (source of truth for layer parity): `transformers`
+  `models/openai_privacy_filter/modular_openai_privacy_filter.py` (+ inherited `gpt_oss`,
+  `masking_utils.py`, `modeling_rope_utils.py`); decoder `openai/privacy-filter`
+  `opf/_core/{decoding,sequence_labeling,spans,runtime}.py`; `viterbi_calibration.json`
+- LocalAI vendored llama.cpp backend: `backend/cpp/llama-cpp/{grpc-server.cpp,prepare.sh,Makefile}`,
+  upstream graph `src/models/openai-moe.cpp`; LocalAI Score shape: `core/backend/score.go`
+- Conversion methodology: `~/c/LocalVQE-train/PROCESS.md`, `~/c/LocalVQE/ggml/`,
+  `~/c/LocalVQE/README.md`
+- LocalAI integration points (in-repo):
+  - PII seam: `core/services/routing/pii/{ner.go,redactor.go,types.go}`
+  - gRPC: `backend/backend.proto` (TokenClassify), `pkg/grpc/*`, `pkg/model/connection_evicting_client.go`
+  - Existing Python impl (reference): `backend/python/transformers/backend.py:203,271`
+  - Capability registry: `core/config/backend_capabilities.go`
+  - GGML backend templates: `backend/go/{vibevoice-cpp,localvqe,parakeet-cpp}/`
+</content>
+</invoke>
diff --git a/docs/plans/pii-ner-ggml/INTEGRATION.md b/docs/plans/pii-ner-ggml/INTEGRATION.md
new file mode 100644
index 000000000000..b18c43590524
--- /dev/null
+++ b/docs/plans/pii-ner-ggml/INTEGRATION.md
@@ -0,0 +1,112 @@
+# Phase-1 integration guide — `openai-privacy-filter` in llama.cpp + LocalAI
+
+DRAFT companion to `../pii-ner-ggml-backend.md`. Skeletons in this dir:
+`conversion_openai_privacy_filter.py` (HF→GGUF) and `openai-privacy-filter.cpp` (graph). All
+line/symbol references are against the vendored checkout at commit `22d66b56`
+(`backend/cpp/llama-cpp/llama.cpp`). Any in-tree work must follow that tree's `AGENTS.md`.
+
+## What's already there vs what we add
+
+| Capability | Status @ 22d66b56 | Action |
+|---|---|---|
+| gpt-oss graph (MoE top-k, sinks, RoPE/YaRN, o200k vocab) | present (`src/models/openai-moe.cpp`, `conversion/gpt_oss.py`) | reuse |
+| `cls_out` / `cls_out_b` tensors, `n_cls_out`, `*.classifier.output_labels` | present (reranker; `bert.cpp:38`, `llama-arch.cpp:288,394`) | reuse |
+| **Symmetric banded non-causal mask** | present — `LLAMA_SWA_TYPE_SYMMETRIC` (`llama-hparams.h:342`) + no-cache `fill_mask` honors `causal_attn` + `is_masked_swa` (`llama-graph.cpp:428-462`) | reuse (set hparams) |
+| `LLAMA_POOLING_TYPE_TOKEN_CLS` + per-token `cls_out` in `build_pooling` + per-token extract in `llama-context` | **absent** (only RANK uses `cls_out`) | **carry PR #19725** |
+| `TokenClassify` in LocalAI llama-cpp grpc-server | absent (`Embedding`/`Rerank`/`Score` only) | **add** |
+
+The big de-risk: the bidirectional ±128 band is *not* new ggml code — it's
+`causal_attn=false` + `swa_type=SYMMETRIC` + `n_swa=256` on the existing no-cache path.
+
+## A. New architecture registration (llama.cpp)
+
+1. **`gguf-py/gguf/constants.py`**: add `MODEL_ARCH.OPENAI_PRIVACY_FILTER`,
+   `MODEL_ARCH_NAMES[...] = "openai-privacy-filter"`, and a `MODEL_TENSORS[...]` list =
+   gpt-oss's tensor set **minus `OUTPUT`** (no lm_head) **plus `CLS_OUT`**. Also `PoolingType.TOKEN_CLS = 5` (PR #19725).
+2. **`gguf-py/gguf/tensor_mapping.py`**: map HF `"score"` → `MODEL_TENSOR.CLS_OUT` (so
+   `score.weight`/`score.bias` → `cls.output.{weight,bias}`).
+3. **`conversion/__init__.py`**: `"OpenAIPrivacyFilterForTokenClassification": "openai_privacy_filter"`.
+4. **`conversion/openai_privacy_filter.py`**: the skeleton here (subclass `GptOssModel`).
+5. **`src/llama-arch.h` / `.cpp`**: `LLM_ARCH_OPENAI_PRIVACY_FILTER`, name string, and a
+   per-arch tensor-name table (clone gpt-oss's, drop `OUTPUT`, add `CLS_OUT` → `cls.output`).
+6. **`src/llama-model.cpp`**: register `llama_model_openai_privacy_filter` (load_hparams,
+   load_tensors, build_graph dispatch — see the `.cpp` skeleton); ensure `llama_model_rope_type`
+   returns NEOX for it (same as gpt-oss).
+7. **`src/models/models.h`** + **`src/CMakeLists.txt`**: declare the class + add the source.
+
+## B. Carry PR #19725 (token-level classification substrate)
+
+Apply as `backend/cpp/llama-cpp/patches/0001-token-cls.patch` (extend `prepare.sh` to `git apply`
+patches after the source copy). The diff (verified from the PR) adds:
+- `include/llama.h`: `LLAMA_POOLING_TYPE_TOKEN_CLS = 5`.
+- `src/llama-graph.cpp` `build_pooling`: a `case LLAMA_POOLING_TYPE_TOKEN_CLS` that applies
+  `cls_out` (+`cls_out_b`) to **every** token → `[n_cls_out, n_tokens]` (vs RANK pooling to one).
+- `src/llama-context.cpp`: under TOKEN_CLS, size the embd buffer to `n_tokens*n_cls_out` and
+  have `llama_get_embeddings_ith(i)` return `n_cls_out` logits for token `i`.
+- `convert`/gguf-py label plumbing (`add_classifier_output_labels`, `n_cls_out`).
+- `tools/server/server-context.cpp`: `token_level_pooling = NONE || TOKEN_CLS`.
+
+We depend on this; if it changes under review, re-sync the patch.
+
+## C. The one numeric trap — `n_swa` mapping (verify first)
+
+`is_masked_swa(SYMMETRIC)` masks when `|p1−p0| > n_swa/2`. HF band is `|q−kv| ≤ 128`. So
+**`n_swa = 256`** (the loader doubles `sliding_window`). This ×2 is the most likely bug.
+Verify by dumping the HF attention mask for a >257-token input and asserting the GGUF run masks
+the identical (i,j) pairs. (Other parity checks: YaRN `inv_freq`/`attention_scaling` with
+`truncate=false`; expert gate/up split is `chunk(2)` not interleaved; fp32 softmax incl. sinks.)
+
+## D. LocalAI llama-cpp backend — add `TokenClassify`
+
+In `backend/cpp/llama-cpp/grpc-server.cpp`, mirror `Rerank`/`Embedding`:
+1. Load with `pooling_type = LLAMA_POOLING_TYPE_TOKEN_CLS` and (forced by the arch) non-causal.
+   Add a load flag analogous to `--reranking`.
+2. `TokenClassify(text, threshold)`:
+   - tokenize once (o200k, with offsets — keep the token→byte map);
+   - run the windowed forward (§E) → per-token `n_cls_out` logits via `llama_get_embeddings_ith`;
+   - `log_softmax` (fp32) per token → constrained **Viterbi** over BIOES (§F);
+   - assemble spans → byte offsets → `TokenClassifyEntity{entity_group, start, end, score, text}`.
+3. Capability metadata: add `MethodTokenClassify` + a `classification`/`ner` usecase in
+   `core/config/backend_capabilities.go`; register on the `llama-cpp` backend. Follow
+   `.agents/api-endpoints-and-auth.md`.
+
+The Go `NERDetector` then calls `TokenClassify` over the existing gRPC client — same contract
+as the Phase-0 Python backend.
+
+## E. Windowed inference (long inputs, exact)
+
+Attention is strictly local (±128), so per-token logits depend only on the ±128 neighborhood.
+Process in windows of width `W` (e.g. 1024), keeping only interior labels:
+
+```
+HALF = 128                       // = sliding_window
+stride = W - 2*HALF              // 768 for W=1024
+for start in 0, stride, 2*stride, ...:
+    win = tokens[start : start+W]            // one non-causal ubatch (n_ubatch >= W)
+    logits = forward(win)                     // [n_cls_out, len(win)]
+    lo = (start == 0)            ? 0   : HALF              // drop left halo
+    hi = (start+W >= N)         ? len(win) : W - HALF      // drop right halo
+    emit logits[lo:hi] as global positions [start+lo : start+hi]
+```
+
+Bit-exact vs a single banded forward (interior tokens see their full receptive field), bounds
+memory/compute to O(N·W), keeps ubatches small, and is streamable. Strictly better than the
+`opf` runtime's non-overlapping `n_ctx` windows (no seam loss). For ≤W inputs it's one forward.
+
+## F. Viterbi + offsets (C++, next to the tokenizer)
+
+Port `opf/_core/{decoding,sequence_labeling,spans}.py`: constrained linear-chain Viterbi over
+BIOES with start/end scores + the 6 transition biases (all 0.0 in the shipped
+`viterbi_calibration.json` → structural constraints only at default; expose as load options);
+per-token-argmax fallback if all paths die. Map token spans → byte offsets from the o200k byte
+stream (UTF-8 aware). Optional whitespace-trim + per-label de-overlap. Keep it here (not Go) so
+the token→byte mapping stays with the tokenizer and the `TokenClassify` proto is unchanged.
+
+## Verification ladder (per `../pii-ner-ggml-backend.md` §7)
+
+1. Mask parity (§C) on a >257-token input.
+2. Per-layer parity vs HF (post-RoPE q/k, sink-incl. fp32 probs, router top-4, expert outs,
+   post-MoE ×4 rescale, final norm, `score` logits) — single-window inputs.
+3. Span-F1 per language vs the `opf` CLI on an AI4Privacy held-out slice.
+4. Windowing equivalence: windowed vs single banded forward on a long input → identical labels.
+</content>
diff --git a/docs/plans/pii-ner-ggml/conversion_openai_privacy_filter.py b/docs/plans/pii-ner-ggml/conversion_openai_privacy_filter.py
new file mode 100644
index 000000000000..3bf2056e051c
--- /dev/null
+++ b/docs/plans/pii-ner-ggml/conversion_openai_privacy_filter.py
@@ -0,0 +1,111 @@
+# DRAFT / SKELETON — not wired into the build yet.
+#
+# Target location once real: llama.cpp `conversion/openai_privacy_filter.py`, registered in
+# `conversion/__init__.py` ("OpenAIPrivacyFilterForTokenClassification": "openai_privacy_filter").
+#
+# Converts openai/privacy-filter + OpenMed/privacy-filter-multilingual (HF
+# `OpenAIPrivacyFilterForTokenClassification`, model_type `openai_privacy_filter`) to GGUF.
+#
+# It subclasses the gpt-oss converter to reuse the o200k/harmony BPE vocab and the gpt-oss
+# tensor map, and overrides only what differs (verified against the vendored
+# `conversion/gpt_oss.py` @ commit 22d66b56 and the HF modular source):
+#   1. a new arch (MODEL_ARCH.OPENAI_PRIVACY_FILTER) — see INTEGRATION.md
+#   2. experts: privacy-filter packs gate_up as CONCATENATED halves (chunk(2)), NOT gpt-oss's
+#      INTERLEAVED ::2/1::2 — this is the single most important override (wrong split = wrong model)
+#   3. a token-classification head: HF `score.{weight,bias}` -> ggml `cls.output.{weight,bias}`
+#   4. pooling = TOKEN_CLS + classifier output labels (depends on PR #19725's writer API)
+#   5. bidirectional encoder: no lm_head; runtime runs non-causal (see INTEGRATION.md / load_hparams)
+#
+# privacy-filter ships bf16 dense experts (NOT MXFP4), so we always hit the gpt-oss
+# "not in MXFP4" dense path (transpose + split); none of the MXFP4 repack code runs.
+
+from __future__ import annotations
+
+from typing import Iterable, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+from .base import ModelBase, gguf, logger
+from .gpt_oss import GptOssModel
+
+
+@ModelBase.register("OpenAIPrivacyFilterForTokenClassification")
+class OpenAIPrivacyFilterModel(GptOssModel):
+    model_arch = gguf.MODEL_ARCH.OPENAI_PRIVACY_FILTER  # NEW — add in gguf-py/gguf/constants.py
+
+    def set_vocab(self):
+        # identical to gpt-oss: o200k_base / harmony special tokens via GPT2 BPE export
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        # GptOssModel.set_gguf_parameters writes: base text params + sliding_window +
+        # expert_feed_forward_length + (inherited) rope/yarn. We keep all of it.
+        super().set_gguf_parameters()
+
+        # --- token-classification head ---
+        # PoolingType.TOKEN_CLS == 5 (added by PR #19725 to gguf-py constants).
+        self.gguf_writer.add_pooling_type(gguf.PoolingType.TOKEN_CLS)
+        # ordered label list, index 0 == "O"; writer key "*.classifier.output_labels".
+        # NOTE: confirm the exact writer method name against PR #19725
+        # (add_classifier_output_labels(...) is what its diff adds); n_cls_out is derived
+        # from the label count by the loader.
+        self.gguf_writer.add_classifier_output_labels(self._ordered_labels())  # TODO: verify API
+
+        # Bidirectional encoder. There is no dedicated "is_causal" GGUF key today; we make the
+        # runtime non-causal in load_hparams (causal_attn=false) + SYMMETRIC SWA. See
+        # INTEGRATION.md §"non-causal banded mask". sliding_window (=128, the HALF-window) is
+        # already written by super(); the loader maps it to n_swa=256 for SWA_TYPE_SYMMETRIC.
+
+    def _ordered_labels(self) -> list[str]:
+        # HF id2label is {"0": "O", "1": "B-private_person", ...}; emit in index order so the
+        # GGUF label table lines up with the score-head rows. 33 (base) / 217 (multilingual).
+        id2label = self.hparams["id2label"]
+        return [id2label[str(i)] for i in range(len(id2label))]
+
+    def modify_tensors(self, data_torch: "Tensor", name: str, bid: int | None) -> Iterable[tuple[str, "Tensor"]]:
+        # --- classification head -> cls.output ---
+        # Relies on a "score" -> MODEL_TENSOR.CLS_OUT entry in tensor_mapping.py (add it), so
+        # map_tensor_name resolves both score.weight and score.bias.
+        if name in ("score.weight", "score.bias"):
+            yield from super(GptOssModel, self).modify_tensors(data_torch, name, bid)
+            return
+
+        # --- experts: CONCATENATED (chunk) split, NOT gpt-oss interleaved ---
+        # gpt-oss dense path does: transpose(-1,-2) then gate=[:, ::2, :], up=[:, 1::2, :].
+        # privacy-filter's _apply_gate uses gate_up.chunk(2, dim=-1): gate = first half,
+        # up = second half. After transpose [E, 2*inter, hidden] -> gate=[:, :inter, :],
+        # up=[:, inter:, :].
+        if "gate_up_proj" in name:
+            inter = self.hparams["intermediate_size"]  # 640
+            if name.endswith("_bias"):
+                gate_b, up_b = data_torch[..., :inter], data_torch[..., inter:]
+                name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias")
+                name_up = name.replace("gate_up_proj_bias", "up_proj.bias")
+                # bypass GptOssModel.modify_tensors (interleaved) -> straight to TextModel
+                yield from super(GptOssModel, self).modify_tensors(gate_b, name_gate, bid)
+                yield from super(GptOssModel, self).modify_tensors(up_b, name_up, bid)
+                return
+            if "_blocks" not in name and "_scales" not in name:  # dense bf16 (always true here)
+                data_torch = data_torch.transpose(-1, -2)
+                gate_w, up_w = data_torch[:, :inter, :], data_torch[:, inter:, :]
+                name_gate = name.replace("gate_up_proj", "gate_proj.weight")
+                name_up = name.replace("gate_up_proj", "up_proj.weight")
+                yield from super(GptOssModel, self).modify_tensors(gate_w, name_gate, bid)
+                yield from super(GptOssModel, self).modify_tensors(up_w, name_up, bid)
+                return
+            logger.warning(f"unexpected MXFP4 expert tensor in privacy-filter: {name}")
+
+        # down_proj (dense bf16): naming + transpose, same as gpt-oss non-MXFP4 path
+        if "down_proj" in name and not name.endswith("_bias"):
+            name = name.replace("down_proj", "down_proj.weight")
+            data_torch = data_torch.transpose(-1, -2)
+            yield from super(GptOssModel, self).modify_tensors(data_torch, name, bid)
+            return
+
+        # everything else (q/k/v/o + biases, attn sinks, router + bias, norms, embeddings):
+        # gpt-oss handles these correctly (note filter_tensors appends ".weight" to sinks).
+        yield from super().modify_tensors(data_torch, name, bid)
+
+    # NOTE: we do NOT emit `output` / lm_head. The base may try to tie/emit an output tensor;
+    # ensure the arch's tensor list (constants.py) omits MODEL_TENSOR.OUTPUT so nothing expects it.
diff --git a/docs/plans/pii-ner-ggml/openai-privacy-filter.cpp b/docs/plans/pii-ner-ggml/openai-privacy-filter.cpp
new file mode 100644
index 000000000000..37395f98e248
--- /dev/null
+++ b/docs/plans/pii-ner-ggml/openai-privacy-filter.cpp
@@ -0,0 +1,149 @@
+// DRAFT / SKELETON — not wired into the build yet.
+//
+// Target: llama.cpp `src/models/openai-privacy-filter.cpp` (+ class decl in src/models/models.h,
+// + add_subdirectory/source entry in src/CMakeLists.txt, + arch in src/llama-arch.{h,cpp},
+// + loader in src/llama-model.cpp). See INTEGRATION.md for the full touch-point list.
+//
+// This is `src/models/openai-moe.cpp` (gpt-oss) with exactly three changes, each marked
+// "CHANGE vs gpt-oss". Everything else is copied so the shared, hardened kernels
+// (build_qkv, ggml_rope_ext/YaRN, build_attn + attn_sinks, build_moe_ffn SWIGLU_OAI_MOE) are
+// reused verbatim. Verified against openai-moe.cpp @ commit 22d66b56.
+//
+// Reuse note: the symmetric banded NON-CAUSAL mask is NOT new code — it falls out of the
+// no-cache attention path (build_attn_inp_no_cache -> fill_mask) once load_hparams sets
+//   hparams.causal_attn = false; hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC; hparams.n_swa = 256;
+// (SYMMETRIC masks |p1-p0| > n_swa/2, so n_swa = 2*sliding_window = 256 gives |q-kv| <= 128.)
+
+#include "models.h"
+
+void llama_model_openai_privacy_filter::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+
+    uint32_t sliding_window = 0;
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, sliding_window);   // = 128 (HALF-window)
+
+    // CHANGE vs gpt-oss: bidirectional, symmetric band, no causal/global alternation.
+    hparams.causal_attn = false;                       // encoder; whole seq in one ubatch
+    hparams.swa_type    = LLAMA_SWA_TYPE_SYMMETRIC;    // |p1-p0| > n_swa/2 -> masked
+    hparams.n_swa       = 2 * sliding_window;          // 256 -> half-window 128 == HF band
+    // NB(verify): SYMMETRIC half = n_swa/2. The ×2 here is the most likely off-by-one source —
+    // assert against an HF reference attention map on a >257-token input (see INTEGRATION.md §verify).
+
+    // n_cls_out / pooling_type / classifier labels are read by the generic loader from GGUF
+    // (already present for the reranker; TOKEN_CLS pooling value comes from PR #19725).
+
+    type = LLM_TYPE_UNKNOWN;  // ~1.5B, single published size; n_layer == 8
+}
+
+void llama_model_openai_privacy_filter::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+    const int64_t n_ff_exp = hparams.n_ff_exp;
+
+    tok_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+    // CHANGE vs gpt-oss: NO lm_head (`output`). Instead the token-classification head.
+    // cls_out/cls_out_b are model-level (like bert.cpp), loaded into model.cls_out / cls_out_b.
+    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, 0);
+    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_head * n_rot, n_head_kv * n_rot, n_head_kv * n_rot, 0);
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT,  "weight", i), {n_head * n_rot, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT,  "bias",   i), {n_embd}, 0);
+
+        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
+
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+
+        layer.ffn_gate_inp_b  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "bias", i), {n_expert}, 0);
+        layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), {n_embd, n_expert}, 0);
+        layer.ffn_up_exps_b   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "bias", i), {n_ff_exp, n_expert}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_openai_privacy_filter::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_openai_privacy_filter::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    // CHANGE vs gpt-oss (1/2): no-cache, non-causal attention input. The symmetric band is
+    // applied by fill_mask from hparams.{swa_type,n_swa}. (gpt-oss used build_attn_inp_kv_iswa.)
+    auto * inp_attn = build_attn_inp_no_cache();
+
+    for (int il = 0; il < n_layer; ++il) {
+        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
+        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+        ggml_tensor * inpSA = inpL;
+
+        cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // self-attention (identical to gpt-oss: RoPE/YaRN + sinks + 1/sqrt(d) kq_scale)
+        {
+            auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, n_rot, n_head, n_head_kv, il);
+
+            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                                 freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow);
+            Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                                 freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow);
+            cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il);
+
+            // kq_scale 1/sqrt(n_rot) == HF's head_dim**-0.25 applied to q and k (verify in fp16).
+            cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
+                             Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr,
+                             1.0f/sqrtf(float(n_rot)), il);
+            cb(cur, "attn_out", il);
+        }
+        // NOTE: gpt-oss does build_inp_out_ids() + ggml_get_rows on the last layer to drop
+        // unused tokens. For token classification we need ALL token outputs, so we do NOT
+        // prune here (n_outputs == n_tokens under token-level pooling).
+
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "attn_post_norm", il);
+
+        cur = build_moe_ffn(cur,
+                model.layers[il].ffn_gate_inp,  model.layers[il].ffn_gate_inp_b,
+                model.layers[il].ffn_up_exps,   model.layers[il].ffn_up_exps_b,
+                model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
+                model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
+                nullptr, n_expert, n_expert_used,
+                LLM_FFN_SWIGLU_OAI_MOE, false, hparams.expert_weights_scale,
+                LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT, il);
+        cb(cur, "ffn_moe_out", il);
+
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+        inpL = cur;
+    }
+
+    cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // CHANGE vs gpt-oss (2/2): no lm_head. build_pooling applies cls_out per token under
+    // LLAMA_POOLING_TYPE_TOKEN_CLS (PR #19725) -> result is [n_cls_out, n_tokens].
+    // Mirrors the encoder graphs (bert.cpp ends at res->t_embd and lets the framework pool).
+    ggml_build_forward_expand(gf, res->t_embd);
+}
diff --git a/flake.nix b/flake.nix
index 30f57a05759b..e7145d393e89 100644
--- a/flake.nix
+++ b/flake.nix
@@ -70,6 +70,19 @@
           protoc-gen-go
           protoc-gen-go-grpc
 
+          # C++ gRPC + protobuf for the vendored llama.cpp backend
+          # (backend/cpp/llama-cpp `make grpc-server`). The CMake build does
+          # find_package(gRPC)/find_package(Protobuf); without grpc here the
+          # shell exposes protobuf alone and the build fails to locate gRPC
+          # (or links a stale, version-skewed grpc from the store). nixpkgs
+          # builds `grpc` against this same `protobuf`, so the pair is
+          # self-consistent. Docker (backend/Dockerfile.base-grpc-builder)
+          # compiles gRPC v1.65.0 / protoc v27.1 from source; nixpkgs here is
+          # newer (grpc 1.80 / protobuf 34) but wire- and ABI-consistent
+          # within the backend. Pin protobuf_27 + a grpc override if exact
+          # Docker version parity is ever required.
+          grpc
+
           # React UI build (core/http/react-ui — `make react-ui`)
           nodejs
           bun  # alternative to npm, used by `make react-ui-docker`
diff --git a/gallery/index.yaml b/gallery/index.yaml
index d8e62c9b03d7..eb57c79c2fc4 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -106,6 +106,111 @@
     - filename: llama-cpp/mmproj/Step-3.7-Flash-GGUF/mmproj-F32.gguf
       sha256: 2fab13dcd32e4b3dc4410297df80f4d82627308e725dedac802940ceca7dff13
       uri: https://huggingface.co/unsloth/Step-3.7-Flash-GGUF/resolve/main/mmproj-F32.gguf
+- name: "privacy-filter-multilingual"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/5fd5e18a90b6dc4633f6d292/QPiv8pt4JNxr0FdGnpFef.png
+  urls:
+    - https://huggingface.co/OpenMed/privacy-filter-multilingual
+    - https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF
+  description: |
+    A multilingual PII token-classification model: a fine-tune of
+    openai/privacy-filter by OpenMed. It labels every token with a BIOES tag
+    over 54 PII categories (217 classes) across 16 languages (ar, bn, de, en,
+    es, fr, hi, it, ja, ko, nl, pt, te, tr, vi, zh), spanning identity, contact,
+    address, financial, vehicle, digital, and crypto entities.
+
+    In LocalAI this is a PII detector for the NER redactor tier: set
+    known_usecases to [token_classify] (as below), and any model opts into
+    redaction by listing this one under pii.detectors. The detection policy
+    (which categories to mask vs block, and the score threshold) lives on this
+    model's own pii_detection block - see the overrides below. It runs locally
+    with no Python, served by the vendored llama.cpp backend's TokenClassify
+    RPC (constrained BIOES Viterbi decode into UTF-8 byte-offset entity spans).
+
+    Architecture: gpt-oss-style sparse MoE (8 layers, 128 experts top-4, ~50M
+    active per token), bidirectional banded attention, o200k tokenizer; served
+    via the openai-privacy-filter architecture. F16, ~2.7 GB.
+  license: apache-2.0
+  tags:
+    - token-classification
+    - ner
+    - pii
+    - privacy
+    - multilingual
+    - gguf
+  overrides:
+    backend: llama-cpp
+    embeddings: true
+    known_usecases:
+      - token_classify
+    parameters:
+      model: llama-cpp/models/privacy-filter-multilingual/privacy-filter-multilingual-f16.gguf
+    # Detection policy used when another model references this one via
+    # pii.detectors. Default-mask everything the model flags; block the
+    # credential/financial-secret/crypto categories. Keys are the model's
+    # own entity-group names (uppercase, no separators); anything not
+    # listed falls through to default_action: mask.
+    pii_detection:
+      min_score: 0.5
+      default_action: mask
+      entity_actions:
+        PASSWORD: block
+        PIN: block
+        CVV: block
+        CREDITCARD: block
+        IBAN: block
+        BIC: block
+        BANKACCOUNT: block
+        SSN: block
+        BITCOINADDRESS: block
+        ETHEREUMADDRESS: block
+        LITECOINADDRESS: block
+  files:
+    - filename: llama-cpp/models/privacy-filter-multilingual/privacy-filter-multilingual-f16.gguf
+      sha256: 01b76572f80b7d2ebee80a27cb9c3699c26b04cae1c402eee7664fc17a4b5ce6
+      uri: https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF/resolve/main/privacy-filter-multilingual-f16.gguf
+- name: "secret-filter"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  description: |
+    A pattern-based PII detector for high-entropy, highly-regular secrets —
+    API keys, tokens, and private-key blocks — that the NER tier cannot catch
+    (it has no credential class, so it fragments a key and may leave the secret
+    part exposed). Detection is bounded restricted-regex compiled to RE2
+    (linear time, no backtracking); it runs entirely in-process with no model
+    download, no backend, and zero VRAM.
+
+    Install it, then reference it under another model's pii.detectors (or set it
+    as the instance-wide default detector on the Middleware page) to block leaks
+    of known credential formats out of the box. Add your own patterns under
+    pii_detection.patterns in a restricted regex subset (e.g. "tok-\\w{32,}");
+    each must carry a fixed literal anchor of at least 3 characters, so open-
+    ended shapes like email addresses are rejected and left to the NER tier.
+  license: apache-2.0
+  tags:
+    - pii
+    - privacy
+    - secrets
+    - pattern
+  overrides:
+    backend: pattern
+    known_usecases:
+      - token_classify
+    # Matched secrets are blocked by default (a leaked credential should not
+    # reach an upstream provider); downgrade individual groups to mask/allow
+    # via entity_actions if needed. Group names mirror the built-in catalogue.
+    pii_detection:
+      default_action: block
+      builtins:
+        - anthropic_api_key
+        - openai_api_key
+        - github_token
+        - github_pat
+        - aws_access_key
+        - google_api_key
+        - slack_token
+        - stripe_key
+        - jwt
+        - private_key_block
 - name: "lfm2.5-8b-a1b"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:
@@ -24092,6 +24197,14 @@
     # concurrent generation traffic; see model_config.go validation).
     known_usecases:
       - score
+    # Scoring decodes the whole prompt+candidate in one llama_decode and reads
+    # a logit row per candidate token. The llama.cpp server caps the causal
+    # output rows at n_parallel, so the default (1) aborts with
+    # GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max). Raise it to cover
+    # multi-token route labels; kv_unified (the grpc-server default) keeps the
+    # full context per sequence, so this does not split the KV cache.
+    options:
+      - parallel:64
     parameters:
       model: Arch-Router-1.5B.Q4_K_M.gguf
   files:
@@ -24103,6 +24216,9 @@
   overrides:
     known_usecases:
       - score
+    # See the q4 entry: lift the scoring output-row cap above the default 1.
+    options:
+      - parallel:64
     parameters:
       model: Arch-Router-1.5B.Q8_0.gguf
   files:
diff --git a/pkg/mcp/localaitools/client.go b/pkg/mcp/localaitools/client.go
index 60090d63aebf..5ac519aca46d 100644
--- a/pkg/mcp/localaitools/client.go
+++ b/pkg/mcp/localaitools/client.go
@@ -76,24 +76,11 @@ type LocalAIClient interface {
 	GetUsageStats(ctx context.Context, q UsageStatsQuery) (*UsageStats, error)
 
 	// ---- PII filter ----
-	// ListPIIPatterns returns the active PII pattern set with each
-	// one's action.
-	ListPIIPatterns(ctx context.Context) ([]PIIPattern, error)
 	// GetPIIEvents returns recent redaction events. Implementation
-	// enforces "admin required" when auth is on.
+	// enforces "admin required" when auth is on. The regex pattern tools
+	// were removed — detection policy lives on each detector model's
+	// pii_detection block, managed via the model-config tools.
 	GetPIIEvents(ctx context.Context, q PIIEventsQuery) ([]PIIEvent, error)
-	// TestPIIRedaction dry-runs the redactor against text. No event
-	// is recorded.
-	TestPIIRedaction(ctx context.Context, req PIIRedactTestRequest) (*PIIRedactTestResult, error)
-	// SetPIIPatternAction mutates the named pattern's action and/or
-	// disabled state in-process. Transient until PersistPIIPatterns is
-	// called — runtime_settings.json then applies the deltas on the
-	// next start. Admin-required.
-	SetPIIPatternAction(ctx context.Context, req PIIPatternActionUpdate) error
-
-	// PersistPIIPatterns snapshots the live redactor's per-pattern
-	// (action, disabled) state into runtime_settings.json. Admin-required.
-	PersistPIIPatterns(ctx context.Context) error
 
 	// ---- Middleware admin ----
 	// GetMiddlewareStatus returns the aggregated state surfaced on the
diff --git a/pkg/mcp/localaitools/coverage_test.go b/pkg/mcp/localaitools/coverage_test.go
index 8159afcf9e89..ddf5e9c1d5be 100644
--- a/pkg/mcp/localaitools/coverage_test.go
+++ b/pkg/mcp/localaitools/coverage_test.go
@@ -38,25 +38,21 @@ var toolToHTTPRoute = map[string]string{
 	ToolVRAMEstimate:        "POST /api/models/vram-estimate",
 	ToolGetBranding:         "GET /api/branding",
 	ToolGetUsageStats:       "GET /api/usage (or /api/usage/all when all=true)",
-	ToolListPIIPatterns:     "GET /api/pii/patterns",
 	ToolGetPIIEvents:        "GET /api/pii/events",
-	ToolTestPIIRedaction:    "POST /api/pii/test",
 	ToolGetMiddlewareStatus: "GET /api/middleware/status",
 	ToolGetRouterDecisions:  "GET /api/router/decisions",
 
 	// Mutating tools.
-	ToolInstallModel:        "POST /models/apply",
-	ToolImportModelURI:      "POST /models/import-uri",
-	ToolDeleteModel:         "POST /models/delete/:name",
-	ToolEditModelConfig:     "PATCH /api/models/config-json/:name",
-	ToolReloadModels:        "POST /models/reload",
-	ToolInstallBackend:      "POST /backends/apply",
-	ToolUpgradeBackend:      "POST /backends/upgrade/:name",
-	ToolToggleModelState:    "PUT /models/toggle-state/:name/:action",
-	ToolToggleModelPinned:   "PUT /models/toggle-pinned/:name/:action",
-	ToolSetBranding:         "POST /api/settings (instance_name, instance_tagline)",
-	ToolSetPIIPatternAction: "PUT /api/pii/patterns/:id",
-	ToolPersistPIIPatterns:  "POST /api/pii/patterns/persist",
+	ToolInstallModel:      "POST /models/apply",
+	ToolImportModelURI:    "POST /models/import-uri",
+	ToolDeleteModel:       "POST /models/delete/:name",
+	ToolEditModelConfig:   "PATCH /api/models/config-json/:name",
+	ToolReloadModels:      "POST /models/reload",
+	ToolInstallBackend:    "POST /backends/apply",
+	ToolUpgradeBackend:    "POST /backends/upgrade/:name",
+	ToolToggleModelState:  "PUT /models/toggle-state/:name/:action",
+	ToolToggleModelPinned: "PUT /models/toggle-pinned/:name/:action",
+	ToolSetBranding:       "POST /api/settings (instance_name, instance_tagline)",
 }
 
 // allKnownTools is the union of expectedFullCatalog (defined in
diff --git a/pkg/mcp/localaitools/dto.go b/pkg/mcp/localaitools/dto.go
index 85136c60ae3a..77e9a9065e37 100644
--- a/pkg/mcp/localaitools/dto.go
+++ b/pkg/mcp/localaitools/dto.go
@@ -77,11 +77,11 @@ type Backend struct {
 
 // SystemInfo summarises the LocalAI deployment.
 type SystemInfo struct {
-	Version          string   `json:"version"`
-	Distributed      bool     `json:"distributed"`
-	BackendsPath     string   `json:"backends_path,omitempty"`
-	ModelsPath       string   `json:"models_path,omitempty"`
-	LoadedModels     []string `json:"loaded_models,omitempty"`
+	Version           string   `json:"version"`
+	Distributed       bool     `json:"distributed"`
+	BackendsPath      string   `json:"backends_path,omitempty"`
+	ModelsPath        string   `json:"models_path,omitempty"`
+	LoadedModels      []string `json:"loaded_models,omitempty"`
 	InstalledBackends []string `json:"installed_backends,omitempty"`
 }
 
@@ -184,19 +184,11 @@ type UsageBucket struct {
 
 // ---- PII / sensitive data tools ----
 
-// PIIPattern is one row in the list_pii_patterns response.
-type PIIPattern struct {
-	ID             string `json:"id"`
-	Description    string `json:"description"`
-	Action         string `json:"action"` // mask | block | route_local
-	MaxMatchLength int    `json:"max_match_length"`
-}
-
 // PIIEventsQuery filters get_pii_events.
 type PIIEventsQuery struct {
 	CorrelationID string `json:"correlation_id,omitempty" jsonschema:"Optional X-Correlation-ID join key (binds events to the request and usage record)."`
 	UserID        string `json:"user_id,omitempty"        jsonschema:"Optional user id to scope the query."`
-	PatternID     string `json:"pattern_id,omitempty"     jsonschema:"Optional pattern id (e.g. email, ssn)."`
+	PatternID     string `json:"pattern_id,omitempty"     jsonschema:"Optional detector group id (e.g. ner:EMAIL)."`
 	Limit         int    `json:"limit,omitempty"          jsonschema:"Maximum events. Defaults to 100."`
 }
 
@@ -215,38 +207,6 @@ type PIIEvent struct {
 	CreatedAt     string `json:"created_at"`
 }
 
-// PIIRedactTestRequest is the input for test_pii_redaction.
-type PIIRedactTestRequest struct {
-	Text string `json:"text" jsonschema:"The candidate text. Will be run through the redactor without recording an event."`
-}
-
-// PIIRedactTestResult is the output for test_pii_redaction. spans
-// describes where the redactor matched; redacted is the text after
-// applying mask actions; blocked / local_only flag stronger actions.
-type PIIRedactTestResult struct {
-	Redacted  string        `json:"redacted"`
-	Spans     []PIIEventSpan `json:"spans"`
-	Blocked   bool          `json:"blocked"`
-	LocalOnly bool          `json:"local_only"`
-}
-
-type PIIEventSpan struct {
-	Start      int    `json:"start"`
-	End        int    `json:"end"`
-	Pattern    string `json:"pattern"`
-	HashPrefix string `json:"hash_prefix"`
-}
-
-// PIIPatternActionUpdate is the input for set_pii_pattern_action.
-// At least one of Action or Disabled must be set. Mutations are
-// transient by default — call persist_pii_patterns to flush them
-// to runtime_settings.json so the next start re-applies them.
-type PIIPatternActionUpdate struct {
-	ID       string `json:"id" jsonschema:"Pattern id to mutate (e.g. email, ssn, credit_card, api_key_prefix)."`
-	Action   string `json:"action,omitempty" jsonschema:"New action: mask, block, or route_local. Optional — omit to leave the action unchanged."`
-	Disabled *bool  `json:"disabled,omitempty" jsonschema:"Set true to skip this pattern entirely; false to re-enable. Optional — omit to leave enabled-state unchanged."`
-}
-
 // MiddlewareStatus is the aggregated /api/middleware/status payload —
 // the React Middleware page renders this in one go. Routing is a
 // placeholder until subsystem 2 lands.
@@ -255,25 +215,25 @@ type MiddlewareStatus struct {
 	Router MiddlewareRouterStatus `json:"router"`
 }
 
-// MiddlewarePIIStatus shows what the redactor is doing right now and
-// which models opt in. enabled_globally=false means --disable-pii.
+// MiddlewarePIIStatus shows which models opt in to PII redaction and the
+// NER detector models they reference. The detection policy itself lives
+// on each detector model's pii_detection block.
 type MiddlewarePIIStatus struct {
-	EnabledGlobally           bool                  `json:"enabled_globally"`
-	Reason                    string                `json:"reason,omitempty"`
-	DefaultEnabledForBackends []string              `json:"default_enabled_for_backends,omitempty"`
-	Patterns                  []PIIPattern          `json:"patterns"`
-	Models                    []MiddlewarePIIModel  `json:"models"`
-	RecentEventCount          int                   `json:"recent_event_count"`
+	EnabledGlobally           bool                 `json:"enabled_globally"`
+	Reason                    string               `json:"reason,omitempty"`
+	DefaultEnabledForBackends []string             `json:"default_enabled_for_backends,omitempty"`
+	Models                    []MiddlewarePIIModel `json:"models"`
+	RecentEventCount          int                  `json:"recent_event_count"`
 }
 
 // MiddlewarePIIModel is one model row in the per-model PII table.
 type MiddlewarePIIModel struct {
-	Name              string            `json:"name"`
-	Backend           string            `json:"backend"`
-	Enabled           bool              `json:"enabled"`
-	Explicit          bool              `json:"explicit"`             // Did YAML set Enabled, or did the backend prefix decide?
-	DefaultForBackend bool              `json:"default_for_backend"`  // Backend matches the auto-on rule (proxy-*).
-	Overrides         map[string]string `json:"overrides,omitempty"`
+	Name              string   `json:"name"`
+	Backend           string   `json:"backend"`
+	Enabled           bool     `json:"enabled"`
+	Explicit          bool     `json:"explicit"`            // Did YAML set Enabled, or did the backend prefix decide?
+	DefaultForBackend bool     `json:"default_for_backend"` // Backend matches the auto-on rule (proxy-*).
+	Detectors         []string `json:"detectors,omitempty"` // NER detector model names this config references.
 }
 
 // MiddlewareRouterStatus is the placeholder shape the Routing tab
diff --git a/pkg/mcp/localaitools/fakes_test.go b/pkg/mcp/localaitools/fakes_test.go
index cbe429a081a6..3d76ae8b96da 100644
--- a/pkg/mcp/localaitools/fakes_test.go
+++ b/pkg/mcp/localaitools/fakes_test.go
@@ -45,10 +45,7 @@ type fakeClient struct {
 	getBranding         func() (*Branding, error)
 	setBranding         func(SetBrandingRequest) (*Branding, error)
 	getUsageStats       func(UsageStatsQuery) (*UsageStats, error)
-	listPIIPatterns     func() ([]PIIPattern, error)
 	getPIIEvents        func(PIIEventsQuery) ([]PIIEvent, error)
-	testPIIRedaction    func(PIIRedactTestRequest) (*PIIRedactTestResult, error)
-	setPIIPatternAction func(PIIPatternActionUpdate) error
 	getMiddlewareStatus func() (*MiddlewareStatus, error)
 	getRouterDecisions  func(RouterDecisionsQuery) ([]RouterDecision, error)
 }
@@ -253,14 +250,6 @@ func (f *fakeClient) GetUsageStats(_ context.Context, q UsageStatsQuery) (*Usage
 	}, nil
 }
 
-func (f *fakeClient) ListPIIPatterns(_ context.Context) ([]PIIPattern, error) {
-	f.record("ListPIIPatterns", nil)
-	if f.listPIIPatterns != nil {
-		return f.listPIIPatterns()
-	}
-	return []PIIPattern{}, nil
-}
-
 func (f *fakeClient) GetPIIEvents(_ context.Context, q PIIEventsQuery) ([]PIIEvent, error) {
 	f.record("GetPIIEvents", q)
 	if f.getPIIEvents != nil {
@@ -269,27 +258,6 @@ func (f *fakeClient) GetPIIEvents(_ context.Context, q PIIEventsQuery) ([]PIIEve
 	return []PIIEvent{}, nil
 }
 
-func (f *fakeClient) TestPIIRedaction(_ context.Context, req PIIRedactTestRequest) (*PIIRedactTestResult, error) {
-	f.record("TestPIIRedaction", req)
-	if f.testPIIRedaction != nil {
-		return f.testPIIRedaction(req)
-	}
-	return &PIIRedactTestResult{Redacted: req.Text}, nil
-}
-
-func (f *fakeClient) SetPIIPatternAction(_ context.Context, req PIIPatternActionUpdate) error {
-	f.record("SetPIIPatternAction", req)
-	if f.setPIIPatternAction != nil {
-		return f.setPIIPatternAction(req)
-	}
-	return nil
-}
-
-func (f *fakeClient) PersistPIIPatterns(_ context.Context) error {
-	f.record("PersistPIIPatterns", nil)
-	return nil
-}
-
 func (f *fakeClient) GetRouterDecisions(_ context.Context, q RouterDecisionsQuery) ([]RouterDecision, error) {
 	f.record("GetRouterDecisions", q)
 	if f.getRouterDecisions != nil {
@@ -306,10 +274,8 @@ func (f *fakeClient) GetMiddlewareStatus(_ context.Context) (*MiddlewareStatus,
 	return &MiddlewareStatus{
 		PII: MiddlewarePIIStatus{
 			EnabledGlobally: true,
-			Patterns:        []PIIPattern{},
 			Models:          []MiddlewarePIIModel{},
 		},
 		Router: MiddlewareRouterStatus{Configured: false, Models: []string{}},
 	}, nil
 }
-
diff --git a/pkg/mcp/localaitools/httpapi/client.go b/pkg/mcp/localaitools/httpapi/client.go
index c180b79c2655..d2947a5b1450 100644
--- a/pkg/mcp/localaitools/httpapi/client.go
+++ b/pkg/mcp/localaitools/httpapi/client.go
@@ -582,16 +582,6 @@ func (c *Client) GetUsageStats(ctx context.Context, q localaitools.UsageStatsQue
 
 // ---- PII filter ----
 
-func (c *Client) ListPIIPatterns(ctx context.Context) ([]localaitools.PIIPattern, error) {
-	var raw struct {
-		Patterns []localaitools.PIIPattern `json:"patterns"`
-	}
-	if err := c.do(ctx, http.MethodGet, routePIIPatterns, nil, &raw); err != nil {
-		return nil, err
-	}
-	return raw.Patterns, nil
-}
-
 func (c *Client) GetPIIEvents(ctx context.Context, q localaitools.PIIEventsQuery) ([]localaitools.PIIEvent, error) {
 	qs := url.Values{}
 	if q.CorrelationID != "" {
@@ -624,35 +614,6 @@ func (c *Client) GetPIIEvents(ctx context.Context, q localaitools.PIIEventsQuery
 	return raw.Events, nil
 }
 
-func (c *Client) TestPIIRedaction(ctx context.Context, req localaitools.PIIRedactTestRequest) (*localaitools.PIIRedactTestResult, error) {
-	var out localaitools.PIIRedactTestResult
-	if err := c.do(ctx, http.MethodPost, routePIITest, map[string]string{"text": req.Text}, &out); err != nil {
-		return nil, err
-	}
-	return &out, nil
-}
-
-func (c *Client) SetPIIPatternAction(ctx context.Context, req localaitools.PIIPatternActionUpdate) error {
-	if req.ID == "" {
-		return fmt.Errorf("pattern id is required")
-	}
-	body := map[string]any{}
-	if req.Action != "" {
-		body["action"] = req.Action
-	}
-	if req.Disabled != nil {
-		body["disabled"] = *req.Disabled
-	}
-	if len(body) == 0 {
-		return fmt.Errorf("must specify action and/or disabled")
-	}
-	return c.do(ctx, http.MethodPut, routePIIPatternByID(req.ID), body, nil)
-}
-
-func (c *Client) PersistPIIPatterns(ctx context.Context) error {
-	return c.do(ctx, http.MethodPost, routePIIPatternsPersist, nil, nil)
-}
-
 func (c *Client) GetMiddlewareStatus(ctx context.Context) (*localaitools.MiddlewareStatus, error) {
 	var out localaitools.MiddlewareStatus
 	if err := c.do(ctx, http.MethodGet, routeMiddleware, nil, &out); err != nil {
diff --git a/pkg/mcp/localaitools/httpapi/routes.go b/pkg/mcp/localaitools/httpapi/routes.go
index 4be8f2ad87d1..79504dc1be5f 100644
--- a/pkg/mcp/localaitools/httpapi/routes.go
+++ b/pkg/mcp/localaitools/httpapi/routes.go
@@ -11,33 +11,26 @@ import (
 // registrations in core/http/routes/localai.go — the Tool↔REST drift detector
 // in coverage_test.go documents the mapping.
 const (
-	routeWelcome            = "/"
-	routeModelsApply        = "/models/apply"
-	routeModelsAvail        = "/models/available"
-	routeModelsGall         = "/models/galleries"
-	routeModelsImport       = "/models/import-uri"
-	routeModelsReload       = "/models/reload"
-	routeBackends           = "/backends"
-	routeBackendsKnown      = "/backends/known"
-	routeBackendsApply      = "/backends/apply"
-	routeNodes              = "/api/nodes"
-	routeVRAMEstimate       = "/api/models/vram-estimate"
-	routeBranding           = "/api/branding"
-	routeSettings           = "/api/settings"
-	routeUsage              = "/api/usage"
-	routeUsageAll           = "/api/usage/all"
-	routePIIPatterns        = "/api/pii/patterns"
-	routePIIPatternsPersist = "/api/pii/patterns/persist"
-	routePIIEvents          = "/api/pii/events"
-	routePIITest            = "/api/pii/test"
-	routeMiddleware         = "/api/middleware/status"
-	routeRouterDecisions    = "/api/router/decisions"
+	routeWelcome         = "/"
+	routeModelsApply     = "/models/apply"
+	routeModelsAvail     = "/models/available"
+	routeModelsGall      = "/models/galleries"
+	routeModelsImport    = "/models/import-uri"
+	routeModelsReload    = "/models/reload"
+	routeBackends        = "/backends"
+	routeBackendsKnown   = "/backends/known"
+	routeBackendsApply   = "/backends/apply"
+	routeNodes           = "/api/nodes"
+	routeVRAMEstimate    = "/api/models/vram-estimate"
+	routeBranding        = "/api/branding"
+	routeSettings        = "/api/settings"
+	routeUsage           = "/api/usage"
+	routeUsageAll        = "/api/usage/all"
+	routePIIEvents       = "/api/pii/events"
+	routeMiddleware      = "/api/middleware/status"
+	routeRouterDecisions = "/api/router/decisions"
 )
 
-func routePIIPatternByID(id string) string {
-	return "/api/pii/patterns/" + url.PathEscape(id)
-}
-
 func routeJobStatus(jobID string) string {
 	return "/models/jobs/" + url.PathEscape(jobID)
 }
diff --git a/pkg/mcp/localaitools/inproc/client.go b/pkg/mcp/localaitools/inproc/client.go
index e1d190dcd3c8..6e047d751a3c 100644
--- a/pkg/mcp/localaitools/inproc/client.go
+++ b/pkg/mcp/localaitools/inproc/client.go
@@ -14,10 +14,10 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/gallery/importers"
+	"github.com/mudler/LocalAI/core/http/auth"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services/galleryop"
 	"github.com/mudler/LocalAI/core/services/modeladmin"
-	"github.com/mudler/LocalAI/core/http/auth"
 	"github.com/mudler/LocalAI/core/services/routing/billing"
 	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/core/services/routing/router"
@@ -619,23 +619,6 @@ func (c *Client) GetUsageStats(ctx context.Context, q localaitools.UsageStatsQue
 
 // ---- PII filter ----
 
-func (c *Client) ListPIIPatterns(_ context.Context) ([]localaitools.PIIPattern, error) {
-	if c.PIIRedactor == nil {
-		return nil, errors.New("PII filter is disabled")
-	}
-	patterns := c.PIIRedactor.Patterns()
-	out := make([]localaitools.PIIPattern, 0, len(patterns))
-	for _, p := range patterns {
-		out = append(out, localaitools.PIIPattern{
-			ID:             p.ID,
-			Description:    p.Description,
-			Action:         string(p.Action),
-			MaxMatchLength: p.MaxMatchLength,
-		})
-	}
-	return out, nil
-}
-
 func (c *Client) GetPIIEvents(ctx context.Context, q localaitools.PIIEventsQuery) ([]localaitools.PIIEvent, error) {
 	if c.PIIEvents == nil {
 		return nil, errors.New("PII filter is disabled")
@@ -668,77 +651,6 @@ func (c *Client) GetPIIEvents(ctx context.Context, q localaitools.PIIEventsQuery
 	return out, nil
 }
 
-func (c *Client) SetPIIPatternAction(_ context.Context, req localaitools.PIIPatternActionUpdate) error {
-	if c.PIIRedactor == nil {
-		return errors.New("PII filter is disabled")
-	}
-	if req.ID == "" {
-		return errors.New("pattern id is required")
-	}
-	if req.Action == "" && req.Disabled == nil {
-		return errors.New("must specify action and/or disabled")
-	}
-	if req.Action != "" {
-		if err := c.PIIRedactor.SetAction(req.ID, pii.Action(req.Action)); err != nil {
-			return err
-		}
-	}
-	if req.Disabled != nil {
-		if err := c.PIIRedactor.SetDisabled(req.ID, *req.Disabled); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-// PersistPIIPatterns snapshots the current redactor state into
-// runtime_settings.json. Mirrors POST /api/pii/patterns/persist.
-func (c *Client) PersistPIIPatterns(_ context.Context) error {
-	if c.PIIRedactor == nil {
-		return errors.New("PII filter is disabled")
-	}
-	if c.AppConfig == nil {
-		return errors.New("app config not available")
-	}
-	existing, err := c.AppConfig.ReadPersistedSettings()
-	if err != nil {
-		return fmt.Errorf("read settings: %w", err)
-	}
-	defaults, err := pii.LoadConfig(c.AppConfig.PIIConfigPath)
-	if err != nil {
-		return fmt.Errorf("reload defaults: %w", err)
-	}
-	defaultByID := make(map[string]pii.Pattern, len(defaults))
-	for _, d := range defaults {
-		defaultByID[d.ID] = d
-	}
-	overrides := map[string]config.PIIPatternRuntimeOverride{}
-	for _, p := range c.PIIRedactor.Patterns() {
-		d, known := defaultByID[p.ID]
-		ov := config.PIIPatternRuntimeOverride{}
-		changed := false
-		if !known || p.Action != d.Action {
-			action := string(p.Action)
-			ov.Action = &action
-			changed = true
-		}
-		if !known || p.Disabled != d.Disabled {
-			disabled := p.Disabled
-			ov.Disabled = &disabled
-			changed = true
-		}
-		if changed {
-			overrides[p.ID] = ov
-		}
-	}
-	existing.PIIPatternOverrides = &overrides
-	if err := c.AppConfig.WritePersistedSettings(existing); err != nil {
-		return fmt.Errorf("write settings: %w", err)
-	}
-	c.AppConfig.PIIPatternOverrides = overrides
-	return nil
-}
-
 func (c *Client) GetRouterDecisions(ctx context.Context, q localaitools.RouterDecisionsQuery) ([]localaitools.RouterDecision, error) {
 	if c.RouterDecisions == nil {
 		return []localaitools.RouterDecision{}, nil
@@ -779,23 +691,10 @@ func (c *Client) GetMiddlewareStatus(ctx context.Context) (*localaitools.Middlew
 		Note:       "Intelligent routing is not yet implemented.",
 	}
 	piiSection := localaitools.MiddlewarePIIStatus{
-		EnabledGlobally: c.PIIRedactor != nil,
-		Patterns:        []localaitools.PIIPattern{},
+		EnabledGlobally: c.PIIEvents != nil,
 		Models:          []localaitools.MiddlewarePIIModel{},
 	}
-	if c.PIIRedactor == nil {
-		piiSection.Reason = "--disable-pii"
-		return &localaitools.MiddlewareStatus{PII: piiSection, Router: router}, nil
-	}
 	piiSection.DefaultEnabledForBackends = []string{"cloud-proxy"}
-	for _, p := range c.PIIRedactor.Patterns() {
-		piiSection.Patterns = append(piiSection.Patterns, localaitools.PIIPattern{
-			ID:             p.ID,
-			Description:    p.Description,
-			Action:         string(p.Action),
-			MaxMatchLength: p.MaxMatchLength,
-		})
-	}
 	if c.ConfigLoader != nil {
 		for _, cfg := range c.ConfigLoader.GetAllModelsConfigs() {
 			cfg := cfg
@@ -805,7 +704,7 @@ func (c *Client) GetMiddlewareStatus(ctx context.Context) (*localaitools.Middlew
 				Enabled:           cfg.PIIIsEnabled(),
 				Explicit:          cfg.PII.Enabled != nil,
 				DefaultForBackend: cfg.Backend == "cloud-proxy",
-				Overrides:         cfg.PIIPatternOverrides(),
+				Detectors:         cfg.PIIDetectors(),
 			})
 		}
 	}
@@ -817,27 +716,6 @@ func (c *Client) GetMiddlewareStatus(ctx context.Context) (*localaitools.Middlew
 	return &localaitools.MiddlewareStatus{PII: piiSection, Router: router}, nil
 }
 
-func (c *Client) TestPIIRedaction(_ context.Context, req localaitools.PIIRedactTestRequest) (*localaitools.PIIRedactTestResult, error) {
-	if c.PIIRedactor == nil {
-		return nil, errors.New("PII filter is disabled")
-	}
-	res := c.PIIRedactor.Redact(req.Text)
-	out := &localaitools.PIIRedactTestResult{
-		Redacted:  res.Redacted,
-		Blocked:   res.Blocked,
-		LocalOnly: res.LocalOnly,
-	}
-	for _, s := range res.Spans {
-		out.Spans = append(out.Spans, localaitools.PIIEventSpan{
-			Start:      s.Start,
-			End:        s.End,
-			Pattern:    s.Pattern,
-			HashPrefix: s.HashPrefix,
-		})
-	}
-	return out, nil
-}
-
 func capabilityFlagsOf(m *config.ModelConfig) []string {
 	var out []string
 	for label, flag := range config.GetAllModelConfigUsecases() {
diff --git a/pkg/mcp/localaitools/server_test.go b/pkg/mcp/localaitools/server_test.go
index f82d0ae415c5..eb1579449db1 100644
--- a/pkg/mcp/localaitools/server_test.go
+++ b/pkg/mcp/localaitools/server_test.go
@@ -91,13 +91,9 @@ var expectedFullCatalog = sortedStrings(
 	ToolListInstalledModels,
 	ToolListKnownBackends,
 	ToolListNodes,
-	ToolListPIIPatterns,
-	ToolPersistPIIPatterns,
 	ToolReloadModels,
 	ToolSetBranding,
-	ToolSetPIIPatternAction,
 	ToolSystemInfo,
-	ToolTestPIIRedaction,
 	ToolToggleModelPinned,
 	ToolToggleModelState,
 	ToolUpgradeBackend,
@@ -119,9 +115,7 @@ var expectedReadOnlyCatalog = sortedStrings(
 	ToolListInstalledModels,
 	ToolListKnownBackends,
 	ToolListNodes,
-	ToolListPIIPatterns,
 	ToolSystemInfo,
-	ToolTestPIIRedaction,
 	ToolVRAMEstimate,
 )
 
diff --git a/pkg/mcp/localaitools/tools.go b/pkg/mcp/localaitools/tools.go
index 57b2638e3065..c7bf620c3a1d 100644
--- a/pkg/mcp/localaitools/tools.go
+++ b/pkg/mcp/localaitools/tools.go
@@ -20,9 +20,7 @@ const (
 	ToolVRAMEstimate        = "vram_estimate"
 	ToolGetBranding         = "get_branding"
 	ToolGetUsageStats       = "get_usage_stats"
-	ToolListPIIPatterns     = "list_pii_patterns"
 	ToolGetPIIEvents        = "get_pii_events"
-	ToolTestPIIRedaction    = "test_pii_redaction"
 	ToolGetMiddlewareStatus = "get_middleware_status"
 	ToolGetRouterDecisions  = "get_router_decisions"
 
@@ -38,8 +36,6 @@ const (
 	ToolToggleModelState  = "toggle_model_state"
 	ToolToggleModelPinned = "toggle_model_pinned"
 	ToolSetBranding       = "set_branding"
-	ToolSetPIIPatternAction = "set_pii_pattern_action"
-	ToolPersistPIIPatterns  = "persist_pii_patterns"
 )
 
 // DefaultServerName is the MCP Implementation.Name surfaced when
diff --git a/pkg/mcp/localaitools/tools_middleware.go b/pkg/mcp/localaitools/tools_middleware.go
index 626609bb027e..5dd8066fd4fb 100644
--- a/pkg/mcp/localaitools/tools_middleware.go
+++ b/pkg/mcp/localaitools/tools_middleware.go
@@ -7,21 +7,21 @@ import (
 )
 
 // registerMiddlewareTools wires the routing-module admin surface for the
-// MCP server. The two tools mirror what the React /app/middleware page
-// exposes:
+// MCP server, mirroring what the React /app/middleware page exposes:
 //
 //   - get_middleware_status: read-only aggregator. The agent can ask
-//     "what's filtering my requests?" and get back the active PII
-//     pattern set, the per-model resolved enabled/override state, and
-//     a placeholder for routing.
-//   - set_pii_pattern_action: mutating. Mutations are TRANSIENT — they
-//     live until process restart, when patterns reload from the YAML
-//     defaults. The skill prompt should warn the user about that
-//     before applying lasting changes.
-func registerMiddlewareTools(s *mcp.Server, client LocalAIClient, opts Options) {
+//     "what's filtering my requests?" and get back the per-model PII
+//     enabled state + the detector models each references, recent event
+//     count, plus the active router models and their classifier configs.
+//   - get_router_decisions: read-only routing-decision log.
+//
+// PII detection policy lives on each detector model's pii_detection
+// block, edited via the model-config tools — there is no global pattern
+// set to mutate here anymore.
+func registerMiddlewareTools(s *mcp.Server, client LocalAIClient, _ Options) {
 	mcp.AddTool(s, &mcp.Tool{
 		Name:        ToolGetMiddlewareStatus,
-		Description: "Aggregated routing-module status: PII pattern catalogue with current actions, per-model resolved PII state and overrides, recent event count, plus the active router models and their classifier configs. Read-only.",
+		Description: "Aggregated routing-module status: per-model resolved PII state and the NER detector models each one references, recent event count, plus the active router models and their classifier configs. Read-only.",
 	}, func(ctx context.Context, _ *mcp.CallToolRequest, _ struct{}) (*mcp.CallToolResult, any, error) {
 		status, err := client.GetMiddlewareStatus(ctx)
 		if err != nil {
@@ -40,39 +40,4 @@ func registerMiddlewareTools(s *mcp.Server, client LocalAIClient, opts Options)
 		}
 		return jsonResult(decisions), nil, nil
 	})
-
-	if opts.DisableMutating {
-		return
-	}
-
-	mcp.AddTool(s, &mcp.Tool{
-		Name:        ToolSetPIIPatternAction,
-		Description: "Change a PII pattern's action (mask|block|route_local) and/or disabled state in-process. TRANSIENT: the mutation is lost on restart unless followed by persist_pii_patterns. Admin-required.",
-	}, func(ctx context.Context, _ *mcp.CallToolRequest, args PIIPatternActionUpdate) (*mcp.CallToolResult, any, error) {
-		if args.ID == "" {
-			return errorResultf("id is required"), nil, nil
-		}
-		if args.Action == "" && args.Disabled == nil {
-			return errorResultf("at least one of action (mask, block, route_local) or disabled must be set"), nil, nil
-		}
-		if err := client.SetPIIPatternAction(ctx, args); err != nil {
-			return errorResult(err), nil, nil
-		}
-		return jsonResult(map[string]any{
-			"id":        args.ID,
-			"action":    args.Action,
-			"disabled":  args.Disabled,
-			"persisted": false,
-		}), nil, nil
-	})
-
-	mcp.AddTool(s, &mcp.Tool{
-		Name:        ToolPersistPIIPatterns,
-		Description: "Snapshot the live PII redactor's per-pattern (action, disabled) state into runtime_settings.json so it re-applies on the next process start. Pairs with set_pii_pattern_action — that one is in-process; this one persists. Admin-required.",
-	}, func(ctx context.Context, _ *mcp.CallToolRequest, _ struct{}) (*mcp.CallToolResult, any, error) {
-		if err := client.PersistPIIPatterns(ctx); err != nil {
-			return errorResult(err), nil, nil
-		}
-		return jsonResult(map[string]any{"persisted": true}), nil, nil
-	})
 }
diff --git a/pkg/mcp/localaitools/tools_pii.go b/pkg/mcp/localaitools/tools_pii.go
index e53a27dbeb2b..4027d14d3922 100644
--- a/pkg/mcp/localaitools/tools_pii.go
+++ b/pkg/mcp/localaitools/tools_pii.go
@@ -7,20 +7,13 @@ import (
 )
 
 func registerPIITools(s *mcp.Server, client LocalAIClient, _ Options) {
-	mcp.AddTool(s, &mcp.Tool{
-		Name:        ToolListPIIPatterns,
-		Description: "List the active PII regex pattern set. Each entry shows the pattern id, description, and current action (mask, block, route_local). Read-only.",
-	}, func(ctx context.Context, _ *mcp.CallToolRequest, _ struct{}) (*mcp.CallToolResult, any, error) {
-		patterns, err := client.ListPIIPatterns(ctx)
-		if err != nil {
-			return errorResult(err), nil, nil
-		}
-		return jsonResult(patterns), nil, nil
-	})
-
+	// The regex pattern tools (list/test/set/persist) were removed with
+	// the regex tier. Detection policy now lives on each detector model's
+	// pii_detection block (managed via the model config tools/UI), so the
+	// only PII tool is the read-only audit-event view.
 	mcp.AddTool(s, &mcp.Tool{
 		Name:        ToolGetPIIEvents,
-		Description: "Recent PII redaction events. Filter by correlation_id (joins to a usage record), user_id, or pattern_id. Events never carry the matched value — only an 8-char sha256 prefix so admins can dedupe recurring leaks.",
+		Description: "Recent PII redaction events. Filter by correlation_id (joins to a usage record), user_id, or pattern_id (e.g. ner:EMAIL). Events never carry the matched value — only an 8-char sha256 prefix so admins can dedupe recurring leaks.",
 	}, func(ctx context.Context, _ *mcp.CallToolRequest, args PIIEventsQuery) (*mcp.CallToolResult, any, error) {
 		events, err := client.GetPIIEvents(ctx, args)
 		if err != nil {
@@ -28,18 +21,4 @@ func registerPIITools(s *mcp.Server, client LocalAIClient, _ Options) {
 		}
 		return jsonResult(events), nil, nil
 	})
-
-	mcp.AddTool(s, &mcp.Tool{
-		Name:        ToolTestPIIRedaction,
-		Description: "Dry-run the PII redactor against text without recording a real event. Useful for tuning patterns: paste a candidate string and see whether it would be masked, blocked, or routed locally.",
-	}, func(ctx context.Context, _ *mcp.CallToolRequest, args PIIRedactTestRequest) (*mcp.CallToolResult, any, error) {
-		if args.Text == "" {
-			return errorResultf("text is required"), nil, nil
-		}
-		res, err := client.TestPIIRedaction(ctx, args)
-		if err != nil {
-			return errorResult(err), nil, nil
-		}
-		return jsonResult(res), nil, nil
-	})
 }
diff --git a/tests/e2e-aio/e2e_test.go b/tests/e2e-aio/e2e_test.go
index 19b310460ccb..6472b5d63cff 100644
--- a/tests/e2e-aio/e2e_test.go
+++ b/tests/e2e-aio/e2e_test.go
@@ -222,6 +222,37 @@ var _ = Describe("E2E test", func() {
 				Expect(resp3.Data[1].Embedding).To(Equal(resp2.Data[0].Embedding))
 				Expect(resp3.Data[0].Embedding).ToNot(Equal(resp3.Data[1].Embedding))
 			})
+
+			// Regression guard for the auto-batch fix (core/backend/options.go
+			// EffectiveBatchSize). Embeddings pool over the whole sequence in a
+			// single physical batch (n_ubatch == n_batch), so an input longer
+			// than n_batch is rejected by the backend with "input is too large
+			// to process". Before the fix n_batch defaulted to 512 regardless of
+			// the model's context, so any prompt over ~512 tokens failed here.
+			// The embedding model is configured with a 2048 context (see
+			// models/embeddings.yaml); this input is comfortably over 512 tokens
+			// and under that context, so it must embed in one pass.
+			It("embeds an input larger than the default 512 batch", func() {
+				var b bytes.Buffer
+				// ~100 short sentences ≈ 1000+ tokens: well past the old 512
+				// batch ceiling, well within the 2048 context.
+				for i := range 100 {
+					fmt.Fprintf(&b, "This is sentence number %d discussing organic skincare and machine learning. ", i)
+				}
+				longInput := b.String()
+
+				resp, err := client.Embeddings.New(context.TODO(),
+					openai.EmbeddingNewParams{
+						Input: openai.EmbeddingNewParamsInputUnion{
+							OfArrayOfStrings: []string{longInput},
+						},
+						Model: openai.EmbeddingModelTextEmbeddingAda002,
+					},
+				)
+				Expect(err).ToNot(HaveOccurred(), "a >512-token input must embed in a single batch (auto-batch sizing)")
+				Expect(len(resp.Data)).To(Equal(1), fmt.Sprint(resp))
+				Expect(resp.Data[0].Embedding).ToNot(BeEmpty())
+			})
 		})
 
 		Context("vision", func() {
diff --git a/tests/e2e-aio/models/embeddings.yaml b/tests/e2e-aio/models/embeddings.yaml
index 8613f2c33b17..426035d4d77c 100644
--- a/tests/e2e-aio/models/embeddings.yaml
+++ b/tests/e2e-aio/models/embeddings.yaml
@@ -1,5 +1,12 @@
 embeddings: true
 name: text-embedding-ada-002
 backend: llama-cpp
+# nomic-embed-text-v1.5 has a 2048-token context, unlike the previous 512-token
+# granite model. The larger context is what makes the long-input embedding test
+# (e2e_test.go) meaningful: it exercises the auto-batch fix where n_batch is
+# sized up to the context window (core/backend/options.go EffectiveBatchSize) so
+# a >512-token input embeds in a single pass instead of failing with "input is
+# too large to process" against the default 512 batch.
+context_size: 2048
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: huggingface://nomic-ai/nomic-embed-text-v1.5-GGUF/nomic-embed-text-v1.5.f16.gguf
diff --git a/tests/e2e-backends/backend_test.go b/tests/e2e-backends/backend_test.go
index 4c7dac33cd0e..6d0d27276851 100644
--- a/tests/e2e-backends/backend_test.go
+++ b/tests/e2e-backends/backend_test.go
@@ -105,6 +105,7 @@ const (
 	capAudioTransform = "audio_transform"
 	capLogprobs      = "logprobs"
 	capLogitBias     = "logit_bias"
+	capTokenize      = "tokenize"
 
 	defaultPrompt             = "The capital of France is"
 	streamPrompt              = "Once upon a time"
@@ -426,6 +427,23 @@ var _ = Describe("Backend container", Ordered, func() {
 			res.GetMessage(), res.GetTokens(), res.GetPromptTokens())
 	})
 
+	// Regression guard for the raw-prompt tokenize RPC. The llama.cpp handler
+	// read the prompt from the wrong JSON key ("content" instead of "prompt"),
+	// so any non-empty prompt threw and the RPC returned "Unexpected error in
+	// RPC handling". The mock backend reimplements TokenizeString in Go, so only
+	// a real backend exercises the C++ path — this spec is that coverage.
+	It("tokenizes a prompt via TokenizeString", func() {
+		if !caps[capTokenize] {
+			Skip("tokenize capability not enabled")
+		}
+		ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
+		defer cancel()
+		res, err := client.TokenizeString(ctx, &pb.PredictOptions{Prompt: prompt})
+		Expect(err).NotTo(HaveOccurred(), "TokenizeString RPC failed")
+		Expect(res.GetTokens()).NotTo(BeEmpty(), "TokenizeString returned no tokens for a non-empty prompt")
+		GinkgoWriter.Printf("Tokenize: %d tokens for %q\n", len(res.GetTokens()), prompt)
+	})
+
 	It("streams output via PredictStream", func() {
 		if !caps[capStream] {
 			Skip("stream capability not enabled")
diff --git a/tests/e2e/e2e_router_test.go b/tests/e2e/e2e_router_test.go
new file mode 100644
index 000000000000..560f5a9bc2de
--- /dev/null
+++ b/tests/e2e/e2e_router_test.go
@@ -0,0 +1,90 @@
+package e2e_test
+
+import (
+	"context"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"github.com/openai/openai-go/v3"
+)
+
+// Router e2e: drives /v1/chat/completions through the RouteModel middleware
+// against a configured score classifier (mock-classifier from the suite
+// fixtures) and two candidates. The mock-backend's Score handler ranks
+// candidates by looking for a `ROUTE_HINT=<label>` marker in the prompt and
+// boosting the candidate whose label matches; without a hint, all candidates
+// score equally and the router falls back. The ECHO_SERVED_MODEL trigger
+// makes the chosen candidate echo its loaded model file path so the test can
+// verify routing decisively rather than infer it from content shape.
+var _ = Describe("Router E2E", Label("Router"), func() {
+	chat := func(message string) (*openai.ChatCompletion, error) {
+		return client.Chat.Completions.New(
+			context.TODO(),
+			openai.ChatCompletionNewParams{
+				Model: "smart-router",
+				Messages: []openai.ChatCompletionMessageParamUnion{
+					openai.UserMessage(message),
+				},
+			},
+		)
+	}
+
+	It("routes a casual probe to the casual-chat candidate", func() {
+		resp, err := chat("ROUTE_HINT=casual-chat ECHO_SERVED_MODEL")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Choices).To(HaveLen(1))
+		Expect(resp.Choices[0].Message.Content).To(ContainSubstring("SERVED_MODEL=mock-cand-casual.bin"),
+			"casual hint should have routed to mock-cand-casual; got %q", resp.Choices[0].Message.Content)
+	})
+
+	It("routes a code probe to the code-generation candidate", func() {
+		resp, err := chat("ROUTE_HINT=code-generation ECHO_SERVED_MODEL")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Choices).To(HaveLen(1))
+		Expect(resp.Choices[0].Message.Content).To(ContainSubstring("SERVED_MODEL=mock-cand-code.bin"),
+			"code hint should have routed to mock-cand-code; got %q", resp.Choices[0].Message.Content)
+	})
+
+	It("falls back when no policy label matches the probe", func() {
+		// No ROUTE_HINT marker — the mock Score handler gives every candidate
+		// the same base log-prob, softmax goes uniform, no label clears
+		// activation_threshold=0.40, so the router falls back to
+		// mock-cand-casual.
+		resp, err := chat("ECHO_SERVED_MODEL hello world")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Choices).To(HaveLen(1))
+		Expect(resp.Choices[0].Message.Content).To(ContainSubstring("SERVED_MODEL=mock-cand-casual.bin"),
+			"unhinted probe should have fallen back; got %q", resp.Choices[0].Message.Content)
+	})
+
+	It("routes correctly over a long conversation (exercises fitMessages)", func() {
+		// Build a conversation long enough that the score classifier's
+		// probeTokenBudget kicks in and fitMessages has to trim. mock-backend's
+		// TokenizeString returns ~1 token per 4 prompt characters, and the
+		// classifier ContextSize is 4096, so >40k chars guarantees the trim
+		// path. The ROUTE_HINT marker is placed ONLY in the newest message —
+		// if fitMessages dropped it during trim, no candidate would win and we
+		// would route to the fallback (mock-cand-casual) instead of the code
+		// candidate.
+		filler := strings.Repeat("background context, lorem ipsum dolor sit amet. ", 200) // ~10k chars × 5 turns
+		msgs := make([]openai.ChatCompletionMessageParamUnion, 0, 6)
+		for range 5 {
+			msgs = append(msgs, openai.UserMessage(filler))
+		}
+		msgs = append(msgs, openai.UserMessage("ROUTE_HINT=code-generation ECHO_SERVED_MODEL"))
+
+		resp, err := client.Chat.Completions.New(
+			context.TODO(),
+			openai.ChatCompletionNewParams{Model: "smart-router", Messages: msgs},
+		)
+		Expect(err).ToNot(HaveOccurred(), "router must survive a long conversation without erroring")
+		Expect(resp.Choices).To(HaveLen(1))
+		// The newest turn carries the routing intent ("code"); fitMessages must
+		// keep it intact even after dropping older fillers, so the code
+		// candidate still wins.
+		Expect(resp.Choices[0].Message.Content).To(ContainSubstring("SERVED_MODEL=mock-cand-code.bin"),
+			"long-conversation routing must still resolve to the code candidate; got %q",
+			resp.Choices[0].Message.Content)
+	})
+})
diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go
index 49e21f4174c3..6c8a6c9a85a4 100644
--- a/tests/e2e/e2e_suite_test.go
+++ b/tests/e2e/e2e_suite_test.go
@@ -236,6 +236,65 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())
 	Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline.yaml"), pipelineData, 0644)).To(Succeed())
 
+	// Router model setup: a score classifier (mock-backend Score) selects
+	// between two candidate chat models based on keyword matches against the
+	// candidate label fragments. Exercises the full RouteModel middleware path
+	// — probe extraction, ScoreClassifier.fitMessages (with the classifier's
+	// real TokenizeString and ContextSize wired), Score RPC, and fanout to
+	// the chosen candidate. The classifier MUST carry a chat template, since
+	// buildClassifier now rejects routers whose classifier model has none.
+	chatMLTpl := map[string]any{
+		"chat":         "{{.Input -}}\n<|im_start|>assistant\n",
+		"chat_message": "<|im_start|>{{ .RoleName }}\n{{ if .Content }}{{ .Content }}{{ end }}<|im_end|>",
+	}
+	classifierCfg := map[string]any{
+		"name":           "mock-classifier",
+		"backend":        "mock-backend",
+		"known_usecases": []string{"score"},
+		"context_size":   4096,
+		"stopwords":      []string{"<|im_end|>"},
+		"parameters":     map[string]any{"model": "mock-classifier.bin"},
+		"template":       chatMLTpl,
+	}
+	classifierData, err := yaml.Marshal(classifierCfg)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(os.WriteFile(filepath.Join(modelsPath, "mock-classifier.yaml"), classifierData, 0644)).To(Succeed())
+
+	for _, name := range []string{"mock-cand-casual", "mock-cand-code"} {
+		candCfg := map[string]any{
+			"name":           name,
+			"backend":        "mock-backend",
+			"known_usecases": []string{"chat"},
+			"parameters":     map[string]any{"model": name + ".bin"},
+		}
+		candData, err := yaml.Marshal(candCfg)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(os.WriteFile(filepath.Join(modelsPath, name+".yaml"), candData, 0644)).To(Succeed())
+	}
+
+	routerCfg := map[string]any{
+		"name":           "smart-router",
+		"known_usecases": []string{"chat"},
+		"router": map[string]any{
+			"classifier":           "score",
+			"classifier_model":     "mock-classifier",
+			"activation_threshold": 0.40,
+			"fallback":             "mock-cand-casual",
+			"policies": []map[string]any{
+				{"label": "casual-chat", "description": "small talk and general conversation"},
+				{"label": "code-generation", "description": "writing or debugging code"},
+				{"label": "math-reasoning", "description": "arithmetic and word problems"},
+			},
+			"candidates": []map[string]any{
+				{"model": "mock-cand-casual", "labels": []string{"casual-chat"}},
+				{"model": "mock-cand-code", "labels": []string{"code-generation", "math-reasoning"}},
+			},
+		},
+	}
+	routerData, err := yaml.Marshal(routerCfg)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(os.WriteFile(filepath.Join(modelsPath, "smart-router.yaml"), routerData, 0644)).To(Succeed())
+
 	// If REALTIME_TEST_MODEL=realtime-test-pipeline, auto-create a pipeline
 	// config from the REALTIME_VAD/STT/LLM/TTS env vars so real-model tests
 	// can run without the user having to write a YAML file manually.
diff --git a/tests/e2e/mock-backend/main.go b/tests/e2e/mock-backend/main.go
index 46c4e51d6a4a..50ac696e2a40 100644
--- a/tests/e2e/mock-backend/main.go
+++ b/tests/e2e/mock-backend/main.go
@@ -109,6 +109,23 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R
 		}, nil
 	}
 
+	// ECHO_SERVED_MODEL returns the loaded model file path so router e2e
+	// tests can verify which candidate actually served the request without
+	// adding a new RPC. The router fans out to a single backend process per
+	// candidate, so lastLoadParams.Model is unique per candidate.
+	if strings.Contains(in.Prompt, "ECHO_SERVED_MODEL") {
+		opts := snapshotLoadParams()
+		modelID := ""
+		if opts != nil {
+			modelID = opts.Model
+		}
+		return &pb.Reply{
+			Message:      []byte("SERVED_MODEL=" + modelID),
+			Tokens:       2,
+			PromptTokens: 1,
+		}, nil
+	}
+
 	// Simulate C++ autoparser: tool call via ChatDeltas, empty message
 	if strings.Contains(in.Prompt, "AUTOPARSER_TOOL_CALL") {
 		toolName := mockToolNameFromRequest(in)
@@ -171,7 +188,7 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R
 	// Simulate multiple tool calls in a single response (Go-side JSON parser path).
 	if strings.Contains(in.Prompt, "MULTI_TOOL_CALL") {
 		return &pb.Reply{
-			Message:      []byte(`{"name": "get_weather", "arguments": {"location": "Rome"}}
+			Message: []byte(`{"name": "get_weather", "arguments": {"location": "Rome"}}
 {"name": "get_weather", "arguments": {"location": "Paris"}}`),
 			Tokens:       30,
 			PromptTokens: 10,
@@ -540,15 +557,91 @@ func (m *MockBackend) AudioTranscription(ctx context.Context, in *pb.TranscriptR
 }
 
 func (m *MockBackend) TokenizeString(ctx context.Context, in *pb.PredictOptions) (*pb.TokenizationResponse, error) {
-	xlog.Debug("TokenizeString called", "prompt", in.Prompt)
-	// Return mock token IDs
-	tokens := []int32{101, 2023, 2003, 1037, 3231, 1012}
+	xlog.Debug("TokenizeString called", "prompt_len", len(in.Prompt))
+	// Approximate BPE: ~4 chars/token, minimum 1. Realistic enough for the
+	// router's fitMessages to exercise the budget/rune-pretrim path with
+	// recognisable counts that scale with input size.
+	n := max((len(in.Prompt)+3)/4, 1)
+	tokens := make([]int32, n)
+	for i := range tokens {
+		tokens[i] = int32(i + 1)
+	}
 	return &pb.TokenizationResponse{
-		Length: int32(len(tokens)),
+		Length: int32(n),
 		Tokens: tokens,
 	}, nil
 }
 
+// Score implements deterministic marker-driven ranking for router e2e
+// tests. The Score RPC receives the full rendered routing prompt (system
+// prompt + chat envelope + user turn), and the system prompt by construction
+// lists every policy label — so any keyword-against-prompt heuristic would
+// match every candidate. Instead we look for an explicit `ROUTE_HINT=<label>`
+// marker, which only appears when a test deliberately places one in a user
+// message. The candidate whose extracted label equals the hint gets a large
+// log-prob boost; all others stay at the base. With no hint, every candidate
+// scores equally, softmax is uniform, and (with a sensible activation
+// threshold) the router falls back.
+func (m *MockBackend) Score(ctx context.Context, in *pb.ScoreRequest) (*pb.ScoreResponse, error) {
+	xlog.Debug("Score called", "candidates", len(in.Candidates))
+	hint := extractRouteHint(in.Prompt)
+	out := &pb.ScoreResponse{Candidates: make([]*pb.CandidateScore, len(in.Candidates))}
+	for i, c := range in.Candidates {
+		label := extractRouteLabel(c)
+		// Base -5 (softmax ≈ 0.003), hint match +5 → 0 (softmax ≈ 0.99).
+		logProb := -5.0
+		if hint != "" && label == hint {
+			logProb = 0.0
+		}
+		// num_tokens matches TokenizeString's heuristic so per-token mean
+		// log-prob consumers see consistent values.
+		nTok := max((len(c)+3)/4, 1)
+		out.Candidates[i] = &pb.CandidateScore{
+			LogProb:                 logProb,
+			NumTokens:               int32(nTok),
+			LengthNormalizedLogProb: logProb / float64(nTok),
+		}
+	}
+	return out, nil
+}
+
+// extractRouteHint returns the label after the LAST occurrence of
+// `ROUTE_HINT=` in the prompt, terminated by whitespace or end-of-string.
+// Using the last occurrence makes the marker stable across long
+// conversations: the *newest* user message's hint wins, mirroring how the
+// router's fitMessages keeps the newest turn whole.
+func extractRouteHint(prompt string) string {
+	const key = "ROUTE_HINT="
+	i := strings.LastIndex(prompt, key)
+	if i < 0 {
+		return ""
+	}
+	rest := prompt[i+len(key):]
+	end := strings.IndexAny(rest, " \t\r\n<")
+	if end < 0 {
+		return rest
+	}
+	return rest[:end]
+}
+
+// extractRouteLabel returns the label inside `{"route": "<label>"}`. Returns
+// "" on any shape it doesn't recognise — the caller treats that as a no-match.
+func extractRouteLabel(candidate string) string {
+	_, rest, ok := strings.Cut(candidate, `"route"`)
+	if !ok {
+		return ""
+	}
+	_, rest, ok = strings.Cut(rest, `"`)
+	if !ok {
+		return ""
+	}
+	label, _, ok := strings.Cut(rest, `"`)
+	if !ok {
+		return ""
+	}
+	return label
+}
+
 func (m *MockBackend) Status(ctx context.Context, in *pb.HealthMessage) (*pb.StatusResponse, error) {
 	xlog.Debug("Status called")
 	return &pb.StatusResponse{