diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index 12c186ca2870..3296c6d84578 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -563,7 +563,7 @@ jobs: - name: Run e2e-backends smoke env: BACKEND_IMAGE: quay.io/go-skynet/local-ai-backends:master-cpu-llama-cpp - BACKEND_TEST_CAPS: health,load,predict,stream,logprobs,logit_bias + BACKEND_TEST_CAPS: health,load,predict,stream,logprobs,logit_bias,tokenize run: | make test-extra-backend # Realtime e2e with sherpa-onnx driving VAD + STT + TTS against a mocked LLM. diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index ac5521bc44ae..9f82001260c5 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -1282,6 +1283,232 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt } +// ============================================================================ +// Token-classification (NER) support for the openai-privacy-filter arch. +// +// The model emits BIOES-tagged per-token logits (217 classes for the +// multilingual privacy filter: "O" plus {B,I,E,S}-). We decode the +// most likely *valid* BIOES path with a constrained linear-chain Viterbi (the +// model's 6 transition biases are all 0.0 in the shipped +// viterbi_calibration.json, so only the structural BIOES constraints apply), +// assemble spans, and map token spans to UTF-8 byte offsets in the source text. +// +// Receptive-field note: attention is a symmetric +/-sliding_window band PER +// LAYER, so after n_layer layers a token's logits depend on its +// +/-(n_layer * sliding_window) neighbourhood -- NOT +/-sliding_window. Windowed +// inference for long inputs must therefore use a halo of n_layer*sliding_window +// to stay bit-exact with a single forward (see TokenClassify below). +// ============================================================================ +namespace pf_ner { + +// Per-layer attention half-window for openai-privacy-filter (config +// sliding_window = 128). Only used to size the windowing halo for inputs that +// exceed a single forward; short inputs (the common PII case) never window. +static constexpr int PF_SLIDING_WINDOW = 128; + +enum bioes_tag { TAG_O = 0, TAG_B, TAG_I, TAG_E, TAG_S }; + +struct label_info { + bioes_tag tag; + int cat; // index into label_table::categories; -1 for O / unknown +}; + +// Parsed view of the model's classifier labels. +struct label_table { + std::vector labels; // size n_cls, indexed by class id + std::vector categories; // distinct entity-group names + int o_label = 0; // class id of the "O" (outside) label + + // Per-category open-state class ids (B/I), used by the Viterbi inner loop. + struct open_ids { int b = -1; int i = -1; }; + std::vector per_cat; + + const std::string & category_name(int cat) const { return categories[cat]; } +}; + +// Split a "B-CATEGORY" label into its BIOES tag and category name. The model's +// labels use a single '-' separator and category names contain none (verified +// against the GGUF metadata). +static label_table build_label_table(const llama_model * model) { + label_table t; + const uint32_t n = llama_model_n_cls_out(model); + t.labels.resize(n, { TAG_O, -1 }); + std::map cat_index; + bool found_o = false; + for (uint32_t i = 0; i < n; i++) { + const char * raw = llama_model_cls_label(model, i); + std::string s = raw ? raw : ""; + if (s.empty() || s == "O") { + t.labels[i] = { TAG_O, -1 }; + if (!found_o) { t.o_label = (int) i; found_o = true; } + continue; + } + bioes_tag tag; + switch (s[0]) { + case 'B': tag = TAG_B; break; + case 'I': tag = TAG_I; break; + case 'E': tag = TAG_E; break; + case 'S': tag = TAG_S; break; + default: t.labels[i] = { TAG_O, -1 }; continue; // unknown -> treat as O + } + const size_t dash = s.find('-'); + const std::string cat = (dash == std::string::npos) ? s : s.substr(dash + 1); + int ci; + auto it = cat_index.find(cat); + if (it == cat_index.end()) { + ci = (int) t.categories.size(); + cat_index.emplace(cat, ci); + t.categories.push_back(cat); + } else { + ci = it->second; + } + t.labels[i] = { tag, ci }; + } + t.per_cat.assign(t.categories.size(), {}); + for (uint32_t i = 0; i < n; i++) { + const auto & li = t.labels[i]; + if (li.cat < 0) continue; + if (li.tag == TAG_B) t.per_cat[li.cat].b = (int) i; + if (li.tag == TAG_I) t.per_cat[li.cat].i = (int) i; + } + return t; +} + +static inline bool tag_is_closed(bioes_tag tg) { return tg == TAG_O || tg == TAG_E || tg == TAG_S; } + +// Constrained linear-chain Viterbi over BIOES. `emit` is row-major +// [n_tok * n_cls] of per-token LOG-probabilities. Returns the best valid label +// per token. Exploits the BIOES structure so each step is O(n_cls), not +// O(n_cls^2): a fresh label (O/B/S) may only follow a closed state (O/E/S) and +// can take the single best closed predecessor; a continuation (I/E of category +// c) may only follow B-c or I-c. Falls back to per-token argmax only if no +// valid path survives numerically (the all-O path always exists, so this is a +// safety net). +static std::vector bioes_viterbi(const label_table & lt, + const std::vector & emit, + int n_tok, int n_cls) { + const float NEG = -std::numeric_limits::infinity(); + std::vector prev_dp(n_cls, NEG), dp(n_cls, NEG); + std::vector bp((size_t) n_tok * n_cls, -1); + + // t == 0: a span may only start with O / B / S. + for (int j = 0; j < n_cls; j++) { + const bioes_tag tg = lt.labels[j].tag; + if (tg == TAG_O || tg == TAG_B || tg == TAG_S) prev_dp[j] = emit[j]; + } + + for (int t = 1; t < n_tok; t++) { + std::fill(dp.begin(), dp.end(), NEG); + const float * e = &emit[(size_t) t * n_cls]; + + // best closed predecessor (O/E/S) from the previous step + float best_closed = NEG; int best_closed_arg = -1; + for (int i = 0; i < n_cls; i++) { + if (prev_dp[i] == NEG) continue; + if (tag_is_closed(lt.labels[i].tag) && prev_dp[i] > best_closed) { + best_closed = prev_dp[i]; + best_closed_arg = i; + } + } + + for (int j = 0; j < n_cls; j++) { + const auto & lj = lt.labels[j]; + float pred = NEG; int arg = -1; + if (lj.tag == TAG_O || lj.tag == TAG_B || lj.tag == TAG_S) { + pred = best_closed; arg = best_closed_arg; // fresh start + } else { + // I-c or E-c: predecessor must be B-c or I-c + const auto & oc = lt.per_cat[lj.cat]; + if (oc.b >= 0 && prev_dp[oc.b] > pred) { pred = prev_dp[oc.b]; arg = oc.b; } + if (oc.i >= 0 && prev_dp[oc.i] > pred) { pred = prev_dp[oc.i]; arg = oc.i; } + } + if (arg >= 0 && pred != NEG) { + dp[j] = pred + e[j]; + bp[(size_t) t * n_cls + j] = arg; + } + } + prev_dp.swap(dp); + } + + // terminate only on a closed state (no dangling B/I span) + float best = NEG; int arg = -1; + for (int j = 0; j < n_cls; j++) { + if (prev_dp[j] == NEG) continue; + if (tag_is_closed(lt.labels[j].tag) && prev_dp[j] > best) { best = prev_dp[j]; arg = j; } + } + + std::vector path(n_tok, lt.o_label); + if (arg < 0) { + for (int t = 0; t < n_tok; t++) { + const float * e = &emit[(size_t) t * n_cls]; + int a = 0; float m = e[0]; + for (int j = 1; j < n_cls; j++) if (e[j] > m) { m = e[j]; a = j; } + path[t] = a; + } + return path; + } + int cur = arg; + for (int t = n_tok - 1; t >= 0; t--) { + path[t] = cur; + if (t > 0) cur = bp[(size_t) t * n_cls + cur]; + } + return path; +} + +// One assembled entity span over token indices [tok_begin, tok_end] inclusive. +struct span { + int cat; + int tok_begin; + int tok_end; + float score; // mean per-token probability of the chosen labels +}; + +// Walk a (valid) BIOES label path into spans. Viterbi guarantees validity, so +// B is always closed by a matching E and S stands alone. +static std::vector assemble_spans(const label_table & lt, + const std::vector & path, + const std::vector & emit, + int n_cls) { + std::vector out; + int n_tok = (int) path.size(); + int begin = -1, cat = -1; + double prob_sum = 0.0; + auto prob_at = [&](int t) { + return (double) std::exp(emit[(size_t) t * n_cls + path[t]]); + }; + for (int t = 0; t < n_tok; t++) { + const auto & li = lt.labels[path[t]]; + switch (li.tag) { + case TAG_S: + out.push_back({ li.cat, t, t, (float) prob_at(t) }); + begin = -1; + break; + case TAG_B: + begin = t; cat = li.cat; prob_sum = prob_at(t); + break; + case TAG_I: + if (begin >= 0 && li.cat == cat) prob_sum += prob_at(t); + break; + case TAG_E: + if (begin >= 0 && li.cat == cat) { + prob_sum += prob_at(t); + const int len = t - begin + 1; + out.push_back({ cat, begin, t, (float) (prob_sum / len) }); + } + begin = -1; + break; + case TAG_O: + default: + begin = -1; + break; + } + } + return out; +} + +} // namespace pf_ner + + // GRPC Server start class BackendServiceImpl final : public backend::Backend::Service { private: @@ -3444,6 +3671,186 @@ class BackendServiceImpl final : public backend::Backend::Service { return grpc::Status::OK; } + // TokenClassify runs the openai-privacy-filter token classifier (a + // bidirectional MoE encoder with a per-token BIOES head) over the supplied + // text and returns the detected entity spans. It mirrors Score's + // direct-decode strategy (bypassing the slot/task queue) because it needs + // full control over batch construction, per-token logit readout, and + // overlapping-window stitching for long inputs. + // + // The model must be loaded with embeddings enabled and TOKEN_CLS pooling + // (the converter writes pooling_type = TOKEN_CLS into the GGUF, so a model + // YAML only needs `embeddings: true`). Pipeline: + // tokenize (+offsets) -> windowed non-causal forward -> per-token + // log_softmax -> constrained BIOES Viterbi -> spans -> byte offsets. + grpc::Status TokenClassify(ServerContext* context, const backend::TokenClassifyRequest* request, backend::TokenClassifyResponse* response) override { + auto auth = checkAuth(context); + if (!auth.ok()) return auth; + if (params_base.model.path.empty()) { + return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded"); + } + + // Tripwire against the slot loop + serialise concurrent TokenClassify + // calls, exactly as Score does (see Score's class comment): we drive + // llama_decode directly, so we must not race the slot loop or another + // direct-decode RPC. + conflict_guard guard("TokenClassify", score_inflight, slot_loop_inflight, "slot_loop_inflight"); + static std::mutex token_classify_mutex; + std::lock_guard tc_lock(token_classify_mutex); + + llama_context * lctx = ctx_server.get_llama_context(); + if (lctx == nullptr) { + return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "llama context unavailable (sleeping?)"); + } + if (!params_base.embedding || llama_pooling_type(lctx) != LLAMA_POOLING_TYPE_TOKEN_CLS) { + return grpc::Status(grpc::StatusCode::UNIMPLEMENTED, + "This model does not support token classification. Load a TOKEN_CLS-pooling model (e.g. openai-privacy-filter) with `embeddings: true`"); + } + + const llama_model * model = ctx_server.impl->model_tgt; + const llama_vocab * vocab = ctx_server.impl->vocab; + const int n_cls = (int) llama_model_n_cls_out(model); + const int n_embd_out = llama_model_n_embd_out(model); + if (n_cls <= 0 || n_embd_out != n_cls) { + return grpc::Status(grpc::StatusCode::INTERNAL, + "TokenClassify: unexpected classifier output width (n_cls_out=" + std::to_string(n_cls) + + ", n_embd_out=" + std::to_string(n_embd_out) + ")"); + } + + const std::string & text = request->text(); + if (text.empty()) { + return grpc::Status::OK; // no text -> no entities + } + + // Tokenize once. add_special matches the verified llama-embedding parity + // path; rendering pieces with special=false makes any control tokens + // (e.g. an injected BOS) zero-width so they never fall inside a span and + // do not perturb byte offsets. + std::vector tokens = common_tokenize(vocab, text, /*add_special=*/true, /*parse_special=*/true); + const int n_tok = (int) tokens.size(); + if (n_tok == 0) { + return grpc::Status::OK; + } + + // Per-token UTF-8 byte offsets into `text`, by accumulating piece lengths. + // o200k is byte-level reversible, so piece concatenation reproduces the + // input bytes exactly; we validate and warn (best-effort) if it doesn't. + std::vector tok_off(n_tok), tok_end(n_tok); + { + size_t running = 0; + for (int k = 0; k < n_tok; k++) { + std::string piece = common_token_to_piece(vocab, tokens[k], /*special=*/false); + tok_off[k] = (int) running; + running += piece.size(); + tok_end[k] = (int) running; + } + if (running != text.size()) { + LOG_WRN("TokenClassify: detokenized length %zu != input length %zu; byte offsets may be approximate\n", + running, text.size()); + } + } + + // Window geometry. A single forward is exact whenever the input fits one + // ubatch (the common short-PII case). For longer inputs we slide + // overlapping windows with a halo of n_layer*sliding_window so interior + // tokens see their full receptive field (see the namespace note). + const int W = std::min((int) llama_n_ubatch(lctx), (int) llama_n_ctx(lctx)); + const int halo = (int) llama_model_n_layer(model) * pf_ner::PF_SLIDING_WINDOW; + if (W <= 0) { + return grpc::Status(grpc::StatusCode::INTERNAL, "TokenClassify: invalid ubatch/context size"); + } + if (n_tok > W && W <= 2 * halo) { + return grpc::Status(grpc::StatusCode::OUT_OF_RANGE, + "TokenClassify: input (" + std::to_string(n_tok) + " tokens) exceeds the single-forward window (" + + std::to_string(W) + ") and exact windowing needs nbatch > " + std::to_string(2 * halo) + + "; increase the model's nbatch/n_ctx"); + } + + std::vector emit((size_t) n_tok * n_cls); + llama_batch batch = llama_batch_init(W, 0, 1); + + // Decode one window [start, start+wlen) and write log-softmax rows for + // the interior global positions [start+lo, start+hi). Positions are + // window-local (0..wlen-1): RoPE is relative and the symmetric band uses + // |p1-p0|, so local positions are equivalent to absolute ones here. + auto run_window = [&](int start, int wlen, int lo, int hi) -> grpc::Status { + common_batch_clear(batch); + for (int j = 0; j < wlen; j++) { + common_batch_add(batch, tokens[start + j], j, { 0 }, /*logits=*/true); + } + llama_memory_clear(llama_get_memory(lctx), true); + int rc = llama_decode(lctx, batch); + if (rc < 0) { + return grpc::Status(grpc::StatusCode::INTERNAL, + "TokenClassify: llama_decode failed (" + std::to_string(rc) + ")"); + } + for (int li = lo; li < hi; li++) { + const float * row = llama_get_embeddings_ith(lctx, li); + if (row == nullptr) { + return grpc::Status(grpc::StatusCode::INTERNAL, + "TokenClassify: null embeddings at window position " + std::to_string(li)); + } + // log_softmax over the n_cls logits (fp32, max-subtraction stable) + float maxv = row[0]; + for (int c = 1; c < n_cls; c++) if (row[c] > maxv) maxv = row[c]; + double sum = 0.0; + for (int c = 0; c < n_cls; c++) sum += std::exp((double) (row[c] - maxv)); + const double logsum = std::log(sum); + float * dst = &emit[(size_t) (start + li) * n_cls]; + for (int c = 0; c < n_cls; c++) { + dst[c] = (float) ((double) (row[c] - maxv) - logsum); + } + } + return grpc::Status::OK; + }; + + grpc::Status st = grpc::Status::OK; + if (n_tok <= W) { + st = run_window(0, n_tok, 0, n_tok); + } else { + const int stride = W - 2 * halo; + for (int start = 0; start < n_tok; start += stride) { + const int wlen = std::min(W, n_tok - start); + const int lo = (start == 0) ? 0 : halo; + const int hi = (start + wlen >= n_tok) ? wlen : (wlen - halo); + st = run_window(start, wlen, lo, hi); + if (!st.ok()) break; + if (start + wlen >= n_tok) break; + } + } + llama_batch_free(batch); + if (!st.ok()) { + return st; + } + + // Decode the BIOES path and assemble spans. + const pf_ner::label_table lt = pf_ner::build_label_table(model); + const std::vector path = pf_ner::bioes_viterbi(lt, emit, n_tok, n_cls); + const std::vector spans = pf_ner::assemble_spans(lt, path, emit, n_cls); + + const float threshold = request->threshold(); + for (const auto & sp : spans) { + if (sp.score < threshold) continue; + int bstart = tok_off[sp.tok_begin]; + int bend = tok_end[sp.tok_end]; + if (bstart < 0 || bend > (int) text.size() || bstart >= bend) continue; + // Trim leading/trailing ASCII whitespace: the o200k tokenizer folds a + // leading space into the token piece, so a span would otherwise read + // " John" instead of "John" — masking the trimmed form is cleaner. + while (bstart < bend && (unsigned char) text[bstart] <= ' ') bstart++; + while (bend > bstart && (unsigned char) text[bend - 1] <= ' ') bend--; + if (bstart >= bend) continue; + backend::TokenClassifyEntity * ent = response->add_entities(); + ent->set_entity_group(lt.category_name(sp.cat)); + ent->set_start(bstart); + ent->set_end(bend); + ent->set_score(sp.score); + ent->set_text(text.substr(bstart, (size_t) (bend - bstart))); + } + + return grpc::Status::OK; + } + grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response) override { auto auth = checkAuth(context); if (!auth.ok()) return auth; @@ -3458,7 +3865,7 @@ class BackendServiceImpl final : public backend::Backend::Service { if (body.count("prompt") != 0) { const bool add_special = json_value(body, "add_special", false); - llama_tokens tokens = tokenize_mixed(ctx_server.impl->vocab, body.at("content"), add_special, true); + llama_tokens tokens = tokenize_mixed(ctx_server.impl->vocab, body.at("prompt"), add_special, true); for (const auto& token : tokens) { diff --git a/backend/cpp/llama-cpp/patches/0001-token-cls-pooling-substrate.patch b/backend/cpp/llama-cpp/patches/0001-token-cls-pooling-substrate.patch new file mode 100644 index 000000000000..81e62851d54f --- /dev/null +++ b/backend/cpp/llama-cpp/patches/0001-token-cls-pooling-substrate.patch @@ -0,0 +1,157 @@ +diff --git a/common/arg.cpp b/common/arg.cpp +index e0f6c6066..6a62c43f2 100644 +--- a/common/arg.cpp ++++ b/common/arg.cpp +@@ -1923,14 +1923,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex + } + ).set_sampling().set_env("LLAMA_ARG_BACKEND_SAMPLING")); + add_opt(common_arg( +- {"--pooling"}, "{none,mean,cls,last,rank}", ++ {"--pooling"}, "{none,mean,cls,last,rank,token-cls}", + "pooling type for embeddings, use model default if unspecified", + [](common_params & params, const std::string & value) { +- /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } +- else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } +- else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } +- else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } +- else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; } ++ /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } ++ else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } ++ else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } ++ else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } ++ else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; } ++ else if (value == "token-cls") { params.pooling_type = LLAMA_POOLING_TYPE_TOKEN_CLS; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING")); +diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp +index f6a20ef9d..4281da592 100644 +--- a/examples/embedding/embedding.cpp ++++ b/examples/embedding/embedding.cpp +@@ -54,7 +54,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu + const float * embd = nullptr; + int embd_pos = 0; + +- if (pooling_type == LLAMA_POOLING_TYPE_NONE) { ++ if (pooling_type == LLAMA_POOLING_TYPE_NONE || pooling_type == LLAMA_POOLING_TYPE_TOKEN_CLS) { + // try to get token embeddings + embd = llama_get_embeddings_ith(ctx, i); + embd_pos = i; +@@ -246,7 +246,7 @@ int main(int argc, char ** argv) { + + // count number of embeddings + int n_embd_count = 0; +- if (pooling_type == LLAMA_POOLING_TYPE_NONE) { ++ if (pooling_type == LLAMA_POOLING_TYPE_NONE || pooling_type == LLAMA_POOLING_TYPE_TOKEN_CLS) { + for (int k = 0; k < n_prompts; k++) { + n_embd_count += inputs[k].size(); + } +@@ -272,7 +272,7 @@ int main(int argc, char ** argv) { + if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) { + float * out = emb + e * n_embd_out; + batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize); +- e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s; ++ e += (pooling_type == LLAMA_POOLING_TYPE_NONE || pooling_type == LLAMA_POOLING_TYPE_TOKEN_CLS) ? batch.n_tokens : s; + s = 0; + common_batch_clear(batch); + } +@@ -289,7 +289,7 @@ int main(int argc, char ** argv) { + if (params.embd_out.empty()) { + LOG("\n"); + +- if (pooling_type == LLAMA_POOLING_TYPE_NONE) { ++ if (pooling_type == LLAMA_POOLING_TYPE_NONE || pooling_type == LLAMA_POOLING_TYPE_TOKEN_CLS) { + for (int j = 0; j < n_embd_count; j++) { + LOG("embedding %d: ", j); + for (int i = 0; i < std::min(3, n_embd_out); i++) { +diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py +index 5a567e2d1..d2763dfed 100644 +--- a/gguf-py/gguf/constants.py ++++ b/gguf-py/gguf/constants.py +@@ -4172,6 +4172,7 @@ class PoolingType(IntEnum): + CLS = 2 + LAST = 3 + RANK = 4 ++ TOKEN_CLS = 5 + + + class GGMLQuantizationType(IntEnum): +diff --git a/include/llama.h b/include/llama.h +index e8374c53b..d22f3c816 100644 +--- a/include/llama.h ++++ b/include/llama.h +@@ -175,6 +175,7 @@ extern "C" { + LLAMA_POOLING_TYPE_CLS = 2, + LLAMA_POOLING_TYPE_LAST = 3, + LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph ++ LLAMA_POOLING_TYPE_TOKEN_CLS = 5, // used by token classification models to attach the classification head to each token + }; + + enum llama_attention_type { +diff --git a/src/llama-context.cpp b/src/llama-context.cpp +index ad36c0666..769cc620c 100644 +--- a/src/llama-context.cpp ++++ b/src/llama-context.cpp +@@ -1420,6 +1420,17 @@ int llama_context::encode(const llama_batch & batch_inp) { + GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd.size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd.data, 0, n_tokens*n_embd_out*sizeof(float)); + } break; ++ case LLAMA_POOLING_TYPE_TOKEN_CLS: ++ { ++ // extract token classification outputs ++ GGML_ASSERT(embd.data != nullptr); ++ GGML_ASSERT(hparams.n_cls_out > 0); ++ GGML_ASSERT(hparams.n_embd_out() == hparams.n_cls_out); ++ ++ const uint32_t n_cls_out = hparams.n_cls_out; ++ GGML_ASSERT(n_tokens*n_cls_out <= (int64_t) embd.size); ++ ggml_backend_tensor_get_async(backend_embd, t_embd, embd.data, 0, n_tokens*n_cls_out*sizeof(float)); ++ } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: +@@ -1864,6 +1875,22 @@ int llama_context::decode(const llama_batch & batch_inp) { + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd_out*sizeof(float)); + } + } break; ++ case LLAMA_POOLING_TYPE_TOKEN_CLS: ++ { ++ // extract token classification outputs ++ GGML_ASSERT(embd.data != nullptr); ++ GGML_ASSERT(hparams.n_cls_out > 0); ++ GGML_ASSERT(hparams.n_embd_out() == hparams.n_cls_out); ++ ++ const uint32_t n_cls_out = hparams.n_cls_out; ++ float * embd_out = embd.data + n_outputs_prev*n_cls_out; ++ ++ if (n_outputs) { ++ GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); ++ GGML_ASSERT((n_outputs_prev + n_outputs)*n_cls_out <= (int64_t) embd.size); ++ ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_cls_out*sizeof(float)); ++ } ++ } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: +diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp +index e6ec3054d..afa2eb665 100644 +--- a/src/llama-graph.cpp ++++ b/src/llama-graph.cpp +@@ -2939,6 +2939,17 @@ void llm_graph_context::build_pooling( + { + cur = inp; + } break; ++ case LLAMA_POOLING_TYPE_TOKEN_CLS: ++ { ++ cur = inp; ++ ++ if (cls_out) { ++ cur = ggml_mul_mat(ctx0, cls_out, cur); ++ if (cls_out_b) { ++ cur = ggml_add(ctx0, cur, cls_out_b); ++ } ++ } ++ } break; + case LLAMA_POOLING_TYPE_MEAN: + { + ggml_tensor * inp_mean = build_inp_mean(); diff --git a/backend/cpp/llama-cpp/patches/0002-arch-openai-privacy-filter.patch b/backend/cpp/llama-cpp/patches/0002-arch-openai-privacy-filter.patch new file mode 100644 index 000000000000..a0a7aae47085 --- /dev/null +++ b/backend/cpp/llama-cpp/patches/0002-arch-openai-privacy-filter.patch @@ -0,0 +1,84 @@ +diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py +index 5a567e2d1..59f69ce5e 100644 +--- a/gguf-py/gguf/constants.py ++++ b/gguf-py/gguf/constants.py +@@ -485,6 +485,7 @@ class MODEL_ARCH(IntEnum): + HUNYUAN_VL = auto() + SMOLLM3 = auto() + GPT_OSS = auto() ++ OPENAI_PRIVACY_FILTER = auto() # tracks upstream model_type "openai_privacy_filter"; shares the gpt-oss MoE body but is NOT a gpt-oss variant (interleaved/NORM rope, bidirectional, no LM head) + LFM2 = auto() + LFM2MOE = auto() + DREAM = auto() +@@ -1005,6 +1006,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { + MODEL_ARCH.HUNYUAN_VL: "hunyuan_vl", + MODEL_ARCH.SMOLLM3: "smollm3", + MODEL_ARCH.GPT_OSS: "gpt-oss", ++ MODEL_ARCH.OPENAI_PRIVACY_FILTER: "openai-privacy-filter", + MODEL_ARCH.LFM2: "lfm2", + MODEL_ARCH.LFM2MOE: "lfm2moe", + MODEL_ARCH.DREAM: "dream", +@@ -3702,6 +3704,27 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], ++ MODEL_ARCH.OPENAI_PRIVACY_FILTER: [ ++ # gpt-oss tensor set, minus the LM head (OUTPUT), plus the ++ # token-classification head (CLS_OUT -> "cls.output"). The encoder ++ # graph ends at output_norm and the TOKEN_CLS pooling attaches the ++ # score head per token (see patch 0001). ++ MODEL_TENSOR.TOKEN_EMBD, ++ MODEL_TENSOR.OUTPUT_NORM, ++ MODEL_TENSOR.ATTN_NORM, ++ MODEL_TENSOR.ATTN_POST_NORM, ++ MODEL_TENSOR.ATTN_Q, ++ MODEL_TENSOR.ATTN_K, ++ MODEL_TENSOR.ATTN_V, ++ MODEL_TENSOR.ATTN_OUT, ++ MODEL_TENSOR.ATTN_SINKS, ++ MODEL_TENSOR.ROPE_FREQS, ++ MODEL_TENSOR.FFN_GATE_INP, ++ MODEL_TENSOR.FFN_GATE_EXP, ++ MODEL_TENSOR.FFN_DOWN_EXP, ++ MODEL_TENSOR.FFN_UP_EXP, ++ MODEL_TENSOR.CLS_OUT, ++ ], + MODEL_ARCH.LFM2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, +diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py +index 444f0f285..d3e5d1d89 100644 +--- a/gguf-py/gguf/tensor_mapping.py ++++ b/gguf-py/gguf/tensor_mapping.py +@@ -1287,6 +1287,7 @@ class TensorNameMap: + + MODEL_TENSOR.CLS_OUT: ( + "classifier.out_proj", # roberta ++ "score", # openai-privacy-filter (token-classification head) + ), + + MODEL_TENSOR.CLS_NORM: ( +diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp +index b485ac02e..aaf166680 100644 +--- a/src/llama-arch.cpp ++++ b/src/llama-arch.cpp +@@ -135,6 +135,7 @@ static const std::map LLM_ARCH_NAMES = { + { LLM_ARCH_MAINCODER, "maincoder" }, + { LLM_ARCH_KIMI_LINEAR, "kimi-linear" }, + { LLM_ARCH_TALKIE, "talkie" }, ++ { LLM_ARCH_OPENAI_PRIVACY_FILTER, "openai-privacy-filter" }, + { LLM_ARCH_UNKNOWN, "(unknown)" }, + }; + +diff --git a/src/llama-arch.h b/src/llama-arch.h +index b59043e40..edd6b2ad6 100644 +--- a/src/llama-arch.h ++++ b/src/llama-arch.h +@@ -139,6 +139,7 @@ enum llm_arch { + LLM_ARCH_MAINCODER, + LLM_ARCH_KIMI_LINEAR, + LLM_ARCH_TALKIE, ++ LLM_ARCH_OPENAI_PRIVACY_FILTER, + LLM_ARCH_UNKNOWN, + }; + diff --git a/backend/cpp/llama-cpp/patches/0003-convert-openai-privacy-filter.patch b/backend/cpp/llama-cpp/patches/0003-convert-openai-privacy-filter.patch new file mode 100644 index 000000000000..eeb72ad2f138 --- /dev/null +++ b/backend/cpp/llama-cpp/patches/0003-convert-openai-privacy-filter.patch @@ -0,0 +1,178 @@ +diff --git a/conversion/__init__.py b/conversion/__init__.py +index 222005740..ab54e15a6 100644 +--- a/conversion/__init__.py ++++ b/conversion/__init__.py +@@ -87,6 +87,7 @@ TEXT_MODEL_MAP: dict[str, str] = { + "GlmMoeDsaForCausalLM": "glm", + "GlmOcrForConditionalGeneration": "glm", + "GptOssForCausalLM": "gpt_oss", ++ "OpenAIPrivacyFilterForTokenClassification": "openai_privacy_filter", + "GraniteForCausalLM": "granite", + "GraniteMoeForCausalLM": "granite", + "GraniteMoeHybridForCausalLM": "granite", +diff --git a/conversion/openai_privacy_filter.py b/conversion/openai_privacy_filter.py +new file mode 100644 +index 000000000..c6e4cf2e3 +--- /dev/null ++++ b/conversion/openai_privacy_filter.py +@@ -0,0 +1,160 @@ ++from __future__ import annotations ++ ++from typing import Iterable, TYPE_CHECKING ++ ++if TYPE_CHECKING: ++ from torch import Tensor ++ ++from .base import ModelBase, gguf ++from .gpt_oss import GptOssModel ++ ++ ++@ModelBase.register("OpenAIPrivacyFilterForTokenClassification") ++class OpenAIPrivacyFilterModel(GptOssModel): ++ # openai/privacy-filter + OpenMed/privacy-filter-multilingual: a gpt-oss MoE ++ # body (8 layers, 14/2 heads, head_dim 64, d_model 640, 128 experts top-4, ++ # o200k vocab, attn sinks, YaRN) re-purposed as a *bidirectional token ++ # classifier*. config.model_type == "openai_privacy_filter". ++ # ++ # We subclass the gpt-oss converter to reuse its vocab and tensor handling, ++ # and override only what differs: ++ # 1. expert gate_up split: CONCATENATED halves, not gpt-oss interleaving; ++ # 2. a token-classification head (score -> cls.output) + TOKEN_CLS pooling; ++ # 3. no LM head (the arch's MODEL_TENSORS omits OUTPUT). ++ # The body is bf16 dense (not MXFP4), so the gpt-oss dense paths are the ones ++ # that run; the MXFP4 repack code in the base is never exercised. ++ model_arch = gguf.MODEL_ARCH.OPENAI_PRIVACY_FILTER ++ ++ def set_gguf_parameters(self): ++ # HF renamed rope_scaling -> rope_parameters for this arch. Alias it ++ # before super() so the base TextModel YaRN handling (which keys off ++ # "rope_scaling") still writes the rope KVs. The dict keys match what the ++ # base expects (rope_type=yarn, factor, beta_fast/slow, ++ # original_max_position_embeddings). Verify the rope KVs landed with ++ # gguf_dump as part of parity (patch 0004 / Task 5). ++ if "rope_scaling" not in self.hparams and "rope_parameters" in self.hparams: ++ self.hparams["rope_scaling"] = self.hparams["rope_parameters"] ++ ++ # GptOssModel.set_gguf_parameters writes base text params + ++ # sliding_window + expert_feed_forward_length (= intermediate_size). ++ super().set_gguf_parameters() ++ ++ # Token-classification head. PoolingType.TOKEN_CLS == 5 (patch 0001). ++ # The loader derives n_cls_out from the label count; n_embd_out must ++ # equal it (llama-context asserts n_embd_out() == n_cls_out under ++ # TOKEN_CLS), so we write both from the same ordered label list. ++ labels = self._ordered_labels() ++ self.gguf_writer.add_pooling_type(gguf.PoolingType.TOKEN_CLS) ++ self.gguf_writer.add_classifier_output_labels(labels) ++ self.gguf_writer.add_embedding_length_out(len(labels)) ++ ++ def generate_extra_tensors(self) -> Iterable[tuple[str, "Tensor"]]: ++ # Emit the gpt-oss base's extra tensors (MXFP4 repack; a no-op here since ++ # privacy-filter is dense bf16), then our per-dim RoPE frequency factors. ++ yield from super().generate_extra_tensors() ++ ++ # YaRN with truncate=False. The model's rope_parameters set ++ # truncate=False, but ggml's rope_yarn corr_dims unconditionally ++ # floor()/ceil() the interpolation ramp boundaries. That rounding shifts ++ # the ramp in the transition band (here dims ~20-34), giving a per-dim ++ # frequency error up to ~21% that mis-rotates Q/K, softens attention ++ # (worse at higher positions), and attenuates the final logits. Instead ++ # of changing ggml's shared YaRN (which would perturb every other YaRN ++ # model), we bake the *exact* HF inv_freq into per-dim rope_freqs ++ # (freq_factors); the loader disables ggml's YaRN ramp for this arch and ++ # keeps only the YaRN attention mscale (see ++ # src/models/openai-privacy-filter.cpp and patches/README.md). ++ import math ++ import torch ++ ++ rope = self.hparams.get("rope_parameters") or self.hparams.get("rope_scaling") or {} ++ if str(rope.get("rope_type", "")).lower() != "yarn": ++ return ++ ++ dim = int(self.hparams["head_dim"]) ++ base = float(rope.get("rope_theta", self.hparams.get("rope_theta", 10000.0))) ++ factor = float(rope["factor"]) ++ orig = float(rope["original_max_position_embeddings"]) ++ beta_fast = float(rope.get("beta_fast", 32.0)) ++ beta_slow = float(rope.get("beta_slow", 1.0)) ++ truncate = bool(rope.get("truncate", True)) ++ ++ # HF transformers _compute_yarn_parameters (modeling_rope_utils). ++ def correction_dim(num_rotations: float) -> float: ++ return (dim * math.log(orig / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) ++ ++ low, high = correction_dim(beta_fast), correction_dim(beta_slow) ++ if truncate: ++ low, high = math.floor(low), math.ceil(high) ++ low, high = max(low, 0.0), min(high, dim - 1) ++ if low == high: ++ high += 0.001 ++ ++ half = dim // 2 ++ pos_freqs = base ** (torch.arange(0, dim, 2, dtype=torch.float64) / dim) ++ extrap = 1.0 / pos_freqs # high-frequency dims: no scaling ++ interp = 1.0 / (factor * pos_freqs) # low-frequency dims: divide by factor ++ ramp = torch.clamp((torch.arange(half, dtype=torch.float64) - low) / (high - low), 0.0, 1.0) ++ extrap_factor = 1.0 - ramp ++ inv_freq = interp * (1.0 - extrap_factor) + extrap * extrap_factor ++ ++ # ggml divides theta_base (= pos * extrap) by the freq factor, so the ++ # per-dim factor that reproduces inv_freq is extrap / inv_freq (1..factor). ++ freq_factors = (extrap / inv_freq).to(torch.float32) ++ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), freq_factors) ++ ++ def _ordered_labels(self) -> list[str]: ++ # id2label is {"0": "O", "1": "B-ACCOUNTNAME", ...}; emit in index order ++ # so the GGUF label table row i lines up with score-head output row i. ++ # Keys arrive as int (config parsing coerces them) or str (raw JSON), ++ # so normalize to int before ordering. ++ # 217 labels (multilingual) / 33 (base english), BIOES-encoded. ++ id2label = {int(k): v for k, v in self.hparams["id2label"].items()} ++ return [id2label[i] for i in range(len(id2label))] ++ ++ def modify_tensors(self, data_torch: "Tensor", name: str, bid: int | None) -> Iterable[tuple[str, "Tensor"]]: ++ # Experts: privacy-filter packs the fused gate_up projection as two ++ # CONCATENATED halves (gate = first intermediate_size columns, up = the ++ # rest) rather than gpt-oss's INTERLEAVED even/odd (::2 / 1::2). This is ++ # the single load-bearing divergence from the base converter: an ++ # interleaved split here would silently produce a numerically wrong ++ # model that still loads and runs. ++ # ++ # If per-layer parity vs the HF reference (Task 5) fails at the first ++ # expert FFN, the fix is to revert the two slicings below to the gpt-oss ++ # interleaving (data_torch[:, ::2, :] / [:, 1::2, :] and ++ # [..., ::2] / [..., 1::2]). ++ # ++ # privacy-filter is dense bf16, so we only handle the non-MXFP4 case ++ # (no _blocks/_scales tensors exist). ++ if "gate_up_proj" in name and "_blocks" not in name and "_scales" not in name: ++ inter = self.hparams["intermediate_size"] # 640 ++ if name.endswith("_bias"): ++ gate_b, up_b = data_torch[..., :inter], data_torch[..., inter:] ++ name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias") ++ name_up = name.replace("gate_up_proj_bias", "up_proj.bias") ++ # bypass GptOssModel.modify_tensors (interleaved) -> TextModel ++ yield from super(GptOssModel, self).modify_tensors(gate_b, name_gate, bid) ++ yield from super(GptOssModel, self).modify_tensors(up_b, name_up, bid) ++ return ++ # weight: HF stores [E, in, 2*inter]; transpose to [E, 2*inter, in] ++ # then split the output dim into the two contiguous halves. ++ data_torch = data_torch.transpose(-1, -2) ++ gate_w, up_w = data_torch[:, :inter, :], data_torch[:, inter:, :] ++ name_gate = name.replace("gate_up_proj", "gate_proj.weight") ++ name_up = name.replace("gate_up_proj", "up_proj.weight") ++ yield from super(GptOssModel, self).modify_tensors(gate_w, name_gate, bid) ++ yield from super(GptOssModel, self).modify_tensors(up_w, name_up, bid) ++ return ++ ++ # Everything else converts correctly via the base: ++ # - down_proj (dense): GptOssModel.modify_tensors does the rename + ++ # transpose; ++ # - q/k/v/o (+biases), attn sinks, router (+bias), norms, embeddings: ++ # GptOssModel / its filter_tensors handle these; ++ # - score.{weight,bias}: fall through to TextModel.map_tensor_name, ++ # which maps "score" -> cls.output via the tensor_mapping.py entry ++ # added in patch 0002. ++ # We never emit an LM head: tie_word_embeddings is false and the arch's ++ # MODEL_TENSORS list omits MODEL_TENSOR.OUTPUT, so nothing expects one. ++ yield from super().modify_tensors(data_torch, name, bid) diff --git a/backend/cpp/llama-cpp/patches/0004-graph-openai-privacy-filter.patch b/backend/cpp/llama-cpp/patches/0004-graph-openai-privacy-filter.patch new file mode 100644 index 000000000000..f9de3578e86a --- /dev/null +++ b/backend/cpp/llama-cpp/patches/0004-graph-openai-privacy-filter.patch @@ -0,0 +1,285 @@ +diff --git a/src/llama-model.cpp b/src/llama-model.cpp +index 3e236f8c1..465641b63 100644 +--- a/src/llama-model.cpp ++++ b/src/llama-model.cpp +@@ -257,6 +257,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params + return new llama_model_smollm3(params); + case LLM_ARCH_OPENAI_MOE: + return new llama_model_openai_moe(params); ++ case LLM_ARCH_OPENAI_PRIVACY_FILTER: ++ return new llama_model_openai_privacy_filter(params); + case LLM_ARCH_FALCON_H1: + return new llama_model_falcon_h1(params); + case LLM_ARCH_LFM2: +@@ -1794,7 +1796,7 @@ void llama_model::print_info() const { + LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); + } + +- if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) { ++ if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1 || arch == LLM_ARCH_OPENAI_PRIVACY_FILTER) { + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + } + +@@ -2315,6 +2317,10 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { + case LLM_ARCH_LLAMA_EMBED: + case LLM_ARCH_MAINCODER: + case LLM_ARCH_GLM_DSA: ++ // openai-privacy-filter uses the interleaved (GPT-J) rope layout ++ // (_apply_rotary_emb pairs x[..., ::2]/x[..., 1::2]), unlike gpt-oss ++ // (OPENAI_MOE) which uses NEOX rotate-half. See patches/README.md. ++ case LLM_ARCH_OPENAI_PRIVACY_FILTER: + return LLAMA_ROPE_TYPE_NORM; + + // the pairs of head values are offset by n_rot/2 +diff --git a/src/models/models.h b/src/models/models.h +index 5251e2d82..ab78f4bdc 100644 +--- a/src/models/models.h ++++ b/src/models/models.h +@@ -1591,6 +1591,22 @@ struct llama_model_openai_moe : public llama_model_base { + }; + + ++// openai/privacy-filter token classifier: gpt-oss MoE body re-purposed as a ++// bidirectional NER encoder with a per-token classification head (see ++// src/models/openai-privacy-filter.cpp). ++struct llama_model_openai_privacy_filter : public llama_model_base { ++ llama_model_openai_privacy_filter(const struct llama_model_params & params) : llama_model_base(params) {} ++ void load_arch_hparams(llama_model_loader & ml) override; ++ void load_arch_tensors(llama_model_loader & ml) override; ++ ++ struct graph : public llm_graph_context { ++ graph(const llama_model & model, const llm_graph_params & params); ++ }; ++ ++ std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; ++}; ++ ++ + struct llama_model_falcon_h1 : public llama_model_base { + llama_model_falcon_h1(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; +diff --git a/src/models/openai-privacy-filter.cpp b/src/models/openai-privacy-filter.cpp +new file mode 100644 +index 000000000..9e40391db +--- /dev/null ++++ b/src/models/openai-privacy-filter.cpp +@@ -0,0 +1,219 @@ ++#include "models.h" ++ ++// openai/privacy-filter + OpenMed/privacy-filter-multilingual. ++// ++// A gpt-oss MoE *body* (8 layers, 14/2 heads, head_dim 64, d_model 640, ++// 128 experts top-4, o200k vocab, attention sinks, YaRN) re-purposed as a ++// BIDIRECTIONAL token classifier. The body is identical to llama_model_openai_moe; ++// the differences are all "this is an encoder with a per-token classification ++// head", not "the transformer block is different": ++// ++// 1. non-causal attention over a SYMMETRIC sliding-window band, no KV cache ++// (build_attn_inp_no_cache instead of the kv_iswa input); ++// 2. every layer is windowed (uniform band) — no alternating dense layers; ++// 3. NO build_inp_out_ids() pruning: a token classifier needs a logit for ++// every token, so we must not collapse to the last token; ++// 4. NO LM head. The graph stops at the per-token hidden states (res->t_embd); ++// the framework then calls build_pooling(), which under pooling_type == ++// TOKEN_CLS applies model.cls_out (+cls_out_b) to each token to produce ++// [n_cls_out, n_tokens] logits (carry-patch 0001). ++ ++void llama_model_openai_privacy_filter::load_arch_hparams(llama_model_loader & ml) { ++ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ++ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ++ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ++ ++ // Bidirectional encoder. ++ hparams.causal_attn = false; ++ ++ // The HF sliding_window is the window half-width (a token attends to ++ // ±sliding_window). LLAMA_SWA_TYPE_SYMMETRIC masks |p1 - p0| > n_swa/2, ++ // so n_swa = 2 * sliding_window reproduces that ±window. ++ hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC; ++ hparams.n_swa = 2 * hparams.n_swa; ++ ++ // Uniform band: every layer is windowed (unlike gpt-oss's alternating ++ // dense/SWA pattern). set_swa_pattern(0) marks all layers SWA, so the ++ // no-cache attn path uses the symmetric mask on every layer. ++ hparams.set_swa_pattern(0); ++ ++ // RoPE: the model uses YaRN with truncate=false, but ggml's rope_yarn ++ // floor()/ceil()s the interpolation-ramp boundaries, which mis-rotates Q/K ++ // in the transition band (softened attention, attenuated logits). We bake ++ // the exact HF inv_freq into per-dim rope_freqs (freq_factors) in the ++ // converter, and here disable ggml's YaRN ramp while keeping only the YaRN ++ // attention mscale. With ramp off, freq_factors fully define the per-dim ++ // frequencies; the kernel divides theta by them regardless of ext_factor. ++ const float yarn_factor = hparams.rope_freq_scale_train > 0.0f ++ ? 1.0f / hparams.rope_freq_scale_train : 1.0f; ++ hparams.rope_attn_factor = 1.0f + 0.1f * logf(yarn_factor); // YaRN mscale (get_mscale, mscale=1) ++ hparams.rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE; // no ramp; rope_freqs carry the scaling ++ ++ // Every layer is SWA, so the graph reads the *_swa rope params. Base stays ++ // at the trained value; scale is 1.0 (all per-dim scaling lives in ++ // rope_freqs now, not in a global freq_scale). ++ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; ++ hparams.rope_freq_scale_train_swa = 1.0f; ++ ++ // No dedicated size label — the privacy-filter configs are tiny ++ // (8 layers) and don't match any LLM_TYPE bucket. ++ type = LLM_TYPE_UNKNOWN; ++} ++ ++void llama_model_openai_privacy_filter::load_arch_tensors(llama_model_loader &) { ++ LLAMA_LOAD_LOCALS; ++ ++ const int64_t n_ff_exp = hparams.n_ff_exp; ++ ++ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); ++ ++ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); ++ ++ // Token-classification head (no LM head). The converter maps the HF ++ // `score.{weight,bias}` to `cls.output.*` (carry-patch 0002), and the ++ // arch's MODEL_TENSORS omits OUTPUT. n_cls_out is derived from the ++ // classifier label table during the generic hparams load. ++ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, 0); ++ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, 0); ++ ++ for (int i = 0; i < n_layer; ++i) { ++ auto & layer = layers[i]; ++ ++ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); ++ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); ++ ++ create_tensor_qkv(layer, i, n_embd, n_head * n_rot, n_head_kv * n_rot, n_head_kv * n_rot, 0); ++ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0); ++ ++ layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0); ++ ++ // Per-dim RoPE frequency factors (single shared "rope_freqs.weight"; ++ // ROPE_FREQS has no per-layer name, so every layer resolves the same ++ // tensor). Carries the exact HF YaRN inv_freq; see load_arch_hparams. ++ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); ++ ++ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0); ++ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); ++ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); ++ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); ++ ++ layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); ++ ++ layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0); ++ layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0); ++ layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0); ++ layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0); ++ } ++} ++ ++std::unique_ptr llama_model_openai_privacy_filter::build_arch_graph(const llm_graph_params & params) const { ++ return std::make_unique(*this, params); ++} ++ ++llama_model_openai_privacy_filter::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { ++ ggml_tensor * cur; ++ ggml_tensor * inpL; ++ ++ inpL = build_inp_embd(model.tok_embd); ++ ++ // inp_pos - contains the positions (used by RoPE) ++ ggml_tensor * inp_pos = build_inp_pos(); ++ ++ // Bidirectional encoder: no KV cache, non-causal. The symmetric ++ // sliding-window band is applied via the no-cache SWA mask, which ++ // build_attn_inp_no_cache() allocates because swa_type != NONE. ++ auto * inp_attn = build_attn_inp_no_cache(); ++ ++ // NOTE: deliberately no build_inp_out_ids() / ggml_get_rows() pruning — ++ // a token classifier produces a logit per token, so every position must ++ // survive to the classification head. ++ ++ for (int il = 0; il < n_layer; ++il) { ++ const float freq_base_l = model.get_rope_freq_base (cparams, il); ++ const float freq_scale_l = model.get_rope_freq_scale(cparams, il); ++ ++ ggml_tensor * inpSA = inpL; ++ ++ // Per-dim YaRN frequency factors (see load_arch_hparams): bypasses ++ // ggml's truncate=true ramp and reproduces the HF inv_freq exactly. ++ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); ++ ++ // norm ++ cur = build_norm(inpL, ++ model.layers[il].attn_norm, nullptr, ++ LLM_NORM_RMS, il); ++ cb(cur, "attn_norm", il); ++ ++ // self-attention ++ { ++ auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, ++ n_rot, n_head, n_head_kv, il); ++ ++ Qcur = ggml_rope_ext( ++ ctx0, Qcur, inp_pos, rope_factors, ++ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ++ ext_factor, attn_factor, beta_fast, beta_slow ++ ); ++ ++ Kcur = ggml_rope_ext( ++ ctx0, Kcur, inp_pos, rope_factors, ++ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ++ ext_factor, attn_factor, beta_fast, beta_slow ++ ); ++ ++ cb(Qcur, "Qcur", il); ++ cb(Kcur, "Kcur", il); ++ cb(Vcur, "Vcur", il); ++ ++ cur = build_attn(inp_attn, ++ model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s, ++ Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il); ++ ++ cb(cur, "attn_out", il); ++ } ++ ++ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); ++ cb(ffn_inp, "ffn_inp", il); ++ ++ cur = build_norm(ffn_inp, ++ model.layers[il].attn_post_norm, nullptr, ++ LLM_NORM_RMS, il); ++ cb(cur, "attn_post_norm", il); ++ ++ // MoE branch ++ cur = build_moe_ffn(cur, ++ model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b, ++ model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b, ++ model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b, ++ model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b, ++ nullptr, ++ n_expert, n_expert_used, ++ LLM_FFN_SWIGLU_OAI_MOE, false, ++ hparams.expert_weights_scale, ++ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT, ++ il); ++ cb(cur, "ffn_moe_out", il); ++ ++ cur = ggml_add(ctx0, cur, ffn_inp); ++ ++ cur = build_cvec(cur, il); ++ cb(cur, "l_out", il); ++ ++ // input for next layer ++ inpL = cur; ++ } ++ cur = inpL; ++ ++ cur = build_norm(cur, ++ model.output_norm, NULL, ++ LLM_NORM_RMS, -1); ++ ++ cb(cur, "result_norm", -1); ++ ++ // Stop at the per-token hidden states. The framework calls ++ // build_pooling() next; under pooling_type == TOKEN_CLS it applies ++ // model.cls_out (+cls_out_b) to every token (carry-patch 0001). ++ res->t_embd = cur; ++ ++ ggml_build_forward_expand(gf, cur); ++} diff --git a/backend/cpp/llama-cpp/patches/0005-no-cache-all-swa-mask-fix.patch b/backend/cpp/llama-cpp/patches/0005-no-cache-all-swa-mask-fix.patch new file mode 100644 index 000000000000..b712b3c127eb --- /dev/null +++ b/backend/cpp/llama-cpp/patches/0005-no-cache-all-swa-mask-fix.patch @@ -0,0 +1,34 @@ +--- a/src/llama-graph.cpp ++++ b/src/llama-graph.cpp +@@ -463,16 +463,24 @@ + } + }; + ++ // The non-SWA (full) and SWA masks are separate graph inputs, but a ++ // given model may consume only one of them: an encoder where *every* ++ // layer is SWA (e.g. the openai-privacy-filter token classifier, whose ++ // every layer uses a symmetric sliding window) never references the full ++ // mask, so the graph allocator leaves self_kq_mask unallocated (null ++ // buffer). Only fill a mask that actually got a buffer — filling an ++ // unallocated input would write through a null data pointer. + GGML_ASSERT(self_kq_mask); +- GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer)); +- if (self_kq_mask->type == GGML_TYPE_F16) { +- fill_mask((ggml_fp16_t *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE); +- } else { +- fill_mask((float *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE); ++ if (self_kq_mask->buffer) { ++ GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer)); ++ if (self_kq_mask->type == GGML_TYPE_F16) { ++ fill_mask((ggml_fp16_t *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE); ++ } else { ++ fill_mask((float *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE); ++ } + } + +- if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { +- GGML_ASSERT(self_kq_mask_swa); ++ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE && self_kq_mask_swa && self_kq_mask_swa->buffer) { + GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer)); + if (self_kq_mask_swa->type == GGML_TYPE_F16) { + fill_mask((ggml_fp16_t *) self_kq_mask_swa->data, ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type); diff --git a/backend/cpp/llama-cpp/patches/README.md b/backend/cpp/llama-cpp/patches/README.md new file mode 100644 index 000000000000..cc60f26aa3c1 --- /dev/null +++ b/backend/cpp/llama-cpp/patches/README.md @@ -0,0 +1,183 @@ +# llama.cpp carry-patches + +`prepare.sh` applies every file in this directory to the freshly-cloned +`llama.cpp/` tree with `patch -p1`, in lexical order, before the grpc-server +sources are copied in. Keep patches small, ordered, and documented here. + +## 0001-token-cls-pooling-substrate.patch + +**What:** adds a per-token classification pooling path to llama.cpp: +`LLAMA_POOLING_TYPE_TOKEN_CLS` (= 5). Under this pooling type `build_pooling` +applies the model's `cls_out` (+`cls_out_b`) head to **every** token instead of +to a single pooled vector, and `llama_context::{encode,decode}` copy the +resulting `[n_cls_out, n_tokens]` logits into the embeddings buffer +(`llama_get_embeddings_ith(i)` then returns the `n_cls_out` logits for token +`i`). The `--pooling token-cls` CLI flag and the `llama-embedding` example are +taught to treat it as token-level (like `none`). + +This is the substrate the `openai-privacy-filter` token-classifier arch needs +(patches 0002/0003): the encoder graph ends at `result_norm` and lets the +framework attach the score head per token. + +**Provenance:** a reduced subset of upstream PR +[ggml-org/llama.cpp#19725](https://github.com/ggml-org/llama.cpp/pull/19725) +("llama: add BertForTokenClassification support"). We carry **only** the +pooling-mechanism hunks (`include/llama.h`, `src/llama-graph.cpp`, +`src/llama-context.cpp`, `common/arg.cpp`, `examples/embedding/embedding.cpp`, +`gguf-py/gguf/constants.py`). We deliberately drop the PR's BERT/WPM-specific +parts (the `convert_hf_to_gguf.py` BertModel changes — our converter is its own +`conversion/openai_privacy_filter.py`; and the WPM `do_lower_case` tokenizer +plumbing — privacy-filter uses o200k BPE, not WordPiece). The prerequisites the +substrate assumes (`gguf_writer.add_embedding_length_out` / +`add_classifier_output_labels`, `hparams.n_cls_out` / `n_embd_out()`, +`model.cls_out` / `cls_out_b`) already exist in the pinned tree. + +**Re-sync:** PR #19725 is still OPEN; if it changes under review, re-diff. +If/when we upstream the `openai-privacy-filter` arch we will depend on TOKEN_CLS +having landed (or keep carrying this). + +**Version note:** authored against `d6588daa8`; re-verified (line-offset +only) against the current pin `5dcb71166`. See the consolidated version +note at the bottom of this file. + +## 0002-arch-openai-privacy-filter.patch + +**What:** registers the `openai-privacy-filter` architecture (matching the +model's `config.model_type == "openai_privacy_filter"`): +- `src/llama-arch.h` / `.cpp`: `LLM_ARCH_OPENAI_PRIVACY_FILTER` + name string. + No per-arch tensor-name table is needed — this llama.cpp uses a single global + `LLM_TENSOR_NAMES` map, and every tensor we use (incl. `cls.output`, + `attn_sinks`) is already in it. +- `gguf-py/gguf/constants.py`: `MODEL_ARCH.OPENAI_PRIVACY_FILTER`, its name, and + a `MODEL_TENSORS` list = the gpt-oss set **minus `OUTPUT`** (no LM head) + **plus `CLS_OUT`** (the score head). +- `gguf-py/gguf/tensor_mapping.py`: maps HF `score` → `MODEL_TENSOR.CLS_OUT`, so + `score.{weight,bias}` convert to `cls.output.{weight,bias}`. + +The loader/graph for the arch (`llama-model.cpp`, `src/models/…`) come in 0003. +`patch -p1 --dry-run` clean atop 0001 against the current pin `5dcb71166`. + +## 0003-convert-openai-privacy-filter.patch + +**What:** the HF→GGUF converter. Adds `conversion/openai_privacy_filter.py` +(`OpenAIPrivacyFilterModel`, a `GptOssModel` subclass) and registers it in +`conversion/__init__.py` (`OpenAIPrivacyFilterForTokenClassification` → +`openai_privacy_filter`). It reuses the gpt-oss vocab and tensor handling and +overrides only: +- **expert `gate_up` split** — privacy-filter packs gate/up as **concatenated + halves** (`chunk(2)`), *not* gpt-oss's interleaved `::2`/`1::2`. This is the + one load-bearing divergence; a wrong split yields a silently-wrong model. + **Confirmed correct by per-layer parity** (full-logit cos = 1.0 vs HF; the FFN + out matched once the attention upstream was fixed — see below). +- **per-dim RoPE frequency factors** (`generate_extra_tensors`) — the model's + `rope_parameters` set YaRN with `truncate: false`, but ggml's `rope_yarn` + unconditionally `floor()/ceil()`s the interpolation-ramp boundaries. That + rounding shifts the ramp in the transition band (here dims ~20–34), a per-dim + frequency error up to ~21% that mis-rotates Q/K and softens attention. Rather + than change ggml's shared YaRN (which would perturb every other YaRN model), + the converter computes HF's *exact* `inv_freq` (truncate=false) and writes + `rope_freqs.weight = extrap / inv_freq` (1.0 … factor). The loader (0004) + then disables ggml's YaRN ramp and keeps only the YaRN attention mscale, so + these freq-factors fully define the per-dim frequencies. +- **token-classification head** — writes `pooling_type = TOKEN_CLS`, the ordered + `id2label` table (`add_classifier_output_labels`), and `n_embd_out = + len(labels)` (= n_cls_out). `score.{weight,bias}` map to `cls.output.*` via + the 0002 `tensor_mapping` entry; no LM head is emitted. +- aliases `rope_parameters` → `rope_scaling` so the base YaRN handling fires + (this arch renamed the key). + +Everything else (down_proj, q/k/v/o + biases, attn sinks, router, norms, +embeddings) converts via the gpt-oss base unchanged. + +**Validated end-to-end** against the real `OpenMed/privacy-filter-multilingual` +weights: `convert_hf_to_gguf.py` produces a 156-tensor F16 GGUF whose metadata +is correct — `general.architecture = openai-privacy-filter`, `pooling_type = 5` +(TOKEN_CLS), 217 `classifier.output_labels`, `embedding_length_out = 217`, +`cls.output.{weight 640×217, bias 217}`, rope `yarn`/factor 32/orig_ctx 4096/ +freq_base 150000, experts 128/4, sliding_window 128. The only thing the GGUF +structure can't confirm is the gate_up *packing order* — that's a numeric check +deferred to per-layer parity (Task 5). + +Repro (one-shot env; torch needs a 64-bit libstdc++ on `LD_LIBRARY_PATH` under +nix): `pip install torch numpy safetensors sentencepiece protobuf transformers` +into a venv, then +`PYTHONPATH=gguf-py python convert_hf_to_gguf.py --outtype f16`. + +## 0004-graph-openai-privacy-filter.patch + +**What:** the model class, graph, and loader wiring for the +`openai-privacy-filter` arch. Adds `src/models/openai-privacy-filter.cpp` +(`llama_model_openai_privacy_filter` — `load_arch_hparams` / +`load_arch_tensors` / `build_arch_graph` + nested `graph`), its `struct` +declaration in `src/models/models.h`, and wiring sites in +`src/llama-model.cpp` (the factory `case`, the **NORM** `rope_type` list, and +the `n_ff_exp` info-log condition). No `CMakeLists.txt` change — model +sources are gathered by `file(GLOB "models/*.cpp")`. + +The graph is the `llama_model_openai_moe` body re-purposed as a +bidirectional token classifier: +- `load_arch_hparams` sets `causal_attn = false`, `swa_type = + LLAMA_SWA_TYPE_SYMMETRIC`, `n_swa = 2 * sliding_window` (SYMMETRIC masks + `|p1-p0| > n_swa/2`, so the HF half-width window round-trips), and + `set_swa_pattern(0)` so **every** layer is windowed (uniform band, no + alternating dense layers). +- the graph uses `build_attn_inp_no_cache()` (no KV cache; the no-cache + input allocates the SWA mask because `swa_type != NONE`), passes the + per-layer `attn_sinks` to `build_attn`, and **omits `build_inp_out_ids()` + pruning** so every token keeps a logit. +- **RoPE.** privacy-filter uses the **interleaved (GPT-J) rope layout** + (`_apply_rotary_emb` pairs `x[..., ::2]/x[..., 1::2]`), so the arch returns + `LLAMA_ROPE_TYPE_NORM` — *unlike* gpt-oss (`OPENAI_MOE`), which uses NEOX + rotate-half. (This was the dominant parity bug: NEOX mis-pairs the rotated + dims, leaving a per-token cos ≈ 0.82 that no frequency tweak could fix.) + `load_arch_hparams` also bakes the YaRN `truncate=false` fix: it sets + `rope_scaling_type = NONE` (disables ggml's floor/ceil YaRN ramp), keeps the + YaRN mscale via `rope_attn_factor = 1 + 0.1·ln(factor)`, sets the SWA + freq-scale to 1.0, and the graph passes the per-layer `rope_freqs` + (loaded from `rope_freqs.weight`, written by 0003) into `ggml_rope_ext` so + the per-dim frequencies reproduce HF exactly. +- it ends at `res->t_embd` (no LM head). The framework then calls + `build_pooling()`, which under `pooling_type == TOKEN_CLS` applies + `cls_out`/`cls_out_b` per token (carry-patch 0001). `load_arch_tensors` + loads `cls.output.{weight,bias}`, the per-layer `rope_freqs`, and no + `output`/LM head. + +**Parity: solved.** Against `OpenMed/privacy-filter-multilingual` at F16, the +new arch matches the HF reference token-for-token (12/12 argmax, full-logit +cosine = 1.0; every layer's residual stream cos = 1.0, relerr ≈ 2e-4 = F16 +rounding), including the e-mail BIOES span. Verified on the real +`llama-embedding` binary (model-default TOKEN_CLS pooling — do **not** pass +`--pooling none`, which overrides it). The two parity-gated assumptions — +`n_swa = 2 * sliding_window` and 0003's gate_up packing — are both confirmed +correct. All five patches apply, in order, against `5dcb71166`. + +## 0005-no-cache-all-swa-mask-fix.patch + +**What:** a robustness fix to `llm_graph_input_attn_no_cache::set_input` +(`src/llama-graph.cpp`). The no-cache attention input creates two mask +tensors — the full (non-SWA) mask and, when `swa_type != NONE`, the SWA +mask — but a model may consume only one. The openai-privacy-filter encoder +makes **every** layer SWA (uniform symmetric window), so the full +`self_kq_mask` is never referenced by the graph and the allocator leaves it +unallocated (null buffer). The stock `set_input` unconditionally fills it, +dereferencing a null `->data` and aborting at +`GGML_ASSERT(ggml_backend_buffer_is_host(...))`. The fix only fills a mask +that actually received a buffer (and guards the SWA branch symmetrically). + +This is a general fix — any all-SWA no-cache (encoder) model needs it — and +is a candidate to upstream separately. Without it the model loads but +aborts on first `decode`. Discovered via the CPU smoke test (it loads, +tokenizes 12 tokens, then aborts in `set_input`); after the fix the model +runs and produces `[n_cls_out, n_tokens]` logits. With the RoPE fixes in +0003/0004 the per-token logits now match the HF reference exactly (12/12 +argmax, full-logit cosine = 1.0). + +--- + +**Version note (applies to all patches here):** patches 0001–0003 were +originally authored against `d6588daa8`; after LocalAI bumped `Makefile` +`LLAMA_VERSION` to `5dcb71166686799f0d873eab7386234302d05ecf` (upstream +#10128) all patches were regenerated and re-verified against that commit. All +five apply in order with `patch -p1` (no fuzz, no rejected hunks) and the +result compiles and reaches full HF parity. Re-run the apply check after any +further `LLAMA_VERSION` bump. diff --git a/backend/index.yaml b/backend/index.yaml index 37e6890710e4..d2ced5d356f6 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -1557,6 +1557,7 @@ - localai/localai-backends:master-metal-darwin-arm64-kitten-tts - !!merge <<: *local-store name: "local-store-development" + alias: "local-store" uri: "quay.io/go-skynet/local-ai-backends:master-cpu-local-store" mirrors: - localai/localai-backends:master-cpu-local-store @@ -1567,6 +1568,7 @@ - localai/localai-backends:latest-metal-darwin-arm64-local-store - !!merge <<: *local-store name: "metal-local-store-development" + alias: "local-store" uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-local-store" mirrors: - localai/localai-backends:master-metal-darwin-arm64-local-store diff --git a/backend/python/transformers/backend.py b/backend/python/transformers/backend.py index a8c1840b3c46..4b251c7f1dca 100644 --- a/backend/python/transformers/backend.py +++ b/backend/python/transformers/backend.py @@ -270,10 +270,17 @@ def LoadModel(self, request, context): def TokenClassify(self, request, context): # Runs HuggingFace's token-classification pipeline and returns - # the aggregated entity spans. The pipeline gives us byte - # offsets via aggregation_strategy="simple" (set at load - # time), so the caller can slice the original text without - # re-tokenising on the Go side. + # the aggregated entity spans. + # + # OFFSET UNITS: the proto contract (TokenClassifyEntity.start/end) + # is UTF-8 BYTE offsets into request.text. HuggingFace's pipeline, + # however, reports start/end as CODEPOINT offsets into the Python + # str (derived from the fast tokenizer's offset_mapping). Those + # coincide only for ASCII; for any multi-byte character they + # diverge — and this entry point exists to serve the explicitly + # multilingual privacy-filter model, so the conversion is + # mandatory, not a nicety. We build one prefix table mapping each + # codepoint index to its byte offset and translate every span. if not getattr(self, "TokenClassification", False): context.set_code(grpc.StatusCode.FAILED_PRECONDITION) context.set_details("model was not loaded as Type=TokenClassification") @@ -286,18 +293,50 @@ def TokenClassify(self, request, context): context.set_details(f"token-classification failed: {err}") return backend_pb2.TokenClassifyResponse() + text = request.text + # byte_at[i] = byte length of text[:i]; len == len(text)+1 so an + # exclusive end offset that points one past the last codepoint + # maps to len(text.encode("utf-8")). Built in a single O(n) pass. + byte_at = [0] * (len(text) + 1) + acc = 0 + for i, ch in enumerate(text): + byte_at[i] = acc + acc += len(ch.encode("utf-8")) + byte_at[len(text)] = acc + + def to_byte(cp_index, default): + # Clamp out-of-range codepoint indices into the table rather + # than throwing: a span we can't place is better dropped Go-side + # than crashing the RPC. + if cp_index is None: + cp_index = default + if cp_index < 0: + cp_index = 0 + elif cp_index > len(text): + cp_index = len(text) + return byte_at[cp_index] + threshold = request.threshold if request.threshold > 0 else 0.0 entities = [] for r in results: score = float(r.get("score", 0.0)) if score < threshold: continue + cp_start = r.get("start") + cp_end = r.get("end") + start = to_byte(cp_start, 0) + end = to_byte(cp_end, 0) entities.append(backend_pb2.TokenClassifyEntity( entity_group=str(r.get("entity_group") or r.get("entity") or ""), - start=int(r.get("start", 0)), - end=int(r.get("end", 0)), + start=start, + end=end, score=score, - text=str(r.get("word", "")), + # Slice the original text by the (codepoint) span so the + # echoed text matches start..end exactly, instead of the + # pipeline's reconstructed "word" which can carry wordpiece + # artifacts. Falls back to "word" when offsets are absent. + text=(text[cp_start:cp_end] if cp_start is not None and cp_end is not None + else str(r.get("word", ""))), )) return backend_pb2.TokenClassifyResponse(entities=entities) diff --git a/core/application/application.go b/core/application/application.go index 29e05b6d1900..d5c286318dbd 100644 --- a/core/application/application.go +++ b/core/application/application.go @@ -12,14 +12,15 @@ import ( "github.com/mudler/LocalAI/core/http/auth" mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp" "github.com/mudler/LocalAI/core/services/agentpool" + "github.com/mudler/LocalAI/core/services/cloudproxy/mitm" "github.com/mudler/LocalAI/core/services/facerecognition" "github.com/mudler/LocalAI/core/services/galleryop" "github.com/mudler/LocalAI/core/services/monitoring" "github.com/mudler/LocalAI/core/services/nodes" "github.com/mudler/LocalAI/core/services/routing/admission" "github.com/mudler/LocalAI/core/services/routing/billing" - "github.com/mudler/LocalAI/core/services/cloudproxy/mitm" "github.com/mudler/LocalAI/core/services/routing/pii" + "github.com/mudler/LocalAI/core/services/routing/piidetector" "github.com/mudler/LocalAI/core/services/routing/router" "github.com/mudler/LocalAI/core/services/voicerecognition" "github.com/mudler/LocalAI/core/templates" @@ -71,15 +72,15 @@ type Application struct { // 1-to-1 host↔model invariant the dispatcher relies on. Read by // /api/middleware/status so the admin UI can surface the cause. mitmHostConflicts atomic.Pointer[map[string][]string] - routerDecisions router.DecisionStore - routerRegistry *router.Registry - admissionLimiter *admission.Limiter - watchdogMutex sync.Mutex - watchdogStop chan bool - p2pMutex sync.Mutex - p2pCtx context.Context - p2pCancel context.CancelFunc - agentJobMutex sync.Mutex + routerDecisions router.DecisionStore + routerRegistry *router.Registry + admissionLimiter *admission.Limiter + watchdogMutex sync.Mutex + watchdogStop chan bool + p2pMutex sync.Mutex + p2pCtx context.Context + p2pCancel context.CancelFunc + agentJobMutex sync.Mutex // Distributed mode services (nil when not in distributed mode) distributed *DistributedServices @@ -254,6 +255,122 @@ func (a *Application) PIIEvents() pii.EventStore { return a.piiEvents } +// PIINERResolver returns the resolver the chat PII middleware uses to +// turn a configured detector model name into a ready-to-use NERConfig: +// a token-classifier bound over the shared model loader (lazy — the +// model loads on first Detect) plus the detection policy read from that +// model's own pii_detection block. Unknown names resolve to (zero, +// false) so the middleware fails closed. Pass it via pii.WithNERResolver. +func (a *Application) PIINERResolver() pii.NERDetectorResolver { + return func(modelName string) (pii.NERConfig, bool) { + if modelName == "" { + return pii.NERConfig{}, false + } + cfg, ok := a.ModelConfigLoader().GetModelConfig(modelName) + if !ok { + return pii.NERConfig{}, false + } + + // Pattern detectors match secrets with the restricted-regex tier + // in-process (no backend load). Build a pattern matcher instead of the + // gRPC token-classifier; on a compile error fail closed with an error + // detector so the request is blocked, not silently unscanned. + if cfg.IsPatternDetector() { + det, err := piidetector.NewPattern(cfg, a.ApplicationConfig()) + if err != nil { + det = pii.NewErrNERDetector(err.Error()) + } + return pii.NERConfigFromRaw( + det, + 0, // patterns are deterministic — no confidence floor + cfg.PIIDetectionDefaultAction(), + patternEntityActions(cfg), + pii.SourcePattern, + ), true + } + + det := piidetector.New(a.ModelLoader(), cfg, a.ApplicationConfig()) + return pii.NERConfigFromRaw( + det, + cfg.PIIDetectionMinScore(), + cfg.PIIDetectionDefaultAction(), + cfg.PIIDetectionEntityActions(), + pii.SourceNER, + ), true + } +} + +// patternEntityActions merges a pattern detector's per-pattern Action overrides +// into its entity_actions map. A pattern reports matches under its Name, so a +// per-pattern action is just an entity_actions[Name] entry; explicit +// entity_actions still win if both are set. +func patternEntityActions(cfg config.ModelConfig) map[string]string { + out := cfg.PIIDetectionEntityActions() + for _, p := range cfg.PIIDetection.Patterns { + if p.Action == "" || p.Name == "" { + continue + } + if out == nil { + out = map[string]string{} + } + if _, exists := out[p.Name]; !exists { + out[p.Name] = p.Action + } + } + return out +} + +// ResolvePIIPolicy resolves the effective request-side PII policy for a +// consuming model, layering the instance-wide default detector +// (PIIDefaultDetectors, set via POST /api/settings) on top of the per-model +// config. It is the single decision point shared by the chat middleware (via +// WithPolicyResolver) and the MITM listener so both agree. +// +// - enabled: an explicit pii.enabled on the model always wins (true OR +// false). Otherwise PII is on when the backend defaults it on — today +// that means cloud-proxy models, which cross the network to a third party. +// - detectors: the model's own pii.detectors, or — when it lists none — the +// global PIIDefaultDetectors fallback. This is what makes cloud-proxy/MITM +// redaction work out of the box. +// +// appConfig is read live, so changes via the settings API take effect on the +// next request without a restart. +func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, detectors []string) { + if cfg == nil { + return false, nil + } + appCfg := a.ApplicationConfig() + + if cfg.PII.Enabled != nil { + enabled = *cfg.PII.Enabled + } else { + enabled = cfg.PIIIsEnabled() // backend default (cloud-proxy) + } + if !enabled { + return false, nil + } + + detectors = cfg.PIIDetectors() + if len(detectors) == 0 { + detectors = append([]string(nil), appCfg.PIIDefaultDetectors...) + } + return enabled, detectors +} + +// PIIPolicyResolver adapts ResolvePIIPolicy to pii.PolicyResolver for +// pii.WithPolicyResolver. The middleware carries the resolved model config as +// `any` (the MODEL_CONFIG context value, a *config.ModelConfig); this asserts +// it back and applies the instance-wide defaults. +func (a *Application) PIIPolicyResolver() pii.PolicyResolver { + return func(modelCfg any) (bool, []string) { + cfg, ok := modelCfg.(*config.ModelConfig) + if !ok { + return false, nil + } + return a.ResolvePIIPolicy(cfg) + } +} + // MITMCA returns the cloudproxy MITM proxy's CA, or nil when the // MITM listener is disabled. func (a *Application) MITMCA() *mitm.CA { return a.mitmCA.Load() } diff --git a/core/application/mitm.go b/core/application/mitm.go index 293b3d449c20..cb1ab3993c8c 100644 --- a/core/application/mitm.go +++ b/core/application/mitm.go @@ -8,9 +8,33 @@ import ( "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/services/cloudproxy/mitm" + "github.com/mudler/LocalAI/core/services/routing/pii" "github.com/mudler/xlog" ) +// startMITMIfConfigured brings up the cloudproxy MITM listener when an +// address is configured, treating any startup failure as non-fatal. +// +// The listener is opt-in middleware whose address is persisted in runtime +// settings (/api/settings → runtime_settings.json) and replayed on every +// boot. A bad value — e.g. a host the process can't bind, like a LAN IP +// inside a container — must NOT abort the whole server: doing so crash-loops +// with no way out, because the Settings UI used to correct the address can't +// load if startup never completes. So on failure we log loudly and carry on; +// the admin fixes the address via /api/settings, which calls RestartMITM. +func startMITMIfConfigured(app *Application, options *config.ApplicationConfig) { + if options.MITMListen == "" { + return + } + if err := startMITMProxy(app, options); err != nil { + xlog.Error("mitm: cloudproxy listener failed to start — continuing without it", + "listen", options.MITMListen, + "error", err, + "hint", "fix the address via Settings (e.g. \":8082\" to bind all interfaces) and the listener will restart", + ) + } +} + func startMITMProxy(app *Application, options *config.ApplicationConfig) error { app.mitmMutex.Lock() defer app.mitmMutex.Unlock() @@ -68,25 +92,41 @@ func startMITMLocked(app *Application, options *config.ApplicationConfig) error } sort.Strings(effectiveHosts) - // Per-host PII gate inherits from the owning model's pii.enabled. - // A non-cloud-proxy backend with no explicit pii.enabled resolves - // to false → host is intercepted but the regex pass is skipped - // (audit events still record). - var piiDisabled []string + // Per-host NER detectors come from the owning model's pii.detectors + // (resolved against each detector model's pii_detection policy). A + // host whose model has pii.enabled=false, lists no detectors, or + // whose detectors can't be resolved gets no entry → it is intercepted + // and forwarded unredacted (audit events still record traffic). An + // unresolvable detector is recorded as an error-detector so the + // request fails closed at request time rather than leaking. + resolver := app.PIINERResolver() + detectorsByHost := map[string][]pii.NERConfig{} for host, modelName := range ownership.Owners { cfg, exists := app.backendLoader.GetModelConfig(modelName) if !exists { continue } - if !cfg.PIIIsEnabled() { - piiDisabled = append(piiDisabled, host) + // Resolve through the shared policy so cloud-proxy hosts inherit the + // instance-wide default detector when they name none of their own. + enabled, detectors := app.ResolvePIIPolicy(&cfg) + if !enabled || len(detectors) == 0 { + continue + } + cfgs := make([]pii.NERConfig, 0, len(detectors)) + for _, name := range detectors { + nc, ok := resolver(name) + if !ok { + xlog.Error("mitm: detector model not resolvable; requests to host will fail closed", "host", host, "detector", name) + nc = pii.NERConfig{Detector: pii.NewErrNERDetector("detector model '" + name + "' not resolvable")} + } + cfgs = append(cfgs, nc) } + detectorsByHost[host] = cfgs } handler := mitm.NewPIIHandler(mitm.PIIHandlerOptions{ - Redactor: app.piiRedactor, - EventStore: app.piiEvents, - HostsWithPIIDisabled: piiDisabled, + EventStore: app.piiEvents, + DetectorsByHost: detectorsByHost, }) srv, err := mitm.NewServer(mitm.Config{ @@ -109,7 +149,7 @@ func startMITMLocked(app *Application, options *config.ApplicationConfig) error "ca_dir", caDir, "intercept_hosts", effectiveHosts, "model_owned_hosts", len(ownership.Owners), - "pii_disabled_hosts", len(piiDisabled), + "pii_detector_hosts", len(detectorsByHost), ) return nil } diff --git a/core/application/mitm_test.go b/core/application/mitm_test.go new file mode 100644 index 000000000000..b7627fa2d66c --- /dev/null +++ b/core/application/mitm_test.go @@ -0,0 +1,58 @@ +package application + +import ( + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/system" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// minimal Application wired enough for startMITMProxy: an empty model +// config loader (no host claims), CA written under a temp DataPath. +func newMITMTestApp(dataPath string) (*Application, *config.ApplicationConfig) { + state, err := system.GetSystemState() + Expect(err).NotTo(HaveOccurred()) + state.Model.ModelsPath = dataPath + opts := config.NewApplicationConfig( + config.WithSystemState(state), + config.WithDataPath(dataPath), + ) + return newApplication(opts), opts +} + +var _ = Describe("startMITMIfConfigured", func() { + It("does nothing when no listen address is configured", func() { + app, opts := newMITMTestApp(GinkgoT().TempDir()) + opts.MITMListen = "" + + Expect(func() { startMITMIfConfigured(app, opts) }).NotTo(Panic()) + Expect(app.mitmServer.Load()).To(BeNil(), "no listener should be stored when disabled") + }) + + // Regression: a persisted-but-unbindable MITM address (e.g. a LAN host + // inside a container) must not abort startup. startMITMIfConfigured + // swallows the bind error so the rest of LocalAI still comes up and the + // admin can fix the address via the Settings UI. + It("logs and continues when the listen address cannot be bound", func() { + app, opts := newMITMTestApp(GinkgoT().TempDir()) + // 192.0.2.1 is TEST-NET-1 (RFC 5737): guaranteed not assigned to any + // local interface, so bind fails deterministically without DNS. + opts.MITMListen = "192.0.2.1:8082" + + Expect(func() { startMITMIfConfigured(app, opts) }).NotTo(Panic()) + Expect(app.mitmServer.Load()).To(BeNil(), "failed listener must not be stored") + }) + + It("starts and stores the listener on a bindable address", func() { + app, opts := newMITMTestApp(GinkgoT().TempDir()) + opts.MITMListen = "127.0.0.1:0" // OS-assigned free port + + startMITMIfConfigured(app, opts) + + srv := app.mitmServer.Load() + Expect(srv).NotTo(BeNil(), "listener should be stored on success") + DeferCleanup(srv.Stop) + Expect(srv.Addr()).NotTo(BeEmpty()) + }) +}) diff --git a/core/application/pii_policy_test.go b/core/application/pii_policy_test.go new file mode 100644 index 000000000000..e221293c3e64 --- /dev/null +++ b/core/application/pii_policy_test.go @@ -0,0 +1,51 @@ +package application + +import ( + "github.com/mudler/LocalAI/core/config" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("ResolvePIIPolicy", func() { + chat := config.FLAG_CHAT + bp := func(b bool) *bool { return &b } + mk := func(c *config.ApplicationConfig) *Application { + return &Application{applicationConfig: c} + } + + It("lets an explicit pii.enabled=false win over the global default detector", func() { + app := mk(&config.ApplicationConfig{PIIDefaultDetectors: []string{"pf"}}) + cfg := &config.ModelConfig{Backend: "cloud-proxy", KnownUsecases: &chat} + cfg.PII.Enabled = bp(false) + enabled, dets := app.ResolvePIIPolicy(cfg) + Expect(enabled).To(BeFalse()) + Expect(dets).To(BeNil()) + }) + + It("enables a cloud-proxy model with the global default detector (closes the no-op gap)", func() { + // cloud-proxy defaults PIIIsEnabled()==true but lists no detectors, so + // without a global default it scans with nothing. + app := mk(&config.ApplicationConfig{PIIDefaultDetectors: []string{"pf"}}) + cfg := &config.ModelConfig{Backend: "cloud-proxy"} + enabled, dets := app.ResolvePIIPolicy(cfg) + Expect(enabled).To(BeTrue()) + Expect(dets).To(Equal([]string{"pf"})) + }) + + It("leaves a non-cloud model off by default (no instance usecase default-on)", func() { + app := mk(&config.ApplicationConfig{PIIDefaultDetectors: []string{"pf"}}) + cfg := &config.ModelConfig{Backend: "llama-cpp", KnownUsecases: &chat} + enabled, _ := app.ResolvePIIPolicy(cfg) + Expect(enabled).To(BeFalse()) + }) + + It("prefers the model's own detectors over the global default", func() { + app := mk(&config.ApplicationConfig{PIIDefaultDetectors: []string{"global-pf"}}) + cfg := &config.ModelConfig{Backend: "cloud-proxy"} + cfg.PII.Detectors = []string{"own-pf"} + enabled, dets := app.ResolvePIIPolicy(cfg) + Expect(enabled).To(BeTrue()) + Expect(dets).To(Equal([]string{"own-pf"})) + }) +}) diff --git a/core/application/router_factories.go b/core/application/router_factories.go index d37cfb9d8115..879c43a835ee 100644 --- a/core/application/router_factories.go +++ b/core/application/router_factories.go @@ -1,63 +1,120 @@ package application import ( + "context" + "fmt" + "github.com/mudler/LocalAI/core/backend" "github.com/mudler/LocalAI/core/config" ) -// adapterConfig resolves a model name to its runtime ModelConfig, or -// nil when the name is unknown. Shared by the router-facing factories -// below and by ModelConfigLookup. +// adapterConfig resolves a model name to its runtime ModelConfig, or nil when +// unknown. LoadModelConfigFileByNameDefaultOptions never returns nil — for an +// unknown name it returns a defaults-filled stub with an empty Name (the YAML +// `name:` field is required by Validate), which is how we tell the two apart. func (a *Application) adapterConfig(modelName string) *config.ModelConfig { cfg, err := a.backendLoader.LoadModelConfigFileByNameDefaultOptions(modelName, a.applicationConfig) - if err != nil || cfg == nil { + if err != nil || cfg == nil || cfg.Name == "" { return nil } return cfg } -// ModelConfigLookup is the lookup function the router middleware's -// classifier validator uses to confirm classifier_model declares -// FLAG_SCORE before binding it. +// ModelConfigLookup is the lookup the router middleware's classifier validator +// uses to confirm classifier_model declares FLAG_SCORE before binding it. func (a *Application) ModelConfigLookup() func(modelName string) *config.ModelConfig { return a.adapterConfig } -// Scorer returns a backend.Scorer bound to the named model, or nil -// when the model is unknown. Used as a method value (app.Scorer) by -// router.ClassifierDeps — no factory-of-factory wrapper needed. +// The router-facing factories below (Scorer, Embedder, Reranker, TokenCounter) +// bind a model NAME at construction and re-resolve the CONFIG on every call. +// Capturing the config at construction would bake in whatever state +// adapterConfig saw first — including a stub returned before the YAML reached +// bcl.configs (e.g. /import-model or gallery install racing startup). The +// classifier registry caches factories by router-config fingerprint, so a +// once-stale capture stays stale until the router config is edited. + func (a *Application) Scorer(modelName string) backend.Scorer { - cfg := a.adapterConfig(modelName) + if a.adapterConfig(modelName) == nil { + return nil + } + return &lazyScorer{app: a, modelName: modelName} +} + +type lazyScorer struct { + app *Application + modelName string +} + +func (l *lazyScorer) Score(ctx context.Context, prompt string, candidates []string) ([]backend.CandidateScore, error) { + cfg := l.app.adapterConfig(l.modelName) if cfg == nil { + return nil, fmt.Errorf("scorer: model %q no longer available", l.modelName) + } + return backend.NewScorer(l.app.modelLoader, *cfg, l.app.applicationConfig).Score(ctx, prompt, candidates) +} + +// TokenCounter returns a func so the middleware's literal field type accepts +// it as a method value without importing core/http/middleware from here. +func (a *Application) TokenCounter(modelName string) func(string) (int, error) { + if a.adapterConfig(modelName) == nil { return nil } - return backend.NewScorer(a.modelLoader, *cfg, a.applicationConfig) + return func(text string) (int, error) { + cfg := a.adapterConfig(modelName) + if cfg == nil { + return 0, fmt.Errorf("token counter: model %q no longer available", modelName) + } + resp, err := backend.ModelTokenize(text, a.modelLoader, *cfg, a.applicationConfig) + if err != nil { + return 0, err + } + return len(resp.Tokens), nil + } } -// Reranker returns a backend.Reranker bound to the named model, or -// nil when unknown. The reranker model's `type:` (e.g. "colbert") -// selects the scoring head inside the rerankers backend. func (a *Application) Reranker(modelName string) backend.Reranker { - cfg := a.adapterConfig(modelName) - if cfg == nil { + if a.adapterConfig(modelName) == nil { return nil } - return backend.NewReranker(a.modelLoader, *cfg, a.applicationConfig) + return &lazyReranker{app: a, modelName: modelName} } -// Embedder returns a backend.Embedder bound to the named model, or -// nil when unknown. Used by the router's L2 embedding cache. -func (a *Application) Embedder(modelName string) backend.Embedder { - cfg := a.adapterConfig(modelName) +type lazyReranker struct { + app *Application + modelName string +} + +func (l *lazyReranker) Rerank(ctx context.Context, query string, documents []string) ([]backend.RerankResult, error) { + cfg := l.app.adapterConfig(l.modelName) if cfg == nil { + return nil, fmt.Errorf("reranker: model %q no longer available", l.modelName) + } + return backend.NewReranker(l.app.modelLoader, *cfg, l.app.applicationConfig).Rerank(ctx, query, documents) +} + +func (a *Application) Embedder(modelName string) backend.Embedder { + if a.adapterConfig(modelName) == nil { return nil } - return backend.NewEmbedder(a.modelLoader, *cfg, a.applicationConfig) + return &lazyEmbedder{app: a, modelName: modelName} +} + +type lazyEmbedder struct { + app *Application + modelName string +} + +func (l *lazyEmbedder) Embed(ctx context.Context, text string) ([]float32, error) { + cfg := l.app.adapterConfig(l.modelName) + if cfg == nil { + return nil, fmt.Errorf("embedder: model %q no longer available", l.modelName) + } + return backend.NewEmbedder(l.app.modelLoader, *cfg, l.app.applicationConfig).Embed(ctx, text) } -// VectorStore returns a backend.VectorStore for the named collection, -// or nil when the name is empty. Each router model gets its own -// backend process via the model loader's cache keyed by storeName. +// VectorStore takes a store name, not a model name — no adapterConfig, no +// staleness to avoid. func (a *Application) VectorStore(storeName string) backend.VectorStore { return backend.NewVectorStore(a.modelLoader, a.applicationConfig, storeName) } diff --git a/core/application/router_factories_test.go b/core/application/router_factories_test.go new file mode 100644 index 000000000000..5a6988a88fba --- /dev/null +++ b/core/application/router_factories_test.go @@ -0,0 +1,155 @@ +package application + +import ( + "context" + "os" + "path/filepath" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/LocalAI/pkg/system" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// Regression: the router-facing factories used to capture +// *config.ModelConfig at construction. A gallery install that raced +// startup left a stub (Backend="") bound for the lifetime of the +// classifier registry's cache entry, bypassing the user's `backend:` +// config. These specs pin the lazy re-resolve. +var _ = Describe("router_factories lazy config resolution", func() { + var ( + tmpDir string + app *Application + ) + + BeforeEach(func() { + var err error + tmpDir, err = os.MkdirTemp("", "router-factories-*") + Expect(err).NotTo(HaveOccurred()) + + appCfg := &config.ApplicationConfig{ + Context: context.Background(), + SystemState: &system.SystemState{Model: system.Model{ModelsPath: tmpDir}}, + } + app = &Application{ + backendLoader: config.NewModelConfigLoader(tmpDir), + modelLoader: model.NewModelLoader(appCfg.SystemState), + applicationConfig: appCfg, + } + }) + + AfterEach(func() { + _ = os.RemoveAll(tmpDir) + }) + + // writeCfg seeds both the on-disk YAML and the in-memory cache — + // removing only the cache would fall through to file-read. + writeCfg := func(name, backend string) { + yaml := "name: " + name + "\nbackend: " + backend + "\nparameters:\n model: " + name + ".bin\n" + Expect(os.WriteFile(filepath.Join(tmpDir, name+".yaml"), []byte(yaml), 0644)).To(Succeed()) + Expect(app.backendLoader.LoadModelConfigsFromPath(tmpDir)).To(Succeed()) + cfg, ok := app.backendLoader.GetModelConfig(name) + Expect(ok).To(BeTrue(), "config must be loaded before the spec runs") + Expect(cfg.Backend).To(Equal(backend)) + } + + // removeCfg purges both the cache and the YAML so LoadModelConfigFileByName + // returns the empty-stub case and adapterConfig returns nil. + removeCfg := func(name string) { + app.backendLoader.RemoveModelConfig(name) + Expect(os.Remove(filepath.Join(tmpDir, name+".yaml"))).To(Succeed()) + } + + Context("Embedder", func() { + It("returns nil at construction for an unknown model", func() { + Expect(app.Embedder("missing")).To(BeNil()) + }) + + It("re-resolves the model config on each Embed call", func() { + writeCfg("emb-test", "llama-cpp") + emb := app.Embedder("emb-test") + Expect(emb).NotTo(BeNil()) + + // The factory must hold the NAME, not a captured config — + // otherwise stale captures survive cache invalidation. + lazy, ok := emb.(*lazyEmbedder) + Expect(ok).To(BeTrue(), "Embedder must return *lazyEmbedder") + Expect(lazy.modelName).To(Equal("emb-test")) + + // Mutate the cached config. A lazy implementation sees the + // update on the next adapterConfig call; a captured-at- + // construction implementation would still see "llama-cpp". + app.backendLoader.UpdateModelConfig("emb-test", func(c *config.ModelConfig) { + c.Backend = "rerankers" + }) + Expect(lazy.app.adapterConfig("emb-test").Backend).To(Equal("rerankers")) + + // Remove the config entirely → Embed must surface the disappearance. + removeCfg("emb-test") + _, err := emb.Embed(context.Background(), "anything") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no longer available")) + }) + }) + + Context("Scorer", func() { + It("returns nil at construction for an unknown model", func() { + Expect(app.Scorer("missing")).To(BeNil()) + }) + + It("re-resolves the model config on each Score call", func() { + writeCfg("score-test", "llama-cpp") + sc := app.Scorer("score-test") + Expect(sc).NotTo(BeNil()) + + lazy, ok := sc.(*lazyScorer) + Expect(ok).To(BeTrue(), "Scorer must return *lazyScorer") + Expect(lazy.modelName).To(Equal("score-test")) + + removeCfg("score-test") + _, err := sc.Score(context.Background(), "prompt", []string{"a"}) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no longer available")) + }) + }) + + Context("Reranker", func() { + It("returns nil at construction for an unknown model", func() { + Expect(app.Reranker("missing")).To(BeNil()) + }) + + It("re-resolves the model config on each Rerank call", func() { + writeCfg("rerank-test", "rerankers") + rr := app.Reranker("rerank-test") + Expect(rr).NotTo(BeNil()) + + lazy, ok := rr.(*lazyReranker) + Expect(ok).To(BeTrue(), "Reranker must return *lazyReranker") + Expect(lazy.modelName).To(Equal("rerank-test")) + + removeCfg("rerank-test") + _, err := rr.Rerank(context.Background(), "q", []string{"d"}) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no longer available")) + }) + }) + + Context("TokenCounter", func() { + It("returns nil at construction for an unknown model", func() { + Expect(app.TokenCounter("missing")).To(BeNil()) + }) + + It("re-resolves the model config on each call", func() { + writeCfg("tok-test", "llama-cpp") + tc := app.TokenCounter("tok-test") + Expect(tc).NotTo(BeNil()) + + removeCfg("tok-test") + _, err := tc("anything") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no longer available")) + }) + }) +}) diff --git a/core/application/startup.go b/core/application/startup.go index be559479f2f8..6019c565d924 100644 --- a/core/application/startup.go +++ b/core/application/startup.go @@ -23,9 +23,9 @@ import ( "github.com/mudler/LocalAI/core/services/routing/pii" "github.com/mudler/LocalAI/core/services/routing/router" "github.com/mudler/LocalAI/core/services/storage" - "github.com/mudler/LocalAI/pkg/signals" coreStartup "github.com/mudler/LocalAI/core/startup" "github.com/mudler/LocalAI/internal" + "github.com/mudler/LocalAI/pkg/signals" "github.com/mudler/LocalAI/pkg/vram" "github.com/mudler/LocalAI/pkg/model" @@ -53,7 +53,6 @@ func New(opts ...config.AppOption) (*Application, error) { caps, err := xsysinfo.CPUCapabilities() if err == nil { xlog.Debug("CPU capabilities", "capabilities", caps) - } gpus, err := xsysinfo.GPUs() if err == nil { @@ -68,18 +67,18 @@ func New(opts ...config.AppOption) (*Application, error) { return nil, fmt.Errorf("models path cannot be empty") } - err = os.MkdirAll(options.SystemState.Model.ModelsPath, 0750) + err = os.MkdirAll(options.SystemState.Model.ModelsPath, 0o750) if err != nil { return nil, fmt.Errorf("unable to create ModelPath: %q", err) } if options.GeneratedContentDir != "" { - err := os.MkdirAll(options.GeneratedContentDir, 0750) + err := os.MkdirAll(options.GeneratedContentDir, 0o750) if err != nil { return nil, fmt.Errorf("unable to create ImageDir: %q", err) } } if options.UploadDir != "" { - err := os.MkdirAll(options.UploadDir, 0750) + err := os.MkdirAll(options.UploadDir, 0o750) if err != nil { return nil, fmt.Errorf("unable to create UploadDir: %q", err) } @@ -87,7 +86,7 @@ func New(opts ...config.AppOption) (*Application, error) { // Create and migrate data directory if options.DataPath != "" { - if err := os.MkdirAll(options.DataPath, 0750); err != nil { + if err := os.MkdirAll(options.DataPath, 0o750); err != nil { return nil, fmt.Errorf("unable to create DataPath: %q", err) } // Migrate data from DynamicConfigsDir to DataPath if needed @@ -192,44 +191,14 @@ func New(opts ...config.AppOption) (*Application, error) { xlog.Info("stats: disabled by --disable-stats") } - // Wire the regex PII filter. Default-on: a single-user box gets - // the built-in pattern set the first time it starts, with email/ - // phone/SSN/credit-card on mask and api_key_prefix on block. If - // the operator wants different actions, --pii-config points at a - // YAML file that overrides per-id; --disable-pii turns it off - // entirely. - if !options.DisablePII { - patterns, err := pii.LoadConfig(options.PIIConfigPath) - if err != nil { - return nil, fmt.Errorf("pii config: %w", err) - } - application.piiRedactor = pii.NewRedactor(patterns) - application.piiEvents = pii.NewMemoryEventStore(0) - // Apply persisted per-pattern overrides — admins toggling - // action/disabled via the UI and clicking "Save to disk" land - // here on the next start. Bad ids are warned and ignored so a - // stale entry doesn't block startup. - for id, ov := range options.PIIPatternOverrides { - if ov.Action != nil { - if err := application.piiRedactor.SetAction(id, pii.Action(*ov.Action)); err != nil { - xlog.Warn("pii: persisted override skipped", "pattern", id, "error", err) - continue - } - } - if ov.Disabled != nil { - if err := application.piiRedactor.SetDisabled(id, *ov.Disabled); err != nil { - xlog.Warn("pii: persisted disable skipped", "pattern", id, "error", err) - } - } - } - xlog.Info("pii: filter enabled", - "patterns", len(patterns), - "config_path", options.PIIConfigPath, - "persisted_overrides", len(options.PIIPatternOverrides), - ) - } else { - xlog.Info("pii: disabled by --disable-pii") - } + // Wire the PII filter subsystem. The redactor is now a stateless + // handle — detection is driven by per-model NER detectors + // (pii.detectors → the detector model's pii_detection policy), run + // request-side by the chat middleware and the MITM input path. The + // regex tier was removed; redaction is opt-in per model via + // PIIIsEnabled(). The event store backs the /api/pii/events audit log. + application.piiRedactor = &pii.Redactor{} + application.piiEvents = pii.NewMemoryEventStore(0) // Wire the routing decision log. Always-on when stats are enabled — // the per-router admin page reads this as the live activity feed @@ -441,11 +410,7 @@ func New(opts ...config.AppOption) (*Application, error) { // traffic doesn't need a parallel config for MITM traffic. // Runs after loadRuntimeSettingsFromFile so a listener configured // via /api/settings is brought back up across restarts. - if options.MITMListen != "" { - if err := startMITMProxy(application, options); err != nil { - return nil, fmt.Errorf("mitm: startup: %w", err) - } - } + startMITMIfConfigured(application, options) application.ModelLoader().SetBackendLoggingEnabled(options.EnableBackendLogging) @@ -500,7 +465,7 @@ func startWatcher(options *config.ApplicationConfig) { if _, err := os.Stat(options.DynamicConfigsDir); err != nil { if os.IsNotExist(err) { // We try to create the directory if it does not exist and was specified - if err := os.MkdirAll(options.DynamicConfigsDir, 0700); err != nil { + if err := os.MkdirAll(options.DynamicConfigsDir, 0o700); err != nil { xlog.Error("failed creating DynamicConfigsDir", "error", err) } } else { @@ -747,16 +712,6 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) { options.MITMListen = *settings.MITMListen } - // PII pattern overrides — file is the only source; CLI flags don't - // reach into this map. Apply unconditionally when present; the - // redactor wiring below sees the result on first construction. - if settings.PIIPatternOverrides != nil { - options.PIIPatternOverrides = make(map[string]config.PIIPatternRuntimeOverride, len(*settings.PIIPatternOverrides)) - for id, ov := range *settings.PIIPatternOverrides { - options.PIIPatternOverrides[id] = ov - } - } - // Backend upgrade flags if settings.AutoUpgradeBackends != nil { if !options.AutoUpgradeBackends { @@ -907,7 +862,7 @@ func loadOrGenerateHMACSecret(path string) (string, error) { } secret := hex.EncodeToString(b) - if err := os.WriteFile(path, []byte(secret), 0600); err != nil { + if err := os.WriteFile(path, []byte(secret), 0o600); err != nil { return "", fmt.Errorf("failed to persist HMAC secret: %w", err) } diff --git a/core/backend/embeddings.go b/core/backend/embeddings.go index 4be2bc346ef9..eff88ef04b19 100644 --- a/core/backend/embeddings.go +++ b/core/backend/embeddings.go @@ -100,8 +100,13 @@ func ModelEmbedding(ctx context.Context, s string, tokens []int, loader *model.M trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes) traceData := map[string]any{ - "input_text": trace.TruncateString(s, 1000), - "input_tokens_count": len(tokens), + "input_text": trace.TruncateString(s, 1000), + } + // Only present for token-mode callers (pre-tokenized override); + // emitting "0" alongside input_text would read as "consumed zero + // tokens", which is wrong. + if len(tokens) > 0 { + traceData["input_tokens_count"] = len(tokens) } startTime := time.Now() diff --git a/core/backend/options.go b/core/backend/options.go index 0274bdb6e78a..09a00fd93107 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -87,11 +87,57 @@ func getSeed(c config.ModelConfig) int32 { return seed } -func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { - b := 512 +// DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a +// model config leaves them unset. Exported so callers that must respect the +// effective decode window — notably the router's prompt trimmer — resolve the +// same numbers grpcModelOpts does instead of guessing. +const ( + DefaultContextSize = 4096 + DefaultBatchSize = 512 +) + +// EffectiveContextSize is the context window the backend will run with: the +// configured value, or DefaultContextSize when unset. +func EffectiveContextSize(c config.ModelConfig) int { + if c.ContextSize != nil { + return *c.ContextSize + } + return DefaultContextSize +} + +// EffectiveBatchSize is the single-decode batch the backend will run with. +// Score, embedding, rerank and token-classification (NER) all process the whole +// input in one pass: score decodes prompt+candidate (asserts n_tokens <= +// n_batch), embedding/rerank pool over the full sequence in one physical batch +// (n_ubatch), and the NER encoder runs one forward per n_ubatch-sized window. +// So the batch is sized to the context — anything that fits the context fits +// one pass, avoiding both the GGML_ASSERT crash (n_outputs_max <= +// cparams.n_outputs_max, where n_outputs_max defaults to n_batch) and the +// "input is too large to process" error. Explicit `batch:` always wins. +func EffectiveBatchSize(c config.ModelConfig) int { if c.Batch != 0 { - b = c.Batch + return c.Batch } + // token_classify is checked explicitly AND via the embeddings flag: a + // token-classification (NER) model sets embeddings:true but declares + // known_usecases:[token_classify], and that declaration is authoritative — + // it suppresses the embeddings usecase guess, so HasUsecases(FLAG_EMBEDDINGS) + // is false here. Any pooled encoder (embeddings:true) is single-pass + // regardless of how its usecases resolved, so key off the flag as a catch-all. + singlePass := c.HasUsecases(config.FLAG_SCORE) || + c.HasUsecases(config.FLAG_EMBEDDINGS) || + c.HasUsecases(config.FLAG_RERANK) || + c.HasUsecases(config.FLAG_TOKEN_CLASSIFY) || + (c.Embeddings != nil && *c.Embeddings) + if ctx := EffectiveContextSize(c); singlePass && ctx > DefaultBatchSize { + return ctx + } + return DefaultBatchSize +} + +func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { + ctxSize := EffectiveContextSize(c) + b := EffectiveBatchSize(c) flashAttention := "auto" @@ -134,11 +180,6 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { } } - ctxSize := 4096 - if c.ContextSize != nil { - ctxSize = *c.ContextSize - } - mmlock := false if c.MMlock != nil { mmlock = *c.MMlock diff --git a/core/backend/options_internal_test.go b/core/backend/options_internal_test.go index 5e1848f0f5bd..1c1bd1493545 100644 --- a/core/backend/options_internal_test.go +++ b/core/backend/options_internal_test.go @@ -97,3 +97,95 @@ var _ = Describe("gRPCPredictOpts reasoning_effort metadata", func() { Expect(opts.Metadata).ToNot(HaveKey("reasoning_effort")) }) }) + +var _ = Describe("grpcModelOpts NBatch", func() { + scoreUsecase := config.FLAG_SCORE + threads := 1 + ctx := 4096 + + It("defaults to 512 for an ordinary model", func() { + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}} + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(512)) + }) + + It("sizes the batch to the context window for score models", func() { + // Score models decode the whole prompt+candidate in one + // llama_decode; n_batch must cover it or the backend aborts. + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}, KnownUsecases: &scoreUsecase} + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(4096)) + }) + + It("keeps an explicit batch over the score default", func() { + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}, KnownUsecases: &scoreUsecase} + cfg.Batch = 1024 + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(1024)) + }) + + It("sizes the batch to the context window for embedding models", func() { + // Embedding/rerank pool over the whole sequence in one physical batch + // (n_ubatch); without this the input is capped at the 512 default and + // the backend returns "input is too large to process". + embeddings := true + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}} + cfg.Embeddings = &embeddings + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(4096)) + }) + + It("sizes the batch to the context window for rerank models", func() { + reranking := true + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}} + cfg.Reranking = &reranking + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(4096)) + }) + + It("sizes the batch to the context window for token-classification (NER) models", func() { + // The privacy-filter regression: a token_classify model sets + // embeddings:true but declares known_usecases:[token_classify], which + // is authoritative and suppresses the embeddings usecase guess — so + // HasUsecases(FLAG_EMBEDDINGS) is false. Without sizing the batch to + // the context the NER encoder loads at 512, shrinking the exact-pass + // window and tripping the GGML_ASSERT on longer inputs. + tokenClassify := config.FLAG_TOKEN_CLASSIFY + embeddings := true + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}, KnownUsecases: &tokenClassify} + cfg.Embeddings = &embeddings + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(4096)) + }) + + It("sizes the batch to the effective context for a token_classify model with no explicit context_size", func() { + // Mirrors the shipped gallery config (no batch, no context_size): the + // backend defaults n_ctx to 4096, so n_batch must follow. + tokenClassify := config.FLAG_TOKEN_CLASSIFY + embeddings := true + cfg := config.ModelConfig{Threads: &threads, KnownUsecases: &tokenClassify} + cfg.Embeddings = &embeddings + Expect(cfg.ContextSize).To(BeNil()) + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(4096)) + Expect(opts.ContextSize).To(BeEquivalentTo(4096)) + }) + + It("does not raise the batch when a score model's context is below the default", func() { + small := 256 + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &small}, KnownUsecases: &scoreUsecase} + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(512)) + }) + + It("sizes the batch to the effective 4096 default for a score model with no explicit context_size", func() { + // The crash case: the backend defaults n_ctx to 4096, so n_batch must + // follow even when context_size is unset — otherwise n_batch stays 512 + // against a 4096 window and the score decode hits the GGML_ASSERT. + cfg := config.ModelConfig{Threads: &threads, KnownUsecases: &scoreUsecase} + Expect(cfg.ContextSize).To(BeNil()) + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(4096)) + Expect(opts.ContextSize).To(BeEquivalentTo(4096), "n_batch must match the effective n_ctx the backend receives") + }) +}) diff --git a/core/backend/stores.go b/core/backend/stores.go index 4884765f2f93..8b73ee17c017 100644 --- a/core/backend/stores.go +++ b/core/backend/stores.go @@ -3,9 +3,10 @@ package backend import ( "context" "fmt" - "strings" + "time" "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/trace" "github.com/mudler/LocalAI/pkg/grpc" "github.com/mudler/LocalAI/pkg/model" @@ -39,34 +40,85 @@ func (s *localVectorStore) backend(_ context.Context) (grpc.Backend, error) { return StoreBackend(s.loader, s.appConfig, s.storeName, "") } -func (s *localVectorStore) Search(ctx context.Context, vec []float32) (float64, []byte, bool, error) { - be, err := s.backend(ctx) - if err != nil { - return 0, nil, false, fmt.Errorf("vector store load: %w", err) +func (s *localVectorStore) Search(ctx context.Context, vec []float32) (sim float64, payload []byte, ok bool, err error) { + start := time.Now() + outcome := "hit" + defer func() { + s.recordTrace(start, "search", len(vec), sim, outcome, err) + }() + be, berr := s.backend(ctx) + if berr != nil { + outcome = "backend_load_error" + return 0, nil, false, fmt.Errorf("vector store load: %w", berr) } - _, values, similarities, err := store.Find(ctx, be, vec, 1) - if err != nil { - // local-store's Find returns "existing length is -1" before - // any keys are inserted. Surface that as a clean miss so the - // cache layer treats it as an empty store and proceeds to - // Insert rather than skipping. - if strings.Contains(err.Error(), "existing length is -1") { - return 0, nil, false, nil - } - return 0, nil, false, fmt.Errorf("vector store find: %w", err) + _, values, similarities, ferr := store.Find(ctx, be, vec, 1) + if ferr != nil { + outcome = "find_error" + return 0, nil, false, fmt.Errorf("vector store find: %w", ferr) } if len(values) == 0 || len(similarities) == 0 { + outcome = "miss" return 0, nil, false, nil } return float64(similarities[0]), values[0], true, nil } -func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) error { - be, err := s.backend(ctx) +func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) (err error) { + start := time.Now() + outcome := "ok" + defer func() { + s.recordTrace(start, "insert", len(vec), 0, outcome, err) + }() + be, berr := s.backend(ctx) + if berr != nil { + outcome = "backend_load_error" + return fmt.Errorf("vector store load: %w", berr) + } + if serr := store.SetSingle(ctx, be, vec, payload); serr != nil { + outcome = "insert_error" + return serr + } + return nil +} + +// recordTrace surfaces vector-store calls in /api/backend-traces, including +// the backend-load-failure path that otherwise vanishes into an xlog.Warn. +// modelName uses the store namespace (e.g. "router-cache-smart-router") so +// admins can tell which router's cache misbehaved; the backend is always +// "local-store" and can't disambiguate. +func (s *localVectorStore) recordTrace(start time.Time, op string, vecDim int, sim float64, outcome string, err error) { + if s.appConfig == nil || !s.appConfig.EnableTracing { + return + } + trace.InitBackendTracingIfEnabled(s.appConfig.TracingMaxItems, s.appConfig.TracingMaxBodyBytes) + errStr := "" if err != nil { - return fmt.Errorf("vector store load: %w", err) + errStr = err.Error() + } + summary := op + " " + outcome + if op == "search" && outcome == "hit" { + summary = fmt.Sprintf("search hit (sim=%.3f)", sim) + } + data := map[string]any{ + "op": op, + "outcome": outcome, + "vector_dim": vecDim, + } + // Only include similarity for a real neighbor — miss/empty_store would + // otherwise render "similarity: 0" and read as a measured value. + if op == "search" && outcome == "hit" { + data["similarity"] = sim } - return store.SetSingle(ctx, be, vec, payload) + trace.RecordBackendTrace(trace.BackendTrace{ + Timestamp: start, + Duration: time.Since(start), + Type: trace.BackendTraceVectorStore, + ModelName: s.storeName, + Backend: model.LocalStoreBackend, + Summary: summary, + Error: errStr, + Data: data, + }) } func StoreBackend(sl *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string, backend string) (grpc.Backend, error) { diff --git a/core/backend/stores_test.go b/core/backend/stores_test.go new file mode 100644 index 000000000000..e9d5208a3d45 --- /dev/null +++ b/core/backend/stores_test.go @@ -0,0 +1,88 @@ +package backend + +import ( + "context" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/trace" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/LocalAI/pkg/system" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// findVectorStoreTrace returns the most recent vector_store trace whose +// model_name matches storeName, or nil if none was recorded. Used by +// the specs below to assert the trace landed without relying on +// ring-buffer ordering across other tests in the suite. +func findVectorStoreTrace(storeName string) *trace.BackendTrace { + traces := trace.GetBackendTraces() + for i := range traces { + bt := &traces[i] + if bt.Type == trace.BackendTraceVectorStore && bt.ModelName == storeName { + return bt + } + } + return nil +} + +var _ = Describe("localVectorStore tracing", func() { + // Pin the trace surface admins read from /api/backend-traces. + // The original failure mode that motivated these specs — the + // local-store backend not installed — was silent on every surface + // except a per-call xlog.Warn. With tracing wired in, the row + // appears next to the embedder/score traces for the same request. + BeforeEach(func() { + trace.ClearBackendTraces() + }) + + It("records a vector_store trace with outcome=backend_load_error when the backend can't be loaded", func() { + // nil ModelLoader → s.backend → StoreBackend → panics on load. + // Use a real-but-empty loader so the failure surfaces as an + // error instead, exercising the load-failure trace path the + // admin would hit when local-store isn't installed. + appCfg := &config.ApplicationConfig{ + EnableTracing: true, + TracingMaxItems: 16, + TracingMaxBodyBytes: 1024, + } + s := &localVectorStore{ + loader: model.NewModelLoader(&system.SystemState{}), + appConfig: appCfg, + storeName: "router-cache-test", + } + + // Search must surface the error AND record a trace describing it. + _, _, _, err := s.Search(context.Background(), []float32{0.1, 0.2, 0.3}) + Expect(err).To(HaveOccurred()) + + Eventually(func() *trace.BackendTrace { + return findVectorStoreTrace("router-cache-test") + }).ShouldNot(BeNil()) + + bt := findVectorStoreTrace("router-cache-test") + Expect(bt.Backend).To(Equal(model.LocalStoreBackend)) + Expect(bt.Data["op"]).To(Equal("search")) + Expect(bt.Data["outcome"]).To(Equal("backend_load_error")) + Expect(bt.Data["vector_dim"]).To(Equal(3)) + // Error is the wrapped "vector store load: …" surfaced to the caller. + Expect(bt.Error).To(ContainSubstring("vector store load")) + }) + + It("does not record a trace when tracing is disabled", func() { + // Opt-out path: appConfig.EnableTracing=false must short-circuit + // before InitBackendTracingIfEnabled, so a workload with tracing + // turned off doesn't pay the channel-send cost per cache call. + appCfg := &config.ApplicationConfig{EnableTracing: false} + s := &localVectorStore{ + loader: model.NewModelLoader(&system.SystemState{}), + appConfig: appCfg, + storeName: "router-cache-disabled", + } + _, _, _, _ = s.Search(context.Background(), []float32{1}) + Consistently(func() *trace.BackendTrace { + return findVectorStoreTrace("router-cache-disabled") + }).Should(BeNil()) + }) +}) diff --git a/core/backend/token_classify.go b/core/backend/token_classify.go new file mode 100644 index 000000000000..cb1e6b638c21 --- /dev/null +++ b/core/backend/token_classify.go @@ -0,0 +1,150 @@ +package backend + +import ( + "context" + "time" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/trace" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + model "github.com/mudler/LocalAI/pkg/model" +) + +// TokenEntity is one detected span from a token-classification (NER) +// model. Mirrors pb.TokenClassifyEntity but keeps the proto type out of +// consumers. Start/End are BYTE offsets into the classified text, +// half-open (addressing text[Start:End]) — the proto contract. Group is +// the model's entity label (e.g. "private_person", "EMAIL"). +type TokenEntity struct { + Group string `json:"group"` + Start int `json:"start"` + End int `json:"end"` + Score float32 `json:"score"` + Text string `json:"text"` +} + +// TokenClassifyOptions controls a single TokenClassify request. +type TokenClassifyOptions struct { + // Threshold drops entities the backend scores below this value at + // the source. 0 returns everything the model emits; downstream + // callers (e.g. the PII redactor's MinScore) can still filter + // further once they know the per-request policy. + Threshold float32 +} + +// TokenClassifier runs a token-classification model over text and +// returns the detected entity spans. Implemented by NewTokenClassifier +// over a model-loaded backend; the PII redactor's encoder/NER tier +// consumes this via a pii.NERDetector adapter (see +// core/services/routing/piidetector). +type TokenClassifier interface { + TokenClassify(ctx context.Context, text string) ([]TokenEntity, error) +} + +// NewTokenClassifier binds (loader, modelConfig, appConfig) into a +// TokenClassifier. The underlying backend is resolved lazily on the +// first call, mirroring NewScorer. +func NewTokenClassifier(loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, opts TokenClassifyOptions) TokenClassifier { + return &modelTokenClassifier{loader: loader, modelConfig: modelConfig, appConfig: appConfig, opts: opts} +} + +type modelTokenClassifier struct { + loader *model.ModelLoader + modelConfig config.ModelConfig + appConfig *config.ApplicationConfig + opts TokenClassifyOptions +} + +func (m *modelTokenClassifier) TokenClassify(ctx context.Context, text string) ([]TokenEntity, error) { + fn, err := ModelTokenClassify(text, m.opts, m.loader, m.modelConfig, m.appConfig) + if err != nil { + return nil, err + } + return fn(ctx) +} + +// ModelTokenClassify loads the backend for modelConfig and returns a +// closure that classifies `text`. Mirrors ModelScore: the closure is +// bound to the loaded model so a caller can reuse it within a request +// without re-resolving the backend. +// +// When tracing is enabled it records a BackendTraceTokenClassify row so the +// detector's output — every entity's group, byte range, confidence and the +// matched substring — shows in the Traces UI alongside the request it gated. +// This is the technical view for debugging false positives (e.g. a phone +// number scored as SSN); the persisted PIIEvent keeps only a hash. +func ModelTokenClassify(text string, opts TokenClassifyOptions, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (func(ctx context.Context) ([]TokenEntity, error), error) { + modelOpts := ModelOptions(modelConfig, appConfig) + inferenceModel, err := loader.Load(modelOpts...) + if err != nil { + recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil) + return nil, err + } + return func(ctx context.Context) ([]TokenEntity, error) { + var startTime time.Time + if appConfig.EnableTracing { + trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes) + startTime = time.Now() + } + resp, err := inferenceModel.TokenClassify(ctx, &pb.TokenClassifyRequest{ + Text: text, + Threshold: opts.Threshold, + }) + entities := tokenClassifyResponseToEntities(resp) + if appConfig.EnableTracing { + trace.RecordBackendTrace(tokenClassifyTrace(modelConfig, text, opts.Threshold, entities, startTime, err)) + } + if err != nil { + return nil, err + } + return entities, nil + }, nil +} + +// tokenClassifyTrace assembles the Traces-UI row for one NER call: the input +// preview, the threshold, and every detected entity (group, byte range, +// confidence, matched text). Split out from the closure so the Data assembly +// is unit-testable without a live backend. +func tokenClassifyTrace(modelConfig config.ModelConfig, text string, threshold float32, entities []TokenEntity, start time.Time, callErr error) trace.BackendTrace { + errStr := "" + if callErr != nil { + errStr = callErr.Error() + } + return trace.BackendTrace{ + Timestamp: start, + Duration: time.Since(start), + Type: trace.BackendTraceTokenClassify, + ModelName: modelConfig.Name, + Backend: modelConfig.Backend, + Summary: trace.TruncateString(text, 200), + Error: errStr, + Data: map[string]any{ + "input_chars": len(text), + "threshold": threshold, + "entities": entities, + }, + } +} + +// tokenClassifyResponseToEntities converts the wire-format response into +// the value type consumed by callers. Extracted so the conversion can be +// unit-tested without a real backend (see token_classify_test.go). +func tokenClassifyResponseToEntities(resp *pb.TokenClassifyResponse) []TokenEntity { + if resp == nil { + return nil + } + out := make([]TokenEntity, 0, len(resp.Entities)) + for _, e := range resp.Entities { + if e == nil { + continue + } + out = append(out, TokenEntity{ + Group: e.EntityGroup, + Start: int(e.Start), + End: int(e.End), + Score: e.Score, + Text: e.Text, + }) + } + return out +} diff --git a/core/backend/token_classify_test.go b/core/backend/token_classify_test.go new file mode 100644 index 000000000000..3b9adda0e5a5 --- /dev/null +++ b/core/backend/token_classify_test.go @@ -0,0 +1,61 @@ +package backend + +import ( + "errors" + "time" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/trace" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("tokenClassifyResponseToEntities", func() { + It("returns nil for a nil response", func() { + Expect(tokenClassifyResponseToEntities(nil)).To(BeNil()) + }) + + It("maps proto entities to TokenEntity, skipping nil rows", func() { + resp := &pb.TokenClassifyResponse{ + Entities: []*pb.TokenClassifyEntity{ + {EntityGroup: "private_person", Start: 3, End: 8, Score: 0.97, Text: "Alice"}, + nil, + {EntityGroup: "EMAIL", Start: 20, End: 40, Score: 0.5, Text: "a@b.com"}, + }, + } + Expect(tokenClassifyResponseToEntities(resp)).To(Equal([]TokenEntity{ + {Group: "private_person", Start: 3, End: 8, Score: 0.97, Text: "Alice"}, + {Group: "EMAIL", Start: 20, End: 40, Score: 0.5, Text: "a@b.com"}, + })) + }) + + It("returns an empty (non-nil) slice for a response with no entities", func() { + out := tokenClassifyResponseToEntities(&pb.TokenClassifyResponse{}) + Expect(out).NotTo(BeNil()) + Expect(out).To(BeEmpty()) + }) +}) + +var _ = Describe("tokenClassifyTrace", func() { + cfg := config.ModelConfig{Name: "privacy-filter", Backend: "llama-cpp"} + ents := []TokenEntity{{Group: "SSN", Start: 5, End: 16, Score: 0.62, Text: "123-45-6789"}} + + It("captures model, input preview, threshold and per-entity detail", func() { + tr := tokenClassifyTrace(cfg, "ssn is 123-45-6789", 0.5, ents, time.Now(), nil) + Expect(tr.Type).To(Equal(trace.BackendTraceTokenClassify)) + Expect(tr.ModelName).To(Equal("privacy-filter")) + Expect(tr.Backend).To(Equal("llama-cpp")) + Expect(tr.Summary).To(ContainSubstring("ssn is")) + Expect(tr.Error).To(BeEmpty()) + Expect(tr.Data["input_chars"]).To(Equal(len("ssn is 123-45-6789"))) + Expect(tr.Data["threshold"]).To(BeEquivalentTo(float32(0.5))) + Expect(tr.Data["entities"]).To(Equal(ents)) + }) + + It("records the backend error string when the call failed", func() { + tr := tokenClassifyTrace(cfg, "x", 0, nil, time.Now(), errors.New("boom")) + Expect(tr.Error).To(Equal("boom")) + }) +}) diff --git a/core/backend/tokenize.go b/core/backend/tokenize.go index 96618d89cdc2..6b926b1793a1 100644 --- a/core/backend/tokenize.go +++ b/core/backend/tokenize.go @@ -7,9 +7,23 @@ import ( "github.com/mudler/LocalAI/core/schema" "github.com/mudler/LocalAI/core/trace" "github.com/mudler/LocalAI/pkg/grpc" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" "github.com/mudler/LocalAI/pkg/model" ) +// tokenizeTokenCount returns the number of tokens in a backend response, +// treating a nil response as zero. The gRPC client returns (nil, err) on +// failure, and the tracing block below runs before that error is returned — +// so the count must be read nil-safely here. Reading resp.Tokens on a nil +// resp previously panicked the whole HTTP handler when tracing was enabled +// (e.g. a transient tokenize failure during router probe-budget sizing). +func tokenizeTokenCount(resp *pb.TokenizationResponse) int { + if resp == nil { + return 0 + } + return len(resp.Tokens) +} + func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) { var inferenceModel grpc.Backend @@ -40,10 +54,7 @@ func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.Model errStr = err.Error() } - tokenCount := 0 - if resp.Tokens != nil { - tokenCount = len(resp.Tokens) - } + tokenCount := tokenizeTokenCount(resp) trace.RecordBackendTrace(trace.BackendTrace{ Timestamp: startTime, @@ -64,8 +75,8 @@ func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.Model return schema.TokenizeResponse{}, err } - if resp.Tokens == nil { - resp.Tokens = make([]int32, 0) + if resp == nil || resp.Tokens == nil { + return schema.TokenizeResponse{Tokens: make([]int32, 0)}, nil } return schema.TokenizeResponse{ diff --git a/core/backend/tokenize_test.go b/core/backend/tokenize_test.go new file mode 100644 index 000000000000..3b5c8e9fbc6f --- /dev/null +++ b/core/backend/tokenize_test.go @@ -0,0 +1,27 @@ +package backend + +import ( + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("tokenizeTokenCount", func() { + // Regression: the gRPC client returns (nil, err) when a tokenize call + // fails, and ModelTokenize's tracing block reads the token count before + // the error is returned. Dereferencing a nil response there panicked the + // HTTP handler (nil pointer dereference) — e.g. a transient tokenize + // failure while the router sized its probe-token budget. + It("returns zero for a nil response instead of panicking", func() { + Expect(tokenizeTokenCount(nil)).To(Equal(0)) + }) + + It("returns zero when the response carries no tokens", func() { + Expect(tokenizeTokenCount(&pb.TokenizationResponse{})).To(Equal(0)) + }) + + It("counts the tokens present on the response", func() { + Expect(tokenizeTokenCount(&pb.TokenizationResponse{Tokens: []int32{1, 2, 3}})).To(Equal(3)) + }) +}) diff --git a/core/config/application_config.go b/core/config/application_config.go index dd36b97b90fe..12799b1dd1c6 100644 --- a/core/config/application_config.go +++ b/core/config/application_config.go @@ -48,25 +48,6 @@ type ApplicationConfig struct { // touch disk or memory. DisableStats bool - // PIIConfigPath points to an optional YAML file describing the PII - // pattern set. When empty, the routing/pii module's DefaultPatterns() - // (email, phone, SSN, credit card, IPv4, API key prefixes) are - // loaded with their default actions. Each entry overrides the - // matching default by ID: - // - // patterns: - // - id: email - // action: route_local # downgrade default mask -> route_local - // - id: ssn - // action: block # upgrade default mask -> block - // - // Unknown ids are rejected with a clear error at startup. - PIIConfigPath string - - // DisablePII turns the regex PII filter off entirely. Default - // (false) enables it on the OpenAI chat completions route. - DisablePII bool - // MITMListen is the address (host:port) the cloudproxy MITM // listener binds on. Empty disables the MITM proxy entirely. // Use case: redacting PII from Claude Code / Codex CLI traffic @@ -75,19 +56,20 @@ type ApplicationConfig struct { // LocalAI exposes at /api/middleware/proxy-ca.crt. MITMListen string + // PIIDefaultDetectors lists token-classification (NER) detector model + // names applied to any PII-enabled model that does not name its own + // pii.detectors. This makes cloud-proxy / MITM redaction work out of the + // box (those default to PII-enabled but carry no detector list) and lets + // an operator set one detector for the whole instance. Set at runtime via + // POST /api/settings; read live by Application.ResolvePIIPolicy. + PIIDefaultDetectors []string + // MITMCADir holds the persisted MITM proxy CA cert and private // key. The CA is generated on first start; subsequent starts // reload it so clients keep trusting the same root. The key // file is mode 0600. MITMCADir string - - // PIIPatternOverrides applies persisted per-id deltas (action, - // disabled) to the live redactor at startup. Loaded from - // runtime_settings.json and applied right after pii.NewRedactor. - // nil/empty leaves the YAML defaults in place. - PIIPatternOverrides map[string]PIIPatternRuntimeOverride - DisableWebUI bool OllamaAPIRootEndpoint bool EnforcePredownloadScans bool @@ -116,11 +98,11 @@ type ApplicationConfig struct { // --require-backend-integrity / LOCALAI_REQUIRE_BACKEND_INTEGRITY. RequireBackendIntegrity bool - SingleBackend bool // Deprecated: use MaxActiveBackends = 1 instead - MaxActiveBackends int // Maximum number of active backends (0 = unlimited, 1 = single backend mode) - WatchDogIdle bool - WatchDogBusy bool - WatchDog bool + SingleBackend bool // Deprecated: use MaxActiveBackends = 1 instead + MaxActiveBackends int // Maximum number of active backends (0 = unlimited, 1 = single backend mode) + WatchDogIdle bool + WatchDogBusy bool + WatchDog bool // Memory Reclaimer settings (works with GPU if available, otherwise RAM) MemoryReclaimerEnabled bool // Enable memory threshold monitoring @@ -583,6 +565,7 @@ func WithJSONStringPreload(configFile string) AppOption { o.PreloadJSONModels = configFile } } + func WithConfigFile(configFile string) AppOption { return func(o *ApplicationConfig) { o.ConfigFile = configFile @@ -671,21 +654,6 @@ func WithDisableStats(disable bool) AppOption { } } -// WithPIIConfigPath points the routing PII filter at a YAML config -// file. CLI: --pii-config. -func WithPIIConfigPath(path string) AppOption { - return func(o *ApplicationConfig) { - o.PIIConfigPath = path - } -} - -// WithDisablePII turns the regex PII filter off. CLI: --disable-pii. -func WithDisablePII(disable bool) AppOption { - return func(o *ApplicationConfig) { - o.DisablePII = disable - } -} - // WithMITMListen sets the address the cloudproxy MITM listener // binds on. Empty = disabled. CLI: --mitm-listen. func WithMITMListen(addr string) AppOption { @@ -702,7 +670,6 @@ func WithMITMCADir(dir string) AppOption { } } - func WithDynamicConfigDir(dynamicConfigsDir string) AppOption { return func(o *ApplicationConfig) { o.DynamicConfigsDir = dynamicConfigsDir @@ -1108,6 +1075,8 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings { mitmListen := o.MITMListen + piiDefaultDetectors := append([]string(nil), o.PIIDefaultDetectors...) + return RuntimeSettings{ WatchdogEnabled: &watchdogEnabled, WatchdogIdleEnabled: &watchdogIdle, @@ -1162,6 +1131,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings { LogoHorizontalFile: &logoHorizontalFile, FaviconFile: &faviconFile, MITMListen: &mitmListen, + PIIDefaultDetectors: &piiDefaultDetectors, } } @@ -1391,6 +1361,10 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req o.MITMListen = *settings.MITMListen } + if settings.PIIDefaultDetectors != nil { + o.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...) + } + // Note: ApiKeys requires special handling (merging with startup keys) - handled in caller return requireRestart diff --git a/core/config/backend_capabilities.go b/core/config/backend_capabilities.go index 19da89462fdb..a483a5cc53a9 100644 --- a/core/config/backend_capabilities.go +++ b/core/config/backend_capabilities.go @@ -8,25 +8,26 @@ import ( // Usecase name constants — the canonical string values used in gallery entries, // model configs (known_usecases), and UsecaseInfoMap keys. const ( - UsecaseChat = "chat" - UsecaseCompletion = "completion" - UsecaseEdit = "edit" - UsecaseVision = "vision" - UsecaseEmbeddings = "embeddings" - UsecaseTokenize = "tokenize" - UsecaseImage = "image" - UsecaseVideo = "video" - UsecaseTranscript = "transcript" - UsecaseTTS = "tts" - UsecaseSoundGeneration = "sound_generation" - UsecaseRerank = "rerank" - UsecaseDetection = "detection" - UsecaseVAD = "vad" - UsecaseAudioTransform = "audio_transform" - UsecaseDiarization = "diarization" - UsecaseRealtimeAudio = "realtime_audio" - UsecaseFaceRecognition = "face_recognition" - UsecaseSpeakerRecognition = "speaker_recognition" + UsecaseChat = "chat" + UsecaseCompletion = "completion" + UsecaseEdit = "edit" + UsecaseVision = "vision" + UsecaseEmbeddings = "embeddings" + UsecaseTokenize = "tokenize" + UsecaseImage = "image" + UsecaseVideo = "video" + UsecaseTranscript = "transcript" + UsecaseTTS = "tts" + UsecaseSoundGeneration = "sound_generation" + UsecaseRerank = "rerank" + UsecaseDetection = "detection" + UsecaseVAD = "vad" + UsecaseAudioTransform = "audio_transform" + UsecaseDiarization = "diarization" + UsecaseRealtimeAudio = "realtime_audio" + UsecaseFaceRecognition = "face_recognition" + UsecaseSpeakerRecognition = "speaker_recognition" + UsecaseTokenClassify = "token_classify" ) // GRPCMethod identifies a Backend service RPC from backend.proto. @@ -54,6 +55,7 @@ const ( MethodVoiceVerify GRPCMethod = "VoiceVerify" MethodVoiceEmbed GRPCMethod = "VoiceEmbed" MethodVoiceAnalyze GRPCMethod = "VoiceAnalyze" + MethodTokenClassify GRPCMethod = "TokenClassify" ) // UsecaseInfo describes a single known_usecase value and how it maps @@ -171,6 +173,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{ GRPCMethod: MethodVoiceVerify, Description: "Speaker recognition — verify identity, embed and analyze voice via VoiceVerify, VoiceEmbed and VoiceAnalyze RPCs.", }, + UsecaseTokenClassify: { + Flag: FLAG_TOKEN_CLASSIFY, + GRPCMethod: MethodTokenClassify, + Description: "Per-token classification (NER) via the TokenClassify RPC — the PII detector tier. Declared explicitly via known_usecases; never auto-guessed, since the token-classification head is not useful as general generation or embeddings.", + }, } // BackendCapability describes which gRPC methods and usecases a backend supports. @@ -202,10 +209,14 @@ var BackendCapabilities = map[string]BackendCapability{ // --- LLM / text generation backends --- "llama-cpp": { GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding, MethodTokenizeString}, - PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEdit, UsecaseEmbeddings, UsecaseTokenize, UsecaseVision}, + PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEdit, UsecaseEmbeddings, UsecaseTokenize, UsecaseVision, UsecaseTokenClassify}, DefaultUsecases: []string{UsecaseChat}, AcceptsImages: true, // requires mmproj - Description: "llama.cpp GGUF models — LLM inference with optional vision via mmproj", + // token_classify is supported only with a patched llama.cpp that + // exposes per-token classification logits (the PII NER detector + // path); it is never auto-guessed and must be declared explicitly + // via known_usecases. + Description: "llama.cpp GGUF models — LLM inference with optional vision via mmproj", }, "vllm": { GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding}, diff --git a/core/config/gguf.go b/core/config/gguf.go index c373561b6319..6f82c809e5aa 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -19,8 +19,18 @@ const ( defaultNGPULayers = 99999999 ) -func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { +// reservedNonChatModel reports whether the operator reserved this model for an +// internal direct-decode primitive — the router score classifier or the PII +// NER token_classify tier. Such a model has no chat template and must not be +// given the generative-chat defaults the GGUF importer otherwise applies +// (FLAG_CHAT, jinja templating); doing so trips the llama-cpp known_usecases +// conflict check and makes the config invalid. +func reservedNonChatModel(cfg *ModelConfig) bool { + return cfg.KnownUsecases != nil && + (*cfg.KnownUsecases&(FLAG_SCORE|FLAG_TOKEN_CLASSIFY)) != 0 +} +func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { if defaultCtx == 0 && cfg.ContextSize == nil { ctxSize := f.EstimateLLaMACppRun().ContextSize if ctxSize > 0 { @@ -77,11 +87,20 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { cfg.Name = f.Metadata().Name } - // Instruct to use template from llama.cpp - cfg.TemplateConfig.UseTokenizerTemplate = true - cfg.FunctionsConfig.GrammarConfig.NoGrammar = true - cfg.Options = append(cfg.Options, "use_jinja:true") - cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT") + // A model the operator reserved for an internal direct-decode primitive + // (the router score classifier, or the PII NER token_classify tier) is not + // a chat model: it carries no chat template and must not be painted with + // the generative-chat defaults. In particular appending FLAG_CHAT here + // would fold chat into KnownUsecases on the next sync and trip the + // llama-cpp known_usecases conflict check in Validate(), making the config + // invalid so it is silently skipped at load. Respect the declaration. + if !reservedNonChatModel(cfg) { + // Instruct to use template from llama.cpp + cfg.TemplateConfig.UseTokenizerTemplate = true + cfg.FunctionsConfig.GrammarConfig.NoGrammar = true + cfg.Options = append(cfg.Options, "use_jinja:true") + cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT") + } // Apply per-model-family inference parameter defaults (temperature, top_p, etc.) ApplyInferenceDefaults(cfg, f.Metadata().Name) diff --git a/core/config/meta/build.go b/core/config/meta/build.go index 24cfb86b7962..39235b9998dd 100644 --- a/core/config/meta/build.go +++ b/core/config/meta/build.go @@ -93,6 +93,9 @@ func applyOverride(f *FieldMeta, o FieldMetaOverride) { if o.Component != "" { f.Component = o.Component } + if o.Language != "" { + f.Language = o.Language + } if o.Placeholder != "" { f.Placeholder = o.Placeholder } diff --git a/core/config/meta/constants.go b/core/config/meta/constants.go index b15eb53d0d94..9be49fec0eed 100644 --- a/core/config/meta/constants.go +++ b/core/config/meta/constants.go @@ -8,6 +8,7 @@ const ( ProviderModelsTTS = "models:tts" ProviderModelsTranscript = "models:transcript" ProviderModelsVAD = "models:vad" + ProviderModelsScore = "models:score" ) // Static option lists embedded directly in field metadata. diff --git a/core/config/meta/pattern_meta_test.go b/core/config/meta/pattern_meta_test.go new file mode 100644 index 000000000000..0b75f5055777 --- /dev/null +++ b/core/config/meta/pattern_meta_test.go @@ -0,0 +1,41 @@ +package meta_test + +import ( + "reflect" + "testing" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/config/meta" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestMeta(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "config/meta suite") +} + +var _ = Describe("pattern detector field metadata", func() { + byPath := func() map[string]meta.FieldMeta { + md := meta.BuildForTest(reflect.TypeOf(config.ModelConfig{}), meta.DefaultRegistry()) + out := make(map[string]meta.FieldMeta, len(md.Fields)) + for _, f := range md.Fields { + out[f.Path] = f + } + return out + } + + It("renders builtins as a select with the catalogue as options", func() { + f, ok := byPath()["pii_detection.builtins"] + Expect(ok).To(BeTrue(), "pii_detection.builtins field should exist") + Expect(f.Component).To(Equal("pii-builtins-select")) + Expect(f.Options).NotTo(BeEmpty()) + }) + + It("renders custom patterns with the pattern-list editor", func() { + f, ok := byPath()["pii_detection.patterns"] + Expect(ok).To(BeTrue(), "pii_detection.patterns field should exist") + Expect(f.Component).To(Equal("pii-pattern-list")) + }) +}) diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go index 548b218921ba..b8222fe8210f 100644 --- a/core/config/meta/registry.go +++ b/core/config/meta/registry.go @@ -1,5 +1,19 @@ package meta +import "github.com/mudler/LocalAI/core/services/routing/piipattern" + +// builtinPatternOptions turns the piipattern built-in catalogue into select +// options for the editor's built-in-patterns checklist, keeping the catalogue +// the single source of truth. +func builtinPatternOptions() []FieldOption { + cat := piipattern.BuiltinCatalogue() + out := make([]FieldOption, 0, len(cat)) + for _, b := range cat { + out = append(out, FieldOption{Value: b.Name, Label: b.Name + " — " + b.Description}) + } + return out +} + // DefaultRegistry returns enrichment overrides for the ~30 most commonly used // config fields. Fields not listed here still appear with auto-generated // labels and type-inferred components. @@ -226,6 +240,7 @@ func DefaultRegistry() map[string]FieldMetaOverride { Label: "Chat Template", Description: "Go template for chat completion requests", Component: "code-editor", + Language: "gotemplate", Order: 40, }, "template.chat_message": { @@ -233,6 +248,7 @@ func DefaultRegistry() map[string]FieldMetaOverride { Label: "Chat Message Template", Description: "Go template for individual chat messages", Component: "code-editor", + Language: "gotemplate", Order: 41, }, "template.completion": { @@ -240,13 +256,22 @@ func DefaultRegistry() map[string]FieldMetaOverride { Label: "Completion Template", Description: "Go template for completion requests", Component: "code-editor", + Language: "gotemplate", Order: 42, }, + "template.function": { + Section: "templates", + Label: "Functions Template", + Description: "Go template applied when tools/functions are present in the request", + Component: "code-editor", + Language: "gotemplate", + Order: 43, + }, "template.use_tokenizer_template": { Section: "templates", Label: "Use Tokenizer Template", Description: "Use the chat template from the model's tokenizer config", - Order: 43, + Order: 44, }, // Router section template — kept in the templates UI section // (rather than the router section under "other") so operators @@ -257,7 +282,8 @@ func DefaultRegistry() map[string]FieldMetaOverride { Label: "Router Classifier System Prompt", Description: "Go text/template (with sprig functions) for the routing system prompt the score classifier feeds to its classifier_model. Executed with `.Policies` ([]{Label, Description}). Empty falls back to the built-in Arch-Router-shaped prompt (route-listing block + JSON output schema). Override when the classifier model was trained on a different schema or you need the routing instructions in a different language. The candidate format scored against the model is fixed at `{\"route\": \"