fix(turboquant): guard upstream-only grpc-server fields for fork (#10043)

localai-bot · mudler · web-flow · commit 1c92b0091889 · 2026-05-28T17:37:54.000+02:00
fix(turboquant): guard upstream-only grpc-server fields for fork build backend/cpp/llama-cpp/grpc-server.cpp is reused by the turboquant build, which compiles against an older llama.cpp fork (TheTom/llama-cpp-turboquant). Two recent changes added references to upstream-only struct fields outside the existing LOCALAI_LEGACY_LLAMA_CPP_SPEC guards: - common_params::checkpoint_min_step (default + option handler), added with the ggml-org/llama.cpp 35c9b1f3 bump (#9998) - the common_params_speculative::draft tensor_buft_overrides sentinel termination (#9919), which sat after the guard's #endif The fork has neither field, so grpc-server.cpp failed to compile for every turboquant flavor. Wrap the three references in #ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC, matching the existing fork-compat guards, so the stock llama-cpp build is unchanged and the fork build skips them. Update patch-grpc-server.sh's doc comment to record what the macro now gates out. Verified by a local fallback-flavor turboquant build: grpc-server.cpp compiles against the fork and the backend image builds. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -573,8 +573,12 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
     // checkpoint_min_step: minimum spacing between context checkpoints in
     // tokens (0 disables the minimum). Match upstream's default (256). This
     // field was renamed from `checkpoint_every_nt` in llama.cpp; the semantics
-    // also shifted from a fixed cadence to a minimum spacing.
+    // also shifted from a fixed cadence to a minimum spacing. The turboquant
+    // fork branched before the field existed, so skip it on the legacy path
+    // (LOCALAI_LEGACY_LLAMA_CPP_SPEC is injected by patch-grpc-server.sh).
+#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC
     params.checkpoint_min_step = 256;
+#endif
 
      // decode options. Options are in form optname:optvale, or if booleans only optname.
     for (int i = 0; i < request->options_size(); i++) {
@@ -748,11 +752,18 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
                 params.cache_idle_slots = false;
             }
 
+#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC
         // --- minimum context-checkpoint spacing (upstream -cms / --checkpoint-min-step) ---
         // 0 disables the minimum-spacing gate. Old option names (`checkpoint_every_nt`,
         // `checkpoint_every_n_tokens`) are kept as aliases for backward compatibility
         // with existing user configs: upstream renamed the field and shifted its
         // semantics from a fixed cadence to a minimum spacing.
+        //
+        // Gated out for the turboquant fork, which lacks common_params::
+        // checkpoint_min_step. The leading `}` closing the cache_idle_slots
+        // branch is removed with this block; the next `} else if` (n_ubatch)
+        // then closes cache_idle_slots, so braces stay balanced under both
+        // preprocessor branches.
         } else if (!strcmp(optname, "checkpoint_min_step") || !strcmp(optname, "checkpoint_min_spacing") ||
                    !strcmp(optname, "checkpoint_every_nt") || !strcmp(optname, "checkpoint_every_n_tokens")) {
             if (optval != NULL) {
@@ -762,6 +773,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
                     // If conversion fails, keep default value (256)
                 }
             }
+#endif
 
         // --- physical batch size (upstream -ub / --ubatch-size) ---
         // Note: line ~482 already aliases n_ubatch to n_batch as a default; this
@@ -1165,9 +1177,15 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
             params.tensor_buft_overrides.push_back({nullptr, nullptr});
         }
     }
+    // The draft tensor_buft_overrides are only populated under the modern
+    // (post-#22838) layout, whose population code is itself gated by
+    // LOCALAI_LEGACY_LLAMA_CPP_SPEC above. The turboquant fork lacks
+    // common_params_speculative::draft entirely, so skip the sentinel there too.
+#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC
     if (!params.speculative.draft.tensor_buft_overrides.empty()) {
         params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr});
     }
+#endif
 
     // TODO: Add yarn
 
diff --git a/backend/cpp/turboquant/patch-grpc-server.sh b/backend/cpp/turboquant/patch-grpc-server.sh
@@ -124,8 +124,11 @@ fi
 # 5. Define LOCALAI_LEGACY_LLAMA_CPP_SPEC at the top of the file so the
 #    grpc-server option parser skips the new option-handler blocks (ngram_mod,
 #    ngram_map_k, ngram_map_k4v, ngram_cache, draft.cache_type_*, draft.cpuparams*,
-#    draft.tensor_buft_overrides) introduced for the post-#22838 layout. Those
-#    blocks reference struct fields that simply do not exist in the fork.
+#    draft.tensor_buft_overrides) introduced for the post-#22838 layout, the
+#    draft.tensor_buft_overrides sentinel termination, and the
+#    common_params::checkpoint_min_step default/option (added with the
+#    35c9b1f3 bump). Those blocks reference struct fields that simply do not
+#    exist in the fork.
 if grep -q '^#define LOCALAI_LEGACY_LLAMA_CPP_SPEC' "$SRC"; then
     echo "==> $SRC already defines LOCALAI_LEGACY_LLAMA_CPP_SPEC, skipping"
 else