Skip to content

Commit 0755f27

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/openvino.Dockerfile # .github/workflows/build-self-hosted.yml # .github/workflows/build.yml # common/chat.cpp # docs/backend/OPENVINO.md # examples/speculative-simple/speculative-simple.cpp # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/CMakeLists.txt # ggml/src/ggml-hexagon/htp/htp-ctx.h # ggml/src/ggml-hexagon/htp/htp-ops.h # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-hexagon/libggml-htp.inf # ggml/src/ggml-openvino/ggml-decoder.cpp # ggml/src/ggml-openvino/ggml-openvino-extra.cpp # ggml/src/ggml-openvino/ggml-openvino.cpp # ggml/src/ggml-openvino/ggml-quants.cpp # ggml/src/ggml-openvino/openvino/op/rope.cpp # ggml/src/ggml-openvino/openvino/op_table.cpp # ggml/src/ggml-openvino/openvino/op_table.h # ggml/src/ggml-openvino/openvino/translate_session.cpp # ggml/src/ggml-openvino/openvino/utils.cpp # ggml/src/ggml-openvino/openvino/utils.h # ggml/src/ggml-openvino/utils.cpp # ggml/src/ggml-openvino/utils.h # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/convert.cpp # ggml/src/ggml-sycl/convert.hpp # ggml/src/ggml-sycl/gemm.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/set_rows.cpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/sync_vendor.py # tests/CMakeLists.txt # tests/test-chat.cpp # tools/cli/cli.cpp # tools/mtmd/CMakeLists.txt # tools/server/CMakeLists.txt
2 parents becf70d + 8bccdbb commit 0755f27

42 files changed

Lines changed: 1531 additions & 3199 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

common/arg.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3124,14 +3124,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31243124
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
31253125
[](common_params & params, int value) {
31263126
if (value < -1) { throw std::invalid_argument("invalid value"); }
3127-
params.reasoning_budget = value;
3127+
params.sampling.reasoning_budget_tokens = value;
31283128
}
31293129
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
31303130
add_opt(common_arg(
31313131
{"--reasoning-budget-message"}, "MESSAGE",
31323132
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
31333133
[](common_params & params, const std::string & value) {
3134-
params.reasoning_budget_message = value;
3134+
params.sampling.reasoning_budget_message = value;
31353135
}
31363136
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
31373137
add_opt(common_arg(
@@ -3904,6 +3904,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
39043904
}
39053905
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
39063906

3907+
add_opt(common_arg(
3908+
{"--spec-default"},
3909+
string_format("enable default speculative decoding config"),
3910+
[](common_params & params) {
3911+
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
3912+
params.speculative.ngram_size_n = 24;
3913+
params.speculative.n_min = 48;
3914+
params.speculative.n_max = 64;
3915+
}
3916+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3917+
39073918
return ctx_arg;
39083919
}
39093920

common/chat.cpp

Lines changed: 19 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,25 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
408408
return render_message_to_json(msgs, c);
409409
}
410410

411+
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
412+
if (tools.empty()) {
413+
return json();
414+
}
415+
416+
auto result = json::array();
417+
for (const auto & tool : tools) {
418+
result.push_back({
419+
{ "type", "function" },
420+
{ "function", {
421+
{ "name", tool.name },
422+
{ "description", tool.description },
423+
{ "parameters", json::parse(tool.parameters) },
424+
}},
425+
});
426+
}
427+
return result;
428+
}
429+
411430
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
412431
std::vector<common_chat_tool> result;
413432

@@ -443,60 +462,9 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
443462
return result;
444463
}
445464

446-
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
447-
if (tools.empty()) {
448-
return json();
449-
}
450-
451-
auto result = json::array();
452-
for (const auto & tool : tools) {
453-
result.push_back({
454-
{ "type", "function" },
455-
{ "function",
456-
{
457-
{ "name", tool.name },
458-
{ "description", tool.description },
459-
{ "parameters", json::parse(tool.parameters) },
460-
} },
461-
});
462-
}
463-
return result;
464-
}
465-
466-
json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
467-
json delta = json::object();
468-
if (!diff.reasoning_content_delta.empty()) {
469-
delta["reasoning_content"] = diff.reasoning_content_delta;
470-
}
471-
if (!diff.content_delta.empty()) {
472-
delta["content"] = diff.content_delta;
473-
}
474-
if (diff.tool_call_index != std::string::npos) {
475-
json tool_call;
476-
tool_call["index"] = diff.tool_call_index;
477-
if (!diff.tool_call_delta.id.empty()) {
478-
tool_call["id"] = diff.tool_call_delta.id;
479-
tool_call["type"] = "function";
480-
}
481-
if (!diff.tool_call_delta.name.empty() || !diff.tool_call_delta.arguments.empty()) {
482-
json function = json::object();
483-
if (!diff.tool_call_delta.name.empty()) {
484-
function["name"] = diff.tool_call_delta.name;
485-
}
486-
if (!diff.tool_call_delta.arguments.empty()) {
487-
function["arguments"] = diff.tool_call_delta.arguments;
488-
}
489-
tool_call["function"] = function;
490-
}
491-
delta["tool_calls"] = json::array({ tool_call });
492-
}
493-
return delta;
494-
}
495-
496465
#include "common/unicode.h"
497466
#include "peg-parser.cpp"
498467
#include "chat-peg-parser.cpp"
499-
500468
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
501469
if (use_jinja) {
502470
try {

common/chat.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -256,14 +256,13 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
256256
// Parses a JSON array of messages in OpenAI's chat completion API format.
257257
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
258258

259+
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
260+
259261
// DEPRECATED: only used in tests
260262
nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
261263

262-
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
263264
nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
264265

265-
nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
266-
267266
// get template caps, useful for reporting to server /props endpoint
268267
std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);
269268

common/common.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ struct common_params_sampling {
275275
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
276276
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
277277
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
278+
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
278279

279280
bool backend_sampling = false;
280281

@@ -582,8 +583,6 @@ struct common_params {
582583
bool force_pure_content_parser = false;
583584
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
584585
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
585-
int reasoning_budget = -1;
586-
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
587586
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
588587
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
589588

common/speculative.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -749,6 +749,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
749749

750750
mod.reset();
751751
n_low = 0;
752+
i_last = 0;
752753
}
753754
} else {
754755
n_low = 0;

convert_hf_to_gguf.py

Lines changed: 96 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11855,7 +11855,7 @@ def prepare_tensors(self):
1185511855
raise ValueError(f"Unprocessed experts: {experts}")
1185611856

1185711857

11858-
@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
11858+
@ModelBase.register("HunYuanDenseV1ForCausalLM")
1185911859
class HunYuanModel(TextModel):
1186011860
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
1186111861

@@ -11994,40 +11994,125 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1199411994

1199511995

1199611996
@ModelBase.register("HunYuanVLForConditionalGeneration")
11997-
class HunyuanOCRVisionModel(MmprojModel):
11997+
class HunyuanVLVisionModel(MmprojModel):
11998+
# Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
11999+
# "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
12000+
# Each variant maps to a different projector type in clip.cpp so image
12001+
# preprocessing follows the correct code path.
12002+
1199812003
def __init__(self, *args, **kwargs):
1199912004
super().__init__(*args, **kwargs)
1200012005
assert self.hparams_vision is not None
12001-
# HunyuanOCR uses max_image_size instead of image_size
12006+
# HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
1200212007
if "image_size" not in self.hparams_vision:
1200312008
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
1200412009

12010+
@staticmethod
12011+
def is_ocr_variant(hparams: dict) -> bool:
12012+
"""Return True for HunyuanOCR, False for HunyuanVL.
12013+
12014+
The projector's output dim must equal the text model's hidden_size by
12015+
construction (that's what "projector" means). HunyuanOCR pairs a 1B text
12016+
backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
12017+
ViT -> LLM projection dim is a hard architectural signature, not a
12018+
magic number.
12019+
"""
12020+
vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
12021+
return vision_out == 1024
12022+
1200512023
def set_gguf_parameters(self):
1200612024
super().set_gguf_parameters()
1200712025
assert self.hparams_vision is not None
12008-
hparams = self.hparams_vision
12009-
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
12010-
self.gguf_writer.add_vision_use_gelu(True)
12011-
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
12012-
self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
12013-
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
12014-
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
12026+
vcfg = self.hparams_vision
12027+
12028+
if self.is_ocr_variant(self.global_config):
12029+
# --- HunyuanOCR ---
12030+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
12031+
self.gguf_writer.add_vision_use_gelu(True)
12032+
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
12033+
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
12034+
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
12035+
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
12036+
return
12037+
12038+
# --- HunyuanVL ---
12039+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
12040+
self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
12041+
self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
12042+
self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
12043+
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
12044+
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
1201512045

1201612046
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1201712047
if not name.startswith("vit."):
12018-
return # skip text tensors
12048+
return
1201912049
# strip CLS token (row 0) from position embeddings so resize_position_embeddings works
1202012050
if "position_embedding" in name:
1202112051
data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd]
1202212052
yield from super().modify_tensors(data_torch, name, bid)
1202312053

1202412054
def tensor_force_quant(self, name, new_name, bid, n_dims):
1202512055
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
12056+
# Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
1202612057
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
1202712058
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
1202812059
return super().tensor_force_quant(name, new_name, bid, n_dims)
1202912060

1203012061

12062+
@ModelBase.register("HunYuanVLForConditionalGeneration")
12063+
class HunyuanVLTextModel(HunYuanModel):
12064+
# The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
12065+
# and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
12066+
# while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
12067+
# the config and pick the matching GGUF architecture.
12068+
model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
12069+
12070+
@staticmethod
12071+
def _is_ocr_config(hparams: dict) -> bool:
12072+
# OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
12073+
# outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
12074+
# HunyuanVLVisionModel.is_ocr_variant.
12075+
return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
12076+
12077+
def __init__(self, dir_model: Path, *args, **kwargs):
12078+
raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
12079+
if self._is_ocr_config(raw_hparams):
12080+
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
12081+
else:
12082+
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
12083+
super().__init__(dir_model, *args, **kwargs)
12084+
12085+
def set_gguf_parameters(self):
12086+
super().set_gguf_parameters()
12087+
12088+
# Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
12089+
# the HunYuan-Dense arch which already handles standard rope in super().
12090+
if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
12091+
return
12092+
12093+
if self.rope_parameters.get("rope_type") != "xdrope":
12094+
return
12095+
12096+
# defaults for HunyuanVL. The C++ side later computes:
12097+
# freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
12098+
self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
12099+
self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
12100+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
12101+
self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1)))
12102+
12103+
ctx_len = int(self.hparams["max_position_embeddings"])
12104+
self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len)
12105+
self.gguf_writer.add_context_length(ctx_len)
12106+
12107+
self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"]))
12108+
12109+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
12110+
# Skip vision tensors — they are written by HunyuanVLVisionModel
12111+
if name.startswith("vit."):
12112+
return
12113+
yield from super().modify_tensors(data_torch, name, bid)
12114+
12115+
1203112116
@ModelBase.register("SmolLM3ForCausalLM")
1203212117
class SmolLM3Model(LlamaModel):
1203312118
model_arch = gguf.MODEL_ARCH.SMOLLM3

docs/backend/snapdragon/linux.md

Lines changed: 0 additions & 58 deletions
This file was deleted.

0 commit comments

Comments
 (0)