Skip to content

Commit e12a190

Browse files
Merge pull request #549 from janhq/update-dev-from-master-2026-06-04-01-25
Sync master with upstream release b9496
2 parents 3eb29eb + 94a220c commit e12a190

91 files changed

Lines changed: 3724 additions & 1091 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/release.yml

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -619,10 +619,11 @@ jobs:
619619
run: |
620620
choco install ninja
621621
622-
- name: ccache
623-
uses: ggml-org/ccache-action@v1.2.21
624-
with:
625-
key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
622+
# TODO: these jobs need to use llvm toolchain in order to utilize the ccache
623+
#- name: ccache
624+
# uses: ggml-org/ccache-action@v1.2.21
625+
# with:
626+
# key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
626627

627628
- name: Install OpenCL Headers and Libs
628629
id: install_opencl
@@ -650,10 +651,10 @@ jobs:
650651
cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON
651652
cmake --build build --config Release --target ${{ matrix.target }}
652653
653-
- name: ccache-clear
654-
uses: ./.github/actions/ccache-clear
655-
with:
656-
key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
654+
#- name: ccache-clear
655+
# uses: ./.github/actions/ccache-clear
656+
# with:
657+
# key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
657658

658659
- name: Pack artifacts
659660
id: pack_artifacts

common/arg.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
353353
model.path = "";
354354
}
355355
common_download_opts hf_opts = opts;
356-
hf_opts.download_mmproj = true; // also look for mmproj when downloading hf model
357356
auto download_result = common_download_model(model, hf_opts);
358357

359358
if (download_result.model_path.empty()) {
@@ -441,10 +440,11 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
441440
COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
442441

443442
common_download_opts opts;
444-
opts.bearer_token = params.hf_token;
445-
opts.offline = params.offline;
446-
opts.skip_download = params.skip_download;
447-
opts.download_mtp = spec_type_draft_mtp;
443+
opts.bearer_token = params.hf_token;
444+
opts.offline = params.offline;
445+
opts.skip_download = params.skip_download;
446+
opts.download_mtp = spec_type_draft_mtp;
447+
opts.download_mmproj = !params.no_mmproj;
448448

449449
try {
450450
auto res = common_params_handle_model(params.model, opts);

common/speculative.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#include "common.h"
44
#include "ggml.h"
55
#include "llama.h"
6-
#include "../src/llama-ext.h" // staging API: llama_set_embeddings_pre_norm / llama_get_embeddings_pre_norm_ith (used by MTP)
6+
#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP)
77
#include "log.h"
88
#include "ngram-cache.h"
99
#include "ngram-map.h"
@@ -162,7 +162,7 @@ struct common_speculative_impl {
162162
virtual bool need_embd() const = 0;
163163

164164
// true if this implementation requires the target context to extract pre-norm embeddings
165-
virtual bool need_embd_pre_norm() const { return false; }
165+
virtual bool need_embd_nextn() const { return false; }
166166
};
167167

168168
struct common_speculative_impl_draft_simple : public common_speculative_impl {
@@ -487,8 +487,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
487487
}
488488
}
489489

490-
llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false);
491-
llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);
490+
llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false);
491+
llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);
492492

493493
pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f));
494494

@@ -583,7 +583,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
583583
// ^--- this is a problem
584584
// TODO:this is generally true, but would be nice to assert it
585585
{
586-
const float * h_tgt = llama_get_embeddings_pre_norm(ctx_tgt);
586+
const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt);
587587
std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1));
588588

589589
//{
@@ -625,7 +625,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
625625
verify_h[seq_id].resize((size_t) n_rows * n_embd);
626626

627627
for (int32_t i = 0; i < n_rows; ++i) {
628-
const float * h = llama_get_embeddings_pre_norm_ith(ctx_tgt, i_batch_beg[seq_id] + i);
628+
const float * h = llama_get_embeddings_nextn_ith(ctx_tgt, i_batch_beg[seq_id] + i);
629629
std::memcpy(verify_h[seq_id].data() + (size_t) i * n_embd, h, row_bytes);
630630
}
631631

@@ -686,7 +686,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
686686
auto * smpl = smpls[seq_id].get();
687687

688688
common_sampler_sample(smpl, ctx_dft, i_batch, true);
689-
h_row = llama_get_embeddings_pre_norm_ith(ctx_dft, i_batch);
689+
h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
690690
++i_batch;
691691

692692
const auto * cur_p = common_sampler_get_candidates(smpl, true);
@@ -772,7 +772,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
772772
return false;
773773
}
774774

775-
bool need_embd_pre_norm() const override {
775+
bool need_embd_nextn() const override {
776776
return true;
777777
}
778778
};
@@ -1539,13 +1539,13 @@ bool common_speculative_need_embd(common_speculative * spec) {
15391539
return false;
15401540
}
15411541

1542-
bool common_speculative_need_embd_pre_norm(common_speculative * spec) {
1542+
bool common_speculative_need_embd_nextn(common_speculative * spec) {
15431543
if (spec == nullptr) {
15441544
return false;
15451545
}
15461546

15471547
for (auto & impl : spec->impls) {
1548-
if (impl->need_embd_pre_norm()) {
1548+
if (impl->need_embd_nextn()) {
15491549
return true;
15501550
}
15511551
}

common/speculative.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ bool common_speculative_process(common_speculative * spec, const llama_batch & b
5959
// true if any implementation requires target post-norm embeddings to be extracted
6060
bool common_speculative_need_embd(common_speculative * spec);
6161

62-
// true if any implementation requires target pre-norm embeddings to be extracted
63-
bool common_speculative_need_embd_pre_norm(common_speculative * spec);
62+
// true if any implementation requires target nextn embeddings to be extracted
63+
bool common_speculative_need_embd_nextn(common_speculative * spec);
6464

6565
// generate drafts for the sequences specified with `common_speculative_get_draft_params`
6666
void common_speculative_draft(common_speculative * spec);

conversion/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
"Gemma3nForConditionalGeneration": "gemma",
7878
"Gemma4ForConditionalGeneration": "gemma",
7979
"Gemma4ForCausalLM": "gemma",
80+
"Gemma4UnifiedForConditionalGeneration": "gemma",
8081
"GemmaForCausalLM": "gemma",
8182
"Glm4ForCausalLM": "glm",
8283
"Glm4MoeForCausalLM": "glm",
@@ -247,6 +248,7 @@
247248
"Gemma3ForConditionalGeneration": "gemma",
248249
"Gemma3nForConditionalGeneration": "gemma",
249250
"Gemma4ForConditionalGeneration": "gemma",
251+
"Gemma4UnifiedForConditionalGeneration": "gemma",
250252
"Glm4vForConditionalGeneration": "qwen3vl",
251253
"Glm4vMoeForConditionalGeneration": "qwen3vl",
252254
"GlmOcrForConditionalGeneration": "qwen3vl",

conversion/gemma.py

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import json
44
import re
55

6-
from typing import Callable, Iterable, TYPE_CHECKING
6+
from typing import Callable, Iterable, TYPE_CHECKING, Sequence
77

88
import torch
99

@@ -765,6 +765,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
765765
yield from super().modify_tensors(data_torch, name, bid)
766766

767767

768+
@ModelBase.register("Gemma4UnifiedForConditionalGeneration")
769+
class Gemma4UnifiedModel(Gemma4Model):
770+
model_arch = gguf.MODEL_ARCH.GEMMA4
771+
772+
def _get_suppress_tokens(self) -> Sequence[int] | None:
773+
gen_cfg_path = self.dir_model / "generation_config.json"
774+
if gen_cfg_path.is_file():
775+
with open(gen_cfg_path, encoding="utf-8") as f:
776+
gen_cfg = json.load(f)
777+
return gen_cfg.get("suppress_tokens")
778+
return None
779+
780+
def set_gguf_parameters(self):
781+
super().set_gguf_parameters()
782+
783+
suppress_tokens = self._get_suppress_tokens()
784+
if suppress_tokens is not None:
785+
self.gguf_writer.add_suppress_tokens(suppress_tokens)
786+
787+
768788
@ModelBase.register("Gemma4ForConditionalGeneration")
769789
class Gemma4VisionAudioModel(MmprojModel):
770790
has_audio_encoder = True
@@ -839,3 +859,61 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
839859
data_torch = data_torch.permute(0, 3, 1, 2).contiguous()
840860
mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
841861
yield (mapped_name, data_torch)
862+
863+
864+
@ModelBase.register("Gemma4UnifiedForConditionalGeneration")
865+
class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
866+
has_audio_encoder = True
867+
has_vision_encoder = True
868+
869+
def __init__(self, *args, **kwargs):
870+
super().__init__(*args, **kwargs)
871+
assert self.hparams_vision is not None
872+
assert self.hparams_audio is not None
873+
text_embd_dim = self.hparams_vision["mm_embed_dim"]
874+
self.hparams_vision["hidden_size"] = text_embd_dim
875+
self.hparams_audio["hidden_size"] = text_embd_dim
876+
# this is a transformer-less vision tower, the params below are redundant but set to avoid error
877+
self.hparams_vision["intermediate_size"] = 0
878+
self.hparams_vision["num_layers"] = 0
879+
self.hparams_vision["num_attention_heads"] = 0
880+
self.hparams_audio["intermediate_size"] = 0
881+
self.hparams_audio["num_layers"] = 0
882+
self.hparams_audio["num_attention_heads"] = 0
883+
884+
def set_gguf_parameters(self):
885+
super().set_gguf_parameters()
886+
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4UV)
887+
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4UA)
888+
889+
def modify_tensors(self, data_torch, name, bid):
890+
if name.endswith("pos_embedding"):
891+
name += ".weight"
892+
data_torch = data_torch.permute(1, 0, 2)
893+
elif ".pos_norm." in name:
894+
# rename to patch_ln3 to reuse the tensor name scheme
895+
name = name.replace(".pos_norm.", ".patch_ln3.")
896+
elif "patch_dense.weight" in name:
897+
# ggml im2col outputs in RR..GG..BB.. (CHW) order, but weight expects RGBRGB.. (HWC).
898+
# Permute columns so column i aligns with CHW input position i.
899+
assert self.hparams_vision is not None
900+
p = self.hparams_vision["model_patch_size"]
901+
i = torch.arange(p * p * 3)
902+
ch = i // (p * p)
903+
row = (i % (p * p)) // p
904+
col = i % p
905+
# perm[i] = HWC column index for CHW position i
906+
perm = row * p * 3 + col * 3 + ch
907+
data_torch = data_torch[:, perm]
908+
elif "patch_ln1.weight" in name or "patch_ln1.bias" in name:
909+
# same permutation for patch_ln1 as patch_dense to align with CHW input order
910+
assert self.hparams_vision is not None
911+
p = self.hparams_vision["model_patch_size"]
912+
i = torch.arange(p * p * 3)
913+
ch = i // (p * p)
914+
row = (i % (p * p)) // p
915+
col = i % p
916+
# perm[i] = HWC index for CHW position i
917+
perm = row * p * 3 + col * 3 + ch
918+
data_torch = data_torch[perm]
919+
return super().modify_tensors(data_torch, name, bid)

ggml/src/ggml-cpu/ops.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8955,7 +8955,12 @@ static void ggml_compute_forward_flash_attn_ext_f16(
89558955
k->type == v->type &&
89568956
neq1 >= Q_TILE_SZ);
89578957
#ifdef GGML_SIMD
8958-
use_tiled &= (DV % GGML_F32_EPR == 0);
8958+
#if defined(__ARM_FEATURE_SVE)
8959+
const int64_t f32_epr = svcntw();
8960+
#else
8961+
const int64_t f32_epr = GGML_F32_EPR;
8962+
#endif
8963+
use_tiled &= (DV % f32_epr == 0);
89598964
#endif
89608965
int current_chunk = ith;
89618966

@@ -11358,7 +11363,11 @@ static void ggml_compute_forward_fwht_f32(const ggml_compute_params * params, gg
1135811363

1135911364
// Scalar passes
1136011365
#if defined(GGML_SIMD)
11366+
#if defined(__ARM_FEATURE_SVE)
11367+
const int step = svcntw();
11368+
#else
1136111369
const int step = GGML_F32_EPR;
11370+
#endif
1136211371
#else
1136311372
const int step = n;
1136411373
#endif

ggml/src/ggml-cuda/common.cuh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1611,6 +1611,12 @@ static bool ggml_cuda_kernel_can_use_pdl(const void * kernel) {
16111611

16121612
#endif //defined(GGML_CUDA_USE_PDL)
16131613

1614+
// PDL and __restrict__ need to be mutually exclusive, see https://github.com/ggml-org/llama.cpp/pull/24030
1615+
# if (defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER)
1616+
# define GGML_CUDA_RESTRICT
1617+
# else
1618+
# define GGML_CUDA_RESTRICT __restrict__
1619+
# endif // defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
16141620

16151621
template<typename Kernel, typename... Args>
16161622
static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_kernel_launch_params & launch_params, Args&&... args) {

0 commit comments

Comments
 (0)