janhq
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 9 additions & 8 deletions b/‎.github/workflows/release.yml‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 5 additions & 5 deletions b/‎common/arg.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎common/speculative.cpp‎
Lines changed: 10 additions & 10 deletions b/‎common/speculative.cpp‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎common/speculative.h‎
Lines changed: 2 additions & 2 deletions b/‎common/speculative.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎conversion/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎conversion/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎conversion/gemma.py‎
Lines changed: 79 additions & 1 deletion b/‎conversion/gemma.py‎
Lines changed: 79 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 10 additions & 1 deletion b/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 6 additions & 0 deletions b/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 6 additions & 0 deletions
@@ -619,10 +619,11 @@ jobs:
         run: |
           choco install ninja
 
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
+      # TODO: these jobs need to use llvm toolchain in order to utilize the ccache
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@v1.2.21
+      #  with:
+      #    key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
 
       - name: Install OpenCL Headers and Libs
         id: install_opencl
@@ -650,10 +651,10 @@ jobs:
           cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON
           cmake --build build --config Release --target ${{ matrix.target }}
 
-      - name: ccache-clear
-        uses: ./.github/actions/ccache-clear
-        with:
-          key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
+      #- name: ccache-clear
+      #  uses: ./.github/actions/ccache-clear
+      #  with:
+      #    key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
 
       - name: Pack artifacts
         id: pack_artifacts
 
@@ -353,7 +353,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
             model.path = "";
         }
         common_download_opts hf_opts = opts;
-        hf_opts.download_mmproj = true; // also look for mmproj when downloading hf model
         auto download_result = common_download_model(model, hf_opts);
 
         if (download_result.model_path.empty()) {
@@ -441,10 +440,11 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
                                          COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
 
     common_download_opts opts;
-    opts.bearer_token  = params.hf_token;
-    opts.offline       = params.offline;
-    opts.skip_download = params.skip_download;
-    opts.download_mtp  = spec_type_draft_mtp;
+    opts.bearer_token    = params.hf_token;
+    opts.offline         = params.offline;
+    opts.skip_download   = params.skip_download;
+    opts.download_mtp    = spec_type_draft_mtp;
+    opts.download_mmproj = !params.no_mmproj;
 
     try {
         auto res = common_params_handle_model(params.model, opts);
 
@@ -3,7 +3,7 @@
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
-#include "../src/llama-ext.h" // staging API: llama_set_embeddings_pre_norm / llama_get_embeddings_pre_norm_ith (used by MTP)
+#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP)
 #include "log.h"
 #include "ngram-cache.h"
 #include "ngram-map.h"
@@ -162,7 +162,7 @@ struct common_speculative_impl {
     virtual bool need_embd() const = 0;
 
     // true if this implementation requires the target context to extract pre-norm embeddings
-    virtual bool need_embd_pre_norm() const { return false; }
+    virtual bool need_embd_nextn() const { return false; }
 };
 
 struct common_speculative_impl_draft_simple : public common_speculative_impl {
@@ -487,8 +487,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
             }
         }
 
-        llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false);
-        llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);
+        llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false);
+        llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);
 
         pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f));
 
@@ -583,7 +583,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
         //                                                       ^--- this is a problem
         // TODO:this is generally true, but would be nice to assert it
         {
-            const float * h_tgt = llama_get_embeddings_pre_norm(ctx_tgt);
+            const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt);
             std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1));
 
             //{
@@ -625,7 +625,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
             verify_h[seq_id].resize((size_t) n_rows * n_embd);
 
             for (int32_t i = 0; i < n_rows; ++i) {
-                const float * h = llama_get_embeddings_pre_norm_ith(ctx_tgt, i_batch_beg[seq_id] + i);
+                const float * h = llama_get_embeddings_nextn_ith(ctx_tgt, i_batch_beg[seq_id] + i);
                 std::memcpy(verify_h[seq_id].data() + (size_t) i * n_embd, h, row_bytes);
             }
 
@@ -686,7 +686,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                 auto * smpl = smpls[seq_id].get();
 
                 common_sampler_sample(smpl, ctx_dft, i_batch, true);
-                h_row = llama_get_embeddings_pre_norm_ith(ctx_dft, i_batch);
+                h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
                 ++i_batch;
 
                 const auto * cur_p = common_sampler_get_candidates(smpl, true);
@@ -772,7 +772,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
         return false;
     }
 
-    bool need_embd_pre_norm() const override {
+    bool need_embd_nextn() const override {
         return true;
     }
 };
@@ -1539,13 +1539,13 @@ bool common_speculative_need_embd(common_speculative * spec) {
     return false;
 }
 
-bool common_speculative_need_embd_pre_norm(common_speculative * spec) {
+bool common_speculative_need_embd_nextn(common_speculative * spec) {
     if (spec == nullptr) {
         return false;
     }
 
     for (auto & impl : spec->impls) {
-        if (impl->need_embd_pre_norm()) {
+        if (impl->need_embd_nextn()) {
             return true;
         }
     }
 
@@ -59,8 +59,8 @@ bool common_speculative_process(common_speculative * spec, const llama_batch & b
 // true if any implementation requires target post-norm embeddings to be extracted
 bool common_speculative_need_embd(common_speculative * spec);
 
-// true if any implementation requires target pre-norm embeddings to be extracted
-bool common_speculative_need_embd_pre_norm(common_speculative * spec);
+// true if any implementation requires target nextn embeddings to be extracted
+bool common_speculative_need_embd_nextn(common_speculative * spec);
 
 // generate drafts for the sequences specified with `common_speculative_get_draft_params`
 void common_speculative_draft(common_speculative * spec);
 
@@ -77,6 +77,7 @@
     "Gemma3nForConditionalGeneration": "gemma",
     "Gemma4ForConditionalGeneration": "gemma",
     "Gemma4ForCausalLM": "gemma",
+    "Gemma4UnifiedForConditionalGeneration": "gemma",
     "GemmaForCausalLM": "gemma",
     "Glm4ForCausalLM": "glm",
     "Glm4MoeForCausalLM": "glm",
@@ -247,6 +248,7 @@
     "Gemma3ForConditionalGeneration": "gemma",
     "Gemma3nForConditionalGeneration": "gemma",
     "Gemma4ForConditionalGeneration": "gemma",
+    "Gemma4UnifiedForConditionalGeneration": "gemma",
     "Glm4vForConditionalGeneration": "qwen3vl",
     "Glm4vMoeForConditionalGeneration": "qwen3vl",
     "GlmOcrForConditionalGeneration": "qwen3vl",
 
@@ -3,7 +3,7 @@
 import json
 import re
 
-from typing import Callable, Iterable, TYPE_CHECKING
+from typing import Callable, Iterable, TYPE_CHECKING, Sequence
 
 import torch
 
@@ -765,6 +765,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield from super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("Gemma4UnifiedForConditionalGeneration")
+class Gemma4UnifiedModel(Gemma4Model):
+    model_arch = gguf.MODEL_ARCH.GEMMA4
+
+    def _get_suppress_tokens(self) -> Sequence[int] | None:
+        gen_cfg_path = self.dir_model / "generation_config.json"
+        if gen_cfg_path.is_file():
+            with open(gen_cfg_path, encoding="utf-8") as f:
+                gen_cfg = json.load(f)
+                return gen_cfg.get("suppress_tokens")
+        return None
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        suppress_tokens = self._get_suppress_tokens()
+        if suppress_tokens is not None:
+            self.gguf_writer.add_suppress_tokens(suppress_tokens)
+
+
 @ModelBase.register("Gemma4ForConditionalGeneration")
 class Gemma4VisionAudioModel(MmprojModel):
     has_audio_encoder = True
@@ -839,3 +859,61 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 data_torch = data_torch.permute(0, 3, 1, 2).contiguous()
             mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
             yield (mapped_name, data_torch)
+
+
+@ModelBase.register("Gemma4UnifiedForConditionalGeneration")
+class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
+    has_audio_encoder = True
+    has_vision_encoder = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        assert self.hparams_audio is not None
+        text_embd_dim = self.hparams_vision["mm_embed_dim"]
+        self.hparams_vision["hidden_size"] = text_embd_dim
+        self.hparams_audio["hidden_size"] = text_embd_dim
+        # this is a transformer-less vision tower, the params below are redundant but set to avoid error
+        self.hparams_vision["intermediate_size"] = 0
+        self.hparams_vision["num_layers"] = 0
+        self.hparams_vision["num_attention_heads"] = 0
+        self.hparams_audio["intermediate_size"] = 0
+        self.hparams_audio["num_layers"] = 0
+        self.hparams_audio["num_attention_heads"] = 0
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4UV)
+        self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4UA)
+
+    def modify_tensors(self, data_torch, name, bid):
+        if name.endswith("pos_embedding"):
+            name += ".weight"
+            data_torch = data_torch.permute(1, 0, 2)
+        elif ".pos_norm." in name:
+            # rename to patch_ln3 to reuse the tensor name scheme
+            name = name.replace(".pos_norm.", ".patch_ln3.")
+        elif "patch_dense.weight" in name:
+            # ggml im2col outputs in RR..GG..BB.. (CHW) order, but weight expects RGBRGB.. (HWC).
+            # Permute columns so column i aligns with CHW input position i.
+            assert self.hparams_vision is not None
+            p = self.hparams_vision["model_patch_size"]
+            i = torch.arange(p * p * 3)
+            ch  = i // (p * p)
+            row = (i % (p * p)) // p
+            col = i % p
+            # perm[i] = HWC column index for CHW position i
+            perm = row * p * 3 + col * 3 + ch
+            data_torch = data_torch[:, perm]
+        elif "patch_ln1.weight" in name or "patch_ln1.bias" in name:
+            # same permutation for patch_ln1 as patch_dense to align with CHW input order
+            assert self.hparams_vision is not None
+            p = self.hparams_vision["model_patch_size"]
+            i = torch.arange(p * p * 3)
+            ch  = i // (p * p)
+            row = (i % (p * p)) // p
+            col = i % p
+            # perm[i] = HWC index for CHW position i
+            perm = row * p * 3 + col * 3 + ch
+            data_torch = data_torch[perm]
+        return super().modify_tensors(data_torch, name, bid)
@@ -8955,7 +8955,12 @@ static void ggml_compute_forward_flash_attn_ext_f16(
                                 k->type == v->type &&
                                 neq1 >= Q_TILE_SZ);
 #ifdef GGML_SIMD
-        use_tiled &= (DV % GGML_F32_EPR == 0);
+#if defined(__ARM_FEATURE_SVE)
+        const int64_t f32_epr = svcntw();
+#else
+        const int64_t f32_epr = GGML_F32_EPR;
+#endif
+        use_tiled &= (DV % f32_epr == 0);
 #endif
         int current_chunk = ith;
 
@@ -11358,7 +11363,11 @@ static void ggml_compute_forward_fwht_f32(const ggml_compute_params * params, gg
 
         // Scalar passes
 #if defined(GGML_SIMD)
+#if defined(__ARM_FEATURE_SVE)
+        const int step = svcntw();
+#else
         const int step = GGML_F32_EPR;
+#endif
 #else
         const int step = n;
 #endif
 
@@ -1611,6 +1611,12 @@ static bool ggml_cuda_kernel_can_use_pdl(const void * kernel) {
 
 #endif //defined(GGML_CUDA_USE_PDL)
 
+// PDL and __restrict__ need to be mutually exclusive, see https://github.com/ggml-org/llama.cpp/pull/24030
+# if (defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER)
+# define GGML_CUDA_RESTRICT
+# else
+# define GGML_CUDA_RESTRICT __restrict__
+# endif // defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
 
 template<typename Kernel, typename... Args>
 static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_kernel_launch_params & launch_params, Args&&... args) {