Skip to content

Commit 1877e6a

Browse files
Merge pull request #480 from janhq/update-dev-from-master-2026-04-09-00-44
Sync master with upstream release b8712
2 parents 6578d82 + dcdcbad commit 1877e6a

76 files changed

Lines changed: 1172 additions & 11140 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/labeler.yml

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,26 @@ android:
7373
- changed-files:
7474
- any-glob-to-any-file:
7575
- examples/llama.android/**
76+
server/webui:
77+
- changed-files:
78+
- all:
79+
- any-glob-to-any-file:
80+
- tools/server/webui/**
81+
- tools/server/public/**
82+
- all-globs-to-all-files:
83+
- '!tools/server/webui/**'
84+
- '!tools/server/public/**'
7685
server:
7786
- changed-files:
78-
- any-glob-to-any-file:
79-
- tools/server/**
87+
- all:
88+
- any-glob-to-any-file:
89+
- tools/server/**
90+
- all-globs-to-all-files:
91+
- '!tools/server/webui/**'
92+
- '!tools/server/public/**'
93+
94+
95+
8096
ggml:
8197
- changed-files:
8298
- any-glob-to-any-file:

.github/workflows/release.yml

Lines changed: 27 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -36,55 +36,26 @@ env:
3636
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
3737

3838
jobs:
39-
macOS-arm64:
40-
runs-on: macos-14
41-
42-
steps:
43-
- name: Clone
44-
id: checkout
45-
uses: actions/checkout@v6
46-
with:
47-
fetch-depth: 0
48-
49-
- name: ccache
50-
uses: ggml-org/ccache-action@v1.2.21
51-
with:
52-
key: macOS-latest-arm64
53-
evict-old-files: 1d
54-
55-
- name: Build
56-
id: cmake_build
57-
run: |
58-
sysctl -a
59-
cmake -B build \
60-
-DCMAKE_INSTALL_RPATH='@loader_path' \
61-
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
62-
-DLLAMA_FATAL_WARNINGS=ON \
63-
-DLLAMA_BUILD_BORINGSSL=ON \
64-
-DGGML_METAL_USE_BF16=ON \
65-
-DGGML_METAL_EMBED_LIBRARY=ON \
66-
-DGGML_RPC=ON \
67-
${{ env.CMAKE_ARGS }}
68-
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
69-
70-
- name: Determine tag name
71-
id: tag
72-
uses: ./.github/actions/get-tag-name
73-
74-
- name: Pack artifacts
75-
id: pack_artifacts
76-
run: |
77-
cp LICENSE ./build/bin/
78-
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
79-
80-
- name: Upload artifacts
81-
uses: actions/upload-artifact@v6
82-
with:
83-
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
84-
name: llama-bin-macos-arm64.tar.gz
39+
macOS-cpu:
40+
strategy:
41+
matrix:
42+
include:
43+
- build: 'arm64'
44+
arch: 'arm64'
45+
os: macos-14
46+
defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
47+
- build: 'arm64-kleidiai'
48+
arch: 'arm64'
49+
os: macos-14
50+
defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
51+
- build: 'x64'
52+
arch: 'x64'
53+
os: macos-15-intel
54+
# Metal is disabled on x64 due to intermittent failures with Github runners not having a GPU:
55+
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
56+
defines: "-DGGML_METAL=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3"
8557

86-
macOS-x64:
87-
runs-on: macos-15-intel
58+
runs-on: ${{ matrix.os }}
8859

8960
steps:
9061
- name: Clone
@@ -96,23 +67,20 @@ jobs:
9667
- name: ccache
9768
uses: ggml-org/ccache-action@v1.2.21
9869
with:
99-
key: macOS-latest-x64
70+
key: macOS-latest-${{ matrix.arch }}
10071
evict-old-files: 1d
10172

10273
- name: Build
10374
id: cmake_build
10475
run: |
10576
sysctl -a
106-
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
107-
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
10877
cmake -B build \
78+
${{ matrix.defines }} \
10979
-DCMAKE_INSTALL_RPATH='@loader_path' \
11080
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
11181
-DLLAMA_FATAL_WARNINGS=ON \
11282
-DLLAMA_BUILD_BORINGSSL=ON \
113-
-DGGML_METAL=OFF \
114-
-DGGML_RPC=ON \
115-
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
83+
${{ env.CMAKE_ARGS }}
11684
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
11785
11886
- name: Determine tag name
@@ -123,13 +91,13 @@ jobs:
12391
id: pack_artifacts
12492
run: |
12593
cp LICENSE ./build/bin/
126-
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
94+
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
12795
12896
- name: Upload artifacts
12997
uses: actions/upload-artifact@v6
13098
with:
131-
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
132-
name: llama-bin-macos-x64.tar.gz
99+
path: llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz
100+
name: llama-bin-macos-${{ matrix.build }}.tar.gz
133101

134102
ubuntu-cpu:
135103
strategy:
@@ -1003,8 +971,7 @@ jobs:
1003971
- ubuntu-cpu
1004972
- ubuntu-vulkan
1005973
- ubuntu-24-openvino
1006-
- macOS-arm64
1007-
- macOS-x64
974+
- macOS-cpu
1008975
- ios-xcode-build
1009976
- openEuler-cann
1010977

@@ -1079,6 +1046,7 @@ jobs:
10791046
10801047
**macOS/iOS:**
10811048
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
1049+
- [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz)
10821050
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
10831051
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
10841052

common/chat.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1963,7 +1963,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
19631963
params.add_generation_prompt = true;
19641964
std::string gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
19651965
auto diff = calculate_diff_split(no_gen_prompt, gen_prompt);
1966-
params.generation_prompt = diff.right;
1966+
params.generation_prompt = diff.right + diff.suffix;
19671967

19681968
params.add_generation_prompt = inputs.add_generation_prompt;
19691969

convert_hf_to_gguf.py

Lines changed: 85 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2219,10 +2219,10 @@ def set_gguf_parameters(self):
22192219
self.image_size = self.find_vparam(["image_size"])
22202220
self.gguf_writer.add_vision_image_size(self.image_size)
22212221
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
2222-
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size"]))
2222+
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "width", "vt_hidden_size"]))
22232223
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"]))
22242224
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
2225-
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads"]))
2225+
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "heads", "vt_num_attention_heads"]))
22262226

22272227
# preprocessor config
22282228
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -4949,6 +4949,73 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
49494949
yield from super().modify_tensors(data_torch, name, bid)
49504950

49514951

4952+
@ModelBase.register("StepVLForConditionalGeneration")
4953+
class Step3VLVisionModel(MmprojModel):
4954+
def __init__(self, *args, **kwargs):
4955+
super().__init__(*args, **kwargs)
4956+
assert self.hparams_vision is not None
4957+
4958+
if not self.hparams_vision.get("intermediate_size"):
4959+
hidden_size = self.hparams_vision.get("hidden_size") or self.hparams_vision.get("width") or 0
4960+
assert hidden_size > 0
4961+
mlp_ratio = float(self.hparams_vision.get("mlp_ratio", 8960 / 1536))
4962+
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
4963+
4964+
self.preprocessor_config.setdefault("image_mean", list(_MISTRAL_COMMON_DATASET_MEAN))
4965+
self.preprocessor_config.setdefault("image_std", list(_MISTRAL_COMMON_DATASET_STD))
4966+
4967+
def set_gguf_parameters(self):
4968+
super().set_gguf_parameters()
4969+
assert self.hparams_vision is not None
4970+
4971+
projector_stride = int(self.global_config.get("understand_projector_stride", -1))
4972+
hidden_size = int(self.hparams_vision.get("hidden_size", self.hparams_vision.get("width", -1)))
4973+
num_layers = int(self.hparams_vision.get("num_hidden_layers", self.hparams_vision.get("layers", -1)))
4974+
assert (projector_stride, int(self.hparams_vision.get("image_size", -1)), hidden_size, num_layers) == (2, 728, 1536, 47), (
4975+
"current Step3-VL conversion path is only validated for Step3-VL-10B"
4976+
)
4977+
4978+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.STEP3VL)
4979+
self.gguf_writer.add_vision_attention_layernorm_eps(float(self.hparams_vision.get("layer_norm_eps", 1e-5)))
4980+
self.gguf_writer.add_vision_projector_scale_factor(projector_stride ** 2)
4981+
# 3024 max resize comes from step3-vl-10b processing_step3.py.
4982+
self.gguf_writer.add_vision_preproc_image_size(3024)
4983+
4984+
def tensor_force_quant(self, name, new_name, bid, n_dims):
4985+
if ".position_embd." in new_name:
4986+
return gguf.GGMLQuantizationType.F32
4987+
return super().tensor_force_quant(name, new_name, bid, n_dims)
4988+
4989+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4990+
if name.startswith("model.") or name.startswith("lm_head."):
4991+
return
4992+
4993+
if name.startswith("vision_model.vit_downsampler"):
4994+
match = re.match(r"vision_model\.vit_downsampler(\d+)\.(weight|bias)", name)
4995+
if match is None:
4996+
raise ValueError(f"Unexpected Step3-VL projector tensor {name!r}")
4997+
4998+
proj_id = int(match.group(1)) - 1
4999+
suffix = f".{match.group(2)}"
5000+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, proj_id, suffix=suffix), data_torch)
5001+
return
5002+
5003+
if name == "vit_large_projector.weight":
5004+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ_FC), data_torch)
5005+
return
5006+
5007+
if name.startswith("vision_model."):
5008+
if name == "vision_model.positional_embedding":
5009+
name += ".weight"
5010+
elif name.endswith(".gamma") and ".ls_" in name:
5011+
name = name.removesuffix(".gamma") + ".weight"
5012+
5013+
name = name.replace("attn.in_proj_weight", "attn.in_proj.weight")
5014+
name = name.replace("attn.in_proj_bias", "attn.in_proj.bias")
5015+
5016+
yield from super().modify_tensors(data_torch, name, bid)
5017+
5018+
49525019
@ModelBase.register("Qwen3VLForConditionalGeneration")
49535020
class Qwen3VLTextModel(Qwen3Model):
49545021
model_arch = gguf.MODEL_ARCH.QWEN3VL
@@ -4969,6 +5036,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
49695036
yield from super().modify_tensors(data_torch, name, bid)
49705037

49715038

5039+
@ModelBase.register("StepVLForConditionalGeneration")
5040+
class Step3VLTextModel(Qwen3Model):
5041+
model_arch = gguf.MODEL_ARCH.QWEN3
5042+
5043+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5044+
if name.startswith("vision_model.") or name.startswith("model.vision_model.") or name.startswith("vit_large_projector."):
5045+
return
5046+
yield from super().modify_tensors(data_torch, name, bid)
5047+
5048+
49725049
@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
49735050
class Qwen3VLMoeTextModel(Qwen3MoeModel):
49745051
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
@@ -12994,6 +13071,12 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
1299413071
# For non-hf Mamba and Mamba2 models
1299513072
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
1299613073

13074+
# Step3-VL keeps text config under text_config but uses a custom top-level architecture.
13075+
# For text conversion we route to a dedicated text-only class.
13076+
# TODO: refactor this later to avoid adding exception here
13077+
if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
13078+
return arch
13079+
1299713080
# if "architectures" is found in the sub-config, use that instead
1299813081
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
1299913082
arch = text_config["architectures"][0]

examples/debug/debug.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <vector>
1010
#include <filesystem>
1111
#include <fstream>
12+
#include <optional>
1213
#include <regex>
1314

1415
static void print_usage(int /*argc*/, char ** argv) {
@@ -222,7 +223,10 @@ int main(int argc, char ** argv) {
222223
llama_backend_init();
223224
llama_numa_init(params.numa);
224225

225-
base_callback_data cb_data(params, params.tensor_filter);
226+
std::optional<base_callback_data> cb_data;
227+
if (!params.save_logits) {
228+
cb_data.emplace(params, params.tensor_filter);
229+
}
226230

227231
auto llama_init = common_init_from_params(params);
228232

ggml/src/ggml-cuda/common.cuh

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,19 +1157,6 @@ struct ggml_tensor_extra_gpu {
11571157
#define USE_CUDA_GRAPH
11581158
#endif
11591159

1160-
struct ggml_cuda_graph_node_properties {
1161-
void * node_data;
1162-
ggml_op node_op;
1163-
enum ggml_type node_type;
1164-
int32_t flags;
1165-
int64_t ne[GGML_MAX_DIMS];
1166-
size_t nb[GGML_MAX_DIMS];
1167-
void * src_data[GGML_MAX_SRC];
1168-
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
1169-
};
1170-
1171-
static_assert(std::is_trivial<ggml_cuda_graph_node_properties>::value, "ggml_cuda_graph_node_properties must be trivial");
1172-
11731160
struct ggml_cuda_graph {
11741161
#ifdef USE_CUDA_GRAPH
11751162
~ggml_cuda_graph() {
@@ -1186,13 +1173,7 @@ struct ggml_cuda_graph {
11861173
std::vector<cudaGraphNode_t> nodes;
11871174
bool disable_due_to_gpu_arch = false;
11881175
bool warmup_complete = false;
1189-
std::vector<ggml_cuda_graph_node_properties> props;
1190-
1191-
// these are extra tensors (inputs) that participate in the ggml graph but are not nodes
1192-
// they properties also have to match in order to be able to safely reuse a CUDA graph
1193-
// ref: https://github.com/ggml-org/llama.cpp/pull/18583
1194-
// ref: https://github.com/ggml-org/llama.cpp/pull/19165
1195-
std::vector<ggml_cuda_graph_node_properties> extra;
1176+
std::vector<ggml_tensor> nodes_copy;
11961177

11971178
bool is_enabled() const {
11981179
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);

0 commit comments

Comments
 (0)