Skip to content

Commit 37073bc

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # ggml/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cuda/mmq.cuh # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/sync-ggml.last # tests/test-backend-ops.cpp # tests/test-log.cpp
2 parents ef79904 + 683c5ac commit 37073bc

27 files changed

Lines changed: 1040 additions & 622 deletions

common/log.cpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ enum common_log_col : int {
4949
};
5050

5151
// disable colors by default
52-
static std::vector<const char *> g_col = {
52+
static const char* g_col[] = {
5353
"",
5454
"",
5555
"",
@@ -247,7 +247,6 @@ struct common_log {
247247

248248
entries = std::move(new_entries);
249249
}
250-
251250
cv.notify_one();
252251
}
253252

@@ -265,7 +264,6 @@ struct common_log {
265264
{
266265
std::unique_lock<std::mutex> lock(mtx);
267266
cv.wait(lock, [this]() { return head != tail; });
268-
269267
cur = entries[head];
270268

271269
head = (head + 1) % entries.size();
@@ -301,7 +299,6 @@ struct common_log {
301299

302300
tail = (tail + 1) % entries.size();
303301
}
304-
305302
cv.notify_one();
306303
}
307304

@@ -338,7 +335,7 @@ struct common_log {
338335
g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
339336
g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
340337
} else {
341-
for (size_t i = 0; i < g_col.size(); i++) {
338+
for (size_t i = 0; i < std::size(g_col); i++) {
342339
g_col[i] = "";
343340
}
344341
}
@@ -368,14 +365,20 @@ struct common_log * common_log_init() {
368365
}
369366

370367
struct common_log * common_log_main() {
371-
static struct common_log log;
368+
// We intentionally leak (i.e. do not delete) the logger singleton because
369+
// common_log destructor called at DLL teardown phase will cause hanging on Windows.
370+
// OS will release resources anyway so it should not be a significant issue,
371+
// though this design may cause logs to be lost if not flushed before the program exits.
372+
// Refer to https://github.com/ggml-org/llama.cpp/issues/22142 for details.
373+
static struct common_log * log;
372374
static std::once_flag init_flag;
373375
std::call_once(init_flag, [&]() {
376+
log = new common_log;
374377
// Set default to auto-detect colors
375-
log.set_colors(tty_can_use_colors());
378+
log->set_colors(tty_can_use_colors());
376379
});
377380

378-
return &log;
381+
return log;
379382
}
380383

381384
void common_log_pause(struct common_log * log) {

common/log.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,11 @@ void common_log_default_callback(enum ggml_log_level level, const char * text, v
4949
struct common_log;
5050

5151
struct common_log * common_log_init();
52-
struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
52+
53+
// Singleton, intentionally leaked to avoid Windows teardown hangs.
54+
// Call common_log_flush() before exit if you want to ensure all logs are flushed.
55+
struct common_log * common_log_main();
56+
5357
void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
5458
void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
5559
void common_log_free (struct common_log * log);

common/speculative.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,7 @@ struct common_speculative_state_draft : public common_speculative_state {
467467

468468
prompt_dft.push_back(id_last);
469469

470-
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
470+
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
471471

472472
int ret = llama_decode(ctx_dft, batch);
473473
if (ret != 0 && ret != 1) {
@@ -495,14 +495,14 @@ struct common_speculative_state_draft : public common_speculative_state {
495495

496496
common_sampler_accept(smpl, id, true);
497497

498-
result.push_back(id);
499-
500-
if (sparams.n_max <= (int) result.size()) {
498+
// only collect very high-confidence draft tokens
499+
if (cur_p->data[0].p < sparams.p_min) {
501500
break;
502501
}
503502

504-
// only collect very high-confidence draft tokens
505-
if (cur_p->data[0].p < sparams.p_min) {
503+
result.push_back(id);
504+
505+
if (sparams.n_max <= (int) result.size()) {
506506
break;
507507
}
508508

convert_hf_to_gguf.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,9 @@ def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_s
728728

729729
del experts, merged
730730

731+
def _needs_nvfp4_processing(self) -> bool:
732+
return True
733+
731734
def prepare_tensors(self):
732735
# detect NVFP4 quantization (ModelOpt format)
733736
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
@@ -758,7 +761,7 @@ def prepare_tensors(self):
758761
# NVFP4 weights are repacked and written directly to gguf_writer.
759762
# This must run before dequant_model so NVFP4 tensors are removed
760763
# from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
761-
if self._is_nvfp4:
764+
if self._is_nvfp4 and self._needs_nvfp4_processing():
762765
self._generate_nvfp4_tensors()
763766

764767
self.dequant_model()
@@ -2190,6 +2193,10 @@ def __init__(self, *args, **kwargs):
21902193
# merge configs
21912194
self.preprocessor_config = {**self.preprocessor_config, **cfg}
21922195

2196+
def _needs_nvfp4_processing(self) -> bool:
2197+
# nvfp4 quantization applies to the text model only.
2198+
return False
2199+
21932200
def get_vision_config(self) -> dict[str, Any] | None:
21942201
config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
21952202
return self.global_config.get(config_name)
@@ -4450,6 +4457,12 @@ def get_vision_config(self) -> dict[str, Any] | None:
44504457
}
44514458
return vision_config
44524459

4460+
def dequant_model(self):
4461+
if self._is_nvfp4:
4462+
# Skip nvfp4 quantization for vision/audio model.
4463+
return
4464+
super().dequant_model()
4465+
44534466
def set_gguf_parameters(self):
44544467
if "image_mean" not in self.preprocessor_config:
44554468
self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
@@ -4473,6 +4486,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
44734486
if "input_conditioner" in name:
44744487
return
44754488

4489+
# mtmd does not support video yet so skip tensors related to video.
4490+
if "radio_model.model.patch_generator.video_embedder" in name:
4491+
return
4492+
44764493
# RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
44774494
if "patch_generator.pos_embed" in name:
44784495
if not name.endswith(".weight"):
@@ -10820,7 +10837,11 @@ def __init__(self, *args, **kwargs):
1082010837
# uses self.model_arch to build the tensor name map, and all MoE-specific
1082110838
# mappings would be missed if it were called with the default non-MoE arch.
1082210839
hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
10823-
if "num_experts_per_tok" in hparams:
10840+
has_moe_params = (
10841+
"num_experts_per_tok" in hparams
10842+
or (isinstance(hparams.get("llm_config"), dict) and "num_experts_per_tok" in hparams["llm_config"])
10843+
)
10844+
if has_moe_params:
1082410845
self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
1082510846
self.is_moe = True
1082610847

@@ -10967,6 +10988,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1096710988
if name.startswith(("vision_model.", "mlp1.")):
1096810989
return
1096910990

10991+
if name.startswith(("sound_encoder.")):
10992+
return
10993+
if name.startswith(("sound_projection.")):
10994+
return
10995+
1097010996
# Strip language_model. prefix for VLM models (e.g., Nemotron Nano 12B v2 VL)
1097110997
if name.startswith("language_model."):
1097210998
name = name[len("language_model."):]

ggml/src/ggml-backend-meta.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1826,7 +1826,24 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
18261826
continue;
18271827
}
18281828

1829-
i = get_i_delayed(i);
1829+
const int i_delayed = get_i_delayed(i);
1830+
1831+
// If we can delay the AllReduce we need to consider the interaction with zero-sized tensor slices.
1832+
// A backend with such a slice would normally have valid data after participating in the AllReduce with a node that has
1833+
// its compute flag disabled and thus gets its data zeroed out.
1834+
// If the AllReduce is delayed then the nodes until that point also need to have their compute flag disabled.
1835+
if (i_delayed > i) {
1836+
for (size_t j = 0; j < n_backends; j++) {
1837+
auto & bcj = backend_ctx->backend_configs[j];
1838+
if ((bcj.nodes[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
1839+
for (int ii = i + 1; ii <= i_delayed; ii++) {
1840+
bcj.nodes[ii]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
1841+
}
1842+
}
1843+
}
1844+
}
1845+
1846+
i = i_delayed;
18301847

18311848
for (size_t j = 0; j < n_backends; j++) {
18321849
auto & bcj = backend_ctx->backend_configs[j];

ggml/src/ggml-cpu/arch/arm/repack.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4867,6 +4867,71 @@ void ggml_gemm_q8_0_4x8_q8_0(int n,
48674867
UNUSED(ncols_interleaved);
48684868
UNUSED(blocklen);
48694869

4870+
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
4871+
if (svcntb() * 8 == 256) {
4872+
const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
4873+
4874+
static const uint32_t idx_arr[8] = {0, 1, 4, 5, 2, 3, 6, 7};
4875+
svuint32_t idx = svld1(svptrue_b32(), idx_arr);
4876+
static const uint32_t idx_arr1[8] = {0, 1, 2, 3, 1, 2, 3, 0};
4877+
svuint32_t idx_sc1 = svld1(svptrue_b32(), idx_arr1);
4878+
static const uint32_t idx_arr2[8] = {0, 1, 2, 3, 0, 1, 2, 3};
4879+
svuint32_t idx_sc2 = svld1(svptrue_b32(), idx_arr2);
4880+
4881+
for (int y = 0; y < nr; y += 4) {
4882+
const block_q8_0x4 * a_ptr_base = (const block_q8_0x4 *) vy + (y / 4) * nb;
4883+
4884+
for (int x = 0; x < nc; x += ncols_interleaved) {
4885+
const block_q8_0x4 * b_ptr = b_ptr_base + (x / 4) * nb;
4886+
const block_q8_0x4 * a_ptr = a_ptr_base;
4887+
4888+
svfloat32_t acc_f32_01 = svdup_f32(0);
4889+
svfloat32_t acc_f32_23 = svdup_f32(0);
4890+
4891+
for (int b = 0; b < nb; b++) {
4892+
4893+
svint32_t acc_01 = svdup_s32(0);
4894+
svint32_t acc_23 = svdup_s32(0);
4895+
4896+
// Process 4 chunks of 8 positions each
4897+
for (int chunk = 0; chunk < 4; chunk++) {
4898+
svint8_t s_a01 = svld1rq_s8(svptrue_b8(), a_ptr->qs + chunk * 32);
4899+
svint8_t s_a23 = svld1rq_s8(svptrue_b8(), a_ptr->qs + chunk * 32 + 16);
4900+
svint8_t s_b0123 = svld1_s8(svptrue_b8(), b_ptr->qs + chunk * 32);
4901+
4902+
acc_01 = svmmla_s32(acc_01, s_a01, s_b0123);
4903+
acc_23 = svmmla_s32(acc_23, s_a23, s_b0123);
4904+
}
4905+
4906+
// Reorder outputs from 2×2 tiles to row-major
4907+
// acc[01] = [r0c0, r0c1, r1c0, r1c1, r0c2, r0c3, r1c2, r1c3]
4908+
// acc[23] = [r2c0, r2c1, r3c0, r3c1, r2c2, r2c3, r3c2, r3c3]
4909+
4910+
svint32_t row01 = svtbl_s32(acc_01, idx);
4911+
svint32_t row23 = svtbl_s32(acc_23, idx);
4912+
4913+
svfloat16_t temp1 = svld1_f16(svptrue_pat_b16(SV_VL4), (const __fp16 *) a_ptr->d);
4914+
svfloat16_t temp2 = svld1_f16(svptrue_pat_b16(SV_VL4), (const __fp16 *) b_ptr->d);
4915+
svfloat32_t sv_a_d = svtbl_f32(svcvt_f32_f16_x(svptrue_b32(), svzip1_f16(temp1, temp1)), idx_sc1);
4916+
svfloat32_t sv_b_d = svtbl_f32(svcvt_f32_f16_x(svptrue_b32(), svzip1_f16(temp2, temp2)), idx_sc2);
4917+
4918+
acc_f32_01 = svmla_f32_x(svptrue_b32(), acc_f32_01, svcvt_f32_s32_x(svptrue_b32(), row01), svmul_lane_f32(sv_b_d, sv_a_d, 0));
4919+
acc_f32_23 = svmla_f32_x(svptrue_b32(), acc_f32_23, svcvt_f32_s32_x(svptrue_b32(), row23), svmul_lane_f32(sv_b_d, sv_a_d, 2));
4920+
a_ptr++;
4921+
b_ptr++;
4922+
}
4923+
4924+
svbool_t pg4 = svptrue_pat_b32(SV_VL4);
4925+
svst1_f32(pg4, s + (y+0) * bs + x, acc_f32_01);
4926+
svst1_f32(pg4, s + (y+1) * bs + x, svext_f32(acc_f32_01, acc_f32_01, 4));
4927+
svst1_f32(pg4, s + (y+2) * bs + x, acc_f32_23);
4928+
svst1_f32(pg4, s + (y+3) * bs + x, svext_f32(acc_f32_23, acc_f32_23, 4));
4929+
}
4930+
}
4931+
return;
4932+
}
4933+
#endif // SVE compile-time end
4934+
48704935
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
48714936
const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
48724937

ggml/src/ggml-cpu/llamafile/sgemm.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2321,6 +2321,9 @@ class tinyBLAS_Q0_PPC {
23212321
}
23222322

23232323
void matmul(int64_t m, int64_t n) {
2324+
#if defined(_AIX) || defined(__BIG_ENDIAN__)
2325+
mnpack(0, m, 0, n);
2326+
#else
23242327
const int64_t mc = 64;
23252328
const int64_t kc = 64;
23262329
int64_t nc = 64;
@@ -2334,7 +2337,6 @@ class tinyBLAS_Q0_PPC {
23342337
} else {
23352338
n_aligned = (n / 64) * 64;
23362339
}
2337-
23382340
if (n_aligned > 0) {
23392341
if (n_aligned % 64 == 0) nc = 64;
23402342
else if (n_aligned == n) nc = n;
@@ -2352,6 +2354,7 @@ class tinyBLAS_Q0_PPC {
23522354
} else {
23532355
mnpack(0, m, 0, n);
23542356
}
2357+
#endif
23552358
}
23562359

23572360
private:
@@ -3191,12 +3194,16 @@ class tinyBLAS_PPC {
31913194
}
31923195

31933196
void matmul(int64_t m, int64_t n) {
3197+
#if defined(_AIX) || defined(__BIG_ENDIAN__)
3198+
mnpack(0, m, 0, n);
3199+
#else
31943200
int64_t mc = 256; int64_t nc = 256; int64_t kc = 256;
31953201
if (m % mc == 0 && n % nc == 0 && k % kc == 0) {
31963202
matmul_tiled(m, n, mc, nc, kc);
31973203
} else {
31983204
mnpack(0, m, 0, n);
31993205
}
3206+
#endif
32003207
}
32013208

32023209
private:

ggml/src/ggml-cuda/common.cuh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -837,6 +837,18 @@ static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
837837
#endif // defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000
838838
}
839839

840+
static __device__ __forceinline__ uint8_t ggml_cuda_fp32_to_ue4m3(float x) {
841+
#if defined(BLACKWELL_MMA_AVAILABLE) // This is used for NVFP4 subblock scale quantizations only
842+
if (!(x > 0.0f)) {
843+
return 0;
844+
}
845+
const __nv_fp8_e4m3 xf(x);
846+
return xf.__x;
847+
#else
848+
NO_DEVICE_CODE; // Used only for NVFP4 Scales for Activations, only for Blackwell
849+
#endif // defined(BLACKWELL_MMA_AVAILABLE)
850+
}
851+
840852
__device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
841853
const uint8_t sign_bit = (x < 0.0f) << 3;
842854
float ax = fabsf(x) * e;

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
6666
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 32, 128, 128, 128, 2, true);
6767
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2, 32, 128, 128, 128, 2, true);
6868

69+
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2, 32, 128, 128, 128, 1, false);
70+
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 256, 1, 32, 128, 128, 128, 1, false);
71+
6972
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 8, 64, 4, 32, 256, 256, 128, 1, false);
7073
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 64, 4, 32, 256, 256, 128, 1, false);
7174
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2, 32, 128, 128, 128, 1, false);
@@ -85,6 +88,9 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
8588
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 64, 128, 128, 64, 2, true);
8689
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2, 64, 128, 128, 64, 2, true);
8790

91+
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2, 32, 128, 128, 128, 1, false);
92+
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 256, 1, 32, 128, 128, 128, 1, false);
93+
8894
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 8, 64, 4, 32, 96, 64, 128, 1, false);
8995
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 64, 4, 32, 96, 64, 128, 1, false);
9096
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2, 32, 128, 128, 128, 1, false);
@@ -118,6 +124,9 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
118124
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 64, 128, 128, 64, 2, true);
119125
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2, 64, 128, 128, 64, 2, true);
120126

127+
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2, 64, 160, 128, 64, 2, true);
128+
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 128, 2, 64, 160, 128, 64, 2, false);
129+
121130
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 64, 4, 32, 128, 128, 128, 1, false);
122131
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2, 32, 128, 128, 128, 1, false);
123132
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 64, 256, 1, 32, 128, 128, 128, 1, false);
@@ -1217,7 +1226,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
12171226
float KQ_max_scale[cols_per_thread];
12181227
#pragma unroll
12191228
for (int col = 0; col < cols_per_thread; ++col) {
1220-
const int jc = cols_per_warp == 8 ? T_C_KQ::get_j(col) : T_C_KQ::get_i(2*col);
1229+
const int jc = (threadIdx.y/np)*cols_per_warp + (cols_per_warp == 8 ? T_C_KQ::get_j(col) : T_C_KQ::get_i(2*col));
12211230
const float sink = sinks_f[jc % ncols2];
12221231

12231232
const float KQ_max_new = fmaxf(KQ_max[col], sink);
@@ -1825,6 +1834,10 @@ extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
18251834
extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
18261835
extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
18271836

1837+
// Mistral Small 4 (DKQ=320, DV=256), GQA=32-only build:
1838+
extern DECL_FATTN_MMA_F16_CASE(320, 256, 1, 32);
1839+
extern DECL_FATTN_MMA_F16_CASE(320, 256, 2, 32);
1840+
18281841
// For GLM 4.7 Flash
18291842
extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 4);
18301843
extern DECL_FATTN_MMA_F16_CASE(576, 512, 8, 4);

0 commit comments

Comments
 (0)