LostRuins
diff --git a/‎common/log.cpp‎
Lines changed: 11 additions & 8 deletions b/‎common/log.cpp‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎common/log.h‎
Lines changed: 5 additions & 1 deletion b/‎common/log.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎common/speculative.cpp‎
Lines changed: 6 additions & 6 deletions b/‎common/speculative.cpp‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 28 additions & 2 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 28 additions & 2 deletions
diff --git a/‎ggml/src/ggml-backend-meta.cpp‎
Lines changed: 18 additions & 1 deletion b/‎ggml/src/ggml-backend-meta.cpp‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/arch/arm/repack.cpp‎
Lines changed: 65 additions & 0 deletions b/‎ggml/src/ggml-cpu/arch/arm/repack.cpp‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/llamafile/sgemm.cpp‎
Lines changed: 8 additions & 1 deletion b/‎ggml/src/ggml-cpu/llamafile/sgemm.cpp‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 12 additions & 0 deletions b/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/fattn-mma-f16.cuh‎
Lines changed: 14 additions & 1 deletion b/‎ggml/src/ggml-cuda/fattn-mma-f16.cuh‎
Lines changed: 14 additions & 1 deletion
@@ -49,7 +49,7 @@ enum common_log_col : int {
 };
 
 // disable colors by default
-static std::vector<const char *> g_col = {
+static const char* g_col[] = {
     "",
     "",
     "",
@@ -247,7 +247,6 @@ struct common_log {
 
             entries = std::move(new_entries);
         }
-
         cv.notify_one();
     }
 
@@ -265,7 +264,6 @@ struct common_log {
                 {
                     std::unique_lock<std::mutex> lock(mtx);
                     cv.wait(lock, [this]() { return head != tail; });
-
                     cur = entries[head];
 
                     head = (head + 1) % entries.size();
@@ -301,7 +299,6 @@ struct common_log {
 
                 tail = (tail + 1) % entries.size();
             }
-
             cv.notify_one();
         }
 
@@ -338,7 +335,7 @@ struct common_log {
             g_col[COMMON_LOG_COL_CYAN]    = LOG_COL_CYAN;
             g_col[COMMON_LOG_COL_WHITE]   = LOG_COL_WHITE;
         } else {
-            for (size_t i = 0; i < g_col.size(); i++) {
+            for (size_t i = 0; i < std::size(g_col); i++) {
                 g_col[i] = "";
             }
         }
@@ -368,14 +365,20 @@ struct common_log * common_log_init() {
 }
 
 struct common_log * common_log_main() {
-    static struct common_log log;
+    // We intentionally leak (i.e. do not delete) the logger singleton because
+    // common_log destructor called at DLL teardown phase will cause hanging on Windows.
+    // OS will release resources anyway so it should not be a significant issue,
+    // though this design may cause logs to be lost if not flushed before the program exits.
+    // Refer to https://github.com/ggml-org/llama.cpp/issues/22142 for details.
+    static struct common_log * log;
     static std::once_flag    init_flag;
     std::call_once(init_flag, [&]() {
+        log = new common_log;
         // Set default to auto-detect colors
-        log.set_colors(tty_can_use_colors());
+        log->set_colors(tty_can_use_colors());
     });
 
-    return &log;
+    return log;
 }
 
 void common_log_pause(struct common_log * log) {
 
@@ -49,7 +49,11 @@ void common_log_default_callback(enum ggml_log_level level, const char * text, v
 struct common_log;
 
 struct common_log * common_log_init();
-struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
+
+// Singleton, intentionally leaked to avoid Windows teardown hangs.
+// Call common_log_flush() before exit if you want to ensure all logs are flushed.
+struct common_log * common_log_main();
+
 void                common_log_pause (struct common_log * log); // pause  the worker thread, not thread-safe
 void                common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
 void                common_log_free  (struct common_log * log);
 
@@ -467,7 +467,7 @@ struct common_speculative_state_draft : public common_speculative_state {
 
         prompt_dft.push_back(id_last);
 
-        LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
+        //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
 
         int ret = llama_decode(ctx_dft, batch);
         if (ret != 0 && ret != 1) {
@@ -495,14 +495,14 @@ struct common_speculative_state_draft : public common_speculative_state {
 
             common_sampler_accept(smpl, id, true);
 
-            result.push_back(id);
-
-            if (sparams.n_max <= (int) result.size()) {
+            // only collect very high-confidence draft tokens
+            if (cur_p->data[0].p < sparams.p_min) {
                 break;
             }
 
-            // only collect very high-confidence draft tokens
-            if (cur_p->data[0].p < sparams.p_min) {
+            result.push_back(id);
+
+            if (sparams.n_max <= (int) result.size()) {
                 break;
             }
 
 
@@ -728,6 +728,9 @@ def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_s
 
         del experts, merged
 
+    def _needs_nvfp4_processing(self) -> bool:
+        return True
+
     def prepare_tensors(self):
         # detect NVFP4 quantization (ModelOpt format)
         quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
@@ -758,7 +761,7 @@ def prepare_tensors(self):
         # NVFP4 weights are repacked and written directly to gguf_writer.
         # This must run before dequant_model so NVFP4 tensors are removed
         # from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
-        if self._is_nvfp4:
+        if self._is_nvfp4 and self._needs_nvfp4_processing():
             self._generate_nvfp4_tensors()
 
         self.dequant_model()
@@ -2190,6 +2193,10 @@ def __init__(self, *args, **kwargs):
                 # merge configs
                 self.preprocessor_config = {**self.preprocessor_config, **cfg}
 
+    def _needs_nvfp4_processing(self) -> bool:
+        # nvfp4 quantization applies to the text model only.
+        return False
+
     def get_vision_config(self) -> dict[str, Any] | None:
         config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
         return self.global_config.get(config_name)
@@ -4450,6 +4457,12 @@ def get_vision_config(self) -> dict[str, Any] | None:
         }
         return vision_config
 
+    def dequant_model(self):
+        if self._is_nvfp4:
+            # Skip nvfp4 quantization for vision/audio model.
+            return
+        super().dequant_model()
+
     def set_gguf_parameters(self):
         if "image_mean" not in self.preprocessor_config:
             self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
@@ -4473,6 +4486,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         if "input_conditioner" in name:
             return
 
+        # mtmd does not support video yet so skip tensors related to video.
+        if "radio_model.model.patch_generator.video_embedder" in name:
+            return
+
         # RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
         if "patch_generator.pos_embed" in name:
             if not name.endswith(".weight"):
@@ -10820,7 +10837,11 @@ def __init__(self, *args, **kwargs):
         # uses self.model_arch to build the tensor name map, and all MoE-specific
         # mappings would be missed if it were called with the default non-MoE arch.
         hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
-        if "num_experts_per_tok" in hparams:
+        has_moe_params = (
+            "num_experts_per_tok" in hparams
+            or (isinstance(hparams.get("llm_config"), dict) and "num_experts_per_tok" in hparams["llm_config"])
+        )
+        if has_moe_params:
             self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
             self.is_moe = True
 
@@ -10967,6 +10988,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         if name.startswith(("vision_model.", "mlp1.")):
             return
 
+        if name.startswith(("sound_encoder.")):
+            return
+        if name.startswith(("sound_projection.")):
+            return
+
         # Strip language_model. prefix for VLM models (e.g., Nemotron Nano 12B v2 VL)
         if name.startswith("language_model."):
             name = name[len("language_model."):]
 
@@ -1826,7 +1826,24 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
                     continue;
                 }
 
-                i = get_i_delayed(i);
+                const int i_delayed = get_i_delayed(i);
+
+                // If we can delay the AllReduce we need to consider the interaction with zero-sized tensor slices.
+                // A backend with such a slice would normally have valid data after participating in the AllReduce with a node that has
+                //     its compute flag disabled and thus gets its data zeroed out.
+                // If the AllReduce is delayed then the nodes until that point also need to have their compute flag disabled.
+                if (i_delayed > i) {
+                    for (size_t j = 0; j < n_backends; j++) {
+                        auto & bcj = backend_ctx->backend_configs[j];
+                        if ((bcj.nodes[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+                            for (int ii = i + 1; ii <= i_delayed; ii++) {
+                                bcj.nodes[ii]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
+                            }
+                        }
+                    }
+                }
+
+                i = i_delayed;
 
                 for (size_t j = 0; j < n_backends; j++) {
                     auto & bcj = backend_ctx->backend_configs[j];
 
@@ -4867,6 +4867,71 @@ void ggml_gemm_q8_0_4x8_q8_0(int                        n,
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (svcntb() * 8 == 256) {
+        const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
+
+        static const uint32_t idx_arr[8] = {0, 1, 4, 5, 2, 3, 6, 7};
+        svuint32_t idx = svld1(svptrue_b32(), idx_arr);
+        static const uint32_t idx_arr1[8] = {0, 1, 2, 3, 1, 2, 3, 0};
+        svuint32_t idx_sc1 = svld1(svptrue_b32(), idx_arr1);
+        static const uint32_t idx_arr2[8] = {0, 1, 2, 3, 0, 1, 2, 3};
+        svuint32_t idx_sc2 = svld1(svptrue_b32(), idx_arr2);
+
+        for (int y = 0; y < nr; y += 4) {
+            const block_q8_0x4 * a_ptr_base = (const block_q8_0x4 *) vy + (y / 4) * nb;
+
+            for (int x = 0; x < nc; x += ncols_interleaved) {
+                const block_q8_0x4 * b_ptr = b_ptr_base + (x / 4) * nb;
+                const block_q8_0x4 * a_ptr = a_ptr_base;
+
+                svfloat32_t acc_f32_01 = svdup_f32(0);
+                svfloat32_t acc_f32_23 = svdup_f32(0);
+
+                for (int b = 0; b < nb; b++) {
+
+                    svint32_t acc_01 = svdup_s32(0);
+                    svint32_t acc_23 = svdup_s32(0);
+
+                    // Process 4 chunks of 8 positions each
+                    for (int chunk = 0; chunk < 4; chunk++) {
+                        svint8_t s_a01 = svld1rq_s8(svptrue_b8(), a_ptr->qs + chunk * 32);
+                        svint8_t s_a23 = svld1rq_s8(svptrue_b8(), a_ptr->qs + chunk * 32 + 16);
+                        svint8_t s_b0123 = svld1_s8(svptrue_b8(), b_ptr->qs + chunk * 32);
+
+                        acc_01 = svmmla_s32(acc_01, s_a01, s_b0123);
+                        acc_23 = svmmla_s32(acc_23, s_a23, s_b0123);
+                    }
+
+                    // Reorder outputs from 2×2 tiles to row-major
+                    // acc[01] = [r0c0, r0c1, r1c0, r1c1, r0c2, r0c3, r1c2, r1c3]
+                    // acc[23] = [r2c0, r2c1, r3c0, r3c1, r2c2, r2c3, r3c2, r3c3]
+
+                    svint32_t row01 = svtbl_s32(acc_01, idx);
+                    svint32_t row23 = svtbl_s32(acc_23, idx);
+
+                    svfloat16_t temp1 = svld1_f16(svptrue_pat_b16(SV_VL4), (const __fp16 *) a_ptr->d);
+                    svfloat16_t temp2 = svld1_f16(svptrue_pat_b16(SV_VL4), (const __fp16 *) b_ptr->d);
+                    svfloat32_t sv_a_d = svtbl_f32(svcvt_f32_f16_x(svptrue_b32(), svzip1_f16(temp1, temp1)), idx_sc1);
+                    svfloat32_t sv_b_d = svtbl_f32(svcvt_f32_f16_x(svptrue_b32(), svzip1_f16(temp2, temp2)), idx_sc2);
+
+                    acc_f32_01 = svmla_f32_x(svptrue_b32(), acc_f32_01, svcvt_f32_s32_x(svptrue_b32(), row01), svmul_lane_f32(sv_b_d, sv_a_d, 0));
+                    acc_f32_23 = svmla_f32_x(svptrue_b32(), acc_f32_23, svcvt_f32_s32_x(svptrue_b32(), row23), svmul_lane_f32(sv_b_d, sv_a_d, 2));
+                    a_ptr++;
+                    b_ptr++;
+                }
+
+                svbool_t pg4 = svptrue_pat_b32(SV_VL4);
+                svst1_f32(pg4, s + (y+0) * bs + x, acc_f32_01);
+                svst1_f32(pg4, s + (y+1) * bs + x, svext_f32(acc_f32_01, acc_f32_01, 4));
+                svst1_f32(pg4, s + (y+2) * bs + x, acc_f32_23);
+                svst1_f32(pg4, s + (y+3) * bs + x, svext_f32(acc_f32_23, acc_f32_23, 4));
+            }
+        }
+        return;
+    }
+#endif  // SVE compile-time end
+
 #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
 
 
@@ -2321,6 +2321,9 @@ class tinyBLAS_Q0_PPC {
     }
 
     void matmul(int64_t m, int64_t n) {
+    #if defined(_AIX) || defined(__BIG_ENDIAN__)
+        mnpack(0, m, 0, n);
+    #else
         const int64_t mc = 64;
         const int64_t kc = 64;
         int64_t nc = 64;
@@ -2334,7 +2337,6 @@ class tinyBLAS_Q0_PPC {
         } else {
             n_aligned = (n / 64) * 64;
         }
-
         if (n_aligned > 0) {
             if (n_aligned % 64 == 0)      nc = 64;
             else if (n_aligned == n)      nc = n;
@@ -2352,6 +2354,7 @@ class tinyBLAS_Q0_PPC {
         } else {
             mnpack(0, m, 0, n);
         }
+    #endif
     }
 
   private:
@@ -3191,12 +3194,16 @@ class tinyBLAS_PPC {
     }
 
     void matmul(int64_t m, int64_t n) {
+    #if defined(_AIX) || defined(__BIG_ENDIAN__)
+        mnpack(0, m, 0, n);
+    #else
         int64_t mc = 256; int64_t nc = 256; int64_t kc = 256;
         if (m % mc == 0 && n % nc == 0 && k % kc == 0) {
             matmul_tiled(m, n, mc, nc, kc);
         } else {
             mnpack(0, m, 0, n);
         }
+    #endif
     }
 
   private:
 
@@ -837,6 +837,18 @@ static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
 #endif // defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000
 }
 
+static __device__ __forceinline__ uint8_t ggml_cuda_fp32_to_ue4m3(float x) {
+#if defined(BLACKWELL_MMA_AVAILABLE) // This is used for NVFP4 subblock scale quantizations only
+    if (!(x > 0.0f)) {
+        return 0;
+    }
+    const __nv_fp8_e4m3 xf(x);
+    return xf.__x;
+#else
+     NO_DEVICE_CODE; // Used only for NVFP4 Scales for Activations, only for Blackwell
+#endif // defined(BLACKWELL_MMA_AVAILABLE)
+}
+
 __device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
     const uint8_t sign_bit = (x < 0.0f) << 3;
     float         ax       = fabsf(x) * e;
 
@@ -66,6 +66,9 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  32, 128, 128, 128, 2, true);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  32, 128, 128, 128, 2, true);
 
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2,  32, 128, 128, 128, 1, false);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 256, 1,  32, 128, 128, 128, 1, false);
+
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512,  8,  64, 4,  32, 256, 256, 128, 1, false);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16,  64, 4,  32, 256, 256, 128, 1, false);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2,  32, 128, 128, 128, 1, false);
@@ -85,6 +88,9 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  64, 128, 128,  64, 2, true);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  64, 128, 128,  64, 2, true);
 
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2,  32, 128, 128, 128, 1, false);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 256, 1,  32, 128, 128, 128, 1, false);
+
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512,  8,  64, 4,  32,  96,  64, 128, 1, false);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16,  64, 4,  32,  96,  64, 128, 1, false);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2,  32, 128, 128, 128, 1, false);
@@ -118,6 +124,9 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  64, 128, 128,  64, 2, true);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  64, 128, 128,  64, 2, true);
 
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2,  64, 160, 128,  64, 2, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 128, 2,  64, 160, 128,  64, 2, false);
+
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16,  64, 4,  32, 128, 128, 128, 1, false);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2,  32, 128, 128, 128, 1, false);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 64, 256, 1,  32, 128, 128, 128, 1, false);
@@ -1217,7 +1226,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         float KQ_max_scale[cols_per_thread];
 #pragma unroll
         for (int col = 0; col < cols_per_thread; ++col) {
-            const int jc = cols_per_warp == 8 ? T_C_KQ::get_j(col) : T_C_KQ::get_i(2*col);
+            const int jc = (threadIdx.y/np)*cols_per_warp + (cols_per_warp == 8 ? T_C_KQ::get_j(col) : T_C_KQ::get_i(2*col));
             const float sink = sinks_f[jc % ncols2];
 
             const float KQ_max_new = fmaxf(KQ_max[col], sink);
@@ -1825,6 +1834,10 @@ extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
 
+// Mistral Small 4 (DKQ=320, DV=256), GQA=32-only build:
+extern DECL_FATTN_MMA_F16_CASE(320, 256,  1, 32);
+extern DECL_FATTN_MMA_F16_CASE(320, 256,  2, 32);
+
 // For GLM 4.7 Flash
 extern DECL_FATTN_MMA_F16_CASE(576, 512,  4,  4);
 extern DECL_FATTN_MMA_F16_CASE(576, 512,  8,  4);
Original file line number	Diff line number	Diff line change
`@@ -2321,6 +2321,9 @@ class tinyBLAS_Q0_PPC {`
`2321`	`2321`	`}`
`2322`	`2322`
`2323`	`2323`	`void matmul(int64_t m, int64_t n) {`
	`2324`	`+ #if defined(_AIX) \|\| defined(__BIG_ENDIAN__)`
	`2325`	`+ mnpack(0, m, 0, n);`
	`2326`	`+ #else`
`2324`	`2327`	`const int64_t mc = 64;`
`2325`	`2328`	`const int64_t kc = 64;`
`2326`	`2329`	`int64_t nc = 64;`
`@@ -2334,7 +2337,6 @@ class tinyBLAS_Q0_PPC {`
`2334`	`2337`	`} else {`
`2335`	`2338`	`n_aligned = (n / 64) * 64;`
`2336`	`2339`	`}`
`2337`		`-`
`2338`	`2340`	`if (n_aligned > 0) {`
`2339`	`2341`	`if (n_aligned % 64 == 0) nc = 64;`
`2340`	`2342`	`else if (n_aligned == n) nc = n;`
`@@ -2352,6 +2354,7 @@ class tinyBLAS_Q0_PPC {`
`2352`	`2354`	`} else {`
`2353`	`2355`	`mnpack(0, m, 0, n);`
`2354`	`2356`	`}`
	`2357`	`+ #endif`
`2355`	`2358`	`}`
`2356`	`2359`
`2357`	`2360`	`private:`
`@@ -3191,12 +3194,16 @@ class tinyBLAS_PPC {`
`3191`	`3194`	`}`
`3192`	`3195`
`3193`	`3196`	`void matmul(int64_t m, int64_t n) {`
	`3197`	`+ #if defined(_AIX) \|\| defined(__BIG_ENDIAN__)`
	`3198`	`+ mnpack(0, m, 0, n);`
	`3199`	`+ #else`
`3194`	`3200`	`int64_t mc = 256; int64_t nc = 256; int64_t kc = 256;`
`3195`	`3201`	`if (m % mc == 0 && n % nc == 0 && k % kc == 0) {`
`3196`	`3202`	`matmul_tiled(m, n, mc, nc, kc);`
`3197`	`3203`	`} else {`
`3198`	`3204`	`mnpack(0, m, 0, n);`
`3199`	`3205`	`}`
	`3206`	`+ #endif`
`3200`	`3207`	`}`
`3201`	`3208`
`3202`	`3209`	`private:`