Cleanup: Improve naming, and file structure

Vrajang Parikh · Vrajang Parikh · commit 84e44b34d8da · 2026-02-06T23:00:56.000Z
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
@@ -1243,10 +1243,10 @@ extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchLasx;
 
 struct MLAS_QNBIT_LUT_GEMM_DISPATCH;
 
-extern const MLAS_QNBIT_LUT_GEMM_DISPATCH MlasLutGenKernelAvx2;
+extern const MLAS_QNBIT_LUT_GEMM_DISPATCH MlasLutGemmDispatchAvx2;
 
 #if defined(MLAS_TARGET_ARM64)
-extern const MLAS_QNBIT_LUT_GEMM_DISPATCH MlasLutGenKernelNeon;
+extern const MLAS_QNBIT_LUT_GEMM_DISPATCH MlasLutGemmDispatchNeon;
 #endif
 
 //
@@ -1457,7 +1457,7 @@ struct MLAS_PLATFORM {
     const MLAS_Q8Q4GEMM_DISPATCH* Q8Q4GemmDispatch{nullptr};
 
     const MLAS_QNBIT_GEMM_DISPATCH* QNBitGemmDispatch{nullptr};
-    const MLAS_QNBIT_LUT_GEMM_DISPATCH* LutGenKernel{nullptr};
+    const MLAS_QNBIT_LUT_GEMM_DISPATCH* LutGemmDispatch{nullptr};
 
     MLAS_CAST_F16_TO_F32_KERNEL* CastF16ToF32Kernel;
     MLAS_CAST_F32_TO_F16_KERNEL* CastF32ToF16Kernel;
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
@@ -422,7 +422,7 @@ Return Value:
                 this->RopeDispatch = &MlasRopeDispatchAvx2;
 
                 // TODO(vraspar): check if this really goes here or if there are other platform reqs that we need to fulfill
-                this->LutGenKernel = &MlasLutGenKernelAvx2;
+                this->LutGemmDispatch = &MlasLutGemmDispatchAvx2;
 
                 //
                 // Check if the processor supports Hybrid core architecture.
@@ -655,7 +655,7 @@ Return Value:
     this->QNBitGemmDispatch = &GetMlasQNBitGemmDispatchNeon(HasDotProductInstructions, HasI8MMInstructions);
 
     // Enable LUT-based GEMM for 2-bit quantization on ARM64
-    this->LutGenKernel = &MlasLutGenKernelNeon;
+    this->LutGemmDispatch = &MlasLutGemmDispatchNeon;
 
 #if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
     this->CastF16ToF32Kernel = &MlasCastF16ToF32KernelNeon;
diff --git a/onnxruntime/core/mlas/lib/qlutgemm.cpp b/onnxruntime/core/mlas/lib/qlutgemm.cpp
@@ -191,7 +191,7 @@ LutGemmPackQuantBData(
     const size_t kfactor = tmac_params.kfactor;
 
     // LUT GEMM requires a valid LUT dispatch implementation, so dispatch must be available
-    const auto* Dispatch = GetMlasPlatform().LutGenKernel;
+    const auto* Dispatch = GetMlasPlatform().LutGemmDispatch;
     if (Dispatch == nullptr || Dispatch->PackQuantBData == nullptr) {
         MLAS_THROW_EX(std::runtime_error, "PackQuantBData requires LUT GEMM dispatch support");
     }
@@ -240,9 +240,9 @@ LutPackScalesAndZeroPoints(
     const size_t bm = tmac_params.bm;
 
     // LUT GEMM is only available for AVX2, so dispatch must be available
-    const auto* Dispatch = GetMlasPlatform().LutGenKernel;
+    const auto* Dispatch = GetMlasPlatform().LutGemmDispatch;
     if (Dispatch == nullptr || Dispatch->PackScalesAndZeroPoints == nullptr) {
-        MLAS_THROW_EX(std::runtime_error, "PackScalesAndZeroPoints requires AVX2 dispatch");
+        MLAS_THROW_EX(std::runtime_error, "PackScalesAndZeroPoints requires LUT GEMM dispatch support");
     }
 
     Dispatch->PackScalesAndZeroPoints(
@@ -320,7 +320,7 @@ MlasIsLutGemmAvailable(
     size_t BlkLen
 )
 {
-    const auto* lut_kernel = GetMlasPlatform().LutGenKernel;
+    const auto* lut_kernel = GetMlasPlatform().LutGemmDispatch;
     if (lut_kernel == nullptr ||
         lut_kernel->GenerateLUT == nullptr ||
         lut_kernel->ComputeGemm == nullptr ||
@@ -392,7 +392,7 @@ MlasLutGemm(
 )
 {
     // adapted from ggml_backend_tmac_mul_mat
-    const auto* Dispatch = GetMlasPlatform().LutGenKernel;
+    const auto* Dispatch = GetMlasPlatform().LutGemmDispatch;
     // This should be ensured by calling MlasIsLutGemmAvailable() before MlasLutGemm()
     if (Dispatch == nullptr || Dispatch->GenerateLUT == nullptr || Dispatch->ComputeGemm == nullptr) {
         MLAS_THROW_EX(std::runtime_error, "TMAC not supported in this configuration");
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp
@@ -54,6 +54,12 @@ _mm256_addv_ps(const __m256 v)
 #define extract_low_epi16_epi32(v) _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v))
 #define extract_high_epi16_epi32(v) _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v, 1))
 
+namespace lutgemm_avx2
+{
+
+namespace
+{
+
 // Template classes for accumulation
 template <int N>
 struct SignedHalvingAdder {
@@ -324,9 +330,11 @@ lut_ctor_g4_int8_impl(
     *lut_biases = biases;
 }
 
-// based on lut_ctor_g4_int8_impl
+}  // namespace
+
+// LutGemmGenerateLUT_CompFp32 - Entry point for LUT generation
 void
-GenerateLUT_avx2(
+LutGemmGenerateLUT_CompFp32(
     const float* b,
     int8_t* qlut,
     float* lut_scales,
@@ -495,10 +503,9 @@ tbl_g4_int8_float_update_impl(int32_t m, float* c, const int8_t* lut, const uint
     return 0;
 }
 
-// based on qgemm_lut_int8_g4
-// Simplified version with hardcoded configuration for 2-bit quantization
+// LutGemmCompute_CompFp32 - Entry point for GEMM computation
 void
-TMACComputeGemm_avx2(
+LutGemmCompute_CompFp32(
     const uint8_t* A,         // Quantized packed weights
     const float* Scales,      // Weight scales (and optionally zero-points)
     const int8_t* LUT,        // Pre-computed quantized lookup table
@@ -651,11 +658,11 @@ TMACComputeGemm_avx2(
 }
 
 //
-// AVX2 optimized weight packing for T-MAC LUT GEMM
+// LutGemmPackQuantBData_CompFp32 - AVX2 optimized weight packing for T-MAC LUT GEMM
 // This performs the same transformation as the scalar version but uses SIMD operations
 //
 void
-PackQuantBData_avx2(
+LutGemmPackQuantBData_CompFp32(
     size_t N,
     size_t K,
     size_t bits,
@@ -864,12 +871,11 @@ PackQuantBData_avx2(
 }
 
 //
-// AVX2 optimized scales and zero points packing for T-MAC LUT GEMM
-// This performs the same transformation as the scalar version but uses SIMD operations
+// LutGemmPackScalesAndZeroPoints_CompFp32 - Scales and zero points packing
 //
 template <bool HasZeroPoint>
-static void
-PackScalesAndZeroPoints_avx2_impl(
+void
+LutGemmPackScalesAndZeroPoints_CompFp32_Impl(
     size_t N,
     size_t K,
     size_t bits,
@@ -984,7 +990,7 @@ PackScalesAndZeroPoints_avx2_impl(
 }
 
 void
-PackScalesAndZeroPoints_avx2(
+LutGemmPackScalesAndZeroPoints_CompFp32(
     size_t N,
     size_t K,
     size_t bits,
@@ -1002,25 +1008,27 @@ PackScalesAndZeroPoints_avx2(
     assert(bits == 2);
 
     if (HasZeroPoint) {
-        PackScalesAndZeroPoints_avx2_impl<true>(
+        LutGemmPackScalesAndZeroPoints_CompFp32_Impl<true>(
             N, K, bits, BlkLen, simd_n_out, bm,
             PackedScalesBegin, QuantBScale, QuantBZeroPoint, ThreadPool
         );
     } else {
-        PackScalesAndZeroPoints_avx2_impl<false>(
+        LutGemmPackScalesAndZeroPoints_CompFp32_Impl<false>(
             N, K, bits, BlkLen, simd_n_out, bm,
             PackedScalesBegin, QuantBScale, QuantBZeroPoint, ThreadPool
         );
     }
 }
 
+}  // namespace lutgemm_avx2
+
 // Kernel dispatch structure definition.
 
-const MLAS_QNBIT_LUT_GEMM_DISPATCH MlasLutGenKernelAvx2 = []() {
+const MLAS_QNBIT_LUT_GEMM_DISPATCH MlasLutGemmDispatchAvx2 = []() {
     MLAS_QNBIT_LUT_GEMM_DISPATCH d;
-    d.GenerateLUT = GenerateLUT_avx2;
-    d.ComputeGemm = TMACComputeGemm_avx2;
-    d.PackQuantBData = PackQuantBData_avx2;
-    d.PackScalesAndZeroPoints = PackScalesAndZeroPoints_avx2;
+    d.GenerateLUT = lutgemm_avx2::LutGemmGenerateLUT_CompFp32;
+    d.ComputeGemm = lutgemm_avx2::LutGemmCompute_CompFp32;
+    d.PackQuantBData = lutgemm_avx2::LutGemmPackQuantBData_CompFp32;
+    d.PackScalesAndZeroPoints = lutgemm_avx2::LutGemmPackScalesAndZeroPoints_CompFp32;
     return d;
 }();
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.h b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.h
@@ -23,4 +23,4 @@ Module Name:
 // External dispatch table for AVX2 LUT GEMM kernels.
 // Kernel functions are internal to the .cpp file and accessed via this dispatch.
 //
-extern const MLAS_QNBIT_LUT_GEMM_DISPATCH MlasLutGenKernelAvx2;
+extern const MLAS_QNBIT_LUT_GEMM_DISPATCH MlasLutGemmDispatchAvx2;
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_neon.cpp
@@ -46,6 +46,12 @@ Module Name:
 #define PRAGMA_UNROLL
 #endif
 
+namespace lutgemm_neon
+{
+
+namespace
+{
+
 //
 // Template classes for accumulation - adapted from llama.cpp tbl.cpp
 //
@@ -282,11 +288,13 @@ lut_ctor_g4_int8_impl_neon(
     *lut_biases = biases;
 }
 
+}  // namespace
+
 //
-// GenerateLUT - Entry point for LUT generation
+// LutGemmGenerateLUT_CompFp32 - Entry point for LUT generation
 //
-static void
-GenerateLUT_neon(
+void
+LutGemmGenerateLUT_CompFp32(
     const float* b,
     int8_t* qlut,
     float* lut_scales,
@@ -620,10 +628,10 @@ tbl_g4_int8_float_update_impl_neon(
 }
 
 //
-// TMACComputeGemm - Entry point for GEMM computation
+// LutGemmCompute_CompFp32 - Entry point for GEMM computation
 //
-static void
-TMACComputeGemm_neon(
+void
+LutGemmCompute_CompFp32(
     const uint8_t* A,
     const float* Scales,
     const int8_t* LUT,
@@ -756,11 +764,11 @@ TMACComputeGemm_neon(
 }
 
 //
-// Weight packing for NEON (can use scalar or NEON implementation)
+// LutGemmPackQuantBData_CompFp32 - Weight packing for NEON
 // This is done during model load, so performance is less critical
 //
-static void
-PackQuantBData_neon(
+void
+LutGemmPackQuantBData_CompFp32(
     size_t N,
     size_t K,
     size_t bits,
@@ -917,11 +925,11 @@ PackQuantBData_neon(
 }
 
 //
-// Scales and zero points packing
+// LutGemmPackScalesAndZeroPoints_CompFp32 - Scales and zero points packing
 //
 template <bool HasZeroPoint>
-static void
-PackScalesAndZeroPoints_neon_impl(
+void
+LutGemmPackScalesAndZeroPoints_CompFp32_Impl(
     size_t N,
     size_t K,
     size_t bits,
@@ -991,8 +999,8 @@ PackScalesAndZeroPoints_neon_impl(
     );
 }
 
-static void
-PackScalesAndZeroPoints_neon(
+void
+LutGemmPackScalesAndZeroPoints_CompFp32(
     size_t N,
     size_t K,
     size_t bits,
@@ -1009,27 +1017,29 @@ PackScalesAndZeroPoints_neon(
     assert(bits == 2);
 
     if (HasZeroPoint) {
-        PackScalesAndZeroPoints_neon_impl<true>(
+        LutGemmPackScalesAndZeroPoints_CompFp32_Impl<true>(
             N, K, bits, BlkLen, simd_n_out, bm,
             PackedScalesBegin, QuantBScale, QuantBZeroPoint, ThreadPool
         );
     } else {
-        PackScalesAndZeroPoints_neon_impl<false>(
+        LutGemmPackScalesAndZeroPoints_CompFp32_Impl<false>(
             N, K, bits, BlkLen, simd_n_out, bm,
             PackedScalesBegin, QuantBScale, QuantBZeroPoint, ThreadPool
         );
     }
 }
 
+}  // namespace lutgemm_neon
+
 //
 // Kernel dispatch structure definition
 //
-const MLAS_QNBIT_LUT_GEMM_DISPATCH MlasLutGenKernelNeon = []() {
+const MLAS_QNBIT_LUT_GEMM_DISPATCH MlasLutGemmDispatchNeon = []() {
     MLAS_QNBIT_LUT_GEMM_DISPATCH d;
-    d.GenerateLUT = GenerateLUT_neon;
-    d.ComputeGemm = TMACComputeGemm_neon;
-    d.PackQuantBData = PackQuantBData_neon;
-    d.PackScalesAndZeroPoints = PackScalesAndZeroPoints_neon;
+    d.GenerateLUT = lutgemm_neon::LutGemmGenerateLUT_CompFp32;
+    d.ComputeGemm = lutgemm_neon::LutGemmCompute_CompFp32;
+    d.PackQuantBData = lutgemm_neon::LutGemmPackQuantBData_CompFp32;
+    d.PackScalesAndZeroPoints = lutgemm_neon::LutGemmPackScalesAndZeroPoints_CompFp32;
     return d;
 }();
 
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_neon.h b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_neon.h
@@ -23,5 +23,5 @@ Module Name:
 // External dispatch table for ARM NEON LUT GEMM kernels.
 // Kernel functions are internal to the .cpp file and accessed via this dispatch.
 //
-extern const MLAS_QNBIT_LUT_GEMM_DISPATCH MlasLutGenKernelNeon;
+extern const MLAS_QNBIT_LUT_GEMM_DISPATCH MlasLutGemmDispatchNeon;
 
diff --git a/onnxruntime/test/mlas/unittest/test_lutgemm.cpp b/onnxruntime/test/mlas/unittest/test_lutgemm.cpp
diff --git a/onnxruntime/test/mlas/unittest/test_lutgemm_pack.cpp b/onnxruntime/test/mlas/unittest/test_lutgemm_pack.cpp

Original file line number	Diff line number	Diff line change
`@@ -191,7 +191,7 @@ LutGemmPackQuantBData(`
`191`	`191`	`const size_t kfactor = tmac_params.kfactor;`
`192`	`192`
`193`	`193`	`// LUT GEMM requires a valid LUT dispatch implementation, so dispatch must be available`
`194`		`- const auto* Dispatch = GetMlasPlatform().LutGenKernel;`
	`194`	`+ const auto* Dispatch = GetMlasPlatform().LutGemmDispatch;`
`195`	`195`	`if (Dispatch == nullptr \|\| Dispatch->PackQuantBData == nullptr) {`
`196`	`196`	`MLAS_THROW_EX(std::runtime_error, "PackQuantBData requires LUT GEMM dispatch support");`
`197`	`197`	`}`
`@@ -240,9 +240,9 @@ LutPackScalesAndZeroPoints(`
`240`	`240`	`const size_t bm = tmac_params.bm;`
`241`	`241`
`242`	`242`	`// LUT GEMM is only available for AVX2, so dispatch must be available`
`243`		`- const auto* Dispatch = GetMlasPlatform().LutGenKernel;`
	`243`	`+ const auto* Dispatch = GetMlasPlatform().LutGemmDispatch;`
`244`	`244`	`if (Dispatch == nullptr \|\| Dispatch->PackScalesAndZeroPoints == nullptr) {`
`245`		`- MLAS_THROW_EX(std::runtime_error, "PackScalesAndZeroPoints requires AVX2 dispatch");`
	`245`	`+ MLAS_THROW_EX(std::runtime_error, "PackScalesAndZeroPoints requires LUT GEMM dispatch support");`
`246`	`246`	`}`
`247`	`247`
`248`	`248`	`Dispatch->PackScalesAndZeroPoints(`
`@@ -320,7 +320,7 @@ MlasIsLutGemmAvailable(`
`320`	`320`	`size_t BlkLen`
`321`	`321`	`)`
`322`	`322`	`{`
`323`		`- const auto* lut_kernel = GetMlasPlatform().LutGenKernel;`
	`323`	`+ const auto* lut_kernel = GetMlasPlatform().LutGemmDispatch;`
`324`	`324`	`if (lut_kernel == nullptr \|\|`
`325`	`325`	`lut_kernel->GenerateLUT == nullptr \|\|`
`326`	`326`	`lut_kernel->ComputeGemm == nullptr \|\|`
`@@ -392,7 +392,7 @@ MlasLutGemm(`
`392`	`392`	`)`
`393`	`393`	`{`
`394`	`394`	`// adapted from ggml_backend_tmac_mul_mat`
`395`		`- const auto* Dispatch = GetMlasPlatform().LutGenKernel;`
	`395`	`+ const auto* Dispatch = GetMlasPlatform().LutGemmDispatch;`
`396`	`396`	`// This should be ensured by calling MlasIsLutGemmAvailable() before MlasLutGemm()`
`397`	`397`	`if (Dispatch == nullptr \|\| Dispatch->GenerateLUT == nullptr \|\| Dispatch->ComputeGemm == nullptr) {`
`398`	`398`	`MLAS_THROW_EX(std::runtime_error, "TMAC not supported in this configuration");`