microsoft
diff --git a/‎cmake/onnxruntime.cmake‎
Lines changed: 1 addition & 0 deletions b/‎cmake/onnxruntime.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 6 additions & 0 deletions b/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 1 addition & 0 deletions b/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/OperatorKernels.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/OperatorKernels.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/onnxruntime/core/framework/data_types.h‎
Lines changed: 5 additions & 2 deletions b/‎include/onnxruntime/core/framework/data_types.h‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎include/onnxruntime/core/framework/data_types_internal.h‎
Lines changed: 48 additions & 0 deletions b/‎include/onnxruntime/core/framework/data_types_internal.h‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎include/onnxruntime/core/framework/int2.h‎
Lines changed: 178 additions & 0 deletions b/‎include/onnxruntime/core/framework/int2.h‎
Lines changed: 178 additions & 0 deletions
diff --git a/‎include/onnxruntime/core/framework/to_tensor_proto_element_type.h‎
Lines changed: 10 additions & 0 deletions b/‎include/onnxruntime/core/framework/to_tensor_proto_element_type.h‎
Lines changed: 10 additions & 0 deletions
@@ -28,6 +28,7 @@ function(get_c_cxx_api_headers HEADERS_VAR)
     "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_ep_device_ep_metadata_keys.h"
     "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_float16.h"
     "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h"
+    "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_env_config_keys.h"
     "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h"
     "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h"
   )
 
@@ -45,6 +45,8 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
   ${MLAS_SRC_DIR}/qnbitgemm.h
   ${MLAS_SRC_DIR}/qnbitgemm.cpp
+  ${MLAS_SRC_DIR}/qlutgemm.h
+  ${MLAS_SRC_DIR}/qlutgemm.cpp
   ${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
   ${MLAS_SRC_DIR}/flashattn.cpp
   ${MLAS_SRC_DIR}/cast.cpp
@@ -209,6 +211,8 @@ function(setup_mlas_source_for_windows)
       ${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
       ${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
       ${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
+      ${MLAS_SRC_DIR}/sqnbitgemm_lut_kernel_avx2.h
+      ${MLAS_SRC_DIR}/sqnbitgemm_lut_kernel_avx2.cpp
       ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
       ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
       ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
@@ -693,6 +697,8 @@ else()
           ${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
           ${MLAS_SRC_DIR}/intrinsics/avx2/saturation_check_avx2.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
+          ${MLAS_SRC_DIR}/sqnbitgemm_lut_kernel_avx2.h
+          ${MLAS_SRC_DIR}/sqnbitgemm_lut_kernel_avx2.cpp
           ${MLAS_SRC_DIR}/rotary_embedding_kernel_avx2.h
           ${MLAS_SRC_DIR}/rotary_embedding_kernel_avx2.cpp
           ${MLAS_SRC_DIR}/rotary_embedding_kernel_avx2.cpp
 
@@ -597,6 +597,7 @@ set (onnxruntime_shared_lib_test_SRC
 if (NOT onnxruntime_MINIMAL_BUILD)
   list(APPEND onnxruntime_shared_lib_test_SRC ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_inference.cc)
   list(APPEND onnxruntime_shared_lib_test_SRC ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_model_builder_api.cc)
+  list(APPEND onnxruntime_shared_lib_test_SRC ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_env_creation.cc)
 endif()
 
 if(onnxruntime_RUN_ONNX_TESTS)
 
@@ -653,6 +653,7 @@ Do not modify directly.*
 |ArgMin|*in* data:**T**<br> *out* reduced:**tensor(int64)**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||12|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[1, 11]|**T** = tensor(double), tensor(float), tensor(float16)|
+|Attention|*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *in* nonpad_kv_seqlen:**tensor(int64)**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**<br><br>or<br><br>*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**|23+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **U** = tensor(bfloat16), tensor(bool), tensor(float), tensor(float16)|
 |AveragePool|*in* X:**T**<br> *out* Y:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[19, 21]|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[11, 18]|**T** = tensor(double), tensor(float), tensor(float16)|
 
@@ -16,6 +16,7 @@
 #include "core/common/float8.h"
 #include "core/common/float16.h"
 #include "core/framework/int4.h"
+#include "core/framework/int2.h"
 #include "core/framework/float4.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/framework/to_tensor_proto_element_type.h"
@@ -211,6 +212,7 @@ class DataTypeImpl {
   static const std::vector<MLDataType>& AllTensorTypesIRv9();
   static const std::vector<MLDataType>& AllTensorTypesIRv10();
   static const std::vector<MLDataType>& AllTensorTypesIRv11();
+  static const std::vector<MLDataType>& AllTensorTypesIRv13();
 
   static const std::vector<MLDataType>& AllFixedSizeTensorTypes();  // up to IR4 (no float 8), deprecated
   static const std::vector<MLDataType>& AllFixedSizeTensorTypesIRv4();
@@ -285,7 +287,7 @@ template <typename T>
 struct IsTensorContainedType : public IsAnyOf<T, float, uint8_t, int8_t, uint16_t, int16_t,
                                               int32_t, int64_t, std::string, bool, MLFloat16,
                                               double, uint32_t, uint64_t, BFloat16,
-                                              Int4x2, UInt4x2
+                                              Int4x2, UInt4x2, Int2x4, UInt2x4
 #if !defined(DISABLE_FLOAT8_TYPES)
                                               ,
                                               Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, Float8E5M2FNUZ
@@ -304,7 +306,8 @@ struct IsTensorContainedType : public IsAnyOf<T, float, uint8_t, int8_t, uint16_
 template <typename T>
 struct IsSparseTensorContainedType : public IsAnyOf<T, float, uint8_t, int8_t, uint16_t, int16_t,
                                                     int32_t, int64_t, std::string, bool, MLFloat16,
-                                                    double, uint32_t, uint64_t, BFloat16
+                                                    double, uint32_t, uint64_t, BFloat16,
+                                                    Int4x2, UInt4x2, Int2x4, UInt2x4
 #if !defined(DISABLE_FLOAT8_TYPES)
                                                     ,
                                                     Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, Float8E5M2FNUZ
 
@@ -102,6 +102,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:              \
       function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:               \
+      function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:              \
+      function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                      \
     default:                                                      \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type); \
   }
@@ -171,6 +177,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:                       \
       retval = function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:                        \
+      retval = function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:                       \
+      retval = function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                               \
     default:                                                               \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type);          \
   }
@@ -230,6 +242,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:              \
       function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:               \
+      function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:              \
+      function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                      \
     default:                                                      \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type); \
   }
@@ -287,6 +305,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:                       \
       retval = function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:                        \
+      retval = function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:                       \
+      retval = function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                               \
     default:                                                               \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type);          \
   }
@@ -355,6 +379,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:              \
       function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:               \
+      function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:              \
+      function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                      \
     default:                                                      \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type); \
   }
@@ -421,6 +451,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:                       \
       retval = function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:                        \
+      retval = function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:                       \
+      retval = function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                               \
     default:                                                               \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type);          \
   }
@@ -477,6 +513,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:              \
       function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:               \
+      function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:              \
+      function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                      \
     default:                                                      \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type); \
   }
@@ -531,6 +573,12 @@ namespace utils {
     case ONNX_NAMESPACE::TensorProto_DataType_UINT4:                       \
       retval = function<UInt4x2>(__VA_ARGS__);                             \
       break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT2:                        \
+      retval = function<Int2x4>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT2:                       \
+      retval = function<UInt2x4>(__VA_ARGS__);                             \
+      break;                                                               \
     default:                                                               \
       ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type);          \
   }
 
@@ -0,0 +1,178 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cassert>
+#include <type_traits>
+#include "core/common/common.h"
+#include <gsl/gsl>
+
+namespace onnxruntime {
+
+template <bool Signed>
+struct Int2Traits;
+
+template <>
+struct Int2Traits<true> {
+  using UnpackedType = int8_t;
+  static constexpr int8_t min_val = -2;
+  static constexpr int8_t max_val = 1;
+};
+
+template <>
+struct Int2Traits<false> {
+  using UnpackedType = uint8_t;
+  static constexpr uint8_t min_val = 0;
+  static constexpr uint8_t max_val = 3;
+};
+
+/// <summary>
+/// Stores 4 packed 2-bit elements in 1 byte.
+/// Packing follows ONNX spec: x0 | (x1 << 2) | (x2 << 4) | (x3 << 6)
+/// </summary>
+/// <typeparam name="Signed">Set to true if signed int2, or false if unsigned uint2.</typeparam>
+template <bool Signed>
+struct Int2x4Base {
+  using UnpackedType = typename Int2Traits<Signed>::UnpackedType;
+  static constexpr UnpackedType min_val = Int2Traits<Signed>::min_val;
+  static constexpr UnpackedType max_val = Int2Traits<Signed>::max_val;
+
+  std::byte bits_{};
+
+  Int2x4Base() = default;
+
+  explicit Int2x4Base(std::byte bits) {
+    bits_ = bits;
+  }
+
+  Int2x4Base(UnpackedType val0, UnpackedType val1, UnpackedType val2, UnpackedType val3) {
+    bits_ = static_cast<std::byte>(
+        (val0 & 0x3) |
+        ((val1 & 0x3) << 2) |
+        ((val2 & 0x3) << 4) |
+        ((val3 & 0x3) << 6));
+  }
+
+  static inline int8_t SignExtendLower2Bits(std::byte bits) {
+    // Sign-extend lower 2-bits by left shifting and then doing an arithmetic right shift.
+    constexpr uint8_t shift = (sizeof(int32_t) * 8) - 2;
+    return static_cast<int8_t>((static_cast<int32_t>(bits) << shift) >> shift);
+  }
+
+  inline UnpackedType GetElem(size_t index) const {
+    assert(index <= 3);
+    const uint8_t shift = 2 * static_cast<uint8_t>(index);
+    const std::byte val = (bits_ >> shift) & std::byte{0x3};
+
+    if constexpr (Signed) {
+      return SignExtendLower2Bits(val);
+    } else {
+      return static_cast<UnpackedType>(val);
+    }
+  }
+
+  inline void SetElem(size_t index, UnpackedType val) {
+    assert(index <= 3);
+    const uint8_t shift = 2 * static_cast<uint8_t>(index);
+    const std::byte clear_mask = ~(std::byte{0x3} << shift);
+
+    bits_ &= clear_mask;                                    // Clear 2-bit element to 0
+    bits_ |= static_cast<std::byte>((val & 0x3) << shift);  // Set 2-bit element to val
+  }
+
+  inline std::byte ToBits() const {
+    return bits_;
+  }
+
+  /// <summary>
+  /// Calculates the number of packed byte units needed to store the given number of 2-bit elements.
+  /// Each byte stores 4 x 2-bit elements.
+  /// </summary>
+  static size_t CalcNumInt2Quads(size_t num_int2_elems) {
+    return (num_int2_elems + 3) / 4;
+  }
+
+  /// <summary>
+  /// Copy a source buffer of 2-bit elements (packed) into a destination buffer of 8-bit elements (unpacked).
+  /// </summary>
+  /// <param name="dst">Destination buffer to store unpacked 8-bit elements</param>
+  /// <param name="src">Source buffer with 2-bit elements</param>
+  /// <returns>True on success</returns>
+  static bool Unpack(gsl::span<UnpackedType> dst, gsl::span<const Int2x4Base<Signed>> src) {
+    if (CalcNumInt2Quads(dst.size()) != src.size()) {
+      return false;
+    }
+
+    if (src.empty()) {
+      return true;
+    }
+
+    for (size_t i = 0; i < dst.size(); i++) {
+      size_t byte_idx = i >> 2;   // i / 4
+      size_t elem_idx = i & 0x3;  // i % 4
+      dst[i] = src[byte_idx].GetElem(elem_idx);
+    }
+
+    return true;
+  }
+
+  /// <summary>
+  /// Copy a source buffer of 8-bit elements (unpacked) into a destination buffer of 2-bit elements (packed).
+  /// </summary>
+  /// <param name="dst">Destination buffer to store packed 2-bit elements</param>
+  /// <param name="src">Source buffer with 8-bit elements</param>
+  /// <returns>True on success</returns>
+  static bool Pack(gsl::span<Int2x4Base<Signed>> dst, gsl::span<const UnpackedType> src) {
+    if (CalcNumInt2Quads(src.size()) != dst.size()) {
+      return false;
+    }
+
+    if (src.empty()) {
+      return true;
+    }
+
+    size_t src_i = 0;
+    size_t dst_i = 0;
+    const size_t full_quads = src.size() / 4;
+
+    // Process complete groups of 4 elements
+    for (; dst_i < full_quads; dst_i++) {
+      dst[dst_i] = Int2x4Base<Signed>(src[src_i], src[src_i + 1], src[src_i + 2], src[src_i + 3]);
+      src_i += 4;
+    }
+
+    // Handle remaining elements (1-3)
+    if (src_i < src.size()) {
+      UnpackedType vals[4] = {0, 0, 0, 0};
+      size_t remaining = src.size() - src_i;
+      for (size_t j = 0; j < remaining; j++) {
+        vals[j] = src[src_i + j];
+      }
+      dst[dst_i] = Int2x4Base<Signed>(vals[0], vals[1], vals[2], vals[3]);
+    }
+
+    return true;
+  }
+
+  /// <summary>
+  /// Returns hierarchical indices for a packed int2 element from the given element index.
+  ///
+  /// Usage:
+  ///   Int2x4* data = ...;
+  ///   auto indices = GetTensorElemIndices(5);  // 6th int2 element
+  ///   int8_t elem = data[indices.first].GetElem(indices.second);
+  /// </summary>
+  /// <param name="index">Index of 2-bit element</param>
+  /// <returns>Pair of (byte_index, element_index_within_byte)</returns>
+  static inline std::pair<size_t, size_t> GetTensorElemIndices(size_t index) {
+    return {index >> 2, index & 0x3};
+  }
+};
+
+using Int2x4 = Int2x4Base<true>;
+using UInt2x4 = Int2x4Base<false>;
+static_assert(sizeof(Int2x4) == sizeof(std::byte));
+static_assert(sizeof(UInt2x4) == sizeof(std::byte));
+
+}  // namespace onnxruntime
@@ -13,6 +13,7 @@
 #include "core/framework/float4.h"
 #include "core/common/float8.h"
 #include "core/common/float16.h"
+#include "core/framework/int2.h"
 #include "core/framework/int4.h"
 
 namespace onnxruntime {
@@ -116,5 +117,14 @@ constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<UInt4x2>
   return ONNX_NAMESPACE::TensorProto_DataType_UINT4;
 }
 
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<Int2x4>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_INT2;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<UInt2x4>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_UINT2;
+}
+
 }  // namespace utils
 }  // namespace onnxruntime
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ function(get_c_cxx_api_headers HEADERS_VAR)`
`28`	`28`	`"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_ep_device_ep_metadata_keys.h"`
`29`	`29`	`"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_float16.h"`
`30`	`30`	`"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h"`
	`31`	`+ "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_env_config_keys.h"`
`31`	`32`	`"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h"`
`32`	`33`	`"${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h"`
`33`	`34`	`)`