LinAlg packed type clarifications/protections (#845)

pow2clk · web-flow · commit 0d6b554f1d20 · 2026-04-29T15:43:03.000-06:00
Packed types are only intended to be used as inputs to makeinterpreted vector after which they will be used only through that wrapper. Using them directly in the APIs introduces complicated dimension checking and isn't intended to be supported. Mostly sprinkling heavy usage of is_arithmetic checks for fall native vector inputs. Some of these were added already for MultiplyAdd, but left out elsewhere. Removes the restriction on using packed types for groupshared load/store/accumulate operations. Makes a few incidental typo corrections here and there. Removes mention of matrices of packed types as matrices cannot have packed types, though they may have types that can only be represented as packed when converted and wrapped in interpreted vectors Fixes #823
diff --git a/proposals/0035-linalg-matrix.md b/proposals/0035-linalg-matrix.md
@@ -111,8 +111,7 @@ class Matrix {
                      MatrixLayoutEnum Layout, uint Align = 128);
 
   template <typename T, SIZE_TYPE Size>
-  static typename hlsl::enable_if<hlsl::is_arithmetic<T>::value &&
-                                      (M * N / ElementsPerScalar <= Size),
+  static typename hlsl::enable_if<M * N / ElementsPerScalar <= Size,
                                   Matrix>::type
   Load(/*groupshared*/ T Arr[Size], uint StartIdx, uint Stride,
        MatrixLayoutEnum Layout);
@@ -141,8 +140,7 @@ class Matrix {
              MatrixLayoutEnum Layout, uint Align = 128);
 
   template <typename T, SIZE_TYPE Size>
-  typename hlsl::enable_if<hlsl::is_arithmetic<T>::value &&
-                               (M * N / ElementsPerScalar <= Size),
+  typename hlsl::enable_if<M * N / ElementsPerScalar <= Size,
                            void>::type
   Store(/*groupshared*/ T Arr[Size], uint StartIdx, uint Stride,
         MatrixLayoutEnum Layout);
@@ -158,8 +156,7 @@ class Matrix {
   template <typename T, MatrixUseEnum UseLocal = Use,
             MatrixScopeEnum ScopeLocal = Scope, SIZE_TYPE Size>
   typename hlsl::enable_if<
-      hlsl::is_arithmetic<T>::value && Use == MatrixUse::Accumulator &&
-          UseLocal == Use && (M * N / ElementsPerScalar <= Size) &&
+      UseLocal == Use && (M * N / ElementsPerScalar <= Size) &&
           Scope == MatrixScope::Wave && ScopeLocal == Scope,
       void>::type
   InterlockedAccumulate(/*groupshared*/ T Arr[Size], uint StartIdx, uint Stride,
@@ -232,21 +229,25 @@ Matrix<CompTy, M, N, MatrixUse::Accumulator, MatrixScope::ThreadGroup> Multiply(
 
 template <typename OutputElTy, typename InputElTy, SIZE_TYPE M, SIZE_TYPE K,
           ComponentEnum MatrixDT>
-vector<OutputElTy, M>
+typename hlsl::enable_if<hlsl::is_arithmetic<InputElTy>::value,
+                         vector<OutputElTy, M> >::type
 Multiply(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread> MatrixA,
          vector<InputElTy, K> Vec);
 
 template <typename OutputElTy, typename InputElTy, typename BiasElTy,
           SIZE_TYPE M, SIZE_TYPE K, ComponentEnum MatrixDT>
-vector<OutputElTy, M>
+typename hlsl::enable_if<hlsl::is_arithmetic<InputElTy>::value &&
+                         hlsl::is_arithmetic<BiasElTy>::value,
+                         vector<OutputElTy, M> >::type
 MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread> MatrixA,
             vector<InputElTy, K>, vector<BiasElTy, M> Vec);
 
 template <typename OutputElTy, typename InputElTy, ComponentEnum InputInterp,
           typename BiasElTy, SIZE_TYPE M, SIZE_TYPE VecK, SIZE_TYPE K,
           ComponentEnum MatrixDT>
 typename hlsl::enable_if<
-    InterpretedVector<InputElTy, VecK, InputInterp>::Size == K,
+    InterpretedVector<InputElTy, VecK, InputInterp>::Size == K &&
+    hlsl::is_arithmetic<BiasElTy>::value,
     vector<OutputElTy, M> >::type
 MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread> MatrixA,
             InterpretedVector<InputElTy, VecK, InputInterp> InterpVec,
@@ -270,7 +271,8 @@ MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread> MatrixA,
             VectorRef<BiasElTy, M> BiasRef);
 
 template <ComponentEnum OutTy, typename InputElTy, SIZE_TYPE M, SIZE_TYPE N>
-Matrix<OutTy, M, N, MatrixUse::Accumulator, MatrixScope::Thread>
+typename hlsl::enable_if<hlsl::is_arithmetic<InputElTy>::value,
+                         Matrix<OutTy, M, N, MatrixUse::Accumulator, MatrixScope::Thread> >::type
 OuterProduct(vector<InputElTy, M> VecA, vector<InputElTy, N> VecB);
 
 template <typename InputElTy, SIZE_TYPE M>
@@ -529,9 +531,6 @@ DXIL validation.
 | Wave         | [4,128]                   |
 | ThreadGroup  | [1,1024]                  |
 
-Sizes for matrices of packed data types are 4 times the valid size for a scalar
-element.
-
 Not all hardware is required to support all possible dimensions for thread and
 wave scope matrices, or all possible element types. The shader compiler will
 encode the dimensions and input and output data types used by each shader in the
@@ -1016,7 +1015,7 @@ When accumulating to `RWByteAddressBuffer` objects, the accumulation is
 performed on the component type of the matrix object. When accumulating to
 `groupshared` memory, the matrix component data is converted to the target
 arithmetic or packed data type before atomic arithmetic is performed. No
-conversion is performed if the target aritmetic type matches the matrix
+conversion is performed if the target arithmetic type matches the matrix
 component type.
 
 #### Matrix::MultiplyAccumulate(Matrix, Matrix)
@@ -1117,7 +1116,8 @@ type and takes arguments with potentially mismatched element types.
 ``` c++
 template <typename OutputElTy, typename InputElTy, SIZE_TYPE M, SIZE_TYPE K,
           ComponentEnum MatrixDT>
-vector<OutputElTy, M>
+typename hlsl::enable_if<hlsl::is_arithmetic<InputElTy>::value,
+                         vector<OutputElTy, M> >::type
 linalg::Multiply(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread> MatrixA,
                  vector<InputElTy, K> Vec);
 ```
@@ -1133,7 +1133,8 @@ matrix by the `K`-element vector `Vec` producing a result `M`-element vector.
 ```c++
 template <ComponentType OutTy, typename InputElTy,
           uint M, uint N>
-Matrix<OutTy, M, N, MatrixUse::Accumulator, MatrixScope::Thread>
+typename hlsl::enable_if<hlsl::is_arithmetic<InputElTy>::value,
+                         Matrix<OutTy, M, N, MatrixUse::Accumulator, MatrixScope::Thread> >::type
     linalg::OuterProduct(vector<InputElTy, M> VecA, vector<InputElTy, N> VecB);
 ```
 
@@ -1147,7 +1148,9 @@ parameter for the output matrix element type.
 ``` c++
 template <typename OutputElTy, typename InputElTy, typename BiasElTy,
           SIZE_TYPE M, SIZE_TYPE K, ComponentEnum MatrixDT>
-vector<OutputElTy, M>
+typename hlsl::enable_if<hlsl::is_arithmetic<InputElTy>::value &&
+                         hlsl::is_arithmetic<BiasElTy>::value,
+                         vector<OutputElTy, M> >::type
 linalg::MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread> MatrixA,
                     vector<InputElTy, K> Vec, vector<BiasElTy, M> Bias);
 ```
@@ -1579,9 +1582,9 @@ declare void @dx.op.linAlgMatrixAccumulateToDescriptor.[MatTy](
 Accumulates a matrix to a RWByteAddressBuffer at a specified offset. This
 operation is only available for matrices with `MatrixUse::Accumulator`. The
 matrix data is added to the existing data in the buffer. The matrix component
-data is converted to the target arithmetic or packed data type if the data types
-do not match, then added to the existing data in memory. This operation must
-observe [bounds checking behavior](#bounds-checking-behavior) described below.
+data is added to the existing data in memory using the component type of the
+matrix. This operation must observe
+[bounds checking behavior](#bounds-checking-behavior) described below.
 
 Validation rules will enforce that:
 * `Layout` is `OuterProductOptimal` for matrix with `MatrixScope` of `Thread`
@@ -2150,21 +2153,25 @@ Matrix<CompTy, M, N, MatrixUse::Accumulator, MatrixScope::ThreadGroup> Multiply(
 
 template <typename OutputElTy, typename InputElTy, SIZE_TYPE M, SIZE_TYPE K,
           ComponentEnum MatrixDT>
-vector<OutputElTy, M>
+typename hlsl::enable_if<hlsl::is_arithmetic<InputElTy>::value,
+                         vector<OutputElTy, M> >::type
 Multiply(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread> MatrixA,
          vector<InputElTy, K> Vec);
 
 template <typename OutputElTy, typename InputElTy, typename BiasElTy,
           SIZE_TYPE M, SIZE_TYPE K, ComponentEnum MatrixDT>
-vector<OutputElTy, M>
+typename hlsl::enable_if<hlsl::is_arithmetic<InputElTy>::value &&
+                         hlsl::is_arithmetic<BiasElTy>::value,
+                         vector<OutputElTy, M> >::type
 MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread> MatrixA,
             vector<InputElTy, K> Vec, vector<BiasElTy, M> Vec);
 
 template <typename OutputElTy, typename InputElTy, ComponentEnum InputInterp,
           typename BiasElTy, SIZE_TYPE M, SIZE_TYPE VecK, SIZE_TYPE K,
           ComponentEnum MatrixDT>
 typename hlsl::enable_if<
-    InterpretedVector<InputElTy, VecK, InputInterp>::Size == K,
+    InterpretedVector<InputElTy, VecK, InputInterp>::Size == K &&
+    hlsl::is_arithmetic<BiasElTy>::value,
     vector<OutputElTy, M> >::type
 MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread> MatrixA,
             InterpretedVector<InputElTy, VecK, InputInterp> InterpVec,
@@ -2188,7 +2195,8 @@ MultiplyAdd(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread> MatrixA,
             VectorRef<BiasElTy, M> BiasRef);
 
 template <ComponentEnum OutTy, typename InputElTy, SIZE_TYPE M, SIZE_TYPE N>
-Matrix<OutTy, M, N, MatrixUse::Accumulator, MatrixScope::Thread>
+typename hlsl::enable_if<hlsl::is_arithmetic<InputElTy>::value,
+                         Matrix<OutTy, M, N, MatrixUse::Accumulator, MatrixScope::Thread> >::type
 OuterProduct(vector<InputElTy, M> VecA, vector<InputElTy, N> VecB);
 
 template <typename InputElTy, SIZE_TYPE M>