Merge branch 'main' of https://github.com/microsoft/hlsl-specs into cbuffer

hekota · hekota · commit 7b2b62de3b28 · 2025-05-06T11:16:31.000-07:00
diff --git a/proposals/0029-cooperative-vector.md b/proposals/0029-cooperative-vector.md
@@ -77,9 +77,11 @@ void ps_main(args) // args: texture, normal, position
 
 **Neural Network based shader**
 
-Below shader is in HLSL-like psuedocode, to highlight the idea of what replacing physical computations with a neural network based evaluation looks like. The exact syntax for the new intrinsics is intentionally skipped to keep it simple, later sections contain examples with the correct syntax and sample descriptors.
+The shader below shows the idea of what replacing physical computations with a
+neural network based evaluation looks like. Some details have been omitted, but
+this should give a sense of how these new operations can be used.
 
-> NOTE: see proposal [0031] for the latest on the HLSL API.
+> NOTE: see proposal [0031] for full details on the HLSL API.
 
 ```c++
 ByteAddressBuffer inputMatrix0; 
@@ -89,29 +91,38 @@ ByteAddressBuffer biasVector1;
 
 void ps_main(args) // args: texture, normal, position
 {   
+    using namespace dx::linalg;
+
     PreProcessing(args);
     // Neural Network computes the output vector
     // using the same input args and trained data
     // in the form of matrices and bias vectors.
 
     // The input vector is computed from the shader input
-    vector<uint32_t, M> inputVector = SomeFunction(args);
+    vector<uint32_t, INPUT_SIZE> inputVector = SomeFunction(args);
 
     // Below the physical calculations are replaced by NN evaluation
     // the Matrix and Bias are trained offline and loaded to memory
 
     // layer0 = inputVector*inputMatrix + biasVector0
     // The matrix and bias are loaded from memory at offsets : moffset0 and boffset0
-    vector<uint32_t, K> layer0 = MatrixVectorMulAdd(inputVector, inputMatrix0, moffset0, biasVector0, boffset0);
+    MatrixRef<DATA_TYPE_UINT32, N, INPUT_SIZE, MATRIX_LAYOUT_MUL_OPTIMAL> M0 = { inputMatrix0, moffset0, 0 }; 
+    VectorRef<DATA_TYPE_UINT32> B0 = { biasVector0, boffset0 };
+
+    vector<uint32_t, N> layer0 = MulAdd<uint32_t>(M0, MakeInterpretedVector<DATA_TYPE_UINT32>(inputVector), B0);
     layer0 = max(layer0,0); // Apply activation function
 
-    // layer0 = inputVector*inputMatrix0 + biasVector0
+    // layer1 = inputVector*inputMatrix0 + biasVector0
     // The matrix and bias are loaded from memory at offsets : moffset1 and boffset1
-    vector<uint32_t, K> layer1 = MatrixVectorMulAdd(layer0, inputMatrix0, moffset1, biasVector0, boffset1);
+    MatrixRef<DATA_TYPE_UINT32, N, N, MATRIX_LAYOUT_MUL_OPTIMAL> M1 = { inputMatrix0, moffset1, 0 };
+    VectorRef<DATA_TYPE_UINT32> B1 = { biasVector0, boffset1 };
+    vector<uint32_t, K> layer1 = MulAdd<uint32_t>(M1, MakeInterpretedVector<DATA_TYPE_UINT32>(layer0), B1);
     layer1 = max(layer1,0); // Apply activation function
 
     // output = layer1*inputMatrix1 + biasVector1 
-    vector<uint32_t, N> output = MatrixVectorMulAdd(layer1, inputMatrix1, biasVector1);
+    MatrixRef<DATA_TYPE_UINT32, OUTPUT_SIZE, N, MATRIX_LAYOUT_MUL_OPTIMAL> M2 = { inputMatrix1, 0, 0 };
+    VectorRef<DATA_TYPE_UIN32> B2 = { biasVector1, 0 };
+    vector<uint32_t, OUTPUT_SIZE> output = MulAdd<uint32_t>(M2, MakeInterpretedVector<DATA_TYPE_UINT32>(layer1), B2);
 
     output = exp(output); 
     
diff --git a/proposals/0030-dxil-vectors.md b/proposals/0030-dxil-vectors.md
@@ -100,6 +100,7 @@ Previously usage of `extractelement` and `insertelement` in DXIL didn't allow dy
 #### Elementwise intrinsics
 
 A selection of elementwise intrinsics are given additional native vector forms.
+The full list of intrinsics with elementwise overloads is listed in [Appendix 1](#appendix-1-new-elementwise-overloads).
 Elementwise intrinsics are those that perform their calculations irrespective of the location of the element
  in the vector or matrix arguments except insofar as that position corresponds to those of the other elements
  that might be used in the individual element calculations.
@@ -183,6 +184,71 @@ Calculations should produce the correct results in all cases for a range of vect
 In practice, this testing will largely represent verifying correct intrinsic output
  with the new shader model.
 
+## Appendix 1: New Elementwise Overloads
+
+| Opcode |  Name              | Class              |
+| ------ | --------------     | --------           |
+| 6      | FAbs               | Unary              |
+| 7      | Saturate           | Unary              |
+| 8      | IsNaN              | IsSpecialFloat     |
+| 9      | IsInf              | IsSpecialFloat     |
+| 10     | IsFinite           | IsSpecialFloat     |
+| 11     | IsNormal           | IsSpecialFloat     |
+| 12     | Cos                | Unary              |
+| 13     | Sin                | Unary              |
+| 14     | Tan                | Unary              |
+| 15     | Acos               | Unary              |
+| 16     | Asin               | Unary              |
+| 17     | Atan               | Unary              |
+| 18     | Hcos               | Unary              |
+| 19     | Hsin               | Unary              |
+| 20     | Htan               | Unary              |
+| 21     | Exp                | Unary              |
+| 22     | Frc                | Unary              |
+| 23     | Log                | Unary              |
+| 24     | Sqrt               | Unary              |
+| 25     | Rsqrt              | Unary              |
+| 26     | Round_ne           | Unary              |
+| 27     | Round_ni           | Unary              |
+| 28     | Round_pi           | Unary              |
+| 29     | Round_z            | Unary              |
+| 30     | Bfrev              | Unary              |
+| 31     | Countbits          | UnaryBits          |
+| 32     | FirstBitLo         | UnaryBits          |
+| 33     | FirstBitHi         | UnaryBits          |
+| 34     | FirstBitSHi        | UnaryBits          |
+| 35     | FMax               | Binary             |
+| 36     | FMin               | Binary             |
+| 37     | IMax               | Binary             |
+| 38     | IMin               | Binary             |
+| 39     | UMax               | Binary             |
+| 40     | UMin               | Binary             |
+| 46     | FMad               | Tertiary           |
+| 47     | Fma                | Tertiary           |
+| 48     | IMad               | Tertiary           |
+| 49     | UMad               | Tertiary           |
+| 83     | DerivCoarseX       | Unary              |
+| 84     | DerivCoarseY       | Unary              |
+| 85     | DerivFineX         | Unary              |
+| 86     | DerivFineY         | Unary              |
+| 115    | WaveActiveAllEqual | WaveActiveAllEqual |
+| 117    | WaveReadLaneAt     | WaveReadLaneAt     |
+| 118    | WaveReadLaneFirst  | WaveReadLaneFirst  |
+| 119    | WaveActiveOp       | WaveActiveOp       |
+| 120    | WaveActiveBit      | WaveActiveBit      |
+| 121    | WavePrefixOp       | WavePrefixOp       |
+| 122    | QuadReadLaneAt     | QuadReadLaneAt     |
+| 123    | QuadOp             | QuadOp             |
+| 124    | BitcastI16toF16    | BitcastI16toF16    |
+| 125    | BitcastF16toI16    | BitcastF16toI16    |
+| 126    | BitcastI32toF32    | BitcastI32toF32    |
+| 127    | BitcastF32toI32    | BitcastF32toI32    |
+| 128    | BitcastI64toF64    | BitcastI64toF64    |
+| 129    | BitcastF64toI64    | BitcastF64toI64    |
+| 165    | WaveMatch          | WaveMatch          |
+
+
+
 ## Acknowledgments
 
 * [Anupama Chandrasekhar](https://github.com/anupamachandra) and [Tex Riddell](https://github.com/tex3d) for foundational contributions to the design.