[TLERaw] Enable fp16 for TLERaw CUDA (#471)

sgjzfzzf · web-flow · commit eb9acb37bdb4 · 2026-03-23T18:48:57.000+08:00
Signed-off-by: Jinjie Liu &lt;jjliu@baai.ac.cn&gt;
diff --git a/python/tutorials/tle/raw/cuda/02-fused-softmax.cu b/python/tutorials/tle/raw/cuda/02-fused-softmax.cu
@@ -1,4 +1,5 @@
 #include <math_constants.h>
+
 __device__ auto
 SoftmaxKernel(__attribute__((address_space(3))) float *output_allocated,
               __attribute__((address_space(3))) float *output_aligned,
@@ -50,11 +51,11 @@ SoftmaxKernel(__attribute__((address_space(3))) float *output_allocated,
     __attribute__((address_space(3))) float *allocated;
     __attribute__((address_space(3))) float *aligned;
     int64_t offsets;
-    int64_t sizes1[1];
-    int64_t stride1[1];
+    int64_t sizes[1];
+    int64_t strides[1];
   } r{
-      output_allocated, output_aligned, output_offsets,
-      output_size,      output_stride,
+      output_allocated, output_aligned,  output_offsets,
+      {output_size},    {output_stride},
   };
   return r;
 }
diff --git a/python/tutorials/tle/raw/cuda/03-matrix-multiplication.cu b/python/tutorials/tle/raw/cuda/03-matrix-multiplication.cu
@@ -1,23 +1,17 @@
 #include <stdint.h>
 
-__device__ __forceinline__ float raw_half_to_float(uint16_t h) {
-  float out;
-  asm volatile("cvt.f32.f16 %0, %1;" : "=f"(out) : "h"(h));
-  return out;
-}
-
 __device__ auto
 MatMul(__attribute__((address_space(3))) float *output_allocated,
        __attribute__((address_space(3))) float *output_aligned,
        const int64_t output_offsets, const int64_t output_size1,
        const int64_t output_size2, const int64_t output_stride1,
        const int64_t output_stride2,
-       __attribute__((address_space(3))) uint16_t *a_allocated,
-       __attribute__((address_space(3))) uint16_t *a_aligned,
+       __attribute__((address_space(3))) __fp16 *a_allocated,
+       __attribute__((address_space(3))) __fp16 *a_aligned,
        const int64_t a_offsets, const int64_t a_size1, const int64_t a_size2,
        const int64_t a_stride1, const int64_t a_stride2,
-       __attribute__((address_space(3))) uint16_t *b_allocated,
-       __attribute__((address_space(3))) uint16_t *b_aligned,
+       __attribute__((address_space(3))) __fp16 *b_allocated,
+       __attribute__((address_space(3))) __fp16 *b_aligned,
        const int64_t b_offsets, const int64_t b_size1, const int64_t b_size2,
        const int64_t b_stride1, const int64_t b_stride2) {
   const int idx = threadIdx.x;
@@ -29,13 +23,10 @@ MatMul(__attribute__((address_space(3))) float *output_allocated,
   for (int i = idx; i < m * n; i += bdimx) {
     int row = i / n;
     int col = i % n;
-    float acc = 0.0f;
+    float acc = 0;
     for (int j = 0; j < k; j++) {
-      float a_val = raw_half_to_float(
-          a_aligned[a_offsets + row * a_stride1 + j * a_stride2]);
-      float b_val = raw_half_to_float(
-          b_aligned[b_offsets + j * b_stride1 + col * b_stride2]);
-      acc += a_val * b_val;
+      acc += a_aligned[a_offsets + row * a_stride1 + j * a_stride2] *
+             b_aligned[b_offsets + j * b_stride1 + col * b_stride2];
     }
     output_aligned[output_offsets + row * output_stride1 +
                    col * output_stride2] += acc;
@@ -47,8 +38,8 @@ MatMul(__attribute__((address_space(3))) float *output_allocated,
     __attribute__((address_space(3))) float *allocated;
     __attribute__((address_space(3))) float *aligned;
     int64_t offsets;
-    int64_t sizes1[2];
-    int64_t stride1[2];
+    int64_t sizes[2];
+    int64_t strides[2];
   } r{output_allocated,
       output_aligned,
       output_offsets,