Skip to content

Commit 9e3f85a

Browse files
author
sunyuechi
committed
Add RVV f16-f32-vcvt kernels and configs.
1 parent 260e0ec commit 9e3f85a

8 files changed

+175
-0
lines changed

cmake/gen/rvvfp16arith_microkernels.cmake

+3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
SET(PROD_RVVFP16ARITH_MICROKERNEL_SRCS)
1313

1414
SET(NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS
15+
src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u1v.c
16+
src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u2v.c
17+
src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u4v.c
1518
src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u1v.c
1619
src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u2v.c
1720
src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u4v.c

gen/rvvfp16arith_microkernels.bzl

+3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [
99
]
1010

1111
NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [
12+
"src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u1v.c",
13+
"src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u2v.c",
14+
"src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u4v.c",
1215
"src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u1v.c",
1316
"src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u2v.c",
1417
"src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u4v.c",

scripts/generate-f16-f32-vcvt.sh

+5
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ tools/xngen src/f16-f32-vcvt/neon-int32.c.in -D BATCH_TILE=32 -o src/f16-f32-vcv
1818
tools/xngen src/f16-f32-vcvt/neonfp16.c.in -D BATCH_TILE=8 -o src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u8.c &
1919
tools/xngen src/f16-f32-vcvt/neonfp16.c.in -D BATCH_TILE=16 -o src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u16.c &
2020

21+
################################ RISC-V Vector ################################
22+
tools/xngen src/f16-f32-vcvt/rvvfp16arith.c.in -D LMUL=1 -o src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u1v.c &
23+
tools/xngen src/f16-f32-vcvt/rvvfp16arith.c.in -D LMUL=2 -o src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u2v.c &
24+
tools/xngen src/f16-f32-vcvt/rvvfp16arith.c.in -D LMUL=4 -o src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u4v.c &
25+
2126
################################# x86 128-bit #################################
2227
tools/xngen src/f16-f32-vcvt/sse-int16.c.in -D SSE=2 -D AVX=0 -D BATCH_TILE=8 -o src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u8.c &
2328
tools/xngen src/f16-f32-vcvt/sse-int16.c.in -D SSE=2 -D AVX=0 -D BATCH_TILE=16 -o src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u16.c &

src/f16-f32-vcvt/f16-f32-vcvt.h

+6
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,12 @@ XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u
8585
XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32, 32, false, xnn_float16, float, void, NULL)
8686
#endif // XNN_ARCH_WASMRELAXEDSIMD
8787

88+
#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
89+
XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector_fp16_arith, xnn_f16_f32_vcvt_ukernel__rvvfp16arith_u1v, 1, true, xnn_float16, float, void, NULL)
90+
XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector_fp16_arith, xnn_f16_f32_vcvt_ukernel__rvvfp16arith_u2v, 2, true, xnn_float16, float, void, NULL)
91+
XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector_fp16_arith, xnn_f16_f32_vcvt_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, float, void, NULL)
92+
#endif
93+
8894
XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f16_f32_vcvt_ukernel__scalar_u1, 1, false, xnn_float16, float, void, NULL)
8995
XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f16_f32_vcvt_ukernel__scalar_u2, 2, false, xnn_float16, float, void, NULL)
9096
XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f16_f32_vcvt_ukernel__scalar_u3, 3, false, xnn_float16, float, void, NULL)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// Auto-generated file. Do not edit!
2+
// Template: src/f16-f32-vcvt/rvvfp16arith.c.in
3+
// Generator: tools/xngen
4+
//
5+
// Copyright (c) 2025 Institute of Software Chinese Academy of Sciences (ISCAS).
6+
//
7+
// This source code is licensed under the BSD-style license found in the
8+
// LICENSE file in the root directory of this source tree.
9+
10+
#include <assert.h>
11+
12+
#include <riscv_vector.h>
13+
14+
#include "xnnpack/vcvt.h"
15+
16+
17+
void xnn_f16_f32_vcvt_ukernel__rvvfp16arith_u1v(
18+
size_t batch,
19+
const xnn_float16* input,
20+
float* output,
21+
const void* params)
22+
{
23+
assert(batch != 0);
24+
assert(batch % sizeof(xnn_float16) == 0);
25+
assert(input != NULL);
26+
assert(output != NULL);
27+
28+
batch >>= XNN_LOG2_SIZEOF_HALF;
29+
30+
const _Float16* i = (const _Float16*) input;
31+
for (; batch > 0;) {
32+
const int32_t n = __riscv_vsetvl_e16m1(batch); batch -= n;
33+
34+
vfloat16m1_t x_f16v = __riscv_vle16_v_f16m1(i, n); i += n;
35+
36+
vfloat32m2_t y_f32v = __riscv_vfwcvt_f_f_v_f32m2(x_f16v, n);
37+
38+
__riscv_vse32_v_f32m2(output, y_f32v, n); output += n;
39+
}
40+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// Auto-generated file. Do not edit!
2+
// Template: src/f16-f32-vcvt/rvvfp16arith.c.in
3+
// Generator: tools/xngen
4+
//
5+
// Copyright (c) 2025 Institute of Software Chinese Academy of Sciences (ISCAS).
6+
//
7+
// This source code is licensed under the BSD-style license found in the
8+
// LICENSE file in the root directory of this source tree.
9+
10+
#include <assert.h>
11+
12+
#include <riscv_vector.h>
13+
14+
#include "xnnpack/vcvt.h"
15+
16+
17+
void xnn_f16_f32_vcvt_ukernel__rvvfp16arith_u2v(
18+
size_t batch,
19+
const xnn_float16* input,
20+
float* output,
21+
const void* params)
22+
{
23+
assert(batch != 0);
24+
assert(batch % sizeof(xnn_float16) == 0);
25+
assert(input != NULL);
26+
assert(output != NULL);
27+
28+
batch >>= XNN_LOG2_SIZEOF_HALF;
29+
30+
const _Float16* i = (const _Float16*) input;
31+
for (; batch > 0;) {
32+
const int32_t n = __riscv_vsetvl_e16m2(batch); batch -= n;
33+
34+
vfloat16m2_t x_f16v = __riscv_vle16_v_f16m2(i, n); i += n;
35+
36+
vfloat32m4_t y_f32v = __riscv_vfwcvt_f_f_v_f32m4(x_f16v, n);
37+
38+
__riscv_vse32_v_f32m4(output, y_f32v, n); output += n;
39+
}
40+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// Auto-generated file. Do not edit!
2+
// Template: src/f16-f32-vcvt/rvvfp16arith.c.in
3+
// Generator: tools/xngen
4+
//
5+
// Copyright (c) 2025 Institute of Software Chinese Academy of Sciences (ISCAS).
6+
//
7+
// This source code is licensed under the BSD-style license found in the
8+
// LICENSE file in the root directory of this source tree.
9+
10+
#include <assert.h>
11+
12+
#include <riscv_vector.h>
13+
14+
#include "xnnpack/vcvt.h"
15+
16+
17+
void xnn_f16_f32_vcvt_ukernel__rvvfp16arith_u4v(
18+
size_t batch,
19+
const xnn_float16* input,
20+
float* output,
21+
const void* params)
22+
{
23+
assert(batch != 0);
24+
assert(batch % sizeof(xnn_float16) == 0);
25+
assert(input != NULL);
26+
assert(output != NULL);
27+
28+
batch >>= XNN_LOG2_SIZEOF_HALF;
29+
30+
const _Float16* i = (const _Float16*) input;
31+
for (; batch > 0;) {
32+
const int32_t n = __riscv_vsetvl_e16m4(batch); batch -= n;
33+
34+
vfloat16m4_t x_f16v = __riscv_vle16_v_f16m4(i, n); i += n;
35+
36+
vfloat32m8_t y_f32v = __riscv_vfwcvt_f_f_v_f32m8(x_f16v, n);
37+
38+
__riscv_vse32_v_f32m8(output, y_f32v, n); output += n;
39+
}
40+
}

src/f16-f32-vcvt/rvvfp16arith.c.in

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Copyright (c) 2025 Institute of Software Chinese Academy of Sciences (ISCAS).
2+
//
3+
// This source code is licensed under the BSD-style license found in the
4+
// LICENSE file in the root directory of this source tree.
5+
6+
$assert LMUL in [1, 2, 4]
7+
$LMUL_32 = {1: "2", 2: "4", 4: "8"}[LMUL]
8+
#include <assert.h>
9+
10+
#include <riscv_vector.h>
11+
12+
#include "xnnpack/vcvt.h"
13+
14+
15+
void xnn_f16_f32_vcvt_ukernel__rvvfp16arith_u${LMUL}v(
16+
size_t batch,
17+
const xnn_float16* input,
18+
float* output,
19+
const void* params)
20+
{
21+
assert(batch != 0);
22+
assert(batch % sizeof(xnn_float16) == 0);
23+
assert(input != NULL);
24+
assert(output != NULL);
25+
26+
batch >>= XNN_LOG2_SIZEOF_HALF;
27+
28+
const _Float16* i = (const _Float16*) input;
29+
for (; batch > 0;) {
30+
const int32_t n = __riscv_vsetvl_e16m${LMUL}(batch); batch -= n;
31+
32+
vfloat16m${LMUL}_t x_f16v = __riscv_vle16_v_f16m${LMUL}(i, n); i += n;
33+
34+
vfloat32m${LMUL_32}_t y_f32v = __riscv_vfwcvt_f_f_v_f32m${LMUL_32}(x_f16v, n);
35+
36+
__riscv_vse32_v_f32m${LMUL_32}(output, y_f32v, n); output += n;
37+
}
38+
}

0 commit comments

Comments
 (0)