Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions bench/qs8-qc8w-gemm-fp32.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,53 @@
#include "src/xnnpack/packw.h"


#if XNN_ENABLE_HVX && XNN_ARCH_HEXAGON
static void qs8_qc8w_gemm_minmax_fp32_ukernel_1x32c4__hvx(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x32c4__hvx,
xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/1, /*nr=*/32, /*kr=*/4, /*sr=*/1,
benchmark::utils::CheckHVX);
}

BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_1x32c4__hvx)

static void qs8_qc8w_gemm_minmax_fp32_ukernel_4x32c4__hvx(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x32c4__hvx,
xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/4, /*nr=*/32, /*kr=*/4, /*sr=*/1,
benchmark::utils::CheckHVX);
}

BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_4x32c4__hvx)

static void qs8_qc8w_gemm_minmax_fp32_ukernel_8x32c4__hvx(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x32c4__hvx,
xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/8, /*nr=*/32, /*kr=*/4, /*sr=*/1,
benchmark::utils::CheckHVX);
}

BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_8x32c4__hvx)

static void qs8_qc8w_gemm_minmax_fp32_ukernel_16x32c4__hvx(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_16x32c4__hvx,
xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/16, /*nr=*/32, /*kr=*/4, /*sr=*/1,
benchmark::utils::CheckHVX);
}

BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_16x32c4__hvx)
#endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON


#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV
static void qs8_qc8w_gemm_minmax_fp32_ukernel_1x4v__rvv(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
Expand Down
26 changes: 16 additions & 10 deletions src/qs8-gemm/MRx32c4-hvx.c.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ $assert REQUANTIZATION == "FP32" or not REQUANTIZATION
$assert DATATYPE in ["QC8"]

#include <assert.h>
#include <string.h> // for memcpy

#include <hexagon_types.h>
#include <hexagon_protos.h>
Expand All @@ -18,6 +19,11 @@ $assert DATATYPE in ["QC8"]
#include "src/xnnpack/unaligned.h"


static XNN_INTRINSIC void xnn_Q6_V_vstu_variable(void* addr, uint32_t n,
HVX_Vector vin) {
memcpy(addr, &vin, n);
}

$DATATYPE_SPEC = {"QC8": "qs8_qc8w", "QD8": "qd8_f32_qc8w", "QS8": "qs8", "QU8": "qu8", "QC4": "qd8_f32_qc4w"}[DATATYPE]
$REQUANTIZATION_SPEC = "" if DATATYPE in ["QD8", "QC4"] else "_" + REQUANTIZATION.lower()
$PARAMS_STRUCT = REQUANTIZATION.lower() + "_scalar" if REQUANTIZATION else "scalar"
Expand Down Expand Up @@ -74,23 +80,23 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x32c4__
const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->${PARAMS_STRUCT}.output_min);

do {
HVX_Vector vacc0x32 = *((HVX_Vector*)w);
HVX_Vector vacc0x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x0x32 = Q6_V_vsplat_R(0);
$for M in range(1, MR):
HVX_Vector vacc${M}x32 = *((HVX_Vector*)w);
HVX_Vector vacc1x${M}x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc${M}x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x${M}x32 = Q6_V_vsplat_R(0);

w = (const int32_t*) w + 32;

size_t k = kc;
while (k >= 8 * sizeof(${XINT8_T})) {
$for M in range(MR):
const HVX_Vector va${M}x0123 = Q6_V_vsplat_R(unaligned_load_s32(a${M}));
const HVX_Vector va${M}x4567 = Q6_V_vsplat_R(unaligned_load_s32(a${M}+4));
a${M} += 8;
const HVX_Vector va${M}x0123 = Q6_V_vsplat_R(unaligned_load_s32(a${M}));
const HVX_Vector va${M}x4567 = Q6_V_vsplat_R(unaligned_load_s32(a${M}+4));
a${M} += 8;

const HVX_Vector vb32x0123 = *((HVX_Vector *)((${XINT8_T} *)w));
const HVX_Vector vb32x4567 = *((HVX_Vector *)((${XINT8_T} *)w + 128));
const HVX_Vector vb32x0123 = *((HVX_UVector *)((${XINT8_T} *)w));
const HVX_Vector vb32x4567 = *((HVX_UVector *)((${XINT8_T} *)w + 128));
$for M in range(MR):
vacc${M}x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc${M}x32, va${M}x0123, vb32x0123);
vacc1x${M}x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc1x${M}x32, va${M}x4567, vb32x4567);
Expand All @@ -107,7 +113,7 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x32c4__
const HVX_Vector va${M}x0123 = Q6_V_vsplat_R(unaligned_load_s32(a${M}));
a${M} += 4;

const HVX_Vector vb32x0123 = *((HVX_Vector *)((${XINT8_T} *)w));
const HVX_Vector vb32x0123 = *((HVX_UVector *)((${XINT8_T} *)w));
$for M in range(MR):
vacc${M}x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc${M}x32, va${M}x0123, vb32x0123);

Expand Down Expand Up @@ -153,7 +159,7 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x32c4__
} else {
// Prepare mask for valid 8-bit elements (depends on nc).
$for M in range(MR):
Q6_V_vstu_variable(c${M}, nc, vout${M}x32);
xnn_Q6_V_vstu_variable(c${M}, nc, vout${M}x32);
nc = 0;
}
} while (nc != 0);
Expand Down
76 changes: 41 additions & 35 deletions src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-16x32c4-minmax-fp32-hvx.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@


#include <assert.h>
#include <string.h> // for memcpy

#include <hexagon_types.h>
#include <hexagon_protos.h>
Expand All @@ -21,6 +22,11 @@
#include "src/xnnpack/unaligned.h"


static XNN_INTRINSIC void xnn_Q6_V_vstu_variable(void* addr, uint32_t n,
HVX_Vector vin) {
memcpy(addr, &vin, n);
}


void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_16x32c4__hvx(
size_t mr,
Expand Down Expand Up @@ -143,37 +149,37 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_16x32c4__hvx(
const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->fp32_scalar.output_min);

do {
HVX_Vector vacc0x32 = *((HVX_Vector*)w);
HVX_Vector vacc0x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x0x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc1x32 = *((HVX_Vector*)w);
HVX_Vector vacc1x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x1x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc2x32 = *((HVX_Vector*)w);
HVX_Vector vacc2x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x2x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc3x32 = *((HVX_Vector*)w);
HVX_Vector vacc3x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x3x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc4x32 = *((HVX_Vector*)w);
HVX_Vector vacc4x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x4x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc5x32 = *((HVX_Vector*)w);
HVX_Vector vacc5x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x5x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc6x32 = *((HVX_Vector*)w);
HVX_Vector vacc6x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x6x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc7x32 = *((HVX_Vector*)w);
HVX_Vector vacc7x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x7x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc8x32 = *((HVX_Vector*)w);
HVX_Vector vacc8x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x8x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc9x32 = *((HVX_Vector*)w);
HVX_Vector vacc9x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x9x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc10x32 = *((HVX_Vector*)w);
HVX_Vector vacc10x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x10x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc11x32 = *((HVX_Vector*)w);
HVX_Vector vacc11x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x11x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc12x32 = *((HVX_Vector*)w);
HVX_Vector vacc12x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x12x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc13x32 = *((HVX_Vector*)w);
HVX_Vector vacc13x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x13x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc14x32 = *((HVX_Vector*)w);
HVX_Vector vacc14x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x14x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc15x32 = *((HVX_Vector*)w);
HVX_Vector vacc15x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x15x32 = Q6_V_vsplat_R(0);

w = (const int32_t*) w + 32;
Expand Down Expand Up @@ -229,8 +235,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_16x32c4__hvx(
const HVX_Vector va15x4567 = Q6_V_vsplat_R(unaligned_load_s32(a15+4));
a15 += 8;

const HVX_Vector vb32x0123 = *((HVX_Vector *)((int8_t *)w));
const HVX_Vector vb32x4567 = *((HVX_Vector *)((int8_t *)w + 128));
const HVX_Vector vb32x0123 = *((HVX_UVector *)((int8_t *)w));
const HVX_Vector vb32x4567 = *((HVX_UVector *)((int8_t *)w + 128));
vacc0x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc0x32, va0x0123, vb32x0123);
vacc1x0x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc1x0x32, va0x4567, vb32x4567);
vacc1x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc1x32, va1x0123, vb32x0123);
Expand Down Expand Up @@ -319,7 +325,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_16x32c4__hvx(
const HVX_Vector va15x0123 = Q6_V_vsplat_R(unaligned_load_s32(a15));
a15 += 4;

const HVX_Vector vb32x0123 = *((HVX_Vector *)((int8_t *)w));
const HVX_Vector vb32x0123 = *((HVX_UVector *)((int8_t *)w));
vacc0x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc0x32, va0x0123, vb32x0123);
vacc1x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc1x32, va1x0123, vb32x0123);
vacc2x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc2x32, va2x0123, vb32x0123);
Expand Down Expand Up @@ -548,22 +554,22 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_16x32c4__hvx(
nc -= 32;
} else {
// Prepare mask for valid 8-bit elements (depends on nc).
Q6_V_vstu_variable(c0, nc, vout0x32);
Q6_V_vstu_variable(c1, nc, vout1x32);
Q6_V_vstu_variable(c2, nc, vout2x32);
Q6_V_vstu_variable(c3, nc, vout3x32);
Q6_V_vstu_variable(c4, nc, vout4x32);
Q6_V_vstu_variable(c5, nc, vout5x32);
Q6_V_vstu_variable(c6, nc, vout6x32);
Q6_V_vstu_variable(c7, nc, vout7x32);
Q6_V_vstu_variable(c8, nc, vout8x32);
Q6_V_vstu_variable(c9, nc, vout9x32);
Q6_V_vstu_variable(c10, nc, vout10x32);
Q6_V_vstu_variable(c11, nc, vout11x32);
Q6_V_vstu_variable(c12, nc, vout12x32);
Q6_V_vstu_variable(c13, nc, vout13x32);
Q6_V_vstu_variable(c14, nc, vout14x32);
Q6_V_vstu_variable(c15, nc, vout15x32);
xnn_Q6_V_vstu_variable(c0, nc, vout0x32);
xnn_Q6_V_vstu_variable(c1, nc, vout1x32);
xnn_Q6_V_vstu_variable(c2, nc, vout2x32);
xnn_Q6_V_vstu_variable(c3, nc, vout3x32);
xnn_Q6_V_vstu_variable(c4, nc, vout4x32);
xnn_Q6_V_vstu_variable(c5, nc, vout5x32);
xnn_Q6_V_vstu_variable(c6, nc, vout6x32);
xnn_Q6_V_vstu_variable(c7, nc, vout7x32);
xnn_Q6_V_vstu_variable(c8, nc, vout8x32);
xnn_Q6_V_vstu_variable(c9, nc, vout9x32);
xnn_Q6_V_vstu_variable(c10, nc, vout10x32);
xnn_Q6_V_vstu_variable(c11, nc, vout11x32);
xnn_Q6_V_vstu_variable(c12, nc, vout12x32);
xnn_Q6_V_vstu_variable(c13, nc, vout13x32);
xnn_Q6_V_vstu_variable(c14, nc, vout14x32);
xnn_Q6_V_vstu_variable(c15, nc, vout15x32);
nc = 0;
}
} while (nc != 0);
Expand Down
16 changes: 11 additions & 5 deletions src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x32c4-minmax-fp32-hvx.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@


#include <assert.h>
#include <string.h> // for memcpy

#include <hexagon_types.h>
#include <hexagon_protos.h>
Expand All @@ -21,6 +22,11 @@
#include "src/xnnpack/unaligned.h"


static XNN_INTRINSIC void xnn_Q6_V_vstu_variable(void* addr, uint32_t n,
HVX_Vector vin) {
memcpy(addr, &vin, n);
}


void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x32c4__hvx(
size_t mr,
Expand Down Expand Up @@ -53,7 +59,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x32c4__hvx(
const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->fp32_scalar.output_min);

do {
HVX_Vector vacc0x32 = *((HVX_Vector*)w);
HVX_Vector vacc0x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x0x32 = Q6_V_vsplat_R(0);

w = (const int32_t*) w + 32;
Expand All @@ -64,8 +70,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x32c4__hvx(
const HVX_Vector va0x4567 = Q6_V_vsplat_R(unaligned_load_s32(a0+4));
a0 += 8;

const HVX_Vector vb32x0123 = *((HVX_Vector *)((int8_t *)w));
const HVX_Vector vb32x4567 = *((HVX_Vector *)((int8_t *)w + 128));
const HVX_Vector vb32x0123 = *((HVX_UVector *)((int8_t *)w));
const HVX_Vector vb32x4567 = *((HVX_UVector *)((int8_t *)w + 128));
vacc0x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc0x32, va0x0123, vb32x0123);
vacc1x0x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc1x0x32, va0x4567, vb32x4567);

Expand All @@ -79,7 +85,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x32c4__hvx(
const HVX_Vector va0x0123 = Q6_V_vsplat_R(unaligned_load_s32(a0));
a0 += 4;

const HVX_Vector vb32x0123 = *((HVX_Vector *)((int8_t *)w));
const HVX_Vector vb32x0123 = *((HVX_UVector *)((int8_t *)w));
vacc0x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc0x32, va0x0123, vb32x0123);

w = (const int8_t*) w + 128;
Expand Down Expand Up @@ -113,7 +119,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x32c4__hvx(
nc -= 32;
} else {
// Prepare mask for valid 8-bit elements (depends on nc).
Q6_V_vstu_variable(c0, nc, vout0x32);
xnn_Q6_V_vstu_variable(c0, nc, vout0x32);
nc = 0;
}
} while (nc != 0);
Expand Down
28 changes: 17 additions & 11 deletions src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x32c4-minmax-fp32-hvx.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@


#include <assert.h>
#include <string.h> // for memcpy

#include <hexagon_types.h>
#include <hexagon_protos.h>
Expand All @@ -21,6 +22,11 @@
#include "src/xnnpack/unaligned.h"


static XNN_INTRINSIC void xnn_Q6_V_vstu_variable(void* addr, uint32_t n,
HVX_Vector vin) {
memcpy(addr, &vin, n);
}


void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x32c4__hvx(
size_t mr,
Expand Down Expand Up @@ -71,13 +77,13 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x32c4__hvx(
const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->fp32_scalar.output_min);

do {
HVX_Vector vacc0x32 = *((HVX_Vector*)w);
HVX_Vector vacc0x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x0x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc1x32 = *((HVX_Vector*)w);
HVX_Vector vacc1x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x1x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc2x32 = *((HVX_Vector*)w);
HVX_Vector vacc2x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x2x32 = Q6_V_vsplat_R(0);
HVX_Vector vacc3x32 = *((HVX_Vector*)w);
HVX_Vector vacc3x32 = *((HVX_UVector*)w);
HVX_Vector vacc1x3x32 = Q6_V_vsplat_R(0);

w = (const int32_t*) w + 32;
Expand All @@ -97,8 +103,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x32c4__hvx(
const HVX_Vector va3x4567 = Q6_V_vsplat_R(unaligned_load_s32(a3+4));
a3 += 8;

const HVX_Vector vb32x0123 = *((HVX_Vector *)((int8_t *)w));
const HVX_Vector vb32x4567 = *((HVX_Vector *)((int8_t *)w + 128));
const HVX_Vector vb32x0123 = *((HVX_UVector *)((int8_t *)w));
const HVX_Vector vb32x4567 = *((HVX_UVector *)((int8_t *)w + 128));
vacc0x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc0x32, va0x0123, vb32x0123);
vacc1x0x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc1x0x32, va0x4567, vb32x4567);
vacc1x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc1x32, va1x0123, vb32x0123);
Expand Down Expand Up @@ -127,7 +133,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x32c4__hvx(
const HVX_Vector va3x0123 = Q6_V_vsplat_R(unaligned_load_s32(a3));
a3 += 4;

const HVX_Vector vb32x0123 = *((HVX_Vector *)((int8_t *)w));
const HVX_Vector vb32x0123 = *((HVX_UVector *)((int8_t *)w));
vacc0x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc0x32, va0x0123, vb32x0123);
vacc1x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc1x32, va1x0123, vb32x0123);
vacc2x32 = Q6_Vw_vrmpyacc_VwVbVb(vacc2x32, va2x0123, vb32x0123);
Expand Down Expand Up @@ -200,10 +206,10 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x32c4__hvx(
nc -= 32;
} else {
// Prepare mask for valid 8-bit elements (depends on nc).
Q6_V_vstu_variable(c0, nc, vout0x32);
Q6_V_vstu_variable(c1, nc, vout1x32);
Q6_V_vstu_variable(c2, nc, vout2x32);
Q6_V_vstu_variable(c3, nc, vout3x32);
xnn_Q6_V_vstu_variable(c0, nc, vout0x32);
xnn_Q6_V_vstu_variable(c1, nc, vout1x32);
xnn_Q6_V_vstu_variable(c2, nc, vout2x32);
xnn_Q6_V_vstu_variable(c3, nc, vout3x32);
nc = 0;
}
} while (nc != 0);
Expand Down
Loading
Loading