diff --git a/BUILD.bazel b/BUILD.bazel index 6432817b6e1..7d7a96e3fa8 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -178,6 +178,9 @@ MICROKERNEL_DEFS = [ "src/qs8-vmul/qs8-vmul-minmax-rndnu.h", "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h", "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h", + "src/qs8-vprelu/qs8-vprelu.h", + "src/qs8-vpreluc/qs8-vpreluc.h", + "src/qs8-vrpreluc/qs8-vrpreluc.h", "src/qu8-dwconv/qu8-dwconv-minmax-fp32.h", "src/qu8-dwconv/qu8-dwconv-minmax-rndnu.h", "src/qu8-f32-vcvt/qu8-f32-vcvt.h", @@ -189,6 +192,9 @@ MICROKERNEL_DEFS = [ "src/qu8-vmul/qu8-vmul-minmax-rndnu.h", "src/qu8-vmulc/qu8-vmulc-minmax-fp32.h", "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h", + "src/qu8-vprelu/qu8-vprelu.h", + "src/qu8-vpreluc/qu8-vpreluc.h", + "src/qu8-vrpreluc/qu8-vrpreluc.h", "src/s8-maxpool/s8-maxpool-minmax.h", "src/s8-vclamp/s8-vclamp.h", "src/u8-maxpool/u8-maxpool-minmax.h", diff --git a/cmake/gen/avx2_microkernels.cmake b/cmake/gen/avx2_microkernels.cmake index 29a419e7153..3eb82a52097 100644 --- a/cmake/gen/avx2_microkernels.cmake +++ b/cmake/gen/avx2_microkernels.cmake @@ -61,6 +61,9 @@ SET(PROD_AVX2_MICROKERNEL_SRCS src/qs8-vaddc/gen/qs8-vaddc-minmax-avx2-mul32-ld64-u16.c src/qs8-vcvt/gen/qs8-vcvt-avx2-u32.c src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u32.c + src/qs8-vprelu/gen/qs8-vprelu-avx2-u16.c + src/qs8-vpreluc/gen/qs8-vpreluc-avx2-u16.c + src/qs8-vrpreluc/gen/qs8-vrpreluc-avx2-u16.c src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-avx2-mul32.c src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-avx2-mul32.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx2-u16.c @@ -73,6 +76,9 @@ SET(PROD_AVX2_MICROKERNEL_SRCS src/qu8-vaddc/gen/qu8-vaddc-minmax-avx2-mul32-ld64-u16.c src/qu8-vcvt/gen/qu8-vcvt-avx2-u32.c src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c + src/qu8-vprelu/gen/qu8-vprelu-avx2-u16.c + src/qu8-vpreluc/gen/qu8-vpreluc-avx2-u16.c + src/qu8-vrpreluc/gen/qu8-vrpreluc-avx2-u16.c src/s8-vclamp/s8-vclamp-avx2-u128.c src/u8-vclamp/u8-vclamp-avx2-u128.c src/x8-lut/gen/x8-lut-avx2-u128.c diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake index 77e90c12e9d..2d44202c44a 100644 --- a/cmake/gen/scalar_microkernels.cmake +++ b/cmake/gen/scalar_microkernels.cmake @@ -198,6 +198,9 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u4.c src/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-u4.c src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-u4.c + src/qs8-vprelu/gen/qs8-vprelu-scalar-u8.c + src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u8.c + src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u8.c src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c @@ -226,6 +229,9 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u4.c src/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-u4.c src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u4.c + src/qu8-vprelu/gen/qu8-vprelu-scalar-u8.c + src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u8.c + src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u8.c src/s8-ibilinear/gen/s8-ibilinear-scalar-c1.c src/s8-maxpool/gen/s8-maxpool-9p-minmax-scalar-u1.c src/s8-rdminmax/gen/s8-rdmax-2p2x-scalar-c2.c @@ -659,6 +665,15 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-u2.c src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-u1.c src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-u2.c + src/qs8-vprelu/gen/qs8-vprelu-scalar-u1.c + src/qs8-vprelu/gen/qs8-vprelu-scalar-u2.c + src/qs8-vprelu/gen/qs8-vprelu-scalar-u4.c + src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u1.c + src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u2.c + src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u4.c + src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u1.c + src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u2.c + src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u4.c src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c @@ -745,6 +760,15 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-u2.c src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u1.c src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u2.c + src/qu8-vprelu/gen/qu8-vprelu-scalar-u1.c + src/qu8-vprelu/gen/qu8-vprelu-scalar-u2.c + src/qu8-vprelu/gen/qu8-vprelu-scalar-u4.c + src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u1.c + src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u2.c + src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u4.c + src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u1.c + src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u2.c + src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u4.c src/s8-ibilinear/gen/s8-ibilinear-scalar-c2.c src/s8-ibilinear/gen/s8-ibilinear-scalar-c4.c src/s8-rminmax/gen/s8-rmax-scalar-u1.c diff --git a/gen/avx2_microkernels.bzl b/gen/avx2_microkernels.bzl index 61961a62c9b..ef8ea72d8d2 100644 --- a/gen/avx2_microkernels.bzl +++ b/gen/avx2_microkernels.bzl @@ -57,6 +57,9 @@ PROD_AVX2_MICROKERNEL_SRCS = [ "src/qs8-vaddc/gen/qs8-vaddc-minmax-avx2-mul32-ld64-u16.c", "src/qs8-vcvt/gen/qs8-vcvt-avx2-u32.c", "src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u32.c", + "src/qs8-vprelu/gen/qs8-vprelu-avx2-u16.c", + "src/qs8-vpreluc/gen/qs8-vpreluc-avx2-u16.c", + "src/qs8-vrpreluc/gen/qs8-vrpreluc-avx2-u16.c", "src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-avx2-mul32.c", "src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-avx2-mul32.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx2-u16.c", @@ -69,6 +72,9 @@ PROD_AVX2_MICROKERNEL_SRCS = [ "src/qu8-vaddc/gen/qu8-vaddc-minmax-avx2-mul32-ld64-u16.c", "src/qu8-vcvt/gen/qu8-vcvt-avx2-u32.c", "src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c", + "src/qu8-vprelu/gen/qu8-vprelu-avx2-u16.c", + "src/qu8-vpreluc/gen/qu8-vpreluc-avx2-u16.c", + "src/qu8-vrpreluc/gen/qu8-vrpreluc-avx2-u16.c", "src/s8-vclamp/s8-vclamp-avx2-u128.c", "src/u8-vclamp/u8-vclamp-avx2-u128.c", "src/x8-lut/gen/x8-lut-avx2-u128.c", diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl index ad7d1cf7385..9d83428319e 100644 --- a/gen/scalar_microkernels.bzl +++ b/gen/scalar_microkernels.bzl @@ -194,6 +194,9 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u4.c", "src/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-u4.c", "src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-u4.c", + "src/qs8-vprelu/gen/qs8-vprelu-scalar-u8.c", + "src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u8.c", + "src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u8.c", "src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c", "src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c", "src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c", @@ -222,6 +225,9 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u4.c", "src/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-u4.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u4.c", + "src/qu8-vprelu/gen/qu8-vprelu-scalar-u8.c", + "src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u8.c", + "src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u8.c", "src/s8-ibilinear/gen/s8-ibilinear-scalar-c1.c", "src/s8-maxpool/gen/s8-maxpool-9p-minmax-scalar-u1.c", "src/s8-rdminmax/gen/s8-rdmax-2p2x-scalar-c2.c", @@ -656,6 +662,15 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-u2.c", "src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-u1.c", "src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-u2.c", + "src/qs8-vprelu/gen/qs8-vprelu-scalar-u1.c", + "src/qs8-vprelu/gen/qs8-vprelu-scalar-u2.c", + "src/qs8-vprelu/gen/qs8-vprelu-scalar-u4.c", + "src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u1.c", + "src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u2.c", + "src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u4.c", + "src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u1.c", + "src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u2.c", + "src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u4.c", "src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c", "src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c", "src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c", @@ -742,6 +757,15 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-u2.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u1.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u2.c", + "src/qu8-vprelu/gen/qu8-vprelu-scalar-u1.c", + "src/qu8-vprelu/gen/qu8-vprelu-scalar-u2.c", + "src/qu8-vprelu/gen/qu8-vprelu-scalar-u4.c", + "src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u1.c", + "src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u2.c", + "src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u4.c", + "src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u1.c", + "src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u2.c", + "src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u4.c", "src/s8-ibilinear/gen/s8-ibilinear-scalar-c2.c", "src/s8-ibilinear/gen/s8-ibilinear-scalar-c4.c", "src/s8-rminmax/gen/s8-rmax-scalar-u1.c", diff --git a/scripts/generate-qs8-vprelu.sh b/scripts/generate-qs8-vprelu.sh new file mode 100755 index 00000000000..08add47036e --- /dev/null +++ b/scripts/generate-qs8-vprelu.sh @@ -0,0 +1,48 @@ +# Copyright (C) 2024 Intel Corporation +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# +# SPDX-License-Identifier: BSD-3-Clause + +#################################### Scalar ################################### +tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QS8 -o src/qs8-vprelu/gen/qs8-vprelu-scalar-u1.c & +tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QS8 -o src/qs8-vprelu/gen/qs8-vprelu-scalar-u2.c & +tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=4 -D DATATYPE=QS8 -o src/qs8-vprelu/gen/qs8-vprelu-scalar-u4.c & +tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-vprelu/gen/qs8-vprelu-scalar-u8.c & + +tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QU8 -o src/qu8-vprelu/gen/qu8-vprelu-scalar-u1.c & +tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QU8 -o src/qu8-vprelu/gen/qu8-vprelu-scalar-u2.c & +tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=4 -D DATATYPE=QU8 -o src/qu8-vprelu/gen/qu8-vprelu-scalar-u4.c & +tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-vprelu/gen/qu8-vprelu-scalar-u8.c & + +#################################### AVX2 ################################### +tools/xngen src/qs8-vprelu/avx2.c.in -D BATCH_TILE=16 -D AVX=1 -D DATATYPE=QS8 -o src/qs8-vprelu/gen/qs8-vprelu-avx2-u16.c & + +tools/xngen src/qs8-vprelu/avx2.c.in -D BATCH_TILE=16 -D AVX=1 -D DATATYPE=QU8 -o src/qu8-vprelu/gen/qu8-vprelu-avx2-u16.c & + + +wait + diff --git a/scripts/generate-qs8-vpreluc.sh b/scripts/generate-qs8-vpreluc.sh new file mode 100755 index 00000000000..2fd28edc7f2 --- /dev/null +++ b/scripts/generate-qs8-vpreluc.sh @@ -0,0 +1,48 @@ +# Copyright (C) 2024 Intel Corporation +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# +# SPDX-License-Identifier: BSD-3-Clause + +#################################### Scalar ################################### +tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QS8 -o src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u1.c & +tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QS8 -o src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u2.c & +tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=4 -D DATATYPE=QS8 -o src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u4.c & +tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u8.c & + +tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QU8 -o src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u1.c & +tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QU8 -o src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u2.c & +tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=4 -D DATATYPE=QU8 -o src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u4.c & +tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u8.c & + +#################################### AVX2 ################################### +tools/xngen src/qs8-vpreluc/avx2.c.in -D BATCH_TILE=16 -D AVX=1 -D DATATYPE=QS8 -o src/qs8-vpreluc/gen/qs8-vpreluc-avx2-u16.c & + +tools/xngen src/qs8-vpreluc/avx2.c.in -D BATCH_TILE=16 -D AVX=1 -D DATATYPE=QU8 -o src/qu8-vpreluc/gen/qu8-vpreluc-avx2-u16.c & + + +wait + diff --git a/scripts/generate-qs8-vrpreluc.sh b/scripts/generate-qs8-vrpreluc.sh new file mode 100755 index 00000000000..485ce7d74f6 --- /dev/null +++ b/scripts/generate-qs8-vrpreluc.sh @@ -0,0 +1,47 @@ +# Copyright (C) 2024 Intel Corporation +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# +# SPDX-License-Identifier: BSD-3-Clause + +#################################### Scalar ################################### +tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QS8 -o src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u1.c & +tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QS8 -o src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u2.c & +tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=4 -D DATATYPE=QS8 -o src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u4.c & +tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u8.c & + +tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QU8 -o src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u1.c & +tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QU8 -o src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u2.c & +tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=4 -D DATATYPE=QU8 -o src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u4.c & +tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u8.c & + +#################################### AVX2 ################################### +tools/xngen src/qs8-vrpreluc/avx2.c.in -D BATCH_TILE=16 -D AVX=1 -D DATATYPE=QS8 -o src/qs8-vrpreluc/gen/qs8-vrpreluc-avx2-u16.c & + +tools/xngen src/qs8-vrpreluc/avx2.c.in -D BATCH_TILE=16 -D AVX=1 -D DATATYPE=QU8 -o src/qu8-vrpreluc/gen/qu8-vrpreluc-avx2-u16.c & + + +wait diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh index acb71e91197..48c9012b4ab 100755 --- a/scripts/generate-tests.sh +++ b/scripts/generate-tests.sh @@ -136,6 +136,12 @@ tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b - tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel qu8-vmulc-minmax-fp32 --output test/qu8-vmulc-minmax-fp32.cc & tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel qu8-vmulc-minmax-rndnu --output test/qu8-vmulc-minmax-rndnu.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel qs8-vprelu --output test/qs8-vprelu.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel qs8-vpreluc --output test/qs8-vpreluc.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel qs8-vrpreluc --output test/qs8-vrpreluc.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel qu8-vprelu --output test/qu8-vprelu.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel qu8-vpreluc --output test/qu8-vpreluc.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel qu8-vrpreluc --output test/qu8-vrpreluc.cc & ### Tests for VUnary micro-kernels tools/generate-vunary-test.py --ukernel f16-vabs --output test/f16-vabs.cc & tools/generate-vunary-test.py --ukernel f16-vapproxgelu --output test/f16-vapproxgelu.cc & diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c index 0fc41e25fef..652182bd6f0 100644 --- a/src/configs/binary-elementwise-config.c +++ b/src/configs/binary-elementwise-config.c @@ -34,9 +34,11 @@ static struct xnn_binary_elementwise_config f32_vsqrdiff_config = {0}; static struct xnn_binary_elementwise_config qs8_vadd_config = {0}; static struct xnn_binary_elementwise_config qs8_vmul_config = {0}; +static struct xnn_binary_elementwise_config qs8_vprelu_config = {0}; static struct xnn_binary_elementwise_config qu8_vadd_config = {0}; static struct xnn_binary_elementwise_config qu8_vmul_config = {0}; +static struct xnn_binary_elementwise_config qu8_vprelu_config = {0}; XNN_INIT_ONCE_GUARD(f16_vadd); XNN_INIT_ONCE_GUARD(f16_vdiv); @@ -57,9 +59,10 @@ XNN_INIT_ONCE_GUARD(f32_vsub); XNN_INIT_ONCE_GUARD(f32_vsqrdiff); XNN_INIT_ONCE_GUARD(qs8_vadd); XNN_INIT_ONCE_GUARD(qs8_vmul); +XNN_INIT_ONCE_GUARD(qs8_vprelu); XNN_INIT_ONCE_GUARD(qu8_vadd); XNN_INIT_ONCE_GUARD(qu8_vmul); - +XNN_INIT_ONCE_GUARD(qu8_vprelu); static void init_f16_vadd_config(void) { #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR @@ -1143,6 +1146,32 @@ static void init_qs8_vmul_config(void) { #endif } +static void init_qs8_vprelu_config(void) { + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->use_x86_avx2) { + qs8_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vprelu_ukernel__avx2_u16; + qs8_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vpreluc_ukernel__avx2_u16; + qs8_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vrpreluc_ukernel__avx2_u16; + qs8_vprelu_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_vprelu_scalar_params; + qs8_vprelu_config.element_tile = 16; + } else { + qs8_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vprelu_ukernel__scalar_u8; + qs8_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vpreluc_ukernel__scalar_u8; + qs8_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vrpreluc_ukernel__scalar_u8; + qs8_vprelu_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_vprelu_scalar_params; + qs8_vprelu_config.element_tile = 8; + } + #else + qs8_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vprelu_ukernel__scalar_u8; + qs8_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vpreluc_ukernel__scalar_u8; + qs8_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vrpreluc_ukernel__scalar_u8; + qs8_vprelu_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_vprelu_scalar_params; + qs8_vprelu_config.element_tile = 8; + #endif +} + static void init_qu8_vadd_config(void) { #if XNN_ARCH_ARM const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -1292,6 +1321,32 @@ static void init_qu8_vmul_config(void) { #endif } +static void init_qu8_vprelu_config(void) { + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->use_x86_avx2) { + qu8_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vprelu_ukernel__avx2_u16; + qu8_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vpreluc_ukernel__avx2_u16; + qu8_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vrpreluc_ukernel__avx2_u16; + qu8_vprelu_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_vprelu_scalar_params; + qu8_vprelu_config.element_tile = 16; + } else { + qu8_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vprelu_ukernel__scalar_u8; + qu8_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vpreluc_ukernel__scalar_u8; + qu8_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vrpreluc_ukernel__scalar_u8; + qu8_vprelu_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_vprelu_scalar_params; + qu8_vprelu_config.element_tile = 8; + } + #else + qu8_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vprelu_ukernel__scalar_u8; + qu8_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vpreluc_ukernel__scalar_u8; + qu8_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vrpreluc_ukernel__scalar_u8; + qu8_vprelu_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_vprelu_scalar_params; + qu8_vprelu_config.element_tile = 8; + #endif +} + const struct xnn_binary_elementwise_config* xnn_init_f16_vadd_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) { @@ -1463,6 +1518,15 @@ const struct xnn_binary_elementwise_config* xnn_init_qs8_vmul_config() { return &qs8_vmul_config; } +const struct xnn_binary_elementwise_config* xnn_init_qs8_vprelu_config() { + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + if (hardware_config == NULL) { + return NULL; + } + XNN_INIT_ONCE(qs8_vprelu); + return &qs8_vprelu_config; +} + const struct xnn_binary_elementwise_config* xnn_init_qu8_vadd_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { @@ -1480,3 +1544,12 @@ const struct xnn_binary_elementwise_config* xnn_init_qu8_vmul_config() { XNN_INIT_ONCE(qu8_vmul); return &qu8_vmul_config; } + +const struct xnn_binary_elementwise_config* xnn_init_qu8_vprelu_config() { + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + if (hardware_config == NULL) { + return NULL; + } + XNN_INIT_ONCE(qu8_vprelu); + return &qu8_vprelu_config; +} diff --git a/src/microparams-init.c b/src/microparams-init.c index 04ebc7472f2..adf2485afc3 100644 --- a/src/microparams-init.c +++ b/src/microparams-init.c @@ -934,6 +934,30 @@ size_t xnn_init_qs8_add_minmax_scalar_params( return sizeof(uparams->scalar); } +size_t xnn_init_qs8_vprelu_scalar_params( + union xnn_qs8_vprelu_scalar_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization) { + assert(a_quantization); + assert(b_quantization); + assert(output_quantization); + const float negative_product_scale = (a_quantization->scale * b_quantization->scale) / output_quantization->scale; + const float positive_product_scale = a_quantization->scale / output_quantization->scale; + const float rprelu_positive_product_scale = b_quantization->scale / output_quantization->scale; + assert(negative_product_scale >= 0x1.0p-16f); + assert(negative_product_scale < 0x1.0p+8f); + uparams->scalar.input_zero_point = a_quantization->zero_point; + uparams->scalar.slope_zero_point = b_quantization->zero_point; + uparams->scalar.negative_multiplier = negative_product_scale; + uparams->scalar.positive_multiplier = positive_product_scale; + uparams->scalar.rprelu_positive_multiplier = rprelu_positive_product_scale; + uparams->scalar.output_zero_point = output_quantization->zero_point; + uparams->scalar.output_min = INT8_MIN; + uparams->scalar.output_max = INT8_MAX; + return sizeof(uparams->scalar); +} + size_t xnn_init_qu8_mul_minmax_scalar_params( union xnn_qu8_mul_minmax_params uparams[XNN_MIN_ELEMENTS(1)], const struct xnn_quantization_params* a_quantization, @@ -1000,6 +1024,31 @@ size_t xnn_init_qu8_mul_minmax_rndnu_neon_params( } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +size_t xnn_init_qu8_vprelu_scalar_params( + union xnn_qs8_vprelu_scalar_params uparams[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization) { + assert(a_quantization); + assert(b_quantization); + assert(output_quantization); + const float negative_product_scale = (a_quantization->scale * b_quantization->scale) / output_quantization->scale; + const float positive_product_scale = a_quantization->scale / output_quantization->scale; + const float rprelu_positive_product_scale = b_quantization->scale / output_quantization->scale; + assert(negative_product_scale >= 0x1.0p-16f); + assert(negative_product_scale < 0x1.0p+8f); + uparams->scalar.input_zero_point = a_quantization->zero_point; + uparams->scalar.slope_zero_point = b_quantization->zero_point; + uparams->scalar.negative_multiplier = negative_product_scale; + uparams->scalar.positive_multiplier = positive_product_scale; + uparams->scalar.rprelu_positive_multiplier = rprelu_positive_product_scale; + uparams->scalar.output_zero_point = output_quantization->zero_point; + uparams->scalar.output_min = 0; + uparams->scalar.output_max = UINT8_MAX; + return sizeof(uparams->scalar); +} + + size_t xnn_init_qs8_mul_minmax_scalar_params( union xnn_qs8_mul_minmax_params uparams[XNN_MIN_ELEMENTS(1)], const struct xnn_quantization_params* a_quantization, diff --git a/src/operators/binary-elementwise-nd.c b/src/operators/binary-elementwise-nd.c index 7e100fa5784..1a3b9f863ae 100644 --- a/src/operators/binary-elementwise-nd.c +++ b/src/operators/binary-elementwise-nd.c @@ -119,6 +119,10 @@ static const struct xnn_binary_elementwise_config* init_config( return xnn_init_f32_vprelu_config(); case xnn_datatype_fp16: return xnn_init_f16_vprelu_config(); + case xnn_datatype_qint8: + return xnn_init_qs8_vprelu_config(); + case xnn_datatype_quint8: + return xnn_init_qu8_vprelu_config(); default: return NULL; } diff --git a/src/qs8-vprelu/avx2.c.in b/src/qs8-vprelu/avx2.c.in new file mode 100644 index 00000000000..df3569883cf --- /dev/null +++ b/src/qs8-vprelu/avx2.c.in @@ -0,0 +1,167 @@ +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + +$assert DATATYPE in ["QS8", "QU8"] +$assert BATCH_TILE >= 8 +$assert BATCH_TILE == 8 or BATCH_TILE % 16 == 0 +$SIMD_TILE = BATCH_TILE // 16 + +#include +#include +#include +#include "src/xnnpack/intrinsics-polyfill.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] +$_MM256_CVTEPX8_EPI32 = {"QS8": "_mm256_cvtepi8_epi32", "QU8": "_mm256_cvtepu8_epi32"}[DATATYPE] +$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] +void xnn_${DATATYPE.lower()}_vprelu_ukernel__avx2_u${BATCH_TILE}( + size_t batch, + const ${XINT8_T}* input_a, + const ${XINT8_T}* input_b, + ${XINT8_T}* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(${XINT8_T}) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point); + const __m256i vslope_zero_point = _mm256_set1_epi32(params->scalar.slope_zero_point); + const __m256i voutput_zero_point = _mm256_set1_epi32(params->scalar.output_zero_point); + const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.positive_multiplier); + const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier); + const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f); + const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point); + + $if BATCH_TILE > 8: + for (; batch >= ${BATCH_TILE} * sizeof(${XINT8_T}); batch -= ${BATCH_TILE} * sizeof(${XINT8_T})) { + __m256i va0 = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_a)); + __m256i vb0 = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_b)); + + $for N in range(1, 2*SIMD_TILE): + __m256i va${N} = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) (input_a + ${N * 8}))); + __m256i vb${N} = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) (input_b + ${N * 8}))); + input_a += ${BATCH_TILE}; + input_b += ${BATCH_TILE}; + + $for N in range(2*SIMD_TILE): + __m256i va${N}_sub = _mm256_sub_epi32(va${N}, vinput_zero_point); + __m256i vb${N}_sub = _mm256_sub_epi32(vb${N}, vslope_zero_point); + __m256i vcompare${N} = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va${N}_sub); + __m256i vacc${N} = _mm256_blendv_epi8(va${N}_sub, _mm256_mullo_epi32(va${N}_sub, vb${N}_sub), vcompare${N}); + + $for N in range(2*SIMD_TILE): + __m256 vscale${N} = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare${N})); + __m256 vfpacc${N} = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc${N}), vscale${N}); + + $for N in range(2*SIMD_TILE): + __m256 vfpacc_clamped${N} = _mm256_min_ps(_mm256_max_ps(vfpacc${N}, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased${N} = _mm256_add_ps(vfpacc_clamped${N}, vmagic_bias); + __m256i vout${N} = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased${N}), vmagic_bias_less_output_zero_point); + + $for N in range(2*SIMD_TILE): + const __m128i vout_low${N} = _mm256_castsi256_si128(vout${N}); + const __m128i vout_high${N} = _mm256_extracti128_si256(vout${N}, 1); + const __m128i vout_packed16${N} = _mm_packs_epi32(vout_low${N}, vout_high${N}); + __m128i vout_final${N} = ${_MM_PACKXS_EPI16}(vout_packed16${N}, vout_packed16${N}); + + _mm_storeu_si64((__m128i*)(output), vout_final0); + + $for N in range(1, 2*SIMD_TILE): + _mm_storeu_si64((__m128i*)(output + ${N*8}), vout_final${N}); + + output += ${BATCH_TILE}; + } + + for (; batch >= 8 * sizeof(${XINT8_T}); batch -= 8 * sizeof(${XINT8_T})) { + __m256i va = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_a)); + __m256i vb = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_b)); + __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + __m256i vb_sub = _mm256_sub_epi32(vb, vslope_zero_point); + __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub); + __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vb_sub), vcompare); + __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + input_a+=8; + input_b+=8; + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed16 = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = ${_MM_PACKXS_EPI16}(vout_packed16, vout_packed16); + _mm_storeu_si64((__m128i*) output, vout_final); + output+=8; + + + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(${XINT8_T})); + assert(batch <= 7 * sizeof(${XINT8_T})); + + const __m256i va = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) input_a)); + const __m256i vb = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) input_b)); + const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + const __m256i vb_sub = _mm256_sub_epi32(vb, vslope_zero_point); + const __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub); + const __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vb_sub), vcompare); + const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed16 = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = ${_MM_PACKXS_EPI16}(vout_packed16, vout_packed16); + + if (batch & (4 * sizeof(${XINT8_T}))) { + _mm_storeu_si32(output, vout_final); + vout_final = _mm_srli_epi64(vout_final, 32); + output += 4; + } + + if (batch & (2 * sizeof(${XINT8_T}))) { + _mm_storeu_si16(output, vout_final); + vout_final = _mm_srli_epi32(vout_final, 16); + output += 2; + } + if (batch & (1 * sizeof(${XINT8_T}))) { + *output = (${XINT8_T}) _mm_extract_epi8(vout_final, 0); + } + } +} diff --git a/src/qs8-vprelu/gen/qs8-vprelu-avx2-u16.c b/src/qs8-vprelu/gen/qs8-vprelu-avx2-u16.c new file mode 100644 index 00000000000..3ca94449696 --- /dev/null +++ b/src/qs8-vprelu/gen/qs8-vprelu-avx2-u16.c @@ -0,0 +1,171 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vprelu/avx2.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include +#include +#include "src/xnnpack/intrinsics-polyfill.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qs8_vprelu_ukernel__avx2_u16( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point); + const __m256i vslope_zero_point = _mm256_set1_epi32(params->scalar.slope_zero_point); + const __m256i voutput_zero_point = _mm256_set1_epi32(params->scalar.output_zero_point); + const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.positive_multiplier); + const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier); + const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f); + const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point); + + for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) { + __m256i va0 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_a)); + __m256i vb0 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_b)); + + __m256i va1 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) (input_a + 8))); + __m256i vb1 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) (input_b + 8))); + input_a += 16; + input_b += 16; + + __m256i va0_sub = _mm256_sub_epi32(va0, vinput_zero_point); + __m256i vb0_sub = _mm256_sub_epi32(vb0, vslope_zero_point); + __m256i vcompare0 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va0_sub); + __m256i vacc0 = _mm256_blendv_epi8(va0_sub, _mm256_mullo_epi32(va0_sub, vb0_sub), vcompare0); + __m256i va1_sub = _mm256_sub_epi32(va1, vinput_zero_point); + __m256i vb1_sub = _mm256_sub_epi32(vb1, vslope_zero_point); + __m256i vcompare1 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va1_sub); + __m256i vacc1 = _mm256_blendv_epi8(va1_sub, _mm256_mullo_epi32(va1_sub, vb1_sub), vcompare1); + + __m256 vscale0 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare0)); + __m256 vfpacc0 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc0), vscale0); + __m256 vscale1 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare1)); + __m256 vfpacc1 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc1), vscale1); + + __m256 vfpacc_clamped0 = _mm256_min_ps(_mm256_max_ps(vfpacc0, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased0 = _mm256_add_ps(vfpacc_clamped0, vmagic_bias); + __m256i vout0 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased0), vmagic_bias_less_output_zero_point); + __m256 vfpacc_clamped1 = _mm256_min_ps(_mm256_max_ps(vfpacc1, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased1 = _mm256_add_ps(vfpacc_clamped1, vmagic_bias); + __m256i vout1 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased1), vmagic_bias_less_output_zero_point); + + const __m128i vout_low0 = _mm256_castsi256_si128(vout0); + const __m128i vout_high0 = _mm256_extracti128_si256(vout0, 1); + const __m128i vout_packed160 = _mm_packs_epi32(vout_low0, vout_high0); + __m128i vout_final0 = _mm_packs_epi16(vout_packed160, vout_packed160); + const __m128i vout_low1 = _mm256_castsi256_si128(vout1); + const __m128i vout_high1 = _mm256_extracti128_si256(vout1, 1); + const __m128i vout_packed161 = _mm_packs_epi32(vout_low1, vout_high1); + __m128i vout_final1 = _mm_packs_epi16(vout_packed161, vout_packed161); + + _mm_storeu_si64((__m128i*)(output), vout_final0); + + _mm_storeu_si64((__m128i*)(output + 8), vout_final1); + + output += 16; + } + + for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { + __m256i va = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_a)); + __m256i vb = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_b)); + __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + __m256i vb_sub = _mm256_sub_epi32(vb, vslope_zero_point); + __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub); + __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vb_sub), vcompare); + __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + input_a+=8; + input_b+=8; + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed16 = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = _mm_packs_epi16(vout_packed16, vout_packed16); + _mm_storeu_si64((__m128i*) output, vout_final); + output+=8; + + + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(int8_t)); + assert(batch <= 7 * sizeof(int8_t)); + + const __m256i va = _mm256_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) input_a)); + const __m256i vb = _mm256_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) input_b)); + const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + const __m256i vb_sub = _mm256_sub_epi32(vb, vslope_zero_point); + const __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub); + const __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vb_sub), vcompare); + const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed16 = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = _mm_packs_epi16(vout_packed16, vout_packed16); + + if (batch & (4 * sizeof(int8_t))) { + _mm_storeu_si32(output, vout_final); + vout_final = _mm_srli_epi64(vout_final, 32); + output += 4; + } + + if (batch & (2 * sizeof(int8_t))) { + _mm_storeu_si16(output, vout_final); + vout_final = _mm_srli_epi32(vout_final, 16); + output += 2; + } + if (batch & (1 * sizeof(int8_t))) { + *output = (int8_t) _mm_extract_epi8(vout_final, 0); + } + } +} diff --git a/src/qs8-vprelu/gen/qs8-vprelu-scalar-u1.c b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u1.c new file mode 100644 index 00000000000..cc182272768 --- /dev/null +++ b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u1.c @@ -0,0 +1,76 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vprelu/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + + +void xnn_qs8_vprelu_ukernel__scalar_u1( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const int32_t slope_zero_point = params->scalar.slope_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + const int32_t vb = (int32_t) *input_b++ - slope_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (int8_t) vout; + batch -= sizeof(int8_t); + } while (batch != 0); +} diff --git a/src/qs8-vprelu/gen/qs8-vprelu-scalar-u2.c b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u2.c new file mode 100644 index 00000000000..79778d1efcd --- /dev/null +++ b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u2.c @@ -0,0 +1,110 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vprelu/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + + +void xnn_qs8_vprelu_ukernel__scalar_u2( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const int32_t slope_zero_point = params->scalar.slope_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + + for (; batch >= 2 * sizeof(int8_t); batch -= 2 * sizeof(int8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + input_a += 2; + + const int32_t vb0 = input_b[0] - slope_zero_point; + const int32_t vb1 = input_b[1] - slope_zero_point; + input_b += 2; + + int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * vb0 : va0; + int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * vb1 : va1; + + float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + + output[0] = (int8_t) vout0; + output[1] = (int8_t) vout1; + output += 2; + } + + if XNN_UNLIKELY(batch != 0) { + const int32_t va = (int32_t) *input_a - input_zero_point; + const int32_t vb = (int32_t) *input_b - slope_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output = (int8_t) vout; + } +} diff --git a/src/qs8-vprelu/gen/qs8-vprelu-scalar-u4.c b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u4.c new file mode 100644 index 00000000000..f1154357c54 --- /dev/null +++ b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u4.c @@ -0,0 +1,133 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vprelu/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + + +void xnn_qs8_vprelu_ukernel__scalar_u4( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const int32_t slope_zero_point = params->scalar.slope_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + + for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + const int32_t va2 = input_a[2] - input_zero_point; + const int32_t va3 = input_a[3] - input_zero_point; + input_a += 4; + + const int32_t vb0 = input_b[0] - slope_zero_point; + const int32_t vb1 = input_b[1] - slope_zero_point; + const int32_t vb2 = input_b[2] - slope_zero_point; + const int32_t vb3 = input_b[3] - slope_zero_point; + input_b += 4; + + int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * vb0 : va0; + int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * vb1 : va1; + int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * vb2 : va2; + int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * vb3 : va3; + + float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + float vfpacc2 = (float) vacc2 * vscale2; + float vfpacc3 = (float) vacc3 * vscale3; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); + vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); + vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + vfpacc2 += vmagic_bias; + vfpacc3 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; + const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; + + output[0] = (int8_t) vout0; + output[1] = (int8_t) vout1; + output[2] = (int8_t) vout2; + output[3] = (int8_t) vout3; + output += 4; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + const int32_t vb = (int32_t) *input_b++ - slope_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (int8_t) vout; + batch -= sizeof(int8_t); + } while (batch != 0); + } +} diff --git a/src/qs8-vprelu/gen/qs8-vprelu-scalar-u8.c b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u8.c new file mode 100644 index 00000000000..222001b2fdf --- /dev/null +++ b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u8.c @@ -0,0 +1,173 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vprelu/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + + +void xnn_qs8_vprelu_ukernel__scalar_u8( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const int32_t slope_zero_point = params->scalar.slope_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + + for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + const int32_t va2 = input_a[2] - input_zero_point; + const int32_t va3 = input_a[3] - input_zero_point; + const int32_t va4 = input_a[4] - input_zero_point; + const int32_t va5 = input_a[5] - input_zero_point; + const int32_t va6 = input_a[6] - input_zero_point; + const int32_t va7 = input_a[7] - input_zero_point; + input_a += 8; + + const int32_t vb0 = input_b[0] - slope_zero_point; + const int32_t vb1 = input_b[1] - slope_zero_point; + const int32_t vb2 = input_b[2] - slope_zero_point; + const int32_t vb3 = input_b[3] - slope_zero_point; + const int32_t vb4 = input_b[4] - slope_zero_point; + const int32_t vb5 = input_b[5] - slope_zero_point; + const int32_t vb6 = input_b[6] - slope_zero_point; + const int32_t vb7 = input_b[7] - slope_zero_point; + input_b += 8; + + int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * vb0 : va0; + int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * vb1 : va1; + int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * vb2 : va2; + int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * vb3 : va3; + int32_t vacc4 = XNN_UNPREDICTABLE(va4 < 0) ? va4 * vb4 : va4; + int32_t vacc5 = XNN_UNPREDICTABLE(va5 < 0) ? va5 * vb5 : va5; + int32_t vacc6 = XNN_UNPREDICTABLE(va6 < 0) ? va6 * vb6 : va6; + int32_t vacc7 = XNN_UNPREDICTABLE(va7 < 0) ? va7 * vb7 : va7; + + float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale4 = XNN_UNPREDICTABLE(va4 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale5 = XNN_UNPREDICTABLE(va5 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale6 = XNN_UNPREDICTABLE(va6 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale7 = XNN_UNPREDICTABLE(va7 < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + float vfpacc2 = (float) vacc2 * vscale2; + float vfpacc3 = (float) vacc3 * vscale3; + float vfpacc4 = (float) vacc4 * vscale4; + float vfpacc5 = (float) vacc5 * vscale5; + float vfpacc6 = (float) vacc6 * vscale6; + float vfpacc7 = (float) vacc7 * vscale7; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); + vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); + vfpacc4 = math_max_f32(vfpacc4, voutput_min_less_zero_point); + vfpacc5 = math_max_f32(vfpacc5, voutput_min_less_zero_point); + vfpacc6 = math_max_f32(vfpacc6, voutput_min_less_zero_point); + vfpacc7 = math_max_f32(vfpacc7, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); + vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); + vfpacc4 = math_min_f32(vfpacc4, voutput_max_less_zero_point); + vfpacc5 = math_min_f32(vfpacc5, voutput_max_less_zero_point); + vfpacc6 = math_min_f32(vfpacc6, voutput_max_less_zero_point); + vfpacc7 = math_min_f32(vfpacc7, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + vfpacc2 += vmagic_bias; + vfpacc3 += vmagic_bias; + vfpacc4 += vmagic_bias; + vfpacc5 += vmagic_bias; + vfpacc6 += vmagic_bias; + vfpacc7 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; + const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; + const int32_t vout4 = (int32_t) float_as_uint32(vfpacc4) - vmagic_bias_less_output_zero_point; + const int32_t vout5 = (int32_t) float_as_uint32(vfpacc5) - vmagic_bias_less_output_zero_point; + const int32_t vout6 = (int32_t) float_as_uint32(vfpacc6) - vmagic_bias_less_output_zero_point; + const int32_t vout7 = (int32_t) float_as_uint32(vfpacc7) - vmagic_bias_less_output_zero_point; + + output[0] = (int8_t) vout0; + output[1] = (int8_t) vout1; + output[2] = (int8_t) vout2; + output[3] = (int8_t) vout3; + output[4] = (int8_t) vout4; + output[5] = (int8_t) vout5; + output[6] = (int8_t) vout6; + output[7] = (int8_t) vout7; + output += 8; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + const int32_t vb = (int32_t) *input_b++ - slope_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (int8_t) vout; + batch -= sizeof(int8_t); + } while (batch != 0); + } +} diff --git a/src/qs8-vprelu/qs8-vprelu.h b/src/qs8-vprelu/qs8-vprelu.h new file mode 100644 index 00000000000..adaa8f4ed16 --- /dev/null +++ b/src/qs8-vprelu/qs8-vprelu.h @@ -0,0 +1,59 @@ +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef XNN_UKERNEL_WITH_PARAMS +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) +#define XNN_DEFINED_UKERNEL_WITH_PARAMS +#endif + +#ifndef XNN_UKERNEL +#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ + XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) +#define XNN_DEFINED_UKERNEL +#endif + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vprelu_ukernel__avx2_u16, 16, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vprelu_ukernel__scalar_u1, 1, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vprelu_ukernel__scalar_u2, 2, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vprelu_ukernel__scalar_u4, 4, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vprelu_ukernel__scalar_u8, 8, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) + +#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_UKERNEL_WITH_PARAMS +#endif + +#ifdef XNN_DEFINED_UKERNEL +#undef XNN_DEFINED_UKERNEL +#undef XNN_UKERNEL +#endif \ No newline at end of file diff --git a/src/qs8-vprelu/scalar.c.in b/src/qs8-vprelu/scalar.c.in new file mode 100644 index 00000000000..abd18dd6c71 --- /dev/null +++ b/src/qs8-vprelu/scalar.c.in @@ -0,0 +1,138 @@ +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + +$assert DATATYPE in ["QS8", "QU8"] +$assert BATCH_TILE >= 1 + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + + +$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] +void xnn_${DATATYPE.lower()}_vprelu_ukernel__scalar_u${BATCH_TILE}( + size_t batch, + const ${XINT8_T}* input_a, + const ${XINT8_T}* input_b, + ${XINT8_T}* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(${XINT8_T}) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const int32_t slope_zero_point = params->scalar.slope_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + + $if BATCH_TILE == 1: + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + const int32_t vb = (int32_t) *input_b++ - slope_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (${XINT8_T}) vout; + batch -= sizeof(${XINT8_T}); + } while (batch != 0); + $else: + for (; batch >= ${BATCH_TILE} * sizeof(${XINT8_T}); batch -= ${BATCH_TILE} * sizeof(${XINT8_T})) { + $for N in range(BATCH_TILE): + const int32_t va${N} = input_a[${N}] - input_zero_point; + input_a += ${BATCH_TILE}; + + $for N in range(BATCH_TILE): + const int32_t vb${N} = input_b[${N}] - slope_zero_point; + input_b += ${BATCH_TILE}; + + $for N in range(BATCH_TILE): + int32_t vacc${N} = XNN_UNPREDICTABLE(va${N} < 0) ? va${N} * vb${N} : va${N}; + + $for N in range(BATCH_TILE): + float vscale${N} = XNN_UNPREDICTABLE(va${N} < 0) ? vnegative_multiplier : vpositive_multiplier; + + $for N in range(BATCH_TILE): + float vfpacc${N} = (float) vacc${N} * vscale${N}; + + $for N in range(BATCH_TILE): + vfpacc${N} = math_max_f32(vfpacc${N}, voutput_min_less_zero_point); + + $for N in range(BATCH_TILE): + vfpacc${N} = math_min_f32(vfpacc${N}, voutput_max_less_zero_point); + + $for N in range(BATCH_TILE): + vfpacc${N} += vmagic_bias; + + $for N in range(BATCH_TILE): + const int32_t vout${N} = (int32_t) float_as_uint32(vfpacc${N}) - vmagic_bias_less_output_zero_point; + + $for N in range(BATCH_TILE): + output[${N}] = (${XINT8_T}) vout${N}; + output += ${BATCH_TILE}; + } + + if XNN_UNLIKELY(batch != 0) { + $if BATCH_TILE == 2: + const int32_t va = (int32_t) *input_a - input_zero_point; + const int32_t vb = (int32_t) *input_b - slope_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output = (${XINT8_T}) vout; + $else: + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + const int32_t vb = (int32_t) *input_b++ - slope_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (${XINT8_T}) vout; + batch -= sizeof(${XINT8_T}); + } while (batch != 0); + } +} diff --git a/src/qs8-vpreluc/avx2.c.in b/src/qs8-vpreluc/avx2.c.in new file mode 100644 index 00000000000..540c93f0827 --- /dev/null +++ b/src/qs8-vpreluc/avx2.c.in @@ -0,0 +1,159 @@ +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + +$assert DATATYPE in ["QS8", "QU8"] +$assert BATCH_TILE >= 8 +$assert BATCH_TILE == 8 or BATCH_TILE % 16 == 0 +$SIMD_TILE = BATCH_TILE // 16 + +#include +#include +#include +#include "src/xnnpack/intrinsics-polyfill.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] +$_MM256_CVTEPX8_EPI32 = {"QS8": "_mm256_cvtepi8_epi32", "QU8": "_mm256_cvtepu8_epi32"}[DATATYPE] +$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] +void xnn_${DATATYPE.lower()}_vpreluc_ukernel__avx2_u${BATCH_TILE}( + size_t batch, + const ${XINT8_T}* input_a, + const ${XINT8_T}* input_b, + ${XINT8_T}* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(${XINT8_T}) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point); + const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.positive_multiplier); + const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier); + const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f); + const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point); + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + const __m256i vslope = _mm256_set1_epi32(slope); + $if BATCH_TILE > 8: + for (; batch >= ${BATCH_TILE} * sizeof(${XINT8_T}); batch -= ${BATCH_TILE} * sizeof(${XINT8_T})) { + __m256i va0 = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_a)); + + + $for N in range(1, 2*SIMD_TILE): + __m256i va${N} = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) (input_a + ${N * 8}))); + input_a += ${BATCH_TILE}; + + + $for N in range(2*SIMD_TILE): + __m256i va${N}_sub = _mm256_sub_epi32(va${N}, vinput_zero_point); + __m256i vcompare${N} = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va${N}_sub); + __m256i vacc${N} = _mm256_blendv_epi8(va${N}_sub, _mm256_mullo_epi32(va${N}_sub, vslope), vcompare${N}); + + $for N in range(2*SIMD_TILE): + __m256 vscale${N} = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare${N})); + __m256 vfpacc${N} = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc${N}), vscale${N}); + + $for N in range(2*SIMD_TILE): + __m256 vfpacc_clamped${N} = _mm256_min_ps(_mm256_max_ps(vfpacc${N}, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased${N} = _mm256_add_ps(vfpacc_clamped${N}, vmagic_bias); + __m256i vout${N} = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased${N}), vmagic_bias_less_output_zero_point); + + $for N in range(2*SIMD_TILE): + const __m128i vout_low${N} = _mm256_castsi256_si128(vout${N}); + const __m128i vout_high${N} = _mm256_extracti128_si256(vout${N}, 1); + const __m128i vout_packed${N} = _mm_packs_epi32(vout_low${N}, vout_high${N}); + __m128i vout_final${N} = ${_MM_PACKXS_EPI16}(vout_packed${N}, vout_packed${N}); + + _mm_storeu_si64((__m128i*)(output), vout_final0); + + $for N in range(1, 2*SIMD_TILE): + _mm_storeu_si64((__m128i*)(output + ${N*8}), vout_final${N}); + + output += ${BATCH_TILE}; + } + + for (; batch >= 8 * sizeof(${XINT8_T}); batch -= 8 * sizeof(${XINT8_T})) { + __m256i va = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_a)); + __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub); + __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vslope), vcompare); + __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + input_a+=8; + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = ${_MM_PACKXS_EPI16}(vout_packed, vout_packed); + _mm_storeu_si64((__m128i*) output, vout_final); + output+=8; + + + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(${XINT8_T})); + assert(batch <= 7 * sizeof(${XINT8_T})); + + const __m256i va = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) input_a)); + const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub); + const __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vslope), vcompare); + const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = ${_MM_PACKXS_EPI16}(vout_packed, vout_packed); + + if (batch & (4 * sizeof(${XINT8_T}))) { + _mm_storeu_si32(output, vout_final); + vout_final = _mm_srli_epi64(vout_final, 32); + output += 4; + } + + if (batch & (2 * sizeof(${XINT8_T}))) { + _mm_storeu_si16(output, vout_final); + vout_final = _mm_srli_epi32(vout_final, 16); + output += 2; + } + if (batch & (1 * sizeof(${XINT8_T}))) { + *output = (${XINT8_T}) _mm_extract_epi8(vout_final, 0); + } + } +} diff --git a/src/qs8-vpreluc/gen/qs8-vpreluc-avx2-u16.c b/src/qs8-vpreluc/gen/qs8-vpreluc-avx2-u16.c new file mode 100644 index 00000000000..0be14824148 --- /dev/null +++ b/src/qs8-vpreluc/gen/qs8-vpreluc-avx2-u16.c @@ -0,0 +1,162 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vpreluc/avx2.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include +#include +#include "src/xnnpack/intrinsics-polyfill.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qs8_vpreluc_ukernel__avx2_u16( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point); + const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.positive_multiplier); + const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier); + const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f); + const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point); + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + const __m256i vslope = _mm256_set1_epi32(slope); + for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) { + __m256i va0 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_a)); + + + __m256i va1 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) (input_a + 8))); + input_a += 16; + + + __m256i va0_sub = _mm256_sub_epi32(va0, vinput_zero_point); + __m256i vcompare0 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va0_sub); + __m256i vacc0 = _mm256_blendv_epi8(va0_sub, _mm256_mullo_epi32(va0_sub, vslope), vcompare0); + __m256i va1_sub = _mm256_sub_epi32(va1, vinput_zero_point); + __m256i vcompare1 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va1_sub); + __m256i vacc1 = _mm256_blendv_epi8(va1_sub, _mm256_mullo_epi32(va1_sub, vslope), vcompare1); + + __m256 vscale0 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare0)); + __m256 vfpacc0 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc0), vscale0); + __m256 vscale1 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare1)); + __m256 vfpacc1 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc1), vscale1); + + __m256 vfpacc_clamped0 = _mm256_min_ps(_mm256_max_ps(vfpacc0, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased0 = _mm256_add_ps(vfpacc_clamped0, vmagic_bias); + __m256i vout0 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased0), vmagic_bias_less_output_zero_point); + __m256 vfpacc_clamped1 = _mm256_min_ps(_mm256_max_ps(vfpacc1, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased1 = _mm256_add_ps(vfpacc_clamped1, vmagic_bias); + __m256i vout1 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased1), vmagic_bias_less_output_zero_point); + + const __m128i vout_low0 = _mm256_castsi256_si128(vout0); + const __m128i vout_high0 = _mm256_extracti128_si256(vout0, 1); + const __m128i vout_packed0 = _mm_packs_epi32(vout_low0, vout_high0); + __m128i vout_final0 = _mm_packs_epi16(vout_packed0, vout_packed0); + const __m128i vout_low1 = _mm256_castsi256_si128(vout1); + const __m128i vout_high1 = _mm256_extracti128_si256(vout1, 1); + const __m128i vout_packed1 = _mm_packs_epi32(vout_low1, vout_high1); + __m128i vout_final1 = _mm_packs_epi16(vout_packed1, vout_packed1); + + _mm_storeu_si64((__m128i*)(output), vout_final0); + + _mm_storeu_si64((__m128i*)(output + 8), vout_final1); + + output += 16; + } + + for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { + __m256i va = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_a)); + __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub); + __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vslope), vcompare); + __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + input_a+=8; + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = _mm_packs_epi16(vout_packed, vout_packed); + _mm_storeu_si64((__m128i*) output, vout_final); + output+=8; + + + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(int8_t)); + assert(batch <= 7 * sizeof(int8_t)); + + const __m256i va = _mm256_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) input_a)); + const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub); + const __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vslope), vcompare); + const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = _mm_packs_epi16(vout_packed, vout_packed); + + if (batch & (4 * sizeof(int8_t))) { + _mm_storeu_si32(output, vout_final); + vout_final = _mm_srli_epi64(vout_final, 32); + output += 4; + } + + if (batch & (2 * sizeof(int8_t))) { + _mm_storeu_si16(output, vout_final); + vout_final = _mm_srli_epi32(vout_final, 16); + output += 2; + } + if (batch & (1 * sizeof(int8_t))) { + *output = (int8_t) _mm_extract_epi8(vout_final, 0); + } + } +} diff --git a/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u1.c b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u1.c new file mode 100644 index 00000000000..9de87e0a41e --- /dev/null +++ b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u1.c @@ -0,0 +1,73 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qs8_vpreluc_ukernel__scalar_u1( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (int8_t) vout; + batch -= sizeof(int8_t); + } while (batch != 0); +} diff --git a/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u2.c b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u2.c new file mode 100644 index 00000000000..852603e354c --- /dev/null +++ b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u2.c @@ -0,0 +1,103 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qs8_vpreluc_ukernel__scalar_u2( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + for (; batch >= 2 * sizeof(int8_t); batch -= 2 * sizeof(int8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + input_a += 2; + + int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * slope : va0; + int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * slope : va1; + + float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + + output[0] = (int8_t) vout0; + output[1] = (int8_t) vout1; + output += 2; + } + + if XNN_UNLIKELY(batch != 0) { + const int32_t va = (int32_t) *input_a - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va; + float vscale = XNN_UNPREDICTABLE(va >= 0) ? vpositive_multiplier : vnegative_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output = (int8_t) vout; + } +} diff --git a/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u4.c b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u4.c new file mode 100644 index 00000000000..996f8739ef0 --- /dev/null +++ b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u4.c @@ -0,0 +1,124 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qs8_vpreluc_ukernel__scalar_u4( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + const int32_t va2 = input_a[2] - input_zero_point; + const int32_t va3 = input_a[3] - input_zero_point; + input_a += 4; + + int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * slope : va0; + int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * slope : va1; + int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * slope : va2; + int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * slope : va3; + + float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + float vfpacc2 = (float) vacc2 * vscale2; + float vfpacc3 = (float) vacc3 * vscale3; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); + vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); + vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + vfpacc2 += vmagic_bias; + vfpacc3 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; + const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; + + output[0] = (int8_t) vout0; + output[1] = (int8_t) vout1; + output[2] = (int8_t) vout2; + output[3] = (int8_t) vout3; + output += 4; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (int8_t) vout; + batch -= sizeof(int8_t); + } while (batch != 0); + } +} diff --git a/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u8.c b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u8.c new file mode 100644 index 00000000000..36344d5210c --- /dev/null +++ b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u8.c @@ -0,0 +1,160 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qs8_vpreluc_ukernel__scalar_u8( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + const int32_t va2 = input_a[2] - input_zero_point; + const int32_t va3 = input_a[3] - input_zero_point; + const int32_t va4 = input_a[4] - input_zero_point; + const int32_t va5 = input_a[5] - input_zero_point; + const int32_t va6 = input_a[6] - input_zero_point; + const int32_t va7 = input_a[7] - input_zero_point; + input_a += 8; + + int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * slope : va0; + int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * slope : va1; + int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * slope : va2; + int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * slope : va3; + int32_t vacc4 = XNN_UNPREDICTABLE(va4 < 0) ? va4 * slope : va4; + int32_t vacc5 = XNN_UNPREDICTABLE(va5 < 0) ? va5 * slope : va5; + int32_t vacc6 = XNN_UNPREDICTABLE(va6 < 0) ? va6 * slope : va6; + int32_t vacc7 = XNN_UNPREDICTABLE(va7 < 0) ? va7 * slope : va7; + + float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale4 = XNN_UNPREDICTABLE(va4 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale5 = XNN_UNPREDICTABLE(va5 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale6 = XNN_UNPREDICTABLE(va6 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale7 = XNN_UNPREDICTABLE(va7 < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + float vfpacc2 = (float) vacc2 * vscale2; + float vfpacc3 = (float) vacc3 * vscale3; + float vfpacc4 = (float) vacc4 * vscale4; + float vfpacc5 = (float) vacc5 * vscale5; + float vfpacc6 = (float) vacc6 * vscale6; + float vfpacc7 = (float) vacc7 * vscale7; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); + vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); + vfpacc4 = math_max_f32(vfpacc4, voutput_min_less_zero_point); + vfpacc5 = math_max_f32(vfpacc5, voutput_min_less_zero_point); + vfpacc6 = math_max_f32(vfpacc6, voutput_min_less_zero_point); + vfpacc7 = math_max_f32(vfpacc7, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); + vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); + vfpacc4 = math_min_f32(vfpacc4, voutput_max_less_zero_point); + vfpacc5 = math_min_f32(vfpacc5, voutput_max_less_zero_point); + vfpacc6 = math_min_f32(vfpacc6, voutput_max_less_zero_point); + vfpacc7 = math_min_f32(vfpacc7, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + vfpacc2 += vmagic_bias; + vfpacc3 += vmagic_bias; + vfpacc4 += vmagic_bias; + vfpacc5 += vmagic_bias; + vfpacc6 += vmagic_bias; + vfpacc7 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; + const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; + const int32_t vout4 = (int32_t) float_as_uint32(vfpacc4) - vmagic_bias_less_output_zero_point; + const int32_t vout5 = (int32_t) float_as_uint32(vfpacc5) - vmagic_bias_less_output_zero_point; + const int32_t vout6 = (int32_t) float_as_uint32(vfpacc6) - vmagic_bias_less_output_zero_point; + const int32_t vout7 = (int32_t) float_as_uint32(vfpacc7) - vmagic_bias_less_output_zero_point; + + output[0] = (int8_t) vout0; + output[1] = (int8_t) vout1; + output[2] = (int8_t) vout2; + output[3] = (int8_t) vout3; + output[4] = (int8_t) vout4; + output[5] = (int8_t) vout5; + output[6] = (int8_t) vout6; + output[7] = (int8_t) vout7; + output += 8; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (int8_t) vout; + batch -= sizeof(int8_t); + } while (batch != 0); + } +} diff --git a/src/qs8-vpreluc/qs8-vpreluc.h b/src/qs8-vpreluc/qs8-vpreluc.h new file mode 100644 index 00000000000..403889fc111 --- /dev/null +++ b/src/qs8-vpreluc/qs8-vpreluc.h @@ -0,0 +1,59 @@ +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef XNN_UKERNEL_WITH_PARAMS +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) +#define XNN_DEFINED_UKERNEL_WITH_PARAMS +#endif + +#ifndef XNN_UKERNEL +#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ + XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) +#define XNN_DEFINED_UKERNEL +#endif + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vpreluc_ukernel__avx2_u16, 16, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vpreluc_ukernel__scalar_u1, 1, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vpreluc_ukernel__scalar_u2, 2, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vpreluc_ukernel__scalar_u4, 4, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vpreluc_ukernel__scalar_u8, 8, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) + +#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_UKERNEL_WITH_PARAMS +#endif + +#ifdef XNN_DEFINED_UKERNEL +#undef XNN_DEFINED_UKERNEL +#undef XNN_UKERNEL +#endif diff --git a/src/qs8-vpreluc/scalar.c.in b/src/qs8-vpreluc/scalar.c.in new file mode 100644 index 00000000000..0a7dab0e10d --- /dev/null +++ b/src/qs8-vpreluc/scalar.c.in @@ -0,0 +1,129 @@ +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + +$assert DATATYPE in ["QS8", "QU8"] +$assert BATCH_TILE >= 1 + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] +void xnn_${DATATYPE.lower()}_vpreluc_ukernel__scalar_u${BATCH_TILE}( + size_t batch, + const ${XINT8_T}* input_a, + const ${XINT8_T}* input_b, + ${XINT8_T}* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(${XINT8_T}) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + $if BATCH_TILE == 1: + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (${XINT8_T}) vout; + batch -= sizeof(${XINT8_T}); + } while (batch != 0); + $else: + for (; batch >= ${BATCH_TILE} * sizeof(${XINT8_T}); batch -= ${BATCH_TILE} * sizeof(${XINT8_T})) { + $for N in range(BATCH_TILE): + const int32_t va${N} = input_a[${N}] - input_zero_point; + input_a += ${BATCH_TILE}; + + $for N in range(BATCH_TILE): + int32_t vacc${N} = XNN_UNPREDICTABLE(va${N} < 0) ? va${N} * slope : va${N}; + + $for N in range(BATCH_TILE): + float vscale${N} = XNN_UNPREDICTABLE(va${N} < 0) ? vnegative_multiplier : vpositive_multiplier; + + $for N in range(BATCH_TILE): + float vfpacc${N} = (float) vacc${N} * vscale${N}; + + $for N in range(BATCH_TILE): + vfpacc${N} = math_max_f32(vfpacc${N}, voutput_min_less_zero_point); + + $for N in range(BATCH_TILE): + vfpacc${N} = math_min_f32(vfpacc${N}, voutput_max_less_zero_point); + + $for N in range(BATCH_TILE): + vfpacc${N} += vmagic_bias; + + $for N in range(BATCH_TILE): + const int32_t vout${N} = (int32_t) float_as_uint32(vfpacc${N}) - vmagic_bias_less_output_zero_point; + + $for N in range(BATCH_TILE): + output[${N}] = (${XINT8_T}) vout${N}; + output += ${BATCH_TILE}; + } + + if XNN_UNLIKELY(batch != 0) { + $if BATCH_TILE == 2: + const int32_t va = (int32_t) *input_a - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va; + float vscale = XNN_UNPREDICTABLE(va >= 0) ? vpositive_multiplier : vnegative_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output = (${XINT8_T}) vout; + $else: + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (${XINT8_T}) vout; + batch -= sizeof(${XINT8_T}); + } while (batch != 0); + } +} diff --git a/src/qs8-vrpreluc/avx2.c.in b/src/qs8-vrpreluc/avx2.c.in new file mode 100644 index 00000000000..df04f427419 --- /dev/null +++ b/src/qs8-vrpreluc/avx2.c.in @@ -0,0 +1,159 @@ +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + +$assert DATATYPE in ["QS8", "QU8"] +$assert BATCH_TILE >= 8 +$assert BATCH_TILE == 8 or BATCH_TILE % 16 == 0 +$SIMD_TILE = BATCH_TILE // 16 + +#include +#include +#include +#include "src/xnnpack/intrinsics-polyfill.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] +$_MM256_CVTEPX8_EPI32 = {"QS8": "_mm256_cvtepi8_epi32", "QU8": "_mm256_cvtepu8_epi32"}[DATATYPE] +$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] +void xnn_${DATATYPE.lower()}_vrpreluc_ukernel__avx2_u${BATCH_TILE}( + size_t batch, + const ${XINT8_T}* input_a, + const ${XINT8_T}* input_b, + ${XINT8_T}* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(${XINT8_T}) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point); + const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.rprelu_positive_multiplier); + const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier); + const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f); + const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point); + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + const __m256i vslope = _mm256_set1_epi32(slope); + $if BATCH_TILE > 8: + for (; batch >= ${BATCH_TILE} * sizeof(${XINT8_T}); batch -= ${BATCH_TILE} * sizeof(${XINT8_T})) { + __m256i va0 = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_a)); + + + $for N in range(1, 2*SIMD_TILE): + __m256i va${N} = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) (input_a + ${N * 8}))); + input_a += ${BATCH_TILE}; + + + $for N in range(2*SIMD_TILE): + __m256i va${N}_sub = _mm256_sub_epi32(va${N}, vinput_zero_point); + __m256i vcompare${N} = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope); + __m256i vacc${N} = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va${N}_sub, vslope), vcompare${N}); + + $for N in range(2*SIMD_TILE): + __m256 vscale${N} = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare${N})); + __m256 vfpacc${N} = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc${N}), vscale${N}); + + $for N in range(2*SIMD_TILE): + __m256 vfpacc_clamped${N} = _mm256_min_ps(_mm256_max_ps(vfpacc${N}, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased${N} = _mm256_add_ps(vfpacc_clamped${N}, vmagic_bias); + __m256i vout${N} = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased${N}), vmagic_bias_less_output_zero_point); + + $for N in range(2*SIMD_TILE): + const __m128i vout_low${N} = _mm256_castsi256_si128(vout${N}); + const __m128i vout_high${N} = _mm256_extracti128_si256(vout${N}, 1); + const __m128i vout_packed${N} = _mm_packs_epi32(vout_low${N}, vout_high${N}); + __m128i vout_final${N} = ${_MM_PACKXS_EPI16}(vout_packed${N}, vout_packed${N}); + + _mm_storeu_si64((__m128i*)(output), vout_final0); + + $for N in range(1, 2*SIMD_TILE): + _mm_storeu_si64((__m128i*)(output + ${N*8}), vout_final${N}); + + output += ${BATCH_TILE}; + } + + for (; batch >= 8 * sizeof(${XINT8_T}); batch -= 8 * sizeof(${XINT8_T})) { + __m256i va = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_a)); + __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope); + __m256i vacc = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va_sub, vslope), vcompare); + __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + input_a+=8; + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = ${_MM_PACKXS_EPI16}(vout_packed, vout_packed); + _mm_storeu_si64((__m128i*) output, vout_final); + output+=8; + + + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(${XINT8_T})); + assert(batch <= 7 * sizeof(${XINT8_T})); + + const __m256i va = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) input_a)); + const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + const __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope); + const __m256i vacc = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va_sub, vslope), vcompare); + const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = ${_MM_PACKXS_EPI16}(vout_packed, vout_packed); + + if (batch & (4 * sizeof(${XINT8_T}))) { + _mm_storeu_si32(output, vout_final); + vout_final = _mm_srli_epi64(vout_final, 32); + output += 4; + } + + if (batch & (2 * sizeof(${XINT8_T}))) { + _mm_storeu_si16(output, vout_final); + vout_final = _mm_srli_epi32(vout_final, 16); + output += 2; + } + if (batch & (1 * sizeof(${XINT8_T}))) { + *output = (${XINT8_T}) _mm_extract_epi8(vout_final, 0); + } + } +} diff --git a/src/qs8-vrpreluc/gen/qs8-vrpreluc-avx2-u16.c b/src/qs8-vrpreluc/gen/qs8-vrpreluc-avx2-u16.c new file mode 100644 index 00000000000..2a70f7a1f4f --- /dev/null +++ b/src/qs8-vrpreluc/gen/qs8-vrpreluc-avx2-u16.c @@ -0,0 +1,162 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vrpreluc/avx2.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include +#include +#include "src/xnnpack/intrinsics-polyfill.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qs8_vrpreluc_ukernel__avx2_u16( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point); + const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.rprelu_positive_multiplier); + const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier); + const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f); + const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point); + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + const __m256i vslope = _mm256_set1_epi32(slope); + for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) { + __m256i va0 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_a)); + + + __m256i va1 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) (input_a + 8))); + input_a += 16; + + + __m256i va0_sub = _mm256_sub_epi32(va0, vinput_zero_point); + __m256i vcompare0 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope); + __m256i vacc0 = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va0_sub, vslope), vcompare0); + __m256i va1_sub = _mm256_sub_epi32(va1, vinput_zero_point); + __m256i vcompare1 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope); + __m256i vacc1 = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va1_sub, vslope), vcompare1); + + __m256 vscale0 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare0)); + __m256 vfpacc0 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc0), vscale0); + __m256 vscale1 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare1)); + __m256 vfpacc1 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc1), vscale1); + + __m256 vfpacc_clamped0 = _mm256_min_ps(_mm256_max_ps(vfpacc0, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased0 = _mm256_add_ps(vfpacc_clamped0, vmagic_bias); + __m256i vout0 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased0), vmagic_bias_less_output_zero_point); + __m256 vfpacc_clamped1 = _mm256_min_ps(_mm256_max_ps(vfpacc1, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased1 = _mm256_add_ps(vfpacc_clamped1, vmagic_bias); + __m256i vout1 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased1), vmagic_bias_less_output_zero_point); + + const __m128i vout_low0 = _mm256_castsi256_si128(vout0); + const __m128i vout_high0 = _mm256_extracti128_si256(vout0, 1); + const __m128i vout_packed0 = _mm_packs_epi32(vout_low0, vout_high0); + __m128i vout_final0 = _mm_packs_epi16(vout_packed0, vout_packed0); + const __m128i vout_low1 = _mm256_castsi256_si128(vout1); + const __m128i vout_high1 = _mm256_extracti128_si256(vout1, 1); + const __m128i vout_packed1 = _mm_packs_epi32(vout_low1, vout_high1); + __m128i vout_final1 = _mm_packs_epi16(vout_packed1, vout_packed1); + + _mm_storeu_si64((__m128i*)(output), vout_final0); + + _mm_storeu_si64((__m128i*)(output + 8), vout_final1); + + output += 16; + } + + for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { + __m256i va = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_a)); + __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope); + __m256i vacc = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va_sub, vslope), vcompare); + __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + input_a+=8; + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = _mm_packs_epi16(vout_packed, vout_packed); + _mm_storeu_si64((__m128i*) output, vout_final); + output+=8; + + + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(int8_t)); + assert(batch <= 7 * sizeof(int8_t)); + + const __m256i va = _mm256_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) input_a)); + const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + const __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope); + const __m256i vacc = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va_sub, vslope), vcompare); + const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = _mm_packs_epi16(vout_packed, vout_packed); + + if (batch & (4 * sizeof(int8_t))) { + _mm_storeu_si32(output, vout_final); + vout_final = _mm_srli_epi64(vout_final, 32); + output += 4; + } + + if (batch & (2 * sizeof(int8_t))) { + _mm_storeu_si16(output, vout_final); + vout_final = _mm_srli_epi32(vout_final, 16); + output += 2; + } + if (batch & (1 * sizeof(int8_t))) { + *output = (int8_t) _mm_extract_epi8(vout_final, 0); + } + } +} diff --git a/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u1.c b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u1.c new file mode 100644 index 00000000000..74113dfcc2c --- /dev/null +++ b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u1.c @@ -0,0 +1,73 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vrpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qs8_vrpreluc_ukernel__scalar_u1( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope; + float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (int8_t) vout; + batch -= sizeof(int8_t); + } while (batch != 0); +} diff --git a/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u2.c b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u2.c new file mode 100644 index 00000000000..f5169605632 --- /dev/null +++ b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u2.c @@ -0,0 +1,103 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vrpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qs8_vrpreluc_ukernel__scalar_u2( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + for (; batch >= 2 * sizeof(int8_t); batch -= 2 * sizeof(int8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + input_a += 2; + + int32_t vacc0 = XNN_UNPREDICTABLE(slope < 0) ? va0 * slope : slope; + int32_t vacc1 = XNN_UNPREDICTABLE(slope < 0) ? va1 * slope : slope; + + float vscale0 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + + output[0] = (int8_t) vout0; + output[1] = (int8_t) vout1; + output += 2; + } + + if XNN_UNLIKELY(batch != 0) { + const int32_t va = (int32_t) *input_a - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope; + float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output = (int8_t) vout; + } +} diff --git a/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u4.c b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u4.c new file mode 100644 index 00000000000..8913be2e207 --- /dev/null +++ b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u4.c @@ -0,0 +1,124 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vrpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qs8_vrpreluc_ukernel__scalar_u4( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + const int32_t va2 = input_a[2] - input_zero_point; + const int32_t va3 = input_a[3] - input_zero_point; + input_a += 4; + + int32_t vacc0 = XNN_UNPREDICTABLE(slope < 0) ? va0 * slope : slope; + int32_t vacc1 = XNN_UNPREDICTABLE(slope < 0) ? va1 * slope : slope; + int32_t vacc2 = XNN_UNPREDICTABLE(slope < 0) ? va2 * slope : slope; + int32_t vacc3 = XNN_UNPREDICTABLE(slope < 0) ? va3 * slope : slope; + + float vscale0 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale2 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale3 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + float vfpacc2 = (float) vacc2 * vscale2; + float vfpacc3 = (float) vacc3 * vscale3; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); + vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); + vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + vfpacc2 += vmagic_bias; + vfpacc3 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; + const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; + + output[0] = (int8_t) vout0; + output[1] = (int8_t) vout1; + output[2] = (int8_t) vout2; + output[3] = (int8_t) vout3; + output += 4; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope; + float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (int8_t) vout; + batch -= sizeof(int8_t); + } while (batch != 0); + } +} diff --git a/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u8.c b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u8.c new file mode 100644 index 00000000000..ba74ff74a30 --- /dev/null +++ b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u8.c @@ -0,0 +1,160 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vrpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qs8_vrpreluc_ukernel__scalar_u8( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + const int32_t va2 = input_a[2] - input_zero_point; + const int32_t va3 = input_a[3] - input_zero_point; + const int32_t va4 = input_a[4] - input_zero_point; + const int32_t va5 = input_a[5] - input_zero_point; + const int32_t va6 = input_a[6] - input_zero_point; + const int32_t va7 = input_a[7] - input_zero_point; + input_a += 8; + + int32_t vacc0 = XNN_UNPREDICTABLE(slope < 0) ? va0 * slope : slope; + int32_t vacc1 = XNN_UNPREDICTABLE(slope < 0) ? va1 * slope : slope; + int32_t vacc2 = XNN_UNPREDICTABLE(slope < 0) ? va2 * slope : slope; + int32_t vacc3 = XNN_UNPREDICTABLE(slope < 0) ? va3 * slope : slope; + int32_t vacc4 = XNN_UNPREDICTABLE(slope < 0) ? va4 * slope : slope; + int32_t vacc5 = XNN_UNPREDICTABLE(slope < 0) ? va5 * slope : slope; + int32_t vacc6 = XNN_UNPREDICTABLE(slope < 0) ? va6 * slope : slope; + int32_t vacc7 = XNN_UNPREDICTABLE(slope < 0) ? va7 * slope : slope; + + float vscale0 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale2 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale3 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale4 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale5 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale6 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale7 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + float vfpacc2 = (float) vacc2 * vscale2; + float vfpacc3 = (float) vacc3 * vscale3; + float vfpacc4 = (float) vacc4 * vscale4; + float vfpacc5 = (float) vacc5 * vscale5; + float vfpacc6 = (float) vacc6 * vscale6; + float vfpacc7 = (float) vacc7 * vscale7; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); + vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); + vfpacc4 = math_max_f32(vfpacc4, voutput_min_less_zero_point); + vfpacc5 = math_max_f32(vfpacc5, voutput_min_less_zero_point); + vfpacc6 = math_max_f32(vfpacc6, voutput_min_less_zero_point); + vfpacc7 = math_max_f32(vfpacc7, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); + vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); + vfpacc4 = math_min_f32(vfpacc4, voutput_max_less_zero_point); + vfpacc5 = math_min_f32(vfpacc5, voutput_max_less_zero_point); + vfpacc6 = math_min_f32(vfpacc6, voutput_max_less_zero_point); + vfpacc7 = math_min_f32(vfpacc7, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + vfpacc2 += vmagic_bias; + vfpacc3 += vmagic_bias; + vfpacc4 += vmagic_bias; + vfpacc5 += vmagic_bias; + vfpacc6 += vmagic_bias; + vfpacc7 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; + const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; + const int32_t vout4 = (int32_t) float_as_uint32(vfpacc4) - vmagic_bias_less_output_zero_point; + const int32_t vout5 = (int32_t) float_as_uint32(vfpacc5) - vmagic_bias_less_output_zero_point; + const int32_t vout6 = (int32_t) float_as_uint32(vfpacc6) - vmagic_bias_less_output_zero_point; + const int32_t vout7 = (int32_t) float_as_uint32(vfpacc7) - vmagic_bias_less_output_zero_point; + + output[0] = (int8_t) vout0; + output[1] = (int8_t) vout1; + output[2] = (int8_t) vout2; + output[3] = (int8_t) vout3; + output[4] = (int8_t) vout4; + output[5] = (int8_t) vout5; + output[6] = (int8_t) vout6; + output[7] = (int8_t) vout7; + output += 8; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope; + float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (int8_t) vout; + batch -= sizeof(int8_t); + } while (batch != 0); + } +} diff --git a/src/qs8-vrpreluc/qs8-vrpreluc.h b/src/qs8-vrpreluc/qs8-vrpreluc.h new file mode 100644 index 00000000000..d499dff23e0 --- /dev/null +++ b/src/qs8-vrpreluc/qs8-vrpreluc.h @@ -0,0 +1,59 @@ +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef XNN_UKERNEL_WITH_PARAMS +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) +#define XNN_DEFINED_UKERNEL_WITH_PARAMS +#endif + +#ifndef XNN_UKERNEL +#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ + XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) +#define XNN_DEFINED_UKERNEL +#endif + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vrpreluc_ukernel__avx2_u16, 16, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vrpreluc_ukernel__scalar_u1, 1, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vrpreluc_ukernel__scalar_u2, 2, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vrpreluc_ukernel__scalar_u4, 4, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vrpreluc_ukernel__scalar_u8, 8, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) + +#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_UKERNEL_WITH_PARAMS +#endif + +#ifdef XNN_DEFINED_UKERNEL +#undef XNN_DEFINED_UKERNEL +#undef XNN_UKERNEL +#endif diff --git a/src/qs8-vrpreluc/scalar.c.in b/src/qs8-vrpreluc/scalar.c.in new file mode 100644 index 00000000000..c7087e6032f --- /dev/null +++ b/src/qs8-vrpreluc/scalar.c.in @@ -0,0 +1,129 @@ +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + +$assert DATATYPE in ["QS8", "QU8"] +$assert BATCH_TILE >= 1 + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] +void xnn_${DATATYPE.lower()}_vrpreluc_ukernel__scalar_u${BATCH_TILE}( + size_t batch, + const ${XINT8_T}* input_a, + const ${XINT8_T}* input_b, + ${XINT8_T}* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(${XINT8_T}) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + $if BATCH_TILE == 1: + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope; + float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (${XINT8_T}) vout; + batch -= sizeof(${XINT8_T}); + } while (batch != 0); + $else: + for (; batch >= ${BATCH_TILE} * sizeof(${XINT8_T}); batch -= ${BATCH_TILE} * sizeof(${XINT8_T})) { + $for N in range(BATCH_TILE): + const int32_t va${N} = input_a[${N}] - input_zero_point; + input_a += ${BATCH_TILE}; + + $for N in range(BATCH_TILE): + int32_t vacc${N} = XNN_UNPREDICTABLE(slope < 0) ? va${N} * slope : slope; + + $for N in range(BATCH_TILE): + float vscale${N} = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + + $for N in range(BATCH_TILE): + float vfpacc${N} = (float) vacc${N} * vscale${N}; + + $for N in range(BATCH_TILE): + vfpacc${N} = math_max_f32(vfpacc${N}, voutput_min_less_zero_point); + + $for N in range(BATCH_TILE): + vfpacc${N} = math_min_f32(vfpacc${N}, voutput_max_less_zero_point); + + $for N in range(BATCH_TILE): + vfpacc${N} += vmagic_bias; + + $for N in range(BATCH_TILE): + const int32_t vout${N} = (int32_t) float_as_uint32(vfpacc${N}) - vmagic_bias_less_output_zero_point; + + $for N in range(BATCH_TILE): + output[${N}] = (${XINT8_T}) vout${N}; + output += ${BATCH_TILE}; + } + + if XNN_UNLIKELY(batch != 0) { + $if BATCH_TILE == 2: + const int32_t va = (int32_t) *input_a - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope; + float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output = (${XINT8_T}) vout; + $else: + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope; + float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (${XINT8_T}) vout; + batch -= sizeof(${XINT8_T}); + } while (batch != 0); + } +} diff --git a/src/qu8-vprelu/gen/qu8-vprelu-avx2-u16.c b/src/qu8-vprelu/gen/qu8-vprelu-avx2-u16.c new file mode 100644 index 00000000000..4c449c5e4e0 --- /dev/null +++ b/src/qu8-vprelu/gen/qu8-vprelu-avx2-u16.c @@ -0,0 +1,171 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vprelu/avx2.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include +#include +#include "src/xnnpack/intrinsics-polyfill.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qu8_vprelu_ukernel__avx2_u16( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point); + const __m256i vslope_zero_point = _mm256_set1_epi32(params->scalar.slope_zero_point); + const __m256i voutput_zero_point = _mm256_set1_epi32(params->scalar.output_zero_point); + const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.positive_multiplier); + const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier); + const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f); + const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point); + + for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) { + __m256i va0 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_a)); + __m256i vb0 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_b)); + + __m256i va1 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) (input_a + 8))); + __m256i vb1 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) (input_b + 8))); + input_a += 16; + input_b += 16; + + __m256i va0_sub = _mm256_sub_epi32(va0, vinput_zero_point); + __m256i vb0_sub = _mm256_sub_epi32(vb0, vslope_zero_point); + __m256i vcompare0 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va0_sub); + __m256i vacc0 = _mm256_blendv_epi8(va0_sub, _mm256_mullo_epi32(va0_sub, vb0_sub), vcompare0); + __m256i va1_sub = _mm256_sub_epi32(va1, vinput_zero_point); + __m256i vb1_sub = _mm256_sub_epi32(vb1, vslope_zero_point); + __m256i vcompare1 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va1_sub); + __m256i vacc1 = _mm256_blendv_epi8(va1_sub, _mm256_mullo_epi32(va1_sub, vb1_sub), vcompare1); + + __m256 vscale0 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare0)); + __m256 vfpacc0 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc0), vscale0); + __m256 vscale1 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare1)); + __m256 vfpacc1 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc1), vscale1); + + __m256 vfpacc_clamped0 = _mm256_min_ps(_mm256_max_ps(vfpacc0, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased0 = _mm256_add_ps(vfpacc_clamped0, vmagic_bias); + __m256i vout0 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased0), vmagic_bias_less_output_zero_point); + __m256 vfpacc_clamped1 = _mm256_min_ps(_mm256_max_ps(vfpacc1, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased1 = _mm256_add_ps(vfpacc_clamped1, vmagic_bias); + __m256i vout1 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased1), vmagic_bias_less_output_zero_point); + + const __m128i vout_low0 = _mm256_castsi256_si128(vout0); + const __m128i vout_high0 = _mm256_extracti128_si256(vout0, 1); + const __m128i vout_packed160 = _mm_packs_epi32(vout_low0, vout_high0); + __m128i vout_final0 = _mm_packus_epi16(vout_packed160, vout_packed160); + const __m128i vout_low1 = _mm256_castsi256_si128(vout1); + const __m128i vout_high1 = _mm256_extracti128_si256(vout1, 1); + const __m128i vout_packed161 = _mm_packs_epi32(vout_low1, vout_high1); + __m128i vout_final1 = _mm_packus_epi16(vout_packed161, vout_packed161); + + _mm_storeu_si64((__m128i*)(output), vout_final0); + + _mm_storeu_si64((__m128i*)(output + 8), vout_final1); + + output += 16; + } + + for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { + __m256i va = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_a)); + __m256i vb = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_b)); + __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + __m256i vb_sub = _mm256_sub_epi32(vb, vslope_zero_point); + __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub); + __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vb_sub), vcompare); + __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + input_a+=8; + input_b+=8; + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed16 = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = _mm_packus_epi16(vout_packed16, vout_packed16); + _mm_storeu_si64((__m128i*) output, vout_final); + output+=8; + + + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint8_t)); + assert(batch <= 7 * sizeof(uint8_t)); + + const __m256i va = _mm256_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) input_a)); + const __m256i vb = _mm256_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) input_b)); + const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + const __m256i vb_sub = _mm256_sub_epi32(vb, vslope_zero_point); + const __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub); + const __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vb_sub), vcompare); + const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed16 = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = _mm_packus_epi16(vout_packed16, vout_packed16); + + if (batch & (4 * sizeof(uint8_t))) { + _mm_storeu_si32(output, vout_final); + vout_final = _mm_srli_epi64(vout_final, 32); + output += 4; + } + + if (batch & (2 * sizeof(uint8_t))) { + _mm_storeu_si16(output, vout_final); + vout_final = _mm_srli_epi32(vout_final, 16); + output += 2; + } + if (batch & (1 * sizeof(uint8_t))) { + *output = (uint8_t) _mm_extract_epi8(vout_final, 0); + } + } +} diff --git a/src/qu8-vprelu/gen/qu8-vprelu-scalar-u1.c b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u1.c new file mode 100644 index 00000000000..ada1fb5810c --- /dev/null +++ b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u1.c @@ -0,0 +1,76 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vprelu/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + + +void xnn_qu8_vprelu_ukernel__scalar_u1( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const int32_t slope_zero_point = params->scalar.slope_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + const int32_t vb = (int32_t) *input_b++ - slope_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (uint8_t) vout; + batch -= sizeof(uint8_t); + } while (batch != 0); +} diff --git a/src/qu8-vprelu/gen/qu8-vprelu-scalar-u2.c b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u2.c new file mode 100644 index 00000000000..7420ba1e7bc --- /dev/null +++ b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u2.c @@ -0,0 +1,110 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vprelu/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + + +void xnn_qu8_vprelu_ukernel__scalar_u2( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const int32_t slope_zero_point = params->scalar.slope_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + + for (; batch >= 2 * sizeof(uint8_t); batch -= 2 * sizeof(uint8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + input_a += 2; + + const int32_t vb0 = input_b[0] - slope_zero_point; + const int32_t vb1 = input_b[1] - slope_zero_point; + input_b += 2; + + int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * vb0 : va0; + int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * vb1 : va1; + + float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + + output[0] = (uint8_t) vout0; + output[1] = (uint8_t) vout1; + output += 2; + } + + if XNN_UNLIKELY(batch != 0) { + const int32_t va = (int32_t) *input_a - input_zero_point; + const int32_t vb = (int32_t) *input_b - slope_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output = (uint8_t) vout; + } +} diff --git a/src/qu8-vprelu/gen/qu8-vprelu-scalar-u4.c b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u4.c new file mode 100644 index 00000000000..feb88e84360 --- /dev/null +++ b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u4.c @@ -0,0 +1,133 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vprelu/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + + +void xnn_qu8_vprelu_ukernel__scalar_u4( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const int32_t slope_zero_point = params->scalar.slope_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + + for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + const int32_t va2 = input_a[2] - input_zero_point; + const int32_t va3 = input_a[3] - input_zero_point; + input_a += 4; + + const int32_t vb0 = input_b[0] - slope_zero_point; + const int32_t vb1 = input_b[1] - slope_zero_point; + const int32_t vb2 = input_b[2] - slope_zero_point; + const int32_t vb3 = input_b[3] - slope_zero_point; + input_b += 4; + + int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * vb0 : va0; + int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * vb1 : va1; + int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * vb2 : va2; + int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * vb3 : va3; + + float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + float vfpacc2 = (float) vacc2 * vscale2; + float vfpacc3 = (float) vacc3 * vscale3; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); + vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); + vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + vfpacc2 += vmagic_bias; + vfpacc3 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; + const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; + + output[0] = (uint8_t) vout0; + output[1] = (uint8_t) vout1; + output[2] = (uint8_t) vout2; + output[3] = (uint8_t) vout3; + output += 4; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + const int32_t vb = (int32_t) *input_b++ - slope_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (uint8_t) vout; + batch -= sizeof(uint8_t); + } while (batch != 0); + } +} diff --git a/src/qu8-vprelu/gen/qu8-vprelu-scalar-u8.c b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u8.c new file mode 100644 index 00000000000..b865f02a6c7 --- /dev/null +++ b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u8.c @@ -0,0 +1,173 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vprelu/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + + +void xnn_qu8_vprelu_ukernel__scalar_u8( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const int32_t slope_zero_point = params->scalar.slope_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + + for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + const int32_t va2 = input_a[2] - input_zero_point; + const int32_t va3 = input_a[3] - input_zero_point; + const int32_t va4 = input_a[4] - input_zero_point; + const int32_t va5 = input_a[5] - input_zero_point; + const int32_t va6 = input_a[6] - input_zero_point; + const int32_t va7 = input_a[7] - input_zero_point; + input_a += 8; + + const int32_t vb0 = input_b[0] - slope_zero_point; + const int32_t vb1 = input_b[1] - slope_zero_point; + const int32_t vb2 = input_b[2] - slope_zero_point; + const int32_t vb3 = input_b[3] - slope_zero_point; + const int32_t vb4 = input_b[4] - slope_zero_point; + const int32_t vb5 = input_b[5] - slope_zero_point; + const int32_t vb6 = input_b[6] - slope_zero_point; + const int32_t vb7 = input_b[7] - slope_zero_point; + input_b += 8; + + int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * vb0 : va0; + int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * vb1 : va1; + int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * vb2 : va2; + int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * vb3 : va3; + int32_t vacc4 = XNN_UNPREDICTABLE(va4 < 0) ? va4 * vb4 : va4; + int32_t vacc5 = XNN_UNPREDICTABLE(va5 < 0) ? va5 * vb5 : va5; + int32_t vacc6 = XNN_UNPREDICTABLE(va6 < 0) ? va6 * vb6 : va6; + int32_t vacc7 = XNN_UNPREDICTABLE(va7 < 0) ? va7 * vb7 : va7; + + float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale4 = XNN_UNPREDICTABLE(va4 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale5 = XNN_UNPREDICTABLE(va5 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale6 = XNN_UNPREDICTABLE(va6 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale7 = XNN_UNPREDICTABLE(va7 < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + float vfpacc2 = (float) vacc2 * vscale2; + float vfpacc3 = (float) vacc3 * vscale3; + float vfpacc4 = (float) vacc4 * vscale4; + float vfpacc5 = (float) vacc5 * vscale5; + float vfpacc6 = (float) vacc6 * vscale6; + float vfpacc7 = (float) vacc7 * vscale7; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); + vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); + vfpacc4 = math_max_f32(vfpacc4, voutput_min_less_zero_point); + vfpacc5 = math_max_f32(vfpacc5, voutput_min_less_zero_point); + vfpacc6 = math_max_f32(vfpacc6, voutput_min_less_zero_point); + vfpacc7 = math_max_f32(vfpacc7, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); + vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); + vfpacc4 = math_min_f32(vfpacc4, voutput_max_less_zero_point); + vfpacc5 = math_min_f32(vfpacc5, voutput_max_less_zero_point); + vfpacc6 = math_min_f32(vfpacc6, voutput_max_less_zero_point); + vfpacc7 = math_min_f32(vfpacc7, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + vfpacc2 += vmagic_bias; + vfpacc3 += vmagic_bias; + vfpacc4 += vmagic_bias; + vfpacc5 += vmagic_bias; + vfpacc6 += vmagic_bias; + vfpacc7 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; + const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; + const int32_t vout4 = (int32_t) float_as_uint32(vfpacc4) - vmagic_bias_less_output_zero_point; + const int32_t vout5 = (int32_t) float_as_uint32(vfpacc5) - vmagic_bias_less_output_zero_point; + const int32_t vout6 = (int32_t) float_as_uint32(vfpacc6) - vmagic_bias_less_output_zero_point; + const int32_t vout7 = (int32_t) float_as_uint32(vfpacc7) - vmagic_bias_less_output_zero_point; + + output[0] = (uint8_t) vout0; + output[1] = (uint8_t) vout1; + output[2] = (uint8_t) vout2; + output[3] = (uint8_t) vout3; + output[4] = (uint8_t) vout4; + output[5] = (uint8_t) vout5; + output[6] = (uint8_t) vout6; + output[7] = (uint8_t) vout7; + output += 8; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + const int32_t vb = (int32_t) *input_b++ - slope_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (uint8_t) vout; + batch -= sizeof(uint8_t); + } while (batch != 0); + } +} diff --git a/src/qu8-vprelu/qu8-vprelu.h b/src/qu8-vprelu/qu8-vprelu.h new file mode 100644 index 00000000000..860c35f2d56 --- /dev/null +++ b/src/qu8-vprelu/qu8-vprelu.h @@ -0,0 +1,59 @@ +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef XNN_UKERNEL_WITH_PARAMS +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) +#define XNN_DEFINED_UKERNEL_WITH_PARAMS +#endif + +#ifndef XNN_UKERNEL +#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ + XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) +#define XNN_DEFINED_UKERNEL +#endif + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vprelu_ukernel__avx2_u16, 16, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vprelu_ukernel__scalar_u1, 1, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) + XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vprelu_ukernel__scalar_u2, 2, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) + XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vprelu_ukernel__scalar_u4, 4, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) + XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vprelu_ukernel__scalar_u8, 8, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) + +#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_UKERNEL_WITH_PARAMS +#endif + +#ifdef XNN_DEFINED_UKERNEL +#undef XNN_DEFINED_UKERNEL +#undef XNN_UKERNEL +#endif diff --git a/src/qu8-vpreluc/gen/qu8-vpreluc-avx2-u16.c b/src/qu8-vpreluc/gen/qu8-vpreluc-avx2-u16.c new file mode 100644 index 00000000000..16eea7bf320 --- /dev/null +++ b/src/qu8-vpreluc/gen/qu8-vpreluc-avx2-u16.c @@ -0,0 +1,162 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vpreluc/avx2.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include +#include +#include "src/xnnpack/intrinsics-polyfill.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qu8_vpreluc_ukernel__avx2_u16( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point); + const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.positive_multiplier); + const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier); + const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f); + const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point); + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + const __m256i vslope = _mm256_set1_epi32(slope); + for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) { + __m256i va0 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_a)); + + + __m256i va1 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) (input_a + 8))); + input_a += 16; + + + __m256i va0_sub = _mm256_sub_epi32(va0, vinput_zero_point); + __m256i vcompare0 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va0_sub); + __m256i vacc0 = _mm256_blendv_epi8(va0_sub, _mm256_mullo_epi32(va0_sub, vslope), vcompare0); + __m256i va1_sub = _mm256_sub_epi32(va1, vinput_zero_point); + __m256i vcompare1 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va1_sub); + __m256i vacc1 = _mm256_blendv_epi8(va1_sub, _mm256_mullo_epi32(va1_sub, vslope), vcompare1); + + __m256 vscale0 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare0)); + __m256 vfpacc0 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc0), vscale0); + __m256 vscale1 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare1)); + __m256 vfpacc1 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc1), vscale1); + + __m256 vfpacc_clamped0 = _mm256_min_ps(_mm256_max_ps(vfpacc0, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased0 = _mm256_add_ps(vfpacc_clamped0, vmagic_bias); + __m256i vout0 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased0), vmagic_bias_less_output_zero_point); + __m256 vfpacc_clamped1 = _mm256_min_ps(_mm256_max_ps(vfpacc1, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased1 = _mm256_add_ps(vfpacc_clamped1, vmagic_bias); + __m256i vout1 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased1), vmagic_bias_less_output_zero_point); + + const __m128i vout_low0 = _mm256_castsi256_si128(vout0); + const __m128i vout_high0 = _mm256_extracti128_si256(vout0, 1); + const __m128i vout_packed0 = _mm_packs_epi32(vout_low0, vout_high0); + __m128i vout_final0 = _mm_packus_epi16(vout_packed0, vout_packed0); + const __m128i vout_low1 = _mm256_castsi256_si128(vout1); + const __m128i vout_high1 = _mm256_extracti128_si256(vout1, 1); + const __m128i vout_packed1 = _mm_packs_epi32(vout_low1, vout_high1); + __m128i vout_final1 = _mm_packus_epi16(vout_packed1, vout_packed1); + + _mm_storeu_si64((__m128i*)(output), vout_final0); + + _mm_storeu_si64((__m128i*)(output + 8), vout_final1); + + output += 16; + } + + for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { + __m256i va = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_a)); + __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub); + __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vslope), vcompare); + __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + input_a+=8; + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = _mm_packus_epi16(vout_packed, vout_packed); + _mm_storeu_si64((__m128i*) output, vout_final); + output+=8; + + + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint8_t)); + assert(batch <= 7 * sizeof(uint8_t)); + + const __m256i va = _mm256_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) input_a)); + const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub); + const __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vslope), vcompare); + const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = _mm_packus_epi16(vout_packed, vout_packed); + + if (batch & (4 * sizeof(uint8_t))) { + _mm_storeu_si32(output, vout_final); + vout_final = _mm_srli_epi64(vout_final, 32); + output += 4; + } + + if (batch & (2 * sizeof(uint8_t))) { + _mm_storeu_si16(output, vout_final); + vout_final = _mm_srli_epi32(vout_final, 16); + output += 2; + } + if (batch & (1 * sizeof(uint8_t))) { + *output = (uint8_t) _mm_extract_epi8(vout_final, 0); + } + } +} diff --git a/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u1.c b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u1.c new file mode 100644 index 00000000000..2a4b5d61ef6 --- /dev/null +++ b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u1.c @@ -0,0 +1,73 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qu8_vpreluc_ukernel__scalar_u1( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (uint8_t) vout; + batch -= sizeof(uint8_t); + } while (batch != 0); +} diff --git a/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u2.c b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u2.c new file mode 100644 index 00000000000..2e6ca54fe52 --- /dev/null +++ b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u2.c @@ -0,0 +1,103 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qu8_vpreluc_ukernel__scalar_u2( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + for (; batch >= 2 * sizeof(uint8_t); batch -= 2 * sizeof(uint8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + input_a += 2; + + int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * slope : va0; + int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * slope : va1; + + float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + + output[0] = (uint8_t) vout0; + output[1] = (uint8_t) vout1; + output += 2; + } + + if XNN_UNLIKELY(batch != 0) { + const int32_t va = (int32_t) *input_a - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va; + float vscale = XNN_UNPREDICTABLE(va >= 0) ? vpositive_multiplier : vnegative_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output = (uint8_t) vout; + } +} diff --git a/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u4.c b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u4.c new file mode 100644 index 00000000000..52f6af22463 --- /dev/null +++ b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u4.c @@ -0,0 +1,124 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qu8_vpreluc_ukernel__scalar_u4( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + const int32_t va2 = input_a[2] - input_zero_point; + const int32_t va3 = input_a[3] - input_zero_point; + input_a += 4; + + int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * slope : va0; + int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * slope : va1; + int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * slope : va2; + int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * slope : va3; + + float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + float vfpacc2 = (float) vacc2 * vscale2; + float vfpacc3 = (float) vacc3 * vscale3; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); + vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); + vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + vfpacc2 += vmagic_bias; + vfpacc3 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; + const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; + + output[0] = (uint8_t) vout0; + output[1] = (uint8_t) vout1; + output[2] = (uint8_t) vout2; + output[3] = (uint8_t) vout3; + output += 4; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (uint8_t) vout; + batch -= sizeof(uint8_t); + } while (batch != 0); + } +} diff --git a/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u8.c b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u8.c new file mode 100644 index 00000000000..e882addc000 --- /dev/null +++ b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u8.c @@ -0,0 +1,160 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qu8_vpreluc_ukernel__scalar_u8( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + const int32_t va2 = input_a[2] - input_zero_point; + const int32_t va3 = input_a[3] - input_zero_point; + const int32_t va4 = input_a[4] - input_zero_point; + const int32_t va5 = input_a[5] - input_zero_point; + const int32_t va6 = input_a[6] - input_zero_point; + const int32_t va7 = input_a[7] - input_zero_point; + input_a += 8; + + int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * slope : va0; + int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * slope : va1; + int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * slope : va2; + int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * slope : va3; + int32_t vacc4 = XNN_UNPREDICTABLE(va4 < 0) ? va4 * slope : va4; + int32_t vacc5 = XNN_UNPREDICTABLE(va5 < 0) ? va5 * slope : va5; + int32_t vacc6 = XNN_UNPREDICTABLE(va6 < 0) ? va6 * slope : va6; + int32_t vacc7 = XNN_UNPREDICTABLE(va7 < 0) ? va7 * slope : va7; + + float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale4 = XNN_UNPREDICTABLE(va4 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale5 = XNN_UNPREDICTABLE(va5 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale6 = XNN_UNPREDICTABLE(va6 < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale7 = XNN_UNPREDICTABLE(va7 < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + float vfpacc2 = (float) vacc2 * vscale2; + float vfpacc3 = (float) vacc3 * vscale3; + float vfpacc4 = (float) vacc4 * vscale4; + float vfpacc5 = (float) vacc5 * vscale5; + float vfpacc6 = (float) vacc6 * vscale6; + float vfpacc7 = (float) vacc7 * vscale7; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); + vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); + vfpacc4 = math_max_f32(vfpacc4, voutput_min_less_zero_point); + vfpacc5 = math_max_f32(vfpacc5, voutput_min_less_zero_point); + vfpacc6 = math_max_f32(vfpacc6, voutput_min_less_zero_point); + vfpacc7 = math_max_f32(vfpacc7, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); + vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); + vfpacc4 = math_min_f32(vfpacc4, voutput_max_less_zero_point); + vfpacc5 = math_min_f32(vfpacc5, voutput_max_less_zero_point); + vfpacc6 = math_min_f32(vfpacc6, voutput_max_less_zero_point); + vfpacc7 = math_min_f32(vfpacc7, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + vfpacc2 += vmagic_bias; + vfpacc3 += vmagic_bias; + vfpacc4 += vmagic_bias; + vfpacc5 += vmagic_bias; + vfpacc6 += vmagic_bias; + vfpacc7 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; + const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; + const int32_t vout4 = (int32_t) float_as_uint32(vfpacc4) - vmagic_bias_less_output_zero_point; + const int32_t vout5 = (int32_t) float_as_uint32(vfpacc5) - vmagic_bias_less_output_zero_point; + const int32_t vout6 = (int32_t) float_as_uint32(vfpacc6) - vmagic_bias_less_output_zero_point; + const int32_t vout7 = (int32_t) float_as_uint32(vfpacc7) - vmagic_bias_less_output_zero_point; + + output[0] = (uint8_t) vout0; + output[1] = (uint8_t) vout1; + output[2] = (uint8_t) vout2; + output[3] = (uint8_t) vout3; + output[4] = (uint8_t) vout4; + output[5] = (uint8_t) vout5; + output[6] = (uint8_t) vout6; + output[7] = (uint8_t) vout7; + output += 8; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va; + float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (uint8_t) vout; + batch -= sizeof(uint8_t); + } while (batch != 0); + } +} diff --git a/src/qu8-vpreluc/qu8-vpreluc.h b/src/qu8-vpreluc/qu8-vpreluc.h new file mode 100644 index 00000000000..d37edff2bf0 --- /dev/null +++ b/src/qu8-vpreluc/qu8-vpreluc.h @@ -0,0 +1,59 @@ +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef XNN_UKERNEL_WITH_PARAMS +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) +#define XNN_DEFINED_UKERNEL_WITH_PARAMS +#endif + +#ifndef XNN_UKERNEL +#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ + XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) +#define XNN_DEFINED_UKERNEL +#endif + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vpreluc_ukernel__avx2_u16, 16, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vpreluc_ukernel__scalar_u1, 1, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vpreluc_ukernel__scalar_u2, 2, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vpreluc_ukernel__scalar_u4, 4, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vpreluc_ukernel__scalar_u8, 8, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) + +#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_UKERNEL_WITH_PARAMS +#endif + +#ifdef XNN_DEFINED_UKERNEL +#undef XNN_DEFINED_UKERNEL +#undef XNN_UKERNEL +#endif diff --git a/src/qu8-vrpreluc/gen/qu8-vrpreluc-avx2-u16.c b/src/qu8-vrpreluc/gen/qu8-vrpreluc-avx2-u16.c new file mode 100644 index 00000000000..a79c185e155 --- /dev/null +++ b/src/qu8-vrpreluc/gen/qu8-vrpreluc-avx2-u16.c @@ -0,0 +1,162 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vrpreluc/avx2.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include +#include +#include "src/xnnpack/intrinsics-polyfill.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qu8_vrpreluc_ukernel__avx2_u16( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point); + const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.rprelu_positive_multiplier); + const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier); + const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f); + const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point); + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + const __m256i vslope = _mm256_set1_epi32(slope); + for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) { + __m256i va0 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_a)); + + + __m256i va1 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) (input_a + 8))); + input_a += 16; + + + __m256i va0_sub = _mm256_sub_epi32(va0, vinput_zero_point); + __m256i vcompare0 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope); + __m256i vacc0 = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va0_sub, vslope), vcompare0); + __m256i va1_sub = _mm256_sub_epi32(va1, vinput_zero_point); + __m256i vcompare1 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope); + __m256i vacc1 = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va1_sub, vslope), vcompare1); + + __m256 vscale0 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare0)); + __m256 vfpacc0 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc0), vscale0); + __m256 vscale1 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare1)); + __m256 vfpacc1 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc1), vscale1); + + __m256 vfpacc_clamped0 = _mm256_min_ps(_mm256_max_ps(vfpacc0, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased0 = _mm256_add_ps(vfpacc_clamped0, vmagic_bias); + __m256i vout0 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased0), vmagic_bias_less_output_zero_point); + __m256 vfpacc_clamped1 = _mm256_min_ps(_mm256_max_ps(vfpacc1, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased1 = _mm256_add_ps(vfpacc_clamped1, vmagic_bias); + __m256i vout1 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased1), vmagic_bias_less_output_zero_point); + + const __m128i vout_low0 = _mm256_castsi256_si128(vout0); + const __m128i vout_high0 = _mm256_extracti128_si256(vout0, 1); + const __m128i vout_packed0 = _mm_packs_epi32(vout_low0, vout_high0); + __m128i vout_final0 = _mm_packus_epi16(vout_packed0, vout_packed0); + const __m128i vout_low1 = _mm256_castsi256_si128(vout1); + const __m128i vout_high1 = _mm256_extracti128_si256(vout1, 1); + const __m128i vout_packed1 = _mm_packs_epi32(vout_low1, vout_high1); + __m128i vout_final1 = _mm_packus_epi16(vout_packed1, vout_packed1); + + _mm_storeu_si64((__m128i*)(output), vout_final0); + + _mm_storeu_si64((__m128i*)(output + 8), vout_final1); + + output += 16; + } + + for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { + __m256i va = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_a)); + __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope); + __m256i vacc = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va_sub, vslope), vcompare); + __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + input_a+=8; + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = _mm_packus_epi16(vout_packed, vout_packed); + _mm_storeu_si64((__m128i*) output, vout_final); + output+=8; + + + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint8_t)); + assert(batch <= 7 * sizeof(uint8_t)); + + const __m256i va = _mm256_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) input_a)); + const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point); + const __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope); + const __m256i vacc = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va_sub, vslope), vcompare); + const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare)); + const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale); + const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point); + const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias); + const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point); + const __m128i vout_low = _mm256_castsi256_si128(vout); + const __m128i vout_high = _mm256_extracti128_si256(vout, 1); + const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high); + __m128i vout_final = _mm_packus_epi16(vout_packed, vout_packed); + + if (batch & (4 * sizeof(uint8_t))) { + _mm_storeu_si32(output, vout_final); + vout_final = _mm_srli_epi64(vout_final, 32); + output += 4; + } + + if (batch & (2 * sizeof(uint8_t))) { + _mm_storeu_si16(output, vout_final); + vout_final = _mm_srli_epi32(vout_final, 16); + output += 2; + } + if (batch & (1 * sizeof(uint8_t))) { + *output = (uint8_t) _mm_extract_epi8(vout_final, 0); + } + } +} diff --git a/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u1.c b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u1.c new file mode 100644 index 00000000000..30d699efe88 --- /dev/null +++ b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u1.c @@ -0,0 +1,73 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vrpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qu8_vrpreluc_ukernel__scalar_u1( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope; + float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (uint8_t) vout; + batch -= sizeof(uint8_t); + } while (batch != 0); +} diff --git a/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u2.c b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u2.c new file mode 100644 index 00000000000..b3fc0aaec21 --- /dev/null +++ b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u2.c @@ -0,0 +1,103 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vrpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qu8_vrpreluc_ukernel__scalar_u2( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + for (; batch >= 2 * sizeof(uint8_t); batch -= 2 * sizeof(uint8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + input_a += 2; + + int32_t vacc0 = XNN_UNPREDICTABLE(slope < 0) ? va0 * slope : slope; + int32_t vacc1 = XNN_UNPREDICTABLE(slope < 0) ? va1 * slope : slope; + + float vscale0 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + + output[0] = (uint8_t) vout0; + output[1] = (uint8_t) vout1; + output += 2; + } + + if XNN_UNLIKELY(batch != 0) { + const int32_t va = (int32_t) *input_a - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope; + float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output = (uint8_t) vout; + } +} diff --git a/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u4.c b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u4.c new file mode 100644 index 00000000000..c5dfaa0f9ea --- /dev/null +++ b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u4.c @@ -0,0 +1,124 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vrpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qu8_vrpreluc_ukernel__scalar_u4( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + const int32_t va2 = input_a[2] - input_zero_point; + const int32_t va3 = input_a[3] - input_zero_point; + input_a += 4; + + int32_t vacc0 = XNN_UNPREDICTABLE(slope < 0) ? va0 * slope : slope; + int32_t vacc1 = XNN_UNPREDICTABLE(slope < 0) ? va1 * slope : slope; + int32_t vacc2 = XNN_UNPREDICTABLE(slope < 0) ? va2 * slope : slope; + int32_t vacc3 = XNN_UNPREDICTABLE(slope < 0) ? va3 * slope : slope; + + float vscale0 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale2 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale3 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + float vfpacc2 = (float) vacc2 * vscale2; + float vfpacc3 = (float) vacc3 * vscale3; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); + vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); + vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + vfpacc2 += vmagic_bias; + vfpacc3 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; + const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; + + output[0] = (uint8_t) vout0; + output[1] = (uint8_t) vout1; + output[2] = (uint8_t) vout2; + output[3] = (uint8_t) vout3; + output += 4; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope; + float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (uint8_t) vout; + batch -= sizeof(uint8_t); + } while (batch != 0); + } +} diff --git a/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u8.c b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u8.c new file mode 100644 index 00000000000..259b81e234a --- /dev/null +++ b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u8.c @@ -0,0 +1,160 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/qs8-vrpreluc/scalar.c.in +// Generator: tools/xngen +// +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + + +#include +#include "src/xnnpack/math.h" +#include "src/xnnpack/vbinary.h" + +void xnn_qu8_vrpreluc_ukernel__scalar_u8( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t input_zero_point = params->scalar.input_zero_point; + const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier; + const float vnegative_multiplier = params->scalar.negative_multiplier; + const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point; + const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point; + const float vmagic_bias = 12582912.0f; + const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; + const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point; + for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { + const int32_t va0 = input_a[0] - input_zero_point; + const int32_t va1 = input_a[1] - input_zero_point; + const int32_t va2 = input_a[2] - input_zero_point; + const int32_t va3 = input_a[3] - input_zero_point; + const int32_t va4 = input_a[4] - input_zero_point; + const int32_t va5 = input_a[5] - input_zero_point; + const int32_t va6 = input_a[6] - input_zero_point; + const int32_t va7 = input_a[7] - input_zero_point; + input_a += 8; + + int32_t vacc0 = XNN_UNPREDICTABLE(slope < 0) ? va0 * slope : slope; + int32_t vacc1 = XNN_UNPREDICTABLE(slope < 0) ? va1 * slope : slope; + int32_t vacc2 = XNN_UNPREDICTABLE(slope < 0) ? va2 * slope : slope; + int32_t vacc3 = XNN_UNPREDICTABLE(slope < 0) ? va3 * slope : slope; + int32_t vacc4 = XNN_UNPREDICTABLE(slope < 0) ? va4 * slope : slope; + int32_t vacc5 = XNN_UNPREDICTABLE(slope < 0) ? va5 * slope : slope; + int32_t vacc6 = XNN_UNPREDICTABLE(slope < 0) ? va6 * slope : slope; + int32_t vacc7 = XNN_UNPREDICTABLE(slope < 0) ? va7 * slope : slope; + + float vscale0 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale1 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale2 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale3 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale4 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale5 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale6 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vscale7 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + + float vfpacc0 = (float) vacc0 * vscale0; + float vfpacc1 = (float) vacc1 * vscale1; + float vfpacc2 = (float) vacc2 * vscale2; + float vfpacc3 = (float) vacc3 * vscale3; + float vfpacc4 = (float) vacc4 * vscale4; + float vfpacc5 = (float) vacc5 * vscale5; + float vfpacc6 = (float) vacc6 * vscale6; + float vfpacc7 = (float) vacc7 * vscale7; + + vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); + vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); + vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); + vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); + vfpacc4 = math_max_f32(vfpacc4, voutput_min_less_zero_point); + vfpacc5 = math_max_f32(vfpacc5, voutput_min_less_zero_point); + vfpacc6 = math_max_f32(vfpacc6, voutput_min_less_zero_point); + vfpacc7 = math_max_f32(vfpacc7, voutput_min_less_zero_point); + + vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); + vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); + vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); + vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); + vfpacc4 = math_min_f32(vfpacc4, voutput_max_less_zero_point); + vfpacc5 = math_min_f32(vfpacc5, voutput_max_less_zero_point); + vfpacc6 = math_min_f32(vfpacc6, voutput_max_less_zero_point); + vfpacc7 = math_min_f32(vfpacc7, voutput_max_less_zero_point); + + vfpacc0 += vmagic_bias; + vfpacc1 += vmagic_bias; + vfpacc2 += vmagic_bias; + vfpacc3 += vmagic_bias; + vfpacc4 += vmagic_bias; + vfpacc5 += vmagic_bias; + vfpacc6 += vmagic_bias; + vfpacc7 += vmagic_bias; + + const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; + const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; + const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; + const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; + const int32_t vout4 = (int32_t) float_as_uint32(vfpacc4) - vmagic_bias_less_output_zero_point; + const int32_t vout5 = (int32_t) float_as_uint32(vfpacc5) - vmagic_bias_less_output_zero_point; + const int32_t vout6 = (int32_t) float_as_uint32(vfpacc6) - vmagic_bias_less_output_zero_point; + const int32_t vout7 = (int32_t) float_as_uint32(vfpacc7) - vmagic_bias_less_output_zero_point; + + output[0] = (uint8_t) vout0; + output[1] = (uint8_t) vout1; + output[2] = (uint8_t) vout2; + output[3] = (uint8_t) vout3; + output[4] = (uint8_t) vout4; + output[5] = (uint8_t) vout5; + output[6] = (uint8_t) vout6; + output[7] = (uint8_t) vout7; + output += 8; + } + + if XNN_UNLIKELY(batch != 0) { + do { + const int32_t va = (int32_t) *input_a++ - input_zero_point; + int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope; + float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier; + float vfpacc = (float) vacc * vscale; + vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); + vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); + vfpacc += vmagic_bias; + const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; + *output++ = (uint8_t) vout; + batch -= sizeof(uint8_t); + } while (batch != 0); + } +} diff --git a/src/qu8-vrpreluc/qu8-vrpreluc.h b/src/qu8-vrpreluc/qu8-vrpreluc.h new file mode 100644 index 00000000000..4d9f66110bc --- /dev/null +++ b/src/qu8-vrpreluc/qu8-vrpreluc.h @@ -0,0 +1,59 @@ +// Copyright (C) 2024 Intel Corporation +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef XNN_UKERNEL_WITH_PARAMS +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) +#define XNN_DEFINED_UKERNEL_WITH_PARAMS +#endif + +#ifndef XNN_UKERNEL +#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \ + XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr) +#define XNN_DEFINED_UKERNEL +#endif + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vrpreluc_ukernel__avx2_u16, 16, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vrpreluc_ukernel__scalar_u1, 1, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vrpreluc_ukernel__scalar_u2, 2, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vrpreluc_ukernel__scalar_u4, 4, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vrpreluc_ukernel__scalar_u8, 8, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) + +#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_DEFINED_UKERNEL_WITH_PARAMS +#undef XNN_UKERNEL_WITH_PARAMS +#endif + +#ifdef XNN_DEFINED_UKERNEL +#undef XNN_DEFINED_UKERNEL +#undef XNN_UKERNEL +#endif diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h index dd29c5c381b..1e870773edc 100644 --- a/src/xnnpack/config.h +++ b/src/xnnpack/config.h @@ -67,9 +67,13 @@ xnn_init_qs8_vadd_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_qs8_vmul_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* +xnn_init_qs8_vprelu_config(); +XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_qu8_vadd_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_qu8_vmul_config(); +XNN_INTERNAL const struct xnn_binary_elementwise_config* +xnn_init_qu8_vprelu_config(); XNN_INTERNAL const struct xnn_unary_elementwise_config* xnn_init_f16_abs_config(); diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index 2e9a11629e5..f1d40a546f4 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -982,6 +982,22 @@ typedef void (*xnn_qu8_vmul_minmax_ukernel_fn)( const union xnn_qu8_mul_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + + // VPRELU: Vector PRELU elementwise + +typedef void (*xnn_qs8_vprelu_ukernel_fn)( + size_t batch, const int8_t* input_x, const int8_t* input_y, + int8_t* output, + const union xnn_qs8_vprelu_scalar_params + params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + +typedef void (*xnn_qu8_vprelu_ukernel_fn)( + size_t batch, const uint8_t* input_x, const uint8_t* input_y, + uint8_t* output, + const union xnn_qs8_vprelu_scalar_params + params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + + /***************** Microkernel pointers for sparse inference *****************/ // SpMM: Sparse Matrix-Matrix multiplication @@ -1180,6 +1196,18 @@ typedef size_t (*xnn_init_qu8_mul_minmax_params_fn)( const struct xnn_quantization_params* b_quantization, const struct xnn_quantization_params* output_quantization); +typedef size_t(*xnn_init_qs8_vprelu_params_fn)( + union xnn_qs8_vprelu_scalar_params params[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization); + +typedef size_t(*xnn_init_qu8_vprelu_params_fn)( + union xnn_qs8_vprelu_scalar_params params[XNN_MIN_ELEMENTS(1)], + const struct xnn_quantization_params* a_quantization, + const struct xnn_quantization_params* b_quantization, + const struct xnn_quantization_params* output_quantization); + typedef size_t (*xnn_init_bf16_default_params_fn)( struct xnn_bf16_default_params params[XNN_MIN_ELEMENTS(1)]); diff --git a/src/xnnpack/microparams-init.h b/src/xnnpack/microparams-init.h index 0b6c9066fa9..b7aad90ee29 100644 --- a/src/xnnpack/microparams-init.h +++ b/src/xnnpack/microparams-init.h @@ -250,6 +250,25 @@ DECLARE_INIT_QU8_MUL_MINMAX_PARAMS_FUNCTION( xnn_init_qu8_mul_minmax_rndnu_neon_params) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +#define DECLARE_INIT_QS8_VPRELU_PARAMS_FUNCTION(fn_name) \ + XNN_INTERNAL size_t fn_name( \ + union xnn_qs8_vprelu_scalar_params uparams[XNN_MIN_ELEMENTS(1)], \ + const struct xnn_quantization_params* a_quantization, \ + const struct xnn_quantization_params* b_quantization, \ + const struct xnn_quantization_params* output_quantization); + +DECLARE_INIT_QS8_VPRELU_PARAMS_FUNCTION(xnn_init_qs8_vprelu_scalar_params) + +#define DECLARE_INIT_QU8_VPRELU_PARAMS_FUNCTION(fn_name) \ + XNN_INTERNAL size_t fn_name( \ + union xnn_qs8_vprelu_scalar_params uparams[XNN_MIN_ELEMENTS(1)], \ + const struct xnn_quantization_params* a_quantization, \ + const struct xnn_quantization_params* b_quantization, \ + const struct xnn_quantization_params* output_quantization); + +DECLARE_INIT_QU8_VPRELU_PARAMS_FUNCTION(xnn_init_qu8_vprelu_scalar_params) + + #ifdef __cplusplus } // extern "C" #endif diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h index dfa52593c4d..3575f324bd5 100644 --- a/src/xnnpack/microparams.h +++ b/src/xnnpack/microparams.h @@ -145,8 +145,22 @@ struct xnn_u8_minmax_params { } scalar; }; -// Conv w. Min+Max: used by quantized GEMM/IGEMM/DWCONV microkernels with MINMAX -// activation. +// VPReLU: used by VPRELU microkernels. + +union xnn_qs8_vprelu_scalar_params { + struct { + int32_t input_zero_point; + int32_t slope_zero_point; + int32_t output_zero_point; + float positive_multiplier; + float rprelu_positive_multiplier; + float negative_multiplier; + int32_t output_min; + int32_t output_max; + } scalar; +}; + +// Conv w. Min+Max: used by quantized GEMM/IGEMM/DWCONV microkernels with MINMAX activation. struct xnn_qd8_quantization_params { int32_t zero_point; float inv_scale; @@ -392,6 +406,8 @@ union xnn_binary_uparams { struct xnn_qu8_add_minmax_params qu8_addsub; union xnn_qs8_mul_minmax_params qs8_mul; union xnn_qu8_mul_minmax_params qu8_mul; + union xnn_qs8_vprelu_scalar_params qs8_vprelu; + union xnn_qs8_vprelu_scalar_params qu8_vprelu; struct xnn_f16_minmax_params f16; struct xnn_f32_minmax_params f32; struct xnn_binary_reference_params reference; diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h index 65e9c9dfe02..7b6d51d473f 100644 --- a/src/xnnpack/operator.h +++ b/src/xnnpack/operator.h @@ -324,6 +324,7 @@ struct xnn_operator { const struct xnn_binary_elementwise_config* vadd_config; const struct xnn_binary_elementwise_config* vmul_config; const struct xnn_unary_elementwise_config* vtanh_config; + const struct xnn_binary_elementwise_config* vprelu_config; enum xnn_attention_logits_cap_type cap_type; struct xnn_attention_logits_cap_tanh_params cap_params; } attention; // For attention operator. diff --git a/src/xnnpack/vbinary.h b/src/xnnpack/vbinary.h index e9fa14244fb..31c59eadb75 100644 --- a/src/xnnpack/vbinary.h +++ b/src/xnnpack/vbinary.h @@ -88,6 +88,9 @@ extern "C" { #include "src/qu8-vmul/qu8-vmul-minmax-rndnu.h" #include "src/qu8-vmulc/qu8-vmulc-minmax-fp32.h" #include "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h" +#include "src/qu8-vprelu/qu8-vprelu.h" +#include "src/qu8-vpreluc/qu8-vpreluc.h" +#include "src/qu8-vrpreluc/qu8-vrpreluc.h" #undef XNN_UKERNEL_WITH_PARAMS #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ @@ -101,6 +104,9 @@ extern "C" { #include "src/qs8-vmul/qs8-vmul-minmax-rndnu.h" #include "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h" #include "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h" +#include "src/qs8-vprelu/qs8-vprelu.h" +#include "src/qs8-vpreluc/qs8-vpreluc.h" +#include "src/qs8-vrpreluc/qs8-vrpreluc.h" #undef XNN_UKERNEL_WITH_PARAMS #ifdef __cplusplus diff --git a/test/BUILD.bazel b/test/BUILD.bazel index 158601b7985..73401ce31af 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -251,12 +251,18 @@ xnnpack_cxx_library( "qs8_vmul_minmax_rndnu", "qs8_vmulc_minmax_fp32", "qs8_vmulc_minmax_rndnu", + "qs8_vprelu", + "qs8_vpreluc", + "qs8_vrpreluc", "qu8_vadd_minmax", "qu8_vaddc_minmax", "qu8_vmul_minmax_fp32", "qu8_vmul_minmax_rndnu", "qu8_vmulc_minmax_fp32", "qu8_vmulc_minmax_rndnu", + "qu8_vprelu", + "qu8_vpreluc", + "qu8_vrpreluc", ]] [xnnpack_unit_test( diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4f470f678a0..29714ece197 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -319,12 +319,18 @@ SET(MICROKERNEL_VBINARY_UNIT_TESTS qs8-vaddc-minmax qs8-vmul-minmax-fp32 qs8-vmulc-minmax-fp32 + qs8-vprelu + qs8-vpreluc + qs8-vrpreluc qu8-vadd-minmax qu8-vaddc-minmax qu8-vmul-minmax-fp32 qu8-vmul-minmax-rndnu qu8-vmulc-minmax-fp32 - qu8-vmulc-minmax-rndnu) + qu8-vmulc-minmax-rndnu + qu8-vprelu + qu8-vpreluc + qu8-vrpreluc) FOREACH(TEST ${MICROKERNEL_VBINARY_UNIT_TESTS}) ADD_EXECUTABLE(${TEST}-test ${TEST}.cc) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE diff --git a/test/qs8-vprelu.cc b/test/qs8-vprelu.cc new file mode 100644 index 00000000000..3223267225d --- /dev/null +++ b/test/qs8-vprelu.cc @@ -0,0 +1,32 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! +// Microkernel: qs8-vprelu +// Generator: tools/generate-vbinary-test.py + + +#include "src/xnnpack/microparams-init.h" +#include "src/xnnpack/vbinary.h" +#include "test/vbinary-microkernel-tester.h" + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ +XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ + \ +XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);\ + \ +XNN_TEST_BINARY_A_ZERO_POINT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_B_ZERO_POINT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_Y_ZERO_POINT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_A_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_B_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); +#include "src/qs8-vprelu/qs8-vprelu.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vpreluc.cc b/test/qs8-vpreluc.cc new file mode 100644 index 00000000000..e44237a865f --- /dev/null +++ b/test/qs8-vpreluc.cc @@ -0,0 +1,32 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! +// Microkernel: qs8-vpreluc +// Generator: tools/generate-vbinary-test.py + + +#include "src/xnnpack/microparams-init.h" +#include "src/xnnpack/vbinary.h" +#include "test/vbinary-microkernel-tester.h" + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ +XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ + \ +XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);\ + \ +XNN_TEST_BINARY_A_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_B_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_Y_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_A_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_B_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); +#include "src/qs8-vpreluc/qs8-vpreluc.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vrpreluc.cc b/test/qs8-vrpreluc.cc new file mode 100644 index 00000000000..6499f9123b3 --- /dev/null +++ b/test/qs8-vrpreluc.cc @@ -0,0 +1,32 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! +// Microkernel: qs8-vrpreluc +// Generator: tools/generate-vbinary-test.py + + +#include "src/xnnpack/microparams-init.h" +#include "src/xnnpack/vbinary.h" +#include "test/vbinary-microkernel-tester.h" + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ +XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ + \ +XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);\ + \ +XNN_TEST_BINARY_A_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_B_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_Y_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_A_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_B_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); +#include "src/qs8-vrpreluc/qs8-vrpreluc.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vprelu.cc b/test/qu8-vprelu.cc new file mode 100644 index 00000000000..1eda9bc36db --- /dev/null +++ b/test/qu8-vprelu.cc @@ -0,0 +1,32 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! +// Microkernel: qu8-vprelu +// Generator: tools/generate-vbinary-test.py + + +#include "src/xnnpack/microparams-init.h" +#include "src/xnnpack/vbinary.h" +#include "test/vbinary-microkernel-tester.h" + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ +XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ + \ +XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);\ + \ +XNN_TEST_BINARY_A_ZERO_POINT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_B_ZERO_POINT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_Y_ZERO_POINT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_A_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_B_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); +#include "src/qu8-vprelu/qu8-vprelu.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vpreluc.cc b/test/qu8-vpreluc.cc new file mode 100644 index 00000000000..cc0bd37d37b --- /dev/null +++ b/test/qu8-vpreluc.cc @@ -0,0 +1,32 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! +// Microkernel: qu8-vpreluc +// Generator: tools/generate-vbinary-test.py + + +#include "src/xnnpack/microparams-init.h" +#include "src/xnnpack/vbinary.h" +#include "test/vbinary-microkernel-tester.h" + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ +XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ + \ +XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);\ + \ +XNN_TEST_BINARY_A_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_B_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_Y_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_A_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_B_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ +XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); +#include "src/qu8-vpreluc/qu8-vpreluc.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vrpreluc.cc b/test/qu8-vrpreluc.cc new file mode 100644 index 00000000000..a1f09d9a87e --- /dev/null +++ b/test/qu8-vrpreluc.cc @@ -0,0 +1,32 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! +// Microkernel: qu8-vrpreluc +// Generator: tools/generate-vbinary-test.py + + +#include "src/xnnpack/microparams-init.h" +#include "src/xnnpack/vbinary.h" +#include "test/vbinary-microkernel-tester.h" + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ +XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ + \ +XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);\ + \ +XNN_TEST_BINARY_A_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_B_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_Y_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_A_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_B_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ +XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); +#include "src/qu8-vrpreluc/qu8-vrpreluc.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/vbinary-microkernel-tester.cc b/test/vbinary-microkernel-tester.cc index 198c6206213..c84ff052532 100644 --- a/test/vbinary-microkernel-tester.cc +++ b/test/vbinary-microkernel-tester.cc @@ -397,3 +397,170 @@ void VBinaryMicrokernelTester::Test( } } } + +void VBinaryMicrokernelTester::Test( + xnn_qs8_vprelu_ukernel_fn vprelu, OpType op_type, + xnn_init_qs8_vprelu_params_fn init_params) const { + xnnpack::ReplicableRandomDevice rng; + auto i8rng = [&rng]() { + return std::uniform_int_distribution( + std::numeric_limits::min(), + std::numeric_limits::max())(rng); + }; + + xnnpack::Buffer a(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); + xnnpack::Buffer b(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); + xnnpack::Buffer y( + batch_size() + + (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(int8_t) : 0)); + xnnpack::Buffer y_fp(batch_size()); + xnnpack::Buffer y_ref(batch_size()); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + if (!inplace_a()) { + std::generate(a.begin(), a.end(), [&]() { return i8rng(); }); + } + if (!inplace_b()) { + std::generate(b.begin(), b.end(), [&]() { return i8rng(); }); + } + if (inplace_a() || inplace_b()) { + std::generate(y.begin(), y.end(), [&]() { return i8rng(); }); + } + const int8_t* a_data = inplace_a() ? y.data() : a.data(); + const int8_t* b_data = inplace_b() ? y.data() : b.data(); + const size_t stride_b = broadcast_b() ? 0 : 1; + + // Prepare parameters. + xnn_qs8_vprelu_scalar_params params; + struct xnn_quantization_params a_quantization = {a_zero_point() - 0x80, + a_scale()}; + struct xnn_quantization_params b_quantization = {b_zero_point() - 0x80, + b_scale()}; + struct xnn_quantization_params y_quantization = {y_zero_point() - 0x80, + y_scale()}; + init_params(¶ms, &a_quantization, &b_quantization, &y_quantization); + + // Compute reference results. + const float positive_multiplier = a_scale() / y_scale(); + const float rprelu_pos_multiplier = b_scale() / y_scale(); + const float negative_multiplier = (a_scale() * b_scale()) / y_scale(); + EXPECT_GE(positive_multiplier, 0x1.0p-32f); + EXPECT_GE(negative_multiplier, 0x1.0p-32f); + for (size_t i = 0; i < batch_size(); i++) { + int32_t acc; + float scale; + const int32_t a_val = static_cast(a_data[i]) - static_cast(a_zero_point() - 0x80); + const int32_t b_val = static_cast(b_data[i * stride_b]) - static_cast(b_zero_point() - 0x80); + switch (op_type) + { + case OpType::Prelu: + acc = (a_val < 0) ? a_val * b_val : a_val; + scale = (a_val < 0) ? negative_multiplier : positive_multiplier; + break; + default: + acc = (b_val < 0) ? a_val * b_val : b_val; + scale = (b_val < 0) ? negative_multiplier : rprelu_pos_multiplier; + break; + } + y_fp[i] = static_cast(y_zero_point() - 0x80) + scale * static_cast(acc); + y_fp[i] = std::min(y_fp[i], static_cast(INT8_MAX)); + y_fp[i] = std::max(y_fp[i], static_cast(INT8_MIN)); + y_ref[i] = xnn_qs8_requantize_fp32( + acc, scale, static_cast(y_zero_point() - 0x80), + INT8_MIN, INT8_MAX); + } + + // Call optimized micro-kernel. + vprelu(batch_size(), a_data, b_data, y.data(), ¶ms); + + // Verify results. + for (size_t i = 0; i < batch_size(); i++) { + EXPECT_NEAR(static_cast(y_ref[i]), static_cast(y[i]), 1) + << "at element " << i << " / " << batch_size(); + EXPECT_NEAR(static_cast(static_cast(y[i])), y_fp[i], 1.0f) + << "at element " << i << " / " << batch_size(); + } + } +} + + + +void VBinaryMicrokernelTester::Test( + xnn_qu8_vprelu_ukernel_fn vprelu, OpType op_type, + xnn_init_qu8_vprelu_params_fn init_params) const { + xnnpack::ReplicableRandomDevice rng; + auto u8rng = [&rng]() { + return std::uniform_int_distribution( + 0, std::numeric_limits::max())(rng); + }; + + xnnpack::Buffer a(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t)); + xnnpack::Buffer b(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t)); + xnnpack::Buffer y( + batch_size() + + (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(uint8_t) : 0)); + xnnpack::Buffer y_fp(batch_size()); + xnnpack::Buffer y_ref(batch_size()); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + if (!inplace_a()) { + std::generate(a.begin(), a.end(), [&]() { return u8rng(); }); + } + if (!inplace_b()) { + std::generate(b.begin(), b.end(), [&]() { return u8rng(); }); + } + if (inplace_a() || inplace_b()) { + std::generate(y.begin(), y.end(), [&]() { return u8rng(); }); + } + const uint8_t* a_data = inplace_a() ? y.data() : a.data(); + const uint8_t* b_data = inplace_b() ? y.data() : b.data(); + const size_t stride_b = broadcast_b() ? 0 : 1; + + // Prepare parameters. + xnn_qs8_vprelu_scalar_params params; + struct xnn_quantization_params a_quantization = {a_zero_point(), a_scale()}; + struct xnn_quantization_params b_quantization = {b_zero_point(), b_scale()}; + struct xnn_quantization_params y_quantization = {y_zero_point(), y_scale()}; + init_params(¶ms, &a_quantization, &b_quantization, &y_quantization); + + // Compute reference results. + const float positive_multiplier = a_scale() / y_scale(); + const float rprelu_pos_multiplier = b_scale() / y_scale(); + const float negative_multiplier = (a_scale() * b_scale()) / y_scale(); + + for (size_t i = 0; i < batch_size(); i++) { + int32_t acc; + float scale; + const int32_t a_val = static_cast(a_data[i]) - static_cast(a_zero_point()); + const int32_t b_val = static_cast(b_data[i * stride_b]) - static_cast(b_zero_point()); + switch (op_type) + { + case OpType::Prelu: + acc = (a_val < 0) ? a_val * b_val : a_val; + scale = (a_val < 0) ? negative_multiplier : positive_multiplier; + break; + default: + acc = (b_val < 0) ? a_val * b_val : b_val; + scale = (b_val < 0) ? negative_multiplier : rprelu_pos_multiplier; + break; + } + y_fp[i] = static_cast(y_zero_point()) + scale * static_cast(acc); + y_fp[i] = std::min(y_fp[i], static_cast(UINT8_MAX)); + y_fp[i] = std::max(y_fp[i], static_cast(0)); + y_ref[i] = xnn_qu8_requantize_fp32( + acc, scale, static_cast(y_zero_point()), + 0, UINT8_MAX); + } + + // Call optimized micro-kernel. + vprelu(batch_size(), a_data, b_data, y.data(), ¶ms); + + // Verify results. + for (size_t i = 0; i < batch_size(); i++) { + EXPECT_NEAR(static_cast(y_ref[i]), static_cast(y[i]), 1) + << "at element " << i << " / " << batch_size(); + EXPECT_NEAR(static_cast(static_cast(y[i])), y_fp[i], 1.0f) + << "at element " << i << " / " << batch_size(); + } + } +} + + diff --git a/test/vbinary-microkernel-tester.h b/test/vbinary-microkernel-tester.h index fca254596eb..d30ff1cf077 100644 --- a/test/vbinary-microkernel-tester.h +++ b/test/vbinary-microkernel-tester.h @@ -201,6 +201,12 @@ class VBinaryMicrokernelTester { void Test(xnn_qs8_vmul_minmax_ukernel_fn vmul_minmax, xnn_init_qs8_mul_minmax_params_fn init_params) const; + + void Test(xnn_qs8_vprelu_ukernel_fn vprelu, OpType op_type, + xnn_init_qs8_vprelu_params_fn init_params) const; + + void Test(xnn_qu8_vprelu_ukernel_fn vprelu, OpType op_type, + xnn_init_qu8_vprelu_params_fn init_params) const; private: size_t batch_size_{1}; diff --git a/tools/generate-vbinary-test.py b/tools/generate-vbinary-test.py index 9e5700ac56a..f9ead3d8dd6 100755 --- a/tools/generate-vbinary-test.py +++ b/tools/generate-vbinary-test.py @@ -144,8 +144,12 @@ def main(args): op_type = OP_TYPES[op] test_args = ["ukernel"] - if tester in ["VBinaryMicrokernelTester"] and not datatype in ["qs8", "qu8"]: - test_args.append("%s::OpType::%s" % (tester, op_type)) + if tester in ["VBinaryMicrokernelTester"]: + if datatype in ['qs8', 'qu8'] and op in ['vprelu', 'vpreluc', 'vrpreluc']: + op_type = "Prelu" if op in ['vprelu', 'vpreluc'] else "RPrelu" + test_args.append("%s::OpType::%s" % (tester, op_type)) + elif not datatype in ['qs8', 'qu8']: + test_args.append("%s::OpType::%s" % (tester, op_type)) test_args.append("init_params") tests += xnncommon.make_multiline_macro( xngen.preprocess(