diff --git a/BUILD.bazel b/BUILD.bazel
index 6432817b6e1..7d7a96e3fa8 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -178,6 +178,9 @@ MICROKERNEL_DEFS = [
     "src/qs8-vmul/qs8-vmul-minmax-rndnu.h",
     "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h",
     "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h",
+    "src/qs8-vprelu/qs8-vprelu.h",
+    "src/qs8-vpreluc/qs8-vpreluc.h",
+    "src/qs8-vrpreluc/qs8-vrpreluc.h",
     "src/qu8-dwconv/qu8-dwconv-minmax-fp32.h",
     "src/qu8-dwconv/qu8-dwconv-minmax-rndnu.h",
     "src/qu8-f32-vcvt/qu8-f32-vcvt.h",
@@ -189,6 +192,9 @@ MICROKERNEL_DEFS = [
     "src/qu8-vmul/qu8-vmul-minmax-rndnu.h",
     "src/qu8-vmulc/qu8-vmulc-minmax-fp32.h",
     "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h",
+    "src/qu8-vprelu/qu8-vprelu.h",
+    "src/qu8-vpreluc/qu8-vpreluc.h",
+    "src/qu8-vrpreluc/qu8-vrpreluc.h",
     "src/s8-maxpool/s8-maxpool-minmax.h",
     "src/s8-vclamp/s8-vclamp.h",
     "src/u8-maxpool/u8-maxpool-minmax.h",
diff --git a/cmake/gen/avx2_microkernels.cmake b/cmake/gen/avx2_microkernels.cmake
index 29a419e7153..3eb82a52097 100644
--- a/cmake/gen/avx2_microkernels.cmake
+++ b/cmake/gen/avx2_microkernels.cmake
@@ -61,6 +61,9 @@ SET(PROD_AVX2_MICROKERNEL_SRCS
   src/qs8-vaddc/gen/qs8-vaddc-minmax-avx2-mul32-ld64-u16.c
   src/qs8-vcvt/gen/qs8-vcvt-avx2-u32.c
   src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u32.c
+  src/qs8-vprelu/gen/qs8-vprelu-avx2-u16.c
+  src/qs8-vpreluc/gen/qs8-vpreluc-avx2-u16.c
+  src/qs8-vrpreluc/gen/qs8-vrpreluc-avx2-u16.c
   src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-avx2-mul32.c
   src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-avx2-mul32.c
   src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx2-u16.c
@@ -73,6 +76,9 @@ SET(PROD_AVX2_MICROKERNEL_SRCS
   src/qu8-vaddc/gen/qu8-vaddc-minmax-avx2-mul32-ld64-u16.c
   src/qu8-vcvt/gen/qu8-vcvt-avx2-u32.c
   src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c
+  src/qu8-vprelu/gen/qu8-vprelu-avx2-u16.c
+  src/qu8-vpreluc/gen/qu8-vpreluc-avx2-u16.c
+  src/qu8-vrpreluc/gen/qu8-vrpreluc-avx2-u16.c
   src/s8-vclamp/s8-vclamp-avx2-u128.c
   src/u8-vclamp/u8-vclamp-avx2-u128.c
   src/x8-lut/gen/x8-lut-avx2-u128.c
diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake
index 77e90c12e9d..2d44202c44a 100644
--- a/cmake/gen/scalar_microkernels.cmake
+++ b/cmake/gen/scalar_microkernels.cmake
@@ -198,6 +198,9 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS
   src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u4.c
   src/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-u4.c
   src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-u4.c
+  src/qs8-vprelu/gen/qs8-vprelu-scalar-u8.c
+  src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u8.c
+  src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u8.c
   src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c
   src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c
   src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c
@@ -226,6 +229,9 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS
   src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u4.c
   src/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-u4.c
   src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u4.c
+  src/qu8-vprelu/gen/qu8-vprelu-scalar-u8.c
+  src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u8.c
+  src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u8.c
   src/s8-ibilinear/gen/s8-ibilinear-scalar-c1.c
   src/s8-maxpool/gen/s8-maxpool-9p-minmax-scalar-u1.c
   src/s8-rdminmax/gen/s8-rdmax-2p2x-scalar-c2.c
@@ -659,6 +665,15 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS
   src/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-u2.c
   src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-u1.c
   src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-u2.c
+  src/qs8-vprelu/gen/qs8-vprelu-scalar-u1.c
+  src/qs8-vprelu/gen/qs8-vprelu-scalar-u2.c
+  src/qs8-vprelu/gen/qs8-vprelu-scalar-u4.c
+  src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u1.c
+  src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u2.c
+  src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u4.c
+  src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u1.c
+  src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u2.c
+  src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u4.c
   src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c
   src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c
   src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c
@@ -745,6 +760,15 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS
   src/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-u2.c
   src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u1.c
   src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u2.c
+  src/qu8-vprelu/gen/qu8-vprelu-scalar-u1.c
+  src/qu8-vprelu/gen/qu8-vprelu-scalar-u2.c
+  src/qu8-vprelu/gen/qu8-vprelu-scalar-u4.c
+  src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u1.c
+  src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u2.c
+  src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u4.c
+  src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u1.c
+  src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u2.c
+  src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u4.c
   src/s8-ibilinear/gen/s8-ibilinear-scalar-c2.c
   src/s8-ibilinear/gen/s8-ibilinear-scalar-c4.c
   src/s8-rminmax/gen/s8-rmax-scalar-u1.c
diff --git a/gen/avx2_microkernels.bzl b/gen/avx2_microkernels.bzl
index 61961a62c9b..ef8ea72d8d2 100644
--- a/gen/avx2_microkernels.bzl
+++ b/gen/avx2_microkernels.bzl
@@ -57,6 +57,9 @@ PROD_AVX2_MICROKERNEL_SRCS = [
     "src/qs8-vaddc/gen/qs8-vaddc-minmax-avx2-mul32-ld64-u16.c",
     "src/qs8-vcvt/gen/qs8-vcvt-avx2-u32.c",
     "src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u32.c",
+    "src/qs8-vprelu/gen/qs8-vprelu-avx2-u16.c",
+    "src/qs8-vpreluc/gen/qs8-vpreluc-avx2-u16.c",
+    "src/qs8-vrpreluc/gen/qs8-vrpreluc-avx2-u16.c",
     "src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-avx2-mul32.c",
     "src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-avx2-mul32.c",
     "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx2-u16.c",
@@ -69,6 +72,9 @@ PROD_AVX2_MICROKERNEL_SRCS = [
     "src/qu8-vaddc/gen/qu8-vaddc-minmax-avx2-mul32-ld64-u16.c",
     "src/qu8-vcvt/gen/qu8-vcvt-avx2-u32.c",
     "src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c",
+    "src/qu8-vprelu/gen/qu8-vprelu-avx2-u16.c",
+    "src/qu8-vpreluc/gen/qu8-vpreluc-avx2-u16.c",
+    "src/qu8-vrpreluc/gen/qu8-vrpreluc-avx2-u16.c",
     "src/s8-vclamp/s8-vclamp-avx2-u128.c",
     "src/u8-vclamp/u8-vclamp-avx2-u128.c",
     "src/x8-lut/gen/x8-lut-avx2-u128.c",
diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl
index ad7d1cf7385..9d83428319e 100644
--- a/gen/scalar_microkernels.bzl
+++ b/gen/scalar_microkernels.bzl
@@ -194,6 +194,9 @@ PROD_SCALAR_MICROKERNEL_SRCS = [
     "src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u4.c",
     "src/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-u4.c",
     "src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-u4.c",
+    "src/qs8-vprelu/gen/qs8-vprelu-scalar-u8.c",
+    "src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u8.c",
+    "src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u8.c",
     "src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c",
     "src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c",
     "src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c",
@@ -222,6 +225,9 @@ PROD_SCALAR_MICROKERNEL_SRCS = [
     "src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u4.c",
     "src/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-u4.c",
     "src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u4.c",
+    "src/qu8-vprelu/gen/qu8-vprelu-scalar-u8.c",
+    "src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u8.c",
+    "src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u8.c",
     "src/s8-ibilinear/gen/s8-ibilinear-scalar-c1.c",
     "src/s8-maxpool/gen/s8-maxpool-9p-minmax-scalar-u1.c",
     "src/s8-rdminmax/gen/s8-rdmax-2p2x-scalar-c2.c",
@@ -656,6 +662,15 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [
     "src/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-u2.c",
     "src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-u1.c",
     "src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-u2.c",
+    "src/qs8-vprelu/gen/qs8-vprelu-scalar-u1.c",
+    "src/qs8-vprelu/gen/qs8-vprelu-scalar-u2.c",
+    "src/qs8-vprelu/gen/qs8-vprelu-scalar-u4.c",
+    "src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u1.c",
+    "src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u2.c",
+    "src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u4.c",
+    "src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u1.c",
+    "src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u2.c",
+    "src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u4.c",
     "src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c",
     "src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c",
     "src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c",
@@ -742,6 +757,15 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [
     "src/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-u2.c",
     "src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u1.c",
     "src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-u2.c",
+    "src/qu8-vprelu/gen/qu8-vprelu-scalar-u1.c",
+    "src/qu8-vprelu/gen/qu8-vprelu-scalar-u2.c",
+    "src/qu8-vprelu/gen/qu8-vprelu-scalar-u4.c",
+    "src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u1.c",
+    "src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u2.c",
+    "src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u4.c",
+    "src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u1.c",
+    "src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u2.c",
+    "src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u4.c",
     "src/s8-ibilinear/gen/s8-ibilinear-scalar-c2.c",
     "src/s8-ibilinear/gen/s8-ibilinear-scalar-c4.c",
     "src/s8-rminmax/gen/s8-rmax-scalar-u1.c",
diff --git a/scripts/generate-qs8-vprelu.sh b/scripts/generate-qs8-vprelu.sh
new file mode 100755
index 00000000000..08add47036e
--- /dev/null
+++ b/scripts/generate-qs8-vprelu.sh
@@ -0,0 +1,48 @@
+# Copyright (C) 2024 Intel Corporation
+#  
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#  
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#  
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#  
+#  
+# SPDX-License-Identifier: BSD-3-Clause
+
+#################################### Scalar ###################################
+tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QS8 -o src/qs8-vprelu/gen/qs8-vprelu-scalar-u1.c &
+tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QS8 -o src/qs8-vprelu/gen/qs8-vprelu-scalar-u2.c &
+tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=4 -D DATATYPE=QS8 -o src/qs8-vprelu/gen/qs8-vprelu-scalar-u4.c &
+tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-vprelu/gen/qs8-vprelu-scalar-u8.c &
+ 
+tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QU8 -o src/qu8-vprelu/gen/qu8-vprelu-scalar-u1.c &
+tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QU8 -o src/qu8-vprelu/gen/qu8-vprelu-scalar-u2.c &
+tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=4 -D DATATYPE=QU8 -o src/qu8-vprelu/gen/qu8-vprelu-scalar-u4.c &
+tools/xngen src/qs8-vprelu/scalar.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-vprelu/gen/qu8-vprelu-scalar-u8.c &
+
+#################################### AVX2 ###################################
+tools/xngen src/qs8-vprelu/avx2.c.in -D BATCH_TILE=16 -D AVX=1 -D DATATYPE=QS8 -o src/qs8-vprelu/gen/qs8-vprelu-avx2-u16.c &
+
+tools/xngen src/qs8-vprelu/avx2.c.in -D BATCH_TILE=16 -D AVX=1 -D DATATYPE=QU8 -o src/qu8-vprelu/gen/qu8-vprelu-avx2-u16.c &
+
+
+wait
+
diff --git a/scripts/generate-qs8-vpreluc.sh b/scripts/generate-qs8-vpreluc.sh
new file mode 100755
index 00000000000..2fd28edc7f2
--- /dev/null
+++ b/scripts/generate-qs8-vpreluc.sh
@@ -0,0 +1,48 @@
+# Copyright (C) 2024 Intel Corporation
+#  
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#  
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#  
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#  
+#  
+# SPDX-License-Identifier: BSD-3-Clause
+
+#################################### Scalar ###################################
+tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QS8 -o src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u1.c &
+tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QS8 -o src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u2.c &
+tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=4 -D DATATYPE=QS8 -o src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u4.c &
+tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u8.c &
+ 
+tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QU8 -o src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u1.c &
+tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QU8 -o src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u2.c &
+tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=4 -D DATATYPE=QU8 -o src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u4.c &
+tools/xngen src/qs8-vpreluc/scalar.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u8.c &
+
+#################################### AVX2 ###################################
+tools/xngen src/qs8-vpreluc/avx2.c.in -D BATCH_TILE=16 -D AVX=1 -D DATATYPE=QS8 -o src/qs8-vpreluc/gen/qs8-vpreluc-avx2-u16.c &
+
+tools/xngen src/qs8-vpreluc/avx2.c.in -D BATCH_TILE=16 -D AVX=1 -D DATATYPE=QU8 -o src/qu8-vpreluc/gen/qu8-vpreluc-avx2-u16.c &
+
+
+wait
+
diff --git a/scripts/generate-qs8-vrpreluc.sh b/scripts/generate-qs8-vrpreluc.sh
new file mode 100755
index 00000000000..485ce7d74f6
--- /dev/null
+++ b/scripts/generate-qs8-vrpreluc.sh
@@ -0,0 +1,47 @@
+# Copyright (C) 2024 Intel Corporation
+#  
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#  
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#  
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#  
+#  
+# SPDX-License-Identifier: BSD-3-Clause
+
+#################################### Scalar ###################################
+tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QS8 -o src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u1.c &
+tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QS8 -o src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u2.c &
+tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=4 -D DATATYPE=QS8 -o src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u4.c &
+tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u8.c &
+ 
+tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QU8 -o src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u1.c &
+tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QU8 -o src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u2.c &
+tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=4 -D DATATYPE=QU8 -o src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u4.c &
+tools/xngen src/qs8-vrpreluc/scalar.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u8.c &
+
+#################################### AVX2 ###################################
+tools/xngen src/qs8-vrpreluc/avx2.c.in -D BATCH_TILE=16 -D AVX=1 -D DATATYPE=QS8 -o src/qs8-vrpreluc/gen/qs8-vrpreluc-avx2-u16.c &
+
+tools/xngen src/qs8-vrpreluc/avx2.c.in -D BATCH_TILE=16 -D AVX=1 -D DATATYPE=QU8 -o src/qu8-vrpreluc/gen/qu8-vrpreluc-avx2-u16.c &
+
+
+wait
diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
index acb71e91197..48c9012b4ab 100755
--- a/scripts/generate-tests.sh
+++ b/scripts/generate-tests.sh
@@ -136,6 +136,12 @@ tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b -
 tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel qu8-vmulc-minmax-fp32 --output test/qu8-vmulc-minmax-fp32.cc &
 tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel qu8-vmulc-minmax-rndnu --output test/qu8-vmulc-minmax-rndnu.cc &
 
+tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester  --ukernel qs8-vprelu  --output test/qs8-vprelu.cc &
+tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b  --ukernel qs8-vpreluc  --output test/qs8-vpreluc.cc &
+tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b  --ukernel qs8-vrpreluc  --output test/qs8-vrpreluc.cc &
+tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester  --ukernel qu8-vprelu  --output test/qu8-vprelu.cc &
+tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel qu8-vpreluc  --output test/qu8-vpreluc.cc &
+tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel qu8-vrpreluc  --output test/qu8-vrpreluc.cc &
 ### Tests for VUnary micro-kernels
 tools/generate-vunary-test.py --ukernel f16-vabs --output test/f16-vabs.cc &
 tools/generate-vunary-test.py --ukernel f16-vapproxgelu --output test/f16-vapproxgelu.cc &
diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c
index 0fc41e25fef..652182bd6f0 100644
--- a/src/configs/binary-elementwise-config.c
+++ b/src/configs/binary-elementwise-config.c
@@ -34,9 +34,11 @@ static struct xnn_binary_elementwise_config f32_vsqrdiff_config = {0};
 
 static struct xnn_binary_elementwise_config qs8_vadd_config = {0};
 static struct xnn_binary_elementwise_config qs8_vmul_config = {0};
+static struct xnn_binary_elementwise_config qs8_vprelu_config = {0};
 
 static struct xnn_binary_elementwise_config qu8_vadd_config = {0};
 static struct xnn_binary_elementwise_config qu8_vmul_config = {0};
+static struct xnn_binary_elementwise_config qu8_vprelu_config = {0};
 
 XNN_INIT_ONCE_GUARD(f16_vadd);
 XNN_INIT_ONCE_GUARD(f16_vdiv);
@@ -57,9 +59,10 @@ XNN_INIT_ONCE_GUARD(f32_vsub);
 XNN_INIT_ONCE_GUARD(f32_vsqrdiff);
 XNN_INIT_ONCE_GUARD(qs8_vadd);
 XNN_INIT_ONCE_GUARD(qs8_vmul);
+XNN_INIT_ONCE_GUARD(qs8_vprelu);
 XNN_INIT_ONCE_GUARD(qu8_vadd);
 XNN_INIT_ONCE_GUARD(qu8_vmul);
-
+XNN_INIT_ONCE_GUARD(qu8_vprelu);
 
 static void init_f16_vadd_config(void) {
   #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR
@@ -1143,6 +1146,32 @@ static void init_qs8_vmul_config(void) {
   #endif
 }
 
+static void init_qs8_vprelu_config(void) {
+  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->use_x86_avx2) {
+      qs8_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vprelu_ukernel__avx2_u16;
+      qs8_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vpreluc_ukernel__avx2_u16;
+      qs8_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vrpreluc_ukernel__avx2_u16;
+      qs8_vprelu_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_vprelu_scalar_params;
+      qs8_vprelu_config.element_tile = 16;
+    } else {
+      qs8_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vprelu_ukernel__scalar_u8;
+      qs8_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vpreluc_ukernel__scalar_u8;
+      qs8_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vrpreluc_ukernel__scalar_u8;
+      qs8_vprelu_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_vprelu_scalar_params;
+      qs8_vprelu_config.element_tile = 8;
+    }
+  #else
+    qs8_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vprelu_ukernel__scalar_u8;
+    qs8_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vpreluc_ukernel__scalar_u8;
+    qs8_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vrpreluc_ukernel__scalar_u8;
+    qs8_vprelu_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_vprelu_scalar_params;
+    qs8_vprelu_config.element_tile = 8;
+  #endif
+}
+
 static void init_qu8_vadd_config(void) {
   #if XNN_ARCH_ARM
     const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
@@ -1292,6 +1321,32 @@ static void init_qu8_vmul_config(void) {
   #endif
 }
 
+static void init_qu8_vprelu_config(void) {
+  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->use_x86_avx2) {
+      qu8_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vprelu_ukernel__avx2_u16;
+      qu8_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vpreluc_ukernel__avx2_u16;
+      qu8_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vrpreluc_ukernel__avx2_u16;
+      qu8_vprelu_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_vprelu_scalar_params;
+      qu8_vprelu_config.element_tile = 16;
+    } else {
+      qu8_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vprelu_ukernel__scalar_u8;
+      qu8_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vpreluc_ukernel__scalar_u8;
+      qu8_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vrpreluc_ukernel__scalar_u8;
+      qu8_vprelu_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_vprelu_scalar_params;
+      qu8_vprelu_config.element_tile = 8;
+    }
+  #else
+    qu8_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vprelu_ukernel__scalar_u8;
+    qu8_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vpreluc_ukernel__scalar_u8;
+    qu8_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vrpreluc_ukernel__scalar_u8;
+    qu8_vprelu_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_vprelu_scalar_params;
+    qu8_vprelu_config.element_tile = 8;
+  #endif
+}
+
 const struct xnn_binary_elementwise_config* xnn_init_f16_vadd_config() {
   const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
   if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
@@ -1463,6 +1518,15 @@ const struct xnn_binary_elementwise_config* xnn_init_qs8_vmul_config() {
   return &qs8_vmul_config;
 }
 
+const struct xnn_binary_elementwise_config* xnn_init_qs8_vprelu_config() {
+  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  if (hardware_config == NULL) {
+    return NULL;
+  }
+  XNN_INIT_ONCE(qs8_vprelu);
+  return &qs8_vprelu_config;
+}
+
 const struct xnn_binary_elementwise_config* xnn_init_qu8_vadd_config() {
   const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
   if (hardware_config == NULL) {
@@ -1480,3 +1544,12 @@ const struct xnn_binary_elementwise_config* xnn_init_qu8_vmul_config() {
   XNN_INIT_ONCE(qu8_vmul);
   return &qu8_vmul_config;
 }
+
+const struct xnn_binary_elementwise_config* xnn_init_qu8_vprelu_config() {
+  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  if (hardware_config == NULL) {
+    return NULL;
+  }
+  XNN_INIT_ONCE(qu8_vprelu);
+  return &qu8_vprelu_config;
+}
diff --git a/src/microparams-init.c b/src/microparams-init.c
index 04ebc7472f2..adf2485afc3 100644
--- a/src/microparams-init.c
+++ b/src/microparams-init.c
@@ -934,6 +934,30 @@ size_t xnn_init_qs8_add_minmax_scalar_params(
   return sizeof(uparams->scalar);
 }
 
+size_t xnn_init_qs8_vprelu_scalar_params(
+    union xnn_qs8_vprelu_scalar_params uparams[XNN_MIN_ELEMENTS(1)],
+    const struct xnn_quantization_params* a_quantization,
+    const struct xnn_quantization_params* b_quantization,
+    const struct xnn_quantization_params* output_quantization) {
+  assert(a_quantization);
+  assert(b_quantization);
+  assert(output_quantization);
+  const float negative_product_scale = (a_quantization->scale * b_quantization->scale) / output_quantization->scale;
+  const float positive_product_scale = a_quantization->scale / output_quantization->scale;
+  const float rprelu_positive_product_scale = b_quantization->scale / output_quantization->scale;
+  assert(negative_product_scale >= 0x1.0p-16f);
+  assert(negative_product_scale < 0x1.0p+8f);
+  uparams->scalar.input_zero_point = a_quantization->zero_point;
+  uparams->scalar.slope_zero_point = b_quantization->zero_point;
+  uparams->scalar.negative_multiplier = negative_product_scale;
+  uparams->scalar.positive_multiplier = positive_product_scale;
+  uparams->scalar.rprelu_positive_multiplier = rprelu_positive_product_scale;
+  uparams->scalar.output_zero_point = output_quantization->zero_point;
+  uparams->scalar.output_min = INT8_MIN;
+  uparams->scalar.output_max = INT8_MAX;
+  return sizeof(uparams->scalar);
+}
+
 size_t xnn_init_qu8_mul_minmax_scalar_params(
     union xnn_qu8_mul_minmax_params uparams[XNN_MIN_ELEMENTS(1)],
     const struct xnn_quantization_params* a_quantization,
@@ -1000,6 +1024,31 @@ size_t xnn_init_qu8_mul_minmax_rndnu_neon_params(
 }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
+size_t xnn_init_qu8_vprelu_scalar_params(
+    union xnn_qs8_vprelu_scalar_params uparams[XNN_MIN_ELEMENTS(1)],
+    const struct xnn_quantization_params* a_quantization,
+    const struct xnn_quantization_params* b_quantization,
+    const struct xnn_quantization_params* output_quantization) {
+  assert(a_quantization);
+  assert(b_quantization);
+  assert(output_quantization);
+  const float negative_product_scale = (a_quantization->scale * b_quantization->scale) / output_quantization->scale;
+  const float positive_product_scale = a_quantization->scale / output_quantization->scale;
+  const float rprelu_positive_product_scale = b_quantization->scale / output_quantization->scale;
+  assert(negative_product_scale >= 0x1.0p-16f);
+  assert(negative_product_scale < 0x1.0p+8f);
+  uparams->scalar.input_zero_point = a_quantization->zero_point;
+  uparams->scalar.slope_zero_point = b_quantization->zero_point;
+  uparams->scalar.negative_multiplier = negative_product_scale;
+  uparams->scalar.positive_multiplier = positive_product_scale;
+  uparams->scalar.rprelu_positive_multiplier = rprelu_positive_product_scale;
+  uparams->scalar.output_zero_point = output_quantization->zero_point;
+  uparams->scalar.output_min = 0;
+  uparams->scalar.output_max = UINT8_MAX;
+  return sizeof(uparams->scalar);
+}
+
+
 size_t xnn_init_qs8_mul_minmax_scalar_params(
     union xnn_qs8_mul_minmax_params uparams[XNN_MIN_ELEMENTS(1)],
     const struct xnn_quantization_params* a_quantization,
diff --git a/src/operators/binary-elementwise-nd.c b/src/operators/binary-elementwise-nd.c
index 7e100fa5784..1a3b9f863ae 100644
--- a/src/operators/binary-elementwise-nd.c
+++ b/src/operators/binary-elementwise-nd.c
@@ -119,6 +119,10 @@ static const struct xnn_binary_elementwise_config* init_config(
           return xnn_init_f32_vprelu_config();
         case xnn_datatype_fp16:
           return xnn_init_f16_vprelu_config();
+        case xnn_datatype_qint8:
+          return xnn_init_qs8_vprelu_config();
+        case xnn_datatype_quint8:
+          return xnn_init_qu8_vprelu_config();
         default:
           return NULL;
       }
diff --git a/src/qs8-vprelu/avx2.c.in b/src/qs8-vprelu/avx2.c.in
new file mode 100644
index 00000000000..df3569883cf
--- /dev/null
+++ b/src/qs8-vprelu/avx2.c.in
@@ -0,0 +1,167 @@
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+$assert DATATYPE in ["QS8", "QU8"]
+$assert BATCH_TILE >= 8
+$assert BATCH_TILE == 8 or BATCH_TILE % 16 == 0
+$SIMD_TILE = BATCH_TILE // 16
+
+#include <assert.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include "src/xnnpack/intrinsics-polyfill.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+$_MM256_CVTEPX8_EPI32 = {"QS8": "_mm256_cvtepi8_epi32", "QU8": "_mm256_cvtepu8_epi32"}[DATATYPE]
+$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_vprelu_ukernel__avx2_u${BATCH_TILE}(
+    size_t batch,
+    const ${XINT8_T}* input_a,
+    const ${XINT8_T}* input_b,
+    ${XINT8_T}* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(${XINT8_T}) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point);
+  const __m256i vslope_zero_point = _mm256_set1_epi32(params->scalar.slope_zero_point);
+  const __m256i voutput_zero_point = _mm256_set1_epi32(params->scalar.output_zero_point);
+  const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.positive_multiplier);
+  const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier);
+  const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f);
+  const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point);
+
+  $if BATCH_TILE > 8:
+    for (; batch >= ${BATCH_TILE} * sizeof(${XINT8_T}); batch -= ${BATCH_TILE} * sizeof(${XINT8_T})) {
+      __m256i va0 = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_a));
+      __m256i vb0 = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_b));
+      
+      $for N in range(1, 2*SIMD_TILE):
+        __m256i va${N} = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) (input_a + ${N * 8})));
+        __m256i vb${N} = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) (input_b + ${N * 8}))); 
+      input_a += ${BATCH_TILE};
+      input_b += ${BATCH_TILE};
+
+      $for N in range(2*SIMD_TILE):
+        __m256i va${N}_sub = _mm256_sub_epi32(va${N}, vinput_zero_point);
+        __m256i vb${N}_sub = _mm256_sub_epi32(vb${N}, vslope_zero_point);
+        __m256i vcompare${N} = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va${N}_sub);
+        __m256i vacc${N} = _mm256_blendv_epi8(va${N}_sub, _mm256_mullo_epi32(va${N}_sub, vb${N}_sub), vcompare${N});
+
+      $for N in range(2*SIMD_TILE):
+        __m256 vscale${N} = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare${N}));
+        __m256 vfpacc${N} = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc${N}), vscale${N});
+
+      $for N in range(2*SIMD_TILE):
+        __m256 vfpacc_clamped${N} = _mm256_min_ps(_mm256_max_ps(vfpacc${N}, voutput_min_less_zero_point), voutput_max_less_zero_point);
+        __m256 vfpacc_biased${N} = _mm256_add_ps(vfpacc_clamped${N}, vmagic_bias);
+        __m256i vout${N} = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased${N}), vmagic_bias_less_output_zero_point);
+
+      $for N in range(2*SIMD_TILE):  
+        const __m128i vout_low${N} = _mm256_castsi256_si128(vout${N});
+        const __m128i vout_high${N} = _mm256_extracti128_si256(vout${N}, 1);
+        const __m128i vout_packed16${N} = _mm_packs_epi32(vout_low${N}, vout_high${N});
+        __m128i vout_final${N} = ${_MM_PACKXS_EPI16}(vout_packed16${N}, vout_packed16${N});
+
+      _mm_storeu_si64((__m128i*)(output), vout_final0);
+
+      $for N in range(1, 2*SIMD_TILE):
+        _mm_storeu_si64((__m128i*)(output + ${N*8}), vout_final${N});
+
+      output += ${BATCH_TILE};
+    }
+
+  for (; batch >= 8 * sizeof(${XINT8_T}); batch -= 8 * sizeof(${XINT8_T})) {
+    __m256i va = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_a));
+    __m256i vb = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_b));
+    __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    __m256i vb_sub = _mm256_sub_epi32(vb, vslope_zero_point);
+    __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub);
+    __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vb_sub), vcompare);
+    __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    input_a+=8;
+    input_b+=8;
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed16 = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = ${_MM_PACKXS_EPI16}(vout_packed16, vout_packed16);
+    _mm_storeu_si64((__m128i*) output, vout_final);
+    output+=8;
+
+    
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 * sizeof(${XINT8_T}));
+    assert(batch <= 7 * sizeof(${XINT8_T}));
+
+    const __m256i va = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) input_a));
+    const __m256i vb = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) input_b));
+    const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    const __m256i vb_sub = _mm256_sub_epi32(vb, vslope_zero_point);
+    const __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub);
+    const __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vb_sub), vcompare);
+    const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed16 = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = ${_MM_PACKXS_EPI16}(vout_packed16, vout_packed16);
+   
+    if (batch & (4 * sizeof(${XINT8_T}))) {
+      _mm_storeu_si32(output, vout_final);
+      vout_final = _mm_srli_epi64(vout_final, 32);
+      output += 4;
+    }
+
+    if (batch & (2 * sizeof(${XINT8_T}))) {
+     _mm_storeu_si16(output, vout_final);
+      vout_final = _mm_srli_epi32(vout_final, 16);
+      output += 2;
+    }
+    if (batch & (1 * sizeof(${XINT8_T}))) {
+      *output = (${XINT8_T}) _mm_extract_epi8(vout_final, 0);
+    }
+  }
+}
diff --git a/src/qs8-vprelu/gen/qs8-vprelu-avx2-u16.c b/src/qs8-vprelu/gen/qs8-vprelu-avx2-u16.c
new file mode 100644
index 00000000000..3ca94449696
--- /dev/null
+++ b/src/qs8-vprelu/gen/qs8-vprelu-avx2-u16.c
@@ -0,0 +1,171 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vprelu/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include "src/xnnpack/intrinsics-polyfill.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qs8_vprelu_ukernel__avx2_u16(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point);
+  const __m256i vslope_zero_point = _mm256_set1_epi32(params->scalar.slope_zero_point);
+  const __m256i voutput_zero_point = _mm256_set1_epi32(params->scalar.output_zero_point);
+  const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.positive_multiplier);
+  const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier);
+  const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f);
+  const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point);
+
+  for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) {
+    __m256i va0 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_a));
+    __m256i vb0 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_b));
+    
+    __m256i va1 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) (input_a + 8)));
+    __m256i vb1 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) (input_b + 8))); 
+    input_a += 16;
+    input_b += 16;
+
+    __m256i va0_sub = _mm256_sub_epi32(va0, vinput_zero_point);
+    __m256i vb0_sub = _mm256_sub_epi32(vb0, vslope_zero_point);
+    __m256i vcompare0 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va0_sub);
+    __m256i vacc0 = _mm256_blendv_epi8(va0_sub, _mm256_mullo_epi32(va0_sub, vb0_sub), vcompare0);
+    __m256i va1_sub = _mm256_sub_epi32(va1, vinput_zero_point);
+    __m256i vb1_sub = _mm256_sub_epi32(vb1, vslope_zero_point);
+    __m256i vcompare1 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va1_sub);
+    __m256i vacc1 = _mm256_blendv_epi8(va1_sub, _mm256_mullo_epi32(va1_sub, vb1_sub), vcompare1);
+
+    __m256 vscale0 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare0));
+    __m256 vfpacc0 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc0), vscale0);
+    __m256 vscale1 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare1));
+    __m256 vfpacc1 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc1), vscale1);
+
+    __m256 vfpacc_clamped0 = _mm256_min_ps(_mm256_max_ps(vfpacc0, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased0 = _mm256_add_ps(vfpacc_clamped0, vmagic_bias);
+    __m256i vout0 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased0), vmagic_bias_less_output_zero_point);
+    __m256 vfpacc_clamped1 = _mm256_min_ps(_mm256_max_ps(vfpacc1, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased1 = _mm256_add_ps(vfpacc_clamped1, vmagic_bias);
+    __m256i vout1 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased1), vmagic_bias_less_output_zero_point);
+
+    const __m128i vout_low0 = _mm256_castsi256_si128(vout0);
+    const __m128i vout_high0 = _mm256_extracti128_si256(vout0, 1);
+    const __m128i vout_packed160 = _mm_packs_epi32(vout_low0, vout_high0);
+    __m128i vout_final0 = _mm_packs_epi16(vout_packed160, vout_packed160);
+    const __m128i vout_low1 = _mm256_castsi256_si128(vout1);
+    const __m128i vout_high1 = _mm256_extracti128_si256(vout1, 1);
+    const __m128i vout_packed161 = _mm_packs_epi32(vout_low1, vout_high1);
+    __m128i vout_final1 = _mm_packs_epi16(vout_packed161, vout_packed161);
+
+    _mm_storeu_si64((__m128i*)(output), vout_final0);
+
+    _mm_storeu_si64((__m128i*)(output + 8), vout_final1);
+
+    output += 16;
+  }
+
+  for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) {
+    __m256i va = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_a));
+    __m256i vb = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_b));
+    __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    __m256i vb_sub = _mm256_sub_epi32(vb, vslope_zero_point);
+    __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub);
+    __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vb_sub), vcompare);
+    __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    input_a+=8;
+    input_b+=8;
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed16 = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = _mm_packs_epi16(vout_packed16, vout_packed16);
+    _mm_storeu_si64((__m128i*) output, vout_final);
+    output+=8;
+
+    
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 * sizeof(int8_t));
+    assert(batch <= 7 * sizeof(int8_t));
+
+    const __m256i va = _mm256_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) input_a));
+    const __m256i vb = _mm256_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) input_b));
+    const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    const __m256i vb_sub = _mm256_sub_epi32(vb, vslope_zero_point);
+    const __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub);
+    const __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vb_sub), vcompare);
+    const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed16 = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = _mm_packs_epi16(vout_packed16, vout_packed16);
+   
+    if (batch & (4 * sizeof(int8_t))) {
+      _mm_storeu_si32(output, vout_final);
+      vout_final = _mm_srli_epi64(vout_final, 32);
+      output += 4;
+    }
+
+    if (batch & (2 * sizeof(int8_t))) {
+     _mm_storeu_si16(output, vout_final);
+      vout_final = _mm_srli_epi32(vout_final, 16);
+      output += 2;
+    }
+    if (batch & (1 * sizeof(int8_t))) {
+      *output = (int8_t) _mm_extract_epi8(vout_final, 0);
+    }
+  }
+}
diff --git a/src/qs8-vprelu/gen/qs8-vprelu-scalar-u1.c b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u1.c
new file mode 100644
index 00000000000..cc182272768
--- /dev/null
+++ b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u1.c
@@ -0,0 +1,76 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vprelu/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+
+void xnn_qs8_vprelu_ukernel__scalar_u1(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const int32_t slope_zero_point = params->scalar.slope_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+
+  do {
+    const int32_t va = (int32_t) *input_a++ - input_zero_point;
+    const int32_t vb = (int32_t) *input_b++ - slope_zero_point;
+    int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va;
+    float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vfpacc = (float) vacc * vscale;
+    vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+    vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+    *output++ = (int8_t) vout;
+    batch -= sizeof(int8_t);
+  } while (batch != 0);
+}
diff --git a/src/qs8-vprelu/gen/qs8-vprelu-scalar-u2.c b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u2.c
new file mode 100644
index 00000000000..79778d1efcd
--- /dev/null
+++ b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u2.c
@@ -0,0 +1,110 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vprelu/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+
+void xnn_qs8_vprelu_ukernel__scalar_u2(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const int32_t slope_zero_point = params->scalar.slope_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+
+  for (; batch >= 2 * sizeof(int8_t); batch -= 2 * sizeof(int8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    input_a += 2;
+
+    const int32_t vb0 = input_b[0] - slope_zero_point;
+    const int32_t vb1 = input_b[1] - slope_zero_point;
+    input_b += 2;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * vb0 : va0;
+    int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * vb1 : va1;
+
+    float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (int8_t) vout0;
+    output[1] = (int8_t) vout1;
+    output += 2;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    const int32_t va = (int32_t) *input_a - input_zero_point;
+    const int32_t vb = (int32_t) *input_b - slope_zero_point;
+    int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va;
+    float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vfpacc = (float) vacc * vscale;
+    vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+    vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+    *output = (int8_t) vout;
+  }
+}
diff --git a/src/qs8-vprelu/gen/qs8-vprelu-scalar-u4.c b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u4.c
new file mode 100644
index 00000000000..f1154357c54
--- /dev/null
+++ b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u4.c
@@ -0,0 +1,133 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vprelu/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+
+void xnn_qs8_vprelu_ukernel__scalar_u4(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const int32_t slope_zero_point = params->scalar.slope_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+
+  for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    const int32_t va2 = input_a[2] - input_zero_point;
+    const int32_t va3 = input_a[3] - input_zero_point;
+    input_a += 4;
+
+    const int32_t vb0 = input_b[0] - slope_zero_point;
+    const int32_t vb1 = input_b[1] - slope_zero_point;
+    const int32_t vb2 = input_b[2] - slope_zero_point;
+    const int32_t vb3 = input_b[3] - slope_zero_point;
+    input_b += 4;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * vb0 : va0;
+    int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * vb1 : va1;
+    int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * vb2 : va2;
+    int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * vb3 : va3;
+
+    float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+    float vfpacc2 = (float) vacc2 * vscale2;
+    float vfpacc3 = (float) vacc3 * vscale3;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+    vfpacc2 += vmagic_bias;
+    vfpacc3 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
+    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (int8_t) vout0;
+    output[1] = (int8_t) vout1;
+    output[2] = (int8_t) vout2;
+    output[3] = (int8_t) vout3;
+    output += 4;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      const int32_t vb = (int32_t) *input_b++ - slope_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va;
+      float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (int8_t) vout;
+      batch -= sizeof(int8_t);
+    } while (batch != 0);
+  }
+}
diff --git a/src/qs8-vprelu/gen/qs8-vprelu-scalar-u8.c b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u8.c
new file mode 100644
index 00000000000..222001b2fdf
--- /dev/null
+++ b/src/qs8-vprelu/gen/qs8-vprelu-scalar-u8.c
@@ -0,0 +1,173 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vprelu/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+
+void xnn_qs8_vprelu_ukernel__scalar_u8(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const int32_t slope_zero_point = params->scalar.slope_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+
+  for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    const int32_t va2 = input_a[2] - input_zero_point;
+    const int32_t va3 = input_a[3] - input_zero_point;
+    const int32_t va4 = input_a[4] - input_zero_point;
+    const int32_t va5 = input_a[5] - input_zero_point;
+    const int32_t va6 = input_a[6] - input_zero_point;
+    const int32_t va7 = input_a[7] - input_zero_point;
+    input_a += 8;
+
+    const int32_t vb0 = input_b[0] - slope_zero_point;
+    const int32_t vb1 = input_b[1] - slope_zero_point;
+    const int32_t vb2 = input_b[2] - slope_zero_point;
+    const int32_t vb3 = input_b[3] - slope_zero_point;
+    const int32_t vb4 = input_b[4] - slope_zero_point;
+    const int32_t vb5 = input_b[5] - slope_zero_point;
+    const int32_t vb6 = input_b[6] - slope_zero_point;
+    const int32_t vb7 = input_b[7] - slope_zero_point;
+    input_b += 8;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * vb0 : va0;
+    int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * vb1 : va1;
+    int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * vb2 : va2;
+    int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * vb3 : va3;
+    int32_t vacc4 = XNN_UNPREDICTABLE(va4 < 0) ? va4 * vb4 : va4;
+    int32_t vacc5 = XNN_UNPREDICTABLE(va5 < 0) ? va5 * vb5 : va5;
+    int32_t vacc6 = XNN_UNPREDICTABLE(va6 < 0) ? va6 * vb6 : va6;
+    int32_t vacc7 = XNN_UNPREDICTABLE(va7 < 0) ? va7 * vb7 : va7;
+
+    float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale4 = XNN_UNPREDICTABLE(va4 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale5 = XNN_UNPREDICTABLE(va5 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale6 = XNN_UNPREDICTABLE(va6 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale7 = XNN_UNPREDICTABLE(va7 < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+    float vfpacc2 = (float) vacc2 * vscale2;
+    float vfpacc3 = (float) vacc3 * vscale3;
+    float vfpacc4 = (float) vacc4 * vscale4;
+    float vfpacc5 = (float) vacc5 * vscale5;
+    float vfpacc6 = (float) vacc6 * vscale6;
+    float vfpacc7 = (float) vacc7 * vscale7;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+    vfpacc4 = math_max_f32(vfpacc4, voutput_min_less_zero_point);
+    vfpacc5 = math_max_f32(vfpacc5, voutput_min_less_zero_point);
+    vfpacc6 = math_max_f32(vfpacc6, voutput_min_less_zero_point);
+    vfpacc7 = math_max_f32(vfpacc7, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+    vfpacc4 = math_min_f32(vfpacc4, voutput_max_less_zero_point);
+    vfpacc5 = math_min_f32(vfpacc5, voutput_max_less_zero_point);
+    vfpacc6 = math_min_f32(vfpacc6, voutput_max_less_zero_point);
+    vfpacc7 = math_min_f32(vfpacc7, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+    vfpacc2 += vmagic_bias;
+    vfpacc3 += vmagic_bias;
+    vfpacc4 += vmagic_bias;
+    vfpacc5 += vmagic_bias;
+    vfpacc6 += vmagic_bias;
+    vfpacc7 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
+    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
+    const int32_t vout4 = (int32_t) float_as_uint32(vfpacc4) - vmagic_bias_less_output_zero_point;
+    const int32_t vout5 = (int32_t) float_as_uint32(vfpacc5) - vmagic_bias_less_output_zero_point;
+    const int32_t vout6 = (int32_t) float_as_uint32(vfpacc6) - vmagic_bias_less_output_zero_point;
+    const int32_t vout7 = (int32_t) float_as_uint32(vfpacc7) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (int8_t) vout0;
+    output[1] = (int8_t) vout1;
+    output[2] = (int8_t) vout2;
+    output[3] = (int8_t) vout3;
+    output[4] = (int8_t) vout4;
+    output[5] = (int8_t) vout5;
+    output[6] = (int8_t) vout6;
+    output[7] = (int8_t) vout7;
+    output += 8;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      const int32_t vb = (int32_t) *input_b++ - slope_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va;
+      float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (int8_t) vout;
+      batch -= sizeof(int8_t);
+    } while (batch != 0);
+  }
+}
diff --git a/src/qs8-vprelu/qs8-vprelu.h b/src/qs8-vprelu/qs8-vprelu.h
new file mode 100644
index 00000000000..adaa8f4ed16
--- /dev/null
+++ b/src/qs8-vprelu/qs8-vprelu.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef XNN_UKERNEL_WITH_PARAMS
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \
+    XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype)
+#define XNN_DEFINED_UKERNEL_WITH_PARAMS
+#endif
+
+#ifndef XNN_UKERNEL
+#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \
+    XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr)
+#define XNN_DEFINED_UKERNEL
+#endif
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vprelu_ukernel__avx2_u16, 16, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vprelu_ukernel__scalar_u1, 1, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vprelu_ukernel__scalar_u2, 2, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vprelu_ukernel__scalar_u4, 4, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vprelu_ukernel__scalar_u8, 8, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+
+#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS
+#undef XNN_DEFINED_UKERNEL_WITH_PARAMS
+#undef XNN_UKERNEL_WITH_PARAMS
+#endif
+
+#ifdef XNN_DEFINED_UKERNEL
+#undef XNN_DEFINED_UKERNEL
+#undef XNN_UKERNEL
+#endif
\ No newline at end of file
diff --git a/src/qs8-vprelu/scalar.c.in b/src/qs8-vprelu/scalar.c.in
new file mode 100644
index 00000000000..abd18dd6c71
--- /dev/null
+++ b/src/qs8-vprelu/scalar.c.in
@@ -0,0 +1,138 @@
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+$assert DATATYPE in ["QS8", "QU8"]
+$assert BATCH_TILE >= 1
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_vprelu_ukernel__scalar_u${BATCH_TILE}(
+    size_t batch,
+    const ${XINT8_T}* input_a,
+    const ${XINT8_T}* input_b,
+    ${XINT8_T}* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(${XINT8_T}) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const int32_t slope_zero_point = params->scalar.slope_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+
+  $if BATCH_TILE == 1:
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      const int32_t vb = (int32_t) *input_b++ - slope_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va;
+      float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (${XINT8_T}) vout;
+      batch -= sizeof(${XINT8_T});
+    } while (batch != 0);
+  $else:
+    for (; batch >= ${BATCH_TILE} * sizeof(${XINT8_T}); batch -= ${BATCH_TILE} * sizeof(${XINT8_T})) {
+      $for N in range(BATCH_TILE):
+        const int32_t va${N} = input_a[${N}] - input_zero_point;
+      input_a += ${BATCH_TILE};
+
+      $for N in range(BATCH_TILE):
+        const int32_t vb${N} = input_b[${N}] - slope_zero_point;
+      input_b += ${BATCH_TILE};
+
+      $for N in range(BATCH_TILE):
+        int32_t vacc${N} = XNN_UNPREDICTABLE(va${N} < 0) ? va${N} * vb${N} : va${N};
+
+      $for N in range(BATCH_TILE):
+        float vscale${N} = XNN_UNPREDICTABLE(va${N} < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+      $for N in range(BATCH_TILE):
+        float vfpacc${N} = (float) vacc${N} * vscale${N};
+
+      $for N in range(BATCH_TILE):
+        vfpacc${N} = math_max_f32(vfpacc${N}, voutput_min_less_zero_point);
+
+      $for N in range(BATCH_TILE):
+        vfpacc${N} = math_min_f32(vfpacc${N}, voutput_max_less_zero_point);
+
+      $for N in range(BATCH_TILE):
+        vfpacc${N} += vmagic_bias;
+
+      $for N in range(BATCH_TILE):
+        const int32_t vout${N} = (int32_t) float_as_uint32(vfpacc${N}) - vmagic_bias_less_output_zero_point;
+
+      $for N in range(BATCH_TILE):
+        output[${N}] = (${XINT8_T}) vout${N};
+      output += ${BATCH_TILE};
+    }
+
+    if XNN_UNLIKELY(batch != 0) {
+      $if BATCH_TILE == 2:
+        const int32_t va = (int32_t) *input_a - input_zero_point;
+        const int32_t vb = (int32_t) *input_b - slope_zero_point;
+        int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va;
+        float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+        float vfpacc = (float) vacc * vscale;
+        vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+        vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+        vfpacc += vmagic_bias;
+        const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+        *output = (${XINT8_T}) vout;
+      $else:
+        do {
+          const int32_t va = (int32_t) *input_a++ - input_zero_point;
+          const int32_t vb = (int32_t) *input_b++ - slope_zero_point;
+          int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va;
+          float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+          float vfpacc = (float) vacc * vscale;
+          vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+          vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+          vfpacc += vmagic_bias;
+          const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+          *output++ = (${XINT8_T}) vout;
+          batch -= sizeof(${XINT8_T});
+        } while (batch != 0);
+    }
+}
diff --git a/src/qs8-vpreluc/avx2.c.in b/src/qs8-vpreluc/avx2.c.in
new file mode 100644
index 00000000000..540c93f0827
--- /dev/null
+++ b/src/qs8-vpreluc/avx2.c.in
@@ -0,0 +1,159 @@
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+$assert DATATYPE in ["QS8", "QU8"]
+$assert BATCH_TILE >= 8
+$assert BATCH_TILE == 8 or BATCH_TILE % 16 == 0
+$SIMD_TILE = BATCH_TILE // 16
+
+#include <assert.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include "src/xnnpack/intrinsics-polyfill.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+$_MM256_CVTEPX8_EPI32 = {"QS8": "_mm256_cvtepi8_epi32", "QU8": "_mm256_cvtepu8_epi32"}[DATATYPE]
+$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_vpreluc_ukernel__avx2_u${BATCH_TILE}(
+    size_t batch,
+    const ${XINT8_T}* input_a,
+    const ${XINT8_T}* input_b,
+    ${XINT8_T}* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(${XINT8_T}) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point);
+  const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.positive_multiplier);
+  const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier);
+  const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f);
+  const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point);
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  const __m256i vslope = _mm256_set1_epi32(slope);
+  $if BATCH_TILE > 8:
+    for (; batch >= ${BATCH_TILE} * sizeof(${XINT8_T}); batch -= ${BATCH_TILE} * sizeof(${XINT8_T})) {
+      __m256i va0 = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_a));
+      
+      
+      $for N in range(1, 2*SIMD_TILE):
+        __m256i va${N} = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) (input_a + ${N * 8})));
+      input_a += ${BATCH_TILE};
+      
+
+      $for N in range(2*SIMD_TILE):
+        __m256i va${N}_sub = _mm256_sub_epi32(va${N}, vinput_zero_point);
+        __m256i vcompare${N} = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va${N}_sub);
+        __m256i vacc${N} = _mm256_blendv_epi8(va${N}_sub, _mm256_mullo_epi32(va${N}_sub, vslope), vcompare${N});
+
+      $for N in range(2*SIMD_TILE):
+        __m256 vscale${N} = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare${N}));
+        __m256 vfpacc${N} = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc${N}), vscale${N});
+
+      $for N in range(2*SIMD_TILE):
+        __m256 vfpacc_clamped${N} = _mm256_min_ps(_mm256_max_ps(vfpacc${N}, voutput_min_less_zero_point), voutput_max_less_zero_point);
+        __m256 vfpacc_biased${N} = _mm256_add_ps(vfpacc_clamped${N}, vmagic_bias);
+        __m256i vout${N} = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased${N}), vmagic_bias_less_output_zero_point);
+
+      $for N in range(2*SIMD_TILE):  
+        const __m128i vout_low${N} = _mm256_castsi256_si128(vout${N});
+        const __m128i vout_high${N} = _mm256_extracti128_si256(vout${N}, 1);
+        const __m128i vout_packed${N} = _mm_packs_epi32(vout_low${N}, vout_high${N});
+        __m128i vout_final${N} = ${_MM_PACKXS_EPI16}(vout_packed${N}, vout_packed${N});
+
+      _mm_storeu_si64((__m128i*)(output), vout_final0);
+
+      $for N in range(1, 2*SIMD_TILE):
+        _mm_storeu_si64((__m128i*)(output + ${N*8}), vout_final${N});
+
+      output += ${BATCH_TILE};
+    }
+
+  for (; batch >= 8 * sizeof(${XINT8_T}); batch -= 8 * sizeof(${XINT8_T})) {
+    __m256i va = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_a));
+    __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub);
+    __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vslope), vcompare);
+    __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    input_a+=8;
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = ${_MM_PACKXS_EPI16}(vout_packed, vout_packed);
+    _mm_storeu_si64((__m128i*) output, vout_final);
+    output+=8;
+
+    
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 * sizeof(${XINT8_T}));
+    assert(batch <= 7 * sizeof(${XINT8_T}));
+
+    const __m256i va = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) input_a));
+    const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub);
+    const __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vslope), vcompare);
+    const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = ${_MM_PACKXS_EPI16}(vout_packed, vout_packed);
+   
+    if (batch & (4 * sizeof(${XINT8_T}))) {
+      _mm_storeu_si32(output, vout_final);
+      vout_final = _mm_srli_epi64(vout_final, 32);
+      output += 4;
+    }
+
+    if (batch & (2 * sizeof(${XINT8_T}))) {
+     _mm_storeu_si16(output, vout_final);
+      vout_final = _mm_srli_epi32(vout_final, 16);
+      output += 2;
+    }
+    if (batch & (1 * sizeof(${XINT8_T}))) {
+      *output = (${XINT8_T}) _mm_extract_epi8(vout_final, 0);
+    }
+  }
+}
diff --git a/src/qs8-vpreluc/gen/qs8-vpreluc-avx2-u16.c b/src/qs8-vpreluc/gen/qs8-vpreluc-avx2-u16.c
new file mode 100644
index 00000000000..0be14824148
--- /dev/null
+++ b/src/qs8-vpreluc/gen/qs8-vpreluc-avx2-u16.c
@@ -0,0 +1,162 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vpreluc/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include "src/xnnpack/intrinsics-polyfill.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qs8_vpreluc_ukernel__avx2_u16(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point);
+  const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.positive_multiplier);
+  const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier);
+  const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f);
+  const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point);
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  const __m256i vslope = _mm256_set1_epi32(slope);
+  for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) {
+    __m256i va0 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_a));
+    
+    
+    __m256i va1 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) (input_a + 8)));
+    input_a += 16;
+    
+
+    __m256i va0_sub = _mm256_sub_epi32(va0, vinput_zero_point);
+    __m256i vcompare0 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va0_sub);
+    __m256i vacc0 = _mm256_blendv_epi8(va0_sub, _mm256_mullo_epi32(va0_sub, vslope), vcompare0);
+    __m256i va1_sub = _mm256_sub_epi32(va1, vinput_zero_point);
+    __m256i vcompare1 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va1_sub);
+    __m256i vacc1 = _mm256_blendv_epi8(va1_sub, _mm256_mullo_epi32(va1_sub, vslope), vcompare1);
+
+    __m256 vscale0 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare0));
+    __m256 vfpacc0 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc0), vscale0);
+    __m256 vscale1 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare1));
+    __m256 vfpacc1 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc1), vscale1);
+
+    __m256 vfpacc_clamped0 = _mm256_min_ps(_mm256_max_ps(vfpacc0, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased0 = _mm256_add_ps(vfpacc_clamped0, vmagic_bias);
+    __m256i vout0 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased0), vmagic_bias_less_output_zero_point);
+    __m256 vfpacc_clamped1 = _mm256_min_ps(_mm256_max_ps(vfpacc1, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased1 = _mm256_add_ps(vfpacc_clamped1, vmagic_bias);
+    __m256i vout1 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased1), vmagic_bias_less_output_zero_point);
+
+    const __m128i vout_low0 = _mm256_castsi256_si128(vout0);
+    const __m128i vout_high0 = _mm256_extracti128_si256(vout0, 1);
+    const __m128i vout_packed0 = _mm_packs_epi32(vout_low0, vout_high0);
+    __m128i vout_final0 = _mm_packs_epi16(vout_packed0, vout_packed0);
+    const __m128i vout_low1 = _mm256_castsi256_si128(vout1);
+    const __m128i vout_high1 = _mm256_extracti128_si256(vout1, 1);
+    const __m128i vout_packed1 = _mm_packs_epi32(vout_low1, vout_high1);
+    __m128i vout_final1 = _mm_packs_epi16(vout_packed1, vout_packed1);
+
+    _mm_storeu_si64((__m128i*)(output), vout_final0);
+
+    _mm_storeu_si64((__m128i*)(output + 8), vout_final1);
+
+    output += 16;
+  }
+
+  for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) {
+    __m256i va = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_a));
+    __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub);
+    __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vslope), vcompare);
+    __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    input_a+=8;
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = _mm_packs_epi16(vout_packed, vout_packed);
+    _mm_storeu_si64((__m128i*) output, vout_final);
+    output+=8;
+
+    
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 * sizeof(int8_t));
+    assert(batch <= 7 * sizeof(int8_t));
+
+    const __m256i va = _mm256_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) input_a));
+    const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub);
+    const __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vslope), vcompare);
+    const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = _mm_packs_epi16(vout_packed, vout_packed);
+   
+    if (batch & (4 * sizeof(int8_t))) {
+      _mm_storeu_si32(output, vout_final);
+      vout_final = _mm_srli_epi64(vout_final, 32);
+      output += 4;
+    }
+
+    if (batch & (2 * sizeof(int8_t))) {
+     _mm_storeu_si16(output, vout_final);
+      vout_final = _mm_srli_epi32(vout_final, 16);
+      output += 2;
+    }
+    if (batch & (1 * sizeof(int8_t))) {
+      *output = (int8_t) _mm_extract_epi8(vout_final, 0);
+    }
+  }
+}
diff --git a/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u1.c b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u1.c
new file mode 100644
index 00000000000..9de87e0a41e
--- /dev/null
+++ b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u1.c
@@ -0,0 +1,73 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qs8_vpreluc_ukernel__scalar_u1(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  do {
+    const int32_t va = (int32_t) *input_a++ - input_zero_point;
+    int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va;
+    float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vfpacc = (float) vacc * vscale;
+    vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+    vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+    *output++ = (int8_t) vout;
+    batch -= sizeof(int8_t);
+  } while (batch != 0);
+}
diff --git a/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u2.c b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u2.c
new file mode 100644
index 00000000000..852603e354c
--- /dev/null
+++ b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u2.c
@@ -0,0 +1,103 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qs8_vpreluc_ukernel__scalar_u2(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  for (; batch >= 2 * sizeof(int8_t); batch -= 2 * sizeof(int8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    input_a += 2;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * slope : va0;
+    int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * slope : va1;
+
+    float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (int8_t) vout0;
+    output[1] = (int8_t) vout1;
+    output += 2;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    const int32_t va = (int32_t) *input_a - input_zero_point;
+    int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va;
+    float vscale = XNN_UNPREDICTABLE(va >= 0) ? vpositive_multiplier : vnegative_multiplier;
+    float vfpacc = (float) vacc * vscale;
+    vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+    vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+    *output = (int8_t) vout;
+  }
+}
diff --git a/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u4.c b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u4.c
new file mode 100644
index 00000000000..996f8739ef0
--- /dev/null
+++ b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u4.c
@@ -0,0 +1,124 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qs8_vpreluc_ukernel__scalar_u4(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    const int32_t va2 = input_a[2] - input_zero_point;
+    const int32_t va3 = input_a[3] - input_zero_point;
+    input_a += 4;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * slope : va0;
+    int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * slope : va1;
+    int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * slope : va2;
+    int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * slope : va3;
+
+    float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+    float vfpacc2 = (float) vacc2 * vscale2;
+    float vfpacc3 = (float) vacc3 * vscale3;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+    vfpacc2 += vmagic_bias;
+    vfpacc3 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
+    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (int8_t) vout0;
+    output[1] = (int8_t) vout1;
+    output[2] = (int8_t) vout2;
+    output[3] = (int8_t) vout3;
+    output += 4;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va;
+      float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (int8_t) vout;
+      batch -= sizeof(int8_t);
+    } while (batch != 0);
+  }
+}
diff --git a/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u8.c b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u8.c
new file mode 100644
index 00000000000..36344d5210c
--- /dev/null
+++ b/src/qs8-vpreluc/gen/qs8-vpreluc-scalar-u8.c
@@ -0,0 +1,160 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qs8_vpreluc_ukernel__scalar_u8(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    const int32_t va2 = input_a[2] - input_zero_point;
+    const int32_t va3 = input_a[3] - input_zero_point;
+    const int32_t va4 = input_a[4] - input_zero_point;
+    const int32_t va5 = input_a[5] - input_zero_point;
+    const int32_t va6 = input_a[6] - input_zero_point;
+    const int32_t va7 = input_a[7] - input_zero_point;
+    input_a += 8;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * slope : va0;
+    int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * slope : va1;
+    int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * slope : va2;
+    int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * slope : va3;
+    int32_t vacc4 = XNN_UNPREDICTABLE(va4 < 0) ? va4 * slope : va4;
+    int32_t vacc5 = XNN_UNPREDICTABLE(va5 < 0) ? va5 * slope : va5;
+    int32_t vacc6 = XNN_UNPREDICTABLE(va6 < 0) ? va6 * slope : va6;
+    int32_t vacc7 = XNN_UNPREDICTABLE(va7 < 0) ? va7 * slope : va7;
+
+    float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale4 = XNN_UNPREDICTABLE(va4 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale5 = XNN_UNPREDICTABLE(va5 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale6 = XNN_UNPREDICTABLE(va6 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale7 = XNN_UNPREDICTABLE(va7 < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+    float vfpacc2 = (float) vacc2 * vscale2;
+    float vfpacc3 = (float) vacc3 * vscale3;
+    float vfpacc4 = (float) vacc4 * vscale4;
+    float vfpacc5 = (float) vacc5 * vscale5;
+    float vfpacc6 = (float) vacc6 * vscale6;
+    float vfpacc7 = (float) vacc7 * vscale7;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+    vfpacc4 = math_max_f32(vfpacc4, voutput_min_less_zero_point);
+    vfpacc5 = math_max_f32(vfpacc5, voutput_min_less_zero_point);
+    vfpacc6 = math_max_f32(vfpacc6, voutput_min_less_zero_point);
+    vfpacc7 = math_max_f32(vfpacc7, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+    vfpacc4 = math_min_f32(vfpacc4, voutput_max_less_zero_point);
+    vfpacc5 = math_min_f32(vfpacc5, voutput_max_less_zero_point);
+    vfpacc6 = math_min_f32(vfpacc6, voutput_max_less_zero_point);
+    vfpacc7 = math_min_f32(vfpacc7, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+    vfpacc2 += vmagic_bias;
+    vfpacc3 += vmagic_bias;
+    vfpacc4 += vmagic_bias;
+    vfpacc5 += vmagic_bias;
+    vfpacc6 += vmagic_bias;
+    vfpacc7 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
+    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
+    const int32_t vout4 = (int32_t) float_as_uint32(vfpacc4) - vmagic_bias_less_output_zero_point;
+    const int32_t vout5 = (int32_t) float_as_uint32(vfpacc5) - vmagic_bias_less_output_zero_point;
+    const int32_t vout6 = (int32_t) float_as_uint32(vfpacc6) - vmagic_bias_less_output_zero_point;
+    const int32_t vout7 = (int32_t) float_as_uint32(vfpacc7) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (int8_t) vout0;
+    output[1] = (int8_t) vout1;
+    output[2] = (int8_t) vout2;
+    output[3] = (int8_t) vout3;
+    output[4] = (int8_t) vout4;
+    output[5] = (int8_t) vout5;
+    output[6] = (int8_t) vout6;
+    output[7] = (int8_t) vout7;
+    output += 8;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va;
+      float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (int8_t) vout;
+      batch -= sizeof(int8_t);
+    } while (batch != 0);
+  }
+}
diff --git a/src/qs8-vpreluc/qs8-vpreluc.h b/src/qs8-vpreluc/qs8-vpreluc.h
new file mode 100644
index 00000000000..403889fc111
--- /dev/null
+++ b/src/qs8-vpreluc/qs8-vpreluc.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef XNN_UKERNEL_WITH_PARAMS
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \
+    XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype)
+#define XNN_DEFINED_UKERNEL_WITH_PARAMS
+#endif
+
+#ifndef XNN_UKERNEL
+#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \
+    XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr)
+#define XNN_DEFINED_UKERNEL
+#endif
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vpreluc_ukernel__avx2_u16, 16, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vpreluc_ukernel__scalar_u1, 1, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vpreluc_ukernel__scalar_u2, 2, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vpreluc_ukernel__scalar_u4, 4, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vpreluc_ukernel__scalar_u8, 8, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+
+#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS
+#undef XNN_DEFINED_UKERNEL_WITH_PARAMS
+#undef XNN_UKERNEL_WITH_PARAMS
+#endif
+
+#ifdef XNN_DEFINED_UKERNEL
+#undef XNN_DEFINED_UKERNEL
+#undef XNN_UKERNEL
+#endif
diff --git a/src/qs8-vpreluc/scalar.c.in b/src/qs8-vpreluc/scalar.c.in
new file mode 100644
index 00000000000..0a7dab0e10d
--- /dev/null
+++ b/src/qs8-vpreluc/scalar.c.in
@@ -0,0 +1,129 @@
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+$assert DATATYPE in ["QS8", "QU8"]
+$assert BATCH_TILE >= 1
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_vpreluc_ukernel__scalar_u${BATCH_TILE}(
+    size_t batch,
+    const ${XINT8_T}* input_a,
+    const ${XINT8_T}* input_b,
+    ${XINT8_T}* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(${XINT8_T}) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  $if BATCH_TILE == 1:
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va;
+      float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (${XINT8_T}) vout;
+      batch -= sizeof(${XINT8_T});
+    } while (batch != 0);
+  $else:
+    for (; batch >= ${BATCH_TILE} * sizeof(${XINT8_T}); batch -= ${BATCH_TILE} * sizeof(${XINT8_T})) {
+      $for N in range(BATCH_TILE):
+        const int32_t va${N} = input_a[${N}] - input_zero_point;
+      input_a += ${BATCH_TILE};
+
+      $for N in range(BATCH_TILE):
+        int32_t vacc${N} = XNN_UNPREDICTABLE(va${N} < 0) ? va${N} * slope : va${N};
+
+      $for N in range(BATCH_TILE):
+        float vscale${N} = XNN_UNPREDICTABLE(va${N} < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+      $for N in range(BATCH_TILE):
+        float vfpacc${N} = (float) vacc${N} * vscale${N};
+
+      $for N in range(BATCH_TILE):
+        vfpacc${N} = math_max_f32(vfpacc${N}, voutput_min_less_zero_point);
+
+      $for N in range(BATCH_TILE):
+        vfpacc${N} = math_min_f32(vfpacc${N}, voutput_max_less_zero_point);
+
+      $for N in range(BATCH_TILE):
+        vfpacc${N} += vmagic_bias;
+
+      $for N in range(BATCH_TILE):
+        const int32_t vout${N} = (int32_t) float_as_uint32(vfpacc${N}) - vmagic_bias_less_output_zero_point;
+
+      $for N in range(BATCH_TILE):
+        output[${N}] = (${XINT8_T}) vout${N};
+      output += ${BATCH_TILE};
+    }
+
+    if XNN_UNLIKELY(batch != 0) {
+      $if BATCH_TILE == 2:
+        const int32_t va = (int32_t) *input_a - input_zero_point;
+        int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va;
+        float vscale = XNN_UNPREDICTABLE(va >= 0) ? vpositive_multiplier : vnegative_multiplier;
+        float vfpacc = (float) vacc * vscale;
+        vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+        vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+        vfpacc += vmagic_bias;
+        const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+        *output = (${XINT8_T}) vout;
+      $else:
+        do {
+          const int32_t va = (int32_t) *input_a++ - input_zero_point;
+          int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va;
+          float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+          float vfpacc = (float) vacc * vscale;
+          vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+          vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+          vfpacc += vmagic_bias;
+          const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+          *output++ = (${XINT8_T}) vout;
+          batch -= sizeof(${XINT8_T});
+        } while (batch != 0);
+    }
+}
diff --git a/src/qs8-vrpreluc/avx2.c.in b/src/qs8-vrpreluc/avx2.c.in
new file mode 100644
index 00000000000..df04f427419
--- /dev/null
+++ b/src/qs8-vrpreluc/avx2.c.in
@@ -0,0 +1,159 @@
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+$assert DATATYPE in ["QS8", "QU8"]
+$assert BATCH_TILE >= 8
+$assert BATCH_TILE == 8 or BATCH_TILE % 16 == 0
+$SIMD_TILE = BATCH_TILE // 16
+
+#include <assert.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include "src/xnnpack/intrinsics-polyfill.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+$_MM256_CVTEPX8_EPI32 = {"QS8": "_mm256_cvtepi8_epi32", "QU8": "_mm256_cvtepu8_epi32"}[DATATYPE]
+$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_vrpreluc_ukernel__avx2_u${BATCH_TILE}(
+    size_t batch,
+    const ${XINT8_T}* input_a,
+    const ${XINT8_T}* input_b,
+    ${XINT8_T}* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(${XINT8_T}) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point);
+  const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.rprelu_positive_multiplier);
+  const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier);
+  const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f);
+  const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point);
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  const __m256i vslope = _mm256_set1_epi32(slope);
+  $if BATCH_TILE > 8:
+    for (; batch >= ${BATCH_TILE} * sizeof(${XINT8_T}); batch -= ${BATCH_TILE} * sizeof(${XINT8_T})) {
+      __m256i va0 = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_a));
+      
+      
+      $for N in range(1, 2*SIMD_TILE):
+        __m256i va${N} = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) (input_a + ${N * 8})));
+      input_a += ${BATCH_TILE};
+      
+
+      $for N in range(2*SIMD_TILE):
+        __m256i va${N}_sub = _mm256_sub_epi32(va${N}, vinput_zero_point);
+        __m256i vcompare${N} = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope);
+        __m256i vacc${N} = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va${N}_sub, vslope), vcompare${N});
+
+      $for N in range(2*SIMD_TILE):
+        __m256 vscale${N} = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare${N}));
+        __m256 vfpacc${N} = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc${N}), vscale${N});
+
+      $for N in range(2*SIMD_TILE):
+        __m256 vfpacc_clamped${N} = _mm256_min_ps(_mm256_max_ps(vfpacc${N}, voutput_min_less_zero_point), voutput_max_less_zero_point);
+        __m256 vfpacc_biased${N} = _mm256_add_ps(vfpacc_clamped${N}, vmagic_bias);
+        __m256i vout${N} = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased${N}), vmagic_bias_less_output_zero_point);
+
+      $for N in range(2*SIMD_TILE):  
+        const __m128i vout_low${N} = _mm256_castsi256_si128(vout${N});
+        const __m128i vout_high${N} = _mm256_extracti128_si256(vout${N}, 1);
+        const __m128i vout_packed${N} = _mm_packs_epi32(vout_low${N}, vout_high${N});
+        __m128i vout_final${N} = ${_MM_PACKXS_EPI16}(vout_packed${N}, vout_packed${N});
+
+      _mm_storeu_si64((__m128i*)(output), vout_final0);
+
+      $for N in range(1, 2*SIMD_TILE):
+        _mm_storeu_si64((__m128i*)(output + ${N*8}), vout_final${N});
+
+      output += ${BATCH_TILE};
+    }
+
+  for (; batch >= 8 * sizeof(${XINT8_T}); batch -= 8 * sizeof(${XINT8_T})) {
+    __m256i va = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si64((const __m128i*) input_a));
+    __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope);
+    __m256i vacc = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va_sub, vslope), vcompare);
+    __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    input_a+=8;
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = ${_MM_PACKXS_EPI16}(vout_packed, vout_packed);
+    _mm_storeu_si64((__m128i*) output, vout_final);
+    output+=8;
+
+    
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 * sizeof(${XINT8_T}));
+    assert(batch <= 7 * sizeof(${XINT8_T}));
+
+    const __m256i va = ${_MM256_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) input_a));
+    const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    const __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope);
+    const __m256i vacc = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va_sub, vslope), vcompare);
+    const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = ${_MM_PACKXS_EPI16}(vout_packed, vout_packed);
+   
+    if (batch & (4 * sizeof(${XINT8_T}))) {
+      _mm_storeu_si32(output, vout_final);
+      vout_final = _mm_srli_epi64(vout_final, 32);
+      output += 4;
+    }
+
+    if (batch & (2 * sizeof(${XINT8_T}))) {
+     _mm_storeu_si16(output, vout_final);
+      vout_final = _mm_srli_epi32(vout_final, 16);
+      output += 2;
+    }
+    if (batch & (1 * sizeof(${XINT8_T}))) {
+      *output = (${XINT8_T}) _mm_extract_epi8(vout_final, 0);
+    }
+  }
+}
diff --git a/src/qs8-vrpreluc/gen/qs8-vrpreluc-avx2-u16.c b/src/qs8-vrpreluc/gen/qs8-vrpreluc-avx2-u16.c
new file mode 100644
index 00000000000..2a70f7a1f4f
--- /dev/null
+++ b/src/qs8-vrpreluc/gen/qs8-vrpreluc-avx2-u16.c
@@ -0,0 +1,162 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vrpreluc/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include "src/xnnpack/intrinsics-polyfill.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qs8_vrpreluc_ukernel__avx2_u16(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point);
+  const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.rprelu_positive_multiplier);
+  const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier);
+  const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f);
+  const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point);
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  const __m256i vslope = _mm256_set1_epi32(slope);
+  for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) {
+    __m256i va0 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_a));
+    
+    
+    __m256i va1 = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) (input_a + 8)));
+    input_a += 16;
+    
+
+    __m256i va0_sub = _mm256_sub_epi32(va0, vinput_zero_point);
+    __m256i vcompare0 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope);
+    __m256i vacc0 = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va0_sub, vslope), vcompare0);
+    __m256i va1_sub = _mm256_sub_epi32(va1, vinput_zero_point);
+    __m256i vcompare1 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope);
+    __m256i vacc1 = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va1_sub, vslope), vcompare1);
+
+    __m256 vscale0 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare0));
+    __m256 vfpacc0 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc0), vscale0);
+    __m256 vscale1 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare1));
+    __m256 vfpacc1 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc1), vscale1);
+
+    __m256 vfpacc_clamped0 = _mm256_min_ps(_mm256_max_ps(vfpacc0, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased0 = _mm256_add_ps(vfpacc_clamped0, vmagic_bias);
+    __m256i vout0 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased0), vmagic_bias_less_output_zero_point);
+    __m256 vfpacc_clamped1 = _mm256_min_ps(_mm256_max_ps(vfpacc1, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased1 = _mm256_add_ps(vfpacc_clamped1, vmagic_bias);
+    __m256i vout1 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased1), vmagic_bias_less_output_zero_point);
+
+    const __m128i vout_low0 = _mm256_castsi256_si128(vout0);
+    const __m128i vout_high0 = _mm256_extracti128_si256(vout0, 1);
+    const __m128i vout_packed0 = _mm_packs_epi32(vout_low0, vout_high0);
+    __m128i vout_final0 = _mm_packs_epi16(vout_packed0, vout_packed0);
+    const __m128i vout_low1 = _mm256_castsi256_si128(vout1);
+    const __m128i vout_high1 = _mm256_extracti128_si256(vout1, 1);
+    const __m128i vout_packed1 = _mm_packs_epi32(vout_low1, vout_high1);
+    __m128i vout_final1 = _mm_packs_epi16(vout_packed1, vout_packed1);
+
+    _mm_storeu_si64((__m128i*)(output), vout_final0);
+
+    _mm_storeu_si64((__m128i*)(output + 8), vout_final1);
+
+    output += 16;
+  }
+
+  for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) {
+    __m256i va = _mm256_cvtepi8_epi32(_mm_loadu_si64((const __m128i*) input_a));
+    __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope);
+    __m256i vacc = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va_sub, vslope), vcompare);
+    __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    input_a+=8;
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = _mm_packs_epi16(vout_packed, vout_packed);
+    _mm_storeu_si64((__m128i*) output, vout_final);
+    output+=8;
+
+    
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 * sizeof(int8_t));
+    assert(batch <= 7 * sizeof(int8_t));
+
+    const __m256i va = _mm256_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) input_a));
+    const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    const __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope);
+    const __m256i vacc = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va_sub, vslope), vcompare);
+    const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = _mm_packs_epi16(vout_packed, vout_packed);
+   
+    if (batch & (4 * sizeof(int8_t))) {
+      _mm_storeu_si32(output, vout_final);
+      vout_final = _mm_srli_epi64(vout_final, 32);
+      output += 4;
+    }
+
+    if (batch & (2 * sizeof(int8_t))) {
+     _mm_storeu_si16(output, vout_final);
+      vout_final = _mm_srli_epi32(vout_final, 16);
+      output += 2;
+    }
+    if (batch & (1 * sizeof(int8_t))) {
+      *output = (int8_t) _mm_extract_epi8(vout_final, 0);
+    }
+  }
+}
diff --git a/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u1.c b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u1.c
new file mode 100644
index 00000000000..74113dfcc2c
--- /dev/null
+++ b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u1.c
@@ -0,0 +1,73 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vrpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qs8_vrpreluc_ukernel__scalar_u1(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  do {
+    const int32_t va = (int32_t) *input_a++ - input_zero_point;
+    int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope;
+    float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vfpacc = (float) vacc * vscale;
+    vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+    vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+    *output++ = (int8_t) vout;
+    batch -= sizeof(int8_t);
+  } while (batch != 0);
+}
diff --git a/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u2.c b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u2.c
new file mode 100644
index 00000000000..f5169605632
--- /dev/null
+++ b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u2.c
@@ -0,0 +1,103 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vrpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qs8_vrpreluc_ukernel__scalar_u2(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  for (; batch >= 2 * sizeof(int8_t); batch -= 2 * sizeof(int8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    input_a += 2;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(slope < 0) ? va0 * slope : slope;
+    int32_t vacc1 = XNN_UNPREDICTABLE(slope < 0) ? va1 * slope : slope;
+
+    float vscale0 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (int8_t) vout0;
+    output[1] = (int8_t) vout1;
+    output += 2;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    const int32_t va = (int32_t) *input_a - input_zero_point;
+    int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope;
+    float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vfpacc = (float) vacc * vscale;
+    vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+    vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+    *output = (int8_t) vout;
+  }
+}
diff --git a/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u4.c b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u4.c
new file mode 100644
index 00000000000..8913be2e207
--- /dev/null
+++ b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u4.c
@@ -0,0 +1,124 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vrpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qs8_vrpreluc_ukernel__scalar_u4(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  for (; batch >= 4 * sizeof(int8_t); batch -= 4 * sizeof(int8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    const int32_t va2 = input_a[2] - input_zero_point;
+    const int32_t va3 = input_a[3] - input_zero_point;
+    input_a += 4;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(slope < 0) ? va0 * slope : slope;
+    int32_t vacc1 = XNN_UNPREDICTABLE(slope < 0) ? va1 * slope : slope;
+    int32_t vacc2 = XNN_UNPREDICTABLE(slope < 0) ? va2 * slope : slope;
+    int32_t vacc3 = XNN_UNPREDICTABLE(slope < 0) ? va3 * slope : slope;
+
+    float vscale0 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale2 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale3 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+    float vfpacc2 = (float) vacc2 * vscale2;
+    float vfpacc3 = (float) vacc3 * vscale3;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+    vfpacc2 += vmagic_bias;
+    vfpacc3 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
+    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (int8_t) vout0;
+    output[1] = (int8_t) vout1;
+    output[2] = (int8_t) vout2;
+    output[3] = (int8_t) vout3;
+    output += 4;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope;
+      float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (int8_t) vout;
+      batch -= sizeof(int8_t);
+    } while (batch != 0);
+  }
+}
diff --git a/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u8.c b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u8.c
new file mode 100644
index 00000000000..ba74ff74a30
--- /dev/null
+++ b/src/qs8-vrpreluc/gen/qs8-vrpreluc-scalar-u8.c
@@ -0,0 +1,160 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vrpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qs8_vrpreluc_ukernel__scalar_u8(
+    size_t batch,
+    const int8_t* input_a,
+    const int8_t* input_b,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    const int32_t va2 = input_a[2] - input_zero_point;
+    const int32_t va3 = input_a[3] - input_zero_point;
+    const int32_t va4 = input_a[4] - input_zero_point;
+    const int32_t va5 = input_a[5] - input_zero_point;
+    const int32_t va6 = input_a[6] - input_zero_point;
+    const int32_t va7 = input_a[7] - input_zero_point;
+    input_a += 8;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(slope < 0) ? va0 * slope : slope;
+    int32_t vacc1 = XNN_UNPREDICTABLE(slope < 0) ? va1 * slope : slope;
+    int32_t vacc2 = XNN_UNPREDICTABLE(slope < 0) ? va2 * slope : slope;
+    int32_t vacc3 = XNN_UNPREDICTABLE(slope < 0) ? va3 * slope : slope;
+    int32_t vacc4 = XNN_UNPREDICTABLE(slope < 0) ? va4 * slope : slope;
+    int32_t vacc5 = XNN_UNPREDICTABLE(slope < 0) ? va5 * slope : slope;
+    int32_t vacc6 = XNN_UNPREDICTABLE(slope < 0) ? va6 * slope : slope;
+    int32_t vacc7 = XNN_UNPREDICTABLE(slope < 0) ? va7 * slope : slope;
+
+    float vscale0 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale2 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale3 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale4 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale5 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale6 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale7 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+    float vfpacc2 = (float) vacc2 * vscale2;
+    float vfpacc3 = (float) vacc3 * vscale3;
+    float vfpacc4 = (float) vacc4 * vscale4;
+    float vfpacc5 = (float) vacc5 * vscale5;
+    float vfpacc6 = (float) vacc6 * vscale6;
+    float vfpacc7 = (float) vacc7 * vscale7;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+    vfpacc4 = math_max_f32(vfpacc4, voutput_min_less_zero_point);
+    vfpacc5 = math_max_f32(vfpacc5, voutput_min_less_zero_point);
+    vfpacc6 = math_max_f32(vfpacc6, voutput_min_less_zero_point);
+    vfpacc7 = math_max_f32(vfpacc7, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+    vfpacc4 = math_min_f32(vfpacc4, voutput_max_less_zero_point);
+    vfpacc5 = math_min_f32(vfpacc5, voutput_max_less_zero_point);
+    vfpacc6 = math_min_f32(vfpacc6, voutput_max_less_zero_point);
+    vfpacc7 = math_min_f32(vfpacc7, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+    vfpacc2 += vmagic_bias;
+    vfpacc3 += vmagic_bias;
+    vfpacc4 += vmagic_bias;
+    vfpacc5 += vmagic_bias;
+    vfpacc6 += vmagic_bias;
+    vfpacc7 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
+    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
+    const int32_t vout4 = (int32_t) float_as_uint32(vfpacc4) - vmagic_bias_less_output_zero_point;
+    const int32_t vout5 = (int32_t) float_as_uint32(vfpacc5) - vmagic_bias_less_output_zero_point;
+    const int32_t vout6 = (int32_t) float_as_uint32(vfpacc6) - vmagic_bias_less_output_zero_point;
+    const int32_t vout7 = (int32_t) float_as_uint32(vfpacc7) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (int8_t) vout0;
+    output[1] = (int8_t) vout1;
+    output[2] = (int8_t) vout2;
+    output[3] = (int8_t) vout3;
+    output[4] = (int8_t) vout4;
+    output[5] = (int8_t) vout5;
+    output[6] = (int8_t) vout6;
+    output[7] = (int8_t) vout7;
+    output += 8;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope;
+      float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (int8_t) vout;
+      batch -= sizeof(int8_t);
+    } while (batch != 0);
+  }
+}
diff --git a/src/qs8-vrpreluc/qs8-vrpreluc.h b/src/qs8-vrpreluc/qs8-vrpreluc.h
new file mode 100644
index 00000000000..d499dff23e0
--- /dev/null
+++ b/src/qs8-vrpreluc/qs8-vrpreluc.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef XNN_UKERNEL_WITH_PARAMS
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \
+    XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype)
+#define XNN_DEFINED_UKERNEL_WITH_PARAMS
+#endif
+
+#ifndef XNN_UKERNEL
+#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \
+    XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr)
+#define XNN_DEFINED_UKERNEL
+#endif
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vrpreluc_ukernel__avx2_u16, 16, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vrpreluc_ukernel__scalar_u1, 1, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vrpreluc_ukernel__scalar_u2, 2, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vrpreluc_ukernel__scalar_u4, 4, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vrpreluc_ukernel__scalar_u8, 8, false, int8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qs8_vprelu_scalar_params) 
+
+#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS
+#undef XNN_DEFINED_UKERNEL_WITH_PARAMS
+#undef XNN_UKERNEL_WITH_PARAMS
+#endif
+
+#ifdef XNN_DEFINED_UKERNEL
+#undef XNN_DEFINED_UKERNEL
+#undef XNN_UKERNEL
+#endif
diff --git a/src/qs8-vrpreluc/scalar.c.in b/src/qs8-vrpreluc/scalar.c.in
new file mode 100644
index 00000000000..c7087e6032f
--- /dev/null
+++ b/src/qs8-vrpreluc/scalar.c.in
@@ -0,0 +1,129 @@
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+$assert DATATYPE in ["QS8", "QU8"]
+$assert BATCH_TILE >= 1
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_vrpreluc_ukernel__scalar_u${BATCH_TILE}(
+    size_t batch,
+    const ${XINT8_T}* input_a,
+    const ${XINT8_T}* input_b,
+    ${XINT8_T}* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(${XINT8_T}) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  $if BATCH_TILE == 1:
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope;
+      float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (${XINT8_T}) vout;
+      batch -= sizeof(${XINT8_T});
+    } while (batch != 0);
+  $else:
+    for (; batch >= ${BATCH_TILE} * sizeof(${XINT8_T}); batch -= ${BATCH_TILE} * sizeof(${XINT8_T})) {
+      $for N in range(BATCH_TILE):
+        const int32_t va${N} = input_a[${N}] - input_zero_point;
+      input_a += ${BATCH_TILE};
+
+      $for N in range(BATCH_TILE):
+        int32_t vacc${N} = XNN_UNPREDICTABLE(slope < 0) ? va${N} * slope : slope;
+
+      $for N in range(BATCH_TILE):
+        float vscale${N} = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+      $for N in range(BATCH_TILE):
+        float vfpacc${N} = (float) vacc${N} * vscale${N};
+
+      $for N in range(BATCH_TILE):
+        vfpacc${N} = math_max_f32(vfpacc${N}, voutput_min_less_zero_point);
+
+      $for N in range(BATCH_TILE):
+        vfpacc${N} = math_min_f32(vfpacc${N}, voutput_max_less_zero_point);
+
+      $for N in range(BATCH_TILE):
+        vfpacc${N} += vmagic_bias;
+
+      $for N in range(BATCH_TILE):
+        const int32_t vout${N} = (int32_t) float_as_uint32(vfpacc${N}) - vmagic_bias_less_output_zero_point;
+
+      $for N in range(BATCH_TILE):
+        output[${N}] = (${XINT8_T}) vout${N};
+      output += ${BATCH_TILE};
+    }
+
+    if XNN_UNLIKELY(batch != 0) {
+      $if BATCH_TILE == 2:
+        const int32_t va = (int32_t) *input_a - input_zero_point;
+        int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope;
+        float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+        float vfpacc = (float) vacc * vscale;
+        vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+        vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+        vfpacc += vmagic_bias;
+        const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+        *output = (${XINT8_T}) vout;
+      $else:
+        do {
+          const int32_t va = (int32_t) *input_a++ - input_zero_point;
+          int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope;
+          float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+          float vfpacc = (float) vacc * vscale;
+          vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+          vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+          vfpacc += vmagic_bias;
+          const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+          *output++ = (${XINT8_T}) vout;
+          batch -= sizeof(${XINT8_T});
+        } while (batch != 0);
+    }
+}
diff --git a/src/qu8-vprelu/gen/qu8-vprelu-avx2-u16.c b/src/qu8-vprelu/gen/qu8-vprelu-avx2-u16.c
new file mode 100644
index 00000000000..4c449c5e4e0
--- /dev/null
+++ b/src/qu8-vprelu/gen/qu8-vprelu-avx2-u16.c
@@ -0,0 +1,171 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vprelu/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include "src/xnnpack/intrinsics-polyfill.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qu8_vprelu_ukernel__avx2_u16(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point);
+  const __m256i vslope_zero_point = _mm256_set1_epi32(params->scalar.slope_zero_point);
+  const __m256i voutput_zero_point = _mm256_set1_epi32(params->scalar.output_zero_point);
+  const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.positive_multiplier);
+  const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier);
+  const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f);
+  const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point);
+
+  for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) {
+    __m256i va0 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_a));
+    __m256i vb0 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_b));
+    
+    __m256i va1 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) (input_a + 8)));
+    __m256i vb1 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) (input_b + 8))); 
+    input_a += 16;
+    input_b += 16;
+
+    __m256i va0_sub = _mm256_sub_epi32(va0, vinput_zero_point);
+    __m256i vb0_sub = _mm256_sub_epi32(vb0, vslope_zero_point);
+    __m256i vcompare0 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va0_sub);
+    __m256i vacc0 = _mm256_blendv_epi8(va0_sub, _mm256_mullo_epi32(va0_sub, vb0_sub), vcompare0);
+    __m256i va1_sub = _mm256_sub_epi32(va1, vinput_zero_point);
+    __m256i vb1_sub = _mm256_sub_epi32(vb1, vslope_zero_point);
+    __m256i vcompare1 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va1_sub);
+    __m256i vacc1 = _mm256_blendv_epi8(va1_sub, _mm256_mullo_epi32(va1_sub, vb1_sub), vcompare1);
+
+    __m256 vscale0 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare0));
+    __m256 vfpacc0 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc0), vscale0);
+    __m256 vscale1 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare1));
+    __m256 vfpacc1 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc1), vscale1);
+
+    __m256 vfpacc_clamped0 = _mm256_min_ps(_mm256_max_ps(vfpacc0, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased0 = _mm256_add_ps(vfpacc_clamped0, vmagic_bias);
+    __m256i vout0 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased0), vmagic_bias_less_output_zero_point);
+    __m256 vfpacc_clamped1 = _mm256_min_ps(_mm256_max_ps(vfpacc1, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased1 = _mm256_add_ps(vfpacc_clamped1, vmagic_bias);
+    __m256i vout1 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased1), vmagic_bias_less_output_zero_point);
+
+    const __m128i vout_low0 = _mm256_castsi256_si128(vout0);
+    const __m128i vout_high0 = _mm256_extracti128_si256(vout0, 1);
+    const __m128i vout_packed160 = _mm_packs_epi32(vout_low0, vout_high0);
+    __m128i vout_final0 = _mm_packus_epi16(vout_packed160, vout_packed160);
+    const __m128i vout_low1 = _mm256_castsi256_si128(vout1);
+    const __m128i vout_high1 = _mm256_extracti128_si256(vout1, 1);
+    const __m128i vout_packed161 = _mm_packs_epi32(vout_low1, vout_high1);
+    __m128i vout_final1 = _mm_packus_epi16(vout_packed161, vout_packed161);
+
+    _mm_storeu_si64((__m128i*)(output), vout_final0);
+
+    _mm_storeu_si64((__m128i*)(output + 8), vout_final1);
+
+    output += 16;
+  }
+
+  for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) {
+    __m256i va = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_a));
+    __m256i vb = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_b));
+    __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    __m256i vb_sub = _mm256_sub_epi32(vb, vslope_zero_point);
+    __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub);
+    __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vb_sub), vcompare);
+    __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    input_a+=8;
+    input_b+=8;
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed16 = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = _mm_packus_epi16(vout_packed16, vout_packed16);
+    _mm_storeu_si64((__m128i*) output, vout_final);
+    output+=8;
+
+    
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 * sizeof(uint8_t));
+    assert(batch <= 7 * sizeof(uint8_t));
+
+    const __m256i va = _mm256_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) input_a));
+    const __m256i vb = _mm256_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) input_b));
+    const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    const __m256i vb_sub = _mm256_sub_epi32(vb, vslope_zero_point);
+    const __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub);
+    const __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vb_sub), vcompare);
+    const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed16 = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = _mm_packus_epi16(vout_packed16, vout_packed16);
+   
+    if (batch & (4 * sizeof(uint8_t))) {
+      _mm_storeu_si32(output, vout_final);
+      vout_final = _mm_srli_epi64(vout_final, 32);
+      output += 4;
+    }
+
+    if (batch & (2 * sizeof(uint8_t))) {
+     _mm_storeu_si16(output, vout_final);
+      vout_final = _mm_srli_epi32(vout_final, 16);
+      output += 2;
+    }
+    if (batch & (1 * sizeof(uint8_t))) {
+      *output = (uint8_t) _mm_extract_epi8(vout_final, 0);
+    }
+  }
+}
diff --git a/src/qu8-vprelu/gen/qu8-vprelu-scalar-u1.c b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u1.c
new file mode 100644
index 00000000000..ada1fb5810c
--- /dev/null
+++ b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u1.c
@@ -0,0 +1,76 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vprelu/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+
+void xnn_qu8_vprelu_ukernel__scalar_u1(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const int32_t slope_zero_point = params->scalar.slope_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+
+  do {
+    const int32_t va = (int32_t) *input_a++ - input_zero_point;
+    const int32_t vb = (int32_t) *input_b++ - slope_zero_point;
+    int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va;
+    float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vfpacc = (float) vacc * vscale;
+    vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+    vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+    *output++ = (uint8_t) vout;
+    batch -= sizeof(uint8_t);
+  } while (batch != 0);
+}
diff --git a/src/qu8-vprelu/gen/qu8-vprelu-scalar-u2.c b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u2.c
new file mode 100644
index 00000000000..7420ba1e7bc
--- /dev/null
+++ b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u2.c
@@ -0,0 +1,110 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vprelu/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+
+void xnn_qu8_vprelu_ukernel__scalar_u2(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const int32_t slope_zero_point = params->scalar.slope_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+
+  for (; batch >= 2 * sizeof(uint8_t); batch -= 2 * sizeof(uint8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    input_a += 2;
+
+    const int32_t vb0 = input_b[0] - slope_zero_point;
+    const int32_t vb1 = input_b[1] - slope_zero_point;
+    input_b += 2;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * vb0 : va0;
+    int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * vb1 : va1;
+
+    float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (uint8_t) vout0;
+    output[1] = (uint8_t) vout1;
+    output += 2;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    const int32_t va = (int32_t) *input_a - input_zero_point;
+    const int32_t vb = (int32_t) *input_b - slope_zero_point;
+    int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va;
+    float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vfpacc = (float) vacc * vscale;
+    vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+    vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+    *output = (uint8_t) vout;
+  }
+}
diff --git a/src/qu8-vprelu/gen/qu8-vprelu-scalar-u4.c b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u4.c
new file mode 100644
index 00000000000..feb88e84360
--- /dev/null
+++ b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u4.c
@@ -0,0 +1,133 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vprelu/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+
+void xnn_qu8_vprelu_ukernel__scalar_u4(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const int32_t slope_zero_point = params->scalar.slope_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+
+  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    const int32_t va2 = input_a[2] - input_zero_point;
+    const int32_t va3 = input_a[3] - input_zero_point;
+    input_a += 4;
+
+    const int32_t vb0 = input_b[0] - slope_zero_point;
+    const int32_t vb1 = input_b[1] - slope_zero_point;
+    const int32_t vb2 = input_b[2] - slope_zero_point;
+    const int32_t vb3 = input_b[3] - slope_zero_point;
+    input_b += 4;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * vb0 : va0;
+    int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * vb1 : va1;
+    int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * vb2 : va2;
+    int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * vb3 : va3;
+
+    float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+    float vfpacc2 = (float) vacc2 * vscale2;
+    float vfpacc3 = (float) vacc3 * vscale3;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+    vfpacc2 += vmagic_bias;
+    vfpacc3 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
+    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (uint8_t) vout0;
+    output[1] = (uint8_t) vout1;
+    output[2] = (uint8_t) vout2;
+    output[3] = (uint8_t) vout3;
+    output += 4;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      const int32_t vb = (int32_t) *input_b++ - slope_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va;
+      float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (uint8_t) vout;
+      batch -= sizeof(uint8_t);
+    } while (batch != 0);
+  }
+}
diff --git a/src/qu8-vprelu/gen/qu8-vprelu-scalar-u8.c b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u8.c
new file mode 100644
index 00000000000..b865f02a6c7
--- /dev/null
+++ b/src/qu8-vprelu/gen/qu8-vprelu-scalar-u8.c
@@ -0,0 +1,173 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vprelu/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+
+void xnn_qu8_vprelu_ukernel__scalar_u8(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const int32_t slope_zero_point = params->scalar.slope_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+
+  for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    const int32_t va2 = input_a[2] - input_zero_point;
+    const int32_t va3 = input_a[3] - input_zero_point;
+    const int32_t va4 = input_a[4] - input_zero_point;
+    const int32_t va5 = input_a[5] - input_zero_point;
+    const int32_t va6 = input_a[6] - input_zero_point;
+    const int32_t va7 = input_a[7] - input_zero_point;
+    input_a += 8;
+
+    const int32_t vb0 = input_b[0] - slope_zero_point;
+    const int32_t vb1 = input_b[1] - slope_zero_point;
+    const int32_t vb2 = input_b[2] - slope_zero_point;
+    const int32_t vb3 = input_b[3] - slope_zero_point;
+    const int32_t vb4 = input_b[4] - slope_zero_point;
+    const int32_t vb5 = input_b[5] - slope_zero_point;
+    const int32_t vb6 = input_b[6] - slope_zero_point;
+    const int32_t vb7 = input_b[7] - slope_zero_point;
+    input_b += 8;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * vb0 : va0;
+    int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * vb1 : va1;
+    int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * vb2 : va2;
+    int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * vb3 : va3;
+    int32_t vacc4 = XNN_UNPREDICTABLE(va4 < 0) ? va4 * vb4 : va4;
+    int32_t vacc5 = XNN_UNPREDICTABLE(va5 < 0) ? va5 * vb5 : va5;
+    int32_t vacc6 = XNN_UNPREDICTABLE(va6 < 0) ? va6 * vb6 : va6;
+    int32_t vacc7 = XNN_UNPREDICTABLE(va7 < 0) ? va7 * vb7 : va7;
+
+    float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale4 = XNN_UNPREDICTABLE(va4 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale5 = XNN_UNPREDICTABLE(va5 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale6 = XNN_UNPREDICTABLE(va6 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale7 = XNN_UNPREDICTABLE(va7 < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+    float vfpacc2 = (float) vacc2 * vscale2;
+    float vfpacc3 = (float) vacc3 * vscale3;
+    float vfpacc4 = (float) vacc4 * vscale4;
+    float vfpacc5 = (float) vacc5 * vscale5;
+    float vfpacc6 = (float) vacc6 * vscale6;
+    float vfpacc7 = (float) vacc7 * vscale7;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+    vfpacc4 = math_max_f32(vfpacc4, voutput_min_less_zero_point);
+    vfpacc5 = math_max_f32(vfpacc5, voutput_min_less_zero_point);
+    vfpacc6 = math_max_f32(vfpacc6, voutput_min_less_zero_point);
+    vfpacc7 = math_max_f32(vfpacc7, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+    vfpacc4 = math_min_f32(vfpacc4, voutput_max_less_zero_point);
+    vfpacc5 = math_min_f32(vfpacc5, voutput_max_less_zero_point);
+    vfpacc6 = math_min_f32(vfpacc6, voutput_max_less_zero_point);
+    vfpacc7 = math_min_f32(vfpacc7, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+    vfpacc2 += vmagic_bias;
+    vfpacc3 += vmagic_bias;
+    vfpacc4 += vmagic_bias;
+    vfpacc5 += vmagic_bias;
+    vfpacc6 += vmagic_bias;
+    vfpacc7 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
+    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
+    const int32_t vout4 = (int32_t) float_as_uint32(vfpacc4) - vmagic_bias_less_output_zero_point;
+    const int32_t vout5 = (int32_t) float_as_uint32(vfpacc5) - vmagic_bias_less_output_zero_point;
+    const int32_t vout6 = (int32_t) float_as_uint32(vfpacc6) - vmagic_bias_less_output_zero_point;
+    const int32_t vout7 = (int32_t) float_as_uint32(vfpacc7) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (uint8_t) vout0;
+    output[1] = (uint8_t) vout1;
+    output[2] = (uint8_t) vout2;
+    output[3] = (uint8_t) vout3;
+    output[4] = (uint8_t) vout4;
+    output[5] = (uint8_t) vout5;
+    output[6] = (uint8_t) vout6;
+    output[7] = (uint8_t) vout7;
+    output += 8;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      const int32_t vb = (int32_t) *input_b++ - slope_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * vb : va;
+      float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (uint8_t) vout;
+      batch -= sizeof(uint8_t);
+    } while (batch != 0);
+  }
+}
diff --git a/src/qu8-vprelu/qu8-vprelu.h b/src/qu8-vprelu/qu8-vprelu.h
new file mode 100644
index 00000000000..860c35f2d56
--- /dev/null
+++ b/src/qu8-vprelu/qu8-vprelu.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef XNN_UKERNEL_WITH_PARAMS
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \
+    XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype)
+#define XNN_DEFINED_UKERNEL_WITH_PARAMS
+#endif
+
+#ifndef XNN_UKERNEL
+#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \
+    XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr)
+#define XNN_DEFINED_UKERNEL
+#endif
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vprelu_ukernel__avx2_u16, 16, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+ 
+ XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vprelu_ukernel__scalar_u1, 1, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+ XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vprelu_ukernel__scalar_u2, 2, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+ XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vprelu_ukernel__scalar_u4, 4, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+ XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vprelu_ukernel__scalar_u8, 8, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+
+#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS
+#undef XNN_DEFINED_UKERNEL_WITH_PARAMS
+#undef XNN_UKERNEL_WITH_PARAMS
+#endif
+
+#ifdef XNN_DEFINED_UKERNEL
+#undef XNN_DEFINED_UKERNEL
+#undef XNN_UKERNEL
+#endif
diff --git a/src/qu8-vpreluc/gen/qu8-vpreluc-avx2-u16.c b/src/qu8-vpreluc/gen/qu8-vpreluc-avx2-u16.c
new file mode 100644
index 00000000000..16eea7bf320
--- /dev/null
+++ b/src/qu8-vpreluc/gen/qu8-vpreluc-avx2-u16.c
@@ -0,0 +1,162 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vpreluc/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include "src/xnnpack/intrinsics-polyfill.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qu8_vpreluc_ukernel__avx2_u16(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point);
+  const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.positive_multiplier);
+  const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier);
+  const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f);
+  const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point);
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  const __m256i vslope = _mm256_set1_epi32(slope);
+  for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) {
+    __m256i va0 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_a));
+    
+    
+    __m256i va1 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) (input_a + 8)));
+    input_a += 16;
+    
+
+    __m256i va0_sub = _mm256_sub_epi32(va0, vinput_zero_point);
+    __m256i vcompare0 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va0_sub);
+    __m256i vacc0 = _mm256_blendv_epi8(va0_sub, _mm256_mullo_epi32(va0_sub, vslope), vcompare0);
+    __m256i va1_sub = _mm256_sub_epi32(va1, vinput_zero_point);
+    __m256i vcompare1 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va1_sub);
+    __m256i vacc1 = _mm256_blendv_epi8(va1_sub, _mm256_mullo_epi32(va1_sub, vslope), vcompare1);
+
+    __m256 vscale0 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare0));
+    __m256 vfpacc0 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc0), vscale0);
+    __m256 vscale1 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare1));
+    __m256 vfpacc1 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc1), vscale1);
+
+    __m256 vfpacc_clamped0 = _mm256_min_ps(_mm256_max_ps(vfpacc0, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased0 = _mm256_add_ps(vfpacc_clamped0, vmagic_bias);
+    __m256i vout0 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased0), vmagic_bias_less_output_zero_point);
+    __m256 vfpacc_clamped1 = _mm256_min_ps(_mm256_max_ps(vfpacc1, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased1 = _mm256_add_ps(vfpacc_clamped1, vmagic_bias);
+    __m256i vout1 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased1), vmagic_bias_less_output_zero_point);
+
+    const __m128i vout_low0 = _mm256_castsi256_si128(vout0);
+    const __m128i vout_high0 = _mm256_extracti128_si256(vout0, 1);
+    const __m128i vout_packed0 = _mm_packs_epi32(vout_low0, vout_high0);
+    __m128i vout_final0 = _mm_packus_epi16(vout_packed0, vout_packed0);
+    const __m128i vout_low1 = _mm256_castsi256_si128(vout1);
+    const __m128i vout_high1 = _mm256_extracti128_si256(vout1, 1);
+    const __m128i vout_packed1 = _mm_packs_epi32(vout_low1, vout_high1);
+    __m128i vout_final1 = _mm_packus_epi16(vout_packed1, vout_packed1);
+
+    _mm_storeu_si64((__m128i*)(output), vout_final0);
+
+    _mm_storeu_si64((__m128i*)(output + 8), vout_final1);
+
+    output += 16;
+  }
+
+  for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) {
+    __m256i va = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_a));
+    __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub);
+    __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vslope), vcompare);
+    __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    input_a+=8;
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = _mm_packus_epi16(vout_packed, vout_packed);
+    _mm_storeu_si64((__m128i*) output, vout_final);
+    output+=8;
+
+    
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 * sizeof(uint8_t));
+    assert(batch <= 7 * sizeof(uint8_t));
+
+    const __m256i va = _mm256_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) input_a));
+    const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), va_sub);
+    const __m256i vacc = _mm256_blendv_epi8(va_sub, _mm256_mullo_epi32(va_sub, vslope), vcompare);
+    const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = _mm_packus_epi16(vout_packed, vout_packed);
+   
+    if (batch & (4 * sizeof(uint8_t))) {
+      _mm_storeu_si32(output, vout_final);
+      vout_final = _mm_srli_epi64(vout_final, 32);
+      output += 4;
+    }
+
+    if (batch & (2 * sizeof(uint8_t))) {
+     _mm_storeu_si16(output, vout_final);
+      vout_final = _mm_srli_epi32(vout_final, 16);
+      output += 2;
+    }
+    if (batch & (1 * sizeof(uint8_t))) {
+      *output = (uint8_t) _mm_extract_epi8(vout_final, 0);
+    }
+  }
+}
diff --git a/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u1.c b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u1.c
new file mode 100644
index 00000000000..2a4b5d61ef6
--- /dev/null
+++ b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u1.c
@@ -0,0 +1,73 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qu8_vpreluc_ukernel__scalar_u1(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  do {
+    const int32_t va = (int32_t) *input_a++ - input_zero_point;
+    int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va;
+    float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vfpacc = (float) vacc * vscale;
+    vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+    vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+    *output++ = (uint8_t) vout;
+    batch -= sizeof(uint8_t);
+  } while (batch != 0);
+}
diff --git a/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u2.c b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u2.c
new file mode 100644
index 00000000000..2e6ca54fe52
--- /dev/null
+++ b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u2.c
@@ -0,0 +1,103 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qu8_vpreluc_ukernel__scalar_u2(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  for (; batch >= 2 * sizeof(uint8_t); batch -= 2 * sizeof(uint8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    input_a += 2;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * slope : va0;
+    int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * slope : va1;
+
+    float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (uint8_t) vout0;
+    output[1] = (uint8_t) vout1;
+    output += 2;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    const int32_t va = (int32_t) *input_a - input_zero_point;
+    int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va;
+    float vscale = XNN_UNPREDICTABLE(va >= 0) ? vpositive_multiplier : vnegative_multiplier;
+    float vfpacc = (float) vacc * vscale;
+    vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+    vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+    *output = (uint8_t) vout;
+  }
+}
diff --git a/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u4.c b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u4.c
new file mode 100644
index 00000000000..52f6af22463
--- /dev/null
+++ b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u4.c
@@ -0,0 +1,124 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qu8_vpreluc_ukernel__scalar_u4(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    const int32_t va2 = input_a[2] - input_zero_point;
+    const int32_t va3 = input_a[3] - input_zero_point;
+    input_a += 4;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * slope : va0;
+    int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * slope : va1;
+    int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * slope : va2;
+    int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * slope : va3;
+
+    float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+    float vfpacc2 = (float) vacc2 * vscale2;
+    float vfpacc3 = (float) vacc3 * vscale3;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+    vfpacc2 += vmagic_bias;
+    vfpacc3 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
+    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (uint8_t) vout0;
+    output[1] = (uint8_t) vout1;
+    output[2] = (uint8_t) vout2;
+    output[3] = (uint8_t) vout3;
+    output += 4;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va;
+      float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (uint8_t) vout;
+      batch -= sizeof(uint8_t);
+    } while (batch != 0);
+  }
+}
diff --git a/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u8.c b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u8.c
new file mode 100644
index 00000000000..e882addc000
--- /dev/null
+++ b/src/qu8-vpreluc/gen/qu8-vpreluc-scalar-u8.c
@@ -0,0 +1,160 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qu8_vpreluc_ukernel__scalar_u8(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    const int32_t va2 = input_a[2] - input_zero_point;
+    const int32_t va3 = input_a[3] - input_zero_point;
+    const int32_t va4 = input_a[4] - input_zero_point;
+    const int32_t va5 = input_a[5] - input_zero_point;
+    const int32_t va6 = input_a[6] - input_zero_point;
+    const int32_t va7 = input_a[7] - input_zero_point;
+    input_a += 8;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(va0 < 0) ? va0 * slope : va0;
+    int32_t vacc1 = XNN_UNPREDICTABLE(va1 < 0) ? va1 * slope : va1;
+    int32_t vacc2 = XNN_UNPREDICTABLE(va2 < 0) ? va2 * slope : va2;
+    int32_t vacc3 = XNN_UNPREDICTABLE(va3 < 0) ? va3 * slope : va3;
+    int32_t vacc4 = XNN_UNPREDICTABLE(va4 < 0) ? va4 * slope : va4;
+    int32_t vacc5 = XNN_UNPREDICTABLE(va5 < 0) ? va5 * slope : va5;
+    int32_t vacc6 = XNN_UNPREDICTABLE(va6 < 0) ? va6 * slope : va6;
+    int32_t vacc7 = XNN_UNPREDICTABLE(va7 < 0) ? va7 * slope : va7;
+
+    float vscale0 = XNN_UNPREDICTABLE(va0 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(va1 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale2 = XNN_UNPREDICTABLE(va2 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale3 = XNN_UNPREDICTABLE(va3 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale4 = XNN_UNPREDICTABLE(va4 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale5 = XNN_UNPREDICTABLE(va5 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale6 = XNN_UNPREDICTABLE(va6 < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale7 = XNN_UNPREDICTABLE(va7 < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+    float vfpacc2 = (float) vacc2 * vscale2;
+    float vfpacc3 = (float) vacc3 * vscale3;
+    float vfpacc4 = (float) vacc4 * vscale4;
+    float vfpacc5 = (float) vacc5 * vscale5;
+    float vfpacc6 = (float) vacc6 * vscale6;
+    float vfpacc7 = (float) vacc7 * vscale7;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+    vfpacc4 = math_max_f32(vfpacc4, voutput_min_less_zero_point);
+    vfpacc5 = math_max_f32(vfpacc5, voutput_min_less_zero_point);
+    vfpacc6 = math_max_f32(vfpacc6, voutput_min_less_zero_point);
+    vfpacc7 = math_max_f32(vfpacc7, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+    vfpacc4 = math_min_f32(vfpacc4, voutput_max_less_zero_point);
+    vfpacc5 = math_min_f32(vfpacc5, voutput_max_less_zero_point);
+    vfpacc6 = math_min_f32(vfpacc6, voutput_max_less_zero_point);
+    vfpacc7 = math_min_f32(vfpacc7, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+    vfpacc2 += vmagic_bias;
+    vfpacc3 += vmagic_bias;
+    vfpacc4 += vmagic_bias;
+    vfpacc5 += vmagic_bias;
+    vfpacc6 += vmagic_bias;
+    vfpacc7 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
+    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
+    const int32_t vout4 = (int32_t) float_as_uint32(vfpacc4) - vmagic_bias_less_output_zero_point;
+    const int32_t vout5 = (int32_t) float_as_uint32(vfpacc5) - vmagic_bias_less_output_zero_point;
+    const int32_t vout6 = (int32_t) float_as_uint32(vfpacc6) - vmagic_bias_less_output_zero_point;
+    const int32_t vout7 = (int32_t) float_as_uint32(vfpacc7) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (uint8_t) vout0;
+    output[1] = (uint8_t) vout1;
+    output[2] = (uint8_t) vout2;
+    output[3] = (uint8_t) vout3;
+    output[4] = (uint8_t) vout4;
+    output[5] = (uint8_t) vout5;
+    output[6] = (uint8_t) vout6;
+    output[7] = (uint8_t) vout7;
+    output += 8;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(va < 0) ? va * slope : va;
+      float vscale = XNN_UNPREDICTABLE(va < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (uint8_t) vout;
+      batch -= sizeof(uint8_t);
+    } while (batch != 0);
+  }
+}
diff --git a/src/qu8-vpreluc/qu8-vpreluc.h b/src/qu8-vpreluc/qu8-vpreluc.h
new file mode 100644
index 00000000000..d37edff2bf0
--- /dev/null
+++ b/src/qu8-vpreluc/qu8-vpreluc.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef XNN_UKERNEL_WITH_PARAMS
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \
+    XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype)
+#define XNN_DEFINED_UKERNEL_WITH_PARAMS
+#endif
+
+#ifndef XNN_UKERNEL
+#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \
+    XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr)
+#define XNN_DEFINED_UKERNEL
+#endif
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vpreluc_ukernel__avx2_u16, 16, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vpreluc_ukernel__scalar_u1, 1, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vpreluc_ukernel__scalar_u2, 2, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vpreluc_ukernel__scalar_u4, 4, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vpreluc_ukernel__scalar_u8, 8, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+
+#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS
+#undef XNN_DEFINED_UKERNEL_WITH_PARAMS
+#undef XNN_UKERNEL_WITH_PARAMS
+#endif
+
+#ifdef XNN_DEFINED_UKERNEL
+#undef XNN_DEFINED_UKERNEL
+#undef XNN_UKERNEL
+#endif
diff --git a/src/qu8-vrpreluc/gen/qu8-vrpreluc-avx2-u16.c b/src/qu8-vrpreluc/gen/qu8-vrpreluc-avx2-u16.c
new file mode 100644
index 00000000000..a79c185e155
--- /dev/null
+++ b/src/qu8-vrpreluc/gen/qu8-vrpreluc-avx2-u16.c
@@ -0,0 +1,162 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vrpreluc/avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include "src/xnnpack/intrinsics-polyfill.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qu8_vrpreluc_ukernel__avx2_u16(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const __m256i vinput_zero_point = _mm256_set1_epi32(params->scalar.input_zero_point);
+  const __m256 vpositive_multiplier = _mm256_set1_ps(params->scalar.rprelu_positive_multiplier);
+  const __m256 vnegative_multiplier = _mm256_set1_ps(params->scalar.negative_multiplier);
+  const __m256 voutput_min_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const __m256 vmagic_bias = _mm256_set1_ps(12582912.0f);
+  const __m256i vmagic_bias_less_output_zero_point = _mm256_set1_epi32(INT32_C(0x4B400000) - (int32_t)params->scalar.output_zero_point);
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  const __m256i vslope = _mm256_set1_epi32(slope);
+  for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) {
+    __m256i va0 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_a));
+    
+    
+    __m256i va1 = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) (input_a + 8)));
+    input_a += 16;
+    
+
+    __m256i va0_sub = _mm256_sub_epi32(va0, vinput_zero_point);
+    __m256i vcompare0 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope);
+    __m256i vacc0 = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va0_sub, vslope), vcompare0);
+    __m256i va1_sub = _mm256_sub_epi32(va1, vinput_zero_point);
+    __m256i vcompare1 = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope);
+    __m256i vacc1 = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va1_sub, vslope), vcompare1);
+
+    __m256 vscale0 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare0));
+    __m256 vfpacc0 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc0), vscale0);
+    __m256 vscale1 = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare1));
+    __m256 vfpacc1 = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc1), vscale1);
+
+    __m256 vfpacc_clamped0 = _mm256_min_ps(_mm256_max_ps(vfpacc0, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased0 = _mm256_add_ps(vfpacc_clamped0, vmagic_bias);
+    __m256i vout0 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased0), vmagic_bias_less_output_zero_point);
+    __m256 vfpacc_clamped1 = _mm256_min_ps(_mm256_max_ps(vfpacc1, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased1 = _mm256_add_ps(vfpacc_clamped1, vmagic_bias);
+    __m256i vout1 = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased1), vmagic_bias_less_output_zero_point);
+
+    const __m128i vout_low0 = _mm256_castsi256_si128(vout0);
+    const __m128i vout_high0 = _mm256_extracti128_si256(vout0, 1);
+    const __m128i vout_packed0 = _mm_packs_epi32(vout_low0, vout_high0);
+    __m128i vout_final0 = _mm_packus_epi16(vout_packed0, vout_packed0);
+    const __m128i vout_low1 = _mm256_castsi256_si128(vout1);
+    const __m128i vout_high1 = _mm256_extracti128_si256(vout1, 1);
+    const __m128i vout_packed1 = _mm_packs_epi32(vout_low1, vout_high1);
+    __m128i vout_final1 = _mm_packus_epi16(vout_packed1, vout_packed1);
+
+    _mm_storeu_si64((__m128i*)(output), vout_final0);
+
+    _mm_storeu_si64((__m128i*)(output + 8), vout_final1);
+
+    output += 16;
+  }
+
+  for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) {
+    __m256i va = _mm256_cvtepu8_epi32(_mm_loadu_si64((const __m128i*) input_a));
+    __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope);
+    __m256i vacc = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va_sub, vslope), vcompare);
+    __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    input_a+=8;
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = _mm_packus_epi16(vout_packed, vout_packed);
+    _mm_storeu_si64((__m128i*) output, vout_final);
+    output+=8;
+
+    
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 * sizeof(uint8_t));
+    assert(batch <= 7 * sizeof(uint8_t));
+
+    const __m256i va = _mm256_cvtepu8_epi32(_mm_loadu_si128((const __m128i*) input_a));
+    const __m256i va_sub = _mm256_sub_epi32(va, vinput_zero_point);
+    const __m256i vcompare = _mm256_cmpgt_epi32(_mm256_setzero_si256(), vslope);
+    const __m256i vacc = _mm256_blendv_epi8(vslope, _mm256_mullo_epi32(va_sub, vslope), vcompare);
+    const __m256 vscale = _mm256_blendv_ps(vpositive_multiplier, vnegative_multiplier, _mm256_castsi256_ps(vcompare));
+    const __m256 vfpacc = _mm256_mul_ps(_mm256_cvtepi32_ps(vacc), vscale);
+    const __m256 vfpacc_clamped = _mm256_min_ps(_mm256_max_ps(vfpacc, voutput_min_less_zero_point), voutput_max_less_zero_point);
+    const __m256 vfpacc_biased = _mm256_add_ps(vfpacc_clamped, vmagic_bias);
+    const __m256i vout = _mm256_sub_epi32(_mm256_castps_si256(vfpacc_biased), vmagic_bias_less_output_zero_point);
+    const __m128i vout_low = _mm256_castsi256_si128(vout);
+    const __m128i vout_high = _mm256_extracti128_si256(vout, 1);
+    const __m128i vout_packed = _mm_packs_epi32(vout_low, vout_high);
+    __m128i vout_final = _mm_packus_epi16(vout_packed, vout_packed);
+   
+    if (batch & (4 * sizeof(uint8_t))) {
+      _mm_storeu_si32(output, vout_final);
+      vout_final = _mm_srli_epi64(vout_final, 32);
+      output += 4;
+    }
+
+    if (batch & (2 * sizeof(uint8_t))) {
+     _mm_storeu_si16(output, vout_final);
+      vout_final = _mm_srli_epi32(vout_final, 16);
+      output += 2;
+    }
+    if (batch & (1 * sizeof(uint8_t))) {
+      *output = (uint8_t) _mm_extract_epi8(vout_final, 0);
+    }
+  }
+}
diff --git a/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u1.c b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u1.c
new file mode 100644
index 00000000000..30d699efe88
--- /dev/null
+++ b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u1.c
@@ -0,0 +1,73 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vrpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qu8_vrpreluc_ukernel__scalar_u1(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  do {
+    const int32_t va = (int32_t) *input_a++ - input_zero_point;
+    int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope;
+    float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vfpacc = (float) vacc * vscale;
+    vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+    vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+    *output++ = (uint8_t) vout;
+    batch -= sizeof(uint8_t);
+  } while (batch != 0);
+}
diff --git a/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u2.c b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u2.c
new file mode 100644
index 00000000000..b3fc0aaec21
--- /dev/null
+++ b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u2.c
@@ -0,0 +1,103 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vrpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qu8_vrpreluc_ukernel__scalar_u2(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  for (; batch >= 2 * sizeof(uint8_t); batch -= 2 * sizeof(uint8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    input_a += 2;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(slope < 0) ? va0 * slope : slope;
+    int32_t vacc1 = XNN_UNPREDICTABLE(slope < 0) ? va1 * slope : slope;
+
+    float vscale0 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (uint8_t) vout0;
+    output[1] = (uint8_t) vout1;
+    output += 2;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    const int32_t va = (int32_t) *input_a - input_zero_point;
+    int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope;
+    float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vfpacc = (float) vacc * vscale;
+    vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+    vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+    *output = (uint8_t) vout;
+  }
+}
diff --git a/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u4.c b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u4.c
new file mode 100644
index 00000000000..c5dfaa0f9ea
--- /dev/null
+++ b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u4.c
@@ -0,0 +1,124 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vrpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qu8_vrpreluc_ukernel__scalar_u4(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  for (; batch >= 4 * sizeof(uint8_t); batch -= 4 * sizeof(uint8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    const int32_t va2 = input_a[2] - input_zero_point;
+    const int32_t va3 = input_a[3] - input_zero_point;
+    input_a += 4;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(slope < 0) ? va0 * slope : slope;
+    int32_t vacc1 = XNN_UNPREDICTABLE(slope < 0) ? va1 * slope : slope;
+    int32_t vacc2 = XNN_UNPREDICTABLE(slope < 0) ? va2 * slope : slope;
+    int32_t vacc3 = XNN_UNPREDICTABLE(slope < 0) ? va3 * slope : slope;
+
+    float vscale0 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale2 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale3 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+    float vfpacc2 = (float) vacc2 * vscale2;
+    float vfpacc3 = (float) vacc3 * vscale3;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+    vfpacc2 += vmagic_bias;
+    vfpacc3 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
+    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (uint8_t) vout0;
+    output[1] = (uint8_t) vout1;
+    output[2] = (uint8_t) vout2;
+    output[3] = (uint8_t) vout3;
+    output += 4;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope;
+      float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (uint8_t) vout;
+      batch -= sizeof(uint8_t);
+    } while (batch != 0);
+  }
+}
diff --git a/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u8.c b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u8.c
new file mode 100644
index 00000000000..259b81e234a
--- /dev/null
+++ b/src/qu8-vrpreluc/gen/qu8-vrpreluc-scalar-u8.c
@@ -0,0 +1,160 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vrpreluc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <assert.h>
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vbinary.h"
+
+void xnn_qu8_vrpreluc_ukernel__scalar_u8(
+    size_t batch,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input_a != NULL);
+  assert(input_b != NULL);
+  assert(output != NULL);
+
+  const int32_t input_zero_point = params->scalar.input_zero_point;
+  const float vpositive_multiplier = params->scalar.rprelu_positive_multiplier;
+  const float vnegative_multiplier = params->scalar.negative_multiplier;                                
+  const float voutput_min_less_zero_point = (int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point;
+  const float voutput_max_less_zero_point = (int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point;
+  const float vmagic_bias = 12582912.0f;
+  const int32_t vmagic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
+  const int32_t slope = (int32_t) *input_b - params->scalar.slope_zero_point;
+  for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) {
+    const int32_t va0 = input_a[0] - input_zero_point;
+    const int32_t va1 = input_a[1] - input_zero_point;
+    const int32_t va2 = input_a[2] - input_zero_point;
+    const int32_t va3 = input_a[3] - input_zero_point;
+    const int32_t va4 = input_a[4] - input_zero_point;
+    const int32_t va5 = input_a[5] - input_zero_point;
+    const int32_t va6 = input_a[6] - input_zero_point;
+    const int32_t va7 = input_a[7] - input_zero_point;
+    input_a += 8;
+
+    int32_t vacc0 = XNN_UNPREDICTABLE(slope < 0) ? va0 * slope : slope;
+    int32_t vacc1 = XNN_UNPREDICTABLE(slope < 0) ? va1 * slope : slope;
+    int32_t vacc2 = XNN_UNPREDICTABLE(slope < 0) ? va2 * slope : slope;
+    int32_t vacc3 = XNN_UNPREDICTABLE(slope < 0) ? va3 * slope : slope;
+    int32_t vacc4 = XNN_UNPREDICTABLE(slope < 0) ? va4 * slope : slope;
+    int32_t vacc5 = XNN_UNPREDICTABLE(slope < 0) ? va5 * slope : slope;
+    int32_t vacc6 = XNN_UNPREDICTABLE(slope < 0) ? va6 * slope : slope;
+    int32_t vacc7 = XNN_UNPREDICTABLE(slope < 0) ? va7 * slope : slope;
+
+    float vscale0 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale1 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale2 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale3 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale4 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale5 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale6 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+    float vscale7 = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+
+    float vfpacc0 = (float) vacc0 * vscale0;
+    float vfpacc1 = (float) vacc1 * vscale1;
+    float vfpacc2 = (float) vacc2 * vscale2;
+    float vfpacc3 = (float) vacc3 * vscale3;
+    float vfpacc4 = (float) vacc4 * vscale4;
+    float vfpacc5 = (float) vacc5 * vscale5;
+    float vfpacc6 = (float) vacc6 * vscale6;
+    float vfpacc7 = (float) vacc7 * vscale7;
+
+    vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+    vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+    vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+    vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+    vfpacc4 = math_max_f32(vfpacc4, voutput_min_less_zero_point);
+    vfpacc5 = math_max_f32(vfpacc5, voutput_min_less_zero_point);
+    vfpacc6 = math_max_f32(vfpacc6, voutput_min_less_zero_point);
+    vfpacc7 = math_max_f32(vfpacc7, voutput_min_less_zero_point);
+
+    vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+    vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+    vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+    vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+    vfpacc4 = math_min_f32(vfpacc4, voutput_max_less_zero_point);
+    vfpacc5 = math_min_f32(vfpacc5, voutput_max_less_zero_point);
+    vfpacc6 = math_min_f32(vfpacc6, voutput_max_less_zero_point);
+    vfpacc7 = math_min_f32(vfpacc7, voutput_max_less_zero_point);
+
+    vfpacc0 += vmagic_bias;
+    vfpacc1 += vmagic_bias;
+    vfpacc2 += vmagic_bias;
+    vfpacc3 += vmagic_bias;
+    vfpacc4 += vmagic_bias;
+    vfpacc5 += vmagic_bias;
+    vfpacc6 += vmagic_bias;
+    vfpacc7 += vmagic_bias;
+
+    const int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
+    const int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
+    const int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
+    const int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
+    const int32_t vout4 = (int32_t) float_as_uint32(vfpacc4) - vmagic_bias_less_output_zero_point;
+    const int32_t vout5 = (int32_t) float_as_uint32(vfpacc5) - vmagic_bias_less_output_zero_point;
+    const int32_t vout6 = (int32_t) float_as_uint32(vfpacc6) - vmagic_bias_less_output_zero_point;
+    const int32_t vout7 = (int32_t) float_as_uint32(vfpacc7) - vmagic_bias_less_output_zero_point;
+
+    output[0] = (uint8_t) vout0;
+    output[1] = (uint8_t) vout1;
+    output[2] = (uint8_t) vout2;
+    output[3] = (uint8_t) vout3;
+    output[4] = (uint8_t) vout4;
+    output[5] = (uint8_t) vout5;
+    output[6] = (uint8_t) vout6;
+    output[7] = (uint8_t) vout7;
+    output += 8;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t va = (int32_t) *input_a++ - input_zero_point;
+      int32_t vacc = XNN_UNPREDICTABLE(slope < 0) ? va * slope : slope;
+      float vscale = XNN_UNPREDICTABLE(slope < 0) ? vnegative_multiplier : vpositive_multiplier;
+      float vfpacc = (float) vacc * vscale;
+      vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+      vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+      vfpacc += vmagic_bias;
+      const int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+      *output++ = (uint8_t) vout;
+      batch -= sizeof(uint8_t);
+    } while (batch != 0);
+  }
+}
diff --git a/src/qu8-vrpreluc/qu8-vrpreluc.h b/src/qu8-vrpreluc/qu8-vrpreluc.h
new file mode 100644
index 00000000000..4d9f66110bc
--- /dev/null
+++ b/src/qu8-vrpreluc/qu8-vrpreluc.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2024 Intel Corporation
+//  
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//  
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+// 3. Neither the name of the copyright holder nor the names of its contributors
+//    may be used to endorse or promote products derived from this software
+//    without specific prior written permission.
+//  
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//  
+//  
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef XNN_UKERNEL_WITH_PARAMS
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \
+    XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype)
+#define XNN_DEFINED_UKERNEL_WITH_PARAMS
+#endif
+
+#ifndef XNN_UKERNEL
+#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype) \
+    XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, void, /*init_params=*/nullptr)
+#define XNN_DEFINED_UKERNEL
+#endif
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vrpreluc_ukernel__avx2_u16, 16, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vrpreluc_ukernel__scalar_u1, 1, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vrpreluc_ukernel__scalar_u2, 2, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vrpreluc_ukernel__scalar_u4, 4, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vrpreluc_ukernel__scalar_u8, 8, false, uint8_t, union xnn_qs8_vprelu_scalar_params, xnn_init_qu8_vprelu_scalar_params) 
+
+#ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS
+#undef XNN_DEFINED_UKERNEL_WITH_PARAMS
+#undef XNN_UKERNEL_WITH_PARAMS
+#endif
+
+#ifdef XNN_DEFINED_UKERNEL
+#undef XNN_DEFINED_UKERNEL
+#undef XNN_UKERNEL
+#endif
diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h
index dd29c5c381b..1e870773edc 100644
--- a/src/xnnpack/config.h
+++ b/src/xnnpack/config.h
@@ -67,9 +67,13 @@ xnn_init_qs8_vadd_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config*
 xnn_init_qs8_vmul_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config*
+xnn_init_qs8_vprelu_config();
+XNN_INTERNAL const struct xnn_binary_elementwise_config*
 xnn_init_qu8_vadd_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config*
 xnn_init_qu8_vmul_config();
+XNN_INTERNAL const struct xnn_binary_elementwise_config*
+xnn_init_qu8_vprelu_config();
 
 XNN_INTERNAL const struct xnn_unary_elementwise_config*
 xnn_init_f16_abs_config();
diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h
index 2e9a11629e5..f1d40a546f4 100644
--- a/src/xnnpack/microfnptr.h
+++ b/src/xnnpack/microfnptr.h
@@ -982,6 +982,22 @@ typedef void (*xnn_qu8_vmul_minmax_ukernel_fn)(
     const union xnn_qu8_mul_minmax_params
         params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
+
+ // VPRELU: Vector PRELU elementwise 
+
+typedef void (*xnn_qs8_vprelu_ukernel_fn)(
+    size_t batch, const int8_t* input_x, const int8_t* input_y,
+    int8_t* output,
+    const union xnn_qs8_vprelu_scalar_params
+        params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
+
+typedef void (*xnn_qu8_vprelu_ukernel_fn)(
+    size_t batch, const uint8_t* input_x, const uint8_t* input_y,
+    uint8_t* output,
+    const union xnn_qs8_vprelu_scalar_params
+        params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
+
+
 /***************** Microkernel pointers for sparse inference *****************/
 
 // SpMM: Sparse Matrix-Matrix multiplication
@@ -1180,6 +1196,18 @@ typedef size_t (*xnn_init_qu8_mul_minmax_params_fn)(
     const struct xnn_quantization_params* b_quantization,
     const struct xnn_quantization_params* output_quantization);
 
+typedef size_t(*xnn_init_qs8_vprelu_params_fn)(
+  union xnn_qs8_vprelu_scalar_params params[XNN_MIN_ELEMENTS(1)],
+  const struct xnn_quantization_params* a_quantization,
+  const struct xnn_quantization_params* b_quantization,
+  const struct xnn_quantization_params* output_quantization);
+
+typedef size_t(*xnn_init_qu8_vprelu_params_fn)(
+  union xnn_qs8_vprelu_scalar_params params[XNN_MIN_ELEMENTS(1)],
+  const struct xnn_quantization_params* a_quantization,
+  const struct xnn_quantization_params* b_quantization,
+  const struct xnn_quantization_params* output_quantization);
+
 typedef size_t (*xnn_init_bf16_default_params_fn)(
     struct xnn_bf16_default_params params[XNN_MIN_ELEMENTS(1)]);
 
diff --git a/src/xnnpack/microparams-init.h b/src/xnnpack/microparams-init.h
index 0b6c9066fa9..b7aad90ee29 100644
--- a/src/xnnpack/microparams-init.h
+++ b/src/xnnpack/microparams-init.h
@@ -250,6 +250,25 @@ DECLARE_INIT_QU8_MUL_MINMAX_PARAMS_FUNCTION(
     xnn_init_qu8_mul_minmax_rndnu_neon_params)
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
+#define DECLARE_INIT_QS8_VPRELU_PARAMS_FUNCTION(fn_name)        \
+  XNN_INTERNAL size_t fn_name(                                      \
+      union xnn_qs8_vprelu_scalar_params uparams[XNN_MIN_ELEMENTS(1)], \
+      const struct xnn_quantization_params* a_quantization,         \
+      const struct xnn_quantization_params* b_quantization,         \
+      const struct xnn_quantization_params* output_quantization);
+
+DECLARE_INIT_QS8_VPRELU_PARAMS_FUNCTION(xnn_init_qs8_vprelu_scalar_params)
+
+#define DECLARE_INIT_QU8_VPRELU_PARAMS_FUNCTION(fn_name)        \
+  XNN_INTERNAL size_t fn_name(                                      \
+      union xnn_qs8_vprelu_scalar_params uparams[XNN_MIN_ELEMENTS(1)], \
+      const struct xnn_quantization_params* a_quantization,         \
+      const struct xnn_quantization_params* b_quantization,         \
+      const struct xnn_quantization_params* output_quantization);
+
+DECLARE_INIT_QU8_VPRELU_PARAMS_FUNCTION(xnn_init_qu8_vprelu_scalar_params)
+
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h
index dfa52593c4d..3575f324bd5 100644
--- a/src/xnnpack/microparams.h
+++ b/src/xnnpack/microparams.h
@@ -145,8 +145,22 @@ struct xnn_u8_minmax_params {
   } scalar;
 };
 
-// Conv w. Min+Max: used by quantized GEMM/IGEMM/DWCONV microkernels with MINMAX
-// activation.
+// VPReLU: used by VPRELU microkernels.
+
+union xnn_qs8_vprelu_scalar_params {
+  struct {
+    int32_t input_zero_point;
+    int32_t slope_zero_point;
+    int32_t output_zero_point;
+    float positive_multiplier;
+    float rprelu_positive_multiplier;
+    float negative_multiplier;
+    int32_t output_min;
+    int32_t output_max;
+  } scalar;
+};
+
+// Conv w. Min+Max: used by quantized GEMM/IGEMM/DWCONV microkernels with MINMAX activation.
 struct xnn_qd8_quantization_params {
   int32_t zero_point;
   float inv_scale;
@@ -392,6 +406,8 @@ union xnn_binary_uparams {
   struct xnn_qu8_add_minmax_params qu8_addsub;
   union xnn_qs8_mul_minmax_params qs8_mul;
   union xnn_qu8_mul_minmax_params qu8_mul;
+  union xnn_qs8_vprelu_scalar_params qs8_vprelu;
+  union xnn_qs8_vprelu_scalar_params qu8_vprelu;
   struct xnn_f16_minmax_params f16;
   struct xnn_f32_minmax_params f32;
   struct xnn_binary_reference_params reference;
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index 65e9c9dfe02..7b6d51d473f 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -324,6 +324,7 @@ struct xnn_operator {
       const struct xnn_binary_elementwise_config* vadd_config;
       const struct xnn_binary_elementwise_config* vmul_config;
       const struct xnn_unary_elementwise_config* vtanh_config;
+      const struct xnn_binary_elementwise_config* vprelu_config;
       enum xnn_attention_logits_cap_type cap_type;
       struct xnn_attention_logits_cap_tanh_params cap_params;
     } attention;  // For attention operator.
diff --git a/src/xnnpack/vbinary.h b/src/xnnpack/vbinary.h
index e9fa14244fb..31c59eadb75 100644
--- a/src/xnnpack/vbinary.h
+++ b/src/xnnpack/vbinary.h
@@ -88,6 +88,9 @@ extern "C" {
 #include "src/qu8-vmul/qu8-vmul-minmax-rndnu.h"
 #include "src/qu8-vmulc/qu8-vmulc-minmax-fp32.h"
 #include "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h"
+#include "src/qu8-vprelu/qu8-vprelu.h"
+#include "src/qu8-vpreluc/qu8-vpreluc.h"
+#include "src/qu8-vrpreluc/qu8-vrpreluc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
 
 #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \
@@ -101,6 +104,9 @@ extern "C" {
 #include "src/qs8-vmul/qs8-vmul-minmax-rndnu.h"
 #include "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h"
 #include "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h"
+#include "src/qs8-vprelu/qs8-vprelu.h"
+#include "src/qs8-vpreluc/qs8-vpreluc.h"
+#include "src/qs8-vrpreluc/qs8-vrpreluc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
 
 #ifdef __cplusplus
diff --git a/test/BUILD.bazel b/test/BUILD.bazel
index 158601b7985..73401ce31af 100644
--- a/test/BUILD.bazel
+++ b/test/BUILD.bazel
@@ -251,12 +251,18 @@ xnnpack_cxx_library(
     "qs8_vmul_minmax_rndnu",
     "qs8_vmulc_minmax_fp32",
     "qs8_vmulc_minmax_rndnu",
+    "qs8_vprelu",
+    "qs8_vpreluc",
+    "qs8_vrpreluc",
     "qu8_vadd_minmax",
     "qu8_vaddc_minmax",
     "qu8_vmul_minmax_fp32",
     "qu8_vmul_minmax_rndnu",
     "qu8_vmulc_minmax_fp32",
     "qu8_vmulc_minmax_rndnu",
+    "qu8_vprelu",
+    "qu8_vpreluc",
+    "qu8_vrpreluc",
 ]]
 
 [xnnpack_unit_test(
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4f470f678a0..29714ece197 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -319,12 +319,18 @@ SET(MICROKERNEL_VBINARY_UNIT_TESTS
     qs8-vaddc-minmax
     qs8-vmul-minmax-fp32
     qs8-vmulc-minmax-fp32
+    qs8-vprelu
+    qs8-vpreluc
+    qs8-vrpreluc
     qu8-vadd-minmax
     qu8-vaddc-minmax
     qu8-vmul-minmax-fp32
     qu8-vmul-minmax-rndnu
     qu8-vmulc-minmax-fp32
-    qu8-vmulc-minmax-rndnu)
+    qu8-vmulc-minmax-rndnu
+    qu8-vprelu
+    qu8-vpreluc
+    qu8-vrpreluc)
 FOREACH(TEST ${MICROKERNEL_VBINARY_UNIT_TESTS})
   ADD_EXECUTABLE(${TEST}-test ${TEST}.cc)
   TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE
diff --git a/test/qs8-vprelu.cc b/test/qs8-vprelu.cc
new file mode 100644
index 00000000000..3223267225d
--- /dev/null
+++ b/test/qs8-vprelu.cc
@@ -0,0 +1,32 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Microkernel: qs8-vprelu
+//   Generator: tools/generate-vbinary-test.py
+
+
+#include "src/xnnpack/microparams-init.h"
+#include "src/xnnpack/vbinary.h"
+#include "test/vbinary-microkernel-tester.h"
+
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)                                \
+XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);       \
+XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
+XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);       \
+XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);       \
+                                                                                                                                                 \
+XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
+XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
+XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);\
+                                                                                                                                                 \
+XNN_TEST_BINARY_A_ZERO_POINT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);   \
+XNN_TEST_BINARY_B_ZERO_POINT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);   \
+XNN_TEST_BINARY_Y_ZERO_POINT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);   \
+XNN_TEST_BINARY_A_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);        \
+XNN_TEST_BINARY_B_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);        \
+XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);
+#include "src/qs8-vprelu/qs8-vprelu.h"
+#undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-vpreluc.cc b/test/qs8-vpreluc.cc
new file mode 100644
index 00000000000..e44237a865f
--- /dev/null
+++ b/test/qs8-vpreluc.cc
@@ -0,0 +1,32 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Microkernel: qs8-vpreluc
+//   Generator: tools/generate-vbinary-test.py
+
+
+#include "src/xnnpack/microparams-init.h"
+#include "src/xnnpack/vbinary.h"
+#include "test/vbinary-microkernel-tester.h"
+
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)                               \
+XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);       \
+XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
+XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);       \
+XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);       \
+                                                                                                                                                \
+XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
+XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
+XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);\
+                                                                                                                                                \
+XNN_TEST_BINARY_A_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);   \
+XNN_TEST_BINARY_B_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);   \
+XNN_TEST_BINARY_Y_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);   \
+XNN_TEST_BINARY_A_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);        \
+XNN_TEST_BINARY_B_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);        \
+XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);
+#include "src/qs8-vpreluc/qs8-vpreluc.h"
+#undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-vrpreluc.cc b/test/qs8-vrpreluc.cc
new file mode 100644
index 00000000000..6499f9123b3
--- /dev/null
+++ b/test/qs8-vrpreluc.cc
@@ -0,0 +1,32 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Microkernel: qs8-vrpreluc
+//   Generator: tools/generate-vbinary-test.py
+
+
+#include "src/xnnpack/microparams-init.h"
+#include "src/xnnpack/vbinary.h"
+#include "test/vbinary-microkernel-tester.h"
+
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)                                \
+XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);       \
+XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);      \
+XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);       \
+XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);       \
+                                                                                                                                                 \
+XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);      \
+XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);      \
+XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);\
+                                                                                                                                                 \
+XNN_TEST_BINARY_A_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);   \
+XNN_TEST_BINARY_B_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);   \
+XNN_TEST_BINARY_Y_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);   \
+XNN_TEST_BINARY_A_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);        \
+XNN_TEST_BINARY_B_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);        \
+XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);
+#include "src/qs8-vrpreluc/qs8-vrpreluc.h"
+#undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-vprelu.cc b/test/qu8-vprelu.cc
new file mode 100644
index 00000000000..1eda9bc36db
--- /dev/null
+++ b/test/qu8-vprelu.cc
@@ -0,0 +1,32 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Microkernel: qu8-vprelu
+//   Generator: tools/generate-vbinary-test.py
+
+
+#include "src/xnnpack/microparams-init.h"
+#include "src/xnnpack/vbinary.h"
+#include "test/vbinary-microkernel-tester.h"
+
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)                                \
+XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);       \
+XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
+XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);       \
+XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);       \
+                                                                                                                                                 \
+XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
+XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
+XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);\
+                                                                                                                                                 \
+XNN_TEST_BINARY_A_ZERO_POINT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);   \
+XNN_TEST_BINARY_B_ZERO_POINT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);   \
+XNN_TEST_BINARY_Y_ZERO_POINT(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);   \
+XNN_TEST_BINARY_A_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);        \
+XNN_TEST_BINARY_B_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);        \
+XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);
+#include "src/qu8-vprelu/qu8-vprelu.h"
+#undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-vpreluc.cc b/test/qu8-vpreluc.cc
new file mode 100644
index 00000000000..cc0bd37d37b
--- /dev/null
+++ b/test/qu8-vpreluc.cc
@@ -0,0 +1,32 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Microkernel: qu8-vpreluc
+//   Generator: tools/generate-vbinary-test.py
+
+
+#include "src/xnnpack/microparams-init.h"
+#include "src/xnnpack/vbinary.h"
+#include "test/vbinary-microkernel-tester.h"
+
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)                               \
+XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);       \
+XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
+XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);       \
+XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);       \
+                                                                                                                                                \
+XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
+XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
+XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);\
+                                                                                                                                                \
+XNN_TEST_BINARY_A_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);   \
+XNN_TEST_BINARY_B_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);   \
+XNN_TEST_BINARY_Y_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);   \
+XNN_TEST_BINARY_A_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);        \
+XNN_TEST_BINARY_B_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);        \
+XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);
+#include "src/qu8-vpreluc/qu8-vpreluc.h"
+#undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-vrpreluc.cc b/test/qu8-vrpreluc.cc
new file mode 100644
index 00000000000..a1f09d9a87e
--- /dev/null
+++ b/test/qu8-vrpreluc.cc
@@ -0,0 +1,32 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Microkernel: qu8-vrpreluc
+//   Generator: tools/generate-vbinary-test.py
+
+
+#include "src/xnnpack/microparams-init.h"
+#include "src/xnnpack/vbinary.h"
+#include "test/vbinary-microkernel-tester.h"
+
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)                                \
+XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);       \
+XNN_TEST_BINARY_BATCH_DIV(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);      \
+XNN_TEST_BINARY_BATCH_LT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);       \
+XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);       \
+                                                                                                                                                 \
+XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);      \
+XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);      \
+XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);\
+                                                                                                                                                 \
+XNN_TEST_BINARY_A_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);   \
+XNN_TEST_BINARY_B_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);   \
+XNN_TEST_BINARY_Y_ZERO_POINT(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);   \
+XNN_TEST_BINARY_A_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);        \
+XNN_TEST_BINARY_B_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);        \
+XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);
+#include "src/qu8-vrpreluc/qu8-vrpreluc.h"
+#undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/vbinary-microkernel-tester.cc b/test/vbinary-microkernel-tester.cc
index 198c6206213..c84ff052532 100644
--- a/test/vbinary-microkernel-tester.cc
+++ b/test/vbinary-microkernel-tester.cc
@@ -397,3 +397,170 @@ void VBinaryMicrokernelTester::Test(
     }
   }
 }
+
+void VBinaryMicrokernelTester::Test(
+    xnn_qs8_vprelu_ukernel_fn vprelu, OpType op_type,
+    xnn_init_qs8_vprelu_params_fn init_params) const {
+  xnnpack::ReplicableRandomDevice rng;
+  auto i8rng = [&rng]() {
+    return std::uniform_int_distribution<int32_t>(
+        std::numeric_limits<int8_t>::min(),
+        std::numeric_limits<int8_t>::max())(rng);
+  };
+
+  xnnpack::Buffer<int8_t> a(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t));
+  xnnpack::Buffer<int8_t> b(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t));
+  xnnpack::Buffer<int8_t> y(
+      batch_size() +
+      (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(int8_t) : 0));
+  xnnpack::Buffer<float> y_fp(batch_size());
+  xnnpack::Buffer<int8_t> y_ref(batch_size());
+  for (size_t iteration = 0; iteration < iterations(); iteration++) {
+    if (!inplace_a()) {
+      std::generate(a.begin(), a.end(), [&]() { return i8rng(); });
+    }
+    if (!inplace_b()) {
+      std::generate(b.begin(), b.end(), [&]() { return i8rng(); });
+    }
+    if (inplace_a() || inplace_b()) {
+      std::generate(y.begin(), y.end(), [&]() { return i8rng(); });
+    }
+    const int8_t* a_data = inplace_a() ? y.data() : a.data();
+    const int8_t* b_data = inplace_b() ? y.data() : b.data();
+    const size_t stride_b = broadcast_b() ? 0 : 1;
+
+    // Prepare parameters.
+    xnn_qs8_vprelu_scalar_params params;
+    struct xnn_quantization_params a_quantization = {a_zero_point() - 0x80,
+                                                     a_scale()};
+    struct xnn_quantization_params b_quantization = {b_zero_point() - 0x80,
+                                                     b_scale()};
+    struct xnn_quantization_params y_quantization = {y_zero_point() - 0x80,
+                                                     y_scale()};
+    init_params(&params, &a_quantization, &b_quantization, &y_quantization);
+
+    // Compute reference results.
+    const float positive_multiplier = a_scale() / y_scale();
+    const float rprelu_pos_multiplier = b_scale() / y_scale();
+    const float negative_multiplier = (a_scale() * b_scale()) / y_scale();
+    EXPECT_GE(positive_multiplier, 0x1.0p-32f);
+    EXPECT_GE(negative_multiplier, 0x1.0p-32f);
+    for (size_t i = 0; i < batch_size(); i++) {
+      int32_t acc;
+      float scale;
+      const int32_t a_val = static_cast<int32_t>(a_data[i]) - static_cast<int32_t>(a_zero_point() - 0x80);
+      const int32_t b_val = static_cast<int32_t>(b_data[i * stride_b]) - static_cast<int32_t>(b_zero_point() - 0x80);
+      switch (op_type)
+      {
+      case OpType::Prelu:
+        acc = (a_val < 0) ? a_val * b_val : a_val;
+        scale = (a_val < 0) ? negative_multiplier : positive_multiplier;
+        break;
+      default:
+        acc = (b_val < 0) ? a_val * b_val : b_val;
+        scale = (b_val < 0) ? negative_multiplier : rprelu_pos_multiplier; 
+        break;
+      }
+      y_fp[i] = static_cast<float>(y_zero_point() - 0x80) + scale * static_cast<float>(acc);
+      y_fp[i] = std::min<float>(y_fp[i], static_cast<float>(INT8_MAX));
+      y_fp[i] = std::max<float>(y_fp[i], static_cast<float>(INT8_MIN));
+      y_ref[i] = xnn_qs8_requantize_fp32(
+          acc, scale, static_cast<int8_t>(y_zero_point() - 0x80),
+          INT8_MIN, INT8_MAX);
+    }
+
+    // Call optimized micro-kernel.
+    vprelu(batch_size(), a_data, b_data, y.data(), &params);
+
+    // Verify results.
+    for (size_t i = 0; i < batch_size(); i++) {
+      EXPECT_NEAR(static_cast<int32_t>(y_ref[i]), static_cast<int32_t>(y[i]), 1)
+          << "at element " << i << " / " << batch_size();
+      EXPECT_NEAR(static_cast<float>(static_cast<int32_t>(y[i])), y_fp[i], 1.0f)
+          << "at element " << i << " / " << batch_size();
+    }
+  }
+}
+
+
+
+void VBinaryMicrokernelTester::Test(
+    xnn_qu8_vprelu_ukernel_fn vprelu, OpType op_type,
+    xnn_init_qu8_vprelu_params_fn init_params) const {
+  xnnpack::ReplicableRandomDevice rng;
+  auto u8rng = [&rng]() {
+    return std::uniform_int_distribution<uint32_t>(
+        0, std::numeric_limits<uint8_t>::max())(rng);
+  };
+
+  xnnpack::Buffer<uint8_t> a(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t));
+  xnnpack::Buffer<uint8_t> b(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t));
+  xnnpack::Buffer<uint8_t> y(
+      batch_size() +
+      (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(uint8_t) : 0));
+  xnnpack::Buffer<float> y_fp(batch_size());
+  xnnpack::Buffer<uint8_t> y_ref(batch_size());
+  for (size_t iteration = 0; iteration < iterations(); iteration++) {
+    if (!inplace_a()) {
+      std::generate(a.begin(), a.end(), [&]() { return u8rng(); });
+    }
+    if (!inplace_b()) {
+      std::generate(b.begin(), b.end(), [&]() { return u8rng(); });
+    }
+    if (inplace_a() || inplace_b()) {
+      std::generate(y.begin(), y.end(), [&]() { return u8rng(); });
+    }
+    const uint8_t* a_data = inplace_a() ? y.data() : a.data();
+    const uint8_t* b_data = inplace_b() ? y.data() : b.data();
+    const size_t stride_b = broadcast_b() ? 0 : 1;
+
+    // Prepare parameters.
+    xnn_qs8_vprelu_scalar_params params;
+    struct xnn_quantization_params a_quantization = {a_zero_point(), a_scale()};
+    struct xnn_quantization_params b_quantization = {b_zero_point(), b_scale()};
+    struct xnn_quantization_params y_quantization = {y_zero_point(), y_scale()};
+    init_params(&params, &a_quantization, &b_quantization, &y_quantization);
+
+    // Compute reference results.
+    const float positive_multiplier = a_scale() / y_scale();
+    const float rprelu_pos_multiplier = b_scale() / y_scale();
+    const float negative_multiplier = (a_scale() * b_scale()) / y_scale();
+   
+    for (size_t i = 0; i < batch_size(); i++) {
+      int32_t acc;
+      float scale;
+      const int32_t a_val = static_cast<int32_t>(a_data[i]) - static_cast<int32_t>(a_zero_point());
+      const int32_t b_val = static_cast<int32_t>(b_data[i * stride_b]) - static_cast<int32_t>(b_zero_point());
+      switch (op_type)
+      {
+      case OpType::Prelu:
+        acc = (a_val < 0) ? a_val * b_val : a_val;
+        scale = (a_val < 0) ? negative_multiplier : positive_multiplier;
+        break;
+      default:
+        acc = (b_val < 0) ? a_val * b_val : b_val;
+        scale = (b_val < 0) ? negative_multiplier : rprelu_pos_multiplier; 
+        break;
+      }
+      y_fp[i] = static_cast<float>(y_zero_point()) + scale * static_cast<float>(acc);
+      y_fp[i] = std::min<float>(y_fp[i], static_cast<float>(UINT8_MAX));
+      y_fp[i] = std::max<float>(y_fp[i], static_cast<float>(0));
+      y_ref[i] = xnn_qu8_requantize_fp32(
+          acc, scale, static_cast<uint8_t>(y_zero_point()),
+          0, UINT8_MAX);
+    }
+
+    // Call optimized micro-kernel.
+    vprelu(batch_size(), a_data, b_data, y.data(), &params);
+
+    // Verify results.
+    for (size_t i = 0; i < batch_size(); i++) {
+      EXPECT_NEAR(static_cast<uint32_t>(y_ref[i]), static_cast<uint32_t>(y[i]), 1)
+          << "at element " << i << " / " << batch_size();
+      EXPECT_NEAR(static_cast<float>(static_cast<int32_t>(y[i])), y_fp[i], 1.0f)
+          << "at element " << i << " / " << batch_size();
+    }
+  }
+}
+
+
diff --git a/test/vbinary-microkernel-tester.h b/test/vbinary-microkernel-tester.h
index fca254596eb..d30ff1cf077 100644
--- a/test/vbinary-microkernel-tester.h
+++ b/test/vbinary-microkernel-tester.h
@@ -201,6 +201,12 @@ class VBinaryMicrokernelTester {
 
   void Test(xnn_qs8_vmul_minmax_ukernel_fn vmul_minmax,
             xnn_init_qs8_mul_minmax_params_fn init_params) const;
+  
+  void Test(xnn_qs8_vprelu_ukernel_fn vprelu, OpType op_type,
+           xnn_init_qs8_vprelu_params_fn init_params) const;
+
+  void Test(xnn_qu8_vprelu_ukernel_fn vprelu, OpType op_type,
+            xnn_init_qu8_vprelu_params_fn init_params) const;
 
  private:
   size_t batch_size_{1};
diff --git a/tools/generate-vbinary-test.py b/tools/generate-vbinary-test.py
index 9e5700ac56a..f9ead3d8dd6 100755
--- a/tools/generate-vbinary-test.py
+++ b/tools/generate-vbinary-test.py
@@ -144,8 +144,12 @@ def main(args):
   op_type = OP_TYPES[op]
 
   test_args = ["ukernel"]
-  if tester in ["VBinaryMicrokernelTester"] and not datatype in ["qs8", "qu8"]:
-    test_args.append("%s::OpType::%s" % (tester, op_type))
+  if tester in ["VBinaryMicrokernelTester"]:
+      if datatype in ['qs8', 'qu8'] and op in ['vprelu', 'vpreluc', 'vrpreluc']:
+          op_type = "Prelu" if op in ['vprelu', 'vpreluc'] else "RPrelu"
+          test_args.append("%s::OpType::%s" % (tester, op_type))
+      elif not datatype in ['qs8', 'qu8']:
+          test_args.append("%s::OpType::%s" % (tester, op_type))
   test_args.append("init_params")
   tests += xnncommon.make_multiline_macro(
       xngen.preprocess(