From 2b909bb9654bd0776ef7201d34e7e74052696021 Mon Sep 17 00:00:00 2001 From: "Mihai.Olinovici" Date: Thu, 19 Sep 2024 06:45:36 +0000 Subject: [PATCH] Add RVV qs8/qu8-vcvt kernels and configs. --- cmake/gen/rvv_microkernels.cmake | 4 ++ gen/rvv_microkernels.bzl | 4 ++ scripts/generate-qs8-vcvt.sh | 7 ++++ src/configs/unary-elementwise-config.c | 14 ++++--- src/qs8-vcvt/gen/qs8-vcvt-rvv-u1v.c | 46 ++++++++++++++++++++++ src/qs8-vcvt/gen/qs8-vcvt-rvv-u2v.c | 46 ++++++++++++++++++++++ src/qs8-vcvt/qs8-vcvt.h | 5 +++ src/qs8-vcvt/rvv.c.in | 53 ++++++++++++++++++++++++++ src/qu8-vcvt/gen/qu8-vcvt-rvv-u1v.c | 47 +++++++++++++++++++++++ src/qu8-vcvt/gen/qu8-vcvt-rvv-u2v.c | 47 +++++++++++++++++++++++ src/qu8-vcvt/qu8-vcvt.h | 5 +++ 11 files changed, 272 insertions(+), 6 deletions(-) create mode 100644 src/qs8-vcvt/gen/qs8-vcvt-rvv-u1v.c create mode 100644 src/qs8-vcvt/gen/qs8-vcvt-rvv-u2v.c create mode 100644 src/qs8-vcvt/rvv.c.in create mode 100644 src/qu8-vcvt/gen/qu8-vcvt-rvv-u1v.c create mode 100644 src/qu8-vcvt/gen/qu8-vcvt-rvv-u2v.c diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index cd41d9717bb..5c82d7c4bdb 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -54,8 +54,10 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/f32-vrnd/gen/f32-vrndu-rvv-u4v.c src/f32-vrnd/gen/f32-vrndz-rvv-u4v.c src/f32-vrsqrt/gen/f32-vrsqrt-rvv-rsqrt-u4v.c + src/qs8-vcvt/gen/qs8-vcvt-rvv-u2v.c src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u2v.c src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u2v.c + src/qu8-vcvt/gen/qu8-vcvt-rvv-u2v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c @@ -180,8 +182,10 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x4v-minmax-rvv.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c + src/qs8-vcvt/gen/qs8-vcvt-rvv-u1v.c src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u1v.c src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u1v.c + src/qu8-vcvt/gen/qu8-vcvt-rvv-u1v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u2.c diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index 397b67d9954..9129faa27a8 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -50,8 +50,10 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-vrnd/gen/f32-vrndu-rvv-u4v.c", "src/f32-vrnd/gen/f32-vrndz-rvv-u4v.c", "src/f32-vrsqrt/gen/f32-vrsqrt-rvv-rsqrt-u4v.c", + "src/qs8-vcvt/gen/qs8-vcvt-rvv-u2v.c", "src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u2v.c", "src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u2v.c", + "src/qu8-vcvt/gen/qu8-vcvt-rvv-u2v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c", "src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c", @@ -177,8 +179,10 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x4v-minmax-rvv.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c", + "src/qs8-vcvt/gen/qs8-vcvt-rvv-u1v.c", "src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u1v.c", "src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u1v.c", + "src/qu8-vcvt/gen/qu8-vcvt-rvv-u1v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c", "src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u2.c", diff --git a/scripts/generate-qs8-vcvt.sh b/scripts/generate-qs8-vcvt.sh index 2bfb4ac2e95..632f53c3fe4 100755 --- a/scripts/generate-qs8-vcvt.sh +++ b/scripts/generate-qs8-vcvt.sh @@ -77,6 +77,13 @@ tools/xngen src/qs8-vcvt/armsimd32.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/q tools/xngen src/qs8-vcvt/armsimd32.c.in -D BATCH_TILE=4 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/qu8-vcvt-armsimd32-u4.c & tools/xngen src/qs8-vcvt/armsimd32.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/qu8-vcvt-armsimd32-u8.c & +################################ RISC-V Vector ################################ +tools/xngen src/qs8-vcvt/rvv.c.in -D LMUL=1 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/qs8-vcvt-rvv-u1v.c & +tools/xngen src/qs8-vcvt/rvv.c.in -D LMUL=2 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/qs8-vcvt-rvv-u2v.c & + +tools/xngen src/qs8-vcvt/rvv.c.in -D LMUL=1 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/qu8-vcvt-rvv-u1v.c & +tools/xngen src/qs8-vcvt/rvv.c.in -D LMUL=2 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/qu8-vcvt-rvv-u2v.c & + #################################### Scalar ################################### tools/xngen src/qs8-vcvt/scalar.c.in -D BATCH_TILE=1 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/qs8-vcvt-scalar-u1.c & tools/xngen src/qs8-vcvt/scalar.c.in -D BATCH_TILE=2 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/qs8-vcvt-scalar-u2.c & diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c index 6e794819768..0612a3ad975 100644 --- a/src/configs/unary-elementwise-config.c +++ b/src/configs/unary-elementwise-config.c @@ -1850,10 +1850,11 @@ static void init_qs8_cvt_config(void) { qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; qs8_cvt_config.element_tile = 4; } - #elif XNN_ARCH_RISCV - qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__scalar_u4; + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__rvv_u2v; qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; - qs8_cvt_config.element_tile = 4; + qs8_cvt_config.element_tile = hardware_config->vlenb / sizeof(int8_t) * 2; // (VLENB/sizeof)*LMUL; #else qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__scalar_u4; qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; @@ -2140,10 +2141,11 @@ static void init_qu8_cvt_config(void) { qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; qu8_cvt_config.element_tile = 4; } - #elif XNN_ARCH_RISCV - qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__scalar_u4; + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__rvv_u2v; qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; - qu8_cvt_config.element_tile = 4; + qu8_cvt_config.element_tile = hardware_config->vlenb / sizeof(int8_t) * 2; // (VLENB/sizeof)*LMUL; #else qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__scalar_u4; qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; diff --git a/src/qs8-vcvt/gen/qs8-vcvt-rvv-u1v.c b/src/qs8-vcvt/gen/qs8-vcvt-rvv-u1v.c new file mode 100644 index 00000000000..c65b840a503 --- /dev/null +++ b/src/qs8-vcvt/gen/qs8-vcvt-rvv-u1v.c @@ -0,0 +1,46 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-vcvt/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/vcvt.h" + + +void xnn_qs8_vcvt_ukernel__rvv_u1v( + size_t batch, + const int8_t* input, + int8_t* output, + const struct xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const int32_t bias = + ((int32_t) params->scalar.output_zero_point << 8) - + (int32_t) params->scalar.multiplier * (int32_t) params->scalar.input_zero_point + + INT32_C(0x80); + const int32_t multiplier = params->scalar.multiplier; + int32_t n = __riscv_vsetvl_e8m1(batch); + vint32m4_t bias_i32v = __riscv_vmv_v_x_i32m4(bias, n); + + do { + n = __riscv_vsetvl_e8m1(batch); batch -= n; + + vint8m1_t acc_i8v = __riscv_vle8_v_i8m1(input, n); input += n; + vint16m2_t acc_i16v = __riscv_vwcvt_x_x_v_i16m2(acc_i8v, n); + vint32m4_t acc_i32v = __riscv_vwmacc_vx_i32m4(bias_i32v, multiplier, acc_i16v, n); + vint16m2_t out_i16v = __riscv_vnclip_wx_i16m2(acc_i32v, 8, __RISCV_VXRM_RDN, n); + vint8m1_t out_i8v = __riscv_vnclip_wx_i8m1(out_i16v, 0, __RISCV_VXRM_RNU, n); + __riscv_vse8_v_i8m1(output, out_i8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qs8-vcvt/gen/qs8-vcvt-rvv-u2v.c b/src/qs8-vcvt/gen/qs8-vcvt-rvv-u2v.c new file mode 100644 index 00000000000..8ab1a6f4da9 --- /dev/null +++ b/src/qs8-vcvt/gen/qs8-vcvt-rvv-u2v.c @@ -0,0 +1,46 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-vcvt/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/vcvt.h" + + +void xnn_qs8_vcvt_ukernel__rvv_u2v( + size_t batch, + const int8_t* input, + int8_t* output, + const struct xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const int32_t bias = + ((int32_t) params->scalar.output_zero_point << 8) - + (int32_t) params->scalar.multiplier * (int32_t) params->scalar.input_zero_point + + INT32_C(0x80); + const int32_t multiplier = params->scalar.multiplier; + int32_t n = __riscv_vsetvl_e8m2(batch); + vint32m8_t bias_i32v = __riscv_vmv_v_x_i32m8(bias, n); + + do { + n = __riscv_vsetvl_e8m2(batch); batch -= n; + + vint8m2_t acc_i8v = __riscv_vle8_v_i8m2(input, n); input += n; + vint16m4_t acc_i16v = __riscv_vwcvt_x_x_v_i16m4(acc_i8v, n); + vint32m8_t acc_i32v = __riscv_vwmacc_vx_i32m8(bias_i32v, multiplier, acc_i16v, n); + vint16m4_t out_i16v = __riscv_vnclip_wx_i16m4(acc_i32v, 8, __RISCV_VXRM_RDN, n); + vint8m2_t out_i8v = __riscv_vnclip_wx_i8m2(out_i16v, 0, __RISCV_VXRM_RNU, n); + __riscv_vse8_v_i8m2(output, out_i8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qs8-vcvt/qs8-vcvt.h b/src/qs8-vcvt/qs8-vcvt.h index 9ab18389c83..36ac6e69207 100644 --- a/src/qs8-vcvt/qs8-vcvt.h +++ b/src/qs8-vcvt/qs8-vcvt.h @@ -55,6 +55,11 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qs8_vcvt_ukernel__armsimd32_u4, XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qs8_vcvt_ukernel__armsimd32_u8, 8, false, int8_t, int8_t, struct xnn_qs8_cvt_params, xnn_init_qs8_cvt_scalar_params) #endif // XNN_ARCH_ARM +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_vcvt_ukernel__rvv_u1v, 1, true, int8_t, int8_t, struct xnn_qs8_cvt_params, xnn_init_qs8_cvt_scalar_params) +XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_vcvt_ukernel__rvv_u2v, 2, true, int8_t, int8_t, struct xnn_qs8_cvt_params, xnn_init_qs8_cvt_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_vcvt_ukernel__scalar_u1, 1, false, int8_t, int8_t, struct xnn_qs8_cvt_params, xnn_init_qs8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_vcvt_ukernel__scalar_u2, 2, false, int8_t, int8_t, struct xnn_qs8_cvt_params, xnn_init_qs8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_vcvt_ukernel__scalar_u4, 4, false, int8_t, int8_t, struct xnn_qs8_cvt_params, xnn_init_qs8_cvt_scalar_params) diff --git a/src/qs8-vcvt/rvv.c.in b/src/qs8-vcvt/rvv.c.in new file mode 100644 index 00000000000..9470a580495 --- /dev/null +++ b/src/qs8-vcvt/rvv.c.in @@ -0,0 +1,53 @@ +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert DATATYPE in ["QS8", "QU8"] +#include + +#include + +#include "xnnpack/vcvt.h" + + +$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] +void xnn_${DATATYPE.lower()}_vcvt_ukernel__rvv_u${LMUL}v( + size_t batch, + const ${XINT8_T}* input, + ${XINT8_T}* output, + const struct xnn_${DATATYPE.lower()}_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(${XINT8_T}) == 0); + assert(input != NULL); + assert(output != NULL); + + const int32_t bias = + ((int32_t) params->scalar.output_zero_point << 8) - + (int32_t) params->scalar.multiplier * (int32_t) params->scalar.input_zero_point + + INT32_C(0x80); + const int32_t multiplier = params->scalar.multiplier; + int32_t n = __riscv_vsetvl_e8m${LMUL}(batch); + vint32m${LMUL*4}_t bias_i32v = __riscv_vmv_v_x_i32m${LMUL*4}(bias, n); + + do { + n = __riscv_vsetvl_e8m${LMUL}(batch); batch -= n; + + $if DATATYPE == "QS8": + vint8m${LMUL}_t acc_i8v = __riscv_vle8_v_i8m${LMUL}(input, n); input += n; + vint16m${LMUL*2}_t acc_i16v = __riscv_vwcvt_x_x_v_i16m${LMUL*2}(acc_i8v, n); + vint32m${LMUL*4}_t acc_i32v = __riscv_vwmacc_vx_i32m${LMUL*4}(bias_i32v, multiplier, acc_i16v, n); + vint16m${LMUL*2}_t out_i16v = __riscv_vnclip_wx_i16m${LMUL*2}(acc_i32v, 8, __RISCV_VXRM_RDN, n); + vint8m${LMUL}_t out_i8v = __riscv_vnclip_wx_i8m${LMUL}(out_i16v, 0, __RISCV_VXRM_RNU, n); + __riscv_vse8_v_i8m${LMUL}(output, out_i8v, n); output += n; + $else: + vuint8m${LMUL}_t acc_u8v = __riscv_vle8_v_u8m${LMUL}(input, n); input += n; + vuint16m${LMUL*2}_t acc_u16v = __riscv_vwcvtu_x_x_v_u16m${LMUL*2}(acc_u8v, n); + vint32m${LMUL*4}_t acc_i32v = __riscv_vwmacc_vx_i32m${LMUL*4}(bias_i32v, multiplier, __riscv_vreinterpret_v_u16m${LMUL*2}_i16m${LMUL*2}(acc_u16v), n); + vint16m${LMUL*2}_t out_i16v = __riscv_vnclip_wx_i16m${LMUL*2}(acc_i32v, 8, __RISCV_VXRM_RDN, n); + out_i16v = __riscv_vmax_vx_i16m${LMUL*2}(out_i16v, 0, n); + vuint8m${LMUL}_t out_u8v = __riscv_vnclipu_wx_u8m${LMUL}(__riscv_vreinterpret_v_i16m${LMUL*2}_u16m${LMUL*2}(out_i16v), 0, __RISCV_VXRM_RNU, n); + __riscv_vse8_v_u8m${LMUL}(output, out_u8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qu8-vcvt/gen/qu8-vcvt-rvv-u1v.c b/src/qu8-vcvt/gen/qu8-vcvt-rvv-u1v.c new file mode 100644 index 00000000000..b746dbbf705 --- /dev/null +++ b/src/qu8-vcvt/gen/qu8-vcvt-rvv-u1v.c @@ -0,0 +1,47 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-vcvt/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/vcvt.h" + + +void xnn_qu8_vcvt_ukernel__rvv_u1v( + size_t batch, + const uint8_t* input, + uint8_t* output, + const struct xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const int32_t bias = + ((int32_t) params->scalar.output_zero_point << 8) - + (int32_t) params->scalar.multiplier * (int32_t) params->scalar.input_zero_point + + INT32_C(0x80); + const int32_t multiplier = params->scalar.multiplier; + int32_t n = __riscv_vsetvl_e8m1(batch); + vint32m4_t bias_i32v = __riscv_vmv_v_x_i32m4(bias, n); + + do { + n = __riscv_vsetvl_e8m1(batch); batch -= n; + + vuint8m1_t acc_u8v = __riscv_vle8_v_u8m1(input, n); input += n; + vuint16m2_t acc_u16v = __riscv_vwcvtu_x_x_v_u16m2(acc_u8v, n); + vint32m4_t acc_i32v = __riscv_vwmacc_vx_i32m4(bias_i32v, multiplier, __riscv_vreinterpret_v_u16m2_i16m2(acc_u16v), n); + vint16m2_t out_i16v = __riscv_vnclip_wx_i16m2(acc_i32v, 8, __RISCV_VXRM_RDN, n); + out_i16v = __riscv_vmax_vx_i16m2(out_i16v, 0, n); + vuint8m1_t out_u8v = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(out_i16v), 0, __RISCV_VXRM_RNU, n); + __riscv_vse8_v_u8m1(output, out_u8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qu8-vcvt/gen/qu8-vcvt-rvv-u2v.c b/src/qu8-vcvt/gen/qu8-vcvt-rvv-u2v.c new file mode 100644 index 00000000000..dbc4637437d --- /dev/null +++ b/src/qu8-vcvt/gen/qu8-vcvt-rvv-u2v.c @@ -0,0 +1,47 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-vcvt/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/vcvt.h" + + +void xnn_qu8_vcvt_ukernel__rvv_u2v( + size_t batch, + const uint8_t* input, + uint8_t* output, + const struct xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const int32_t bias = + ((int32_t) params->scalar.output_zero_point << 8) - + (int32_t) params->scalar.multiplier * (int32_t) params->scalar.input_zero_point + + INT32_C(0x80); + const int32_t multiplier = params->scalar.multiplier; + int32_t n = __riscv_vsetvl_e8m2(batch); + vint32m8_t bias_i32v = __riscv_vmv_v_x_i32m8(bias, n); + + do { + n = __riscv_vsetvl_e8m2(batch); batch -= n; + + vuint8m2_t acc_u8v = __riscv_vle8_v_u8m2(input, n); input += n; + vuint16m4_t acc_u16v = __riscv_vwcvtu_x_x_v_u16m4(acc_u8v, n); + vint32m8_t acc_i32v = __riscv_vwmacc_vx_i32m8(bias_i32v, multiplier, __riscv_vreinterpret_v_u16m4_i16m4(acc_u16v), n); + vint16m4_t out_i16v = __riscv_vnclip_wx_i16m4(acc_i32v, 8, __RISCV_VXRM_RDN, n); + out_i16v = __riscv_vmax_vx_i16m4(out_i16v, 0, n); + vuint8m2_t out_u8v = __riscv_vnclipu_wx_u8m2(__riscv_vreinterpret_v_i16m4_u16m4(out_i16v), 0, __RISCV_VXRM_RNU, n); + __riscv_vse8_v_u8m2(output, out_u8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qu8-vcvt/qu8-vcvt.h b/src/qu8-vcvt/qu8-vcvt.h index 6ae370c3e29..cb138c927a1 100644 --- a/src/qu8-vcvt/qu8-vcvt.h +++ b/src/qu8-vcvt/qu8-vcvt.h @@ -55,6 +55,11 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qu8_vcvt_ukernel__armsimd32_u4, XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qu8_vcvt_ukernel__armsimd32_u8, 8, false, uint8_t, uint8_t, struct xnn_qu8_cvt_params, xnn_init_qu8_cvt_scalar_params) #endif // XNN_ARCH_ARM +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_vcvt_ukernel__rvv_u1v, 1, true, uint8_t, uint8_t, struct xnn_qu8_cvt_params, xnn_init_qu8_cvt_scalar_params) +XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_vcvt_ukernel__rvv_u2v, 2, true, uint8_t, uint8_t, struct xnn_qu8_cvt_params, xnn_init_qu8_cvt_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_vcvt_ukernel__scalar_u1, 1, false, uint8_t, uint8_t, struct xnn_qu8_cvt_params, xnn_init_qu8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_vcvt_ukernel__scalar_u2, 2, false, uint8_t, uint8_t, struct xnn_qu8_cvt_params, xnn_init_qu8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_vcvt_ukernel__scalar_u4, 4, false, uint8_t, uint8_t, struct xnn_qu8_cvt_params, xnn_init_qu8_cvt_scalar_params)