From d779b27bbcf9790ed515e768acc3afc7a949a589 Mon Sep 17 00:00:00 2001 From: kaustubh-raste Date: Fri, 20 Sep 2024 09:37:46 +0100 Subject: [PATCH 1/4] In qs8/qu8 vadd/vaddc update microparams from union to struct in .in --- src/qs8-vadd/avx2-mul32-ld64.c.in | 2 +- src/qs8-vadd/avx512skx-mul32-ld128.c.in | 2 +- src/qs8-vadd/hvx.c.in | 2 +- src/qs8-vadd/neon.c.in | 2 +- src/qs8-vadd/scalar.c.in | 2 +- src/qs8-vadd/sse-mul16-ld64.c.in | 2 +- src/qs8-vadd/sse-mul32-ld32.c.in | 2 +- src/qs8-vadd/wasmsimd.c.in | 2 +- src/qs8-vaddc/avx2-mul32-ld64.c.in | 2 +- src/qs8-vaddc/avx512skx-mul32-ld128.c.in | 2 +- src/qs8-vaddc/neon.c.in | 2 +- src/qs8-vaddc/scalar.c.in | 2 +- src/qs8-vaddc/sse-mul16-ld64.c.in | 2 +- src/qs8-vaddc/sse-mul32-ld32.c.in | 2 +- src/qs8-vaddc/wasmsimd.c.in | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/qs8-vadd/avx2-mul32-ld64.c.in b/src/qs8-vadd/avx2-mul32-ld64.c.in index a1aeceb8306..d7947fa0a7b 100644 --- a/src/qs8-vadd/avx2-mul32-ld64.c.in +++ b/src/qs8-vadd/avx2-mul32-ld64.c.in @@ -26,7 +26,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__avx2_mul32_ld64_u${BATCH_TILE} const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vadd/avx512skx-mul32-ld128.c.in b/src/qs8-vadd/avx512skx-mul32-ld128.c.in index b7b1243afe2..1a8b1266194 100644 --- a/src/qs8-vadd/avx512skx-mul32-ld128.c.in +++ b/src/qs8-vadd/avx512skx-mul32-ld128.c.in @@ -29,7 +29,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__avx512skx_mul32_ld128_u${BATCH const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vadd/hvx.c.in b/src/qs8-vadd/hvx.c.in index 888009fe821..23b9456e246 100644 --- a/src/qs8-vadd/hvx.c.in +++ b/src/qs8-vadd/hvx.c.in @@ -22,7 +22,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__hvx_u${BATCH_TILE}( const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vadd/neon.c.in b/src/qs8-vadd/neon.c.in index c68a1f91749..f8db561cecd 100644 --- a/src/qs8-vadd/neon.c.in +++ b/src/qs8-vadd/neon.c.in @@ -39,7 +39,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__neon_${"ld128" if LD128 else " const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vadd/scalar.c.in b/src/qs8-vadd/scalar.c.in index 75d857bb760..1be74bb06fc 100644 --- a/src/qs8-vadd/scalar.c.in +++ b/src/qs8-vadd/scalar.c.in @@ -17,7 +17,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__scalar_u${BATCH_TILE}( const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vadd/sse-mul16-ld64.c.in b/src/qs8-vadd/sse-mul16-ld64.c.in index 1d9ea23b7de..60b09843cf2 100644 --- a/src/qs8-vadd/sse-mul16-ld64.c.in +++ b/src/qs8-vadd/sse-mul16-ld64.c.in @@ -29,7 +29,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__${ISA}_mul16_ld64_u${BATCH_TIL const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vadd/sse-mul32-ld32.c.in b/src/qs8-vadd/sse-mul32-ld32.c.in index 0a719e3f9cb..bd9c8b15e98 100644 --- a/src/qs8-vadd/sse-mul32-ld32.c.in +++ b/src/qs8-vadd/sse-mul32-ld32.c.in @@ -29,7 +29,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__${ISA}_mul32_ld32_u${BATCH_TIL const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vadd/wasmsimd.c.in b/src/qs8-vadd/wasmsimd.c.in index f0fcd728d1c..86c31555c63 100644 --- a/src/qs8-vadd/wasmsimd.c.in +++ b/src/qs8-vadd/wasmsimd.c.in @@ -27,7 +27,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__wasmsimd_u${BATCH_TILE}( const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vaddc/avx2-mul32-ld64.c.in b/src/qs8-vaddc/avx2-mul32-ld64.c.in index fe837277db7..276a0d084bb 100644 --- a/src/qs8-vaddc/avx2-mul32-ld64.c.in +++ b/src/qs8-vaddc/avx2-mul32-ld64.c.in @@ -25,7 +25,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__avx2_mul32_ld64_u${BATCH_TILE const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vaddc/avx512skx-mul32-ld128.c.in b/src/qs8-vaddc/avx512skx-mul32-ld128.c.in index d754c937db1..3756361ff15 100644 --- a/src/qs8-vaddc/avx512skx-mul32-ld128.c.in +++ b/src/qs8-vaddc/avx512skx-mul32-ld128.c.in @@ -28,7 +28,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u${BATC const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vaddc/neon.c.in b/src/qs8-vaddc/neon.c.in index 2741f9a2979..60e4d8ea6ce 100644 --- a/src/qs8-vaddc/neon.c.in +++ b/src/qs8-vaddc/neon.c.in @@ -39,7 +39,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__neon_${"ld128" if LD128 else const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vaddc/scalar.c.in b/src/qs8-vaddc/scalar.c.in index 964dd93ed42..aa7e022d242 100644 --- a/src/qs8-vaddc/scalar.c.in +++ b/src/qs8-vaddc/scalar.c.in @@ -17,7 +17,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__scalar_u${BATCH_TILE}( const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vaddc/sse-mul16-ld64.c.in b/src/qs8-vaddc/sse-mul16-ld64.c.in index 9983d8a4555..11c0778d5c6 100644 --- a/src/qs8-vaddc/sse-mul16-ld64.c.in +++ b/src/qs8-vaddc/sse-mul16-ld64.c.in @@ -30,7 +30,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__${ISA}_mul16_ld64_u${BATCH_TI const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vaddc/sse-mul32-ld32.c.in b/src/qs8-vaddc/sse-mul32-ld32.c.in index b297eaf9f88..a570fe48ed7 100644 --- a/src/qs8-vaddc/sse-mul32-ld32.c.in +++ b/src/qs8-vaddc/sse-mul32-ld32.c.in @@ -30,7 +30,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__${ISA}_mul32_ld32_u${BATCH_TI const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); diff --git a/src/qs8-vaddc/wasmsimd.c.in b/src/qs8-vaddc/wasmsimd.c.in index 33e8c55cf0d..f75945e05bf 100644 --- a/src/qs8-vaddc/wasmsimd.c.in +++ b/src/qs8-vaddc/wasmsimd.c.in @@ -27,7 +27,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__wasmsimd_u${BATCH_TILE}( const ${XINT8_T}* input_a, const ${XINT8_T}* input_b, ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(${XINT8_T}) == 0); From a08c147da201a96ec4a6b6253e95e40db1a3fa84 Mon Sep 17 00:00:00 2001 From: kaustubh-raste Date: Fri, 20 Sep 2024 10:18:07 +0100 Subject: [PATCH 2/4] Add qs8/qu8 vadd/vaddc RVV microkernel implementations and configs --- cmake/gen/rvv_microkernels.cmake | 8 +++ gen/rvv_microkernels.bzl | 8 +++ scripts/generate-qs8-vadd.sh | 13 ++++ src/configs/binary-elementwise-config.c | 14 ++++ src/qs8-vadd/qs8-vadd-minmax.h | 5 ++ src/qs8-vadd/rvv.c.in | 71 ++++++++++++++++++++ src/qs8-vaddc/qs8-vaddc-minmax.h | 5 ++ src/qs8-vaddc/rvv.c.in | 62 +++++++++++++++++ src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c | 59 ++++++++++++++++ src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c | 59 ++++++++++++++++ src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c | 53 +++++++++++++++ src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c | 53 +++++++++++++++ src/qu8-vadd/qu8-vadd-minmax.h | 5 ++ src/qu8-vaddc/qu8-vaddc-minmax.h | 5 ++ src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c | 62 +++++++++++++++++ src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c | 62 +++++++++++++++++ src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c | 55 +++++++++++++++ src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c | 55 +++++++++++++++ 18 files changed, 654 insertions(+) create mode 100755 src/qs8-vadd/rvv.c.in create mode 100755 src/qs8-vaddc/rvv.c.in create mode 100644 src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c create mode 100644 src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c create mode 100644 src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c create mode 100644 src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c create mode 100644 src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c create mode 100644 src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c create mode 100644 src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c create mode 100644 src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index cd41d9717bb..9b38f154d5a 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -54,9 +54,13 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/f32-vrnd/gen/f32-vrndu-rvv-u4v.c src/f32-vrnd/gen/f32-vrndz-rvv-u4v.c src/f32-vrsqrt/gen/f32-vrsqrt-rvv-rsqrt-u4v.c + src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u2v.c + src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u2v.c + src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c + src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c src/x32-transposec/gen/x32-transposec-4x4-rvv.c @@ -180,9 +184,13 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x4v-minmax-rvv.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c + src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u1v.c + src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u1v.c + src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c + src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u2.c src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u4.c diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index 397b67d9954..0afed1fe70e 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -50,9 +50,13 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-vrnd/gen/f32-vrndu-rvv-u4v.c", "src/f32-vrnd/gen/f32-vrndz-rvv-u4v.c", "src/f32-vrsqrt/gen/f32-vrsqrt-rvv-rsqrt-u4v.c", + "src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c", "src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u2v.c", + "src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c", "src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u2v.c", + "src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c", + "src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c", "src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c", "src/x32-transposec/gen/x32-transposec-4x4-rvv.c", @@ -177,9 +181,13 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x4v-minmax-rvv.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c", + "src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c", "src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u1v.c", + "src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c", "src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u1v.c", + "src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c", + "src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c", "src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u2.c", "src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u4.c", diff --git a/scripts/generate-qs8-vadd.sh b/scripts/generate-qs8-vadd.sh index 037dd753c7e..7df87238abb 100755 --- a/scripts/generate-qs8-vadd.sh +++ b/scripts/generate-qs8-vadd.sh @@ -69,6 +69,19 @@ tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=32 -D LD128=0 -D DATATYPE=QU8 tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=16 -D LD128=1 -D DATATYPE=QU8 -o src/qu8-vaddc/gen/qu8-vaddc-minmax-neon-ld128-u16.c & +################################ RISC-V Vector ################################ +tools/xngen src/qs8-vadd/rvv.c.in -D LMUL=1 -D DATATYPE=QS8 -o src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c & +tools/xngen src/qs8-vadd/rvv.c.in -D LMUL=2 -D DATATYPE=QS8 -o src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c & + +tools/xngen src/qs8-vadd/rvv.c.in -D LMUL=1 -D DATATYPE=QU8 -o src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c & +tools/xngen src/qs8-vadd/rvv.c.in -D LMUL=2 -D DATATYPE=QU8 -o src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c & + +tools/xngen src/qs8-vaddc/rvv.c.in -D LMUL=1 -D DATATYPE=QS8 -o src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c & +tools/xngen src/qs8-vaddc/rvv.c.in -D LMUL=2 -D DATATYPE=QS8 -o src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c & + +tools/xngen src/qs8-vaddc/rvv.c.in -D LMUL=1 -D DATATYPE=QU8 -o src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c & +tools/xngen src/qs8-vaddc/rvv.c.in -D LMUL=2 -D DATATYPE=QU8 -o src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c & + ################################### x86 SSE ################################### tools/xngen src/qs8-vadd/sse-mul16-ld64.c.in -D BATCH_TILE=8 -D SSE=2 -D AVX=0 -D DATATYPE=QS8 -o src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u8.c & tools/xngen src/qs8-vadd/sse-mul16-ld64.c.in -D BATCH_TILE=16 -D SSE=2 -D AVX=0 -D DATATYPE=QS8 -o src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u16.c & diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c index 0e7750621bb..d321a16b8e3 100644 --- a/src/configs/binary-elementwise-config.c +++ b/src/configs/binary-elementwise-config.c @@ -1094,6 +1094,13 @@ static void init_qs8_vadd_config(void) { qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_u32; qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; qs8_vadd_config.minmax.element_tile = 32; + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__rvv_u2v; + qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__rvv_u2v; + qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__rvv_u2v; + qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_scalar_params; + qs8_vadd_config.minmax.element_tile = hardware_config->vlenb; #else qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__scalar_u4; qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__scalar_u4; @@ -1232,6 +1239,13 @@ static void init_qu8_vadd_config(void) { qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_u32; qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; qu8_vadd_config.minmax.element_tile = 32; + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__rvv_u2v; + qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__rvv_u2v; + qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__rvv_u2v; + qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_scalar_params; + qu8_vadd_config.minmax.element_tile = hardware_config->vlenb; #else qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__scalar_u4; qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__scalar_u4; diff --git a/src/qs8-vadd/qs8-vadd-minmax.h b/src/qs8-vadd/qs8-vadd-minmax.h index 7dc637cb967..ab650a763eb 100644 --- a/src/qs8-vadd/qs8-vadd-minmax.h +++ b/src/qs8-vadd/qs8-vadd-minmax.h @@ -68,6 +68,11 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_qs8_vadd_minmax_ukernel__hvx_u96, 96, XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_qs8_vadd_minmax_ukernel__hvx_u128, 128, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) #endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qs8_vadd_minmax_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qs8_vadd_minmax_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vadd_minmax_ukernel__scalar_u1, 1, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vadd_minmax_ukernel__scalar_u2, 2, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vadd_minmax_ukernel__scalar_u4, 4, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) diff --git a/src/qs8-vadd/rvv.c.in b/src/qs8-vadd/rvv.c.in new file mode 100755 index 00000000000..fbda19177ad --- /dev/null +++ b/src/qs8-vadd/rvv.c.in @@ -0,0 +1,71 @@ +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert DATATYPE in ["QS8", "QU8"] +$assert LMUL in [1, 2, 4, 8] +#include + +#include + +#include "xnnpack/vbinary.h" + +$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] + +void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__rvv_u${LMUL}v( + size_t batch, + const ${XINT8_T}* input_a, + const ${XINT8_T}* input_b, + ${XINT8_T}* output, + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(${XINT8_T}) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t bias = params->scalar.bias; + const int32_t a_multiplier = params->scalar.a_multiplier; + const int32_t b_multiplier = params->scalar.b_multiplier; + const uint32_t shift = params->scalar.shift; + const int32_t output_min = params->scalar.output_min; + const int32_t output_max = params->scalar.output_max; + const int32_t output_zero_point = params->scalar.output_zero_point; + + do { + int32_t n = __riscv_vsetvl_e8m${LMUL}(batch); batch -= n; + + $if DATATYPE == "QS8": + vint8m${LMUL}_t in_a_i8v = __riscv_vle8_v_i8m${LMUL}(input_a, n); input_a += n; + vint8m${LMUL}_t in_b_i8v = __riscv_vle8_v_i8m${LMUL}(input_b, n); input_b += n; + vint16m${LMUL*2}_t a_i16v = __riscv_vwcvt_x_x_v_i16m${LMUL*2}(in_a_i8v, n); + vint16m${LMUL*2}_t b_i16v = __riscv_vwcvt_x_x_v_i16m${LMUL*2}(in_b_i8v, n); + $else: + vuint8m${LMUL}_t in_a_u8v = __riscv_vle8_v_u8m${LMUL}(input_a, n); input_a += n; + vuint8m${LMUL}_t in_b_u8v = __riscv_vle8_v_u8m${LMUL}(input_b, n); input_b += n; + vuint16m${LMUL*2}_t a_u16v = __riscv_vwcvtu_x_x_v_u16m${LMUL*2}(in_a_u8v, n); + vuint16m${LMUL*2}_t b_u16v = __riscv_vwcvtu_x_x_v_u16m${LMUL*2}(in_b_u8v, n); + vint16m${LMUL*2}_t a_i16v = __riscv_vreinterpret_v_u16m${LMUL*2}_i16m${LMUL*2}(a_u16v); + vint16m${LMUL*2}_t b_i16v = __riscv_vreinterpret_v_u16m${LMUL*2}_i16m${LMUL*2}(b_u16v); + vint32m${LMUL*4}_t a_i32v = __riscv_vwcvt_x_x_v_i32m${LMUL*4}(a_i16v, n); + vint32m${LMUL*4}_t b_i32v = __riscv_vwcvt_x_x_v_i32m${LMUL*4}(b_i16v, n); + a_i32v = __riscv_vmul_vx_i32m${LMUL*4}(a_i32v, a_multiplier, n); + b_i32v = __riscv_vmul_vx_i32m${LMUL*4}(b_i32v, b_multiplier, n); + vint32m${LMUL*4}_t acc_i32v = __riscv_vadd_vx_i32m${LMUL*4}(a_i32v, bias, n); + acc_i32v = __riscv_vadd_vv_i32m${LMUL*4}(acc_i32v, b_i32v, n); + vint32m${LMUL*4}_t out_i32v = __riscv_vsra_vx_i32m${LMUL*4}(acc_i32v, shift, n); + out_i32v = __riscv_vadd_vx_i32m${LMUL*4}(out_i32v, output_zero_point, n); + out_i32v = __riscv_vmax_vx_i32m${LMUL*4}(out_i32v, output_min, n); + out_i32v = __riscv_vmin_vx_i32m${LMUL*4}(out_i32v, output_max, n); + vint16m${LMUL*2}_t out_i16v = __riscv_vncvt_x_x_w_i16m${LMUL*2}(out_i32v, n); + $if DATATYPE == "QS8": + vint8m${LMUL}_t out_i8v = __riscv_vncvt_x_x_w_i8m${LMUL}(out_i16v, n); + __riscv_vse8_v_i8m${LMUL}(output, out_i8v, n); output += n; + $else: + a_u16v = __riscv_vreinterpret_v_i16m${LMUL*2}_u16m${LMUL*2}(out_i16v); + vuint8m${LMUL}_t out_u8v = __riscv_vncvt_x_x_w_u8m${LMUL}(a_u16v, n); + __riscv_vse8_v_u8m${LMUL}(output, out_u8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qs8-vaddc/qs8-vaddc-minmax.h b/src/qs8-vaddc/qs8-vaddc-minmax.h index 4b355506b4a..1d39167110f 100644 --- a/src/qs8-vaddc/qs8-vaddc-minmax.h +++ b/src/qs8-vaddc/qs8-vaddc-minmax.h @@ -61,6 +61,11 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vaddc_minmax_ukernel__wasmsimd_u24, 24, false XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vaddc_minmax_ukernel__wasmsimd_u32, 32, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qs8_vaddc_minmax_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qs8_vaddc_minmax_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vaddc_minmax_ukernel__scalar_u1, 1, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vaddc_minmax_ukernel__scalar_u2, 2, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vaddc_minmax_ukernel__scalar_u4, 4, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) diff --git a/src/qs8-vaddc/rvv.c.in b/src/qs8-vaddc/rvv.c.in new file mode 100755 index 00000000000..e9a3fc8f4da --- /dev/null +++ b/src/qs8-vaddc/rvv.c.in @@ -0,0 +1,62 @@ +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert DATATYPE in ["QS8", "QU8"] +$assert LMUL in [1, 2, 4, 8] +#include + +#include + +#include "xnnpack/vbinary.h" + +$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] + +void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__rvv_u${LMUL}v( + size_t batch, + const ${XINT8_T}* input_a, + const ${XINT8_T}* input_b, + ${XINT8_T}* output, + const struct xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(${XINT8_T}) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t bias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier; + const int32_t a_multiplier = params->scalar.a_multiplier; + const uint32_t shift = params->scalar.shift; + const int32_t output_min = params->scalar.output_min; + const int32_t output_max = params->scalar.output_max; + const int32_t output_zero_point = params->scalar.output_zero_point; + + do { + int32_t n = __riscv_vsetvl_e8m${LMUL}(batch); batch -= n; + + $if DATATYPE == "QS8": + vint8m${LMUL}_t in_a_i8v = __riscv_vle8_v_i8m${LMUL}(input_a, n); input_a += n; + vint16m${LMUL*2}_t a_i16v = __riscv_vwcvt_x_x_v_i16m${LMUL*2}(in_a_i8v, n); + $else: + vuint8m${LMUL}_t in_a_u8v = __riscv_vle8_v_u8m${LMUL}(input_a, n); input_a += n; + vuint16m${LMUL*2}_t a_u16v = __riscv_vwcvtu_x_x_v_u16m${LMUL*2}(in_a_u8v, n); + vint16m${LMUL*2}_t a_i16v = __riscv_vreinterpret_v_u16m${LMUL*2}_i16m${LMUL*2}(a_u16v); + vint32m${LMUL*4}_t a_i32v = __riscv_vwcvt_x_x_v_i32m${LMUL*4}(a_i16v, n); + a_i32v = __riscv_vmul_vx_i32m${LMUL*4}(a_i32v, a_multiplier, n); + vint32m${LMUL*4}_t acc_i32v = __riscv_vadd_vx_i32m${LMUL*4}(a_i32v, bias, n); + vint32m${LMUL*4}_t out_i32v = __riscv_vsra_vx_i32m${LMUL*4}(acc_i32v, shift, n); + out_i32v = __riscv_vadd_vx_i32m${LMUL*4}(out_i32v, output_zero_point, n); + out_i32v = __riscv_vmax_vx_i32m${LMUL*4}(out_i32v, output_min, n); + out_i32v = __riscv_vmin_vx_i32m${LMUL*4}(out_i32v, output_max, n); + vint16m${LMUL*2}_t out_i16v = __riscv_vncvt_x_x_w_i16m${LMUL*2}(out_i32v, n); + $if DATATYPE == "QS8": + vint8m${LMUL}_t out_i8v = __riscv_vncvt_x_x_w_i8m${LMUL}(out_i16v, n); + __riscv_vse8_v_i8m${LMUL}(output, out_i8v, n); output += n; + $else: + a_u16v = __riscv_vreinterpret_v_i16m${LMUL*2}_u16m${LMUL*2}(out_i16v); + vuint8m${LMUL}_t out_u8v = __riscv_vncvt_x_x_w_u8m${LMUL}(a_u16v, n); + __riscv_vse8_v_u8m${LMUL}(output, out_u8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c b/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c new file mode 100644 index 00000000000..e72fd205a6d --- /dev/null +++ b/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c @@ -0,0 +1,59 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-vadd/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/vbinary.h" + + +void xnn_qs8_vadd_minmax_ukernel__rvv_u1v( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const struct xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t bias = params->scalar.bias; + const int32_t a_multiplier = params->scalar.a_multiplier; + const int32_t b_multiplier = params->scalar.b_multiplier; + const uint32_t shift = params->scalar.shift; + const int32_t output_min = params->scalar.output_min; + const int32_t output_max = params->scalar.output_max; + const int32_t output_zero_point = params->scalar.output_zero_point; + + do { + int32_t n = __riscv_vsetvl_e8m1(batch); batch -= n; + + vint8m1_t in_a_i8v = __riscv_vle8_v_i8m1(input_a, n); input_a += n; + vint8m1_t in_b_i8v = __riscv_vle8_v_i8m1(input_b, n); input_b += n; + vint16m2_t a_i16v = __riscv_vwcvt_x_x_v_i16m2(in_a_i8v, n); + vint16m2_t b_i16v = __riscv_vwcvt_x_x_v_i16m2(in_b_i8v, n); + vint32m4_t a_i32v = __riscv_vwcvt_x_x_v_i32m4(a_i16v, n); + vint32m4_t b_i32v = __riscv_vwcvt_x_x_v_i32m4(b_i16v, n); + a_i32v = __riscv_vmul_vx_i32m4(a_i32v, a_multiplier, n); + b_i32v = __riscv_vmul_vx_i32m4(b_i32v, b_multiplier, n); + vint32m4_t acc_i32v = __riscv_vadd_vx_i32m4(a_i32v, bias, n); + acc_i32v = __riscv_vadd_vv_i32m4(acc_i32v, b_i32v, n); + vint32m4_t out_i32v = __riscv_vsra_vx_i32m4(acc_i32v, shift, n); + out_i32v = __riscv_vadd_vx_i32m4(out_i32v, output_zero_point, n); + out_i32v = __riscv_vmax_vx_i32m4(out_i32v, output_min, n); + out_i32v = __riscv_vmin_vx_i32m4(out_i32v, output_max, n); + vint16m2_t out_i16v = __riscv_vncvt_x_x_w_i16m2(out_i32v, n); + vint8m1_t out_i8v = __riscv_vncvt_x_x_w_i8m1(out_i16v, n); + __riscv_vse8_v_i8m1(output, out_i8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c b/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c new file mode 100644 index 00000000000..c6e7588970a --- /dev/null +++ b/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c @@ -0,0 +1,59 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-vadd/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/vbinary.h" + + +void xnn_qs8_vadd_minmax_ukernel__rvv_u2v( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const struct xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t bias = params->scalar.bias; + const int32_t a_multiplier = params->scalar.a_multiplier; + const int32_t b_multiplier = params->scalar.b_multiplier; + const uint32_t shift = params->scalar.shift; + const int32_t output_min = params->scalar.output_min; + const int32_t output_max = params->scalar.output_max; + const int32_t output_zero_point = params->scalar.output_zero_point; + + do { + int32_t n = __riscv_vsetvl_e8m2(batch); batch -= n; + + vint8m2_t in_a_i8v = __riscv_vle8_v_i8m2(input_a, n); input_a += n; + vint8m2_t in_b_i8v = __riscv_vle8_v_i8m2(input_b, n); input_b += n; + vint16m4_t a_i16v = __riscv_vwcvt_x_x_v_i16m4(in_a_i8v, n); + vint16m4_t b_i16v = __riscv_vwcvt_x_x_v_i16m4(in_b_i8v, n); + vint32m8_t a_i32v = __riscv_vwcvt_x_x_v_i32m8(a_i16v, n); + vint32m8_t b_i32v = __riscv_vwcvt_x_x_v_i32m8(b_i16v, n); + a_i32v = __riscv_vmul_vx_i32m8(a_i32v, a_multiplier, n); + b_i32v = __riscv_vmul_vx_i32m8(b_i32v, b_multiplier, n); + vint32m8_t acc_i32v = __riscv_vadd_vx_i32m8(a_i32v, bias, n); + acc_i32v = __riscv_vadd_vv_i32m8(acc_i32v, b_i32v, n); + vint32m8_t out_i32v = __riscv_vsra_vx_i32m8(acc_i32v, shift, n); + out_i32v = __riscv_vadd_vx_i32m8(out_i32v, output_zero_point, n); + out_i32v = __riscv_vmax_vx_i32m8(out_i32v, output_min, n); + out_i32v = __riscv_vmin_vx_i32m8(out_i32v, output_max, n); + vint16m4_t out_i16v = __riscv_vncvt_x_x_w_i16m4(out_i32v, n); + vint8m2_t out_i8v = __riscv_vncvt_x_x_w_i8m2(out_i16v, n); + __riscv_vse8_v_i8m2(output, out_i8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c b/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c new file mode 100644 index 00000000000..65991ef2e6c --- /dev/null +++ b/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c @@ -0,0 +1,53 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-vaddc/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/vbinary.h" + + +void xnn_qs8_vaddc_minmax_ukernel__rvv_u1v( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const struct xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t bias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier; + const int32_t a_multiplier = params->scalar.a_multiplier; + const uint32_t shift = params->scalar.shift; + const int32_t output_min = params->scalar.output_min; + const int32_t output_max = params->scalar.output_max; + const int32_t output_zero_point = params->scalar.output_zero_point; + + do { + int32_t n = __riscv_vsetvl_e8m1(batch); batch -= n; + + vint8m1_t in_a_i8v = __riscv_vle8_v_i8m1(input_a, n); input_a += n; + vint16m2_t a_i16v = __riscv_vwcvt_x_x_v_i16m2(in_a_i8v, n); + vint32m4_t a_i32v = __riscv_vwcvt_x_x_v_i32m4(a_i16v, n); + a_i32v = __riscv_vmul_vx_i32m4(a_i32v, a_multiplier, n); + vint32m4_t acc_i32v = __riscv_vadd_vx_i32m4(a_i32v, bias, n); + vint32m4_t out_i32v = __riscv_vsra_vx_i32m4(acc_i32v, shift, n); + out_i32v = __riscv_vadd_vx_i32m4(out_i32v, output_zero_point, n); + out_i32v = __riscv_vmax_vx_i32m4(out_i32v, output_min, n); + out_i32v = __riscv_vmin_vx_i32m4(out_i32v, output_max, n); + vint16m2_t out_i16v = __riscv_vncvt_x_x_w_i16m2(out_i32v, n); + vint8m1_t out_i8v = __riscv_vncvt_x_x_w_i8m1(out_i16v, n); + __riscv_vse8_v_i8m1(output, out_i8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c b/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c new file mode 100644 index 00000000000..bc512319791 --- /dev/null +++ b/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c @@ -0,0 +1,53 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-vaddc/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/vbinary.h" + + +void xnn_qs8_vaddc_minmax_ukernel__rvv_u2v( + size_t batch, + const int8_t* input_a, + const int8_t* input_b, + int8_t* output, + const struct xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t bias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier; + const int32_t a_multiplier = params->scalar.a_multiplier; + const uint32_t shift = params->scalar.shift; + const int32_t output_min = params->scalar.output_min; + const int32_t output_max = params->scalar.output_max; + const int32_t output_zero_point = params->scalar.output_zero_point; + + do { + int32_t n = __riscv_vsetvl_e8m2(batch); batch -= n; + + vint8m2_t in_a_i8v = __riscv_vle8_v_i8m2(input_a, n); input_a += n; + vint16m4_t a_i16v = __riscv_vwcvt_x_x_v_i16m4(in_a_i8v, n); + vint32m8_t a_i32v = __riscv_vwcvt_x_x_v_i32m8(a_i16v, n); + a_i32v = __riscv_vmul_vx_i32m8(a_i32v, a_multiplier, n); + vint32m8_t acc_i32v = __riscv_vadd_vx_i32m8(a_i32v, bias, n); + vint32m8_t out_i32v = __riscv_vsra_vx_i32m8(acc_i32v, shift, n); + out_i32v = __riscv_vadd_vx_i32m8(out_i32v, output_zero_point, n); + out_i32v = __riscv_vmax_vx_i32m8(out_i32v, output_min, n); + out_i32v = __riscv_vmin_vx_i32m8(out_i32v, output_max, n); + vint16m4_t out_i16v = __riscv_vncvt_x_x_w_i16m4(out_i32v, n); + vint8m2_t out_i8v = __riscv_vncvt_x_x_w_i8m2(out_i16v, n); + __riscv_vse8_v_i8m2(output, out_i8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qu8-vadd/qu8-vadd-minmax.h b/src/qu8-vadd/qu8-vadd-minmax.h index 2e9658f7fe9..501045de1cf 100644 --- a/src/qu8-vadd/qu8-vadd-minmax.h +++ b/src/qu8-vadd/qu8-vadd-minmax.h @@ -46,6 +46,11 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vadd_minmax_ukernel__wasmsimd_u16, 16, false, XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vadd_minmax_ukernel__wasmsimd_u32, 32, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qu8_vadd_minmax_ukernel__rvv_u1v, 1, true, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qu8_vadd_minmax_ukernel__rvv_u2v, 2, true, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vadd_minmax_ukernel__scalar_u1, 1, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vadd_minmax_ukernel__scalar_u2, 2, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vadd_minmax_ukernel__scalar_u4, 4, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) diff --git a/src/qu8-vaddc/qu8-vaddc-minmax.h b/src/qu8-vaddc/qu8-vaddc-minmax.h index 08c23dd2a97..06c46f467dd 100644 --- a/src/qu8-vaddc/qu8-vaddc-minmax.h +++ b/src/qu8-vaddc/qu8-vaddc-minmax.h @@ -46,6 +46,11 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vaddc_minmax_ukernel__wasmsimd_u16, 16, false XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vaddc_minmax_ukernel__wasmsimd_u32, 32, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qu8_vaddc_minmax_ukernel__rvv_u1v, 1, true, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qu8_vaddc_minmax_ukernel__rvv_u2v, 2, true, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vaddc_minmax_ukernel__scalar_u1, 1, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vaddc_minmax_ukernel__scalar_u2, 2, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vaddc_minmax_ukernel__scalar_u4, 4, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) diff --git a/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c b/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c new file mode 100644 index 00000000000..6b1e27390d3 --- /dev/null +++ b/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c @@ -0,0 +1,62 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-vadd/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/vbinary.h" + + +void xnn_qu8_vadd_minmax_ukernel__rvv_u1v( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const struct xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t bias = params->scalar.bias; + const int32_t a_multiplier = params->scalar.a_multiplier; + const int32_t b_multiplier = params->scalar.b_multiplier; + const uint32_t shift = params->scalar.shift; + const int32_t output_min = params->scalar.output_min; + const int32_t output_max = params->scalar.output_max; + const int32_t output_zero_point = params->scalar.output_zero_point; + + do { + int32_t n = __riscv_vsetvl_e8m1(batch); batch -= n; + + vuint8m1_t in_a_u8v = __riscv_vle8_v_u8m1(input_a, n); input_a += n; + vuint8m1_t in_b_u8v = __riscv_vle8_v_u8m1(input_b, n); input_b += n; + vuint16m2_t a_u16v = __riscv_vwcvtu_x_x_v_u16m2(in_a_u8v, n); + vuint16m2_t b_u16v = __riscv_vwcvtu_x_x_v_u16m2(in_b_u8v, n); + vint16m2_t a_i16v = __riscv_vreinterpret_v_u16m2_i16m2(a_u16v); + vint16m2_t b_i16v = __riscv_vreinterpret_v_u16m2_i16m2(b_u16v); + vint32m4_t a_i32v = __riscv_vwcvt_x_x_v_i32m4(a_i16v, n); + vint32m4_t b_i32v = __riscv_vwcvt_x_x_v_i32m4(b_i16v, n); + a_i32v = __riscv_vmul_vx_i32m4(a_i32v, a_multiplier, n); + b_i32v = __riscv_vmul_vx_i32m4(b_i32v, b_multiplier, n); + vint32m4_t acc_i32v = __riscv_vadd_vx_i32m4(a_i32v, bias, n); + acc_i32v = __riscv_vadd_vv_i32m4(acc_i32v, b_i32v, n); + vint32m4_t out_i32v = __riscv_vsra_vx_i32m4(acc_i32v, shift, n); + out_i32v = __riscv_vadd_vx_i32m4(out_i32v, output_zero_point, n); + out_i32v = __riscv_vmax_vx_i32m4(out_i32v, output_min, n); + out_i32v = __riscv_vmin_vx_i32m4(out_i32v, output_max, n); + vint16m2_t out_i16v = __riscv_vncvt_x_x_w_i16m2(out_i32v, n); + a_u16v = __riscv_vreinterpret_v_i16m2_u16m2(out_i16v); + vuint8m1_t out_u8v = __riscv_vncvt_x_x_w_u8m1(a_u16v, n); + __riscv_vse8_v_u8m1(output, out_u8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c b/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c new file mode 100644 index 00000000000..0bc7417d848 --- /dev/null +++ b/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c @@ -0,0 +1,62 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-vadd/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/vbinary.h" + + +void xnn_qu8_vadd_minmax_ukernel__rvv_u2v( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const struct xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t bias = params->scalar.bias; + const int32_t a_multiplier = params->scalar.a_multiplier; + const int32_t b_multiplier = params->scalar.b_multiplier; + const uint32_t shift = params->scalar.shift; + const int32_t output_min = params->scalar.output_min; + const int32_t output_max = params->scalar.output_max; + const int32_t output_zero_point = params->scalar.output_zero_point; + + do { + int32_t n = __riscv_vsetvl_e8m2(batch); batch -= n; + + vuint8m2_t in_a_u8v = __riscv_vle8_v_u8m2(input_a, n); input_a += n; + vuint8m2_t in_b_u8v = __riscv_vle8_v_u8m2(input_b, n); input_b += n; + vuint16m4_t a_u16v = __riscv_vwcvtu_x_x_v_u16m4(in_a_u8v, n); + vuint16m4_t b_u16v = __riscv_vwcvtu_x_x_v_u16m4(in_b_u8v, n); + vint16m4_t a_i16v = __riscv_vreinterpret_v_u16m4_i16m4(a_u16v); + vint16m4_t b_i16v = __riscv_vreinterpret_v_u16m4_i16m4(b_u16v); + vint32m8_t a_i32v = __riscv_vwcvt_x_x_v_i32m8(a_i16v, n); + vint32m8_t b_i32v = __riscv_vwcvt_x_x_v_i32m8(b_i16v, n); + a_i32v = __riscv_vmul_vx_i32m8(a_i32v, a_multiplier, n); + b_i32v = __riscv_vmul_vx_i32m8(b_i32v, b_multiplier, n); + vint32m8_t acc_i32v = __riscv_vadd_vx_i32m8(a_i32v, bias, n); + acc_i32v = __riscv_vadd_vv_i32m8(acc_i32v, b_i32v, n); + vint32m8_t out_i32v = __riscv_vsra_vx_i32m8(acc_i32v, shift, n); + out_i32v = __riscv_vadd_vx_i32m8(out_i32v, output_zero_point, n); + out_i32v = __riscv_vmax_vx_i32m8(out_i32v, output_min, n); + out_i32v = __riscv_vmin_vx_i32m8(out_i32v, output_max, n); + vint16m4_t out_i16v = __riscv_vncvt_x_x_w_i16m4(out_i32v, n); + a_u16v = __riscv_vreinterpret_v_i16m4_u16m4(out_i16v); + vuint8m2_t out_u8v = __riscv_vncvt_x_x_w_u8m2(a_u16v, n); + __riscv_vse8_v_u8m2(output, out_u8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c b/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c new file mode 100644 index 00000000000..5632688ac9e --- /dev/null +++ b/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c @@ -0,0 +1,55 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-vaddc/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/vbinary.h" + + +void xnn_qu8_vaddc_minmax_ukernel__rvv_u1v( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const struct xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t bias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier; + const int32_t a_multiplier = params->scalar.a_multiplier; + const uint32_t shift = params->scalar.shift; + const int32_t output_min = params->scalar.output_min; + const int32_t output_max = params->scalar.output_max; + const int32_t output_zero_point = params->scalar.output_zero_point; + + do { + int32_t n = __riscv_vsetvl_e8m1(batch); batch -= n; + + vuint8m1_t in_a_u8v = __riscv_vle8_v_u8m1(input_a, n); input_a += n; + vuint16m2_t a_u16v = __riscv_vwcvtu_x_x_v_u16m2(in_a_u8v, n); + vint16m2_t a_i16v = __riscv_vreinterpret_v_u16m2_i16m2(a_u16v); + vint32m4_t a_i32v = __riscv_vwcvt_x_x_v_i32m4(a_i16v, n); + a_i32v = __riscv_vmul_vx_i32m4(a_i32v, a_multiplier, n); + vint32m4_t acc_i32v = __riscv_vadd_vx_i32m4(a_i32v, bias, n); + vint32m4_t out_i32v = __riscv_vsra_vx_i32m4(acc_i32v, shift, n); + out_i32v = __riscv_vadd_vx_i32m4(out_i32v, output_zero_point, n); + out_i32v = __riscv_vmax_vx_i32m4(out_i32v, output_min, n); + out_i32v = __riscv_vmin_vx_i32m4(out_i32v, output_max, n); + vint16m2_t out_i16v = __riscv_vncvt_x_x_w_i16m2(out_i32v, n); + a_u16v = __riscv_vreinterpret_v_i16m2_u16m2(out_i16v); + vuint8m1_t out_u8v = __riscv_vncvt_x_x_w_u8m1(a_u16v, n); + __riscv_vse8_v_u8m1(output, out_u8v, n); output += n; + } while (batch != 0); +} diff --git a/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c b/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c new file mode 100644 index 00000000000..17a6b4e05c6 --- /dev/null +++ b/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c @@ -0,0 +1,55 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-vaddc/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/vbinary.h" + + +void xnn_qu8_vaddc_minmax_ukernel__rvv_u2v( + size_t batch, + const uint8_t* input_a, + const uint8_t* input_b, + uint8_t* output, + const struct xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + const int32_t bias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier; + const int32_t a_multiplier = params->scalar.a_multiplier; + const uint32_t shift = params->scalar.shift; + const int32_t output_min = params->scalar.output_min; + const int32_t output_max = params->scalar.output_max; + const int32_t output_zero_point = params->scalar.output_zero_point; + + do { + int32_t n = __riscv_vsetvl_e8m2(batch); batch -= n; + + vuint8m2_t in_a_u8v = __riscv_vle8_v_u8m2(input_a, n); input_a += n; + vuint16m4_t a_u16v = __riscv_vwcvtu_x_x_v_u16m4(in_a_u8v, n); + vint16m4_t a_i16v = __riscv_vreinterpret_v_u16m4_i16m4(a_u16v); + vint32m8_t a_i32v = __riscv_vwcvt_x_x_v_i32m8(a_i16v, n); + a_i32v = __riscv_vmul_vx_i32m8(a_i32v, a_multiplier, n); + vint32m8_t acc_i32v = __riscv_vadd_vx_i32m8(a_i32v, bias, n); + vint32m8_t out_i32v = __riscv_vsra_vx_i32m8(acc_i32v, shift, n); + out_i32v = __riscv_vadd_vx_i32m8(out_i32v, output_zero_point, n); + out_i32v = __riscv_vmax_vx_i32m8(out_i32v, output_min, n); + out_i32v = __riscv_vmin_vx_i32m8(out_i32v, output_max, n); + vint16m4_t out_i16v = __riscv_vncvt_x_x_w_i16m4(out_i32v, n); + a_u16v = __riscv_vreinterpret_v_i16m4_u16m4(out_i16v); + vuint8m2_t out_u8v = __riscv_vncvt_x_x_w_u8m2(a_u16v, n); + __riscv_vse8_v_u8m2(output, out_u8v, n); output += n; + } while (batch != 0); +} From 763c584ec281750a6f18ae53dc07162de9cb81c3 Mon Sep 17 00:00:00 2001 From: kaustubh-raste Date: Wed, 25 Sep 2024 11:58:02 +0100 Subject: [PATCH 3/4] Use vnclip+max instead of min+max+vncvt --- src/configs/binary-elementwise-config.c | 4 ++-- src/qs8-vadd/rvv.c.in | 5 ++--- src/qs8-vaddc/rvv.c.in | 5 ++--- src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c | 5 ++--- src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c | 5 ++--- src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c | 5 ++--- src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c | 5 ++--- src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c | 5 ++--- src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c | 5 ++--- src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c | 5 ++--- src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c | 5 ++--- 11 files changed, 22 insertions(+), 32 deletions(-) diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c index 3a91d1b7e9d..c1f258483ed 100644 --- a/src/configs/binary-elementwise-config.c +++ b/src/configs/binary-elementwise-config.c @@ -965,7 +965,7 @@ static void init_qs8_vadd_config(void) { qs8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__rvv_u2v; qs8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__rvv_u2v; qs8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__rvv_u2v; - qs8_vadd_config.init = xnn_init_qs8_add_minmax_scalar_params; + qs8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qs8_add_minmax_scalar_params; qs8_vadd_config.element_tile = hardware_config->vlenb; #else qs8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__scalar_u4; @@ -1110,7 +1110,7 @@ static void init_qu8_vadd_config(void) { qu8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__rvv_u2v; qu8_vadd_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__rvv_u2v; qu8_vadd_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__rvv_u2v; - qu8_vadd_config.init = xnn_init_qu8_add_minmax_scalar_params; + qu8_vadd_config.init = (xnn_init_binary_params_fn) xnn_init_qu8_add_minmax_scalar_params; qu8_vadd_config.element_tile = hardware_config->vlenb; #else qu8_vadd_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__scalar_u4; diff --git a/src/qs8-vadd/rvv.c.in b/src/qs8-vadd/rvv.c.in index fbda19177ad..78fd1b453de 100755 --- a/src/qs8-vadd/rvv.c.in +++ b/src/qs8-vadd/rvv.c.in @@ -57,9 +57,8 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__rvv_u${LMUL}v( acc_i32v = __riscv_vadd_vv_i32m${LMUL*4}(acc_i32v, b_i32v, n); vint32m${LMUL*4}_t out_i32v = __riscv_vsra_vx_i32m${LMUL*4}(acc_i32v, shift, n); out_i32v = __riscv_vadd_vx_i32m${LMUL*4}(out_i32v, output_zero_point, n); - out_i32v = __riscv_vmax_vx_i32m${LMUL*4}(out_i32v, output_min, n); - out_i32v = __riscv_vmin_vx_i32m${LMUL*4}(out_i32v, output_max, n); - vint16m${LMUL*2}_t out_i16v = __riscv_vncvt_x_x_w_i16m${LMUL*2}(out_i32v, n); + vint16m${LMUL*2}_t out_i16v = __riscv_vnclip_wx_i16m${LMUL*2}(out_i32v, output_max, __RISCV_VXRM_RNE, n); + out_i16v = __riscv_vmax_vx_i16m${LMUL*2}(out_i16v, output_min, n); $if DATATYPE == "QS8": vint8m${LMUL}_t out_i8v = __riscv_vncvt_x_x_w_i8m${LMUL}(out_i16v, n); __riscv_vse8_v_i8m${LMUL}(output, out_i8v, n); output += n; diff --git a/src/qs8-vaddc/rvv.c.in b/src/qs8-vaddc/rvv.c.in index e9a3fc8f4da..ffc01e31e59 100755 --- a/src/qs8-vaddc/rvv.c.in +++ b/src/qs8-vaddc/rvv.c.in @@ -48,9 +48,8 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__rvv_u${LMUL}v( vint32m${LMUL*4}_t acc_i32v = __riscv_vadd_vx_i32m${LMUL*4}(a_i32v, bias, n); vint32m${LMUL*4}_t out_i32v = __riscv_vsra_vx_i32m${LMUL*4}(acc_i32v, shift, n); out_i32v = __riscv_vadd_vx_i32m${LMUL*4}(out_i32v, output_zero_point, n); - out_i32v = __riscv_vmax_vx_i32m${LMUL*4}(out_i32v, output_min, n); - out_i32v = __riscv_vmin_vx_i32m${LMUL*4}(out_i32v, output_max, n); - vint16m${LMUL*2}_t out_i16v = __riscv_vncvt_x_x_w_i16m${LMUL*2}(out_i32v, n); + vint16m${LMUL*2}_t out_i16v = __riscv_vnclip_wx_i16m${LMUL*2}(out_i32v, output_max, __RISCV_VXRM_RNE, n); + out_i16v = __riscv_vmax_vx_i16m${LMUL*2}(out_i16v, output_min, n); $if DATATYPE == "QS8": vint8m${LMUL}_t out_i8v = __riscv_vncvt_x_x_w_i8m${LMUL}(out_i16v, n); __riscv_vse8_v_i8m${LMUL}(output, out_i8v, n); output += n; diff --git a/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c b/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c index e72fd205a6d..71ede41724b 100644 --- a/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c +++ b/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c @@ -50,9 +50,8 @@ void xnn_qs8_vadd_minmax_ukernel__rvv_u1v( acc_i32v = __riscv_vadd_vv_i32m4(acc_i32v, b_i32v, n); vint32m4_t out_i32v = __riscv_vsra_vx_i32m4(acc_i32v, shift, n); out_i32v = __riscv_vadd_vx_i32m4(out_i32v, output_zero_point, n); - out_i32v = __riscv_vmax_vx_i32m4(out_i32v, output_min, n); - out_i32v = __riscv_vmin_vx_i32m4(out_i32v, output_max, n); - vint16m2_t out_i16v = __riscv_vncvt_x_x_w_i16m2(out_i32v, n); + vint16m2_t out_i16v = __riscv_vnclip_wx_i16m2(out_i32v, output_max, __RISCV_VXRM_RNE, n); + out_i16v = __riscv_vmax_vx_i16m2(out_i16v, output_min, n); vint8m1_t out_i8v = __riscv_vncvt_x_x_w_i8m1(out_i16v, n); __riscv_vse8_v_i8m1(output, out_i8v, n); output += n; } while (batch != 0); diff --git a/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c b/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c index c6e7588970a..cc495efb25a 100644 --- a/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c +++ b/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c @@ -50,9 +50,8 @@ void xnn_qs8_vadd_minmax_ukernel__rvv_u2v( acc_i32v = __riscv_vadd_vv_i32m8(acc_i32v, b_i32v, n); vint32m8_t out_i32v = __riscv_vsra_vx_i32m8(acc_i32v, shift, n); out_i32v = __riscv_vadd_vx_i32m8(out_i32v, output_zero_point, n); - out_i32v = __riscv_vmax_vx_i32m8(out_i32v, output_min, n); - out_i32v = __riscv_vmin_vx_i32m8(out_i32v, output_max, n); - vint16m4_t out_i16v = __riscv_vncvt_x_x_w_i16m4(out_i32v, n); + vint16m4_t out_i16v = __riscv_vnclip_wx_i16m4(out_i32v, output_max, __RISCV_VXRM_RNE, n); + out_i16v = __riscv_vmax_vx_i16m4(out_i16v, output_min, n); vint8m2_t out_i8v = __riscv_vncvt_x_x_w_i8m2(out_i16v, n); __riscv_vse8_v_i8m2(output, out_i8v, n); output += n; } while (batch != 0); diff --git a/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c b/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c index 65991ef2e6c..44c9d854299 100644 --- a/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c +++ b/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c @@ -44,9 +44,8 @@ void xnn_qs8_vaddc_minmax_ukernel__rvv_u1v( vint32m4_t acc_i32v = __riscv_vadd_vx_i32m4(a_i32v, bias, n); vint32m4_t out_i32v = __riscv_vsra_vx_i32m4(acc_i32v, shift, n); out_i32v = __riscv_vadd_vx_i32m4(out_i32v, output_zero_point, n); - out_i32v = __riscv_vmax_vx_i32m4(out_i32v, output_min, n); - out_i32v = __riscv_vmin_vx_i32m4(out_i32v, output_max, n); - vint16m2_t out_i16v = __riscv_vncvt_x_x_w_i16m2(out_i32v, n); + vint16m2_t out_i16v = __riscv_vnclip_wx_i16m2(out_i32v, output_max, __RISCV_VXRM_RNE, n); + out_i16v = __riscv_vmax_vx_i16m2(out_i16v, output_min, n); vint8m1_t out_i8v = __riscv_vncvt_x_x_w_i8m1(out_i16v, n); __riscv_vse8_v_i8m1(output, out_i8v, n); output += n; } while (batch != 0); diff --git a/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c b/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c index bc512319791..06daea3a11b 100644 --- a/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c +++ b/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c @@ -44,9 +44,8 @@ void xnn_qs8_vaddc_minmax_ukernel__rvv_u2v( vint32m8_t acc_i32v = __riscv_vadd_vx_i32m8(a_i32v, bias, n); vint32m8_t out_i32v = __riscv_vsra_vx_i32m8(acc_i32v, shift, n); out_i32v = __riscv_vadd_vx_i32m8(out_i32v, output_zero_point, n); - out_i32v = __riscv_vmax_vx_i32m8(out_i32v, output_min, n); - out_i32v = __riscv_vmin_vx_i32m8(out_i32v, output_max, n); - vint16m4_t out_i16v = __riscv_vncvt_x_x_w_i16m4(out_i32v, n); + vint16m4_t out_i16v = __riscv_vnclip_wx_i16m4(out_i32v, output_max, __RISCV_VXRM_RNE, n); + out_i16v = __riscv_vmax_vx_i16m4(out_i16v, output_min, n); vint8m2_t out_i8v = __riscv_vncvt_x_x_w_i8m2(out_i16v, n); __riscv_vse8_v_i8m2(output, out_i8v, n); output += n; } while (batch != 0); diff --git a/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c b/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c index 6b1e27390d3..e2bf648875e 100644 --- a/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c +++ b/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c @@ -52,9 +52,8 @@ void xnn_qu8_vadd_minmax_ukernel__rvv_u1v( acc_i32v = __riscv_vadd_vv_i32m4(acc_i32v, b_i32v, n); vint32m4_t out_i32v = __riscv_vsra_vx_i32m4(acc_i32v, shift, n); out_i32v = __riscv_vadd_vx_i32m4(out_i32v, output_zero_point, n); - out_i32v = __riscv_vmax_vx_i32m4(out_i32v, output_min, n); - out_i32v = __riscv_vmin_vx_i32m4(out_i32v, output_max, n); - vint16m2_t out_i16v = __riscv_vncvt_x_x_w_i16m2(out_i32v, n); + vint16m2_t out_i16v = __riscv_vnclip_wx_i16m2(out_i32v, output_max, __RISCV_VXRM_RNE, n); + out_i16v = __riscv_vmax_vx_i16m2(out_i16v, output_min, n); a_u16v = __riscv_vreinterpret_v_i16m2_u16m2(out_i16v); vuint8m1_t out_u8v = __riscv_vncvt_x_x_w_u8m1(a_u16v, n); __riscv_vse8_v_u8m1(output, out_u8v, n); output += n; diff --git a/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c b/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c index 0bc7417d848..8cb9af0d519 100644 --- a/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c +++ b/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c @@ -52,9 +52,8 @@ void xnn_qu8_vadd_minmax_ukernel__rvv_u2v( acc_i32v = __riscv_vadd_vv_i32m8(acc_i32v, b_i32v, n); vint32m8_t out_i32v = __riscv_vsra_vx_i32m8(acc_i32v, shift, n); out_i32v = __riscv_vadd_vx_i32m8(out_i32v, output_zero_point, n); - out_i32v = __riscv_vmax_vx_i32m8(out_i32v, output_min, n); - out_i32v = __riscv_vmin_vx_i32m8(out_i32v, output_max, n); - vint16m4_t out_i16v = __riscv_vncvt_x_x_w_i16m4(out_i32v, n); + vint16m4_t out_i16v = __riscv_vnclip_wx_i16m4(out_i32v, output_max, __RISCV_VXRM_RNE, n); + out_i16v = __riscv_vmax_vx_i16m4(out_i16v, output_min, n); a_u16v = __riscv_vreinterpret_v_i16m4_u16m4(out_i16v); vuint8m2_t out_u8v = __riscv_vncvt_x_x_w_u8m2(a_u16v, n); __riscv_vse8_v_u8m2(output, out_u8v, n); output += n; diff --git a/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c b/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c index 5632688ac9e..1641df58ce5 100644 --- a/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c +++ b/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c @@ -45,9 +45,8 @@ void xnn_qu8_vaddc_minmax_ukernel__rvv_u1v( vint32m4_t acc_i32v = __riscv_vadd_vx_i32m4(a_i32v, bias, n); vint32m4_t out_i32v = __riscv_vsra_vx_i32m4(acc_i32v, shift, n); out_i32v = __riscv_vadd_vx_i32m4(out_i32v, output_zero_point, n); - out_i32v = __riscv_vmax_vx_i32m4(out_i32v, output_min, n); - out_i32v = __riscv_vmin_vx_i32m4(out_i32v, output_max, n); - vint16m2_t out_i16v = __riscv_vncvt_x_x_w_i16m2(out_i32v, n); + vint16m2_t out_i16v = __riscv_vnclip_wx_i16m2(out_i32v, output_max, __RISCV_VXRM_RNE, n); + out_i16v = __riscv_vmax_vx_i16m2(out_i16v, output_min, n); a_u16v = __riscv_vreinterpret_v_i16m2_u16m2(out_i16v); vuint8m1_t out_u8v = __riscv_vncvt_x_x_w_u8m1(a_u16v, n); __riscv_vse8_v_u8m1(output, out_u8v, n); output += n; diff --git a/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c b/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c index 17a6b4e05c6..a46e9e6d9a6 100644 --- a/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c +++ b/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c @@ -45,9 +45,8 @@ void xnn_qu8_vaddc_minmax_ukernel__rvv_u2v( vint32m8_t acc_i32v = __riscv_vadd_vx_i32m8(a_i32v, bias, n); vint32m8_t out_i32v = __riscv_vsra_vx_i32m8(acc_i32v, shift, n); out_i32v = __riscv_vadd_vx_i32m8(out_i32v, output_zero_point, n); - out_i32v = __riscv_vmax_vx_i32m8(out_i32v, output_min, n); - out_i32v = __riscv_vmin_vx_i32m8(out_i32v, output_max, n); - vint16m4_t out_i16v = __riscv_vncvt_x_x_w_i16m4(out_i32v, n); + vint16m4_t out_i16v = __riscv_vnclip_wx_i16m4(out_i32v, output_max, __RISCV_VXRM_RNE, n); + out_i16v = __riscv_vmax_vx_i16m4(out_i16v, output_min, n); a_u16v = __riscv_vreinterpret_v_i16m4_u16m4(out_i16v); vuint8m2_t out_u8v = __riscv_vncvt_x_x_w_u8m2(a_u16v, n); __riscv_vse8_v_u8m2(output, out_u8v, n); output += n; From a57be02c764045dd46befebdb8162bd3fdbfccf1 Mon Sep 17 00:00:00 2001 From: kaustubh-raste Date: Thu, 3 Oct 2024 07:44:00 +0100 Subject: [PATCH 4/4] Fix the path of generated rvv files --- cmake/gen/rvv_microkernels.cmake | 16 ++++++++-------- gen/rvv_microkernels.bzl | 16 ++++++++-------- scripts/generate-qs8-vadd.sh | 16 ++++++++-------- .../gen/qs8-vadd-minmax-rvv-u1v.c | 0 .../gen/qs8-vadd-minmax-rvv-u2v.c | 0 .../gen/qs8-vaddc-minmax-rvv-u1v.c | 0 .../gen/qs8-vaddc-minmax-rvv-u2v.c | 0 .../gen/qu8-vadd-minmax-rvv-u1v.c | 0 .../gen/qu8-vadd-minmax-rvv-u2v.c | 0 .../gen/qu8-vaddc-minmax-rvv-u1v.c | 0 .../gen/qu8-vaddc-minmax-rvv-u2v.c | 0 11 files changed, 24 insertions(+), 24 deletions(-) rename src/{qs8-vmul => qs8-vadd}/gen/qs8-vadd-minmax-rvv-u1v.c (100%) rename src/{qs8-vmul => qs8-vadd}/gen/qs8-vadd-minmax-rvv-u2v.c (100%) rename src/{qs8-vmulc => qs8-vaddc}/gen/qs8-vaddc-minmax-rvv-u1v.c (100%) rename src/{qs8-vmulc => qs8-vaddc}/gen/qs8-vaddc-minmax-rvv-u2v.c (100%) rename src/{qu8-vmul => qu8-vadd}/gen/qu8-vadd-minmax-rvv-u1v.c (100%) rename src/{qu8-vmul => qu8-vadd}/gen/qu8-vadd-minmax-rvv-u2v.c (100%) rename src/{qu8-vmulc => qu8-vaddc}/gen/qu8-vaddc-minmax-rvv-u1v.c (100%) rename src/{qu8-vmulc => qu8-vaddc}/gen/qu8-vaddc-minmax-rvv-u2v.c (100%) diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index c6821e7fced..f9f20d78a2c 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -55,14 +55,14 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/f32-vrnd/gen/f32-vrndz-rvv-u4v.c src/f32-vrsqrt/gen/f32-vrsqrt-rvv-rsqrt-u4v.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c - src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c + src/qs8-vadd/gen/qs8-vadd-minmax-rvv-u2v.c + src/qs8-vaddc/gen/qs8-vaddc-minmax-rvv-u2v.c src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u2v.c - src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u2v.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u2v.c - src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c + src/qu8-vadd/gen/qu8-vadd-minmax-rvv-u2v.c + src/qu8-vaddc/gen/qu8-vaddc-minmax-rvv-u2v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c - src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c src/x32-transposec/gen/x32-transposec-4x4-rvv.c @@ -185,14 +185,14 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u1v.c - src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c + src/qs8-vadd/gen/qs8-vadd-minmax-rvv-u1v.c + src/qs8-vaddc/gen/qs8-vaddc-minmax-rvv-u1v.c src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u1v.c - src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u1v.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u1v.c - src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c + src/qu8-vadd/gen/qu8-vadd-minmax-rvv-u1v.c + src/qu8-vaddc/gen/qu8-vaddc-minmax-rvv-u1v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c - src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u2.c src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u4.c diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index cd5e4d1c5b5..d0900b5adaa 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -51,14 +51,14 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-vrnd/gen/f32-vrndz-rvv-u4v.c", "src/f32-vrsqrt/gen/f32-vrsqrt-rvv-rsqrt-u4v.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c", - "src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c", + "src/qs8-vadd/gen/qs8-vadd-minmax-rvv-u2v.c", + "src/qs8-vaddc/gen/qs8-vaddc-minmax-rvv-u2v.c", "src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u2v.c", - "src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c", "src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u2v.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u2v.c", - "src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c", + "src/qu8-vadd/gen/qu8-vadd-minmax-rvv-u2v.c", + "src/qu8-vaddc/gen/qu8-vaddc-minmax-rvv-u2v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c", - "src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c", "src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c", "src/x32-transposec/gen/x32-transposec-4x4-rvv.c", @@ -182,14 +182,14 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u1v.c", - "src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c", + "src/qs8-vadd/gen/qs8-vadd-minmax-rvv-u1v.c", + "src/qs8-vaddc/gen/qs8-vaddc-minmax-rvv-u1v.c", "src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u1v.c", - "src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c", "src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u1v.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u1v.c", - "src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c", + "src/qu8-vadd/gen/qu8-vadd-minmax-rvv-u1v.c", + "src/qu8-vaddc/gen/qu8-vaddc-minmax-rvv-u1v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c", - "src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c", "src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u2.c", "src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u4.c", diff --git a/scripts/generate-qs8-vadd.sh b/scripts/generate-qs8-vadd.sh index 7df87238abb..6e0ed10cf84 100755 --- a/scripts/generate-qs8-vadd.sh +++ b/scripts/generate-qs8-vadd.sh @@ -70,17 +70,17 @@ tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=32 -D LD128=0 -D DATATYPE=QU8 tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=16 -D LD128=1 -D DATATYPE=QU8 -o src/qu8-vaddc/gen/qu8-vaddc-minmax-neon-ld128-u16.c & ################################ RISC-V Vector ################################ -tools/xngen src/qs8-vadd/rvv.c.in -D LMUL=1 -D DATATYPE=QS8 -o src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c & -tools/xngen src/qs8-vadd/rvv.c.in -D LMUL=2 -D DATATYPE=QS8 -o src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c & +tools/xngen src/qs8-vadd/rvv.c.in -D LMUL=1 -D DATATYPE=QS8 -o src/qs8-vadd/gen/qs8-vadd-minmax-rvv-u1v.c & +tools/xngen src/qs8-vadd/rvv.c.in -D LMUL=2 -D DATATYPE=QS8 -o src/qs8-vadd/gen/qs8-vadd-minmax-rvv-u2v.c & -tools/xngen src/qs8-vadd/rvv.c.in -D LMUL=1 -D DATATYPE=QU8 -o src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c & -tools/xngen src/qs8-vadd/rvv.c.in -D LMUL=2 -D DATATYPE=QU8 -o src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c & +tools/xngen src/qs8-vadd/rvv.c.in -D LMUL=1 -D DATATYPE=QU8 -o src/qu8-vadd/gen/qu8-vadd-minmax-rvv-u1v.c & +tools/xngen src/qs8-vadd/rvv.c.in -D LMUL=2 -D DATATYPE=QU8 -o src/qu8-vadd/gen/qu8-vadd-minmax-rvv-u2v.c & -tools/xngen src/qs8-vaddc/rvv.c.in -D LMUL=1 -D DATATYPE=QS8 -o src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c & -tools/xngen src/qs8-vaddc/rvv.c.in -D LMUL=2 -D DATATYPE=QS8 -o src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c & +tools/xngen src/qs8-vaddc/rvv.c.in -D LMUL=1 -D DATATYPE=QS8 -o src/qs8-vaddc/gen/qs8-vaddc-minmax-rvv-u1v.c & +tools/xngen src/qs8-vaddc/rvv.c.in -D LMUL=2 -D DATATYPE=QS8 -o src/qs8-vaddc/gen/qs8-vaddc-minmax-rvv-u2v.c & -tools/xngen src/qs8-vaddc/rvv.c.in -D LMUL=1 -D DATATYPE=QU8 -o src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c & -tools/xngen src/qs8-vaddc/rvv.c.in -D LMUL=2 -D DATATYPE=QU8 -o src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c & +tools/xngen src/qs8-vaddc/rvv.c.in -D LMUL=1 -D DATATYPE=QU8 -o src/qu8-vaddc/gen/qu8-vaddc-minmax-rvv-u1v.c & +tools/xngen src/qs8-vaddc/rvv.c.in -D LMUL=2 -D DATATYPE=QU8 -o src/qu8-vaddc/gen/qu8-vaddc-minmax-rvv-u2v.c & ################################### x86 SSE ################################### tools/xngen src/qs8-vadd/sse-mul16-ld64.c.in -D BATCH_TILE=8 -D SSE=2 -D AVX=0 -D DATATYPE=QS8 -o src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u8.c & diff --git a/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c b/src/qs8-vadd/gen/qs8-vadd-minmax-rvv-u1v.c similarity index 100% rename from src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u1v.c rename to src/qs8-vadd/gen/qs8-vadd-minmax-rvv-u1v.c diff --git a/src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c b/src/qs8-vadd/gen/qs8-vadd-minmax-rvv-u2v.c similarity index 100% rename from src/qs8-vmul/gen/qs8-vadd-minmax-rvv-u2v.c rename to src/qs8-vadd/gen/qs8-vadd-minmax-rvv-u2v.c diff --git a/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c b/src/qs8-vaddc/gen/qs8-vaddc-minmax-rvv-u1v.c similarity index 100% rename from src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u1v.c rename to src/qs8-vaddc/gen/qs8-vaddc-minmax-rvv-u1v.c diff --git a/src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c b/src/qs8-vaddc/gen/qs8-vaddc-minmax-rvv-u2v.c similarity index 100% rename from src/qs8-vmulc/gen/qs8-vaddc-minmax-rvv-u2v.c rename to src/qs8-vaddc/gen/qs8-vaddc-minmax-rvv-u2v.c diff --git a/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c b/src/qu8-vadd/gen/qu8-vadd-minmax-rvv-u1v.c similarity index 100% rename from src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u1v.c rename to src/qu8-vadd/gen/qu8-vadd-minmax-rvv-u1v.c diff --git a/src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c b/src/qu8-vadd/gen/qu8-vadd-minmax-rvv-u2v.c similarity index 100% rename from src/qu8-vmul/gen/qu8-vadd-minmax-rvv-u2v.c rename to src/qu8-vadd/gen/qu8-vadd-minmax-rvv-u2v.c diff --git a/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c b/src/qu8-vaddc/gen/qu8-vaddc-minmax-rvv-u1v.c similarity index 100% rename from src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u1v.c rename to src/qu8-vaddc/gen/qu8-vaddc-minmax-rvv-u1v.c diff --git a/src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c b/src/qu8-vaddc/gen/qu8-vaddc-minmax-rvv-u2v.c similarity index 100% rename from src/qu8-vmulc/gen/qu8-vaddc-minmax-rvv-u2v.c rename to src/qu8-vaddc/gen/qu8-vaddc-minmax-rvv-u2v.c