From 86e94db839cb27dc8c48fac9ff0ba3fe03afc2c1 Mon Sep 17 00:00:00 2001 From: Quentin Khan Date: Tue, 1 Oct 2024 07:38:29 -0700 Subject: [PATCH] Add `sum_nd_f32|f16|qs8|qu8` operator. PiperOrigin-RevId: 681013868 --- build_srcs.bzl | 1 + include/xnnpack.h | 103 +++ src/operators/reduce-nd.c | 275 +++++++ src/subgraph/static-sum.c | 366 ++++++++++ src/xnnpack/node-type-defs.h | 1 + src/xnnpack/operator-type-defs.h | 4 + test/BUILD.bazel | 27 + test/static-sum.cc | 1168 ++++++++++++++++++++++++++++++ test/sum-nd.cc | 875 ++++++++++++++++++++++ test/sum-operator-tester.h | 633 ++++++++++++++++ 10 files changed, 3453 insertions(+) create mode 100644 src/subgraph/static-sum.c create mode 100644 test/static-sum.cc create mode 100644 test/sum-nd.cc create mode 100644 test/sum-operator-tester.h diff --git a/build_srcs.bzl b/build_srcs.bzl index a4f0471792e..7a89891a16d 100644 --- a/build_srcs.bzl +++ b/build_srcs.bzl @@ -85,6 +85,7 @@ SUBGRAPH_SRCS = [ "src/subgraph/static-mean.c", "src/subgraph/static-resize-bilinear-2d.c", "src/subgraph/static-slice.c", + "src/subgraph/static-sum.c", "src/subgraph/static-transpose.c", "src/subgraph/tanh.c", "src/subgraph/unpooling-2d.c", diff --git a/include/xnnpack.h b/include/xnnpack.h index 579b6c04ef4..4a274c9eaff 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -1334,6 +1334,26 @@ enum xnn_status xnn_define_static_mean( uint32_t output_id, uint32_t flags); +/// Define a Sum Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param num_reduction_axes - number of axes along which sum is computed. +/// @param reduction_axes - axes along which sum is computed. +/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with at least +/// @a num_reduction_axes dimensions defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor defined in the +/// @a subgraph with @a num_reduction_axes fewer dimensions than the input tensor (if +/// XNN_FLAG_KEEP_DIMS is not specified), or has same dimension rank but the dimension at +/// @a reduction_axes reduced to 1 (if XNN_FLAG_KEEP_DIMS is specified). +/// @param flags - binary features of the Sum Node. The only currently supported value is XNN_FLAG_KEEP_DIMS +enum xnn_status xnn_define_static_sum( + xnn_subgraph_t subgraph, + size_t num_reduction_axes, + const size_t* reduction_axes, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + /// Define a 2-Input Concatenate Node and add it to a Subgraph. /// /// The 2-Input Concatenate Node concatenates two tensors along a specified axis. @@ -5080,6 +5100,89 @@ enum xnn_status xnn_setup_mean_nd_qu8( const void* input, void* output); +enum xnn_status xnn_create_sum_nd_f16( + uint32_t flags, + xnn_operator_t* sum_op_out); + +enum xnn_status xnn_create_sum_nd_f32( + uint32_t flags, + xnn_operator_t* sum_op_out); + +enum xnn_status xnn_create_sum_nd_qs8( + float scale, + int8_t input_zero_point, + int8_t output_zero_point, + uint32_t flags, + xnn_operator_t* sum_op_out); + +enum xnn_status xnn_create_sum_nd_qu8( + float scale, + uint8_t input_zero_point, + uint8_t output_zero_point, + uint32_t flags, + xnn_operator_t* sum_op_out); + +enum xnn_status xnn_reshape_sum_nd_f16( + xnn_operator_t sum_op, + size_t num_reduction_axes, + const size_t* reduction_axes, + size_t num_input_dims, + const size_t* input_shape, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_reshape_sum_nd_f32( + xnn_operator_t sum_op, + size_t num_reduction_axes, + const size_t* reduction_axes, + size_t num_input_dims, + const size_t* input_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_reshape_sum_nd_qs8( + xnn_operator_t sum_op, + size_t num_reduction_axes, + const size_t* reduction_axes, + size_t num_input_dims, + const size_t* input_shape, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_reshape_sum_nd_qu8( + xnn_operator_t sum_op, + size_t num_reduction_axes, + const size_t* reduction_axes, + size_t num_input_dims, + const size_t* input_shape, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_sum_nd_f16( + xnn_operator_t sum_op, + void* workspace, + const void* input, + void* output); + +enum xnn_status xnn_setup_sum_nd_f32( + xnn_operator_t sum_op, + const float* input, + float* output); + +enum xnn_status xnn_setup_sum_nd_qs8( + xnn_operator_t sum_op, + void* workspace, + const void* input, + void* output); + +enum xnn_status xnn_setup_sum_nd_qu8( + xnn_operator_t sum_op, + void* workspace, + const void* input, + void* output); + enum xnn_status xnn_create_negate_nc_f16( uint32_t flags, xnn_operator_t* negate_op_out); diff --git a/src/operators/reduce-nd.c b/src/operators/reduce-nd.c index a808a0dd3c1..81673f569e3 100644 --- a/src/operators/reduce-nd.c +++ b/src/operators/reduce-nd.c @@ -596,3 +596,278 @@ enum xnn_status xnn_setup_mean_nd_qu8( workspace, input, output, xnn_operator_type_mean_nd_qu8); } + +enum xnn_status xnn_create_sum_nd_f16( + uint32_t flags, + xnn_operator_t* sum_op_out) +{ + const struct xnn_reduce_config* rsum_config = xnn_init_f16_f32acc_rsum_config(); + const struct xnn_reduce_config* rdsum_config = xnn_init_f16_f32acc_rdsum_config(); + const struct xnn_unary_elementwise_config* f32_to_f16_cvt_config = xnn_init_f32_to_f16_cvt_config(); + if (rdsum_config == NULL || rsum_config == NULL || f32_to_f16_cvt_config == NULL) { + xnn_log_error("failed to create %s operator: unsupported hardware configuration", + xnn_operator_type_to_string(xnn_operator_type_sum_nd_f16)); + return xnn_status_unsupported_hardware; + } + struct f16_f32acc_mean_params params; + rsum_config->init.f16_f32acc_scale(¶ms.f16_f32acc_scale, /*scale=*/1.0f); + return create_mean_nd( + flags, + /*log2_element_size=*/XNN_LOG2_SIZEOF_HALF, + xnn_operator_type_sum_nd_f16, + rdsum_config, rsum_config, f32_to_f16_cvt_config, /*s32_f32_cvt_config=*/NULL, + /*u32_f32_cvt_config=*/NULL, ¶ms, sizeof(params), sum_op_out); +} + +static void update_params_sum_f16( + xnn_operator_t sum_op, + size_t num_elements) +{ + const float scale = 1.0f; + sum_op->rsum_config->init.f16_f32acc_scale(&sum_op->params.mean_params.f16_f32acc_scale, scale); +} + +enum xnn_status xnn_reshape_sum_nd_f16( + xnn_operator_t sum_op, + size_t num_reduction_axes, + const size_t* reduction_axes, + size_t num_input_dims, + const size_t* input_shape, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool) +{ + return reshape_mean_nd( + sum_op, + num_reduction_axes, reduction_axes, + num_input_dims, input_shape, + workspace_size, workspace_alignment, + /*log2_data_element_size=*/XNN_LOG2_SIZEOF_HALF, + /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_FLOAT, + xnn_operator_type_sum_nd_f16, + /*scale_params=*/&sum_op->params.mean_params.f16_f32acc_scale, + /*scale_params_size=*/sizeof(sum_op->params.mean_params.f16_f32acc_scale), + update_params_sum_f16, + threadpool); +} + +enum xnn_status xnn_setup_sum_nd_f16( + xnn_operator_t sum_op, + void* workspace, + const void* input, + void* output) +{ + return setup_mean_nd( + sum_op, + workspace, input, output, + xnn_operator_type_sum_nd_f16); +} + +enum xnn_status xnn_create_sum_nd_f32( + uint32_t flags, + xnn_operator_t* sum_op_out) +{ + const struct xnn_reduce_config* rsum_config = xnn_init_f32_rsum_config(); + const struct xnn_reduce_config* rdsum_config = xnn_init_f32_rdsum_config(); + if (rdsum_config == NULL || rsum_config == NULL) { + xnn_log_error("failed to create %s operator: unsupported hardware configuration", + xnn_operator_type_to_string(xnn_operator_type_sum_nd_f32)); + return xnn_status_unsupported_hardware; + } + + struct xnn_f32_scaleminmax_params params; + rsum_config->init.f32_scaleminmax(¶ms, /*scale=*/1.0f, /*min=*/-INFINITY, /*max=*/INFINITY); + return create_mean_nd( + flags, + /*log2_element_size=*/XNN_LOG2_SIZEOF_FLOAT, + xnn_operator_type_sum_nd_f32, + rdsum_config, rsum_config, /*cvt_config=*/NULL, /*s32_f32_cvt_config=*/NULL, + /*u32_f32_cvt_config=*/NULL, ¶ms, sizeof(params), sum_op_out); +} + +static void update_params_sum_f32( + xnn_operator_t sum_op, + size_t num_elements) +{ + const float scale = 1.0f; + sum_op->rsum_config->init.f32_scaleminmax(&sum_op->params.f32_scaleminmax, scale, -INFINITY, INFINITY); + sum_op->rdsum_config->init.f32_scaleminmax(&sum_op->params.f32_scaleminmax, scale, -INFINITY, INFINITY); +} + +enum xnn_status xnn_reshape_sum_nd_f32( + xnn_operator_t sum_op, + size_t num_reduction_axes, + const size_t* reduction_axes, + size_t num_input_dims, + const size_t* input_shape, + pthreadpool_t threadpool) +{ + return reshape_mean_nd( + sum_op, + num_reduction_axes, reduction_axes, + num_input_dims, input_shape, + /*workspace_size=*/NULL, /*workspace_alignment=*/NULL, + /*log2_data_element_size=*/XNN_LOG2_SIZEOF_FLOAT, + /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_FLOAT, + xnn_operator_type_sum_nd_f32, + /*scale_params=*/&sum_op->params.f32_scaleminmax, + /*scale_params_size=*/sizeof(sum_op->params.f32_scaleminmax), + update_params_sum_f32, + threadpool); +} + +enum xnn_status xnn_setup_sum_nd_f32( + xnn_operator_t sum_op, + const float* input, + float* output) +{ + return setup_mean_nd( + sum_op, + /*workspace=*/NULL, input, output, + xnn_operator_type_sum_nd_f32); +} + +enum xnn_status xnn_create_sum_nd_qs8( + float scale, + int8_t input_zero_point, + int8_t output_zero_point, + uint32_t flags, + xnn_operator_t* sum_op_out) +{ + const struct xnn_reduce_config* rsum_config = xnn_init_qs8_rsum_config(); + const struct xnn_reduce_config* rdsum_config = xnn_init_qs8_rdsum_config(); + const struct xnn_unary_elementwise_config* f32_qs8_cvt_config = xnn_init_f32_to_qs8_cvt_config(); + const struct xnn_unary_elementwise_config* s32_f32_cvt_config = xnn_init_s32_to_f32_cvt_config(); + if (rdsum_config == NULL || rsum_config == NULL || f32_qs8_cvt_config == NULL || s32_f32_cvt_config == NULL) { + xnn_log_error("failed to create %s operator: unsupported hardware configuration", + xnn_operator_type_to_string(xnn_operator_type_sum_nd_qs8)); + return xnn_status_unsupported_hardware; + } + + struct xnn_qs8_mean_minmax_params params; + rsum_config->init.qs8_mean(¶ms, scale, -1, input_zero_point, output_zero_point); + + return create_mean_nd( + flags, + /*log2_element_size=*/XNN_LOG2_SIZEOF_INT8_T, + xnn_operator_type_sum_nd_qs8, + rdsum_config, rsum_config, f32_qs8_cvt_config, s32_f32_cvt_config, /*u32_f32_cvt_config=*/NULL, + ¶ms, sizeof(params), + sum_op_out); +} + +static void update_params_sum_qs8( + xnn_operator_t sum_op, + size_t num_elements) { + sum_op->params.qs8_mean.scalar.scale *= 1.0f; + sum_op->params.qs8_mean.scalar.num_elements = num_elements; +} + +enum xnn_status xnn_reshape_sum_nd_qs8( + xnn_operator_t sum_op, + size_t num_reduction_axes, + const size_t* reduction_axes, + size_t num_input_dims, + const size_t* input_shape, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool) +{ + return reshape_mean_nd( + sum_op, + num_reduction_axes, reduction_axes, + num_input_dims, input_shape, + workspace_size, workspace_alignment, + /*log2_data_element_size=*/XNN_LOG2_SIZEOF_INT8_T, + /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_FLOAT, + xnn_operator_type_sum_nd_qs8, + /*scale_params=*/&sum_op->params.qs8_mean, + /*scale_params_size=*/sizeof(sum_op->params.qs8_mean), + update_params_sum_qs8, + threadpool); +} + +enum xnn_status xnn_setup_sum_nd_qs8( + xnn_operator_t sum_op, + void* workspace, + const void* input, + void* output) +{ + return setup_mean_nd( + sum_op, + workspace, input, output, + xnn_operator_type_sum_nd_qs8); +} + +enum xnn_status xnn_create_sum_nd_qu8( + float scale, + uint8_t input_zero_point, + uint8_t output_zero_point, + uint32_t flags, + xnn_operator_t* sum_op_out) +{ + const struct xnn_reduce_config* rsum_config = xnn_init_qu8_rsum_config(); + const struct xnn_reduce_config* rdsum_config = xnn_init_qu8_rdsum_config(); + const struct xnn_unary_elementwise_config* f32_qu8_cvt_config = xnn_init_f32_to_qu8_cvt_config(); + const struct xnn_unary_elementwise_config* u32_f32_cvt_config = xnn_init_u32_to_f32_cvt_config(); + if (rdsum_config == NULL || rsum_config == NULL || f32_qu8_cvt_config == NULL || u32_f32_cvt_config == NULL) { + xnn_log_error("failed to create %s operator: unsupported hardware configuration", + xnn_operator_type_to_string(xnn_operator_type_sum_nd_qu8)); + return xnn_status_unsupported_hardware; + } + + struct xnn_qu8_mean_minmax_params params; + rsum_config->init.qu8_mean(¶ms, scale, -1, input_zero_point, output_zero_point); + + return create_mean_nd( + flags, + /*log2_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, + xnn_operator_type_sum_nd_qu8, + rdsum_config, rsum_config, f32_qu8_cvt_config, /*s32_f32_cvt_config=*/NULL, u32_f32_cvt_config, + ¶ms, sizeof(params), + sum_op_out); +} + +static void update_params_sum_qu8( + xnn_operator_t sum_op, + size_t num_elements) { + sum_op->params.qu8_mean.scalar.scale *= 1.0f; + sum_op->params.qu8_mean.scalar.num_elements = num_elements; +} + + +enum xnn_status xnn_reshape_sum_nd_qu8( + xnn_operator_t sum_op, + size_t num_reduction_axes, + const size_t* reduction_axes, + size_t num_input_dims, + const size_t* input_shape, + size_t* workspace_size, + size_t* workspace_alignment, + pthreadpool_t threadpool) +{ + return reshape_mean_nd( + sum_op, + num_reduction_axes, reduction_axes, + num_input_dims, input_shape, + workspace_size, workspace_alignment, + /*log2_data_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, + /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_FLOAT, + xnn_operator_type_sum_nd_qu8, + /*scale_params=*/&sum_op->params.qu8_mean, + /*scale_params_size=*/sizeof(sum_op->params.qu8_mean), + update_params_sum_qu8, + threadpool); +} + +enum xnn_status xnn_setup_sum_nd_qu8( + xnn_operator_t sum_op, + void* workspace, + const void* input, + void* output) +{ + return setup_mean_nd( + sum_op, + workspace, input, output, + xnn_operator_type_sum_nd_qu8); +} diff --git a/src/subgraph/static-sum.c b/src/subgraph/static-sum.c new file mode 100644 index 00000000000..ed57c227816 --- /dev/null +++ b/src/subgraph/static-sum.c @@ -0,0 +1,366 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include + +#include "xnnpack.h" +#include "xnnpack/common.h" +#include "xnnpack/log.h" +#include "xnnpack/node-type.h" +#include "xnnpack/operator-type.h" +#include "xnnpack/operator.h" +#include "xnnpack/subgraph-validation.h" +#include "xnnpack/subgraph.h" +#include "pthreadpool.h" + +static enum xnn_status create_sum_operator( + const struct xnn_node* node, + const struct xnn_value* values, + size_t num_values, + struct xnn_operator_data* opdata, + struct xnn_code_cache* code_cache, + xnn_weights_cache_t weights_cache) +{ + assert(node->num_inputs == 1); + assert(node->num_outputs == 1); + + enum xnn_status status; + const uint32_t input_id = node->inputs[0]; + assert(input_id != XNN_INVALID_VALUE_ID); + assert(input_id < num_values); + const struct xnn_value *input_value = &values[input_id]; + + assert(node->num_outputs == 1); + const uint32_t output_id = node->outputs[0]; + assert(output_id != XNN_INVALID_VALUE_ID); + assert(output_id < num_values); + + switch (input_value->datatype) { + case xnn_datatype_fp16: + status = xnn_create_sum_nd_f16( + node->flags, + &opdata->operator_objects[0]); + break; + case xnn_datatype_fp32: + status = xnn_create_sum_nd_f32( + node->flags, + &opdata->operator_objects[0]); + break; + case xnn_datatype_qint8: + { + const float input_scale = values[input_id].quantization.scale; + const float output_scale = values[output_id].quantization.scale; + const int8_t input_zero_point = (int8_t) values[input_id].quantization.zero_point; + const int8_t output_zero_point = (int8_t) values[output_id].quantization.zero_point; + + status = xnn_create_sum_nd_qs8( + input_scale / output_scale, input_zero_point, output_zero_point, + node->flags, + &opdata->operator_objects[0]); + break; + } + case xnn_datatype_quint8: + { + const float input_scale = values[input_id].quantization.scale; + const float output_scale = values[output_id].quantization.scale; + const uint8_t input_zero_point = (uint8_t) values[input_id].quantization.zero_point; + const uint8_t output_zero_point = (uint8_t) values[output_id].quantization.zero_point; + + status = xnn_create_sum_nd_qu8( + input_scale / output_scale, input_zero_point, output_zero_point, + node->flags, + &opdata->operator_objects[0]); + break; + } + default: + XNN_UNREACHABLE; + } + if (status == xnn_status_success) { + const size_t num_reduction_axes = node->params.reduce.num_reduction_axes; + opdata->num_reduction_axes = num_reduction_axes; + memcpy(opdata->reduction_axes, node->params.reduce.reduction_axes, num_reduction_axes * sizeof(size_t)); + } + return status; +} + +static enum xnn_status reshape_sum_operator( + struct xnn_operator_data* opdata, + struct xnn_value* values, + size_t num_values, + pthreadpool_t threadpool) +{ + const uint32_t input_id = opdata->inputs[0]; + assert(input_id != XNN_INVALID_VALUE_ID); + assert(input_id < num_values); + + const struct xnn_value* input_value = values + input_id; + assert(input_value->type == xnn_value_type_dense_tensor); + + const uint32_t output_id = opdata->outputs[0]; + assert(output_id != XNN_INVALID_VALUE_ID); + assert(output_id < num_values); + + enum xnn_status status = xnn_status_invalid_state; + switch (opdata->operator_objects[0]->type) { + case xnn_operator_type_sum_nd_f16: + status = xnn_reshape_sum_nd_f16( + opdata->operator_objects[0], + opdata->num_reduction_axes, + opdata->reduction_axes, + input_value->shape.num_dims, + input_value->shape.dim, + &opdata->workspace_size, + &opdata->workspace_alignment, + threadpool); + break; + case xnn_operator_type_sum_nd_f32: + status = xnn_reshape_sum_nd_f32( + opdata->operator_objects[0], + opdata->num_reduction_axes, + opdata->reduction_axes, + input_value->shape.num_dims, + input_value->shape.dim, + threadpool); + break; + case xnn_operator_type_sum_nd_qs8: + status = xnn_reshape_sum_nd_qs8( + opdata->operator_objects[0], + opdata->num_reduction_axes, + opdata->reduction_axes, + input_value->shape.num_dims, + input_value->shape.dim, + &opdata->workspace_size, + &opdata->workspace_alignment, + threadpool); + break; + case xnn_operator_type_sum_nd_qu8: + status = xnn_reshape_sum_nd_qu8( + opdata->operator_objects[0], + opdata->num_reduction_axes, + opdata->reduction_axes, + input_value->shape.num_dims, + input_value->shape.dim, + &opdata->workspace_size, + &opdata->workspace_alignment, + threadpool); + break; + default: + XNN_UNREACHABLE; + } + struct xnn_value* output_value = values + output_id; + size_t input_num_dims = input_value->shape.num_dims; + size_t num_reduction_axes = opdata->num_reduction_axes; + if (opdata->operator_objects[0]->flags & XNN_FLAG_KEEP_DIMS) { + output_value->shape.num_dims = input_value->shape.num_dims; + for (size_t idx = 0; idx < input_num_dims; ++idx) { + bool is_axis = false; + for (size_t axis_idx = 0; axis_idx < num_reduction_axes; ++axis_idx) { + if (opdata->reduction_axes[axis_idx] == idx) { + is_axis = true; + break; + } + } + if (is_axis) { + output_value->shape.dim[idx] = 1; + } else { + output_value->shape.dim[idx] = input_value->shape.dim[idx]; + } + } + } else { + size_t num_skip_axis = 0; + for (size_t idx = 0; idx < input_num_dims; ++idx) { + bool is_axis = false; + for (size_t axis_idx = 0; axis_idx < num_reduction_axes; ++axis_idx) { + if (opdata->reduction_axes[axis_idx] == idx) { + ++num_skip_axis; + is_axis = true; + break; + } + } + if (!is_axis) { + output_value->shape.dim[idx - num_skip_axis] = input_value->shape.dim[idx]; + } + } + output_value->shape.num_dims = input_value->shape.num_dims - num_skip_axis; + } + const size_t new_size = xnn_tensor_get_size(output_value); + if (new_size > output_value->size) { + output_value->size = new_size; + return xnn_status_reallocation_required; + } + return status; +} + +static enum xnn_status setup_sum_operator( + const struct xnn_operator_data* opdata, + const struct xnn_value* values, + size_t num_values, + pthreadpool_t threadpool) +{ + const uint32_t input_id = opdata->inputs[0]; + assert(input_id != XNN_INVALID_VALUE_ID); + assert(input_id < num_values); + + const uint32_t output_id = opdata->outputs[0]; + assert(output_id != XNN_INVALID_VALUE_ID); + assert(output_id < num_values); + + const struct xnn_value* input_value = values + input_id; + assert(input_value->type == xnn_value_type_dense_tensor); + const void* input_data = input_value->data; + assert(input_data != NULL); + + const struct xnn_value* output_value = values + output_id; + assert(output_value->type == xnn_value_type_dense_tensor); + void* output_data = output_value->data; + assert(output_data != NULL); + + switch (opdata->operator_objects[0]->type) { + case xnn_operator_type_sum_nd_f16: + return xnn_setup_sum_nd_f16( + opdata->operator_objects[0], + opdata->workspace, + input_data, output_data); + case xnn_operator_type_sum_nd_f32: + return xnn_setup_sum_nd_f32( + opdata->operator_objects[0], + input_data, output_data); + case xnn_operator_type_sum_nd_qs8: + return xnn_setup_sum_nd_qs8( + opdata->operator_objects[0], + opdata->workspace, + input_data, output_data); + case xnn_operator_type_sum_nd_qu8: + return xnn_setup_sum_nd_qu8( + opdata->operator_objects[0], + opdata->workspace, + input_data, output_data); + default: + XNN_UNREACHABLE; + } +} + +enum xnn_status xnn_define_static_sum( + xnn_subgraph_t subgraph, + size_t num_reduction_axes, + const size_t* reduction_axes, + uint32_t input_id, + uint32_t output_id, + uint32_t flags) +{ + enum xnn_status status; + if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_static_sum)) != xnn_status_success) { + return status; + } + + status = xnn_subgraph_check_nth_input_node_id(xnn_node_type_static_sum, input_id, subgraph->num_values, 1); + if (status != xnn_status_success) { + return status; + } + + const struct xnn_value* input_value = &subgraph->values[input_id]; + status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_static_sum, input_id, input_value, 1); + if (status != xnn_status_success) { + return status; + } + + switch (input_value->datatype) { + case xnn_datatype_fp16: + case xnn_datatype_fp32: + case xnn_datatype_qint8: + case xnn_datatype_quint8: + break; + default: + xnn_log_error( + "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", + xnn_node_type_to_string(xnn_node_type_static_sum), input_id, + xnn_datatype_to_string(input_value->datatype), input_value->datatype); + return xnn_status_invalid_parameter; + } + + status = xnn_subgraph_check_output_node_id(xnn_node_type_static_sum, output_id, subgraph->num_values); + if (status != xnn_status_success) { + return status; + } + + const struct xnn_value* output_value = &subgraph->values[output_id]; + status = xnn_subgraph_check_output_type_dense(xnn_node_type_static_sum, output_id, output_value); + if (status != xnn_status_success) { + return status; + } + + enum xnn_compute_type compute_type = xnn_compute_type_invalid; + switch (output_value->datatype) { + case xnn_datatype_fp16: + compute_type = xnn_compute_type_fp16; + break; + case xnn_datatype_fp32: + compute_type = xnn_compute_type_fp32; + break; + case xnn_datatype_qint8: + compute_type = xnn_compute_type_qs8; + break; + case xnn_datatype_quint8: + compute_type = xnn_compute_type_qu8; + break; + default: + xnn_log_error( + "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", + xnn_node_type_to_string(xnn_node_type_static_sum), output_id, + xnn_datatype_to_string(output_value->datatype), output_value->datatype); + return xnn_status_invalid_parameter; + } + + if (num_reduction_axes == 0) { + xnn_log_error( + "failed to define %s operator with %zu reduction axes: the number of reduction axes must be non-zero", + xnn_node_type_to_string(xnn_node_type_static_sum), num_reduction_axes); + return xnn_status_invalid_parameter; + } + + for (size_t i = 0; i < num_reduction_axes; i++) { + if (reduction_axes[i] > input_value->shape.num_dims) { + xnn_log_error( + "failed to define %s operator with #%zu reduction axis of %zu: the index is out of bounds for a %zuD input shape", + xnn_node_type_to_string(xnn_node_type_static_sum), i, reduction_axes[i], input_value->shape.num_dims); + return xnn_status_invalid_parameter; + } + } + + for (size_t i = 1; i < num_reduction_axes; i++) { + if (reduction_axes[i] <= reduction_axes[i - 1]) { + xnn_log_error( + "failed to define %s operator with #%zu reduction axis of %zu: the reduction " + "axes must be in ascending order and unique", + xnn_node_type_to_string(xnn_node_type_static_sum), i, reduction_axes[i]); + return xnn_status_invalid_parameter; + } + } + + struct xnn_node* node = xnn_subgraph_new_node(subgraph); + if (node == NULL) { + return xnn_status_out_of_memory; + } + + node->type = xnn_node_type_static_sum; + node->compute_type = compute_type; + node->params.reduce.num_reduction_axes = num_reduction_axes; + memcpy(node->params.reduce.reduction_axes, reduction_axes, num_reduction_axes * sizeof(size_t)); + node->num_inputs = 1; + node->inputs[0] = input_id; + node->num_outputs = 1; + node->outputs[0] = output_id; + node->flags = flags; + + node->create = create_sum_operator; + node->reshape = reshape_sum_operator; + node->setup = setup_sum_operator; + + return xnn_status_success; +} diff --git a/src/xnnpack/node-type-defs.h b/src/xnnpack/node-type-defs.h index 1ccf512d668..08d7fe411b5 100644 --- a/src/xnnpack/node-type-defs.h +++ b/src/xnnpack/node-type-defs.h @@ -66,6 +66,7 @@ XNN_ENUM_ITEM(xnn_node_type_static_mean, "Static Mean") XNN_ENUM_ITEM(xnn_node_type_static_reshape, "Static Reshape") XNN_ENUM_ITEM(xnn_node_type_static_resize_bilinear_2d, "Static Resize Bilinear 2D") XNN_ENUM_ITEM(xnn_node_type_static_slice, "Static Slice") +XNN_ENUM_ITEM(xnn_node_type_static_sum, "Static Sum") XNN_ENUM_ITEM(xnn_node_type_static_transpose, "Static Transpose") XNN_ENUM_ITEM(xnn_node_type_subtract, "Subtract") XNN_ENUM_ITEM(xnn_node_type_tanh, "Tanh") diff --git a/src/xnnpack/operator-type-defs.h b/src/xnnpack/operator-type-defs.h index defc763b888..1c9359f3ced 100644 --- a/src/xnnpack/operator-type-defs.h +++ b/src/xnnpack/operator-type-defs.h @@ -155,6 +155,10 @@ XNN_ENUM_ITEM(xnn_operator_type_square_root_nc_f16, "Square Root (NC, F16)") XNN_ENUM_ITEM(xnn_operator_type_square_root_nc_f32, "Square Root (NC, F32)") XNN_ENUM_ITEM(xnn_operator_type_squared_difference, "Squared Difference (NC)") XNN_ENUM_ITEM(xnn_operator_type_subtract, "Subtract (ND)") +XNN_ENUM_ITEM(xnn_operator_type_sum_nd_f16, "Sum (ND, F16)") +XNN_ENUM_ITEM(xnn_operator_type_sum_nd_f32, "Sum (ND, F32)") +XNN_ENUM_ITEM(xnn_operator_type_sum_nd_qs8, "Sum (ND, QS8)") +XNN_ENUM_ITEM(xnn_operator_type_sum_nd_qu8, "Sum (ND, QU8)") XNN_ENUM_ITEM(xnn_operator_type_tanh_nc_f16, "Tanh (NC, F16)") XNN_ENUM_ITEM(xnn_operator_type_tanh_nc_f32, "Tanh (NC, F32)") XNN_ENUM_ITEM(xnn_operator_type_tanh_nc_qs8, "Tanh (NC, QS8)") diff --git a/test/BUILD.bazel b/test/BUILD.bazel index b36d2b0c0f8..e3226c17ec3 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -1587,6 +1587,16 @@ xnnpack_unit_test( deps = OPERATOR_TEST_DEPS + ["//:requantization"], ) +xnnpack_unit_test( + name = "sum_nd_test", + timeout = "moderate", + srcs = [ + "sum-nd.cc", + "sum-operator-tester.h", + ], + deps = OPERATOR_TEST_DEPS + ["//:requantization"], +) + xnnpack_unit_test( name = "slice_normalization_test", srcs = [ @@ -2211,6 +2221,23 @@ xnnpack_unit_test( ], ) +xnnpack_unit_test( + name = "static_sum_test", + srcs = [ + "static-sum.cc", + ], + deps = [ + ":replicable_random_device", + "@FP16", + "//:XNNPACK", + "//:aligned_allocator", + "//:common", + "//:node_type", + "//:operators", + "//:subgraph", + ], +) + xnnpack_unit_test( name = "reshape_helpers_test", srcs = [ diff --git a/test/static-sum.cc b/test/static-sum.cc new file mode 100644 index 00000000000..aca32b2954e --- /dev/null +++ b/test/static-sum.cc @@ -0,0 +1,1168 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include // For std::generate, std::min. +#include // For std::array. +#include // For std::lrintf. +#include // For size_t. +#include // For uint32_t. +#include // For std::multiplies. +#include // For std::numeric_limits. +#include // For std::unique_ptr. +#include // For std::accumulate. +#include // For std::uniform_real_distribution. +#include // For std::vector. + +#include +#include +#include "xnnpack.h" +#include "xnnpack/aligned-allocator.h" +#include "xnnpack/common.h" +#include "xnnpack/node-type.h" +#include "xnnpack/operator.h" +#include "xnnpack/subgraph.h" +#include "replicable_random_device.h" + +namespace xnnpack { +template +class SumTestBase : public ::testing::TestWithParam { + protected: + SumTestBase() { + f32dist = std::uniform_real_distribution(-1.0f, 1.0f); + + auto num_input_dim_dist = std::uniform_int_distribution(2, XNN_MAX_TENSOR_DIMS); + const size_t num_input_dims = num_input_dim_dist(rng); + auto num_reduction_axes_dist = std::uniform_int_distribution(1, num_input_dims); + const size_t num_reduction_axes = num_reduction_axes_dist(rng); + + auto axes_dist = std::uniform_int_distribution(0, num_input_dims - 1); + reduction_axes.resize(num_reduction_axes); + std::generate(reduction_axes.begin(), reduction_axes.end(), [&]() { return axes_dist(rng); }); + std::sort(reduction_axes.begin(), reduction_axes.end()); + auto end = std::unique(reduction_axes.begin(), reduction_axes.end()); + reduction_axes.erase(end, reduction_axes.end()); + + auto shape_dist = std::uniform_int_distribution(2, 15); + input_shape.resize(num_input_dims); + std::generate(input_shape.begin(), input_shape.end(), [&]() { return shape_dist(rng); }); + num_input_elements = std::accumulate(input_shape.cbegin(), input_shape.cend(), size_t(1), std::multiplies()); + + output_shape = input_shape; + for (size_t axis : reduction_axes) { + output_shape[axis] = 1; + } + num_output_elements = std::accumulate(output_shape.cbegin(), output_shape.cend(), size_t(1), std::multiplies()); + + input = std::vector(XNN_EXTRA_BYTES / sizeof(T) + num_input_elements); + operator_output = std::vector(num_output_elements); + subgraph_output = std::vector(num_output_elements); + } + + xnnpack::ReplicableRandomDevice rng; + std::uniform_real_distribution f32dist; + std::uniform_int_distribution i8dist = std::uniform_int_distribution( + std::numeric_limits::min(), std::numeric_limits::max()); + std::uniform_int_distribution u8dist = std::uniform_int_distribution( + std::numeric_limits::min(), std::numeric_limits::max()); + + std::vector reduction_axes; + std::vector input_shape; + size_t num_input_elements; + std::vector output_shape; + size_t num_output_elements; + + std::vector input; + std::vector operator_output; + std::vector subgraph_output; +}; + +using SumTestF16 = SumTestBase; +using SumTestF32 = SumTestBase; +using SumTestQS8 = SumTestBase; +using SumTestQU8 = SumTestBase; + +INSTANTIATE_TEST_SUITE_P(KeepDims, SumTestF16, testing::Bool()); +INSTANTIATE_TEST_SUITE_P(KeepDims, SumTestF32, testing::Bool()); +INSTANTIATE_TEST_SUITE_P(KeepDims, SumTestQS8, testing::Bool()); +INSTANTIATE_TEST_SUITE_P(KeepDims, SumTestQU8, testing::Bool()); + +TEST_F(SumTestF16, define) +{ + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, xnn_datatype_fp16, input_shape.size(), input_shape.data(), + nullptr, /*external_id=*/0, /*flags=*/0, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, xnn_datatype_fp16, output_shape.size(), output_shape.data(), + nullptr, /*external_id=*/1, /*flags=*/0, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum( + subgraph, + reduction_axes.size(), reduction_axes.data(), + input_id, output_id, + /*flags=*/0)); + + ASSERT_EQ(subgraph->num_nodes, 1); + const struct xnn_node* node = &subgraph->nodes[0]; + ASSERT_EQ(node->type, xnn_node_type_static_sum); + ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); + ASSERT_EQ(node->params.reduce.num_reduction_axes, reduction_axes.size()); + for (size_t i = 0; i < reduction_axes.size(); i++) { + ASSERT_EQ(node->params.reduce.reduction_axes[i], reduction_axes[i]); + } + ASSERT_EQ(node->num_inputs, 1); + ASSERT_EQ(node->inputs[0], input_id); + ASSERT_EQ(node->num_outputs, 1); + ASSERT_EQ(node->outputs[0], output_id); + ASSERT_EQ(node->flags, 0); +} + +TEST_F(SumTestF32, define) +{ + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, xnn_datatype_fp32, input_shape.size(), input_shape.data(), + nullptr, /*external_id=*/0, /*flags=*/0, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, xnn_datatype_fp32, output_shape.size(), output_shape.data(), + nullptr, /*external_id=*/1, /*flags=*/0, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum( + subgraph, + reduction_axes.size(), reduction_axes.data(), + input_id, output_id, + /*flags=*/0)); + + ASSERT_EQ(subgraph->num_nodes, 1); + const struct xnn_node* node = &subgraph->nodes[0]; + ASSERT_EQ(node->type, xnn_node_type_static_sum); + ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); + ASSERT_EQ(node->params.reduce.num_reduction_axes, reduction_axes.size()); + for (size_t i = 0; i < reduction_axes.size(); i++) { + ASSERT_EQ(node->params.reduce.reduction_axes[i], reduction_axes[i]); + } + ASSERT_EQ(node->num_inputs, 1); + ASSERT_EQ(node->inputs[0], input_id); + ASSERT_EQ(node->num_outputs, 1); + ASSERT_EQ(node->outputs[0], output_id); + ASSERT_EQ(node->flags, 0); +} + +TEST_F(SumTestQS8, define) +{ + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + std::uniform_real_distribution scale_dist = std::uniform_real_distribution(0.0f, 1.0f); + const int32_t input_zero_point = i8dist(rng); + const int32_t output_zero_point = i8dist(rng); + const float input_scale = scale_dist(rng); + const float output_scale = scale_dist(rng); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, input_zero_point, input_scale, + input_shape.size(), input_shape.data(), nullptr, /*external_id=*/0, + /*flags=*/0, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, output_zero_point, output_scale, + output_shape.size(), output_shape.data(), nullptr, /*external_id=*/1, + /*flags=*/0, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum( + subgraph, + reduction_axes.size(), reduction_axes.data(), + input_id, output_id, + /*flags=*/0)); + + ASSERT_EQ(subgraph->num_nodes, 1); + const struct xnn_node* node = &subgraph->nodes[0]; + ASSERT_EQ(node->type, xnn_node_type_static_sum); + ASSERT_EQ(node->compute_type, xnn_compute_type_qs8); + ASSERT_EQ(node->params.reduce.num_reduction_axes, reduction_axes.size()); + for (size_t i = 0; i < reduction_axes.size(); i++) { + ASSERT_EQ(node->params.reduce.reduction_axes[i], reduction_axes[i]); + } + ASSERT_EQ(node->num_inputs, 1); + ASSERT_EQ(node->inputs[0], input_id); + ASSERT_EQ(node->num_outputs, 1); + ASSERT_EQ(node->outputs[0], output_id); + ASSERT_EQ(node->flags, 0); +} + +TEST_F(SumTestQU8, define) +{ + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + std::uniform_real_distribution scale_dist = std::uniform_real_distribution(0.0f, 1.0f); + const int32_t input_zero_point = u8dist(rng); + const int32_t output_zero_point = u8dist(rng); + const float input_scale = scale_dist(rng); + const float output_scale = scale_dist(rng); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_quint8, input_zero_point, input_scale, + input_shape.size(), input_shape.data(), nullptr, /*external_id=*/0, + /*flags=*/0, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_quint8, output_zero_point, output_scale, + output_shape.size(), output_shape.data(), nullptr, /*external_id=*/1, + /*flags=*/0, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum( + subgraph, + reduction_axes.size(), reduction_axes.data(), + input_id, output_id, + /*flags=*/0)); + + ASSERT_EQ(subgraph->num_nodes, 1); + const struct xnn_node* node = &subgraph->nodes[0]; + ASSERT_EQ(node->type, xnn_node_type_static_sum); + ASSERT_EQ(node->compute_type, xnn_compute_type_qu8); + ASSERT_EQ(node->params.reduce.num_reduction_axes, reduction_axes.size()); + for (size_t i = 0; i < reduction_axes.size(); i++) { + ASSERT_EQ(node->params.reduce.reduction_axes[i], reduction_axes[i]); + } + ASSERT_EQ(node->num_inputs, 1); + ASSERT_EQ(node->inputs[0], input_id); + ASSERT_EQ(node->num_outputs, 1); + ASSERT_EQ(node->outputs[0], output_id); + ASSERT_EQ(node->flags, 0); +} + +TEST_P(SumTestF16, matches_operator_api) { + bool keep_dims = GetParam(); + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_operator_t op = nullptr; + + std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); + std::fill(operator_output.begin(), operator_output.end(), std::nanf("")); + std::fill(subgraph_output.begin(), subgraph_output.end(), std::nanf("")); + + uint32_t flags = keep_dims ? XNN_FLAG_KEEP_DIMS : 0; + // Call operator API. + const xnn_status status = xnn_create_sum_nd_f16(flags, &op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, op); + + std::unique_ptr auto_op(op, xnn_delete_operator); + + size_t workspace_size = SIZE_MAX; + size_t workspace_alignment = SIZE_MAX; + ASSERT_EQ(xnn_status_success, + xnn_reshape_sum_nd_f16(op, + reduction_axes.size(), reduction_axes.data(), + input_shape.size(), input_shape.data(), + &workspace_size, &workspace_alignment, + /*threadpool=*/nullptr)); + + ASSERT_NE(workspace_size, SIZE_MAX); + ASSERT_LE(workspace_alignment, XNN_ALLOCATION_ALIGNMENT); + std::vector> workspace(workspace_size); + ASSERT_EQ(xnn_status_success, xnn_setup_sum_nd_f16(op, workspace.data(), input.data(), operator_output.data())); + + ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, xnn_datatype_fp16, input_shape.size(), input_shape.data(), + nullptr, /*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + int output_num_dims = input_shape.size(); + if (!keep_dims) { + output_num_dims -= reduction_axes.size(); + } + ASSERT_EQ( + xnn_status_success, + xnn_define_tensor_value(subgraph, xnn_datatype_fp16, output_num_dims, + output_shape.data(), nullptr, /*external_id=*/1, + XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum(subgraph, reduction_axes.size(), + reduction_axes.data(), input_id, output_id, + flags)); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + const std::array external = { + xnn_external_value{input_id, input.data()}, + xnn_external_value{output_id, subgraph_output.data()} + }; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + // Check outputs match. + for (size_t i = 0; i < operator_output.size(); i++) { + float sub_out = subgraph_output[i]; + float op_out = operator_output[i]; + ASSERT_NEAR(sub_out, op_out, std::abs(0.05f * std::min(sub_out, op_out))); + } +} + +TEST_P(SumTestF32, matches_operator_api) { + bool keep_dims = GetParam(); + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_operator_t op = nullptr; + + std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); + std::fill(operator_output.begin(), operator_output.end(), nanf("")); + std::fill(subgraph_output.begin(), subgraph_output.end(), nanf("")); + + uint32_t flags = keep_dims ? XNN_FLAG_KEEP_DIMS : 0; + // Call operator API. + const xnn_status status = xnn_create_sum_nd_f32(flags, &op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, op); + + std::unique_ptr auto_op(op, xnn_delete_operator); + + ASSERT_EQ(xnn_status_success, + xnn_reshape_sum_nd_f32(op, + reduction_axes.size(), reduction_axes.data(), + input_shape.size(), input_shape.data(), + /*threadpool=*/nullptr)); + + ASSERT_EQ(xnn_status_success, xnn_setup_sum_nd_f32(op, input.data(), operator_output.data())); + + ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, xnn_datatype_fp32, input_shape.size(), input_shape.data(), + nullptr, /*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + int output_num_dims = input_shape.size(); + if (!keep_dims) { + output_num_dims -= reduction_axes.size(); + } + ASSERT_EQ( + xnn_status_success, + xnn_define_tensor_value(subgraph, xnn_datatype_fp32, output_num_dims, + output_shape.data(), nullptr, /*external_id=*/1, + XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum(subgraph, reduction_axes.size(), + reduction_axes.data(), input_id, output_id, + flags)); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + const std::array external = { + xnn_external_value{input_id, input.data()}, + xnn_external_value{output_id, subgraph_output.data()} + }; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + // Check outputs match. + for (int i = 0; i < subgraph_output.size(); ++i) { + ASSERT_NEAR(subgraph_output[i], operator_output[i], 2.5f * std::numeric_limits::epsilon()) << " i " << i; + } +} + +TEST_P(SumTestQS8, matches_operator_api) { + bool keep_dims = GetParam(); + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_operator_t op = nullptr; + + std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); + std::fill(operator_output.begin(), operator_output.end(), INT8_C(0)); + std::fill(subgraph_output.begin(), subgraph_output.end(), INT8_C(0)); + + std::uniform_real_distribution scale_dist = std::uniform_real_distribution(0.0f, 1.0f); + const float input_scale = scale_dist(rng); + const float output_scale = scale_dist(rng); + const int32_t input_zero_point = i8dist(rng); + const int32_t output_zero_point = i8dist(rng); + + uint32_t flags = keep_dims ? XNN_FLAG_KEEP_DIMS : 0; + // Call operator API. + const xnn_status status = xnn_create_sum_nd_qs8( + input_scale / output_scale, input_zero_point, output_zero_point, flags, &op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, op); + + std::unique_ptr auto_op(op, xnn_delete_operator); + + size_t workspace_size = SIZE_MAX; + size_t workspace_alignment = SIZE_MAX; + ASSERT_EQ(xnn_status_success, + xnn_reshape_sum_nd_qs8(op, + reduction_axes.size(), reduction_axes.data(), + input_shape.size(), input_shape.data(), + &workspace_size, &workspace_alignment, + /*threadpool=*/nullptr)); + + ASSERT_NE(workspace_size, SIZE_MAX); + ASSERT_LE(workspace_alignment, XNN_ALLOCATION_ALIGNMENT); + std::vector> workspace(workspace_size); + ASSERT_EQ(xnn_status_success, xnn_setup_sum_nd_qs8(op, workspace.data(), input.data(), operator_output.data())); + + ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, input_zero_point, input_scale, + input_shape.size(), input_shape.data(), nullptr, /*external_id=*/0, + XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + int output_num_dims = input_shape.size(); + if (!keep_dims) { + output_num_dims -= reduction_axes.size(); + } + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, output_zero_point, output_scale, + output_shape.size(), output_shape.data(), nullptr, /*external_id=*/1, + XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum(subgraph, reduction_axes.size(), + reduction_axes.data(), input_id, output_id, + flags)); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + const std::array external = { + xnn_external_value{input_id, input.data()}, + xnn_external_value{output_id, subgraph_output.data()} + }; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + // Check outputs match. + for (int i = 0; i < subgraph_output.size(); ++i) { + ASSERT_EQ(subgraph_output[i], operator_output[i]) << " i " << i; + } +} + +TEST_P(SumTestQU8, matches_operator_api) { + bool keep_dims = GetParam(); + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_operator_t op = nullptr; + + std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); + std::fill(operator_output.begin(), operator_output.end(), UINT8_C(0)); + std::fill(subgraph_output.begin(), subgraph_output.end(), UINT8_C(0)); + + std::uniform_real_distribution scale_dist = std::uniform_real_distribution(0.0f, 1.0f); + const float input_scale = scale_dist(rng); + const float output_scale = scale_dist(rng); + const int32_t input_zero_point = u8dist(rng); + const int32_t output_zero_point = u8dist(rng); + + uint32_t flags = keep_dims ? XNN_FLAG_KEEP_DIMS : 0; + // Call operator API. + const xnn_status status = xnn_create_sum_nd_qu8( + input_scale / output_scale, input_zero_point, output_zero_point, flags, &op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, op); + + std::unique_ptr auto_op(op, xnn_delete_operator); + + size_t workspace_size = SIZE_MAX; + size_t workspace_alignment = SIZE_MAX; + ASSERT_EQ(xnn_status_success, + xnn_reshape_sum_nd_qu8(op, + reduction_axes.size(), reduction_axes.data(), + input_shape.size(), input_shape.data(), + &workspace_size, &workspace_alignment, + /*threadpool=*/nullptr)); + + ASSERT_NE(workspace_size, SIZE_MAX); + ASSERT_LE(workspace_alignment, XNN_ALLOCATION_ALIGNMENT); + std::vector> workspace(workspace_size); + ASSERT_EQ(xnn_status_success, xnn_setup_sum_nd_qu8(op, workspace.data(), input.data(), operator_output.data())); + + ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_quint8, input_zero_point, input_scale, + input_shape.size(), input_shape.data(), nullptr, /*external_id=*/0, + XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + int output_num_dims = input_shape.size(); + if (!keep_dims) { + output_num_dims -= reduction_axes.size(); + } + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_quint8, output_zero_point, output_scale, + output_shape.size(), output_shape.data(), nullptr, /*external_id=*/1, + XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum(subgraph, reduction_axes.size(), + reduction_axes.data(), input_id, output_id, + flags)); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + const std::array external = { + xnn_external_value{input_id, input.data()}, + xnn_external_value{output_id, subgraph_output.data()} + }; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + // Check outputs match. + for (int i = 0; i < subgraph_output.size(); ++i) { + ASSERT_EQ(subgraph_output[i], operator_output[i]) << " i " << i; + } +} + +TEST_F(SumTestF32, reshape_output_keep_dims) +{ + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, xnn_datatype_fp32, input_shape.size(), input_shape.data(), + nullptr, /*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, xnn_datatype_fp32, output_shape.size(), output_shape.data(), + nullptr, /*external_id=*/1, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum( + subgraph, + reduction_axes.size(), reduction_axes.data(), + input_id, output_id, + /*flags=*/XNN_FLAG_KEEP_DIMS)); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + const std::array external = { + xnn_external_value{input_id, input.data()}, + xnn_external_value{output_id, subgraph_output.data()} + }; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + input_shape[0] += 2; + input_shape[1] += 4; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_shape.size(), input_shape.data())); + const struct xnn_node* node = &subgraph->nodes[0]; + std::vector unique_reduction_axes = reduction_axes; + std::sort(unique_reduction_axes.begin(), unique_reduction_axes.end()); + auto end = std::unique(unique_reduction_axes.begin(), unique_reduction_axes.end()); + unique_reduction_axes.erase(end, unique_reduction_axes.end()); + // There are too many parameters which influence the workspace size so + // knowing if reallocation is required or not is messy. + node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr); + const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; + size_t current_axes = 0; + for (size_t i = 0; i < output_shape->num_dims; ++i) { + if (unique_reduction_axes[current_axes] == i) { + ASSERT_EQ(output_shape->dim[i], 1); + ++current_axes; + if (current_axes == unique_reduction_axes.size()) { + break; + } + } else { + ASSERT_EQ(output_shape->dim[i], input_shape[i]); + } + } + + input_shape[0] -= 1; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_shape.size(), input_shape.data())); + ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_success); + current_axes = 0; + for (size_t i = 0; i < output_shape->num_dims; ++i) { + if (unique_reduction_axes[current_axes] == i) { + ASSERT_EQ(output_shape->dim[i], 1); + ++current_axes; + if (current_axes == unique_reduction_axes.size()) { + break; + } + } else { + ASSERT_EQ(output_shape->dim[i], input_shape[i]); + } + } +} + +TEST_F(SumTestF32, reshape_output_no_keep_dims) +{ + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, xnn_datatype_fp32, input_shape.size(), input_shape.data(), + nullptr, /*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + int output_num_dims = input_shape.size() - reduction_axes.size(); + ASSERT_EQ( + xnn_status_success, + xnn_define_tensor_value(subgraph, xnn_datatype_fp32, output_num_dims, + output_shape.data(), nullptr, /*external_id=*/1, + XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum( + subgraph, + reduction_axes.size(), reduction_axes.data(), + input_id, output_id, + /*flags=*/0)); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + const std::array external = { + xnn_external_value{input_id, input.data()}, + xnn_external_value{output_id, subgraph_output.data()} + }; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + input_shape[0] += 2; + input_shape[1] += 4; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_shape.size(), input_shape.data())); + const struct xnn_node* node = &subgraph->nodes[0]; + std::vector unique_reduction_axes = reduction_axes; + std::sort(unique_reduction_axes.begin(), unique_reduction_axes.end()); + auto end = std::unique(unique_reduction_axes.begin(), unique_reduction_axes.end()); + unique_reduction_axes.erase(end, unique_reduction_axes.end()); + // There are too many parameters which influence the workspace size so + // knowing if reallocation is required or not is messy. + node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr); + const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; + size_t current_axes = 0; + size_t current_dim = 0; + for (size_t i = 0; i < input_shape.size(); ++i) { + if (unique_reduction_axes[current_axes] == i) { + ++current_axes; + if (current_axes == unique_reduction_axes.size()) { + break; + } + } else { + ASSERT_EQ(output_shape->dim[current_dim], input_shape[i]); + ++current_dim; + } + } + + input_shape[0] -= 1; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_shape.size(), input_shape.data())); + ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_success); + current_axes = 0; + current_dim = 0; + for (size_t i = 0; i < input_shape.size(); ++i) { + if (unique_reduction_axes[current_axes] == i) { + ++current_axes; + if (current_axes == unique_reduction_axes.size()) { + break; + } + } else { + ASSERT_EQ(output_shape->dim[current_dim], input_shape[i]); + ++current_dim; + } + } +} + +TEST_F(SumTestQS8, reshape_output_keep_dims) +{ + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + std::uniform_real_distribution scale_dist = std::uniform_real_distribution(0.0f, 1.0f); + const float input_scale = scale_dist(rng); + const float output_scale = scale_dist(rng); + const int32_t input_zero_point = i8dist(rng); + const int32_t output_zero_point = i8dist(rng); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, input_zero_point, input_scale, + input_shape.size(), input_shape.data(), nullptr, /*external_id=*/0, + XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, output_zero_point, output_scale, + output_shape.size(), output_shape.data(), nullptr, /*external_id=*/1, + XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum( + subgraph, + reduction_axes.size(), reduction_axes.data(), + input_id, output_id, + /*flags=*/XNN_FLAG_KEEP_DIMS)); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + const std::array external = { + xnn_external_value{input_id, input.data()}, + xnn_external_value{output_id, subgraph_output.data()} + }; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + input_shape[0] += 2; + input_shape[1] += 4; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_shape.size(), input_shape.data())); + const struct xnn_node* node = &subgraph->nodes[0]; + std::vector unique_reduction_axes = reduction_axes; + std::sort(unique_reduction_axes.begin(), unique_reduction_axes.end()); + auto end = std::unique(unique_reduction_axes.begin(), unique_reduction_axes.end()); + unique_reduction_axes.erase(end, unique_reduction_axes.end()); + // There are too many parameters which influence the workspace size so + // knowing if reallocation is required or not is messy. + node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr); + const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; + size_t current_axes = 0; + for (size_t i = 0; i < output_shape->num_dims; ++i) { + if (unique_reduction_axes[current_axes] == i) { + ASSERT_EQ(output_shape->dim[i], 1); + ++current_axes; + if (current_axes == unique_reduction_axes.size()) { + break; + } + } else { + ASSERT_EQ(output_shape->dim[i], input_shape[i]); + } + } + + input_shape[0] -= 1; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_shape.size(), input_shape.data())); + ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_success); + current_axes = 0; + for (size_t i = 0; i < output_shape->num_dims; ++i) { + if (unique_reduction_axes[current_axes] == i) { + ASSERT_EQ(output_shape->dim[i], 1); + ++current_axes; + if (current_axes == unique_reduction_axes.size()) { + break; + } + } else { + ASSERT_EQ(output_shape->dim[i], input_shape[i]); + } + } +} + +TEST_F(SumTestQS8, reshape_output_no_keep_dims) +{ + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + std::uniform_real_distribution scale_dist = std::uniform_real_distribution(0.0f, 1.0f); + const float input_scale = scale_dist(rng); + const float output_scale = scale_dist(rng); + const int32_t input_zero_point = i8dist(rng); + const int32_t output_zero_point = i8dist(rng); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, input_zero_point, input_scale, + input_shape.size(), input_shape.data(), nullptr, /*external_id=*/0, + XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + int output_num_dims = input_shape.size() - reduction_axes.size(); + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_qint8, output_zero_point, output_scale, + output_num_dims, output_shape.data(), nullptr, /*external_id=*/1, + XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum( + subgraph, + reduction_axes.size(), reduction_axes.data(), + input_id, output_id, + /*flags=*/0)); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + const std::array external = { + xnn_external_value{input_id, input.data()}, + xnn_external_value{output_id, subgraph_output.data()} + }; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + input_shape[0] += 2; + input_shape[1] += 4; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_shape.size(), input_shape.data())); + const struct xnn_node* node = &subgraph->nodes[0]; + std::vector unique_reduction_axes = reduction_axes; + std::sort(unique_reduction_axes.begin(), unique_reduction_axes.end()); + auto end = std::unique(unique_reduction_axes.begin(), unique_reduction_axes.end()); + unique_reduction_axes.erase(end, unique_reduction_axes.end()); + // There are too many parameters which influence the workspace size so + // knowing if reallocation is required or not is messy. + node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr); + const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; + size_t current_axes = 0; + size_t current_dim = 0; + for (size_t i = 0; i < input_shape.size(); ++i) { + if (unique_reduction_axes[current_axes] == i) { + ++current_axes; + if (current_axes == unique_reduction_axes.size()) { + break; + } + } else { + ASSERT_EQ(output_shape->dim[current_dim], input_shape[i]); + ++current_dim; + } + } + + input_shape[0] -= 1; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_shape.size(), input_shape.data())); + ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_success); + current_axes = 0; + current_dim = 0; + for (size_t i = 0; i < input_shape.size(); ++i) { + if (unique_reduction_axes[current_axes] == i) { + ++current_axes; + if (current_axes == unique_reduction_axes.size()) { + break; + } + } else { + ASSERT_EQ(output_shape->dim[current_dim], input_shape[i]); + ++current_dim; + } + } +} + +TEST_F(SumTestQU8, reshape_output_keep_dims) +{ + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + std::uniform_real_distribution scale_dist = std::uniform_real_distribution(0.0f, 1.0f); + const float input_scale = scale_dist(rng); + const float output_scale = scale_dist(rng); + const int32_t input_zero_point = u8dist(rng); + const int32_t output_zero_point = u8dist(rng); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_quint8, input_zero_point, input_scale, + input_shape.size(), input_shape.data(), nullptr, /*external_id=*/0, + XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_quint8, output_zero_point, output_scale, + output_shape.size(), output_shape.data(), nullptr, /*external_id=*/1, + XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum( + subgraph, + reduction_axes.size(), reduction_axes.data(), + input_id, output_id, + /*flags=*/XNN_FLAG_KEEP_DIMS)); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + const std::array external = { + xnn_external_value{input_id, input.data()}, + xnn_external_value{output_id, subgraph_output.data()} + }; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + input_shape[0] += 2; + input_shape[1] += 4; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_shape.size(), input_shape.data())); + const struct xnn_node* node = &subgraph->nodes[0]; + std::vector unique_reduction_axes = reduction_axes; + std::sort(unique_reduction_axes.begin(), unique_reduction_axes.end()); + auto end = std::unique(unique_reduction_axes.begin(), unique_reduction_axes.end()); + unique_reduction_axes.erase(end, unique_reduction_axes.end()); + // There are too many parameters which influence the workspace size so + // knowing if reallocation is required or not is messy. + node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr); + const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; + size_t current_axes = 0; + for (size_t i = 0; i < output_shape->num_dims; ++i) { + if (unique_reduction_axes[current_axes] == i) { + ASSERT_EQ(output_shape->dim[i], 1); + ++current_axes; + if (current_axes == unique_reduction_axes.size()) { + break; + } + } else { + ASSERT_EQ(output_shape->dim[i], input_shape[i]); + } + } + + input_shape[0] -= 1; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_shape.size(), input_shape.data())); + ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_success); + current_axes = 0; + for (size_t i = 0; i < output_shape->num_dims; ++i) { + if (unique_reduction_axes[current_axes] == i) { + ASSERT_EQ(output_shape->dim[i], 1); + ++current_axes; + if (current_axes == unique_reduction_axes.size()) { + break; + } + } else { + ASSERT_EQ(output_shape->dim[i], input_shape[i]); + } + } +} + +TEST_F(SumTestQU8, reshape_output_no_keep_dims) +{ + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + std::uniform_real_distribution scale_dist = std::uniform_real_distribution(0.0f, 1.0f); + const float input_scale = scale_dist(rng); + const float output_scale = scale_dist(rng); + const int32_t input_zero_point = u8dist(rng); + const int32_t output_zero_point = u8dist(rng); + + uint32_t input_id = XNN_INVALID_NODE_ID; + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_quint8, input_zero_point, input_scale, + input_shape.size(), input_shape.data(), nullptr, /*external_id=*/0, + XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + int output_num_dims = input_shape.size() - reduction_axes.size(); + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, xnn_datatype_quint8, output_zero_point, output_scale, + output_num_dims, output_shape.data(), nullptr, /*external_id=*/1, + XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ(xnn_status_success, + xnn_define_static_sum( + subgraph, + reduction_axes.size(), reduction_axes.data(), + input_id, output_id, + /*flags=*/0)); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + const std::array external = { + xnn_external_value{input_id, input.data()}, + xnn_external_value{output_id, subgraph_output.data()} + }; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + input_shape[0] += 2; + input_shape[1] += 4; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_shape.size(), input_shape.data())); + const struct xnn_node* node = &subgraph->nodes[0]; + std::vector unique_reduction_axes = reduction_axes; + std::sort(unique_reduction_axes.begin(), unique_reduction_axes.end()); + auto end = std::unique(unique_reduction_axes.begin(), unique_reduction_axes.end()); + unique_reduction_axes.erase(end, unique_reduction_axes.end()); + // There are too many parameters which influence the workspace size so + // knowing if reallocation is required or not is messy. + node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr); + const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; + size_t current_axes = 0; + size_t current_dim = 0; + for (size_t i = 0; i < input_shape.size(); ++i) { + if (unique_reduction_axes[current_axes] == i) { + ++current_axes; + if (current_axes == unique_reduction_axes.size()) { + break; + } + } else { + ASSERT_EQ(output_shape->dim[current_dim], input_shape[i]); + ++current_dim; + } + } + + input_shape[0] -= 1; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_shape.size(), input_shape.data())); + ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_success); + current_axes = 0; + current_dim = 0; + for (size_t i = 0; i < input_shape.size(); ++i) { + if (unique_reduction_axes[current_axes] == i) { + ++current_axes; + if (current_axes == unique_reduction_axes.size()) { + break; + } + } else { + ASSERT_EQ(output_shape->dim[current_dim], input_shape[i]); + ++current_dim; + } + } +} + +} // namespace xnnpack diff --git a/test/sum-nd.cc b/test/sum-nd.cc new file mode 100644 index 00000000000..2eee1b04188 --- /dev/null +++ b/test/sum-nd.cc @@ -0,0 +1,875 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include + +#include +#include "xnnpack.h" +#include "sum-operator-tester.h" + +constexpr size_t kDim1 = 2; +constexpr size_t kDim2 = 3; +constexpr size_t kDim3 = 5; +constexpr size_t kDim4 = 7; +constexpr size_t kDim5 = 11; +constexpr size_t kDim6 = 13; + + +TEST(SUM_ND_F16, reduce_all) { + SumOperatorTester() + .input_shape({kDim1}) + .reduction_axes({0}) + .TestF16(); +} + +TEST(SUM_ND_F16, reduce_first_axis) { + SumOperatorTester() + .input_shape({kDim1, kDim2}) + .reduction_axes({0}) + .TestF16(); +} + +TEST(SUM_ND_F16, reduce_last_axis) { + SumOperatorTester() + .input_shape({kDim1, kDim2}) + .reduction_axes({1}) + .TestF16(); +} + +TEST(SUM_ND_F16, reduce_2d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 2); bm1++) { + const bool reduce_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool reduce_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + SumOperatorTester() + .input_shape({kDim1, kDim2}) + .reduction_axes(reduction_axes) + .TestF16(); + } +} + +TEST(SUM_ND_F16, reduce_3d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 3); bm1++) { + const bool reduce_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool reduce_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool reduce_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestF16(); + } +} + +TEST(SUM_ND_F16, reduce_4d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 4); bm1++) { + const bool reduce_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool reduce_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool reduce_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool reduce_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestF16(); + } +} + +TEST(SUM_ND_F16, reduce_5d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 5); bm1++) { + const bool reduce_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool reduce_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool reduce_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool reduce_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool reduce_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestF16(); + } +} + +TEST(SUM_ND_F16, reduce_6d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 6); bm1++) { + const bool reduce_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool reduce_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool reduce_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool reduce_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool reduce_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool reduce_dim6 = (bm1 & (uint32_t(1) << 5)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + if (reduce_dim6) { + reduction_axes.push_back(5); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestF16(); + } +} + +TEST(SUM_ND_F16, reduce_6d_multithreaded) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 6); bm1++) { + const bool reduce_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool reduce_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool reduce_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool reduce_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool reduce_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool reduce_dim6 = (bm1 & (uint32_t(1) << 5)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + if (reduce_dim6) { + reduction_axes.push_back(5); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .multithreaded(true) + .TestF16(); + } +} + +TEST(SUM_ND_F32, reduce_all) { + SumOperatorTester() + .input_shape({kDim1}) + .reduction_axes({0}) + .TestF32(); +} + +TEST(SUM_ND_F32, reduce_first_axis) { + SumOperatorTester() + .input_shape({kDim1, kDim2}) + .reduction_axes({0}) + .TestF32(); +} + +TEST(SUM_ND_F32, reduce_last_axis) { + SumOperatorTester() + .input_shape({kDim1, kDim2, kDim3}) + .reduction_axes({0,2}) + .TestF32(); +} + +TEST(SUM_ND_F32, reduce_last_axis2) { + SumOperatorTester() + .input_shape({kDim1, kDim2, kDim3}) + .reduction_axes({0,2}) + .TestF32(); +} + +TEST(SUM_ND_F32, reduce_2d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 2); bm1++) { + const bool reduce_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool reduce_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + SumOperatorTester() + .input_shape({kDim1, kDim2}) + .reduction_axes(reduction_axes) + .TestF32(); + } +} + +TEST(SUM_ND_F32, reduce_3d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 3); bm1++) { + const bool reduce_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool reduce_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool reduce_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestF32(); + } +} + +TEST(SUM_ND_F32, reduce_4d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 4); bm1++) { + const bool reduce_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool reduce_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool reduce_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool reduce_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestF32(); + } +} + +TEST(SUM_ND_F32, reduce_5d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 5); bm1++) { + const bool reduce_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool reduce_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool reduce_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool reduce_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool reduce_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestF32(); + } +} + +TEST(SUM_ND_F32, reduce_6d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 6); bm1++) { + const bool reduce_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool reduce_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool reduce_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool reduce_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool reduce_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool reduce_dim6 = (bm1 & (uint32_t(1) << 5)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + if (reduce_dim6) { + reduction_axes.push_back(5); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestF32(); + } +} + +TEST(SUM_ND_F32, reduce_6d_multithreaded) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 6); bm1++) { + const bool reduce_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool reduce_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool reduce_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool reduce_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool reduce_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool reduce_dim6 = (bm1 & (uint32_t(1) << 5)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + if (reduce_dim6) { + reduction_axes.push_back(5); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .multithreaded(true) + .TestF32(); + } +} + +TEST(SUM_ND_QS8, reduce_all) { + SumOperatorTester() + .input_shape({kDim1}) + .reduction_axes({0}) + .TestQS8(); +} + +TEST(SUM_ND_QS8, reduce_first_axis) { + SumOperatorTester() + .input_shape({kDim1, kDim2}) + .reduction_axes({0}) + .TestQS8(); +} + +TEST(SUM_ND_QS8, reduce_last_axis) { + SumOperatorTester() + .input_shape({kDim1, kDim2, kDim3}) + .reduction_axes({0,2}) + .TestQS8(); +} + +TEST(SUM_ND_QS8, reduce_last_axis2) { + SumOperatorTester() + .input_shape({kDim1, kDim2, kDim3}) + .reduction_axes({0,2}) + .TestQS8(); +} + +TEST(SUM_ND_QS8, reduce_2d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 2); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + SumOperatorTester() + .input_shape({kDim1, kDim2}) + .reduction_axes(reduction_axes) + .TestQS8(); + } +} + +TEST(SUM_ND_QS8, reduce_3d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 3); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestQS8(); + } +} + +TEST(SUM_ND_QS8, reduce_4d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 4); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + const bool reduce_dim4 = (bm1 & UINT32_C(8)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestQS8(); + } +} + +TEST(SUM_ND_QS8, reduce_5d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 5); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + const bool reduce_dim4 = (bm1 & UINT32_C(8)) != 0; + const bool reduce_dim5 = (bm1 & UINT32_C(16)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestQS8(); + } +} + +TEST(SUM_ND_QS8, reduce_6d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 6); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + const bool reduce_dim4 = (bm1 & UINT32_C(8)) != 0; + const bool reduce_dim5 = (bm1 & UINT32_C(16)) != 0; + const bool reduce_dim6 = (bm1 & UINT32_C(32)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + if (reduce_dim6) { + reduction_axes.push_back(5); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestQS8(); + } +} + +TEST(SUM_ND_QS8, reduce_6d_multithreaded) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 6); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + const bool reduce_dim4 = (bm1 & UINT32_C(8)) != 0; + const bool reduce_dim5 = (bm1 & UINT32_C(16)) != 0; + const bool reduce_dim6 = (bm1 & UINT32_C(32)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + if (reduce_dim6) { + reduction_axes.push_back(5); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .multithreaded(true) + .TestQS8(); + } +} + +TEST(SUM_ND_QU8, reduce_all) { + SumOperatorTester() + .input_shape({kDim1}) + .reduction_axes({0}) + .TestQU8(); +} + +TEST(SUM_ND_QU8, reduce_first_axis) { + SumOperatorTester() + .input_shape({kDim1, kDim2}) + .reduction_axes({0}) + .TestQU8(); +} + +TEST(SUM_ND_QU8, reduce_last_axis) { + SumOperatorTester() + .input_shape({kDim1, kDim2, kDim3}) + .reduction_axes({0,2}) + .TestQU8(); +} + +TEST(SUM_ND_QU8, reduce_last_axis2) { + SumOperatorTester() + .input_shape({kDim1, kDim2, kDim3}) + .reduction_axes({0,2}) + .TestQU8(); +} + +TEST(SUM_ND_QU8, reduce_2d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 2); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + SumOperatorTester() + .input_shape({kDim1, kDim2}) + .reduction_axes(reduction_axes) + .TestQU8(); + } +} + +TEST(SUM_ND_QU8, reduce_3d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 3); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestQU8(); + } +} + +TEST(SUM_ND_QU8, reduce_4d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 4); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + const bool reduce_dim4 = (bm1 & UINT32_C(8)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestQU8(); + } +} + +TEST(SUM_ND_QU8, reduce_5d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 5); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + const bool reduce_dim4 = (bm1 & UINT32_C(8)) != 0; + const bool reduce_dim5 = (bm1 & UINT32_C(16)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestQU8(); + } +} + +TEST(SUM_ND_QU8, reduce_6d) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 6); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + const bool reduce_dim4 = (bm1 & UINT32_C(8)) != 0; + const bool reduce_dim5 = (bm1 & UINT32_C(16)) != 0; + const bool reduce_dim6 = (bm1 & UINT32_C(32)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + if (reduce_dim6) { + reduction_axes.push_back(5); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .TestQU8(); + } +} + +TEST(SUM_ND_QU8, reduce_6d_multithreaded) { + std::vector reduction_axes; + for (uint32_t bm1 = 1; bm1 < (uint32_t(1) << 6); bm1++) { + const bool reduce_dim1 = (bm1 & UINT32_C(1)) != 0; + const bool reduce_dim2 = (bm1 & UINT32_C(2)) != 0; + const bool reduce_dim3 = (bm1 & UINT32_C(4)) != 0; + const bool reduce_dim4 = (bm1 & UINT32_C(8)) != 0; + const bool reduce_dim5 = (bm1 & UINT32_C(16)) != 0; + const bool reduce_dim6 = (bm1 & UINT32_C(32)) != 0; + + const std::vector input_shape{{kDim1, kDim2, kDim3, kDim4, kDim5, kDim6}}; + reduction_axes.clear(); + if (reduce_dim1) { + reduction_axes.push_back(0); + } + if (reduce_dim2) { + reduction_axes.push_back(1); + } + if (reduce_dim3) { + reduction_axes.push_back(2); + } + if (reduce_dim4) { + reduction_axes.push_back(3); + } + if (reduce_dim5) { + reduction_axes.push_back(4); + } + if (reduce_dim6) { + reduction_axes.push_back(5); + } + + SumOperatorTester() + .input_shape(input_shape) + .reduction_axes(reduction_axes) + .multithreaded(true) + .TestQU8(); + } +} diff --git a/test/sum-operator-tester.h b/test/sum-operator-tester.h new file mode 100644 index 00000000000..646af8614a6 --- /dev/null +++ b/test/sum-operator-tester.h @@ -0,0 +1,633 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "xnnpack.h" +#include "xnnpack/aligned-allocator.h" +#include "xnnpack/common.h" +#include "xnnpack/math.h" +#include "xnnpack/requantization.h" +#include "replicable_random_device.h" +#include "pthreadpool.h" + +class SumOperatorTester { + public: + SumOperatorTester& input_shape(std::initializer_list input_shape) { + assert(input_shape.size() <= XNN_MAX_TENSOR_DIMS); + this->input_shape_ = std::vector(input_shape); + return *this; + } + + SumOperatorTester& input_shape(const std::vector& input_shape) { + assert(input_shape.size() <= XNN_MAX_TENSOR_DIMS); + this->input_shape_ = std::vector(input_shape); + return *this; + } + + const std::vector& input_shape() const { + return this->input_shape_; + } + + size_t num_input_dims() const { + return this->input_shape_.size(); + } + + size_t num_input_elements() const { + return std::accumulate( + this->input_shape_.begin(), this->input_shape_.end(), size_t(1), std::multiplies()); + } + + SumOperatorTester& reduction_axes(std::initializer_list reduction_axes) { + assert(reduction_axes.size() <= XNN_MAX_TENSOR_DIMS); + this->reduction_axes_ = std::vector(reduction_axes); + return *this; + } + + SumOperatorTester& reduction_axes(const std::vector reduction_axes) { + assert(reduction_axes.size() <= XNN_MAX_TENSOR_DIMS); + this->reduction_axes_ = reduction_axes; + return *this; + } + + const std::vector& reduction_axes() const { + return this->reduction_axes_; + } + + size_t num_reduction_axes() const { + return this->reduction_axes_.size(); + } + + SumOperatorTester& multithreaded(size_t multithreaded) { + this->multithreaded_ = multithreaded; + return *this; + } + + size_t multithreaded() const { + return this->multithreaded_; + } + + size_t num_threads() const { + // Do not spin up excessive number of threads for tests. + return multithreaded() ? 5 : 1; + } + + SumOperatorTester& iterations(size_t iterations) { + this->iterations_ = iterations; + return *this; + } + + size_t iterations() const { + return this->iterations_; + } + + void TestF16() const { + xnnpack::ReplicableRandomDevice rng; + std::uniform_real_distribution f32dist(0.01f, 1.0f); + + // Compute generalized shapes. + std::array input_dims; + std::array output_dims; + std::fill(input_dims.begin(), input_dims.end(), 1); + std::fill(output_dims.begin(), output_dims.end(), 1); + std::copy(input_shape().cbegin(), input_shape().cend(), input_dims.end() - num_input_dims()); + std::copy(input_dims.cbegin(), input_dims.cend(), output_dims.begin()); + for (size_t axis : reduction_axes()) { + (output_dims.end() - num_input_dims())[axis] = 1; + } + const size_t num_output_elements = + std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), std::multiplies()); + + // Compute generalized strides. + std::array input_strides; + std::array output_strides; + size_t input_stride = 1, output_stride = 1; + for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { + input_strides[i - 1] = input_stride; + output_strides[i - 1] = output_dims[i - 1] == 1 ? 0 : output_stride; + input_stride *= input_dims[i - 1]; + output_stride *= output_dims[i - 1]; + } + + std::vector input(XNN_EXTRA_BYTES / sizeof(xnn_float16) + num_input_elements()); + std::vector output(num_output_elements); + std::vector output_ref(num_output_elements); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + std::unique_ptr auto_threadpool{nullptr, pthreadpool_destroy}; + if (multithreaded()) { + const pthreadpool_t threadpool = pthreadpool_create(num_threads()); + if (pthreadpool_get_threads_count(threadpool) <= 1) { + GTEST_SKIP(); + } else { + auto_threadpool.reset(threadpool); + } + } + + std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); + std::fill(output.begin(), output.end(), std::nanf("")); + + // Compute reference results. + std::fill(output_ref.begin(), output_ref.end(), 0.0f); + for (size_t i = 0; i < input_dims[0]; i++) { + for (size_t j = 0; j < input_dims[1]; j++) { + for (size_t k = 0; k < input_dims[2]; k++) { + for (size_t l = 0; l < input_dims[3]; l++) { + for (size_t m = 0; m < input_dims[4]; m++) { + for (size_t n = 0; n < input_dims[5]; n++) { + output_ref[i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3] + m * output_strides[4] + n * output_strides[5]] += + input[i * input_strides[0] + j * input_strides[1] + k * input_strides[2] + l * input_strides[3] + m * input_strides[4] + n * input_strides[5]]; + } + } + } + } + } + } + + // Create, setup, run, and destroy a sum operator. + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + xnn_operator_t sum_op = nullptr; + + const xnn_status status = xnn_create_sum_nd_f16( + /*flags=*/0, &sum_op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, sum_op); + + // Smart pointer to automatically delete sum_op. + std::unique_ptr auto_sum_op(sum_op, xnn_delete_operator); + + size_t workspace_size = SIZE_MAX; + size_t workspace_alignment = SIZE_MAX; + ASSERT_EQ(xnn_status_success, + xnn_reshape_sum_nd_f16( + sum_op, + num_reduction_axes(), + reduction_axes().data(), + num_input_dims(), + input_shape().data(), + &workspace_size, &workspace_alignment, + auto_threadpool.get())); + + ASSERT_NE(workspace_size, SIZE_MAX); + ASSERT_LE(workspace_alignment, XNN_ALLOCATION_ALIGNMENT); + std::vector> workspace(workspace_size); + + ASSERT_EQ(xnn_status_success, + xnn_setup_sum_nd_f16( + sum_op, + workspace.data(), + input.data(), output.data())); + + ASSERT_EQ(xnn_status_success, + xnn_run_operator(sum_op, auto_threadpool.get())); + + // Verify results. + for (size_t i = 0; i < output_dims[0]; i++) { + for (size_t j = 0; j < output_dims[1]; j++) { + for (size_t k = 0; k < output_dims[2]; k++) { + for (size_t l = 0; l < output_dims[3]; l++) { + for (size_t m = 0; m < output_dims[4]; m++) { + for (size_t n = 0; n < output_dims[5]; n++) { + const size_t index = + i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3] + m * output_strides[4] + n * output_strides[5]; + ASSERT_NEAR(output[index], output_ref[index], 3.0e-2f * std::abs(output_ref[index])) + << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ")"; + } + } + } + } + } + } + } + } + + void TestF32() const { + xnnpack::ReplicableRandomDevice rng; + std::uniform_real_distribution f32dist(0.01f, 1.0f); + + // Compute generalized shapes. + std::array input_dims; + std::array output_dims; + std::fill(input_dims.begin(), input_dims.end(), 1); + std::fill(output_dims.begin(), output_dims.end(), 1); + std::copy(input_shape().cbegin(), input_shape().cend(), input_dims.end() - num_input_dims()); + std::copy(input_dims.cbegin(), input_dims.cend(), output_dims.begin()); + for (size_t axis : reduction_axes()) { + (output_dims.end() - num_input_dims())[axis] = 1; + } + const size_t num_output_elements = + std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), std::multiplies()); + + // Compute generalized strides. + std::array input_strides; + std::array output_strides; + size_t input_stride = 1, output_stride = 1; + for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { + input_strides[i - 1] = input_stride; + output_strides[i - 1] = output_dims[i - 1] == 1 ? 0 : output_stride; + input_stride *= input_dims[i - 1]; + output_stride *= output_dims[i - 1]; + } + + std::vector input(XNN_EXTRA_BYTES / sizeof(float) + num_input_elements()); + std::vector output(num_output_elements); + std::vector output_ref(num_output_elements); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + std::unique_ptr auto_threadpool{nullptr, pthreadpool_destroy}; + if (multithreaded()) { + const pthreadpool_t threadpool = pthreadpool_create(num_threads()); + if (pthreadpool_get_threads_count(threadpool) <= 1) { + GTEST_SKIP(); + } else { + auto_threadpool.reset(threadpool); + } + } + + std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); + std::fill(output.begin(), output.end(), nanf("")); + + // Compute reference results. + std::fill(output_ref.begin(), output_ref.end(), 0.0); + for (size_t i = 0; i < input_dims[0]; i++) { + for (size_t j = 0; j < input_dims[1]; j++) { + for (size_t k = 0; k < input_dims[2]; k++) { + for (size_t l = 0; l < input_dims[3]; l++) { + for (size_t m = 0; m < input_dims[4]; m++) { + for (size_t n = 0; n < input_dims[5]; n++) { + output_ref[i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3] + m * output_strides[4] + n * output_strides[5]] += + input[i * input_strides[0] + j * input_strides[1] + k * input_strides[2] + l * input_strides[3] + m * input_strides[4] + n * input_strides[5]]; + } + } + } + } + } + } + + // Create, setup, run, and destroy a sum operator. + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + xnn_operator_t sum_op = nullptr; + + const xnn_status status = xnn_create_sum_nd_f32( + /*flags=*/0, &sum_op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, sum_op); + + // Smart pointer to automatically delete sum_op. + std::unique_ptr auto_sum_op(sum_op, xnn_delete_operator); + + ASSERT_EQ(xnn_status_success, + xnn_reshape_sum_nd_f32( + sum_op, + num_reduction_axes(), + reduction_axes().data(), + num_input_dims(), + input_shape().data(), + auto_threadpool.get())); + + ASSERT_EQ(xnn_status_success, + xnn_setup_sum_nd_f32( + sum_op, + input.data(), output.data())); + + ASSERT_EQ(xnn_status_success, + xnn_run_operator(sum_op, auto_threadpool.get())); + + // Verify results. + for (size_t i = 0; i < output_dims[0]; i++) { + for (size_t j = 0; j < output_dims[1]; j++) { + for (size_t k = 0; k < output_dims[2]; k++) { + for (size_t l = 0; l < output_dims[3]; l++) { + for (size_t m = 0; m < output_dims[4]; m++) { + for (size_t n = 0; n < output_dims[5]; n++) { + const size_t index = + i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3] + m * output_strides[4] + n * output_strides[5]; + ASSERT_NEAR(output[index], output_ref[index], 3.0e-6f * std::abs(output_ref[index])) + << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ")"; + } + } + } + } + } + } + } + } + + void TestQS8() const { + xnnpack::ReplicableRandomDevice rng; + std::uniform_int_distribution i8dist( + std::numeric_limits::min(), std::numeric_limits::max()); + + // Compute generalized shapes. + std::array input_dims; + std::array output_dims; + std::fill(input_dims.begin(), input_dims.end(), 1); + std::fill(output_dims.begin(), output_dims.end(), 1); + std::copy(input_shape().cbegin(), input_shape().cend(), input_dims.end() - num_input_dims()); + std::copy(input_dims.cbegin(), input_dims.cend(), output_dims.begin()); + for (size_t axis : reduction_axes()) { + (output_dims.end() - num_input_dims())[axis] = 1; + } + const size_t num_output_elements = + std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), std::multiplies()); + + // Compute generalized strides. + std::array input_strides; + std::array output_strides; + size_t input_stride = 1, output_stride = 1; + for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { + input_strides[i - 1] = input_stride; + output_strides[i - 1] = output_dims[i - 1] == 1 ? 0 : output_stride; + input_stride *= input_dims[i - 1]; + output_stride *= output_dims[i - 1]; + } + + std::vector input(XNN_EXTRA_BYTES / sizeof(int8_t) + num_input_elements()); + std::vector output(num_output_elements); + std::vector output_ref(num_output_elements); + std::vector output_ref_qs8(num_output_elements); + std::vector accumulator(num_output_elements); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + std::fill(accumulator.begin(), accumulator.end(), 0); + + std::unique_ptr auto_threadpool{nullptr, pthreadpool_destroy}; + if (multithreaded()) { + const pthreadpool_t threadpool = pthreadpool_create(num_threads()); + if (pthreadpool_get_threads_count(threadpool) <= 1) { + GTEST_SKIP(); + } else { + auto_threadpool.reset(threadpool); + } + } + + std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); + std::fill(output.begin(), output.end(), INT8_C(0xA5)); + + const int32_t num_reduced_elements = num_input_elements() / num_output_elements; + const float input_scale = 0.5f; + const float output_scale = 0.75f; + const int8_t input_zero_point = i8dist(rng); + const int8_t output_zero_point = i8dist(rng); + const int8_t quantized_output_min = xnn_qs8_quantize(-INFINITY, output_scale, output_zero_point); + const int8_t quantized_output_max = xnn_qs8_quantize(INFINITY, output_scale, output_zero_point); + + // Compute reference results. + std::fill(output_ref.begin(), output_ref.end(), 0); + for (size_t i = 0; i < input_dims[0]; i++) { + for (size_t j = 0; j < input_dims[1]; j++) { + for (size_t k = 0; k < input_dims[2]; k++) { + for (size_t l = 0; l < input_dims[3]; l++) { + for (size_t m = 0; m < input_dims[4]; m++) { + for (size_t n = 0; n < input_dims[5]; n++) { + size_t input_idx = i * input_strides[0] + j * input_strides[1] + k * input_strides[2] + l * input_strides[3] + m * input_strides[4] + n * input_strides[5]; + size_t output_idx = i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3] + m * output_strides[4] + n * output_strides[5]; + accumulator[output_idx] += static_cast(input[input_idx]); + } + } + } + } + } + } + + for (size_t idx = 0; idx < output_ref.size(); ++idx) { + output_ref[idx] = static_cast(accumulator[idx] - static_cast(input_zero_point) * num_reduced_elements); + output_ref[idx] *= input_scale * output_scale; + output_ref[idx] = std::min(output_ref[idx], static_cast(static_cast(quantized_output_max) - static_cast(output_zero_point))); + output_ref[idx] = std::max(output_ref[idx], static_cast(static_cast(quantized_output_min) - static_cast(output_zero_point))); + output_ref_qs8[idx] = static_cast(std::lrintf(output_ref[idx]) + static_cast(output_zero_point)); + } + + // Create, setup, run, and destroy a sum operator. + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + xnn_operator_t sum_op = nullptr; + + const xnn_status status = xnn_create_sum_nd_qs8( + input_scale * output_scale, input_zero_point, output_zero_point, + /*flags=*/0, &sum_op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, sum_op); + + // Smart pointer to automatically delete sum_op. + std::unique_ptr auto_sum_op(sum_op, xnn_delete_operator); + + size_t workspace_size = SIZE_MAX; + size_t workspace_alignment = SIZE_MAX; + ASSERT_EQ(xnn_status_success, + xnn_reshape_sum_nd_qs8( + sum_op, + num_reduction_axes(), + reduction_axes().data(), + num_input_dims(), + input_shape().data(), + &workspace_size, &workspace_alignment, + auto_threadpool.get())); + + ASSERT_NE(workspace_size, SIZE_MAX); + ASSERT_LE(workspace_alignment, XNN_ALLOCATION_ALIGNMENT); + std::vector> workspace(workspace_size); + + ASSERT_EQ(xnn_status_success, + xnn_setup_sum_nd_qs8( + sum_op, + workspace.data(), + input.data(), output.data())); + + ASSERT_EQ(xnn_status_success, + xnn_run_operator(sum_op, auto_threadpool.get())); + + // Verify results. + for (size_t i = 0; i < output_dims[0]; i++) { + for (size_t j = 0; j < output_dims[1]; j++) { + for (size_t k = 0; k < output_dims[2]; k++) { + for (size_t l = 0; l < output_dims[3]; l++) { + for (size_t m = 0; m < output_dims[4]; m++) { + for (size_t n = 0; n < output_dims[5]; n++) { + const size_t index = + i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3] + m * output_strides[4] + n * output_strides[5]; + ASSERT_EQ(output[index], output_ref_qs8[index]) + << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ")"; + } + } + } + } + } + } + } + } + + void TestQU8() const { + xnnpack::ReplicableRandomDevice rng; + std::uniform_int_distribution u8dist( + std::numeric_limits::min(), std::numeric_limits::max()); + + // Compute generalized shapes. + std::array input_dims; + std::array output_dims; + std::fill(input_dims.begin(), input_dims.end(), 1); + std::fill(output_dims.begin(), output_dims.end(), 1); + std::copy(input_shape().cbegin(), input_shape().cend(), input_dims.end() - num_input_dims()); + std::copy(input_dims.cbegin(), input_dims.cend(), output_dims.begin()); + for (size_t axis : reduction_axes()) { + (output_dims.end() - num_input_dims())[axis] = 1; + } + const size_t num_output_elements = + std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), std::multiplies()); + + // Compute generalized strides. + std::array input_strides; + std::array output_strides; + size_t input_stride = 1, output_stride = 1; + for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) { + input_strides[i - 1] = input_stride; + output_strides[i - 1] = output_dims[i - 1] == 1 ? 0 : output_stride; + input_stride *= input_dims[i - 1]; + output_stride *= output_dims[i - 1]; + } + + std::vector input(XNN_EXTRA_BYTES / sizeof(int8_t) + num_input_elements()); + std::vector output(num_output_elements); + std::vector output_ref(num_output_elements); + std::vector output_ref_qu8(num_output_elements); + std::vector accumulator(num_output_elements); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + std::fill(accumulator.begin(), accumulator.end(), 0); + + std::unique_ptr auto_threadpool{nullptr, pthreadpool_destroy}; + if (multithreaded()) { + const pthreadpool_t threadpool = pthreadpool_create(num_threads()); + if (pthreadpool_get_threads_count(threadpool) <= 1) { + GTEST_SKIP(); + } else { + auto_threadpool.reset(threadpool); + } + } + + std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); + std::fill(output.begin(), output.end(), INT8_C(0xA5)); + + const int32_t num_reduced_elements = num_input_elements() / num_output_elements; + const float input_scale = 0.5f; + const float output_scale = 0.75f; + const uint8_t input_zero_point = u8dist(rng); + const uint8_t output_zero_point = u8dist(rng); + const uint8_t quantized_output_min = xnn_qu8_quantize(-INFINITY, output_scale, output_zero_point); + const uint8_t quantized_output_max = xnn_qu8_quantize(INFINITY, output_scale, output_zero_point); + + // Compute reference results. + std::fill(output_ref.begin(), output_ref.end(), 0); + for (size_t i = 0; i < input_dims[0]; i++) { + for (size_t j = 0; j < input_dims[1]; j++) { + for (size_t k = 0; k < input_dims[2]; k++) { + for (size_t l = 0; l < input_dims[3]; l++) { + for (size_t m = 0; m < input_dims[4]; m++) { + for (size_t n = 0; n < input_dims[5]; n++) { + size_t input_idx = i * input_strides[0] + j * input_strides[1] + k * input_strides[2] + l * input_strides[3] + m * input_strides[4] + n * input_strides[5]; + size_t output_idx = i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3] + m * output_strides[4] + n * output_strides[5]; + accumulator[output_idx] += static_cast(input[input_idx]); + } + } + } + } + } + } + + for (size_t idx = 0; idx < output_ref.size(); ++idx) { + output_ref[idx] = static_cast(accumulator[idx] - static_cast(input_zero_point) * num_reduced_elements); + output_ref[idx] *= input_scale * output_scale; + output_ref[idx] = std::min(output_ref[idx], static_cast(static_cast(quantized_output_max) - static_cast(output_zero_point))); + output_ref[idx] = std::max(output_ref[idx], static_cast(static_cast(quantized_output_min) - static_cast(output_zero_point))); + output_ref_qu8[idx] = static_cast(std::lrintf(output_ref[idx]) + static_cast(output_zero_point)); + } + + // Create, setup, run, and destroy a sum operator. + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + xnn_operator_t sum_op = nullptr; + + const xnn_status status = xnn_create_sum_nd_qu8( + input_scale * output_scale, input_zero_point, output_zero_point, + /*flags=*/0, &sum_op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, sum_op); + + // Smart pointer to automatically delete sum_op. + std::unique_ptr auto_sum_op(sum_op, xnn_delete_operator); + + size_t workspace_size = SIZE_MAX; + size_t workspace_alignment = SIZE_MAX; + ASSERT_EQ(xnn_status_success, + xnn_reshape_sum_nd_qu8( + sum_op, + num_reduction_axes(), + reduction_axes().data(), + num_input_dims(), + input_shape().data(), + &workspace_size, &workspace_alignment, + auto_threadpool.get())); + + ASSERT_NE(workspace_size, SIZE_MAX); + ASSERT_LE(workspace_alignment, XNN_ALLOCATION_ALIGNMENT); + std::vector> workspace(workspace_size); + + ASSERT_EQ(xnn_status_success, + xnn_setup_sum_nd_qu8( + sum_op, + workspace.data(), + input.data(), output.data())); + + ASSERT_EQ(xnn_status_success, + xnn_run_operator(sum_op, auto_threadpool.get())); + + // Verify results. + for (size_t i = 0; i < output_dims[0]; i++) { + for (size_t j = 0; j < output_dims[1]; j++) { + for (size_t k = 0; k < output_dims[2]; k++) { + for (size_t l = 0; l < output_dims[3]; l++) { + for (size_t m = 0; m < output_dims[4]; m++) { + for (size_t n = 0; n < output_dims[5]; n++) { + const size_t index = + i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3] + m * output_strides[4] + n * output_strides[5]; + ASSERT_EQ(output[index], output_ref_qu8[index]) + << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ")"; + } + } + } + } + } + } + } + } + + private: + std::vector input_shape_; + std::vector reduction_axes_; + bool multithreaded_{false}; + size_t iterations_{3}; +};